Compare commits
	
		
			35 Commits
		
	
	
		
			superclust
			...
			mucosim_cu
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					b024adaf5b | ||
| 
						 | 
					696e6da01d | ||
| 
						 | 
					b2a6574426 | ||
| 
						 | 
					c4080e866e | ||
| 
						 | 
					7b592b5fc7 | ||
| 
						 | 
					4690542db5 | ||
| 
						 | 
					8c131a7699 | ||
| 
						 | 
					dc4d5f1a9c | ||
| 
						 | 
					50007216ed | ||
| 
						 | 
					72e4599acc | ||
| 
						 | 
					8fa03733e9 | ||
| 
						 | 
					bf1ae3d013 | ||
| 
						 | 
					8009b54113 | ||
| 
						 | 
					0ea0587442 | ||
| 
						 | 
					134e3f4b78 | ||
| 
						 | 
					c2bfa3ca3f | ||
| 
						 | 
					2a099da5b7 | ||
| 
						 | 
					7691b23d67 | ||
| 
						 | 
					da90466f98 | ||
| 
						 | 
					8f723c1299 | ||
| 
						 | 
					0586ef150a | ||
| 
						 | 
					2e5d973f7d | ||
| 
						 | 
					e2fd1a0476 | ||
| 
						 | 
					4105c844c6 | ||
| 
						 | 
					1f5c9c4b23 | ||
| 
						 | 
					29e115464b | ||
| 
						 | 
					1a54314c8b | ||
| 
						 | 
					280f595b7f | ||
| 
						 | 
					3428974730 | ||
| 
						 | 
					b54842f764 | ||
| 
						 | 
					9730164e6f | ||
| 
						 | 
					0f5fdd3708 | ||
| 
						 | 
					3f7fb7f22a | ||
| 
						 | 
					bfa6c581c3 | ||
| 
						 | 
					fd886e77eb | 
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@@ -27,6 +27,7 @@
 | 
				
			|||||||
*.so
 | 
					*.so
 | 
				
			||||||
*.so.*
 | 
					*.so.*
 | 
				
			||||||
*.dylib
 | 
					*.dylib
 | 
				
			||||||
 | 
					.DS_Store
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Executables
 | 
					# Executables
 | 
				
			||||||
*.exe
 | 
					*.exe
 | 
				
			||||||
@@ -52,6 +53,7 @@ Mkfile.old
 | 
				
			|||||||
dkms.conf
 | 
					dkms.conf
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Build directories and executables
 | 
					# Build directories and executables
 | 
				
			||||||
 | 
					.vscode/
 | 
				
			||||||
GCC/
 | 
					GCC/
 | 
				
			||||||
ICC/
 | 
					ICC/
 | 
				
			||||||
MDBench-GCC*
 | 
					MDBench-GCC*
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										8
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										8
									
								
								Makefile
									
									
									
									
									
								
							@@ -58,6 +58,9 @@ ASM       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.
 | 
				
			|||||||
OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
 | 
					OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
 | 
				
			||||||
OBJ       = $(filter-out $(BUILD_DIR)/main% $(OVERWRITE),$(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
 | 
					OBJ       = $(filter-out $(BUILD_DIR)/main% $(OVERWRITE),$(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
 | 
				
			||||||
OBJ      += $(patsubst $(ASM_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*.s))
 | 
					OBJ      += $(patsubst $(ASM_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*.s))
 | 
				
			||||||
 | 
					OBJ		 += $(patsubst $(SRC_DIR)/%.cu, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cu))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(OPTIONS) $(INCLUDES)
 | 
					CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(OPTIONS) $(INCLUDES)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# $(warning $(OBJ))
 | 
					# $(warning $(OBJ))
 | 
				
			||||||
@@ -88,6 +91,11 @@ $(BUILD_DIR)/%.o:  %.s
 | 
				
			|||||||
	$(info ===>  ASSEMBLE  $@)
 | 
						$(info ===>  ASSEMBLE  $@)
 | 
				
			||||||
	$(Q)$(AS) $< -o $@
 | 
						$(Q)$(AS) $< -o $@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					$(BUILD_DIR)/%.o:  %.cu
 | 
				
			||||||
 | 
						$(info ===>  COMPILE  $@)
 | 
				
			||||||
 | 
						$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
 | 
				
			||||||
 | 
						$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM  $< > $(BUILD_DIR)/$*.d
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.PHONY: clean distclean tags info asm
 | 
					.PHONY: clean distclean tags info asm
 | 
				
			||||||
 | 
					
 | 
				
			||||||
clean:
 | 
					clean:
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,5 +1,5 @@
 | 
				
			|||||||
# Compiler tag (GCC/CLANG/ICC)
 | 
					# Compiler tag (GCC/CLANG/ICC/NVCC)
 | 
				
			||||||
TAG ?= CLANG
 | 
					TAG ?= NVCC
 | 
				
			||||||
# Enable likwid (true or false)
 | 
					# Enable likwid (true or false)
 | 
				
			||||||
ENABLE_LIKWID ?= false
 | 
					ENABLE_LIKWID ?= false
 | 
				
			||||||
# SP or DP
 | 
					# SP or DP
 | 
				
			||||||
@@ -22,7 +22,7 @@ INDEX_TRACER ?= false
 | 
				
			|||||||
# Vector width (elements) for index and distance tracer
 | 
					# Vector width (elements) for index and distance tracer
 | 
				
			||||||
VECTOR_WIDTH ?= 8
 | 
					VECTOR_WIDTH ?= 8
 | 
				
			||||||
# Compute statistics
 | 
					# Compute statistics
 | 
				
			||||||
COMPUTE_STATS ?= true
 | 
					COMPUTE_STATS ?= false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#Feature options
 | 
					#Feature options
 | 
				
			||||||
OPTIONS =  -DALIGNMENT=64
 | 
					OPTIONS =  -DALIGNMENT=64
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										10
									
								
								evaluate_cpu_openmpi_metrics.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								evaluate_cpu_openmpi_metrics.sh
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,10 @@
 | 
				
			|||||||
 | 
					END=32
 | 
				
			||||||
 | 
					for ((i=1;i<=END;i++)); do
 | 
				
			||||||
 | 
						output=$(eval "likwid-mpirun -np 1 -t $i -m -g FLOPS_DP -omp gnu ./MDBench-GCC -n 50")
 | 
				
			||||||
 | 
						echo -n "$i,"
 | 
				
			||||||
 | 
						echo "$output" > "FLOPS_DP/thread_$i.txt"
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## likwid perf measurements on testfront1:
 | 
				
			||||||
 | 
					# srun --nodes=1 --exclusive --nodelist=rome1 --time=00:30:00 --export=NONE -c 64 -C hwperf --pty /bin/bash -l
 | 
				
			||||||
 | 
					# likwid-mpirun -np 1 -t 32 -m -g MEM -omp gnu -d ./MDBench-GCC
 | 
				
			||||||
							
								
								
									
										6
									
								
								evaluate_cpu_runtime.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								evaluate_cpu_runtime.sh
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,6 @@
 | 
				
			|||||||
 | 
					#!/bin/bash
 | 
				
			||||||
 | 
					for i in $(seq 1 32); do
 | 
				
			||||||
 | 
						echo "$i"
 | 
				
			||||||
 | 
						export "OMP_NUM_THREADS=$i"
 | 
				
			||||||
 | 
						./MDBench-GCC -n 50 | grep "Performance"
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
							
								
								
									
										5
									
								
								evaluate_gpu_ncu_profiles_per_thread.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								evaluate_gpu_ncu_profiles_per_thread.sh
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,5 @@
 | 
				
			|||||||
 | 
					END=32
 | 
				
			||||||
 | 
					for ((i=16;i<=END;i++)); do
 | 
				
			||||||
 | 
						export NUM_THREADS=$i
 | 
				
			||||||
 | 
						$(eval "ncu --set full -o /home/hpc/rzku/ptfs410h/MD-Bench/log/MG/presentation_2/Resources/GPU/Metrics/threads_$i ./MDBench-NVCC -n 50")
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
							
								
								
									
										6
									
								
								evaluate_gpu_perf_per_thread.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								evaluate_gpu_perf_per_thread.sh
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,6 @@
 | 
				
			|||||||
 | 
					END=64
 | 
				
			||||||
 | 
					for ((i=1;i<=END;i*=2)); do
 | 
				
			||||||
 | 
						output=$(eval "NUM_THREADS=$i ./MDBench-NVCC -n 2000")
 | 
				
			||||||
 | 
						echo -n "$i,"
 | 
				
			||||||
 | 
						echo "$output" | grep 'atom updates per second' | sed 's/[^0-9.,]//g' | awk '{print $1"e6"}'
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
							
								
								
									
										15
									
								
								include_NVCC.mk
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								include_NVCC.mk
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,15 @@
 | 
				
			|||||||
 | 
					CC  = nvcc
 | 
				
			||||||
 | 
					LINKER = $(CC)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					ANSI_CFLAGS  = -ansi
 | 
				
			||||||
 | 
					ANSI_CFLAGS += -std=c99
 | 
				
			||||||
 | 
					ANSI_CFLAGS += -pedantic
 | 
				
			||||||
 | 
					ANSI_CFLAGS += -Wextra
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# CFLAGS   = -O0 -g  -std=c99 -fargument-noalias
 | 
				
			||||||
 | 
					CFLAGS   = -O3 -g -arch=sm_61 # -fopenmp
 | 
				
			||||||
 | 
					ASFLAGS  =  -masm=intel
 | 
				
			||||||
 | 
					LFLAGS   =
 | 
				
			||||||
 | 
					DEFINES  = -D_GNU_SOURCE -DLIKWID_PERFMON
 | 
				
			||||||
 | 
					INCLUDES = $(LIKWID_INC)
 | 
				
			||||||
 | 
					LIBS     = -lm $(LIKWID_LIB) -llikwid -lcuda -lcudart
 | 
				
			||||||
@@ -25,11 +25,29 @@
 | 
				
			|||||||
#include <string.h>
 | 
					#include <string.h>
 | 
				
			||||||
#include <errno.h>
 | 
					#include <errno.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <cuda_runtime.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void checkCUDAError(const char *msg, cudaError_t err)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    if (err != cudaSuccess)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					        //print a human readable error message
 | 
				
			||||||
 | 
					        printf("[CUDA ERROR %s]: %s\r\n", msg, cudaGetErrorString(err));
 | 
				
			||||||
 | 
					        exit(-1);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void* allocate (int alignment, size_t bytesize)
 | 
					void* allocate (int alignment, size_t bytesize)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    int errorCode;
 | 
					    int errorCode;
 | 
				
			||||||
    void* ptr;
 | 
					    void* ptr;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    checkCUDAError( "allocate", cudaMallocHost((void**)&ptr, bytesize) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return ptr;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    /*
 | 
				
			||||||
    errorCode =  posix_memalign(&ptr, alignment, bytesize);
 | 
					    errorCode =  posix_memalign(&ptr, alignment, bytesize);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (errorCode) {
 | 
					    if (errorCode) {
 | 
				
			||||||
@@ -51,6 +69,7 @@ void* allocate (int alignment, size_t bytesize)
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return ptr;
 | 
					    return ptr;
 | 
				
			||||||
 | 
					    */
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void* reallocate (
 | 
					void* reallocate (
 | 
				
			||||||
@@ -59,11 +78,11 @@ void* reallocate (
 | 
				
			|||||||
        size_t newBytesize,
 | 
					        size_t newBytesize,
 | 
				
			||||||
        size_t oldBytesize)
 | 
					        size_t oldBytesize)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    void* newarray =  allocate(alignment, newBytesize);
 | 
					    void* newarray = allocate(alignment, newBytesize);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if(ptr != NULL) {
 | 
					    if(ptr != NULL) {
 | 
				
			||||||
        memcpy(newarray, ptr, oldBytesize);
 | 
					        memcpy(newarray, ptr, oldBytesize);
 | 
				
			||||||
        free(ptr);
 | 
					        cudaFreeHost(ptr);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return newarray;
 | 
					    return newarray;
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										33
									
								
								src/atom.c
									
									
									
									
									
								
							
							
						
						
									
										33
									
								
								src/atom.c
									
									
									
									
									
								
							@@ -30,6 +30,9 @@
 | 
				
			|||||||
#include <allocate.h>
 | 
					#include <allocate.h>
 | 
				
			||||||
#include <util.h>
 | 
					#include <util.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <cuda_runtime.h>
 | 
				
			||||||
 | 
					#include <device_launch_parameters.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define DELTA 20000
 | 
					#define DELTA 20000
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void initAtom(Atom *atom)
 | 
					void initAtom(Atom *atom)
 | 
				
			||||||
@@ -57,10 +60,10 @@ void createAtom(Atom *atom, Parameter *param)
 | 
				
			|||||||
    atom->Natoms = 4 * param->nx * param->ny * param->nz;
 | 
					    atom->Natoms = 4 * param->nx * param->ny * param->nz;
 | 
				
			||||||
    atom->Nlocal = 0;
 | 
					    atom->Nlocal = 0;
 | 
				
			||||||
    atom->ntypes = param->ntypes;
 | 
					    atom->ntypes = param->ntypes;
 | 
				
			||||||
    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
					    checkCUDAError( "atom->epsilon cudaMallocHost", cudaMallocHost((void**)&(atom->epsilon), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
				
			||||||
    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
					    checkCUDAError( "atom->sigma6 cudaMallocHost", cudaMallocHost((void**)&(atom->sigma6), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
				
			||||||
    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
					    checkCUDAError( "atom->cutforcesq cudaMallocHost", cudaMallocHost((void**)&(atom->cutforcesq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
				
			||||||
    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
					    checkCUDAError( "atom->cutneighsq cudaMallocHost", cudaMallocHost((void**)&(atom->cutneighsq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
				
			||||||
    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
 | 
					    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
 | 
				
			||||||
        atom->epsilon[i] = param->epsilon;
 | 
					        atom->epsilon[i] = param->epsilon;
 | 
				
			||||||
        atom->sigma6[i] = param->sigma6;
 | 
					        atom->sigma6[i] = param->sigma6;
 | 
				
			||||||
@@ -134,9 +137,9 @@ void createAtom(Atom *atom, Parameter *param)
 | 
				
			|||||||
                atom_x(atom->Nlocal) = xtmp;
 | 
					                atom_x(atom->Nlocal) = xtmp;
 | 
				
			||||||
                atom_y(atom->Nlocal) = ytmp;
 | 
					                atom_y(atom->Nlocal) = ytmp;
 | 
				
			||||||
                atom_z(atom->Nlocal) = ztmp;
 | 
					                atom_z(atom->Nlocal) = ztmp;
 | 
				
			||||||
                atom->vx[atom->Nlocal] = vxtmp;
 | 
					                atom_vx(atom->Nlocal) = vxtmp;
 | 
				
			||||||
                atom->vy[atom->Nlocal] = vytmp;
 | 
					                atom_vy(atom->Nlocal) = vytmp;
 | 
				
			||||||
                atom->vz[atom->Nlocal] = vztmp;
 | 
					                atom_vz(atom->Nlocal) = vztmp;
 | 
				
			||||||
                atom->type[atom->Nlocal] = rand() % atom->ntypes;
 | 
					                atom->type[atom->Nlocal] = rand() % atom->ntypes;
 | 
				
			||||||
                atom->Nlocal++;
 | 
					                atom->Nlocal++;
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
@@ -159,16 +162,24 @@ void growAtom(Atom *atom)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    #ifdef AOS
 | 
					    #ifdef AOS
 | 
				
			||||||
    atom->x  = (MD_FLOAT*) reallocate(atom->x,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
 | 
					    atom->x  = (MD_FLOAT*) reallocate(atom->x,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
 | 
				
			||||||
    #else
 | 
					    #else
 | 
				
			||||||
    atom->x  = (MD_FLOAT*) reallocate(atom->x,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
 | 
					    atom->x  = (MD_FLOAT*) reallocate(atom->x,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
 | 
				
			||||||
    atom->y  = (MD_FLOAT*) reallocate(atom->y,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
 | 
					    atom->y  = (MD_FLOAT*) reallocate(atom->y,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
 | 
				
			||||||
    atom->z  = (MD_FLOAT*) reallocate(atom->z,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
 | 
					    atom->z  = (MD_FLOAT*) reallocate(atom->z,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
 | 
				
			||||||
    #endif
 | 
					
 | 
				
			||||||
    atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
 | 
					 | 
				
			||||||
    atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
 | 
					 | 
				
			||||||
    atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
 | 
					 | 
				
			||||||
    atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
 | 
					    atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
 | 
				
			||||||
    atom->fy = (MD_FLOAT*) reallocate(atom->fy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
 | 
					    atom->fy = (MD_FLOAT*) reallocate(atom->fy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
 | 
				
			||||||
    atom->fz = (MD_FLOAT*) reallocate(atom->fz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
 | 
					    atom->fz = (MD_FLOAT*) reallocate(atom->fz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
 | 
				
			||||||
 | 
					    atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
 | 
				
			||||||
 | 
					    atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    #endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int));
 | 
					    atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int));
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										106
									
								
								src/force.c
									
									
									
									
									
								
							
							
						
						
									
										106
									
								
								src/force.c
									
									
									
									
									
								
							@@ -1,106 +0,0 @@
 | 
				
			|||||||
/*
 | 
					 | 
				
			||||||
 * =======================================================================================
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 | 
					 | 
				
			||||||
 *   Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 *   This file is part of MD-Bench.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 *   MD-Bench is free software: you can redistribute it and/or modify it
 | 
					 | 
				
			||||||
 *   under the terms of the GNU Lesser General Public License as published
 | 
					 | 
				
			||||||
 *   by the Free Software Foundation, either version 3 of the License, or
 | 
					 | 
				
			||||||
 *   (at your option) any later version.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
 | 
					 | 
				
			||||||
 *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 | 
					 | 
				
			||||||
 *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
 | 
					 | 
				
			||||||
 *   details.
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 *   You should have received a copy of the GNU Lesser General Public License along
 | 
					 | 
				
			||||||
 *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
 | 
					 | 
				
			||||||
 * =======================================================================================
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
#include <likwid-marker.h>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <timing.h>
 | 
					 | 
				
			||||||
#include <neighbor.h>
 | 
					 | 
				
			||||||
#include <parameter.h>
 | 
					 | 
				
			||||||
#include <atom.h>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
double computeForce(
 | 
					 | 
				
			||||||
        Parameter *param,
 | 
					 | 
				
			||||||
        Atom *atom,
 | 
					 | 
				
			||||||
        Neighbor *neighbor
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
    int Nlocal = atom->Nlocal;
 | 
					 | 
				
			||||||
    int* neighs;
 | 
					 | 
				
			||||||
    MD_FLOAT* fx = atom->fx;
 | 
					 | 
				
			||||||
    MD_FLOAT* fy = atom->fy;
 | 
					 | 
				
			||||||
    MD_FLOAT* fz = atom->fz;
 | 
					 | 
				
			||||||
#ifndef EXPLICIT_TYPES
 | 
					 | 
				
			||||||
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
 | 
					 | 
				
			||||||
    MD_FLOAT sigma6 = param->sigma6;
 | 
					 | 
				
			||||||
    MD_FLOAT epsilon = param->epsilon;
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for(int i = 0; i < Nlocal; i++) {
 | 
					 | 
				
			||||||
        fx[i] = 0.0;
 | 
					 | 
				
			||||||
        fy[i] = 0.0;
 | 
					 | 
				
			||||||
        fz[i] = 0.0;
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    double S = getTimeStamp();
 | 
					 | 
				
			||||||
    LIKWID_MARKER_START("force");
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#pragma omp parallel for
 | 
					 | 
				
			||||||
    for(int i = 0; i < Nlocal; i++) {
 | 
					 | 
				
			||||||
        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
 | 
					 | 
				
			||||||
        int numneighs = neighbor->numneigh[i];
 | 
					 | 
				
			||||||
        MD_FLOAT xtmp = atom_x(i);
 | 
					 | 
				
			||||||
        MD_FLOAT ytmp = atom_y(i);
 | 
					 | 
				
			||||||
        MD_FLOAT ztmp = atom_z(i);
 | 
					 | 
				
			||||||
        MD_FLOAT fix = 0;
 | 
					 | 
				
			||||||
        MD_FLOAT fiy = 0;
 | 
					 | 
				
			||||||
        MD_FLOAT fiz = 0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef EXPLICIT_TYPES
 | 
					 | 
				
			||||||
        const int type_i = atom->type[i];
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        for(int k = 0; k < numneighs; k++) {
 | 
					 | 
				
			||||||
            int j = neighs[k];
 | 
					 | 
				
			||||||
            MD_FLOAT delx = xtmp - atom_x(j);
 | 
					 | 
				
			||||||
            MD_FLOAT dely = ytmp - atom_y(j);
 | 
					 | 
				
			||||||
            MD_FLOAT delz = ztmp - atom_z(j);
 | 
					 | 
				
			||||||
            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef EXPLICIT_TYPES
 | 
					 | 
				
			||||||
            const int type_j = atom->type[j];
 | 
					 | 
				
			||||||
            const int type_ij = type_i * atom->ntypes + type_j;
 | 
					 | 
				
			||||||
            const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
 | 
					 | 
				
			||||||
            const MD_FLOAT sigma6 = atom->sigma6[type_ij];
 | 
					 | 
				
			||||||
            const MD_FLOAT epsilon = atom->epsilon[type_ij];
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            if(rsq < cutforcesq) {
 | 
					 | 
				
			||||||
                MD_FLOAT sr2 = 1.0 / rsq;
 | 
					 | 
				
			||||||
                MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
 | 
					 | 
				
			||||||
                MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
 | 
					 | 
				
			||||||
                fix += delx * force;
 | 
					 | 
				
			||||||
                fiy += dely * force;
 | 
					 | 
				
			||||||
                fiz += delz * force;
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        fx[i] += fix;
 | 
					 | 
				
			||||||
        fy[i] += fiy;
 | 
					 | 
				
			||||||
        fz[i] += fiz;
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    LIKWID_MARKER_STOP("force");
 | 
					 | 
				
			||||||
    double E = getTimeStamp();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return E-S;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
							
								
								
									
										274
									
								
								src/force.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										274
									
								
								src/force.cu
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,274 @@
 | 
				
			|||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * =======================================================================================
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 | 
				
			||||||
 | 
					 *   Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *   This file is part of MD-Bench.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *   MD-Bench is free software: you can redistribute it and/or modify it
 | 
				
			||||||
 | 
					 *   under the terms of the GNU Lesser General Public License as published
 | 
				
			||||||
 | 
					 *   by the Free Software Foundation, either version 3 of the License, or
 | 
				
			||||||
 | 
					 *   (at your option) any later version.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
 | 
				
			||||||
 | 
					 *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 | 
				
			||||||
 | 
					 *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
 | 
				
			||||||
 | 
					 *   details.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *   You should have received a copy of the GNU Lesser General Public License along
 | 
				
			||||||
 | 
					 *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
 | 
				
			||||||
 | 
					 * =======================================================================================
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					#include <math.h>
 | 
				
			||||||
 | 
					#include <stdio.h>
 | 
				
			||||||
 | 
					#include <stdlib.h>
 | 
				
			||||||
 | 
					#include <stddef.h>
 | 
				
			||||||
 | 
					#include <cuda_profiler_api.h>
 | 
				
			||||||
 | 
					#include <cuda_runtime.h>
 | 
				
			||||||
 | 
					#include <device_launch_parameters.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					extern "C" {
 | 
				
			||||||
 | 
					    #include <likwid-marker.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    #include <timing.h>
 | 
				
			||||||
 | 
					    #include <neighbor.h>
 | 
				
			||||||
 | 
					    #include <parameter.h>
 | 
				
			||||||
 | 
					    #include <atom.h>
 | 
				
			||||||
 | 
					    #include <allocate.h>
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// cuda kernel
 | 
				
			||||||
 | 
					__global__ void calc_force(
 | 
				
			||||||
 | 
					    Atom a,
 | 
				
			||||||
 | 
					    MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon,
 | 
				
			||||||
 | 
					    int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const int i = blockIdx.x * blockDim.x + threadIdx.x;
 | 
				
			||||||
 | 
					    if( i >= Nlocal ) {
 | 
				
			||||||
 | 
					        return;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Atom *atom = &a;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const int numneighs = neigh_numneigh[i];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    MD_FLOAT xtmp = atom_x(i);
 | 
				
			||||||
 | 
					    MD_FLOAT ytmp = atom_y(i);
 | 
				
			||||||
 | 
					    MD_FLOAT ztmp = atom_z(i);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    MD_FLOAT fix = 0;
 | 
				
			||||||
 | 
					    MD_FLOAT fiy = 0;
 | 
				
			||||||
 | 
					    MD_FLOAT fiz = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for(int k = 0; k < numneighs; k++) {
 | 
				
			||||||
 | 
					        int j = neigh_neighbors[atom->Nlocal * k + i];
 | 
				
			||||||
 | 
					        MD_FLOAT delx = xtmp - atom_x(j);
 | 
				
			||||||
 | 
					        MD_FLOAT dely = ytmp - atom_y(j);
 | 
				
			||||||
 | 
					        MD_FLOAT delz = ztmp - atom_z(j);
 | 
				
			||||||
 | 
					        MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef EXPLICIT_TYPES
 | 
				
			||||||
 | 
					        const int type_j = atom->type[j];
 | 
				
			||||||
 | 
					        const int type_ij = type_i * atom->ntypes + type_j;
 | 
				
			||||||
 | 
					        const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
 | 
				
			||||||
 | 
					        const MD_FLOAT sigma6 = atom->sigma6[type_ij];
 | 
				
			||||||
 | 
					        const MD_FLOAT epsilon = atom->epsilon[type_ij];
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if(rsq < cutforcesq) {
 | 
				
			||||||
 | 
					            MD_FLOAT sr2 = 1.0 / rsq;
 | 
				
			||||||
 | 
					            MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
 | 
				
			||||||
 | 
					            MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
 | 
				
			||||||
 | 
					            fix += delx * force;
 | 
				
			||||||
 | 
					            fiy += dely * force;
 | 
				
			||||||
 | 
					            fiz += delz * force;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    atom_fx(i) = fix;
 | 
				
			||||||
 | 
					    atom_fy(i) = fiy;
 | 
				
			||||||
 | 
					    atom_fz(i) = fiz;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, Atom a) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const int i = blockIdx.x * blockDim.x + threadIdx.x;
 | 
				
			||||||
 | 
					    if( i >= Nlocal ) {
 | 
				
			||||||
 | 
					        return;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Atom *atom = &a;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    atom_vx(i) += dtforce * atom_fx(i);
 | 
				
			||||||
 | 
					    atom_vy(i) += dtforce * atom_fy(i);
 | 
				
			||||||
 | 
					    atom_vz(i) += dtforce * atom_fz(i);
 | 
				
			||||||
 | 
					    atom_x(i) = atom_x(i) + dt * atom_vx(i);
 | 
				
			||||||
 | 
					    atom_y(i) = atom_y(i) + dt * atom_vy(i);
 | 
				
			||||||
 | 
					    atom_z(i) = atom_z(i) + dt * atom_vz(i);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const int i = blockIdx.x * blockDim.x + threadIdx.x;
 | 
				
			||||||
 | 
					    if( i >= Nlocal ) {
 | 
				
			||||||
 | 
					        return;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Atom *atom = &a;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    atom_vx(i) += dtforce * atom_fx(i);
 | 
				
			||||||
 | 
					    atom_vy(i) += dtforce * atom_fy(i);
 | 
				
			||||||
 | 
					    atom_vz(i) += dtforce * atom_fz(i);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					extern "C" {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static Atom c_atom;
 | 
				
			||||||
 | 
					int *c_neighs;
 | 
				
			||||||
 | 
					int *c_neigh_numneigh;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int get_num_threads() {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const char *num_threads_env = getenv("NUM_THREADS");
 | 
				
			||||||
 | 
					    int num_threads = 0;
 | 
				
			||||||
 | 
					    if(num_threads_env == nullptr)
 | 
				
			||||||
 | 
					        num_threads = 32;
 | 
				
			||||||
 | 
					    else {
 | 
				
			||||||
 | 
					        num_threads = atoi(num_threads_env);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return num_threads;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const int Nlocal = atom->Nlocal;
 | 
				
			||||||
 | 
					    const int num_threads = get_num_threads();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
 | 
				
			||||||
 | 
					    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, c_atom);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    checkCUDAError( "PeekAtLastError FinalIntegrate", cudaPeekAtLastError() );
 | 
				
			||||||
 | 
					    checkCUDAError( "DeviceSync FinalIntegrate", cudaDeviceSynchronize() );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if(doReneighbour) {
 | 
				
			||||||
 | 
					        checkCUDAError( "FinalIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const int Nlocal = atom->Nlocal;
 | 
				
			||||||
 | 
					    const int num_threads = get_num_threads();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
 | 
				
			||||||
 | 
					    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, c_atom);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    checkCUDAError( "PeekAtLastError InitialIntegrate", cudaPeekAtLastError() );
 | 
				
			||||||
 | 
					    checkCUDAError( "DeviceSync InitialIntegrate", cudaDeviceSynchronize() );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if(doReneighbour) {
 | 
				
			||||||
 | 
					        checkCUDAError( "InitialIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    checkCUDAError( "InitialIntegrate: position memcpy", cudaMemcpy(atom->x, c_atom.x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void initCudaAtom(Atom *atom, Neighbor *neighbor) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const int Nlocal = atom->Nlocal;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    checkCUDAError( "c_atom.x malloc", cudaMalloc((void**)&(c_atom.x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
 | 
				
			||||||
 | 
					    checkCUDAError( "c_atom.x memcpy", cudaMemcpy(c_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    checkCUDAError( "c_atom.fx malloc", cudaMalloc((void**)&(c_atom.fx), sizeof(MD_FLOAT) * Nlocal * 3) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    checkCUDAError( "c_atom.vx malloc", cudaMalloc((void**)&(c_atom.vx), sizeof(MD_FLOAT) * Nlocal * 3) );
 | 
				
			||||||
 | 
					    checkCUDAError( "c_atom.vx memcpy", cudaMemcpy(c_atom.vx, atom->vx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyHostToDevice) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    checkCUDAError( "c_atom.type malloc", cudaMalloc((void**)&(c_atom.type), sizeof(int) * atom->Nmax) );
 | 
				
			||||||
 | 
					    checkCUDAError( "c_atom.epsilon malloc", cudaMalloc((void**)&(c_atom.epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
 | 
				
			||||||
 | 
					    checkCUDAError( "c_atom.sigma6 malloc", cudaMalloc((void**)&(c_atom.sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
 | 
				
			||||||
 | 
					    checkCUDAError( "c_atom.cutforcesq malloc", cudaMalloc((void**)&(c_atom.cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    checkCUDAError( "c_neighs malloc", cudaMalloc((void**)&c_neighs, sizeof(int) * Nlocal * neighbor->maxneighs) );
 | 
				
			||||||
 | 
					    checkCUDAError( "c_neigh_numneigh malloc", cudaMalloc((void**)&c_neigh_numneigh, sizeof(int) * Nlocal) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    checkCUDAError( "c_atom.type memcpy", cudaMemcpy(c_atom.type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
 | 
				
			||||||
 | 
					    checkCUDAError( "c_atom.sigma6 memcpy", cudaMemcpy(c_atom.sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
 | 
				
			||||||
 | 
					    checkCUDAError( "c_atom.epsilon memcpy", cudaMemcpy(c_atom.epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    checkCUDAError( "c_atom.cutforcesq memcpy", cudaMemcpy(c_atom.cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					double computeForce(
 | 
				
			||||||
 | 
					        bool reneighbourHappenend,
 | 
				
			||||||
 | 
					        Parameter *param,
 | 
				
			||||||
 | 
					        Atom *atom,
 | 
				
			||||||
 | 
					        Neighbor *neighbor
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    int Nlocal = atom->Nlocal;
 | 
				
			||||||
 | 
					#ifndef EXPLICIT_TYPES
 | 
				
			||||||
 | 
					    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
 | 
				
			||||||
 | 
					    MD_FLOAT sigma6 = param->sigma6;
 | 
				
			||||||
 | 
					    MD_FLOAT epsilon = param->epsilon;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const int num_threads = get_num_threads();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    c_atom.Natoms = atom->Natoms;
 | 
				
			||||||
 | 
					    c_atom.Nlocal = atom->Nlocal;
 | 
				
			||||||
 | 
					    c_atom.Nghost = atom->Nghost;
 | 
				
			||||||
 | 
					    c_atom.Nmax = atom->Nmax;
 | 
				
			||||||
 | 
					    c_atom.ntypes = atom->ntypes;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    /*
 | 
				
			||||||
 | 
					    int nDevices;
 | 
				
			||||||
 | 
					    cudaGetDeviceCount(&nDevices);
 | 
				
			||||||
 | 
					    size_t free, total;
 | 
				
			||||||
 | 
					    for(int i = 0; i < nDevices; ++i) {
 | 
				
			||||||
 | 
					        cudaMemGetInfo( &free, &total );
 | 
				
			||||||
 | 
					        cudaDeviceProp prop;
 | 
				
			||||||
 | 
					        cudaGetDeviceProperties(&prop, i);
 | 
				
			||||||
 | 
					        printf("DEVICE %d/%d NAME: %s\r\n with %ld MB/%ld MB memory used", i + 1, nDevices, prop.name, free / 1024 / 1024, total / 1024 / 1024);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // checkCUDAError( "c_atom.fx memset", cudaMemset(c_atom.fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cudaProfilerStart();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    checkCUDAError( "c_atom.x memcpy", cudaMemcpy(c_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if(reneighbourHappenend) {
 | 
				
			||||||
 | 
					        checkCUDAError( "c_neigh_numneigh memcpy", cudaMemcpy(c_neigh_numneigh, neighbor->numneigh, sizeof(int) * Nlocal, cudaMemcpyHostToDevice) );
 | 
				
			||||||
 | 
					        checkCUDAError( "c_neighs memcpy", cudaMemcpy(c_neighs, neighbor->neighbors, sizeof(int) * Nlocal * neighbor->maxneighs, cudaMemcpyHostToDevice) );
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
 | 
				
			||||||
 | 
					    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    double S = getTimeStamp();
 | 
				
			||||||
 | 
					    LIKWID_MARKER_START("force");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    calc_force <<< num_blocks, num_threads_per_block >>> (c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighs, c_neigh_numneigh);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    checkCUDAError( "PeekAtLastError ComputeForce", cudaPeekAtLastError() );
 | 
				
			||||||
 | 
					    checkCUDAError( "DeviceSync ComputeForce", cudaDeviceSynchronize() );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cudaProfilerStop();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    LIKWID_MARKER_STOP("force");
 | 
				
			||||||
 | 
					    double E = getTimeStamp();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return E-S;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@@ -22,8 +22,12 @@
 | 
				
			|||||||
 */
 | 
					 */
 | 
				
			||||||
#include <stdlib.h>
 | 
					#include <stdlib.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <cuda_runtime.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifndef __ALLOCATE_H_
 | 
					#ifndef __ALLOCATE_H_
 | 
				
			||||||
#define __ALLOCATE_H_
 | 
					#define __ALLOCATE_H_
 | 
				
			||||||
extern void* allocate (int alignment, size_t bytesize);
 | 
					extern void* allocate (int alignment, size_t bytesize);
 | 
				
			||||||
extern void* reallocate (void* ptr, int alignment, size_t newBytesize, size_t oldBytesize);
 | 
					extern void* reallocate (void* ptr, int alignment, size_t newBytesize, size_t oldBytesize);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					extern void checkCUDAError(const char *msg, cudaError_t err);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -45,14 +45,34 @@ extern void growAtom(Atom*);
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#ifdef AOS
 | 
					#ifdef AOS
 | 
				
			||||||
#define POS_DATA_LAYOUT     "AoS"
 | 
					#define POS_DATA_LAYOUT     "AoS"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define atom_x(i)           atom->x[(i) * 3 + 0]
 | 
					#define atom_x(i)           atom->x[(i) * 3 + 0]
 | 
				
			||||||
#define atom_y(i)           atom->x[(i) * 3 + 1]
 | 
					#define atom_y(i)           atom->x[(i) * 3 + 1]
 | 
				
			||||||
#define atom_z(i)           atom->x[(i) * 3 + 2]
 | 
					#define atom_z(i)           atom->x[(i) * 3 + 2]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define atom_fx(i)          atom->fx[(i) * 3 + 0]
 | 
				
			||||||
 | 
					#define atom_fy(i)          atom->fx[(i) * 3 + 1]
 | 
				
			||||||
 | 
					#define atom_fz(i)          atom->fx[(i) * 3 + 2]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define atom_vx(i)          atom->vx[(i) * 3 + 0]
 | 
				
			||||||
 | 
					#define atom_vy(i)          atom->vx[(i) * 3 + 1]
 | 
				
			||||||
 | 
					#define atom_vz(i)          atom->vx[(i) * 3 + 2]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
#define POS_DATA_LAYOUT     "SoA"
 | 
					#define POS_DATA_LAYOUT     "SoA"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define atom_x(i)           atom->x[i]
 | 
					#define atom_x(i)           atom->x[i]
 | 
				
			||||||
#define atom_y(i)           atom->y[i]
 | 
					#define atom_y(i)           atom->y[i]
 | 
				
			||||||
#define atom_z(i)           atom->z[i]
 | 
					#define atom_z(i)           atom->z[i]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define atom_fx(i)           atom->fx[i]
 | 
				
			||||||
 | 
					#define atom_fy(i)           atom->fy[i]
 | 
				
			||||||
 | 
					#define atom_fz(i)           atom->fz[i]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define atom_vx(i)           atom->vx[i]
 | 
				
			||||||
 | 
					#define atom_vy(i)           atom->vy[i]
 | 
				
			||||||
 | 
					#define atom_vz(i)           atom->vz[i]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										52
									
								
								src/main.c
									
									
									
									
									
								
							
							
						
						
									
										52
									
								
								src/main.c
									
									
									
									
									
								
							@@ -23,6 +23,7 @@
 | 
				
			|||||||
#include <stdlib.h>
 | 
					#include <stdlib.h>
 | 
				
			||||||
#include <stdio.h>
 | 
					#include <stdio.h>
 | 
				
			||||||
#include <string.h>
 | 
					#include <string.h>
 | 
				
			||||||
 | 
					#include <stdbool.h>
 | 
				
			||||||
#include <unistd.h>
 | 
					#include <unistd.h>
 | 
				
			||||||
#include <limits.h>
 | 
					#include <limits.h>
 | 
				
			||||||
#include <math.h>
 | 
					#include <math.h>
 | 
				
			||||||
@@ -44,7 +45,12 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#define HLINE "----------------------------------------------------------------------------\n"
 | 
					#define HLINE "----------------------------------------------------------------------------\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern double computeForce(Parameter*, Atom*, Neighbor*);
 | 
					extern void initCudaAtom(Atom *atom, Neighbor *neighbor);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					extern void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom);
 | 
				
			||||||
 | 
					extern void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					extern double computeForce(bool, Parameter*, Atom*, Neighbor*);
 | 
				
			||||||
extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int);
 | 
					extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int);
 | 
				
			||||||
extern double computeForceEam(Eam* eam, Parameter*, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep);
 | 
					extern double computeForceEam(Eam* eam, Parameter*, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -100,6 +106,8 @@ double setup(
 | 
				
			|||||||
    buildNeighbor(atom, neighbor);
 | 
					    buildNeighbor(atom, neighbor);
 | 
				
			||||||
    E = getTimeStamp();
 | 
					    E = getTimeStamp();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    initCudaAtom(atom, neighbor);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return E-S;
 | 
					    return E-S;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -125,28 +133,22 @@ double reneighbour(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
void initialIntegrate(Parameter *param, Atom *atom)
 | 
					void initialIntegrate(Parameter *param, Atom *atom)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
 | 
					 | 
				
			||||||
    MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for(int i = 0; i < atom->Nlocal; i++) {
 | 
					    for(int i = 0; i < atom->Nlocal; i++) {
 | 
				
			||||||
        vx[i] += param->dtforce * fx[i];
 | 
					        atom_vx(i) += param->dtforce * atom_fx(i);
 | 
				
			||||||
        vy[i] += param->dtforce * fy[i];
 | 
					        atom_vy(i) += param->dtforce * atom_fy(i);
 | 
				
			||||||
        vz[i] += param->dtforce * fz[i];
 | 
					        atom_vz(i) += param->dtforce * atom_fz(i);
 | 
				
			||||||
        atom_x(i) = atom_x(i) + param->dt * vx[i];
 | 
					        atom_x(i) = atom_x(i) + param->dt * atom_vx(i);
 | 
				
			||||||
        atom_y(i) = atom_y(i) + param->dt * vy[i];
 | 
					        atom_y(i) = atom_y(i) + param->dt * atom_vy(i);
 | 
				
			||||||
        atom_z(i) = atom_z(i) + param->dt * vz[i];
 | 
					        atom_z(i) = atom_z(i) + param->dt * atom_vz(i);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void finalIntegrate(Parameter *param, Atom *atom)
 | 
					void finalIntegrate(Parameter *param, Atom *atom)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
 | 
					 | 
				
			||||||
    MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for(int i = 0; i < atom->Nlocal; i++) {
 | 
					    for(int i = 0; i < atom->Nlocal; i++) {
 | 
				
			||||||
        vx[i] += param->dtforce * fx[i];
 | 
					        atom_vx(i) += param->dtforce * atom_fx(i);
 | 
				
			||||||
        vy[i] += param->dtforce * fy[i];
 | 
					        atom_vy(i) += param->dtforce * atom_fy(i);
 | 
				
			||||||
        vz[i] += param->dtforce * fz[i];
 | 
					        atom_vz(i) += param->dtforce * atom_fz(i);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -262,7 +264,7 @@ int main(int argc, char** argv)
 | 
				
			|||||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS)
 | 
					#if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS)
 | 
				
			||||||
        computeForceTracing(¶m, &atom, &neighbor, &stats, 1, 0);
 | 
					        computeForceTracing(¶m, &atom, &neighbor, &stats, 1, 0);
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
        computeForce(¶m, &atom, &neighbor);
 | 
					        computeForce(true, ¶m, &atom, &neighbor);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -275,12 +277,15 @@ int main(int argc, char** argv)
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for(int n = 0; n < param.ntimes; n++) {
 | 
					    for(int n = 0; n < param.ntimes; n++) {
 | 
				
			||||||
        initialIntegrate(¶m, &atom);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if((n + 1) % param.every) {
 | 
					        const bool doReneighbour = (n + 1) % param.every == 0;
 | 
				
			||||||
            updatePbc(&atom, ¶m);
 | 
					
 | 
				
			||||||
        } else {
 | 
					        cuda_initial_integrate(doReneighbour, ¶m, &atom);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if(doReneighbour) {
 | 
				
			||||||
            timer[NEIGH] += reneighbour(¶m, &atom, &neighbor);
 | 
					            timer[NEIGH] += reneighbour(¶m, &atom, &neighbor);
 | 
				
			||||||
 | 
					        } else {
 | 
				
			||||||
 | 
					            updatePbc(&atom, ¶m);
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if(param.force_field == FF_EAM) {
 | 
					        if(param.force_field == FF_EAM) {
 | 
				
			||||||
@@ -289,10 +294,11 @@ int main(int argc, char** argv)
 | 
				
			|||||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS)
 | 
					#if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS)
 | 
				
			||||||
            timer[FORCE] += computeForceTracing(¶m, &atom, &neighbor, &stats, 0, n + 1);
 | 
					            timer[FORCE] += computeForceTracing(¶m, &atom, &neighbor, &stats, 0, n + 1);
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
            timer[FORCE] += computeForce(¶m, &atom, &neighbor);
 | 
					            timer[FORCE] += computeForce(doReneighbour, ¶m, &atom, &neighbor);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        finalIntegrate(¶m, &atom);
 | 
					
 | 
				
			||||||
 | 
					        cuda_final_integrate(doReneighbour, ¶m, &atom);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
 | 
					        if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
 | 
				
			||||||
            computeThermo(n + 1, ¶m, &atom);
 | 
					            computeThermo(n + 1, ¶m, &atom);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -26,6 +26,7 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#include <neighbor.h>
 | 
					#include <neighbor.h>
 | 
				
			||||||
#include <parameter.h>
 | 
					#include <parameter.h>
 | 
				
			||||||
 | 
					#include <allocate.h>
 | 
				
			||||||
#include <atom.h>
 | 
					#include <atom.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define SMALL 1.0e-6
 | 
					#define SMALL 1.0e-6
 | 
				
			||||||
@@ -174,10 +175,12 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor)
 | 
				
			|||||||
    /* extend atom arrays if necessary */
 | 
					    /* extend atom arrays if necessary */
 | 
				
			||||||
    if(nall > nmax) {
 | 
					    if(nall > nmax) {
 | 
				
			||||||
        nmax = nall;
 | 
					        nmax = nall;
 | 
				
			||||||
        if(neighbor->numneigh) free(neighbor->numneigh);
 | 
					        if(neighbor->numneigh) cudaFreeHost(neighbor->numneigh);
 | 
				
			||||||
        if(neighbor->neighbors) free(neighbor->neighbors);
 | 
					        if(neighbor->neighbors) cudaFreeHost(neighbor->neighbors);
 | 
				
			||||||
        neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
 | 
					        checkCUDAError( "buildNeighbor numneigh", cudaMallocHost((void**)&(neighbor->numneigh), nmax * sizeof(int)) );
 | 
				
			||||||
        neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
 | 
					        checkCUDAError( "buildNeighbor neighbors", cudaMallocHost((void**)&(neighbor->neighbors), nmax * neighbor->maxneighs * sizeof(int)) );
 | 
				
			||||||
 | 
					        // neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
 | 
				
			||||||
 | 
					        // neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    /* bin local & ghost atoms */
 | 
					    /* bin local & ghost atoms */
 | 
				
			||||||
@@ -190,7 +193,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor)
 | 
				
			|||||||
        resize = 0;
 | 
					        resize = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for(int i = 0; i < atom->Nlocal; i++) {
 | 
					        for(int i = 0; i < atom->Nlocal; i++) {
 | 
				
			||||||
            int* neighptr = &(neighbor->neighbors[i * neighbor->maxneighs]);
 | 
					            int* neighptr = &(neighbor->neighbors[i]);
 | 
				
			||||||
            int n = 0;
 | 
					            int n = 0;
 | 
				
			||||||
            MD_FLOAT xtmp = atom_x(i);
 | 
					            MD_FLOAT xtmp = atom_x(i);
 | 
				
			||||||
            MD_FLOAT ytmp = atom_y(i);
 | 
					            MD_FLOAT ytmp = atom_y(i);
 | 
				
			||||||
@@ -221,8 +224,11 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor)
 | 
				
			|||||||
                    #else
 | 
					                    #else
 | 
				
			||||||
                    const MD_FLOAT cutoff = cutneighsq;
 | 
					                    const MD_FLOAT cutoff = cutneighsq;
 | 
				
			||||||
                    #endif
 | 
					                    #endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                    if( rsq <= cutoff ) {
 | 
					                    if( rsq <= cutoff ) {
 | 
				
			||||||
                        neighptr[n++] = j;
 | 
					                        int idx = atom->Nlocal * n;
 | 
				
			||||||
 | 
					                        neighptr[idx] = j;
 | 
				
			||||||
 | 
					                        n += 1;
 | 
				
			||||||
                    }
 | 
					                    }
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
@@ -323,7 +329,10 @@ void binatoms(Atom *atom)
 | 
				
			|||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for(int i = 0; i < nall; i++) {
 | 
					        for(int i = 0; i < nall; i++) {
 | 
				
			||||||
            int ibin = coord2bin(atom_x(i), atom_y(i), atom_z(i));
 | 
					            MD_FLOAT x = atom_x(i);
 | 
				
			||||||
 | 
					            MD_FLOAT y = atom_y(i);
 | 
				
			||||||
 | 
					            MD_FLOAT z = atom_z(i);
 | 
				
			||||||
 | 
					            int ibin = coord2bin(x, y, z);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if(bincount[ibin] < atoms_per_bin) {
 | 
					            if(bincount[ibin] < atoms_per_bin) {
 | 
				
			||||||
                int ac = bincount[ibin]++;
 | 
					                int ac = bincount[ibin]++;
 | 
				
			||||||
@@ -352,14 +361,18 @@ void sortAtom(Atom* atom) {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#ifdef AOS
 | 
					#ifdef AOS
 | 
				
			||||||
    double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
 | 
					    double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    double* new_vx = (double*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
    double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT));
 | 
					    double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT));
 | 
				
			||||||
    double* new_y = (double*) malloc(Nmax * sizeof(MD_FLOAT));
 | 
					    double* new_y = (double*) malloc(Nmax * sizeof(MD_FLOAT));
 | 
				
			||||||
    double* new_z = (double*) malloc(Nmax * sizeof(MD_FLOAT));
 | 
					    double* new_z = (double*) malloc(Nmax * sizeof(MD_FLOAT));
 | 
				
			||||||
#endif
 | 
					
 | 
				
			||||||
    double* new_vx = (double*) malloc(Nmax * sizeof(MD_FLOAT));
 | 
					    double* new_vx = (double*) malloc(Nmax * sizeof(MD_FLOAT));
 | 
				
			||||||
    double* new_vy = (double*) malloc(Nmax * sizeof(MD_FLOAT));
 | 
					    double* new_vy = (double*) malloc(Nmax * sizeof(MD_FLOAT));
 | 
				
			||||||
    double* new_vz = (double*) malloc(Nmax * sizeof(MD_FLOAT));
 | 
					    double* new_vz = (double*) malloc(Nmax * sizeof(MD_FLOAT));
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    double* old_x = atom->x; double* old_y = atom->y; double* old_z = atom->z;
 | 
					    double* old_x = atom->x; double* old_y = atom->y; double* old_z = atom->z;
 | 
				
			||||||
    double* old_vx = atom->vx; double* old_vy = atom->vy; double* old_vz = atom->vz;
 | 
					    double* old_vx = atom->vx; double* old_vy = atom->vy; double* old_vz = atom->vz;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -373,24 +386,34 @@ void sortAtom(Atom* atom) {
 | 
				
			|||||||
            new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
 | 
					            new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
 | 
				
			||||||
            new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
 | 
					            new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
 | 
				
			||||||
            new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
 | 
					            new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            new_vx[new_i * 3 + 0] = old_vx[old_i * 3 + 0];
 | 
				
			||||||
 | 
					            new_vx[new_i * 3 + 1] = old_vy[old_i * 3 + 1];
 | 
				
			||||||
 | 
					            new_vx[new_i * 3 + 2] = old_vz[old_i * 3 + 2];
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
            new_x[new_i] = old_x[old_i];
 | 
					            new_x[new_i] = old_x[old_i];
 | 
				
			||||||
            new_y[new_i] = old_y[old_i];
 | 
					            new_y[new_i] = old_y[old_i];
 | 
				
			||||||
            new_z[new_i] = old_z[old_i];
 | 
					            new_z[new_i] = old_z[old_i];
 | 
				
			||||||
#endif
 | 
					
 | 
				
			||||||
            new_vx[new_i] = old_vx[old_i];
 | 
					            new_vx[new_i] = old_vx[old_i];
 | 
				
			||||||
            new_vy[new_i] = old_vy[old_i];
 | 
					            new_vy[new_i] = old_vy[old_i];
 | 
				
			||||||
            new_vz[new_i] = old_vz[old_i];
 | 
					            new_vz[new_i] = old_vz[old_i];
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    free(atom->x);
 | 
					    free(atom->x);
 | 
				
			||||||
    atom->x = new_x;
 | 
					    atom->x = new_x;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    free(atom->vx);
 | 
				
			||||||
 | 
					    atom->vx = new_vx;
 | 
				
			||||||
#ifndef AOS
 | 
					#ifndef AOS
 | 
				
			||||||
    free(atom->y);
 | 
					    free(atom->y);
 | 
				
			||||||
    free(atom->z);
 | 
					    free(atom->z);
 | 
				
			||||||
    atom->y = new_y; atom->z = new_z;
 | 
					    atom->y = new_y; atom->z = new_z;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    free(atom->vy); free(atom->vz);
 | 
				
			||||||
 | 
					    atom->vy = new_vy; atom->vz = new_vz;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
    free(atom->vx); free(atom->vy); free(atom->vz);
 | 
					 | 
				
			||||||
    atom->vx = new_vx; atom->vy = new_vy; atom->vz = new_vz;
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										26
									
								
								src/thermo.c
									
									
									
									
									
								
							
							
						
						
									
										26
									
								
								src/thermo.c
									
									
									
									
									
								
							@@ -71,12 +71,9 @@ void setupThermo(Parameter *param, int natoms)
 | 
				
			|||||||
void computeThermo(int iflag, Parameter *param, Atom *atom)
 | 
					void computeThermo(int iflag, Parameter *param, Atom *atom)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    MD_FLOAT t = 0.0, p;
 | 
					    MD_FLOAT t = 0.0, p;
 | 
				
			||||||
    MD_FLOAT* vx = atom->vx;
 | 
					 | 
				
			||||||
    MD_FLOAT* vy = atom->vy;
 | 
					 | 
				
			||||||
    MD_FLOAT* vz = atom->vz;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for(int i = 0; i < atom->Nlocal; i++) {
 | 
					    for(int i = 0; i < atom->Nlocal; i++) {
 | 
				
			||||||
        t += (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) * param->mass;
 | 
					        t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    t = t * t_scale;
 | 
					    t = t * t_scale;
 | 
				
			||||||
@@ -101,12 +98,11 @@ void adjustThermo(Parameter *param, Atom *atom)
 | 
				
			|||||||
{
 | 
					{
 | 
				
			||||||
    /* zero center-of-mass motion */
 | 
					    /* zero center-of-mass motion */
 | 
				
			||||||
    MD_FLOAT vxtot = 0.0; MD_FLOAT vytot = 0.0; MD_FLOAT vztot = 0.0;
 | 
					    MD_FLOAT vxtot = 0.0; MD_FLOAT vytot = 0.0; MD_FLOAT vztot = 0.0;
 | 
				
			||||||
    MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for(int i = 0; i < atom->Nlocal; i++) {
 | 
					    for(int i = 0; i < atom->Nlocal; i++) {
 | 
				
			||||||
        vxtot += vx[i];
 | 
					        vxtot += atom_vx(i);
 | 
				
			||||||
        vytot += vy[i];
 | 
					        vytot += atom_vy(i);
 | 
				
			||||||
        vztot += vz[i];
 | 
					        vztot += atom_vz(i);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    vxtot = vxtot / atom->Natoms;
 | 
					    vxtot = vxtot / atom->Natoms;
 | 
				
			||||||
@@ -114,24 +110,24 @@ void adjustThermo(Parameter *param, Atom *atom)
 | 
				
			|||||||
    vztot = vztot / atom->Natoms;
 | 
					    vztot = vztot / atom->Natoms;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for(int i = 0; i < atom->Nlocal; i++) {
 | 
					    for(int i = 0; i < atom->Nlocal; i++) {
 | 
				
			||||||
        vx[i] -= vxtot;
 | 
					        atom_vx(i) -= vxtot;
 | 
				
			||||||
        vy[i] -= vytot;
 | 
					        atom_vy(i) -= vytot;
 | 
				
			||||||
        vz[i] -= vztot;
 | 
					        atom_vz(i) -= vztot;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    t_act = 0;
 | 
					    t_act = 0;
 | 
				
			||||||
    MD_FLOAT t = 0.0;
 | 
					    MD_FLOAT t = 0.0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for(int i = 0; i < atom->Nlocal; i++) {
 | 
					    for(int i = 0; i < atom->Nlocal; i++) {
 | 
				
			||||||
        t += (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) * param->mass;
 | 
					        t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    t *= t_scale;
 | 
					    t *= t_scale;
 | 
				
			||||||
    MD_FLOAT factor = sqrt(param->temp / t);
 | 
					    MD_FLOAT factor = sqrt(param->temp / t);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for(int i = 0; i < atom->Nlocal; i++) {
 | 
					    for(int i = 0; i < atom->Nlocal; i++) {
 | 
				
			||||||
        vx[i] *= factor;
 | 
					        atom_vx(i) *= factor;
 | 
				
			||||||
        vy[i] *= factor;
 | 
					        atom_vy(i) *= factor;
 | 
				
			||||||
        vz[i] *= factor;
 | 
					        atom_vz(i) *= factor;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user