Refactor code
This commit is contained in:
532
src/clusterpair/atom.c
Normal file
532
src/clusterpair/atom.c
Normal file
@@ -0,0 +1,532 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
#include <util.h>
|
||||
|
||||
void initAtom(Atom *atom) {
|
||||
atom->x = NULL; atom->y = NULL; atom->z = NULL;
|
||||
atom->vx = NULL; atom->vy = NULL; atom->vz = NULL;
|
||||
atom->cl_x = NULL;
|
||||
atom->cl_v = NULL;
|
||||
atom->cl_f = NULL;
|
||||
atom->cl_type = NULL;
|
||||
atom->Natoms = 0;
|
||||
atom->Nlocal = 0;
|
||||
atom->Nghost = 0;
|
||||
atom->Nmax = 0;
|
||||
atom->Nclusters = 0;
|
||||
atom->Nclusters_local = 0;
|
||||
atom->Nclusters_ghost = 0;
|
||||
atom->Nclusters_max = 0;
|
||||
atom->type = NULL;
|
||||
atom->ntypes = 0;
|
||||
atom->epsilon = NULL;
|
||||
atom->sigma6 = NULL;
|
||||
atom->cutforcesq = NULL;
|
||||
atom->cutneighsq = NULL;
|
||||
atom->iclusters = NULL;
|
||||
atom->jclusters = NULL;
|
||||
atom->icluster_bin = NULL;
|
||||
initMasks(atom);
|
||||
}
|
||||
|
||||
void createAtom(Atom *atom, Parameter *param) {
|
||||
MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd;
|
||||
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd;
|
||||
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd;
|
||||
atom->Natoms = 4 * param->nx * param->ny * param->nz;
|
||||
atom->Nlocal = 0;
|
||||
atom->ntypes = param->ntypes;
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0));
|
||||
int ilo = (int) (xlo / (0.5 * alat) - 1);
|
||||
int ihi = (int) (xhi / (0.5 * alat) + 1);
|
||||
int jlo = (int) (ylo / (0.5 * alat) - 1);
|
||||
int jhi = (int) (yhi / (0.5 * alat) + 1);
|
||||
int klo = (int) (zlo / (0.5 * alat) - 1);
|
||||
int khi = (int) (zhi / (0.5 * alat) + 1);
|
||||
|
||||
ilo = MAX(ilo, 0);
|
||||
ihi = MIN(ihi, 2 * param->nx - 1);
|
||||
jlo = MAX(jlo, 0);
|
||||
jhi = MIN(jhi, 2 * param->ny - 1);
|
||||
klo = MAX(klo, 0);
|
||||
khi = MIN(khi, 2 * param->nz - 1);
|
||||
|
||||
MD_FLOAT xtmp, ytmp, ztmp, vxtmp, vytmp, vztmp;
|
||||
int i, j, k, m, n;
|
||||
int sx = 0; int sy = 0; int sz = 0;
|
||||
int ox = 0; int oy = 0; int oz = 0;
|
||||
int subboxdim = 8;
|
||||
|
||||
while(oz * subboxdim <= khi) {
|
||||
k = oz * subboxdim + sz;
|
||||
j = oy * subboxdim + sy;
|
||||
i = ox * subboxdim + sx;
|
||||
|
||||
if(((i + j + k) % 2 == 0) && (i >= ilo) && (i <= ihi) && (j >= jlo) && (j <= jhi) && (k >= klo) && (k <= khi)) {
|
||||
xtmp = 0.5 * alat * i;
|
||||
ytmp = 0.5 * alat * j;
|
||||
ztmp = 0.5 * alat * k;
|
||||
|
||||
if(xtmp >= xlo && xtmp < xhi && ytmp >= ylo && ytmp < yhi && ztmp >= zlo && ztmp < zhi ) {
|
||||
n = k * (2 * param->ny) * (2 * param->nx) + j * (2 * param->nx) + i + 1;
|
||||
for(m = 0; m < 5; m++) { myrandom(&n); }
|
||||
vxtmp = myrandom(&n);
|
||||
for(m = 0; m < 5; m++){ myrandom(&n); }
|
||||
vytmp = myrandom(&n);
|
||||
for(m = 0; m < 5; m++) { myrandom(&n); }
|
||||
vztmp = myrandom(&n);
|
||||
|
||||
if(atom->Nlocal == atom->Nmax) { growAtom(atom); }
|
||||
atom_x(atom->Nlocal) = xtmp;
|
||||
atom_y(atom->Nlocal) = ytmp;
|
||||
atom_z(atom->Nlocal) = ztmp;
|
||||
atom->vx[atom->Nlocal] = vxtmp;
|
||||
atom->vy[atom->Nlocal] = vytmp;
|
||||
atom->vz[atom->Nlocal] = vztmp;
|
||||
atom->type[atom->Nlocal] = rand() % atom->ntypes;
|
||||
atom->Nlocal++;
|
||||
}
|
||||
}
|
||||
|
||||
sx++;
|
||||
if(sx == subboxdim) { sx = 0; sy++; }
|
||||
if(sy == subboxdim) { sy = 0; sz++; }
|
||||
if(sz == subboxdim) { sz = 0; ox++; }
|
||||
if(ox * subboxdim > ihi) { ox = 0; oy++; }
|
||||
if(oy * subboxdim > jhi) { oy = 0; oz++; }
|
||||
}
|
||||
}
|
||||
|
||||
int type_str2int(const char *type) {
|
||||
if(strncmp(type, "Ar", 2) == 0) { return 0; } // Argon
|
||||
fprintf(stderr, "Invalid atom type: %s\n", type);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int readAtom(Atom* atom, Parameter* param) {
|
||||
int len = strlen(param->input_file);
|
||||
if(strncmp(¶m->input_file[len - 4], ".pdb", 4) == 0) { return readAtom_pdb(atom, param); }
|
||||
if(strncmp(¶m->input_file[len - 4], ".gro", 4) == 0) { return readAtom_gro(atom, param); }
|
||||
if(strncmp(¶m->input_file[len - 4], ".dmp", 4) == 0) { return readAtom_dmp(atom, param); }
|
||||
fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int readAtom_pdb(Atom* atom, Parameter* param) {
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
int read_atoms = 0;
|
||||
|
||||
if(!fp) {
|
||||
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
while(!feof(fp)) {
|
||||
readline(line, fp);
|
||||
char *item = strtok(line, " ");
|
||||
if(strncmp(item, "CRYST1", 6) == 0) {
|
||||
param->xlo = 0.0;
|
||||
param->xhi = atof(strtok(NULL, " "));
|
||||
param->ylo = 0.0;
|
||||
param->yhi = atof(strtok(NULL, " "));
|
||||
param->zlo = 0.0;
|
||||
param->zhi = atof(strtok(NULL, " "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
// alpha, beta, gamma, sGroup, z
|
||||
} else if(strncmp(item, "ATOM", 4) == 0) {
|
||||
char *label;
|
||||
int atom_id, comp_id;
|
||||
MD_FLOAT occupancy, charge;
|
||||
atom_id = atoi(strtok(NULL, " ")) - 1;
|
||||
|
||||
while(atom_id + 1 >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
atom->type[atom_id] = type_str2int(strtok(NULL, " "));
|
||||
label = strtok(NULL, " ");
|
||||
comp_id = atoi(strtok(NULL, " "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, " "));
|
||||
atom->vx[atom_id] = 0.0;
|
||||
atom->vy[atom_id] = 0.0;
|
||||
atom->vz[atom_id] = 0.0;
|
||||
occupancy = atof(strtok(NULL, " "));
|
||||
charge = atof(strtok(NULL, " "));
|
||||
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
|
||||
atom->Natoms++;
|
||||
atom->Nlocal++;
|
||||
read_atoms++;
|
||||
} else if(strncmp(item, "HEADER", 6) == 0 ||
|
||||
strncmp(item, "REMARK", 6) == 0 ||
|
||||
strncmp(item, "MODEL", 5) == 0 ||
|
||||
strncmp(item, "TER", 3) == 0 ||
|
||||
strncmp(item, "ENDMDL", 6) == 0) {
|
||||
// Do nothing
|
||||
} else {
|
||||
fprintf(stderr, "Invalid item: %s\n", item);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if(!read_atoms) {
|
||||
fprintf(stderr, "Input error: No atoms read!\n");
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
|
||||
fclose(fp);
|
||||
return read_atoms;
|
||||
}
|
||||
|
||||
int readAtom_gro(Atom* atom, Parameter* param) {
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
char desc[MAXLINE];
|
||||
int read_atoms = 0;
|
||||
int atoms_to_read = 0;
|
||||
int i = 0;
|
||||
|
||||
if(!fp) {
|
||||
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
readline(desc, fp);
|
||||
for(i = 0; desc[i] != '\n'; i++);
|
||||
desc[i] = '\0';
|
||||
readline(line, fp);
|
||||
atoms_to_read = atoi(strtok(line, " "));
|
||||
fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
|
||||
|
||||
while(!feof(fp) && read_atoms < atoms_to_read) {
|
||||
readline(line, fp);
|
||||
char *label = strtok(line, " ");
|
||||
int type = type_str2int(strtok(NULL, " "));
|
||||
int atom_id = atoi(strtok(NULL, " ")) - 1;
|
||||
atom_id = read_atoms;
|
||||
while(atom_id + 1 >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
atom->type[atom_id] = type;
|
||||
atom_x(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, " "));
|
||||
atom->vx[atom_id] = atof(strtok(NULL, " "));
|
||||
atom->vy[atom_id] = atof(strtok(NULL, " "));
|
||||
atom->vz[atom_id] = atof(strtok(NULL, " "));
|
||||
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
|
||||
atom->Natoms++;
|
||||
atom->Nlocal++;
|
||||
read_atoms++;
|
||||
}
|
||||
|
||||
if(!feof(fp)) {
|
||||
readline(line, fp);
|
||||
param->xlo = 0.0;
|
||||
param->xhi = atof(strtok(line, " "));
|
||||
param->ylo = 0.0;
|
||||
param->yhi = atof(strtok(NULL, " "));
|
||||
param->zlo = 0.0;
|
||||
param->zhi = atof(strtok(NULL, " "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
}
|
||||
|
||||
if(read_atoms != atoms_to_read) {
|
||||
fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
|
||||
fclose(fp);
|
||||
return read_atoms;
|
||||
}
|
||||
|
||||
int readAtom_dmp(Atom* atom, Parameter* param) {
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
int natoms = 0;
|
||||
int read_atoms = 0;
|
||||
int atom_id = -1;
|
||||
int ts = -1;
|
||||
|
||||
if(!fp) {
|
||||
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
while(!feof(fp) && ts < 1 && !read_atoms) {
|
||||
readline(line, fp);
|
||||
if(strncmp(line, "ITEM: ", 6) == 0) {
|
||||
char *item = &line[6];
|
||||
|
||||
if(strncmp(item, "TIMESTEP", 8) == 0) {
|
||||
readline(line, fp);
|
||||
ts = atoi(line);
|
||||
} else if(strncmp(item, "NUMBER OF ATOMS", 15) == 0) {
|
||||
readline(line, fp);
|
||||
natoms = atoi(line);
|
||||
atom->Natoms = natoms;
|
||||
atom->Nlocal = natoms;
|
||||
while(atom->Nlocal >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
} else if(strncmp(item, "BOX BOUNDS pp pp pp", 19) == 0) {
|
||||
readline(line, fp);
|
||||
param->xlo = atof(strtok(line, " "));
|
||||
param->xhi = atof(strtok(NULL, " "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
|
||||
readline(line, fp);
|
||||
param->ylo = atof(strtok(line, " "));
|
||||
param->yhi = atof(strtok(NULL, " "));
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
|
||||
readline(line, fp);
|
||||
param->zlo = atof(strtok(line, " "));
|
||||
param->zhi = atof(strtok(NULL, " "));
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
} else if(strncmp(item, "ATOMS id type x y z vx vy vz", 28) == 0) {
|
||||
for(int i = 0; i < natoms; i++) {
|
||||
readline(line, fp);
|
||||
atom_id = atoi(strtok(line, " ")) - 1;
|
||||
atom->type[atom_id] = atoi(strtok(NULL, " "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, " "));
|
||||
atom->vx[atom_id] = atof(strtok(NULL, " "));
|
||||
atom->vy[atom_id] = atof(strtok(NULL, " "));
|
||||
atom->vz[atom_id] = atof(strtok(NULL, " "));
|
||||
atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
|
||||
read_atoms++;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "Invalid item: %s\n", item);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if(ts < 0 || !natoms || !read_atoms) {
|
||||
fprintf(stderr, "Input error: atom data was not read!\n");
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
|
||||
fclose(fp);
|
||||
return natoms;
|
||||
}
|
||||
|
||||
void initMasks(Atom *atom) {
|
||||
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
|
||||
unsigned int mask0, mask1, mask2, mask3;
|
||||
|
||||
atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT));
|
||||
atom->diagonal_4xn_j_minus_i = allocate(ALIGNMENT, MAX(CLUSTER_M, VECTOR_WIDTH) * sizeof(MD_UINT));
|
||||
atom->diagonal_2xnn_j_minus_i = allocate(ALIGNMENT, VECTOR_WIDTH * sizeof(MD_UINT));
|
||||
//atom->masks_2xnn = allocate(ALIGNMENT, 8 * sizeof(unsigned int));
|
||||
|
||||
for(int j = 0; j < MAX(CLUSTER_M, VECTOR_WIDTH); j++) {
|
||||
atom->diagonal_4xn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
|
||||
}
|
||||
|
||||
for(int j = 0; j < VECTOR_WIDTH / 2; j++) {
|
||||
atom->diagonal_2xnn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
|
||||
atom->diagonal_2xnn_j_minus_i[VECTOR_WIDTH / 2 + j] = (MD_FLOAT)(j - 1) - 0.5;
|
||||
}
|
||||
|
||||
for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) {
|
||||
atom->exclusion_filter[i] = (1U << i);
|
||||
}
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
|
||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0xf - 0x3 * cond0);
|
||||
mask2 = (unsigned int)(0xf - 0x7 * cond0);
|
||||
mask3 = (unsigned int)(0xf - 0xf * cond0);
|
||||
atom->masks_2xnn_hn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_hn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0xf - 0x2 * cond0);
|
||||
mask2 = (unsigned int)(0xf - 0x4 * cond0);
|
||||
mask3 = (unsigned int)(0xf - 0x8 * cond0);
|
||||
atom->masks_2xnn_fn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_fn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
atom->masks_4xn_hn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x3 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x7 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 4 + 3] = (unsigned int)(0xf - 0xf * cond0);
|
||||
|
||||
atom->masks_4xn_fn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x2 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x4 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 4 + 3] = (unsigned int)(0xf - 0x8 * cond0);
|
||||
}
|
||||
#else
|
||||
for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
|
||||
for(unsigned int cond1 = 0; cond1 < 2; cond1++) {
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
|
||||
mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
|
||||
mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
|
||||
mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
|
||||
#else
|
||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0x3 - 0x3 * cond0);
|
||||
mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
|
||||
mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
|
||||
#endif
|
||||
|
||||
atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
|
||||
mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
|
||||
mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
|
||||
mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
|
||||
#else
|
||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0x3 - 0x2 * cond0);
|
||||
mask2 = (unsigned int)(0x3 - 0x1 * cond1);
|
||||
mask3 = (unsigned int)(0x3 - 0x2 * cond1);
|
||||
#endif
|
||||
|
||||
atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
|
||||
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
|
||||
#else
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0x3 - 0x3 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1);
|
||||
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void growAtom(Atom *atom) {
|
||||
int nold = atom->Nmax;
|
||||
atom->Nmax += DELTA;
|
||||
|
||||
#ifdef AOS
|
||||
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||
#else
|
||||
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->y = (MD_FLOAT*) reallocate(atom->y, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->z = (MD_FLOAT*) reallocate(atom->z, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
#endif
|
||||
atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int));
|
||||
}
|
||||
|
||||
void growClusters(Atom *atom) {
|
||||
int nold = atom->Nclusters_max;
|
||||
int jterm = MAX(1, CLUSTER_M / CLUSTER_N); // If M>N, we need to allocate more j-clusters
|
||||
atom->Nclusters_max += DELTA;
|
||||
atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
|
||||
atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jterm * sizeof(Cluster), nold * jterm * sizeof(Cluster));
|
||||
atom->icluster_bin = (int*) reallocate(atom->icluster_bin, ALIGNMENT, atom->Nclusters_max * sizeof(int), nold * sizeof(int));
|
||||
atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
|
||||
}
|
171
src/clusterpair/atom.h
Normal file
171
src/clusterpair/atom.h
Normal file
@@ -0,0 +1,171 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __ATOM_H_
|
||||
#define __ATOM_H_
|
||||
|
||||
#define DELTA 20000
|
||||
|
||||
// Nbnxn layouts (as of GROMACS):
|
||||
// Simd4xN: M=4, N=VECTOR_WIDTH
|
||||
// Simd2xNN: M=4, N=(VECTOR_WIDTH/2)
|
||||
// Cuda: M=8, N=VECTOR_WIDTH
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
# undef VECTOR_WIDTH
|
||||
# define VECTOR_WIDTH 8
|
||||
# define KERNEL_NAME "CUDA"
|
||||
# define CLUSTER_M 8
|
||||
# define CLUSTER_N VECTOR_WIDTH
|
||||
# define UNROLL_J 1
|
||||
# define computeForceLJ computeForceLJ_cuda
|
||||
# define initialIntegrate cudaInitialIntegrate
|
||||
# define finalIntegrate cudaFinalIntegrate
|
||||
# define updatePbc cudaUpdatePbc
|
||||
#else
|
||||
# define CLUSTER_M 4
|
||||
// Simd2xNN (here used for single-precision)
|
||||
# if VECTOR_WIDTH > CLUSTER_M * 2
|
||||
# define KERNEL_NAME "Simd2xNN"
|
||||
# define CLUSTER_N (VECTOR_WIDTH / 2)
|
||||
# define UNROLL_I 4
|
||||
# define UNROLL_J 2
|
||||
# define computeForceLJ computeForceLJ_2xnn
|
||||
// Simd4xN
|
||||
# else
|
||||
# define KERNEL_NAME "Simd4xN"
|
||||
# define CLUSTER_N VECTOR_WIDTH
|
||||
# define UNROLL_I 4
|
||||
# define UNROLL_J 1
|
||||
# define computeForceLJ computeForceLJ_4xn
|
||||
# endif
|
||||
# ifdef USE_REFERENCE_VERSION
|
||||
# undef KERNEL_NAME
|
||||
# undef computeForceLJ
|
||||
# define KERNEL_NAME "Reference"
|
||||
# define computeForceLJ computeForceLJ_ref
|
||||
# endif
|
||||
# define initialIntegrate cpuInitialIntegrate
|
||||
# define finalIntegrate cpuFinalIntegrate
|
||||
# define updatePbc cpuUpdatePbc
|
||||
#endif
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
# define CJ0_FROM_CI(a) (a)
|
||||
# define CJ1_FROM_CI(a) (a)
|
||||
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
|
||||
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
|
||||
#elif CLUSTER_M == CLUSTER_N * 2 // M > N
|
||||
# define CJ0_FROM_CI(a) ((a) << 1)
|
||||
# define CJ1_FROM_CI(a) (((a) << 1) | 0x1)
|
||||
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_M * (b))
|
||||
# define CJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1))
|
||||
#elif CLUSTER_M == CLUSTER_N / 2 // M < N
|
||||
# define CJ0_FROM_CI(a) ((a) >> 1)
|
||||
# define CJ1_FROM_CI(a) ((a) >> 1)
|
||||
# define CI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1))
|
||||
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
|
||||
#else
|
||||
# error "Invalid cluster configuration!"
|
||||
#endif
|
||||
|
||||
#if CLUSTER_N != 2 && CLUSTER_N != 4 && CLUSTER_N != 8
|
||||
# error "Cluster N dimension can be only 2, 4 and 8"
|
||||
#endif
|
||||
|
||||
#define CI_SCALAR_BASE_INDEX(a) (CI_BASE_INDEX(a, 1))
|
||||
#define CI_VECTOR_BASE_INDEX(a) (CI_BASE_INDEX(a, 3))
|
||||
#define CJ_SCALAR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 1))
|
||||
#define CJ_VECTOR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 3))
|
||||
|
||||
#if CLUSTER_M >= CLUSTER_N
|
||||
# define CL_X_OFFSET (0 * CLUSTER_M)
|
||||
# define CL_Y_OFFSET (1 * CLUSTER_M)
|
||||
# define CL_Z_OFFSET (2 * CLUSTER_M)
|
||||
#else
|
||||
# define CL_X_OFFSET (0 * CLUSTER_N)
|
||||
# define CL_Y_OFFSET (1 * CLUSTER_N)
|
||||
# define CL_Z_OFFSET (2 * CLUSTER_N)
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
int natoms;
|
||||
MD_FLOAT bbminx, bbmaxx;
|
||||
MD_FLOAT bbminy, bbmaxy;
|
||||
MD_FLOAT bbminz, bbmaxz;
|
||||
} Cluster;
|
||||
|
||||
typedef struct {
|
||||
int Natoms, Nlocal, Nghost, Nmax;
|
||||
int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max;
|
||||
MD_FLOAT *x, *y, *z;
|
||||
MD_FLOAT *vx, *vy, *vz;
|
||||
int *border_map;
|
||||
int *type;
|
||||
int ntypes;
|
||||
MD_FLOAT *epsilon;
|
||||
MD_FLOAT *sigma6;
|
||||
MD_FLOAT *cutforcesq;
|
||||
MD_FLOAT *cutneighsq;
|
||||
int *PBCx, *PBCy, *PBCz;
|
||||
// Data in cluster format
|
||||
MD_FLOAT *cl_x;
|
||||
MD_FLOAT *cl_v;
|
||||
MD_FLOAT *cl_f;
|
||||
int *cl_type;
|
||||
Cluster *iclusters, *jclusters;
|
||||
int *icluster_bin;
|
||||
int dummy_cj;
|
||||
MD_UINT *exclusion_filter;
|
||||
MD_FLOAT *diagonal_4xn_j_minus_i;
|
||||
MD_FLOAT *diagonal_2xnn_j_minus_i;
|
||||
unsigned int masks_2xnn_hn[8];
|
||||
unsigned int masks_2xnn_fn[8];
|
||||
unsigned int masks_4xn_hn[16];
|
||||
unsigned int masks_4xn_fn[16];
|
||||
} Atom;
|
||||
|
||||
extern void initAtom(Atom*);
|
||||
extern void initMasks(Atom*);
|
||||
extern void createAtom(Atom*, Parameter*);
|
||||
extern int readAtom(Atom*, Parameter*);
|
||||
extern int readAtom_pdb(Atom*, Parameter*);
|
||||
extern int readAtom_gro(Atom*, Parameter*);
|
||||
extern int readAtom_dmp(Atom*, Parameter*);
|
||||
extern void growAtom(Atom*);
|
||||
extern void growClusters(Atom*);
|
||||
|
||||
#ifdef AOS
|
||||
# define POS_DATA_LAYOUT "AoS"
|
||||
# define atom_x(i) atom->x[(i) * 3 + 0]
|
||||
# define atom_y(i) atom->x[(i) * 3 + 1]
|
||||
# define atom_z(i) atom->x[(i) * 3 + 2]
|
||||
/*
|
||||
# define atom_vx(i) atom->vx[(i) * 3 + 0]
|
||||
# define atom_vy(i) atom->vx[(i) * 3 + 1]
|
||||
# define atom_vz(i) atom->vx[(i) * 3 + 2]
|
||||
# define atom_fx(i) atom->fx[(i) * 3 + 0]
|
||||
# define atom_fy(i) atom->fx[(i) * 3 + 1]
|
||||
# define atom_fz(i) atom->fx[(i) * 3 + 2]
|
||||
*/
|
||||
#else
|
||||
# define POS_DATA_LAYOUT "SoA"
|
||||
# define atom_x(i) atom->x[i]
|
||||
# define atom_y(i) atom->y[i]
|
||||
# define atom_z(i) atom->z[i]
|
||||
#endif
|
||||
|
||||
// TODO: allow to switch velocites and forces to AoS
|
||||
# define atom_vx(i) atom->vx[i]
|
||||
# define atom_vy(i) atom->vy[i]
|
||||
# define atom_vz(i) atom->vz[i]
|
||||
# define atom_fx(i) atom->fx[i]
|
||||
# define atom_fy(i) atom->fy[i]
|
||||
# define atom_fz(i) atom->fz[i]
|
||||
|
||||
#endif
|
317
src/clusterpair/cuda/force_lj.cu
Normal file
317
src/clusterpair/cuda/force_lj.cu
Normal file
@@ -0,0 +1,317 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
extern "C" {
|
||||
|
||||
#include <stdio.h>
|
||||
//---
|
||||
#include <cuda.h>
|
||||
#include <driver_types.h>
|
||||
//---
|
||||
#include <likwid-marker.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <device.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <stats.h>
|
||||
#include <timing.h>
|
||||
#include <util.h>
|
||||
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
MD_FLOAT *cuda_cl_x;
|
||||
MD_FLOAT *cuda_cl_v;
|
||||
MD_FLOAT *cuda_cl_f;
|
||||
int *cuda_neighbors;
|
||||
int *cuda_numneigh;
|
||||
int *cuda_natoms;
|
||||
int *natoms;
|
||||
int *ngatoms;
|
||||
int *cuda_border_map;
|
||||
int *cuda_jclusters_natoms;
|
||||
MD_FLOAT *cuda_bbminx, *cuda_bbmaxx;
|
||||
MD_FLOAT *cuda_bbminy, *cuda_bbmaxy;
|
||||
MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
|
||||
int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
|
||||
int isReneighboured;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void initDevice(Atom *atom, Neighbor *neighbor) {
|
||||
cuda_assert("cudaDeviceSetup", cudaDeviceReset());
|
||||
cuda_assert("cudaDeviceSetup", cudaSetDevice(0));
|
||||
cuda_cl_x = (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
cuda_cl_v = (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
cuda_cl_f = (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
cuda_natoms = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_jclusters_natoms = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_border_map = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_PBCx = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_PBCy = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_PBCz = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_numneigh = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_neighbors = (int *) allocateGPU(atom->Nclusters_max * neighbor->maxneighs * sizeof(int));
|
||||
natoms = (int *) malloc(atom->Nclusters_max * sizeof(int));
|
||||
ngatoms = (int *) malloc(atom->Nclusters_max * sizeof(int));
|
||||
isReneighboured = 1;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void copyDataToCUDADevice(Atom *atom) {
|
||||
memcpyToGPU(cuda_cl_x, atom->cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyToGPU(cuda_cl_v, atom->cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyToGPU(cuda_cl_f, atom->cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
natoms[ci] = atom->iclusters[ci].natoms;
|
||||
}
|
||||
|
||||
memcpyToGPU(cuda_natoms, natoms, atom->Nclusters_local * sizeof(int));
|
||||
|
||||
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
int ncj = atom->Nclusters_local / jfac;
|
||||
for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
|
||||
const int cj = ncj + cg;
|
||||
ngatoms[cg] = atom->jclusters[cj].natoms;
|
||||
}
|
||||
|
||||
memcpyToGPU(cuda_jclusters_natoms, ngatoms, atom->Nclusters_ghost * sizeof(int));
|
||||
memcpyToGPU(cuda_border_map, atom->border_map, atom->Nclusters_ghost * sizeof(int));
|
||||
memcpyToGPU(cuda_PBCx, atom->PBCx, atom->Nclusters_ghost * sizeof(int));
|
||||
memcpyToGPU(cuda_PBCy, atom->PBCy, atom->Nclusters_ghost * sizeof(int));
|
||||
memcpyToGPU(cuda_PBCz, atom->PBCz, atom->Nclusters_ghost * sizeof(int));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void copyDataFromCUDADevice(Atom *atom) {
|
||||
memcpyFromGPU(atom->cl_x, cuda_cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyFromGPU(atom->cl_v, cuda_cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyFromGPU(atom->cl_f, cuda_cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void cudaDeviceFree() {
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_x));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_v));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_f));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_numneigh));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_neighbors));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_natoms));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_border_map));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_jclusters_natoms));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCx));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCy));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCz));
|
||||
free(natoms);
|
||||
free(ngatoms);
|
||||
}
|
||||
|
||||
__global__ void cudaInitialIntegrate_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
|
||||
int *cuda_natoms,
|
||||
int Nclusters_local, MD_FLOAT dtforce, MD_FLOAT dt) {
|
||||
|
||||
unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (ci_pos >= Nclusters_local) return;
|
||||
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
|
||||
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
|
||||
|
||||
for (int cii = 0; cii < cuda_natoms[ci_pos]; cii++) {
|
||||
ci_v[CL_X_OFFSET + cii] += dtforce * ci_f[CL_X_OFFSET + cii];
|
||||
ci_v[CL_Y_OFFSET + cii] += dtforce * ci_f[CL_Y_OFFSET + cii];
|
||||
ci_v[CL_Z_OFFSET + cii] += dtforce * ci_f[CL_Z_OFFSET + cii];
|
||||
ci_x[CL_X_OFFSET + cii] += dt * ci_v[CL_X_OFFSET + cii];
|
||||
ci_x[CL_Y_OFFSET + cii] += dt * ci_v[CL_Y_OFFSET + cii];
|
||||
ci_x[CL_Z_OFFSET + cii] += dt * ci_v[CL_Z_OFFSET + cii];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void cudaUpdatePbc_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
|
||||
int *cuda_jclusters_natoms,
|
||||
int *cuda_PBCx,
|
||||
int *cuda_PBCy,
|
||||
int *cuda_PBCz,
|
||||
int Nclusters_local,
|
||||
int Nclusters_ghost,
|
||||
MD_FLOAT param_xprd,
|
||||
MD_FLOAT param_yprd,
|
||||
MD_FLOAT param_zprd) {
|
||||
unsigned int cg = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (cg >= Nclusters_ghost) return;
|
||||
|
||||
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
int ncj = Nclusters_local / jfac;
|
||||
MD_FLOAT xprd = param_xprd;
|
||||
MD_FLOAT yprd = param_yprd;
|
||||
MD_FLOAT zprd = param_zprd;
|
||||
|
||||
const int cj = ncj + cg;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int bmap_vec_base = CJ_VECTOR_BASE_INDEX(cuda_border_map[cg]);
|
||||
MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
|
||||
MD_FLOAT *bmap_x = &cuda_cl_x[bmap_vec_base];
|
||||
|
||||
for(int cjj = 0; cjj < cuda_jclusters_natoms[cg]; cjj++) {
|
||||
cj_x[CL_X_OFFSET + cjj] = bmap_x[CL_X_OFFSET + cjj] + cuda_PBCx[cg] * xprd;
|
||||
cj_x[CL_Y_OFFSET + cjj] = bmap_x[CL_Y_OFFSET + cjj] + cuda_PBCy[cg] * yprd;
|
||||
cj_x[CL_Z_OFFSET + cjj] = bmap_x[CL_Z_OFFSET + cjj] + cuda_PBCz[cg] * zprd;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void computeForceLJ_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
|
||||
int Nclusters_local, int Nclusters_max,
|
||||
int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
|
||||
MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon) {
|
||||
|
||||
unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
unsigned int cjj_pos = blockDim.z * blockIdx.z + threadIdx.z;
|
||||
if ((ci_pos >= Nclusters_local) || (cii_pos >= CLUSTER_M) || (cjj_pos >= CLUSTER_N)) return;
|
||||
|
||||
int ci_cj0 = CJ0_FROM_CI(ci_pos);
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
|
||||
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
|
||||
int numneighs = cuda_numneigh[ci_pos];
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = (&cuda_neighs[ci_pos * maxneighs])[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base];
|
||||
|
||||
MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii_pos];
|
||||
MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii_pos];
|
||||
MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii_pos];
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
int cond;
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
cond = half_neigh ? (ci_cj0 != cj || cii_pos < cjj_pos) :
|
||||
(ci_cj0 != cj || cii_pos != cjj_pos);
|
||||
#elif CLUSTER_M < CLUSTER_N
|
||||
cond = half_neigh ? (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) < cjj_pos) :
|
||||
(ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) != cjj_pos);
|
||||
#endif
|
||||
if(cond) {
|
||||
MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj_pos];
|
||||
MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj_pos];
|
||||
MD_FLOAT delz = ztmp - cj_x[CL_Z_OFFSET + cjj_pos];
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||
|
||||
if(half_neigh) {
|
||||
atomicAdd(&cj_f[CL_X_OFFSET + cjj_pos], -delx * force);
|
||||
atomicAdd(&cj_f[CL_Y_OFFSET + cjj_pos], -dely * force);
|
||||
atomicAdd(&cj_f[CL_Z_OFFSET + cjj_pos], -delz * force);
|
||||
}
|
||||
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
|
||||
atomicAdd(&ci_f[CL_X_OFFSET + cii_pos], fix);
|
||||
atomicAdd(&ci_f[CL_Y_OFFSET + cii_pos], fiy);
|
||||
atomicAdd(&ci_f[CL_Z_OFFSET + cii_pos], fiz);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void cudaFinalIntegrate_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
|
||||
int *cuda_natoms,
|
||||
int Nclusters_local, MD_FLOAT dtforce) {
|
||||
|
||||
unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (ci_pos >= Nclusters_local) return;
|
||||
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
|
||||
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
|
||||
|
||||
for (int cii = 0; cii < cuda_natoms[ci_pos]; cii++) {
|
||||
ci_v[CL_X_OFFSET + cii] += dtforce * ci_f[CL_X_OFFSET + cii];
|
||||
ci_v[CL_Y_OFFSET + cii] += dtforce * ci_f[CL_Y_OFFSET + cii];
|
||||
ci_v[CL_Z_OFFSET + cii] += dtforce * ci_f[CL_Z_OFFSET + cii];
|
||||
}
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void cudaInitialIntegrate(Parameter *param, Atom *atom) {
|
||||
const int threads_num = 16;
|
||||
dim3 block_size = dim3(threads_num, 1, 1);
|
||||
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
|
||||
cudaInitialIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_v, cuda_cl_f,
|
||||
cuda_natoms, atom->Nclusters_local, param->dtforce, param->dt);
|
||||
cuda_assert("cudaInitialIntegrate", cudaPeekAtLastError());
|
||||
cuda_assert("cudaInitialIntegrate", cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
/* update coordinates of ghost atoms */
|
||||
/* uses mapping created in setupPbc */
|
||||
extern "C"
|
||||
void cudaUpdatePbc(Atom *atom, Parameter *param) {
|
||||
const int threads_num = 512;
|
||||
dim3 block_size = dim3(threads_num, 1, 1);;
|
||||
dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);;
|
||||
cudaUpdatePbc_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_border_map,
|
||||
cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
|
||||
atom->Nclusters_local, atom->Nclusters_ghost,
|
||||
param->xprd, param->yprd, param->zprd);
|
||||
cuda_assert("cudaUpdatePbc", cudaPeekAtLastError());
|
||||
cuda_assert("cudaUpdatePbc", cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
extern "C"
|
||||
double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
|
||||
memsetGPU(cuda_cl_f, 0, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
if (isReneighboured) {
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
memcpyToGPU(&cuda_numneigh[ci], &neighbor->numneigh[ci], sizeof(int));
|
||||
memcpyToGPU(&cuda_neighbors[ci * neighbor->maxneighs], &neighbor->neighbors[ci * neighbor->maxneighs], neighbor->numneigh[ci] * sizeof(int));
|
||||
}
|
||||
|
||||
isReneighboured = 0;
|
||||
}
|
||||
|
||||
const int threads_num = 1;
|
||||
dim3 block_size = dim3(threads_num, CLUSTER_M, CLUSTER_N);
|
||||
dim3 grid_size = dim3(atom->Nclusters_local/threads_num+1, 1, 1);
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
computeForceLJ_cuda_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_f,
|
||||
atom->Nclusters_local, atom->Nclusters_max,
|
||||
cuda_numneigh, cuda_neighbors,
|
||||
neighbor->half_neigh, neighbor->maxneighs, cutforcesq,
|
||||
sigma6, epsilon);
|
||||
cuda_assert("computeForceLJ_cuda", cudaPeekAtLastError());
|
||||
cuda_assert("computeForceLJ_cuda", cudaDeviceSynchronize());
|
||||
LIKWID_MARKER_STOP("force");
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void cudaFinalIntegrate(Parameter *param, Atom *atom) {
|
||||
const int threads_num = 16;
|
||||
dim3 block_size = dim3(threads_num, 1, 1);
|
||||
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
|
||||
cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms, atom->Nclusters_local, param->dt);
|
||||
cuda_assert("cudaFinalIntegrate", cudaPeekAtLastError());
|
||||
cuda_assert("cudaFinalIntegrate", cudaDeviceSynchronize());
|
||||
}
|
201
src/clusterpair/force_eam.c
Normal file
201
src/clusterpair/force_eam.c
Normal file
@@ -0,0 +1,201 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <likwid-marker.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <allocate.h>
|
||||
#include <timing.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <stats.h>
|
||||
#include <eam.h>
|
||||
#include <util.h>
|
||||
|
||||
double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
/*
|
||||
if(eam->nmax < atom->Nmax) {
|
||||
eam->nmax = atom->Nmax;
|
||||
if(eam->fp != NULL) { free(eam->fp); }
|
||||
eam->fp = (MD_FLOAT *) allocate(ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT));
|
||||
}
|
||||
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz; int ntypes = atom->ntypes; MD_FLOAT* fp = eam->fp;
|
||||
MD_FLOAT* rhor_spline = eam->rhor_spline; MD_FLOAT* frho_spline = eam->frho_spline; MD_FLOAT* z2r_spline = eam->z2r_spline;
|
||||
MD_FLOAT rdr = eam->rdr; int nr = eam->nr; int nr_tot = eam->nr_tot; MD_FLOAT rdrho = eam->rdrho;
|
||||
int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
|
||||
*/
|
||||
double S = getTimeStamp();
|
||||
|
||||
LIKWID_MARKER_START("force_eam_fp");
|
||||
/*
|
||||
#pragma omp parallel for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
MD_FLOAT rhoi = 0;
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
#pragma ivdep
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
#else
|
||||
const MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
#endif
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT p = sqrt(rsq) * rdr + 1.0;
|
||||
int m = (int)(p);
|
||||
m = m < nr - 1 ? m : nr - 1;
|
||||
p -= m;
|
||||
p = p < 1.0 ? p : 1.0;
|
||||
#ifdef EXPLICIT_TYPES
|
||||
rhoi += ((rhor_spline[type_ij * nr_tot + m * 7 + 3] * p +
|
||||
rhor_spline[type_ij * nr_tot + m * 7 + 4]) * p +
|
||||
rhor_spline[type_ij * nr_tot + m * 7 + 5]) * p +
|
||||
rhor_spline[type_ij * nr_tot + m * 7 + 6];
|
||||
#else
|
||||
rhoi += ((rhor_spline[m * 7 + 3] * p +
|
||||
rhor_spline[m * 7 + 4]) * p +
|
||||
rhor_spline[m * 7 + 5]) * p +
|
||||
rhor_spline[m * 7 + 6];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_ii = type_i * type_i;
|
||||
#endif
|
||||
MD_FLOAT p = 1.0 * rhoi * rdrho + 1.0;
|
||||
int m = (int)(p);
|
||||
m = MAX(1, MIN(m, nrho - 1));
|
||||
p -= m;
|
||||
p = MIN(p, 1.0);
|
||||
#ifdef EXPLICIT_TYPES
|
||||
fp[i] = (frho_spline[type_ii * nrho_tot + m * 7 + 0] * p +
|
||||
frho_spline[type_ii * nrho_tot + m * 7 + 1]) * p +
|
||||
frho_spline[type_ii * nrho_tot + m * 7 + 2];
|
||||
#else
|
||||
fp[i] = (frho_spline[m * 7 + 0] * p + frho_spline[m * 7 + 1]) * p + frho_spline[m * 7 + 2];
|
||||
#endif
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force_eam_fp");
|
||||
|
||||
// We still need to update fp for PBC atoms
|
||||
for(int i = 0; i < atom->Nghost; i++) {
|
||||
fp[Nlocal + i] = fp[atom->border_map[i]];
|
||||
}
|
||||
|
||||
LIKWID_MARKER_START("force_eam");
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
|
||||
#pragma ivdep
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
#else
|
||||
const MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
#endif
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT r = sqrt(rsq);
|
||||
MD_FLOAT p = r * rdr + 1.0;
|
||||
int m = (int)(p);
|
||||
m = m < nr - 1 ? m : nr - 1;
|
||||
p -= m;
|
||||
p = p < 1.0 ? p : 1.0;
|
||||
|
||||
|
||||
// rhoip = derivative of (density at atom j due to atom i)
|
||||
// rhojp = derivative of (density at atom i due to atom j)
|
||||
// phi = pair potential energy
|
||||
// phip = phi'
|
||||
// z2 = phi * r
|
||||
// z2p = (phi * r)' = (phi' r) + phi
|
||||
// psip needs both fp[i] and fp[j] terms since r_ij appears in two
|
||||
// terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
|
||||
// hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
MD_FLOAT rhoip = (rhor_spline[type_ij * nr_tot + m * 7 + 0] * p +
|
||||
rhor_spline[type_ij * nr_tot + m * 7 + 1]) * p +
|
||||
rhor_spline[type_ij * nr_tot + m * 7 + 2];
|
||||
|
||||
MD_FLOAT z2p = (z2r_spline[type_ij * nr_tot + m * 7 + 0] * p +
|
||||
z2r_spline[type_ij * nr_tot + m * 7 + 1]) * p +
|
||||
z2r_spline[type_ij * nr_tot + m * 7 + 2];
|
||||
|
||||
MD_FLOAT z2 = ((z2r_spline[type_ij * nr_tot + m * 7 + 3] * p +
|
||||
z2r_spline[type_ij * nr_tot + m * 7 + 4]) * p +
|
||||
z2r_spline[type_ij * nr_tot + m * 7 + 5]) * p +
|
||||
z2r_spline[type_ij * nr_tot + m * 7 + 6];
|
||||
#else
|
||||
MD_FLOAT rhoip = (rhor_spline[m * 7 + 0] * p + rhor_spline[m * 7 + 1]) * p + rhor_spline[m * 7 + 2];
|
||||
MD_FLOAT z2p = (z2r_spline[m * 7 + 0] * p + z2r_spline[m * 7 + 1]) * p + z2r_spline[m * 7 + 2];
|
||||
MD_FLOAT z2 = ((z2r_spline[m * 7 + 3] * p +
|
||||
z2r_spline[m * 7 + 4]) * p +
|
||||
z2r_spline[m * 7 + 5]) * p +
|
||||
z2r_spline[m * 7 + 6];
|
||||
#endif
|
||||
|
||||
MD_FLOAT recip = 1.0 / r;
|
||||
MD_FLOAT phi = z2 * recip;
|
||||
MD_FLOAT phip = z2p * recip - phi * recip;
|
||||
MD_FLOAT psip = fp[i] * rhoip + fp[j] * rhoip + phip;
|
||||
MD_FLOAT fpair = -psip * recip;
|
||||
|
||||
fix += delx * fpair;
|
||||
fiy += dely * fpair;
|
||||
fiz += delz * fpair;
|
||||
//fpair *= 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
fx[i] = fix;
|
||||
fy[i] = fiy;
|
||||
fz[i] = fiz;
|
||||
addStat(stats->total_force_neighs, numneighs);
|
||||
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
|
||||
}
|
||||
|
||||
*/
|
||||
LIKWID_MARKER_STOP("force_eam");
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
1072
src/clusterpair/force_lj.c
Normal file
1072
src/clusterpair/force_lj.c
Normal file
File diff suppressed because it is too large
Load Diff
56
src/clusterpair/integrate.h
Normal file
56
src/clusterpair/integrate.h
Normal file
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdbool.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
#include <util.h>
|
||||
|
||||
void cpuInitialIntegrate(Parameter *param, Atom *atom) {
|
||||
DEBUG_MESSAGE("cpuInitialIntegrate start\n");
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
|
||||
ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
|
||||
ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
|
||||
ci_x[CL_X_OFFSET + cii] += param->dt * ci_v[CL_X_OFFSET + cii];
|
||||
ci_x[CL_Y_OFFSET + cii] += param->dt * ci_v[CL_Y_OFFSET + cii];
|
||||
ci_x[CL_Z_OFFSET + cii] += param->dt * ci_v[CL_Z_OFFSET + cii];
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("cpuInitialIntegrate end\n");
|
||||
}
|
||||
|
||||
void cpuFinalIntegrate(Parameter *param, Atom *atom) {
|
||||
DEBUG_MESSAGE("cpuFinalIntegrate start\n");
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
|
||||
ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
|
||||
ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("cpuFinalIntegrate end\n");
|
||||
}
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
void cudaInitialIntegrate(Parameter*, Atom*);
|
||||
void cudaFinalIntegrate(Parameter*, Atom*);
|
||||
#endif
|
344
src/clusterpair/main-stub.c
Normal file
344
src/clusterpair/main-stub.c
Normal file
@@ -0,0 +1,344 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
//---
|
||||
#include <likwid-marker.h>
|
||||
//---
|
||||
#include <timing.h>
|
||||
#include <allocate.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <stats.h>
|
||||
#include <thermo.h>
|
||||
#include <eam.h>
|
||||
#include <pbc.h>
|
||||
#include <timers.h>
|
||||
#include <util.h>
|
||||
|
||||
#define HLINE "----------------------------------------------------------------------------\n"
|
||||
|
||||
extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
|
||||
|
||||
// Patterns
|
||||
#define P_SEQ 0
|
||||
#define P_FIX 1
|
||||
#define P_RAND 2
|
||||
|
||||
void init(Parameter *param) {
|
||||
param->input_file = NULL;
|
||||
param->force_field = FF_LJ;
|
||||
param->epsilon = 1.0;
|
||||
param->sigma6 = 1.0;
|
||||
param->rho = 0.8442;
|
||||
param->ntypes = 4;
|
||||
param->ntimes = 200;
|
||||
param->nx = 1;
|
||||
param->ny = 1;
|
||||
param->nz = 1;
|
||||
param->lattice = 1.0;
|
||||
param->cutforce = 1000000.0;
|
||||
param->cutneigh = param->cutforce;
|
||||
param->mass = 1.0;
|
||||
param->half_neigh = 0;
|
||||
// Unused
|
||||
param->dt = 0.005;
|
||||
param->dtforce = 0.5 * param->dt;
|
||||
param->nstat = 100;
|
||||
param->temp = 1.44;
|
||||
param->reneigh_every = 20;
|
||||
param->proc_freq = 2.4;
|
||||
param->eam_file = NULL;
|
||||
}
|
||||
|
||||
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps, int masked) {
|
||||
const int maxneighs = nneighs * nreps;
|
||||
const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
const int ncj = atom->Nclusters_local / jfac;
|
||||
const unsigned int imask = NBNXN_INTERACTION_MASK_ALL;
|
||||
neighbor->numneigh = (int*) malloc(atom->Nclusters_max * sizeof(int));
|
||||
neighbor->numneigh_masked = (int*) malloc(atom->Nclusters_max * sizeof(int));
|
||||
neighbor->neighbors = (int*) malloc(atom->Nclusters_max * maxneighs * sizeof(int));
|
||||
neighbor->neighbors_imask = (unsigned int*) malloc(atom->Nclusters_max * maxneighs * sizeof(unsigned int));
|
||||
|
||||
if(pattern == P_RAND && ncj <= nneighs) {
|
||||
fprintf(stderr, "Error: P_RAND: Number of j-clusters should be higher than number of j-cluster neighbors per i-cluster!\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||
unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
|
||||
int j = (pattern == P_SEQ) ? CJ0_FROM_CI(ci) : 0;
|
||||
int m = (pattern == P_SEQ) ? ncj : nneighs;
|
||||
int k = 0;
|
||||
|
||||
for(int k = 0; k < nneighs; k++) {
|
||||
if(pattern == P_RAND) {
|
||||
int found = 0;
|
||||
do {
|
||||
int cj = rand() % ncj;
|
||||
neighptr[k] = cj;
|
||||
neighptr_imask[k] = imask;
|
||||
found = 0;
|
||||
for(int l = 0; l < k; l++) {
|
||||
if(neighptr[l] == cj) {
|
||||
found = 1;
|
||||
}
|
||||
}
|
||||
} while(found == 1);
|
||||
} else {
|
||||
neighptr[k] = j;
|
||||
neighptr_imask[k] = imask;
|
||||
j = (j + 1) % m;
|
||||
}
|
||||
}
|
||||
|
||||
for(int r = 1; r < nreps; r++) {
|
||||
for(int k = 0; k < nneighs; k++) {
|
||||
neighptr[r * nneighs + k] = neighptr[k];
|
||||
neighptr_imask[r * nneighs + k] = neighptr_imask[k];
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[ci] = nneighs * nreps;
|
||||
neighbor->numneigh_masked[ci] = (masked == 1) ? (nneighs * nreps) : 0;
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, const char *argv[]) {
|
||||
Eam eam;
|
||||
Atom atom_data;
|
||||
Atom *atom = (Atom *)(&atom_data);
|
||||
Neighbor neighbor;
|
||||
Stats stats;
|
||||
Parameter param;
|
||||
char *pattern_str = NULL;
|
||||
int pattern = P_SEQ;
|
||||
int niclusters = 256; // Number of local i-clusters
|
||||
int iclusters_natoms = CLUSTER_M; // Number of valid atoms within i-clusters
|
||||
int nneighs = 9; // Number of j-cluster neighbors per i-cluster
|
||||
int masked = 0; // Use masked loop
|
||||
int nreps = 1;
|
||||
int csv = 0;
|
||||
|
||||
LIKWID_MARKER_INIT;
|
||||
LIKWID_MARKER_REGISTER("force");
|
||||
DEBUG_MESSAGE("Initializing parameters...\n");
|
||||
init(¶m);
|
||||
|
||||
for(int i = 0; i < argc; i++) {
|
||||
if((strcmp(argv[i], "-f") == 0)) {
|
||||
if((param.force_field = str2ff(argv[++i])) < 0) {
|
||||
fprintf(stderr, "Invalid force field!\n");
|
||||
exit(-1);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-p") == 0)) {
|
||||
pattern_str = strdup(argv[++i]);
|
||||
if(strncmp(pattern_str, "seq", 3) == 0) { pattern = P_SEQ; }
|
||||
else if(strncmp(pattern_str, "fix", 3) == 0) { pattern = P_FIX; }
|
||||
else if(strncmp(pattern_str, "rand", 3) == 0) { pattern = P_RAND; }
|
||||
else {
|
||||
fprintf(stderr, "Invalid pattern!\n");
|
||||
exit(-1);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-e") == 0)) {
|
||||
param.eam_file = strdup(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-m") == 0)) {
|
||||
masked = 1;
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
|
||||
param.ntimes = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-ni") == 0)) {
|
||||
niclusters = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-na") == 0)) {
|
||||
iclusters_natoms = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-nn") == 0)) {
|
||||
nneighs = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-nr") == 0)) {
|
||||
nreps = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "--freq") == 0)) {
|
||||
param.proc_freq = atof(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "--csv") == 0)) {
|
||||
csv = 1;
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
|
||||
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
|
||||
printf(HLINE);
|
||||
printf("-f <string>: force field (lj or eam), default lj\n");
|
||||
printf("-p <string>: pattern for data accesses (seq, fix or rand)\n");
|
||||
printf("-n / --nsteps <int>: number of timesteps for simulation\n");
|
||||
printf("-ni <int>: number of i-clusters (default 256)\n");
|
||||
printf("-na <int>: number of atoms per i-cluster (default %d)\n", CLUSTER_M);
|
||||
printf("-nn <int>: number of j-cluster neighbors per i-cluster (default 9)\n");
|
||||
printf("-nr <int>: number of times neighbor lists should be replicated (default 1)\n");
|
||||
printf("--freq <real>: set CPU frequency (GHz) and display average cycles per atom and neighbors\n");
|
||||
printf("--csv: set output as CSV style\n");
|
||||
printf(HLINE);
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
if(pattern_str == NULL) {
|
||||
pattern_str = strdup("seq\0");
|
||||
}
|
||||
|
||||
if(param.force_field == FF_EAM) {
|
||||
DEBUG_MESSAGE("Initializing EAM parameters...\n");
|
||||
initEam(&eam, ¶m);
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("Initializing atoms...\n");
|
||||
initAtom(atom);
|
||||
initStats(&stats);
|
||||
|
||||
atom->ntypes = param.ntypes;
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param.epsilon;
|
||||
atom->sigma6[i] = param.sigma6;
|
||||
atom->cutneighsq[i] = param.cutneigh * param.cutneigh;
|
||||
atom->cutforcesq[i] = param.cutforce * param.cutforce;
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("Creating atoms...\n");
|
||||
while(atom->Nmax < niclusters * iclusters_natoms) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
while(atom->Nclusters_max < niclusters) {
|
||||
growClusters(atom);
|
||||
}
|
||||
|
||||
for(int ci = 0; ci < niclusters; ++ci) {
|
||||
int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
int *ci_type = &atom->cl_type[ci_sca_base];
|
||||
|
||||
for(int cii = 0; cii < iclusters_natoms; ++cii) {
|
||||
ci_x[CL_X_OFFSET + cii] = (MD_FLOAT)(ci * iclusters_natoms + cii) * 0.00001;
|
||||
ci_x[CL_Y_OFFSET + cii] = (MD_FLOAT)(ci * iclusters_natoms + cii) * 0.00001;
|
||||
ci_x[CL_Z_OFFSET + cii] = (MD_FLOAT)(ci * iclusters_natoms + cii) * 0.00001;
|
||||
ci_v[CL_X_OFFSET + cii] = 0.0;
|
||||
ci_v[CL_Y_OFFSET + cii] = 0.0;
|
||||
ci_v[CL_Z_OFFSET + cii] = 0.0;
|
||||
ci_type[cii] = rand() % atom->ntypes;
|
||||
atom->Nlocal++;
|
||||
}
|
||||
|
||||
for(int cii = iclusters_natoms; cii < CLUSTER_M; cii++) {
|
||||
ci_x[CL_X_OFFSET + cii] = INFINITY;
|
||||
ci_x[CL_Y_OFFSET + cii] = INFINITY;
|
||||
ci_x[CL_Z_OFFSET + cii] = INFINITY;
|
||||
}
|
||||
|
||||
atom->iclusters[ci].natoms = iclusters_natoms;
|
||||
atom->Nclusters_local++;
|
||||
}
|
||||
|
||||
const double estim_atom_volume = (double)(atom->Nlocal * 3 * sizeof(MD_FLOAT));
|
||||
const double estim_neighbors_volume = (double)(atom->Nlocal * (nneighs + 2) * sizeof(int));
|
||||
const double estim_volume = (double)(atom->Nlocal * 6 * sizeof(MD_FLOAT) + estim_neighbors_volume);
|
||||
|
||||
if(!csv) {
|
||||
printf("Kernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH);
|
||||
printf("Floating-point precision: %s\n", PRECISION_STRING);
|
||||
printf("Pattern: %s\n", pattern_str);
|
||||
printf("Number of timesteps: %d\n", param.ntimes);
|
||||
printf("Number of i-clusters: %d\n", niclusters);
|
||||
printf("Number of atoms per i-cluster: %d\n", iclusters_natoms);
|
||||
printf("Number of j-cluster neighbors per i-cluster: %d\n", nneighs);
|
||||
printf("Number of times to replicate neighbor lists: %d\n", nreps);
|
||||
printf("Estimated total data volume (kB): %.4f\n", estim_volume / 1000.0);
|
||||
printf("Estimated atom data volume (kB): %.4f\n", estim_atom_volume / 1000.0);
|
||||
printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("Defining j-clusters...\n");
|
||||
defineJClusters(atom);
|
||||
DEBUG_MESSAGE("Initializing neighbor lists...\n");
|
||||
initNeighbor(&neighbor, ¶m);
|
||||
DEBUG_MESSAGE("Creating neighbor lists...\n");
|
||||
createNeighbors(atom, &neighbor, pattern, nneighs, nreps, masked);
|
||||
DEBUG_MESSAGE("Computing forces...\n");
|
||||
|
||||
double T_accum = 0.0;
|
||||
for(int i = 0; i < param.ntimes; i++) {
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
traceAddresses(¶m, atom, &neighbor, i + 1);
|
||||
#endif
|
||||
|
||||
if(param.force_field == FF_EAM) {
|
||||
T_accum += computeForceEam(&eam, ¶m, atom, &neighbor, &stats);
|
||||
} else {
|
||||
T_accum += computeForceLJ(¶m, atom, &neighbor, &stats);
|
||||
}
|
||||
}
|
||||
|
||||
double freq_hz = param.proc_freq * 1.e9;
|
||||
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
|
||||
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
|
||||
const double cycles_per_neigh = cycles_per_atom / (double)(nneighs);
|
||||
|
||||
if(!csv) {
|
||||
printf("Total time: %.4f, Mega atom updates/s: %.4f\n", T_accum, atoms_updates_per_sec / 1.e6);
|
||||
if(param.proc_freq > 0.0) {
|
||||
printf("Cycles per atom: %.4f, Cycles per neighbor: %.4f\n", cycles_per_atom, cycles_per_neigh);
|
||||
}
|
||||
} else {
|
||||
printf("steps,pattern,niclusters,iclusters_natoms,nneighs,nreps,total vol.(kB),atoms vol.(kB),neigh vol.(kB),time(s),atom upds/s(M)");
|
||||
if(param.proc_freq > 0.0) {
|
||||
printf(",cy/atom,cy/neigh");
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("%d,%s,%d,%d,%d,%d,%.4f,%.4f,%.4f,%.4f,%.4f",
|
||||
param.ntimes, pattern_str, niclusters, iclusters_natoms, nneighs, nreps,
|
||||
estim_volume / 1.e3, estim_atom_volume / 1.e3, estim_neighbors_volume / 1.e3, T_accum, atoms_updates_per_sec / 1.e6);
|
||||
|
||||
if(param.proc_freq > 0.0) {
|
||||
printf(",%.4f,%.4f", cycles_per_atom, cycles_per_neigh);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
double timer[NUMTIMER];
|
||||
timer[FORCE] = T_accum;
|
||||
displayStatistics(atom, ¶m, &stats, timer);
|
||||
LIKWID_MARKER_CLOSE;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
344
src/clusterpair/main.c
Normal file
344
src/clusterpair/main.c
Normal file
@@ -0,0 +1,344 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <omp.h>
|
||||
//--
|
||||
#include <likwid-marker.h>
|
||||
//--
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
#include <device.h>
|
||||
#include <eam.h>
|
||||
#include <integrate.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <pbc.h>
|
||||
#include <stats.h>
|
||||
#include <thermo.h>
|
||||
#include <timers.h>
|
||||
#include <timing.h>
|
||||
#include <util.h>
|
||||
#include <vtk.h>
|
||||
#include <xtc.h>
|
||||
|
||||
#define HLINE "----------------------------------------------------------------------------\n"
|
||||
|
||||
extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
extern int isReneighboured;
|
||||
extern double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
|
||||
extern void copyDataToCUDADevice(Atom *atom);
|
||||
extern void copyDataFromCUDADevice(Atom *atom);
|
||||
extern void cudaDeviceFree();
|
||||
#endif
|
||||
|
||||
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
||||
double S, E;
|
||||
param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
|
||||
param->xprd = param->nx * param->lattice;
|
||||
param->yprd = param->ny * param->lattice;
|
||||
param->zprd = param->nz * param->lattice;
|
||||
|
||||
S = getTimeStamp();
|
||||
initAtom(atom);
|
||||
initPbc(atom);
|
||||
initStats(stats);
|
||||
initNeighbor(neighbor, param);
|
||||
if(param->input_file == NULL) {
|
||||
createAtom(atom, param);
|
||||
} else {
|
||||
readAtom(atom, param);
|
||||
}
|
||||
|
||||
setupNeighbor(param, atom);
|
||||
setupThermo(param, atom->Natoms);
|
||||
if(param->input_file == NULL) { adjustThermo(param, atom); }
|
||||
buildClusters(atom);
|
||||
defineJClusters(atom);
|
||||
setupPbc(atom, param);
|
||||
binClusters(atom);
|
||||
buildNeighbor(atom, neighbor);
|
||||
initDevice(atom, neighbor);
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
double S, E;
|
||||
S = getTimeStamp();
|
||||
LIKWID_MARKER_START("reneighbour");
|
||||
updateSingleAtoms(atom);
|
||||
updateAtomsPbc(atom, param);
|
||||
buildClusters(atom);
|
||||
defineJClusters(atom);
|
||||
setupPbc(atom, param);
|
||||
binClusters(atom);
|
||||
buildNeighbor(atom, neighbor);
|
||||
LIKWID_MARKER_STOP("reneighbour");
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
void printAtomState(Atom *atom) {
|
||||
printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n",
|
||||
atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
|
||||
|
||||
/* int nall = atom->Nlocal + atom->Nghost; */
|
||||
|
||||
/* for (int i=0; i<nall; i++) { */
|
||||
/* printf("%d %f %f %f\n", i, atom->x[i], atom->y[i], atom->z[i]); */
|
||||
/* } */
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
double timer[NUMTIMER];
|
||||
Eam eam;
|
||||
Atom atom;
|
||||
Neighbor neighbor;
|
||||
Stats stats;
|
||||
Parameter param;
|
||||
|
||||
LIKWID_MARKER_INIT;
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_REGISTER("force");
|
||||
//LIKWID_MARKER_REGISTER("reneighbour");
|
||||
//LIKWID_MARKER_REGISTER("pbc");
|
||||
}
|
||||
|
||||
initParameter(¶m);
|
||||
for(int i = 0; i < argc; i++) {
|
||||
if((strcmp(argv[i], "-p") == 0) || (strcmp(argv[i], "--param") == 0)) {
|
||||
readParameter(¶m, argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-f") == 0)) {
|
||||
if((param.force_field = str2ff(argv[++i])) < 0) {
|
||||
fprintf(stderr, "Invalid force field!\n");
|
||||
exit(-1);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-i") == 0)) {
|
||||
param.input_file = strdup(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-e") == 0)) {
|
||||
param.eam_file = strdup(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
|
||||
param.ntimes = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-nx") == 0)) {
|
||||
param.nx = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-ny") == 0)) {
|
||||
param.ny = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-nz") == 0)) {
|
||||
param.nz = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-half") == 0)) {
|
||||
param.half_neigh = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-m") == 0) || (strcmp(argv[i], "--mass") == 0)) {
|
||||
param.mass = atof(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-r") == 0) || (strcmp(argv[i], "--radius") == 0)) {
|
||||
param.cutforce = atof(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-s") == 0) || (strcmp(argv[i], "--skin") == 0)) {
|
||||
param.skin = atof(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "--freq") == 0)) {
|
||||
param.proc_freq = atof(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "--vtk") == 0)) {
|
||||
param.vtk_file = strdup(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "--xtc") == 0)) {
|
||||
#ifndef XTC_OUTPUT
|
||||
fprintf(stderr, "XTC not available, set XTC_OUTPUT option in config.mk file and recompile MD-Bench!");
|
||||
exit(-1);
|
||||
#else
|
||||
param.xtc_file = strdup(argv[++i]);
|
||||
#endif
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
|
||||
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
|
||||
printf(HLINE);
|
||||
printf("-p <string>: file to read parameters from (can be specified more than once)\n");
|
||||
printf("-f <string>: force field (lj or eam), default lj\n");
|
||||
printf("-i <string>: input file with atom positions (dump)\n");
|
||||
printf("-e <string>: input file for EAM\n");
|
||||
printf("-n / --nsteps <int>: set number of timesteps for simulation\n");
|
||||
printf("-nx/-ny/-nz <int>: set linear dimension of systembox in x/y/z direction\n");
|
||||
printf("-r / --radius <real>: set cutoff radius\n");
|
||||
printf("-s / --skin <real>: set skin (verlet buffer)\n");
|
||||
printf("--freq <real>: processor frequency (GHz)\n");
|
||||
printf("--vtk <string>: VTK file for visualization\n");
|
||||
printf("--xtc <string>: XTC file for visualization\n");
|
||||
printf(HLINE);
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
param.cutneigh = param.cutforce + param.skin;
|
||||
setup(¶m, &eam, &atom, &neighbor, &stats);
|
||||
printParameter(¶m);
|
||||
printf(HLINE);
|
||||
|
||||
printf("step\ttemp\t\tpressure\n");
|
||||
computeThermo(0, ¶m, &atom);
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
||||
#endif
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
copyDataToCUDADevice(&atom);
|
||||
#endif
|
||||
|
||||
if(param.force_field == FF_EAM) {
|
||||
timer[FORCE] = computeForceEam(&eam, ¶m, &atom, &neighbor, &stats);
|
||||
} else {
|
||||
timer[FORCE] = computeForceLJ(¶m, &atom, &neighbor, &stats);
|
||||
}
|
||||
|
||||
timer[NEIGH] = 0.0;
|
||||
timer[TOTAL] = getTimeStamp();
|
||||
|
||||
if(param.vtk_file != NULL) {
|
||||
write_data_to_vtk_file(param.vtk_file, &atom, 0);
|
||||
}
|
||||
|
||||
if(param.xtc_file != NULL) {
|
||||
xtc_init(param.xtc_file, &atom, 0);
|
||||
}
|
||||
|
||||
for(int n = 0; n < param.ntimes; n++) {
|
||||
initialIntegrate(¶m, &atom);
|
||||
|
||||
if((n + 1) % param.reneigh_every) {
|
||||
if(!((n + 1) % param.prune_every)) {
|
||||
pruneNeighbor(¶m, &atom, &neighbor);
|
||||
}
|
||||
|
||||
updatePbc(&atom, ¶m, 0);
|
||||
} else {
|
||||
#ifdef CUDA_TARGET
|
||||
copyDataFromCUDADevice(&atom);
|
||||
#endif
|
||||
|
||||
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor);
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
copyDataToCUDADevice(&atom);
|
||||
isReneighboured = 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
||||
#endif
|
||||
|
||||
if(param.force_field == FF_EAM) {
|
||||
timer[FORCE] += computeForceEam(&eam, ¶m, &atom, &neighbor, &stats);
|
||||
} else {
|
||||
timer[FORCE] += computeForceLJ(¶m, &atom, &neighbor, &stats);
|
||||
}
|
||||
|
||||
finalIntegrate(¶m, &atom);
|
||||
|
||||
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
|
||||
computeThermo(n + 1, ¶m, &atom);
|
||||
}
|
||||
|
||||
int write_pos = !((n + 1) % param.x_out_every);
|
||||
int write_vel = !((n + 1) % param.v_out_every);
|
||||
if(write_pos || write_vel) {
|
||||
if(param.vtk_file != NULL) {
|
||||
write_data_to_vtk_file(param.vtk_file, &atom, n + 1);
|
||||
}
|
||||
|
||||
if(param.xtc_file != NULL) {
|
||||
xtc_write(&atom, n + 1, write_pos, write_vel);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
copyDataFromCUDADevice(&atom);
|
||||
#endif
|
||||
|
||||
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
|
||||
updateSingleAtoms(&atom);
|
||||
computeThermo(-1, ¶m, &atom);
|
||||
|
||||
if(param.xtc_file != NULL) {
|
||||
xtc_end();
|
||||
}
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
cudaDeviceFree();
|
||||
#endif
|
||||
|
||||
printf(HLINE);
|
||||
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
|
||||
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
|
||||
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
|
||||
printf(HLINE);
|
||||
|
||||
int nthreads = 0;
|
||||
int chunkSize = 0;
|
||||
omp_sched_t schedKind;
|
||||
char schedType[10];
|
||||
#pragma omp parallel
|
||||
#pragma omp master
|
||||
{
|
||||
omp_get_schedule(&schedKind, &chunkSize);
|
||||
|
||||
switch (schedKind)
|
||||
{
|
||||
case omp_sched_static: strcpy(schedType, "static"); break;
|
||||
case omp_sched_dynamic: strcpy(schedType, "dynamic"); break;
|
||||
case omp_sched_guided: strcpy(schedType, "guided"); break;
|
||||
case omp_sched_auto: strcpy(schedType, "auto"); break;
|
||||
}
|
||||
|
||||
nthreads = omp_get_max_threads();
|
||||
}
|
||||
|
||||
printf("Num threads: %d\n", nthreads);
|
||||
printf("Schedule: (%s,%d)\n", schedType, chunkSize);
|
||||
|
||||
printf("Performance: %.2f million atom updates per second\n",
|
||||
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
|
||||
#ifdef COMPUTE_STATS
|
||||
displayStatistics(&atom, ¶m, &stats, timer);
|
||||
#endif
|
||||
LIKWID_MARKER_CLOSE;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
939
src/clusterpair/neighbor.c
Normal file
939
src/clusterpair/neighbor.c
Normal file
@@ -0,0 +1,939 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <util.h>
|
||||
|
||||
#define SMALL 1.0e-6
|
||||
#define FACTOR 0.999
|
||||
|
||||
static MD_FLOAT xprd, yprd, zprd;
|
||||
static MD_FLOAT bininvx, bininvy;
|
||||
static int mbinxlo, mbinylo;
|
||||
static int nbinx, nbiny;
|
||||
static int mbinx, mbiny; // n bins in x, y
|
||||
static int *bincount;
|
||||
static int *bins;
|
||||
static int *bin_nclusters;
|
||||
static int *bin_clusters;
|
||||
static int mbins; //total number of bins
|
||||
static int atoms_per_bin; // max atoms per bin
|
||||
static int clusters_per_bin; // max clusters per bin
|
||||
static MD_FLOAT cutneigh;
|
||||
static MD_FLOAT cutneighsq; // neighbor cutoff squared
|
||||
static int nmax;
|
||||
static int nstencil; // # of bins in stencil
|
||||
static int* stencil; // stencil list of bin offsets
|
||||
static MD_FLOAT binsizex, binsizey;
|
||||
|
||||
static int coord2bin(MD_FLOAT, MD_FLOAT);
|
||||
static MD_FLOAT bindist(int, int);
|
||||
|
||||
/* exported subroutines */
|
||||
void initNeighbor(Neighbor *neighbor, Parameter *param) {
|
||||
MD_FLOAT neighscale = 5.0 / 6.0;
|
||||
xprd = param->nx * param->lattice;
|
||||
yprd = param->ny * param->lattice;
|
||||
zprd = param->nz * param->lattice;
|
||||
cutneigh = param->cutneigh;
|
||||
nmax = 0;
|
||||
atoms_per_bin = 8;
|
||||
clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 10;
|
||||
stencil = NULL;
|
||||
bins = NULL;
|
||||
bincount = NULL;
|
||||
bin_clusters = NULL;
|
||||
bin_nclusters = NULL;
|
||||
neighbor->half_neigh = param->half_neigh;
|
||||
neighbor->maxneighs = 100;
|
||||
neighbor->numneigh = NULL;
|
||||
neighbor->numneigh_masked = NULL;
|
||||
neighbor->neighbors = NULL;
|
||||
neighbor->neighbors_imask = NULL;
|
||||
}
|
||||
|
||||
void setupNeighbor(Parameter *param, Atom *atom) {
|
||||
MD_FLOAT coord;
|
||||
int mbinxhi, mbinyhi;
|
||||
int nextx, nexty, nextz;
|
||||
|
||||
if(param->input_file != NULL) {
|
||||
xprd = param->xprd;
|
||||
yprd = param->yprd;
|
||||
zprd = param->zprd;
|
||||
}
|
||||
|
||||
// TODO: update lo and hi for standard case and use them here instead
|
||||
MD_FLOAT xlo = 0.0; MD_FLOAT xhi = xprd;
|
||||
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = yprd;
|
||||
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd;
|
||||
|
||||
MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
|
||||
MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N);
|
||||
MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density);
|
||||
MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density);
|
||||
nbinx = MAX(1, (int)ceil((xhi - xlo) / targetsizex));
|
||||
nbiny = MAX(1, (int)ceil((yhi - ylo) / targetsizey));
|
||||
binsizex = (xhi - xlo) / nbinx;
|
||||
binsizey = (yhi - ylo) / nbiny;
|
||||
bininvx = 1.0 / binsizex;
|
||||
bininvy = 1.0 / binsizey;
|
||||
cutneighsq = cutneigh * cutneigh;
|
||||
|
||||
coord = xlo - cutneigh - SMALL * xprd;
|
||||
mbinxlo = (int)(coord * bininvx);
|
||||
if(coord < 0.0) { mbinxlo = mbinxlo - 1; }
|
||||
coord = xhi + cutneigh + SMALL * xprd;
|
||||
mbinxhi = (int)(coord * bininvx);
|
||||
|
||||
coord = ylo - cutneigh - SMALL * yprd;
|
||||
mbinylo = (int)(coord * bininvy);
|
||||
if(coord < 0.0) { mbinylo = mbinylo - 1; }
|
||||
coord = yhi + cutneigh + SMALL * yprd;
|
||||
mbinyhi = (int)(coord * bininvy);
|
||||
|
||||
mbinxlo = mbinxlo - 1;
|
||||
mbinxhi = mbinxhi + 1;
|
||||
mbinx = mbinxhi - mbinxlo + 1;
|
||||
|
||||
mbinylo = mbinylo - 1;
|
||||
mbinyhi = mbinyhi + 1;
|
||||
mbiny = mbinyhi - mbinylo + 1;
|
||||
|
||||
nextx = (int)(cutneigh * bininvx);
|
||||
nexty = (int)(cutneigh * bininvy);
|
||||
if(nextx * binsizex < FACTOR * cutneigh) nextx++;
|
||||
if(nexty * binsizey < FACTOR * cutneigh) nexty++;
|
||||
|
||||
if (stencil) { free(stencil); }
|
||||
stencil = (int *) malloc((2 * nexty + 1) * (2 * nextx + 1) * sizeof(int));
|
||||
nstencil = 0;
|
||||
|
||||
for(int j = -nexty; j <= nexty; j++) {
|
||||
for(int i = -nextx; i <= nextx; i++) {
|
||||
if(bindist(i, j) < cutneighsq) {
|
||||
stencil[nstencil++] = j * mbinx + i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(bincount) { free(bincount); }
|
||||
if(bins) { free(bins); }
|
||||
if(bin_nclusters) { free(bin_nclusters); }
|
||||
if(bin_clusters) { free(bin_clusters); }
|
||||
mbins = mbinx * mbiny;
|
||||
bincount = (int*) malloc(mbins * sizeof(int));
|
||||
bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
|
||||
bin_nclusters = (int*) malloc(mbins * sizeof(int));
|
||||
bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
|
||||
|
||||
/*
|
||||
DEBUG_MESSAGE("lo, hi = (%e, %e, %e), (%e, %e, %e)\n", xlo, ylo, zlo, xhi, yhi, zhi);
|
||||
DEBUG_MESSAGE("binsize = %e, %e\n", binsizex, binsizey);
|
||||
DEBUG_MESSAGE("mbin lo, hi = (%d, %d), (%d, %d)\n", mbinxlo, mbinylo, mbinxhi, mbinyhi);
|
||||
DEBUG_MESSAGE("mbins = %d (%d x %d)\n", mbins, mbinx, mbiny);
|
||||
DEBUG_MESSAGE("nextx = %d, nexty = %d\n", nextx, nexty);
|
||||
*/
|
||||
}
|
||||
|
||||
MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) {
|
||||
MD_FLOAT dl = atom->iclusters[ci].bbminx - atom->jclusters[cj].bbmaxx;
|
||||
MD_FLOAT dh = atom->jclusters[cj].bbminx - atom->iclusters[ci].bbmaxx;
|
||||
MD_FLOAT dm = MAX(dl, dh);
|
||||
MD_FLOAT dm0 = MAX(dm, 0.0);
|
||||
MD_FLOAT d2 = dm0 * dm0;
|
||||
|
||||
dl = atom->iclusters[ci].bbminy - atom->jclusters[cj].bbmaxy;
|
||||
dh = atom->jclusters[cj].bbminy - atom->iclusters[ci].bbmaxy;
|
||||
dm = MAX(dl, dh);
|
||||
dm0 = MAX(dm, 0.0);
|
||||
d2 += dm0 * dm0;
|
||||
|
||||
dl = atom->iclusters[ci].bbminz - atom->jclusters[cj].bbmaxz;
|
||||
dh = atom->jclusters[cj].bbminz - atom->iclusters[ci].bbmaxz;
|
||||
dm = MAX(dl, dh);
|
||||
dm0 = MAX(dm, 0.0);
|
||||
d2 += dm0 * dm0;
|
||||
return d2;
|
||||
}
|
||||
|
||||
int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
MD_FLOAT delx = ci_x[CL_X_OFFSET + cii] - cj_x[CL_X_OFFSET + cjj];
|
||||
MD_FLOAT dely = ci_x[CL_Y_OFFSET + cii] - cj_x[CL_Y_OFFSET + cjj];
|
||||
MD_FLOAT delz = ci_x[CL_Z_OFFSET + cii] - cj_x[CL_Z_OFFSET + cjj];
|
||||
if(delx * delx + dely * dely + delz * delz < rsq) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
|
||||
static unsigned int get_imask(int rdiag, int ci, int cj) {
|
||||
return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
|
||||
}
|
||||
|
||||
/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
|
||||
static unsigned int get_imask_simd_j2(int rdiag, int ci, int cj) {
|
||||
return (rdiag && ci * 2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0
|
||||
: (rdiag && ci * 2 + 1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1
|
||||
: NBNXN_INTERACTION_MASK_ALL));
|
||||
}
|
||||
|
||||
/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
|
||||
static unsigned int get_imask_simd_j4(int rdiag, int ci, int cj) {
|
||||
return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
|
||||
}
|
||||
|
||||
/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
|
||||
static unsigned int get_imask_simd_j8(int rdiag, int ci, int cj) {
|
||||
return (rdiag && ci == cj * 2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
|
||||
: (rdiag && ci == cj * 2 + 1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
|
||||
: NBNXN_INTERACTION_MASK_ALL));
|
||||
}
|
||||
|
||||
#if VECTOR_WIDTH == 2
|
||||
# define get_imask_simd_4xn get_imask_simd_j2
|
||||
#elif VECTOR_WIDTH== 4
|
||||
# define get_imask_simd_4xn get_imask_simd_j4
|
||||
#elif VECTOR_WIDTH == 8
|
||||
# define get_imask_simd_4xn get_imask_simd_j8
|
||||
# define get_imask_simd_2xnn get_imask_simd_j4
|
||||
#elif VECTOR_WIDTH == 16
|
||||
# define get_imask_simd_2xnn get_imask_simd_j8
|
||||
#else
|
||||
# error "Invalid cluster configuration"
|
||||
#endif
|
||||
|
||||
void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
DEBUG_MESSAGE("buildNeighbor start\n");
|
||||
|
||||
/* extend atom arrays if necessary */
|
||||
if(atom->Nclusters_local > nmax) {
|
||||
nmax = atom->Nclusters_local;
|
||||
if(neighbor->numneigh) free(neighbor->numneigh);
|
||||
if(neighbor->numneigh_masked) free(neighbor->numneigh_masked);
|
||||
if(neighbor->neighbors) free(neighbor->neighbors);
|
||||
if(neighbor->neighbors_imask) free(neighbor->neighbors_imask);
|
||||
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
|
||||
neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int));
|
||||
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int));
|
||||
neighbor->neighbors_imask = (unsigned int*) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
|
||||
}
|
||||
|
||||
MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
|
||||
MD_FLOAT bby = 0.5 * (binsizey + binsizey);
|
||||
MD_FLOAT rbb_sq = MAX(0.0, cutneigh - 0.5 * sqrt(bbx * bbx + bby * bby));
|
||||
rbb_sq = rbb_sq * rbb_sq;
|
||||
int resize = 1;
|
||||
|
||||
/* loop over each atom, storing neighbors */
|
||||
while(resize) {
|
||||
int new_maxneighs = neighbor->maxneighs;
|
||||
resize = 0;
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj1 = CJ1_FROM_CI(ci);
|
||||
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||
unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
|
||||
int n = 0, nmasked = 0;
|
||||
int ibin = atom->icluster_bin[ci];
|
||||
MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
|
||||
MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
|
||||
MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy;
|
||||
MD_FLOAT ibb_ymax = atom->iclusters[ci].bbmaxy;
|
||||
MD_FLOAT ibb_zmin = atom->iclusters[ci].bbminz;
|
||||
MD_FLOAT ibb_zmax = atom->iclusters[ci].bbmaxz;
|
||||
|
||||
for(int k = 0; k < nstencil; k++) {
|
||||
int jbin = ibin + stencil[k];
|
||||
int *loc_bin = &bin_clusters[jbin * clusters_per_bin];
|
||||
int cj, m = -1;
|
||||
MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax;
|
||||
const int c = bin_nclusters[jbin];
|
||||
|
||||
if(c > 0) {
|
||||
MD_FLOAT dl, dh, dm, dm0, d_bb_sq;
|
||||
|
||||
do {
|
||||
m++;
|
||||
cj = loc_bin[m];
|
||||
if(neighbor->half_neigh && ci_cj1 > cj) {
|
||||
continue;
|
||||
}
|
||||
jbb_zmin = atom->jclusters[cj].bbminz;
|
||||
jbb_zmax = atom->jclusters[cj].bbmaxz;
|
||||
dl = ibb_zmin - jbb_zmax;
|
||||
dh = jbb_zmin - ibb_zmax;
|
||||
dm = MAX(dl, dh);
|
||||
dm0 = MAX(dm, 0.0);
|
||||
d_bb_sq = dm0 * dm0;
|
||||
} while(m + 1 < c && d_bb_sq > cutneighsq);
|
||||
|
||||
jbb_xmin = atom->jclusters[cj].bbminx;
|
||||
jbb_xmax = atom->jclusters[cj].bbmaxx;
|
||||
jbb_ymin = atom->jclusters[cj].bbminy;
|
||||
jbb_ymax = atom->jclusters[cj].bbmaxy;
|
||||
|
||||
while(m < c) {
|
||||
if(!neighbor->half_neigh || ci_cj1 <= cj) {
|
||||
dl = ibb_zmin - jbb_zmax;
|
||||
dh = jbb_zmin - ibb_zmax;
|
||||
dm = MAX(dl, dh);
|
||||
dm0 = MAX(dm, 0.0);
|
||||
d_bb_sq = dm0 * dm0;
|
||||
|
||||
/*if(d_bb_sq > cutneighsq) {
|
||||
break;
|
||||
}*/
|
||||
|
||||
dl = ibb_ymin - jbb_ymax;
|
||||
dh = jbb_ymin - ibb_ymax;
|
||||
dm = MAX(dl, dh);
|
||||
dm0 = MAX(dm, 0.0);
|
||||
d_bb_sq += dm0 * dm0;
|
||||
|
||||
dl = ibb_xmin - jbb_xmax;
|
||||
dh = jbb_xmin - ibb_xmax;
|
||||
dm = MAX(dl, dh);
|
||||
dm0 = MAX(dm, 0.0);
|
||||
d_bb_sq += dm0 * dm0;
|
||||
|
||||
if(d_bb_sq < cutneighsq) {
|
||||
if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
|
||||
// We use true (1) for rdiag because we only care if there are masks
|
||||
// at all, and when this is set to false (0) the self-exclusions are
|
||||
// not accounted for, which makes the optimized version to not work!
|
||||
unsigned int imask;
|
||||
#if CLUSTER_N == (VECTOR_WIDTH / 2) // 2xnn
|
||||
imask = get_imask_simd_2xnn(1, ci, cj);
|
||||
#else // 4xn
|
||||
imask = get_imask_simd_4xn(1, ci, cj);
|
||||
#endif
|
||||
|
||||
if(n < neighbor->maxneighs) {
|
||||
if(imask == NBNXN_INTERACTION_MASK_ALL) {
|
||||
neighptr[n] = cj;
|
||||
neighptr_imask[n] = imask;
|
||||
} else {
|
||||
neighptr[n] = neighptr[nmasked];
|
||||
neighptr_imask[n] = neighptr_imask[nmasked];
|
||||
neighptr[nmasked] = cj;
|
||||
neighptr_imask[nmasked] = imask;
|
||||
nmasked++;
|
||||
}
|
||||
}
|
||||
|
||||
n++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
m++;
|
||||
if(m < c) {
|
||||
cj = loc_bin[m];
|
||||
jbb_xmin = atom->jclusters[cj].bbminx;
|
||||
jbb_xmax = atom->jclusters[cj].bbmaxx;
|
||||
jbb_ymin = atom->jclusters[cj].bbminy;
|
||||
jbb_ymax = atom->jclusters[cj].bbmaxy;
|
||||
jbb_zmin = atom->jclusters[cj].bbminz;
|
||||
jbb_zmax = atom->jclusters[cj].bbmaxz;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fill neighbor list with dummy values to fit vector width
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(n % (VECTOR_WIDTH / CLUSTER_N)) {
|
||||
neighptr[n] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
neighptr_imask[n] = 0;
|
||||
n++;
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[ci] = n;
|
||||
neighbor->numneigh_masked[ci] = nmasked;
|
||||
if(n >= neighbor->maxneighs) {
|
||||
resize = 1;
|
||||
|
||||
if(n >= new_maxneighs) {
|
||||
new_maxneighs = n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
neighbor->maxneighs = new_maxneighs * 1.2;
|
||||
fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
|
||||
free(neighbor->neighbors);
|
||||
free(neighbor->neighbors_imask);
|
||||
neighbor->neighbors = (int *) malloc(nmax * neighbor->maxneighs * sizeof(int));
|
||||
neighbor->neighbors_imask = (unsigned int *) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
DEBUG_MESSAGE("\ncutneighsq = %f, rbb_sq = %f\n", cutneighsq, rbb_sq);
|
||||
for(int ci = 0; ci < 6; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||
|
||||
DEBUG_MESSAGE("Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
|
||||
ci,
|
||||
atom->iclusters[ci].bbminx,
|
||||
atom->iclusters[ci].bbmaxx,
|
||||
atom->iclusters[ci].bbminy,
|
||||
atom->iclusters[ci].bbmaxy,
|
||||
atom->iclusters[ci].bbminz,
|
||||
atom->iclusters[ci].bbmaxz);
|
||||
|
||||
for(int cii = 0; cii < CLUSTER_M; cii++) {
|
||||
DEBUG_MESSAGE("%f, %f, %f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("Neighbors:\n");
|
||||
for(int k = 0; k < neighbor->numneigh[ci]; k++) {
|
||||
int cj = neighptr[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
|
||||
DEBUG_MESSAGE(" Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
|
||||
cj,
|
||||
atom->jclusters[cj].bbminx,
|
||||
atom->jclusters[cj].bbmaxx,
|
||||
atom->jclusters[cj].bbminy,
|
||||
atom->jclusters[cj].bbmaxy,
|
||||
atom->jclusters[cj].bbminz,
|
||||
atom->jclusters[cj].bbmaxz);
|
||||
|
||||
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
|
||||
DEBUG_MESSAGE(" %f, %f, %f\n", cj_x[CL_X_OFFSET + cjj], cj_x[CL_Y_OFFSET + cjj], cj_x[CL_Z_OFFSET + cjj]);
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
DEBUG_MESSAGE("buildNeighbor end\n");
|
||||
}
|
||||
|
||||
void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
DEBUG_MESSAGE("pruneNeighbor start\n");
|
||||
//MD_FLOAT cutsq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT cutsq = cutneighsq;
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
unsigned int *neighs_imask = &neighbor->neighbors_imask[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||
int k = 0;
|
||||
|
||||
// Remove dummy clusters if necessary
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(neighs[numneighs - 1] == atom->dummy_cj) {
|
||||
numneighs--;
|
||||
}
|
||||
}
|
||||
|
||||
while(k < numneighs) {
|
||||
int cj = neighs[k];
|
||||
if(atomDistanceInRange(atom, ci, cj, cutsq)) {
|
||||
k++;
|
||||
} else {
|
||||
numneighs--;
|
||||
if(k < numneighs_masked) {
|
||||
numneighs_masked--;
|
||||
}
|
||||
neighs[k] = neighs[numneighs];
|
||||
}
|
||||
}
|
||||
|
||||
// Readd dummy clusters if necessary
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
|
||||
neighs[numneighs] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
neighs_imask[numneighs] = 0;
|
||||
numneighs++;
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[ci] = numneighs;
|
||||
neighbor->numneigh_masked[ci] = numneighs_masked;
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("pruneNeighbor end\n");
|
||||
}
|
||||
|
||||
/* internal subroutines */
|
||||
MD_FLOAT bindist(int i, int j) {
|
||||
MD_FLOAT delx, dely, delz;
|
||||
|
||||
if(i > 0) {
|
||||
delx = (i - 1) * binsizex;
|
||||
} else if(i == 0) {
|
||||
delx = 0.0;
|
||||
} else {
|
||||
delx = (i + 1) * binsizex;
|
||||
}
|
||||
|
||||
if(j > 0) {
|
||||
dely = (j - 1) * binsizey;
|
||||
} else if(j == 0) {
|
||||
dely = 0.0;
|
||||
} else {
|
||||
dely = (j + 1) * binsizey;
|
||||
}
|
||||
|
||||
return (delx * delx + dely * dely);
|
||||
}
|
||||
|
||||
int coord2bin(MD_FLOAT xin, MD_FLOAT yin) {
|
||||
int ix, iy;
|
||||
|
||||
if(xin >= xprd) {
|
||||
ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
|
||||
} else if(xin >= 0.0) {
|
||||
ix = (int)(xin * bininvx) - mbinxlo;
|
||||
} else {
|
||||
ix = (int)(xin * bininvx) - mbinxlo - 1;
|
||||
}
|
||||
|
||||
if(yin >= yprd) {
|
||||
iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
|
||||
} else if(yin >= 0.0) {
|
||||
iy = (int)(yin * bininvy) - mbinylo;
|
||||
} else {
|
||||
iy = (int)(yin * bininvy) - mbinylo - 1;
|
||||
}
|
||||
|
||||
return (iy * mbinx + ix + 1);
|
||||
}
|
||||
|
||||
void coord2bin2D(MD_FLOAT xin, MD_FLOAT yin, int *ix, int *iy) {
|
||||
if(xin >= xprd) {
|
||||
*ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
|
||||
} else if(xin >= 0.0) {
|
||||
*ix = (int)(xin * bininvx) - mbinxlo;
|
||||
} else {
|
||||
*ix = (int)(xin * bininvx) - mbinxlo - 1;
|
||||
}
|
||||
|
||||
if(yin >= yprd) {
|
||||
*iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
|
||||
} else if(yin >= 0.0) {
|
||||
*iy = (int)(yin * bininvy) - mbinylo;
|
||||
} else {
|
||||
*iy = (int)(yin * bininvy) - mbinylo - 1;
|
||||
}
|
||||
}
|
||||
|
||||
void binAtoms(Atom *atom) {
|
||||
DEBUG_MESSAGE("binAtoms start\n");
|
||||
int resize = 1;
|
||||
|
||||
while(resize > 0) {
|
||||
resize = 0;
|
||||
|
||||
for(int i = 0; i < mbins; i++) {
|
||||
bincount[i] = 0;
|
||||
}
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
int ibin = coord2bin(atom_x(i), atom_y(i));
|
||||
if(bincount[ibin] < atoms_per_bin) {
|
||||
int ac = bincount[ibin]++;
|
||||
bins[ibin * atoms_per_bin + ac] = i;
|
||||
} else {
|
||||
resize = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
free(bins);
|
||||
atoms_per_bin *= 2;
|
||||
bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("binAtoms end\n");
|
||||
}
|
||||
|
||||
// TODO: Use pigeonhole sorting
|
||||
void sortAtomsByZCoord(Atom *atom) {
|
||||
DEBUG_MESSAGE("sortAtomsByZCoord start\n");
|
||||
for(int bin = 0; bin < mbins; bin++) {
|
||||
int c = bincount[bin];
|
||||
int *bin_ptr = &bins[bin * atoms_per_bin];
|
||||
|
||||
for(int ac_i = 0; ac_i < c; ac_i++) {
|
||||
int i = bin_ptr[ac_i];
|
||||
int min_ac = ac_i;
|
||||
int min_idx = i;
|
||||
MD_FLOAT min_z = atom_z(i);
|
||||
|
||||
for(int ac_j = ac_i + 1; ac_j < c; ac_j++) {
|
||||
int j = bin_ptr[ac_j];
|
||||
MD_FLOAT zj = atom_z(j);
|
||||
if(zj < min_z) {
|
||||
min_ac = ac_j;
|
||||
min_idx = j;
|
||||
min_z = zj;
|
||||
}
|
||||
}
|
||||
|
||||
bin_ptr[ac_i] = min_idx;
|
||||
bin_ptr[min_ac] = i;
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("sortAtomsByZCoord end\n");
|
||||
}
|
||||
|
||||
void buildClusters(Atom *atom) {
|
||||
DEBUG_MESSAGE("buildClusters start\n");
|
||||
atom->Nclusters_local = 0;
|
||||
|
||||
/* bin local atoms */
|
||||
binAtoms(atom);
|
||||
sortAtomsByZCoord(atom);
|
||||
|
||||
for(int bin = 0; bin < mbins; bin++) {
|
||||
int c = bincount[bin];
|
||||
int ac = 0;
|
||||
int nclusters = ((c + CLUSTER_M - 1) / CLUSTER_M);
|
||||
if(CLUSTER_N > CLUSTER_M && nclusters % 2) { nclusters++; }
|
||||
for(int cl = 0; cl < nclusters; cl++) {
|
||||
const int ci = atom->Nclusters_local;
|
||||
if(ci >= atom->Nclusters_max) {
|
||||
growClusters(atom);
|
||||
}
|
||||
|
||||
int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
int *ci_type = &atom->cl_type[ci_sca_base];
|
||||
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
|
||||
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
|
||||
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
|
||||
|
||||
atom->iclusters[ci].natoms = 0;
|
||||
for(int cii = 0; cii < CLUSTER_M; cii++) {
|
||||
if(ac < c) {
|
||||
int i = bins[bin * atoms_per_bin + ac];
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
|
||||
ci_x[CL_X_OFFSET + cii] = xtmp;
|
||||
ci_x[CL_Y_OFFSET + cii] = ytmp;
|
||||
ci_x[CL_Z_OFFSET + cii] = ztmp;
|
||||
ci_v[CL_X_OFFSET + cii] = atom->vx[i];
|
||||
ci_v[CL_Y_OFFSET + cii] = atom->vy[i];
|
||||
ci_v[CL_Z_OFFSET + cii] = atom->vz[i];
|
||||
|
||||
// TODO: To create the bounding boxes faster, we can use SIMD operations
|
||||
if(bbminx > xtmp) { bbminx = xtmp; }
|
||||
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
|
||||
if(bbminy > ytmp) { bbminy = ytmp; }
|
||||
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
|
||||
if(bbminz > ztmp) { bbminz = ztmp; }
|
||||
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
|
||||
|
||||
ci_type[cii] = atom->type[i];
|
||||
atom->iclusters[ci].natoms++;
|
||||
} else {
|
||||
ci_x[CL_X_OFFSET + cii] = INFINITY;
|
||||
ci_x[CL_Y_OFFSET + cii] = INFINITY;
|
||||
ci_x[CL_Z_OFFSET + cii] = INFINITY;
|
||||
}
|
||||
|
||||
ac++;
|
||||
}
|
||||
|
||||
atom->icluster_bin[ci] = bin;
|
||||
atom->iclusters[ci].bbminx = bbminx;
|
||||
atom->iclusters[ci].bbmaxx = bbmaxx;
|
||||
atom->iclusters[ci].bbminy = bbminy;
|
||||
atom->iclusters[ci].bbmaxy = bbmaxy;
|
||||
atom->iclusters[ci].bbminz = bbminz;
|
||||
atom->iclusters[ci].bbmaxz = bbmaxz;
|
||||
atom->Nclusters_local++;
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("buildClusters end\n");
|
||||
}
|
||||
|
||||
void defineJClusters(Atom *atom) {
|
||||
DEBUG_MESSAGE("defineJClusters start\n");
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int cj0 = CJ0_FROM_CI(ci);
|
||||
|
||||
if(CLUSTER_M == CLUSTER_N) {
|
||||
atom->jclusters[cj0].bbminx = atom->iclusters[ci].bbminx;
|
||||
atom->jclusters[cj0].bbmaxx = atom->iclusters[ci].bbmaxx;
|
||||
atom->jclusters[cj0].bbminy = atom->iclusters[ci].bbminy;
|
||||
atom->jclusters[cj0].bbmaxy = atom->iclusters[ci].bbmaxy;
|
||||
atom->jclusters[cj0].bbminz = atom->iclusters[ci].bbminz;
|
||||
atom->jclusters[cj0].bbmaxz = atom->iclusters[ci].bbmaxz;
|
||||
atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms;
|
||||
|
||||
} else if(CLUSTER_M > CLUSTER_N) {
|
||||
int cj1 = CJ1_FROM_CI(ci);
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
|
||||
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
|
||||
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
|
||||
|
||||
for(int cii = 0; cii < MAX(atom->iclusters[ci].natoms, CLUSTER_N); cii++) {
|
||||
MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
|
||||
MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
|
||||
MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
|
||||
|
||||
// TODO: To create the bounding boxes faster, we can use SIMD operations
|
||||
if(bbminx > xtmp) { bbminx = xtmp; }
|
||||
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
|
||||
if(bbminy > ytmp) { bbminy = ytmp; }
|
||||
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
|
||||
if(bbminz > ztmp) { bbminz = ztmp; }
|
||||
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
|
||||
}
|
||||
|
||||
atom->jclusters[cj0].bbminx = bbminx;
|
||||
atom->jclusters[cj0].bbmaxx = bbmaxx;
|
||||
atom->jclusters[cj0].bbminy = bbminy;
|
||||
atom->jclusters[cj0].bbmaxy = bbmaxy;
|
||||
atom->jclusters[cj0].bbminz = bbminz;
|
||||
atom->jclusters[cj0].bbmaxz = bbmaxz;
|
||||
atom->jclusters[cj0].natoms = MAX(atom->iclusters[ci].natoms, CLUSTER_N);
|
||||
|
||||
bbminx = INFINITY, bbmaxx = -INFINITY;
|
||||
bbminy = INFINITY, bbmaxy = -INFINITY;
|
||||
bbminz = INFINITY, bbmaxz = -INFINITY;
|
||||
|
||||
for(int cii = CLUSTER_N; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
|
||||
MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
|
||||
MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
|
||||
|
||||
// TODO: To create the bounding boxes faster, we can use SIMD operations
|
||||
if(bbminx > xtmp) { bbminx = xtmp; }
|
||||
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
|
||||
if(bbminy > ytmp) { bbminy = ytmp; }
|
||||
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
|
||||
if(bbminz > ztmp) { bbminz = ztmp; }
|
||||
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
|
||||
}
|
||||
|
||||
atom->jclusters[cj1].bbminx = bbminx;
|
||||
atom->jclusters[cj1].bbmaxx = bbmaxx;
|
||||
atom->jclusters[cj1].bbminy = bbminy;
|
||||
atom->jclusters[cj1].bbmaxy = bbmaxy;
|
||||
atom->jclusters[cj1].bbminz = bbminz;
|
||||
atom->jclusters[cj1].bbmaxz = bbmaxz;
|
||||
atom->jclusters[cj1].natoms = MIN(0, atom->iclusters[ci].natoms - CLUSTER_N);
|
||||
|
||||
} else {
|
||||
if(ci % 2 == 0) {
|
||||
const int ci1 = ci + 1;
|
||||
atom->jclusters[cj0].bbminx = MIN(atom->iclusters[ci].bbminx, atom->iclusters[ci1].bbminx);
|
||||
atom->jclusters[cj0].bbmaxx = MAX(atom->iclusters[ci].bbmaxx, atom->iclusters[ci1].bbmaxx);
|
||||
atom->jclusters[cj0].bbminy = MIN(atom->iclusters[ci].bbminy, atom->iclusters[ci1].bbminy);
|
||||
atom->jclusters[cj0].bbmaxy = MAX(atom->iclusters[ci].bbmaxy, atom->iclusters[ci1].bbmaxy);
|
||||
atom->jclusters[cj0].bbminz = MIN(atom->iclusters[ci].bbminz, atom->iclusters[ci1].bbminz);
|
||||
atom->jclusters[cj0].bbmaxz = MAX(atom->iclusters[ci].bbmaxz, atom->iclusters[ci1].bbmaxz);
|
||||
atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms + atom->iclusters[ci1].natoms;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("defineJClusters end\n");
|
||||
}
|
||||
|
||||
void binClusters(Atom *atom) {
|
||||
DEBUG_MESSAGE("binClusters start\n");
|
||||
|
||||
/*
|
||||
DEBUG_MESSAGE("Nghost = %d\n", atom->Nclusters_ghost);
|
||||
for(int ci = atom->Nclusters_local; ci < atom->Nclusters_local + 4; ci++) {
|
||||
MD_FLOAT *cptr = cluster_pos_ptr(ci);
|
||||
DEBUG_MESSAGE("Cluster %d:\n", ci);
|
||||
DEBUG_MESSAGE("bin=%d, Natoms=%d, bbox={%f,%f},{%f,%f},{%f,%f}\n",
|
||||
atom->icluster_bin[ci],
|
||||
atom->clusters[ci].natoms,
|
||||
atom->clusters[ci].bbminx,
|
||||
atom->clusters[ci].bbmaxx,
|
||||
atom->clusters[ci].bbminy,
|
||||
atom->clusters[ci].bbmaxy,
|
||||
atom->clusters[ci].bbminz,
|
||||
atom->clusters[ci].bbmaxz);
|
||||
|
||||
for(int cii = 0; cii < CLUSTER_M; cii++) {
|
||||
DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
const int nlocal = atom->Nclusters_local;
|
||||
const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
const int ncj = atom->Nclusters_local / jfac;
|
||||
|
||||
int resize = 1;
|
||||
while(resize > 0) {
|
||||
resize = 0;
|
||||
|
||||
for(int bin = 0; bin < mbins; bin++) {
|
||||
bin_nclusters[bin] = 0;
|
||||
}
|
||||
|
||||
for(int ci = 0; ci < nlocal && !resize; ci++) {
|
||||
// Assure we add this j-cluster only once in the bin
|
||||
if(!(CLUSTER_M < CLUSTER_N && ci % 2)) {
|
||||
int bin = atom->icluster_bin[ci];
|
||||
int c = bin_nclusters[bin];
|
||||
if(c + 1 < clusters_per_bin) {
|
||||
bin_clusters[bin * clusters_per_bin + c] = CJ0_FROM_CI(ci);
|
||||
bin_nclusters[bin]++;
|
||||
|
||||
if(CLUSTER_M > CLUSTER_N) {
|
||||
int cj1 = CJ1_FROM_CI(ci);
|
||||
if(atom->jclusters[cj1].natoms > 0) {
|
||||
bin_clusters[bin * clusters_per_bin + c + 1] = cj1;
|
||||
bin_nclusters[bin]++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
resize = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(int cg = 0; cg < atom->Nclusters_ghost && !resize; cg++) {
|
||||
const int cj = ncj + cg;
|
||||
int ix = -1, iy = -1;
|
||||
MD_FLOAT xtmp, ytmp;
|
||||
|
||||
if(atom->jclusters[cj].natoms > 0) {
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT cj_minz = atom->jclusters[cj].bbminz;
|
||||
|
||||
xtmp = cj_x[CL_X_OFFSET + 0];
|
||||
ytmp = cj_x[CL_Y_OFFSET + 0];
|
||||
coord2bin2D(xtmp, ytmp, &ix, &iy);
|
||||
ix = MAX(MIN(ix, mbinx - 1), 0);
|
||||
iy = MAX(MIN(iy, mbiny - 1), 0);
|
||||
for(int cjj = 1; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
int nix, niy;
|
||||
xtmp = cj_x[CL_X_OFFSET + cjj];
|
||||
ytmp = cj_x[CL_Y_OFFSET + cjj];
|
||||
coord2bin2D(xtmp, ytmp, &nix, &niy);
|
||||
nix = MAX(MIN(nix, mbinx - 1), 0);
|
||||
niy = MAX(MIN(niy, mbiny - 1), 0);
|
||||
|
||||
// Always put the cluster on the bin of its innermost atom so
|
||||
// the cluster should be closer to local clusters
|
||||
if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; }
|
||||
if(atom->PBCx[cg] < 0 && ix < nix) { ix = nix; }
|
||||
if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; }
|
||||
if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; }
|
||||
}
|
||||
|
||||
int bin = iy * mbinx + ix + 1;
|
||||
int c = bin_nclusters[bin];
|
||||
if(c < clusters_per_bin) {
|
||||
// Insert the current ghost cluster in the bin keeping clusters
|
||||
// sorted by z coordinate
|
||||
int inserted = 0;
|
||||
for(int i = 0; i < c; i++) {
|
||||
int last_cl = bin_clusters[bin * clusters_per_bin + i];
|
||||
if(atom->jclusters[last_cl].bbminz > cj_minz) {
|
||||
bin_clusters[bin * clusters_per_bin + i] = cj;
|
||||
|
||||
for(int j = i + 1; j <= c; j++) {
|
||||
int tmp = bin_clusters[bin * clusters_per_bin + j];
|
||||
bin_clusters[bin * clusters_per_bin + j] = last_cl;
|
||||
last_cl = tmp;
|
||||
}
|
||||
|
||||
inserted = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(!inserted) {
|
||||
bin_clusters[bin * clusters_per_bin + c] = cj;
|
||||
}
|
||||
|
||||
bin_nclusters[bin]++;
|
||||
} else {
|
||||
resize = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
free(bin_clusters);
|
||||
clusters_per_bin *= 2;
|
||||
bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
DEBUG_MESSAGE("bin_nclusters\n");
|
||||
for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", bin_nclusters[i]); }
|
||||
DEBUG_MESSAGE("\n");
|
||||
*/
|
||||
|
||||
DEBUG_MESSAGE("binClusters stop\n");
|
||||
}
|
||||
|
||||
void updateSingleAtoms(Atom *atom) {
|
||||
DEBUG_MESSAGE("updateSingleAtoms start\n");
|
||||
int Natom = 0;
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
atom_x(Natom) = ci_x[CL_X_OFFSET + cii];
|
||||
atom_y(Natom) = ci_x[CL_Y_OFFSET + cii];
|
||||
atom_z(Natom) = ci_x[CL_Z_OFFSET + cii];
|
||||
atom->vx[Natom] = ci_v[CL_X_OFFSET + cii];
|
||||
atom->vy[Natom] = ci_v[CL_Y_OFFSET + cii];
|
||||
atom->vz[Natom] = ci_v[CL_Z_OFFSET + cii];
|
||||
Natom++;
|
||||
}
|
||||
}
|
||||
|
||||
if(Natom != atom->Nlocal) {
|
||||
fprintf(stderr, "updateSingleAtoms(): Number of atoms changed!\n");
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("updateSingleAtoms stop\n");
|
||||
}
|
49
src/clusterpair/neighbor.h
Normal file
49
src/clusterpair/neighbor.h
Normal file
@@ -0,0 +1,49 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __NEIGHBOR_H_
|
||||
#define __NEIGHBOR_H_
|
||||
// Interaction masks from GROMACS, things to remember (maybe these confused just me):
|
||||
// 1. These are not "exclusion" masks as the name suggests in GROMACS, but rather
|
||||
// interaction masks (1 = interaction, 0 = no interaction)
|
||||
// 2. These are inverted (maybe because that is how you use in AVX2/AVX512 masking),
|
||||
// so read them from right to left (least significant to most significant bit)
|
||||
// All interaction mask is the same for all kernels
|
||||
#define NBNXN_INTERACTION_MASK_ALL 0xffffffffU
|
||||
// 4x4 kernel diagonal mask
|
||||
#define NBNXN_INTERACTION_MASK_DIAG 0x08ceU
|
||||
// 4x2 kernel diagonal masks
|
||||
#define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002U
|
||||
#define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002fU
|
||||
// 4x8 kernel diagonal masks
|
||||
#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
|
||||
#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
|
||||
|
||||
typedef struct {
|
||||
int every;
|
||||
int ncalls;
|
||||
int maxneighs;
|
||||
int* numneigh;
|
||||
int* numneigh_masked;
|
||||
int half_neigh;
|
||||
int* neighbors;
|
||||
unsigned int* neighbors_imask;
|
||||
} Neighbor;
|
||||
|
||||
extern void initNeighbor(Neighbor*, Parameter*);
|
||||
extern void setupNeighbor(Parameter*, Atom*);
|
||||
extern void binatoms(Atom*);
|
||||
extern void buildNeighbor(Atom*, Neighbor*);
|
||||
extern void pruneNeighbor(Parameter*, Atom*, Neighbor*);
|
||||
extern void sortAtom(Atom*);
|
||||
extern void buildClusters(Atom*);
|
||||
extern void defineJClusters(Atom*);
|
||||
extern void binClusters(Atom*);
|
||||
extern void updateSingleAtoms(Atom*);
|
||||
#endif
|
231
src/clusterpair/pbc.c
Normal file
231
src/clusterpair/pbc.c
Normal file
@@ -0,0 +1,231 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <pbc.h>
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
#include <neighbor.h>
|
||||
#include <util.h>
|
||||
|
||||
#define DELTA 20000
|
||||
|
||||
static int NmaxGhost;
|
||||
|
||||
static void growPbc(Atom*);
|
||||
|
||||
/* exported subroutines */
|
||||
void initPbc(Atom* atom) {
|
||||
NmaxGhost = 0;
|
||||
atom->border_map = NULL;
|
||||
atom->PBCx = NULL; atom->PBCy = NULL; atom->PBCz = NULL;
|
||||
}
|
||||
|
||||
/* update coordinates of ghost atoms */
|
||||
/* uses mapping created in setupPbc */
|
||||
void cpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) {
|
||||
DEBUG_MESSAGE("updatePbc start\n");
|
||||
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
int ncj = atom->Nclusters_local / jfac;
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
|
||||
const int cj = ncj + cg;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int bmap_vec_base = CJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base];
|
||||
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
|
||||
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
|
||||
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
|
||||
|
||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
|
||||
MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
|
||||
MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
|
||||
|
||||
cj_x[CL_X_OFFSET + cjj] = xtmp;
|
||||
cj_x[CL_Y_OFFSET + cjj] = ytmp;
|
||||
cj_x[CL_Z_OFFSET + cjj] = ztmp;
|
||||
|
||||
if(firstUpdate) {
|
||||
// TODO: To create the bounding boxes faster, we can use SIMD operations
|
||||
if(bbminx > xtmp) { bbminx = xtmp; }
|
||||
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
|
||||
if(bbminy > ytmp) { bbminy = ytmp; }
|
||||
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
|
||||
if(bbminz > ztmp) { bbminz = ztmp; }
|
||||
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
|
||||
}
|
||||
}
|
||||
|
||||
if(firstUpdate) {
|
||||
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
|
||||
cj_x[CL_X_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
|
||||
}
|
||||
|
||||
atom->jclusters[cj].bbminx = bbminx;
|
||||
atom->jclusters[cj].bbmaxx = bbmaxx;
|
||||
atom->jclusters[cj].bbminy = bbminy;
|
||||
atom->jclusters[cj].bbmaxy = bbmaxy;
|
||||
atom->jclusters[cj].bbminz = bbminz;
|
||||
atom->jclusters[cj].bbmaxz = bbmaxz;
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("updatePbc end\n");
|
||||
}
|
||||
|
||||
/* relocate atoms that have left domain according
|
||||
* to periodic boundary conditions */
|
||||
void updateAtomsPbc(Atom *atom, Parameter *param) {
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
if(atom_x(i) < 0.0) {
|
||||
atom_x(i) += xprd;
|
||||
} else if(atom_x(i) >= xprd) {
|
||||
atom_x(i) -= xprd;
|
||||
}
|
||||
|
||||
if(atom_y(i) < 0.0) {
|
||||
atom_y(i) += yprd;
|
||||
} else if(atom_y(i) >= yprd) {
|
||||
atom_y(i) -= yprd;
|
||||
}
|
||||
|
||||
if(atom_z(i) < 0.0) {
|
||||
atom_z(i) += zprd;
|
||||
} else if(atom_z(i) >= zprd) {
|
||||
atom_z(i) -= zprd;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* setup periodic boundary conditions by
|
||||
* defining ghost atoms around domain
|
||||
* only creates mapping and coordinate corrections
|
||||
* that are then enforced in updatePbc */
|
||||
#define ADDGHOST(dx,dy,dz); \
|
||||
Nghost++; \
|
||||
const int cg = ncj + Nghost; \
|
||||
const int cj_natoms = atom->jclusters[cj].natoms; \
|
||||
atom->border_map[Nghost] = cj; \
|
||||
atom->PBCx[Nghost] = dx; \
|
||||
atom->PBCy[Nghost] = dy; \
|
||||
atom->PBCz[Nghost] = dz; \
|
||||
atom->jclusters[cg].natoms = cj_natoms; \
|
||||
Nghost_atoms += cj_natoms; \
|
||||
int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj); \
|
||||
int cg_sca_base = CJ_SCALAR_BASE_INDEX(cg); \
|
||||
for(int cjj = 0; cjj < cj_natoms; cjj++) { \
|
||||
atom->cl_type[cg_sca_base + cjj] = atom->cl_type[cj_sca_base + cjj]; \
|
||||
}
|
||||
|
||||
/* internal subroutines */
|
||||
void growPbc(Atom* atom) {
|
||||
int nold = NmaxGhost;
|
||||
NmaxGhost += DELTA;
|
||||
|
||||
atom->border_map = (int*) reallocate(atom->border_map, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
atom->PBCx = (int*) reallocate(atom->PBCx, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
atom->PBCy = (int*) reallocate(atom->PBCy, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
atom->PBCz = (int*) reallocate(atom->PBCz, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
}
|
||||
|
||||
void setupPbc(Atom *atom, Parameter *param) {
|
||||
DEBUG_MESSAGE("setupPbc start\n");
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
MD_FLOAT Cutneigh = param->cutneigh;
|
||||
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
int ncj = atom->Nclusters_local / jfac;
|
||||
int Nghost = -1;
|
||||
int Nghost_atoms = 0;
|
||||
|
||||
for(int cj = 0; cj < ncj; cj++) {
|
||||
if(atom->jclusters[cj].natoms > 0) {
|
||||
if(atom->Nclusters_local + (Nghost + 7) * jfac >= atom->Nclusters_max) {
|
||||
growClusters(atom);
|
||||
}
|
||||
|
||||
if((Nghost + 7) * jfac >= NmaxGhost) {
|
||||
growPbc(atom);
|
||||
}
|
||||
|
||||
MD_FLOAT bbminx = atom->jclusters[cj].bbminx;
|
||||
MD_FLOAT bbmaxx = atom->jclusters[cj].bbmaxx;
|
||||
MD_FLOAT bbminy = atom->jclusters[cj].bbminy;
|
||||
MD_FLOAT bbmaxy = atom->jclusters[cj].bbmaxy;
|
||||
MD_FLOAT bbminz = atom->jclusters[cj].bbminz;
|
||||
MD_FLOAT bbmaxz = atom->jclusters[cj].bbmaxz;
|
||||
|
||||
/* Setup ghost atoms */
|
||||
/* 6 planes */
|
||||
if (bbminx < Cutneigh) { ADDGHOST(+1,0,0); }
|
||||
if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
|
||||
if (bbminy < Cutneigh) { ADDGHOST(0,+1,0); }
|
||||
if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
|
||||
if (bbminz < Cutneigh) { ADDGHOST(0,0,+1); }
|
||||
if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
|
||||
/* 8 corners */
|
||||
if (bbminx < Cutneigh && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,+1,+1); }
|
||||
if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(+1,-1,+1); }
|
||||
if (bbminx < Cutneigh && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
|
||||
if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
|
||||
if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(-1,+1,+1); }
|
||||
if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,-1,+1); }
|
||||
if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
|
||||
if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
|
||||
/* 12 edges */
|
||||
if (bbminx < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,0,+1); }
|
||||
if (bbminx < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
|
||||
if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,0,+1); }
|
||||
if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
|
||||
if (bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(0,+1,+1); }
|
||||
if (bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
|
||||
if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(0,-1,+1); }
|
||||
if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
|
||||
if (bbminy < Cutneigh && bbminx < Cutneigh) { ADDGHOST(+1,+1,0); }
|
||||
if (bbminy < Cutneigh && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
|
||||
if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh) { ADDGHOST(+1,-1,0); }
|
||||
if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
|
||||
}
|
||||
}
|
||||
|
||||
if(ncj + (Nghost + 1) * jfac >= atom->Nclusters_max) {
|
||||
growClusters(atom);
|
||||
}
|
||||
|
||||
// Add dummy cluster at the end
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(ncj + Nghost + 1);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
|
||||
cj_x[CL_X_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
|
||||
}
|
||||
|
||||
// increase by one to make it the ghost atom count
|
||||
atom->dummy_cj = ncj + Nghost + 1;
|
||||
atom->Nghost = Nghost_atoms;
|
||||
atom->Nclusters_ghost = Nghost + 1;
|
||||
atom->Nclusters = atom->Nclusters_local + Nghost + 1;
|
||||
|
||||
// Update created ghost clusters positions
|
||||
cpuUpdatePbc(atom, param, 1);
|
||||
DEBUG_MESSAGE("setupPbc end\n");
|
||||
}
|
20
src/clusterpair/pbc.h
Normal file
20
src/clusterpair/pbc.h
Normal file
@@ -0,0 +1,20 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __PBC_H_
|
||||
#define __PBC_H_
|
||||
extern void initPbc();
|
||||
extern void cpuUpdatePbc(Atom*, Parameter*, int);
|
||||
extern void updateAtomsPbc(Atom*, Parameter*);
|
||||
extern void setupPbc(Atom*, Parameter*);
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
extern void cudaUpdatePbc(Atom*, Parameter*, int);
|
||||
#endif
|
||||
#endif
|
58
src/clusterpair/stats.c
Normal file
58
src/clusterpair/stats.c
Normal file
@@ -0,0 +1,58 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
#include <stats.h>
|
||||
#include <timers.h>
|
||||
|
||||
void initStats(Stats *s) {
|
||||
s->calculated_forces = 0;
|
||||
s->num_neighs = 0;
|
||||
s->force_iters = 0;
|
||||
s->atoms_within_cutoff = 0;
|
||||
s->atoms_outside_cutoff = 0;
|
||||
s->clusters_within_cutoff = 0;
|
||||
s->clusters_outside_cutoff = 0;
|
||||
}
|
||||
|
||||
void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer) {
|
||||
#ifdef COMPUTE_STATS
|
||||
|
||||
const int MxN = CLUSTER_M * CLUSTER_N;
|
||||
double avg_atoms_cluster = (double)(atom->Nlocal) / (double)(atom->Nclusters_local);
|
||||
double force_useful_volume = 1e-9 * ( (double)(atom->Nlocal * (param->ntimes + 1)) * (sizeof(MD_FLOAT) * 6 + sizeof(int)) +
|
||||
(double)(stats->num_neighs) * (sizeof(MD_FLOAT) * 3 + sizeof(int)) );
|
||||
double avg_neigh_atom = (stats->num_neighs * CLUSTER_N) / (double)(atom->Nlocal * (param->ntimes + 1));
|
||||
double avg_neigh_cluster = (double)(stats->num_neighs) / (double)(stats->calculated_forces);
|
||||
double avg_simd = stats->force_iters / (double)(atom->Nlocal * (param->ntimes + 1));
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
force_useful_volume += 1e-9 * (double)((atom->Nlocal * (param->ntimes + 1)) + stats->num_neighs) * sizeof(int);
|
||||
#endif
|
||||
|
||||
printf("Statistics:\n");
|
||||
printf("\tVector width: %d, Processor frequency: %.4f GHz\n", VECTOR_WIDTH, param->proc_freq);
|
||||
printf("\tAverage atoms per cluster: %.4f\n", avg_atoms_cluster);
|
||||
printf("\tAverage neighbors per atom: %.4f\n", avg_neigh_atom);
|
||||
printf("\tAverage neighbors per cluster: %.4f\n", avg_neigh_cluster);
|
||||
printf("\tAverage SIMD iterations per atom: %.4f\n", avg_simd);
|
||||
printf("\tTotal number of computed pair interactions: %lld\n", stats->num_neighs * MxN);
|
||||
printf("\tTotal number of SIMD iterations: %lld\n", stats->force_iters);
|
||||
printf("\tUseful read data volume for force computation: %.2fGB\n", force_useful_volume);
|
||||
printf("\tCycles/SIMD iteration: %.4f\n", timer[FORCE] * param->proc_freq * 1e9 / stats->force_iters);
|
||||
|
||||
#ifdef USE_REFERENCE_VERSION
|
||||
const double atoms_eff = (double)stats->atoms_within_cutoff / (double)(stats->atoms_within_cutoff + stats->atoms_outside_cutoff) * 100.0;
|
||||
printf("\tAtoms within/outside cutoff radius: %lld/%lld (%.2f%%)\n", stats->atoms_within_cutoff, stats->atoms_outside_cutoff, atoms_eff);
|
||||
const double clusters_eff = (double)stats->clusters_within_cutoff / (double)(stats->clusters_within_cutoff + stats->clusters_outside_cutoff) * 100.0;
|
||||
printf("\tClusters within/outside cutoff radius: %lld/%lld (%.2f%%)\n", stats->clusters_within_cutoff, stats->clusters_outside_cutoff, clusters_eff);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
35
src/clusterpair/stats.h
Normal file
35
src/clusterpair/stats.h
Normal file
@@ -0,0 +1,35 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __STATS_H_
|
||||
#define __STATS_H_
|
||||
typedef struct {
|
||||
long long int calculated_forces;
|
||||
long long int num_neighs;
|
||||
long long int force_iters;
|
||||
long long int atoms_within_cutoff;
|
||||
long long int atoms_outside_cutoff;
|
||||
long long int clusters_within_cutoff;
|
||||
long long int clusters_outside_cutoff;
|
||||
} Stats;
|
||||
|
||||
void initStats(Stats *s);
|
||||
void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer);
|
||||
|
||||
#ifdef COMPUTE_STATS
|
||||
# define addStat(stat, value) stat += value;
|
||||
# define beginStatTimer() double Si = getTimeStamp();
|
||||
# define endStatTimer(stat) stat += getTimeStamp() - Si;
|
||||
#else
|
||||
# define addStat(stat, value)
|
||||
# define beginStatTimer()
|
||||
# define endStatTimer(stat)
|
||||
#endif
|
||||
|
||||
#endif
|
61
src/clusterpair/tracing.c
Normal file
61
src/clusterpair/tracing.c
Normal file
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <tracing.h>
|
||||
|
||||
void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timestep) {
|
||||
MEM_TRACER_INIT;
|
||||
INDEX_TRACER_INIT;
|
||||
int Nlocal = atom->Nlocal;
|
||||
int *neighs;
|
||||
unsigned int *neighs_imask;
|
||||
//MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
|
||||
|
||||
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
MEM_TRACE(atom_x(i), 'R');
|
||||
MEM_TRACE(atom_y(i), 'R');
|
||||
MEM_TRACE(atom_z(i), 'R');
|
||||
INDEX_TRACE_ATOM(i);
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
MEM_TRACE(atom->type[i], 'R');
|
||||
#endif
|
||||
|
||||
DIST_TRACE_SORT(neighs, numneighs);
|
||||
INDEX_TRACE(neighs, numneighs);
|
||||
DIST_TRACE(neighs, numneighs);
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
MEM_TRACE(j, 'R');
|
||||
MEM_TRACE(atom_x(j), 'R');
|
||||
MEM_TRACE(atom_y(j), 'R');
|
||||
MEM_TRACE(atom_z(j), 'R');
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
MEM_TRACE(atom->type[j], 'R');
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
MEM_TRACE(fx[i], 'R');
|
||||
MEM_TRACE(fx[i], 'W');
|
||||
MEM_TRACE(fy[i], 'R');
|
||||
MEM_TRACE(fy[i], 'W');
|
||||
MEM_TRACE(fz[i], 'R');
|
||||
MEM_TRACE(fz[i], 'W');
|
||||
*/
|
||||
}
|
||||
|
||||
INDEX_TRACER_END;
|
||||
MEM_TRACER_END;
|
||||
}
|
102
src/clusterpair/tracing.h
Normal file
102
src/clusterpair/tracing.h
Normal file
@@ -0,0 +1,102 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#endif
|
||||
|
||||
#ifndef VECTOR_WIDTH
|
||||
# define VECTOR_WIDTH 8
|
||||
#endif
|
||||
|
||||
#ifndef TRACER_CONDITION
|
||||
# define TRACER_CONDITION (!(timestep % param->every))
|
||||
#endif
|
||||
|
||||
#ifdef MEM_TRACER
|
||||
# define MEM_TRACER_INIT FILE *mem_tracer_fp; \
|
||||
if(TRACER_CONDITION) { \
|
||||
char mem_tracer_fn[128]; \
|
||||
snprintf(mem_tracer_fn, sizeof mem_tracer_fn, "mem_tracer_%d.out", timestep); \
|
||||
mem_tracer_fp = fopen(mem_tracer_fn, "w");
|
||||
}
|
||||
|
||||
# define MEM_TRACER_END if(TRACER_CONDITION) { fclose(mem_tracer_fp); }
|
||||
# define MEM_TRACE(addr, op) if(TRACER_CONDITION) { fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr))); }
|
||||
#else
|
||||
# define MEM_TRACER_INIT
|
||||
# define MEM_TRACER_END
|
||||
# define MEM_TRACE(addr, op)
|
||||
#endif
|
||||
|
||||
#ifdef INDEX_TRACER
|
||||
# define INDEX_TRACER_INIT FILE *index_tracer_fp; \
|
||||
if(TRACER_CONDITION) { \
|
||||
char index_tracer_fn[128]; \
|
||||
snprintf(index_tracer_fn, sizeof index_tracer_fn, "index_tracer_%d.out", timestep); \
|
||||
index_tracer_fp = fopen(index_tracer_fn, "w"); \
|
||||
}
|
||||
|
||||
# define INDEX_TRACER_END if(TRACER_CONDITION) { fclose(index_tracer_fp); }
|
||||
# define INDEX_TRACE_NATOMS(nl, ng, mn) if(TRACER_CONDITION) { fprintf(index_tracer_fp, "N: %d %d %d\n", nl, ng, mn); }
|
||||
# define INDEX_TRACE_ATOM(a) if(TRACER_CONDITION) { fprintf(index_tracer_fp, "A: %d\n", a); }
|
||||
# define INDEX_TRACE(l, e) if(TRACER_CONDITION) { \
|
||||
for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
|
||||
int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
|
||||
fprintf(index_tracer_fp, "I: "); \
|
||||
for(int __j = 0; __j < __e; ++__j) { \
|
||||
fprintf(index_tracer_fp, "%d ", l[__i + __j]); \
|
||||
} \
|
||||
fprintf(index_tracer_fp, "\n"); \
|
||||
} \
|
||||
}
|
||||
|
||||
# define DIST_TRACE_SORT(l, e) if(TRACER_CONDITION) { \
|
||||
for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
|
||||
int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
|
||||
if(__e > 1) { \
|
||||
for(int __j = __i; __j < __i + __e - 1; ++__j) { \
|
||||
for(int __k = __i; __k < __i + __e - (__j - __i) - 1; ++__k) { \
|
||||
if(l[__k] > l[__k + 1]) { \
|
||||
int __t = l[__k]; \
|
||||
l[__k] = l[__k + 1]; \
|
||||
l[__k + 1] = __t; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
# define DIST_TRACE(l, e) if(TRACER_CONDITION) { \
|
||||
for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
|
||||
int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
|
||||
if(__e > 1) { \
|
||||
fprintf(index_tracer_fp, "D: "); \
|
||||
for(int __j = 0; __j < __e - 1; ++__j) { \
|
||||
int __dist = abs(l[__i + __j + 1] - l[__i + __j]); \
|
||||
fprintf(index_tracer_fp, "%d ", __dist); \
|
||||
} \
|
||||
fprintf(index_tracer_fp, "\n"); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
#else
|
||||
# define INDEX_TRACER_INIT
|
||||
# define INDEX_TRACER_END
|
||||
# define INDEX_TRACE_NATOMS(nl, ng, mn)
|
||||
# define INDEX_TRACE_ATOM(a)
|
||||
# define INDEX_TRACE(l, e)
|
||||
# define DIST_TRACE_SORT(l, e)
|
||||
# define DIST_TRACE(l, e)
|
||||
#endif
|
||||
|
||||
extern void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timestep);
|
190
src/clusterpair/vtk.c
Normal file
190
src/clusterpair/vtk.c
Normal file
@@ -0,0 +1,190 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <atom.h>
|
||||
#include <vtk.h>
|
||||
|
||||
void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep) {
|
||||
write_local_atoms_to_vtk_file(filename, atom, timestep);
|
||||
write_ghost_atoms_to_vtk_file(filename, atom, timestep);
|
||||
write_local_cluster_edges_to_vtk_file(filename, atom, timestep);
|
||||
write_ghost_cluster_edges_to_vtk_file(filename, atom, timestep);
|
||||
}
|
||||
|
||||
int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
|
||||
char timestep_filename[128];
|
||||
snprintf(timestep_filename, sizeof timestep_filename, "%s_local_%d.vtk", filename, timestep);
|
||||
FILE* fp = fopen(timestep_filename, "wb");
|
||||
|
||||
if(fp == NULL) {
|
||||
fprintf(stderr, "Could not open VTK file for writing!\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
fprintf(fp, "# vtk DataFile Version 2.0\n");
|
||||
fprintf(fp, "Particle data\n");
|
||||
fprintf(fp, "ASCII\n");
|
||||
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
|
||||
fprintf(fp, "POINTS %d double\n", atom->Nlocal);
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2);
|
||||
for(int i = 0; i < atom->Nlocal; ++i) {
|
||||
fprintf(fp, "1 %d\n", i);
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal);
|
||||
for(int i = 0; i < atom->Nlocal; ++i) {
|
||||
fprintf(fp, "1\n");
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "POINT_DATA %d\n", atom->Nlocal);
|
||||
fprintf(fp, "SCALARS mass double\n");
|
||||
fprintf(fp, "LOOKUP_TABLE default\n");
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
fprintf(fp, "1.0\n");
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fclose(fp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
|
||||
char timestep_filename[128];
|
||||
snprintf(timestep_filename, sizeof timestep_filename, "%s_ghost_%d.vtk", filename, timestep);
|
||||
FILE* fp = fopen(timestep_filename, "wb");
|
||||
|
||||
if(fp == NULL) {
|
||||
fprintf(stderr, "Could not open VTK file for writing!\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
fprintf(fp, "# vtk DataFile Version 2.0\n");
|
||||
fprintf(fp, "Particle data\n");
|
||||
fprintf(fp, "ASCII\n");
|
||||
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
|
||||
fprintf(fp, "POINTS %d double\n", atom->Nghost);
|
||||
for(int ci = atom->Nclusters_local; ci < atom->Nclusters_local + atom->Nclusters_ghost; ++ci) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "CELLS %d %d\n", atom->Nghost, atom->Nghost * 2);
|
||||
for(int i = 0; i < atom->Nghost; ++i) {
|
||||
fprintf(fp, "1 %d\n", i);
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "CELL_TYPES %d\n", atom->Nghost);
|
||||
for(int i = 0; i < atom->Nghost; ++i) {
|
||||
fprintf(fp, "1\n");
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "POINT_DATA %d\n", atom->Nghost);
|
||||
fprintf(fp, "SCALARS mass double\n");
|
||||
fprintf(fp, "LOOKUP_TABLE default\n");
|
||||
for(int i = 0; i < atom->Nghost; i++) {
|
||||
fprintf(fp, "1.0\n");
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fclose(fp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep) {
|
||||
char timestep_filename[128];
|
||||
snprintf(timestep_filename, sizeof timestep_filename, "%s_local_edges_%d.vtk", filename, timestep);
|
||||
FILE* fp = fopen(timestep_filename, "wb");
|
||||
int N = atom->Nclusters_local;
|
||||
int tot_lines = 0;
|
||||
int i = 0;
|
||||
|
||||
if(fp == NULL) {
|
||||
fprintf(stderr, "Could not open VTK file for writing!\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
fprintf(fp, "# vtk DataFile Version 2.0\n");
|
||||
fprintf(fp, "Particle data\n");
|
||||
fprintf(fp, "ASCII\n");
|
||||
fprintf(fp, "DATASET POLYDATA\n");
|
||||
fprintf(fp, "POINTS %d double\n", atom->Nlocal);
|
||||
for(int ci = 0; ci < N; ++ci) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
|
||||
tot_lines += atom->iclusters[ci].natoms;
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "LINES %d %d\n", N, N + tot_lines);
|
||||
for(int ci = 0; ci < N; ++ci) {
|
||||
fprintf(fp, "%d ", atom->iclusters[ci].natoms);
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%d ", i++);
|
||||
}
|
||||
|
||||
fprintf(fp, "\n");
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fclose(fp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep) {
|
||||
char timestep_filename[128];
|
||||
snprintf(timestep_filename, sizeof timestep_filename, "%s_ghost_edges_%d.vtk", filename, timestep);
|
||||
FILE* fp = fopen(timestep_filename, "wb");
|
||||
int N = atom->Nclusters_local + atom->Nclusters_ghost;
|
||||
int tot_lines = 0;
|
||||
int i = 0;
|
||||
|
||||
if(fp == NULL) {
|
||||
fprintf(stderr, "Could not open VTK file for writing!\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
fprintf(fp, "# vtk DataFile Version 2.0\n");
|
||||
fprintf(fp, "Particle data\n");
|
||||
fprintf(fp, "ASCII\n");
|
||||
fprintf(fp, "DATASET POLYDATA\n");
|
||||
fprintf(fp, "POINTS %d double\n", atom->Nghost);
|
||||
for(int ci = atom->Nclusters_local; ci < N; ++ci) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
|
||||
tot_lines += atom->iclusters[ci].natoms;
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "LINES %d %d\n", atom->Nclusters_ghost, atom->Nclusters_ghost + tot_lines);
|
||||
for(int ci = atom->Nclusters_local; ci < N; ++ci) {
|
||||
fprintf(fp, "%d ", atom->iclusters[ci].natoms);
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%d ", i++);
|
||||
}
|
||||
|
||||
fprintf(fp, "\n");
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fclose(fp);
|
||||
return 0;
|
||||
}
|
16
src/clusterpair/vtk.h
Normal file
16
src/clusterpair/vtk.h
Normal file
@@ -0,0 +1,16 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <atom.h>
|
||||
|
||||
#ifndef __VTK_H_
|
||||
#define __VTK_H_
|
||||
extern void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep);
|
||||
extern int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
||||
extern int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
||||
extern int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
||||
extern int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
||||
#endif
|
56
src/clusterpair/xtc.c
Normal file
56
src/clusterpair/xtc.c
Normal file
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
#include <xtc.h>
|
||||
|
||||
#ifdef XTC_OUTPUT
|
||||
#include <gromacs/fileio/xtcio.h>
|
||||
|
||||
static struct t_fileio *xtc_file = NULL;
|
||||
static rvec *x_buf = NULL;
|
||||
static rvec basis[3];
|
||||
|
||||
void xtc_init(const char *filename, Atom *atom, int timestep) {
|
||||
basis[0][XX] = 1.0;
|
||||
basis[0][YY] = 0.0;
|
||||
basis[0][ZZ] = 0.0;
|
||||
basis[1][XX] = 0.0;
|
||||
basis[1][YY] = 1.0;
|
||||
basis[1][ZZ] = 0.0;
|
||||
basis[2][XX] = 0.0;
|
||||
basis[2][YY] = 0.0;
|
||||
basis[2][ZZ] = 1.0;
|
||||
|
||||
xtc_file = open_xtc(filename, "w");
|
||||
x_buf = (rvec *) allocate(ALIGNMENT, sizeof(rvec) * (atom->Nlocal + 1));
|
||||
xtc_write(atom, timestep, 1, 1);
|
||||
}
|
||||
|
||||
void xtc_write(Atom *atom, int timestep, int write_pos, int write_vel) {
|
||||
int i = 0;
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
|
||||
x_buf[i][XX] = ci_x[CL_X_OFFSET + cii];
|
||||
x_buf[i][YY] = ci_x[CL_Y_OFFSET + cii];
|
||||
x_buf[i][ZZ] = ci_x[CL_Z_OFFSET + cii];
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
write_xtc(xtc_file, atom->Nlocal, timestep, 0.0, (const rvec *) basis, (const rvec *) x_buf, 1000);
|
||||
}
|
||||
|
||||
void xtc_end() {
|
||||
free(x_buf);
|
||||
close_xtc(xtc_file);
|
||||
}
|
||||
#endif
|
21
src/clusterpair/xtc.h
Normal file
21
src/clusterpair/xtc.h
Normal file
@@ -0,0 +1,21 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <atom.h>
|
||||
|
||||
#ifndef __XTC_H_
|
||||
#define __XTC_H_
|
||||
|
||||
#ifdef XTC_OUTPUT
|
||||
void xtc_init(const char *, Atom*, int);
|
||||
void xtc_write(Atom*, int, int, int);
|
||||
void xtc_end();
|
||||
#else
|
||||
#define xtc_init(a,b,c)
|
||||
#define xtc_write(a,b,c,d)
|
||||
#define xtc_end()
|
||||
#endif
|
||||
#endif
|
44
src/common/allocate.c
Normal file
44
src/common/allocate.c
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <util.h>
|
||||
|
||||
void *allocate(int alignment, size_t bytesize) {
|
||||
void *ptr;
|
||||
int errorCode;
|
||||
|
||||
errorCode = posix_memalign(&ptr, alignment, bytesize);
|
||||
if(errorCode == EINVAL) {
|
||||
fprintf(stderr, "Error: Alignment parameter is not a power of two\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if(errorCode == ENOMEM) {
|
||||
fprintf(stderr, "Error: Insufficient memory to fulfill the request\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if(ptr == NULL) {
|
||||
fprintf(stderr, "Error: posix_memalign failed!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void *reallocate(void* ptr, int alignment, size_t new_bytesize, size_t old_bytesize) {
|
||||
void *newarray = allocate(alignment, new_bytesize);
|
||||
if(ptr != NULL) {
|
||||
memcpy(newarray, ptr, old_bytesize);
|
||||
free(ptr);
|
||||
}
|
||||
|
||||
return newarray;
|
||||
}
|
13
src/common/allocate.h
Normal file
13
src/common/allocate.h
Normal file
@@ -0,0 +1,13 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifndef __ALLOCATE_H_
|
||||
#define __ALLOCATE_H_
|
||||
extern void* allocate (int alignment, size_t bytesize);
|
||||
extern void* reallocate (void* ptr, int alignment, size_t newBytesize, size_t oldBytesize);
|
||||
#endif
|
68
src/common/device.c
Normal file
68
src/common/device.c
Normal file
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
//---
|
||||
#include <device.h>
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
void cuda_assert(const char *label, cudaError_t err) {
|
||||
if (err != cudaSuccess) {
|
||||
printf("[CUDA Error]: %s: %s\r\n", label, cudaGetErrorString(err));
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void *allocateGPU(size_t bytesize) {
|
||||
void *ptr;
|
||||
#ifdef CUDA_HOST_MEMORY
|
||||
cuda_assert("allocateGPU", cudaMallocHost((void **) &ptr, bytesize));
|
||||
#else
|
||||
cuda_assert("allocateGPU", cudaMalloc((void **) &ptr, bytesize));
|
||||
#endif
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// Data is not preserved
|
||||
void *reallocateGPU(void *ptr, size_t new_bytesize) {
|
||||
if(ptr != NULL) {
|
||||
#ifdef CUDA_HOST_MEMORY
|
||||
cudaFreeHost(ptr);
|
||||
#else
|
||||
cudaFree(ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
return allocateGPU(new_bytesize);
|
||||
}
|
||||
|
||||
void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize) {
|
||||
#ifndef CUDA_HOST_MEMORY
|
||||
cuda_assert("memcpyToGPU", cudaMemcpy(d_ptr, h_ptr, bytesize, cudaMemcpyHostToDevice));
|
||||
#endif
|
||||
}
|
||||
|
||||
void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize) {
|
||||
#ifndef CUDA_HOST_MEMORY
|
||||
cuda_assert("memcpyFromGPU", cudaMemcpy(h_ptr, d_ptr, bytesize, cudaMemcpyDeviceToHost));
|
||||
#endif
|
||||
}
|
||||
|
||||
void memsetGPU(void *d_ptr, int value, size_t bytesize) {
|
||||
cuda_assert("memsetGPU", cudaMemset(d_ptr, value, bytesize));
|
||||
}
|
||||
|
||||
#else
|
||||
void initDevice(Atom *atom, Neighbor *neighbor) {}
|
||||
void *allocateGPU(size_t bytesize) { return NULL; }
|
||||
void *reallocateGPU(void *ptr, size_t new_bytesize) { return NULL; }
|
||||
void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize) {}
|
||||
void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize) {}
|
||||
void memsetGPU(void *d_ptr, int value, size_t bytesize) {}
|
||||
#endif
|
26
src/common/device.h
Normal file
26
src/common/device.h
Normal file
@@ -0,0 +1,26 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stddef.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <neighbor.h>
|
||||
|
||||
#ifndef __DEVICE_H_
|
||||
#define __DEVICE_H_
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
#include <cuda_runtime.h>
|
||||
extern void cuda_assert(const char *msg, cudaError_t err);
|
||||
#endif
|
||||
|
||||
extern void initDevice(Atom*, Neighbor*);
|
||||
extern void *allocateGPU(size_t bytesize);
|
||||
extern void *reallocateGPU(void *ptr, size_t new_bytesize);
|
||||
extern void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize);
|
||||
extern void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize);
|
||||
extern void memsetGPU(void *d_ptr, int value, size_t bytesize);
|
||||
#endif
|
39
src/common/eam.h
Normal file
39
src/common/eam.h
Normal file
@@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __EAM_H_
|
||||
#define __EAM_H_
|
||||
typedef struct {
|
||||
int nrho, nr;
|
||||
MD_FLOAT drho, dr, cut, mass;
|
||||
MD_FLOAT *frho, *rhor, *zr;
|
||||
} Funcfl;
|
||||
|
||||
typedef struct {
|
||||
MD_FLOAT* fp;
|
||||
int nmax;
|
||||
int nrho, nr;
|
||||
int nrho_tot, nr_tot;
|
||||
MD_FLOAT dr, rdr, drho, rdrho;
|
||||
MD_FLOAT *frho, *rhor, *z2r;
|
||||
MD_FLOAT *rhor_spline, *frho_spline, *z2r_spline;
|
||||
Funcfl file;
|
||||
} Eam;
|
||||
|
||||
void initEam(Eam* eam, Parameter* param);
|
||||
void coeff(Eam* eam, Parameter* param);
|
||||
void init_style(Eam* eam, Parameter *param);
|
||||
void read_eam_file(Funcfl* file, const char* filename);
|
||||
void file2array(Eam* eam);
|
||||
void array2spline(Eam* eam, Parameter* param);
|
||||
void interpolate(int n, MD_FLOAT delta, MD_FLOAT* f, MD_FLOAT* spline);
|
||||
void grab(FILE* fptr, int n, MD_FLOAT* list);
|
||||
#endif
|
269
src/common/eam_utils.c
Normal file
269
src/common/eam_utils.c
Normal file
@@ -0,0 +1,269 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <allocate.h>
|
||||
#include <atom.h>
|
||||
#include <eam.h>
|
||||
#include <parameter.h>
|
||||
#include <util.h>
|
||||
|
||||
#ifndef MAXLINE
|
||||
#define MAXLINE 4096
|
||||
#endif
|
||||
|
||||
void initEam(Eam* eam, Parameter* param) {
|
||||
int ntypes = param->ntypes;
|
||||
eam->nmax = 0;
|
||||
eam->fp = NULL;
|
||||
coeff(eam, param);
|
||||
init_style(eam, param);
|
||||
}
|
||||
|
||||
void coeff(Eam* eam, Parameter* param) {
|
||||
read_eam_file(&eam->file, param->eam_file);
|
||||
param->mass = eam->file.mass;
|
||||
param->cutforce = eam->file.cut;
|
||||
param->cutneigh = param->cutforce + 1.0;
|
||||
param->temp = 600.0;
|
||||
param->dt = 0.001;
|
||||
param->rho = 0.07041125;
|
||||
param->dtforce = 0.5 * param->dt / param->mass;
|
||||
}
|
||||
|
||||
void init_style(Eam* eam, Parameter* param) {
|
||||
// convert read-in file(s) to arrays and spline them
|
||||
file2array(eam);
|
||||
array2spline(eam, param);
|
||||
}
|
||||
|
||||
void read_eam_file(Funcfl* file, const char* filename) {
|
||||
FILE* fptr;
|
||||
char line[MAXLINE];
|
||||
|
||||
fptr = fopen(filename, "r");
|
||||
if(fptr == NULL) {
|
||||
printf("Can't open EAM Potential file: %s\n", filename);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
int tmp;
|
||||
readline(line, fptr);
|
||||
readline(line, fptr);
|
||||
sscanf(line, "%d %lg", &tmp, &(file->mass));
|
||||
readline(line, fptr);
|
||||
sscanf(line, "%d %lg %d %lg %lg", &file->nrho, &file->drho, &file->nr, &file->dr, &file->cut);
|
||||
|
||||
//printf("Read: %lf %i %lf %i %lf %lf\n",file->mass,file->nrho,file->drho,file->nr,file->dr,file->cut);
|
||||
file->frho = (MD_FLOAT *) allocate(ALIGNMENT, (file->nrho + 1) * sizeof(MD_FLOAT));
|
||||
file->rhor = (MD_FLOAT *) allocate(ALIGNMENT, (file->nr + 1) * sizeof(MD_FLOAT));
|
||||
file->zr = (MD_FLOAT *) allocate(ALIGNMENT, (file->nr + 1) * sizeof(MD_FLOAT));
|
||||
grab(fptr, file->nrho, file->frho);
|
||||
grab(fptr, file->nr, file->zr);
|
||||
grab(fptr, file->nr, file->rhor);
|
||||
for(int i = file->nrho; i > 0; i--) file->frho[i] = file->frho[i - 1];
|
||||
for(int i = file->nr; i > 0; i--) file->rhor[i] = file->rhor[i - 1];
|
||||
for(int i = file->nr; i > 0; i--) file->zr[i] = file->zr[i - 1];
|
||||
fclose(fptr);
|
||||
}
|
||||
|
||||
void file2array(Eam* eam) {
|
||||
int i, j, k, m, n;
|
||||
double sixth = 1.0 / 6.0;
|
||||
|
||||
// determine max function params from all active funcfl files
|
||||
// active means some element is pointing at it via map
|
||||
int active;
|
||||
double rmax, rhomax;
|
||||
eam->dr = eam->drho = rmax = rhomax = 0.0;
|
||||
active = 0;
|
||||
Funcfl* file = &eam->file;
|
||||
eam->dr = MAX(eam->dr, file->dr);
|
||||
eam->drho = MAX(eam->drho, file->drho);
|
||||
rmax = MAX(rmax, (file->nr - 1) * file->dr);
|
||||
rhomax = MAX(rhomax, (file->nrho - 1) * file->drho);
|
||||
|
||||
// set nr,nrho from cutoff and spacings
|
||||
// 0.5 is for round-off in divide
|
||||
eam->nr = (int)(rmax / eam->dr + 0.5);
|
||||
eam->nrho = (int)(rhomax / eam->drho + 0.5);
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// setup frho arrays
|
||||
// ------------------------------------------------------------------
|
||||
|
||||
// allocate frho arrays
|
||||
// nfrho = # of funcfl files + 1 for zero array
|
||||
eam->frho = (MD_FLOAT *) allocate(ALIGNMENT, (eam->nrho + 1) * sizeof(MD_FLOAT));
|
||||
|
||||
// interpolate each file's frho to a single grid and cutoff
|
||||
double r, p, cof1, cof2, cof3, cof4;
|
||||
n = 0;
|
||||
|
||||
for(m = 1; m <= eam->nrho; m++) {
|
||||
r = (m - 1) * eam->drho;
|
||||
p = r / file->drho + 1.0;
|
||||
k = (int)(p);
|
||||
k = MIN(k, file->nrho - 2);
|
||||
k = MAX(k, 2);
|
||||
p -= k;
|
||||
p = MIN(p, 2.0);
|
||||
cof1 = -sixth * p * (p - 1.0) * (p - 2.0);
|
||||
cof2 = 0.5 * (p * p - 1.0) * (p - 2.0);
|
||||
cof3 = -0.5 * p * (p + 1.0) * (p - 2.0);
|
||||
cof4 = sixth * p * (p * p - 1.0);
|
||||
eam->frho[m] = cof1 * file->frho[k - 1] + cof2 * file->frho[k] +
|
||||
cof3 * file->frho[k + 1] + cof4 * file->frho[k + 2];
|
||||
}
|
||||
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// setup rhor arrays
|
||||
// ------------------------------------------------------------------
|
||||
|
||||
// allocate rhor arrays
|
||||
// nrhor = # of funcfl files
|
||||
eam->rhor = (MD_FLOAT *) allocate(ALIGNMENT, (eam->nr + 1) * sizeof(MD_FLOAT));
|
||||
|
||||
// interpolate each file's rhor to a single grid and cutoff
|
||||
for(m = 1; m <= eam->nr; m++) {
|
||||
r = (m - 1) * eam->dr;
|
||||
p = r / file->dr + 1.0;
|
||||
k = (int)(p);
|
||||
k = MIN(k, file->nr - 2);
|
||||
k = MAX(k, 2);
|
||||
p -= k;
|
||||
p = MIN(p, 2.0);
|
||||
cof1 = -sixth * p * (p - 1.0) * (p - 2.0);
|
||||
cof2 = 0.5 * (p * p - 1.0) * (p - 2.0);
|
||||
cof3 = -0.5 * p * (p + 1.0) * (p - 2.0);
|
||||
cof4 = sixth * p * (p * p - 1.0);
|
||||
eam->rhor[m] = cof1 * file->rhor[k - 1] + cof2 * file->rhor[k] +
|
||||
cof3 * file->rhor[k + 1] + cof4 * file->rhor[k + 2];
|
||||
//if(m==119)printf("BuildRho: %e %e %e %e %e %e\n",rhor[m],cof1,cof2,cof3,cof4,file->rhor[k]);
|
||||
}
|
||||
|
||||
// type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
|
||||
// for funcfl files, I,J mapping only depends on I
|
||||
// OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
|
||||
|
||||
// ------------------------------------------------------------------
|
||||
// setup z2r arrays
|
||||
// ------------------------------------------------------------------
|
||||
|
||||
// allocate z2r arrays
|
||||
// nz2r = N*(N+1)/2 where N = # of funcfl files
|
||||
eam->z2r = (MD_FLOAT *) allocate(ALIGNMENT, (eam->nr + 1) * sizeof(MD_FLOAT));
|
||||
|
||||
// create a z2r array for each file against other files, only for I >= J
|
||||
// interpolate zri and zrj to a single grid and cutoff
|
||||
double zri, zrj;
|
||||
Funcfl* ifile = &eam->file;
|
||||
Funcfl* jfile = &eam->file;
|
||||
|
||||
for(m = 1; m <= eam->nr; m++) {
|
||||
r = (m - 1) * eam->dr;
|
||||
p = r / ifile->dr + 1.0;
|
||||
k = (int)(p);
|
||||
k = MIN(k, ifile->nr - 2);
|
||||
k = MAX(k, 2);
|
||||
p -= k;
|
||||
p = MIN(p, 2.0);
|
||||
cof1 = -sixth * p * (p - 1.0) * (p - 2.0);
|
||||
cof2 = 0.5 * (p * p - 1.0) * (p - 2.0);
|
||||
cof3 = -0.5 * p * (p + 1.0) * (p - 2.0);
|
||||
cof4 = sixth * p * (p * p - 1.0);
|
||||
zri = cof1 * ifile->zr[k - 1] + cof2 * ifile->zr[k] +
|
||||
cof3 * ifile->zr[k + 1] + cof4 * ifile->zr[k + 2];
|
||||
|
||||
p = r / jfile->dr + 1.0;
|
||||
k = (int)(p);
|
||||
k = MIN(k, jfile->nr - 2);
|
||||
k = MAX(k, 2);
|
||||
p -= k;
|
||||
p = MIN(p, 2.0);
|
||||
cof1 = -sixth * p * (p - 1.0) * (p - 2.0);
|
||||
cof2 = 0.5 * (p * p - 1.0) * (p - 2.0);
|
||||
cof3 = -0.5 * p * (p + 1.0) * (p - 2.0);
|
||||
cof4 = sixth * p * (p * p - 1.0);
|
||||
zrj = cof1 * jfile->zr[k - 1] + cof2 * jfile->zr[k] +
|
||||
cof3 * jfile->zr[k + 1] + cof4 * jfile->zr[k + 2];
|
||||
|
||||
eam->z2r[m] = 27.2 * 0.529 * zri * zrj;
|
||||
}
|
||||
}
|
||||
|
||||
void array2spline(Eam* eam, Parameter* param) {
|
||||
eam->rdr = 1.0 / eam->dr;
|
||||
eam->rdrho = 1.0 / eam->drho;
|
||||
eam->nrho_tot = (eam->nrho + 1) * 7 + 64;
|
||||
eam->nr_tot = (eam->nr + 1) * 7 + 64;
|
||||
eam->nrho_tot -= eam->nrho_tot%64;
|
||||
eam->nr_tot -= eam->nr_tot%64;
|
||||
|
||||
int ntypes = param->ntypes;
|
||||
eam->frho_spline = (MD_FLOAT *) allocate(ALIGNMENT, ntypes * ntypes * eam->nrho_tot * sizeof(MD_FLOAT));
|
||||
eam->rhor_spline = (MD_FLOAT *) allocate(ALIGNMENT, ntypes * ntypes * eam->nr_tot * sizeof(MD_FLOAT));
|
||||
eam->z2r_spline = (MD_FLOAT *) allocate(ALIGNMENT, ntypes * ntypes * eam->nr_tot * sizeof(MD_FLOAT));
|
||||
interpolate(eam->nrho, eam->drho, eam->frho, eam->frho_spline);
|
||||
interpolate(eam->nr, eam->dr, eam->rhor, eam->rhor_spline);
|
||||
interpolate(eam->nr, eam->dr, eam->z2r, eam->z2r_spline);
|
||||
|
||||
// replicate data for multiple types;
|
||||
for(int tt = 0; tt < ntypes * ntypes; tt++) {
|
||||
for(int k = 0; k < eam->nrho_tot; k++)
|
||||
eam->frho_spline[tt*eam->nrho_tot + k] = eam->frho_spline[k];
|
||||
for(int k = 0; k < eam->nr_tot; k++)
|
||||
eam->rhor_spline[tt*eam->nr_tot + k] = eam->rhor_spline[k];
|
||||
for(int k = 0; k < eam->nr_tot; k++)
|
||||
eam->z2r_spline[tt*eam->nr_tot + k] = eam->z2r_spline[k];
|
||||
}
|
||||
}
|
||||
|
||||
void interpolate(int n, MD_FLOAT delta, MD_FLOAT* f, MD_FLOAT* spline) {
|
||||
for(int m = 1; m <= n; m++) spline[m * 7 + 6] = f[m];
|
||||
|
||||
spline[1 * 7 + 5] = spline[2 * 7 + 6] - spline[1 * 7 + 6];
|
||||
spline[2 * 7 + 5] = 0.5 * (spline[3 * 7 + 6] - spline[1 * 7 + 6]);
|
||||
spline[(n - 1) * 7 + 5] = 0.5 * (spline[n * 7 + 6] - spline[(n - 2) * 7 + 6]);
|
||||
spline[n * 7 + 5] = spline[n * 7 + 6] - spline[(n - 1) * 7 + 6];
|
||||
|
||||
for(int m = 3; m <= n - 2; m++)
|
||||
spline[m * 7 + 5] = ((spline[(m - 2) * 7 + 6] - spline[(m + 2) * 7 + 6]) +
|
||||
8.0 * (spline[(m + 1) * 7 + 6] - spline[(m - 1) * 7 + 6])) / 12.0;
|
||||
|
||||
for(int m = 1; m <= n - 1; m++) {
|
||||
spline[m * 7 + 4] = 3.0 * (spline[(m + 1) * 7 + 6] - spline[m * 7 + 6]) -
|
||||
2.0 * spline[m * 7 + 5] - spline[(m + 1) * 7 + 5];
|
||||
spline[m * 7 + 3] = spline[m * 7 + 5] + spline[(m + 1) * 7 + 5] -
|
||||
2.0 * (spline[(m + 1) * 7 + 6] - spline[m * 7 + 6]);
|
||||
}
|
||||
|
||||
spline[n * 7 + 4] = 0.0;
|
||||
spline[n * 7 + 3] = 0.0;
|
||||
|
||||
for(int m = 1; m <= n; m++) {
|
||||
spline[m * 7 + 2] = spline[m * 7 + 5] / delta;
|
||||
spline[m * 7 + 1] = 2.0 * spline[m * 7 + 4] / delta;
|
||||
spline[m * 7 + 0] = 3.0 * spline[m * 7 + 3] / delta;
|
||||
}
|
||||
}
|
||||
|
||||
void grab(FILE* fptr, int n, MD_FLOAT* list) {
|
||||
char* ptr;
|
||||
char line[MAXLINE];
|
||||
int i = 0;
|
||||
|
||||
while(i < n) {
|
||||
readline(line, fptr);
|
||||
ptr = strtok(line, " \t\n\r\f");
|
||||
list[i++] = atof(ptr);
|
||||
while((ptr = strtok(NULL, " \t\n\r\f"))) list[i++] = atof(ptr);
|
||||
}
|
||||
}
|
146
src/common/likwid-marker.h
Normal file
146
src/common/likwid-marker.h
Normal file
@@ -0,0 +1,146 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#ifndef LIKWID_MARKER_H
|
||||
#define LIKWID_MARKER_H
|
||||
|
||||
|
||||
/** \addtogroup MarkerAPI Marker API module
|
||||
* @{
|
||||
*/
|
||||
/*!
|
||||
\def LIKWID_MARKER_INIT
|
||||
Shortcut for likwid_markerInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
|
||||
*/
|
||||
/*!
|
||||
\def LIKWID_MARKER_THREADINIT
|
||||
Shortcut for likwid_markerThreadInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
|
||||
*/
|
||||
/*!
|
||||
\def LIKWID_MARKER_REGISTER(regionTag)
|
||||
Shortcut for likwid_markerRegisterRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
|
||||
*/
|
||||
/*!
|
||||
\def LIKWID_MARKER_START(regionTag)
|
||||
Shortcut for likwid_markerStartRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
|
||||
*/
|
||||
/*!
|
||||
\def LIKWID_MARKER_STOP(regionTag)
|
||||
Shortcut for likwid_markerStopRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
|
||||
*/
|
||||
/*!
|
||||
\def LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
|
||||
Shortcut for likwid_markerGetResults() for \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
|
||||
*/
|
||||
/*!
|
||||
\def LIKWID_MARKER_SWITCH
|
||||
Shortcut for likwid_markerNextGroup() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
|
||||
*/
|
||||
/*!
|
||||
\def LIKWID_MARKER_RESET(regionTag)
|
||||
Shortcut for likwid_markerResetRegion() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
|
||||
*/
|
||||
/*!
|
||||
\def LIKWID_MARKER_CLOSE
|
||||
Shortcut for likwid_markerClose() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
|
||||
*/
|
||||
/** @}*/
|
||||
|
||||
#ifdef LIKWID_PERFMON
|
||||
#include <likwid.h>
|
||||
#define LIKWID_MARKER_INIT likwid_markerInit()
|
||||
#define LIKWID_MARKER_THREADINIT likwid_markerThreadInit()
|
||||
#define LIKWID_MARKER_SWITCH likwid_markerNextGroup()
|
||||
#define LIKWID_MARKER_REGISTER(regionTag) likwid_markerRegisterRegion(regionTag)
|
||||
#define LIKWID_MARKER_START(regionTag) likwid_markerStartRegion(regionTag)
|
||||
#define LIKWID_MARKER_STOP(regionTag) likwid_markerStopRegion(regionTag)
|
||||
#define LIKWID_MARKER_CLOSE likwid_markerClose()
|
||||
#define LIKWID_MARKER_RESET(regionTag) likwid_markerResetRegion(regionTag)
|
||||
#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count) likwid_markerGetRegion(regionTag, nevents, events, time, count)
|
||||
#else /* LIKWID_PERFMON */
|
||||
#define LIKWID_MARKER_INIT
|
||||
#define LIKWID_MARKER_THREADINIT
|
||||
#define LIKWID_MARKER_SWITCH
|
||||
#define LIKWID_MARKER_REGISTER(regionTag)
|
||||
#define LIKWID_MARKER_START(regionTag)
|
||||
#define LIKWID_MARKER_STOP(regionTag)
|
||||
#define LIKWID_MARKER_CLOSE
|
||||
#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
|
||||
#define LIKWID_MARKER_RESET(regionTag)
|
||||
#endif /* LIKWID_PERFMON */
|
||||
|
||||
|
||||
/** \addtogroup NvMarkerAPI NvMarker API module (MarkerAPI for Nvidia GPUs)
|
||||
* @{
|
||||
*/
|
||||
/*!
|
||||
\def LIKWID_NVMARKER_INIT
|
||||
Shortcut for likwid_gpuMarkerInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
|
||||
*/
|
||||
/*!
|
||||
\def LIKWID_NVMARKER_THREADINIT
|
||||
Shortcut for likwid_gpuMarkerThreadInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
|
||||
*/
|
||||
/*!
|
||||
\def LIKWID_NVMARKER_REGISTER(regionTag)
|
||||
Shortcut for likwid_gpuMarkerRegisterRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
|
||||
*/
|
||||
/*!
|
||||
\def LIKWID_NVMARKER_START(regionTag)
|
||||
Shortcut for likwid_gpuMarkerStartRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
|
||||
*/
|
||||
/*!
|
||||
\def LIKWID_NVMARKER_STOP(regionTag)
|
||||
Shortcut for likwid_gpuMarkerStopRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
|
||||
*/
|
||||
/*!
|
||||
\def LIKWID_NVMARKER_GET(regionTag, ngpus, nevents, events, time, count)
|
||||
Shortcut for likwid_gpuMarkerGetRegion() for \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
|
||||
*/
|
||||
/*!
|
||||
\def LIKWID_NVMARKER_SWITCH
|
||||
Shortcut for likwid_gpuMarkerNextGroup() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
|
||||
*/
|
||||
/*!
|
||||
\def LIKWID_NVMARKER_RESET(regionTag)
|
||||
Shortcut for likwid_gpuMarkerResetRegion() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
|
||||
*/
|
||||
/*!
|
||||
\def LIKWID_NVMARKER_CLOSE
|
||||
Shortcut for likwid_gpuMarkerClose() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
|
||||
*/
|
||||
/** @}*/
|
||||
|
||||
#ifdef LIKWID_NVMON
|
||||
#ifndef LIKWID_WITH_NVMON
|
||||
#define LIKWID_WITH_NVMON
|
||||
#endif
|
||||
#include <likwid.h>
|
||||
#define LIKWID_NVMARKER_INIT likwid_gpuMarkerInit()
|
||||
#define LIKWID_NVMARKER_THREADINIT likwid_gpuMarkerThreadInit()
|
||||
#define LIKWID_NVMARKER_SWITCH likwid_gpuMarkerNextGroup()
|
||||
#define LIKWID_NVMARKER_REGISTER(regionTag) likwid_gpuMarkerRegisterRegion(regionTag)
|
||||
#define LIKWID_NVMARKER_START(regionTag) likwid_gpuMarkerStartRegion(regionTag)
|
||||
#define LIKWID_NVMARKER_STOP(regionTag) likwid_gpuMarkerStopRegion(regionTag)
|
||||
#define LIKWID_NVMARKER_CLOSE likwid_gpuMarkerClose()
|
||||
#define LIKWID_NVMARKER_RESET(regionTag) likwid_gpuMarkerResetRegion(regionTag)
|
||||
#define LIKWID_NVMARKER_GET(regionTag, ngpus, nevents, events, time, count) \
|
||||
likwid_gpuMarkerGetRegion(regionTag, ngpus, nevents, events, time, count)
|
||||
#else /* LIKWID_NVMON */
|
||||
#define LIKWID_NVMARKER_INIT
|
||||
#define LIKWID_NVMARKER_THREADINIT
|
||||
#define LIKWID_NVMARKER_SWITCH
|
||||
#define LIKWID_NVMARKER_REGISTER(regionTag)
|
||||
#define LIKWID_NVMARKER_START(regionTag)
|
||||
#define LIKWID_NVMARKER_STOP(regionTag)
|
||||
#define LIKWID_NVMARKER_CLOSE
|
||||
#define LIKWID_NVMARKER_GET(regionTag, nevents, events, time, count)
|
||||
#define LIKWID_NVMARKER_RESET(regionTag)
|
||||
#endif /* LIKWID_NVMON */
|
||||
|
||||
|
||||
|
||||
#endif /* LIKWID_MARKER_H */
|
186
src/common/parameter.c
Normal file
186
src/common/parameter.c
Normal file
@@ -0,0 +1,186 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
#include <util.h>
|
||||
|
||||
void initParameter(Parameter *param) {
|
||||
param->input_file = NULL;
|
||||
param->vtk_file = NULL;
|
||||
param->xtc_file = NULL;
|
||||
param->eam_file = NULL;
|
||||
param->write_atom_file = NULL;
|
||||
param->force_field = FF_LJ;
|
||||
param->epsilon = 1.0;
|
||||
param->sigma = 1.0;
|
||||
param->sigma6 = 1.0;
|
||||
param->rho = 0.8442;
|
||||
param->ntypes = 4;
|
||||
param->ntimes = 200;
|
||||
param->dt = 0.005;
|
||||
param->nx = 32;
|
||||
param->ny = 32;
|
||||
param->nz = 32;
|
||||
param->pbc_x = 1;
|
||||
param->pbc_y = 1;
|
||||
param->pbc_z = 1;
|
||||
param->cutforce = 2.5;
|
||||
param->skin = 0.3;
|
||||
param->cutneigh = param->cutforce + param->skin;
|
||||
param->temp = 1.44;
|
||||
param->nstat = 100;
|
||||
param->mass = 1.0;
|
||||
param->dtforce = 0.5 * param->dt;
|
||||
param->reneigh_every = 20;
|
||||
param->prune_every = 1000;
|
||||
param->x_out_every = 20;
|
||||
param->v_out_every = 5;
|
||||
param->half_neigh = 0;
|
||||
param->proc_freq = 2.4;
|
||||
// DEM
|
||||
param->k_s = 1.0;
|
||||
param->k_dn = 1.0;
|
||||
param->gx = 0.0;
|
||||
param->gy = 0.0;
|
||||
param->gz = 0.0;
|
||||
param->reflect_x = 0.0;
|
||||
param->reflect_y = 0.0;
|
||||
param->reflect_z = 0.0;
|
||||
}
|
||||
|
||||
void readParameter(Parameter *param, const char *filename) {
|
||||
FILE *fp = fopen(filename, "r");
|
||||
char line[MAXLINE];
|
||||
int i;
|
||||
|
||||
if(!fp) {
|
||||
fprintf(stderr, "Could not open parameter file: %s\n", filename);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
while(!feof(fp)) {
|
||||
line[0] = '\0';
|
||||
readline(line, fp);
|
||||
for(i = 0; line[i] != '\0' && line[i] != '#'; i++);
|
||||
line[i] = '\0';
|
||||
|
||||
char *tok = strtok(line, " ");
|
||||
char *val = strtok(NULL, " ");
|
||||
|
||||
#define PARSE_PARAM(p,f) if(strncmp(tok, #p, sizeof(#p) / sizeof(#p[0]) - 1) == 0) { param->p = f(val); }
|
||||
#define PARSE_STRING(p) PARSE_PARAM(p, strdup)
|
||||
#define PARSE_INT(p) PARSE_PARAM(p, atoi)
|
||||
#define PARSE_REAL(p) PARSE_PARAM(p, atof)
|
||||
|
||||
if(tok != NULL && val != NULL) {
|
||||
PARSE_PARAM(force_field, str2ff);
|
||||
PARSE_STRING(input_file);
|
||||
PARSE_STRING(eam_file);
|
||||
PARSE_STRING(vtk_file);
|
||||
PARSE_STRING(xtc_file);
|
||||
PARSE_REAL(epsilon);
|
||||
PARSE_REAL(sigma);
|
||||
PARSE_REAL(k_s);
|
||||
PARSE_REAL(k_dn);
|
||||
PARSE_REAL(reflect_x);
|
||||
PARSE_REAL(reflect_y);
|
||||
PARSE_REAL(reflect_z);
|
||||
PARSE_REAL(gx);
|
||||
PARSE_REAL(gy);
|
||||
PARSE_REAL(gz);
|
||||
PARSE_REAL(rho);
|
||||
PARSE_REAL(dt);
|
||||
PARSE_REAL(cutforce);
|
||||
PARSE_REAL(skin);
|
||||
PARSE_REAL(temp);
|
||||
PARSE_REAL(mass);
|
||||
PARSE_REAL(proc_freq);
|
||||
PARSE_INT(ntypes);
|
||||
PARSE_INT(ntimes);
|
||||
PARSE_INT(nx);
|
||||
PARSE_INT(ny);
|
||||
PARSE_INT(nz);
|
||||
PARSE_INT(pbc_x);
|
||||
PARSE_INT(pbc_y);
|
||||
PARSE_INT(pbc_z);
|
||||
PARSE_INT(nstat);
|
||||
PARSE_INT(reneigh_every);
|
||||
PARSE_INT(prune_every);
|
||||
PARSE_INT(x_out_every);
|
||||
PARSE_INT(v_out_every);
|
||||
PARSE_INT(half_neigh);
|
||||
}
|
||||
}
|
||||
|
||||
// Update dtforce
|
||||
param->dtforce = 0.5 * param->dt;
|
||||
|
||||
// Update sigma6 parameter
|
||||
MD_FLOAT s2 = param->sigma * param->sigma;
|
||||
param->sigma6 = s2 * s2 * s2;
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
void printParameter(Parameter *param) {
|
||||
printf("Parameters:\n");
|
||||
if(param->input_file != NULL) {
|
||||
printf("\tInput file: %s\n", param->input_file);
|
||||
}
|
||||
|
||||
if(param->vtk_file != NULL) {
|
||||
printf("\tVTK file: %s\n", param->vtk_file);
|
||||
}
|
||||
|
||||
if(param->xtc_file != NULL) {
|
||||
printf("\tXTC file: %s\n", param->xtc_file);
|
||||
}
|
||||
|
||||
if(param->eam_file != NULL) {
|
||||
printf("\tEAM file: %s\n", param->eam_file);
|
||||
}
|
||||
|
||||
printf("\tForce field: %s\n", ff2str(param->force_field));
|
||||
#ifdef CLUSTER_M
|
||||
printf("\tKernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH);
|
||||
#else
|
||||
printf("\tKernel: %s\n", KERNEL_NAME);
|
||||
#endif
|
||||
printf("\tData layout: %s\n", POS_DATA_LAYOUT);
|
||||
printf("\tFloating-point precision: %s\n", PRECISION_STRING);
|
||||
printf("\tUnit cells (nx, ny, nz): %d, %d, %d\n", param->nx, param->ny, param->nz);
|
||||
printf("\tDomain box sizes (x, y, z): %e, %e, %e\n", param->xprd, param->yprd, param->zprd);
|
||||
printf("\tPeriodic (x, y, z): %d, %d, %d\n", param->pbc_x, param->pbc_y, param->pbc_z);
|
||||
printf("\tLattice size: %e\n", param->lattice);
|
||||
printf("\tEpsilon: %e\n", param->epsilon);
|
||||
printf("\tSigma: %e\n", param->sigma);
|
||||
printf("\tSpring constant: %e\n", param->k_s);
|
||||
printf("\tDamping constant: %e\n", param->k_dn);
|
||||
printf("\tTemperature: %e\n", param->temp);
|
||||
printf("\tRHO: %e\n", param->rho);
|
||||
printf("\tMass: %e\n", param->mass);
|
||||
printf("\tNumber of types: %d\n", param->ntypes);
|
||||
printf("\tNumber of timesteps: %d\n", param->ntimes);
|
||||
printf("\tReport stats every (timesteps): %d\n", param->nstat);
|
||||
printf("\tReneighbor every (timesteps): %d\n", param->reneigh_every);
|
||||
#ifdef SORT_ATOMS
|
||||
printf("\tSort atoms when reneighboring: yes\n");
|
||||
#else
|
||||
printf("\tSort atoms when reneighboring: no\n");
|
||||
#endif
|
||||
printf("\tPrune every (timesteps): %d\n", param->prune_every);
|
||||
printf("\tOutput positions every (timesteps): %d\n", param->x_out_every);
|
||||
printf("\tOutput velocities every (timesteps): %d\n", param->v_out_every);
|
||||
printf("\tDelta time (dt): %e\n", param->dt);
|
||||
printf("\tCutoff radius: %e\n", param->cutforce);
|
||||
printf("\tSkin: %e\n", param->skin);
|
||||
printf("\tHalf neighbor lists: %d\n", param->half_neigh);
|
||||
printf("\tProcessor frequency (GHz): %.4f\n", param->proc_freq);
|
||||
}
|
62
src/common/parameter.h
Normal file
62
src/common/parameter.h
Normal file
@@ -0,0 +1,62 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#ifndef __PARAMETER_H_
|
||||
#define __PARAMETER_H_
|
||||
|
||||
#if PRECISION == 1
|
||||
# define MD_FLOAT float
|
||||
# define MD_UINT unsigned int
|
||||
#else
|
||||
# define MD_FLOAT double
|
||||
# define MD_UINT unsigned long long int
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
int force_field;
|
||||
char* param_file;
|
||||
char* input_file;
|
||||
char* vtk_file;
|
||||
char* xtc_file;
|
||||
char* write_atom_file;
|
||||
MD_FLOAT epsilon;
|
||||
MD_FLOAT sigma;
|
||||
MD_FLOAT sigma6;
|
||||
MD_FLOAT temp;
|
||||
MD_FLOAT rho;
|
||||
MD_FLOAT mass;
|
||||
int ntypes;
|
||||
int ntimes;
|
||||
int nstat;
|
||||
int reneigh_every;
|
||||
int prune_every;
|
||||
int x_out_every;
|
||||
int v_out_every;
|
||||
int half_neigh;
|
||||
MD_FLOAT dt;
|
||||
MD_FLOAT dtforce;
|
||||
MD_FLOAT skin;
|
||||
MD_FLOAT cutforce;
|
||||
MD_FLOAT cutneigh;
|
||||
int nx, ny, nz;
|
||||
int pbc_x, pbc_y, pbc_z;
|
||||
MD_FLOAT lattice;
|
||||
MD_FLOAT xlo, xhi, ylo, yhi, zlo, zhi;
|
||||
MD_FLOAT xprd, yprd, zprd;
|
||||
double proc_freq;
|
||||
char* eam_file;
|
||||
// DEM
|
||||
MD_FLOAT k_s;
|
||||
MD_FLOAT k_dn;
|
||||
MD_FLOAT gx, gy, gz;
|
||||
MD_FLOAT reflect_x, reflect_y, reflect_z;
|
||||
} Parameter;
|
||||
|
||||
void initParameter(Parameter*);
|
||||
void readParameter(Parameter*, const char*);
|
||||
void printParameter(Parameter*);
|
||||
|
||||
#endif
|
68
src/common/simd.h
Normal file
68
src/common/simd.h
Normal file
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#ifndef __SIMD_H__
|
||||
#define __SIMD_H__
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
#ifndef NO_ZMM_INTRIN
|
||||
# include <zmmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifndef CLUSTER_M
|
||||
# define CLUSTER_M 1
|
||||
#endif
|
||||
|
||||
#ifndef CLUSTER_N
|
||||
# define CLUSTER_N 1
|
||||
#endif
|
||||
|
||||
#if defined(__ISA_AVX512__)
|
||||
# if PRECISION == 2
|
||||
# include "simd/avx512_double.h"
|
||||
# else
|
||||
# include "simd/avx512_float.h"
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(__ISA_AVX2__)
|
||||
# if PRECISION == 2
|
||||
# include "simd/avx2_double.h"
|
||||
# else
|
||||
# include "simd/avx2_float.h"
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(__ISA_AVX__)
|
||||
# if PRECISION == 2
|
||||
# include "simd/avx_double.h"
|
||||
# else
|
||||
# include "simd/avx_float.h"
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define SIMD_PRINT_REAL(a) simd_print_real(#a, a);
|
||||
#define SIMD_PRINT_MASK(a) simd_print_mask(#a, a);
|
||||
|
||||
static inline void simd_print_real(const char *ref, MD_SIMD_FLOAT a) {
|
||||
double x[VECTOR_WIDTH];
|
||||
memcpy(x, &a, sizeof(x));
|
||||
|
||||
fprintf(stdout, "%s: ", ref);
|
||||
for(int i = 0; i < VECTOR_WIDTH; i++) {
|
||||
fprintf(stdout, "%f ", x[i]);
|
||||
}
|
||||
|
||||
fprintf(stdout, "\n");
|
||||
}
|
||||
|
||||
static inline void simd_print_mask(const char *ref, MD_SIMD_MASK a) { fprintf(stdout, "%s: %x\n", ref, simd_mask_to_u32(a)); }
|
||||
|
||||
#endif // __SIMD_H__
|
103
src/common/simd/avx2_double.h
Normal file
103
src/common/simd/avx2_double.h
Normal file
@@ -0,0 +1,103 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
#define MD_SIMD_FLOAT __m256d
|
||||
#define MD_SIMD_INT __m128i
|
||||
#define MD_SIMD_MASK __m256d
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_pd(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_pd(0.0); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); }
|
||||
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_pd(p, a); }
|
||||
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
|
||||
MD_SIMD_FLOAT ret;
|
||||
fprintf(stderr, "simd_load_h_duplicate(): Not implemented for AVX2 with double precision!");
|
||||
exit(-1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
|
||||
MD_SIMD_FLOAT ret;
|
||||
fprintf(stderr, "simd_load_h_dual(): Not implemented for AVX2 with double precision!");
|
||||
exit(-1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
|
||||
fprintf(stderr, "simd_h_dual_incr_reduced_sum(): Not implemented for AVX2 with double precision!");
|
||||
exit(-1);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
|
||||
__m256d t0, t1, t2;
|
||||
__m128d a0, a1;
|
||||
|
||||
t0 = _mm256_hadd_pd(v0, v1);
|
||||
t1 = _mm256_hadd_pd(v2, v3);
|
||||
t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
|
||||
t0 = _mm256_add_pd(t0, t2);
|
||||
t1 = _mm256_add_pd(t1, t2);
|
||||
t0 = _mm256_blend_pd(t0, t1, 0xC);
|
||||
//t0 = _mm256_blend_pd(t0, t1, 0b1100);
|
||||
t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
|
||||
_mm256_store_pd(m, t1);
|
||||
|
||||
t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0x5));
|
||||
//t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
|
||||
a0 = _mm256_castpd256_pd128(t0);
|
||||
a1 = _mm256_extractf128_pd(t0, 0x1);
|
||||
a0 = _mm_add_sd(a0, a1);
|
||||
return *((MD_FLOAT *) &a0);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_and_pd(a, m); }
|
||||
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); }
|
||||
//static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_pd(a); }
|
||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); }
|
||||
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); }
|
||||
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd(a, b, _CMP_LT_OQ); }
|
||||
static inline MD_SIMD_MASK simd_mask_int_cond_lt(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_cvtepi32_pd(_mm_cmplt_epi32(a, b)); }
|
||||
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _mm256_and_pd(a, b); }
|
||||
// TODO: Initialize all diagonal cases and just select the proper one (all bits set or diagonal) based on cond0
|
||||
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) {
|
||||
const unsigned long long int all = 0xFFFFFFFFFFFFFFFF;
|
||||
const unsigned long long int none = 0x0;
|
||||
return _mm256_castsi256_pd(_mm256_set_epi64x((a & 0x8) ? all : none, (a & 0x4) ? all : none, (a & 0x2) ? all : none, (a & 0x1) ? all : none));
|
||||
}
|
||||
// TODO: Implement this, althrough it is just required for debugging
|
||||
static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; }
|
||||
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||
__m128d a0, a1;
|
||||
// test with shuffle & add as an alternative to hadd later
|
||||
a = _mm256_hadd_pd(a, a);
|
||||
a0 = _mm256_castpd256_pd128(a);
|
||||
a1 = _mm256_extractf128_pd(a, 0x1);
|
||||
a0 = _mm_add_sd(a0, a1);
|
||||
return *((MD_FLOAT *) &a0);
|
||||
}
|
||||
|
||||
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
|
||||
fprintf(stderr, "simd_h_decr3(): Not implemented for AVX2 with double precision!");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// Functions used in LAMMPS kernel
|
||||
#define simd_gather(vidx, m, s) _mm256_i32gather_pd(m, vidx, s);
|
||||
static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); }
|
||||
static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); }
|
||||
static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); }
|
||||
static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm_load_si128((__m128i const *) m); }
|
||||
static inline MD_SIMD_INT simd_int_add(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_add_epi32(a, b); }
|
||||
static inline MD_SIMD_INT simd_int_mul(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_mul_epi32(a, b); }
|
||||
static inline MD_SIMD_INT simd_int_mask_load(const int *m, MD_SIMD_MASK k) { return simd_int_load(m) & _mm256_cvtpd_epi32(k); }
|
84
src/common/simd/avx2_float.h
Normal file
84
src/common/simd/avx2_float.h
Normal file
@@ -0,0 +1,84 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <immintrin.h>
|
||||
#include <zmmintrin.h>
|
||||
|
||||
#define MD_SIMD_FLOAT __m256
|
||||
#define MD_SIMD_MASK __mmask8
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_ps(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); }
|
||||
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_ps(p, a); }
|
||||
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_mask_mov_ps(_mm256_setzero_ps(), m, a); }
|
||||
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); }
|
||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); }
|
||||
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_ps_mask(a, b, _CMP_LT_OQ); }
|
||||
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
|
||||
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
|
||||
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||
__m128 t0;
|
||||
t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
|
||||
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
|
||||
return *((MD_FLOAT *) &t0);
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
|
||||
__m128 t0, t2;
|
||||
v0 = _mm256_hadd_ps(v0, v1);
|
||||
v2 = _mm256_hadd_ps(v2, v3);
|
||||
v0 = _mm256_hadd_ps(v0, v2);
|
||||
t0 = _mm_add_ps(_mm256_castps256_ps128(v0), _mm256_extractf128_ps(v0, 0x1));
|
||||
t2 = _mm_add_ps(t0, _mm_load_ps(m));
|
||||
_mm_store_ps(m, t2);
|
||||
|
||||
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
|
||||
return *((MD_FLOAT *) &t0);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
|
||||
return _mm256_broadcast_ps((const __m128 *)(m));
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
|
||||
__m128 t0, t1;
|
||||
t0 = _mm_broadcast_ss(m);
|
||||
t1 = _mm_broadcast_ss(m + 1);
|
||||
return _mm256_insertf128_ps(_mm256_castps128_ps256(t0), t1, 0x1);
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
|
||||
__m128 t0, t1;
|
||||
v0 = _mm256_hadd_ps(v0, v1);
|
||||
t0 = _mm256_extractf128_ps(v0, 0x1);
|
||||
t0 = _mm_hadd_ps(_mm256_castps256_ps128(v0), t0);
|
||||
t0 = _mm_permute_ps(t0, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
t1 = _mm_add_ps(t0, _mm_load_ps(m));
|
||||
_mm_store_ps(m, t1);
|
||||
|
||||
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
|
||||
return *((MD_FLOAT *) &t0);
|
||||
}
|
||||
|
||||
inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
|
||||
__m128 asum = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
|
||||
_mm_store_ps(m, _mm_sub_ps(_mm_load_ps(m), asum));
|
||||
}
|
||||
|
||||
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
|
||||
simd_h_decr(m, a0);
|
||||
simd_h_decr(m + CLUSTER_N, a1);
|
||||
simd_h_decr(m + CLUSTER_N * 2, a2);
|
||||
}
|
114
src/common/simd/avx512_double.h
Normal file
114
src/common/simd/avx512_double.h
Normal file
@@ -0,0 +1,114 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <immintrin.h>
|
||||
#ifndef NO_ZMM_INTRIN
|
||||
# include <zmmintrin.h>
|
||||
#endif
|
||||
|
||||
#define MD_SIMD_FLOAT __m512d
|
||||
#define MD_SIMD_MASK __mmask8
|
||||
#define MD_SIMD_INT __m256i
|
||||
#define MD_SIMD_BITMASK MD_SIMD_INT
|
||||
#define MD_SIMD_IBOOL __mmask16
|
||||
|
||||
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return (__mmask8)(a); }
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_pd(a, b, c); }
|
||||
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_pd(a); }
|
||||
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_pd(a, m, a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ); }
|
||||
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
|
||||
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
|
||||
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_pd(p); }
|
||||
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm512_store_pd(p, a); }
|
||||
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm512_mask_mov_pd(_mm512_setzero_pd(), m, a); }
|
||||
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||
MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
|
||||
x = _mm512_add_pd(x, _mm512_shuffle_f64x2(x, x, 0x11));
|
||||
x = _mm512_add_pd(x, _mm512_permute_pd(x, 0x01));
|
||||
return *((MD_FLOAT *) &x);
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
|
||||
__m512d t0, t2;
|
||||
__m256d t3, t4;
|
||||
|
||||
t0 = _mm512_add_pd(v0, _mm512_permute_pd(v0, 0x55));
|
||||
t2 = _mm512_add_pd(v2, _mm512_permute_pd(v2, 0x55));
|
||||
t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xaa), v1, _mm512_permute_pd(v1, 0x55));
|
||||
t2 = _mm512_mask_add_pd(t2, simd_mask_from_u32(0xaa), v3, _mm512_permute_pd(v3, 0x55));
|
||||
t0 = _mm512_add_pd(t0, _mm512_shuffle_f64x2(t0, t0, 0x4e));
|
||||
t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xF0), t2, _mm512_shuffle_f64x2(t2, t2, 0x4e));
|
||||
t0 = _mm512_add_pd(t0, _mm512_shuffle_f64x2(t0, t0, 0xb1));
|
||||
t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0x0C), t0, t0, 0xee);
|
||||
t3 = _mm512_castpd512_pd256(t0);
|
||||
t4 = _mm256_load_pd(m);
|
||||
t4 = _mm256_add_pd(t4, t3);
|
||||
_mm256_store_pd(m, t4);
|
||||
|
||||
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e));
|
||||
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
|
||||
return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0));
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
|
||||
return _mm512_broadcast_f64x4(_mm256_load_pd(m));
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
|
||||
return _mm512_insertf64x4(_mm512_broadcastsd_pd(_mm_load_sd(m)), _mm256_broadcastsd_pd(_mm_load_sd(m + 1)), 1);
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
|
||||
__m512d t0;
|
||||
__m256d t2, t3;
|
||||
|
||||
t0 = _mm512_add_pd(v0, _mm512_permutex_pd(v0, 0x4e));
|
||||
t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xccul), v1, _mm512_permutex_pd(v1, 0x4e));
|
||||
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
|
||||
t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0xaaul), t0, t0, 0xee);
|
||||
t2 = _mm512_castpd512_pd256(t0);
|
||||
t3 = _mm256_load_pd(m);
|
||||
t3 = _mm256_add_pd(t3, t2);
|
||||
_mm256_store_pd(m, t3);
|
||||
|
||||
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e));
|
||||
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
|
||||
return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0));
|
||||
}
|
||||
|
||||
static inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
|
||||
__m256d t;
|
||||
a = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
|
||||
t = _mm256_load_pd(m);
|
||||
t = _mm256_sub_pd(t, _mm512_castpd512_pd256(a));
|
||||
_mm256_store_pd(m, t);
|
||||
}
|
||||
|
||||
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
|
||||
simd_h_decr(m, a0);
|
||||
simd_h_decr(m + CLUSTER_N, a1);
|
||||
simd_h_decr(m + CLUSTER_N * 2, a2);
|
||||
}
|
||||
|
||||
// Functions used in LAMMPS kernel
|
||||
//static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm512_i32gather_pd(vidx, m, s); }
|
||||
#define simd_gather(vidx,m,s) (_mm512_i32gather_pd(vidx, m, s))
|
||||
static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm256_set1_epi32(scalar); }
|
||||
static inline MD_SIMD_INT simd_int_zero() { return _mm256_setzero_si256(); }
|
||||
static inline MD_SIMD_INT simd_int_seq() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); }
|
||||
static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm256_load_si256((const MD_SIMD_INT *) m); }
|
||||
//static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm256_load_epi32(m); }
|
||||
static inline MD_SIMD_INT simd_int_add(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_add_epi32(a, b); }
|
||||
static inline MD_SIMD_INT simd_int_mul(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_mul_epi32(a, b); }
|
||||
static inline MD_SIMD_INT simd_int_mask_load(const int *m, MD_SIMD_MASK k) { return _mm256_mask_load_epi32(simd_int_zero(), k, m); }
|
||||
static inline MD_SIMD_MASK simd_mask_int_cond_lt(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_cmp_epi32_mask(a, b, _MM_CMPINT_LT); }
|
103
src/common/simd/avx512_float.h
Normal file
103
src/common/simd/avx512_float.h
Normal file
@@ -0,0 +1,103 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
#ifndef NO_ZMM_INTRIN
|
||||
# include <zmmintrin.h>
|
||||
#endif
|
||||
|
||||
#define MD_SIMD_FLOAT __m512
|
||||
#define MD_SIMD_MASK __mmask16
|
||||
#define MD_SIMD_INT __m256i
|
||||
#define MD_SIMD_IBOOL __mmask16
|
||||
#define MD_SIMD_INT32 __m512i
|
||||
#define MD_SIMD_BITMASK MD_SIMD_INT32
|
||||
|
||||
static inline MD_SIMD_BITMASK simd_load_bitmask(const int *m) {
|
||||
return _mm512_load_si512(m);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_INT32 simd_int32_broadcast(int a) {
|
||||
return _mm512_set1_epi32(a);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_IBOOL simd_test_bits(MD_SIMD_FLOAT a) {
|
||||
return _mm512_test_epi32_mask(_mm512_castps_si512(a), _mm512_castps_si512(a));
|
||||
}
|
||||
|
||||
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return a; }
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_ps(a, b, c); }
|
||||
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_ps(a); }
|
||||
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_ps(a, m, a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask16(a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); }
|
||||
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask16(a); }
|
||||
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask16_u32(a); }
|
||||
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_ps(p); }
|
||||
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm512_store_ps(p, a); }
|
||||
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm512_mask_mov_ps(_mm512_setzero_ps(), m, a); }
|
||||
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||
// This would only be called in a Mx16 configuration, which is not valid in GROMACS
|
||||
fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n");
|
||||
exit(-1);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
|
||||
// This would only be called in a Mx16 configuration, which is not valid in GROMACS
|
||||
fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n");
|
||||
exit(-1);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const float* m) {
|
||||
return _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_load_pd((const double *)(m))));
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_dual(const float* m) {
|
||||
return _mm512_shuffle_f32x4(_mm512_broadcastss_ps(_mm_load_ss(m)), _mm512_broadcastss_ps(_mm_load_ss(m + 1)), 0x44);
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(float* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
|
||||
__m512 t0, t1;
|
||||
__m128 t2, t3;
|
||||
|
||||
t0 = _mm512_shuffle_f32x4(v0, v1, 0x88);
|
||||
t1 = _mm512_shuffle_f32x4(v0, v1, 0xdd);
|
||||
t0 = _mm512_add_ps(t0, t1);
|
||||
t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0x4e));
|
||||
t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0xb1));
|
||||
t0 = _mm512_maskz_compress_ps(simd_mask_from_u32(0x1111ul), t0);
|
||||
t3 = _mm512_castps512_ps128(t0);
|
||||
t2 = _mm_load_ps(m);
|
||||
t2 = _mm_add_ps(t2, t3);
|
||||
_mm_store_ps(m, t2);
|
||||
|
||||
t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0x4e));
|
||||
t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0xb1));
|
||||
return _mm_cvtss_f32(t3);
|
||||
}
|
||||
|
||||
static inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
|
||||
__m256 t;
|
||||
a = _mm512_add_ps(a, _mm512_shuffle_f32x4(a, a, 0xee));
|
||||
t = _mm256_load_ps(m);
|
||||
t = _mm256_sub_ps(t, _mm512_castps512_ps256(a));
|
||||
_mm256_store_ps(m, t);
|
||||
}
|
||||
|
||||
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
|
||||
simd_h_decr(m, a0);
|
||||
simd_h_decr(m + CLUSTER_N, a1);
|
||||
simd_h_decr(m + CLUSTER_N * 2, a2);
|
||||
}
|
103
src/common/simd/avx_double.h
Normal file
103
src/common/simd/avx_double.h
Normal file
@@ -0,0 +1,103 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
#define MD_SIMD_FLOAT __m256d
|
||||
#define MD_SIMD_INT __m128i
|
||||
#define MD_SIMD_MASK __m256d
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_pd(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_pd(0.0); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); }
|
||||
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_pd(p, a); }
|
||||
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
|
||||
MD_SIMD_FLOAT ret;
|
||||
fprintf(stderr, "simd_load_h_duplicate(): Not implemented for AVX with double precision!");
|
||||
exit(-1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
|
||||
MD_SIMD_FLOAT ret;
|
||||
fprintf(stderr, "simd_load_h_dual(): Not implemented for AVX with double precision!");
|
||||
exit(-1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
|
||||
fprintf(stderr, "simd_h_dual_incr_reduced_sum(): Not implemented for AVX with double precision!");
|
||||
exit(-1);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
|
||||
__m256d t0, t1, t2;
|
||||
__m128d a0, a1;
|
||||
|
||||
t0 = _mm256_hadd_pd(v0, v1);
|
||||
t1 = _mm256_hadd_pd(v2, v3);
|
||||
t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
|
||||
t0 = _mm256_add_pd(t0, t2);
|
||||
t1 = _mm256_add_pd(t1, t2);
|
||||
t0 = _mm256_blend_pd(t0, t1, 0b1100);
|
||||
t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
|
||||
_mm256_store_pd(m, t1);
|
||||
|
||||
t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
|
||||
a0 = _mm256_castpd256_pd128(t0);
|
||||
a1 = _mm256_extractf128_pd(t0, 0x1);
|
||||
a0 = _mm_add_sd(a0, a1);
|
||||
return *((MD_FLOAT *) &a0);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_and_pd(a, m); }
|
||||
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); }
|
||||
#ifdef __ISA_AVX_FMA__
|
||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); }
|
||||
#else
|
||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return simd_add(simd_mul(a, b), c); }
|
||||
#endif
|
||||
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); }
|
||||
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd(a, b, _CMP_LT_OQ); }
|
||||
static inline MD_SIMD_MASK simd_mask_int_cond_lt(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_cvtepi32_pd(_mm_cmplt_epi32(a, b)); }
|
||||
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _mm256_and_pd(a, b); }
|
||||
// TODO: Initialize all diagonal cases and just select the proper one (all bits set or diagonal) based on cond0
|
||||
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) {
|
||||
const unsigned long long int all = 0xFFFFFFFFFFFFFFFF;
|
||||
const unsigned long long int none = 0x0;
|
||||
return _mm256_castsi256_pd(_mm256_set_epi64x((a & 0x8) ? all : none, (a & 0x4) ? all : none, (a & 0x2) ? all : none, (a & 0x1) ? all : none));
|
||||
}
|
||||
// TODO: Implement this, althrough it is just required for debugging
|
||||
static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; }
|
||||
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||
__m128d a0, a1;
|
||||
a = _mm256_add_pd(a, _mm256_permute_pd(a, 0b0101));
|
||||
a0 = _mm256_castpd256_pd128(a);
|
||||
a1 = _mm256_extractf128_pd(a, 0x1);
|
||||
a0 = _mm_add_sd(a0, a1);
|
||||
return *((MD_FLOAT *) &a0);
|
||||
}
|
||||
|
||||
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
|
||||
fprintf(stderr, "simd_h_decr3(): Not implemented for AVX with double precision!");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// Functions used in LAMMPS kernel
|
||||
static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm256_i32gather_pd(m, vidx, s); }
|
||||
static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); }
|
||||
static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); }
|
||||
static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); }
|
||||
static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm_load_si128((__m128i const *) m); }
|
||||
static inline MD_SIMD_INT simd_int_add(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_add_epi32(a, b); }
|
||||
static inline MD_SIMD_INT simd_int_mul(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_mul_epi32(a, b); }
|
||||
static inline MD_SIMD_INT simd_int_mask_load(const int *m, MD_SIMD_MASK k) { return simd_int_load(m) & _mm256_cvtpd_epi32(k); }
|
84
src/common/simd/avx_float.h
Normal file
84
src/common/simd/avx_float.h
Normal file
@@ -0,0 +1,84 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <immintrin.h>
|
||||
#include <zmmintrin.h>
|
||||
|
||||
#define MD_SIMD_FLOAT __m256
|
||||
#define MD_SIMD_MASK __mmask8
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_ps(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); }
|
||||
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_ps(p, a); }
|
||||
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_mask_mov_ps(_mm256_setzero_ps(), m, a); }
|
||||
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); }
|
||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); }
|
||||
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_ps_mask(a, b, _CMP_LT_OQ); }
|
||||
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
|
||||
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
|
||||
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||
__m128 t0;
|
||||
t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
|
||||
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
|
||||
return *((MD_FLOAT *) &t0);
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
|
||||
__m128 t0, t2;
|
||||
v0 = _mm256_hadd_ps(v0, v1);
|
||||
v2 = _mm256_hadd_ps(v2, v3);
|
||||
v0 = _mm256_hadd_ps(v0, v2);
|
||||
t0 = _mm_add_ps(_mm256_castps256_ps128(v0), _mm256_extractf128_ps(v0, 0x1));
|
||||
t2 = _mm_add_ps(t0, _mm_load_ps(m));
|
||||
_mm_store_ps(m, t2);
|
||||
|
||||
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
|
||||
return *((MD_FLOAT *) &t0);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
|
||||
return _mm256_broadcast_ps((const __m128 *)(m));
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
|
||||
__m128 t0, t1;
|
||||
t0 = _mm_broadcast_ss(m);
|
||||
t1 = _mm_broadcast_ss(m + 1);
|
||||
return _mm256_insertf128_ps(_mm256_castps128_ps256(t0), t1, 0x1);
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
|
||||
__m128 t0, t1;
|
||||
v0 = _mm256_hadd_ps(v0, v1);
|
||||
t0 = _mm256_extractf128_ps(v0, 0x1);
|
||||
t0 = _mm_hadd_ps(_mm256_castps256_ps128(v0), t0);
|
||||
t0 = _mm_permute_ps(t0, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
t1 = _mm_add_ps(t0, _mm_load_ps(m));
|
||||
_mm_store_ps(m, t1);
|
||||
|
||||
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
|
||||
return *((MD_FLOAT *) &t0);
|
||||
}
|
||||
|
||||
inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
|
||||
__m128 asum = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
|
||||
_mm_store_ps(m, _mm_sub_ps(_mm_load_ps(m), asum));
|
||||
}
|
||||
|
||||
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
|
||||
simd_h_decr(m, a0);
|
||||
simd_h_decr(m + CLUSTER_N, a1);
|
||||
simd_h_decr(m + CLUSTER_N * 2, a2);
|
||||
}
|
115
src/common/thermo.c
Normal file
115
src/common/thermo.c
Normal file
@@ -0,0 +1,115 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <thermo.h>
|
||||
#include <util.h>
|
||||
|
||||
static int *steparr;
|
||||
static MD_FLOAT *tmparr;
|
||||
static MD_FLOAT *engarr;
|
||||
static MD_FLOAT *prsarr;
|
||||
static MD_FLOAT mvv2e;
|
||||
static MD_FLOAT dof_boltz;
|
||||
static MD_FLOAT t_scale;
|
||||
static MD_FLOAT p_scale;
|
||||
static MD_FLOAT e_scale;
|
||||
static MD_FLOAT t_act;
|
||||
static MD_FLOAT p_act;
|
||||
static MD_FLOAT e_act;
|
||||
static int mstat;
|
||||
|
||||
/* exported subroutines */
|
||||
void setupThermo(Parameter *param, int natoms)
|
||||
{
|
||||
int maxstat = param->ntimes / param->nstat + 2;
|
||||
|
||||
steparr = (int*) malloc(maxstat * sizeof(int));
|
||||
tmparr = (MD_FLOAT*) malloc(maxstat * sizeof(MD_FLOAT));
|
||||
engarr = (MD_FLOAT*) malloc(maxstat * sizeof(MD_FLOAT));
|
||||
prsarr = (MD_FLOAT*) malloc(maxstat * sizeof(MD_FLOAT));
|
||||
|
||||
if(param->force_field == FF_LJ) {
|
||||
mvv2e = 1.0;
|
||||
dof_boltz = (natoms * 3 - 3);
|
||||
t_scale = mvv2e / dof_boltz;
|
||||
p_scale = 1.0 / 3 / param->xprd / param->yprd / param->zprd;
|
||||
e_scale = 0.5;
|
||||
} else if(param->force_field == FF_EAM) {
|
||||
mvv2e = 1.036427e-04;
|
||||
dof_boltz = (natoms * 3 - 3) * 8.617343e-05;
|
||||
t_scale = mvv2e / dof_boltz;
|
||||
p_scale = 1.602176e+06 / 3 / param->xprd / param->yprd / param->zprd;
|
||||
e_scale = 524287.985533;//16.0;
|
||||
param->dtforce /= mvv2e;
|
||||
}
|
||||
}
|
||||
|
||||
void computeThermo(int iflag, Parameter *param, Atom *atom)
|
||||
{
|
||||
MD_FLOAT t = 0.0, p;
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
|
||||
}
|
||||
|
||||
t = t * t_scale;
|
||||
p = (t * dof_boltz) * p_scale;
|
||||
int istep = iflag;
|
||||
|
||||
if(iflag == -1){
|
||||
istep = param->ntimes;
|
||||
}
|
||||
if(iflag == 0){
|
||||
mstat = 0;
|
||||
}
|
||||
|
||||
steparr[mstat] = istep;
|
||||
tmparr[mstat] = t;
|
||||
prsarr[mstat] = p;
|
||||
mstat++;
|
||||
fprintf(stdout, "%i\t%e\t%e\n", istep, t, p);
|
||||
}
|
||||
|
||||
void adjustThermo(Parameter *param, Atom *atom)
|
||||
{
|
||||
/* zero center-of-mass motion */
|
||||
MD_FLOAT vxtot = 0.0; MD_FLOAT vytot = 0.0; MD_FLOAT vztot = 0.0;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
vxtot += atom_vx(i);
|
||||
vytot += atom_vy(i);
|
||||
vztot += atom_vz(i);
|
||||
}
|
||||
|
||||
vxtot = vxtot / atom->Natoms;
|
||||
vytot = vytot / atom->Natoms;
|
||||
vztot = vztot / atom->Natoms;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
atom_vx(i) -= vxtot;
|
||||
atom_vy(i) -= vytot;
|
||||
atom_vz(i) -= vztot;
|
||||
}
|
||||
|
||||
t_act = 0;
|
||||
MD_FLOAT t = 0.0;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
|
||||
}
|
||||
|
||||
t *= t_scale;
|
||||
MD_FLOAT factor = sqrt(param->temp / t);
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
atom_vx(i) *= factor;
|
||||
atom_vy(i) *= factor;
|
||||
atom_vz(i) *= factor;
|
||||
}
|
||||
}
|
15
src/common/thermo.h
Normal file
15
src/common/thermo.h
Normal file
@@ -0,0 +1,15 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
|
||||
#ifndef __THERMO_H_
|
||||
#define __THERMO_H_
|
||||
extern void setupThermo(Parameter*, int);
|
||||
extern void computeThermo(int, Parameter*, Atom*);
|
||||
extern void adjustThermo(Parameter*, Atom*);
|
||||
#endif
|
17
src/common/timers.h
Normal file
17
src/common/timers.h
Normal file
@@ -0,0 +1,17 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#ifndef __TIMERS_H_
|
||||
#define __TIMERS_H_
|
||||
|
||||
typedef enum {
|
||||
TOTAL = 0,
|
||||
NEIGH,
|
||||
FORCE,
|
||||
NUMTIMER
|
||||
} timertype;
|
||||
|
||||
#endif
|
21
src/common/timing.c
Normal file
21
src/common/timing.c
Normal file
@@ -0,0 +1,21 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <time.h>
|
||||
|
||||
double getTimeStamp(void)
|
||||
{
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
|
||||
}
|
||||
|
||||
double getTimeResolution(void)
|
||||
{
|
||||
struct timespec ts;
|
||||
clock_getres(CLOCK_MONOTONIC, &ts);
|
||||
return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
|
||||
}
|
13
src/common/timing.h
Normal file
13
src/common/timing.h
Normal file
@@ -0,0 +1,13 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#ifndef __TIMING_H_
|
||||
#define __TIMING_H_
|
||||
|
||||
extern double getTimeStamp(void);
|
||||
extern double getTimeResolution(void);
|
||||
|
||||
#endif
|
120
src/common/util.c
Normal file
120
src/common/util.c
Normal file
@@ -0,0 +1,120 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <errno.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <util.h>
|
||||
|
||||
/* Park/Miller RNG w/out MASKING, so as to be like f90s version */
|
||||
#define IA 16807
|
||||
#define IM 2147483647
|
||||
#define AM (1.0 / IM)
|
||||
#define IQ 127773
|
||||
#define IR 2836
|
||||
#define MASK 123459876
|
||||
|
||||
double myrandom(int* seed)
|
||||
{
|
||||
int k = (*seed) / IQ;
|
||||
double ans;
|
||||
|
||||
*seed = IA * (*seed - k * IQ) - IR * k;
|
||||
if (*seed < 0) *seed += IM;
|
||||
ans = AM * (*seed);
|
||||
return ans;
|
||||
}
|
||||
|
||||
void random_reset(int* seed, int ibase, double* coord)
|
||||
{
|
||||
int i;
|
||||
char* str = (char*)&ibase;
|
||||
int n = sizeof(int);
|
||||
unsigned int hash = 0;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
hash += str[i];
|
||||
hash += (hash << 10);
|
||||
hash ^= (hash >> 6);
|
||||
}
|
||||
|
||||
str = (char*)coord;
|
||||
n = 3 * sizeof(double);
|
||||
for (i = 0; i < n; i++) {
|
||||
hash += str[i];
|
||||
hash += (hash << 10);
|
||||
hash ^= (hash >> 6);
|
||||
}
|
||||
|
||||
hash += (hash << 3);
|
||||
hash ^= (hash >> 11);
|
||||
hash += (hash << 15);
|
||||
|
||||
// keep 31 bits of unsigned int as new seed
|
||||
// do not allow seed = 0, since will cause hang in gaussian()
|
||||
|
||||
*seed = hash & 0x7ffffff;
|
||||
if (!(*seed)) *seed = 1;
|
||||
|
||||
// warm up the RNG
|
||||
|
||||
for (i = 0; i < 5; i++)
|
||||
myrandom(seed);
|
||||
// save = 0;
|
||||
}
|
||||
|
||||
int str2ff(const char* string)
|
||||
{
|
||||
if (strncmp(string, "lj", 2) == 0) return FF_LJ;
|
||||
if (strncmp(string, "eam", 3) == 0) return FF_EAM;
|
||||
if (strncmp(string, "dem", 3) == 0) return FF_DEM;
|
||||
return -1;
|
||||
}
|
||||
|
||||
const char* ff2str(int ff)
|
||||
{
|
||||
if (ff == FF_LJ) {
|
||||
return "lj";
|
||||
}
|
||||
if (ff == FF_EAM) {
|
||||
return "eam";
|
||||
}
|
||||
if (ff == FF_DEM) {
|
||||
return "dem";
|
||||
}
|
||||
return "invalid";
|
||||
}
|
||||
|
||||
int get_cuda_num_threads(void)
|
||||
{
|
||||
const char* num_threads_env = getenv("NUM_THREADS");
|
||||
return (num_threads_env == NULL) ? 32 : atoi(num_threads_env);
|
||||
}
|
||||
|
||||
void readline(char* line, FILE* fp)
|
||||
{
|
||||
if (fgets(line, MAXLINE, fp) == NULL) {
|
||||
if (errno != 0) {
|
||||
perror("readline()");
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void debug_printf(const char* format, ...)
|
||||
{
|
||||
#ifdef DEBUG
|
||||
va_list arg;
|
||||
int ret;
|
||||
|
||||
va_start(arg, format);
|
||||
if ((vfprintf(stdout, format, arg)) < 0) {
|
||||
perror("debug_printf()");
|
||||
}
|
||||
va_end(arg);
|
||||
#endif
|
||||
}
|
47
src/common/util.h
Normal file
47
src/common/util.h
Normal file
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#ifndef __UTIL_H_
|
||||
#define __UTIL_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#ifndef MIN
|
||||
#define MIN(x, y) ((x) < (y) ? (x) : (y))
|
||||
#endif
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(x, y) ((x) > (y) ? (x) : (y))
|
||||
#endif
|
||||
|
||||
#ifndef ABS
|
||||
#define ABS(a) ((a) >= 0 ? (a) : -(a))
|
||||
#endif
|
||||
|
||||
#define DEBUG_MESSAGE debug_printf
|
||||
|
||||
#ifndef MAXLINE
|
||||
#define MAXLINE 4096
|
||||
#endif
|
||||
|
||||
#define FF_LJ 0
|
||||
#define FF_EAM 1
|
||||
#define FF_DEM 2
|
||||
|
||||
#if PRECISION == 1
|
||||
#define PRECISION_STRING "single"
|
||||
#else
|
||||
#define PRECISION_STRING "double"
|
||||
#endif
|
||||
|
||||
extern double myrandom(int *);
|
||||
extern void random_reset(int *seed, int ibase, double *coord);
|
||||
extern int str2ff(const char *string);
|
||||
extern const char *ff2str(int ff);
|
||||
extern void readline(char *line, FILE *fp);
|
||||
extern void debug_printf(const char *format, ...);
|
||||
extern int get_cuda_num_threads(void);
|
||||
|
||||
#endif
|
551
src/verletlist/atom.c
Normal file
551
src/verletlist/atom.c
Normal file
@@ -0,0 +1,551 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
#include <device.h>
|
||||
#include <util.h>
|
||||
|
||||
#define DELTA 20000
|
||||
|
||||
#ifndef MAXLINE
|
||||
#define MAXLINE 4096
|
||||
#endif
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(a,b) ((a) > (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
void initAtom(Atom *atom) {
|
||||
atom->x = NULL; atom->y = NULL; atom->z = NULL;
|
||||
atom->vx = NULL; atom->vy = NULL; atom->vz = NULL;
|
||||
atom->fx = NULL; atom->fy = NULL; atom->fz = NULL;
|
||||
atom->Natoms = 0;
|
||||
atom->Nlocal = 0;
|
||||
atom->Nghost = 0;
|
||||
atom->Nmax = 0;
|
||||
atom->type = NULL;
|
||||
atom->ntypes = 0;
|
||||
atom->epsilon = NULL;
|
||||
atom->sigma6 = NULL;
|
||||
atom->cutforcesq = NULL;
|
||||
atom->cutneighsq = NULL;
|
||||
atom->radius = NULL;
|
||||
atom->av = NULL;
|
||||
atom->r = NULL;
|
||||
|
||||
DeviceAtom *d_atom = &(atom->d_atom);
|
||||
d_atom->x = NULL; d_atom->y = NULL; d_atom->z = NULL;
|
||||
d_atom->vx = NULL; d_atom->vy = NULL; d_atom->vz = NULL;
|
||||
d_atom->fx = NULL; d_atom->fy = NULL; d_atom->fz = NULL;
|
||||
d_atom->border_map = NULL;
|
||||
d_atom->type = NULL;
|
||||
d_atom->epsilon = NULL;
|
||||
d_atom->sigma6 = NULL;
|
||||
d_atom->cutforcesq = NULL;
|
||||
d_atom->cutneighsq = NULL;
|
||||
}
|
||||
|
||||
void createAtom(Atom *atom, Parameter *param) {
|
||||
MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd;
|
||||
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd;
|
||||
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd;
|
||||
atom->Natoms = 4 * param->nx * param->ny * param->nz;
|
||||
atom->Nlocal = 0;
|
||||
atom->ntypes = param->ntypes;
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0));
|
||||
int ilo = (int) (xlo / (0.5 * alat) - 1);
|
||||
int ihi = (int) (xhi / (0.5 * alat) + 1);
|
||||
int jlo = (int) (ylo / (0.5 * alat) - 1);
|
||||
int jhi = (int) (yhi / (0.5 * alat) + 1);
|
||||
int klo = (int) (zlo / (0.5 * alat) - 1);
|
||||
int khi = (int) (zhi / (0.5 * alat) + 1);
|
||||
|
||||
ilo = MAX(ilo, 0);
|
||||
ihi = MIN(ihi, 2 * param->nx - 1);
|
||||
jlo = MAX(jlo, 0);
|
||||
jhi = MIN(jhi, 2 * param->ny - 1);
|
||||
klo = MAX(klo, 0);
|
||||
khi = MIN(khi, 2 * param->nz - 1);
|
||||
|
||||
MD_FLOAT xtmp, ytmp, ztmp, vxtmp, vytmp, vztmp;
|
||||
int i, j, k, m, n;
|
||||
int sx = 0; int sy = 0; int sz = 0;
|
||||
int ox = 0; int oy = 0; int oz = 0;
|
||||
int subboxdim = 8;
|
||||
|
||||
while(oz * subboxdim <= khi) {
|
||||
|
||||
k = oz * subboxdim + sz;
|
||||
j = oy * subboxdim + sy;
|
||||
i = ox * subboxdim + sx;
|
||||
|
||||
if(((i + j + k) % 2 == 0) &&
|
||||
(i >= ilo) && (i <= ihi) &&
|
||||
(j >= jlo) && (j <= jhi) &&
|
||||
(k >= klo) && (k <= khi)) {
|
||||
|
||||
xtmp = 0.5 * alat * i;
|
||||
ytmp = 0.5 * alat * j;
|
||||
ztmp = 0.5 * alat * k;
|
||||
|
||||
if( xtmp >= xlo && xtmp < xhi &&
|
||||
ytmp >= ylo && ytmp < yhi &&
|
||||
ztmp >= zlo && ztmp < zhi ) {
|
||||
|
||||
n = k * (2 * param->ny) * (2 * param->nx) +
|
||||
j * (2 * param->nx) +
|
||||
i + 1;
|
||||
|
||||
for(m = 0; m < 5; m++) {
|
||||
myrandom(&n);
|
||||
}
|
||||
vxtmp = myrandom(&n);
|
||||
|
||||
for(m = 0; m < 5; m++){
|
||||
myrandom(&n);
|
||||
}
|
||||
vytmp = myrandom(&n);
|
||||
|
||||
for(m = 0; m < 5; m++) {
|
||||
myrandom(&n);
|
||||
}
|
||||
vztmp = myrandom(&n);
|
||||
|
||||
if(atom->Nlocal == atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
atom_x(atom->Nlocal) = xtmp;
|
||||
atom_y(atom->Nlocal) = ytmp;
|
||||
atom_z(atom->Nlocal) = ztmp;
|
||||
atom_vx(atom->Nlocal) = vxtmp;
|
||||
atom_vy(atom->Nlocal) = vytmp;
|
||||
atom_vz(atom->Nlocal) = vztmp;
|
||||
atom->type[atom->Nlocal] = rand() % atom->ntypes;
|
||||
atom->Nlocal++;
|
||||
}
|
||||
}
|
||||
|
||||
sx++;
|
||||
|
||||
if(sx == subboxdim) { sx = 0; sy++; }
|
||||
if(sy == subboxdim) { sy = 0; sz++; }
|
||||
if(sz == subboxdim) { sz = 0; ox++; }
|
||||
if(ox * subboxdim > ihi) { ox = 0; oy++; }
|
||||
if(oy * subboxdim > jhi) { oy = 0; oz++; }
|
||||
}
|
||||
}
|
||||
|
||||
int type_str2int(const char *type) {
|
||||
if(strncmp(type, "Ar", 2) == 0) { return 0; } // Argon
|
||||
fprintf(stderr, "Invalid atom type: %s\n", type);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int readAtom(Atom* atom, Parameter* param) {
|
||||
int len = strlen(param->input_file);
|
||||
if(strncmp(¶m->input_file[len - 4], ".pdb", 4) == 0) { return readAtom_pdb(atom, param); }
|
||||
if(strncmp(¶m->input_file[len - 4], ".gro", 4) == 0) { return readAtom_gro(atom, param); }
|
||||
if(strncmp(¶m->input_file[len - 4], ".dmp", 4) == 0) { return readAtom_dmp(atom, param); }
|
||||
if(strncmp(¶m->input_file[len - 3], ".in", 3) == 0) { return readAtom_in(atom, param); }
|
||||
fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp, in\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int readAtom_pdb(Atom* atom, Parameter* param) {
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
int read_atoms = 0;
|
||||
|
||||
if(!fp) {
|
||||
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
while(!feof(fp)) {
|
||||
readline(line, fp);
|
||||
char *item = strtok(line, " ");
|
||||
if(strncmp(item, "CRYST1", 6) == 0) {
|
||||
param->xlo = 0.0;
|
||||
param->xhi = atof(strtok(NULL, " "));
|
||||
param->ylo = 0.0;
|
||||
param->yhi = atof(strtok(NULL, " "));
|
||||
param->zlo = 0.0;
|
||||
param->zhi = atof(strtok(NULL, " "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
// alpha, beta, gamma, sGroup, z
|
||||
} else if(strncmp(item, "ATOM", 4) == 0) {
|
||||
char *label;
|
||||
int atom_id, comp_id;
|
||||
MD_FLOAT occupancy, charge;
|
||||
atom_id = atoi(strtok(NULL, " ")) - 1;
|
||||
|
||||
while(atom_id + 1 >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
atom->type[atom_id] = type_str2int(strtok(NULL, " "));
|
||||
label = strtok(NULL, " ");
|
||||
comp_id = atoi(strtok(NULL, " "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_vx(atom_id) = 0.0;
|
||||
atom_vy(atom_id) = 0.0;
|
||||
atom_vz(atom_id) = 0.0;
|
||||
occupancy = atof(strtok(NULL, " "));
|
||||
charge = atof(strtok(NULL, " "));
|
||||
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
|
||||
atom->Natoms++;
|
||||
atom->Nlocal++;
|
||||
read_atoms++;
|
||||
} else if(strncmp(item, "HEADER", 6) == 0 ||
|
||||
strncmp(item, "REMARK", 6) == 0 ||
|
||||
strncmp(item, "MODEL", 5) == 0 ||
|
||||
strncmp(item, "TER", 3) == 0 ||
|
||||
strncmp(item, "ENDMDL", 6) == 0) {
|
||||
// Do nothing
|
||||
} else {
|
||||
fprintf(stderr, "Invalid item: %s\n", item);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if(!read_atoms) {
|
||||
fprintf(stderr, "Input error: No atoms read!\n");
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
|
||||
fclose(fp);
|
||||
return read_atoms;
|
||||
}
|
||||
|
||||
int readAtom_gro(Atom* atom, Parameter* param) {
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
char desc[MAXLINE];
|
||||
int read_atoms = 0;
|
||||
int atoms_to_read = 0;
|
||||
int i = 0;
|
||||
|
||||
if(!fp) {
|
||||
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
readline(desc, fp);
|
||||
for(i = 0; desc[i] != '\n'; i++);
|
||||
desc[i] = '\0';
|
||||
readline(line, fp);
|
||||
atoms_to_read = atoi(strtok(line, " "));
|
||||
fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
|
||||
|
||||
while(!feof(fp) && read_atoms < atoms_to_read) {
|
||||
readline(line, fp);
|
||||
char *label = strtok(line, " ");
|
||||
int type = type_str2int(strtok(NULL, " "));
|
||||
int atom_id = atoi(strtok(NULL, " ")) - 1;
|
||||
atom_id = read_atoms;
|
||||
while(atom_id + 1 >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
atom->type[atom_id] = type;
|
||||
atom_x(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_vx(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_vy(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_vz(atom_id) = atof(strtok(NULL, " "));
|
||||
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
|
||||
atom->Natoms++;
|
||||
atom->Nlocal++;
|
||||
read_atoms++;
|
||||
}
|
||||
|
||||
if(!feof(fp)) {
|
||||
readline(line, fp);
|
||||
param->xlo = 0.0;
|
||||
param->xhi = atof(strtok(line, " "));
|
||||
param->ylo = 0.0;
|
||||
param->yhi = atof(strtok(NULL, " "));
|
||||
param->zlo = 0.0;
|
||||
param->zhi = atof(strtok(NULL, " "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
}
|
||||
|
||||
if(read_atoms != atoms_to_read) {
|
||||
fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
|
||||
fclose(fp);
|
||||
return read_atoms;
|
||||
}
|
||||
|
||||
int readAtom_dmp(Atom* atom, Parameter* param) {
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
int natoms = 0;
|
||||
int read_atoms = 0;
|
||||
int atom_id = -1;
|
||||
int ts = -1;
|
||||
|
||||
if(!fp) {
|
||||
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
while(!feof(fp) && ts < 1 && !read_atoms) {
|
||||
readline(line, fp);
|
||||
if(strncmp(line, "ITEM: ", 6) == 0) {
|
||||
char *item = &line[6];
|
||||
|
||||
if(strncmp(item, "TIMESTEP", 8) == 0) {
|
||||
readline(line, fp);
|
||||
ts = atoi(line);
|
||||
} else if(strncmp(item, "NUMBER OF ATOMS", 15) == 0) {
|
||||
readline(line, fp);
|
||||
natoms = atoi(line);
|
||||
atom->Natoms = natoms;
|
||||
atom->Nlocal = natoms;
|
||||
while(atom->Nlocal >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
} else if(strncmp(item, "BOX BOUNDS pp pp pp", 19) == 0) {
|
||||
readline(line, fp);
|
||||
param->xlo = atof(strtok(line, " "));
|
||||
param->xhi = atof(strtok(NULL, " "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
|
||||
readline(line, fp);
|
||||
param->ylo = atof(strtok(line, " "));
|
||||
param->yhi = atof(strtok(NULL, " "));
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
|
||||
readline(line, fp);
|
||||
param->zlo = atof(strtok(line, " "));
|
||||
param->zhi = atof(strtok(NULL, " "));
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
} else if(strncmp(item, "ATOMS id type x y z vx vy vz", 28) == 0) {
|
||||
for(int i = 0; i < natoms; i++) {
|
||||
readline(line, fp);
|
||||
atom_id = atoi(strtok(line, " ")) - 1;
|
||||
atom->type[atom_id] = atoi(strtok(NULL, " "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_vx(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_vy(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_vz(atom_id) = atof(strtok(NULL, " "));
|
||||
atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
|
||||
read_atoms++;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "Invalid item: %s\n", item);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if(ts < 0 || !natoms || !read_atoms) {
|
||||
fprintf(stderr, "Input error: atom data was not read!\n");
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
|
||||
return natoms;
|
||||
}
|
||||
|
||||
int readAtom_in(Atom* atom, Parameter* param) {
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
int natoms = 0;
|
||||
int atom_id = 0;
|
||||
|
||||
if(!fp) {
|
||||
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
readline(line, fp);
|
||||
natoms = atoi(strtok(line, " "));
|
||||
param->xlo = atof(strtok(NULL, " "));
|
||||
param->xhi = atof(strtok(NULL, " "));
|
||||
param->ylo = atof(strtok(NULL, " "));
|
||||
param->yhi = atof(strtok(NULL, " "));
|
||||
param->zlo = atof(strtok(NULL, " "));
|
||||
param->zhi = atof(strtok(NULL, " "));
|
||||
atom->Natoms = natoms;
|
||||
atom->Nlocal = natoms;
|
||||
atom->ntypes = 1;
|
||||
|
||||
while(atom->Nlocal >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
for(int i = 0; i < natoms; i++) {
|
||||
readline(line, fp);
|
||||
|
||||
// TODO: store mass per atom
|
||||
char *s_mass = strtok(line, " ");
|
||||
if(strncmp(s_mass, "inf", 3) == 0) {
|
||||
// Set atom's mass to INFINITY
|
||||
} else {
|
||||
param->mass = atof(s_mass);
|
||||
}
|
||||
|
||||
atom->radius[atom_id] = atof(strtok(NULL, " "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_vx(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_vy(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_vz(atom_id) = atof(strtok(NULL, " "));
|
||||
atom->type[atom_id] = 0;
|
||||
atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
|
||||
atom_id++;
|
||||
}
|
||||
|
||||
if(!natoms) {
|
||||
fprintf(stderr, "Input error: atom data was not read!\n");
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
|
||||
return natoms;
|
||||
}
|
||||
|
||||
void writeAtom(Atom *atom, Parameter *param) {
|
||||
FILE *fp = fopen(param->write_atom_file, "w");
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
fprintf(fp, "%d,%f,%f,%f,%f,%f,%f,%f,0\n",
|
||||
atom->type[i], 1.0,
|
||||
atom_x(i), atom_y(i), atom_z(i),
|
||||
atom_vx(i), atom_vy(i), atom_vz(i));
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
fprintf(stdout, "Wrote input data to %s, grid size: %f, %f, %f\n",
|
||||
param->write_atom_file, param->xprd, param->yprd, param->zprd);
|
||||
}
|
||||
|
||||
void growAtom(Atom *atom) {
|
||||
DeviceAtom *d_atom = &(atom->d_atom);
|
||||
int nold = atom->Nmax;
|
||||
atom->Nmax += DELTA;
|
||||
|
||||
#undef REALLOC
|
||||
#define REALLOC(p,t,ns,os); \
|
||||
atom->p = (t *) reallocate(atom->p, ALIGNMENT, ns, os); \
|
||||
atom->d_atom.p = (t *) reallocateGPU(atom->d_atom.p, ns);
|
||||
|
||||
#ifdef AOS
|
||||
REALLOC(x, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||
REALLOC(vx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||
REALLOC(fx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||
#else
|
||||
REALLOC(x, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
REALLOC(y, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
REALLOC(z, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
REALLOC(vx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
REALLOC(vy, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
REALLOC(vz, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
REALLOC(fx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
REALLOC(fy, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
REALLOC(fz, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
#endif
|
||||
REALLOC(type, int, atom->Nmax * sizeof(int), nold * sizeof(int));
|
||||
|
||||
// DEM
|
||||
atom->radius = (MD_FLOAT *) reallocate(atom->radius, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->av = (MD_FLOAT*) reallocate(atom->av, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||
atom->r = (MD_FLOAT*) reallocate(atom->r, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 4, nold * sizeof(MD_FLOAT) * 4);
|
||||
}
|
102
src/verletlist/atom.h
Normal file
102
src/verletlist/atom.h
Normal file
@@ -0,0 +1,102 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __ATOM_H_
|
||||
#define __ATOM_H_
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
#define KERNEL_NAME "CUDA"
|
||||
#define computeForceLJFullNeigh computeForceLJFullNeigh_cuda
|
||||
#define initialIntegrate initialIntegrate_cuda
|
||||
#define finalIntegrate finalIntegrate_cuda
|
||||
#define buildNeighbor buildNeighbor_cuda
|
||||
#define updatePbc updatePbc_cuda
|
||||
#define updateAtomsPbc updateAtomsPbc_cuda
|
||||
#else
|
||||
#ifdef USE_SIMD_KERNEL
|
||||
#define KERNEL_NAME "SIMD"
|
||||
#define computeForceLJFullNeigh computeForceLJFullNeigh_simd
|
||||
#else
|
||||
#define KERNEL_NAME "PLAIN"
|
||||
#endif
|
||||
#define initialIntegrate initialIntegrate_cpu
|
||||
#define finalIntegrate finalIntegrate_cpu
|
||||
#define buildNeighbor buildNeighbor_cpu
|
||||
#define updatePbc updatePbc_cpu
|
||||
#define updateAtomsPbc updateAtomsPbc_cpu
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
MD_FLOAT *x, *y, *z;
|
||||
MD_FLOAT *vx, *vy, *vz;
|
||||
MD_FLOAT *fx, *fy, *fz;
|
||||
int* border_map;
|
||||
int* type;
|
||||
MD_FLOAT* epsilon;
|
||||
MD_FLOAT* sigma6;
|
||||
MD_FLOAT* cutforcesq;
|
||||
MD_FLOAT* cutneighsq;
|
||||
} DeviceAtom;
|
||||
|
||||
typedef struct {
|
||||
int Natoms, Nlocal, Nghost, Nmax;
|
||||
MD_FLOAT *x, *y, *z;
|
||||
MD_FLOAT *vx, *vy, *vz;
|
||||
MD_FLOAT *fx, *fy, *fz;
|
||||
int* border_map;
|
||||
int* type;
|
||||
int ntypes;
|
||||
MD_FLOAT* epsilon;
|
||||
MD_FLOAT* sigma6;
|
||||
MD_FLOAT* cutforcesq;
|
||||
MD_FLOAT* cutneighsq;
|
||||
|
||||
// DEM
|
||||
MD_FLOAT* radius;
|
||||
MD_FLOAT* av;
|
||||
MD_FLOAT* r;
|
||||
|
||||
// Device data
|
||||
DeviceAtom d_atom;
|
||||
} Atom;
|
||||
|
||||
extern void initAtom(Atom*);
|
||||
extern void createAtom(Atom*, Parameter*);
|
||||
extern int readAtom(Atom*, Parameter*);
|
||||
extern int readAtom_pdb(Atom*, Parameter*);
|
||||
extern int readAtom_gro(Atom*, Parameter*);
|
||||
extern int readAtom_dmp(Atom*, Parameter*);
|
||||
extern int readAtom_in(Atom*, Parameter*);
|
||||
extern void writeAtom(Atom*, Parameter*);
|
||||
extern void growAtom(Atom*);
|
||||
|
||||
#ifdef AOS
|
||||
#define POS_DATA_LAYOUT "AoS"
|
||||
#define atom_x(i) atom->x[(i) * 3 + 0]
|
||||
#define atom_y(i) atom->x[(i) * 3 + 1]
|
||||
#define atom_z(i) atom->x[(i) * 3 + 2]
|
||||
#define atom_vx(i) atom->vx[(i) * 3 + 0]
|
||||
#define atom_vy(i) atom->vx[(i) * 3 + 1]
|
||||
#define atom_vz(i) atom->vx[(i) * 3 + 2]
|
||||
#define atom_fx(i) atom->fx[(i) * 3 + 0]
|
||||
#define atom_fy(i) atom->fx[(i) * 3 + 1]
|
||||
#define atom_fz(i) atom->fx[(i) * 3 + 2]
|
||||
#else
|
||||
#define POS_DATA_LAYOUT "SoA"
|
||||
#define atom_x(i) atom->x[i]
|
||||
#define atom_y(i) atom->y[i]
|
||||
#define atom_z(i) atom->z[i]
|
||||
#define atom_vx(i) atom->vx[i]
|
||||
#define atom_vy(i) atom->vy[i]
|
||||
#define atom_vz(i) atom->vz[i]
|
||||
#define atom_fx(i) atom->fx[i]
|
||||
#define atom_fy(i) atom->fy[i]
|
||||
#define atom_fz(i) atom->fz[i]
|
||||
#endif
|
||||
|
||||
#endif
|
180
src/verletlist/cuda/force.cu
Normal file
180
src/verletlist/cuda/force.cu
Normal file
@@ -0,0 +1,180 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stddef.h>
|
||||
//---
|
||||
#include <cuda_profiler_api.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <device_launch_parameters.h>
|
||||
//---
|
||||
#include <likwid-marker.h>
|
||||
|
||||
extern "C" {
|
||||
|
||||
#include <allocate.h>
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
#include <device.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <timing.h>
|
||||
#include <util.h>
|
||||
|
||||
}
|
||||
|
||||
// cuda kernel
|
||||
__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh, int ntypes) {
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(i >= Nlocal) {
|
||||
return;
|
||||
}
|
||||
|
||||
DeviceAtom *atom = &a;
|
||||
const int numneighs = neigh_numneigh[i];
|
||||
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neigh_neighbors[Nlocal * k + i];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
|
||||
const MD_FLOAT epsilon = atom->epsilon[type_ij];
|
||||
#endif
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
}
|
||||
}
|
||||
|
||||
atom_fx(i) = fix;
|
||||
atom_fy(i) = fiy;
|
||||
atom_fz(i) = fiz;
|
||||
}
|
||||
|
||||
__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, DeviceAtom a) {
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if( i >= Nlocal ) {
|
||||
return;
|
||||
}
|
||||
|
||||
DeviceAtom *atom = &a;
|
||||
|
||||
atom_vx(i) += dtforce * atom_fx(i);
|
||||
atom_vy(i) += dtforce * atom_fy(i);
|
||||
atom_vz(i) += dtforce * atom_fz(i);
|
||||
atom_x(i) = atom_x(i) + dt * atom_vx(i);
|
||||
atom_y(i) = atom_y(i) + dt * atom_vy(i);
|
||||
atom_z(i) = atom_z(i) + dt * atom_vz(i);
|
||||
}
|
||||
|
||||
__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, DeviceAtom a) {
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if( i >= Nlocal ) {
|
||||
return;
|
||||
}
|
||||
|
||||
DeviceAtom *atom = &a;
|
||||
|
||||
atom_vx(i) += dtforce * atom_fx(i);
|
||||
atom_vy(i) += dtforce * atom_fy(i);
|
||||
atom_vz(i) += dtforce * atom_fz(i);
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
|
||||
const int Nlocal = atom->Nlocal;
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||
|
||||
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, atom->d_atom);
|
||||
cuda_assert("kernel_final_integrate", cudaPeekAtLastError());
|
||||
cuda_assert("kernel_final_integrate", cudaDeviceSynchronize());
|
||||
|
||||
if(reneigh) {
|
||||
memcpyFromGPU(atom->vx, atom->d_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
|
||||
}
|
||||
}
|
||||
|
||||
void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
|
||||
const int Nlocal = atom->Nlocal;
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||
|
||||
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, atom->d_atom);
|
||||
cuda_assert("kernel_initial_integrate", cudaPeekAtLastError());
|
||||
cuda_assert("kernel_initial_integrate", cudaDeviceSynchronize());
|
||||
|
||||
if(reneigh) {
|
||||
memcpyFromGPU(atom->vx, atom->d_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
|
||||
}
|
||||
}
|
||||
|
||||
double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
int Nlocal = atom->Nlocal;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
|
||||
/*
|
||||
int nDevices;
|
||||
cudaGetDeviceCount(&nDevices);
|
||||
size_t free, total;
|
||||
for(int i = 0; i < nDevices; ++i) {
|
||||
cudaMemGetInfo( &free, &total );
|
||||
cudaDeviceProp prop;
|
||||
cudaGetDeviceProperties(&prop, i);
|
||||
printf("DEVICE %d/%d NAME: %s\r\n with %ld MB/%ld MB memory used", i + 1, nDevices, prop.name, free / 1024 / 1024, total / 1024 / 1024);
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
// HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
|
||||
// memsetGPU(atom->d_atom.fx, 0, sizeof(MD_FLOAT) * Nlocal * 3);
|
||||
|
||||
cudaProfilerStart();
|
||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh, atom->ntypes);
|
||||
cuda_assert("calc_force", cudaPeekAtLastError());
|
||||
cuda_assert("calc_force", cudaDeviceSynchronize());
|
||||
cudaProfilerStop();
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
}
|
293
src/verletlist/cuda/neighbor.cu
Normal file
293
src/verletlist/cuda/neighbor.cu
Normal file
@@ -0,0 +1,293 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <cuda_profiler_api.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <device_launch_parameters.h>
|
||||
//---
|
||||
|
||||
extern "C" {
|
||||
|
||||
#include <atom.h>
|
||||
#include <device.h>
|
||||
#include <parameter.h>
|
||||
#include <neighbor.h>
|
||||
#include <util.h>
|
||||
|
||||
}
|
||||
|
||||
extern MD_FLOAT xprd, yprd, zprd;
|
||||
extern MD_FLOAT bininvx, bininvy, bininvz;
|
||||
extern int mbinxlo, mbinylo, mbinzlo;
|
||||
extern int nbinx, nbiny, nbinz;
|
||||
extern int mbinx, mbiny, mbinz; // n bins in x, y, z
|
||||
extern int mbins; //total number of bins
|
||||
extern int atoms_per_bin; // max atoms per bin
|
||||
extern MD_FLOAT cutneighsq; // neighbor cutoff squared
|
||||
extern int nmax;
|
||||
extern int nstencil; // # of bins in stencil
|
||||
extern int* stencil; // stencil list of bin offsets
|
||||
static int* c_stencil = NULL;
|
||||
static int* c_resize_needed = NULL;
|
||||
static int* c_new_maxneighs = NULL;
|
||||
static Binning c_binning {
|
||||
.bincount = NULL,
|
||||
.bins = NULL,
|
||||
.mbins = 0,
|
||||
.atoms_per_bin = 0
|
||||
};
|
||||
|
||||
__device__ int coord2bin_device(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin, Neighbor_params np) {
|
||||
int ix, iy, iz;
|
||||
|
||||
if(xin >= np.xprd) {
|
||||
ix = (int)((xin - np.xprd) * np.bininvx) + np.nbinx - np.mbinxlo;
|
||||
} else if(xin >= 0.0) {
|
||||
ix = (int)(xin * np.bininvx) - np.mbinxlo;
|
||||
} else {
|
||||
ix = (int)(xin * np.bininvx) - np.mbinxlo - 1;
|
||||
}
|
||||
|
||||
if(yin >= np.yprd) {
|
||||
iy = (int)((yin - np.yprd) * np.bininvy) + np.nbiny - np.mbinylo;
|
||||
} else if(yin >= 0.0) {
|
||||
iy = (int)(yin * np.bininvy) - np.mbinylo;
|
||||
} else {
|
||||
iy = (int)(yin * np.bininvy) - np.mbinylo - 1;
|
||||
}
|
||||
|
||||
if(zin >= np.zprd) {
|
||||
iz = (int)((zin - np.zprd) * np.bininvz) + np.nbinz - np.mbinzlo;
|
||||
} else if(zin >= 0.0) {
|
||||
iz = (int)(zin * np.bininvz) - np.mbinzlo;
|
||||
} else {
|
||||
iz = (int)(zin * np.bininvz) - np.mbinzlo - 1;
|
||||
}
|
||||
|
||||
return (iz * np.mbiny * np.mbinx + iy * np.mbinx + ix + 1);
|
||||
}
|
||||
|
||||
/* sorts the contents of a bin to make it comparable to the CPU version */
|
||||
/* uses bubble sort since atoms per bin should be relatively small and can be done in situ */
|
||||
__global__ void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin){
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(i >= mbins) {
|
||||
return;
|
||||
}
|
||||
|
||||
int atoms_in_bin = bincount[i];
|
||||
int *bin_ptr = &bins[i * atoms_per_bin];
|
||||
int sorted;
|
||||
do {
|
||||
sorted = 1;
|
||||
int tmp;
|
||||
for(int index = 0; index < atoms_in_bin - 1; index++){
|
||||
if (bin_ptr[index] > bin_ptr[index + 1]){
|
||||
tmp = bin_ptr[index];
|
||||
bin_ptr[index] = bin_ptr[index + 1];
|
||||
bin_ptr[index + 1] = tmp;
|
||||
sorted = 0;
|
||||
}
|
||||
}
|
||||
} while (!sorted);
|
||||
}
|
||||
|
||||
__global__ void binatoms_kernel(DeviceAtom a, int nall, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed) {
|
||||
DeviceAtom* atom = &a;
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(i >= nall) {
|
||||
return;
|
||||
}
|
||||
|
||||
MD_FLOAT x = atom_x(i);
|
||||
MD_FLOAT y = atom_y(i);
|
||||
MD_FLOAT z = atom_z(i);
|
||||
int ibin = coord2bin_device(x, y, z, np);
|
||||
int ac = atomicAdd(&bincount[ibin], 1);
|
||||
|
||||
if(ac < atoms_per_bin){
|
||||
bins[ibin * atoms_per_bin + ac] = i;
|
||||
} else {
|
||||
atomicMax(resize_needed, ac);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void compute_neighborhood(
|
||||
DeviceAtom a, DeviceNeighbor neigh, Neighbor_params np, int nlocal, int maxneighs, int nstencil, int* stencil,
|
||||
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq, int ntypes) {
|
||||
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(i >= nlocal) {
|
||||
return;
|
||||
}
|
||||
|
||||
DeviceAtom *atom = &a;
|
||||
DeviceNeighbor *neighbor = &neigh;
|
||||
|
||||
int* neighptr = &(neighbor->neighbors[i]);
|
||||
int n = 0;
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
int ibin = coord2bin_device(xtmp, ytmp, ztmp, np);
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_i = atom->type[i];
|
||||
#endif
|
||||
for(int k = 0; k < nstencil; k++) {
|
||||
int jbin = ibin + stencil[k];
|
||||
int* loc_bin = &bins[jbin * atoms_per_bin];
|
||||
|
||||
for(int m = 0; m < bincount[jbin]; m++) {
|
||||
int j = loc_bin[m];
|
||||
|
||||
if ( j == i ){
|
||||
continue;
|
||||
}
|
||||
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_j = atom->type[j];
|
||||
const MD_FLOAT cutoff = atom->cutneighsq[type_i * ntypes + type_j];
|
||||
#else
|
||||
const MD_FLOAT cutoff = cutneighsq;
|
||||
#endif
|
||||
|
||||
if( rsq <= cutoff ) {
|
||||
int idx = nlocal * n;
|
||||
neighptr[idx] = j;
|
||||
n += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[i] = n;
|
||||
if(n > maxneighs) {
|
||||
atomicMax(new_maxneighs, n);
|
||||
}
|
||||
}
|
||||
|
||||
void binatoms_cuda(Atom *atom, Binning *c_binning, int *c_resize_needed, Neighbor_params *np, const int threads_per_block) {
|
||||
int nall = atom->Nlocal + atom->Nghost;
|
||||
int resize = 1;
|
||||
const int num_blocks = ceil((float) nall / (float) threads_per_block);
|
||||
|
||||
while(resize > 0) {
|
||||
resize = 0;
|
||||
memsetGPU(c_binning->bincount, 0, c_binning->mbins * sizeof(int));
|
||||
memsetGPU(c_resize_needed, 0, sizeof(int));
|
||||
|
||||
binatoms_kernel<<<num_blocks, threads_per_block>>>(atom->d_atom, atom->Nlocal + atom->Nghost, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);
|
||||
cuda_assert("binatoms", cudaPeekAtLastError());
|
||||
cuda_assert("binatoms", cudaDeviceSynchronize());
|
||||
|
||||
memcpyFromGPU(&resize, c_resize_needed, sizeof(int));
|
||||
if(resize) {
|
||||
c_binning->atoms_per_bin *= 2;
|
||||
c_binning->bins = (int *) reallocateGPU(c_binning->bins, c_binning->mbins * c_binning->atoms_per_bin * sizeof(int));
|
||||
}
|
||||
}
|
||||
|
||||
atoms_per_bin = c_binning->atoms_per_bin;
|
||||
const int sortBlocks = ceil((float) mbins / (float) threads_per_block);
|
||||
sort_bin_contents_kernel<<<sortBlocks, threads_per_block>>>(c_binning->bincount, c_binning->bins, c_binning->mbins, c_binning->atoms_per_bin);
|
||||
cuda_assert("sort_bin", cudaPeekAtLastError());
|
||||
cuda_assert("sort_bin", cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
|
||||
DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
int nall = atom->Nlocal + atom->Nghost;
|
||||
|
||||
cudaProfilerStart();
|
||||
|
||||
// TODO move all of this initialization into its own method
|
||||
if(c_stencil == NULL) {
|
||||
c_stencil = (int *) allocateGPU(nstencil * sizeof(int));
|
||||
memcpyToGPU(c_stencil, stencil, nstencil * sizeof(int));
|
||||
}
|
||||
|
||||
if(c_binning.mbins == 0) {
|
||||
c_binning.mbins = mbins;
|
||||
c_binning.atoms_per_bin = atoms_per_bin;
|
||||
c_binning.bincount = (int *) allocateGPU(c_binning.mbins * sizeof(int));
|
||||
c_binning.bins = (int *) allocateGPU(c_binning.mbins * c_binning.atoms_per_bin * sizeof(int));
|
||||
}
|
||||
|
||||
Neighbor_params np {
|
||||
.xprd = xprd,
|
||||
.yprd = yprd,
|
||||
.zprd = zprd,
|
||||
.bininvx = bininvx,
|
||||
.bininvy = bininvy,
|
||||
.bininvz = bininvz,
|
||||
.mbinxlo = mbinxlo,
|
||||
.mbinylo = mbinylo,
|
||||
.mbinzlo = mbinzlo,
|
||||
.nbinx = nbinx,
|
||||
.nbiny = nbiny,
|
||||
.nbinz = nbinz,
|
||||
.mbinx = mbinx,
|
||||
.mbiny = mbiny,
|
||||
.mbinz = mbinz
|
||||
};
|
||||
|
||||
if(c_resize_needed == NULL) {
|
||||
c_resize_needed = (int *) allocateGPU(sizeof(int));
|
||||
}
|
||||
|
||||
/* bin local & ghost atoms */
|
||||
binatoms_cuda(atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
|
||||
if(c_new_maxneighs == NULL) {
|
||||
c_new_maxneighs = (int *) allocateGPU(sizeof(int));
|
||||
}
|
||||
|
||||
int resize = 1;
|
||||
|
||||
if(nall > nmax) {
|
||||
nmax = nall;
|
||||
d_neighbor->neighbors = (int *) reallocateGPU(d_neighbor->neighbors, nmax * neighbor->maxneighs * sizeof(int));
|
||||
d_neighbor->numneigh = (int *) reallocateGPU(d_neighbor->numneigh, nmax * sizeof(int));
|
||||
}
|
||||
|
||||
/* loop over each atom, storing neighbors */
|
||||
while(resize) {
|
||||
resize = 0;
|
||||
memsetGPU(c_new_maxneighs, 0, sizeof(int));
|
||||
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
|
||||
compute_neighborhood<<<num_blocks, num_threads_per_block>>>(atom->d_atom, *d_neighbor,
|
||||
np, atom->Nlocal, neighbor->maxneighs, nstencil, c_stencil,
|
||||
c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
|
||||
c_new_maxneighs,
|
||||
cutneighsq, atom->ntypes);
|
||||
|
||||
cuda_assert("compute_neighborhood", cudaPeekAtLastError());
|
||||
cuda_assert("compute_neighborhood", cudaDeviceSynchronize());
|
||||
|
||||
int new_maxneighs;
|
||||
memcpyFromGPU(&new_maxneighs, c_new_maxneighs, sizeof(int));
|
||||
if(new_maxneighs > neighbor->maxneighs){
|
||||
resize = 1;
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
printf("RESIZE %d\n", neighbor->maxneighs);
|
||||
neighbor->maxneighs = new_maxneighs * 1.2;
|
||||
printf("NEW SIZE %d\n", neighbor->maxneighs);
|
||||
neighbor->neighbors = (int *) reallocateGPU(neighbor->neighbors, atom->Nmax * neighbor->maxneighs * sizeof(int));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
cudaProfilerStop();
|
||||
}
|
111
src/verletlist/cuda/pbc.cu
Normal file
111
src/verletlist/cuda/pbc.cu
Normal file
@@ -0,0 +1,111 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
//---
|
||||
|
||||
extern "C" {
|
||||
|
||||
#include <allocate.h>
|
||||
#include <atom.h>
|
||||
#include <device.h>
|
||||
#include <pbc.h>
|
||||
#include <util.h>
|
||||
|
||||
}
|
||||
|
||||
extern int NmaxGhost;
|
||||
extern int *PBCx, *PBCy, *PBCz;
|
||||
static int c_NmaxGhost = 0;
|
||||
static int *c_PBCx = NULL, *c_PBCy = NULL, *c_PBCz = NULL;
|
||||
|
||||
__global__ void computeAtomsPbcUpdate(DeviceAtom a, int nlocal, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd) {
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
DeviceAtom *atom = &a;
|
||||
if(i >= nlocal) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (atom_x(i) < 0.0) {
|
||||
atom_x(i) += xprd;
|
||||
} else if (atom_x(i) >= xprd) {
|
||||
atom_x(i) -= xprd;
|
||||
}
|
||||
|
||||
if (atom_y(i) < 0.0) {
|
||||
atom_y(i) += yprd;
|
||||
} else if (atom_y(i) >= yprd) {
|
||||
atom_y(i) -= yprd;
|
||||
}
|
||||
|
||||
if (atom_z(i) < 0.0) {
|
||||
atom_z(i) += zprd;
|
||||
} else if (atom_z(i) >= zprd) {
|
||||
atom_z(i) -= zprd;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void computePbcUpdate(DeviceAtom a, int nlocal, int nghost, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(i >= nghost) {
|
||||
return;
|
||||
}
|
||||
|
||||
DeviceAtom* atom = &a;
|
||||
int *border_map = atom->border_map;
|
||||
atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
|
||||
atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
|
||||
atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
|
||||
}
|
||||
|
||||
/* update coordinates of ghost atoms */
|
||||
/* uses mapping created in setupPbc */
|
||||
void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
|
||||
if(reneigh) {
|
||||
memcpyToGPU(atom->d_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
||||
memcpyToGPU(atom->d_atom.type, atom->type, sizeof(int) * atom->Nmax);
|
||||
|
||||
if(c_NmaxGhost < NmaxGhost) {
|
||||
c_NmaxGhost = NmaxGhost;
|
||||
c_PBCx = (int *) reallocateGPU(c_PBCx, NmaxGhost * sizeof(int));
|
||||
c_PBCy = (int *) reallocateGPU(c_PBCy, NmaxGhost * sizeof(int));
|
||||
c_PBCz = (int *) reallocateGPU(c_PBCz, NmaxGhost * sizeof(int));
|
||||
atom->d_atom.border_map = (int *) reallocateGPU(atom->d_atom.border_map, NmaxGhost * sizeof(int));
|
||||
}
|
||||
|
||||
memcpyToGPU(c_PBCx, PBCx, NmaxGhost * sizeof(int));
|
||||
memcpyToGPU(c_PBCy, PBCy, NmaxGhost * sizeof(int));
|
||||
memcpyToGPU(c_PBCz, PBCz, NmaxGhost * sizeof(int));
|
||||
memcpyToGPU(atom->d_atom.border_map, atom->border_map, NmaxGhost * sizeof(int));
|
||||
cuda_assert("updatePbc.reneigh", cudaPeekAtLastError());
|
||||
cuda_assert("updatePbc.reneigh", cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
const int num_blocks = ceil((float)atom->Nghost / (float)num_threads_per_block);
|
||||
computePbcUpdate<<<num_blocks, num_threads_per_block>>>(atom->d_atom, atom->Nlocal, atom->Nghost, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd);
|
||||
cuda_assert("updatePbc", cudaPeekAtLastError());
|
||||
cuda_assert("updatePbc", cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
void updateAtomsPbc_cuda(Atom* atom, Parameter *param) {
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
|
||||
computeAtomsPbcUpdate<<<num_blocks, num_threads_per_block>>>(atom->d_atom, atom->Nlocal, xprd, yprd, zprd);
|
||||
cuda_assert("computeAtomsPbcUpdate", cudaPeekAtLastError());
|
||||
cuda_assert("computeAtomsPbcUpdate", cudaDeviceSynchronize());
|
||||
memcpyFromGPU(atom->x, atom->d_atom.x, sizeof(MD_FLOAT) * atom->Nlocal * 3);
|
||||
}
|
31
src/verletlist/device_spec.c
Normal file
31
src/verletlist/device_spec.c
Normal file
@@ -0,0 +1,31 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <device.h>
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
|
||||
void initDevice(Atom *atom, Neighbor *neighbor) {
|
||||
DeviceAtom *d_atom = &(atom->d_atom);
|
||||
DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
|
||||
|
||||
d_atom->epsilon = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_atom->sigma6 = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_atom->cutneighsq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_atom->cutforcesq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_neighbor->neighbors = (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
|
||||
d_neighbor->numneigh = (int *) allocateGPU(sizeof(int) * atom->Nmax);
|
||||
|
||||
memcpyToGPU(d_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
||||
memcpyToGPU(d_atom->vx, atom->vx, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
||||
memcpyToGPU(d_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->cutneighsq, atom->cutneighsq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->type, atom->type, sizeof(int) * atom->Nmax);
|
||||
}
|
||||
|
||||
#endif
|
123
src/verletlist/force_dem.c
Normal file
123
src/verletlist/force_dem.c
Normal file
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <math.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <likwid-marker.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <stats.h>
|
||||
#include <timing.h>
|
||||
|
||||
|
||||
double computeForceDemFullNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
MD_FLOAT k_s = param->k_s;
|
||||
MD_FLOAT k_dn = param->k_dn;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
atom_fx(i) = 0.0;
|
||||
atom_fy(i) = 0.0;
|
||||
atom_fz(i) = 0.0;
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
MD_FLOAT irad = atom->radius[i];
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
MD_FLOAT jrad = atom->radius[j];
|
||||
MD_FLOAT xj = atom_x(j);
|
||||
MD_FLOAT yj = atom_y(j);
|
||||
MD_FLOAT zj = atom_z(j);
|
||||
MD_FLOAT delx = xtmp - xj;
|
||||
MD_FLOAT dely = ytmp - yj;
|
||||
MD_FLOAT delz = ztmp - zj;
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT r = sqrt(rsq);
|
||||
MD_FLOAT p = irad + jrad - r;
|
||||
|
||||
if(p > 0) {
|
||||
MD_FLOAT delvx = atom_vx(i) - atom_vx(j);
|
||||
MD_FLOAT delvy = atom_vy(i) - atom_vy(j);
|
||||
MD_FLOAT delvz = atom_vz(i) - atom_vz(j);
|
||||
MD_FLOAT vr = sqrt(delvx * delvx + delvy * delvy + delvz * delvz);
|
||||
|
||||
// normal distance
|
||||
MD_FLOAT nx = delx / r;
|
||||
MD_FLOAT ny = dely / r;
|
||||
MD_FLOAT nz = delz / r;
|
||||
|
||||
// normal contact velocity
|
||||
MD_FLOAT nvx = delvx / vr;
|
||||
MD_FLOAT nvy = delvy / vr;
|
||||
MD_FLOAT nvz = delvz / vr;
|
||||
|
||||
// forces
|
||||
atom_fx(i) += k_s * p * nx - k_dn * nvx;
|
||||
atom_fy(i) += k_s * p * ny - k_dn * nvy;
|
||||
atom_fz(i) += k_s * p * nz - k_dn * nvz;
|
||||
atom_fx(j) += -k_s * p * nx - k_dn * nvx;
|
||||
atom_fy(j) += -k_s * p * ny - k_dn * nvy;
|
||||
atom_fz(j) += -k_s * p * nz - k_dn * nvz;
|
||||
|
||||
// contact position
|
||||
//MD_FLOAT cterm = jrad / (irad + jrad);
|
||||
//MD_FLOAT cx = xj + cterm * delx;
|
||||
//MD_FLOAT cy = yj + cterm * dely;
|
||||
//MD_FLOAT cz = zj + cterm * delz;
|
||||
|
||||
// delta contact and particle position
|
||||
//MD_FLOAT delcx = cx - xtmp;
|
||||
//MD_FLOAT delcy = cy - ytmp;
|
||||
//MD_FLOAT delcz = cz - ztmp;
|
||||
|
||||
// contact velocity
|
||||
//MD_FLOAT cvx = (atom_vx(i) + atom_avx(i) * delcx) - (atom_vx(j) + atom_avx(j) * (cx - xj));
|
||||
//MD_FLOAT cvy = (atom_vy(i) + atom_avy(i) * delcy) - (atom_vy(j) + atom_avy(j) * (cy - yj));
|
||||
//MD_FLOAT cvz = (atom_vz(i) + atom_avz(i) * delcz) - (atom_vz(j) + atom_avz(j) * (cz - zj));
|
||||
|
||||
// tangential force
|
||||
//fix += MIN(kdt * vtsq, kf * fnx) * tx;
|
||||
//fiy += MIN(kdt * vtsq, kf * fny) * ty;
|
||||
//fiz += MIN(kdt * vtsq, kf * fnz) * tz;
|
||||
// torque
|
||||
//MD_FLOAT taux = delcx * ftx;
|
||||
//MD_FLOAT tauy = delcy * fty;
|
||||
//MD_FLOAT tauz = delcz * ftz;
|
||||
}
|
||||
#ifdef USE_REFERENCE_VERSION
|
||||
addStat(stats->atoms_within_cutoff, 1);
|
||||
} else {
|
||||
addStat(stats->atoms_outside_cutoff, 1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
addStat(stats->total_force_neighs, numneighs);
|
||||
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
209
src/verletlist/force_eam.c
Normal file
209
src/verletlist/force_eam.c
Normal file
@@ -0,0 +1,209 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <likwid-marker.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <allocate.h>
|
||||
#include <timing.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <stats.h>
|
||||
#include <eam.h>
|
||||
#include <util.h>
|
||||
|
||||
double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
if(eam->nmax < atom->Nmax) {
|
||||
eam->nmax = atom->Nmax;
|
||||
if(eam->fp != NULL) { free(eam->fp); }
|
||||
eam->fp = (MD_FLOAT *) allocate(ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT));
|
||||
}
|
||||
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
int ntypes = atom->ntypes; MD_FLOAT* fp = eam->fp;
|
||||
MD_FLOAT* rhor_spline = eam->rhor_spline; MD_FLOAT* frho_spline = eam->frho_spline; MD_FLOAT* z2r_spline = eam->z2r_spline;
|
||||
MD_FLOAT rdr = eam->rdr; int nr = eam->nr; int nr_tot = eam->nr_tot; MD_FLOAT rdrho = eam->rdrho;
|
||||
int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
|
||||
double S = getTimeStamp();
|
||||
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force_eam_fp");
|
||||
|
||||
#pragma omp for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
MD_FLOAT rhoi = 0;
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
#pragma ivdep
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
#else
|
||||
const MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
#endif
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT p = sqrt(rsq) * rdr + 1.0;
|
||||
int m = (int)(p);
|
||||
m = m < nr - 1 ? m : nr - 1;
|
||||
p -= m;
|
||||
p = p < 1.0 ? p : 1.0;
|
||||
#ifdef EXPLICIT_TYPES
|
||||
rhoi += ((rhor_spline[type_ij * nr_tot + m * 7 + 3] * p +
|
||||
rhor_spline[type_ij * nr_tot + m * 7 + 4]) * p +
|
||||
rhor_spline[type_ij * nr_tot + m * 7 + 5]) * p +
|
||||
rhor_spline[type_ij * nr_tot + m * 7 + 6];
|
||||
#else
|
||||
rhoi += ((rhor_spline[m * 7 + 3] * p +
|
||||
rhor_spline[m * 7 + 4]) * p +
|
||||
rhor_spline[m * 7 + 5]) * p +
|
||||
rhor_spline[m * 7 + 6];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_ii = type_i * type_i;
|
||||
#endif
|
||||
MD_FLOAT p = 1.0 * rhoi * rdrho + 1.0;
|
||||
int m = (int)(p);
|
||||
m = MAX(1, MIN(m, nrho - 1));
|
||||
p -= m;
|
||||
p = MIN(p, 1.0);
|
||||
#ifdef EXPLICIT_TYPES
|
||||
fp[i] = (frho_spline[type_ii * nrho_tot + m * 7 + 0] * p +
|
||||
frho_spline[type_ii * nrho_tot + m * 7 + 1]) * p +
|
||||
frho_spline[type_ii * nrho_tot + m * 7 + 2];
|
||||
#else
|
||||
fp[i] = (frho_spline[m * 7 + 0] * p + frho_spline[m * 7 + 1]) * p + frho_spline[m * 7 + 2];
|
||||
#endif
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force_eam_fp");
|
||||
}
|
||||
|
||||
// We still need to update fp for PBC atoms
|
||||
for(int i = 0; i < atom->Nghost; i++) {
|
||||
fp[Nlocal + i] = fp[atom->border_map[i]];
|
||||
}
|
||||
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force_eam");
|
||||
|
||||
#pragma omp for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
|
||||
#pragma ivdep
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
#else
|
||||
const MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
#endif
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT r = sqrt(rsq);
|
||||
MD_FLOAT p = r * rdr + 1.0;
|
||||
int m = (int)(p);
|
||||
m = m < nr - 1 ? m : nr - 1;
|
||||
p -= m;
|
||||
p = p < 1.0 ? p : 1.0;
|
||||
|
||||
|
||||
// rhoip = derivative of (density at atom j due to atom i)
|
||||
// rhojp = derivative of (density at atom i due to atom j)
|
||||
// phi = pair potential energy
|
||||
// phip = phi'
|
||||
// z2 = phi * r
|
||||
// z2p = (phi * r)' = (phi' r) + phi
|
||||
// psip needs both fp[i] and fp[j] terms since r_ij appears in two
|
||||
// terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
|
||||
// hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
MD_FLOAT rhoip = (rhor_spline[type_ij * nr_tot + m * 7 + 0] * p +
|
||||
rhor_spline[type_ij * nr_tot + m * 7 + 1]) * p +
|
||||
rhor_spline[type_ij * nr_tot + m * 7 + 2];
|
||||
|
||||
MD_FLOAT z2p = (z2r_spline[type_ij * nr_tot + m * 7 + 0] * p +
|
||||
z2r_spline[type_ij * nr_tot + m * 7 + 1]) * p +
|
||||
z2r_spline[type_ij * nr_tot + m * 7 + 2];
|
||||
|
||||
MD_FLOAT z2 = ((z2r_spline[type_ij * nr_tot + m * 7 + 3] * p +
|
||||
z2r_spline[type_ij * nr_tot + m * 7 + 4]) * p +
|
||||
z2r_spline[type_ij * nr_tot + m * 7 + 5]) * p +
|
||||
z2r_spline[type_ij * nr_tot + m * 7 + 6];
|
||||
#else
|
||||
MD_FLOAT rhoip = (rhor_spline[m * 7 + 0] * p + rhor_spline[m * 7 + 1]) * p + rhor_spline[m * 7 + 2];
|
||||
MD_FLOAT z2p = (z2r_spline[m * 7 + 0] * p + z2r_spline[m * 7 + 1]) * p + z2r_spline[m * 7 + 2];
|
||||
MD_FLOAT z2 = ((z2r_spline[m * 7 + 3] * p +
|
||||
z2r_spline[m * 7 + 4]) * p +
|
||||
z2r_spline[m * 7 + 5]) * p +
|
||||
z2r_spline[m * 7 + 6];
|
||||
#endif
|
||||
|
||||
MD_FLOAT recip = 1.0 / r;
|
||||
MD_FLOAT phi = z2 * recip;
|
||||
MD_FLOAT phip = z2p * recip - phi * recip;
|
||||
MD_FLOAT psip = fp[i] * rhoip + fp[j] * rhoip + phip;
|
||||
MD_FLOAT fpair = -psip * recip;
|
||||
|
||||
fix += delx * fpair;
|
||||
fiy += dely * fpair;
|
||||
fiz += delz * fpair;
|
||||
//fpair *= 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
atom_fx(i) = fix;
|
||||
atom_fy(i) = fiy;
|
||||
atom_fz(i) = fiz;
|
||||
addStat(stats->total_force_neighs, numneighs);
|
||||
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force_eam");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
112
src/verletlist/force_lj-x86.c
Normal file
112
src/verletlist/force_lj-x86.c
Normal file
@@ -0,0 +1,112 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <likwid-marker.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <stats.h>
|
||||
#include <timing.h>
|
||||
|
||||
#ifdef __SIMD_KERNEL__
|
||||
#include <simd.h>
|
||||
#endif
|
||||
|
||||
double computeForceLJFullNeigh_simd(
|
||||
Parameter* param, Atom* atom, Neighbor* neighbor, Stats* stats)
|
||||
{
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
|
||||
for (int i = 0; i < Nlocal; i++) {
|
||||
atom_fx(i) = 0.0;
|
||||
atom_fy(i) = 0.0;
|
||||
atom_fz(i) = 0.0;
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#ifndef __SIMD_KERNEL__
|
||||
fprintf(stderr, "Error: SIMD kernel not implemented for specified instruction set!");
|
||||
exit(-1);
|
||||
#else
|
||||
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
|
||||
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
|
||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp for schedule(runtime)
|
||||
for (int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
MD_SIMD_INT numneighs_vec = simd_int_broadcast(numneighs);
|
||||
MD_SIMD_FLOAT xtmp = simd_broadcast(atom_x(i));
|
||||
MD_SIMD_FLOAT ytmp = simd_broadcast(atom_y(i));
|
||||
MD_SIMD_FLOAT ztmp = simd_broadcast(atom_z(i));
|
||||
MD_SIMD_FLOAT fix = simd_zero();
|
||||
MD_SIMD_FLOAT fiy = simd_zero();
|
||||
MD_SIMD_FLOAT fiz = simd_zero();
|
||||
|
||||
for (int k = 0; k < numneighs; k += VECTOR_WIDTH) {
|
||||
// If the last iteration of this loop is separated from the rest, this
|
||||
// mask can be set only there
|
||||
MD_SIMD_MASK mask_numneighs = simd_mask_int_cond_lt(
|
||||
simd_int_add(simd_int_broadcast(k), simd_int_seq()),
|
||||
numneighs_vec);
|
||||
MD_SIMD_INT j = simd_int_mask_load(&neighs[k], mask_numneighs);
|
||||
#ifdef AOS
|
||||
MD_SIMD_INT j3 = simd_int_add(simd_int_add(j, j), j); // j * 3
|
||||
MD_SIMD_FLOAT delx = xtmp -
|
||||
simd_gather(j3, &(atom->x[0]), sizeof(MD_FLOAT));
|
||||
MD_SIMD_FLOAT dely = ytmp -
|
||||
simd_gather(j3, &(atom->x[1]), sizeof(MD_FLOAT));
|
||||
MD_SIMD_FLOAT delz = ztmp -
|
||||
simd_gather(j3, &(atom->x[2]), sizeof(MD_FLOAT));
|
||||
#else
|
||||
MD_SIMD_FLOAT delx = xtmp - simd_gather(j, atom->x, sizeof(MD_FLOAT));
|
||||
MD_SIMD_FLOAT dely = ytmp - simd_gather(j, atom->y, sizeof(MD_FLOAT));
|
||||
MD_SIMD_FLOAT delz = ztmp - simd_gather(j, atom->z, sizeof(MD_FLOAT));
|
||||
#endif
|
||||
MD_SIMD_FLOAT rsq = simd_fma(delx,
|
||||
delx,
|
||||
simd_fma(dely, dely, simd_mul(delz, delz)));
|
||||
MD_SIMD_MASK cutoff_mask = simd_mask_and(mask_numneighs,
|
||||
simd_mask_cond_lt(rsq, cutforcesq_vec));
|
||||
MD_SIMD_FLOAT sr2 = simd_reciprocal(rsq);
|
||||
MD_SIMD_FLOAT sr6 = simd_mul(sr2,
|
||||
simd_mul(sr2, simd_mul(sr2, sigma6_vec)));
|
||||
MD_SIMD_FLOAT force = simd_mul(c48_vec,
|
||||
simd_mul(sr6,
|
||||
simd_mul(simd_sub(sr6, c05_vec), simd_mul(sr2, eps_vec))));
|
||||
|
||||
fix = simd_masked_add(fix, simd_mul(delx, force), cutoff_mask);
|
||||
fiy = simd_masked_add(fiy, simd_mul(dely, force), cutoff_mask);
|
||||
fiz = simd_masked_add(fiz, simd_mul(delz, force), cutoff_mask);
|
||||
}
|
||||
|
||||
atom_fx(i) += simd_h_reduce_sum(fix);
|
||||
atom_fy(i) += simd_h_reduce_sum(fiy);
|
||||
atom_fz(i) += simd_h_reduce_sum(fiz);
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
#endif
|
||||
|
||||
double E = getTimeStamp();
|
||||
return E - S;
|
||||
}
|
198
src/verletlist/force_lj.c
Normal file
198
src/verletlist/force_lj.c
Normal file
@@ -0,0 +1,198 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <atom.h>
|
||||
#include <likwid-marker.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <stats.h>
|
||||
#include <timing.h>
|
||||
|
||||
double computeForceLJFullNeigh(
|
||||
Parameter* param, Atom* atom, Neighbor* neighbor, Stats* stats)
|
||||
{
|
||||
int nLocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
#ifndef EXPLICIT_TYPES
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
#endif
|
||||
const MD_FLOAT num1 = 1.0;
|
||||
const MD_FLOAT num48 = 48.0;
|
||||
const MD_FLOAT num05 = 0.5;
|
||||
|
||||
for (int i = 0; i < nLocal; i++) {
|
||||
atom_fx(i) = 0.0;
|
||||
atom_fy(i) = 0.0;
|
||||
atom_fz(i) = 0.0;
|
||||
}
|
||||
double timeStart = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp for schedule(runtime)
|
||||
for (int i = 0; i < nLocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
|
||||
for (int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * atom->ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
|
||||
const MD_FLOAT epsilon = atom->epsilon[type_ij];
|
||||
#endif
|
||||
|
||||
if (rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = num1 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
#ifdef USE_REFERENCE_VERSION
|
||||
addStat(stats->atoms_within_cutoff, 1);
|
||||
} else {
|
||||
addStat(stats->atoms_outside_cutoff, 1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
atom_fx(i) += fix;
|
||||
atom_fy(i) += fiy;
|
||||
atom_fz(i) += fiz;
|
||||
|
||||
#ifdef USE_REFERENCE_VERSION
|
||||
if (numneighs % VECTOR_WIDTH > 0) {
|
||||
addStat(stats->atoms_outside_cutoff,
|
||||
VECTOR_WIDTH - (numneighs % VECTOR_WIDTH));
|
||||
}
|
||||
#endif
|
||||
|
||||
addStat(stats->total_force_neighs, numneighs);
|
||||
addStat(stats->total_force_iters,
|
||||
(numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
double timeStop = getTimeStamp();
|
||||
return timeStop - timeStart;
|
||||
}
|
||||
|
||||
double computeForceLJHalfNeigh(
|
||||
Parameter* param, Atom* atom, Neighbor* neighbor, Stats* stats)
|
||||
{
|
||||
int nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
#ifndef EXPLICIT_TYPES
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
#endif
|
||||
const MD_FLOAT num1 = 1.0;
|
||||
const MD_FLOAT num48 = 48.0;
|
||||
const MD_FLOAT num05 = 0.5;
|
||||
|
||||
for (int i = 0; i < nlocal; i++) {
|
||||
atom_fx(i) = 0.0;
|
||||
atom_fy(i) = 0.0;
|
||||
atom_fz(i) = 0.0;
|
||||
}
|
||||
|
||||
double timeStart = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("forceLJ-halfneigh");
|
||||
|
||||
#pragma omp for schedule(runtime)
|
||||
for (int i = 0; i < nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
|
||||
// Pragma required to vectorize the inner loop
|
||||
#ifdef ENABLE_OMP_SIMD
|
||||
#pragma omp simd reduction(+ : fix, fiy, fiz)
|
||||
#endif
|
||||
for (int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * atom->ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
|
||||
const MD_FLOAT epsilon = atom->epsilon[type_ij];
|
||||
#endif
|
||||
|
||||
if (rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = num1 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
|
||||
// We do not need to update forces for ghost atoms
|
||||
if (j < nlocal) {
|
||||
atom_fx(j) -= delx * force;
|
||||
atom_fy(j) -= dely * force;
|
||||
atom_fz(j) -= delz * force;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
atom_fx(i) += fix;
|
||||
atom_fy(i) += fiy;
|
||||
atom_fz(i) += fiz;
|
||||
|
||||
addStat(stats->total_force_neighs, numneighs);
|
||||
addStat(stats->total_force_iters,
|
||||
(numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("forceLJ-halfneigh");
|
||||
}
|
||||
|
||||
double timeStop = getTimeStamp();
|
||||
return timeStop - timeStart;
|
||||
}
|
34
src/verletlist/integrate.h
Normal file
34
src/verletlist/integrate.h
Normal file
@@ -0,0 +1,34 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdbool.h>
|
||||
//---
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
|
||||
void initialIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom) {
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
atom_vx(i) += param->dtforce * atom_fx(i);
|
||||
atom_vy(i) += param->dtforce * atom_fy(i);
|
||||
atom_vz(i) += param->dtforce * atom_fz(i);
|
||||
atom_x(i) = atom_x(i) + param->dt * atom_vx(i);
|
||||
atom_y(i) = atom_y(i) + param->dt * atom_vy(i);
|
||||
atom_z(i) = atom_z(i) + param->dt * atom_vz(i);
|
||||
}
|
||||
}
|
||||
|
||||
void finalIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom) {
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
atom_vx(i) += param->dtforce * atom_fx(i);
|
||||
atom_vy(i) += param->dtforce * atom_fy(i);
|
||||
atom_vz(i) += param->dtforce * atom_fz(i);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
void initialIntegrate_cuda(bool, Parameter*, Atom*);
|
||||
void finalIntegrate_cuda(bool, Parameter*, Atom*);
|
||||
#endif
|
299
src/verletlist/main-stub.c
Normal file
299
src/verletlist/main-stub.c
Normal file
@@ -0,0 +1,299 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
//---
|
||||
#include <likwid-marker.h>
|
||||
//---
|
||||
#include <timing.h>
|
||||
#include <allocate.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <stats.h>
|
||||
#include <thermo.h>
|
||||
#include <eam.h>
|
||||
#include <pbc.h>
|
||||
#include <timers.h>
|
||||
#include <util.h>
|
||||
|
||||
#define HLINE "----------------------------------------------------------------------------\n"
|
||||
|
||||
extern double computeForceLJFullNeigh_plain_c(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceLJFullNeigh_simd(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceLJHalfNeigh(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
|
||||
|
||||
// Patterns
|
||||
#define P_SEQ 0
|
||||
#define P_FIX 1
|
||||
#define P_RAND 2
|
||||
|
||||
void init(Parameter *param) {
|
||||
param->input_file = NULL;
|
||||
param->force_field = FF_LJ;
|
||||
param->epsilon = 1.0;
|
||||
param->sigma6 = 1.0;
|
||||
param->rho = 0.8442;
|
||||
param->ntypes = 4;
|
||||
param->ntimes = 200;
|
||||
param->nx = 1;
|
||||
param->ny = 1;
|
||||
param->nz = 1;
|
||||
param->lattice = 1.0;
|
||||
param->cutforce = 1000000.0;
|
||||
param->cutneigh = param->cutforce;
|
||||
param->mass = 1.0;
|
||||
param->half_neigh = 0;
|
||||
// Unused
|
||||
param->dt = 0.005;
|
||||
param->dtforce = 0.5 * param->dt;
|
||||
param->nstat = 100;
|
||||
param->temp = 1.44;
|
||||
param->reneigh_every = 20;
|
||||
param->proc_freq = 2.4;
|
||||
param->eam_file = NULL;
|
||||
}
|
||||
|
||||
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
|
||||
const int maxneighs = nneighs * nreps;
|
||||
neighbor->numneigh = (int*) malloc(atom->Nmax * sizeof(int));
|
||||
neighbor->neighbors = (int*) malloc(atom->Nmax * maxneighs * sizeof(int));
|
||||
|
||||
if(pattern == P_RAND && atom->Nlocal <= nneighs) {
|
||||
fprintf(stderr, "Error: When using random pattern, number of atoms should be higher than number of neighbors per atom!\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
int *neighptr = &(neighbor->neighbors[i * neighbor->maxneighs]);
|
||||
int j = (pattern == P_SEQ) ? (i + 1) : 0;
|
||||
int m = (pattern == P_SEQ) ? atom->Nlocal : nneighs;
|
||||
|
||||
for(int k = 0; k < nneighs; k++) {
|
||||
if(pattern == P_RAND) {
|
||||
int found = 0;
|
||||
do {
|
||||
j = rand() % atom->Nlocal;
|
||||
neighptr[k] = j;
|
||||
found = (int)(i == j);
|
||||
for(int l = 0; l < k; l++) {
|
||||
if(neighptr[l] == j) {
|
||||
found = 1;
|
||||
}
|
||||
}
|
||||
} while(found == 1);
|
||||
} else {
|
||||
neighptr[k] = j;
|
||||
j = (j + 1) % m;
|
||||
}
|
||||
}
|
||||
|
||||
for(int r = 1; r < nreps; r++) {
|
||||
for(int k = 0; k < nneighs; k++) {
|
||||
neighptr[r * nneighs + k] = neighptr[k];
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[i] = nneighs * nreps;
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, const char *argv[]) {
|
||||
Eam eam;
|
||||
Atom atom_data;
|
||||
Atom *atom = (Atom *)(&atom_data);
|
||||
Neighbor neighbor;
|
||||
Stats stats;
|
||||
Parameter param;
|
||||
char *pattern_str = NULL;
|
||||
int pattern = P_SEQ;
|
||||
int natoms = 256;
|
||||
int nneighs = 76;
|
||||
int nreps = 1;
|
||||
int csv = 0;
|
||||
|
||||
LIKWID_MARKER_INIT;
|
||||
LIKWID_MARKER_REGISTER("force");
|
||||
DEBUG_MESSAGE("Initializing parameters...\n");
|
||||
init(¶m);
|
||||
|
||||
for(int i = 0; i < argc; i++) {
|
||||
if((strcmp(argv[i], "-f") == 0)) {
|
||||
if((param.force_field = str2ff(argv[++i])) < 0) {
|
||||
fprintf(stderr, "Invalid force field!\n");
|
||||
exit(-1);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-p") == 0)) {
|
||||
pattern_str = strdup(argv[++i]);
|
||||
if(strncmp(pattern_str, "seq", 3) == 0) { pattern = P_SEQ; }
|
||||
else if(strncmp(pattern_str, "fix", 3) == 0) { pattern = P_FIX; }
|
||||
else if(strncmp(pattern_str, "rand", 3) == 0) { pattern = P_RAND; }
|
||||
else {
|
||||
fprintf(stderr, "Invalid pattern!\n");
|
||||
exit(-1);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-e") == 0)) {
|
||||
param.eam_file = strdup(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
|
||||
param.ntimes = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-na") == 0)) {
|
||||
natoms = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-nn") == 0)) {
|
||||
nneighs = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-nr") == 0)) {
|
||||
nreps = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "--freq") == 0)) {
|
||||
param.proc_freq = atof(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "--csv") == 0)) {
|
||||
csv = 1;
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
|
||||
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
|
||||
printf(HLINE);
|
||||
printf("-f <string>: force field (lj or eam), default lj\n");
|
||||
printf("-p <string>: pattern for data accesses (seq, fix or rand)\n");
|
||||
printf("-n / --nsteps <int>: number of timesteps for simulation\n");
|
||||
printf("-na <int>: number of atoms (default 256)\n");
|
||||
printf("-nn <int>: number of neighbors per atom (default 76)\n");
|
||||
printf("-nr <int>: number of times neighbor lists should be replicated (default 1)\n");
|
||||
printf("--freq <real>: set CPU frequency (GHz) and display average cycles per atom and neighbors\n");
|
||||
printf("--csv: set output as CSV style\n");
|
||||
printf(HLINE);
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
if(pattern_str == NULL) {
|
||||
pattern_str = strdup("seq\0");
|
||||
}
|
||||
|
||||
if(param.force_field == FF_EAM) {
|
||||
DEBUG_MESSAGE("Initializing EAM parameters...\n");
|
||||
initEam(&eam, ¶m);
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("Initializing atoms...\n");
|
||||
initAtom(atom);
|
||||
initStats(&stats);
|
||||
|
||||
atom->ntypes = param.ntypes;
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param.epsilon;
|
||||
atom->sigma6[i] = param.sigma6;
|
||||
atom->cutneighsq[i] = param.cutneigh * param.cutneigh;
|
||||
atom->cutforcesq[i] = param.cutforce * param.cutforce;
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("Creating atoms...\n");
|
||||
for(int i = 0; i < natoms; ++i) {
|
||||
while(atom->Nlocal > atom->Nmax - natoms) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
atom->type[atom->Nlocal] = rand() % atom->ntypes;
|
||||
atom_x(atom->Nlocal) = (MD_FLOAT)(i) * 0.00001;
|
||||
atom_y(atom->Nlocal) = (MD_FLOAT)(i) * 0.00001;
|
||||
atom_z(atom->Nlocal) = (MD_FLOAT)(i) * 0.00001;
|
||||
atom_vx(atom->Nlocal) = 0.0;
|
||||
atom_vy(atom->Nlocal) = 0.0;
|
||||
atom_vz(atom->Nlocal) = 0.0;
|
||||
atom->Nlocal++;
|
||||
}
|
||||
|
||||
const double estim_atom_volume = (double)(atom->Nlocal * 3 * sizeof(MD_FLOAT));
|
||||
const double estim_neighbors_volume = (double)(atom->Nlocal * (nneighs + 2) * sizeof(int));
|
||||
const double estim_volume = (double)(atom->Nlocal * 6 * sizeof(MD_FLOAT) + estim_neighbors_volume);
|
||||
|
||||
if(!csv) {
|
||||
printf("Pattern: %s\n", pattern_str);
|
||||
printf("Number of timesteps: %d\n", param.ntimes);
|
||||
printf("Number of atoms: %d\n", natoms);
|
||||
printf("Number of neighbors per atom: %d\n", nneighs);
|
||||
printf("Number of times to replicate neighbor lists: %d\n", nreps);
|
||||
printf("Estimated total data volume (kB): %.4f\n", estim_volume / 1000.0);
|
||||
printf("Estimated atom data volume (kB): %.4f\n", estim_atom_volume / 1000.0);
|
||||
printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("Initializing neighbor lists...\n");
|
||||
initNeighbor(&neighbor, ¶m);
|
||||
DEBUG_MESSAGE("Creating neighbor lists...\n");
|
||||
createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
|
||||
DEBUG_MESSAGE("Computing forces...\n");
|
||||
|
||||
double T_accum = 0.0;
|
||||
for(int i = 0; i < param.ntimes; i++) {
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
traceAddresses(¶m, atom, &neighbor, i + 1);
|
||||
#endif
|
||||
|
||||
if(param.force_field == FF_EAM) {
|
||||
computeForceEam(&eam, ¶m, atom, &neighbor, &stats);
|
||||
} else {
|
||||
if(param.half_neigh) {
|
||||
T_accum += computeForceLJHalfNeigh(¶m, atom, &neighbor, &stats);
|
||||
} else {
|
||||
T_accum += computeForceLJFullNeigh(¶m, atom, &neighbor, &stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
double freq_hz = param.proc_freq * 1.e9;
|
||||
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
|
||||
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
|
||||
const double cycles_per_neigh = cycles_per_atom / (double)(nneighs);
|
||||
|
||||
if(!csv) {
|
||||
printf("Total time: %.4f, Mega atom updates/s: %.4f\n", T_accum, atoms_updates_per_sec / 1.e6);
|
||||
if(param.proc_freq > 0.0) {
|
||||
printf("Cycles per atom: %.4f, Cycles per neighbor: %.4f\n", cycles_per_atom, cycles_per_neigh);
|
||||
}
|
||||
} else {
|
||||
printf("steps,pattern,natoms,nneighs,nreps,total vol.(kB),atoms vol.(kB),neigh vol.(kB),time(s),atom upds/s(M)");
|
||||
if(param.proc_freq > 0.0) {
|
||||
printf(",cy/atom,cy/neigh");
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
printf("%d,%s,%d,%d,%d,%.4f,%.4f,%.4f,%.4f,%.4f",
|
||||
param.ntimes, pattern_str, natoms, nneighs, nreps,
|
||||
estim_volume / 1.e3, estim_atom_volume / 1.e3, estim_neighbors_volume / 1.e3, T_accum, atoms_updates_per_sec / 1.e6);
|
||||
|
||||
if(param.proc_freq > 0.0) {
|
||||
printf(",%.4f,%.4f", cycles_per_atom, cycles_per_neigh);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
double timer[NUMTIMER];
|
||||
timer[FORCE] = T_accum;
|
||||
displayStatistics(atom, ¶m, &stats, timer);
|
||||
LIKWID_MARKER_CLOSE;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
368
src/verletlist/main.c
Normal file
368
src/verletlist/main.c
Normal file
@@ -0,0 +1,368 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <likwid-marker.h>
|
||||
#include <omp.h>
|
||||
|
||||
#include <allocate.h>
|
||||
#include <atom.h>
|
||||
#include <device.h>
|
||||
#include <eam.h>
|
||||
#include <integrate.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <pbc.h>
|
||||
#include <stats.h>
|
||||
#include <thermo.h>
|
||||
#include <timers.h>
|
||||
#include <timing.h>
|
||||
#include <util.h>
|
||||
#include <vtk.h>
|
||||
|
||||
#define HLINE "------------------------------------------------------------------\n"
|
||||
|
||||
extern double computeForceLJHalfNeigh(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceLJFullNeigh(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceDemFullNeigh(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
extern double computeForceLJFullNeigh_cuda(Parameter*, Atom*, Neighbor*);
|
||||
#endif
|
||||
|
||||
double setup(Parameter* param, Eam* eam, Atom* atom, Neighbor* neighbor, Stats* stats)
|
||||
{
|
||||
if (param->force_field == FF_EAM) {
|
||||
initEam(eam, param);
|
||||
}
|
||||
double timeStart, timeStop;
|
||||
param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
|
||||
param->xprd = param->nx * param->lattice;
|
||||
param->yprd = param->ny * param->lattice;
|
||||
param->zprd = param->nz * param->lattice;
|
||||
|
||||
timeStart = getTimeStamp();
|
||||
initAtom(atom);
|
||||
initPbc(atom);
|
||||
initStats(stats);
|
||||
initNeighbor(neighbor, param);
|
||||
if (param->input_file == NULL) {
|
||||
createAtom(atom, param);
|
||||
} else {
|
||||
readAtom(atom, param);
|
||||
}
|
||||
|
||||
setupNeighbor(param);
|
||||
setupThermo(param, atom->Natoms);
|
||||
if (param->input_file == NULL) {
|
||||
adjustThermo(param, atom);
|
||||
}
|
||||
#ifdef SORT_ATOMS
|
||||
atom->Nghost = 0;
|
||||
sortAtom(atom);
|
||||
#endif
|
||||
setupPbc(atom, param);
|
||||
initDevice(atom, neighbor);
|
||||
updatePbc(atom, param, true);
|
||||
buildNeighbor(atom, neighbor);
|
||||
timeStop = getTimeStamp();
|
||||
return timeStop - timeStart;
|
||||
}
|
||||
|
||||
double reneighbour(Parameter* param, Atom* atom, Neighbor* neighbor)
|
||||
{
|
||||
double timeStart, timeStop;
|
||||
timeStart = getTimeStamp();
|
||||
LIKWID_MARKER_START("reneighbour");
|
||||
updateAtomsPbc(atom, param);
|
||||
#ifdef SORT_ATOMS
|
||||
atom->Nghost = 0;
|
||||
sortAtom(atom);
|
||||
#endif
|
||||
setupPbc(atom, param);
|
||||
updatePbc(atom, param, true);
|
||||
buildNeighbor(atom, neighbor);
|
||||
LIKWID_MARKER_STOP("reneighbour");
|
||||
timeStop = getTimeStamp();
|
||||
return timeStop - timeStart;
|
||||
}
|
||||
|
||||
void printAtomState(Atom* atom)
|
||||
{
|
||||
printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n",
|
||||
atom->Natoms,
|
||||
atom->Nlocal,
|
||||
atom->Nghost,
|
||||
atom->Nmax);
|
||||
// int nall = atom->Nlocal + atom->Nghost;
|
||||
// for (int i=0; i<nall; i++) {
|
||||
// printf("%d %f %f %f\n", i, atom->x[i], atom->y[i], atom->z[i]);
|
||||
// }
|
||||
}
|
||||
|
||||
double computeForce(
|
||||
Eam* eam, Parameter* param, Atom* atom, Neighbor* neighbor, Stats* stats)
|
||||
{
|
||||
if (param->force_field == FF_EAM) {
|
||||
return computeForceEam(eam, param, atom, neighbor, stats);
|
||||
} else if (param->force_field == FF_DEM) {
|
||||
if (param->half_neigh) {
|
||||
fprintf(stderr, "Error: DEM cannot use half neighbor-lists!\n");
|
||||
return 0.0;
|
||||
} else {
|
||||
return computeForceDemFullNeigh(param, atom, neighbor, stats);
|
||||
}
|
||||
}
|
||||
|
||||
if (param->half_neigh) {
|
||||
return computeForceLJHalfNeigh(param, atom, neighbor, stats);
|
||||
}
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
return computeForceLJFullNeigh(param, atom, neighbor);
|
||||
#else
|
||||
return computeForceLJFullNeigh(param, atom, neighbor, stats);
|
||||
#endif
|
||||
}
|
||||
|
||||
void writeInput(Parameter* param, Atom* atom)
|
||||
{
|
||||
FILE* fpin = fopen("input.in", "w");
|
||||
fprintf(fpin, "0,%f,0,%f,0,%f\n", param->xprd, param->yprd, param->zprd);
|
||||
|
||||
for (int i = 0; i < atom->Nlocal; i++) {
|
||||
fprintf(fpin,
|
||||
"1,%f,%f,%f,%f,%f,%f\n",
|
||||
atom_x(i),
|
||||
atom_y(i),
|
||||
atom_z(i),
|
||||
atom_vx(i),
|
||||
atom_vy(i),
|
||||
atom_vz(i));
|
||||
}
|
||||
|
||||
fclose(fpin);
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
double timer[NUMTIMER];
|
||||
Eam eam;
|
||||
Atom atom;
|
||||
Neighbor neighbor;
|
||||
Stats stats;
|
||||
Parameter param;
|
||||
|
||||
LIKWID_MARKER_INIT;
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_REGISTER("force");
|
||||
// LIKWID_MARKER_REGISTER("reneighbour");
|
||||
// LIKWID_MARKER_REGISTER("pbc");
|
||||
}
|
||||
|
||||
initParameter(¶m);
|
||||
for (int i = 0; i < argc; i++) {
|
||||
if ((strcmp(argv[i], "-p") == 0) || strcmp(argv[i], "--params") == 0) {
|
||||
readParameter(¶m, argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if ((strcmp(argv[i], "-f") == 0)) {
|
||||
if ((param.force_field = str2ff(argv[++i])) < 0) {
|
||||
fprintf(stderr, "Invalid force field!\n");
|
||||
exit(-1);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if ((strcmp(argv[i], "-i") == 0)) {
|
||||
param.input_file = strdup(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if ((strcmp(argv[i], "-e") == 0)) {
|
||||
param.eam_file = strdup(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if ((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
|
||||
param.ntimes = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if ((strcmp(argv[i], "-nx") == 0)) {
|
||||
param.nx = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if ((strcmp(argv[i], "-ny") == 0)) {
|
||||
param.ny = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if ((strcmp(argv[i], "-nz") == 0)) {
|
||||
param.nz = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if ((strcmp(argv[i], "-half") == 0)) {
|
||||
param.half_neigh = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if ((strcmp(argv[i], "-r") == 0) || (strcmp(argv[i], "--radius") == 0)) {
|
||||
param.cutforce = atof(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if ((strcmp(argv[i], "-s") == 0) || (strcmp(argv[i], "--skin") == 0)) {
|
||||
param.skin = atof(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if ((strcmp(argv[i], "--freq") == 0)) {
|
||||
param.proc_freq = atof(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if ((strcmp(argv[i], "--vtk") == 0)) {
|
||||
param.vtk_file = strdup(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if ((strcmp(argv[i], "-w") == 0)) {
|
||||
param.write_atom_file = strdup(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if ((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
|
||||
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
|
||||
printf(HLINE);
|
||||
printf("-p / --params <string>: file to read parameters from (can be "
|
||||
"specified more than once)\n");
|
||||
printf("-f <string>: force field (lj, eam or dem), "
|
||||
"default lj\n");
|
||||
printf("-i <string>: input file with atom positions "
|
||||
"(dump)\n");
|
||||
printf("-e <string>: input file for EAM\n");
|
||||
printf("-n / --nsteps <int>: set number of timesteps for "
|
||||
"simulation\n");
|
||||
printf("-nx/-ny/-nz <int>: set linear dimension of systembox in "
|
||||
"x/y/z direction\n");
|
||||
printf("-half <int>: use half (1) or full (0) neighbor "
|
||||
"lists\n");
|
||||
printf("-r / --radius <real>: set cutoff radius\n");
|
||||
printf("-s / --skin <real>: set skin (verlet buffer)\n");
|
||||
printf("-w <file>: write input atoms to file\n");
|
||||
printf("--freq <real>: processor frequency (GHz)\n");
|
||||
printf("--vtk <string>: VTK file for visualization\n");
|
||||
printf(HLINE);
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
param.cutneigh = param.cutforce + param.skin;
|
||||
setup(¶m, &eam, &atom, &neighbor, &stats);
|
||||
printParameter(¶m);
|
||||
printf(HLINE);
|
||||
|
||||
printf("step\ttemp\t\tpressure\n");
|
||||
computeThermo(0, ¶m, &atom);
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
||||
#endif
|
||||
|
||||
if (param.write_atom_file != NULL) {
|
||||
writeAtom(&atom, ¶m);
|
||||
}
|
||||
|
||||
// writeInput(¶m, &atom);
|
||||
|
||||
timer[FORCE] = computeForce(&eam, ¶m, &atom, &neighbor, &stats);
|
||||
timer[NEIGH] = 0.0;
|
||||
timer[TOTAL] = getTimeStamp();
|
||||
|
||||
if (param.vtk_file != NULL) {
|
||||
write_atoms_to_vtk_file(param.vtk_file, &atom, 0);
|
||||
}
|
||||
|
||||
for (int n = 0; n < param.ntimes; n++) {
|
||||
bool reneigh = (n + 1) % param.reneigh_every == 0;
|
||||
initialIntegrate(reneigh, ¶m, &atom);
|
||||
if ((n + 1) % param.reneigh_every) {
|
||||
updatePbc(&atom, ¶m, false);
|
||||
} else {
|
||||
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor);
|
||||
}
|
||||
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
||||
#endif
|
||||
|
||||
timer[FORCE] += computeForce(&eam, ¶m, &atom, &neighbor, &stats);
|
||||
finalIntegrate(reneigh, ¶m, &atom);
|
||||
|
||||
if (!((n + 1) % param.nstat) && (n + 1) < param.ntimes) {
|
||||
#ifdef CUDA_TARGET
|
||||
memcpyFromGPU(atom.x, atom.d_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3);
|
||||
#endif
|
||||
computeThermo(n + 1, ¶m, &atom);
|
||||
}
|
||||
|
||||
if (param.vtk_file != NULL) {
|
||||
write_atoms_to_vtk_file(param.vtk_file, &atom, n + 1);
|
||||
}
|
||||
}
|
||||
|
||||
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
|
||||
computeThermo(-1, ¶m, &atom);
|
||||
|
||||
printf(HLINE);
|
||||
printf("System: %d atoms %d ghost atoms, Steps: %d\n",
|
||||
atom.Natoms,
|
||||
atom.Nghost,
|
||||
param.ntimes);
|
||||
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
|
||||
timer[TOTAL],
|
||||
timer[FORCE],
|
||||
timer[NEIGH],
|
||||
timer[TOTAL] - timer[FORCE] - timer[NEIGH]);
|
||||
printf(HLINE);
|
||||
|
||||
int nthreads = 0;
|
||||
int chunkSize = 0;
|
||||
omp_sched_t schedKind;
|
||||
char schedType[10];
|
||||
#pragma omp parallel
|
||||
#pragma omp master
|
||||
{
|
||||
omp_get_schedule(&schedKind, &chunkSize);
|
||||
|
||||
switch (schedKind) {
|
||||
case omp_sched_static:
|
||||
strcpy(schedType, "static");
|
||||
break;
|
||||
case omp_sched_dynamic:
|
||||
strcpy(schedType, "dynamic");
|
||||
break;
|
||||
case omp_sched_guided:
|
||||
strcpy(schedType, "guided");
|
||||
break;
|
||||
case omp_sched_auto:
|
||||
strcpy(schedType, "auto");
|
||||
break;
|
||||
case omp_sched_monotonic:
|
||||
strcpy(schedType, "auto");
|
||||
break;
|
||||
}
|
||||
|
||||
nthreads = omp_get_max_threads();
|
||||
}
|
||||
|
||||
printf("Num threads: %d\n", nthreads);
|
||||
printf("Schedule: (%s,%d)\n", schedType, chunkSize);
|
||||
|
||||
printf("Performance: %.2f million atom updates per second\n",
|
||||
1e-6 * (double)atom.Natoms * param.ntimes / timer[TOTAL]);
|
||||
#ifdef COMPUTE_STATS
|
||||
displayStatistics(&atom, ¶m, &stats, timer);
|
||||
#endif
|
||||
LIKWID_MARKER_CLOSE;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
385
src/verletlist/neighbor.c
Normal file
385
src/verletlist/neighbor.c
Normal file
@@ -0,0 +1,385 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
|
||||
#define SMALL 1.0e-6
|
||||
#define FACTOR 0.999
|
||||
|
||||
MD_FLOAT xprd, yprd, zprd;
|
||||
MD_FLOAT bininvx, bininvy, bininvz;
|
||||
int mbinxlo, mbinylo, mbinzlo;
|
||||
int nbinx, nbiny, nbinz;
|
||||
int mbinx, mbiny, mbinz; // n bins in x, y, z
|
||||
int *bincount;
|
||||
int *bins;
|
||||
int mbins; //total number of bins
|
||||
int atoms_per_bin; // max atoms per bin
|
||||
MD_FLOAT cutneigh;
|
||||
MD_FLOAT cutneighsq; // neighbor cutoff squared
|
||||
int nmax;
|
||||
int nstencil; // # of bins in stencil
|
||||
int* stencil; // stencil list of bin offsets
|
||||
MD_FLOAT binsizex, binsizey, binsizez;
|
||||
static int coord2bin(MD_FLOAT, MD_FLOAT , MD_FLOAT);
|
||||
static MD_FLOAT bindist(int, int, int);
|
||||
|
||||
/* exported subroutines */
|
||||
void initNeighbor(Neighbor *neighbor, Parameter *param) {
|
||||
MD_FLOAT neighscale = 5.0 / 6.0;
|
||||
xprd = param->nx * param->lattice;
|
||||
yprd = param->ny * param->lattice;
|
||||
zprd = param->nz * param->lattice;
|
||||
cutneigh = param->cutneigh;
|
||||
nbinx = neighscale * param->nx;
|
||||
nbiny = neighscale * param->ny;
|
||||
nbinz = neighscale * param->nz;
|
||||
nmax = 0;
|
||||
atoms_per_bin = 8;
|
||||
stencil = NULL;
|
||||
bins = NULL;
|
||||
bincount = NULL;
|
||||
neighbor->maxneighs = 100;
|
||||
neighbor->numneigh = NULL;
|
||||
neighbor->neighbors = NULL;
|
||||
neighbor->half_neigh = param->half_neigh;
|
||||
}
|
||||
|
||||
void setupNeighbor(Parameter* param) {
|
||||
MD_FLOAT coord;
|
||||
int mbinxhi, mbinyhi, mbinzhi;
|
||||
int nextx, nexty, nextz;
|
||||
|
||||
if(param->input_file != NULL) {
|
||||
xprd = param->xprd;
|
||||
yprd = param->yprd;
|
||||
zprd = param->zprd;
|
||||
}
|
||||
|
||||
// TODO: update lo and hi for standard case and use them here instead
|
||||
MD_FLOAT xlo = 0.0; MD_FLOAT xhi = xprd;
|
||||
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = yprd;
|
||||
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd;
|
||||
|
||||
cutneighsq = cutneigh * cutneigh;
|
||||
|
||||
if(param->input_file != NULL) {
|
||||
binsizex = cutneigh * 0.5;
|
||||
binsizey = cutneigh * 0.5;
|
||||
binsizez = cutneigh * 0.5;
|
||||
nbinx = (int)((param->xhi - param->xlo) / binsizex);
|
||||
nbiny = (int)((param->yhi - param->ylo) / binsizey);
|
||||
nbinz = (int)((param->zhi - param->zlo) / binsizez);
|
||||
if(nbinx == 0) { nbinx = 1; }
|
||||
if(nbiny == 0) { nbiny = 1; }
|
||||
if(nbinz == 0) { nbinz = 1; }
|
||||
bininvx = nbinx / (param->xhi - param->xlo);
|
||||
bininvy = nbiny / (param->yhi - param->ylo);
|
||||
bininvz = nbinz / (param->zhi - param->zlo);
|
||||
} else {
|
||||
binsizex = xprd / nbinx;
|
||||
binsizey = yprd / nbiny;
|
||||
binsizez = zprd / nbinz;
|
||||
bininvx = 1.0 / binsizex;
|
||||
bininvy = 1.0 / binsizey;
|
||||
bininvz = 1.0 / binsizez;
|
||||
}
|
||||
|
||||
coord = xlo - cutneigh - SMALL * xprd;
|
||||
mbinxlo = (int) (coord * bininvx);
|
||||
if (coord < 0.0) { mbinxlo = mbinxlo - 1; }
|
||||
coord = xhi + cutneigh + SMALL * xprd;
|
||||
mbinxhi = (int) (coord * bininvx);
|
||||
|
||||
coord = ylo - cutneigh - SMALL * yprd;
|
||||
mbinylo = (int) (coord * bininvy);
|
||||
if (coord < 0.0) { mbinylo = mbinylo - 1; }
|
||||
coord = yhi + cutneigh + SMALL * yprd;
|
||||
mbinyhi = (int) (coord * bininvy);
|
||||
|
||||
coord = zlo - cutneigh - SMALL * zprd;
|
||||
mbinzlo = (int) (coord * bininvz);
|
||||
if (coord < 0.0) { mbinzlo = mbinzlo - 1; }
|
||||
coord = zhi + cutneigh + SMALL * zprd;
|
||||
mbinzhi = (int) (coord * bininvz);
|
||||
|
||||
mbinxlo = mbinxlo - 1;
|
||||
mbinxhi = mbinxhi + 1;
|
||||
mbinx = mbinxhi - mbinxlo + 1;
|
||||
|
||||
mbinylo = mbinylo - 1;
|
||||
mbinyhi = mbinyhi + 1;
|
||||
mbiny = mbinyhi - mbinylo + 1;
|
||||
|
||||
mbinzlo = mbinzlo - 1;
|
||||
mbinzhi = mbinzhi + 1;
|
||||
mbinz = mbinzhi - mbinzlo + 1;
|
||||
|
||||
nextx = (int) (cutneigh * bininvx);
|
||||
if(nextx * binsizex < FACTOR * cutneigh) nextx++;
|
||||
nexty = (int) (cutneigh * bininvy);
|
||||
if(nexty * binsizey < FACTOR * cutneigh) nexty++;
|
||||
nextz = (int) (cutneigh * bininvz);
|
||||
if(nextz * binsizez < FACTOR * cutneigh) nextz++;
|
||||
|
||||
if (stencil) { free(stencil); }
|
||||
stencil = (int*) malloc((2 * nextz + 1) * (2 * nexty + 1) * (2 * nextx + 1) * sizeof(int));
|
||||
nstencil = 0;
|
||||
int kstart = -nextz;
|
||||
|
||||
for(int k = kstart; k <= nextz; k++) {
|
||||
for(int j = -nexty; j <= nexty; j++) {
|
||||
for(int i = -nextx; i <= nextx; i++) {
|
||||
if(bindist(i, j, k) < cutneighsq) {
|
||||
stencil[nstencil++] = k * mbiny * mbinx + j * mbinx + i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mbins = mbinx * mbiny * mbinz;
|
||||
if (bincount) { free(bincount); }
|
||||
bincount = (int*) malloc(mbins * sizeof(int));
|
||||
if (bins) { free(bins); }
|
||||
bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
|
||||
}
|
||||
|
||||
void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) {
|
||||
int nall = atom->Nlocal + atom->Nghost;
|
||||
|
||||
/* extend atom arrays if necessary */
|
||||
if(nall > nmax) {
|
||||
nmax = nall;
|
||||
if(neighbor->numneigh) free(neighbor->numneigh);
|
||||
if(neighbor->neighbors) free(neighbor->neighbors);
|
||||
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
|
||||
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
|
||||
}
|
||||
|
||||
/* bin local & ghost atoms */
|
||||
binatoms(atom);
|
||||
int resize = 1;
|
||||
|
||||
/* loop over each atom, storing neighbors */
|
||||
while(resize) {
|
||||
int new_maxneighs = neighbor->maxneighs;
|
||||
resize = 0;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
int* neighptr = &(neighbor->neighbors[i * neighbor->maxneighs]);
|
||||
int n = 0;
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
int ibin = coord2bin(xtmp, ytmp, ztmp);
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_i = atom->type[i];
|
||||
#endif
|
||||
for(int k = 0; k < nstencil; k++) {
|
||||
int jbin = ibin + stencil[k];
|
||||
int* loc_bin = &bins[jbin * atoms_per_bin];
|
||||
|
||||
for(int m = 0; m < bincount[jbin]; m++) {
|
||||
int j = loc_bin[m];
|
||||
if((j == i) || (neighbor->half_neigh && (j < i))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_j = atom->type[j];
|
||||
const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
|
||||
#else
|
||||
const MD_FLOAT cutoff = cutneighsq;
|
||||
#endif
|
||||
if(rsq <= cutoff) {
|
||||
neighptr[n++] = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[i] = n;
|
||||
if(n >= neighbor->maxneighs) {
|
||||
resize = 1;
|
||||
|
||||
if(n >= new_maxneighs) {
|
||||
new_maxneighs = n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
printf("RESIZE %d\n", neighbor->maxneighs);
|
||||
neighbor->maxneighs = new_maxneighs * 1.2;
|
||||
free(neighbor->neighbors);
|
||||
neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* internal subroutines */
|
||||
MD_FLOAT bindist(int i, int j, int k) {
|
||||
MD_FLOAT delx, dely, delz;
|
||||
|
||||
if(i > 0) {
|
||||
delx = (i - 1) * binsizex;
|
||||
} else if(i == 0) {
|
||||
delx = 0.0;
|
||||
} else {
|
||||
delx = (i + 1) * binsizex;
|
||||
}
|
||||
|
||||
if(j > 0) {
|
||||
dely = (j - 1) * binsizey;
|
||||
} else if(j == 0) {
|
||||
dely = 0.0;
|
||||
} else {
|
||||
dely = (j + 1) * binsizey;
|
||||
}
|
||||
|
||||
if(k > 0) {
|
||||
delz = (k - 1) * binsizez;
|
||||
} else if(k == 0) {
|
||||
delz = 0.0;
|
||||
} else {
|
||||
delz = (k + 1) * binsizez;
|
||||
}
|
||||
|
||||
return (delx * delx + dely * dely + delz * delz);
|
||||
}
|
||||
|
||||
int coord2bin(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin) {
|
||||
int ix, iy, iz;
|
||||
|
||||
if(xin >= xprd) {
|
||||
ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
|
||||
} else if(xin >= 0.0) {
|
||||
ix = (int)(xin * bininvx) - mbinxlo;
|
||||
} else {
|
||||
ix = (int)(xin * bininvx) - mbinxlo - 1;
|
||||
}
|
||||
|
||||
if(yin >= yprd) {
|
||||
iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
|
||||
} else if(yin >= 0.0) {
|
||||
iy = (int)(yin * bininvy) - mbinylo;
|
||||
} else {
|
||||
iy = (int)(yin * bininvy) - mbinylo - 1;
|
||||
}
|
||||
|
||||
if(zin >= zprd) {
|
||||
iz = (int)((zin - zprd) * bininvz) + nbinz - mbinzlo;
|
||||
} else if(zin >= 0.0) {
|
||||
iz = (int)(zin * bininvz) - mbinzlo;
|
||||
} else {
|
||||
iz = (int)(zin * bininvz) - mbinzlo - 1;
|
||||
}
|
||||
|
||||
return (iz * mbiny * mbinx + iy * mbinx + ix + 1);
|
||||
}
|
||||
|
||||
void binatoms(Atom *atom) {
|
||||
int nall = atom->Nlocal + atom->Nghost;
|
||||
int resize = 1;
|
||||
|
||||
while(resize > 0) {
|
||||
resize = 0;
|
||||
|
||||
for(int i = 0; i < mbins; i++) {
|
||||
bincount[i] = 0;
|
||||
}
|
||||
|
||||
for(int i = 0; i < nall; i++) {
|
||||
int ibin = coord2bin(atom_x(i), atom_y(i), atom_z(i));
|
||||
|
||||
if(bincount[ibin] < atoms_per_bin) {
|
||||
int ac = bincount[ibin]++;
|
||||
bins[ibin * atoms_per_bin + ac] = i;
|
||||
} else {
|
||||
resize = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
free(bins);
|
||||
atoms_per_bin *= 2;
|
||||
bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void sortAtom(Atom* atom) {
|
||||
binatoms(atom);
|
||||
int Nmax = atom->Nmax;
|
||||
int* binpos = bincount;
|
||||
|
||||
for(int i = 1; i < mbins; i++) {
|
||||
binpos[i] += binpos[i - 1];
|
||||
}
|
||||
|
||||
#ifdef AOS
|
||||
MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
|
||||
MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
|
||||
#else
|
||||
MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
MD_FLOAT* new_y = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
MD_FLOAT* new_z = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
MD_FLOAT* new_vy = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
MD_FLOAT* new_vz = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
#endif
|
||||
MD_FLOAT* old_x = atom->x; MD_FLOAT* old_y = atom->y; MD_FLOAT* old_z = atom->z;
|
||||
MD_FLOAT* old_vx = atom->vx; MD_FLOAT* old_vy = atom->vy; MD_FLOAT* old_vz = atom->vz;
|
||||
|
||||
for(int mybin = 0; mybin < mbins; mybin++) {
|
||||
int start = mybin > 0 ? binpos[mybin - 1] : 0;
|
||||
int count = binpos[mybin] - start;
|
||||
for(int k = 0; k < count; k++) {
|
||||
int new_i = start + k;
|
||||
int old_i = bins[mybin * atoms_per_bin + k];
|
||||
#ifdef AOS
|
||||
new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
|
||||
new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
|
||||
new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
|
||||
new_vx[new_i * 3 + 0] = old_vx[old_i * 3 + 0];
|
||||
new_vx[new_i * 3 + 1] = old_vx[old_i * 3 + 1];
|
||||
new_vx[new_i * 3 + 2] = old_vx[old_i * 3 + 2];
|
||||
#else
|
||||
new_x[new_i] = old_x[old_i];
|
||||
new_y[new_i] = old_y[old_i];
|
||||
new_z[new_i] = old_z[old_i];
|
||||
new_vx[new_i] = old_vx[old_i];
|
||||
new_vy[new_i] = old_vy[old_i];
|
||||
new_vz[new_i] = old_vz[old_i];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
free(atom->x);
|
||||
free(atom->vx);
|
||||
atom->x = new_x;
|
||||
atom->vx = new_vx;
|
||||
#ifndef AOS
|
||||
free(atom->y);
|
||||
free(atom->z);
|
||||
free(atom->vy);
|
||||
free(atom->vz);
|
||||
atom->y = new_y;
|
||||
atom->z = new_z;
|
||||
atom->vy = new_vy;
|
||||
atom->vz = new_vz;
|
||||
#endif
|
||||
}
|
55
src/verletlist/neighbor.h
Normal file
55
src/verletlist/neighbor.h
Normal file
@@ -0,0 +1,55 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __NEIGHBOR_H_
|
||||
#define __NEIGHBOR_H_
|
||||
|
||||
typedef struct {
|
||||
int *neighbors;
|
||||
int *numneigh;
|
||||
} DeviceNeighbor;
|
||||
|
||||
typedef struct {
|
||||
int every;
|
||||
int ncalls;
|
||||
int maxneighs;
|
||||
int half_neigh;
|
||||
int *neighbors;
|
||||
int *numneigh;
|
||||
|
||||
// Device data
|
||||
DeviceNeighbor d_neighbor;
|
||||
} Neighbor;
|
||||
|
||||
typedef struct {
|
||||
MD_FLOAT xprd; MD_FLOAT yprd; MD_FLOAT zprd;
|
||||
MD_FLOAT bininvx; MD_FLOAT bininvy; MD_FLOAT bininvz;
|
||||
int mbinxlo; int mbinylo; int mbinzlo;
|
||||
int nbinx; int nbiny; int nbinz;
|
||||
int mbinx; int mbiny; int mbinz;
|
||||
} Neighbor_params;
|
||||
|
||||
typedef struct {
|
||||
int* bincount;
|
||||
int* bins;
|
||||
int mbins;
|
||||
int atoms_per_bin;
|
||||
} Binning;
|
||||
|
||||
extern void initNeighbor(Neighbor*, Parameter*);
|
||||
extern void setupNeighbor(Parameter*);
|
||||
extern void binatoms(Atom*);
|
||||
extern void buildNeighbor_cpu(Atom*, Neighbor*);
|
||||
extern void sortAtom(Atom*);
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
extern void buildNeighbor_cuda(Atom*, Neighbor*);
|
||||
#endif
|
||||
|
||||
#endif
|
234
src/verletlist/pbc.c
Normal file
234
src/verletlist/pbc.c
Normal file
@@ -0,0 +1,234 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
//---
|
||||
#include <allocate.h>
|
||||
#include <atom.h>
|
||||
#include <pbc.h>
|
||||
|
||||
#define DELTA 20000
|
||||
|
||||
int nmaxGhost;
|
||||
int *PBCx, *PBCy, *PBCz;
|
||||
|
||||
static void growPbc(Atom*);
|
||||
|
||||
/* exported subroutines */
|
||||
void initPbc(Atom* atom)
|
||||
{
|
||||
nmaxGhost = 0;
|
||||
atom->border_map = NULL;
|
||||
PBCx = NULL;
|
||||
PBCy = NULL;
|
||||
PBCz = NULL;
|
||||
}
|
||||
|
||||
/* update coordinates of ghost atoms */
|
||||
/* uses mapping created in setupPbc */
|
||||
void updatePbc_cpu(Atom* atom, Parameter* param, bool doReneighbor)
|
||||
{
|
||||
int* borderMap = atom->border_map;
|
||||
int nlocal = atom->Nlocal;
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
for (int i = 0; i < atom->Nghost; i++) {
|
||||
atom_x(nlocal + i) = atom_x(borderMap[i]) + PBCx[i] * xprd;
|
||||
atom_y(nlocal + i) = atom_y(borderMap[i]) + PBCy[i] * yprd;
|
||||
atom_z(nlocal + i) = atom_z(borderMap[i]) + PBCz[i] * zprd;
|
||||
}
|
||||
}
|
||||
|
||||
/* relocate atoms that have left domain according
|
||||
* to periodic boundary conditions */
|
||||
void updateAtomsPbc_cpu(Atom* atom, Parameter* param)
|
||||
{
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
for (int i = 0; i < atom->Nlocal; i++) {
|
||||
if (atom_x(i) < 0.0) {
|
||||
atom_x(i) += xprd;
|
||||
} else if (atom_x(i) >= xprd) {
|
||||
atom_x(i) -= xprd;
|
||||
}
|
||||
|
||||
if (atom_y(i) < 0.0) {
|
||||
atom_y(i) += yprd;
|
||||
} else if (atom_y(i) >= yprd) {
|
||||
atom_y(i) -= yprd;
|
||||
}
|
||||
|
||||
if (atom_z(i) < 0.0) {
|
||||
atom_z(i) += zprd;
|
||||
} else if (atom_z(i) >= zprd) {
|
||||
atom_z(i) -= zprd;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* setup periodic boundary conditions by
|
||||
* defining ghost atoms around domain
|
||||
* only creates mapping and coordinate corrections
|
||||
* that are then enforced in updatePbc */
|
||||
#define ADDGHOST(dx, dy, dz) \
|
||||
Nghost++; \
|
||||
border_map[Nghost] = i; \
|
||||
PBCx[Nghost] = dx; \
|
||||
PBCy[Nghost] = dy; \
|
||||
PBCz[Nghost] = dz; \
|
||||
atom->type[atom->Nlocal + Nghost] = atom->type[i]
|
||||
|
||||
void setupPbc(Atom* atom, Parameter* param)
|
||||
{
|
||||
int* border_map = atom->border_map;
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
MD_FLOAT cutneigh = param->cutneigh;
|
||||
int Nghost = -1;
|
||||
|
||||
for (int i = 0; i < atom->Nlocal; i++) {
|
||||
if (atom->Nlocal + Nghost + 7 >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
if (Nghost + 7 >= nmaxGhost) {
|
||||
growPbc(atom);
|
||||
border_map = atom->border_map;
|
||||
}
|
||||
|
||||
MD_FLOAT x = atom_x(i);
|
||||
MD_FLOAT y = atom_y(i);
|
||||
MD_FLOAT z = atom_z(i);
|
||||
|
||||
/* Setup ghost atoms */
|
||||
/* 6 planes */
|
||||
if (param->pbc_x != 0) {
|
||||
if (x < cutneigh) {
|
||||
ADDGHOST(+1, 0, 0);
|
||||
}
|
||||
if (x >= (xprd - cutneigh)) {
|
||||
ADDGHOST(-1, 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
if (param->pbc_y != 0) {
|
||||
if (y < cutneigh) {
|
||||
ADDGHOST(0, +1, 0);
|
||||
}
|
||||
if (y >= (yprd - cutneigh)) {
|
||||
ADDGHOST(0, -1, 0);
|
||||
}
|
||||
}
|
||||
|
||||
if (param->pbc_z != 0) {
|
||||
if (z < cutneigh) {
|
||||
ADDGHOST(0, 0, +1);
|
||||
}
|
||||
if (z >= (zprd - cutneigh)) {
|
||||
ADDGHOST(0, 0, -1);
|
||||
}
|
||||
}
|
||||
|
||||
/* 8 corners */
|
||||
if (param->pbc_x != 0 && param->pbc_y != 0 && param->pbc_z != 0) {
|
||||
if (x < cutneigh && y < cutneigh && z < cutneigh) {
|
||||
ADDGHOST(+1, +1, +1);
|
||||
}
|
||||
if (x < cutneigh && y >= (yprd - cutneigh) && z < cutneigh) {
|
||||
ADDGHOST(+1, -1, +1);
|
||||
}
|
||||
if (x < cutneigh && y < cutneigh && z >= (zprd - cutneigh)) {
|
||||
ADDGHOST(+1, +1, -1);
|
||||
}
|
||||
if (x < cutneigh && y >= (yprd - cutneigh) && z >= (zprd - cutneigh)) {
|
||||
ADDGHOST(+1, -1, -1);
|
||||
}
|
||||
if (x >= (xprd - cutneigh) && y < cutneigh && z < cutneigh) {
|
||||
ADDGHOST(-1, +1, +1);
|
||||
}
|
||||
if (x >= (xprd - cutneigh) && y >= (yprd - cutneigh) && z < cutneigh) {
|
||||
ADDGHOST(-1, -1, +1);
|
||||
}
|
||||
if (x >= (xprd - cutneigh) && y < cutneigh && z >= (zprd - cutneigh)) {
|
||||
ADDGHOST(-1, +1, -1);
|
||||
}
|
||||
if (x >= (xprd - cutneigh) && y >= (yprd - cutneigh) &&
|
||||
z >= (zprd - cutneigh)) {
|
||||
ADDGHOST(-1, -1, -1);
|
||||
}
|
||||
}
|
||||
|
||||
/* 12 edges */
|
||||
if (param->pbc_x != 0 && param->pbc_z != 0) {
|
||||
if (x < cutneigh && z < cutneigh) {
|
||||
ADDGHOST(+1, 0, +1);
|
||||
}
|
||||
if (x < cutneigh && z >= (zprd - cutneigh)) {
|
||||
ADDGHOST(+1, 0, -1);
|
||||
}
|
||||
if (x >= (xprd - cutneigh) && z < cutneigh) {
|
||||
ADDGHOST(-1, 0, +1);
|
||||
}
|
||||
if (x >= (xprd - cutneigh) && z >= (zprd - cutneigh)) {
|
||||
ADDGHOST(-1, 0, -1);
|
||||
}
|
||||
}
|
||||
|
||||
if (param->pbc_y != 0 && param->pbc_z != 0) {
|
||||
if (y < cutneigh && z < cutneigh) {
|
||||
ADDGHOST(0, +1, +1);
|
||||
}
|
||||
if (y < cutneigh && z >= (zprd - cutneigh)) {
|
||||
ADDGHOST(0, +1, -1);
|
||||
}
|
||||
if (y >= (yprd - cutneigh) && z < cutneigh) {
|
||||
ADDGHOST(0, -1, +1);
|
||||
}
|
||||
if (y >= (yprd - cutneigh) && z >= (zprd - cutneigh)) {
|
||||
ADDGHOST(0, -1, -1);
|
||||
}
|
||||
}
|
||||
|
||||
if (param->pbc_x != 0 && param->pbc_y != 0) {
|
||||
if (y < cutneigh && x < cutneigh) {
|
||||
ADDGHOST(+1, +1, 0);
|
||||
}
|
||||
if (y < cutneigh && x >= (xprd - cutneigh)) {
|
||||
ADDGHOST(-1, +1, 0);
|
||||
}
|
||||
if (y >= (yprd - cutneigh) && x < cutneigh) {
|
||||
ADDGHOST(+1, -1, 0);
|
||||
}
|
||||
if (y >= (yprd - cutneigh) && x >= (xprd - cutneigh)) {
|
||||
ADDGHOST(-1, -1, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
// increase by one to make it the ghost atom count
|
||||
atom->Nghost = Nghost + 1;
|
||||
}
|
||||
|
||||
/* internal subroutines */
|
||||
void growPbc(Atom* atom)
|
||||
{
|
||||
int nold = nmaxGhost;
|
||||
nmaxGhost += DELTA;
|
||||
|
||||
atom->border_map = (int*)reallocate(atom->border_map,
|
||||
ALIGNMENT,
|
||||
nmaxGhost * sizeof(int),
|
||||
nold * sizeof(int));
|
||||
PBCx = (int*)reallocate(PBCx, ALIGNMENT, nmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
PBCy = (int*)reallocate(PBCy, ALIGNMENT, nmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
PBCz = (int*)reallocate(PBCz, ALIGNMENT, nmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
}
|
24
src/verletlist/pbc.h
Normal file
24
src/verletlist/pbc.h
Normal file
@@ -0,0 +1,24 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdbool.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __PBC_H_
|
||||
#define __PBC_H_
|
||||
extern void initPbc(Atom*);
|
||||
extern void updatePbc_cpu(Atom*, Parameter*, bool);
|
||||
extern void updateAtomsPbc_cpu(Atom*, Parameter*);
|
||||
extern void setupPbc(Atom*, Parameter*);
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
extern void updatePbc_cuda(Atom*, Parameter*, bool);
|
||||
extern void updateAtomsPbc_cuda(Atom*, Parameter*);
|
||||
#endif
|
||||
|
||||
#endif
|
48
src/verletlist/stats.c
Normal file
48
src/verletlist/stats.c
Normal file
@@ -0,0 +1,48 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
#include <stats.h>
|
||||
#include <timers.h>
|
||||
|
||||
void initStats(Stats *s) {
|
||||
s->total_force_neighs = 0;
|
||||
s->total_force_iters = 0;
|
||||
s->atoms_within_cutoff = 0;
|
||||
s->atoms_outside_cutoff = 0;
|
||||
}
|
||||
|
||||
void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer) {
|
||||
#ifdef COMPUTE_STATS
|
||||
|
||||
double force_useful_volume = 1e-9 * ( (double)(atom->Nlocal * (param->ntimes + 1)) * (sizeof(MD_FLOAT) * 6 + sizeof(int)) +
|
||||
(double)(stats->total_force_neighs) * (sizeof(MD_FLOAT) * 3 + sizeof(int)) );
|
||||
double avg_neigh = stats->total_force_neighs / (double)(atom->Nlocal * (param->ntimes + 1));
|
||||
double avg_simd = stats->total_force_iters / (double)(atom->Nlocal * (param->ntimes + 1));
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
force_useful_volume += 1e-9 * (double)((atom->Nlocal * (param->ntimes + 1)) + stats->total_force_neighs) * sizeof(int);
|
||||
#endif
|
||||
|
||||
printf("Statistics:\n");
|
||||
printf("\tVector width: %d, Processor frequency: %.4f GHz\n", VECTOR_WIDTH, param->proc_freq);
|
||||
printf("\tAverage neighbors per atom: %.4f\n", avg_neigh);
|
||||
printf("\tAverage SIMD iterations per atom: %.4f\n", avg_simd);
|
||||
printf("\tTotal number of computed pair interactions: %lld\n", stats->total_force_neighs);
|
||||
printf("\tTotal number of SIMD iterations: %lld\n", stats->total_force_iters);
|
||||
printf("\tUseful read data volume for force computation: %.2fGB\n", force_useful_volume);
|
||||
printf("\tCycles/SIMD iteration: %.4f\n", timer[FORCE] * param->proc_freq * 1e9 / stats->total_force_iters);
|
||||
|
||||
#ifdef USE_REFERENCE_VERSION
|
||||
const double eff_pct = (double)stats->atoms_within_cutoff / (double)(stats->atoms_within_cutoff + stats->atoms_outside_cutoff) * 100.0;
|
||||
printf("\tAtoms within/outside cutoff radius: %lld/%lld (%.2f%%)\n", stats->atoms_within_cutoff, stats->atoms_outside_cutoff, eff_pct);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
32
src/verletlist/stats.h
Normal file
32
src/verletlist/stats.h
Normal file
@@ -0,0 +1,32 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __STATS_H_
|
||||
#define __STATS_H_
|
||||
typedef struct {
|
||||
long long int total_force_neighs;
|
||||
long long int total_force_iters;
|
||||
long long int atoms_within_cutoff;
|
||||
long long int atoms_outside_cutoff;
|
||||
} Stats;
|
||||
|
||||
void initStats(Stats *s);
|
||||
void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer);
|
||||
|
||||
#ifdef COMPUTE_STATS
|
||||
# define addStat(stat, value) stat += value;
|
||||
# define beginStatTimer() double Si = getTimeStamp();
|
||||
# define endStatTimer(stat) stat += getTimeStamp() - Si;
|
||||
#else
|
||||
# define addStat(stat, value)
|
||||
# define beginStatTimer()
|
||||
# define endStatTimer(stat)
|
||||
#endif
|
||||
|
||||
#endif
|
56
src/verletlist/tracing.c
Normal file
56
src/verletlist/tracing.c
Normal file
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <tracing.h>
|
||||
|
||||
void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timestep) {
|
||||
MEM_TRACER_INIT;
|
||||
INDEX_TRACER_INIT;
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
|
||||
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
MEM_TRACE(atom_x(i), 'R');
|
||||
MEM_TRACE(atom_y(i), 'R');
|
||||
MEM_TRACE(atom_z(i), 'R');
|
||||
INDEX_TRACE_ATOM(i);
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
MEM_TRACE(atom->type[i], 'R');
|
||||
#endif
|
||||
|
||||
DIST_TRACE_SORT(neighs, numneighs);
|
||||
INDEX_TRACE(neighs, numneighs);
|
||||
DIST_TRACE(neighs, numneighs);
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
MEM_TRACE(neighs[k], 'R');
|
||||
MEM_TRACE(atom_x(j), 'R');
|
||||
MEM_TRACE(atom_y(j), 'R');
|
||||
MEM_TRACE(atom_z(j), 'R');
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
MEM_TRACE(atom->type[j], 'R');
|
||||
#endif
|
||||
}
|
||||
|
||||
MEM_TRACE(atom_fx(i), 'R');
|
||||
MEM_TRACE(atom_fx(i), 'W');
|
||||
MEM_TRACE(atom_fy(i), 'R');
|
||||
MEM_TRACE(atom_fy(i), 'W');
|
||||
MEM_TRACE(atom_fz(i), 'R');
|
||||
MEM_TRACE(atom_fz(i), 'W');
|
||||
}
|
||||
|
||||
INDEX_TRACER_END;
|
||||
MEM_TRACER_END;
|
||||
}
|
102
src/verletlist/tracing.h
Normal file
102
src/verletlist/tracing.h
Normal file
@@ -0,0 +1,102 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#endif
|
||||
|
||||
#ifndef VECTOR_WIDTH
|
||||
# define VECTOR_WIDTH 8
|
||||
#endif
|
||||
|
||||
#ifndef TRACER_CONDITION
|
||||
# define TRACER_CONDITION (!(timestep % param->every))
|
||||
#endif
|
||||
|
||||
#ifdef MEM_TRACER
|
||||
# define MEM_TRACER_INIT FILE *mem_tracer_fp; \
|
||||
if(TRACER_CONDITION) { \
|
||||
char mem_tracer_fn[128]; \
|
||||
snprintf(mem_tracer_fn, sizeof mem_tracer_fn, "mem_tracer_%d.out", timestep); \
|
||||
mem_tracer_fp = fopen(mem_tracer_fn, "w");
|
||||
}
|
||||
|
||||
# define MEM_TRACER_END if(TRACER_CONDITION) { fclose(mem_tracer_fp); }
|
||||
# define MEM_TRACE(addr, op) if(TRACER_CONDITION) { fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr))); }
|
||||
#else
|
||||
# define MEM_TRACER_INIT
|
||||
# define MEM_TRACER_END
|
||||
# define MEM_TRACE(addr, op)
|
||||
#endif
|
||||
|
||||
#ifdef INDEX_TRACER
|
||||
# define INDEX_TRACER_INIT FILE *index_tracer_fp; \
|
||||
if(TRACER_CONDITION) { \
|
||||
char index_tracer_fn[128]; \
|
||||
snprintf(index_tracer_fn, sizeof index_tracer_fn, "index_tracer_%d.out", timestep); \
|
||||
index_tracer_fp = fopen(index_tracer_fn, "w"); \
|
||||
}
|
||||
|
||||
# define INDEX_TRACER_END if(TRACER_CONDITION) { fclose(index_tracer_fp); }
|
||||
# define INDEX_TRACE_NATOMS(nl, ng, mn) if(TRACER_CONDITION) { fprintf(index_tracer_fp, "N: %d %d %d\n", nl, ng, mn); }
|
||||
# define INDEX_TRACE_ATOM(a) if(TRACER_CONDITION) { fprintf(index_tracer_fp, "A: %d\n", a); }
|
||||
# define INDEX_TRACE(l, e) if(TRACER_CONDITION) { \
|
||||
for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
|
||||
int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
|
||||
fprintf(index_tracer_fp, "I: "); \
|
||||
for(int __j = 0; __j < __e; ++__j) { \
|
||||
fprintf(index_tracer_fp, "%d ", l[__i + __j]); \
|
||||
} \
|
||||
fprintf(index_tracer_fp, "\n"); \
|
||||
} \
|
||||
}
|
||||
|
||||
# define DIST_TRACE_SORT(l, e) if(TRACER_CONDITION) { \
|
||||
for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
|
||||
int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
|
||||
if(__e > 1) { \
|
||||
for(int __j = __i; __j < __i + __e - 1; ++__j) { \
|
||||
for(int __k = __i; __k < __i + __e - (__j - __i) - 1; ++__k) { \
|
||||
if(l[__k] > l[__k + 1]) { \
|
||||
int __t = l[__k]; \
|
||||
l[__k] = l[__k + 1]; \
|
||||
l[__k + 1] = __t; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
# define DIST_TRACE(l, e) if(TRACER_CONDITION) { \
|
||||
for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
|
||||
int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
|
||||
if(__e > 1) { \
|
||||
fprintf(index_tracer_fp, "D: "); \
|
||||
for(int __j = 0; __j < __e - 1; ++__j) { \
|
||||
int __dist = abs(l[__i + __j + 1] - l[__i + __j]); \
|
||||
fprintf(index_tracer_fp, "%d ", __dist); \
|
||||
} \
|
||||
fprintf(index_tracer_fp, "\n"); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
#else
|
||||
# define INDEX_TRACER_INIT
|
||||
# define INDEX_TRACER_END
|
||||
# define INDEX_TRACE_NATOMS(nl, ng, mn)
|
||||
# define INDEX_TRACE_ATOM(a)
|
||||
# define INDEX_TRACE(l, e)
|
||||
# define DIST_TRACE_SORT(l, e)
|
||||
# define DIST_TRACE(l, e)
|
||||
#endif
|
||||
|
||||
extern void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timestep);
|
50
src/verletlist/vtk.c
Normal file
50
src/verletlist/vtk.c
Normal file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <atom.h>
|
||||
|
||||
int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
|
||||
char timestep_filename[128];
|
||||
snprintf(timestep_filename, sizeof timestep_filename, "%s_%d.vtk", filename, timestep);
|
||||
FILE* fp = fopen(timestep_filename, "wb");
|
||||
|
||||
if(fp == NULL) {
|
||||
fprintf(stderr, "Could not open VTK file for writing!\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
fprintf(fp, "# vtk DataFile Version 2.0\n");
|
||||
fprintf(fp, "Particle data\n");
|
||||
fprintf(fp, "ASCII\n");
|
||||
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
|
||||
fprintf(fp, "POINTS %d double\n", atom->Nlocal);
|
||||
for(int i = 0; i < atom->Nlocal; ++i) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", atom_x(i), atom_y(i), atom_z(i));
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2);
|
||||
for(int i = 0; i < atom->Nlocal; ++i) {
|
||||
fprintf(fp, "1 %d\n", i);
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal);
|
||||
for(int i = 0; i < atom->Nlocal; ++i) {
|
||||
fprintf(fp, "1\n");
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "POINT_DATA %d\n", atom->Nlocal);
|
||||
fprintf(fp, "SCALARS mass double\n");
|
||||
fprintf(fp, "LOOKUP_TABLE default\n");
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
fprintf(fp, "1.0\n");
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fclose(fp);
|
||||
return 0;
|
||||
}
|
12
src/verletlist/vtk.h
Normal file
12
src/verletlist/vtk.h
Normal file
@@ -0,0 +1,12 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <atom.h>
|
||||
|
||||
#ifndef __VTK_H_
|
||||
#define __VTK_H_
|
||||
extern int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
||||
#endif
|
Reference in New Issue
Block a user