From dc4d5f1a9cc4b505a5bc66a672ac2018c314150c Mon Sep 17 00:00:00 2001
From: Maximilian Gaul <ptfs410h@woody3.rrze.uni-erlangen.de>
Date: Sat, 1 Jan 2022 18:18:12 +0100
Subject: [PATCH] Porting atom velocity memory layout to AoS, porting velocity
 integration to CUDA, adding measurements + logbook update

---
 .gitignore          |   1 +
 src/allocate.c      |   2 +-
 src/atom.c          |  12 ++--
 src/force.cu        | 142 ++++++++++++++++++++++++++++++++++----------
 src/includes/atom.h |   8 +++
 src/main.c          |  34 ++++++-----
 src/neighbor.c      |  22 +++++--
 src/thermo.c        |  26 ++++----
 8 files changed, 175 insertions(+), 72 deletions(-)

diff --git a/.gitignore b/.gitignore
index 52c86c7..50d6fbe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -52,6 +52,7 @@ Mkfile.old
 dkms.conf
 
 # Build directories and executables
+.vscode/
 GCC/
 ICC/
 MDBench-GCC*
diff --git a/src/allocate.c b/src/allocate.c
index 39199c7..2593291 100644
--- a/src/allocate.c
+++ b/src/allocate.c
@@ -78,7 +78,7 @@ void* reallocate (
         size_t newBytesize,
         size_t oldBytesize)
 {
-    void* newarray =  allocate(alignment, newBytesize);
+    void* newarray = allocate(alignment, newBytesize);
 
     if(ptr != NULL) {
         memcpy(newarray, ptr, oldBytesize);
diff --git a/src/atom.c b/src/atom.c
index 55fb103..1b3d4c0 100644
--- a/src/atom.c
+++ b/src/atom.c
@@ -137,9 +137,9 @@ void createAtom(Atom *atom, Parameter *param)
                 atom_x(atom->Nlocal) = xtmp;
                 atom_y(atom->Nlocal) = ytmp;
                 atom_z(atom->Nlocal) = ztmp;
-                atom->vx[atom->Nlocal] = vxtmp;
-                atom->vy[atom->Nlocal] = vytmp;
-                atom->vz[atom->Nlocal] = vztmp;
+                atom_vx(atom->Nlocal) = vxtmp;
+                atom_vy(atom->Nlocal) = vytmp;
+                atom_vz(atom->Nlocal) = vztmp;
                 atom->type[atom->Nlocal] = rand() % atom->ntypes;
                 atom->Nlocal++;
             }
@@ -164,6 +164,8 @@ void growAtom(Atom *atom)
     atom->x  = (MD_FLOAT*) reallocate(atom->x,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
 
     atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
+
+    atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
     #else
     atom->x  = (MD_FLOAT*) reallocate(atom->x,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
     atom->y  = (MD_FLOAT*) reallocate(atom->y,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
@@ -172,10 +174,12 @@ void growAtom(Atom *atom)
     atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
     atom->fy = (MD_FLOAT*) reallocate(atom->fy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
     atom->fz = (MD_FLOAT*) reallocate(atom->fz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
-    #endif
 
     atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
     atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
     atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+
+    #endif
+
     atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int));
 }
diff --git a/src/force.cu b/src/force.cu
index 5b9fb90..e044585 100644
--- a/src/force.cu
+++ b/src/force.cu
@@ -92,13 +92,115 @@ __global__ void calc_force(
     atom_fz(i) = fiz;
 }
 
+__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, Atom a) {
+
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if( i >= Nlocal ) {
+        return;
+    }
+
+    Atom *atom = &a;
+
+    atom_vx(i) += dtforce * atom_fx(i);
+    atom_vy(i) += dtforce * atom_fy(i);
+    atom_vz(i) += dtforce * atom_fz(i);
+    atom_x(i) = atom_x(i) + dt * atom_vx(i);
+    atom_y(i) = atom_y(i) + dt * atom_vy(i);
+    atom_z(i) = atom_z(i) + dt * atom_vz(i);
+}
+
+__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) {
+
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if( i >= Nlocal ) {
+        return;
+    }
+
+    Atom *atom = &a;
+
+    atom_vx(i) += dtforce * atom_fx(i);
+    atom_vy(i) += dtforce * atom_fy(i);
+    atom_vz(i) += dtforce * atom_fz(i);
+}
+
 extern "C" {
 
-bool initialized = false;
 static Atom c_atom;
 int *c_neighs;
 int *c_neigh_numneigh;
 
+int get_num_threads() {
+
+    const char *num_threads_env = getenv("NUM_THREADS");
+    int num_threads = 0;
+    if(num_threads_env == nullptr)
+        num_threads = 32;
+    else {
+        num_threads = atoi(num_threads_env);
+    }
+
+    return num_threads;
+}
+
+void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom) {
+
+    const int Nlocal = atom->Nlocal;
+    const int num_threads = get_num_threads();
+
+    const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
+    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
+
+    kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, c_atom);
+
+    checkCUDAError( "PeekAtLastError FinalIntegrate", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync FinalIntegrate", cudaDeviceSynchronize() );
+
+    if(doReneighbour) {
+        checkCUDAError( "FinalIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
+        checkCUDAError( "FinalIntegrate: position memcpy", cudaMemcpy(atom->x, c_atom.x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
+    }
+}
+
+void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom) {
+
+    const int Nlocal = atom->Nlocal;
+    const int num_threads = get_num_threads();
+
+    const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
+    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
+
+    kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, c_atom);
+
+    checkCUDAError( "PeekAtLastError InitialIntegrate", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync InitialIntegrate", cudaDeviceSynchronize() );
+
+    checkCUDAError( "InitialIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
+    checkCUDAError( "InitialIntegrate: position memcpy", cudaMemcpy(atom->x, c_atom.x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
+}
+
+void initCudaAtom(Atom *atom, Neighbor *neighbor) {
+
+    const int Nlocal = atom->Nlocal;
+
+    checkCUDAError( "c_atom.x malloc", cudaMalloc((void**)&(c_atom.x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
+    checkCUDAError( "c_atom.fx malloc", cudaMalloc((void**)&(c_atom.fx), sizeof(MD_FLOAT) * Nlocal * 3) );
+    checkCUDAError( "c_atom.vx malloc", cudaMalloc((void**)&(c_atom.vx), sizeof(MD_FLOAT) * Nlocal * 3) );
+    checkCUDAError( "c_atom.vx memcpy", cudaMemcpy(c_atom.vx, atom->vx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyHostToDevice) );
+    checkCUDAError( "c_atom.type malloc", cudaMalloc((void**)&(c_atom.type), sizeof(int) * atom->Nmax) );
+    checkCUDAError( "c_atom.epsilon malloc", cudaMalloc((void**)&(c_atom.epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
+    checkCUDAError( "c_atom.sigma6 malloc", cudaMalloc((void**)&(c_atom.sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
+    checkCUDAError( "c_atom.cutforcesq malloc", cudaMalloc((void**)&(c_atom.cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
+
+    checkCUDAError( "c_neighs malloc", cudaMalloc((void**)&c_neighs, sizeof(int) * Nlocal * neighbor->maxneighs) );
+    checkCUDAError( "c_neigh_numneigh malloc", cudaMalloc((void**)&c_neigh_numneigh, sizeof(int) * Nlocal) );
+
+    checkCUDAError( "c_atom.type memcpy", cudaMemcpy(c_atom.type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
+    checkCUDAError( "c_atom.sigma6 memcpy", cudaMemcpy(c_atom.sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
+    checkCUDAError( "c_atom.epsilon memcpy", cudaMemcpy(c_atom.epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
+
+    checkCUDAError( "c_atom.cutforcesq memcpy", cudaMemcpy(c_atom.cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
+}
+
 double computeForce(
         bool reneighbourHappenend,
         Parameter *param,
@@ -113,15 +215,7 @@ double computeForce(
     MD_FLOAT epsilon = param->epsilon;
 #endif
 
-    cudaProfilerStart();
-
-    const char *num_threads_env = getenv("NUM_THREADS");
-    int num_threads = 0;
-    if(num_threads_env == nullptr)
-        num_threads = 32;
-    else {
-        num_threads = atoi(num_threads_env);
-    }
+    const int num_threads = get_num_threads();
 
     c_atom.Natoms = atom->Natoms;
     c_atom.Nlocal = atom->Nlocal;
@@ -149,25 +243,9 @@ double computeForce(
     // HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
     // HINT: Only works for data layout = AOS!!!
 
-    if(!initialized) {
-        checkCUDAError( "c_atom.x malloc", cudaMalloc((void**)&(c_atom.x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
-        checkCUDAError( "c_atom.fx malloc", cudaMalloc((void**)&(c_atom.fx), sizeof(MD_FLOAT) * Nlocal * 3) );
-        checkCUDAError( "c_atom.type malloc", cudaMalloc((void**)&(c_atom.type), sizeof(int) * atom->Nmax) );
-        checkCUDAError( "c_atom.epsilon malloc", cudaMalloc((void**)&(c_atom.epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
-        checkCUDAError( "c_atom.sigma6 malloc", cudaMalloc((void**)&(c_atom.sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
-        checkCUDAError( "c_atom.cutforcesq malloc", cudaMalloc((void**)&(c_atom.cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
+    // checkCUDAError( "c_atom.fx memset", cudaMemset(c_atom.fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) );
 
-        checkCUDAError( "c_neighs malloc", cudaMalloc((void**)&c_neighs, sizeof(int) * Nlocal * neighbor->maxneighs) );
-        checkCUDAError( "c_neigh_numneigh malloc", cudaMalloc((void**)&c_neigh_numneigh, sizeof(int) * Nlocal) );
-
-        checkCUDAError( "c_atom.type memcpy", cudaMemcpy(c_atom.type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
-        checkCUDAError( "c_atom.sigma6 memcpy", cudaMemcpy(c_atom.sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
-        checkCUDAError( "c_atom.epsilon memcpy", cudaMemcpy(c_atom.epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
-
-        checkCUDAError( "c_atom.cutforcesq memcpy", cudaMemcpy(c_atom.cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
-    }
-
-    checkCUDAError( "c_atom.fx memset", cudaMemset(c_atom.fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) );
+    cudaProfilerStart();
 
     checkCUDAError( "c_atom.x memcpy", cudaMemcpy(c_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
 
@@ -184,12 +262,12 @@ double computeForce(
 
     calc_force <<< num_blocks, num_threads_per_block >>> (c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighs, c_neigh_numneigh);
 
-    checkCUDAError( "PeekAtLastError", cudaPeekAtLastError() );
-    checkCUDAError( "DeviceSync", cudaDeviceSynchronize() );
+    checkCUDAError( "PeekAtLastError ComputeForce", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync ComputeForce", cudaDeviceSynchronize() );
 
     // copy results in c_atom.fx/fy/fz to atom->fx/fy/fz
 
-    cudaMemcpy(atom->fx, c_atom.fx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyDeviceToHost);
+    checkCUDAError( "memcpy force back", cudaMemcpy(atom->fx, c_atom.fx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyDeviceToHost) );
 
     /*
     cudaFree(c_atom.x);
@@ -207,8 +285,6 @@ double computeForce(
     LIKWID_MARKER_STOP("force");
     double E = getTimeStamp();
 
-    initialized = true;
-
     return E-S;
 }
 }
\ No newline at end of file
diff --git a/src/includes/atom.h b/src/includes/atom.h
index 5640517..a91e02a 100644
--- a/src/includes/atom.h
+++ b/src/includes/atom.h
@@ -54,6 +54,10 @@ extern void growAtom(Atom*);
 #define atom_fy(i)          atom->fx[(i) * 3 + 1]
 #define atom_fz(i)          atom->fx[(i) * 3 + 2]
 
+#define atom_vx(i)          atom->vx[(i) * 3 + 0]
+#define atom_vy(i)          atom->vx[(i) * 3 + 1]
+#define atom_vz(i)          atom->vx[(i) * 3 + 2]
+
 #else
 #define POS_DATA_LAYOUT     "SoA"
 
@@ -65,6 +69,10 @@ extern void growAtom(Atom*);
 #define atom_fy(i)           atom->fy[i]
 #define atom_fz(i)           atom->fz[i]
 
+#define atom_vx(i)           atom->vx[i]
+#define atom_vy(i)           atom->vy[i]
+#define atom_vz(i)           atom->vz[i]
+
 #endif
 
 #endif
diff --git a/src/main.c b/src/main.c
index 86b59a2..28898b0 100644
--- a/src/main.c
+++ b/src/main.c
@@ -45,6 +45,11 @@
 
 #define HLINE "----------------------------------------------------------------------------\n"
 
+extern void initCudaAtom(Atom *atom, Neighbor *neighbor);
+
+extern void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom);
+extern void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom);
+
 extern double computeForce(bool, Parameter*, Atom*, Neighbor*);
 extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int);
 extern double computeForceEam(Eam* eam, Parameter*, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep);
@@ -101,6 +106,8 @@ double setup(
     buildNeighbor(atom, neighbor);
     E = getTimeStamp();
 
+    initCudaAtom(atom, neighbor);
+
     return E-S;
 }
 
@@ -126,26 +133,22 @@ double reneighbour(
 
 void initialIntegrate(Parameter *param, Atom *atom)
 {
-    MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz;
-
     for(int i = 0; i < atom->Nlocal; i++) {
-        vx[i] += param->dtforce * atom_fx(i);
-        vy[i] += param->dtforce * atom_fy(i);
-        vz[i] += param->dtforce * atom_fz(i);
-        atom_x(i) = atom_x(i) + param->dt * vx[i];
-        atom_y(i) = atom_y(i) + param->dt * vy[i];
-        atom_z(i) = atom_z(i) + param->dt * vz[i];
+        atom_vx(i) += param->dtforce * atom_fx(i);
+        atom_vy(i) += param->dtforce * atom_fy(i);
+        atom_vz(i) += param->dtforce * atom_fz(i);
+        atom_x(i) = atom_x(i) + param->dt * atom_vx(i);
+        atom_y(i) = atom_y(i) + param->dt * atom_vy(i);
+        atom_z(i) = atom_z(i) + param->dt * atom_vz(i);
     }
 }
 
 void finalIntegrate(Parameter *param, Atom *atom)
 {
-    MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz;
-
     for(int i = 0; i < atom->Nlocal; i++) {
-        vx[i] += param->dtforce * atom_fx(i);
-        vy[i] += param->dtforce * atom_fy(i);
-        vz[i] += param->dtforce * atom_fz(i);
+        atom_vx(i) += param->dtforce * atom_fx(i);
+        atom_vy(i) += param->dtforce * atom_fy(i);
+        atom_vz(i) += param->dtforce * atom_fz(i);
     }
 }
 
@@ -274,10 +277,11 @@ int main(int argc, char** argv)
     }
 
     for(int n = 0; n < param.ntimes; n++) {
-        initialIntegrate(&param, &atom);
 
         const bool doReneighbour = (n + 1) % param.every == 0;
 
+        cuda_initial_integrate(doReneighbour, &param, &atom); // initialIntegrate(&param, &atom);
+
         if(doReneighbour) {
             timer[NEIGH] += reneighbour(&param, &atom, &neighbor);
         } else {
@@ -293,7 +297,7 @@ int main(int argc, char** argv)
             timer[FORCE] += computeForce(doReneighbour, &param, &atom, &neighbor);
 #endif
         }
-        finalIntegrate(&param, &atom);
+        cuda_final_integrate(doReneighbour, &param, &atom); // finalIntegrate(&param, &atom);
 
         if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
             computeThermo(n + 1, &param, &atom);
diff --git a/src/neighbor.c b/src/neighbor.c
index c9f9634..cce3756 100644
--- a/src/neighbor.c
+++ b/src/neighbor.c
@@ -359,14 +359,18 @@ void sortAtom(Atom* atom) {
 
 #ifdef AOS
     double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
+
+    double* new_vx = (double*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
 #else
     double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT));
     double* new_y = (double*) malloc(Nmax * sizeof(MD_FLOAT));
     double* new_z = (double*) malloc(Nmax * sizeof(MD_FLOAT));
-#endif
+
     double* new_vx = (double*) malloc(Nmax * sizeof(MD_FLOAT));
     double* new_vy = (double*) malloc(Nmax * sizeof(MD_FLOAT));
     double* new_vz = (double*) malloc(Nmax * sizeof(MD_FLOAT));
+#endif
+
     double* old_x = atom->x; double* old_y = atom->y; double* old_z = atom->z;
     double* old_vx = atom->vx; double* old_vy = atom->vy; double* old_vz = atom->vz;
 
@@ -380,24 +384,34 @@ void sortAtom(Atom* atom) {
             new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
             new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
             new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
+
+            new_vx[new_i * 3 + 0] = old_vx[old_i * 3 + 0];
+            new_vx[new_i * 3 + 1] = old_vy[old_i * 3 + 1];
+            new_vx[new_i * 3 + 2] = old_vz[old_i * 3 + 2];
 #else
             new_x[new_i] = old_x[old_i];
             new_y[new_i] = old_y[old_i];
             new_z[new_i] = old_z[old_i];
-#endif
+
             new_vx[new_i] = old_vx[old_i];
             new_vy[new_i] = old_vy[old_i];
             new_vz[new_i] = old_vz[old_i];
+#endif
+
         }
     }
 
     free(atom->x);
     atom->x = new_x;
+
+    free(atom->vx);
+    atom->vx = new_vx;
 #ifndef AOS
     free(atom->y);
     free(atom->z);
     atom->y = new_y; atom->z = new_z;
+
+    free(atom->vy); free(atom->vz);
+    atom->vy = new_vy; atom->vz = new_vz;
 #endif
-    free(atom->vx); free(atom->vy); free(atom->vz);
-    atom->vx = new_vx; atom->vy = new_vy; atom->vz = new_vz;
 }
diff --git a/src/thermo.c b/src/thermo.c
index 70b897d..6161bf4 100644
--- a/src/thermo.c
+++ b/src/thermo.c
@@ -71,12 +71,9 @@ void setupThermo(Parameter *param, int natoms)
 void computeThermo(int iflag, Parameter *param, Atom *atom)
 {
     MD_FLOAT t = 0.0, p;
-    MD_FLOAT* vx = atom->vx;
-    MD_FLOAT* vy = atom->vy;
-    MD_FLOAT* vz = atom->vz;
 
     for(int i = 0; i < atom->Nlocal; i++) {
-        t += (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) * param->mass;
+        t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
     }
 
     t = t * t_scale;
@@ -101,12 +98,11 @@ void adjustThermo(Parameter *param, Atom *atom)
 {
     /* zero center-of-mass motion */
     MD_FLOAT vxtot = 0.0; MD_FLOAT vytot = 0.0; MD_FLOAT vztot = 0.0;
-    MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz;
 
     for(int i = 0; i < atom->Nlocal; i++) {
-        vxtot += vx[i];
-        vytot += vy[i];
-        vztot += vz[i];
+        vxtot += atom_vx(i);
+        vytot += atom_vy(i);
+        vztot += atom_vz(i);
     }
 
     vxtot = vxtot / atom->Natoms;
@@ -114,24 +110,24 @@ void adjustThermo(Parameter *param, Atom *atom)
     vztot = vztot / atom->Natoms;
 
     for(int i = 0; i < atom->Nlocal; i++) {
-        vx[i] -= vxtot;
-        vy[i] -= vytot;
-        vz[i] -= vztot;
+        atom_vx(i) -= vxtot;
+        atom_vy(i) -= vytot;
+        atom_vz(i) -= vztot;
     }
 
     t_act = 0;
     MD_FLOAT t = 0.0;
 
     for(int i = 0; i < atom->Nlocal; i++) {
-        t += (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) * param->mass;
+        t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
     }
 
     t *= t_scale;
     MD_FLOAT factor = sqrt(param->temp / t);
 
     for(int i = 0; i < atom->Nlocal; i++) {
-        vx[i] *= factor;
-        vy[i] *= factor;
-        vz[i] *= factor;
+        atom_vx(i) *= factor;
+        atom_vy(i) *= factor;
+        atom_vz(i) *= factor;
     }
 }