MD-Bench/gromacs/cuda/force_lj_sup.cu


extern "C" {

#include <stdio.h>
//---
#include <cuda.h>
#include <driver_types.h>
//---
#include <likwid-marker.h>
//---
#include <atom.h>
#include <device.h>
#include <neighbor.h>
#include <parameter.h>
#include <stats.h>
#include <timing.h>
#include <util.h>

}

extern "C" {
    extern MD_FLOAT *cuda_cl_x;
    extern MD_FLOAT *cuda_cl_v;
    extern MD_FLOAT *cuda_cl_f;
    extern int *cuda_neighbors;
    extern int *cuda_numneigh;
    extern int *cuda_natoms;
    extern int *natoms;
    extern int *ngatoms;
    extern int *cuda_border_map;
    extern int *cuda_jclusters_natoms;
    extern MD_FLOAT *cuda_bbminx, *cuda_bbmaxx;
    extern MD_FLOAT *cuda_bbminy, *cuda_bbmaxy;
    extern MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
    extern int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
    extern int isReneighboured;

    extern int *cuda_iclusters;
    extern int *cuda_nclusters;

    extern MD_FLOAT *cuda_scl_x;
    extern MD_FLOAT *cuda_scl_v;
    extern MD_FLOAT *cuda_scl_f;
}

#ifdef USE_SUPER_CLUSTERS
extern "C"
void alignDataToSuperclusters(Atom *atom) {
    for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
        const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;

        for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {

            MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
            MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
            MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];

            /*
            MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
            MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
            MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
             */

            memcpy(&atom->scl_x[scci], &ci_x[0], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));

            memcpy(&atom->scl_v[scci], &ci_v[0], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));

            memcpy(&atom->scl_f[scci], &ci_f[0], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));

        }
    }
}

extern "C"
void alignDataFromSuperclusters(Atom *atom) {
    for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
        const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;

        for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {


            MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
            MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
            MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];

            /*
            MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
            MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
            MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
             */

            memcpy(&ci_x[0], &atom->scl_x[scci], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&ci_x[0 + CLUSTER_M], &atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&ci_x[0 + 2 * CLUSTER_M], &atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));

            memcpy(&ci_v[0], &atom->scl_v[scci], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&ci_v[0 + CLUSTER_M], &atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&ci_v[0 + 2 * CLUSTER_M], &atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));

            memcpy(&ci_f[0], &atom->scl_f[scci], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&ci_f[0 + CLUSTER_M], &atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&ci_f[0 + 2 * CLUSTER_M], &atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));

        }
    }
}

__global__ void cudaInitialIntegrateSup_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
                                             int *cuda_nclusters,
                                             int *cuda_natoms,
                                             int Nsclusters_local, MD_FLOAT dtforce, MD_FLOAT dt) {

    unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
    //unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
    if (sci_pos >= Nsclusters_local) return;

    //unsigned int ci_pos = cii_pos / CLUSTER_M;
    //unsigned int scii_pos = cii_pos % CLUSTER_M;

    //if (ci_pos >= cuda_nclusters[sci_pos]) return;
    //if (scii_pos >= cuda_natoms[ci_pos]) return;

    int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
    MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
    MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
    MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];

    for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) {
        ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos];
        ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos];
        ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos];
        ci_x[SCL_X_OFFSET + scii_pos] += dt * ci_v[SCL_X_OFFSET + scii_pos];
        ci_x[SCL_Y_OFFSET + scii_pos] += dt * ci_v[SCL_Y_OFFSET + scii_pos];
        ci_x[SCL_Z_OFFSET + scii_pos] += dt * ci_v[SCL_Z_OFFSET + scii_pos];
    }
}

__global__ void cudaFinalIntegrateSup_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
                                           int *cuda_nclusters, int *cuda_natoms,
                                           int Nsclusters_local, MD_FLOAT dtforce) {

    unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
    //unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
    if (sci_pos >= Nsclusters_local) return;

    //unsigned int ci_pos = cii_pos / CLUSTER_M;
    //unsigned int scii_pos = cii_pos % CLUSTER_M;

    //if (ci_pos >= cuda_nclusters[sci_pos]) return;
    //if (scii_pos >= cuda_natoms[ci_pos]) return;

    int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
    MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
    MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];

    for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) {
        ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos];
        ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos];
        ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos];
    }

}

__global__ void computeForceLJSup_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
                                            int *cuda_nclusters, int *cuda_iclusters,
                                            int Nsclusters_local,
                                            int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
                                            MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon) {

    unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int scii_pos = blockDim.y * blockIdx.y + threadIdx.y;
    unsigned int cjj_pos = blockDim.z * blockIdx.z + threadIdx.z;
    if ((sci_pos >= Nsclusters_local) || (scii_pos >= SCLUSTER_M) || (cjj_pos >= CLUSTER_N)) return;

    unsigned int ci_pos = scii_pos / CLUSTER_M;
    unsigned int cii_pos = scii_pos % CLUSTER_M;

    if (ci_pos >= cuda_nclusters[sci_pos]) return;

    int ci_cj0 = CJ0_FROM_CI(ci_pos);
    int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
    MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
    MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];


    //int numneighs = cuda_numneigh[ci_pos];
    int numneighs = cuda_numneigh[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos]];

    for(int k = 0; k < numneighs; k++) {
        int cj = (&cuda_neighs[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos] * maxneighs])[k];
        // TODO Make cj accessible from super cluster data alignment (not reachable right now)
        int cj_vec_base = SCJ_VECTOR_BASE_INDEX(cj);
        MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
        MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base];

        MD_FLOAT xtmp = ci_x[SCL_CL_X_OFFSET(ci_pos) + cii_pos];
        MD_FLOAT ytmp = ci_x[SCL_CL_Y_OFFSET(ci_pos) + cii_pos];
        MD_FLOAT ztmp = ci_x[SCL_CL_Z_OFFSET(ci_pos) + cii_pos];
        MD_FLOAT fix = 0;
        MD_FLOAT fiy = 0;
        MD_FLOAT fiz = 0;

        int cond;
#if CLUSTER_M == CLUSTER_N
        cond = half_neigh ? (ci_cj0 != cj || cii_pos < cjj_pos) :
               (ci_cj0 != cj || cii_pos != cjj_pos);
#elif CLUSTER_M < CLUSTER_N
        cond = half_neigh ? (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) < cjj_pos) :
                            (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) != cjj_pos);
#endif
        if(cond) {
            MD_FLOAT delx = xtmp - cj_x[SCL_CL_X_OFFSET(ci_pos) + cjj_pos];
            MD_FLOAT dely = ytmp - cj_x[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos];
            MD_FLOAT delz = ztmp - cj_x[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos];
            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
            if(rsq < cutforcesq) {
                MD_FLOAT sr2 = 1.0 / rsq;
                MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
                MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;

                if(half_neigh) {
                    atomicAdd(&cj_f[SCL_CL_X_OFFSET(ci_pos) + cjj_pos], -delx * force);
                    atomicAdd(&cj_f[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos], -dely * force);
                    atomicAdd(&cj_f[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos], -delz * force);
                }

                fix += delx * force;
                fiy += dely * force;
                fiz += delz * force;

                atomicAdd(&ci_f[SCL_CL_X_OFFSET(ci_pos) + cii_pos], fix);
                atomicAdd(&ci_f[SCL_CL_Y_OFFSET(ci_pos) + cii_pos], fiy);
                atomicAdd(&ci_f[SCL_CL_Z_OFFSET(ci_pos) + cii_pos], fiz);
            }
        }
    }

}

extern "C"
double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
    DEBUG_MESSAGE("computeForceLJSup_cuda start\r\n");

    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
    MD_FLOAT sigma6 = param->sigma6;
    MD_FLOAT epsilon = param->epsilon;

    memsetGPU(cuda_cl_f, 0, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
    if (isReneighboured) {

        for(int ci = 0; ci < atom->Nclusters_local; ci++) {
            memcpyToGPU(&cuda_numneigh[ci], &neighbor->numneigh[ci], sizeof(int));
            memcpyToGPU(&cuda_neighbors[ci * neighbor->maxneighs], &neighbor->neighbors[ci * neighbor->maxneighs], neighbor->numneigh[ci] * sizeof(int));
        }

        for(int sci = 0; sci < atom->Nsclusters_local; sci++) {
            memcpyToGPU(&cuda_nclusters[sci], &atom->siclusters[sci].nclusters, sizeof(int));
            //memcpyToGPU(&cuda_iclusters[sci * SCLUSTER_SIZE], &atom->siclusters[sci].iclusters, sizeof(int) * atom->siclusters[sci].nclusters);
        }

        memcpyToGPU(cuda_iclusters, atom->icluster_idx, atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));

        isReneighboured = 0;
    }

    const int threads_num = 1;
    dim3 block_size = dim3(threads_num, SCLUSTER_M, CLUSTER_N);
    dim3 grid_size = dim3(atom->Nsclusters_local/threads_num+1, 1, 1);
    double S = getTimeStamp();
    LIKWID_MARKER_START("force");
    computeForceLJSup_cuda_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_scl_f,
                                                           cuda_nclusters, cuda_iclusters,
                                                           atom->Nsclusters_local,
                                                           cuda_numneigh, cuda_neighbors,
                                                           neighbor->half_neigh, neighbor->maxneighs, cutforcesq,
                                                           sigma6, epsilon);
    cuda_assert("computeForceLJ_cuda", cudaPeekAtLastError());
    cuda_assert("computeForceLJ_cuda", cudaDeviceSynchronize());
    LIKWID_MARKER_STOP("force");
    double E = getTimeStamp();
    DEBUG_MESSAGE("computeForceLJSup_cuda stop\r\n");
    return E-S;
}
#endif //USE_SUPER_CLUSTERS
Building of super clusters complete, force computation kernel WIP 2022-12-18 14:28:29 +01:00
			`extern "C" {`

			`#include <stdio.h>`
			`//---`
			`#include <cuda.h>`
			`#include <driver_types.h>`
			`//---`
			`#include <likwid-marker.h>`
			`//---`
			`#include <atom.h>`
			`#include <device.h>`
			`#include <neighbor.h>`
			`#include <parameter.h>`
			`#include <stats.h>`
			`#include <timing.h>`
			`#include <util.h>`

			`}`

			`extern "C" {`
			`extern MD_FLOAT *cuda_cl_x;`
			`extern MD_FLOAT *cuda_cl_v;`
			`extern MD_FLOAT *cuda_cl_f;`
			`extern int *cuda_neighbors;`
			`extern int *cuda_numneigh;`
			`extern int *cuda_natoms;`
			`extern int *natoms;`
			`extern int *ngatoms;`
			`extern int *cuda_border_map;`
			`extern int *cuda_jclusters_natoms;`
			`extern MD_FLOAT cuda_bbminx, cuda_bbmaxx;`
			`extern MD_FLOAT cuda_bbminy, cuda_bbmaxy;`
			`extern MD_FLOAT cuda_bbminz, cuda_bbmaxz;`
			`extern int cuda_PBCx, cuda_PBCy, *cuda_PBCz;`
			`extern int isReneighboured;`

			`extern int *cuda_iclusters;`
			`extern int *cuda_nclusters;`

			`extern MD_FLOAT *cuda_scl_x;`
			`extern MD_FLOAT *cuda_scl_v;`
			`extern MD_FLOAT *cuda_scl_f;`
			`}`

			`#ifdef USE_SUPER_CLUSTERS`
			`extern "C"`
			`void alignDataToSuperclusters(Atom *atom) {`
			`for (int sci = 0; sci < atom->Nsclusters_local; sci++) {`
			`const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;`

			`for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {`

			`MD_FLOAT ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE sci + ci])];`
			`MD_FLOAT ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE sci + ci])];`
			`MD_FLOAT ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE sci + ci])];`

			`/*`
			`MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];`
			`MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];`
			`MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];`
			`*/`

			`memcpy(&atom->scl_x[scci], &ci_x[0], CLUSTER_M * sizeof(MD_FLOAT));`
			`memcpy(&atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));`
			`memcpy(&atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));`

			`memcpy(&atom->scl_v[scci], &ci_v[0], CLUSTER_M * sizeof(MD_FLOAT));`
			`memcpy(&atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));`
			`memcpy(&atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));`

			`memcpy(&atom->scl_f[scci], &ci_f[0], CLUSTER_M * sizeof(MD_FLOAT));`
			`memcpy(&atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));`
			`memcpy(&atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));`

			`}`
			`}`
			`}`

			`extern "C"`
			`void alignDataFromSuperclusters(Atom *atom) {`
			`for (int sci = 0; sci < atom->Nsclusters_local; sci++) {`
			`const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;`

			`for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {`


			`MD_FLOAT ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE sci + ci])];`
			`MD_FLOAT ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE sci + ci])];`
			`MD_FLOAT ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE sci + ci])];`

			`/*`
			`MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];`
			`MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];`
			`MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];`
			`*/`

			`memcpy(&ci_x[0], &atom->scl_x[scci], CLUSTER_M * sizeof(MD_FLOAT));`
			`memcpy(&ci_x[0 + CLUSTER_M], &atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));`
			`memcpy(&ci_x[0 + 2 * CLUSTER_M], &atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));`

			`memcpy(&ci_v[0], &atom->scl_v[scci], CLUSTER_M * sizeof(MD_FLOAT));`
			`memcpy(&ci_v[0 + CLUSTER_M], &atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));`
			`memcpy(&ci_v[0 + 2 * CLUSTER_M], &atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));`

			`memcpy(&ci_f[0], &atom->scl_f[scci], CLUSTER_M * sizeof(MD_FLOAT));`
			`memcpy(&ci_f[0 + CLUSTER_M], &atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));`
			`memcpy(&ci_f[0 + 2 * CLUSTER_M], &atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));`

			`}`
			`}`
			`}`

			`__global__ void cudaInitialIntegrateSup_warp(MD_FLOAT cuda_cl_x, MD_FLOAT cuda_cl_v, MD_FLOAT *cuda_cl_f,`
			`int *cuda_nclusters,`
			`int *cuda_natoms,`
			`int Nsclusters_local, MD_FLOAT dtforce, MD_FLOAT dt) {`

			`unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;`
			`//unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;`
			`if (sci_pos >= Nsclusters_local) return;`

			`//unsigned int ci_pos = cii_pos / CLUSTER_M;`
			`//unsigned int scii_pos = cii_pos % CLUSTER_M;`

			`//if (ci_pos >= cuda_nclusters[sci_pos]) return;`
			`//if (scii_pos >= cuda_natoms[ci_pos]) return;`

			`int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);`
			`MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];`
			`MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];`
			`MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];`

			`for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) {`
			`ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos];`
			`ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos];`
			`ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos];`
			`ci_x[SCL_X_OFFSET + scii_pos] += dt * ci_v[SCL_X_OFFSET + scii_pos];`
			`ci_x[SCL_Y_OFFSET + scii_pos] += dt * ci_v[SCL_Y_OFFSET + scii_pos];`
			`ci_x[SCL_Z_OFFSET + scii_pos] += dt * ci_v[SCL_Z_OFFSET + scii_pos];`
			`}`
			`}`

			`__global__ void cudaFinalIntegrateSup_warp(MD_FLOAT cuda_cl_v, MD_FLOAT cuda_cl_f,`
			`int cuda_nclusters, int cuda_natoms,`
			`int Nsclusters_local, MD_FLOAT dtforce) {`

			`unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;`
			`//unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;`
			`if (sci_pos >= Nsclusters_local) return;`

			`//unsigned int ci_pos = cii_pos / CLUSTER_M;`
			`//unsigned int scii_pos = cii_pos % CLUSTER_M;`

			`//if (ci_pos >= cuda_nclusters[sci_pos]) return;`
			`//if (scii_pos >= cuda_natoms[ci_pos]) return;`

			`int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);`
			`MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];`
			`MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];`

			`for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) {`
			`ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos];`
			`ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos];`
			`ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos];`
			`}`

			`}`

			`__global__ void computeForceLJSup_cuda_warp(MD_FLOAT cuda_cl_x, MD_FLOAT cuda_cl_f,`
			`int cuda_nclusters, int cuda_iclusters,`
			`int Nsclusters_local,`
			`int cuda_numneigh, int cuda_neighs, int half_neigh, int maxneighs,`
			`MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon) {`

			`unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;`
			`unsigned int scii_pos = blockDim.y * blockIdx.y + threadIdx.y;`
			`unsigned int cjj_pos = blockDim.z * blockIdx.z + threadIdx.z;`
			`if ((sci_pos >= Nsclusters_local) \|\| (scii_pos >= SCLUSTER_M) \|\| (cjj_pos >= CLUSTER_N)) return;`

			`unsigned int ci_pos = scii_pos / CLUSTER_M;`
			`unsigned int cii_pos = scii_pos % CLUSTER_M;`

			`if (ci_pos >= cuda_nclusters[sci_pos]) return;`

			`int ci_cj0 = CJ0_FROM_CI(ci_pos);`
			`int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);`
			`MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];`
			`MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];`


			`//int numneighs = cuda_numneigh[ci_pos];`
			`int numneighs = cuda_numneigh[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos]];`

			`for(int k = 0; k < numneighs; k++) {`
			`int cj = (&cuda_neighs[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos] * maxneighs])[k];`
			`// TODO Make cj accessible from super cluster data alignment (not reachable right now)`
			`int cj_vec_base = SCJ_VECTOR_BASE_INDEX(cj);`
			`MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];`
			`MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base];`

			`MD_FLOAT xtmp = ci_x[SCL_CL_X_OFFSET(ci_pos) + cii_pos];`
			`MD_FLOAT ytmp = ci_x[SCL_CL_Y_OFFSET(ci_pos) + cii_pos];`
			`MD_FLOAT ztmp = ci_x[SCL_CL_Z_OFFSET(ci_pos) + cii_pos];`
			`MD_FLOAT fix = 0;`
			`MD_FLOAT fiy = 0;`
			`MD_FLOAT fiz = 0;`

			`int cond;`
			`#if CLUSTER_M == CLUSTER_N`
			`cond = half_neigh ? (ci_cj0 != cj \|\| cii_pos < cjj_pos) :`
			`(ci_cj0 != cj \|\| cii_pos != cjj_pos);`
			`#elif CLUSTER_M < CLUSTER_N`
			`cond = half_neigh ? (ci_cj0 != cj \|\| cii_pos + CLUSTER_M * (ci_pos & 0x1) < cjj_pos) :`
			`(ci_cj0 != cj \|\| cii_pos + CLUSTER_M * (ci_pos & 0x1) != cjj_pos);`
			`#endif`
			`if(cond) {`
			`MD_FLOAT delx = xtmp - cj_x[SCL_CL_X_OFFSET(ci_pos) + cjj_pos];`
			`MD_FLOAT dely = ytmp - cj_x[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos];`
			`MD_FLOAT delz = ztmp - cj_x[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos];`
			`MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;`
			`if(rsq < cutforcesq) {`
			`MD_FLOAT sr2 = 1.0 / rsq;`
			`MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;`
			`MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;`

			`if(half_neigh) {`
			`atomicAdd(&cj_f[SCL_CL_X_OFFSET(ci_pos) + cjj_pos], -delx * force);`
			`atomicAdd(&cj_f[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos], -dely * force);`
			`atomicAdd(&cj_f[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos], -delz * force);`
			`}`

			`fix += delx * force;`
			`fiy += dely * force;`
			`fiz += delz * force;`

			`atomicAdd(&ci_f[SCL_CL_X_OFFSET(ci_pos) + cii_pos], fix);`
			`atomicAdd(&ci_f[SCL_CL_Y_OFFSET(ci_pos) + cii_pos], fiy);`
			`atomicAdd(&ci_f[SCL_CL_Z_OFFSET(ci_pos) + cii_pos], fiz);`
			`}`
			`}`
			`}`

			`}`

			`extern "C"`
			`double computeForceLJSup_cuda(Parameter param, Atom atom, Neighbor neighbor, Stats stats) {`
			`DEBUG_MESSAGE("computeForceLJSup_cuda start\r\n");`

			`MD_FLOAT cutforcesq = param->cutforce * param->cutforce;`
			`MD_FLOAT sigma6 = param->sigma6;`
			`MD_FLOAT epsilon = param->epsilon;`

			`memsetGPU(cuda_cl_f, 0, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));`
			`if (isReneighboured) {`

			`for(int ci = 0; ci < atom->Nclusters_local; ci++) {`
			`memcpyToGPU(&cuda_numneigh[ci], &neighbor->numneigh[ci], sizeof(int));`
			`memcpyToGPU(&cuda_neighbors[ci * neighbor->maxneighs], &neighbor->neighbors[ci * neighbor->maxneighs], neighbor->numneigh[ci] * sizeof(int));`
			`}`

			`for(int sci = 0; sci < atom->Nsclusters_local; sci++) {`
			`memcpyToGPU(&cuda_nclusters[sci], &atom->siclusters[sci].nclusters, sizeof(int));`
			`//memcpyToGPU(&cuda_iclusters[sci * SCLUSTER_SIZE], &atom->siclusters[sci].iclusters, sizeof(int) * atom->siclusters[sci].nclusters);`
			`}`

			`memcpyToGPU(cuda_iclusters, atom->icluster_idx, atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));`

			`isReneighboured = 0;`
			`}`

			`const int threads_num = 1;`
			`dim3 block_size = dim3(threads_num, SCLUSTER_M, CLUSTER_N);`
			`dim3 grid_size = dim3(atom->Nsclusters_local/threads_num+1, 1, 1);`
			`double S = getTimeStamp();`
			`LIKWID_MARKER_START("force");`
			`computeForceLJSup_cuda_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_scl_f,`
			`cuda_nclusters, cuda_iclusters,`
			`atom->Nsclusters_local,`
			`cuda_numneigh, cuda_neighbors,`
			`neighbor->half_neigh, neighbor->maxneighs, cutforcesq,`
			`sigma6, epsilon);`
			`cuda_assert("computeForceLJ_cuda", cudaPeekAtLastError());`
			`cuda_assert("computeForceLJ_cuda", cudaDeviceSynchronize());`
			`LIKWID_MARKER_STOP("force");`
			`double E = getTimeStamp();`
			`DEBUG_MESSAGE("computeForceLJSup_cuda stop\r\n");`
			`return E-S;`
			`}`
			`#endif //USE_SUPER_CLUSTERS`