/***********************************************************************
                            EGBr6()
************************************************************************/

/*
 * Calculate the generalized Born energy and first derivatives.
 *
 * Calling parameters are as follows:
 *
 * lpears - the number of pairs on the lower triangle pair list
 * upears - the number of pairs on the upper trianble pair list
 * pearlist - the pair list, contiguous for the upper and lower triangles
 * lpearsnp - number of pairs on the non-polar lower triangle pair list
 * upearsnp - number of pairs on the non-polar upper trianble pair list
 * pearlistnp - non-polar pair list, contiguous for upper & lower triangles
 * x - input: the atomic (x,y,z) coordinates
 * f - updated: the gradient vector
 * fs - input: overlap parameters
 * rborn - input: atomic radii
 * q - input: atomic charges
 * kappa - input: inverse of the Debye-Huckel length
 * diel_ext - input: solvent dielectric constant
 * enb - updated: Lennard-Jones energy
 * eelt - updated: gas-phase electrostatic energy
 * esurf - updated: nonpolar surface area solvation free energy
 * enp - updated: nonpolar van der Waals solvation free energy
 * freevectors - if !=0 free the static vectors and return
 */

#define GOFFSET 0.0
static
REAL_T egbr6(INT_T * lpears, INT_T * upears, INT_T ** pearlist,
             INT_T * lpearsnp, INT_T * upearsnp, INT_T ** pearlistnp,
             REAL_T * x, REAL_T * f, REAL_T * fs, REAL_T * rborn,
             REAL_T * q, REAL_T * kappa, REAL_T * diel_ext, REAL_T * enb,
             REAL_T * eelt, REAL_T * esurf, REAL_T * enp,
             INT_T freevectors)
{
#if defined(MPI) || defined(SCALAPACK)
   int ierror;
   static REAL_T *reductarr = NULL;
#endif

   static REAL_T *reff = NULL, *sumdeijda = NULL, *psi = NULL;

   int i, i34, j, j34, k, threadnum, numthreads, maxthreads, foff, soff;
   int npairs, ic, iaci;
   int *iexw;
   size_t n;
   REAL_T epol, dielfac, qi, qiqj, fgbi, fgbk, rb2, expmkf;
   REAL_T elec, evdw, sumda, daix, daiy, daiz;
   REAL_T xi, yi, zi, xij, yij, zij;
   REAL_T dedx, dedy, dedz, de;
   REAL_T dij1i, dij3i, temp1;
   REAL_T qi2h, qid2h, datmp;
   REAL_T ri1i, dij2i;
   REAL_T theta;

   REAL_T dij, sumi;
   REAL_T eel, f6, f12, rinv, r2inv, r6inv;
   REAL_T r2, ri, rj, sj, sj2;
   REAL_T uij, efac, temp4, temp5, temp6, reffi2;

   /*
    * Determine the size of the sumdeijda array.  If OPENMP is defined,
    * a copy of this array must be allocated for each thread; otherwise
    * only one copy is allocated.
    */

#ifdef OPENMP
   maxthreads = omp_get_max_threads();
#else
   maxthreads = 1;
#endif

   n = (size_t) prm->Natom;

   *enp = 0.0;

   /*
    * If freevectors != 0, deallocate the static arrays that have been
    * previously allocated and return.
    */

   if (freevectors != 0) {
      if (reff != NULL)
         free_vector(reff, 0, n);
      reff = NULL;
      if (sumdeijda != NULL)
         free_vector(sumdeijda, 0, maxthreads * n);
      sumdeijda = NULL;
      if (psi != NULL)
         free_vector(psi, 0, n);
      psi = NULL;
#if defined(MPI) || defined(SCALAPACK)
      if (reductarr != NULL)
         free_vector(reductarr, 0, n);
      reductarr = NULL;
#endif
      return (0.0);
   }


   /* Allocate some static arrays if they have not been allocated already. */

   if (reff == NULL)
      reff = vector(0, n);
   if (sumdeijda == NULL)
      sumdeijda = vector(0, maxthreads * n);
   if ((psi == NULL) && (gb == 2 || gb == 5))
      psi = vector(0, n);
#if defined(MPI) || defined(SCALAPACK)
   if (reductarr == NULL)
      reductarr = vector(0, n);
#endif

   if (gb_debug)
      fprintf(nabout, "Effective Born radii:\n");


   /*
    * For MPI or ScaLAPACK, initialize all elements of the reff array.
    * Although each task will calculate only a subset of the elements,
    * a reduction is used to combine the results from all tasks.
    * If a gather were used instead of a reduction, no initialization
    * would be necessary.
    */

#if defined(MPI) || defined(SCALAPACK)
   for (i = 0; i < prm->Natom; i++) {
      reff[i] = 0.0;
   }
#endif

#pragma omp parallel \
  private (i, xi, yi, zi, ri, ri1i, sumi, j, k, xij, yij, zij, \
           r2, dij1i, dij, sj, sj2, uij, dij2i, theta, temp1, \
           threadnum, numthreads)
   {

      /*
       * Get the thread number and the number of threads for multi-threaded
       * execution under OpenMP.  For all other cases, including ScaLAPACK,
       * MPI and single-threaded execution, use the values that have been
       * stored in mytaskid and numtasks, respectively.
       */

#if defined(OPENMP)
      threadnum = omp_get_thread_num();
      numthreads = omp_get_num_threads();
#else
      threadnum = mytaskid;
      numthreads = numtasks;
#endif

      /*
       * Loop over all atoms i.
       *
       * Explicitly assign threads to loop indices for the following loop,
       * in a manner equivalent to (static, N) scheduling with OpenMP, and
       * identical to the manner in which threads are assigned in nblist.
       *
       * The reff array is written in the following loops.  It is necessary to
       * synchronize the OpenMP threads or MPI tasks that execute these loops
       * following loop execution so that a race condition does not exist for
       * reading the reff array before it is written.  Even if all subsequent
       * loops use loop index to thread or task mapping that is identical to
       * that of the following loop, elements of the reff array are indexed by
       * other loop indices, so synchronization is necessary.
       *
       * OpenMP synchronization is accomplished by the implied barrier
       * at the end of this parallel region.  MPI synchronization is
       * accomplished by MPI_Allreduce.
       */

      for (i = 0; i < prm->Natom; i++) {

#if defined(OPENMP) || defined(MPI) || defined(SCALAPACK)
         if (!myroc(i, blocksize, numthreads, threadnum))
            continue;
#endif

         xi = x[3 * i];
         yi = x[3 * i + 1];
         zi = x[3 * i + 2];

         ri = rborn[i] - GOFFSET;
         ri1i = 1. / ri;
         sumi = 0.0;

         /* Select atom j from the pair list.  Non-graceful error handling. */

         for (k = 0; k < lpears[i] + upears[i]; k++) {

            if (pearlist[i] == NULL) {
               fprintf(nabout,
                       "NULL pair list entry in egb loop 1, taskid = %d\n",
                       mytaskid);
               fflush(nabout);
            }
            j = pearlist[i][k];

            xij = xi - x[3 * j];
            yij = yi - x[3 * j + 1];
            zij = zi - x[3 * j + 2];
            r2 = xij * xij + yij * yij + zij * zij;

            dij1i = 1.0 / sqrt(r2);
            dij = r2 * dij1i;
            sj = fs[j] * (rborn[j] - GOFFSET);
            sj2 = sj * sj;

            /* Contributions to the inverse cube effective radii; see
               Eqs. 10-12 of Tjong & Zhou, JPCB 111:3055, 2007        */

            if (dij > ri + sj) {        /* Case I */
#ifdef HCT
               /* for comparison, "HCT" holds formulas for gb=1  */
               sumi -= 0.5 * (sj / (r2 - sj2) +
                              0.5 * dij1i * log((dij - sj) / (dij + sj)));
#else

               temp1 = sj / (r2 - sj2);
               sumi -= temp1 * temp1 * temp1;
#endif

            } else if (dij > fabs(ri - sj)) {   /* Case II */
#ifdef HCT
               theta = 0.5 * ri1i * dij1i * (r2 + ri * ri - sj2);
               uij = 1. / (dij + sj);
               sumi -= 0.25 * (ri1i * (2. - theta) - uij +
                               dij1i * log(ri * uij));
#else
               temp1 = 1. / (dij + sj);
               sumi -=
                   0.0625 * dij1i * (8. * dij *
                                     (1. / (ri * ri * ri) -
                                      temp1 * temp1 * temp1)
                                     - 3. * (r2 - sj2) * (pow(ri, -4.) -
                                                          pow(temp1, 4.))
                                     - 6. * (1. / (ri * ri) -
                                             temp1 * temp1));
#endif

            } else if (ri < sj) {       /* Case III */
#ifdef HCT
               sumi -= 0.5 * (sj / (r2 - sj2) + 2. * ri1i +
                              0.5 * dij1i * log((sj - dij) / (sj + dij)));
#else
               temp1 = sj / (r2 - sj2);
               sumi -= dij1i / r2 + temp1 * temp1 * temp1;
#endif

            }

         }

#ifdef HCT
         /* "standard" (HCT) effective radii:  */
         reff[i] = 1.0 / (ri1i + sumi);
         if (reff[i] < 0.0)
            reff[i] = 30.0;
#else
         temp1 = ri1i * ri1i * ri1i + sumi;
         if (temp1 <= 0.0) {
            reff[i] = 30.0;
         } else {
            reff[i] = pow(temp1, (-1. / 3.));
         }
#endif

         if (gb_debug)
            fprintf(nabout, "%d\t%15.7f\t%15.7f\n", i + 1, rborn[i],
                    reff[i]);
      }
   }

   /* The MPI synchronization is accomplished via reduction of the reff array. */

#if defined(MPI) || defined(SCALAPACK)

   ierror = MPI_Allreduce(reff, reductarr, prm->Natom,
                          MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   if (ierror != MPI_SUCCESS) {
      fprintf(nabout,
              "Error in egb reff reduction, error = %d  mytaskid = %d\n",
              ierror, mytaskid);
   }
   for (i = 0; i < prm->Natom; i++) {
      reff[i] = reductarr[i];
   }

#endif

   *esurf = 0.0;                /* no LCPO in this version  */

   /* Compute the GB, Coulomb and Lennard-Jones energies and derivatives. */

   epol = elec = evdw = 0.0;

#pragma omp parallel reduction (+: epol, elec, evdw) \
  private (i, i34, ri, qi, expmkf, dielfac, qi2h, qid2h, iaci, \
           xi, yi, zi, k, j, j34, xij, yij, zij, r2, qiqj, \
           rj, rb2, efac, fgbi, fgbk, temp4, temp6, eel, de, temp5, \
           rinv, r2inv, ic, r6inv, f6, f12, dedx, dedy, dedz, \
           iexw, threadnum, numthreads, foff, soff, \
           sumda, ri1i, dij1i, datmp, daix, daiy, daiz, \
           dij2i, dij, sj, sj2, temp1, dij3i, npairs)
   {

      /*
       * Get the thread number and the number of threads for multi-threaded
       * execution under OpenMP.  For all other cases, including ScaLAPACK,
       * MPI and single-threaded execution, use the values that have been
       * stored in mytaskid and numtasks, respectively.
       */

#if defined(OPENMP)
      threadnum = omp_get_thread_num();
      numthreads = omp_get_num_threads();
#else
      threadnum = mytaskid;
      numthreads = numtasks;
#endif

      /*
       * Compute offsets into the gradient and sumdeijda arrays for this
       * thread, but only if OPENMP is defined.
       */

#ifdef OPENMP
      soff = prm->Natom * threadnum;
      foff = 3 * soff;
#else
      soff = 0;
      foff = 0;
#endif

      /*
       * Initialize the sumdeijda array inside of the parallel region.
       *
       * For OpenMP, the "first touch" memory allocation strategy will
       * locate the copy of the sumdeijda array that is initialized by
       * a particular CPU in memory that is local to that CPU.
       *
       * It is not necessary to synchronize OpenMP threads following
       * this loop because each thread initializes the particular copy
       * of the sumdeijda array that it subsequently updates.  A similar
       * argument applies for MPI tasks.
       */

      for (i = 0; i < prm->Natom; i++) {
         sumdeijda[soff + i] = 0.0;
      }

      /*
       * Allocate and initialize the iexw array used for skipping excluded
       * atoms.  Note that because of the manner in which iexw is used, it
       * is necessary to initialize it before only the first iteration of
       * the following loop.
       */

      iexw = ivector(-1, prm->Natom);
      for (i = -1; i < prm->Natom; i++) {
         iexw[i] = -1;
      }

      /*
       * Loop over all atoms i.
       *
       * Explicitly assign threads to loop indices for the following loop,
       * in a manner equivalent to (static, N) scheduling with OpenMP, and
       * identical to the manner in which threads are assigned in nblist.
       *
       * Synchronization of OpenMP threads will occur following this loop
       * because the parallel region ends after this loop.  Following
       * synchronization, a reduction of the sumdeijda array will be
       * performed.
       *
       * Synchronization of MPI tasks will occur via the MPI_Reduce function.
       */

      for (i = 0; i < prm->Natom; i++) {

#if defined(OPENMP) || defined(MPI) || defined(SCALAPACK)
         if (!myroc(i, blocksize, numthreads, threadnum))
            continue;
#endif

         ri = reff[i];
         qi = q[i];

         /*
          * If atom i is not frozen, compute the "diagonal" energy that
          * is a function of only the effective radius Ri but not of the
          * interatomic distance Dij.  Compute also the contribution of
          * the diagonal energy term to the sum by which the derivative
          * of Ri will be multiplied.
          */

         if (!frozen[i]) {
            expmkf = exp(-KSCALE * (*kappa) * ri) / (*diel_ext);
            dielfac = 1.0 - expmkf;
            qi2h = 0.5 * qi * qi;
            qid2h = qi2h * dielfac;
            epol += -qid2h / ri;

            sumdeijda[soff + i] +=
                qid2h - KSCALE * (*kappa) * qi2h * expmkf * ri;
         }

         /*
          * Skip the pair calculations if there are no atoms j on the
          * pair list of atom i.
          */

         npairs = upears[i];
         if (!npairs)
            continue;

         i34 = 3 * i;

         xi = x[i34];
         yi = x[i34 + 1];
         zi = x[i34 + 2];

         iaci = prm->Ntypes * (prm->Iac[i] - 1);

         /*
          * Expand the excluded atom list into the iexw array by storing i
          * at array address j.
          */

         for (j = 0; j < prm->Iblo[i]; j++) {
            iexw[IexclAt[i][j] - 1] = i;
         }

         /* Initialize the derivative accumulators. */

         daix = daiy = daiz = 0.0;

         /* Select atoms j from the pair list.  Non-graceful error handling. */

         for (k = lpears[i]; k < lpears[i] + npairs; k++) {

            if (pearlist[i] == NULL) {
               fprintf(nabout,
                       "NULL pair list entry in egb loop 3, taskid = %d\n",
                       mytaskid);
               fflush(nabout);
            }
            j = pearlist[i][k];

            j34 = 3 * j;

            /* Continue computing the non-diagonal energy term. */

            xij = xi - x[j34];
            yij = yi - x[j34 + 1];
            zij = zi - x[j34 + 2];
            r2 = xij * xij + yij * yij + zij * zij;

            /*
             * Because index j is retrieved from the pairlist array it is
             * not constrained to a particular range of values; therefore,
             * the threads that have loaded the reff array must be
             * synchronized prior to the use of reff below.
             */

            qiqj = qi * q[j];
            rj = reff[j];
            rb2 = ri * rj;
#ifdef GRYCUK_FGB
            fgbi = 1.0 / sqrt(r2 + rb2);
#else
            efac = exp(-r2 / (4.0 * rb2));
            fgbi = 1.0 / sqrt(r2 + rb2 * efac);
#endif
            fgbk = -(*kappa) * KSCALE / fgbi;

            expmkf = exp(fgbk) / (*diel_ext);
            dielfac = 1.0 - expmkf;

            epol += -qiqj * dielfac * fgbi;

            temp4 = fgbi * fgbi * fgbi;
            temp6 = qiqj * temp4 * (dielfac + fgbk * expmkf);

            /* de holds (1/r) dE/dr, where r is the bond distance  */
#ifdef GRYCUK_FGB
            de = temp6;
#else
            de = temp6 * (1.0 - 0.25 * efac);
#endif

            /* temp5 holds -(1/ri) dE/dai,  where ai = 1/ri; this is
               also equal to -(1/rj) dE/daj                         */
#ifdef GRYCUK_FGB
            temp5 = 0.5 * temp6 * rb2;
#else
            temp5 = 0.5 * efac * temp6 * (rb2 + 0.25 * r2);
#endif

            /*
             * sumdeijda[] will hold minus the derivative of the GB
             * energy with respect to the inverse effective radii.
             */

            sumdeijda[soff + i] += ri * temp5;
            sumdeijda[soff + j] += rj * temp5;

            /*
             * Compute the Van der Waals and Coulombic energies for only
             * those pairs that are not on the excluded atom list.  Any
             * pair on the excluded atom list will have atom i stored at
             * address j of the iexw array.  It is not necessary to reset
             * the elements of the iexw array to -1 between successive
             * iterations in i because an i,j pair is uniquely identified
             * by atom i stored at array address j.  Thus for example, the
             * i+1,j pair would be stored at the same address as the i,j
             * pair but after the i,j pair were used.
             */

            if (iexw[j] != i) {

               rinv = 1. / sqrt(r2);
               r2inv = rinv * rinv;

               /*  gas-phase Coulomb energy:  */

               eel = qiqj * rinv;
               elec += eel;
               de -= eel * r2inv;

               /* Lennard-Jones energy:   */

               ic = prm->Cno[iaci + prm->Iac[j] - 1] - 1;
               if (ic >= 0) {
                  r6inv = r2inv * r2inv * r2inv;
                  f6 = prm->Cn2[ic] * r6inv;
                  f12 = prm->Cn1[ic] * r6inv * r6inv;
                  evdw += f12 - f6;
                  de -= (12. * f12 - 6. * f6) * r2inv;
               }
            }

            /*
             * Sum to the gradient vector the derivatives of Dij that are
             * computed relative to the Cartesian coords of atoms i and j.
             */

            dedx = de * xij;
            dedy = de * yij;
            dedz = de * zij;

            daix += dedx;
            daiy += dedy;
            daiz += dedz;

            f[foff + j34] -= dedx;
            f[foff + j34 + 1] -= dedy;
            f[foff + j34 + 2] -= dedz;

         }

         /* Update the i elements of the gradient. */

         f[foff + i34] += daix;
         f[foff + i34 + 1] += daiy;
         f[foff + i34 + 2] += daiz;

      }

      /* Free the iexw array within this potentially parallel region of code. */

      free_ivector(iexw, -1, prm->Natom);
   }
#if defined(MPI) || defined(SCALAPACK)

   /* Perform a reduction of sumdeijda */

   ierror = MPI_Allreduce(sumdeijda, reductarr, prm->Natom,
                          MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
   if (ierror != MPI_SUCCESS) {
      fprintf(nabout,
              "Error in egb sumdeijda reduction, error = %d  mytaskid = %d\n",
              ierror, mytaskid);
   }
   for (i = 0; i < prm->Natom; i++) {
      sumdeijda[i] = reductarr[i];
   }
#endif
#ifdef OPENMP

   /*
    * Perform a reduction of sumdeijda.
    * In the following, the (j,i) loop nesting is more efficient
    * than (i,j) loop nesting.
    */

#pragma omp parallel for private (i) schedule(static, 8)
   for (j = 0; j < prm->Natom; j++) {
      for (i = 1; i < maxthreads; i++) {
         sumdeijda[j] += sumdeijda[prm->Natom * i + j];
      }
   }
#endif

#pragma omp parallel \
  private (i, i34, ri, qi, expmkf, dielfac, qi2h, qid2h, iaci, \
           xi, yi, zi, k, j, j34, xij, yij, zij, r2, qiqj, \
           rj, rb2, efac, fgbi, fgbk, temp4, temp6, eel, de, temp5, \
           rinv, r2inv, ic, r6inv, f6, f12, dedx, dedy, dedz, \
           iexw, threadnum, numthreads, foff, \
           sumda, ri1i, dij1i, datmp, daix, daiy, daiz, \
           dij2i, dij, sj, sj2, temp1, dij3i, reffi2, npairs)
   {
      /*
       * Get the thread number and the number of threads for multi-threaded
       * execution under OpenMP.  For all other cases, including ScaLAPACK,
       * MPI and single-threaded execution, use the values that have been
       * stored in mytaskid and numtasks, respectively.
       */

#if defined(OPENMP)
      threadnum = omp_get_thread_num();
      numthreads = omp_get_num_threads();
#else
      threadnum = mytaskid;
      numthreads = numtasks;
#endif

      /*
       * Compute an offset into the gradient array for this thread,
       * but only if OPENMP is defined.  There is no need to
       * compute an offset into the sumdeijda array because all
       * copies of this array have been reduced into copy zero.
       */

#ifdef OPENMP
      foff = prm->Natom * 3 * threadnum;
#else
      foff = 0;
#endif

      /*
       * Compute the derivatives of the effective radius Ri of atom i
       * with respect to the cartesian coordinates of each atom j.  Sum
       * all of these derivatives into the gradient vector.
       *
       * Loop over all atoms i.
       *
       * Synchronization of OpenMP threads will occur following this loop
       * because the parallel region ends after this loop.  A reduction
       * of the gradient array will occur in the mme34 function, either
       * for OpenMP or MPI.  This reduction will synchronize the MPI
       * tasks, so an explicit barrier is not necessary at the end of
       * this loop.
       *
       * Explicitly assign threads to loop indices for the following loop,
       * in a manner equivalent to (static, N) scheduling with OpenMP, and
       * identical to the manner in which threads are assigned in nblist.
       */

      for (i = 0; i < prm->Natom; i++) {

#if defined(OPENMP) || defined(MPI) || defined(SCALAPACK)
         if (!myroc(i, blocksize, numthreads, threadnum))
            continue;
#endif

         /*
          * Don't calculate derivatives of the effective radius of atom i
          * if atom i is frozen or if there are no pair atoms j associated
          * with atom i.
          */

         npairs = lpears[i] + upears[i];
         if (frozen[i] || !npairs)
            continue;

         i34 = 3 * i;

         xi = x[i34];
         yi = x[i34 + 1];
         zi = x[i34 + 2];

         ri = rborn[i] - GOFFSET;
         ri1i = 1. / ri;
         reffi2 = reff[i] * reff[i];

         sumda = sumdeijda[i];

         /* Initialize the derivative accumulators. */

         daix = daiy = daiz = 0.0;

         /* Select atom j from the pair list.  Non-graceful error handling. */

         for (k = 0; k < npairs; k++) {

            if (pearlist[i] == NULL) {
               fprintf(nabout,
                       "NULL pair list entry in egb loop 4, taskid = %d\n",
                       mytaskid);
               fflush(nabout);
            }
            j = pearlist[i][k];

            j34 = 3 * j;

            xij = xi - x[j34];
            yij = yi - x[j34 + 1];
            zij = zi - x[j34 + 2];
            r2 = xij * xij + yij * yij + zij * zij;

            dij1i = 1.0 / sqrt(r2);
            dij2i = dij1i * dij1i;
            dij = r2 * dij1i;
            sj = fs[j] * (rborn[j] - GOFFSET);
            sj2 = sj * sj;

            /*  datmp will hold -(1/d) D[ ai, d ], where ai is the
               inverse of the effective radius, and d is the distance
               between atoms i and j.                                 */

            if (dij > ri + sj) {        /* Case I */

#ifdef HCT
               /* for comparison, "HCT" holds formulas for gb=1  */
               temp1 = 1. / (r2 - sj2);
               datmp = temp1 * sj * (-0.5 * dij2i + temp1)
                   + 0.25 * dij1i * dij2i * log((dij - sj) / (dij + sj));
#else
               datmp = reffi2 * 2. * sj * sj2 * pow((r2 - sj2), -4.);
#endif

            } else if (dij > fabs(ri - sj)) {   /* Case II  */

#ifdef HCT
               temp1 = 1. / (dij + sj);
               dij3i = dij1i * dij1i * dij1i;
               datmp =
                   -0.25 * (-0.5 * (r2 - ri * ri + sj2) * dij3i * ri1i *
                            ri1i + dij1i * temp1 * (temp1 - dij1i)
                            - dij3i * log(ri * temp1));
#else

               temp1 = 1. / (dij + sj);
               temp1 = temp1 * temp1 * temp1 * temp1;
               dij3i = dij1i * dij1i * dij1i;
               datmp = 0.0625 * reffi2 * dij3i *
                   ((r2 + sj2 - 2. * ri * ri) *
                    ri1i * ri1i * ri1i * ri1i +
                    (r2 + 4. * dij * sj + sj2) * temp1);


#endif

            } else if (ri < sj) {       /* Case III */

#ifdef HCT
               temp1 = 1. / (r2 - sj2);
               datmp =
                   -0.5 * (sj * dij2i * temp1 - 2. * sj * temp1 * temp1 -
                           0.5 * dij2i * dij1i * log((sj - dij) /
                                                     (sj + dij)));
#else
               datmp = reffi2 * 2. * sj * sj2 * pow((r2 - sj2), -4.);
#endif

            } else {
               datmp = 0.;
            }

            /* Sum the derivatives into daix, daiy and daiz. */

            daix += xij * datmp;
            daiy += yij * datmp;
            daiz += zij * datmp;

            datmp *= sumda;
            f[foff + j34] += xij * datmp;
            f[foff + j34 + 1] += yij * datmp;
            f[foff + j34 + 2] += zij * datmp;

         }

         f[foff + i34] -= sumda * daix;
         f[foff + i34 + 1] -= sumda * daiy;
         f[foff + i34 + 2] -= sumda * daiz;

      }
   }

   /* Return elec and evdw through the reference parameters eelt and enb. */

   *eelt = elec;
   *enb = evdw;


   /* Free the static arrays if static_arrays is 0. */

   if (!static_arrays) {
      if (reff != NULL)
         free_vector(reff, 0, n);
      reff = NULL;
      if (sumdeijda != NULL)
         free_vector(sumdeijda, 0, maxthreads * n);
      sumdeijda = NULL;
      if (psi != NULL)
         free_vector(psi, 0, n);
      psi = NULL;
#if defined(MPI) || defined(SCALAPACK)
      if (reductarr != NULL)
         free_vector(reductarr, 0, n);
      reductarr = NULL;
#endif
   }

   return (epol);
}