#include "copyright.h"
#ifdef MPI
#include "../include/dprec.fh"

!     The AMBER/MPI implementation and support routines were
!     originally and independently implemented and contributed
!     by James Vincent (JV) 7/94.  Modified by tec3, dac and JV.


!+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
!+ broadcast data from master to all other nodes, at beginning
subroutine startup(xx,ix,ih)
   !************************************************************
   !     Send data needed by all nodes once at startup from master
   !     after master has read in all data
   !************************************************************

   use trace
   use amoeba_mdin
   use amoeba_bonds, only : AM_BONDS_bcast
   use amoeba_ureyb, only : AM_UREYB_bcast
   use amoeba_reg_angles, only : AM_REG_ANGLES_bcast
   use amoeba_trig_angles, only : AM_TRIG_ANGLES_bcast
   use amoeba_opbend_angles, only : AM_OPBEND_ANGLES_bcast
   use amoeba_torsions, only : AM_TORSIONS_bcast
   use amoeba_stretch_torsions, only : AM_STRETCH_TORSIONS_bcast
   use amoeba_pitorsions, only : AM_PITORSIONS_bcast
   use amoeba_stretch_bend, only : AM_STRETCH_BEND_bcast
   use amoeba_torsion_torsion, only : AM_TOR_TOR_bcast
   use amoeba_multipoles, only : AM_MPOLE_bcast
   use amoeba_adjust, only : AM_ADJUST_bcast
   use amoeba_vdw, only : AM_VDW_bcast
   use amoeba_induced, only : AM_INDUCED_bcast,polarizability
   use amoeba_self, only : AM_SELF_bcast
   use amoeba_recip, only : AM_RECIP_bcast,AM_RECIP_allocate
   use amoeba_runmd, only : AM_RUNMD_init
   use amoeba_direct, only : AM_DIRECT_bcast
   use parms, only : bcast_parms
   use charmm_mod, only : mpi_bcast_charmm_params
   use ff11_mod, only : mpi_bcast_cmap_params
   use nblist, only : ucell,bc_ewucr,bc_ewuci,nbflag, &
                     BC_DIRPARS,numnptrs
   use file_io_dat
   use pimd_vars, only: ipimd
   use neb_vars, only: ineb
   use cmd_vars,  only: adiab_param, eq_cmd, restart_cmd
   use nose_hoover_module, only: nchain      ! APJ
   use lscivr_vars, only: ilscivr
   use fft,only:column_fft_flag
! SOFT CORE
   use softcore, only : ifsc, scalpha, scbeta, scmask, dynlmb, &
                       sceeorder, tishake, emil_sc
! end SOFT CORE
   use mbar, only : ifmbar, bar_intervall, bar_l_min, bar_l_max, bar_l_incr
   use linear_response, only : ilrt, lrt_interval, lrtmask
! SGLD
   use sgld, only : isgld, isgsta,isgend,fixcom, &
                    sgft,sgff,sgfd,tempsg,tsgavg,tsgavp,treflf
! IPS parameters
   use nbips, only : ips,mipsx,mipsy,mipsz,mipso,raips,gridips,dvbips
! AMD parameters
   use amd_mod, only : iamd,iamdlag,EthreshD,alphaD,EthreshP,alphaP, &
        w_amd,EthreshD_w,alphaD_w,EthreshP_w,alphaP_w
! scaledMD parameters
   use scaledMD_mod, only : scaledMD,scaledMD_lambda
! EMAP parameters
   use emap, only : temap,gammamap,nemap,nrigid
! constant pH
   use constantph, only : cnstph_bcast
! crg_reloc
   use crg_reloc, only : ifcr

#ifdef EMIL
   use emil_mod, only: emil_do_calc
#endif


   implicit none
#  include "parallel.h"
#  include "ew_parallel.h"
#ifdef MPI_DOUBLE_PRECISION
#undef MPI_DOUBLE_PRECISION
#endif
   include 'mpif.h'
   integer ierr
#ifdef CRAY_PVP
#define MPI_DOUBLE_PRECISION MPI_REAL8
#endif
#  include "extra.h"
#  include "../include/md.h"
#  include "../include/memory.h"
#  include "nmr.h"
#  include "box.h"
#ifdef LES
#  include "les.h"
#endif
#  include "extra_pts.h"
#  include "ew_pme_recip.h"
#  include "ew_mpole.h"
#  include "ew_erfc_spline.h"
#  include "ew_cntrl.h"
#  include "flocntrl.h"
#  include "debug.h"
#  include "new_time.h"
#  include "tgtmd.h"
   
   _REAL_ xx(*)
   integer ix(*), ier
   character(len=4) ih(*)
   integer i_column_fft
   
   call trace_enter( 'startup' )
   
   !     Send and receive common blocks from the master node:
   
   !  file_io_dat
   
   call mpi_bcast(ntpr, 1, MPI_INTEGER, 0, commsander, ierr)
   call mpi_bcast(ntwr, 1, MPI_INTEGER, 0, commsander, ierr)
   call mpi_bcast(ntwx, 1, MPI_INTEGER, 0, commsander, ierr)
   call mpi_bcast(ntwv, 1, MPI_INTEGER, 0, commsander, ierr)
   call mpi_bcast(ntwf, 1, MPI_INTEGER, 0, commsander, ierr)
   call mpi_bcast(ntwe, 1, MPI_INTEGER, 0, commsander, ierr)
   call mpi_bcast(ntpp, 1, MPI_INTEGER, 0, commsander, ierr)
   call mpi_bcast(ioutfm, 1, MPI_INTEGER, 0, commsander, ierr)
   call mpi_bcast(ntwprt, 1, MPI_INTEGER, 0, commsander, ierr)
   call mpi_bcast(ntave, 1, MPI_INTEGER, 0, commsander, ierr)
   call mpi_bcast(iredir, 9, MPI_INTEGER, 0, commsander, ierr)
   
   !  nmr.h:
   
   call mpi_bcast(nmropt,7,MPI_INTEGER,0,commsander,ierr)   ! /nmr1/
   call mpi_bcast(intreq,6,MPI_INTEGER,0,commsander,ierr)   ! /nmrstf/
   call mpi_bcast(wnoesy,6,MPI_DOUBLE_PRECISION,0,commsander,ierr) ! /wremar/
   call mpi_bcast(scalm,6,MPI_DOUBLE_PRECISION,0,commsander,ierr) ! /nmr1/
   call mpi_bcast(dobsu,BC_ALIGNR,MPI_DOUBLE_PRECISION,0,commsander,ierr)
                                                                  ! /align/
   call mpi_bcast(ndip, BC_ALIGNI,MPI_INTEGER,0,commsander,ierr)  ! /align/
   call mpi_bcast(nath,BC_METHYLI,MPI_INTEGER,0,commsander,ierr)  ! /methyli/
   call mpi_bcast(tau,BC_METHYLR,MPI_DOUBLE_PRECISION,0,commsander,ierr)
                                                                  ! /methylr/
   
   !  md.h:
   
   call mpi_bcast(nrp,BC_MDI,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(t,BC_MDR,MPI_DOUBLE_PRECISION,0,commsander,ierr)
   
   !  box.h:
   
   call mpi_bcast(ntb,BC_BOXI,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(box,BC_BOXR,MPI_DOUBLE_PRECISION,0,commsander,ierr)
   
   !  parms.f:
   call bcast_parms()
   
   call mpi_bcast(nchain, 1,MPI_INTEGER, 0, commsander, ierr ) 

#ifdef LES
   !   les.h:
   call mpi_bcast(lesfac,BC_LESR,MPI_DOUBLE_PRECISION,0,commsander,ierr)
   call mpi_bcast(nlesty,BC_LESI,MPI_INTEGER,0,commsander,ierr)
#endif
   call mpi_bcast(ipimd, 1, mpi_integer, 0,commsander,ierr)
   call mpi_bcast(ineb, 1, MPI_INTEGER, 0, commsander, ierr )
   call mpi_bcast(adiab_param, 1, MPI_DOUBLE_PRECISION, 0, commsander, ierr)
   call mpi_bcast(eq_cmd,1,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(restart_cmd,1,MPI_INTEGER,0,commsander,ierr)

   call mpi_bcast(ilscivr, 1, mpi_integer, 0, commsander, ierr)


   ! carlos: targeted MD
   
   call mpi_bcast(itgtmd,1,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(tgtrmsd,2,MPI_DOUBLE_PRECISION,0,commsander,ierr)
   ! end targeted md
   
#ifdef EMIL
   ! josh: EMIL
   call mpi_bcast(emil_do_calc,1,MPI_INTEGER,0,commsander,ier)
#endif

   !  ew_pme_recip.h:
   
   call mpi_bcast(sizfftab,BC_PME_PARS_INT,MPI_INTEGER,0,commsander,ier)
   call mpi_bcast(dsum_tol,BC_PME_PARS_REAL,MPI_DOUBLE_PRECISION, &
         0,commsander,ier)

   i_column_fft = 0
   if(master .and. column_fft_flag) i_column_fft = 1
   call mpi_bcast(i_column_fft,1,MPI_INTEGER, &
         0,commsander,ier)
   column_fft_flag = (i_column_fft == 1)
   
   ! ew_mpole.h
   
   call mpi_bcast(ifirst,BC_MULTPOLE,MPI_INTEGER,0,commsander,ier)
   call mpi_bcast(diptol,BC_INDDIPR,MPI_DOUBLE_PRECISION,0,commsander,ier)
   call mpi_bcast(maxiter,BC_INDDIPI,MPI_INTEGER,0,commsander,ier)
   
   !  erfc_spline.h
   
   call mpi_bcast(leed_cub,6,MPI_INTEGER,0,commsander,ier)
   call mpi_bcast(eedtbdns,2,MPI_DOUBLE_PRECISION,0,commsander,ier)
   
   
   !---- from nonbond_list.f module nblist -----------------------
   !     common/dirpars/
   call mpi_bcast(numnptrs,BC_DIRPARS,MPI_INTEGER,0,commsander,ier)
   ! was in  ew_unitcell.h now in module nblist
   call mpi_bcast(ucell,BC_EWUCR,MPI_DOUBLE_PRECISION,0,commsander,ier)
   call mpi_bcast(nbflag,BC_EWUCI,MPI_INTEGER,0,commsander,ier)

   
   !  ewcntrl.h
   
   call mpi_bcast(verbose,BC_EWCNTRL,MPI_INTEGER,0,commsander,ier)
   inocutoff=0
   inogrdptrs=0
   if(master)then
      if(nocutoff)inocutoff=1
      if(nogrdptrs)inogrdptrs=1
   end if
   call mpi_bcast(inogrdptrs,BC_EWCNTRL_NP,MPI_INTEGER, &
         0,commsander,ier)
   nogrdptrs=inogrdptrs == 1
   nocutoff=inocutoff == 1
   
   ! flocntrl.h
   
   call mpi_bcast(do_dir,BC_FLOCNTRL,MPI_INTEGER,0,commsander,ier)
   
   ! debug.h
   
   call mpi_bcast(do_debugf,BC_DEBUG,MPI_INTEGER,0,commsander,ier)
   call mpi_bcast(lscg,BC_DEB_HEAP,MPI_INTEGER,0,commsander,ier)
   
   ! timer info new_time.h
   
   call mpi_bcast(tpar_p,BC_TIME_PAR,MPI_INTEGER,0,commsander,ier)
   
   ! extra points
   
   call mpi_bcast(ifrtyp,BC_EXTRA_PT,MPI_INTEGER,0,commsander,ier)
   
   !  ew_parallel.h
   
   call mpi_bcast(indz,BC_SLABS,MPI_INTEGER,0,commsander,ier)

   
   ! sgld

   call mpi_bcast(isgld,1,MPI_INTEGER,0,commsander,ier)
   call mpi_bcast(isgsta,1,MPI_INTEGER,0,commsander,ier)
   call mpi_bcast(isgend,1,MPI_INTEGER,0,commsander,ier)
   call mpi_bcast(fixcom,1,MPI_INTEGER,0,commsander,ier)
   call mpi_bcast(tsgavg,1,MPI_DOUBLE_PRECISION,0,commsander,ier)
   call mpi_bcast(tsgavp,1,MPI_DOUBLE_PRECISION,0,commsander,ier)
   call mpi_bcast(sgft,1,MPI_DOUBLE_PRECISION,0,commsander,ier)
   call mpi_bcast(sgff,1,MPI_DOUBLE_PRECISION,0,commsander,ier)
   call mpi_bcast(sgfd,1,MPI_DOUBLE_PRECISION,0,commsander,ier)
   call mpi_bcast(tempsg,1,MPI_DOUBLE_PRECISION,0,commsander,ier)
   call mpi_bcast(treflf,1,MPI_DOUBLE_PRECISION,0,commsander,ier)

   !     IX,XX,IH
   
   call mpi_bcast(ix(1),lasti,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(ih(1),4*lasth,MPI_CHARACTER,0,commsander,ierr)
   call mpi_bcast(xx(1),lastr,MPI_DOUBLE_PRECISION,0,commsander,ierr)

! SOFT CORE
   ! Get all nodes informed about the SC parameters
   call mpi_bcast(ifsc,1,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(scalpha,1,MPI_DOUBLE_PRECISION,0,commsander,ierr)
   call mpi_bcast(scbeta,1,MPI_DOUBLE_PRECISION,0,commsander,ierr)
   call mpi_bcast(sceeorder,1,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(scmask,256,MPI_CHARACTER,0,commsander,ierr)
   call mpi_bcast(dynlmb,1,MPI_DOUBLE_PRECISION,0,commsander,ierr)
   call mpi_bcast(tishake,1,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(emil_sc,1,MPI_INTEGER,0,commsander,ierr)   
! end SOFT CORE
   call mpi_bcast(ifmbar,1,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(bar_intervall,1,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(bar_l_min,1,MPI_DOUBLE_PRECISION,0,commsander,ierr)
   call mpi_bcast(bar_l_max,1,MPI_DOUBLE_PRECISION,0,commsander,ierr)
   call mpi_bcast(bar_l_incr,1,MPI_DOUBLE_PRECISION,0,commsander,ierr)

   ! Parameters for LIE module
   call mpi_bcast(ilrt,1,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(lrtmask,256,MPI_CHARACTER,0,commsander,ierr)
   call mpi_bcast(lrt_interval,1,MPI_INTEGER,0,commsander,ierr)

! IPS
   call mpi_bcast(ips,1,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(mipsx,1,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(mipsy,1,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(mipsz,1,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(mipso,1,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(raips,1,MPI_DOUBLE_PRECISION,0,commsander,ierr)
   call mpi_bcast(dvbips,1,MPI_DOUBLE_PRECISION,0,commsander,ierr)
   call mpi_bcast(gridips,1,MPI_DOUBLE_PRECISION,0,commsander,ierr)
! end IPS

! AMD
   call mpi_bcast(iamd,1,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(w_amd,1,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(iamdlag,1,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(EthreshP,1,MPI_DOUBLE_PRECISION,0,commsander,ierr)
   call mpi_bcast(alphaP,1,MPI_DOUBLE_PRECISION,0,commsander,ierr)
   call mpi_bcast(EthreshD,1,MPI_DOUBLE_PRECISION,0,commsander,ierr)
   call mpi_bcast(alphaD,1,MPI_DOUBLE_PRECISION,0,commsander,ierr)
   call mpi_bcast(EthreshP_w,1,MPI_DOUBLE_PRECISION,0,commsander,ierr)
   call mpi_bcast(alphaP_w,1,MPI_DOUBLE_PRECISION,0,commsander,ierr)
   call mpi_bcast(EthreshD_w,1,MPI_DOUBLE_PRECISION,0,commsander,ierr)
   call mpi_bcast(alphaD_w,1,MPI_DOUBLE_PRECISION,0,commsander,ierr)
! end AMD

!scaledMD 
   call mpi_bcast(scaledMD,1,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(scaledMD_lambda,1,MPI_DOUBLE_PRECISION,0,commsander,ierr)
! end scaledMD 

! EMAP
   call mpi_bcast(temap,1,MPI_LOGICAL,0,commsander,ierr)
   call mpi_bcast(NEMAP,1,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(NRIGID,1,MPI_INTEGER,0,commsander,ierr)
   call mpi_bcast(GAMMAMAP,1,MPI_DOUBLE_PRECISION,0,commsander,ierr)
! end EMAP

! crg_reloc -- everyone has to know ifcr
   call mpi_bcast(ifcr,1,MPI_INTEGER,0,commsander,ierr)

! constant pH
   if (icnstph .gt. 0) call cnstph_bcast(ierr)

   ! Charmm force field support - will simply return if charmm is not in use.
   call mpi_bcast_charmm_params(master)
   ! FF11 Cmap support - will simply return if ff11cmap is not in use.
   call mpi_bcast_cmap_params(master)

   call mpi_barrier(commsander,ierr)
   
   !   ---- divide atoms up among the processors, always splitting on
   !        residue boundaries:
   
   call setpar(nspm, ix(i70), ntp, ix(i02), xx(lmass))

   if ( no_ntt3_sync == 1 ) then
     !Here we are not synching the random number generator across threads for
     !NTT=3 but we don't want every thread to have the same random seed. So for
     !the moment just add mytask id to ig.
     ig = ig+mytaskid
   end if

   call trace_exit( 'startup' )
   return
end subroutine startup 
!----------------------------------------------------------------------

!+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
!+ [Enter a one-line description of subroutine fdist here]
subroutine fdist(f,forcetmp,pot,vir,newbalance)
   !************************************************************
   !   for the "original version", (when mpi_orig is set):
   
   !     James Vincent 7/94
   !     Gather all copies of the force array with ene and vir
   !     arrays tacked onto the end, then extract ene and vir
   !     arrays back out.
   !     Input array f is current local PE copy, forcetmp is
   !     scratch space, but results are put back into f, thus
   !     overwriting what was there.
   !     f: final force array - result of reduce operation
   !     forcetmp: scratch space
   !     ene: final ene array with sum of all ene values
   !     vir: final vir array
   !     MJW TODO Correct the above to match my changes
  
   !   when mpi_orig is false, does a distributed sum of the forces,
   !     so that each node ends up with only a piece of the total
   !     force array; does a global sum on the energy and virial
   
   !************************************************************

   use trace
   use neb_vars, only : ineb
   use qmmm_module, only : qmmm_nml
   use state
   implicit none
#  include "../include/memory.h"
#  include "parallel.h"
#ifdef MPI_DOUBLE_PRECISION
#undef MPI_DOUBLE_PRECISION
#endif
   include 'mpif.h'
   integer ierr
#ifdef CRAY_PVP
#define MPI_DOUBLE_PRECISION MPI_REAL8
#endif
#  include "../include/md.h"
#  include "extra.h"
#  include "nmr.h"
   
   !     Parameters:
   
   _REAL_, intent(inout)           :: f(*),vir(*)
   _REAL_                          :: forcetmp(*)
   type(potential_energy_rec), intent(inout)  :: pot
   type(potential_energy_rec)                 :: pot_tmp
   integer newbalance
   
   !     Local:
   
   integer i,j

   if (numtasks == 1) return
   call trace_enter( 'fdist' )
   
   !     Tack ene, and vir, onto end of f:
   
   j = 3*natom+iscale+1
   f(j)   = vir(1)
   f(j+1)   = vir(2)
   f(j+2)   = vir(3)
  
   !f(j+32) = newbalance !what is this for?  !TODO mjw fix this
   
   ! carlos removed ineb here, no need for master to know all forces with new
   ! NEB code


   if( mpi_orig.or. ievb>0 .or. icfe>0 ) then


      !  ---Reduce the force array and energies back to the master node;
      !      (hence, the master will know all coordinates and all forces):
      
      call trace_mpi('mpi_reduce', &
            (3*natom+iscale+3),'MPI_DOUBLE_PRECISION',mytaskid)
#ifdef USE_MPI_IN_PLACE
      if (master) then
        call mpi_reduce(MPI_IN_PLACE,f,(3*natom+iscale+3),MPI_DOUBLE_PRECISION,mpi_sum,0,commsander,ierr)
        call mpi_reduce(MPI_IN_PLACE,pot,potential_energy_rec_len,MPI_DOUBLE_PRECISION,mpi_sum,0,commsander,ierr)
      else
        call mpi_reduce(f,0,(3*natom+iscale+3),MPI_DOUBLE_PRECISION,mpi_sum,0,commsander,ierr)
        call mpi_reduce(pot,0,potential_energy_rec_len,MPI_DOUBLE_PRECISION,mpi_sum,0,commsander,ierr)
      end if
      vir(1) = f(j)
      vir(2) = f(j+1)
      vir(3) = f(j+2)
#else
      call mpi_reduce(f,forcetmp,(3*natom+iscale+3), &
            MPI_DOUBLE_PRECISION,mpi_sum,0,commsander,ierr)

      vir(1) = forcetmp(j)
      vir(2) = forcetmp(j+1)
      vir(3) = forcetmp(j+2)

      ! 2nd call to reduce pot to master
      call mpi_reduce(pot,pot_tmp,potential_energy_rec_len, &
            MPI_DOUBLE_PRECISION,mpi_sum,0,commsander,ierr)

      pot = pot_tmp

      ! Empty out global forcetmp() to local thread
      do i=1,3*natom
         f(i) = forcetmp(i)
      end do
#endif
   else
      
      ! ---Add all copies of virial and energy and put result back on ALL nodes:
      
      call trace_mpi('mpi_allreduce',3,'MPI_DOUBLE_PRECISION',mytaskid)
#ifdef USE_MPI_IN_PLACE
      call mpi_allreduce(MPI_IN_PLACE,f(j),3,MPI_DOUBLE_PRECISION,mpi_sum,commsander,ierr)
      vir(1) = f(j)
      vir(2) = f(j+1)
      vir(3) = f(j+2)
      newbalance=0
      !if(f(j+32) > 0.d0)newbalance=1 !MJW TO FIX
#else
      call mpi_allreduce(f(j),forcetmp(j),3, &
            MPI_DOUBLE_PRECISION,mpi_sum,commsander,ierr)


      vir(1) = forcetmp(j)
      vir(2) = forcetmp(j+1)
      vir(3) = forcetmp(j+2)

      ! Reduce all values of potential to pot_tmp
      ! and then assign to pot
      call mpi_allreduce(pot,pot_tmp,potential_energy_rec_len, &
            MPI_DOUBLE_PRECISION,mpi_sum,commsander,ierr)

      pot  = pot_tmp
      newbalance=0
      !if(forcetmp(j+32) > 0.d0)newbalance=1  !TODO mjw
#endif
      
      if (init /= 3 .AND.ineb==0 .AND. qmmm_nml%vsolv < 2) then
         !  ---Do a distributed sum of the force array:
         call fsum(f,forcetmp)
      else

         !  Due to lack of parallelization in the initial parts
         !    of runmd in the init=3 case, the more efficient
         !    reduce_scatter needs to be replaced with an mpi_allreduce call;
         !    this is also required for NEB simulations so that all the forces 
         !    are available for dot products and removing forces along path
         !    tangent.
         !    in the future we might consider a respa-type NEB to overcome this
         !    extra work
         !   CARLOS: ONLY MASTER NEEDS FORCES FOR NEB
         !   AWG: adaptive QM/MM needs the complete force arrays for force mixing

         call trace_mpi('mpi_allreduce', &
               3*natom,'MPI_DOUBLE_PRECISION',mytaskid)
#ifdef USE_MPI_IN_PLACE
         call mpi_allreduce(MPI_IN_PLACE, f, 3*natom, &
               MPI_DOUBLE_PRECISION,mpi_sum,commsander,ierr)
#else
         call mpi_allreduce(f, forcetmp, 3*natom, &
               MPI_DOUBLE_PRECISION,mpi_sum,commsander,ierr)
         do i=1, 3*natom
            f(i) = forcetmp(i)
         end do
#endif

      end if
      
   end if  ! mpi
   
   call trace_exit( 'fdist' )
   return
end subroutine fdist 

!+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
!+ [Enter a one-line description of subroutine fsum here]
subroutine fsum(f,tmp)
   
   !     equivalent to an MPI_REDUCE_SCATTER on f:  all processors contribute
   !       to f, and the appropriate part of the result winds up on each
   !       processor

   use trace
   implicit none
   _REAL_ f(*),tmp(*)
   
#  include "parallel.h"
#  include "extra.h"
#  include "../include/memory.h"
#ifdef MPI_DOUBLE_PRECISION
#  undef MPI_DOUBLE_PRECISION
#endif
   include 'mpif.h'
   integer ierr
#ifdef CRAY_PVP
#  define MPI_DOUBLE_PRECISION MPI_REAL8
#endif
  
   !Used for Binary Tree 
   integer other,ncyclesm1,k,bit,cs,cr,ns,nr,istart,iend
   integer ist(mpi_status_size)

   if (numtasks <= 1) return

   call trace_enter( 'fsum' )
   
   ! If we have a power of two cpus do a binary tree:

   if (logtwo(numtasks)>=1) then  !We have a power of two cpus
     ncyclesm1 = logtwo(numtasks) - 1
     bit = ishft(numtasks,-1)
   
     do k = 0,ncyclesm1
      
        other=ieor(mytaskid,bit)
      
        !        send chunk:
      
        cs = ishft(other,-((ncyclesm1)-k))*bit
        ns = iparpt3(cs+bit)-iparpt3(cs)
      
        !        recv chunk:
      
        cr = ishft(mytaskid,-((ncyclesm1)-k))*bit
        istart = iparpt3(cr)
        iend = iparpt3(cr+bit)
        nr = iend-istart
      
      
        call trace_mpi('mpi_sendrecv', &
              (ns+nr),'MPI_DOUBLE_PRECISION', other)
        call mpi_sendrecv( &
              f(iparpt3(cs)+1),ns,MPI_DOUBLE_PRECISION,other,5, &
              tmp(iparpt3(cr)+1),nr,MPI_DOUBLE_PRECISION,other,5, &
              commsander, ist, ierr )
        f(istart+1:iend) = f(istart+1:iend) + tmp(istart+1:iend)
      
        bit = ishft(bit,-1)
      
     end do  !  k = 0,ncyclesm1

   else

     ! We don't have a power of two - do things the old fashioned way.
     call trace_mpi('mpi_reduce_scatter', &
           rcvcnt3(mytaskid),'MPI_DOUBLE_PRECISION', mytaskid)
#ifdef RED_SCAT_INPLACE
     call mpi_reduce_scatter(f, f(iparpt3(mytaskid)+1), &
           rcvcnt3, MPI_DOUBLE_PRECISION, mpi_sum, &
           commsander, ierr)
#else
     ! Reduce scaller used to work with most MPI installations even if the send
     ! and receive buffers aliased each other but now it seems that newer 
     ! mpi installations are checking this and quiting with an error. The standard
     ! does not allow this in MPI v1.0. For now we will make the non-inplace version
     ! the default.
     call mpi_reduce_scatter(f, tmp(iparpt3(mytaskid)+1), &
           rcvcnt3, MPI_DOUBLE_PRECISION, mpi_sum, &
           commsander, ierr)
     f(iparpt3(mytaskid)+1:iparpt3(mytaskid+1)) = tmp(iparpt3(mytaskid)+1:iparpt3(mytaskid+1))
#endif
   end if !Power of two cpus.
   call trace_exit( 'fsum' )
   return
end subroutine fsum 

!+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
!+ Distribute the coordinates to all processors.
subroutine xdist(x, tmp, natom)

   use trace
   implicit none
  
   integer, intent(in) :: natom 
   _REAL_ x(3*natom), tmp(3*natom)
   
#  include "parallel.h"
#    ifdef MPI_DOUBLE_PRECISION
#      undef MPI_DOUBLE_PRECISION
#    endif
   include 'mpif.h'
   integer ierr
#    ifdef CRAY_PVP
#      define MPI_DOUBLE_PRECISION MPI_REAL8
#    endif

   !Used for Binary Tree   
   integer other,ncyclesm1,k,bit,cs,cr,ns,nr
   integer ist(mpi_status_size),ireq
   
   if (numtasks <= 1) return
   call trace_enter( 'xdist' )

   ! If we have a power of two cpus do a binary tree:

   if (logtwo(numtasks)>=1) then
     ncyclesm1 = logtwo(numtasks) - 1
     bit=1
     do k = 0,ncyclesm1
      other=ieor(mytaskid,bit)
      cs = ishft(mytaskid,-k)*bit
      cr = ishft(other,-k)*bit
      ns = iparpt3(cs+bit)-iparpt3(cs)
      nr = iparpt3(cr+bit)-iparpt3(cr)
      call trace_mpi('mpi_sendrecv', &
            (ns+nr),'MPI_DOUBLE_PRECISION', other)
      call mpi_sendrecv( &
            x(iparpt3(cs)+1),ns,MPI_DOUBLE_PRECISION,other,5, &
            x(iparpt3(cr)+1),nr,MPI_DOUBLE_PRECISION,other,5, &
            commsander, ist, ierr )
      bit = ishft(bit,1)
     end do

   else

   ! We don't have a power of two - do things the old fashioned way.

     !       --- Assume an "in-place" allgatherv works: this seems(?) to
     !           be true everywhere....

     ! RCW 2011/11/01
     ! The inplace allgatherv is not legal and modern version of mpi - e.g.
     ! mpich2-1.4 check for it and quit with an error.
   
     call trace_mpi('mpi_allgatherv', &
           rcvcnt3(mytaskid),'MPI_DOUBLE_PRECISION', mytaskid)
     !call mpi_allgatherv( &
     !      x(iparpt3(mytaskid)+1),rcvcnt3(mytaskid), &
     !      MPI_DOUBLE_PRECISION,x,rcvcnt3,iparpt3, &
     !      MPI_DOUBLE_PRECISION,commsander, ierr)
     call mpi_allgatherv( &
           x(iparpt3(mytaskid)+1),rcvcnt3(mytaskid), &
           MPI_DOUBLE_PRECISION,tmp,rcvcnt3,iparpt3, &
           MPI_DOUBLE_PRECISION,commsander, ierr)
     x(1:3*natom) = tmp(1:3*natom)
   endif
   call trace_exit( 'xdist' )
   return
end subroutine xdist 

!+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
!+ [Enter a one-line description of subroutine fgblsum here]
subroutine fgblsum(x,tmp, natom)
   implicit none
   integer, intent(in) :: natom
   _REAL_ x(3*natom),tmp(3*natom)
   
   call fsum(x,tmp)
   call xdist(x, tmp, natom)
   
   return
end subroutine fgblsum 

#else
subroutine dummy_parallel()
end subroutine dummy_parallel
#endif  /* MPI  */