#include "definitions.h"
!----------------------------------------------------------------------
      subroutine force_us(&
                 force, eigen, occ, wtkpt ,nplwkp,cptwfp,&
#                include "apply_h_args.h"
                 ,timer,nconso )
!----------------------------------------------------------------------
!
! nonlocal (separable pseudopotential) contribution to the force
! for a non-local US pseudo-potential 
! see PRB 47, 10142 (1993)
! 
!      F = 
!      (1)   Int( dr Veff sum(nm) d(Qnm(r))/dR * becsum
!      (2)   sum(n,m) Dnm d(becsum)/dR 
!      (3)   sum(n,m) qnm eigen * becsum 
!          
!
! first calculate the forces associated with the projectors
! there is the normal kb term and a
! term sum_alfa e_a <a|d(S)/dR|a>
! note d(<a|n><m|a>)/d(R) = d(<n|a>^* <m|a>)/dR
!                         = <n|iG|a>^*<m|a> + <n|a>^*<m|iG|a>
! see PRB 47, 10142 (1993)
! cptwfp is wavefunction for k-point nkp
!
      use non_local_projectors
      use van_us_data_module
#ifdef PARAL
      use par_functions_module
#endif PARAL
      implicit  none
#     include  "apply_h_decl.h"
      real*8    force(3,nions,nspec)
      real*8    eigen(nbands,nkprun),occ(nbands,nkprun),wtkpt(nkprun)

      integer   nplwkp,nconso
      complex*WF_PRECISION cptwfp(nrplwv,nbands)
      real*8    timer(*)

!     locals  
#ifdef BLAS2
      real*8  a(nkbpmaxatom,nions),b(nkbpmaxatom,nions)
#endif
#ifdef PARAL
      real*8  vkb1(nrplwv,nkbpmaxatom)
#endif
      complex*16 work1(nrplwv),temp(nrplwv)
      real*8     a2(nkbpmaxatom,2,nions),temp2(nrplwv,2,nions) 
      real*8     wg,for(3,nions)
      integer    jkbp,ipol,it,jkb,jkb1,ibnd,iv,jv,i,nsp,mu
      integer    nhjkb, nhjkbm,na,j,ni,np 
      complex*16 dbecp(nkb,3),ps,pss,zdotc,h
      real*8     zero 
#include "etime.h"
      integer    idebug
#ifdef PARAL
      integer    nlocal(par_pw_np),offset(par_pw_np)
#endif
      parameter  (zero=0.0d0,idebug=0) 
      logical*4  linit
      integer    nkpinit,nplwkp_local,offs,nkpeff,kp_index
      data       linit/.true./
      save       linit,nkpinit

      call uttime(time)
      timer(TNLFORC)=timer(TNLFORC)-time(1)

      call geteff(nkp,nkpmem,nconso,nkpeff)

#ifdef SERIAL
      nplwkp_local = nplwkp
      offs = 0
#else
!     For parallel job set offset for dnlg
!     and define the number of planewaves on this node (nlocal)
!
      call par_defwfk (nrplwv,nrplwv_global,nplwkp,nlocal,offset,&
                         nplwkp_local,&
#include PARAL_ARGS
        , nconso)
      offs = offset(par_process+1)
#endif


      if (luse_rs_proj) return  !  not implemented yet for real space projectors
      if (linit) then
!        remember which nkp (first nkp for each process)
         nkpinit = nkp
         linit = .false.
      endif

   
!     zero temp array for forces  
      call setv(3*nions,zero,for(1,1),1)
!
      kp_index = (nkp + (nspin-1))/nspin     ! map back:  nkp the collapsed spin/kp index

      do ibnd = 1,nbands

         call uttime(time)
         timer(TUSFOR)=timer(TUSFOR)-time(1)

         wg = occ(ibnd,nkp) 
         do ipol=1, 3

      na = 1
      do np = 1,nspec
        do ni = 1,nionsp(np)
          do i=1,nplwkp_local
            h      = conjg(skb(i,na,nkpeff))*cptwfp(i,ibnd)
            h      = h*g_soft(i+offs,ipol,kspin(nkp),kp_index)*(0.0,1.0)
!           tempr(i,ni)= dreal(h)
!           tempc(i,ni)= dimag(h)
            temp2(i,1,ni)= dreal(h)
            temp2(i,2,ni)= dimag(h)
          enddo
          na = na + 1
        enddo
        na = na - nionsp(np)

!        multiply with nhnionsp(np) projectors for this type of atom
         call dgemm('T','N', nh(np), 2*nionsp(np),nplwkp_local,1.0d0,&
                 vkb(1,1,np,nkpeff), nrplwv,&
                 temp2,       nrplwv,&
                 0.0d0,&
                 a2,           nkbpmaxatom )
!        call dgemm('T','N', nh(np), nionsp(np),nplwkp_local,1.0d0,
!    &           vkb(1,1,np,nkpeff), nrplwv,
!    &           tempc,       nrplwv,
!    &           0.0d0,
!    &           b,           nkbpmaxatom )

        do ni = 1,nionsp(np)
          do iv = 1,nh(np)
             jkb=nkbc(na,iv)
             if (vkbreal(iv,np))  then
               dbecp(jkb,ipol) = dcmplx(a2(iv,1,ni),a2(iv,2,ni))
             else
               dbecp(jkb,ipol) = dcmplx(a2(iv,2,ni),-a2(iv,1,ni))
             endif
          enddo
          na = na + 1
        enddo

      enddo  ! np = 1,nspec

         enddo   ! ipol
         call uttime(time)
         timer(TUSFOR)=timer(TUSFOR)+time(1)

!        at this point sum the becp and dbecp on each processor 
#ifdef PARAL
         if (idebug.gt.0) then 
           write(nconso,*) 'FORCE US becp1:',ipol,dbecp(1,1)
         endif
         call par_sum_complex('A',dbecp,dbecp,3*nkb,3*nkb,1,&
#include PARAL_ARGS
           ,   timer)
         if (idebug.gt.0) then 
           write(nconso,*) 'FORCE US becp2:',ipol,dbecp(1,1)
         endif
#endif PARAL


         do ipol=1, 3
!*multiply on projectors
            do jkb = 1, nkb
               nhjkb = nkbtonh(jkb)
               na = nkbtona(jkb)
               it = ityp(na)
               nhjkbm = nh(it)
               jkb1 = jkb - nhjkb
! now the sums sum_m (d_nm-e_a qnm) {<m|a>, <m|iG|a>}
               ps = ZERO
	       pss= ZERO
               do j = 1,nhjkbm
                  ps = ps + becp(jkb1+j,ibnd,nkp)*&
                       (deeq(nhjkb,j,na,kspin(nkp))-&
                              eigen(ibnd,nkp)*qq(nhjkb,j,it))
                  pss = pss + dbecp(jkb1+j,ipol)*&
                       (deeq(nhjkb,j,na,kspin(nkp))-&
                              eigen(ibnd,nkp)*qq(nhjkb,j,it))
               enddo
               
! multiply with {<n|ig|a>^*,<n|a>^*}
                for(ipol,na) = for(ipol,na) -&
                 wg*wtkpt(nkp)*&
                dble(conjg(dbecp(jkb,ipol))*ps+&
                conjg(becp(jkb,ibnd,nkp))*pss)
            enddo
         enddo
      enddo ! ibnd = 1,nbands

      na = 1
      do nsp = 1,nspec 
       do mu = 1,nionsp(nsp) 
         do ipol = 1,3
          force(ipol,mu,nsp) = force(ipol,mu,nsp) + for(ipol,na)
         enddo
         if (idebug.gt.0) then 
           write(nconso,100) 'FORCE US1:',nsp,mu,(force(ipol,mu,nsp),&
                           ipol=1,3)
100        format(1x,a20,i2,i2,1x,3(f12.8,1x))
         endif
         na = na + 1
       enddo
      enddo

! The D matrix depends on the ionic position, the term
! connected  with dV/dr has already been included (forloc)
! first we calculate ddeeq, the calculation is very similar to 
! that performed in newd
      if (nkp.eq.nkpinit) then       
        call newdd(idebug,&
#                include "apply_h_args.h"
                 ,timer,nconso )
      endif
! with the newdd perform the eigensum, similar to addusdens
! where we have already calculated the becsum=sum_{a in occ} <a|b_n><b_m|a>
! for this k-point
      do ipol=1, 3
         na = 1
         do it = 1,nspec
           do mu = 1,nionsp(it)
             do iv = 1, nh(it)
               do jv = 1, nh(it)
                 force(ipol,mu,it) = force(ipol,mu,it) -&
                   dble(becsum(iv,jv,na,nkp)*&
                       ddeeq(iv,jv,na,ipol,kspin(nkp))) 
               enddo
             enddo
             na = na + 1
           enddo
         enddo
      enddo
!
      if (idebug.gt.0) then 
      do nsp = 1,nspec 
       do mu = 1,nionsp(nsp) 
         write(nconso,100) 'FORCE US2:',nsp,mu,(force(ipol,mu,nsp),&
                          ipol=1,3)
       enddo
      enddo
      endif

      call uttime(time)
      timer(TNLFORC)=timer(TNLFORC)+time(1)


      return
      end


!----------------------------------------------------------------------
      subroutine newdd(idebug,&
#include         "apply_h_args.h"
                ,timer,nconso)
!-----------------------------------------------------------------------
! 
!    This routine calculates 
! 
!       ddeeq =    Int( dr Veff dQnm(r)/dR )
! 
!    that enters in the non-local force because the deeq matrix 
!    depends on the ionic positions. 
!    The routine is identical to newd, except that 
!    exp(-i R G) is (-iG) exp(-i R G) 
!
!          Dnm(ispin) =  sum_G(Veff(G,ispin) Qnm(G) (-iG) exp(-iG.R))
!
!    ispin : 1,nspin 
!
!-----------------------------------------------------------------------
      use van_us_data_module

#ifdef PARAL
      use par_functions_module
#endif PARAL
      implicit none

#     include  "apply_h_decl.h"
      integer   nconso,idebug
      real*8    timer(*)

!     local arrays (automatic) 
      complex*16  auxfft(nplwv)
      integer     recl 
      parameter   (recl = 1024) 
      complex*16  qgm(recl,(nhm*(nhm+1))/2)
      real*8 ylmk0(recl ,lqx*lqx)
#include "etime.h"
      complex*16  aux(ngdens_max),qg(recl),aux1(recl)

!     locals
      integer na,nsp,iv,jv,ik,i,inr,mu,m
      integer imin,imax,ispin,ipol,j,ndeeq,nnh
      real*8  arg,xyz(3),DDOT ,ps,rinplw
      real*8     g1(ngdens,3),gg1(ngdens)
      real*8     zero 
      parameter  (zero=0.0d0) 

      logical*4 lrec
      external  DDOT 
!
      call uttime(time)
      timer(TNEWDD)=timer(TNEWDD)-time(1)

!     zero ddeeq 
      call setv(2*nhm*nhm*nions*3,zero,ddeeq(1,1,1,1,1),1)

#ifdef QRAD_SWAP
      if (nspec.gt.1) call openqrad(ngl) 
#endif QRAD_SWAP

      do i = 1,ngdens
        do j = 1,3
          g1(i,j) = g(i+goffs,j)
        enddo
        gg1(i) = gg(i+goffs)
      enddo

!     loop over spin
      do ispin = 1,nspin

!-----------------------------------------------------------------------
!     Get Fourier transform of effective potential cveff* (in aux)
      rinplw=1.0d0/nplwv
      do i=1, nplwv
         auxfft(i) = cveff(i,ispin)*rinplw
      enddo
      call fft3d(auxfft,ngx,ngy,ngz,-1)
      do m = 1,ngdens_max
        aux(m) = conjg(auxfft(ipwpadG(m,0)))
      enddo
!-----------------------------------------------------------------------
!     calculate Qnm*Veff for each type, D = sum_G exp(-iR.G) Q_nm veff(G)*
      do nsp =1,nspec

#ifdef QRAD_SWAP
          if (nspec.gt.1)&
             call readqrad(ngldim,ngl,nsp,qrad,maxindex)
#endif QRAD_SWAP

          imax = 0
          lrec=.true. 
10        continue
          imin=imax
          imax = imin + recl         
          if (imax.ge.ngdens) then
             imax=ngdens
             lrec=.false.
          endif     

!         calculate ylmk0 for imin,imax
          call ylmr2(lqx**2,imin,imax,ngdens,g1,gg1,recl,&
                    (lqx)**2,ylmk0)

          inr = 1
          nnh = nh(nsp) 
          do iv = 1, nnh
            do jv = iv, nnh
               call qvan2(imin,imax,iv,jv,nsp,recl,qgm(1,inr),&
                   ylmk0,qrad,ngdens_max,&
                   nspec,ngldim,igtongl,nhm,indv,nhtol,nhtom, &
                   ap,lpx,lpl,ivjv2index,nbrx,nlx,mx,nbrx2dim,lqx)
               do i=imin+1,imax
                 qgm(i-imin,inr) = qgm(i-imin,inr) * aux(i+goffs)
               enddo
               inr = inr + 1
            enddo
          enddo
                 
!         setup the structure constant (qg) for each atom
          do na = 1, nions
            if (ityp(na).eq.nsp) then 
              call xyzposna(nions,nspec,nionsp,posion,na,dirc,xyz) 
              do ik=imin+1,imax
               arg = (g(ik+goffs,1)*xyz(1) + g(ik+goffs,2)*xyz(2) &
                      + g(ik+goffs,3)*xyz(3))
                 qg(ik-imin) = dcmplx(-sin(arg),cos(arg))    
              enddo
              do ipol = 1,3                                    
                do i = 1,imax-imin 
                  aux1(i) = g1(i+imin,ipol)*qg(i)
                enddo
                inr = 1
                do iv = 1, nh(nsp)
                 do jv = iv, nh(nsp)
!                  dirty (but fast) way to calculate: 
!                   sum_G real(exp(-iR.G) Q_nm v^* )
                    ps = DDOT(2*(imax-imin),aux1(1),1,qgm(1,inr),1)
                    ddeeq(iv,jv,na,ipol,ispin) = &
                        ddeeq(iv,jv,na,ipol,ispin)+ps
                    ddeeq(jv,iv,na,ipol,ispin) = &
                        ddeeq(iv,jv,na,ipol,ispin)
                    inr = inr + 1
                 enddo
                enddo
              enddo
            endif   ! ityp.eq.nsp
          enddo
          if (lrec) goto 10
      enddo

      enddo  ! ispin = 1,nspin

#ifdef PARAL
!     sum of the Dmatrix on each node
      ndeeq = nhm*nhm*nions*2*3
      call par_sum_double('A',ddeeq,ddeeq,ndeeq,ndeeq,1,&
#include PARAL_ARGS
           ,   timer)
#endif PARAL

      if (idebug.gt.0) then 
!      write out
       do ispin = 1,nspin
       na = 1
       do nsp =1,nspec 
        do mu = 1, nionsp(nsp)
         write(nconso,*) 'DDmatrix atm:',na, ' spin:',ispin,nspin
         DO IV=1,NH(nsp)
          WRITE(nconso,101) mu,iv,&
               (DDEEQ(IV,JV,na,1,ispin),JV=1,NH(nsp))
          WRITE(nconso,101) mu,iv,&
               (DDEEQ(IV,JV,na,2,ispin),JV=1,NH(nsp))
          WRITE(nconso,101)  mu,iv,&
               (DDEEQ(IV,JV,na,3,ispin),JV=1,NH(nsp))
         END DO
         na = na + 1
        enddo
       enddo
       enddo
101    format(1x,'DDEEQ ',i1,1x,i1,20(f8.4,1x))
      endif


#ifdef QRAD_SWAP
      if (nspec.gt.1) close(44) ! qrad swap-file
#endif QRAD_SWAP

      call uttime(time)
      timer(TNEWDD)=timer(TNEWDD)+time(1)
      return 
      end

