#include "definitions.h"
    module  stress_module

    use netcdfinterface
    use run_context
#ifdef PARAL
    use par_functions_module
#endif PARAL
!=====================================================================
!   This module calculates the stree on the unit cell
!=====================================================================

!=====================================================================
!   module data
!=====================================================================

!   index to upper part of x1,x2 matrix 
    integer,parameter :: x1_index(6) = (/3,1,2,2,3,3/)
    integer,parameter :: x2_index(6) = (/3,1,1,2,1,2/)

    logical, save     :: cal_xc_gga_stress = .false.

!   keep a copy of the point group operations
!   calculated in symana nspama
!   should be removed then symana is a module
    real*8,save,allocatable  :: point_group_op(:,:,:) ! 3,3,nspama

!   stress components 
    type stress_type
        real*8 :: kinetic(3,3)
        real*8 :: hartree(3,3)
        real*8 :: xc(3,3)
        real*8 :: xc_cc(3,3)
        real*8 :: xc_gga(3,3)
        real*8 :: localpsp(3,3)
        real*8 :: alphaZ(3,3)
        real*8 :: nonlocal(3,3)
        real*8 :: us_Qnm(3,3)
        real*8 :: ion_ion(3,3)
        real*8 :: total(3,3)
    end type stress_type 
    type(stress_type) :: stress_all 
!   

!##################################################################### 
    contains
!#####################################################################

    subroutine kinetic_stress(cptwfp, nplwkp,occupation,wtkpt,ndcbyt,ldonkp,&
#include "apply_h_args.h"
                             ,timer)
                              
!=====================================================================
!   Kinetic energy part of stress
!       rho_x1x2  |psi(k+G)|^2  (G+k)_x1 * (G+k)_x2 , x1,x2 = x,y,z) 
!=====================================================================
    use basicdata
    implicit none
#include "apply_h_decl.h"    
    integer,intent(in)   :: nplwkp(nkprun)
    complex*WF_PRECISION :: cptwfp(nrplwv,nbands,nkpmem) 
    real*8               :: occupation(nbands,nkprun),wtkpt(nkprun)
    integer              :: ndcbyt
    logical*4, intent(in):: ldonkp(nkprun)   
    real*8,intent(inout) :: timer(*)
!--------------------------------------------------------------------- 
!   locals 
    real*8  :: h(3),wght,h1(3)
    integer :: nb,x1,n,nkpeff
    real*8  :: stress(3,3) , stress_local(3,3)
    integer :: nplwkp_local,offs
#ifdef PARAL
    integer :: nlocal(par_pw_np),offset(par_pw_np)
#endif
!=======================================================================

!   zero stress
    stress(1:3,1:3) = 0.0d0 

!   loop over k-points
    do nkp = 1,nkprun 

      stress_local(1:3,1:3) = 0.0d0 

      if (.not.ldonkp(nkp)) cycle

!----------------------------------------------------------------------------
#ifdef SERIAL
         nplwkp_local = nplwkp(nkp)
         offs = 0
#else
         ! for parallel job set offset for dnlg
         ! and define the number of planewaves on this node (nlocal)
         call par_defwfk(nrplwv,nrplwv_global,nplwkp(nkp),nlocal,offset,&
                         nplwkp_local,&
#include PARAL_ARGS
                        ,nconso)
         offs = offset(par_process+1)
#endif
!----------------------------------------------------------------------------
         call  wfswap_get(cptwfp,ndcbyt,nkpeff,&
#include                "apply_h_args.h"
                         )
!        loop over bands
         do nb = 1,nbands 
 
           wght = occupation(nb,nkp)*wtkpt(nkp)

           ! loop over x1, x2=1:3
           do x1 = 1,3
!             loop over G-vectors
              do n = 1,nplwkp_local
                h1(1:3) = dnlkg(n+offs,x1,nkp)*dnlkg(n+offs,1:3,nkp)*(dnlkg(n+offs,0,nkp)**2)
                h(1:3)  = wght*dble(cptwfp(n,nb,nkpeff)*conjg(cptwfp(n,nb,nkpeff)))*h1(1:3)
                stress_local(x1,1:3) = stress_local(x1,1:3) + h(1:3) 
              enddo

           enddo ! x1
         enddo   ! nb 

#ifdef PARAL
!       Make global sum of  stress contributions
        call par_sum_double ('A',stress_local,stress_local,3,3,3,&
#include PARAL_ARGS
        ,timer)
#endif

    stress(1:3,1:3) = stress(1:3,1:3)+ stress_local(1:3,1:3)
    enddo      ! nkp 

    stress(1:3,1:3) = -hsqdtm*2.0d0*stress(1:3,1:3)/volc

#ifdef PARAL
!   sum stress contributions for all k-points
    call mssum (stress, stress,3,3,3,&
#include PARAL_ARGS
         ,nconso,timer )
#endif PARAL

    call symmetrize_stress(stress)

    call write_stress(stress,'kinetic') 
    stress_all%kinetic = stress

    end subroutine kinetic_stress


!======================================================================================

    subroutine hartree_stress(rdensr,nspin,ngx,ngy,ngz,dnlg0,hartree_energy,volume)
!======================================================================================
!   Hartree energy part of stress
!       2pi e^2 sum(G') |rho(G)|^2/|G|^2 [2*G_i*G_j/|G|^2 - u(i,j)], i,j = x,y,z
!======================================================================================
    use basicdata, only : edeps
    implicit none
    integer,intent(in)  :: nspin,ngx,ngy,ngz    ! spin,fft grid
    real*8,intent(in)   :: rdensr(:,:)          ! real space density (nspin,ngx*ngy*ngz)
    real*8, intent(in)  :: dnlg0(:,:)           ! G array (3,ngx*ngy*ngz)
    real*8, intent(in)  :: hartree_energy 
    real*8, intent(in)  :: volume               ! volume of unit cell
!---------------------------------------------------------------------
!   locals
    complex*16, allocatable :: densG(:)  ! too hold density in reciprocal space
    integer                 :: nplwv,ng,x1
    real*8                  :: rinplw
    real*8                  :: stress(3,3),sizeG2,h(3),h1
 
!=======================================================================
 
!   zero stress
    stress(1:3,1:3) = 0.0d0

    nplwv = ngx*ngy*ngz
    rinplw=1.0d0/dble(nplwv)
    allocate(densG(ngx*ngy*ngz))


    if (nspin.eq.1) then 
      densG(1:nplwv) = rdensr(1:nplwv,1)*rinplw
    else 
      densG(1:nplwv) = (rdensr(1:nplwv,1)+rdensr(1:nplwv,2))*rinplw
    endif
    call fft3d(densG,ngx,ngy,ngz,-1)   

    h1 = 0.0d0
    ! loop over G vectors excluding G=0
    do ng = 1,nplwv
         sizeG2 = sum(dnlg0(ng,1:3)**2) 
         if (sizeG2.gt.0.0000001d0) then 
           h1 = h1 + (edeps/volume)*dble(densG(ng)*conjg(densG(ng)))/sizeG2
           do x1 = 1,3 
             h(1:3) = dble(densG(ng)*conjg(densG(ng)))* & 
                      2.0d0*dnlg0(ng,x1)*dnlg0(ng,1:3)/(sizeG2**2)
             stress(x1,1:3) = stress(x1,1:3) + (edeps/volume)*h(1:3)
           enddo 
         endif
    enddo 
    deallocate(densG)  

    ! volume term 
    do x1 = 1,3
        stress(x1,x1) = stress(x1,x1)-hartree_energy
    enddo
    stress = stress/volume/2.0d0

    call write_stress(stress,'hartree') 
    stress_all%hartree = stress


    end subroutine hartree_stress


!======================================================================================
    subroutine xc_stress(rdensr,vxc,exc,nspin,ngx,ngy,ngz,dnlg0,volume,rho_core, & 
                         nions,nspec,nionsp, posion, dirc )
!======================================================================================
!   exchange-correlation energy part of stress
!       u(i,j) (-Exc - dExc/d_rho rho) 
!
!   exchange-correlation core correction part of stress
!       E = sum(G) n_val(G) * exc(n_val+n_core)(G) 
!                                                         I         I
!       => u(i,j) = sum(G) n_val(G)*Vxc(G)*sum(I) [ strfac(G) dn_core(G)/de(i,j) ]
!
!              I                    I                              I
!       dn_core(G)/de(i,j) = -dn_core(G)/d|G|^2* 2*Gi*Gj - d(i,j)*n_core(G)
!                                       (1)                      (2)
!       (2) is done in real space
!
!        
!======================================================================================
    use basicdata, only : bohr
    implicit none
    integer,intent(in)  :: nspin
    integer,intent(in)  :: ngx,ngy,ngz    ! fft grid
    real*8, intent(in)  :: dnlg0(:,:)     ! G array (ngx*ngy*ngz,3)    
    real*8,intent(in)   :: rdensr(:,:)    ! real space density (nspin,ngx*ngy*ngz)
    real*8 ,intent(in)  :: vxc(:,:)       ! exchange-correlation potential
    real*8 ,intent(in)  :: exc            ! exchange-correlation energy
    real*8 ,intent(in)  :: volume         ! volume of unit cell
    real*8, intent(in)  :: rho_core(:)    ! partial core on realspace grid
    integer,intent(in)  :: nions,nspec,nionsp(nspec)
    real*8 ,intent(in)  :: posion(3,nions,nspec),dirc(3,3)
!---------------------------------------------------------------------
!   locals
    integer                 :: nplwv,ng,x1,nr,nsp,ni,na,index,spin,ispin
    real*8                  :: stress(3,3),h,str(6),arg,gx1,gx2,xyz(3),rinplw
    real*8, allocatable     :: dncore(:)
    complex*16, allocatable :: vxcG(:)
    complex*16, allocatable :: strfac(:,:)  ! too hold structure factor 

!=======================================================================

!   zero stress
    stress(1:3,1:3) = 0.0d0

    nplwv = ngx*ngy*ngz

    do x1 = 1,3 
        stress(x1,x1) = exc/volume   ! ??? -exc
    enddo

    ! loop over real space grid points 
    h = 0.0d0
    if (nspin.eq.1) then 
         do nr = 1,nplwv
            h = h + vxc(nr,1)*rdensr(nr,1)
         enddo
    else 
         do ispin = 1,nspin
            do nr = 1,nplwv
               h = h + vxc(nr,ispin)*rdensr(nr,ispin)
            enddo
         enddo
         
    endif
    do x1 = 1,3
        stress(x1,x1) = stress(x1,x1) + (-h/dble(nplwv))/volume
    enddo

    call write_stress(stress,'xc')
    stress_all%xc = stress

!   core correction part of stress 
    stress(1:3,1:3) = 0.0d0 

    allocate(strfac(nspec,ngx*ngy*ngz))
    allocate(dncore(ngx*ngy*ngz))

    rinplw=1.0d0/dble(nplwv)
    allocate(vxcG(ngx*ngy*ngz))
 


    str(1:6) = 0.0d0
    do ispin = 1,nspin

      ! fourier transform xc potential to reciprocal space
      vxcG(1:nplwv) = vxc(1:nplwv,ispin)*rinplw
      call fft3d(vxcG,ngx,ngy,ngz,-1)

      na = 1
!     loop over different elements
      do nsp = 1,nspec

        ! get dn_core(G)/d|G|^2
        call dncore_dg2(nplwv,dnlg0,volume,dncore,nsp)  
        dncore = dncore*(bohr**2)

        strfac(nsp,1:nplwv) = 0.0d0
        do ni = 1,nionsp(nsp)
           call xyzposna(nions,nspec,nionsp,posion,na,dirc,xyz)
           do ng = 1,nplwv
             ! sum structure factor for each kind of element
             arg = (dnlg0(ng,1)*xyz(1)+dnlg0(ng,2)*xyz(2)+ dnlg0(ng,3)*xyz(3))
             strfac(nsp,ng) = strfac(nsp,ng) + cmplx(cos(arg),-sin(arg))
           enddo
           na = na + 1
        enddo
        do index = 1,6
           ! do this part in reciprocal space
           do ng = 1,nplwv
              gx1 = dnlg0(ng,x1_index(index))
              gx2 = dnlg0(ng,x2_index(index))
              str(index) = str(index) - & 
                  dble(conjg(vxcG(ng))*strfac(nsp,ng))*dncore(ng)*2.0d0*gx1*gx2
           enddo 
        enddo
  
      enddo ! nspec

    enddo   ! spin

    str(1:6) = str(1:6)/dble(nspin)

!   do (2) in real space
    h = 0.0d0
    if (nspin.eq.1) then 
       do nr = 1,nplwv 
          h = h - vxc(nr,1)*rho_core(nr)
       enddo 
       h = h/dble(nplwv)
    else 
       do nr = 1,nplwv 
          h = h - (vxc(nr,1)+vxc(nr,2))*rho_core(nr)
       enddo 
       h = h/2.0d0/dble(nplwv)
    endif


    Do index=1,6 
        if (x1_index(index).eq.x2_index(index)) then 
          str(index) = str(index) + h 
        endif
    enddo 

    do index=1,6 
        stress(x1_index(index),x2_index(index)) = str(index)
        stress(x2_index(index),x1_index(index)) = str(index)
    enddo 
     
    stress = stress/volume
    call symmetrize_stress(stress)
    call write_stress(stress,'xc_cc')
    stress_all%xc_cc = stress

    deallocate(dncore,vxcG,strfac)

    end subroutine xc_stress

!======================================================================================
    subroutine xc_gga_stress(ngx,ngy,ngz,df2,dnabln,absnab,volume )
!======================================================================================
!   calculate GGA stress contribution
!   se Corso and Resta, PRB 50, 4327 (1994)
!   eg. 24 second term (first term calculated in xc_stress)
!        
!        sum(R) dfxc/d(d_x1n) * dn/dr_x2
!
!--------------------------------------------------------------------------------------
    use basicdata, only : bohr
    implicit none
    integer ,intent(in) :: ngx,ngy,ngz 
    real*8 , intent(in) :: df2(ngx*ngy*ngz)       ! dfxc/d|nable n|
    real*8 , intent(in) :: dnabln(ngx*ngy*ngz,3)  ! nabla n
    real*8 , intent(in) :: absnab(ngx*ngy*ngz)    ! |nabla n|
    real*8 , intent(in) :: volume
!---------------------------------------------------------------------
!   locals
    integer   :: ng,index,x1,x2
    real*8    :: str(6),stress(3,3)   

    if (.not.cal_xc_gga_stress) return

    str(1:6) = 0.0d0  
    do index = 1,6 
      x1 = x1_index(index) 
      x2 = x2_index(index) 

      do ng = 1,ngx*ngy*ngz 
         str(index) = str(index) + df2(ng)*dnabln(ng,x1)*dnabln(ng,x2)/absnab(ng)
      enddo 
    enddo 
    str(1:6) = -str(1:6)/dble(ngx*ngy*ngz)*bohr/volume
    do index=1,6 
        stress(x1_index(index),x2_index(index)) = str(index)
        stress(x2_index(index),x1_index(index)) = str(index)
    enddo 

    call symmetrize_stress(stress)
    call write_stress(stress,'xc_gga')
    stress_all%xc_gga = stress              

    end subroutine xc_gga_stress


!======================================================================================
    subroutine xc_spin_gga_stress(ngx,ngy,ngz,df2,dfud,dnabln,absnab,volume)
!======================================================================================
!   calculate GGA stress contribution
!   for spin-polarized GGA
!   se Corso and Resta, PRB 50, 4327 (1994)
!   eg. 24 second term (first term calculated in xc_stress)
!
!        sum(R) dfxc/d(d_x1n) * dn/dr_x2  (skal rettes) 
!
!--------------------------------------------------------------------------------------
    use basicdata, only : bohr
    implicit none
    integer,parameter   :: nspin=2
    integer ,intent(in) :: ngx,ngy,ngz
    real*8 , intent(in) :: df2 (ngx*ngy*ngz,nspin)     ! dfx/d|nable n_up| * 1/|nable n_up|      (exchange)
    real*8 , intent(in) :: dfud(ngx*ngy*ngz)           ! dfc/d|nable(n_up + n_dn)| * 1/|nable n| (correlation)
    real*8 , intent(in) :: dnabln(ngx*ngy*ngz,nspin,3) ! nabla n_up
    real*8 , intent(in) :: absnab(ngx*ngy*ngz,nspin)   ! |nabla n_up|
    real*8 , intent(in) :: volume
!---------------------------------------------------------------------
!   locals
    integer   :: ng,index,x1,x2,ispin
    real*8    :: str(6),stress(3,3)
    real*8    :: dnx1,dnx2
 
    if (.not.cal_xc_gga_stress) return
 
    str(1:6) = 0.0d0
    do index = 1,6
      x1 = x1_index(index)
      x2 = x2_index(index)


      ! exchange
      do ispin = 1,nspin
         do ng = 1,ngx*ngy*ngz
            str(index) = str(index)+df2(ng,ispin)*dnabln(ng,ispin,x1)*dnabln(ng,ispin,x2)
         enddo
      enddo

      ! correlation
      do ng = 1,ngx*ngy*ngz
         dnx1  = dnabln(ng,1,x1) + dnabln(ng,2,x1)
         dnx2  = dnabln(ng,1,x2) + dnabln(ng,2,x2)
         str(index) = str(index)+dfud(ng)*dnx1*dnx2
      enddo

    enddo ! index

    str(1:6) = -str(1:6)/dble(ngx*ngy*ngz)*bohr/volume
    do index=1,6
        stress(x1_index(index),x2_index(index)) = str(index)
        stress(x2_index(index),x1_index(index)) = str(index)
    enddo
    call symmetrize_stress(stress)
    call write_stress(stress,'xc_gga')
    stress_all%xc_gga = stress
 
    end subroutine xc_spin_gga_stress

!======================================================================================
    subroutine localpsp_stress(rdensr,nspin,ngx,ngy,ngz,nions,nspec,nionsp, & 
                               volume,posion, dirc, cvion,dnlg0, enalp )
!======================================================================================
!   local pseudo-potential contribution to stress
!                                    I                                  I
!   rho_x1x2 = -sum(G,I) S_I(G) [dVloc/d|G|^2 2*G_x1*G_x2 + u(x1,x2)*Vloc(G)]*rho(G) 
!   calculated like: 
!             do nsp = 1,nspec 
!                do ni = 1,nionsp(nsp) 
!                  do ng = 1,nplvw 
!                     sum strfac(ng,nsp) 
!                  enddo 
!                enddo 
!                calculate dvloc/dG^2
!                do ng = 1,nplwv 
!                  rho(x1,x2) sum strfac(ng,nsp)*dvloc(ng)*G_x1*G_x2 * rho(G) 
!                  rho(x1,x1) sum strfac(ng,nsp)*Vloc(G)*rho(G)
!                enddo 
!             enddo 
!                     
!======================================================================================
    use van_us_data_module, only : g,rydb3_eva3,bohr
    implicit none
   
    integer, intent(in)   :: nspin
    integer,intent(in)    :: ngx,ngy,ngz                      ! fft grid
    real*8,intent(in)     :: rdensr(ngx*ngy*ngz,nspin)        ! real space density (ngx*ngy*ngz,nspin)     
    real*8 ,intent(in)    :: volume                           ! volume of unit cell
    integer,intent(in)    :: nions,nspec,nionsp(nspec)
    real*8 ,intent(in)    :: posion(3,nions,nspec),dirc(3,3)
    complex*16,intent(in) :: cvion(*)
    real*8, intent(in)    :: dnlg0(:,:)                       ! G array (ngx*ngy*ngz,3)  
    real*8, intent(in)    :: enalp                            ! aZ contribution to energy
!---------------------------------------------------------------------
!   locals
    integer                 :: nplwv,ng,x1,nsp,ni,na,n,x2,i
    complex*16, allocatable :: densG(:)     ! too hold density in reciprocal space
    complex*16, allocatable :: strfac(:,:)  ! too hold structure factor
    real*8,     allocatable :: dvloc(:)     ! too hold dVloc/dG
    real*8                  :: stress(3,3),rinplw,xyz(3),arg,h(3),h1
    real*8,     allocatable :: vloc(:)      ! too hold Vloc(G)
 
!=======================================================================
 
!   zero stress
    stress(1:3,1:3) = 0.0d0

    nplwv = ngx*ngy*ngz

    ! fourier transform rdensr to reciprocal space (densG)
    rinplw=1.0d0/dble(nplwv)
    allocate(densG(ngx*ngy*ngz))
    allocate(strfac(nspec,ngx*ngy*ngz))
    allocate(dvloc(ngx*ngy*ngz))

    if (nspin.eq.1) then  
      densG(1:nplwv) = rdensr(1:nplwv,1)*rinplw
    else 
      densG(1:nplwv) = (rdensr(1:nplwv,1)+rdensr(1:nplwv,2))*rinplw
    endif
      
    call fft3d(densG,ngx,ngy,ngz,-1)

    na = 1
    do nsp = 1,nspec 
      strfac(nsp,1:nplwv) = 0.0d0
      do ni = 1,nionsp(nsp) 
         call xyzposna(nions,nspec,nionsp,posion,na,dirc,xyz)
         do ng = 1,nplwv 
           ! sum structure factor for each kind of element
           arg = (dnlg0(ng,1)*xyz(1)+dnlg0(ng,2)*xyz(2)+ dnlg0(ng,3)*xyz(3)) 
           strfac(nsp,ng) = strfac(nsp,ng) + cmplx(cos(arg),-sin(arg)) 
         enddo 
         na = na + 1
      enddo 

      write(6,*) 'dvloc_of_g '
      call uflush(6)
  
      ! calculate dvloc_of_G
      call dvloc_of_g(nplwv,dnlg0,volume,dvloc,nsp) 
      dvloc(1:nplwv) = dvloc(1:nplwv)*rydb3_eva3*bohr**2

      write(6,*) 'end dvloc_of_g '
      call uflush(6)

      do x1 = 1,3
        do x2 = 1,3
        do ng = 1,nplwv 
          h1 = -dble(strfac(nsp,ng)*conjg(densG(ng)))*dvloc(ng)*2.0d0*dnlg0(ng,x1)*dnlg0(ng,x2)
          stress(x1,x2) = stress(x1,x2) + h1
        enddo
        enddo
      enddo 
    
    enddo  ! nspec    

    h1 = 0.0d0
    if (nspin.eq.1) then 
      do n = 1,nplwv
        h1 = h1 + dble(cvion(n))*rdensr(n,1) 
      enddo 
    else 
      do n = 1,nplwv
        h1 = h1 + dble(cvion(n))*(rdensr(n,1)+rdensr(n,2)) 
      enddo 
    endif
    h1 = -h1*rinplw
    do x1 = 1,3 
        stress(x1,x1) = stress(x1,x1) + h1
    enddo 


    stress = stress/volume

    call write_stress(stress,'localpsp') 
    stress_all%localpsp = stress

!   alpha*Z contribtion to stress 
    stress = 0.0d0 
    do x1 = 1,3 
      stress(x1,x1) = -enalp/volume
    enddo 
    call write_stress(stress,'alphaZ')
    stress_all%alphaZ = stress


    allocate(vloc(nplwv))
    call setlocal1(1,nplwv,dnlg0,volume,vloc) 
    vloc(1:nplwv) = vloc(1:nplwv)*rydb3_eva3

    h1 = 0.0d0 
    do nsp = 1,nspec
      do ng = 2,nplwv 
          h1 = h1 + dble(strfac(nsp,ng)*conjg(densG(ng)))*vloc(ng)  
      enddo 
    enddo

    deallocate(densG,dvloc,vloc) 
 
    end subroutine localpsp_stress


!======================================================================================
    subroutine nonlocal_stress(cptwfp,nplwkp,ndcbyt,ldonkp, & 
#include                       "apply_h_args.h"                        
                               ,occupation,wtkpt,eigen,timer ) 
!======================================================================================
!   non-local kb energy part of stress
!                                                *                *
!     rho(x1,x2)  = sum(nb) sum(n,m) Dnm [d<n|nb>/de*<m|nb> + <n|a>*d<m|nb>/de]
!     (de=de_x1x2) 
!                                        *
!     d<n|nb>/de = d(exp(-iGR)*psi(G)*beta(G))/de,   beta(G) = f(G)*Ylm(G)  
!                
!     dbeta(G)/de = df(G)/de Ylm(G) + f(g)*dYlm(G)/de 
!     
!     dpsi(G)/de  = -1/2*u(x1,x2)*psi(G)   (d(V^0.5*psi(G))/de = 0) 
! 
!     => 
!     d<n|nb>/de =     
!
!======================================================================================
    use basicdata 
    use van_us_data_module
    use us_hpsi_module                    ! cal_vkb and cal_bec
    use non_local_projectors
    implicit none
#include "apply_h_decl.h"  
include 'readvan.h'
    integer, intent(in)  :: nplwkp(nkprun),ndcbyt 
    complex*WF_PRECISION, & 
             intent(in)  :: cptwfp(nrplwv,nbands,nkpmem) 
    real*8, intent(in)   :: occupation(nbands,nkprun),wtkpt(nkprun),eigen(nbands,nkprun)
    logical*4, intent(in):: ldonkp(nkprun)    
    real*8,intent(inout) :: timer(*) 
!---------------------------------------------------------------------
!   locals
    integer            :: ng,x1,nb,n,index,nplw,np,j,x2,nkpeff,nstart
    integer            :: jkb,nhjkb,na,it,nhjkbm,jkb1
    complex*16         :: becp0(nkb,nbands),dbecp(nkb,nbands,9)
    real*8,allocatable :: Gx1Gx2divabsG(:)  ! nrplwv
    real*8             :: stress(3,3),wght,dnm_m_eqnm,h,stress_local(3,3)
    complex*16         :: ps,pss
    real*8             :: vq(nrplwv_global,nbrx,nspec)  
    real*8             :: dvq(nrplwv_global,nbrx,nspec)  
    real*8             :: vq1(nrplwv_global,nbrx,nspec)  
    real*8             :: ylm(nrplwv_global,(lqx)**2),dylm(nrplwv_global,(lqx)**2)
    real*8             :: vkb1(nrplwv,nkbpmaxatom,nspec,nkpmem)
    real*8             :: vkb2(nrplwv,nkbpmaxatom,nspec,nkpmem)
!=======================================================================


!   zero stress
    stress(1:3,1:3) = 0.0d0

    allocate(Gx1Gx2divabsG(nrplwv_global))

!   loop over kpoints
    do nkp = 1,nkprun 

      if (.not.ldonkp(nkp)) cycle
 
      vkb1 = 0.0d0
      vkb2 = 0.0d0
      vkb  = 0.0d0

      call  wfswap_get(cptwfp,ndcbyt,nkpeff,&
#include                "apply_h_args.h"
                      )
      nplw = nplwkp(nkp)
      do n = 1,nplw
         gkabs(n) = sum(dnlg(n,1:3,nkp)**2)
      enddo

      becp0(1:nkb,1:nbands) = becp(1:nkb,1:nbands,nkp)

!     ylm 
      call ylmr2(lqx**2,0,nplw,nrplwv_global,dnlg(1,1,nkp), & 
               gkabs,nrplwv_global,(lqx)**2,ylm)

!     f(G)
      call calcvq(nplw,nrplwv_global,nspec,gkabs,vq,0)    

!     df(G)/dG
      call calcvq(nplw,nrplwv_global,nspec,gkabs,dvq,1)    

!     loop over elements in upper part of x1x2
!     do index = 1,6 
      index = 1
      do x1 = 1,3
       do x2 = 1,3

!        setup dbeta(G)/de = df(G)/de Ylm(G) + f(g)*dYlm(G)/de 
!        in vkb 

         Gx1Gx2divabsG(1) = 0.57735027d0
         nstart = 1
         if (gkabs(1).lt.0.0000001) then 
           nstart = 2
         endif
         do n = nstart,nplw
            Gx1Gx2divabsG(n) = dnlg(n,x1,nkp)* & 
                             dnlg(n,x2,nkp)/gkabs(n)
         enddo 

!        df(G)/de = df(G)/de*G_x1*G_x2/|G|
         do np = 1,nspec 
           do nb = 1,nbeta(np) 
             vq1(1:nplw,nb,np) = dvq(1:nplw,nb,np)*Gx1Gx2divabsG(1:nplw)
           enddo 
         enddo

!        vkb = df(G)/de*ylm(G)
         call dvus_bessel(nplw,nrplwv_global,nrplwv,nspec,nkbp,nhm, &  
                     vq1,vkb1(1,1,1,nkpeff),ylm,nh,indv,nhtol,    & 
                     nhtom,volc,vkbreal & 
#ifdef PARAL
                    ,&
#include             PARAL_ARGS
#endif
                    )

         call dylmr2(lqx**2,0,nplw,nrplwv_global,dnlg(1,1,nkp),gkabs,nrplwv_global, & 
                (lqx)**2,dylm, Gx1Gx2divabsG, x1,x2)  


         call dvus_bessel(nplw,nrplwv_global,nrplwv,nspec,nkbp,nhm, &
                     vq,vkb2(1,1,1,nkpeff),dylm,nh,indv,nhtol,     & 
                     nhtom,volc,vkbreal & 
#ifdef PARAL
                    ,&
#include             PARAL_ARGS
#endif
                    )


!        vkb = vkb + vkb1
         vkb(1:nrplwv,1:nkbpmaxatom,1:nspec,nkpeff) = & 
              vkb1(1:nrplwv,1:nkbpmaxatom,1:nspec,nkpeff)  + & 
              vkb2(1:nrplwv,1:nkbpmaxatom,1:nspec,nkpeff)
 
!        loop over bands 
         do nb = 1,nbands
            ! use cal_bec to multiply make sum(G) (dbeta(G)/de)*(exp(-iGR)*psi(G))
            call cal_bec(nplwv,nrplwv,nrplwv_global,nplw,nkp,nb, & 
                       nions,nspec,nionsp,nbands,nkprun,     & 
                       dnlg(1,1,nkp),posion,dirc,volc,       &
                       ngxs,ngys,ngzs,ipwpad(1,nkp),nkpmem   & 
#ifdef PARAL
                       ,&
#include               PARAL_ARGS
#endif

                       ,timer,reci_psi=cptwfp(1,nb,nkpeff))


            if (x1.eq.x2) then 
              becp(1:nkb,nb,nkp) = becp(1:nkb,nb,nkp) - 0.5d0*becp0(1:nkb,nb) 
            endif
         enddo
         dbecp(1:nkb,1:nbands,index) = becp(1:nkb,1:nbands,nkp) 

         index = index + 1
       enddo ! x1 
      enddo  ! x2

      do nb = 1,nbands

        wght = occupation(nb,nkp)*wtkpt(nkp)
        do jkb = 1, nkb
          nhjkb = nkbtonh(jkb)
          na = nkbtona(jkb)
          it = ityp(na)
          nhjkbm = nh(it)
          jkb1 = jkb - nhjkb
          ! now the sums sum_m (d_nm-e_a qnm) {<m|a>, <m|iG|a>}

          stress_local(1:3,1:3) = 0.0d0 
          index = 1
          do x1 = 1,3
           do x2 = 1,3
             ps = 0.0d0
             pss= 0.0d0
             do j = 1,nhjkbm
                dnm_m_eqnm = deeq(nhjkb,j,na,kspin(nkp))-eigen(nb,nkp)*qq(nhjkb,j,it)
                ps = ps + becp0(jkb1+j,nb    )  * dnm_m_eqnm
                pss= pss+ dbecp(jkb1+j,nb,index)* dnm_m_eqnm
             enddo
             ! 111 format(1x,a20,1x,i2,1x,i2,1x,i2,2(f12.8,1x))
             ! write(6,111) 'pss ',x1,x2,nb,conjg(dbecp(jkb,nb,index))*ps + conjg(becp0(jkb,nb    ))  *pss

             stress(x1,x2) = stress(x1,x2) &
                  + wght*dble(conjg(dbecp(jkb,nb,index))*ps & 
                            + conjg(becp0(jkb,nb    ))  *pss)

             stress_local(x1,x2) = stress_local(x1,x2) &
                  + wght*dble(conjg(dbecp(jkb,nb,index))*ps & 
                            + conjg(becp0(jkb,nb    ))  *pss)

             ! write(6,*) 'stress_local ',x1,x2,nkp,nb
             ! call write_stress(stress_local,'stress_local') 

             index = index + 1
           enddo  ! x1
          enddo   ! x2
        enddo     ! jkb

      enddo ! ibnd = 1,nbands

    enddo     ! nkp = 1,nkprun

!   do index = 1,6 
!     stress(x2_index(index),x1_index(index)) = stress(x1_index(index),x2_index(index))
!   enddo

    stress(1:3,1:3) = stress(1:3,1:3)/volc

    call symmetrize_stress(stress)

#ifdef PARAL
!   sum stress contributions for all k-points
    call mssum (stress, stress,3,3,3,&
#include PARAL_ARGS
         ,nconso,timer )
#endif PARAL

    call write_stress(stress,'nonlocal') 
    stress_all%nonlocal = stress 

    end subroutine nonlocal_stress


!======================================================================================
  subroutine us_stress(&
                 nplwv,nrplwv,nrplwv_global,nions,nspec, &
                 nionsp,nbands, nspin,nkprun,kspin,wtkpt,ldonkp,&
                 posion,dirc,volc, cveff, & 
                 ngx,ngy,ngz, & 
                 nconso &
#ifdef PARAL
                 ,&
#include PARAL_ARGS
#endif
                 ,timer)

!======================================================================================
! 
!     For the US pseudopotential calculate the contribution to the stress 
!     for  the Dnm dependence on Qnm
!
!                                                                       I I
!       rho_x1x2 = sum(G) Veff(G)* sum(i,nm,I) dQnm(G)/du_x1x2 <psi|beta> <beta|psi> 
!
!
!     1.  Q_nm(G) = sum(LM) c(LM,nm) Ylm(G)  Q_L(nm)  
!         c(LM,nm) is Clebsch-Gordon coefficients
! 
!          I 
!     2.  Q_nm(G)  = Qnm(G)  exp(-i G R ) 
!
!----------------------------------------------------------------------
      use van_us_data_module
      implicit   none 
      integer,intent(in)    ::  nplwv,nrplwv,nrplwv_global,nions,nspec,nionsp(nspec)
      integer,intent(in)    ::  nbands,nspin,nkprun,kspin(nkprun)
      real*8,intent(in)     ::  wtkpt(nkprun),posion(3,nions,nspec),dirc(3,3)
      logical*4, intent(in) ::  ldonkp(nkprun)    
      real*8,intent(in)     ::  volc
      complex*16,intent(in) ::  cveff(nplwv,nspin)
      integer,intent(in)    ::  ngx,ngy,ngz
      integer,intent(in)    ::  nconso
      real*8,intent(inout)  ::  timer(*)
#ifdef PARAL
#include      PARAL_DECL
#endif

!     locals 
      integer    :: i,index,ng,ispin,m,x1,x2,nkp
      complex*16 :: qg(ngdens_max),ps,auxfft(nplwv)
      real*8     :: str(6),gx1gx2divgabs(ngdens_max),stress(3,3)
      complex*16 :: aux(ngdens_max,nspin)
      real*8     :: rinplw
      

!-----------------------------------------------------------------------
!     Get Fourier transform of effective potential cveff* (in aux)
      rinplw=1.0d0/nplwv*dble(nspin)
      do ispin = 1,nspin
        do i=1, nplwv
          auxfft(i) = cveff(i,ispin)*rinplw
        enddo
        call fft3d(auxfft,ngx,ngy,ngz,-1)
        do m = 1,ngdens_max
          aux(m,ispin) = auxfft(ipwpadG(m,0))
        enddo
      enddo
!-----------------------------------------------------------------------


      str(1:6) = 0.0d0    
      
      do nkp = 1,nkprun

       if (.not.ldonkp(nkp)) cycle
       do index = 1,6

         qg(1:ngdens_max) = 0.0d0
         ! get contrbution from term: dYlm/du*Qnm
         call calcusdens(&
            nplwv,nrplwv,nrplwv_global,nions,nspec,&
            nionsp,nbands, nkprun,nkp,qg,&
            posion,dirc,volc,.true.,index,nconso & 
#ifdef PARAL
            ,&
#include PARAL_ARGS
#endif
            ,timer)

         ps = 0.0d0
         do ng = 2,ngdens_max
            ps = ps + conjg(aux(ng,kspin(nkp)))*qg(ng) 
         enddo
         str(index) = str(index) + dble(ps)
           
       enddo 
      enddo

!     setup d qrad/dG 
      call calcqr(ngdens,ngldim,ngl,nspec,volc,gg(1+goffs),qrad,1,&
                  ivjv2index,maxindex)

      do nkp = 1,nkprun

       if (.not.ldonkp(nkp)) cycle
       do index=1,6 
 
         x1 = x1_index(index)       
         x2 = x2_index(index)       

         ! get contrbution from term: Ylm*dQnm/du
         qg(1:ngdens_max) = 0.0d0
         call calcusdens(&
              nplwv,nrplwv,nrplwv_global,nions,nspec,&
              nionsp,nbands, nkprun,nkp, qg,&
              posion,dirc,volc,.false.,index,nconso & 
#ifdef PARAL
             ,&
#include PARAL_ARGS
#endif
             ,timer)

         gx1gx2divgabs(1) = 0.0d0
         do ng = 2,ngdens_max
           gx1gx2divgabs(ng) = g(ng,x1)*g(ng,x2)/gg(ng)
         enddo

         ps = 0.0d0
         do ng = 2,ngdens_max 
           ps = ps + gx1gx2divgabs(ng)*qg(ng)*conjg(aux(ng,kspin(nkp)))
         enddo 
         str(index) = str(index) + dble(ps)
      
       enddo        
      enddo

      do index=1,6
        stress(x1_index(index),x2_index(index)) = str(index)
        stress(x2_index(index),x1_index(index)) = str(index)
      enddo

      stress(1:3,1:3) = stress(1:3,1:3)/volc/dble(nspin)

#ifdef PARAL
!     sum stress contributions for all k-points
      call mssum (stress, stress,3,3,3,&
#include PARAL_ARGS
                 ,nconso,timer )
#endif PARAL

      call symmetrize_stress(stress)

      call write_stress(stress,'us_Qnm')
      stress_all%us_Qnm = stress

      end subroutine us_stress


!======================================================================================
    subroutine write_stress(stress,text) 
!======================================================================================
!   write stress to ascii file
!======================================================================================
    use run_context
    implicit none  

    real*8, intent(in)       :: stress(:,:) 
    character*(*),intent(in) :: text

!   locals 
    integer :: i,j 

    do i = 1,3 
        write(nconso,10) text,i,(stress(i,j),j=1,3)
    enddo 
10  format(1x,'stress:',a20,1x,i1,1x,3(f12.8,1x))

    call uflush(nconso)

    end subroutine write_stress


!======================================================================================
    subroutine symmetrize_stress(stress)
!======================================================================================
!   use point group operations on stress components from IBZ k-points
!======================================================================================
    real*8, intent(inout) :: stress(3,3)

!   locals 
    integer :: i,j,numsym
    real*8  :: sym_stress(3,3) 

    if (.not.allocated(point_group_op)) return

    numsym = size(array=point_group_op,dim=3)

!   divide by the number of group operations 
    stress(1:3,1:3) = stress(1:3,1:3)/dble(numsym) 

    sym_stress(1:3,1:3) = 0.0d0 
    do i = 1,3 
      do j = 1,3 

!        sum over rotations
         do nsym = 1,numsym 

           do k1 = 1,3 
             do k2 = 1,3 
                sym_stress(i,j) = sym_stress(i,j) + & 
                  point_group_op(i,k1,nsym)* & 
                       point_group_op(j,k2,nsym)*stress(k1,k2)
             enddo 
           enddo 
         enddo 
      enddo 
    enddo 
    stress(1:3,1:3) = sym_stress(1:3,1:3)

    end subroutine symmetrize_stress

 
!======================================================================================
    subroutine get_total_stress(lmastr, iunit)
!======================================================================================
!   sum stress components
!======================================================================================
    use run_context
    use netcdfinterface
    implicit none
    logical*4, intent(in) :: lmastr
!   locals 
    integer ncid,status 
    integer, intent(in)       ::  iunit 

    stress_all%total = stress_all%kinetic + stress_all%hartree + stress_all%xc       & 
                     + stress_all%xc_cc                                              & 
                     + stress_all%localpsp + stress_all%alphaZ + stress_all%nonlocal & 
                     + stress_all%ion_ion  + stress_all%us_Qnm + stress_all%xc_gga

    call symmetrize_stress(stress_all%total)

    call write_stress(stress_all%total,'total') 

    if (.not.lmastr) return ! only master nodes write to netcdffile 

! - define stress variable in netcdf 

    status = nf_open(netCDF_output_filename, NF_WRITE, ncid )
    if (status /= nf_noerr) &
        call abort_calc(nconso,  &
             "get_total_stress->nf_open: error")                         
    status = nfput(ncid,'TotalStress',stress_all%total(1:3,1:3), & 
                   dim_name1='dim3',dim_name2='dim3')                                

    if ((status/=nfif_OK).and.(status/=nfif_varexist_butOKdef)) & 
        call abort_calc(nconso,  &
             "get_total_stress->nfput: error")      

    status = nf_close(ncid)

 
    end subroutine get_total_stress

    end module stress_module



 
