#include "definitions.h"

      module us_hpsi_module

      contains 

!-----------------------------------------------------------------------
      subroutine usvnlpsi( &
                           spsi,nplwkp,lhpsi, lspsi,lcalbec,nbn,&
#include                  "apply_h_args.h"
                          ,timer,reci_psi,reci_vpsi, real_psi,real_vpsi)
!-----------------------------------------------------------------------
!     Compute V_NL (US pseudo-potential) times wavefunction psi 
!     and  overlab-matrx S times wavefunction psi.
!     The  becp (<psi|beta>) is calculated by call to cal_bec. 
! 
!     if lhpsi = false then H|psi> is not calculated
!     if lspsi = false then S|psi> is not calculated
!
!     input  : 
!        deeq  : Dnm
!        vkb   : beta(G)           
!        psi   : |psi> 
!        nplwkp: actual dimension of psi
!        skb   : sum Ri exp(-i (G+k) Ri)
!        kspin : tells which spin to use 
!
!     output :  
!        becp  : <psi|beta>
!        vpsi(G) = sum(n,m,i) Dnm beta_n(G) exp(-i (G+k) Ri) <psi|beta_m>
!        spsi(G) = sum(n,m,i) Qnm beta_n(G) exp(-i (G+k) Ri) <psi|beta_m>
! 
!-----------------------------------------------------------------------
      use non_local_projectors
      use run_context
      use van_us_data_module
      implicit none


#     include "apply_h_decl.h"
      integer      nplwkp
      complex*WF_PRECISION, optional :: reci_psi(*)
      complex*WF_PRECISION, optional :: reci_vpsi(*)
      complex*16          , optional :: real_psi(:),real_vpsi(*)
      

      complex*WF_PRECISION spsi(*) 
      complex*16   spsi1(nrplwv)
      logical*4    lhpsi,lspsi,lcalbec
      integer      nbn
      real*8       timer(*)


!     locals
      real*8   psr(nkbpmaxatom,nions),psc(nkbpmaxatom,nions)
      real*8   ps2(nkbpmaxatom,2,nions),temp2(nrplwv,2,nions)
      complex*16 ps(nkb),temp(nplwkp),alpha,beta
      complex*16 vpsi1(nrplwv),vpsi2(nrplwv)
      real*8     ac,ad,bc,bd

      integer jkb,nhjkb,nhjkbm,it,na,jkbp,i,j,jkb1,n,lda,ldc,l
      integer np,ni,iv,nproj,nsp,m,ldb
      integer nplwkp_local,nkpeff
#ifdef PARAL
      integer    nlocal(par_pw_np),offset(par_pw_np) 
#endif

#include "etime.h"
      real*8  ZERO,SMALL,rinplwv 
      complex*16   h
      complex*16, allocatable ::  spsiwork(:) ,vpsiwork(:)
      parameter(ZERO = 0.0d0,SMALL=1.0d-12) 
      external  dgemv

      call uttime(time)
      timer(TVNLWAV)=timer(TVNLWAV)-time(1)           

      call geteff(nkp,nkpmem,nconso,nkpeff) 

!     get becp for psi
      if (lcalbec) then 
        call cal_bec(nplwv,nrplwv,nrplwv_global,nplwkp,nkp,nbn, &
           nions,nspec,nionsp,nbands,nkprun,dnlg(1,1,nkp), & 
           posion,dirc,volc, ngxs,ngys,ngzs,ipwpad(1,nkp),nkpmem &
#ifdef PARAL
               ,&
#include PARAL_ARGS
#endif        
          ,timer,reci_psi,real_psi)
      endif

!--------------------------------------------------------------------
!     define nplwv  (number of planewaves)
#ifdef SERIAL
      nplwkp_local = nplwkp
#else
!     define the number of planewaves on this node (nlocal)
      call par_defwfk (nrplwv,nrplwv_global,nplwkp,nlocal,offset,&
                         nplwkp_local,&
#include PARAL_ARGS
        , nconso)
#endif
!--------------------------------------------------------------------


      if (lhpsi) then 

      call uttime(time)
      timer(THPSI)=timer(THPSI)-time(1)

!     zero vpsi      
      vpsi1(1:nrplwv) = dcmplx(0.0d0,0.0d0)
      vpsi2(1:nrplwv) = dcmplx(0.0d0,0.0d0)


!     multiply on projectors
      do jkb = 1, nkb
            nhjkb = nkbtonh(jkb)
            it = ityp(nkbtona(jkb))
            na = nkbtona(jkb)
            nhjkbm = nh(it)
            jkb1 = jkb - nhjkb

!           calculate sum_m d_nm bec_m
            ps(jkb) = dcmplx(ZERO,ZERO) 
            do j = 1,nhjkbm
               ps(jkb) = ps(jkb) + &
                 becp(jkb1+j,nbn,nkp)*deeq(nhjkb,j,na,kspin(nkp))
            enddo
      enddo

      if (luse_rs_proj) then 
        ! use rs projectors, add in real space to vpsi 
        ! do jkb = 1,nkb 
        ! na = nkbtona(jkb)
        !  do n = 1,ngridpts(na) 
        !    real_vpsi(index_rs2fft(n,na))=&
        !     real_vpsi(index_rs2fft(n,na))+vkb_rs(n,jkb)*ps(jkb)
        !  enddo 
        !enddo 

        nproj = 1
        na    = 1
        lda = size(vkb_rs(:,1))
        allocate(vpsiwork(lda))
        do nsp = 1,nspec
         do ni = 1,nionsp(nsp)

             l   = ngridpts(na)
             m   = nh(nsp)

             ldb = nh(nsp)
             n   = 1

             alpha = (1.0d0,0.0d0)
             beta  = (0.0d0,0.0d0)
             ldc = lda
             call zgemm('N','N',l,n,m,alpha,vkb_rs(1,nproj),lda,&
                        ps(nproj),ldb,beta,vpsiwork,ldc)

             ! vpsiwork(:) = 0.0d0
             ! do n= 1,l 
             !   do i = 1,m 
             !      vpsiwork(n) = vpsiwork(n) + vkb_rs(n,nproj+i-1)*&
             !                    ps(nproj+i-1) 
             !   enddo 
             ! enddo

             do n = 1,ngridpts(na) 
                real_vpsi(index_rs2fft(n,na))=&
                  real_vpsi(index_rs2fft(n,na)) + vpsiwork(n)
             enddo 
             na = na + 1
             nproj = nproj + nh(nsp)
         enddo ! ni
        enddo  ! nsp 
        deallocate(vpsiwork)
                                   
      else 
        ! reciprocal projectors

        na = 1
        do np = 1,nspec
          do ni = 1,nionsp(np)
            do iv = 1,nh(np)
              jkb = nkbc(na,iv)
              if (vkbreal(iv,np)) then 
!                (r  + i c)
!                  psr(iv,ni) = dreal(ps(jkb))
!                  psc(iv,ni) = dimag(ps(jkb))
                   ps2(iv,1,ni) = dreal(ps(jkb))
                   ps2(iv,2,ni) = dimag(ps(jkb))
              else
!                (-c + i r)
!                psr(iv,ni) = -dimag(ps(jkb))
!                psc(iv,ni) =  dreal(ps(jkb))
                 ps2(iv,1,ni) = -dimag(ps(jkb))
                 ps2(iv,2,ni) =  dreal(ps(jkb))
              endif
            enddo
            na = na + 1
          enddo
          na = na - nionsp(np)

!         multiby with nionsp(np)*nh(np) projectors for this atom
!         The real and the imaginary. part calc. separately.
          call dgemm('N','N',nplwkp_local, 2*nionsp(np), nh(np), 1.0d0,&
                vkb(1,1,np,nkpeff), nrplwv, &
                ps2,         nkbpmaxatom, &
                0.0d0,&
                temp2,       nrplwv )

          call uttime(time)
          timer(THPSI2)=timer(THPSI2)-time(1)
          do ni = 1,nionsp(np)
            do i=1,nplwkp_local
             vpsi1(i)= vpsi1(i)+skb(i,na,nkpeff)*&
                 dcmplx(temp2(i,1,ni),temp2(i,2,ni))
            enddo
            na = na +1
          enddo
           call uttime(time)
           timer(THPSI2)=timer(THPSI2)+time(1)
!          na = na - nionsp(np)
!
!          call dgemm('N','N',nplwkp_local, nionsp(np), nh(np), 1.0d0,
!     &          vkb(1,1,np,nkpeff), nrplwv, 
!     &          psc,         nkbpmaxatom, 
!     &          0.0d0,
!     &          tempr,       nrplwv)
!
!          call uttime(time)
!          timer(THPSI2)=timer(THPSI2)-time(1)
!          do ni = 1,nionsp(np)
!             do i=1,nplwkp_local
!              vpsi2(i)= vpsi2(i)+skb(i,na)*tempr(i,ni)
!             enddo 
!             na = na +1
!          enddo
!          call uttime(time)
!          timer(THPSI2)=timer(THPSI2)+time(1)

        enddo  ! nspec

!       now collect vpsi1,vpsi2 to vpsi 
        do i=1,nplwkp_local
!         ac = dreal(vpsi1(i))
!         ad = dimag(vpsi1(i))
!         bc = dreal(vpsi2(i))
!         bd = dimag(vpsi2(i))
          reci_vpsi(i)=  reci_vpsi(i) + vpsi1(i)
        enddo
 
      endif ! luse_rs_proj

      call uttime(time)
      timer(THPSI)=timer(THPSI)+time(1)

      endif ! if lhpsi

!     S|psi>
      if (lspsi) then  

      call uttime(time)
      timer(THPSI1)=timer(THPSI1)-time(1)

      do jkb = 1, nkb
            nhjkb = nkbtonh(jkb)
            it = ityp(nkbtona(jkb))
            na = nkbtona(jkb)
            nhjkbm = nh(it)
            jkb1 = jkb - nhjkb
!           calculate sum_m d_nm bec_m
            ps(jkb) = dcmplx(ZERO,ZERO)
            do j = 1,nhjkbm
               ps(jkb) = ps(jkb) + becp(jkb1+j,nbn,nkp)*&
                    qq(nhjkb,j,it)
            enddo
      enddo

      if (luse_rs_proj) then
        ! transform psi to realspace
        allocate(spsiwork(ngxs*ngys*ngzs))
        if (present(real_psi)) then 
          spsiwork(:)=real_psi(:)
        else 
          spsiwork(:)= 0.0d0
        endif

        ! do jkb = 1,nkb
        !   na = nkbtona(jkb)
        !   do n = 1,ngridpts(na)
        !     spsiwork(index_rs2fft(n,na)) =&
        !       spsiwork(index_rs2fft(n,na)) + vkb_rs(n,jkb)*ps(jkb)
        !   enddo
        ! enddo

        nproj = 1
        na    = 1
        lda = size(vkb_rs(:,1))
        allocate(vpsiwork(lda))
        do nsp = 1,nspec
         do ni = 1,nionsp(nsp)
 
             l   = ngridpts(na)
             m   = nh(nsp)
 
             ldb = nh(nsp)
             n   = 1
 
             alpha = (1.0d0,0.0d0)
             beta  = (0.0d0,0.0d0)
             ldc = lda
             call zgemm('N','N',l,n,m,alpha,vkb_rs(1,nproj),lda,&
                        ps(nproj),ldb,beta,vpsiwork,ldc)
 
             do n = 1,ngridpts(na)
                spsiwork(index_rs2fft(n,na))=&
                  spsiwork(index_rs2fft(n,na)) + vpsiwork(n)
             enddo
             na = na + 1
             nproj = nproj + nh(nsp)
         enddo ! ni
        enddo  ! nsp
        deallocate(vpsiwork)                     

        ! transform spsiwork to reciprocal space
        rinplwv = 1.0d0/dble(ngxs*ngys*ngzs)
        call fft3d(spsiwork,ngxs,ngys,ngzs,-1)
        if (present(real_psi)) then 
          do  i=1,nplwkp
            spsi(i) = spsiwork(ipwpad(i,nkp))*rinplwv 
          enddo
        else
          do  i=1,nplwkp
            spsi(i) = spsiwork(ipwpad(i,nkp))*rinplwv + reci_psi(i)
          enddo
        endif
        deallocate(spsiwork)
      else                                                             

!     Init S|psi> to |psi>
      do i = 1,nplwkp_local
        spsi1(i) = dcmplx(reci_psi(i))
      enddo                                   

      na = 1
      do np = 1,nspec
        do ni = 1,nionsp(np)
          do iv = 1,nh(np)
            jkb = nkbc(na,iv)
            if (vkbreal(iv,np)) then
!              (r  + i c)
!               psr(iv,ni) = dreal(ps(jkb))
!               psc(iv,ni) = dimag(ps(jkb))
                ps2(iv,1,ni) = dreal(ps(jkb))
                ps2(iv,2,ni) = dimag(ps(jkb))
            else
!              (-c + i r)
!              psr(iv,ni) = -dimag(ps(jkb))
!              psc(iv,ni) =  dreal(ps(jkb))
               ps2(iv,1,ni) = -dimag(ps(jkb))
               ps2(iv,2,ni) =  dreal(ps(jkb))
            endif
          enddo
          na = na + 1
        enddo
        na = na - nionsp(np)

!       The real and the imaginary part calc. separately.
!       multiby with nh(it) projectors for this atom
        call dgemm('N','N',nplwkp_local, 2*nionsp(np), nh(np), 1.0d0,&
                vkb(1,1,np,nkpeff), nrplwv,&
                ps2,         nkbpmaxatom,&
                0.0d0,&
                temp2,       nrplwv  )
!       call dgemm('N','N',nplwkp_local, nionsp(np), nh(np), 1.0d0,
!    &          vkb(1,1,np), nrplwv,
!    &          psc,         nkbpmaxatom,
!    &          0.0d0,
!    &          tempc,       nrplwv)


        do ni = 1,nionsp(np)
           do i=1,nplwkp_local
            spsi1(i)= spsi1(i)+skb(i,na,nkpeff)*&
               dcmplx(temp2(i,1,ni),temp2(i,2,ni))
           enddo
           na = na +1
        enddo

      enddo  ! nspec

      do i = 1,nplwkp_local
        spsi(i) = dcmplx(spsi1(i)) 
      enddo

      endif   ! luse_rs_proj

      call uttime(time)
      timer(THPSI1)=timer(THPSI1)+time(1)

      endif   ! if lspsi

      call uttime(time)
      timer(TVNLWAV)=timer(TVNLWAV)+time(1)

      end subroutine usvnlpsi

!-----------------------------------------------------------------------
      subroutine cal_bec(nplwv,nrplwv,nrplwv_global,nplwkp,nkp,nbn, & 
                nions, nspec,nionsp,nbands,nkprun,dnlg,posion,dirc, &
                volc,ngxs,ngys,ngzs,ipwpad,nkpmem & 
#ifdef PARAL
                 ,&
#include PARAL_ARGS
#endif
               ,timer,reci_psi,real_psi)
      use van_us_data_module

!-----------------------------------------------------------------------
!     Calculate the matrix element becp between the wavefunction in
!     cptwfp  and the beta-projectors : 
!                                               
!                       --    
!                       \                   kpoint
!        <beta | psi>=  /   cptwfp(G)  beta(G) exp(-i G R) 
!                       --                  nbn
!                        
!
!     The beta(G) functions (in vkb) are first calculated by call 
!     to cal_vkb.
!
!     For parallel version  :
!       vkb=beta(G) and skb (the structure factor) is distributed out on nodes              
!
! 
!-----------------------------------------------------------------------
      use run_context
      use non_local_projectors
#ifdef PARAL
      use par_functions_module
#endif PARAL
      implicit none

#ifdef PARAL
#include PARAL_DECL
#endif

      integer    nplwv,nrplwv,nrplwv_global,nplwkp,nkp,nbn
      integer    nions,nspec,nionsp(nspec)
      integer    nbands,nkprun
      real*8     posion(3,nions,nspec),dirc(3,3),volc
      integer    ngxs,ngys,ngzs,ipwpad(nrplwv),nkpmem
      real*8     timer(*)
#ifdef SERIAL
      real*8     dnlg (nrplwv,3)
#else
      real*8     dnlg (nrplwv_global,3)
      integer    nlocal(par_pw_np),offset(par_pw_np)
#endif
      complex*WF_PRECISION, optional ::  reci_psi(nrplwv)
      complex*16          , optional ::  real_psi(:)
      complex*16 ZDOTC
      external   ZDOTC

!     local arrays (automatic) 
      real*8  a(nkbpmaxatom,nions),b(nkbpmaxatom,nions) 
      real*8  a2(nkbpmaxatom,2,nions),temp2(nrplwv,2,nions)
      complex*16,       allocatable ::    real_psi_work(:)

!     locals 
      integer     jkbp,i,it,na,jkb,mu,ni,iv,np
      complex*16  h
#include "etime.h"
      integer     nplwkp_local,nkpeff

      call uttime(time)
      timer(TCALBEC)=timer(TCALBEC)-time(1)

      call geteff(nkp,nkpmem,nconso,nkpeff)

      if (luse_rs_proj) then 
!       use real space projectors 
        if (present(reci_psi)) then 
!           convert reciprocal wavefunction in reci_psi to real space
            allocate(real_psi_work(ngxs*ngys*ngzs))
            real_psi_work(:) = 0.0
            do i = 1,nplwkp
              real_psi_work(ipwpad(i)) = reci_psi(i)
            enddo
            call fft3d(real_psi_work,ngxs,ngys,ngzs,1)
            call cal_rs_becp(real_psi_work,nrplwv,1,nplwkp,ngxs,ngys,&
                       ngzs,nkb,nkbmax,nkbtona,nh,ipwpad,nions,nspec,&
                       nionsp,becp(1,nbn,nkp),timer) 
            deallocate(real_psi_work)
        else
            call cal_rs_becp(real_psi,nrplwv,1,nplwkp,ngxs,ngys,ngzs,&
                         nkb,nkbmax,nkbtona,nh,ipwpad,nions,nspec,&
                         nionsp,becp(1,nbn,nkp),timer) 
        endif
!        do i = 1,16 
!          write(*,10) 'becp = ',i,becp(i,nbn)
!        enddo 
!10      format(1x,a10,i2,1x,2(f14.8,1x))
!        stop
        call uttime(time)
        timer(TCALBEC)=timer(TCALBEC)+time(1)
        return
      endif

!--------------------------------------------------------------------
!     define nplwv  (number of planewaves)
#ifdef SERIAL
      nplwkp_local = nplwkp
#else
!     define the number of planewaves on this node (nlocal)
      call par_defwfk (nrplwv,nrplwv_global,nplwkp,nlocal,offset,&
                         nplwkp_local,&
#include PARAL_ARGS
        , nconso)
#endif

!--------------------------------------------------------------------
      na = 1
      do np = 1,nspec

        do ni = 1,nionsp(np)
          call uttime(time)
          timer(TCALBEC1)=timer(TCALBEC1)-time(1)
          do i=1,nplwkp_local
            h      = conjg(skb(i,na,nkpeff))*reci_psi(i)
!           tempr(i,ni)= dreal(h)
!           tempc(i,ni)= dimag(h)
            temp2(i,1,ni)= dreal(h)
            temp2(i,2,ni)= dimag(h)
          enddo
          na = na + 1
          call uttime(time)
          timer(TCALBEC1)=timer(TCALBEC1)+time(1)
        enddo 
        na = na - nionsp(np)

!        multiply with nh*nionsp(np) projectors for this type of atom
         call dgemm('T','N', nh(np), 2*nionsp(np), nplwkp_local,1.0d0,&
                 vkb(1,1,np,nkpeff), nrplwv,&
                 temp2,       nrplwv,&
                 0.0d0,&
                 a2,           nkbpmaxatom )
!        call dgemm('T','N', nh(np), nionsp(np), nplwkp_local,1.0d0,
!    &           vkb(1,1,np,nkpeff), nrplwv,
!    &           tempc,       nrplwv,
!    &           0.0d0,
!    &           b,           nkbpmaxatom )

        do ni = 1,nionsp(np)
          do iv = 1,nh(np) 
             jkb=nkbc(na,iv)
             if (vkbreal(iv,np))  then
               becp(jkb,nbn,nkp) = dcmplx(a2(iv,1,ni),a2(iv,2,ni))
             else
               becp(jkb,nbn,nkp) = dcmplx(a2(iv,2,ni),-a2(iv,1,ni))
             endif
          enddo
          na = na + 1
        enddo 

      enddo  ! np = 1,nspec

      call uttime(time)
      timer(TCALBEC)=timer(TCALBEC)+time(1)


! -----------------------------------------------------------------
#ifdef DEBUG1
      do jkbp = 1, nkbp
          it=nkptoit(jkbp)
          do na = 1,nions
            if (ityp(na).eq.it) then
              jkb=nkbc(na,jkbp)
              write(*,100) 'becp ',na,jkb,jkbp,becp(jkb,nbn,nkp)
100           format(1x,a6,1x,3(i3,1x),2f16.9)
            endif
          enddo
      enddo
#endif

#ifdef PARAL
!     Sum up the betas from the different processors
      call par_sum_complex('A',becp(1,nbn,nkp), becp(1,nbn,nkp),&
                  nkbmax,nkbmax,1,&
#include PARAL_ARGS
        ,timer )
#endif                        

      end subroutine cal_bec

!
!
!------------------------------------------------------------------
      subroutine cal_vkb(nplwv,nrplwv, nrplwv_global, nplwkp, nkp, & 
                         nions, nspec, nionsp,nkpmem,              & 
                         nbands, nkprun,dnlg,posion,dirc,volc,     & 
                         ngxs,ngys,ngzs,ecut,idebug,recc           & 
#ifdef PARAL
                         ,                                         & 
#include                 PARAL_ARGS
#endif
                         )   
      use van_us_data_module
 
!
!    1. Sort the k+g array dnlgk;  result in gk and gkabs
!    2. Calculate the structure constant (skb) 
!    3. Fourier transform the beta function (vq) 
!    4. Multiply with the spherical harmonics and return in vkb 
! 
!       vkb for all k-point is in the swap_file vkb_swap
!     
!-----------------------------------------------------------------------
      use run_context
      use non_local_projectors
      implicit none
 
      integer    nplwv,nrplwv,nrplwv_global,nplwkp, nkp
      integer    nions,nspec,nionsp(nspec),nkpmem
      real*8     posion(3,nions,nspec),dirc(3,3),volc
      real*8     dnlg (nrplwv_global,3)
      integer    nbands,nkprun
      integer    ngxs,ngys,ngzs,idebug
      real*8     ecut,recc(3,3)

      include 'readvan.h'  

!     locals 
      real*8 ylm(nrplwv_global      ,lqx*lqx)   !  work array hold spherical harmonics
      real*8 vq(nrplwv_global,nbrx,nspec)       !  work array to hold Fourier transform of betar(r) 
      real*8  x,y,z,arg,h
      integer nat,ik,nsp,mu,ndcbyt,i,j,np,nplwkp_local,offs
      integer nkpeff,file_index
      parameter(ndcbyt=16,h=0.0d0)
#ifdef PARAL
#include          PARAL_DECL 
      integer    nlocal(par_pw_np),offset(par_pw_np)
#endif       

      call geteff(nkp,nkpmem,nconso,nkpeff)

!     set the logical variable luse_rs_proj
      call set_luse_rs_proj()
      if (luse_rs_proj) then 
        call init_rs_nl(ecut,betar,r,rab,ndm,nbrx,npsx,&
                 nbeta,nkb,nh,lll,kkbeta,rc,nspec,idebug,&
                 recc,dirc,volc,ngxs,ngys,ngzs,&
                 nions,nionsp,posion )

        return 
      endif

!     first allocate vkb and skb
      if (.not.allocated(vkb)) &
                   allocate(vkb(nrplwv,nkbpmaxatom,nspec,nkpmem))
      if (.not.allocated(skb)) allocate(skb(nrplwv,nions,nkpmem))

      vkb = 0.0d0 
      skb = 0.0d0

#ifdef SERIAL
      nplwkp_local = nplwkp
      offs = 0
#else
!     For parallel job set offset for skb and vkb (full lenght arrays)
!     and define the number of planewaves on this node (nlocal)
!
      call par_defwfk (nrplwv,nrplwv_global,nplwkp,nlocal,offset,&
                         nplwkp_local,&
#include PARAL_ARGS
        , nconso)
      offs = offset(par_process+1)
#endif                  

!     find the indeces of k+g vectors at the given k-point
!     call gk_sort(dnlg(1,1),nrplwv,psgmax,        
!    $       nplwkp,gk,gkabs,q,index)                 
      do ik=1,nplwkp                
          gkabs(ik) = dnlg(ik,1)**2+dnlg(ik,2)**2+dnlg(ik,3)**2
      end do                                                
                
!     setup the structure constant (skb)
      nat = 1               
      do nsp = 1,nspec
        do mu=1, nionsp(nsp)
          x=posion(1,mu,nsp)*dirc(1,1)+&
            posion(2,mu,nsp)*dirc(2,1)+posion(3,mu,nsp)*dirc(3,1)
          y=posion(1,mu,nsp)*dirc(1,2)+                           &
            posion(2,mu,nsp)*dirc(2,2)+posion(3,mu,nsp)*dirc(3,2)
          z=posion(1,mu,nsp)*dirc(1,3)+                           &
            posion(2,mu,nsp)*dirc(2,3)+posion(3,mu,nsp)*dirc(3,3)
                                                
          do ik=1,nplwkp_local
             arg = (dnlg(ik+offs,1)*x + dnlg(ik+offs,2)*y + &
                    dnlg(ik+offs,3)*z)
             skb(ik,nat,nkpeff) = dcmplx(cos(arg),-sin(arg))      
          end do                                     
          nat = nat + 1
        enddo
      enddo  
               
      call ylmr2(lqx**2,0,nplwkp,nrplwv_global,dnlg,gkabs,&
                 nrplwv_global,(lqx)**2,ylm)                         
                                        
!     Fourier transform the beta-functions and return in vq
      call calcvq(nplwkp,nrplwv_global,nspec,gkabs,vq,0) 

!     multiply with the spherical harmonics and return in vkb
      call dvus_bessel(nplwkp,nrplwv_global,nrplwv,nspec,nkbp,nhm,&
        vq,vkb(1,1,1,nkpeff),ylm,nh,indv,nhtol,nhtom,volc,vkbreal&
#ifdef PARAL
                         ,&
#include                 PARAL_ARGS
#endif
                      ) 

!     write vkb and skb to the file vkb_swap
      if (nkpmem.eq.number_kpoints_per_process) return 

      file_index = int((nkp-1)/nkpmem)+1 
      file_index = int(mod(nkp-1,number_kpoints_per_process)) + 1


!     (nhm = nkbpmaxatom*nspec)
      open(UNIT=32,file=vkb_swap_filename,FORM='UNFORMATTED',&
                    access='direct', recl=ndcbyt*nrplwv)
      do np = 1,nspec
        do j = 1,nkbpmaxatom
         write(32,rec=(nhm+nions)*(file_index-1)+(np-1)*nkbpmaxatom+j)&
          (vkb(i,j,np,nkpeff),i=1,nrplwv)
        enddo 
      enddo
      do j = 1,nions
         write(32,rec=(nhm+nions)*(file_index-1)+j+nhm)&
             (skb(i,j,nkpeff),i=1,nrplwv)
      enddo
      close(32)

      end subroutine cal_vkb

!-----------------------------------------------------------------------
      subroutine add_vnl_eS_psi(psi,nplwkp,eigen,nbn,vpsi,&
#include      "apply_h_args.h"
              ,timer)    
!-----------------------------------------------------------------------
!     Compute (V_NL - eigen*S) times wavefunction in psi, and return 
!     result in vpsi.
!     The  becp (<psi|beta>) is assumed to be defined
!     The same operation as in usvnlpsi, except that 
!     ps = (Dnm- eigen*qnm) is used 
!-----------------------------------------------------------------------
      use non_local_projectors
      use run_context
      use van_us_data_module
      implicit none

#     include "apply_h_decl.h"
      complex*WF_PRECISION, intent(in)  ::  psi(*)
      complex*WF_PRECISION, intent(out) ::  vpsi(*)
      real*8              , intent(in)  ::  eigen
      integer             , intent(in)  ::  nplwkp,nbn
      real*8       timer(*)

!     locals
      real*8   psr(nkbpmaxatom,nions),psc(nkbpmaxatom,nions)
      real*8   ps2(nkbpmaxatom,2,nions),temp2(nrplwv,2,nions)
      complex*16 ps(nkb),ps_vnl(nkb),ps_s(nkb)
      complex*16 temp(nplwkp),alpha,beta
      complex*16 vpsi1(nrplwv),vpsi2(nrplwv)
      real*8     ac,ad,bc,bd

      integer jkb,nhjkb,nhjkbm,it,na,jkbp,i,j,jkb1,n,lda,ldc,l
      integer np,ni,iv,nproj,nsp,m,ldb
      integer nplwkp_local,nkpeff
#ifdef PARAL
      integer    nlocal(par_pw_np),offset(par_pw_np) 
#endif


#include "etime.h"
      real*8  ZERO,SMALL,rinplwv 
      complex*16   h
      complex*16, allocatable ::  spsiwork(:) ,vpsiwork(:)
      parameter(ZERO = 0.0d0,SMALL=1.0d-12) 
      external  dgemv

      call uttime(time)
      timer(TVNLWAV)=timer(TVNLWAV)-time(1)           

      call geteff(nkp,nkpmem,nconso,nkpeff) 

!--------------------------------------------------------------------
!     define nplwv  (number of planewaves)
#ifdef SERIAL
      nplwkp_local = nplwkp
#else
!     define the number of planewaves on this node (nlocal)
      call par_defwfk (nrplwv,nrplwv_global,nplwkp,nlocal,offset,&
                         nplwkp_local,&
#include PARAL_ARGS
        , nconso)
#endif
!--------------------------------------------------------------------

!     zero vpsi      
      vpsi1(1:nrplwv) = dcmplx(0.0d0,0.0d0)
      vpsi2(1:nrplwv) = dcmplx(0.0d0,0.0d0)

!     multiply on projectors
      do jkb = 1, nkb
            nhjkb = nkbtonh(jkb)
            it = ityp(nkbtona(jkb))
            na = nkbtona(jkb)
            nhjkbm = nh(it)
            jkb1 = jkb - nhjkb

!           calculate sum_m d_nm bec_m
            ps_s(jkb)   = dcmplx(ZERO,ZERO) 
            ps_vnl(jkb) = dcmplx(ZERO,ZERO) 
            do j = 1,nhjkbm
!              V_NL part 
               ps_vnl(jkb) = ps_vnl(jkb) + &
                 becp(jkb1+j,nbn,nkp)*deeq(nhjkb,j,na,kspin(nkp))
!              S part
               ps_s(jkb) = ps_s(jkb) + becp(jkb1+j,nbn,nkp)*&
                           qq(nhjkb,j,it)
            enddo
            ps(jkb) = ps_vnl(jkb) - eigen*ps_s(jkb)
      enddo

      if (luse_rs_proj) then 
        ! use rs projectors, add in real space to vpsi 
        ! do jkb = 1,nkb 
        ! na = nkbtona(jkb)
        !  do n = 1,ngridpts(na) 
        !    real_vpsi(index_rs2fft(n,na))=&
        !     real_vpsi(index_rs2fft(n,na))+vkb_rs(n,jkb)*ps(jkb)
        !  enddo 
        !enddo 

        nproj = 1
        na    = 1
        lda = size(vkb_rs(:,1))
        allocate(vpsiwork(lda))
        do nsp = 1,nspec
         do ni = 1,nionsp(nsp)

             l   = ngridpts(na)
             m   = nh(nsp)

             ldb = nh(nsp)
             n   = 1

             alpha = (1.0d0,0.0d0)
             beta  = (0.0d0,0.0d0)
             ldc = lda
             call zgemm('N','N',l,n,m,alpha,vkb_rs(1,nproj),lda,&
                        ps(nproj),ldb,beta,vpsiwork,ldc)

             ! vpsiwork(:) = 0.0d0
             ! do n= 1,l 
             !   do i = 1,m 
             !      vpsiwork(n) = vpsiwork(n) + vkb_rs(n,nproj+i-1)*&
             !                    ps(nproj+i-1) 
             !   enddo 
             ! enddo

             do n = 1,ngridpts(na) 
                vpsi(index_rs2fft(n,na))=&
                  vpsi(index_rs2fft(n,na)) + vpsiwork(n)
             enddo 
             na = na + 1
             nproj = nproj + nh(nsp)
         enddo ! ni
        enddo  ! nsp 
        deallocate(vpsiwork)
                                   
      else 
        ! reciprocal projectors

        na = 1
        do np = 1,nspec
          do ni = 1,nionsp(np)
            do iv = 1,nh(np)
              jkb = nkbc(na,iv)
              if (vkbreal(iv,np)) then 
!                (r  + i c)
!                 psr(iv,ni) = dreal(ps(jkb))
!                 psc(iv,ni) = dimag(ps(jkb))
                  ps2(iv,1,ni) = dreal(ps(jkb))
                  ps2(iv,2,ni) = dimag(ps(jkb))
              else
!                (-c + i r)
!                psr(iv,ni) = -dimag(ps(jkb))
!                psc(iv,ni) =  dreal(ps(jkb))
                 ps2(iv,1,ni) = -dimag(ps(jkb))
                 ps2(iv,2,ni) =  dreal(ps(jkb))
              endif
            enddo
            na = na + 1
          enddo
          na = na - nionsp(np)

!         multiby with nionsp(np)*nh(np) projectors for this atom
!         The real and imaginary part calc. separately.
          call dgemm('N','N',nplwkp_local, 2*nionsp(np), nh(np), 1.0d0,&
                vkb(1,1,np,nkpeff), nrplwv, &
                ps2,         nkbpmaxatom, &
                0.0d0,&
                temp2,       nrplwv )

          do ni = 1,nionsp(np)
            do i=1,nplwkp_local
             vpsi1(i)= vpsi1(i)+skb(i,na,nkpeff)*&
                    dcmplx(temp2(i,1,ni),temp2(i,2,ni))
            enddo
            na = na +1
          enddo
!         na = na - nionsp(np)

!         call dgemm('N','N',nplwkp_local, nionsp(np), nh(np), 1.0d0,
!    &          vkb(1,1,np,nkpeff), nrplwv, 
!    &          psc,         nkbpmaxatom, 
!    &          0.0d0,
!    &          tempr,       nrplwv)

!         do ni = 1,nionsp(np)
!            do i=1,nplwkp_local
!             vpsi2(i)= vpsi2(i)+skb(i,na)*tempr(i,ni)
!            enddo 
!            na = na +1
!         enddo

        enddo  ! nspec

!       now collect vpsi1,vpsi2 to vpsi 
        do i=1,nplwkp_local
!         ac = dreal(vpsi1(i))
!         ad = dimag(vpsi1(i))
!         bc = dreal(vpsi2(i))
!         bd = dimag(vpsi2(i))
          vpsi(i)=  vpsi(i) + vpsi1(i) - eigen*psi(i)
        enddo
 
      endif ! luse_rs_proj

      call uttime(time)
      timer(TVNLWAV)=timer(TVNLWAV)+time(1)

      end subroutine add_vnl_eS_psi


      end module us_hpsi_module
