#include "definitions.h"


       subroutine resminid(nconso)
       write(nconso,*) '@(#)resmin.F	1.18 11/22/98'
       return
       end

! resmin - diagonalisation of the Hamiltonian using a blocked power series
! method  / Lennart B 961002

! cveff(r)    ... the real space total kohn-sham potential
! cdir    .... empty
! cptwfp  .... wave functions for current k-point
! wfkine  .... the kinetic energy for bands of this k-point
! eigen(nbands)  .... empty
! ndiapb ..... number of times all bands are updated (suggested value 1)
! nbsub ...... block size in the diagonalisation
! norder ..... the highest power of H considered in each block update
! cptwfl ..... work area for residual vectors
! damwrk ..... general work area (see below)
 
! out values

! cptwfp  .... updated wave functions for current k-point
! eigen(nbands)  .... eigen values for this iteration

      subroutine resmin (lgenev,ndiapb, nbsub, norder, nplwkp,&
        eigen, cptwfp, wfkine, vnl, &
        sizham,deriv,&
        idebug, noffs0, resmindim,&
#       include "apply_h_args.h"
        , nconso, timer)

      implicit none

#     include "apply_h_decl.h"

      logical*4 lgenev
!     integer*4
      integer ndiapb,nplwkp,nconso
      integer nbsub, noffs0, norder
      integer idebug,resmindim

!     complex*WF_PRECISION
      complex*WF_PRECISION cptwfp(nrplwv,nbands)

!     real*8
      real*8 vnl(nbands), wfkine(nbands)
      real*8 eigen(nbands)
      real*8 sizham, deriv
      real*8 timer(*)

! local variables

      integer nn, n, dim, m, nb, nit
      real*8 temp, sum
#include "etime.h"
! automatic arrays
      real*8 damwrk(resmindim)
      complex*WF_PRECISION cdir(nrplwv)
      real*8 wrke(2*nbands,5)
      complex*WF_PRECISION cptwfl(nrplwv,nbsub*norder)

      dim = nbsub*(norder+1)

! loop until the orbitals has been found with sufficient accuracy

      do 10 nit=1,ndiapb

! Loop over blocks
! noffs0 varies from 0 to nbsub-1, and should guarantee that different
! blocks are used in each call.

         if (nbsub.ge.nbands) noffs0 = 0

         do 20 nn = 1-noffs0, nbands, nbsub

            n = max(nn,1)
            nb = min(nbsub, nbands-n+1, nbsub+nn-1)
! operate on the orbitals within the block
            call diag_block(lgenev,cptwfp,cptwfl,nplwkp,&
!             hmat       heig                 heigva
              damwrk(1), damwrk(1+2*dim*dim), damwrk(1+4*dim*dim),&
!             hsub
              damwrk(1+4*dim*dim+dim),&
!             gamma                                    cresi
              damwrk(1+4*dim*dim+dim+4*nbands*nbands), cdir,&
!             work
              damwrk(1+6*dim*dim+dim+4*nbands*nbands+2*nbands*dim),&
              wfkine,vnl,wrke,nb,norder,n,nconso,&
#             include "apply_h_args.h"
              , timer)

! next block

 20      continue

! Perform a subspace diagonalisation if the block size is less than
! nbands.
         if (nbsub .lt. nbands) then
!           NOTE: subdia() assumes that the matrix elements have
!                 been precalculated in array hsub /OHN
            call uttime(time)
            timer(TSUBDIA)=timer(TSUBDIA)-time(1)
            call subdia(cptwfp,&
!                  hsub
                   damwrk(1+4*dim*dim+dim),nbands,nrplwv,nplwkp,nconso,&
                   sizham,eigen,&
!                  subeig
                   damwrk(1+4*dim*dim+dim+2*nbands*nbands),&
!                  work
                   damwrk(1+6*dim*dim+dim+4*nbands*nbands+2*nbands*dim))
            call uttime(time)
            timer(TSUBDIA)=timer(TSUBDIA)+time(1)
         else
! Otherwise the hsub matrix is already diagonal, so the diagonal elements
! are the new KS eigenvalues.
! Note that the matrix elements are complex*16, but damwrk is declared
! as real*8, so the index calculation looks really ugly.
            do 25 nn = 1, nbands
!                          hsub(nn,nn)
               eigen(nn) = damwrk((nn-1)*2*nbands+nn*2-1+4*dim*dim+dim)
 25         continue
         endif

! next iteration
         noffs0 = noffs0 + 1
         if (noffs0.ge.nbsub) noffs0 = 0

 10   continue

! Redundant (?) orthogonalisation
!     call orsp(nbands,nrplwv,nplwkp,cptwfp,cwor12,cwork1,nplwv)

! Find the kinetic energy.

      do 30 nn=1,nbands
         sum = 0.0d0
         do 40 m=1,nplwkp
            temp = dble(cptwfp(m,nn)*conjg(cptwfp(m,nn)))
            sum=sum+pwkine(m,nkp)*temp
 40      continue
         wfkine(nn)=sum
 30   continue   
      
      return
      end


      subroutine diag_block(lgenev,cptwfp,cptwfl,nplwkp,&
         hmat,heig,heigva,hsub,&
         gamma,cresi,work,&
         wfkine,vnl,wrke,nb,m,n,nconso,&
#        include "apply_h_args.h"
         , timer)

      implicit none

#     include "apply_h_decl.h"

! nb .......  Number of bands in this block (may vary)
! m ........  Order of power series expansion (at least 1)
! n ........  First band in block
! cptwfp ...  Wave functions for all bands
! cptwfl ...  Orthonormalized and preconditioned residual vectors (work area)
! hmat .....  Hamiltonian matrix elements (work area)
! heig .....  Eigenvectors of Hamiltonian matrix (work area)
! heigva ...  Corresponding eigenvalues (work area)
! hsub .....  Hamiltonian subspace elements (in out)
! gamma ....  Overlap between new orbitals and old orbitals/residuals (work)
! cresi ....  Residual vector (work area)
! work .....  Work area for various subroutines. Its size should be at least
!             max(2*m*nb, 4*nbands*(nbands+1), 4*(m+1)*nb*((m+1)*nb+1)) elements

! Total work area size in doubles, excluding cptwfl and cresi:
! In the following dim = nbsub*(norder+1)
!
!               size            offset
! hmat:         2*dim*dim       0
! heig:         2*dim*dim       2*dim*dim
! heigva:       dim             4*dim*dim
! hsub:         2*nbands*nbands 4*dim*dim+dim
! subeig:       2*nbands*nbands 4*dim*dim+dim+2*nbands*nbands
! gamma:        2*nbands*dim    4*dim*dim+dim+4*nbands*nbands
! work:         see below       6*dim*dim+dim+4*nbands*nbands+2*nbands*dim

! The size of work is max(4*nbands*(nbands+1), 4*dim*(dim+1))

! The window program should calculate dim = nbsub*(norder+1) and
! declare damwrk large enough to hold all fields above.     

!     integer
      logical*4  lgenev
      integer nb, m, n, nplwkp, nconso

!     complex*WF_PRECISION 
      complex*WF_PRECISION cptwfp(nrplwv,nbands)
      complex*WF_PRECISION cptwfl(nrplwv,n:n+nb-1,m)
      complex*WF_PRECISION cresi(nrplwv)
!     complex*16 
      complex*16 hmat(n:n+nb-1,0:m,n:n+nb-1,0:m)
      complex*16 Smat(n:n+nb-1,0:m,n:n+nb-1,0:m)
      complex*16 heig(n:n+nb-1,0:m,n:n+nb-1,0:m)
      complex*16 hsub(nbands,nbands)
      complex*16 gamma(nbands,n:n+nb-1,0:m)

!     real*8
      real*8 heigva(n:n+nb-1,0:m)
! The scalar product of the input wave function i and the 
! output wavefunction p will be stored in gamma(p,i,0).
! The scalar product of precond. residual vector cptwfl(.,i,k) and
! the output wavefunction p will be stored in gamma(p,i,k).
! It is assumed that n<=i<n+nb and p<n. The elements are used in
! the construction of hsub.
      real*8 timer(*)
      complex*16 work(nbands,*)
      real*8 wfkine(nbands), vnl(nbands),wrke(nbands)

! automatic variables (full lenght if parallel)
#ifdef PARAL
      complex*WF_PRECISION spsi(nrplwv_global),hpsi(nrplwv_global)
      complex*WF_PRECISION Scresi(nrplwv_global)
#else
      complex*WF_PRECISION spsi(nrplwv),hpsi(nrplwv),Scresi(nrplwv)
#endif

! local variables
      integer i,j,k,kk,nn,p,qi,nprev,imax,nouter,nc1,nc,ii,jj,ib,jb
      integer info,nbnd
      complex*16 cdum,sdum, zone, zzero
      complex*WF_PRECISION alpha
      real*R_PRECISION rnorm, NORM2
      real*8 size_psi,eapp
      external NORM2
      real*8 anorm,vnltmp
      complex*16 anormc
#include "etime.h"

      vnltmp = 0.0d0

! Loop over orders

      do 10 k = 0, m

! Loop over block elements

         do 20 j = n, n+nb-1

! Calculate residual vector
! First calculate H|psi> (cresi)
      
            if (k .eq. 0) then
!              First order (k=0): input vector is cptwfp(:,j)
               call H_TIMES_B(cptwfp(1,j),cresi,Scresi,nplwkp,&
                       1,j,.false.,&
#                include "apply_h_args.h"
                 ,timer)
!              Save the calculated band kinetic energy for preconditioning
               wfkine(j) = wkintmp
               vnl(j) = vnltmp
               if (.not.lgenev) then 
                 do nn = 1,nplwkp
                    Scresi(nn) = cptwfp(nn,j)
                 enddo
               endif
            else
!              Order k>1: input vector is cptwfl(:,j,k)
               call H_TIMES_B(cptwfl(1,j,k),cresi,Scresi,nplwkp,&
                       1,j,.false.,&
#                include "apply_h_args.h"
                ,timer)
               if (.not.lgenev) then 
                 do nn = 1,nplwkp
                    Scresi(nn) = cptwfl(nn,j,k)
                 enddo
               endif
            endif

            call uttime(time)

! Form overlap
            call spoverl(cresi, cptwfp, nrplwv,      &
                         nplwkp, gamma(1,j,k), nbands)

! H|psi> and S|psi> are now both defined 
! Form the residual H|psi> -eapp S|psi> (in cptwfl)
            if (k.lt.m) then 
!              Copy residual to cptwfl(:,j,k+1)
               do 30 nn = 1, nplwkp
                  cptwfl(nn,j,k+1) = cresi(nn)
30             continue
               if (k.eq.0) then 
                 call get_residual(nrplwv,nplwkp,&
                  cptwfl(1,j,k+1),Scresi,cptwfp(1,j),eapp,lgenev)
               else 
                 call get_residual(nrplwv,nplwkp,&
                  cptwfl(1,j,k+1),Scresi,cptwfl(1,j,k),eapp,lgenev)
               endif
            endif 

            if (k .lt. m) then

! Precondition the residual

               wkintmp = wfkine(j)
               call precon (nplwkp, cptwfl(1,j,k+1), cptwfl(1,j,k+1),&
                 heigva(j,k),&
#                include "apply_h_args.h"
                 , timer, info)

            endif

! Calculate a column in the Hamiltonian matrix. Only consider the
! upper triangle.

            do 70 kk = 0,k
               if (kk .eq. k) then
                  imax = j
               else
                  imax = n+nb-1
               endif
               do 80 i = n,imax
                  cdum = (0.0d0,0.0d0)
                  sdum = (0.0d0,0.0d0)
                  if (kk.eq.0) then
! < psi_i | H | psi_j > (k=0)  or < psi_i | H | phi^k_j > (k>0)
                     do 90 nn=1,nplwkp
                        cdum=cdum+conjg(cptwfp(nn,i))* cresi(nn)
                        sdum=sdum+conjg(cptwfp(nn,i))*Scresi(nn)
 90                  continue
                  else
! < phi^kk_i | H | phi^k_j >                    
                     do 110 nn=1,nplwkp
                        cdum=cdum+conjg(cptwfl(nn,i,kk))* cresi(nn)
                        sdum=sdum+conjg(cptwfl(nn,i,kk))*Scresi(nn)
 110                 continue
                  endif
                  hmat(i,kk,j,k) = cdum
                  hmat(j,k,i,kk) = conjg(cdum)
                  Smat(i,kk,j,k) = sdum
                  Smat(j,k,i,kk) = conjg(sdum)
 80            continue

 70         continue
            call uttime(time)


! Continue with the next band/residual within the block
 20      continue 

! Continue with next order
 10   continue

      call uttime(time)
      timer(TUPDWF)=timer(TUPDWF)-time(1)

!=====================================================================
! Write matrix to be diagonalized
#ifdef DEBUG4
      write(nconso,*) '     --- H Re ----      '
      do k = 0, m
!       Loop over block elements
        do j = n, n+nb-1
         write(nconso,100) ((dble(hmat(i,kk,j,k)),i=n,n+nb-1),kk=0,m)
        enddo
      enddo
      write(nconso,*) '     --- S real ----      '
      do k = 0, m
!       Loop over block elements
        do j = n, n+nb-1 
         write(nconso,100) ((dble(Smat(i,kk,j,k)),i=n,n+nb-1),kk=0,m)
        enddo
      enddo
      write(nconso,*) '     --- S imag ----      '
      do k = 0, m
!       Loop over block elements
        do j = n, n+nb-1 
         write(nconso,100)((dimag(Smat(i,kk,j,k)),i=n,n+nb-1),kk=0,m)
        enddo
      enddo
100   format(1x,28f12.8)
#endif DEBUG4
!======================================================================


! Find eigenvalues and eigenvectors of hmat
      call cdiaghg(nb*(m+1),hmat,Smat,nb*(m+1),heigva,heig )

! Calculate matrix elements in hsub (used later in the subspace 
! diagonalisation.) The matrix elements can be calculated from
! the gamma overlaps and the eigenvectors heig, so we do not
! need to construct any new orbitals yet.

! only if nbsub.lt.nbands
      if (nb.lt.nbands) then 

        do 120 j = n,n+nb-1

         do 130 p = 1, n-1

! < psi_p | H | psi_j >  (psi_p belongs to a block above us)
            cdum = (0.0d0,0.0d0)
            do 140 k = 0, m
               do 150 i = n,n+nb-1
                  cdum = cdum + heig(i,k,j,0)*gamma(p,i,k)
 150           continue
 140        continue
            hsub(p,j) = cdum
            hsub(j,p) = conjg(cdum)
 130     continue

         do 160 i = n,n+nb-1
! < psi_i | H | psi_j >  (psi_i and psi_j belongs to current block)
            cdum = (0.0d0,0.0d0)
            do 170 k = 0, m
               do 180 kk = 0, m
                  do 190 p = n, n+nb-1
                     do 200 qi = n, n+nb-1
                        cdum = cdum + conjg(heig(p,k,i,0))*&
                               heig(qi,kk,j,0)*hmat(p,k,qi,kk)
 200                 continue
 190              continue
 180           continue
 170        continue
            hsub(i,j) = cdum
 160     continue

! continue with next column i hsub
 120  continue

      else   
!       return eigenvalues in diagonal of hsub
        do j = 1,nbands
          hsub(j,j) = heigva(j,0)
        enddo 
      endif   ! (nb.lt.nbands) 

!     Find new wave functions:

!     Transform cptwfp = cptwfp * heig + cptwfl * heig
!     The way scalar products are defined here,
!     the eigenvectors must NOT be complex conjugated.

      if (nb.eq.1) then
!        use an optimized routine for the blocksize=1 case
         call rotate(cptwfp(1,n),cptwfl,m,heig,nplwkp,nrplwv)

      else
!        otherwise use the 'czgemm' routine:

!        Employ blocking (blocksize=nbands) in order to use only a limited
!        working array (good for cache machines, too).
!        The real*8 work working array should have size
!        at least 2*nbands*nbsub.
         nouter = nplwkp / nbands
         if (nouter * nbands .lt. nplwkp) nouter = nouter + 1
         zzero = (0.0d0, 0.0d0)
         zone  = (1.0d0, 0.0d0)

         do 240 i = 1, nouter  

!           Offset
            nc1 = (i - 1) * nbands
!           Number of elements
            nc  = nbands
!           Remainder case:
            if (nc1 + nc .gt. nplwkp) nc = nplwkp - nc1

!           work = matmul(cptwfp, heig)
            call czgemm ('N', 'N', nc, nb, nb,&
              zone, cptwfp(nc1+1,n), nrplwv,&
              heig(n,0,n,0), (m+1)*nb, zzero, work, nbands)
!           Copy work back into cptwfp
            do jj = 1, nb
              do ii = 1, nc
                cptwfp(nc1+ii,n+jj-1) = work(ii,jj)
              end do
            end do
!           Add contribution from residuals cptwfl
            do kk = 1, m
!             work = matmul(cptwfl, heig)
              call czgemm ('N', 'N', nc, nb, nb, &
                zone, cptwfl(nc1+1,n,kk), nrplwv,&
                heig(n,kk,n,0), (m+1)*nb, zzero, work, nbands)
!             cptwfp = cptwfp + matmul(cptwfl, heig)
              do jj = 1, nb
                do ii = 1, nc
                  cptwfp(nc1+ii,n+jj-1) = &
                  cptwfp(nc1+ii,n+jj-1) + work(ii,jj)
                end do
              end do
            end do

 240     continue
 9999    continue
      endif

#ifdef DEBUG2
!     Check the magnitude of residuals H*psi - lambda*psi
      do 300 j = n, n + nb - 1
        call H_TIMES_B(cptwfp(1,j),cresi,spsi,nplwkp,lgenev,j,&
#         include "apply_h_args.h"
         ,timer)
!       Residual = H*cptwfp(:,j) - heigva(j,0)*cptwfp(:,j)
        alpha = dcmplx( - heigva(j,0), 0.0d0)
        call BLAS_AXPY (n, alpha, spsi, 1, cresi, 1)
!       Compute 2-norm of residual vector cresi
        rnorm = NORM2 (n, cresi, 1)
        write(nconso,204) j, heigva(j,0), rnorm, rnorm/abs(heigva(j,0))
204     format (' RESMIN: heigva(', i3, ')= ',&
                f10.6, ' residual norm=', 2f12.8)
300   continue
      call uflush(nconso)
#endif

      call uttime(time)
      timer(TUPDWF)=timer(TUPDWF)+time(1)

      return
      end

      subroutine rotate(cptwfp,cptwfl,nit,heig,nplwkp,nrplwv)

      implicit none
      integer nit, nplwkp, nrplwv
      complex*WF_PRECISION cptwfp(nrplwv),cptwfl(nrplwv,nit)
      complex*16 heig(0:nit)

!     Local vars
      integer i, m

! This subroutine rotates a wave function in the space spanned by itself
! and a number of residual vectors.

      complex*WF_PRECISION c
      c = heig(0)
      call BLAS_SCAL (nplwkp, c, cptwfp, 1)
      do 10 i = 1, nit
        c = heig(i)
        call BLAS_AXPY (nplwkp, c, cptwfl(1,i), 1, cptwfp, 1)
10    continue

      return
      end


      subroutine sporthwf (carray, cptwfp, nrplwv, nplwkp, overlap,&
                           wrke, nbands, timer)

!     Orthogonalize carray to wavefunction cptwfp
!     Overlaps are stored in overlap
!     wrke is a work area needed in the BLAS calls (the same as overlap,
!     but with less precision)

      implicit none
      integer nrplwv, nplwkp, nbands
      complex*WF_PRECISION cptwfp(*)
      complex*WF_PRECISION carray(nrplwv)
      complex*WF_PRECISION wrke(nbands)
      complex*16 overlap(nbands) 
      real*8 timer(*)

!     Local variables
#include "etime.h"
      integer nb
      complex*WF_PRECISION alpha, beta

      call uttime(time)
      timer(TORTH)=timer(TORTH)-time(1)
!     Using BLAS (Basic Linear Algebra Subroutines)

      do 100 nb = 1, nbands
        wrke(nb) = (0.0, 0.0)
100   continue
      alpha  = (1.0, 0.0)
      beta   = (0.0, 0.0)
        
!     wrke = conjg(transpose(cptwfp)) * carray
      call BLAS_GEMV ('C', &
        nplwkp, nbands, alpha, cptwfp, nrplwv,&
        carray, 1, beta, wrke, 1)

!     carray = carray - cptwfp * wrke
      alpha  = (-1.0, 0.0)
      beta   = (1.0, 0.0)
      call BLAS_GEMV ('N',&
        nplwkp, nbands, alpha, cptwfp, nrplwv,&
        wrke, 1, beta, carray, 1)
      do 110 nb = 1, nbands
         overlap(nb) = wrke(nb)
 110  continue

      call uttime(time)
      timer(TORTH)=timer(TORTH)+time(1)

      return
      end


      subroutine spoverl(cresi, cptwfp, nrplwv, nplwkp, overlap, nbands)
    
! The same as sportwf, but only calculate the overlaps (don't orthogonalise)
      implicit none
      integer nrplwv, nplwkp, nbands
      complex*WF_PRECISION cresi(nrplwv), cptwfp(nrplwv,nbands)
      complex*16 overlap(nbands)
      complex*16 zdotc
      complex*8 cdotc
      external cdotc, zdotc

      integer nb, m
      complex*16 coverl

      do 200 nb=1, nbands
         overlap(nb) = BLAS_DOTC(nplwkp, cptwfp(1,nb), 1, cresi, 1)
 200  continue

      return
      end


      subroutine diagma(matrix,eigvec,eigval,dim,nconso)

!     Diagonalise a hermitian matrix

      implicit none
      integer dim, nconso
      complex*16 matrix(dim,dim), eigvec(dim,dim)
      real*8 eigval(dim)
      real*8 work(4*dim*dim+4*dim)

!     Local vars
      integer dim2, i, j, ierr

      dim2 = dim*dim

! Separate real and imaginary parts. Also initialise the work area for
! the eigenvectors.

      do 10 j = 1, dim
         do 20 i = 1, dim
            work(i+(j-1)*dim) = dble(matrix(i,j))
            work(i+(j-1)*dim+dim2) = dimag(matrix(i,j))
            work(2*dim2+4*dim+(j-1)*dim+i) = 0.0d0
            work(3*dim2+4*dim+(j-1)*dim+i) = 0.0d0
 20      continue
         work(2*dim2+4*dim+(j-1)*dim+j) = 1.0d0
 10   continue

      call htridi(dim,dim,work(1),work(dim2+1),eigval,&
!                 nm  n   ar      ai           d
                  work(2*dim2+1),work(2*dim2+dim+1),&
!                 e                  e2
                  work(2*dim2+2*dim+1))
!                 tau

      call imtql2(dim,dim,eigval,work(2*dim2+1),&
!                 nm  n   d       e
                  work(2*dim2+4*dim+1),ierr)
!                 z 
      if (ierr.ne.0) then
         write(nconso,*) ' Error in the resmin diagonalisation:',ierr
      else
         call htribk(dim,dim,work(1),work(dim2+1),&
!                    nm  n   ar      ai
                     work(2*dim2+2*dim+1),dim,work(2*dim2+4*dim+1),&
!                    tau                  m   zr
                     work(3*dim2+4*dim+1))
!                    zi
      endif

! Form output matrix

      do 30 j = 1, dim
         do 40 i = 1, dim
            eigvec(i,j) = dcmplx(work(2*dim2+4*dim+i+(j-1)*dim),&
                                 work(3*dim2+4*dim+i+(j-1)*dim))
 40      continue
 30   continue

      return
      end


      subroutine subdia(cptwfp,hsub,nbands,nrplwv,nplwkp,nconso,&
                   sizham,eigen,subeig,work)

! subdia performs a subspace rotation of all bands. The matrix elements
! of the Hamiltonian are input in the array hsub.

      implicit none

      integer nbands, nrplwv, nplwkp, nconso
      complex*WF_PRECISION cptwfp(nrplwv,nbands)
      complex*16 hsub(nbands,nbands), subeig(nbands,nbands)
      real*8 sizham, eigen(nbands)

!     Local vars
      complex*16 work(nbands,nbands)
      complex*16 zone, zzero
      integer i, j, nouter, nc1, nc, ii, jj

! Calculate norm of off-diagonal matrix elements

      sizham = 0.0d0
      do 10 j = 1, nbands
         do 20 i = 1, j-1
            sizham = sizham+conjg(hsub(i,j))*hsub(i,j)
 20      continue
 10   continue
44    format(a5,1x,20(f8.4,1x))

! Find eigenvalues and eigenvectors
      call diagma(hsub,subeig,eigen,nbands,nconso)

! Find new wave functions

! Transform cptwfp = cptwfp*subeig. Employ blocking (blocksize=nbands)
! in order to use only a limited working array.
      nouter = nplwkp / nbands
      if (nouter * nbands .lt. nplwkp) nouter = nouter + 1
      zzero = (0.0d0, 0.0d0)
      zone  = (1.0d0, 0.0d0)

      do 50 i = 1, nouter

! offset
         nc1 = (i-1) * nbands
! Number of elements
         nc = nbands
! Remainder case
         if (nc1 + nc .gt. nplwkp) nc = nplwkp - nc1
! Transform this block
!           work = matmul(cptwfp, heig)
            call czgemm ('N', 'N', nc, nbands, nbands,&
              zone, cptwfp(nc1+1,1), nrplwv,&
              subeig, nbands, zzero, work, nbands)
            do jj = 1, nbands
              do ii = 1, nc
                cptwfp(nc1+ii,jj) = work(ii,jj)
              end do
            end do

 50   continue

      return
      end

      subroutine czgemm (transa, transb, m, n, k, alpha, a, lda,&
        b, ldb, beta, c, ldc)

!     Emulate BLAS subroutine zgemm, but for mixed precision

!     For the purpose of simplicity assume the following restrictions:
!     transa=transb='N'
!     alpha=1.0, beta=0.0

      implicit none
      character*1 transa, transb
      integer m, n, k, lda, ldb, ldc
!     a is (m,k), b is (k,n), c is (m,n):
      complex*WF_PRECISION a(lda,k)
      complex*16 b(ldb,n), c(ldc,n)
      complex*16 alpha, beta

!     Local vars
      complex*16 btmp
      integer m1, n1, k1
#ifdef IBM
      real*8 h1r,h1i,h2r,h2i,h3r,h3i,h4r,h4i
      integer m0
      m0 = mod(m,4)
#endif IBM

!     Multiply the complex*WF_PRECISION by complex*16 matrices

      do 290 n1 = 1, n

#ifdef IBM

!       This matrix-multiply coding has been optimized for IBM POWER2 CPUs
!       with the IBM XLF version 3.2 Fortran compiler.  The technique is
!       to unroll the inner loop, and convert explicitly complex to
!       real arithmetic (stupid optimizer).  This code gets 90-95%
!       of peak performance, provided the array-blocks fit inside cache.
!       /OHN, 13-Sep-1996.

!       First the remainder part:
        do 210 m1 = 1, m0
          h1r=0.0d0
          h1i=0.0d0
          do 200 k1 = 1, k
            h1r=h1r +  real(a(m1  ,k1))* dreal(b(k1,n1))&
                    -  aimag(a(m1  ,k1))* dimag(b(k1,n1))
            h1i=h1i +  aimag(a(m1  ,k1))* dreal(b(k1,n1))&
                    +  real(a(m1  ,k1))* dimag(b(k1,n1))
200       continue
          c(m1,n1) = dcmplx(h1r,h1i)
210     continue

!       Now the inner loop unrolled by 4:
        do 230 m1 = m0 + 1, m, 4
          h1r=0.0d0
          h1i=0.0d0
          h2r=0.0d0
          h2i=0.0d0
          h3r=0.0d0
          h3i=0.0d0
          h4r=0.0d0
          h4i=0.0d0
          do 220 k1 = 1, k
            h1r=h1r +  real(a(m1  ,k1))* dreal(b(k1,n1))&
                    -  aimag(a(m1  ,k1))* dimag(b(k1,n1))
            h1i=h1i +  aimag(a(m1  ,k1))* dreal(b(k1,n1))&
                    +  real(a(m1  ,k1))* dimag(b(k1,n1))
            h2r=h2r +  real(a(m1+1,k1))* dreal(b(k1,n1))&
                    -  aimag(a(m1+1,k1))* dimag(b(k1,n1))
            h2i=h2i +  aimag(a(m1+1,k1))* dreal(b(k1,n1))&
                    +  real(a(m1+1,k1))* dimag(b(k1,n1))
            h3r=h3r +  real(a(m1+2,k1))* dreal(b(k1,n1))&
                    -  aimag(a(m1+2,k1))* dimag(b(k1,n1))
            h3i=h3i +  aimag(a(m1+2,k1))* dreal(b(k1,n1))&
                    +  real(a(m1+2,k1))* dimag(b(k1,n1))
            h4r=h4r +  real(a(m1+3,k1))* dreal(b(k1,n1))&
                    -  aimag(a(m1+3,k1))* dimag(b(k1,n1))
            h4i=h4i +  aimag(a(m1+3,k1))* dreal(b(k1,n1))&
                    +  real(a(m1+3,k1))* dimag(b(k1,n1))
220       continue
          c(m1+0,n1) = dcmplx(h1r,h1i)
          c(m1+1,n1) = dcmplx(h2r,h2i)
          c(m1+2,n1) = dcmplx(h3r,h3i)
          c(m1+3,n1) = dcmplx(h4r,h4i)
230     continue

#else

!       Zero the workspace
        do 200 m1 = 1, m
          c(m1,n1) = (0.0d0, 0.0d0)
200     continue
        do 220 k1 = 1, k
          btmp = b(k1,n1)
          do 210 m1 = 1, m
            c(m1,n1) = c(m1,n1) + a(m1,k1) * btmp
210       continue
220     continue
#endif IBM

290   continue

      return
      end
