#include "definitions.h"
!-------------------------------------------------------------------

      subroutine time_start(timer,idx)
      implicit none
      real*8 timer(*)
      integer idx
#include "etime.h"
      call uttime(time)
      timer(idx) = timer(idx) - time(1)
      return
      end

!-------------------------------------------------------------------

      subroutine time_stop(timer,idx)
      implicit none
      real*8 timer(*)
      integer idx
#include "etime.h"
      call uttime(time)
      timer(idx) = timer(idx) + time(1)
      return
      end

!-------------------------------------------------------------------

      subroutine disp_timers(timer,nout)

      implicit none
      integer nout
      real*8 timer

      call print_t (nout, '-Form_BHB ', TSPROD, timer,1)
      call print_t (nout, '-vec_rotat', TVROT,   timer, 1)
      call print_t (nout, '-TPARTRANS', TPARTRANS,   timer, 1)
      call print_t (nout, '-TPARSUM',   TPARSUM,   timer, 1)

      return
      end
!-------------------------------------------------------------------

      subroutine vec_rotate (b, n,nlength,nsub, nold, u, nmax, timer)

!     Rotate vectors  b(:,1:nsub) = b(:,1:nold)*u(1:nold,1:nsub)
!     Use blocking in b, in order to employ a minimal working space,
!     which hopefully can reside in cache.
!     The working space is necessary because array b is overwritten
!     on output.
!        b(1:nlength,nmax)  dimension b(n,nmax)

      implicit none
      integer n,nmax,nlength,nsub,nold
!     Arrays in the "large" space:
      complex*C_PRECISION b(n,nmax)
!     Arrays in the sub-space:
      complex*16 u(nmax,nmax)
      real*8 timer(*)

!     Local variables
      integer BLOCKSIZE, niter, nn1, nn, i, j
!     Make sure that work(BLOCKSIZE,nsub) fits inside cache:
      parameter (BLOCKSIZE = 128)
      complex*C_PRECISION work(BLOCKSIZE,nsub)
#include "etime.h"
      complex*16 cone, czero
      data cone, czero /(1.0d0,0.0d0), (0.0d0,0.0d0)/

      call uttime(time)
      timer(TBLAS) = timer(TBLAS) - time(1)
      timer(TVROT) = timer(TVROT) - time(1)

!     Number of times to iterate over a block
      niter = nlength / BLOCKSIZE
      if (niter * BLOCKSIZE .lt. nlength) niter = niter + 1

      nn1 = 1
      nn = BLOCKSIZE
      do i = 1, niter
        nn = min (BLOCKSIZE, nlength - nn1 + 1)
!       b(:,1:nsub) = b(:,1:nold)*u(1:nold,1:nsub)
        call CZC_GEMM ('N', 'N', nn, nsub, nold, cone, b(nn1,1), n,&
          u, nmax, czero, work, BLOCKSIZE)
!       Copy back work(:,1:nsub) into b(:,1:nsub)
        do j = 1, nsub
          call BLAS_COPY (nn, work(1,j), 1, b(nn1,j), 1)
        end do
        nn1 = nn1 + BLOCKSIZE
      end do

      call uttime(time)
      timer(TBLAS) = timer(TBLAS) + time(1)
      timer(TVROT) = timer(TVROT) + time(1)

      return
      end


      subroutine vec_rotate_dcmplx(b, n, nsub, nold, u, nmax, timer)
 
!     Rotate vectors  b(:,1:nsub) = b(:,1:nold)*u(1:nold,1:nsub)
!     Use blocking in b, in order to employ a minimal working space,
!     which hopefully can reside in cache.
!     The working space is necessary because array b is overwritten
!     on output.
 
      implicit none
      integer n, nmax, nsub, nold
!     Arrays in the "large" space:
      complex*16 b(n,nmax)
!     Arrays in the sub-space:
      complex*16 u(nmax,nmax)
      real*8 timer(*)
 
!     Local variables
      integer BLOCKSIZE, niter, nn1, nn, i, j
!     Make sure that work(BLOCKSIZE,nsub) fits inside cache:
      parameter (BLOCKSIZE = 128)
      complex*16 work(BLOCKSIZE,nsub)
#include "etime.h"
      complex*16 cone, czero
      data cone, czero /(1.0d0,0.0d0), (0.0d0,0.0d0)/
 
      call uttime(time)
      timer(TBLAS) = timer(TBLAS) - time(1)
      timer(TVROT) = timer(TVROT) - time(1)
 
!     Number of times to iterate over a block
      niter = n / BLOCKSIZE
      if (niter * BLOCKSIZE .lt. n) niter = niter + 1
 
      nn1 = 1
      nn = BLOCKSIZE
      do i = 1, niter
        nn = min (BLOCKSIZE, n - nn1 + 1)
!       b(:,1:nsub) = b(:,1:nold)*u(1:nold,1:nsub)
        call ZGEMM ('N', 'N', nn, nsub, nold, cone, b(nn1,1), n,&
          u, nmax, czero, work, BLOCKSIZE)
!       Copy back work(:,1:nsub) into b(:,1:nsub)
        do j = 1, nsub
          call ZCOPY (nn, work(1,j), 1, b(nn1,j), 1)
        end do
        nn1 = nn1 + BLOCKSIZE
      end do
 
      call uttime(time)
      timer(TBLAS) = timer(TBLAS) + time(1)
      timer(TVROT) = timer(TVROT) + time(1)
 
      return
      end                                              

!-------------------------------------------------------------------

      subroutine symprod1(n, ne, nmax, b, hb, bhb, ldhb, timer)

!     Calculation of the matrix product B'HB (and B'SB)
!     exploiting the fact that the result is symmetric.
!     Uses standard (non-optimized) BLAS calls.
!
!     Input :
!     n ........ leading dimension of b
!     ne ....... current dimension of bhb (or bsb)
!     nmax ..... leading dimension of bhb (or bsb)
!     ldhb ..... leading dimension of hb

!     NB This routine is only called when nsub=ne

      implicit none
      integer nout
      integer n,ne, nmax, ldhb, alpha
      integer j, p, q, beta
      
      complex*C_PRECISION b(n,nmax), hb(ldhb,nmax)
      complex*16 bhb(nmax, nmax)
      real*8 timer(*)

      complex*16 cone, cmone, czero, ctemp
      data cone, czero /(1.0d0,0.0d0), (0.0d0,0.0d0)/

!     Block size parameter
      parameter (alpha=4)

!     ne/alpha should be rounded downards to nearest integer
      beta=ne/alpha

      call time_start(timer,TBLAS)
      do j=1, beta
         p=(j-1)*alpha + 1
         q=alpha*j
         call CCZ_GEMM('C','N',ne-p+1,q-p+1,n,cone,b(1,p),n,&
              hb(1,p),n,czero, bhb(p,p),nmax)
      end do

!     Finish the reminder, if any
      if (beta*alpha .ne. ne) then
         p=beta*alpha+1
         call CCZ_GEMM('C','N',ne-p+1,ne-p+1,n,cone,b(1,p),n,&
              hb(1,p),n,czero, bhb(p,p),nmax)
      end if
      call time_stop(timer,TBLAS)

      return
      end
