!----------------------------------------------------------------------
!       This file contains various subroutines for MPI setup,
!       communication and timing in Fortran
!
!       Last update : Nov 16, 2000.
!----------------------------------------------------------------------

module par_functions_module

! module data 
! comm imclude all processes, however does not have to be identical to 
! MPI_COMM_WORLD
  integer comm_world 


! define a maximum block size for use in MPI_ALLREDUCE operations
! par_sum_complex,par_sum_double and mssum
! #define MPI_MAX_BLOCK 32000
#define MPI_MAX_BLOCK 16000

#include "definitions.h"

#if C_PRECISION == 8
#define PAR_CMPL MPI_COMPLEX
#else
#define PAR_CMPL MPI_DOUBLE_COMPLEX
#endif

#define PARAL_ARGS "parallel_args.h"
#define PARAL_DECL "parallel_decl.h"


contains 

!     Start up the parallel environment
!     return lmastr=TRUE if this is the master node

      subroutine par_start(&
#include PARAL_ARGS
       	)

      implicit none

      include 'mpif.h'

!     Parameters:
!
!     IERROR ... the FORTRAN return code
      integer IERROR

#include PARAL_DECL

!     comm is define as the group to include all processes
      comm=MPI_COMM_WORLD
      comm_world = comm

!     start up MPI
      call MPI_INIT(IERROR)

!     determine total number of processes
      call MPI_COMM_SIZE(comm,par_np,IERROR)

!     determine process rank
      call MPI_COMM_RANK(comm,par_process,IERROR)

      end subroutine par_start

!----------------------------------------------------------------------

      subroutine par_end()
!     End the parallel environment

!     Parameters:
!        IERROR ... the FORTRAN return code

      implicit none

      integer IERROR
      
!     shut down MPI
      call MPI_FINALIZE(IERROR)
     
      end subroutine par_end

!-------------------------------------------------------------------------

      subroutine par_getwf (cptwfp, ldc, iband, nbands, nblock,&
#include PARAL_ARGS
        , cwork, exists, timer, nconso)

!     The array cptwfp is distributed across processors.
!     Transpose cptwfp so that band (column) no "iband" is returned
!     in full in the array "cwork".  The transpose occurs for blocks

!     The logical "exists" is TRUE if band "iband" is on this processor,
!     FALSE otherwise, so the processor knows if it will treat this band.

!     It is assumed that this subroutine is called inside a DO-loop
!     on all processors:
!     DO iband = 1, nbands
!       CALL par_getwf (..., iband, nbands, ..., exists)
!       IF (exists) THEN
!         (Do something on column "iband" contained in "cwork")
!       ENDIF
!       . . .
!     END DO
!
!     The layout of data (cptwfp) for a 4 node job looks like: 

!                                 col_no = 2
!                                   |
!                                  \ /
!                       node1     node2    node3     node4
!                     ---------------------------------------
!                     |        | #####  |         |         |   
!           band1     |        | #####  |         |         |  <--- block_no = 1   
!                     |        | #####  |         |         |   
!                     ---------------------------------------
!                     |        | #####  |         |         |   
!           band2     |        | #####  |         |         |   
!                     |        | #####  |         |         |   
!                     ---------------------------------------
!                     |        | #####  |         |         |   
!           band3     |        | #####  |         |         |   
!                     |        | #####  |         |         |   
!                     ---------------------------------------
!                     |        | #####  |         |         |   
!           band4     |        | #####  |         |         |   
!                     |        | #####  |         |         |   
!                     ---------------------------------------
!                     |        |        |         |         |   
!           band5     |        |        |         |         |  <--- block_no = 2  
!                     |        |        |         |         |   
!                     ---------------------------------------
!
!            
!           After one call of par_getcolumn for node2 cwork on the node looks like 
!
!                node1               node2              node3               node4
!           -----------------   -----------------   -----------------   -----------------
!           |   |###|   |   |   |   |###|   |   |   |   |###|   |   |   |   |###|   |   | 
!           -----------------   -----------------   -----------------   -----------------  
!
               
!     input:
!       cptwfp ..  matrix containing parts of all columns
!	ldc .....  leading dimension of cptwfp 
!       iband ...  number on column to be returned in cwork
!       nbands ..  number of columns in cptwfp to be treated
!       nblock ..  number of columns in block to be treated. If
!                  nblock<=0 the subroutine decides a blockingsize
!       exists ..  true if column nr iband is returned on this processor

!     output:
!       cwork ...  array containing column nr iband in matrix cptwfp


      implicit none

      integer nrplwv, iband, nbands, nblock,nconso,ldc
      logical*4 exists
      real*8 timer(*)
#include PARAL_DECL
      complex*WF_PRECISION cptwfp(ldc,nbands), cwork(*)

!     Local variables
      integer block_size, colsreq, no_blocks, col_no, block_no, icol
      integer offset,dummy(1),irank

      if (par_pw_np.eq.1) then 
        exists = .true.
        cwork(1:ldc) = cptwfp(:,iband)
        return 
      endif

!     Blocking size block_size
      if (nblock .gt. 0) then
        block_size = nblock
      else
!       Blocking size = number of processors par_pw_np (cannot exceed nbands)
        block_size = MIN (par_pw_np, nbands)
      endif

!     Total number of blocks
      no_blocks = (nbands - 1) / block_size + 1
!     The block-number treated in this iteration of the outer loop
      block_no = (iband - 1) / block_size + 1
!     The column number within the present block
      col_no = MOD (iband - 1, block_size) + 1

      call par_rank_pw(irank,&
#include PARAL_ARGS
       ,nconso) 

!     Before call find out how many columns in the current block 
!     are left to be requested
      colsreq = block_size
      if (nbands - block_no*block_size .lt. 0) then
         colsreq = MOD (nbands, block_size)
      endif	

!     ** exit if column iband is not supposed to be
!     ** returned on this processor, except if you are in
!     ** the last block. In that case, keep on going to
!     ** make sure all processors participate in the
!     ** communication 
!     write(nconso,*) 'par_getwf irank = ',irank
!     write(nconso,*) 'par_getwf colsreq = ',colsreq
!     write(nconso,*) 'par_getwf blocksize = ',block_size
!     write(nconso,*) 'par_getwf iband = ',iband
!     write(nconso,*) 'par_getwf nbands = ',nbands
!     write(nconso,*) 'par_getwf col_no = ',col_no
!     call uflush(nconso)
            
      if ((irank.gt.colsreq).and.(irank.le.block_size)) then
         if (iband.lt.nbands) then
            exists = .false.
            return
         end if
      else
         if (col_no.ne.irank) then
            exists = .false.
            return
         end if
      end if



!     ** Do the 'transpose' **
!     ** get an entire column of cptwfp **

!     The (column) offset for cptwfp depends on the current block number
      offset = block_size * (block_no - 1) 

!      write(nconso,*) irank, 'par_getwf foer: iband, offset, colsreq,
!     &n_global:  ',iband,offset,colsreq,n_global
!      call uflush(nconso)

      call par_getcolumn (cptwfp(1, 1+offset), ldc, cwork,&
        exists, n_global,colsreq,.false.,dummy,&
#include PARAL_ARGS
        ,timer,nconso)

!      write(nconso,*) irank, 'par_getwf: iband, offset, exists:  '
!     &,iband,offset,exists
!      call uflush(nconso)

      end subroutine par_getwf

! --------------------------------------------------------------------------

      subroutine par_band_exists_on_this_proc(iband,nbands,nblock,&
#include PARAL_ARGS
         ,exists, nconso)
 
!     The logical "exists" is TRUE if band "iband" should be on this processor,
!     FALSE otherwise, so the processor knows if it will treat this band.          
!     (The same logic is in par_getwf)
!     The layout of bands on processors can then work together with par_getback

!
!     input:
!       iband ...  number on column to be returned in cwork
!       nbands ..  number of columns to be treated
!       nblock ..  number of columns in block to be treated. If
!                  nblock<=0 the subroutine decides a blockingsize
 
!     output:
!       exists ..  true if column nr iband should exists on this processor
 
      implicit none
 
      integer iband, nbands, nblock,nconso
      logical*4 exists
#include PARAL_DECL
 
!     Local variables
      integer block_size, colsreq, no_blocks, col_no, block_no
      integer irank
 
!     Blocking size block_size
      if (nblock .gt. 0) then
        block_size = nblock
      else
!       Blocking size = number of processors par_pw_np (cannot exceed nbands)
        block_size = MIN (par_pw_np, nbands)
      endif
 
!     Total number of blocks
      no_blocks = (nbands - 1) / block_size + 1
!     The block-number treated in this iteration of the outer loop
      block_no = (iband - 1) / block_size + 1
!     The column number within the present block
      col_no = MOD (iband - 1, block_size) + 1
 
      call par_rank_pw(irank,&
#include PARAL_ARGS
       ,nconso)
 
!     Before call find out how many columns in the current block
!     are left to be requested
      colsreq = block_size
      if (nbands - block_no*block_size .lt. 0) then
         colsreq = MOD (nbands, block_size)
      endif
 
!     ** exit if column iband is not supposed to be
!     ** returned on this processor, except if you are in
!     ** the last block. In that case, keep on going to
!     ** make sure all processors participate in the
!     ** communication

      if ((irank.gt.colsreq).and.(irank.le.block_size)) then
         if (iband.lt.nbands) then
            exists = .false.
            return
         end if
      else
         if (col_no.ne.irank) then
            exists = .false.
            return
         end if
      end if      

      exists = (irank.le.colsreq)

      end subroutine par_band_exists_on_this_proc    
 


!---------------------------------------------------------------------------

      subroutine par_splitcols(SENDBUF,lds,RECVBUF,ldr,rows,cols,&
          source,&
#include PARAL_ARGS
          ,timer,nconso)  

!     Split a 2D matrix horizontally, i.e. each process will receive
!     a part of all columns from ONE process source.
!     The data must be stored in column-major order (Fortran)

!     Input:
!        SENDBUF ..  matrix to be split up (on the source node)
!        lds ......  leading dimension of matrix SENDBUF
!        ldr ......  leading dimension of matrix RECVBUF
!        rows .....  rows in matrix SENDBUF
!        cols .....  columns in matrix SENDBUF
!        source ...  the processor doing the splitting
!     Output:
!        RECVBUF ..  matrix containing a part of all columns

      implicit none

      include 'mpif.h'

#include PARAL_DECL
      integer lds,rows,cols,source,ldr,i
      complex*C_PRECISION SENDBUF(lds,*)
      complex*C_PRECISION RECVBUF(ldr,*)

      real*8 timer(*)
      integer nconso
      integer SENDCOUNTS(par_pw_np), DISPLS(par_pw_np), RECVCOUNT
      integer IERROR

      call time_start(timer, TPAR)
      call par_defwfkp(ldr, rows, SENDCOUNTS, DISPLS, RECVCOUNT,&
#include PARAL_ARGS
          ,nconso )

!     send one column at a time
      do i=1,cols
         call MPI_SCATTERV(SENDBUF(1,i),SENDCOUNTS,DISPLS,&
                          PAR_CMPL,&
                          RECVBUF(1,i),RECVCOUNT,&
                          PAR_CMPL,&
                          source,par_comm_pw,IERROR)

      end do
      call time_stop(timer, TPAR)

      end subroutine par_splitcols


!------------------------------------------------------------------------


      subroutine par_getback (copyfrom, iband, nbands, nblock,&
#include PARAL_ARGS
        , copyto, ldto, exists, timer, nconso)

!     It is assumed that this subroutine is called inside a DO-loop
!     on all processors:
!
!     DO iband = 1, nbands
!       CALL par_getwf (..., iband, nbands, ..., exists)
!       IF (exists) THEN
!         (Do something on column "iband" contained in "copyto")
!       ENDIF
!       CALL par_getback(..., iband, nbands, ..., exists)
!     END DO

!     The programs assume the splitting defined in par_defwkp, and it
!     therefore need no information regarding the rows.

!     input:
!       copyfrom:  column to be split up
!       iband:     number on column to be returned in copyto
!       nbands:    number of columns to be places in copyfrom
!       nblock:    number of columns in block to be treated. If
!                  nblock<=0 the subroutine decides a blockingsize
!       ldto:       leading dimension of copyto
!       exists:    true if column nr iband is returned on this processor

!     output:
!       copyto:    matrix to place the columns in

      implicit none

!      include 'mpif.h'

      integer ldto,rows, iband, nbands, nblock,nconso
      complex*WF_PRECISION copyto(ldto,nbands), copyfrom(*)
      logical*4 exists
      real*8 timer(*)
#include PARAL_DECL

!     Local variables
      integer block_size, colsreq, no_blocks, col_no, block_no, icol
      integer offset,dummy(1),irank

      integer SENDCOUNTS(par_pw_np), RDISPLS(par_pw_np)
      integer RECVCOUNTS(par_pw_np)
      integer SDISPLS(par_pw_np), RECVCOUNT, i
      integer IERROR
 
!     Blocking size block_size
      if (nblock .gt. 0) then
        block_size = nblock
      else
!       Blocking size = number of processors par_pw_np (cannot exceed nbands)
        block_size = MIN (par_pw_np, nbands)
      endif

!     Total number of blocks
      no_blocks = (nbands - 1) / block_size + 1
!     The block-number treated in this iteration of the outer loop
      block_no = (iband - 1) / block_size + 1
!     The column number within the present block
      col_no = MOD (iband - 1, block_size) + 1

      call par_rank_pw(irank,&
#include PARAL_ARGS
        ,nconso) 

!     Before call find out how many columns in the current block 
!     are left to be requested
      colsreq = block_size
      if (nbands - block_no*block_size .lt. 0) then
         colsreq = MOD (nbands, block_size)
      endif	

!     ** exit if column iband is not supposed to be
!     ** returned on this processor, except if you are in
!     ** the last block. In that case, keep on going to
!     ** make sure all processors participate in the
!     ** communication 
            
      if ((irank.gt.colsreq).and.(irank.le.block_size)) then
         if (iband.lt.nbands) then
            exists = .false.
            return
         end if
      else
         if (col_no.ne.irank) then
            exists = .false.
            return
         end if
      end if

      offset = block_size * (block_no - 1)

      call par_getrows(copyfrom, copyto(1,1+offset), ldto, &
        n_global,colsreq,.false.,dummy,&
#include PARAL_ARGS
        ,timer,nconso)
 
      end subroutine par_getback


!--------------------------------------------------------------------

      subroutine par_getrows(SENDBUF, RECVBUF, ldr, rows, cols,&
           reqflag, reqcols,&
#include PARAL_ARGS
           , timer, nconso )

!     'Transpose' a matrix (collect rows)

!     Input :
!        SENDBUF ..  matrix to be split up (on the source node)
!        ldr ......  leading dimension of RECVBUF
!        rows .....  number of rows in matrix SENDBUF
!        cols .....  number of columns in matrix SENDBUF
!        reqflag ..  if FALSE then columns 1...cols are split up
!                    if TRUE then reqcols is used as a index to the columns
!        reqcols ..  list of columns to be split up. It is not tested
!                    if the column numbers are valid - this is the
!                    responsibility of the caller.
!     Output:
!        RECVBUF ..  Matrix with the columns split up on a number of processors
!
!     The number of columns must be less than or equal to the
!     number of processors.
!     The data must be stored in column-major order (Fortran)
!     It is assumed that the leading dimension of SENDBUF is 'rows'

      implicit none
      
      include 'mpif.h'

#include PARAL_DECL

      integer ldr
      logical iscol, reqflag
      complex*C_PRECISION SENDBUF(*)
!     SENDBUF has dim. rows*1
      complex*C_PRECISION RECVBUF(ldr,*)
    
      integer rows, cols, reqcols(cols), nconso
!      integer bl, 
      real*8 timer(*)

      integer SENDCOUNTS(par_pw_np), RDISPLS(par_pw_np)
      integer RECVCOUNTS(par_pw_np)
      integer SDISPLS(par_pw_np), RECVCOUNT, i
      integer IERROR
      integer irank


      if (cols .gt. par_pw_np) then
!     Number of columns exceeds the number of processors
         write (nconso,*) 'par_getrows: Too many columns'
         write (nconso,*) 'Columns=',cols, ' Processors=',par_pw_np
!         call clexit(nconso)
      end if

      call par_rank_pw(irank,&
#include PARAL_ARGS
        ,nconso)

      call timer_start(timer, TPARTRANS)
      call par_defwfkp(rows,rows, SENDCOUNTS, SDISPLS,&
           RECVCOUNT,&
#include PARAL_ARGS
           ,nconso)
     
      iscol=.true.
 
!     set up recvcounts and displacements
      do i=1,cols
         if (reqflag) then
            RDISPLS(i)=(reqcols(i)-1)*ldr
         else
            RDISPLS(i)=(i-1)*ldr
         end if

!         RDISPLS(i)    = (i-1)*ldr
         RECVCOUNTS(i) = RECVCOUNT
      end do
      if (cols.lt.par_pw_np) then
         do i=cols+1,par_pw_np
!     this is a dummy variable in this case
            RDISPLS(i)    = RDISPLS(cols)+RECVCOUNT
            RECVCOUNTS(i) = 0
         end do
      end if
      

!     in this case the process is not going to send anything
      if ((irank.gt.cols).and.(irank.le.par_pw_np)) then  
         iscol=.false.
         do i=1,par_pw_np
            SENDCOUNTS=0
         end do
      end if
      
      call MPI_ALLTOALLV(SENDBUF,SENDCOUNTS,&
           SDISPLS,PAR_CMPL,&
           RECVBUF,RECVCOUNTS,RDISPLS,&
           PAR_CMPL,&
           par_comm_pw,IERROR)

      call timer_stop(timer, TPARTRANS)
      
      end subroutine par_getrows
  
      
!------------------------------------------------------------------
      subroutine par_getcolumn(SENDBUF,lds,RECVBUF,iscol,rows,cols,&
           reqflag, reqcols,&
#include PARAL_ARGS
           , timer, nconso)

!     'Transpose' a matrix (collect columns)

!     Input :
!        SENDBUF ..  matrix containing parts of all columns
!	 lds ......  leading dimension af SENDBUF
!        rows .....  number of rows in matrix RECVBUF
!        cols .....  requested number of columns in matrix RECVBUF
!        reqflag ..  if FALSE then columns 1...cols are returned
!                    if TRUE then reqcols is used as a index to the columns
!        reqcols ..  list of columns to be returned. It is not tested
!                    if the column numbers are valid - this is the
!                    responsibility of the caller.
!     Output:
!        iscol ....  TRUE if a column is returned, otherwise FALSE
!        RECVBUF ..  one (possibly no) complete column on each processor
!
!     The number of columns must be less than or equal to the
!     number of processors.
!     The data must be stored in column-major order (Fortran)

      implicit none
      
      include 'mpif.h'

#include PARAL_DECL

      integer lds
      logical iscol, reqflag
      complex*C_PRECISION SENDBUF(lds,*)
      complex*C_PRECISION RECVBUF(*)
!     RECVBUF has dim. rows*1
      integer rows,cols, reqcols(cols), nconso
      integer bl,remainder,sum,i,colidx
      real*8 timer(*)
      integer SENDCOUNT, DISPLS(par_pw_np), RECVCOUNTS(par_pw_np)
      integer IERROR

      if (cols .gt. par_pw_np) then
!     Number of columns exceeds the number of processors
         write (nconso,*) 'par_getcolumn: Too many columns'
         write (nconso,*) 'Columns=',cols, ' Processors=',par_pw_np
         call clexit(nconso)
      else

         call time_start(timer, TPAR)
         call par_defwfkp(rows, rows, RECVCOUNTS, DISPLS,&
              SENDCOUNT,&
#include PARAL_ARGS
             ,nconso )

!      write(nconso,*) 'par_getcolumn lds rows ',par_pw_np,lds,rows,
!     &  DISPLS(1),DISPLS(2),RECVCOUNTS(1),
!     &  RECVCOUNTS(2),SENDCOUNT,par_process

!     Find out if a column will be returned on the calling process
         iscol= ((par_process+1) .le. cols)

!     collect one column on each process
         do i=0,cols-1
            if (reqflag) then
               colidx=reqcols(i+1)
            else
               colidx=i+1
            end if
            call MPI_GATHERV(SENDBUF(1,colidx),&
                 SENDCOUNT,PAR_CMPL,&
                 RECVBUF,RECVCOUNTS,DISPLS,&
                 PAR_CMPL,&
                 i,par_comm_pw,IERROR)
            if (IERROR.ne.MPI_SUCCESS) then 
              write(nconso,*) 'par_getcolumn: MPI_GATHERV error ',&
                               IERROR
              call clexit(nconso) 
            endif
         end do
         call time_stop(timer, TPAR)
      end if

      end subroutine par_getcolumn

!-----------------------------------------------------------------

      subroutine par_sum_double(lflag,in,out,ld,M,N,&
#include PARAL_ARGS
           ,timer)

!     Reduce sum for a general double precision matrix (real)
!     lflag .. only the lower triangular is treated if lflag='L'
!     ld ..... leading dimension of matrices IN and OUT
!     M ...... number of rows
!     N ...... number of columns

      implicit none
      include 'mpif.h'

#include PARAL_DECL

      character*1 lflag
      real*8     in(*),out(*)
      real*8 timer(*)
      integer ld,M,N,i
      integer IERROR
      integer block_size,istart
      integer imin,imax,M1

      logical lsame
      external lsame
      real*8   ::  work(MPI_MAX_BLOCK)

      call time_start(timer, TPAR)

      if (par_pw_np.eq.1) return

!     get block size
      block_size = min(M,MPI_MAX_BLOCK)

      imax = 0
!     loop over blocks
10    continue
      imin = imax
      imax = imin + block_size
      if (imax.ge.M) then
       imax = M
      endif
      M1 = imax-imin


!     LSAME is a LAPACK auxilary routine to check if two characters
!     are the same
      if (.not. lsame(lflag,'L')) then
         do i=0,N-1
            call dcopy(M1,in(i*ld+1+imin),1,work,1)
            call MPI_ALLREDUCE(work,out(i*ld+1+imin),M1,&
                  REAL8,MPI_SUM,par_comm_pw,IERROR)
         end do

      else
!     Matrix is lower triangular
         do i=0,N-1
            call dcopy(M1-i,in(i*ld+i+1+imin),1,work,1)
            call MPI_ALLREDUCE(work,out(i*ld+i+1+imin),M1-i,&
                   REAL8,MPI_SUM,par_comm_pw,IERROR)
         end do

      end if

      if (imax.lt.M) goto 10

      call time_stop(timer, TPAR)

      end subroutine par_sum_double

!-----------------------------------------------------------------

      subroutine par_sum_complex(lflag,in,out,ld,M,N,&
#include PARAL_ARGS
           ,timer)

!     Reduce sum for a general double precision matrix (complex)
!     lflag .. only the lower triangular is treated if lflag='L'
!     ld ..... leading dimension of matrices IN and OUT
!     M ...... number of rows
!     N ...... number of columns   

      implicit none
      include 'mpif.h'

#include PARAL_DECL

      character*1 lflag
      complex*16 in(*),out(*)
      real*8 timer(*)
      integer ld,M,N,i
      integer IERROR
      integer block_size,istart
      integer imin,imax,M1

      logical lsame
      external lsame
      complex*16, allocatable ::  work(:)

      call time_start(timer, TPAR)

      if (par_pw_np.eq.1) return

!     get block size
      block_size = min(M,MPI_MAX_BLOCK)

      allocate(work(block_size))

      imax = 0
!     loop over blocks 
10    continue
      imin = imax 
      imax = imin + block_size 
      if (imax.ge.M) then 
       imax = M 
      endif
      M1 = imax-imin


!     LSAME is a LAPACK auxilary routine to check if two characters
!     are the same
      if (.not. lsame(lflag,'L')) then
         do i=0,N-1
            call zcopy(M1,in(i*ld+1+imin),1,work,1)
            call MPI_ALLREDUCE(work,out(i*ld+1+imin),M1,&
                  MPI_DOUBLE_COMPLEX,MPI_SUM,par_comm_pw,IERROR)
         end do

      else
!     Matrix is lower triangular
         do i=0,N-1
            call zcopy(M1-i,in(i*ld+i+1+imin),1,work,1)
            call MPI_ALLREDUCE(work,out(i*ld+i+1+imin),M1-i,&
                   MPI_DOUBLE_COMPLEX,MPI_SUM,par_comm_pw,IERROR)
         end do

      end if

      if (imax.lt.M) goto 10

      call time_stop(timer, TPAR)

      deallocate(work)

      end subroutine par_sum_complex

!-----------------------------------------------------------------

      subroutine wait_all(nconso) 
      implicit none
      include 'mpif.h'
!     implement a wait on all process 
      integer nconso
      integer work1(1),work2(1),IERROR
         work1(1) = 1
         work2(1) = 1
         call MPI_ALLREDUCE(work1,work2,1,&
              MPI_INTEGER,MPI_SUM,MPI_COMM_WORLD,IERROR)

      end subroutine wait_all



      subroutine mssum(in,out,ld,M,N,&
#include PARAL_ARGS
           ,nconso,timer)

!     Reduce sum for a general double precision matrix (real)
!     Equivalent to par_sum_double, however using the kp group.    
!     lflag .. only the lower triangular is treated if lflag='L'
!     ld ..... leading dimension of matrices IN and OUT
!     M ...... number of rows
!     N ...... number of columns

      implicit none
      include 'mpif.h'

#include PARAL_DECL

      real*8     in(*),out(*)
      real*8 timer(*)
      integer ld,M,N,i
      integer nconso
      integer IERROR
      integer block_size,istart
      integer imin,imax,M1

      logical lsame
      external lsame
      real*8    , allocatable ::  work(:) 

      call time_start(timer, TPAR)

!     get block size
      block_size = min(M,MPI_MAX_BLOCK)

      allocate(work(block_size))

      imax = 0
!     loop over blocks
10    continue
      imin = imax
      imax = imin + block_size
      if (imax.ge.M) then
       imax = M
      endif
      M1 = imax-imin


      do i=0,N-1
         call dcopy(M1,in(i*ld+1+imin),1,work,1)
         call MPI_ALLREDUCE(work,out(i*ld+1+imin),M1,&
              REAL8,&
              MPI_SUM,par_comm_kp,IERROR)
      end do

      if (imax.lt.M) goto 10

      call time_stop(timer, TPAR)

      deallocate(work)

      end subroutine mssum

!-----------------------------------------------------------------
      subroutine smwf (nconso,idslav,nkp,cptwfp,cptwfp_fl,wfdim,&
             nplwkp, nrplwv, nrplwv_global, nbands, nkprun,&
             ldonkp,lmastr,&
#include PARAL_ARGS
         ,timer)

!     Send one wavefunction (all bands of 1 k-point) from slave
!     (denoted by idslav) to master.
!     The arrays have the last dimension (nkprun) removed

      implicit none
#include "ms.h"
#include PARAL_DECL

      integer nconso, idslav, nkp
      integer nplwkp,nrplwv,nrplwv_global,nbands,wfdim
      complex*WF_PRECISION cptwfp(nrplwv,nbands)
      complex*WF_PRECISION cptwfp_fl(wfdim,nbands)
      integer    nkprun
      logical*4  ldonkp(nkprun),lmastr
      real*8  timer(*)

!     Local variables
      integer bufid, info, nOK, nnkp, nn, sender,m,idummy(1)
      complex*WF_PRECISION cwork(wfdim)
      logical*4  exists
     
!     Loop over bands (+ initial data) since a complete wavefunction
!     may exceed the available buffer space (32 MB/msg in PVMe 1.3).
!     Need to do handshaking inside loop, in order not to overflow
!     buffers (PARMACS on VPP-500, 12-Dec-1995; also on SP2 thin-nodes,
!     24-Jan-1996).
!     For each plw. group a entire wavefunction for one band is collected 
!     on node 0 for each plw. group  and this is then send to the master. 

! #define DEBUG 1
#ifdef DEBUG
      write(nconso,*) 'smwf nrplwv   ',nrplwv
      write(nconso,*) 'smwf idslav   ',idslav
      write(nconso,*) 'smwf par_pw_np',par_pw_np
      write(nconso,*) 'smwf wfdim    ',wfdim
      write(nconso,*) 'smwf nkp      ',nkp
      write(nconso,*) 'smwf nplwkp   ',nplwkp
      write(nconso,*) 'smwf lmastr   ',lmastr
      write(nconso,*) 'smwf ldonkp   ',ldonkp(nkp)
      call uflush(nconso)
#endif

      do 200 nn=1,nbands

        if (ldonkp(nkp)) then 
!         Get the entire wavefunction from the other processors on node 0
          if (par_pw_np.gt.1) then
            call par_getcolumn(cptwfp(1,nn),nrplwv,cwork,&
                 exists,nplwkp,1,.false.,idummy,&
#include         PARAL_ARGS
                ,timer,nconso)
            if (.not. exists) goto 200
!             wavefunction for band nn is now in the array cwork
#ifdef DEBUG
              write(nconso,*) 'full wf exist for band nn = ',nn
              call uflush(nconso)
#endif
              do m = 1,nrplwv_global
                cptwfp_fl(m,nn) = cwork(m)
              end do
          endif
        endif  ! ldonkp

!       node0 in world group should not send to itself
        if ((lmastr).and.(ldonkp(nkp))) goto 200

!       node0 on other plw. group should send if it handles this 
!       k-point
        if ((.not.lmastr).and.(.not.ldonkp(nkp))) goto 200 


!         Master-slave handshaking
!         Master is node 0 on world group. 
!         idslav is node 0 on this plw. group (kphost(nkp))
          call mssendok (nconso, idslav)

          call smstart (nconso, idslav, MSG_WF, 'wavefunction')

          nnkp = nkp
          sender = idslav
          call smpack(sender,MSG_WF,&
               INTEGER4,nnkp,1,nOK)
          call MESS_PASS_ERR (nconso, 'smwf: smpack', nOK)
!         Check k-point
          if (nnkp .ne. nkp) then
            write (nconso,*) 'PAR: smkinl: Bad nnkp=', nnkp,&
            ' should be=', nkp
            call clexit (nconso)
          endif
          if (par_pw_np.eq.1) then 
           call smpack (sender, MSG_WF,&
#if WF_PRECISION == 8
            COMPLEX8,&
#else
            COMPLEX16,&
#endif
             cptwfp(1,nn), nrplwv, nOK)
          else
           call smpack (sender, MSG_WF,&
#if WF_PRECISION == 8
            COMPLEX8,&
#else
            COMPLEX16,&
#endif
              cptwfp_fl(1,nn), wfdim, nOK)
          endif
          call MESS_PASS_ERR (nconso, 'smwf: smpack', nOK)

200   continue

      end subroutine smwf


!-----------------------------------------------------------------

!     function for time measurement

      double precision function par_gettime()

!     ON RETURN:
!
!     gettime_par: time instance measured in seconds

      implicit none

      include 'mpif.h'

!     get time from MPI function
      par_gettime = MPI_WTIME()
     
      end function par_gettime

!----------------------------------------------------------------------

      subroutine par_rank_world (irank,&
#include PARAL_ARGS
        ,nconso)

      implicit none
      include 'mpif.h'
      integer irank,nconso
#include PARAL_DECL
      integer ierr

!     Return the rank of this processor in the world group (comm) between 1 and the
!     number of processors:  par_rank = [1...par_np]

!     Distributed arrays are laid out consequtively on processors
!     according to their rank (1, 2, ..., par_np).

!     determine process rank
      call MPI_COMM_RANK(comm,irank,ierr)
      if (ierr.ne.MPI_SUCCESS) then
         write (nconso,*) &
           'MPI_COMM_RANK returned (par_rank_world)',ierr
         call clexit(nconso)
      endif

      irank = irank + 1

      end subroutine par_rank_world

!----------------------------------------------------------------------

      subroutine par_rank_pw (irank,&
#include PARAL_ARGS
        ,nconso)
 
      implicit none
      include 'mpif.h'
      integer irank,nconso
#include PARAL_DECL
      integer ierr
 
!     Return the rank of this processor in the pw group (comm) between 1 and the
!     number of processors:  par_rank = [1...par_pw_np]
 
!     Distributed arrays are laid out consequtively on processors
!     according to their rank (1, 2, ..., par_np).
 
!     determine process rank
      call MPI_COMM_RANK(par_comm_pw,irank,ierr)
      if (ierr.ne.MPI_SUCCESS) then
         write (nconso,*) &
           'MPI_COMM_RANK returned (par_rank_pw)',ierr
         call clexit(nconso)
      endif
 
      irank = irank + 1

      end subroutine par_rank_pw


!----------------------------------------------------------------------

      subroutine par_defwfkp (nrplwv, nplwkp, nplocal, offsets, num,&
#include PARAL_ARGS
           , nconso)

!     Define how many plane waves of a given k-point are located on
!     each processor.

!     Input:  nrplwv ..... Size of the LOCAL wavefunction array.
!             nplwkp ..... The LOCAL number of plane waves
!     Output: nplocal .... Array of LOCAL numbers of plane waves.
!             offsets .... Offsets of LOCAL data into the global array.
!             num ........ Receive/send count (used in subproj)
!             The number of elements in nplocal and offset is "par_pw_np", 
!             the number of processors.

!     The local number of plane waves must be <= nrplwv (array size).
!     The global number of plane waves is the sum of local ones,
!     and the value is returned in "n_global" (in PARARGS).

      implicit none

      integer nplwkp, nrplwv, nplocal(*), offsets(*)
      integer num, nconso
#include PARAL_DECL

!     Local variables
      integer i, remainder, sum

!     Distribute nplwkp plane waves evenly across processors
      do i = 1, par_pw_np
        nplocal(i) = nplwkp / par_pw_np
      end do
      remainder = MOD (nplwkp, par_pw_np)
      do i = 1, remainder
        nplocal(i) = nplocal(i) + 1
      end do

!     Calculate the offsets of local arrays into the global array
      sum = 0
      do i = 1, par_pw_np
        offsets(i) = sum
        sum = sum + nplocal(i)
      end do

      num=nplocal(par_process+1)

!     Check that we don't exceed the local array size
      if (num.gt. nrplwv) then
        write (nconso,*) par_process+1,'par_defwfk: ERROR: exceeding local arrays:', num, nrplwv
!        call clexit (nconso)
      endif

      end subroutine par_defwfkp

!----------------------------------------------------------------------
      subroutine par_group_create(par_comm,group_mems,proc_per_group,&
          nconso)
!     Creates a new MPI communicator corresponding to the specified
!     processors.
!
!     input
!       group_mems:       array of processors ids for new group
!       proc_per_group:   number of members in new group
!       nconso:           unit for error output
!     output
!       par_comm:         if current processor is a member of 
!                         created group comm is set to group id.
!

      implicit none

      include 'mpif.h'

      integer group_no, proc_per_group, nconso
      integer group_mems(proc_per_group)
      integer par_comm

      integer mpi_group_mems(proc_per_group)

      integer i,world_group,new_group,ierr,new_comm

!     MPI number groups from 0 to np-1
      do i=1,proc_per_group
         mpi_group_mems(i)=group_mems(i)-1
!        write(*,*) 'mpi_group_create: ', i,mpi_group_mems(i)
      end do

!     Get group corresponding to world
      call MPI_COMM_GROUP(comm_world,world_group,ierr)
      if (ierr.ne.MPI_SUCCESS) then
         write (nconso,*) 'MPI_COMM_GROUP returned ',ierr
         call clexit(nconso)
      endif

!     Create MPI group
      call MPI_GROUP_INCL(world_group,proc_per_group,&
           mpi_group_mems, new_group,ierr)
      if (ierr.ne.MPI_SUCCESS) then
         write (nconso,*) 'MPI_GROUP_INCL returned ',ierr
         call clexit(nconso)
      endif

!     Create MPI communicator
      call MPI_COMM_CREATE(comm_world,new_group,new_comm,ierr)
      if (ierr.ne.MPI_SUCCESS) then
         write (nconso,*) 'MPI_COMM_CREATE returned ',ierr
         call clexit(nconso)
      endif

      if (new_comm.ne.MPI_COMM_NULL) then
!     processor is in group
!     par_comm is communicator stored in PARAL_ARGS
!     par_comm is only changed if processor is a member of current group
         par_comm=new_comm
      end if

!     Delete MPI group
      call MPI_GROUP_FREE(new_group,ierr)
      if (ierr.ne.MPI_SUCCESS) then
         write (nconso,*) 'MPI_GROUP_FREE returned ',ierr
         call clexit(nconso)
      endif

      end subroutine par_group_create

      subroutine mspack_integer_scalar(nconso, dest, msgtag, type, scalar, n, info)

!     MPI:     Master-to-slave send/receive

      implicit none
#include "ms.h"

      integer nconso, dest, msgtag, type, n, info
      integer scalar

!     Local vars
      integer nbytes, sender, actag, aclen
      integer array(1)
      integer status(MPI_STATUS_SIZE)

      array(1) = scalar

      if (dest.eq.ANY) then
        call MPI_Bcast(array, n, type, matid, comm_world, info)
        call MPI_Barrier(comm_world,info)
      else
        if (lmaster) then
          call MPI_Send(array, n, type, dest, msgtag, comm_world,info)
          call MESS_PASS_ERR (nconso, 'mspack: SEND', info)
        else if (lslave) then
          call MPI_Recv(array, n, type, matid, msgtag, comm_world,status,info)
          call MESS_PASS_ERR (nconso, 'mspack: RECEIVE', info)
        endif
      endif

      scalar = array(1)

      end subroutine mspack_integer_scalar

      subroutine mspack_double_scalar(nconso, dest, msgtag, type, scalar, n, info)

!     MPI:     Master-to-slave send/receive

      implicit none
#include "ms.h"

      integer nconso, dest, msgtag, type, n, info
      real*8 scalar

!     Local vars
      integer nbytes, sender, actag, aclen
      real*8 array(1)
      integer status(MPI_STATUS_SIZE)

      array(1) = scalar

      if (dest.eq.ANY) then
        call MPI_Bcast(array, n, type, matid, comm_world, info)
        call MPI_Barrier(comm_world,info)
      else
        if (lmaster) then
          call MPI_Send(array, n, type, dest, msgtag, comm_world,info)
          call MESS_PASS_ERR (nconso, 'mspack: SEND', info)
        else if (lslave) then
          call MPI_Recv(array, n, type, matid, msgtag, comm_world,status,info)
          call MESS_PASS_ERR (nconso, 'mspack: RECEIVE', info)
        endif
      endif

      scalar = array(1)

      end subroutine mspack_double_scalar


      subroutine mspack_logical_scalar(nconso, dest, msgtag, type, scalar, n, info)

!     MPI:     Master-to-slave send/receive

      implicit none
#include "ms.h"

      integer nconso, dest, msgtag, type, n, info
      logical*4 scalar

!     Local vars
      integer nbytes, sender, actag, aclen
      logical*4 array(1)
      integer status(MPI_STATUS_SIZE)

      array(1) = scalar

      if (dest.eq.ANY) then
        call MPI_Bcast(array, n, type, matid, comm_world, info)
        call MPI_Barrier(comm_world,info)
      else
        if (lmaster) then
          call MPI_Send(array, n, type, dest, msgtag, comm_world,info)
          call MESS_PASS_ERR (nconso, 'mspack: SEND', info)
        else if (lslave) then
          call MPI_Recv(array, n, type, matid, msgtag, comm_world,status,info)
          call MESS_PASS_ERR (nconso, 'mspack: RECEIVE', info)
        endif
      endif

      scalar = array(1)

      end subroutine mspack_logical_scalar


      subroutine mspack_integer_array(nconso, dest, msgtag, type, array, n, info)

!     MPI:     Master-to-slave send/receive

      implicit none
#include "ms.h"

      integer nconso, dest, msgtag, type, n, info
      integer array(*)

!     Local vars
      integer nbytes, sender, actag, aclen

      integer status(MPI_STATUS_SIZE)

      if (dest.eq.ANY) then
        call MPI_Bcast(array, n, type, matid, comm_world, info)
        call MPI_Barrier(comm_world,info)
      else
        if (lmaster) then
          call MPI_Send(array, n, type, dest, msgtag, comm_world,info)
          call MESS_PASS_ERR (nconso, 'mspack: SEND', info)
        else if (lslave) then
          call MPI_Recv(array, n, type, matid, msgtag, comm_world,status,info)
          call MESS_PASS_ERR (nconso, 'mspack: RECEIVE', info)
        endif
      endif

      end subroutine mspack_integer_array

      subroutine mspack_double_array(nconso, dest, msgtag, type, array, n, info)

!     MPI:     Master-to-slave send/receive

      implicit none
#include "ms.h"

      integer nconso, dest, msgtag, type, n, info
      real*8 array(*)

!     Local vars
      integer nbytes, sender, actag, aclen

      integer status(MPI_STATUS_SIZE)

      if (dest.eq.ANY) then
        call MPI_Bcast(array, n, type, matid, comm_world, info)
        call MPI_Barrier(comm_world,info)
      else
        if (lmaster) then
          call MPI_Send(array, n, type, dest, msgtag, comm_world,info)
          call MESS_PASS_ERR (nconso, 'mspack: SEND', info)
        else if (lslave) then
          call MPI_Recv(array, n, type, matid, msgtag, comm_world,status,info)
          call MESS_PASS_ERR (nconso, 'mspack: RECEIVE', info)
        endif
      endif

      end subroutine mspack_double_array

      subroutine mspack_logical_array(nconso, dest, msgtag, type, array, n, info)

!     MPI:     Master-to-slave send/receive

      implicit none
#include "ms.h"

      integer nconso, dest, msgtag, type, n, info
      logical*4 array(*)

!     Local vars
      integer nbytes, sender, actag, aclen

      integer status(MPI_STATUS_SIZE)

      if (dest.eq.ANY) then
        call MPI_Bcast(array, n, type, matid, comm_world, info)
        call MPI_Barrier(comm_world,info)
      else
        if (lmaster) then
          call MPI_Send(array, n, type, dest, msgtag, comm_world,info)
          call MESS_PASS_ERR (nconso, 'mspack: SEND', info)
        else if (lslave) then
          call MPI_Recv(array, n, type, matid, msgtag, comm_world,status,info)
          call MESS_PASS_ERR (nconso, 'mspack: RECEIVE', info)
        endif
      endif

      end subroutine mspack_logical_array

      subroutine mspack_character_string(nconso, dest, msgtag, type, array, n, info)

!     MPI:     Master-to-slave send/receive

      implicit none
#include "ms.h"

      integer nconso, dest, msgtag, type, n, info
      character*(*) array(*)

!     Local vars
      integer nbytes, sender, actag, aclen

      integer status(MPI_STATUS_SIZE)

      if (dest.eq.ANY) then
        call MPI_Bcast(array, n, type, matid, comm_world, info)
        call MPI_Barrier(comm_world,info)
      else
        if (lmaster) then
          call MPI_Send(array, n, type, dest, msgtag, comm_world,info)
          call MESS_PASS_ERR (nconso, 'mspack: SEND', info)
        else if (lslave) then
          call MPI_Recv(array, n, type, matid, msgtag, comm_world,status,info)
          call MESS_PASS_ERR (nconso, 'mspack: RECEIVE', info)
        endif
      endif

      end subroutine mspack_character_string


! -------------------------------------------------------------------------------
!  subroutine sub_divide_word_group read the netCDF variable 
!  MPIControl. 
!  If this variable is set the comm group, initially contaning mpi_world_group, 
!  is subdivided in MpiControl.NumberOfGroups. 
!--------------------------------------------------------------------------------
      subroutine sub_divide_worldgroup(comm,par_np,par_process)
      use run_context
      use netcdfinterface
      
      implicit none
      integer, intent(inout) :: comm,par_np,par_process 

!     locals 
      integer              :: proc_per_group,number_of_groups,group_no 
      integer              :: first_process,last_process, group_size
      integer, allocatable :: group_members(:)
      integer              :: status,n,ncid
      character            :: cdummy

      ! not supported (only master reads nc file should 
      !  implemented first) 
      return 

      status =  nf_open(netCDF_input_filename, NF_NOWRITE, ncid)
      if (status /= nf_noerr) call abort_calc(nconso, &
                 "sub_divide_worldgroup -> nf_open : error opening nc-file")

      status = nfget(ncid, "MPIControl", cdummy)
      if (status == nfif_OK) then              
         status = nfget(ncid, "MPIControl%NumberOfGroups", n)
         if (status == nfif_OK) then           ! Dynamics%Method is defined
            number_of_groups = n
            if (n.lt.1) then  
               write(*,*) 'Illegal value of MPIControl%NumberOfGroups'
               return 
            endif 
            if (n.eq.1) return 
               
            write(*,*) 'Dividing world group in ',n,' groups'
            call uflush(6)
         else 
            return 
         endif 
      else 
         return 
      endif 

      if (mod(par_np,number_of_groups).ne.0) then 
        write(6,*) 'MPIControl: The total number of processors must be a multiple of NumberOfGroups' 
        call clexit(6) 
      endif

      call MPI_Comm_size(comm,par_np,status)
      write(*,*) 'old par_process ',par_process,' par_np = ',par_np
      call uflush(6)

      proc_per_group = par_np/number_of_groups

      ! find group for this process 
      group_no = par_process/proc_per_group
      write(*,*) 'par_process:',par_process,' my group no is ',group_no
      call uflush(6)

      ! define group members for this group group_no 
      first_process = group_no*proc_per_group
      last_process  = min(par_np,first_process+proc_per_group) 

      group_size = last_process - first_process
      allocate(group_members(group_size))

      do n = 1,group_size
 
       group_members(n) = n + first_process 

      enddo 
      write(*,*) 'par_process:',par_process,' my group members are ',group_members(:) 
      call uflush(6)

      call par_group_create(comm,group_members,group_size,6)
      call MPI_Comm_size(comm,par_np,status)
      call MPI_Comm_rank(comm,par_process,status)
      comm_world = comm
      write(*,*) 'new par_process:',par_process,' par_np = ',par_np
      call uflush(6)

      call change_input_output_filenames(group_no)

      call uflush(6)

      end subroutine sub_divide_worldgroup




end module par_functions_module

