#include "definitions.h"
      subroutine par_defwf (nrplwv,nrplwv_global,&
#include PARAL_ARGS
        )

!     Define the distribution of a single wavefunction band across
!     parallel processors.

!     Input:  nrplwv_global ...  Global size of a wavefunction array
!             as defined by the user (i.e., max. number of plane waves) 
!             in (PARAL_ARGS).
!     Output: nrplwv ..... The array size of the partial wavefunction
!             on the local processor.

!     The array distribution is thus like this:

!     Processor:      0          1        ...   (par_np-1)
!     Array:     |  nrplwv  |  nrplwv  |  ...  |  nrplwv  |
!     Global:    |           nrplwv_global                |

!     The number of processors is "par_np".  Hence the global array
!     size is par_np times the local array size:

      implicit none
      integer nrplwv,nrplwv_global
#include PARAL_DECL

!     This number of local plane waves accomodates the global wavefunction:
      nrplwv = (nrplwv_global - 1) / par_pw_np + 1

      return
      end

!----------------------------------------------------------------------

      subroutine par_defwfk (nrplwv, nrplwv_global,nplwkp, nplocal, &
                             offsets, num,&
#include PARAL_ARGS
           , nconso)
 
!     Define how many plane waves of a given k-point are located on
!     each processor.
 
!     Input:  nrplwv ..... Size of the LOCAL wavefunction array.
!             nplwkp ..... The LOCAL number of plane waves
!     Output: nplocal .... Array of LOCAL numbers of plane waves.
!             offsets .... Offsets of LOCAL data into the global array.
!             num ........ Receive/send count (used in subproj)
!             The number of elements in nplocal and offset is "par_np",
!             the number of processors.
 
!     The local number of plane waves must be <= nrplwv (array size).
!     The global number of plane waves is the sum of local ones,
!     and the value is returned in "n_global" (in PARARGS).
 
      implicit none
 
      integer nplwkp, nrplwv,nrplwv_global,nplocal(*), offsets(*)
      integer num, nconso
#include PARAL_DECL

!     Local variables
      integer i, remainder, sum
 
!     Distribute nplwkp plane waves evenly across processors
      do i = 1, par_pw_np
        nplocal(i) = nplwkp / par_pw_np
      end do
      remainder = MOD (nplwkp, par_pw_np)
      do i = 1, remainder
        nplocal(i) = nplocal(i) + 1
      end do
 
!     Calculate the offsets of local arrays into the global array
      sum = 0
      do i = 1, par_pw_np
        offsets(i) = sum
        sum = sum + nplocal(i)
      end do
 
!     Check that we don't exceed the local array size
      if (nplocal(1) .gt. nrplwv) then
        write (nconso,*) 'par_defwfk: ERROR: exceeding local arrays:',&
          nplocal(1), nrplwv
        call clexit (nconso)
      endif
      num=nplocal(par_process+1)
 
      return
      end
 
!--------------------------------------------------------------------------

      subroutine par_distribute (nkprun, nkpmem, kphost, ldonkp,&
#include PARAL_ARGS
        ,idebug)

!     Define the distribution of k-points to groups of processors.
!     The plane waves of a given k-point may be parallelized across 
!     several processors, hence the processors are divided into
!     "groups". 
!       
!     Two types of groups are defined : 
!
!     plane-wave groups : 
!       A plane-wave group treats all plane waves for one k-point
!
!     k-point group :
!       A k-point group treats all k-points for a number of plane-waves.      
!
!     E.g., 6 processors divided into 2 groups treat 4 k-points:
!
!     plw. Grp-no:   Planewaves:  1 .. n1     n1+1..n2    n2+1..N   

!           1      K-point 1  |     1     |     2     |     3     |
!           1      K-point 2  |     1     |     2     |     3     |
!           2      K-point 3  |     4     |     5     |     6     |
!           2      K-point 4  |     4     |     5     |     6     |
! 
!           kp group no             1           2           3 
!
!     In this example each plane-wave group holds 2 k-points: 
!         nodes 1 2 3 : plw. group no 1    
!         nodes 4 5 6 : plw. group no 2

!         nodes 1 4   : k-point group no 1
!         nodes 2 5   : k-point group no 2
!         nodes 3 6   : k-point group no 3

!     Using the charge-density as an example, a mpi_allreduce on 
!     plw. group 1 will be used for first adding the charge-density on 
!     nodes 1,2 and 3 for k-point 1. Similar for k-point 2. 
!     A sum on each nodes of the contributions for kp-point 1 and 2 
!     result in nodes 1 2 3 all hold the total charge density for 
!     k-point 1 and 2. 

!     The same procedure is followd on plw. group 3  so that nodes 
!     4 5 6 hold the total charge density for k-point 3 and 4.  

!     Now a mpi_allreduce is used on kp group 1 2 3 to add the charge 
!     density for kp 1 and 2 to the charge density for kp 3 and 4. 
!     Now all nodes holds the total charge density. 


!     Input:  nkprun ........  The number of k-points
!     Output: kphost ......... Array of group-numbers for the k-points
!             par_comm_pw  ... The communicator for plw. group communication. 
!             par_comm_kp  ... The communicator for kp   group communication. 

      use run_context
      use par_functions_module
      implicit none
      integer nkprun,nkpmem,kphost(nkprun),idebug
      logical*4 ldonkp(nkprun)
#include PARAL_DECL

!     Local vars
!     my_groupno ..... Group-number for this processor
!     group_master ... Array of the groups' master processors
      integer group_master(nkprun),my_groupno
      integer i, k, group_no, irank,nkp,world_rank
      integer par_pw_groups, proc_per_pw_group, kpts_per_pw_group
      integer par_kp_groups, proc_per_kp_group
!     generel array to hold group members
      integer group_mems(par_np)

      do nkp =1,nkprun
        ldonkp(nkp) = .FALSE. 
      enddo       

      write(nconso,*) 'Parallel :   --  parallel configuration -- ' 
!     Number of processors is "par_np"
      if (par_np .le. 1) then
        write (nconso, *) ' Parallel : no. processors too small: ',&
          par_np
        call clexit (nconso)
      endif

!     Compute an optimal number of processor-groups:
      call par_match (par_np, nkprun, par_pw_groups)

      if (par_pw_groups.le.0.or.mod (par_np, par_pw_groups).ne.0) then
!       The number of processors MUST be a multiple of the number of
!       groups, otherwise we give up !
        write (nconso, *)&
        ' Parallel : GIVE UP: par_groups does not divide par_np:',&
          par_pw_groups, par_np
        call clexit (nconso)
      endif

!     Print out information on selected distribution

      proc_per_pw_group = par_np / par_pw_groups
      kpts_per_pw_group = nkprun / par_pw_groups 

!     set nkpmem to the number of k-points per processor if 
!     nkpmem > 1
      if (nkpmem>1) nkpmem = kpts_per_pw_group

      number_kpoints_per_process = kpts_per_pw_group

      write (nconso,100) par_np, par_pw_groups
100   format (' Parallel : There are ', i2,&
        ' processors divided into ', i2, ' groups')

      write (nconso,110) proc_per_pw_group 
      write (nconso,111) kpts_per_pw_group
110   format(' Parallel : Processors per group         : ',i2) 
111   format(' Parallel : k-points per processor group : ',i2)

      write (nconso,120) proc_per_pw_group
120   format (' Parallel : Each k-point is parallelized over ', &
         i2,' processors')

      call par_rank_world (irank,&
#include PARAL_ARGS
        ,nconso)

!     Create par_groups plw. groups of processors and assign k-points

      do group_no = 1, par_pw_groups
!       Select proc_per_group processors as members of this group:
!       the group_mems array contains processor-IDs (1..par_np)
        do i = 1, proc_per_pw_group
          group_mems(i) = (group_no - 1) * proc_per_pw_group + i
          if (irank .eq. group_mems(i)) my_groupno = group_no
        end do

!       Create a plw. group
        call par_group_create (par_comm_pw,&
                       group_mems, proc_per_pw_group,nconso)

!       Assign k-points to this processor group
!       kphost is an array of word rank number for the first process 
!       (process 0) on the plw. group for the k-point. 
        do k = 1, kpts_per_pw_group
         kphost((group_no-1)*kpts_per_pw_group+k) = &
            (group_no - 1) * proc_per_pw_group + 1
        end do
      end do

!     write out information for plw. groups created 
      if (idebug.gt.0) then 
        write(nconso,*) 'par_distribute: Distribution of k-points: ' 
      endif
      do k = 1,kpts_per_pw_group
!       do group_no = 1,par_pw_groups
          group_no = my_groupno
          nkp = (group_no-1)*kpts_per_pw_group+k
!         find out if this process (irank) handles this k-point. 
          if ((irank.ge.kphost(nkp)).and.(irank.le.(kphost(nkp)+proc_per_pw_group))) then
            if (idebug.gt.0) then 
              write(nconso,*) nkp, my_groupno, irank
130           format('par_distribute: k-point ',i3,' on group ',i3,&
                     ' node = ',i2)
            endif
            ldonkp(nkp) = .TRUE.
          endif
!       enddo
      enddo

!     create kp groups  
      par_kp_groups     = proc_per_pw_group
      proc_per_kp_group = par_np/par_kp_groups

      if (idebug.gt.0) then 
        write(nconso,*) 'par_distribute: Number of kp groups ',&
                       par_kp_groups
        write(nconso,*) 'par_distribute:  proc. per kp-group ',&
                                  proc_per_kp_group
      endif

      do group_no = 1,par_kp_groups
        do i = 1,proc_per_kp_group 
          group_mems(i) = (i-1)*par_kp_groups + group_no
          if (idebug.gt.0) then 
            write(nconso,*) 'par_distribute: kp group ',group_no,i,&
                                      group_mems(i)
          endif
        enddo
!       create group 
        call par_group_create (par_comm_kp,&
                      group_mems, proc_per_kp_group,nconso)
      enddo

      par_pw_np = proc_per_pw_group

!     determine process rank par_process (PARAL_ARGS)
!     defined as the rank in the pw group
      world_rank = par_process
      call par_rank_pw (irank,&
#include PARAL_ARGS
        ,nconso)
      par_process = irank - 1

      if (idebug.gt.0) then 
        write(nconso,*) 'par_distribute: rank in word group ',&
         world_rank,&
       ' rank in pw group ',par_process,(ldonkp(nkp),nkp=1,nkprun)
      endif

!     kphost
      if (idebug.gt.0) then 
        do nkp = 1,nkprun 
          write(nconso,*) 'par_distribute: kphost ',nkp,kphost(nkp)
        enddo
      endif
      write(nconso,*) ' '
      call uflush(nconso)

      return
      end

!----------------------------------------------------------------------------


      subroutine par_match (par_np, nkprun, par_groups)

!     Compute an optimal number of processor-groups:
!     Find the distribution that gives a reasonably optimal load-balance.

!     In practice a heavy preference should be given to parallellization
!     across k-points should be given, since this requires much 
!     less communication than parallellization across plane-waves.
!     The only exception is if memory savings are required, in which case
!     parallellization across plane-waves should be preferred (a bigger
!     value of par_groups).

      implicit none
      integer par_np, nkprun, par_groups

!     Local vars
      integer i, gcd

!     The Greatest Common Divisor (GCD) of par_np and nkprun
!     is the optimal value for par_groups.
!     If memory-saving should be done, par_groups = 1 is best.

      do i = 1, min (par_np, nkprun)
        if (mod(par_np,i) .eq. 0 .and. mod(nkprun,i) .eq. 0) then
          gcd = i
        endif
      end do

      par_groups = gcd
!     Maximum memory savings:
!     par_groups = 1

      return
      end
