      subroutine cczgemm2 (transa, transb, m, n, k, alpha, a, lda,
     &  b, ldb, beta, c, ldc)

c     Emulate BLAS subroutine zgemm, but for mixed precision
c     This code is tuned for IBM POWER2, otherwise cczgemm is called.

      implicit none
      character*1 transa, transb
      integer m, n, k, lda, ldb, ldc
c     a is (m,k), b is (k,n), c is (m,n):
      complex*8 a(lda,k), b(ldc,n)
      complex*16 c(ldb,n)
      complex*16 alpha, beta

#ifdef IBM

c     Local vars
      logical lsame
      external lsame
      complex*16 btmp
      integer m1, n1, k1
      real*8 h1r,h1i,h2r,h2i,h3r,h3i,h4r,h4i
      integer m0
      m0 = mod(m,4)

c     For the purpose of simplicity assume the following restrictions:
c     transa='C', transb='N'
c     alpha=1.0, beta=0.0

      if (lsame(transa,'N') .or. lsame(transb,'C') .or.
     &  alpha .ne. (1.0d0,0.0d0) .or. beta .ne. (0.0d0,0.0d0)) then
c       Call the generic subroutine
        call cczgemm (transa, transb, m, n, k, alpha, a, lda,
     &    b, ldb, beta, c, ldc)
        return
      endif

c     Multiply the complex*WF_PRECISION by complex*16 matrices

      do 290 n1 = 1, n

c       This matrix-multiply coding has been optimized for IBM POWER2 CPUs
c       with the IBM XLF version 3.2 Fortran compiler.  The technique is
c       to unroll the inner loop, and convert explicitly complex to
c       real arithmetic (stupid optimizer).  This code gets 90-95%
c       of peak performance, provided the array-blocks fit inside cache.
c       /OHN, 13-Sep-1996.

c       First the remainder part:
        do 210 m1 = 1, m0
          h1r=0.0d0
          h1i=0.0d0
          do 200 k1 = 1, k
            h1r=h1r +  real(a(k1,m1  ))* real(b(k1,n1))
     &              +  imag(a(k1,m1  ))* imag(b(k1,n1))
            h1i=h1i -  imag(a(k1,m1  ))* real(b(k1,n1))
     &              +  real(a(k1,m1  ))* imag(b(k1,n1))
200       continue
          c(m1,n1) = dcmplx(h1r,h1i)
210     continue

c       Now the inner loop unrolled by 4:
        do 230 m1 = m0 + 1, m, 4
          h1r=0.0d0
          h1i=0.0d0
          h2r=0.0d0
          h2i=0.0d0
          h3r=0.0d0
          h3i=0.0d0
          h4r=0.0d0
          h4i=0.0d0
          do 220 k1 = 1, k
            h1r=h1r +  real(a(k1,m1  ))* real(b(k1,n1))
     &              +  imag(a(k1,m1  ))* imag(b(k1,n1))
            h1i=h1i -  imag(a(k1,m1  ))* real(b(k1,n1))
     &              +  real(a(k1,m1  ))* imag(b(k1,n1))
            h2r=h2r +  real(a(k1,m1+1))* real(b(k1,n1))
     &              +  imag(a(k1,m1+1))* imag(b(k1,n1))
            h2i=h2i -  imag(a(k1,m1+1))* real(b(k1,n1))
     &              +  real(a(k1,m1+1))* imag(b(k1,n1))
            h3r=h3r +  real(a(k1,m1+2))* real(b(k1,n1))
     &              +  imag(a(k1,m1+2))* imag(b(k1,n1))
            h3i=h3i -  imag(a(k1,m1+2))* real(b(k1,n1))
     &              +  real(a(k1,m1+2))* imag(b(k1,n1))
            h4r=h4r +  real(a(k1,m1+3))* real(b(k1,n1))
     &              +  imag(a(k1,m1+3))* imag(b(k1,n1))
            h4i=h4i -  imag(a(k1,m1+3))* real(b(k1,n1))
     &              +  real(a(k1,m1+3))* imag(b(k1,n1))
220       continue
          c(m1+0,n1) = dcmplx(h1r,h1i)
          c(m1+1,n1) = dcmplx(h2r,h2i)
          c(m1+2,n1) = dcmplx(h3r,h3i)
          c(m1+3,n1) = dcmplx(h4r,h4i)
230     continue

290   continue

#else
c     Non-IBM machines: Call the generic subroutine
      call cczgemm (transa, transb, m, n, k, alpha, a, lda,
     &  b, ldb, beta, c, ldc)
#endif

      return
      end
