/********************************************************************

"THIS SOFTWARE IS SUBJECT TO COPYRIGHT PROTECTION AND IS OFFERED ONLY
PURSUANT TO THE 3DFX FXT1 GENERAL PUBLIC LICENSE. A COPY OF THIS
LICENSE MAY BE OBTAINED FROM THE DISTRIBUTOR OR BY CONTACTING 3DFX
INTERACTIVE INC.  

TO THE EXTENT PERMITTED BY APPLICABLE LAW, THERE IS NO WARRANTY FOR
THIS PROGRAM. THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THIS
PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR
IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK
AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD
THIS PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY
SERVICING, REPAIR OR CORRECTION.

IN NO EVENT, UNLESS REQUIRED BY APPLICABLE LAW, WILL 3DFX INTERACTIVE,
INC., OR ANY OTHER COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
AND/OR REDISTRIBUTE THIS PROGRAM OR DERIVATIVE WORKS AS PERMITTED
ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL,
INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR
INABILITY TO USE THIS PROGRAM OR DERIVATIVE WORKS (INCLUDING BUT NOT
LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES
SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THIS PROGRAM OR
DERIVATIVE WORKS TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH
HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
DAMAGES.

USE, DUPLICATION OR DISCLOSURE BY THE GOVERNMENT IS SUBJECT TO
RESTRICTIONS AS SET FORTH IN SUBDIVISION (C)(1)(II) OF THE RIGHTS IN
TECHNICAL DATA AND COMPUTER SOFTWARE CLAUSE AT DFARS 252.227-7013,
AND/OR IN SIMILAR OR SUCCESSOR CLAUSES IN THE FAR, DOD OR NASA FAR
SUPPLEMENT. UNPUBLISHED RIGHTS RESERVED UNDER THE COPYRIGHT LAWS OF
THE UNITED STATES.  COPYRIGHT 3DFX INTERACTIVE, INC. 1999, ALL RIGHTS
RESERVED"

********************************************************************/

/*
** COMMONPB.C
**
** 9/02/99 rufus@3dfx.com
** Original release cleanup.
*/

#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#ifdef __MSC__
#include <io.h>
#endif
#include <string.h>
#include <math.h>

#include "3dfxpb.h"
#include "fximgpb.h"

#include <assert.h>
#include "comppb.h"

void
FillPallette4(int alpha, FxI32 colors[4][4], FxI32 lo[3], FxI32 hi[3])
{
  int i, j;
  if (alpha == 0)
    {
      for (i = 0; i < 4; i++) {
        for (j = 0; j < 3; j++) {
          colors[i][j] = ((3 - i) * lo[j] + i * hi[j] + 1) / 3;
        }       
        colors[i][3] = 255;
      }
    }
  else
    {
      for (i = 0; i < 3; i++)
        {
          for (j = 0; j < 3; j++)
            {
              colors[i][j] = ((2-i)*lo[j] + i*hi[j])/2;
            }
          colors[i][3] = 255;
        }
      for (j = 0; j < 4; j++)
        {
          colors[3][j] = 0;
        }
    }
}

void
FillPallette8(FxI32 colors[8][4], FxI32 lo[3], FxI32 hi[3])
{
  int i, j;
  for (i = 0; i < 7; i++) {
    for (j = 0; j < 3; j++) {
      colors[i][j] = ((6 - i) * lo[j] + i * hi[j] + 2) / 6;
    }
    colors[i][3] = 255;
  }
  for (j = 0; j < 4; j++)
    {
      colors[7][j] = 0;
    }
}

int colorBitsMIXED[2] = {CC_MIXED_COLOR_0, CC_MIXED_COLOR_2};
int texelBitsMIXED[2] = {CC_MIXED_TEXELS_LO, CC_MIXED_TEXELS_HI};
int texelBitsALPHA[2] = {CC_ALPHA_TEXELS_LO, CC_ALPHA_TEXELS_HI};

int channelShift[4] = {0, 8, 16, 24};

typedef void (*DecompFun)(FxU32 width, int x, int y,
                          FxI32 *pixelsOut,
                          CCBlock blockIn);

/*
 * decodes a cc_mixed block
 */
void
Decomp4x8CC_Mixed(FxU32 width, int x, int y,
                  FxI32 *pixelsOut,
                  CCBlock blockIn)
{
  
  int i, j, block, chan;
  int bx;
  int glsb;
  int alpha;
  FxI32 lo[3], hi[3];
  FxI32 colors[4][4];
  int index;
  
  bx = 0;
  
  alpha = CC_GETBIT(blockIn, CC_MIXED_ALPHA);
  
  for (block = 0; block < 2; block++, bx += 4) {
    
    if (block == 0) {
      glsb = CC_GETBIT(blockIn, CC_MIXED_GLSB_0);
    } else {
      glsb = CC_GETBIT(blockIn, CC_MIXED_GLSB_1);
    }
    
    for (chan = 0; chan < 3; chan++) {
      lo[chan] = hi[chan] = 0;
      for (i = 0; i < 5; i++) {
        lo[chan] |=
          CC_GETBIT(blockIn, colorBitsMIXED[block] + 5 * chan + 15 + i)
            << (i + 3);
        hi[chan] |=
          CC_GETBIT(blockIn, colorBitsMIXED[block] + 5 * chan +  0 + i)
            << (i + 3);
      }
      lo[chan] = lo[chan] | (lo[chan] >> 5);
      hi[chan] = hi[chan] | (hi[chan] >> 5);
    }
    
    if (alpha == 0)
      {
        lo[1] &= 0xf8;
        lo[1] |= (glsb ^ CC_GETBIT(blockIn, texelBitsMIXED[block] + 1)) << 2;
        lo[1] |= lo[1] >> 6;
      }
    
    hi[1] &= 0xf8;
    hi[1] |= glsb << 2;
    hi[1] |= hi[1] >> 6;
    
    FillPallette4(alpha, colors, lo, hi);
    
    for (i = 0; i < 4; i++) {
      for (j = 0; j < 4; j++) {
        index = 
          (CC_GETBIT(blockIn,
                     texelBitsMIXED[block] + 8 * i + 2 * j + 0) << 0) |
          (CC_GETBIT(blockIn,
                     texelBitsMIXED[block] + 8 * i + 2 * j + 1) << 1);
        
        for (chan = 0; chan < 4; chan++) {
          /* clear channel */
          pixelsOut[(y + i) * width + x  + bx + j] &= 
            ~(255 << channelShift[chan]);
          /* or in channel */
          pixelsOut[(y + i) * width + x + bx + j] |=
            colors[index][chan] << channelShift[chan];
        }
      }
    }
  }
}

/*
 * decodes a cc_hi block
 */
void
Decomp4x8CC_HI(FxU32 width, int x, int y,
               FxI32 *pixelsOut,
               CCBlock blockIn)
{
  int i, j, chan;
  FxI32 lo[3], hi[3];
  FxI32 colors[8][4];
  int index;
  
  for (chan = 0; chan < 3; chan++) {
    lo[chan] = hi[chan] = 0;
    for (i = 0; i < 5; i++) {
      lo[chan] |=
        CC_GETBIT(blockIn, CC_HI_COLOR_0 + 5 * chan + i)
          << (i + 3);
      hi[chan] |=
        CC_GETBIT(blockIn, CC_HI_COLOR_1 + 5 * chan + i)
          << (i + 3);
    }
    lo[chan] = lo[chan] | (lo[chan] >> 5);
    hi[chan] = hi[chan] | (hi[chan] >> 5);
    
  }
  
  FillPallette8(colors, lo, hi);
  
  for (i = 0; i < 4; i++) {
    for (j = 0; j < 8; j++) {
      int blockOffset = (j >= 4) ? 48 : 0;
      int texOffset = 12 * i + 3 * (j & 3);
      int offset = CC_HI_TEXELS + blockOffset + texOffset;
      index = 
        (CC_GETBIT(blockIn, offset + 0) << 0) |
        (CC_GETBIT(blockIn, offset + 1) << 1) |
        (CC_GETBIT(blockIn, offset + 2) << 2);
      
      for (chan = 0; chan < 4; chan++) {
        /* clear channel */
        pixelsOut[(y + i) * width + x + j] &= ~(255 << channelShift[chan]);
        /* or in channel */
        pixelsOut[(y + i) * width + x + j] |=
          colors[index][chan] << channelShift[chan];
      }
    }
  }
}

/*
 * decodes a cc_chroma block
 */
void
Decomp4x8CC_Chroma(FxU32 width, int x, int y,
                   FxI32 *pixelsOut,
                   CCBlock blockIn)
{
  
  int i, j, chan;
  int index;
  FxI32 colorVectors[4][4];
  
  for (chan = 0; chan < 3; chan++) {
    for (i = 0; i < 4; i++) {
      colorVectors[i][chan] = 0;
    }
    for (i = 0; i < 5; i++) {
      colorVectors[0][chan] |=
        CC_GETBIT(blockIn,
                  CC_CHROMA_COLOR_0 + 5 * chan + i) << (i + 3);
      colorVectors[1][chan] |=
        CC_GETBIT(blockIn,
                  CC_CHROMA_COLOR_1 + 5 * chan + i) << (i + 3);
      colorVectors[2][chan] |=
        CC_GETBIT(blockIn,
                  CC_CHROMA_COLOR_2 + 5 * chan + i) << (i + 3);
      colorVectors[3][chan] |=
        CC_GETBIT(blockIn,
                  CC_CHROMA_COLOR_3 + 5 * chan + i) << (i + 3);
    } 
    for (i = 0; i < 4; i++) {
      colorVectors[i][chan] |= colorVectors[i][chan] >> 5;
    }
  }
  chan = 3;
  for (i = 0; i < 4; i++)
    {
      colorVectors[i][3] = 255;
    }
  for (i = 0; i < 4; i++) {
    for (j = 0; j < 8; j++) {
      int blockOffset = (j >= 4) ? 32 : 0;
      int texOffset = 8 * i + 2 * (j & 3);
      int offset = CC_CHROMA_TEXELS + blockOffset + texOffset;
      index = 
        (CC_GETBIT(blockIn, offset + 0) << 0) |
        (CC_GETBIT(blockIn, offset + 1) << 1);
      
      for (chan = 0; chan < 4; chan++) {
        /* clear channel */
        pixelsOut[(y + i) * width + x + j] &=
          ~(255 << channelShift[chan]);
        /* or in channel */
        pixelsOut[(y + i) * width + x + j] |=
          colorVectors[index][chan] << channelShift[chan];
      }
    }
  }
}

/*
 * decodes a cc_alpha block
 */
void
Decomp4x8CC_Alpha(FxU32 width, int x, int y,
                  FxI32 *pixelsOut,
                  CCBlock blockIn)
{
  int i, j, block, chan;
  int lerp, bx;
  FxI32 colors[4][4];
  int index;
  FxI32 colorVectors[3][4];
  
  lerp = CC_GETBIT(blockIn, CC_ALPHA_LERP);
  for (chan = 0; chan < 4; chan++)
    {
      for (i = 0; i < 3; i++)
        {
          colorVectors[i][chan] = 0;
        }
    }
  for (chan = 0; chan < 4; chan++)
    {
      if (chan < 3)
        {
          for (i = 0; i < 5; i++)
            {
              colorVectors[0][chan] |= 
                CC_GETBIT(blockIn, CC_ALPHA_COLOR_0 + 5*chan+i) << (i + 3);
              colorVectors[1][chan] |= 
                CC_GETBIT(blockIn, CC_ALPHA_COLOR_1 + 5*chan+i) << (i + 3);
              colorVectors[2][chan] |= 
                CC_GETBIT(blockIn, CC_ALPHA_COLOR_2 + 5*chan+i) << (i + 3);
            }
        }
      else
        {
          for (i = 0; i < 5; i++)
            {
              colorVectors[0][3] |=
                CC_GETBIT(blockIn, CC_ALPHA_ALPHA_0 + i) << (i + 3);
              colorVectors[1][3] |=
                CC_GETBIT(blockIn, CC_ALPHA_ALPHA_1 + i) << (i + 3);
              colorVectors[2][3] |=
                CC_GETBIT(blockIn, CC_ALPHA_ALPHA_2 + i) << (i + 3);
            }
        }
      for (i = 0; i < 3; i++)
        {
          colorVectors[i][chan] |= colorVectors[i][chan] >> 5;
        }
    }
  
  if (lerp == 0)
    {
      for (i = 0; i < 4; i++)
        {
          for (chan = 0; chan < 4; chan++)
            {
              if (i != 3) colors[i][chan] = colorVectors[i][chan];
              else colors[i][chan] = 0;
            }
        }
      
      for (i = 0; i < 4; i++)
        {
          for (j = 0; j < 8; j++)
            {
              int blockOffset = (j >= 4)? 32 : 0;
              int texOffset = 8 * i + 2 * (j & 3);
              int offset = CC_ALPHA_TEXELS + blockOffset + texOffset;
              index = (CC_GETBIT(blockIn, offset + 0) << 0) |
                (CC_GETBIT(blockIn, offset + 1) << 1);
              for (chan = 0; chan < 4; chan++)
                {
                  /* clear channel */
                  pixelsOut[(y + i) * width + x + j] &= 
                    ~(255 << channelShift[chan]);
                  /* or in channel */
                  pixelsOut[(y + i) * width + x + j] |= 
                    colors[index][chan] << channelShift[chan];
                }
            }
        }
    }
  else
    {
      bx = 0;
      for (block = 0; block < 2; block++, bx +=4)
        {
          if (block == 0)
            {
              for (i = 0; i < 4; i++) {
                for (chan = 0; chan < 4; chan++) {
                  colors[i][chan] = 
                    ((3 - i) * colorVectors[0][chan] + 
                     i * colorVectors[1][chan] + 1) / 3;
                }       
              }
            }
          else
            {
              for (i = 0; i < 4; i++) {
                for (chan = 0; chan < 4; chan++) {
                  colors[i][chan] = 
                    ((3 - i) * colorVectors[2][chan] + 
                     i * colorVectors[1][chan] + 1) / 3;
                }       
              }
            }
          
          for (i = 0; i < 4; i++)
            {
              for (j = 0; j < 4; j++)
                {
                  int blockOffset = texelBitsALPHA[block];
                  int texOffset = 8 * i + 2 * j;
                  int offset = blockOffset + texOffset;
                  index = (CC_GETBIT(blockIn, offset + 0) << 0) |
                    (CC_GETBIT(blockIn, offset + 1) << 1);
                  for (chan = 0; chan < 4; chan++)
                    {
                      /* clear channel */
                      pixelsOut[(y + i) * width + x + bx + j] &= 
                        ~(255 << channelShift[chan]);
                      /* or in channel */
                      pixelsOut[(y + i) * width + x + bx + j] |= 
                        colors[index][chan] << channelShift[chan];
                    }
                }
            }
        }
    }
}

DecompFun DecompressionFunctions[] =
{
  Decomp4x8CC_Mixed,
  Decomp4x8CC_HI,
  Decomp4x8CC_Chroma,
  Decomp4x8CC_Alpha,
  0};

/*
 * decodes a block to color.
 * right now, only handles mixed with sel=1.
 */
void
DecodeCCBlock(FxU32 width, int x, int y,
              FxI32 *pixelsOut,
              CCBlock blockIn)
{
  int mode[3];
  
  mode[0] = CC_GETBIT(blockIn, CC_MIXED_MODE);
  mode[1] = CC_GETBIT(blockIn, CC_HI_MODE_0);
  mode[2] = CC_GETBIT(blockIn, CC_CHROMA_MODE_0);
  
  if (mode[0]) { 
    /* mixed mode = "1" */
    DecompressionFunctions[0](width, x, y, pixelsOut, blockIn); 
  } else if (mode[1] == 0) {
    /* cc-high mode = "00" */
    DecompressionFunctions[1](width, x, y, pixelsOut, blockIn); 
  } else if (mode[2] == 0) {
    /* cc-chroma mode = "010" */
    DecompressionFunctions[2](width, x, y, pixelsOut, blockIn); 
  } else {
    /* alpha mode = "011" */
    DecompressionFunctions[3](width, x, y, pixelsOut, blockIn); 
  }
} 

/* decode section */

void
decode4x8(ImgInfo *info, FILE *inf, int verbose)
{
  FxU32 x,y, xend,yend;
  int w;
  FxI32 *data;
  CCBlock ccblock;
  
  
  /* compare sizes and use the minimum size */
  xend = info->any.width;
  yend = info->any.height;
  
  data = (FxI32 *)info->any.data;
  w = info->any.width;
  
  for (y=0; y + 3 < yend; y += 4) {
    for (x=0; x + 7 < xend; x += 8) {
 /*     fprintf(stderr, "running at x=%d, y=%d\n", x, y); */
      fread(&ccblock, sizeof(ccblock), 1, inf);
      DecodeCCBlock(w, x, y, (FxI32 *)info->any.data, ccblock);
    }
  }
}

/* encode section */

#define RED(x) (((x) >> 16) & 255)
#define GREEN(x) (((x) >> 8) & 255)
#define BLUE(x) (((x) >> 0) & 255)

void
findMinMax(int n,
           FxI32 pix[][4],
           FxI32 minPix[4],
           FxI32 maxPix[4]
           )
{
  int i, chan;
  for (chan = 0; chan < 4; chan++) {
    minPix[chan] = 255; maxPix[chan] = 0;
  }
  for (i = 0; i < n; i++) {
    for (chan = 0; chan < 4; chan++) {
      if (pix[i][chan] < minPix[chan])
        minPix[chan] = pix[i][chan];
      if (pix[i][chan] > maxPix[chan])
        maxPix[chan] = pix[i][chan];
    }
  }
}

FxI32
cdiff2(FxI32 *a, FxI32 *b)
{
  int i;
  FxI32 t, s;
  s = 0;
  for (i = 0; i < 4; i++) {
    t = a[i] - b[i];
    s += t * t;
  }
  return s;
}


/* return the sum rms error for a pixel block on a single channel */
double
rmsErrorBlockChannel(int n, int chan, FxI32 a[][4], FxI32 b[][4])
{
  int i;
  FxI32 t, s;
  
  s = 0;
  for (i = 0; i < n; i++) {
    t = a[i][chan] - b[i][chan];
    s += t * t;
  }
  return s;
}

/* return the sum rms error for a pixel block */
double
rmsErrorBlock(int n, FxI32 a[][4], FxI32 b[][4])
{
  int chan;
  double rms;
  
  rms = 0;
  for (chan = 0; chan < 4; chan++) {
    rms += rmsErrorBlockChannel(n, chan, a, b);
  }
  return rms;
}

double
rmsImgBlock(FxU32 w, int x, int y,
            FxI32 *a, FxI32 *b)
{
  int i, j, shift;
  int block, bx;
  double t, s;
  
  s = 0.0;        
  bx = 0;
  for (block = 0; block < 2; block++, bx += 4) {
    for (i = 0; i < 4; i++) {
      for (j = 0; j < 4; j++) {
        int index = (y + i) * w + x + j + bx;
        double cs = 0;
        for (shift = 0; shift < 32; shift += 8) {
          t = ((a[index] >> shift) & 0xff) - ((b[index] >> shift) & 0xff);
          cs += t * t;
        }
        s += cs;
      }
    }
  }
  return s;
}

void
sort3ChannelsByVariance(int n, FxI32 a[][4], int sortedChans[3])
{
#define SWAP(a, b, t) t = a; a = b; b = t;
#define TXCH(i1, i2)                    \
        if (variance[i1] < variance[i2]) {      \
        SWAP(variance[i1], variance[i2], tv);           \
        SWAP(sortedChans[i1], sortedChans[i2], tc);     \
        }
  
  int chan, i;
  FxI32 sx, sx2, tc;
  double variance[3], tv;
  double teenth = 1.0 / n;
  
  for (chan = 0; chan < 3; chan++) {
    sortedChans[chan] = chan;
    sx = sx2 = 0;
    for (i = 0; i < n; i++) {
      FxI32 t = a[i][chan];
      sx += t;
      sx2 += t * t;
    }
    variance[chan] = sx2 * teenth - sx * sx * teenth * teenth;
  }
  
  /* sort in on variance decreasing */
  TXCH(0, 1); 
  TXCH(0, 2);
  TXCH(1, 2);
  
#undef SWAP
#undef TXCH
}

void
sort4ChannelsByVariance(int n, FxI32 a[][4], int sortedChans[4])
{
#define SWAP(a, b, t) t = a; a = b; b = t;
#define TXCH(i1, i2)                                    \
        if (variance[i1] < variance[i2]) {              \
        SWAP(variance[i1], variance[i2], tv);           \
        SWAP(sortedChans[i1], sortedChans[i2], tc);     \
        }
  
  int chan, i;
  FxI32 sx, sx2, tc;
  double variance[4], tv;
  double teenth = 1.0 / n;
  
  for (chan = 0; chan < 4; chan++) {
    sortedChans[chan] = chan;
    sx = sx2 = 0;
    for (i = 0; i < n; i++) {
      FxI32 t = a[i][chan];
      sx += t;
      sx2 += t * t;
    }
    variance[chan] = sx2 * teenth - sx * sx * teenth * teenth;
  }
  
  /* sort in on variance decreasing */
  TXCH(0, 1); 
  TXCH(0, 2);
  TXCH(0, 3);
  TXCH(1, 2);
  TXCH(1, 3);
  TXCH(2, 3);
  
#undef SWAP
#undef TXCH
}

void
extendLoHi4(int alpha, 
            FxI32 Lo[3], FxI32 Hi[3],
            FxI32 lo[3], FxI32 hi[3])
{
  int chan;
  for (chan = 0; chan < 3; chan++) {
    if (chan != 1) { /* not green */
      assert(!(Lo[chan] & ~0xf8));
      assert(!(Hi[chan] & ~0xf8));
      
      lo[chan] = Lo[chan] | (Lo[chan] >> 5);
      hi[chan] = Hi[chan] | (Hi[chan] >> 5);
    } else {         /* green */
      if (alpha == 0)
        {
          assert(!(Lo[chan] & ~0xfc));
          lo[chan] = Lo[chan] | (Lo[chan] >> 6);
        }
      else
        {
          assert(!(Lo[chan] & ~0xf8));
          lo[chan] = Lo[chan] | (Lo[chan] >> 5);
        }
      assert(!(Hi[chan] & ~0xfc));
      hi[chan] = Hi[chan] | (Hi[chan] >> 6);
    }
  }
}


/* find the best 2 bit indices for the colors given */
void
assignIndices4(int alpha, int activeChannel,
               FxI32 Lo[3], FxI32 Hi[3],
               FxI32 pixIn[16][4],
               FxI32 *outIndex)
{
  int i, j, chan;
  double e2, bestE2;              /* sum squared error */
  FxI32 lo[3], hi[3];
  FxI32 colors[4][4];
  
  extendLoHi4(alpha, Lo, Hi, lo, hi);
  
  chan = activeChannel;
  
  FillPallette4(alpha, colors, lo, hi);
  
  for (i = 0; i < 16; i++) {
    if (pixIn[i][3] == 0)
      {
        outIndex[i] = 3;
      }
    else
      {
        bestE2 = 1e30;
        outIndex[i] = -1;
        
        for (j = 0; j < 4; j++) {
          double t;
          t = pixIn[i][chan] - colors[j][chan];
          e2 = t * t;
          
          if (e2 < bestE2) {
            bestE2 = e2;
            outIndex[i] = j;
          }
        }
      }
    assert(outIndex[i] != -1);
  }
}

/* just fill in the colors */
void
assignColors4(int alpha, FxI32 Lo[3], FxI32 Hi[3],
              FxI32 pixOut[16][4],
              FxI32 *inIndex)
{
  int i, chan;
  
  FxI32 lo[3], hi[3];
  FxI32 colors[4][4];
  
  extendLoHi4(alpha, Lo, Hi, lo, hi);
  
  FillPallette4(alpha, colors, lo, hi);
  
  for (i = 0; i < 16; i++) {
    for (chan = 0; chan < 4; chan++) {
      pixOut[i][chan] = colors[inIndex[i]][chan];
    }
  }
}

/* find the best 3 bit (7 value) indices for the provided lo & hi */
/* only pay attention to active channel. */
void
assignIndices8(int activeChannel,
               FxI32 Lo[3], FxI32 Hi[3],
               FxI32 pixIn[32][4],
               FxI32 *outIndex)
{
  int i, j, chan;
  FxI32 e2, bestE2;               /* sum squared error */
  FxI32 lo[3], hi[3];
  FxI32 colors[8][4];
  
  chan = activeChannel;
  
  assert(!(Lo[chan] & ~0xf8));
  assert(!(Hi[chan] & ~0xf8));
  
  for (i = 0; i < 3; i++) {
    lo[i] = hi[i] = 0;
  }
  
  lo[chan] = Lo[chan] | (Lo[chan] >> 5);
  hi[chan] = Hi[chan] | (Hi[chan] >> 5);
  
  FillPallette8(colors, lo, hi);
  
  for (i = 0; i < 32; i++) {
    if (pixIn[i][3] == 0)
      {
        outIndex[i] = 7;
      }
    else
      {
        bestE2 = 1 << 30;
        outIndex[i] = -1;
        for (j = 0; j < 7; j++) {
          FxI32 t;
          t = pixIn[i][chan] - colors[j][chan];
          e2 = t * t;
          if (e2 < bestE2) {
            bestE2 = e2;
            outIndex[i] = j;
          }
        }
      }
    assert(outIndex[i] != -1);
  }
}

/* fill in the colors */
void
assignColors8(FxI32 Lo[3], FxI32 Hi[3],
              FxI32 pixOut[32][4],
              FxI32 *inIndex)
{
  int i, chan;
  
  FxI32 lo[3], hi[3];
  FxI32 colors[8][4];
  
  for (chan = 0; chan < 3; chan++) {
    assert(!(Lo[chan] & ~0xf8));
    assert(!(Hi[chan] & ~0xf8));
    
    lo[chan] = Lo[chan] | (Lo[chan] >> 5);
    hi[chan] = Hi[chan] | (Hi[chan] >> 5);
  }
  
  FillPallette8(colors, lo, hi);
  
  for (i = 0; i < 32; i++) {
    for (chan = 0; chan < 4; chan++) {
      pixOut[i][chan] = colors[inIndex[i]][chan];
    }
  }
}

void
assignIndices4x2(int activeChannel,
                 FxI32 Lo[4], FxI32 Hi[4], FxI32 Mid[4],
                 FxI32 pixIn[32][4],
                 FxI32 *outIndex_0, FxI32 *outIndex_1)
{
  int i, j, chan;
  FxI32 e2, bestE2;               /* sum squared error */
  FxI32 lo[4], hi[4], mid[4];
  FxI32 colors_0[4][4];
  FxI32 colors_1[4][4];
  int i1 = 0, i2 = 0;
  
  chan = activeChannel;
  
  assert(!(Lo[chan] & ~0xf8));
  assert(!(Hi[chan] & ~0xf8));
  assert(!(Mid[chan] & ~0xf8));
  
  for (i = 0; i < 4; i++) {
    lo[i] = hi[i] = mid[i] = 0;
  }
  
  lo[chan] = Lo[chan] | (Lo[chan] >> 5);
  hi[chan] = Hi[chan] | (Hi[chan] >> 5);
  mid[chan] = Mid[chan] | (mid[chan] >> 5);
  
  for (i = 0; i < 4; i++)
    {
      colors_0[i][chan] = ((3-i)*lo[chan]+i*mid[chan]+1)/3;
      colors_1[i][chan] = ((3-i)*hi[chan]+i*mid[chan]+1)/3;
    }
  
  for (i = 0; i < 32; i++) {
    if (i%8 < 4)
      {
        bestE2 = 1 << 30;
        outIndex_0[i1] = -1;
        for (j = 0; j < 4; j++) {
          FxI32 t;
          t = pixIn[i][chan] - colors_0[j][chan];
          e2 = t * t;
          if (e2 < bestE2) {
            bestE2 = e2;
            outIndex_0[i1] = j;
          }
        }
        assert(outIndex_0[i1] != -1);
        i1++;
      }
    else
      {
        bestE2 = 1 << 30;
        outIndex_1[i2] = -1;
        for (j = 0; j < 4; j++) {
          FxI32 t;
          t = pixIn[i][chan] - colors_0[j][chan];
          e2 = t * t;
          if (e2 < bestE2) {
            bestE2 = e2;
            outIndex_1[i2] = j;
          }
        }
        assert(outIndex_1[i2] != -1);
        i2++;
      }
  }
}

/* fill in the colors */
void
assignColors4x2(FxI32 Lo[4], FxI32 Hi[4], FxI32 Mid[4],
                FxI32 pixOut[32][4],
                FxI32 *inIndex_0, FxI32 *inIndex_1)
{
  int i, chan;
  
  FxI32 lo[4], hi[4], mid[4];
  FxI32 colors_0[4][4], colors_1[4][4];
  int i1 = 0, i2 = 0;
  
  for (chan = 0; chan < 4; chan++) {
    assert(!(Lo[chan] & ~0xf8));
    assert(!(Hi[chan] & ~0xf8));
    assert(!(Mid[chan] & ~0xf8));
    
    lo[chan] = Lo[chan] | (Lo[chan] >> 5);
    hi[chan] = Hi[chan] | (Hi[chan] >> 5);
    mid[chan] = Mid[chan] | (Mid[chan] >> 5);
  }
  
  for (i = 0; i < 4; i++)
    {
      for (chan = 0; chan < 4; chan++)
        {
          colors_0[i][chan] = ((3-i)*lo[chan]+i*mid[chan]+1)/3;
          colors_1[i][chan] = ((3-i)*hi[chan]+i*mid[chan]+1)/3;
        }
    }
  
  for (i = 0; i < 32; i++) {
    for (chan = 0; chan < 4; chan++) {
      if (i%8 < 4)
        {
          pixOut[i][chan] = colors_0[inIndex_0[i1]][chan];
          if (chan == 3) i1++;
        }
      else
        {
          pixOut[i][chan] = colors_1[inIndex_1[i2]][chan];
          if (chan == 3) i2++;
        }
    }
  }
}

typedef double (*CompFun)(int flag,
                          FxU32 width, int x, int y,
                          FxI32 *pixelsIn, FxI32 *pixelsOut,
                          CCBlock *blockOut);

/* cc-chroma:
 *   allocates 4 colors for a 4x4 block
 *   algorithm:
 *
 * Use the generalized lloyd's algorithm for VQ:
 *     find 4 color vectors.
 *
 *     for each sample color
 *         sort to nearest vector.
 *
 *     replace each vector with the centroid of it's matching colors.
 *
 *     repeat until RMS doesn't improve.
 *
 *     if a color vector has no samples, or becomes the same as another
 *     vector, replace it with the color which is farthest from a sample.
 * 
 */

double
Encomp4x8CC_Chroma(int flag,
                   FxU32 width, int x, int y,
                   FxI32 *pixelsIn, FxI32 *pixelsOut,
                   CCBlock *blockOut)
{
  FxI32 extractedPixels[32][4];
  
  static int shift[4] = {0, 8, 16, 24};
  static int range = 32;
  
  int chan, i, j, k;
  
  FxI32 colorVectors[4][4];
  FxI32 cvCount[4]; /* num pixels closest to this vector. */
  FxI32 sameCount;  /* let it loop n times with same RMS without giving up. */
  
  FxI32 createdPixels[32][4], createdPixelIndices[32];
  FxI32 pix[4], minPix[4], maxPix[4];
  
  double bestRMS, sumRMS, rms, worstRMS, lastRMS;
  int worstRMSi;
  
  CCCLEAR(*blockOut);
  CC_SETBIT(*blockOut, CC_CHROMA_MODE_2, 0); /* chroma mode */
  CC_SETBIT(*blockOut, CC_CHROMA_MODE_1, 1); /* chroma mode */
  CC_SETBIT(*blockOut, CC_CHROMA_MODE_0, 0); /* chroma mode */
  
  for (chan = 0; chan < 4; chan++) {
    minPix[chan] = 255; maxPix[chan] = 0;
  }
  for (i = 0; i < 4; i++) {
    for (j = 0; j < 8; j++) {
      for (chan = 0; chan < 4; chan++) {
        pix[chan] = pixelsIn[(y + i) * width + x + j];
        pix[chan] = (pix[chan] >> channelShift[chan]) & 255;
        extractedPixels[(i << 3) + j][chan] = pix[chan];
        if (pix[chan] < minPix[chan])
          minPix[chan] = pix[chan];
        if (pix[chan] > maxPix[chan])
          maxPix[chan] = pix[chan];
      }
    }
  }
  
  for (i = 0; i < 4; i++) {
    cvCount[i] = 0;
  }
  
  lastRMS = 1e30;
  worstRMSi = 0;
  worstRMS = -1;
  sameCount = 0;
  while (1) {
    /* assign unassigned color vectors (same color ok) */
    for (i = 0; i < 4; i++) {
      if (!cvCount[i]) {
        for (chan = 0; chan < 3; chan++) {
          colorVectors[i][chan] = extractedPixels[i][chan];
          colorVectors[i][chan] &= 0xf8;
          colorVectors[i][chan] |= colorVectors[i][chan] >> 5;
        }
        colorVectors[i][3] = 255;
      }
    }
    
    /* resort colors */
    sumRMS = 0;
    for (i = 0; i < 32; i++) {
      bestRMS = 1e30;
      for (j = 0; j < 4; j++) {
        rms = cdiff2(colorVectors[j], extractedPixels[i]);
        if (rms < bestRMS) {
          bestRMS = rms;
          createdPixelIndices[i] = j;
        }
      }
      sumRMS += bestRMS;
      if (bestRMS > worstRMS) {
        worstRMS = bestRMS;
        worstRMSi = i;
      }
    }
    
    if ((sumRMS >= lastRMS) && (sameCount++ > 5)) {
      /* done, fill created indices and compute rms */
      
      for (chan = 0; chan < 3; chan++) {
        for (i = 0; i < 5; i++) {
          CC_SETBIT(*blockOut,
                    CC_CHROMA_COLOR_0 + 5 * chan + i,
                    (colorVectors[0][chan] >> (i + 3)) & 1);
          CC_SETBIT(*blockOut,
                    CC_CHROMA_COLOR_1 + 5 * chan + i,
                    (colorVectors[1][chan] >> (i + 3)) & 1);
          CC_SETBIT(*blockOut,
                    CC_CHROMA_COLOR_2 + 5 * chan + i,
                    (colorVectors[2][chan] >> (i + 3)) & 1);
          CC_SETBIT(*blockOut,
                    CC_CHROMA_COLOR_3 + 5 * chan + i,
                    (colorVectors[3][chan] >> (i + 3)) & 1);
        } 
      }
      
      
      /* fill in output colors */
      for (i = 0; i < 32; i++) {
        for (chan = 0; chan < 4; chan++) {
          createdPixels[i][chan] = colorVectors[createdPixelIndices[i]][chan];
        }
      }
      for (i = 0; i < 4; i++) {
        for (j = 0; j < 8; j++) {
          pixelsOut[(y + i) * width + x + j] =
            (createdPixels[(i << 3) + j][0] << channelShift[0]) |
            (createdPixels[(i << 3) + j][1] << channelShift[1]) |
            (createdPixels[(i << 3) + j][2] << channelShift[2]) |
            (createdPixels[(i << 3) + j][3] << channelShift[3]);
          for (k = 0; k < 2; k++) {
            int blockOffset = (j >= 4) ? 32 : 0;
            int texOffset = 8 * i + 2 * (j & 3);
            int offset = CC_CHROMA_TEXELS + blockOffset + texOffset;
            CC_SETBIT(*blockOut, offset + k,
                      (createdPixelIndices[8 * i + j] >> k) & 1);
          }
        }
      }
      {
        double t = rmsErrorBlock(32, extractedPixels, createdPixels);
        assert(t == sumRMS);
      }
      return sumRMS;
    }
    
    if (sumRMS < lastRMS) {
      sameCount = 0;
      lastRMS = sumRMS;
    }
    
    /* replace color vectors with centroid of colors sorted to them. */
    for (i = 0; i < 4; i++) {
      for (chan = 0; chan < 3; chan++) {
        colorVectors[i][chan] = 0;
      }
      cvCount[i] = 0;
    }
    for (i = 0; i < 32; i++) {
      for (chan = 0; chan < 3; chan++) {
        colorVectors[createdPixelIndices[i]][chan] +=
          extractedPixels[i][chan];
      }
      cvCount[createdPixelIndices[i]]++;
    }
    for (i = 0; i < 4; i++) {
      for (chan = 0; chan < 3; chan++) {
        if (cvCount[i]) {
          colorVectors[i][chan] /= cvCount[i];
          
          /* adjust to 555 precision. */
          colorVectors[i][chan] &= 0xf8;
          colorVectors[i][chan] |= colorVectors[i][chan] >> 5;
        }
      }
    }
  }    
}

/*
 * find the channel with maximum variance
 * find the best vectors and indices for that channel
 * find the best vectors for other channels 
 */

double
Encomp4x8CC_HI(int flag,
               FxU32 width, int x, int y,
               FxI32 *pixelsIn, FxI32 *pixelsOut,
               CCBlock *blockOut)
{
  static int shift[4] = {0, 8, 16, 24};
  static int range = 32;
  
  int chan, i, j, k;
  FxI32 extractedPixels[32][4];
  FxI32 createdPixels[32][4], createdPixelIndices[32];
  FxI32 pix[4], minPix[4], maxPix[4];
  
  int sortedChans[4];
  
  FxI32
    lo[4], loMin[4], loMax[4],
    hiMin[4], hiMax[4], hi[4],
    bestLo[4], bestHi[4];
  
  double bestRMS, rms;
  
  CCCLEAR(*blockOut);
  CC_SETBIT(*blockOut, CC_HI_MODE_1, 0); /* hi mode */
  CC_SETBIT(*blockOut, CC_HI_MODE_0, 0); /* hi mode */
  
  for (chan = 0; chan < 3; chan++) {
    minPix[chan] = 255; maxPix[chan] = 0;
  }
  for (i = 0; i < 4; i++) {
    for (j = 0; j < 8; j++) {
      for (chan = 0; chan < 4; chan++) {
        pix[chan] = pixelsIn[(y + i) * width + x + j];
        pix[chan] = (pix[chan] >> channelShift[chan]) & 255;
        extractedPixels[(i << 3) + j][chan] = pix[chan];
        if (pix[chan] < minPix[chan])
          minPix[chan] = pix[chan];
        if (pix[chan] > maxPix[chan])
          maxPix[chan] = pix[chan];
      }
    }
  }
  
  for (chan = 0; chan < 3; chan++) {
    loMin[chan] = MAX(0, minPix[chan] - range) & 0xf8;
    loMax[chan] = MIN(255, minPix[chan] + range) & 0xf8;
    hiMin[chan] = MAX(0, maxPix[chan] - range) & 0xf8;
    hiMax[chan] = MIN(255, maxPix[chan] + range) & 0xf8;
  }
  
  /* do max variance channel first.
  ** this channel chooses indices. Other channels choose 
  ** lo & hi only. 
  */
  
  sort3ChannelsByVariance(32, extractedPixels, sortedChans);
  
  /* make sure there is no garbage in other channels */
  for (chan = 0; chan < 3; chan++) {
    bestLo[chan] = lo[chan] = bestHi[chan] = hi[chan] = 0;
  }
  chan = sortedChans[0];
  bestRMS = 1e20;
  for (lo[chan] = loMin[chan]; lo[chan] < hiMax[chan]; lo[chan] += 8) {
    for (hi[chan] = loMin[chan]; hi[chan] < hiMax[chan]; hi[chan] += 8) {
      assignIndices8(chan, lo, hi, extractedPixels, createdPixelIndices);
      assignColors8(lo, hi, createdPixels, createdPixelIndices);
      rms = rmsErrorBlockChannel(32, chan, extractedPixels, createdPixels);
      if (rms < bestRMS) {
        bestRMS = rms;
        bestLo[chan] = lo[chan];
        bestHi[chan] = hi[chan];
      }
    }
  }
  
  /* recrate best indices */
  assignIndices8(chan, bestLo, bestHi, extractedPixels, createdPixelIndices);
  
  /* now do other channels, assigning lo and hi but not indices */
  for (i = 1; i < 3; i++) {
    bestRMS = 1e20;
    chan = sortedChans[i];
    for (lo[chan] = loMin[chan]; lo[chan] < hiMax[chan]; lo[chan] += 8) {
      for (hi[chan] = loMin[chan]; hi[chan] < hiMax[chan]; hi[chan] += 8) {
        assignColors8(lo, hi, createdPixels, createdPixelIndices);
        rms = rmsErrorBlockChannel(32, chan, extractedPixels, createdPixels);
        if (rms < bestRMS) {
          bestRMS = rms;
          bestLo[chan] = lo[chan];
          bestHi[chan] = hi[chan];
        }
      }
    }
    assert(bestRMS < 1e20);
  }
  
  for (chan = 0; chan < 3; chan++) {
    for (i = 0; i < 5; i++) {
      CC_SETBIT(*blockOut,
                CC_HI_COLOR_0 + 5 * chan + i,
                (bestLo[chan] >> (i + 3)) & 1);
      CC_SETBIT(*blockOut,
                CC_HI_COLOR_1 + 5 * chan + i,
                (bestHi[chan] >> (i + 3)) & 1);
    } 
  }
  
  assignColors8(bestLo, bestHi,   createdPixels, createdPixelIndices);
  bestRMS = rmsErrorBlock(32, extractedPixels, createdPixels);    
  
  for (i = 0; i < 4; i++) {
    for (j = 0; j < 8; j++) {
      pixelsOut[(y + i) * width + x + j] =
        (createdPixels[(i << 3) + j][0] << channelShift[0]) |
        (createdPixels[(i << 3) + j][1] << channelShift[1]) |
        (createdPixels[(i << 3) + j][2] << channelShift[2]) |
        (createdPixels[(i << 3) + j][3] << channelShift[3]);
      for (k = 0; k < 3; k++) {
        int blockOffset = (j >= 4) ? 48 : 0;
        int texOffset = 12 * i + 3 * (j & 3);
        int offset = CC_HI_TEXELS + blockOffset + texOffset;
        CC_SETBIT(*blockOut, offset + k,
                  (createdPixelIndices[8 * i + j] >> k) & 1);
      }
    }
  }
  
  return bestRMS;
}

/*
 * extract 4x4 block.
 * find best lo & hi values for each channel.
 * output finished pixels.
 *
 * first find best 565-565 lo-hi and indices for each block,
 * then, if appropriate, find a 666-444 delta block and see if it
 * is better or not.
 *
 * return average RMS of blocks and channels.
 */
double
Encomp4x8CC_Mixed(int flag,
                  FxU32 width, int x, int y,
                  FxI32 *pixelsIn, FxI32 *pixelsOut,
                  CCBlock *blockOut)
{
  static int shift[4] = {0, 8, 16, 24};
  static int range = 32;
  
  int chan, i, j;
  int block;
  FxI32 extractedPixels[16][4];
  FxI32 pix[4];
  
  FxI32 bestLo[3], bestHi[3];
  int createdPixelIndices[16];
  FxI32 createdPixels[16][4];
  double sumRMS;
  
  int sortedChans[3];
  FxI32
    lo[3], loMin[3], loMax[3],
    hiMin[3], hiMax[3], hi[3],
    minPix[4], maxPix[4];
  double bestRMS, rms;
  
  int glsb, reverse;
  
  CCCLEAR(*blockOut);
  CC_SETBIT(*blockOut, CC_MIXED_MODE, 1);  /* mixed mode */
  CC_SETBIT(*blockOut, CC_MIXED_ALPHA, flag);
  
  sumRMS = 0.0;
  for (block = 0; block < 2; block++, x+= 4) {
    for (i = 0; i < 4; i++) {
      for (j = 0; j < 4; j++) {
        for (chan = 0; chan < 4; chan++) {
          pix[chan] = pixelsIn[(y + i) * width + x + j];
          pix[chan] = (pix[chan] >> channelShift[chan]) & 255;
          extractedPixels[(i << 2) + j][chan] = pix[chan]; 
        }
      }
    }
    
    findMinMax(16, extractedPixels, minPix, maxPix);
    
    for (chan = 0; chan < 3; chan++) {
      if (chan != 1) { /* not green */
        loMin[chan] = MAX(0, minPix[chan] - range) & 0xf8;
        loMax[chan] = MIN(255, minPix[chan] + range) & 0xf8;
        hiMin[chan] = MAX(0, maxPix[chan] - range) & 0xf8;
        hiMax[chan] = MIN(255, maxPix[chan] + range) & 0xf8;
      } else {                /* green */
        if (flag == 0)
          {
            loMin[chan] = MAX(0, minPix[chan] - range) & 0xfc;
            loMax[chan] = MIN(255, minPix[chan] + range) & 0xfc;
            hiMin[chan] = MAX(0, maxPix[chan] - range) & 0xfc;
            hiMax[chan] = MIN(255, maxPix[chan] + range) & 0xfc;
          }
        else
          {
            loMin[chan] = MAX(0, minPix[chan] - range) & 0xf8;
            loMax[chan] = MIN(255, minPix[chan] + range) & 0xf8;
            hiMin[chan] = MAX(0, maxPix[chan] - range) & 0xf8;
            hiMax[chan] = MIN(255, maxPix[chan] + range) & 0xf8;
          }
      }
    }
    
    /* max variance channel comes first. 
    ** it assigns indices. Other channels best hi and lo
    ** matching these indices.
    */
    sort3ChannelsByVariance(16, extractedPixels, sortedChans);
    
    /* make them all zero */
    for (chan = 0; chan < 3; chan++) {
      bestLo[chan] = lo[chan] = bestHi[chan] = hi[chan] = 0;
    }
    
    /* do first channel */
    chan = sortedChans[0];
    bestRMS = 1e20; 
    for (lo[chan] = loMin[chan]; lo[chan] < hiMax[chan]; lo[chan] += ((chan == 1) && (flag == 0))? 4 : 8) {
      for (hi[chan] = loMin[chan]; hi[chan] < hiMax[chan]; hi[chan] += (chan != 1) ? 8 : 4) { 
        assignIndices4(flag, chan, lo, hi, extractedPixels, (long *) createdPixelIndices);
        assignColors4(flag, lo, hi, createdPixels, (long *) createdPixelIndices);
        rms = rmsErrorBlockChannel(16, chan, extractedPixels, createdPixels);
        if (rms < bestRMS) {
          bestRMS = rms;
          bestLo[chan] = lo[chan];
          bestHi[chan] = hi[chan];
        }
      }
    }
    
    /* gotta redo this because the good indices were lost. */
    assignIndices4(flag, chan, bestLo, bestHi, extractedPixels, (long *) createdPixelIndices);
    
    lo[chan] = bestLo[chan];
    hi[chan] = bestHi[chan];
    
    for (i = 1; i < 3; i++) {
      chan = sortedChans[i];
      bestRMS = 1e20; 
      for (lo[chan] = loMin[chan]; lo[chan] < hiMax[chan]; lo[chan] += ((chan == 1) && (flag == 0))? 4 : 8) {
        for (hi[chan] = loMin[chan]; hi[chan] < hiMax[chan]; hi[chan] += (chan != 1) ? 8 : 4) {
          assignColors4(flag, lo, hi, createdPixels, (long *)createdPixelIndices);
          rms = rmsErrorBlockChannel(16, chan, extractedPixels, createdPixels);
          if (rms < bestRMS) {
            bestRMS = rms;
            bestLo[chan] = lo[chan];
            bestHi[chan] = hi[chan];
          }
        }
      }
      assert(bestRMS < 1e20);
    }
    assignColors4(flag, bestLo, bestHi, createdPixels, (long *) createdPixelIndices);
    bestRMS = rmsErrorBlock(16, extractedPixels, createdPixels);
    
    sumRMS += bestRMS;
    
    /* 
    ** set output block blockOut
    */
    reverse =  (flag == 0) &&
      ((((bestLo[1] ^ bestHi[1]) >> 2) & 1) !=
       ((createdPixelIndices[0] >> 1) & 1));
    
    if (reverse)
      {
        FxI32 t;
        for (chan = 0; chan < 3; chan++)
          {
            t = bestHi[chan];
            bestHi[chan] = bestLo[chan];
            bestLo[chan] = t;
          }
        for (i = 0; i < 16; i++)
          {
            createdPixelIndices[i] = 3 - createdPixelIndices[i];
          }
      }
    
    glsb = (bestHi[1] & 4) >> 2;
    
    if (block == 0)
      {
        CC_SETBIT(*blockOut, CC_MIXED_GLSB_0, glsb);
      }
    else
      {
        CC_SETBIT(*blockOut, CC_MIXED_GLSB_1, glsb);
      }
    
    for (chan = 0; chan < 3; chan++)
      {
        for (i = 0; i < 5; i++)
          {
            CC_SETBIT(*blockOut, colorBitsMIXED[block] + 5 * chan + 15 + i,
                      (bestLo[chan] >> (i + 3)) & 1);
            CC_SETBIT(*blockOut, colorBitsMIXED[block] + 5 * chan + 0 + i,
                      (bestHi[chan] >> (i + 3)) & 1);
          }
      }
    
    assignColors4(flag, bestLo, bestHi, createdPixels, (long *) createdPixelIndices);
    
    for (i = 0; i < 4; i++)
      {
        for (j = 0; j < 4; j++)
          {
            pixelsOut[(y + i) * width + x + j] =
              (createdPixels[(i << 2) + j][0] << channelShift[0]) |
              (createdPixels[(i << 2) + j][1] << channelShift[1]) |
              (createdPixels[(i << 2) + j][2] << channelShift[2]) |
              (createdPixels[(i << 2) + j][3] << channelShift[3]);
            CC_SETBIT(*blockOut, texelBitsMIXED[block] + 8 * i + 2 * j + 0,
                      (createdPixelIndices[4 * i + j] >> 0) & 1);
            CC_SETBIT(*blockOut, texelBitsMIXED[block] + 8 * i + 2 * j + 1,
                      (createdPixelIndices[4 * i + j] >> 1) & 1);
          }
      }
  }
  
  return sumRMS;
}

/* 
 * 
 */
double  Encomp4x8CC_Alpha(int flag,
                          FxU32 width, int x, int y,
                          FxI32 *pixelsIn, FxI32 *pixelsOut,
                          CCBlock *blockOut)
{
  if (flag == 0)
    {
      FxI32 extractedPixels[32][4];
      
      static int shift[4] = {0, 8, 16, 24};
      static int range = 32;
      
      int chan, i, j, k;
      
      FxI32 colorVectors[4][4];
      FxI32 cvCount[4]; /* num pixels closest to this vector. */
      FxI32 sameCount;  /* let it loop n times with same RMS without giving up. */
      
      FxI32 createdPixels[32][4], createdPixelIndices[32];
      FxI32 pix[4], minPix[4], maxPix[4];
      
      double bestRMS, sumRMS, rms, worstRMS, lastRMS;
      int worstRMSi;
      
      CCCLEAR(*blockOut);
      CC_SETBIT(*blockOut, CC_ALPHA_MODE_2, 0); /* alpha mode */
      CC_SETBIT(*blockOut, CC_ALPHA_MODE_1, 1); /* alpha mode */
      CC_SETBIT(*blockOut, CC_ALPHA_MODE_0, 1); /* alpha mode */
      CC_SETBIT(*blockOut, CC_ALPHA_LERP,   0); /* lerp = 0 */
      
      for (chan = 0; chan < 4; chan++) {
        minPix[chan] = 255; maxPix[chan] = 0;
      }
      for (i = 0; i < 4; i++) {
        for (j = 0; j < 8; j++) {
          for (chan = 0; chan < 4; chan++) {
            pix[chan] = pixelsIn[(y + i) * width + x + j];
            pix[chan] = (pix[chan] >> channelShift[chan]) & 255;
            extractedPixels[(i << 3) + j][chan] = pix[chan];
            if (pix[chan] < minPix[chan])
              minPix[chan] = pix[chan];
            if (pix[chan] > maxPix[chan])
              maxPix[chan] = pix[chan];
          }
        }
      }
      
      for (i = 0; i < 4; i++) {
        cvCount[i] = 0;
      }
      
      lastRMS = 1e30;
      worstRMSi = 0;
      worstRMS = -1;
      sameCount = 0;
      while (1) {
        /* assign unassigned color vectors (same color ok) */
        for (i = 0; i < 4; i++) {
          if (i != 3)
            {
              if (!cvCount[i]) {
                for (chan = 0; chan < 4; chan++) {
                  colorVectors[i][chan] = extractedPixels[i][chan];
                  colorVectors[i][chan] &= 0xf8;
                  colorVectors[i][chan] |= colorVectors[i][chan] >> 5;
                }
              }
            }
          else
            {
              for (chan = 0; chan < 4; chan++)
                {
                  colorVectors[i][chan] = 0;
                }
            }
        }
        
        /* resort colors */
        sumRMS = 0;
        for (i = 0; i < 32; i++) {
          bestRMS = 1e30;
          for (j = 0; j < 4; j++) {
            rms = cdiff2(colorVectors[j], extractedPixels[i]);
            if (rms < bestRMS) {
              bestRMS = rms;
              createdPixelIndices[i] = j;
            }
          }
          sumRMS += bestRMS;
          if (bestRMS > worstRMS) {
            worstRMS = bestRMS;
            worstRMSi = i;
          }
        }
        
        if ((sumRMS >= lastRMS) && (sameCount++ > 5)) {
                                /* done, fill created indices and compute rms */
          
          for (chan = 0; chan < 4; chan++) {
            if (chan != 3)
              {
                for (i = 0; i < 5; i++) {
                  CC_SETBIT(*blockOut,
                            CC_ALPHA_COLOR_0 + 5 * chan + i,
                            (colorVectors[0][chan] >> (i + 3)) & 1);
                  CC_SETBIT(*blockOut,
                            CC_ALPHA_COLOR_1 + 5 * chan + i,
                            (colorVectors[1][chan] >> (i + 3)) & 1);
                  CC_SETBIT(*blockOut,
                            CC_ALPHA_COLOR_2 + 5 * chan + i,
                            (colorVectors[2][chan] >> (i + 3)) & 1);
                } 
              }
            else
              {
                for (i = 0; i < 5; i++) {
                  CC_SETBIT(*blockOut,
                            CC_ALPHA_ALPHA_0 + i,
                            (colorVectors[0][chan] >> (i + 3)) & 1);
                  CC_SETBIT(*blockOut,
                            CC_ALPHA_ALPHA_1 + i,
                            (colorVectors[1][chan] >> (i + 3)) & 1);
                  CC_SETBIT(*blockOut,
                            CC_ALPHA_ALPHA_2 + i,
                            (colorVectors[2][chan] >> (i + 3)) & 1);
                }
              }
          }
          
                                /* fill in output colors */
          for (i = 0; i < 32; i++) {
            for (chan = 0; chan < 4; chan++) {
              createdPixels[i][chan] = colorVectors[createdPixelIndices[i]][chan];
            }
          }
          for (i = 0; i < 4; i++) {
            for (j = 0; j < 8; j++) {
              pixelsOut[(y + i) * width + x + j] =
                (createdPixels[(i << 3) + j][0] << channelShift[0]) |
                (createdPixels[(i << 3) + j][1] << channelShift[1]) |
                (createdPixels[(i << 3) + j][2] << channelShift[2]) |
                (createdPixels[(i << 3) + j][3] << channelShift[3]);
              for (k = 0; k < 2; k++) {
                int blockOffset = (j >= 4) ? 32 : 0;
                int texOffset = 8 * i + 2 * (j & 3);
                int offset = CC_ALPHA_TEXELS + blockOffset + texOffset;
                CC_SETBIT(*blockOut, offset + k,
                          (createdPixelIndices[8 * i + j] >> k) & 1);
              }
            }
          }
          {
            double t = rmsErrorBlock(32, extractedPixels, createdPixels);
            assert(t == sumRMS);
          }
          return sumRMS;
        }
        
        if (sumRMS < lastRMS) {
          sameCount = 0;
          lastRMS = sumRMS;
        }
        
        /* replace color vectors with centroid of colors sorted to them. */
        for (i = 0; i < 3; i++) {
          for (chan = 0; chan < 4; chan++) {
            colorVectors[i][chan] = 0;
          }
          cvCount[i] = 0;
        }
        for (i = 0; i < 32; i++) {
          for (chan = 0; chan < 4; chan++) {
            colorVectors[createdPixelIndices[i]][chan] +=
              extractedPixels[i][chan];
          }
          cvCount[createdPixelIndices[i]]++;
        }
        for (i = 0; i < 3; i++) {
          for (chan = 0; chan < 4; chan++) {
            if (cvCount[i]) {
              colorVectors[i][chan] /= cvCount[i];
              
              /* adjust to 555 precision. */
              colorVectors[i][chan] &= 0xf8;
              colorVectors[i][chan] |= colorVectors[i][chan] >> 5;
            }
          }
        }
      }    
    }
  else
    {
      static int shift[4] = {0, 8, 16, 24};
      static int range = 32;
      
      int chan, i, j, k;
      FxI32 extractedPixels[32][4];
      FxI32 createdPixels[32][4], createdPixelIndices_0[16], createdPixelIndices_1[16];
      FxI32 pix[4], minPix[4], maxPix[4];
      
      int sortedChans[4];
      
      FxI32
        lo[4], loMin[4], loMax[4],
        hi[4], hiMin[4], hiMax[4],
        mid[4], midMin[4], midMax[4],
        bestLo[4], bestHi[4], bestMid[4];
      
      double bestRMS, rms;
      
      CCCLEAR(*blockOut);
      CC_SETBIT(*blockOut, CC_ALPHA_MODE_2, 0); /* alpha mode */
      CC_SETBIT(*blockOut, CC_ALPHA_MODE_1, 1); /* alpha mode */
      CC_SETBIT(*blockOut, CC_ALPHA_MODE_0, 1); /* alpha mode */
      CC_SETBIT(*blockOut, CC_ALPHA_LERP,   1); /* lerp = 0 */
      
      for (chan = 0; chan < 4; chan++) {
        minPix[chan] = 255; maxPix[chan] = 0;
      }
      for (i = 0; i < 4; i++) {
        for (j = 0; j < 8; j++) {
          for (chan = 0; chan < 4; chan++) {
            pix[chan] = pixelsIn[(y + i) * width + x + j];
            pix[chan] = (pix[chan] >> channelShift[chan]) & 255;
            extractedPixels[(i << 3) + j][chan] = pix[chan];
            if (pix[chan] < minPix[chan])
              minPix[chan] = pix[chan];
            if (pix[chan] > maxPix[chan])
              maxPix[chan] = pix[chan];
          }
        }
      }
      
      for (chan = 0; chan < 4; chan++) {
        loMin[chan] = MAX(0, minPix[chan] - range) & 0xf8;
        loMax[chan] = MIN(255, minPix[chan] + range) & 0xf8;
        hiMin[chan] = MAX(0, maxPix[chan] - range) & 0xf8;
        hiMax[chan] = MIN(255, maxPix[chan] + range) & 0xf8;
        midMin[chan] = MAX(0, maxPix[chan] - range) & 0xf8;
        midMax[chan] = MIN(255, maxPix[chan] + range) & 0xf8;
      }
      
      /* do max variance channel first.
      ** this channel chooses indices. Other channels choose
      ** lo & hi only.
      */
      
      sort4ChannelsByVariance(32, extractedPixels, sortedChans);
      
      /* make sure there is no garbage in other channels */
      for (chan = 0; chan < 4; chan++) {
        bestLo[chan] = lo[chan] = bestHi[chan] = hi[chan] = bestMid[chan] = mid[chan] = 0;
      }
      chan = sortedChans[0];
      bestRMS = 1e20;
      for (lo[chan] = loMin[chan]; lo[chan] < hiMax[chan]; lo[chan] += 8) {
        for (hi[chan] = loMin[chan]; hi[chan] < hiMax[chan]; hi[chan] += 8) {
          for (mid[chan] = midMin[chan]; mid[chan] < midMax[chan]; mid[chan] += 8)
            {
              assignIndices4x2(chan, lo, hi, mid, extractedPixels, createdPixelIndices_0, createdPixelIndices_1);
              assignColors4x2(lo, hi, mid, createdPixels, createdPixelIndices_0, createdPixelIndices_1);
              rms = rmsErrorBlockChannel(32, chan, extractedPixels, createdPixels);
              if (rms < bestRMS) {
                bestRMS = rms;
                bestLo[chan] = lo[chan];
                bestHi[chan] = hi[chan];
                bestMid[chan] = mid[chan];
              }
            }
        }
      }
      
      /* recrate best indices */
      assignIndices4x2(chan, bestLo, bestHi, bestMid, extractedPixels, createdPixelIndices_0, createdPixelIndices_1);
      
      /* now do other channels, assigning lo and hi but not indices */
      for (i = 1; i < 4; i++) {
        bestRMS = 1e20;
        chan = sortedChans[i];
        for (lo[chan] = loMin[chan]; lo[chan] < hiMax[chan]; lo[chan] += 8) {
          for (hi[chan] = loMin[chan]; hi[chan] < hiMax[chan]; hi[chan] += 8) {
            for (mid[chan] = midMin[chan]; mid[chan] < midMax[chan]; mid[chan] += 8)
              {
                assignColors4x2(lo, hi, mid, createdPixels, createdPixelIndices_0, createdPixelIndices_1);
                rms = rmsErrorBlockChannel(32, chan, extractedPixels, createdPixels);
                if (rms < bestRMS) {
                  bestRMS = rms;
                  bestLo[chan] = lo[chan];
                  bestHi[chan] = hi[chan];
                  bestMid[chan] = mid[chan];
                }
              }
          }
        }
        assert(bestRMS < 1e20);
      }
      
      for (chan = 0; chan < 4; chan++) {
        if (chan != 3)
          {
            for (i = 0; i < 5; i++) {
              CC_SETBIT(*blockOut,
                        CC_ALPHA_COLOR_0 + 5 * chan + i,
                        (bestLo[chan] >> (i + 3)) & 1);
              CC_SETBIT(*blockOut,
                        CC_ALPHA_COLOR_1 + 5 * chan + i,
                        (bestMid[chan] >> (i + 3)) & 1);
              CC_SETBIT(*blockOut,
                        CC_ALPHA_COLOR_2 + 5 * chan + i,
                        (bestHi[chan] >> (i + 3)) & 1);
            } 
          }
        else
          {
            for (i = 0; i < 5; i++) {
              CC_SETBIT(*blockOut,
                        CC_ALPHA_ALPHA_0 + i,
                        (bestLo[chan] >> (i + 3)) & 1);
              CC_SETBIT(*blockOut,
                        CC_ALPHA_ALPHA_1 + i,
                        (bestMid[chan] >> (i + 3)) & 1);
              CC_SETBIT(*blockOut,
                        CC_ALPHA_ALPHA_2 + i,
                        (bestHi[chan] >> (i + 3)) & 1);
            }
          }
      }
      
      assignColors4x2(bestLo, bestHi, bestMid, createdPixels, createdPixelIndices_0, createdPixelIndices_1);
      bestRMS = rmsErrorBlock(32, extractedPixels, createdPixels);    
      
      /* fill in output colors */
      for (i = 0; i < 4; i++) {
        for (j = 0; j < 8; j++) {
          pixelsOut[(y + i) * width + x + j] =
            (createdPixels[(i << 3) + j][0] << channelShift[0]) |
            (createdPixels[(i << 3) + j][1] << channelShift[1]) |
            (createdPixels[(i << 3) + j][2] << channelShift[2]) |
            (createdPixels[(i << 3) + j][3] << channelShift[3]);
          for (k = 0; k < 2; k++) {
            int blockOffset = (j >= 4) ? 32 : 0;
            int texOffset = 8 * i + 2 * (j & 3);
            int offset = CC_ALPHA_TEXELS + blockOffset + texOffset;
            if (j < 4)
              {
                CC_SETBIT(*blockOut, offset + k,
                          (createdPixelIndices_0[4 * i + j] >> k) & 1);
              }
            else
              {
                CC_SETBIT(*blockOut, offset + k,
                          (createdPixelIndices_1[4 * i + (j-4)] >> k) & 1);
              }
          }
        }
      }
      
      return bestRMS;
    }
}

CompFun CompressionFunctions[] = 
{
  Encomp4x8CC_Mixed, 
  Encomp4x8CC_HI,
  Encomp4x8CC_Chroma,
  Encomp4x8CC_Alpha,
  0};

char *CFnames[4] = {"CC-MIXED", "CC-HI", "CC-CHROMA", "CC-ALPHA"};

int CFhistogram[4];             /* histogram of each mode selected */

int numBlocks;
double totalRMS;

int ForceMethod = -1;
int DisallowMethod = -1;


int 
encode4x8(ImgInfo *info, FXT1CallbackFunc callback , int verbose)
{
  FxU32 x,y, xend,yend;
  int w;
  int i, j, k, bestI;
  FxI32 *data;
  CCBlock ccblock[4], ccblock_0, ccblock_1;
  double rms[4], rms_0, rms_1, bestRMS;
  FxI32 *extraData[4], *extraData_0, *extraData_1;
  ImgInfo info2;
  
  /*  int localDebug;      */
  info2.any.width = info->any.width;
  info2.any.height = info->any.height;
  info2.any.sizeInBytes = sizeof(FxI32) * info2.any.width * info2.any.height;
  info2.any.data = (unsigned char *) calloc(info2.any.sizeInBytes >> 2, 4);
  assert(info2.any.data);
  
  for (i = 0; i < 4; i++) {
    extraData[i] = (long *) calloc(info->any.width * info->any.height, sizeof(FxI32));
    assert(extraData[i]);
  }
  
  extraData_0 = (long *) calloc(info->any.width * info->any.height, sizeof(FxI32));
  assert(extraData_0);
  extraData_1 = (long *) calloc(info->any.width * info->any.height, sizeof(FxI32));
  assert(extraData_1);

  /* compare sizes and use the minimum size */
  xend = info->any.width;
  yend = info->any.height;
  
  data = (FxI32 *)info->any.data;
  w = info->any.width;
  
  for (y=0; y + 3 < yend; y += 4) {
    for (x=0; x + 7 < xend; x += 8) {
 /*     fprintf(stderr, "running at x=%d, y=%d\n", x, y); */
      numBlocks++;
      bestRMS = 1e30;
      for (i = 0; CompressionFunctions[i]; i++) {
        if ((i == 1) || (i == 2))
          {
            if ((ForceMethod >= 0 && ForceMethod != i) || (DisallowMethod == i)) {
              rms[i] = 1e30;
            } else {
              rms[i] = CompressionFunctions[i](0, w, x, y, data, extraData[i], &ccblock[i]);
            }
            assert(rms[i] >= 0);
            if (rms[i] < bestRMS) {
              bestRMS = rms[i];
              bestI = i;
            }
          }
        else
          {
            if ((ForceMethod >= 0 && ForceMethod != i) || (DisallowMethod == i)) {
              rms_0 = 1e30;
            } else {
              rms_0 = CompressionFunctions[i](0, w, x, y, data, extraData_0, &ccblock_0);
            }
            assert(rms_0 >= 0);
            
            if ((ForceMethod >= 0 && ForceMethod != i) || (DisallowMethod == i)) {
              rms_1 = 1e30;
            } else {
              rms_1 = CompressionFunctions[i](1, w, x, y, data, extraData_1, &ccblock_1);
            }
            assert(rms_1 >= 0);
            
            if (rms_0 < rms_1)
              {
                for (k = 0; k < 4; k++) {
                  for (j = 0; j < 8; j++) {
                    extraData[i][(y + k) * w + x + j] = extraData_0[(y + k) * w + x + j];
                  }
                }
                for (k = 0; k < 4; k++)
                  {
                    ccblock[i].w[k] = ccblock_0.w[k];
                  }
                rms[i] = rms_0;
              }
            else
              {
                for (k = 0; k < 4; k++) {
                  for (j = 0; j < 8; j++) {
                    extraData[i][(y + k) * w + x + j] = extraData_1[(y + k) * w + x + j];
                  }
                }
                for (k = 0; k < 4; k++)
                  {
                    ccblock[i].w[k] = ccblock_1.w[k];
                  }
                rms[i] = rms_1;
              }
            
            if (rms[i] < bestRMS) {
              bestRMS = rms[i];
              bestI = i;
            }
          }
      }
      assert(bestRMS != 1e30);
      totalRMS += bestRMS/(4*8*4);
      CFhistogram[bestI]++;


          callback(&ccblock[bestI], sizeof(ccblock[bestI]));
      
 /*     fwrite(&ccblock[bestI], sizeof(ccblock[bestI]), 1, outf); */
      
      DecodeCCBlock(w, x, y, (FxI32 *)info2.any.data, ccblock[bestI]);
      {
        int j;
        double trms = 
          rmsImgBlock(w, x, y,
                      (FxI32 *)info->any.data,
                      (FxI32 *)info2.any.data);
        
        for (i = 0; i < 4; i++) {
          for (j = 0; j < 8; j++) {
            int index = (y + i) * w + x + j;
            FxI32 a, b, c;
            a = extraData[bestI][index] & 0xffffffff;
            b = ((FxI32 *)info2.any.data)[index] & 0xffffffff;
            c = data[index] & 0xffffffff;
            assert(a == b);
          }
        }
        /* make sure this is the case!! */
        assert(trms == rms[bestI]);     
      }
    }
  }


/*
 free up buffers
*/
  free(extraData_1);
  free(extraData_0);
  free(extraData[3]);
  free(extraData[2]);
  free(extraData[1]);
  free(extraData[0]);
  free(info2.any.data);

  
  return xend * yend;
}



