/*
   file mds_crc16.c
   created Jul 23 2005 by Yann GUIDON (whygee@f-cpu.org)
   version Oct 20 2005 : awful bugs found
   version Oct 22 : nearing completion
   version Oct 24 : validation of a reference algo
   version Oct 25 : optional '~', 64-bit version,
            pseudo-simultaneous-multithread...
   version Oct 30 : ported to Alpha
   version Nov 2: ported and tried on other archs
   version Nov 7: added CRC16_byte_multi

     Description :
   Low-level 16-bit CRC routines for the MDS multitrack container

     Rationale :
   16-bit code is "slower" on protected-mode x86
   (due to the infamous 0x66 opcode prefix)
   but should use half the memory footprint, leaving
   some more room for useful data (512 vs 1K bytes)

     Warning :
   - Various optimisations are used and abused...
   - Most macros are... nasty !!!
   - There is also "some endianness" in optimised versions

   For backound & technical explanations, read Ross N. Williams'
    "A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS"
    http://www.ross.net/crc/crcpaper.html

   If you find problems, send me a patch !

Standalone validity test :
  gcc -g -Wall -DSTANDALONE_TEST_CRC16 -o mds_crc16 mds_crc16.c
         (should not display any error or warning message except for printf)
  ./mds_crc16

Compile for speed :
  gcc -Os -fomit-frame-pointer -march={pentium|....}

See the other files for stress tests.

Options:
  #define CRC16_NEGATE : the returned CRC is negated (used only for stress tests)

If you want to re-generate the LUT :
  #define CRC16_SMALL_FOOTPRINT : does not include the (large) constant LUT
  #define DISPLAY_TABLE : as you guess, displays the generated LUT

You can also override some "autotuned" parameters
  #define CRC16 { CRC16_byte | CRC16_byte_multi | CRC16_block_multi | CRC16_block64 | CRC16_block |... }
  #define TYPE_CRC16 { U32 | U16 }

When forcing CRC16_byte_multi or CRC16_block_multi, don't
forget to set  -DCRC16_MULTI=2 .
This should be fixed one day but i don't know how.
*/

#ifndef MDS_CRC16
#define MDS_CRC16

/*--------------
   Data sizes
--------------*/

#include <endian.h>

#define U8  unsigned char
#define U16 unsigned short int
#ifndef U32
 #if defined (__alpha) /* || defined(__amd64__) || defined (__x86_64__) */
  #define U32 unsigned int
 #else
  #define U32 unsigned long int
 #endif
#endif
#define U64 unsigned long long int
#define PTR_CAST (long)  /* types.h says that sizeof(long)=width of the address bus */

/* Unless otherwise requested, this code tries to match the CPU with the best routine */
#ifndef CRC16
 #if __BYTE_ORDER != __LITTLE_ENDIAN
  #define CRC16 CRC16_byte
  #warning INFO: CRC16_byte chosen (if too unefficient, try -DCRC16_byte_multi -DCRC16_MULTI=2)
 #else
/* 64-bit platform ? */
  #if defined (__alpha) || defined(__amd64__) || defined (__x86_64__)
   #define CRC16 CRC16_block64
   #warning INFO: CRC16_block64 chosen
/* then it should be 32-bit */
  #else
   #if defined(__i686__) && defined(__athlon) && defined(not_defined_at_all)
    #define CRC16 CRC16_block_multi
    #define CRC16_MULTI 2
    #warning INFO: CRC16_block_multi chosen
   #else
    #define CRC16 CRC16_block /* basic 32-bit code */
    #warning INFO: CRC16_block chosen
   #endif
  #endif
 #endif
#endif

/**************************************************
                Some definitions

    Poly = 16,15,2,0
    CRC16 standard:  (1)-1000-0000-0000-0101 = 0x8005
    init=0xFFFF

 **************************************************/

#define CRC16_poly  0x8005
#define CRC16_seed  0xFFFF

/**************************************************
                Sizes
 **************************************************/

#ifndef TYPE_CRC16  /* so it can be redefined by external means */
 #ifdef i386
  #define TYPE_CRC16 U32
 #else
  #define TYPE_CRC16 U16
 #endif
#endif

TYPE_CRC16 crc16_val; /* the CRC "register" */

/* C can't return 2 values at once so we go through global variables.
 It's not reentrant but this kind of ultra-heavy computation should only
 be performed by a monothreaded background task. */
U16 crc_res_1, crc_res_2;


/**************************************************
            CRC lookup table construction
 **************************************************/

#ifndef CRC16_SMALL_FOOTPRINT

TYPE_CRC16 CRC16_LUT[256]= {
  0x0000, 0x0580, 0x0F80, 0x0A00, 0x1B80, 0x1E00, 0x1400, 0x1180,
  0x3380, 0x3600, 0x3C00, 0x3980, 0x2800, 0x2D80, 0x2780, 0x2200,
  0x6380, 0x6600, 0x6C00, 0x6980, 0x7800, 0x7D80, 0x7780, 0x7200,
  0x5000, 0x5580, 0x5F80, 0x5A00, 0x4B80, 0x4E00, 0x4400, 0x4180,
  0xC380, 0xC600, 0xCC00, 0xC980, 0xD800, 0xDD80, 0xD780, 0xD200,
  0xF000, 0xF580, 0xFF80, 0xFA00, 0xEB80, 0xEE00, 0xE400, 0xE180,
  0xA000, 0xA580, 0xAF80, 0xAA00, 0xBB80, 0xBE00, 0xB400, 0xB180,
  0x9380, 0x9600, 0x9C00, 0x9980, 0x8800, 0x8D80, 0x8780, 0x8200,
  0x8381, 0x8601, 0x8C01, 0x8981, 0x9801, 0x9D81, 0x9781, 0x9201,
  0xB001, 0xB581, 0xBF81, 0xBA01, 0xAB81, 0xAE01, 0xA401, 0xA181,
  0xE001, 0xE581, 0xEF81, 0xEA01, 0xFB81, 0xFE01, 0xF401, 0xF181,
  0xD381, 0xD601, 0xDC01, 0xD981, 0xC801, 0xCD81, 0xC781, 0xC201,
  0x4001, 0x4581, 0x4F81, 0x4A01, 0x5B81, 0x5E01, 0x5401, 0x5181,
  0x7381, 0x7601, 0x7C01, 0x7981, 0x6801, 0x6D81, 0x6781, 0x6201,
  0x2381, 0x2601, 0x2C01, 0x2981, 0x3801, 0x3D81, 0x3781, 0x3201,
  0x1001, 0x1581, 0x1F81, 0x1A01, 0x0B81, 0x0E01, 0x0401, 0x0181,
  0x0383, 0x0603, 0x0C03, 0x0983, 0x1803, 0x1D83, 0x1783, 0x1203,
  0x3003, 0x3583, 0x3F83, 0x3A03, 0x2B83, 0x2E03, 0x2403, 0x2183,
  0x6003, 0x6583, 0x6F83, 0x6A03, 0x7B83, 0x7E03, 0x7403, 0x7183,
  0x5383, 0x5603, 0x5C03, 0x5983, 0x4803, 0x4D83, 0x4783, 0x4203,
  0xC003, 0xC583, 0xCF83, 0xCA03, 0xDB83, 0xDE03, 0xD403, 0xD183,
  0xF383, 0xF603, 0xFC03, 0xF983, 0xE803, 0xED83, 0xE783, 0xE203,
  0xA383, 0xA603, 0xAC03, 0xA983, 0xB803, 0xBD83, 0xB783, 0xB203,
  0x9003, 0x9583, 0x9F83, 0x9A03, 0x8B83, 0x8E03, 0x8403, 0x8183,
  0x8002, 0x8582, 0x8F82, 0x8A02, 0x9B82, 0x9E02, 0x9402, 0x9182,
  0xB382, 0xB602, 0xBC02, 0xB982, 0xA802, 0xAD82, 0xA782, 0xA202,
  0xE382, 0xE602, 0xEC02, 0xE982, 0xF802, 0xFD82, 0xF782, 0xF202,
  0xD002, 0xD582, 0xDF82, 0xDA02, 0xCB82, 0xCE02, 0xC402, 0xC182,
  0x4382, 0x4602, 0x4C02, 0x4982, 0x5802, 0x5D82, 0x5782, 0x5202,
  0x7002, 0x7582, 0x7F82, 0x7A02, 0x6B82, 0x6E02, 0x6402, 0x6182,
  0x2002, 0x2582, 0x2F82, 0x2A02, 0x3B82, 0x3E02, 0x3402, 0x3182,
  0x1382, 0x1602, 0x1C02, 0x1982, 0x0802, 0x0D82, 0x0782, 0x0202
};

#else

TYPE_CRC16 CRC16_LUT[256];

 /* don't forget to call this when the program starts ! */
void create_CRC16_LUT() {
  int i, j;
  U16 r;

  for (i=0; i<256; i++) {
    r=i << 8;
    for (j=0; j<8; j++) {
      if (r & 0x8000)
        r = (r << 1) ^ CRC16_poly;
      else
        r <<= 1;
    }
    CRC16_LUT[i] = ((r<<8) |(r>>8)) & 65535 ; /* byteswap */
  }
}

#endif

/**************************************************
           A reference CRC16 algorithm
 **************************************************/

U16 CRC16_reference(U8 *p, unsigned int count){
  unsigned int i, j;
  U32 ref = CRC16_seed;
  /* The 32-bit word is organised as :
  bits 16-31: "overflow" area (eases tests)
  bits 8-15 : "insertion" area (avoids some useless cycles)
  bits 0-7  : the rest of the CRC area
  */
  for (j=0; j<count; j++) {
    ref ^= (*(p++)) << 8 ; /* insertion */
    for (i=0; i<8; i++) {
      ref<<=1;
      if (ref & 0x10000)
        ref^=CRC16_poly;
    }
  }

  /* the return value MUST be byteswapped
    so it matches the "reflected" algo */
#ifdef CRC16_NEGATE
  return ( ((ref >> 8) & 255) | ((ref & 255) << 8) ) ^ 0xFFFF;
#else
  return   ((ref >> 8) & 255) | ((ref & 255) << 8);
#endif
}

/**************************************************
    And now...
    A rather weird collection of CRC16 routines
 **************************************************/

/* msg_in is an unsigned char (U8) */
#define CRC16_step(msg_in) \
   crc16_val = CRC16_LUT[(crc16_val & 255) ^ msg_in] ^ (crc16_val >> 8);

/* msg_in is an unsigned short int (U16) */
#define CRC16_stepX2(msg_in) \
   crc16_val ^= msg_in; \
   crc16_val = CRC16_LUT[crc16_val & 255] ^ (crc16_val >> 8); \
   crc16_val = CRC16_LUT[crc16_val & 255] ^ (crc16_val >> 8);

/* msg_in is an unsigned long int (U32) */
#define CRC16_stepX4(msg_in) \
   { \
     unsigned long int t; \
     t = crc16_val ^ msg_in; \
             t = CRC16_LUT[t & 255] ^ (t >> 8); \
             t = CRC16_LUT[t & 255] ^ (t >> 8); \
             t = CRC16_LUT[t & 255] ^ (t >> 8); \
     crc16_val = CRC16_LUT[t & 255] ^ (t >> 8); \
   }

/*----------------------------------
   Fail-safe for non-LE machines
----------------------------------*/

U16 CRC16_byte(U8 *p, int count) {
  TYPE_CRC16 t=CRC16_seed;

  while (count--) {
    t = CRC16_LUT[(t & 255) ^ *(p++)] ^ (t >> 8);
  }
#ifdef CRC16_NEGATE
  return t ^ 0xFFFF;
#else
  return t;
#endif
}

/* same but with 2 simultaneous computations : */
void CRC16_byte_multi(U8 *p, int count1, U8 *q, int count2) {
  TYPE_CRC16 crc1=CRC16_seed, crc2=CRC16_seed;
  int count=count1;

  if (count2<count1)
    count=count2;
  count1-=count;
  count2-=count;

  while (count--) {
    crc1 = CRC16_LUT[(crc1 & 255) ^ *(p++)] ^ (crc1 >> 8);
    crc2 = CRC16_LUT[(crc2 & 255) ^ *(q++)] ^ (crc2 >> 8);
  }

  while (count1--) {
    crc1 = CRC16_LUT[(crc1 & 255) ^ *(p++)] ^ (crc1 >> 8);
  }

  while (count2--) {
    crc2 = CRC16_LUT[(crc2 & 255) ^ *(q++)] ^ (crc2 >> 8);
  }

#ifdef CRC16_NEGATE
  crc1 ^= 0xFFFF;
  crc2 ^= 0xFFFF;
#endif
  crc_res_1=(U16)crc1;
  crc_res_2=(U16)crc2;
}

/*---------------------------------
 This one is ok for classic x86 machines
---------------------------------*/
U16 CRC16_block(U8 *p, int count) {
  U32 t=CRC16_seed;

  if (count >= 8) {
/* alignment prologue : */

    if ( PTR_CAST p & 1  )  {
      t = CRC16_LUT[ (0xFF & CRC16_seed) ^ (*p) ] ^ (CRC16_seed >> 8); 
      p++;
      count--;
    }

    if ( PTR_CAST p & 2 ) {
      t ^= *(U16 *)p;
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      p+=2;
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      count-=2;
    }

    if ( PTR_CAST p & 4 ) {
      t ^= *(U32 *)p;
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      count-=4;
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      p+=4;
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      t = CRC16_LUT[t & 255] ^ (t >> 8);
    }

/* Loop : */
    while (count>=8) {
      t ^= *(U32 *)p;
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      count -= 8;
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      t = CRC16_LUT[t & 255] ^ (t >> 8);

      t ^= (*(U32 *)(p+4));
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      p+=8;
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      t = CRC16_LUT[t & 255] ^ (t >> 8);
    }
  }

/* epilogue: */
/* unknown pointer alignment => byte version */

  if (count & 4) {
    t = CRC16_LUT[(t & 255) ^ * p    ] ^ (t >> 8);
    t = CRC16_LUT[(t & 255) ^ *(p+1) ] ^ (t >> 8);
    t = CRC16_LUT[(t & 255) ^ *(p+2) ] ^ (t >> 8);
    t = CRC16_LUT[(t & 255) ^ *(p+3) ] ^ (t >> 8);
    p+=4;
    count -= 4;
  }

  if (count & 2) {
    t = CRC16_LUT[(t & 255) ^ * p    ] ^ (t >> 8);
    t = CRC16_LUT[(t & 255) ^ *(p+1) ] ^ (t >> 8);
    p+=2;
    count -= 2;
  }
  
 /* here, count==0 or ==1 by design, or else there is a problem */

  if (count & 1)
    t = CRC16_LUT[(t & 255) ^  *p    ] ^ (t >> 8);

#ifdef CRC16_NEGATE
  return t ^ 0xFFFF;
#else
  return t;
#endif
}

/*-------------------------------------
 This one is for x86-64 or ALPHA
 (it compiles into bloated code on x86-32)
-------------------------------------*/
U16 CRC16_block64(U8 *p, int count) {
  U64 t=CRC16_seed;

  if (count >= 8) {
/* alignment prologue : */

    if ( PTR_CAST p & 1  )  {
      t = CRC16_LUT[ (0xFF & CRC16_seed) ^ (*p) ] ^ (CRC16_seed >> 8); 
      p++;
      count--;
    }

    if ( PTR_CAST p & 2 ) {
      t ^= *(U16 *)p;
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      p+=2;
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      count-=2;
    }

    if ( PTR_CAST p & 4 ) {
      t ^= *(U32 *)p;
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      count-=4;
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      p+=4;
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      t = CRC16_LUT[t & 255] ^ (t >> 8);
    }

/* Loop : */
    while (count>=8) {
      t ^= *(U64 *)p; /* 64 bits at a time ! */
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      count -= 8;
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      p+=8;
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      t = CRC16_LUT[t & 255] ^ (t >> 8);
      t = CRC16_LUT[t & 255] ^ (t >> 8);
    }
  }

/* epilogue: */
/* unknown pointer alignment => byte version */

  if (count & 4) {
    t = CRC16_LUT[(t & 255) ^ * p    ] ^ (t >> 8);
    t = CRC16_LUT[(t & 255) ^ *(p+1) ] ^ (t >> 8);
    t = CRC16_LUT[(t & 255) ^ *(p+2) ] ^ (t >> 8);
    t = CRC16_LUT[(t & 255) ^ *(p+3) ] ^ (t >> 8);
    p+=4;
    count -= 4;
  }

  if (count & 2) {
    t = CRC16_LUT[(t & 255) ^ * p    ] ^ (t >> 8);
    t = CRC16_LUT[(t & 255) ^ *(p+1) ] ^ (t >> 8);
    p+=2;
    count -= 2;
  }
  
 /* here, count==0 or ==1 by design, or else there is a problem */

  if (count & 1)
    t = CRC16_LUT[(t & 255) ^  *p    ] ^ (t >> 8);

#ifdef CRC16_NEGATE
  return t ^ 0xFFFF;
#else
  return t;
#endif
}

/*---------------------------------
 This one is ok for ppro+ machines
---------------------------------*/

void CRC16_block_multi(U8 *p, int count1, U8 *q, int count2) {
  U32 crc1=CRC16_seed, crc2=CRC16_seed;

/* first alignment prologue : */
  if (count1 >= 8) {
    if ( PTR_CAST p & 1  )  {
      crc1 = CRC16_LUT[ (0xFF & CRC16_seed) ^ (*p) ] ^ (CRC16_seed >> 8); 
      p++;
      count1--;
    }

    if ( PTR_CAST p & 2 ) {
      crc1 ^= *(U16 *)p;
      crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
      p+=2;
      crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
      count1-=2;
    }

    if ( PTR_CAST p & 4 ) {
      crc1 ^= *(U32 *)p;
      crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
      count1-=4;
      crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
      p+=4;
      crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
      crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
    }
  }

/* second alignment prologue : */
  if (count2 >= 8) {
    if ( PTR_CAST q & 1  )  {
      crc2 = CRC16_LUT[ (0xFF & CRC16_seed) ^ (*q) ] ^ (CRC16_seed >> 8); 
      q++;
      count2--;
    }

    if ( PTR_CAST q & 2 ) {
      crc2 ^= *(U16 *)q;
      crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
      q+=2;
      crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
      count2-=2;
    }

    if ( PTR_CAST q & 4 ) {
      crc2 ^= *(U32 *)q;
      crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
      count2-=4;
      crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
      q+=4;
      crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
      crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
    }
  }

/* Loops : */
  while ((count1 & ~7) && (count2 & ~7)) {
    crc1 ^= *(U32 *)p;
    crc2 ^= *(U32 *)q;
    crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
    crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
    crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
    count1 -= 8;      
    crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
    crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
    count2 -= 8;
    crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
    crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
    crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);

    crc1 ^= *(U32 *)(p+4);
    crc2 ^= *(U32 *)(q+4);
    crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
    crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
    p+=8;
    crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
    q+=8;
    crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
    crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
    crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
    crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
    crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
  }

/* the remainder (if the counts are different)
   Either one or the other will be run at once,
   so some gotos should short-circuit this. */
  while (count1 & ~7) {
    crc1 ^= *(U32 *)p;
    crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
    crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
    count1 -= 8;      
    crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
    crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);

    crc1 ^= *(U32 *)(p+4);
    crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
    crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
    p+=8;
    crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
    crc1 = CRC16_LUT[crc1 & 255] ^ (crc1 >> 8);
  }

  while (count2 & ~7) {
    crc2 ^= *(U32 *)q;
    crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
    crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
    count2 -= 8;
    crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
    crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);

    crc2 ^= *(U32 *)(q+4);
    crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
    q+=8;
    crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
    crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
    crc2 = CRC16_LUT[crc2 & 255] ^ (crc2 >> 8);
  }

/* epilogue: */
/* unknown pointer alignment => byte version */
  if (count1 & 4) {
    crc1 = CRC16_LUT[(crc1 & 255) ^ * p    ] ^ (crc1 >> 8);
    crc1 = CRC16_LUT[(crc1 & 255) ^ *(p+1) ] ^ (crc1 >> 8);
    crc1 = CRC16_LUT[(crc1 & 255) ^ *(p+2) ] ^ (crc1 >> 8);
    crc1 = CRC16_LUT[(crc1 & 255) ^ *(p+3) ] ^ (crc1 >> 8);
    p+=4;
    count1 -= 4;
  }

  if (count2 & 4) {
    crc2 = CRC16_LUT[(crc2 & 255) ^ * q    ] ^ (crc2 >> 8);
    crc2 = CRC16_LUT[(crc2 & 255) ^ *(q+1) ] ^ (crc2 >> 8);
    crc2 = CRC16_LUT[(crc2 & 255) ^ *(q+2) ] ^ (crc2 >> 8);
    crc2 = CRC16_LUT[(crc2 & 255) ^ *(q+3) ] ^ (crc2 >> 8);
    q+=4;
    count2 -= 4;
  }

  if (count1 & 2) {
    crc1 = CRC16_LUT[(crc1 & 255) ^ * p    ] ^ (crc1 >> 8);
    crc1 = CRC16_LUT[(crc1 & 255) ^ *(p+1) ] ^ (crc1 >> 8);
    p+=2;
    count1 -= 2;
  }

  if (count2 & 2) {
    crc2 = CRC16_LUT[(crc2 & 255) ^ * q    ] ^ (crc2 >> 8);
    crc2 = CRC16_LUT[(crc2 & 255) ^ *(q+1) ] ^ (crc2 >> 8);
    p+=2;
    count2 -= 2;
  }

  if (count1 & 1)
    crc1 = CRC16_LUT[(crc1 & 255) ^  *p    ] ^ (crc1 >> 8);

  if (count2 & 1)
    crc2 = CRC16_LUT[(crc2 & 255) ^  *q    ] ^ (crc2 >> 8);

#ifdef CRC16_NEGATE
  crc1 ^= 0xFFFF;
  crc2 ^= 0xFFFF;
#endif
  crc_res_1=(U16)crc1;
  crc_res_2=(U16)crc2;
}


/**************************************************
              Some standalone tests
 **************************************************/

#ifdef STANDALONE_TEST_CRC16

#include <stdio.h>

#ifdef DISPLAY_TABLE
void display_table() {
  unsigned int i, j, k=0;
  printf("TYPE_CRC16 CRC16_LUT[256]= {\n");
  for (i=0; i<32; i++) {
    printf("  ");
    for (j=0; j<8; j++) {
      printf("0x%04hX, ", CRC16_LUT[k++]);
    }
    printf("\n");
  }
  printf("};\n");
}
#endif

unsigned char a[]="azertyuiopqsdfghjklmwxcvbn1234567890";

int main() {
  unsigned int i, j, u;

  printf("U8=%d, U16=%d, U32=%d, U64=%d\n",
     sizeof(U8), sizeof(U16), sizeof(U32), sizeof(U64));

  if (sizeof(U8) != 1) {
    printf("Error ! Size of U8 is not 1\n");
    return -1;
  }

  if (sizeof(U16) != 2) {
    printf("Error ! Size of U16 is not 2\n");
    return -1;
  }

  if (sizeof(U32) != 4) {
    printf("Error ! Size of U32 is not 4\n");
    return -1;
  }

  if (sizeof(U64) != 8) {
    printf("Error ! Size of U64 is not 8\n");
    return -1;
  }

#ifdef CRC16_SMALL_FOOTPRINT
  create_CRC16_LUT();
#ifdef DISPLAY_TABLE
  display_table();
#endif
#endif

/* LUT generation tests against a reference bit-wise CRC */
  printf("\n\n TEST 1\n");
  for (j=0; j<35; j++) {

    crc16_val = CRC16_seed;
    printf("\n\n version 1x: %d\n       ",j);
    for (i=j; i<36; i++) {
      CRC16_step(a[i])
#ifdef CRC16_NEGATE
      printf(" 0x%04X", crc16_val ^ 0xFFFF);
#else
      printf(" 0x%04X", crc16_val);
#endif
    }

    printf("\n reference version :\n");
    for (i=j; i<=36; i++) {
      printf(" 0x%04X",CRC16_reference(a+j, i-j));
    }

    printf("\n version CRC16_byte:\n");
    for (i=j; i<=36; i++) {
      printf(" 0x%04X",CRC16_byte(a+j, i-j));
    }

  }

#if __BYTE_ORDER == __LITTLE_ENDIAN

/* block alignment tests: */
  printf("\n\n TEST 2\n");
  for (j=0; j<35; j++) {

    crc16_val = CRC16_seed;
    printf("\n\n version 1x: %d\n       ",j);
    for (i=j; i<36; i++) {
      CRC16_step(a[i])
#ifdef CRC16_NEGATE
      printf(" 0x%04X", crc16_val ^ 0xFFFF);
#else
      printf(" 0x%04X", crc16_val);
#endif
    }

    printf("\n version 32 bits:\n");
    for (i=j; i<=36; i++) {
      printf(" 0x%04X",CRC16_block(a+j, i-j));
    }

    printf("\n version 64 bits:\n");
    for (i=j; i<=36; i++) {
      printf(" 0x%04X",CRC16_block64(a+j, i-j));
    }

  }

/* same but worse : */
  printf("\n\n TEST 3\n");

  for (j=0; j<18; j++) {
    crc16_val = CRC16_seed;
    printf("\n\n version 1x: %d\n       ",j);
    for (i=j; i<18; i++) {
      CRC16_step(a[i])
#ifdef CRC16_NEGATE
      printf(" 0x%04X", crc16_val ^ 0xFFFF);
#else
      printf(" 0x%04X", crc16_val);
#endif
    }

    printf("\n CRC16_block_multi:\n");
    for (i=j; i<=18; i++) {
      CRC16_block_multi(a+j, i-j, a+i, j);
      printf(" 0x%04X",crc_res_1);
    }

    printf("\n CRC16_byte_multi:\n");
    for (i=j; i<=18; i++) {
      CRC16_byte_multi(a+j, i-j, a+i, j);
      printf(" 0x%04X",crc_res_1);
    }

  }

#endif /*  __LITTLE_ENDIAN */


/* 1X, 2X & 4X comparison : */
  printf("\n\n TEST 4\n");

  crc16_val = CRC16_seed;
  printf("\n version 1x (bis):\n       ");
  for (i=0; i<36; i++) {
    CRC16_step(a[i])
#ifdef CRC16_NEGATE
    printf(" 0x%04X", crc16_val ^ 0xFFFF);
#else
    printf(" 0x%04X", crc16_val);
#endif
  }

  printf("\n version CRC16_byte:\n");

  for (i=0; i<=36; i++) {
    printf(" 0x%04X", CRC16_byte(a, i));
  }

  printf("\n version 2x:\n       ");
  crc16_val = CRC16_seed;

  for (i=0; i<36; i+=2) {
   u= a[i] | (a[i+1]<<8) ;
    CRC16_stepX2( u )
#ifdef CRC16_NEGATE
    printf("        0x%04X", crc16_val ^ 0xFFFF);
#else
    printf("        0x%04X", crc16_val);
#endif
  }

  printf("\n version 4x:\n       ");
  crc16_val = CRC16_seed;

  for (i=0; i<36; i+=4) {
   u= a[i] | (a[i+1]<<8) | (a[i+2]<<16) | (a[i+3]<<24) ;
    CRC16_stepX4( u )
#ifdef CRC16_NEGATE
    printf("                      0x%04X", crc16_val ^ 0xFFFF);
#else
    printf("                      0x%04X", crc16_val);
#endif
  }

  printf("\n\n");
  return 0;
}

#endif /* STANDALONE_TEST_CRC16 */

#endif /* MDS_CRC16 */
