/*

DCT1D_01.c
version mar jan  1 08:45:57 GMT 2002
whygee@f-cpu.org

1* 8-bin DCT for a "baseline" JPG compressor.

originally cut and pasted from : sbcci_DCT2D.pdf
"Pipelined Fast 2-D DCT Architecture for JPEG Image Compression"
Luciano Volcan Agostini  <agostini@inf.ufrgs.br>
Ivan Saraiva Silva       <ivan@dimap.ufrn.br>
Sergio Bampi             <bampi@inf.ufrgs.br>

Table 2 : Bit width differences between the two
1-D DCT architectures
Pipeline  First 1-D DCT   Second 1-D DCT
 Stage      bit width        bit width
  1             9               13
  2            10               14
  3            11               15
  4            11               15
  5            12               15
  6            12               15

=> the data are in 1Q15 format and left-aligned
at the beginning.

 m1 = cos(4pi/16)
 m2 = cos(6pi/16)
 m3 = cos(2pi/16) - cos(6pi/16)
 m4 = cos(2pi/16)+ cos(6pi/16)


Optimisation : part 1 = variable name inference

A lot of work was already done so it is easy to
short-circuit temporary variables. Each level
(a to S) is an individual layer so the verification
range is reduced.

b2 bug corrected on Tue Dec 16 06:37:45 CET 2003

*/

sample
  b0, b1, b2, b3, b4, b5, b6, b7,
  c0, c1, c2, c3, c4, c5, c6,
  d0, d1,     d3, d4,
          e2, e3, e4,     e6, e7,
          f2, f3, f4, f5, f6, f7;

/* Step 1 */
b0 = a0 + a7;
b1 = a1 + a6;
b2 = a3 - a4; /* corrected */
b3 = a1 - a6;
b4 = a2 + a5;
b5 = a3 + a4;
b6 = a2 - a5;
b7 = a0 - a7;

/* Step 2 */
c0 = b0 + b5;
c1 = b1 - b4;
c2 = b2 + b6;
c3 = b1 + b4;
c4 = b0 - b5;
c5 = b3 + b7;
c6 = b3 + b6;
/*c7 = b7;*/

/* Step 3 */
d0 = c0 + c3;
d1 = c0 - c3;
/* d2 = c2; */
d3 = c1 + c4;
d4 = c2 - c5;
/*d5 = c4;*/
/*d6 = c5;*/
/*d7 = c6;*/
/*d8 = c7 = b7;*/

/* Step 4 */
/*e0 = d0;*/
/*e1 = d1;*/
e2 = m3 * /*d2 =*/ c2;
e3 = m1 * /*d7 =*/ c6;
e4 = m4 * /*d6 =*/ c5;
/* e5 = d5 =c4; */
e6 = m1 * d3;
e7 = m2 * d4;
/*e8 = d8;*/

/* Step 5 */
/*f0 = e0 = d0;*/
/*f1 = e1 = d1;*/
f2 = c4 /*e5*/ + e6;
f3 = c4 /*e5*/ - e6;
f4 = e3 + /*e8 =*/ b7;
f5 = /*e8 =*/ b7 - e3;
f6 = e2 + e7;
f7 = e4 + e7;

/* Step 6 */
S0 = d0 /* = f0*/;
S1 = f4 + f7;
S2 = f2;
S3 = f5 - f6;
S4 = d1 /* = f1 */;
S5 = f5 + f6;
S6 = f3;
S7 = f4 - f7;