Move MDCT reconstruction window code over to new trig function.  Improves accuracy significantly and slightly reduces code size.  Codec SNR now appears to be limited by truncation to 16 bit.  Comparison to MS decoder gives > 91 dB of agreement, and a lower RMS error verses the source wav then MS.  Additionally, move one commonly accessed table into IRAM. 

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@13813 a1c6a512-1295-4272-9138-f99709370657
diff --git a/apps/codecs/libwma/wmadec.h b/apps/codecs/libwma/wmadec.h
index 0de5079..e3e5d38 100644
--- a/apps/codecs/libwma/wmadec.h
+++ b/apps/codecs/libwma/wmadec.h
@@ -131,7 +131,7 @@
     fixed32 coefs[MAX_CHANNELS][BLOCK_MAX_SIZE];
     MDCTContext mdct_ctx[BLOCK_NB_SIZES];
     fixed32 *windows[BLOCK_NB_SIZES];
-    FFTComplex mdct_tmp[BLOCK_MAX_SIZE]; /* temporary storage for imdct */
+    FFTComplex *mdct_tmp; /* temporary storage for imdct */
     /* output buffer for one frame and the last for IMDCT windowing */
     fixed32 frame_out[MAX_CHANNELS][BLOCK_MAX_SIZE * 2];
     /* last frame info */
diff --git a/apps/codecs/libwma/wmadeci.c b/apps/codecs/libwma/wmadeci.c
index 1857e6a..bb9b96a 100644
--- a/apps/codecs/libwma/wmadeci.c
+++ b/apps/codecs/libwma/wmadeci.c
@@ -114,7 +114,9 @@
 fixed32 tsin0[1024], tsin1[512], tsin2[256], tsin3[128], tsin4[64];
 
 FFTComplex *exparray[5];                                    //these are the fft lookup tables
+
 uint16_t *revarray[5];
+
 FFTComplex  exptab0[512] IBSS_ATTR;//, exptab1[256], exptab2[128], exptab3[64], exptab4[32];    //folded these in!
 uint16_t revtab0[1024], revtab1[512], revtab2[256], revtab3[128], revtab4[64];
 
@@ -122,6 +124,7 @@
 
 uint16_t runtab0[1336], runtab1[1336], levtab0[1336], levtab1[1336];                //these could be made smaller since only one can be 1336
 
+FFTComplex mdct_tmp[BLOCK_MAX_SIZE] IBSS_ATTR; 			/* temporary storage for imdct */
 
 //may also be too large by ~ 1KB each?
 static VLC_TYPE vlcbuf1[6144][2];
@@ -1080,6 +1083,7 @@
     exparray[0] = exptab0; //exparray[1] = exptab1; exparray[2] = exptab2; exparray[3] = exptab3; exparray[4] = exptab4;
     revarray[0]=revtab0; revarray[1]=revtab1; revarray[2]=revtab2; revarray[3]=revtab3; revarray[4]=revtab4;
 
+	s->mdct_tmp = mdct_tmp; /* temporary storage for imdct */
     for(i = 0; i < s->nb_block_sizes; ++i)
     {
         ff_mdct_init(&s->mdct_ctx[i], s->frame_len_bits - i + 1, 1);
@@ -1108,12 +1112,13 @@
 
         //fixed32 n2 = itofix32(n<<1);        //2x the window length
         //alpha = fixdiv32(M_PI_F, n2);        //PI / (2x Window length) == PI<<(s->frame_len_bits - i+1)
-        //printf("two values of alpha %16.10lf %16.10lf\n", fixtof64(alpha), fixtof64(M_PI_F>>(s->frame_len_bits - i+1)));
-        alpha = M_PI_F>>(s->frame_len_bits - i+1);
+
+        //alpha = M_PI_F>>(s->frame_len_bits - i+1);
+        alpha = (1<<15)>>(s->frame_len_bits - i+1);	/* this calculates 0.5/(2*n) */
         for(j=0;j<n;++j)
         {
             fixed32 j2 = itofix32(j) + 0x8000;
-            window[j] = fixsin32(fixmul32(j2,alpha));        //alpha between 0 and pi/2
+             window[j] = fsincos(fixmul32(j2,alpha)<<16, 0);        //alpha between 0 and pi/2
 
         }
         //printf("created window\n");
@@ -1192,43 +1197,7 @@
     return 0;
 }
 
-#if 0
-/* interpolate values for a bigger or smaller block. The block must
-   have multiple sizes */
-static void interpolate_array(fixed32 *scale, int old_size, int new_size)
-{
-    int i, j, jincr, k;
-    fixed32 v;
 
-
-
-    if (new_size > old_size)
-    {
-        jincr = new_size / old_size;
-        j = new_size;
-        for(i = old_size - 1; i >=0; --i)
-        {
-            v = scale[i];
-            k = jincr;
-            do
-            {
-                scale[--j] = v;
-            }
-            while (--k);
-        }
-    }
-    else if (new_size < old_size)
-    {
-        j = 0;
-        jincr = old_size / new_size;
-        for(i = 0; i < new_size; ++i)
-        {
-            scale[i] = scale[j];
-            j += jincr;
-        }
-    }
-}
-#endif
 /* compute x^-0.25 with an exponent and mantissa table. We use linear
    interpolation to reduce the mantissa table size at a small speed
    expense (linear interpolation approximately doubles the number of
@@ -1958,9 +1927,9 @@
 }
 
 int wma_decode_superframe(WMADecodeContext* s,
-                                 void *data,
+                                 void *data,	/*output*/
                                  int *data_size,
-                                 uint8_t *buf,
+                                 uint8_t *buf,	/*input*/
                                  int buf_size)
 {
     //WMADecodeContext *s = avctx->priv_data;
diff --git a/apps/codecs/libwma/wmafixed.c b/apps/codecs/libwma/wmafixed.c
index 7c38009..3a902dd 100644
--- a/apps/codecs/libwma/wmafixed.c
+++ b/apps/codecs/libwma/wmafixed.c
@@ -68,7 +68,7 @@
     return (fixed32)temp;
 }
 
-
+#endif
 /*
 	Special fixmul32 that does a 16.16 x 1.31 multiply that returns a 16.16 value.
 	this is needed because the fft constants are all normalized to be less then 1
@@ -76,7 +76,7 @@
 
 
 */
-
+#ifndef CPU_ARM
 fixed32 fixmul32b(fixed32 x, fixed32 y)
 {
     fixed64 temp;
@@ -88,10 +88,10 @@
 
     return (fixed32)temp;
 }
-
 #endif
 
 
+
 /*
 	Not performance senstitive code here
 
@@ -275,6 +275,7 @@
 
 */
 
+#if 0
 fixed32 fixsin32(fixed32 x)
 {
 
@@ -325,3 +326,4 @@
 {
     return fixsin32(x - (M_PI_F>>1))*-1;
 }
+#endif