A little improvement on Vorbis block synthesis.
Added myself to the list of contributors.


git-svn-id: svn://svn.rockbox.org/rockbox/trunk@6664 a1c6a512-1295-4272-9138-f99709370657
diff --git a/apps/codecs/Tremor/asm_mcf5249.h b/apps/codecs/Tremor/asm_mcf5249.h
index 811148a..9844cc0 100644
--- a/apps/codecs/Tremor/asm_mcf5249.h
+++ b/apps/codecs/Tremor/asm_mcf5249.h
@@ -21,6 +21,9 @@
 
 #if CONFIG_CPU == MCF5249 && !defined(SIMULATOR)
 
+/* attribute for 16-byte alignment */
+#define LINE_ATTR   __attribute__ ((aligned (16)))
+
 #ifndef _V_WIDE_MATH
 #define _V_WIDE_MATH
 
@@ -107,15 +110,14 @@
 }
 
 
-
-
-#if 1 /* Canonical definition */
+#if 1
+/* canonical definition */
 #define XPROD32(_a, _b, _t, _v, _x, _y)         \
   { (_x)=MULT32(_a,_t)+MULT32(_b,_v);           \
     (_y)=MULT32(_b,_t)-MULT32(_a,_v); }
 #else
-/* Thom Johansen suggestion; this could loose the lsb by overflow
-   but does it matter in practice? */
+/* Thom Johansen's suggestion; this could loose the LSB by overflow;
+   Does it matter in practice? */
 #define XPROD32(_a, _b, _t, _v, _x, _y)     \
   asm volatile ("mac.l %[a], %[t], %%acc0;" \
                 "mac.l %[b], %[v], %%acc0;" \
@@ -129,14 +131,82 @@
                 : [a] "r" (_a), [b] "r" (_b), \
                   [t] "r" (_t), [v] "r" (_v) \
                 : "cc");
-#endif 
+#endif
 
 
-/* asm versions of vector multiplication for window.c */
+/* asm versions of vector operations for block.c, window.c */
 /* assumes MAC is initialized & accumulators cleared */
 static inline 
+void mcf5249_vect_add(ogg_int32_t *x, ogg_int32_t *y, int n)
+{
+  /* align to 16 bytes */
+  while(n>0 && (int)x&16) {
+    *x++ += *y++;
+    n--;
+  }
+  asm volatile ("bra 1f;"
+                "0:"                          /* loop start */
+                "movem.l (%[x]), %%d0-%%d3;"  /* fetch values */
+                "movem.l (%[y]), %%a0-%%a3;"
+                /* add */
+                "add.l %%a0, %%d0;"
+                "add.l %%a1, %%d1;"
+                "add.l %%a2, %%d2;"
+                "add.l %%a3, %%d3;"
+                /* store and advance */
+                "movem.l %%d0-%%d3, (%[x]);"  
+                "lea.l (4*4, %[x]), %[x];"
+                "lea.l (4*4, %[y]), %[y];"
+                "subq.l #4, %[n];"     /* done 4 elements */
+                "1: cmpi.l #4, %[n];"
+                "bge 0b;"
+                : [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
+                : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
+                    "cc", "memory");
+  /* add final elements */
+  while (n>0) {
+    *x++ += *y++;
+    n--;
+  }
+}
+
+static inline 
+void mcf5249_vect_copy(ogg_int32_t *x, ogg_int32_t *y, int n)
+{
+  /* align to 16 bytes */
+  while(n>0 && (int)x&16) {
+    *x++ = *y++;
+    n--;
+  }  
+  asm volatile ("bra 1f;"
+                "0:"                                    /* loop start */
+                "movem.l (%[y]), %%d0-%%d3;"            /* fetch values */
+                "movem.l %%d0-%%d3, (%[x]);"            /* store */
+                "lea.l (4*4, %[x]), %[x];"              /* advance */
+                "lea.l (4*4, %[y]), %[y];"
+                "subq.l #4, %[n];"                      /* done 4 elements */
+                "1: cmpi.l #4, %[n];"
+                "bge 0b;"
+                : [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
+                : : "%d0", "%d1", "%d2", "%d3", "cc", "memory");
+  /* copy final elements */
+  while (n>0) {
+    *x++ = *y++;
+    n--;
+  }
+}
+
+
+static inline 
 void mcf5249_vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n)
 {
+  /* ensure data is aligned to 16-bytes */
+  while(n>0 && (int)data%16) {
+    *data = MULT31(*data, *window);
+    data++;
+    window++;
+    n--;
+  }
   asm volatile ("movem.l (%[d]), %%d0-%%d3;"  /* loop start */
                 "movem.l (%[w]), %%a0-%%a3;"  /* pre-fetch registers */
                 "lea.l (4*4, %[w]), %[w];"
@@ -184,6 +254,13 @@
 static inline 
 void mcf5249_vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n)
 {
+  /* ensure at least data is aligned to 16-bytes */
+  while(n>0 && (int)data%16) {
+    *data = MULT31(*data, *window);
+    data++;
+    window--;
+    n--;
+  }
   asm volatile ("lea.l (-3*4, %[w]), %[w];"     /* loop start */
                 "movem.l (%[d]), %%d0-%%d3;"    /* pre-fetch registers */
                 "movem.l (%[w]), %%a0-%%a3;"
@@ -232,6 +309,11 @@
 static inline 
 void mcf5249_vect_zero(ogg_int32_t *ptr, int n)
 {
+  /* ensure ptr is aligned to 16-bytes */
+  while(n>0 && (int)ptr%16) {
+    *ptr++ = 0;
+    n--;
+  }
   asm volatile ("clr.l %%d0;"
                 "clr.l %%d1;"
                 "clr.l %%d2;"
@@ -241,23 +323,16 @@
                 "bra 1f;"
                 "0: movem.l %%d0-%%d3, (%[ptr]);"
                 "lea (4*4, %[ptr]), %[ptr];"
-                "subq.l #4, %[n];"
+                "subq.l #4, %[n];"  /* done 4 elements */
                 "1: bgt 0b;"
-                /* remaing elements */
-                "tst.l %[n];"
-                "beq 1f;"      /* n=0 */
-                "clr.l (%[ptr])+;"
-                "subq.l #1, %[n];"
-                "beq 1f;"     /* n=1 */
-                "clr.l (%[ptr])+;"
-                "subq.l #1, %[n];"
-                "beq 1f;"     /* n=2 */
-                /* otherwise n = 3 */
-                "clr.l (%[ptr])+;"
-                "1:"
                 : [n] "+d" (n), [ptr] "+a" (ptr)
                 :
                 : "%d0","%d1","%d2","%d3","cc","memory");
+  /* clear remaining elements */
+  while(n>0) {
+    *ptr++ = 0;
+    n--;
+  }
 }
 
 #endif
@@ -272,4 +347,6 @@
 }
 
 #endif
+#else
+#define LINE_ATTR
 #endif
diff --git a/apps/codecs/Tremor/block.c b/apps/codecs/Tremor/block.c
index 6f88fb8..f51622b 100644
--- a/apps/codecs/Tremor/block.c
+++ b/apps/codecs/Tremor/block.c
@@ -70,8 +70,8 @@
                           |   |   |endSr
                           |   |beginSr
                           |   |endSl
-			  |beginSl
-			  |beginW
+                          |beginSl
+                          |beginW
 */
 
 /* block abstraction setup *********************************************/
@@ -173,10 +173,8 @@
   v->pcm=(ogg_int32_t **)_ogg_malloc(vi->channels*sizeof(*v->pcm));
   v->pcmret=(ogg_int32_t **)_ogg_malloc(vi->channels*sizeof(*v->pcmret));
 
-  // pbv: allow for extra padding for windowing
   for(i=0;i<vi->channels;i++)
     v->pcm[i]=(ogg_int32_t *)_ogg_calloc(v->pcm_storage,sizeof(*v->pcm[i]));
-    // v->pcm[i]=(ogg_int32_t *)_ogg_calloc(v->pcm_storage,sizeof(*v->pcm[i]));
     
 
   /* all 1 (large block) or 0 (small block) */
@@ -190,7 +188,7 @@
     int mapnum=ci->mode_param[i]->mapping;
     int maptype=ci->map_type[mapnum];
     b->mode[i]=_mapping_P[maptype]->look(v,ci->mode_param[i],
-					 ci->map_param[mapnum]);
+                                         ci->map_param[mapnum]);
   }
   return(0);
 }
@@ -231,7 +229,7 @@
 
     if(v->pcm){
       for(i=0;i<vi->channels;i++)
-	if(v->pcm[i])_ogg_free(v->pcm[i]);
+        if(v->pcm[i])_ogg_free(v->pcm[i]);
       _ogg_free(v->pcm);
       if(v->pcmret)_ogg_free(v->pcmret);
     }
@@ -239,9 +237,9 @@
     /* free mode lookups; these are actually vorbis_look_mapping structs */
     if(ci){
       for(i=0;i<ci->modes;i++){
-	int mapnum=ci->mode_param[i]->mapping;
-	int maptype=ci->map_type[mapnum];
-	if(b && b->mode)_mapping_P[maptype]->free_look(b->mode[i]);
+        int mapnum=ci->mode_param[i]->mapping;
+        int maptype=ci->map_type[mapnum];
+        if(b && b->mode)_mapping_P[maptype]->free_look(b->mode[i]);
       }
     }
 
@@ -262,7 +260,11 @@
   vorbis_info *vi=v->vi;
   codec_setup_info *ci=(codec_setup_info *)vi->codec_setup;
   private_state *b=v->backend_state;
+#if CONFIG_CPU == MCF5249
+  int j;
+#else
   int i,j;
+#endif
 
   if(v->pcm_current>v->pcm_returned  && v->pcm_returned!=-1)return(OV_EINVAL);
 
@@ -304,43 +306,64 @@
     for(j=0;j<vi->channels;j++){
       /* the overlap/add section */
       if(v->lW){
-	if(v->W){
-	  /* large/large */
-	  ogg_int32_t *pcm=v->pcm[j]+prevCenter;
-	  ogg_int32_t *p=vb->pcm[j];
-	  for(i=0;i<n1;i++)
-	    pcm[i]+=p[i];
-	}else{
-	  /* large/small */
-	  ogg_int32_t *pcm=v->pcm[j]+prevCenter+n1/2-n0/2;
-	  ogg_int32_t *p=vb->pcm[j];
-	  for(i=0;i<n0;i++)
-	    pcm[i]+=p[i];
-	}
+        if(v->W){
+          /* large/large */
+          ogg_int32_t *pcm=v->pcm[j]+prevCenter;
+          ogg_int32_t *p=vb->pcm[j];
+#if CONFIG_CPU == MCF5249
+          mcf5249_vect_add(pcm, p, n1);
+#else
+          for(i=0;i<n1;i++)
+            pcm[i]+=p[i]; 
+#endif
+        }else{
+          /* large/small */
+          ogg_int32_t *pcm=v->pcm[j]+prevCenter+n1/2-n0/2;
+          ogg_int32_t *p=vb->pcm[j];
+#if CONFIG_CPU == MCF5249
+          mcf5249_vect_add(pcm, p, n0);
+#else
+          for(i=0;i<n0;i++)
+            pcm[i]+=p[i];
+#endif
+        }
       }else{
-	if(v->W){
-	  /* small/large */
-	  ogg_int32_t *pcm=v->pcm[j]+prevCenter;
-	  ogg_int32_t *p=vb->pcm[j]+n1/2-n0/2;
-	  for(i=0;i<n0;i++)
-	    pcm[i]+=p[i];
-	  for(;i<n1/2+n0/2;i++)
-	    pcm[i]=p[i];
-	}else{
-	  /* small/small */
-	  ogg_int32_t *pcm=v->pcm[j]+prevCenter;
-	  ogg_int32_t *p=vb->pcm[j];
-	  for(i=0;i<n0;i++)
-	    pcm[i]+=p[i];
-	}
+        if(v->W){
+          /* small/large */
+          ogg_int32_t *pcm=v->pcm[j]+prevCenter;
+          ogg_int32_t *p=vb->pcm[j]+n1/2-n0/2;
+#if CONFIG_CPU == MCF5249
+          mcf5249_vect_add(pcm, p, n0);
+          mcf5249_vect_copy(&pcm[n0], &p[n0], n1/2-n0/2);
+#else
+          for(i=0;i<n0;i++)
+            pcm[i]+=p[i];
+          for(;i<n1/2+n0/2;i++)
+            pcm[i]=p[i];
+#endif
+        }else{
+          /* small/small */
+          ogg_int32_t *pcm=v->pcm[j]+prevCenter;
+          ogg_int32_t *p=vb->pcm[j];
+#if CONFIG_CPU == MCF5249
+          mcf5249_vect_add(pcm, p, n0);
+#else 
+          for(i=0;i<n0;i++)
+            pcm[i]+=p[i];
+#endif
+        }
       }
       
       /* the copy section */
       {
-	ogg_int32_t *pcm=v->pcm[j]+thisCenter;
-	ogg_int32_t *p=vb->pcm[j]+n;
-	for(i=0;i<n;i++)
-	  pcm[i]=p[i];
+        ogg_int32_t *pcm=v->pcm[j]+thisCenter;
+        ogg_int32_t *p=vb->pcm[j]+n;
+#if CONFIG_CPU == MCF5249
+        mcf5249_vect_copy(pcm, p, n);
+#else
+        for(i=0;i<n;i++)
+          pcm[i]=p[i];
+#endif
       }
     }
     
@@ -359,8 +382,8 @@
     }else{
       v->pcm_returned=prevCenter;
       v->pcm_current=prevCenter+
-	ci->blocksizes[v->lW]/4+
-	ci->blocksizes[v->W]/4;
+        ci->blocksizes[v->lW]/4+
+        ci->blocksizes[v->W]/4;
     }
 
   }
@@ -389,23 +412,23 @@
       
       /* is this a short page? */
       if(b->sample_count>v->granulepos){
-	/* corner case; if this is both the first and last audio page,
-	   then spec says the end is cut, not beginning */
-	if(vb->eofflag){
-	  /* trim the end */
-	  /* no preceeding granulepos; assume we started at zero (we'd
-	     have to in a short single-page stream) */
-	  /* granulepos could be -1 due to a seek, but that would result
-	     in a long coun`t, not short count */
-	  
-	  v->pcm_current-=(b->sample_count-v->granulepos);
-	}else{
-	  /* trim the beginning */
-	  v->pcm_returned+=(b->sample_count-v->granulepos);
-	  if(v->pcm_returned>v->pcm_current)
-	    v->pcm_returned=v->pcm_current;
-	}
-	
+        /* corner case; if this is both the first and last audio page,
+           then spec says the end is cut, not beginning */
+        if(vb->eofflag){
+          /* trim the end */
+          /* no preceeding granulepos; assume we started at zero (we'd
+             have to in a short single-page stream) */
+          /* granulepos could be -1 due to a seek, but that would result
+             in a long coun`t, not short count */
+          
+          v->pcm_current-=(b->sample_count-v->granulepos);
+        }else{
+          /* trim the beginning */
+          v->pcm_returned+=(b->sample_count-v->granulepos);
+          if(v->pcm_returned>v->pcm_current)
+            v->pcm_returned=v->pcm_current;
+        }
+        
       }
       
     }
@@ -414,16 +437,16 @@
     if(vb->granulepos!=-1 && v->granulepos!=vb->granulepos){
       
       if(v->granulepos>vb->granulepos){
-	long extra=v->granulepos-vb->granulepos;
-	
-	if(extra)
-	  if(vb->eofflag){
-	    /* partial last frame.  Strip the extra samples off */
-	    v->pcm_current-=extra;
-	  } /* else {Shouldn't happen *unless* the bitstream is out of
-	       spec.  Either way, believe the bitstream } */
+        long extra=v->granulepos-vb->granulepos;
+        
+        if(extra)
+          if(vb->eofflag){
+            /* partial last frame.  Strip the extra samples off */
+            v->pcm_current-=extra;
+          } /* else {Shouldn't happen *unless* the bitstream is out of
+               spec.  Either way, believe the bitstream } */
       } /* else {Shouldn't happen *unless* the bitstream is out of
-	   spec.  Either way, believe the bitstream } */
+           spec.  Either way, believe the bitstream } */
       v->granulepos=vb->granulepos;
     }
   }
@@ -441,7 +464,7 @@
     if(pcm){
       int i;
       for(i=0;i<vi->channels;i++)
-	v->pcmret[i]=v->pcm[i]+v->pcm_returned;
+        v->pcmret[i]=v->pcm[i]+v->pcm_returned;
       *pcm=v->pcmret;
     }
     return(v->pcm_current-v->pcm_returned);
diff --git a/apps/codecs/Tremor/mapping0.c b/apps/codecs/Tremor/mapping0.c
index c53383d..6154f5d 100644
--- a/apps/codecs/Tremor/mapping0.c
+++ b/apps/codecs/Tremor/mapping0.c
@@ -202,10 +202,6 @@
   int   nonzero[CHANNELS];
   void *floormemo[CHANNELS];
 
-  /* test for too many channels; 
-     (maybe this is can be checked at the stream level?) */
-  if (vi->channels > CHANNELS) return (-1);
-
   /* time domain information decode (note that applying the
      information would have to happen later; we'll probably add a
      function entry to the harness for that later */
@@ -286,13 +282,14 @@
   //_analysis_output("residue",seq+j,vb->pcm[j],-8,n/2,0,0);
 
   /* compute and apply spectral envelope */
+#if 0
   for(i=0;i<vi->channels;i++){
     ogg_int32_t *pcm=vb->pcm[i];
     int submap=info->chmuxlist[i];
     look->floor_func[submap]->
       inverse2(vb,look->floor_look[submap],floormemo[i],pcm);
   } 
-
+#endif
   //for(j=0;j<vi->channels;j++)
   //_analysis_output("mdct",seq+j,vb->pcm[j],-24,n/2,0,1);
 
@@ -301,8 +298,11 @@
 
   for(i=0;i<vi->channels;i++){
     ogg_int32_t *pcm=vb->pcm[i];
+    int submap=info->chmuxlist[i];
     
-      if(nonzero[i]) {
+    if(nonzero[i]) {
+	look->floor_func[submap]->
+	  inverse2(vb,look->floor_look[submap],floormemo[i],pcm);
         mdct_backward(n, pcm, pcm);
         /* window the data */
         _vorbis_apply_window(pcm,b->window,ci->blocksizes,vb->lW,vb->W,vb->nW);
diff --git a/apps/codecs/Tremor/mdct.c b/apps/codecs/Tremor/mdct.c
index 27a340b..9bdfdce 100644
--- a/apps/codecs/Tremor/mdct.c
+++ b/apps/codecs/Tremor/mdct.c
@@ -341,10 +341,6 @@
   int shift;
   int step;
 
-#if CONFIG_CPU == MCF5249
-  /* mcf5249_init_mac(); */  /* should be redundant */
-#endif
-
   for (shift=6;!(n&(1<<shift));shift++);
   shift=13-shift;
   step=2<<shift;
diff --git a/apps/codecs/Tremor/synthesis.c b/apps/codecs/Tremor/synthesis.c
index db178e7..d01a7aa 100644
--- a/apps/codecs/Tremor/synthesis.c
+++ b/apps/codecs/Tremor/synthesis.c
@@ -33,7 +33,7 @@
 #define CHANNELS          2          
 
 static ogg_int32_t *ipcm_vect[CHANNELS] IDATA_ATTR;
-static ogg_int32_t ipcm_buff[CHANNELS*IRAM_PCM_END] IDATA_ATTR;
+static ogg_int32_t ipcm_buff[CHANNELS*IRAM_PCM_END] IDATA_ATTR LINE_ATTR;
 
 int vorbis_synthesis(vorbis_block *vb,ogg_packet *op,int decodep){
   vorbis_dsp_state     *vd=vb->vd;
@@ -73,10 +73,10 @@
   vb->sequence=op->packetno-3; /* first block is third packet */
   vb->eofflag=op->e_o_s;
 
-  if(decodep){
+  if(decodep && vi->channels<=CHANNELS){
     /* alloc pcm passback storage */
     vb->pcmend=ci->blocksizes[vb->W];
-    if (vi->channels <= CHANNELS && vb->pcmend<=IRAM_PCM_END) { 
+    if (vb->pcmend<=IRAM_PCM_END) { 
       /* use statically allocated iram buffer */
       vb->pcm = ipcm_vect;
       for(i=0; i<CHANNELS; i++)
diff --git a/apps/codecs/Tremor/window_lookup.h b/apps/codecs/Tremor/window_lookup.h
index 71a413b..64350d8 100644
--- a/apps/codecs/Tremor/window_lookup.h
+++ b/apps/codecs/Tremor/window_lookup.h
@@ -32,7 +32,7 @@
   X(0x7fdd78a5), X(0x7ff6ec6d), X(0x7ffed0e9), X(0x7ffffc3f),
 };
 
-static LOOKUP_T vwin128[64] IDATA_ATTR = {
+static LOOKUP_T vwin128[64] IDATA_ATTR LINE_ATTR = {
   X(0x0007c04d), X(0x0045bb89), X(0x00c18b87), X(0x017ae294),
   X(0x02714a4e), X(0x03a4217a), X(0x05129952), X(0x06bbb24f),
   X(0x089e38a1), X(0x0ab8c073), X(0x0d09a228), X(0x0f8ef6bd),
@@ -51,7 +51,7 @@
   X(0x7ffdcf39), X(0x7fff6dac), X(0x7fffed01), X(0x7fffffc4),
 };
 
-static LOOKUP_T vwin256[128] IDATA_ATTR = {
+static LOOKUP_T vwin256[128] IDATA_ATTR LINE_ATTR = {
   X(0x0001f018), X(0x00117066), X(0x00306e9e), X(0x005ee5f1),
   X(0x009ccf26), X(0x00ea208b), X(0x0146cdea), X(0x01b2c87f),
   X(0x022dfedf), X(0x02b85ced), X(0x0351cbbd), X(0x03fa317f),
@@ -86,7 +86,7 @@
   X(0x7fffdcd2), X(0x7ffff6d6), X(0x7ffffed0), X(0x7ffffffc),
 };
 
-static LOOKUP_T vwin512[256] IDATA_ATTR  = {
+static LOOKUP_T vwin512[256] IDATA_ATTR LINE_ATTR  = {
   X(0x00007c06), X(0x00045c32), X(0x000c1c62), X(0x0017bc4c),
   X(0x00273b7a), X(0x003a9955), X(0x0051d51c), X(0x006cede7),
   X(0x008be2a9), X(0x00aeb22a), X(0x00d55b0d), X(0x00ffdbcc),
@@ -284,7 +284,7 @@
   X(0x7fffffdd), X(0x7ffffff7), X(0x7fffffff), X(0x7fffffff),
 };
 
-static LOOKUP_T vwin2048[1024] IDATA_ATTR = {
+static LOOKUP_T vwin2048[1024] IDATA_ATTR LINE_ATTR = {
   X(0x000007c0), X(0x000045c4), X(0x0000c1ca), X(0x00017bd3),
   X(0x000273de), X(0x0003a9eb), X(0x00051df9), X(0x0006d007),
   X(0x0008c014), X(0x000aee1e), X(0x000d5a25), X(0x00100428),
diff --git a/docs/CREDITS b/docs/CREDITS
index 0167b9d..d531078 100644
--- a/docs/CREDITS
+++ b/docs/CREDITS
@@ -115,3 +115,4 @@
 David Bryant
 Martin Arver
 Alexander Spyridakis
+Pedro Baltazar Vasconcelos