SPC codec: enable echo on ColdFire CPU. Do a couple general small optimizations. Preswap some data when running DSP for big endian.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@12410 a1c6a512-1295-4272-9138-f99709370657
diff --git a/apps/codecs/spc.c b/apps/codecs/spc.c
index 86b9c0c..87b5972 100644
--- a/apps/codecs/spc.c
+++ b/apps/codecs/spc.c
@@ -51,9 +51,14 @@
     /* Disable gaussian interpolation */
     #define SPC_NOINTERP 1
 
+#ifndef CPU_COLDFIRE
     /* Disable echo processing */
     #define SPC_NOECHO 1
 #else
+    /* Enable echo processing */
+    #define SPC_NOECHO 0
+#endif
+#else
     /* Don't cache BRR waves */
     #define SPC_BRRCACHE 0 
     
@@ -100,6 +105,8 @@
 
 #define GET_LE16( addr )        get_le16( addr )
 #define SET_LE16( addr, data )  set_le16( addr, data )
+#define INT16A( addr ) (*(uint16_t*) (addr))
+#define INT16SA( addr ) (*(int16_t*) (addr))
 
 #ifdef ROCKBOX_LITTLE_ENDIAN
     #define GET_LE16A( addr )       (*(uint16_t*) (addr))
@@ -794,6 +801,10 @@
 {
     memcpy( spc_emu.cycle_table, cycle_table, sizeof cycle_table );
 
+#ifdef CPU_COLDFIRE
+    coldfire_set_macsr(EMAC_SATURATE);
+#endif
+
     do
     {
         DEBUGF("SPC: next_track\n");
diff --git a/apps/codecs/spc/Spc_Dsp.h b/apps/codecs/spc/Spc_Dsp.h
index 0cf55de..4d64b24 100644
--- a/apps/codecs/spc/Spc_Dsp.h
+++ b/apps/codecs/spc/Spc_Dsp.h
@@ -107,6 +107,19 @@
 
 enum { fir_buf_half = 8 };
 
+#ifdef CPU_COLDFIRE
+/* global because of the large aligment requirement for hardware masking -
+ * L-R interleaved 16-bit samples for easy loading and mac.w use.
+ */
+enum
+{
+    fir_buf_size = fir_buf_half * sizeof ( int32_t ),
+    fir_buf_mask = ~fir_buf_size
+};
+int32_t fir_buf[fir_buf_half]
+    __attribute__ ((aligned (fir_buf_size*2))) IBSS_ATTR;
+#endif /* CPU_COLDFIRE */
+
 struct Spc_Dsp
 {
     union
@@ -122,11 +135,21 @@
     int noise_count;
     uint16_t noise; /* also read as int16_t */
     
+#ifdef CPU_COLDFIRE
+    /* circularly hardware masked address */
+    int32_t *fir_ptr;
+    /* wrapped address just behind current position -
+       allows mac.w to increment and mask fir_ptr */
+    int32_t *last_fir_ptr;
+    /* copy of echo FIR constants as int16_t for use with mac.w */
+    int16_t fir_coeff[voice_count];
+#else
     /* fir_buf [i + 8] == fir_buf [i], to avoid wrap checking in FIR code */
     int fir_pos; /* (0 to 7) */
     int fir_buf [fir_buf_half * 2] [2];
     /* copy of echo FIR constants as int, for faster access */
     int fir_coeff [voice_count]; 
+#endif
     
     struct voice_t voice_state [voice_count];
     
@@ -149,7 +172,6 @@
     this->echo_pos    = 0;
     this->noise_count = 0;
     this->noise       = 2;
-    this->fir_pos     = 0;
     
     this->r.g.flags   = 0xE0; /* reset, mute, echo off */
     this->r.g.key_ons = 0;
@@ -169,8 +191,16 @@
         for ( i = 0; i < 256; i++ )
             this->wave_entry [i].start_addr = -1;
     #endif
-    
+
+#ifdef CPU_COLDFIRE
+    this->fir_ptr      = fir_buf;
+    this->last_fir_ptr = &fir_buf [7];
+    memset( fir_buf, 0, sizeof fir_buf );
+#else
+    this->fir_pos = 0;
     memset( this->fir_buf, 0, sizeof this->fir_buf );
+#endif
+
     assert( offsetof (struct globals_t,unused9 [2]) == register_count );
     assert( sizeof (this->r.voice) == register_count );
 }
@@ -394,7 +424,7 @@
         voice->envx         = 0;
         voice->env_mode     = state_attack;
         voice->env_timer    = env_rate_init; /* TODO: inaccurate? */
-        unsigned start_addr = GET_LE16A( sd [raw_voice->waveform].start );
+        unsigned start_addr = GET_LE16A(sd [raw_voice->waveform].start);
         #if !SPC_BRRCACHE
         {
             voice->addr = RAM + start_addr;
@@ -442,7 +472,7 @@
     EXIT_TIMER(cpu);
     ENTER_TIMER(dsp);
 #endif
-    
+
     /* Here we check for keys on/off.  Docs say that successive writes
        to KON/KOF must be separated by at least 2 Ts periods or risk
        being neglected.  Therefore DSP only looks at these during an
@@ -479,16 +509,42 @@
     
     struct src_dir const* const sd =
         (struct src_dir*) &RAM [this->r.g.wave_page * 0x100];
+
+    #ifdef ROCKBOX_BIG_ENDIAN
+        /* Convert endiannesses before entering loops - these
+           get used alot */
+        const uint32_t rates[voice_count] =
+        {
+            GET_LE16A( this->r.voice[0].rate ) & 0x3FFF,
+            GET_LE16A( this->r.voice[1].rate ) & 0x3FFF,
+            GET_LE16A( this->r.voice[2].rate ) & 0x3FFF,
+            GET_LE16A( this->r.voice[3].rate ) & 0x3FFF,
+            GET_LE16A( this->r.voice[4].rate ) & 0x3FFF,
+            GET_LE16A( this->r.voice[5].rate ) & 0x3FFF,
+            GET_LE16A( this->r.voice[6].rate ) & 0x3FFF,
+            GET_LE16A( this->r.voice[7].rate ) & 0x3FFF,
+        };
+        #define VOICE_RATE(x) *(x)
+        #define IF_RBE(...) __VA_ARGS__
+    #ifdef CPU_COLDFIRE
+        /* Initialize mask register with the buffer address mask */
+        asm ("move.l %[m], %%mask" : : [m]"i"(fir_buf_mask));
+        const int echo_delay_mask = (this->r.g.echo_delay & 15) * 0x800 - 1;
+        const int echo_page       = this->r.g.echo_page * 0x100;
+    #endif /* CPU_COLDFIRE */
+    #else
+        #define VOICE_RATE(x) (INT16A(raw_voice->rate) & 0x3FFF)
+        #define IF_RBE(...)
+    #endif /* ROCKBOX_BIG_ENDIAN */
     
 #if !SPC_NOINTERP
     int const slow_gaussian = (this->r.g.pitch_mods >> 1) |
         this->r.g.noise_enables;
 #endif
     /* (g.flags & 0x40) ? 30 : 14 */
-    int const global_muting = ((this->r.g.flags & 0x40) >> 2) + 14; 
-    
-    int const global_vol_0 = this->r.g.volume_0;
-    int const global_vol_1 = this->r.g.volume_1;
+    int const global_muting = ((this->r.g.flags & 0x40) >> 2) + 14 - 8; 
+    int const global_vol_0  = this->r.g.volume_0;
+    int const global_vol_1  = this->r.g.volume_1;
     
     /* each rate divides exactly into 0x7800 without remainder */
     int const env_rate_init = 0x7800;
@@ -525,7 +581,8 @@
         struct raw_voice_t * raw_voice = this->r.voice;
         struct voice_t* voice = this->voice_state;
         int vbit = 1;
-        for ( ; vbit < 0x100; vbit <<= 1, ++voice, ++raw_voice )
+        IF_RBE( const uint32_t* vr = rates; )
+        for ( ; vbit < 0x100; vbit <<= 1, ++voice, ++raw_voice IF_RBE( , ++vr ) )
         {
             /* pregen involves checking keyon, etc */
 #if 0
@@ -816,7 +873,7 @@
             #endif
 
             /* Get rate (with possible modulation) */
-            int rate = GET_LE16A( raw_voice->rate ) & 0x3FFF;
+            int rate = VOICE_RATE(vr);
             if ( this->r.g.pitch_mods & vbit )
                 rate = (rate * (prev_outx + 32768)) >> 15;
 
@@ -918,19 +975,20 @@
             {
                 uint32_t f = voice->position;
                 int32_t y1;
+
                 asm (
-              "move.l     %[f], %[y0]               \n" /* separate fraction */
-              "and.l      #0xfff, %[f]              \n" /* and whole parts   */
-              "lsr.l      %[sh], %[y0]              \n"
-              "move.l     2(%[s], %[y0].l*2), %[y1] \n" /* load two samples  */
-              "move.l     %[y1], %[y0]              \n" /* separate samples  */
-              "ext.l      %[y1]                     \n" /* y0=s[1], y1=s[2]  */
-              "swap       %[y0]                     \n"
-              "ext.l      %[y0]                     \n"
-              "sub.l      %[y0], %[y1]              \n" /* diff = y1 - y0    */
-              "muls.l     %[f], %[y1]               \n" /* y0 += f*diff      */
-              "asr.l      %[sh], %[y1]              \n"
-              "add.l      %[y1], %[y0]              \n"
+              "move.l     %[f], %[y0]               \r\n" /* separate fraction */
+              "and.l      #0xfff, %[f]              \r\n" /* and whole parts   */
+              "lsr.l      %[sh], %[y0]              \r\n"
+              "move.l     2(%[s], %[y0].l*2), %[y1] \r\n" /* load two samples  */
+              "move.l     %[y1], %[y0]              \r\n" /* separate samples  */
+              "ext.l      %[y1]                     \r\n" /* y0=s[1], y1=s[2]  */
+              "swap       %[y0]                     \r\n"
+              "ext.l      %[y0]                     \r\n"
+              "sub.l      %[y0], %[y1]              \r\n" /* diff = y1 - y0    */
+              "muls.l     %[f], %[y1]               \r\n" /* y0 += f*diff      */
+              "asr.l      %[sh], %[y1]              \r\n"
+              "add.l      %[y1], %[y0]              \r\n"
               : [f]"+&d"(f), [y0]"=&d"(output), [y1]"=&d"(y1)
               : [s]"a"(voice->samples), [sh]"r"(12)
                     );
@@ -1020,6 +1078,100 @@
         /* end of voice loop */
         
     #if !SPC_NOECHO
+    #ifdef CPU_COLDFIRE
+        /* Read feedback from echo buffer */
+        int echo_pos = this->echo_pos;
+        uint8_t* const echo_ptr = RAM + ((echo_page + echo_pos) & 0xFFFF);
+        echo_pos = (echo_pos + 4) & echo_delay_mask;
+        this->echo_pos = echo_pos;
+        int fb = swap_odd_even32(*(int32_t *)echo_ptr);
+        int out_0, out_1;
+
+        /* Keep last 8 samples */
+        *this->last_fir_ptr = fb;
+        this->last_fir_ptr  = this->fir_ptr;
+
+        /* Apply echo FIR filter to output - circular buffer is hardware
+           incremented and masked; FIR coefficients and buffer history are
+           loaded in parallel with multiply accumulate operations. Apply
+           scale factor to do hardware clipping later. */
+        int _0, _1, _2;
+        asm (
+        "move.l                           (%[fir_c])  , %[_2]         \r\n"
+        "mac.w      %[fb]u, %[_2]u, <<,   (%[fir_p])+&, %[_0], %%acc0 \r\n"
+        "mac.w      %[fb]l, %[_2]u, <<,   (%[fir_p])& , %[_1], %%acc1 \r\n"
+        "mac.w      %[_0]u, %[_2]l, <<                       , %%acc0 \r\n"
+        "mac.w      %[_0]l, %[_2]l, <<,  4(%[fir_c])  , %[_2], %%acc1 \r\n"
+        "mac.w      %[_1]u, %[_2]u, <<,  4(%[fir_p])& , %[_0], %%acc0 \r\n"
+        "mac.w      %[_1]l, %[_2]u, <<,  8(%[fir_p])& , %[_1], %%acc1 \r\n"
+        "mac.w      %[_0]u, %[_2]l, <<                       , %%acc0 \r\n"
+        "mac.w      %[_0]l, %[_2]l, <<,  8(%[fir_c])  , %[_2], %%acc1 \r\n"
+        "mac.w      %[_1]u, %[_2]u, <<, 12(%[fir_p])& , %[_0], %%acc0 \r\n"
+        "mac.w      %[_1]l, %[_2]u, <<, 16(%[fir_p])& , %[_1], %%acc1 \r\n"
+        "mac.w      %[_0]u, %[_2]l, <<                       , %%acc0 \r\n"
+        "mac.w      %[_0]l, %[_2]l, <<, 12(%[fir_c])  , %[_2], %%acc1 \r\n"
+        "mac.w      %[_1]u, %[_2]u, <<, 20(%[fir_p])& , %[_0], %%acc0 \r\n"
+        "mac.w      %[_1]l, %[_2]u, <<                       , %%acc1 \r\n"
+        "mac.w      %[_0]u, %[_2]l, <<                       , %%acc0 \r\n"
+        "mac.w      %[_0]l, %[_2]l, <<                       , %%acc1 \r\n"
+        "movclr.l   %%acc0, %[out_0]                                  \r\n"
+        "movclr.l   %%acc1, %[out_1]                                  \r\n"
+        : [_0]"=&r"(_0), [_1]"=&r"(_1), [_2]"=&r"(_2),
+          [fir_p]"+a"(this->fir_ptr),
+          [out_0]"=r"(out_0), [out_1]"=r"(out_1)
+        : [fir_c]"a"(this->fir_coeff), [fb]"r"(fb)
+        );
+
+        /* Generate output */
+        asm (
+        "mac.l      %[chans_0], %[gv_0]    , %%acc2 \r\n"
+        "mac.l      %[chans_1], %[gv_1]    , %%acc3 \r\n"
+        "mac.l      %[ev_0],   %[out_0], >>, %%acc2 \r\n"
+        "mac.l      %[ev_1],   %[out_1], >>, %%acc3 \r\n"
+        :
+        : [chans_0]"r"(chans_0), [gv_0]"r"(global_vol_0),
+          [ev_0]"r"((int)this->r.g.echo_volume_0),
+          [chans_1]"r"(chans_1), [gv_1]"r"(global_vol_1),
+          [ev_1]"r"((int)this->r.g.echo_volume_1),
+          [out_0]"r"(out_0), [out_1]"r"(out_1)
+        );
+
+        /* Feedback into echo buffer */
+        if ( !(this->r.g.flags & 0x20) )
+        {
+            asm (
+            "lsl.l      %[sh], %[e0]                \r\n"
+            "move.l     %[e0], %%acc0               \r\n"
+            "mac.l      %[out_0], %[ef], <<, %%acc0 \r\n"
+            "lsl.l      %[sh], %[e1]                \r\n"
+            "move.l     %[e1], %%acc1               \r\n"
+            "mac.l      %[out_1], %[ef], <<, %%acc1 \r\n"
+            "movclr.l   %%acc0, %[e0]               \r\n"
+            "movclr.l   %%acc1, %[e1]               \r\n"
+            "swap       %[e1]                       \r\n"
+            "move.w     %[e1], %[e0]                \r\n"
+            : [e0]"+&d"(echo_0), [e1]"+&d"(echo_1)
+            : [out_0]"r"(out_0), [out_1]"r"(out_1),
+              [ef]"r"((int)this->r.g.echo_feedback),
+              [sh]"d"(9)
+            );
+            *(int32_t *)echo_ptr = swap_odd_even32(echo_0);
+        }
+
+        /* Output final samples */
+        asm (
+        "movclr.l   %%acc2, %[out_0] \r\n"
+        "movclr.l   %%acc3, %[out_1] \r\n"
+        "asr.l      %[gm],  %[out_0] \r\n"
+        "asr.l      %[gm],  %[out_1] \r\n"
+        : [out_0]"=&d"(out_0), [out_1]"=&d"(out_1)
+        : [gm]"d"(global_muting)
+        );
+
+        out_buf [             0] = out_0;
+        out_buf [WAV_CHUNK_SIZE] = out_1;
+        out_buf ++;
+    #else /* !CPU_COLDFIRE */
         /* Read feedback from echo buffer */
         int echo_pos = this->echo_pos;
         uint8_t* const echo_ptr = RAM +
@@ -1061,10 +1213,8 @@
                     >> global_muting;
         int amp_1 = (chans_1 * global_vol_1 + fb_1 * this->r.g.echo_volume_1)
                     >> global_muting;
-        CLAMP16( amp_0, amp_0 );
-        out_buf [0] = amp_0 * (1 << 8);
-        CLAMP16( amp_1, amp_1 );
-        out_buf [WAV_CHUNK_SIZE] = amp_1 * (1 << 8);
+        out_buf [             0] = amp_0;
+        out_buf [WAV_CHUNK_SIZE] = amp_1;
         out_buf ++;
         
         /* Feedback into echo buffer */
@@ -1077,14 +1227,13 @@
             CLAMP16( e1, e1 );
             SET_LE16A( echo_ptr + 2, e1 );
         }
+    #endif /* CPU_COLDFIRE */
     #else
-        /* Generate output */
+        /* Generate output  */
         int amp_0 = (chans_0 * global_vol_0) >> global_muting;
         int amp_1 = (chans_1 * global_vol_1) >> global_muting;
-        CLAMP16( amp_0, amp_0 );
-        out_buf [0] = amp_0 * (1 << 8);
-        CLAMP16( amp_1, amp_1 );
-        out_buf [WAV_CHUNK_SIZE] = amp_1 * (1 << 8);
+        out_buf [             0] = amp_0;
+        out_buf [WAV_CHUNK_SIZE] = amp_1;
         out_buf ++;
     #endif
     }