SPC codec on Coldfire: Move movclrs into the light and out of the long dark shadow cast my emac latency as much as possible. Put in a faster interpolation routine (emac saves the day...again). Add comments about what's going on.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@12558 a1c6a512-1295-4272-9138-f99709370657
diff --git a/apps/codecs/spc/Spc_Dsp.h b/apps/codecs/spc/Spc_Dsp.h
index 6b530a7..fdcd37f 100644
--- a/apps/codecs/spc/Spc_Dsp.h
+++ b/apps/codecs/spc/Spc_Dsp.h
@@ -974,23 +974,35 @@
if ( (this->r.g.noise_enables & vbit) == 0 )
{
uint32_t f = voice->position;
- int32_t y1;
+ int32_t y0;
+ /**
+ * Formula (fastest found so far of MANY):
+ * output = y0 + f*y1 - f*y0
+ */
asm volatile (
- "move.l %[f], %[y0] \r\n" /* separate fraction */
- "and.l #0xfff, %[f] \r\n" /* and whole parts */
- "lsr.l %[sh], %[y0] \r\n"
- "move.l 2(%[s], %[y0].l*2), %[y1] \r\n" /* load two samples */
- "move.l %[y1], %[y0] \r\n" /* separate samples */
- "ext.l %[y1] \r\n" /* y0=s[1], y1=s[2] */
- "swap %[y0] \r\n"
- "ext.l %[y0] \r\n"
- "sub.l %[y0], %[y1] \r\n" /* diff = y1 - y0 */
- "muls.l %[f], %[y1] \r\n" /* y0 += f*diff */
- "asr.l %[sh], %[y1] \r\n"
- "add.l %[y1], %[y0] \r\n"
- : [f]"+&d"(f), [y0]"=&d"(output), [y1]"=&d"(y1)
- : [s]"a"(voice->samples), [sh]"r"(12)
+ /* separate fractional and whole parts */
+ "move.l %[f], %[y1] \r\n"
+ "and.l #0xfff, %[f] \r\n"
+ "lsr.l %[sh], %[y1] \r\n"
+ /* load samples y0 (upper) & y1 (lower) */
+ "move.l 2(%[s], %[y1].l*2), %[y1] \r\n"
+ /* %acc0 = f*y1 */
+ "mac.w %[f]l, %[y1]l, %%acc0 \r\n"
+ /* msac.w is 2% boostier so add negative */
+ "neg.l %[f] \r\n"
+ /* %acc0 -= f*y0 */
+ "mac.w %[f]l, %[y1]u, %%acc0 \r\n"
+ /* separate out y0 and sign extend */
+ "swap %[y1] \r\n"
+ "movea.w %[y1], %[y0] \r\n"
+ /* fetch result, scale down and add y0 */
+ "movclr.l %%acc0, %[y1] \r\n"
+ /* output = y0 + (result >> 12) */
+ "asr.l %[sh], %[y1] \r\n"
+ "add.l %[y0], %[y1] \r\n"
+ : [f]"+&d"(f), [y0]"=&a"(y0), [y1]"=&d"(output)
+ : [s]"a"(voice->samples), [sh]"d"(12)
);
}
@@ -1093,9 +1105,13 @@
*this->last_fir_ptr = fb;
this->last_fir_ptr = this->fir_ptr;
- /* Apply echo FIR filter to output - circular buffer is hardware
- incremented and masked; FIR coefficients and buffer history are
- loaded in parallel with multiply accumulate operations. */
+ /* Apply echo FIR filter to output samples read from echo buffer -
+ circular buffer is hardware incremented and masked; FIR
+ coefficients and buffer history are loaded in parallel with
+ multiply accumulate operations. Shift left by one here and once
+ again when calculating feedback to have sample values justified
+ to bit 31 in the output to ease endian swap, interleaving and
+ clamping before placing result in the program's echo buffer. */
int _0, _1, _2;
asm volatile (
"move.l (%[fir_c]) , %[_2] \r\n"
@@ -1115,53 +1131,68 @@
"mac.w %[_1]l, %[_2]u, << , %%acc1 \r\n"
"mac.w %[_0]u, %[_2]l, << , %%acc0 \r\n"
"mac.w %[_0]l, %[_2]l, << , %%acc1 \r\n"
- "movclr.l %%acc0, %[out_0] \r\n"
- "movclr.l %%acc1, %[out_1] \r\n"
: [_0]"=&r"(_0), [_1]"=&r"(_1), [_2]"=&r"(_2),
- [fir_p]"+a"(this->fir_ptr),
- [out_0]"=r"(out_0), [out_1]"=r"(out_1)
+ [fir_p]"+a"(this->fir_ptr)
: [fir_c]"a"(this->fir_coeff), [fb]"r"(fb)
);
/* Generate output */
asm volatile (
+ /* fetch filter results to eliminate stalls */
+ "movclr.l %%acc0, %[out_0] \r\n"
+ "movclr.l %%acc1, %[out_1] \r\n"
+ /* apply global volume */
"mac.l %[chans_0], %[gv_0] , %%acc2 \r\n"
"mac.l %[chans_1], %[gv_1] , %%acc3 \r\n"
+ /* apply echo volume and add to final output */
"mac.l %[ev_0], %[out_0], >>, %%acc2 \r\n"
"mac.l %[ev_1], %[out_1], >>, %%acc3 \r\n"
- :
+ : [out_0]"=&r"(out_0), [out_1]"=&r"(out_1)
: [chans_0]"r"(chans_0), [gv_0]"r"(global_vol_0),
[ev_0]"r"((int)this->r.g.echo_volume_0),
[chans_1]"r"(chans_1), [gv_1]"r"(global_vol_1),
- [ev_1]"r"((int)this->r.g.echo_volume_1),
- [out_0]"r"(out_0), [out_1]"r"(out_1)
+ [ev_1]"r"((int)this->r.g.echo_volume_1)
);
/* Feedback into echo buffer */
if ( !(this->r.g.flags & 0x20) )
{
asm volatile (
- "mac.l %[sh], %[e0] , %%acc0 \r\n"
- "mac.l %[out_0], %[ef], <<, %%acc0 \r\n"
+ /* scale echo voices; saturate if overflow */
"mac.l %[sh], %[e1] , %%acc1 \r\n"
+ "mac.l %[sh], %[e0] , %%acc0 \r\n"
+ /* add scaled output from FIR filter */
"mac.l %[out_1], %[ef], <<, %%acc1 \r\n"
- "movclr.l %%acc0, %[e0] \r\n"
+ "mac.l %[out_0], %[ef], <<, %%acc0 \r\n"
+ /* swap and fetch feedback results - simply
+ swap_odd_even32 mixed in between macs and
+ movclrs to mitigate stall issues */
+ "move.l #0x00ff00ff, %[sh] \r\n"
"movclr.l %%acc1, %[e1] \r\n"
"swap %[e1] \r\n"
+ "movclr.l %%acc0, %[e0] \r\n"
"move.w %[e1], %[e0] \r\n"
+ "and.l %[e0], %[sh] \r\n"
+ "eor.l %[sh], %[e0] \r\n"
+ "lsl.l #8, %[sh] \r\n"
+ "lsr.l #8, %[e0] \r\n"
+ "or.l %[sh], %[e0] \r\n"
+ /* save final feedback into echo buffer */
+ "move.l %[e0], (%[echo_ptr]) \r\n"
: [e0]"+&d"(echo_0), [e1]"+&d"(echo_1)
: [out_0]"r"(out_0), [out_1]"r"(out_1),
[ef]"r"((int)this->r.g.echo_feedback),
- [sh]"r"(1 << 9)
+ [echo_ptr]"a"((int32_t *)echo_ptr),
+ [sh]"d"(1 << 9)
);
-
- *(int32_t *)echo_ptr = swap_odd_even32(echo_0);
}
/* Output final samples */
asm volatile (
+ /* fetch output saved in %acc2 and %acc3 */
"movclr.l %%acc2, %[out_0] \r\n"
"movclr.l %%acc3, %[out_1] \r\n"
+ /* scale right by global_muting shift */
"asr.l %[gm], %[out_0] \r\n"
"asr.l %[gm], %[out_1] \r\n"
: [out_0]"=&d"(out_0), [out_1]"=&d"(out_1)