| /*************************************************************************** |
| * __________ __ ___. |
| * Open \______ \ ____ ____ | | _\_ |__ _______ ___ |
| * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / |
| * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < |
| * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ |
| * \/ \/ \/ \/ \/ |
| * $Id$ |
| * |
| * Copyright (C) 2006 Thom Johansen |
| * Copyright (C) 2007, 2012 Michael Sevakis |
| * Copyright (C) 2010 Bertrik Sikken |
| * |
| * This program is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU General Public License |
| * as published by the Free Software Foundation; either version 2 |
| * of the License, or (at your option) any later version. |
| * |
| * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY |
| * KIND, either express or implied. |
| * |
| ****************************************************************************/ |
| #include "rbcodecconfig.h" |
| |
| /**************************************************************************** |
| * void pga_process(struct dsp_proc_entry *this, struct dsp_buffer **buf_p) |
| */ |
| .section .text |
| .align 2 |
| .global pga_process |
| pga_process: |
| | input: 4(sp) = this, 8(sp) = buf_p |
| movem.l 4(%sp), %a0-%a1 | %a0 = this, %a1 = buf_p |
| move.l (%a0), %a0 | %a0 = this->data = &pga_data |
| move.l (%a0), %a0 | %a0 = data->gain |
| move.l (%a1), %a1 | %a1 = buf = *buf_p |
| lea.l -20(%sp), %sp | save registers |
| movem.l %d2-%d4/%a2-%a3, (%sp) | |
| clr.l %d1 | %d1 = buf->format.num_channels |
| move.b 17(%a1), %d1 | |
| 10: | channel loop | |
| move.l (%a1), %d0 | %d0 = buf->remcount |
| move.l (%a1, %d1.l*4), %a2 | %a2 = s = buf->p32[ch-1] |
| move.l %a2, %a3 | %a3 = d = s |
| move.l (%a2)+, %d2 | %d2 = *s++, |
| mac.l %a0, %d2, (%a2)+, %d2, %acc0 | %acc0 = S(n)*gain, load S(n+1) |
| subq.l #1, %d0 | --count > 0 ? : effectively n++ |
| ble.b 30f | loop done | no? finish up |
| 20: | loop | |
| move.l %accext01, %d4 | fetch S(n-1)[7:0] |
| movclr.l %acc0, %d3 | fetch S(n-1)[40:8] in %d5[31:0] |
| asl.l #8, %d3 | *s++ = (S(n-1)[40:8] << 8) | S(n-1)[7:0] |
| mac.l %a0, %d2, (%a2)+, %d2, %acc0 | %acc0 = S(n)*gain, load S(n+1) |
| move.b %d4, %d3 | |
| move.l %d3, (%a3)+ | |
| subq.l #1, %d0 | --count > 0 ? : effectively n++ |
| bgt.b 20b | loop | yes? do more samples |
| 30: | loop done | |
| move.l %accext01, %d4 | fetch S(n-1)[7:0] |
| movclr.l %acc0, %d3 | fetch S(n-1)[40:8] in %d5[31:0] |
| asl.l #8, %d3 | *s = (S(n-1)[40:8] << 8) | S(n-1)[7:0] |
| move.b %d4, %d3 | |
| move.l %d3, (%a3) | |
| subq.l #1, %d1 | next channel |
| bgt.b 10b | channel loop | |
| movem.l (%sp), %d2-%d4/%a2-%a3 | restore registers |
| lea.l 20(%sp), %sp | cleanup stack |
| rts | |
| .size pga_process, .-pga_process |
| |
| /**************************************************************************** |
| * void crossfeed_process(struct dsp_proc_entry *this, |
| * struct dsp_buffer **buf_p) |
| */ |
| .section .text |
| .align 2 |
| .global crossfeed_process |
| crossfeed_process: |
| | input: 4(sp) = this, 8(sp) = buf_p |
| lea.l -44(%sp), %sp | |
| movem.l %d2-%d7/%a2-%a6, (%sp) | save all regs |
| movem.l 48(%sp), %a1/%a4 | %a1 = this, %a4 = buf_p |
| move.l (%a4), %a4 | %a4 = buf = *buf_p |
| movem.l (%a4), %d0/%a4-%a5 | %d0 = buf->remcount, %a4 = buf->p32[0], |
| | %a5 = buf->p32[1] |
| move.l (%a1), %a6 | %d7 = state = &crossfeed_state |
| movem.l (%a6), %d1-%d6/%a0-%a3 | %d1 = gain, %d2-%d4 = coefs, |
| | %d5..%d6 = history[0..1], |
| | %a0..%a1 = history[2..3], |
| | %a2 = index, %a3 = index_max |
| lea.l 0x28(%a6), %a6 | %a6 = state->delay |
| move.l %a6, -(%sp) | push state->delay |
| bra.b .cfp_loop_start |
| /* Register usage in loop: |
| * %d0 = count, %d1 = direct gain, %d2..%d4 = b0, b1, a1 (filter coefs), |
| * %d5..%d6 = history[0..1], %d7 = scratch |
| * %a0..%a1 = history[2..3], %a2 = index, %a3 = index_max, |
| * %a4 = buf[0], %a5 = buf[1], %a6 = scratch |
| */ |
| .cfp_loop: |
| movclr.l %acc0, %d7 | write outputs |
| move.l %d7, (%a4)+ | . |
| movclr.l %acc1, %a6 | . |
| move.l %a6, (%a5)+ | . |
| .cfp_loop_start: |
| mac.l %d3, %d5, (%a2)+, %d5, %acc1 | %acc1 = b1*dl[n - 1], %d5 = dl[n] |
| mac.l %d2, %d5 , %acc1 | %acc1 += b0*dl[n] |
| mac.l %d4, %d6, (%a4), %d7, %acc1 | %acc1 += a1*y_l[n - 1], %d7 = x_l[n] |
| mac.l %d3, %a0, (%a2)+, %a0, %acc0 | %acc0 = b1*dr[n - 1], %a0 = dr[n] |
| mac.l %a2, %a0 , %acc0 | %acc0 += b0*dr[n] |
| mac.l %d4, %a1, (%a5), %a6, %acc0 | %acc0 += a1*y_r[n - 1], %a6 = x_r[n] |
| movem.l %d7/%a6, -8(%a2) | save x_l[n] and x_r[n] to delay line |
| move.l %acc1, %d6 | get filtered delayed left sample (y_l[n]) |
| move.l %acc0, %a1 | get filtered delayed right sample (y_r[n]) |
| mac.l %d1, %d7, %acc0 | %acc0 = gain*x_l[n] + y_r[n] |
| mac.l %d1, %a6, %acc1 | %acc1 = gain*x_r[n] + y_l[n] |
| |
| cmp.l %a3, %a2 | wrap index if past end |
| bhs.b 1f | |
| tpf.w | trap the buffer wrap |
| 1: | ...fwd taken branches more costly |
| move.l (%sp), %a2 | 2b | wrap it up |
| |
| subq.l #1, %d0 | --count > 0 ? |
| bgt.b .cfp_loop | yes? do more |
| |
| movclr.l %acc0, %d7 | write last outputs |
| move.l %d7, (%a4) | . |
| movclr.l %acc1, %a6 | . |
| move.l %a6, (%a5) | . |
| |
| move.l (%sp)+, %a6 | pop state->delay |
| movem.l %d5-%d6/%a0-%a2, -0x18(%a6) | save history, index |
| movem.l (%sp), %d2-%d7/%a2-%a6 | restore all regs |
| lea.l 44(%sp), %sp | |
| rts | |
| .size crossfeed_process, .-crossfeed_process |
| |
| /**************************************************************************** |
| * void crossfeed_meier_process(struct dsp_proc_entry *this, |
| * struct dsp_buffer **buf_p) |
| */ |
| .section .text |
| .global crossfeed_meier_process |
| crossfeed_meier_process: |
| | input: 4(sp) = this, 8(sp) = buf_p |
| movem.l 4(%sp), %a0-%a1 | %a0 = this, %a1 = buf_p |
| lea.l -24(%sp), %sp | save non-volatiles |
| movem.l %d2-%d6/%a2, (%sp) | . |
| move.l (%a0), %a0 | %a0 = &this->data = &crossfeed_state |
| move.l (%a1), %a1 | %a1 = buf = *buf_p |
| movem.l 4(%a0), %d1-%d5 | %d1 = vcl, %d2 = vcr, %d3 = vdiff, |
| | %d4 = coef1, %d5 = coef2 |
| movem.l (%a1), %d0/%a1-%a2 | %d0 = count = buf->remcount |
| | %a1 = p32[0], %a2 = p32[1] |
| | Register usage in loop: |
| | %d0 = count, %d1 = vcl, %d2 = vcr, %d3 = vdiff/lout, |
| | %d4 = coef1, %d5 = coef2, %d6 = rout/scratch |
| | %a1 = p32[0], %a2 = p32[1] |
| .cfmp_loop: |
| mac.l %d5, %d3, %acc0 | %acc0 = common = coef2*vdiff |
| move.l %acc0, %acc1 | copy common |
| mac.l %d4, %d1, (%a1), %d3, %acc0 | %acc0 += coef1*vcl, %d3 = lout |
| msac.l %d4, %d2, (%a2), %d6, %acc1 | %acc1 -= coef1*vcr, %d6 = rout |
| add.l %d1, %d3 | lout += vcl |
| add.l %d2, %d6 | rout += vcr |
| move.l %d3, (%a1)+ | store left channel, pos inc |
| move.l %d6, (%a2)+ | store right channel, pos inc |
| sub.l %d6, %d3 | vdiff = lout - rout |
| movclr.l %acc0, %d6 | %d4 = fetch res1 in s0.31 |
| sub.l %d6, %d1 | vcl -= res1 |
| movclr.l %acc1, %d6 | %d5 = fetch -res2 in s0.31 |
| add.l %d6, %d2 | vcr += -res2 |
| subq.l #1, %d0 | count-- |
| bgt .cfmp_loop | more samples? |
| | |
| movem.l %d1-%d3, 4(%a0) | save vcl, vcr, vdiff |
| movem.l (%sp), %d2-%d6/%a2 | restore non-volatiles |
| lea.l 24(%sp), %sp | . |
| rts | |
| .size crossfeed_meier_process, .-crossfeed_meier_process |
| |
| /**************************************************************************** |
| * int resample_hermite(struct resample_data *data, struct dsp_buffer *src, |
| * struct dsp_buffer *dst) |
| */ |
| .section .text |
| .align 2 |
| .global resample_hermite |
| resample_hermite: |
| | input: 4(sp) = data, 8(sp) = src, 12(sp) = dst |
| lea.l -52(%sp), %sp | save non-volatiles, allocate temps |
| movem.l %d2-%d7/%a2-%a6, 8(%sp) | |
| movem.l 56(%sp), %a0-%a2 | %a0 = data |
| | %a1 = src |
| | %a2 = dst |
| clr.l %d5 | %d5 = ch = src->format.num_channels |
| move.b 17(%a1), %d5 | |
| lea.l 8(%a0), %a5 | %a5 = h = history[ch] |
| moveq.l #16, %d7 | %d7 = shift val |
| .hrs_channel_loop: | |
| movem.l %d5/%a5, (%sp) | store ch, h |
| movem.l (%a0), %d1-%d2 | %d1 = delta = data->delta, |
| | %d2 = phase = data->phase |
| move.l (%a1), %d3 | %d3 = srcrem = src->remcount |
| move.l 12(%a2), %d4 | %d4 = dstrem = dst->bufcount |
| |
| cmp.l #0x8000, %d3 | %d4 = MIN(srcrem, 0x8000) |
| ble.b 1f | |
| move.l #0x8000, %d3 | |
| 1: | |
| |
| move.l (%a1, %d5.l*4), %a1 | %a1 = s = src->p32[ch] |
| move.l (%a2, %d5.l*4), %a2 | %a2 = d = dst->p32[ch] |
| |
| move.l %d2, %d0 | %d0 = pos = phase >> 16 |
| lsr.l %d7, %d0 | |
| |
| cmp.l %d3, %d0 | pos = MIN(pos, srcrem) |
| ble.b 1f | |
| move.l %d3, %d0 | |
| 1: |
| |
| lea.l (%a1, %d0.l*4), %a1 | %a1 = &s[pos] |
| |
| cmp.l #3, %d0 | |
| bge.b 1f | |
| move.l %d0, %a0 | |
| lea.l (%a0, %a0.l*2), %a0 | |
| jmp 2(%pc, %a0.l*4) | 4b | |
| | 0 |
| movem.l (%a5), %a3-%a5 | 4b | x3..x1 = h[0]..h[2] |
| bra.b 2f | 2b | |
| .dcb.w 3,0 | 6b | filler |
| | 1 |
| movem.l 4(%a5), %a3-%a4 | 6b | x3..x2 = h[1]..h[2] |
| move.l -4(%a1), %a5 | 4b | x1 = s[0] |
| bra.b 2f | 2b | |
| | 2 |
| move.l 8(%a5), %a3 | 4b | x3 = h[2] |
| movem.l -8(%a1), %a4-%a5 | 6b | x2..x1 = s[0]..s[1] |
| bra.b 2f | 2b | |
| 1: | 3 + |
| movem.l -12(%a1), %a3-%a5 | x3...x1 = s[pos-3]..s[pos-1] |
| 2: |
| |
| cmp.l %d3, %d0 | pos past end? |
| bge.w .hrs_channel_done | |
| |
| cmp.l #0x10000, %d1 | delta >= 1.0? |
| bhs.w .hrs_dsstart | yes? downsampling |
| | |
| /** Upsampling **/ | |
| sub.l %d3, %d0 | %d0 = pos - srcrem = -dte |
| lsl.l %d7, %d1 | move delta to bits 30..15 |
| lsr.l #1, %d1 | |
| lsl.l %d7, %d2 | move phase to bits 30..15 |
| lsr.l #1, %d2 | |
| | |
| | Register usage in loop: |
| | r0 = dte, d1 = delta, d2 = phase, d3 = srcrem, d4 = dstrem |
| | d5 = scratch, d6 = c3, d7 = scratch |
| | a0 = c2, a1 = &s[pos], a2 = d, |
| | a3 = x3, a4 = x2, a5 = x1, a6 = x0 |
| | |
| | Try to avoid overflow as much as possible and at the same time preserve |
| | accuracy. Same formulas apply to downsampling but registers and |
| | instruction order differ due to specific constraints. |
| | c1 = -0.5*x3 + 0.5*x1 |
| | = 0.5*(x1 - x3) <-- |
| | |
| | v = x1 - x2, -v = x2 - x1 |
| | c2 = x3 - 2.5*x2 + 2*x1 - 0.5*x0 |
| | = x3 + 2*(x1 - x2) - 0.5*(x0 + x2) |
| | = x3 + 2*v - 0.5*(x0 + x2) <-- |
| | |
| | c3 = -0.5*x3 + 1.5*x2 - 1.5*x1 + 0.5*x0 |
| | = 0.5*x0 - 0.5*x3 + 0.5*(x2 - x1) + (x2 - x1) |
| | = 0.5*(x0 - x3 - v) - v <-- |
| | |
| .hrs_usloop_carry: |
| move.l (%a1)+, %a6 | %a6 = s[pos] |
| |
| move.l %a5, %d5 | v |
| sub.l %a4, %d5 | |
| |
| move.l %a6, %d6 | c3 |
| sub.l %a3, %d6 | |
| sub.l %d5, %d6 | |
| asr.l #1, %d6 | |
| sub.l %d5, %d6 | |
| |
| lea.l (%a3, %d5.l*2), %a0 | c2 |
| move.l %a6, %d5 | |
| add.l %a4, %d5 | |
| asr.l #1, %d5 | |
| sub.l %d5, %a0 | |
| |
| .hrs_usloop_frac: |
| move.l %a0, %acc0 | %acc0 = frac * c3 + c2 |
| mac.l %d2, %d6, %acc0 | |
| |
| move.l %a5, %d5 | c1 |
| sub.l %a3, %d5 | |
| asr.l #1, %d5 | |
| |
| movclr.l %acc0, %d7 | %acc1 = frac * acc + c1 |
| move.l %d5, %acc1 | |
| mac.l %d2, %d7, %acc1 | |
| |
| move.l %a4, %acc0 | %acc0 = frac * acc + x2 |
| movclr.l %acc1, %d5 | |
| mac.l %d2, %d5, %acc0 | |
| |
| subq.l #1, %d4 | dstrem <= 0? |
| ble.b .hrs_usfull | yes? stop |
| |
| movclr.l %acc0, %d5 | *d++ = d5 = result |
| move.l %d5, (%a2)+ | |
| |
| add.l %d1, %d2 | phase += delta |
| bpl.b .hrs_usloop_frac | load next values? |
| |
| move.l %a4, %a3 | x3 = x2 |
| move.l %a5, %a4 | x2 = x1 |
| move.l %a6, %a5 | x1 = x0 |
| |
| bclr.l #31, %d2 | clear sign bit |
| addq.l #1, %d0 | dte > 0? |
| bmi.b .hrs_usloop_carry | yes? continue resampling |
| bra.b .hrs_usdone |
| |
| .hrs_usfull: |
| movclr.l %acc0, %d5 | *d++ = d5 = result |
| move.l %d5, (%a2) | |
| |
| add.l %d1, %d2 | do missed phase increment |
| bpl.b .hrs_usdone | was sign bit set? |
| |
| move.l %a4, %a3 | do missed history update |
| move.l %a5, %a4 | |
| move.l %a6, %a5 | |
| |
| addq.l #1, %d0 | do missed dte decrement |
| |
| .hrs_usdone: |
| moveq.l #16, %d7 | restore shift |
| lsl.l #1, %d2 | frac -> phase |
| add.l %d3, %d0 | %d0 = -dte + srcrem = pos |
| or.l %d0, %d2 | restore phase |
| swap.w %d2 | |
| |
| bra.w .hrs_channel_done | |
| |
| /** Downsampling **/ |
| | |
| | Register usage in loop: |
| | r0 = pos, d1 = delta, d2 = phase, d3 = srcrem, d4 = dstrem |
| | d5 = scratch, d6 = scratch, d7 = 16 (shift value) |
| | a0 = scratch, a1 = &s[pos], a2 = d, |
| | a3 = x3, a4 = x2, a5 = x1, a6 = x0 |
| | |
| .hrs_dsloop: |
| movclr.l %acc0, %d5 | *d++ = acc |
| move.l %d5, (%a2)+ | |
| |
| sub.l %d0, %a0 | %a0 = -shift = last_pos - pos |
| move.l %a0, %d5 | |
| asl.l #2, %d5 | -shift -> -bytes |
| sub.l %d5, %a1 | %a1 = s = s - -bytes |
| cmp.l #-4, %a0 | >= 4? |
| ble.b 1f | |
| add.l %d5, %a0 | %a0 = 5 * -shift |
| jmp 40(%pc, %a0.l*2) | 4b | |
| 1: | +4 + |
| movem.l -12(%a1), %a3-%a5 | 6b | x3..x0 = s[pos-3]..s[pos-1] |
| bra.b 1f | 2b | |
| | +3 |
| move.l %a6, %a3 | 2b | x3 = x0 |
| movem.l -8(%a1), %a4-%a5 | 6b | x2..x0 = s[pos-2]..s[pos-1] |
| bra.b 1f | 2b | 10 |
| | +2 |
| move.l %a5, %a3 | 2b | x3 = x1 |
| move.l %a6, %a4 | 2b | x2 = x0 |
| move.l -4(%a1), %a5 | 4b | x1 = s[pos-1] |
| bra.b 1f | 2b | 10 |
| | +1 |
| move.l %a4, %a3 | 2b | x3 = x2 | expected loop destination |
| move.l %a5, %a4 | 2b | x2 = x1 |
| move.l %a6, %a5 | 2b | x1 = x0 |
| 1: |
| |
| subq.l #1, %d4 | 2b | dstrem <= 0? |
| ble.b .hrs_channel_done | 2b | yes? stop |
| cmp.l %d3, %d0 | |
| bge.b .hrs_channel_done | |
| |
| .hrs_dsstart: |
| move.l (%a1), %a6 | %a6 = s[pos] |
| move.l %a5, %d5 | v |
| sub.l %a4, %d5 | |
| |
| move.l %a6, %d6 | c3 |
| sub.l %a3, %d6 | |
| sub.l %d5, %d6 | |
| asr.l #1, %d6 | |
| sub.l %d5, %d6 | |
| |
| lea.l (%a3, %d5.l*2), %a0 | c2 |
| move.l %a6, %d5 | |
| add.l %a4, %d5 | |
| asr.l #1, %d5 | |
| sub.l %d5, %a0 | |
| |
| move.l %d2, %d5 | phase -> frac |
| lsl.l %d7, %d5 | |
| lsr.l #1, %d5 | |
| |
| move.l %a0, %acc0 | %acc0 = frac * c3 + c2 |
| mac.l %d5, %d6, %acc0 | |
| |
| move.l %a5, %d6 | c1 |
| sub.l %a3, %d6 | |
| asr.l #1, %d6 | |
| |
| movclr.l %acc0, %a0 | %acc1 = frac * acc + c1 |
| move.l %d6, %acc1 | |
| mac.l %d5, %a0, %acc1 | |
| |
| move.l %d0, %a0 | %a0 = last_pos |
| add.l %d1, %d2 | phase += delta |
| move.l %d2, %d0 | pos = phase >> 16 |
| lsr.l %d7, %d0 | |
| |
| movclr.l %acc1, %d6 | %acc0 = frac * acc + x2 |
| move.l %a4, %acc0 | |
| mac.l %d5, %d6, %acc0 | |
| |
| cmp.l %d3, %d0 | %d0 = MIN(pos, srcrem) |
| ble.w .hrs_dsloop | |
| move.l %d3, %d0 | |
| bra.w .hrs_dsloop | |
| |
| .hrs_channel_done: | |
| movem.l (%sp), %d5/%a0 | restore ch, h |
| movem.l %a3-%a5, (%a0) | h[0..2] = x3..x1 |
| lea.l 12(%a0), %a5 | h++ |
| movem.l 56(%sp), %a0-%a2 | load data, src, dst |
| subq.l #1, %d5 | ch > 0? |
| bgt.w .hrs_channel_loop | yes? process next channel |
| |
| move.l 12(%a2), %d1 | %d1 = dst->bufcount |
| sub.l %d4, %d1 | written = dst->bufcount - dstrem |
| move.l %d1, (%a2) | dst->remcount = written |
| move.l %d0, %d1 | wrap phase to position in next frame |
| lsl.l %d7, %d1 | data->phase = phase - (pos << 16) |
| sub.l %d1, %d2 | |
| move.l %d2, 4(%a0) | |
| movem.l 8(%sp), %d2-%d7/%a2-%a6 | restore non-volatiles |
| lea.l 52(%sp), %sp | cleanup stack |
| rts | buh-bye |
| |
| .size resample_hermite, .-resample_hermite |
| |
| /**************************************************************************** |
| * void channel_mode_proc_mono(struct dsp_proc_entry *this, |
| * struct dsp_buffer **buf_p) |
| * |
| * Mix left and right channels 50/50 into a center channel. |
| */ |
| .section .text |
| .align 2 |
| .global channel_mode_proc_mono |
| channel_mode_proc_mono: |
| | input: 4(sp) = this, 8(sp) = buf_p |
| move.l 8(%sp), %a0 | %a0 = buf_p |
| move.l (%a0), %a0 | %a0 = buf = *buf_p |
| lea.l -20(%sp), %sp | save registers |
| movem.l %d2-%d4/%a2-%a3, (%sp) | |
| movem.l (%a0), %d0/%a0-%a1 | %d0 = buf->remcount, %a0 = buf->p32[0], |
| | %a1 = buf->p32[1] |
| move.l %a0, %a2 | use separate dst pointers since read |
| move.l %a1, %a3 | pointers run one ahead of write |
| move.l #0x40000000, %d3 | %d3 = 0.5 |
| move.l (%a0)+, %d1 | prime the input registers |
| move.l (%a1)+, %d2 | |
| mac.l %d1, %d3, (%a0)+, %d1, %acc0 | |
| mac.l %d2, %d3, (%a1)+, %d2, %acc0 | |
| subq.l #1, %d0 | |
| ble.s 20f | loop done | |
| 10: | loop | |
| movclr.l %acc0, %d4 | L = R = l/2 + r/2 |
| mac.l %d1, %d3, (%a0)+, %d1, %acc0 | |
| mac.l %d2, %d3, (%a1)+, %d2, %acc0 | |
| move.l %d4, (%a2)+ | output to original buffer |
| move.l %d4, (%a3)+ | |
| subq.l #1, %d0 | |
| bgt.s 10b | loop | |
| 20: | loop done | |
| movclr.l %acc0, %d4 | output last sample |
| move.l %d4, (%a2) | |
| move.l %d4, (%a3) | |
| movem.l (%sp), %d2-%d4/%a2-%a3 | restore registers |
| lea.l 20(%sp), %sp | cleanup |
| rts | |
| .size channel_mode_proc_mono, .-channel_mode_proc_mono |
| |
| /**************************************************************************** |
| * void channel_mode_proc_custom(struct dsp_proc_entry *this, |
| * struct dsp_buffer **buf_p) |
| * |
| * Apply stereo width (narrowing/expanding) effect. |
| */ |
| .section .text |
| .align 2 |
| .global channel_mode_proc_custom |
| channel_mode_proc_custom: |
| | input: 4(sp) = this, 8(sp) = buf_p |
| lea.l -28(%sp), %sp | save registers |
| movem.l %d2-%d6/%a2-%a3, (%sp) | |
| movem.l 32(%sp), %a0-%a1 | %a0 = this, %a1 = buf_p |
| move.l (%a1), %a1 | %a1 = buf = *buf_p |
| move.l (%a0), %a2 | %a2 = this->data = &channel_mode_data |
| movem.l (%a1), %d0/%a0-%a1 | %d0 = buf->remcount, %a0 = buf->p32[0], |
| | %a1 = buf->p32[1] |
| movem.l (%a2), %d3-%d4 | %d3 = sw_gain, %d4 = sw_cross |
| move.l %a0, %a2 | use separate dst pointers since read |
| move.l %a1, %a3 | pointers run one ahead of write |
| move.l (%a0)+, %d1 | prime the input registers |
| move.l (%a1)+, %d2 | |
| mac.l %d1, %d3 , %acc0 | L = l*gain + r*cross |
| mac.l %d1, %d4, (%a0)+, %d1, %acc1 | R = r*gain + l*cross |
| mac.l %d2, %d4 , %acc0 | |
| mac.l %d2, %d3, (%a1)+, %d2, %acc1 | |
| subq.l #1, %d0 | |
| ble.b 20f | loop done | |
| 10: | loop | |
| movclr.l %acc0, %d5 | |
| movclr.l %acc1, %d6 | |
| mac.l %d1, %d3 , %acc0 | L = l*gain + r*cross |
| mac.l %d1, %d4, (%a0)+, %d1, %acc1 | R = r*gain + l*cross |
| mac.l %d2, %d4 , %acc0 | |
| mac.l %d2, %d3, (%a1)+, %d2, %acc1 | |
| move.l %d5, (%a2)+ | |
| move.l %d6, (%a3)+ | |
| subq.l #1, %d0 | |
| bgt.s 10b | loop | |
| 20: | loop done | |
| movclr.l %acc0, %d5 | output last sample |
| movclr.l %acc1, %d6 | |
| move.l %d5, (%a2) | |
| move.l %d6, (%a3) | |
| movem.l (%sp), %d2-%d6/%a2-%a3 | restore registers |
| lea.l 28(%sp), %sp | cleanup |
| rts | |
| .size channel_mode_proc_custom, .-channel_mode_proc_custom |
| |
| /**************************************************************************** |
| * void channel_mode_proc_karaoke(struct dsp_proc_entry *this, |
| * struct dsp_buffer **buf_p) |
| * |
| * Separate channels into side channels. |
| */ |
| .section .text |
| .align 2 |
| .global channel_mode_proc_karaoke |
| channel_mode_proc_karaoke: |
| | input: 4(sp) = this, 8(sp) = buf_p |
| move.l 8(%sp), %a0 | %a0 = buf_p |
| move.l (%a0), %a0 | %a0 = buf = *buf_p |
| lea.l -20(%sp), %sp | save registers |
| movem.l %d2-%d4/%a2-%a3, (%sp) | |
| movem.l (%a0), %d0/%a0-%a1 | %d0 = buf->remcount, %a0 = buf->p32[0], |
| | %a1 = buf->p32[1] |
| move.l %a0, %a2 | use separate dst pointers since read |
| move.l %a1, %a3 | pointers run one ahead of write |
| move.l #0x40000000, %d3 | %d3 = 0.5 |
| move.l (%a0)+, %d1 | prime the input registers |
| move.l (%a1)+, %d2 | |
| mac.l %d1, %d3, (%a0)+, %d1, %acc0 | L = l/2 - r/2 |
| msac.l %d2, %d3, (%a1)+, %d2, %acc0 | |
| subq.l #1, %d0 | |
| ble.b 20f | loop done | |
| 10: | loop | |
| movclr.l %acc0, %d4 | |
| mac.l %d1, %d3, (%a0)+, %d1, %acc0 | L = l/2 - r/2 |
| msac.l %d2, %d3, (%a1)+, %d2, %acc0 | |
| move.l %d4, (%a2)+ | |
| neg.l %d4 | R = -L = -(l/2 - r/2) = r/2 - l/2 |
| move.l %d4, (%a3)+ | |
| subq.l #1, %d0 | |
| bgt.s 10b | loop | |
| 20: | loop done | |
| movclr.l %acc0, %d4 | output last sample |
| move.l %d4, (%a2) | |
| neg.l %d4 | R = -L = -(l/2 - r/2) = r/2 - l/2 |
| move.l %d4, (%a3) | |
| movem.l (%sp), %d2-%d4/%a2-%a3 | restore registers |
| lea.l 20(%sp), %sp | cleanup |
| rts | |
| .size channel_mode_proc_karaoke, .-channel_mode_proc_karaoke |
| |
| /**************************************************************************** |
| * void filter_process(struct dsp_filter *f, int32_t *buf[], int count, |
| * unsigned int channels) |
| * |
| * define HIGH_PRECISION as '1' to make filtering calculate lower bits after |
| * shifting. without this, "shift" - 1 of the lower bits will be lost here. |
| */ |
| #define HIGH_PRECISION 0 |
| .text |
| .global filter_process |
| filter_process: |
| | input: 4(sp) = f, 8(sp) = buf, 12(sp) = count, 16(sp) = channels |
| lea.l -44(%sp), %sp | save clobbered regs |
| #if HIGH_PRECISION |
| movem.l %d2-%d7/%a2-%a6, (%sp) | . |
| #else |
| movem.l %d2-%d6/%a2-%a6, (%sp) | |
| #endif |
| move.l 48(%sp), %a5 | fetch filter structure address |
| clr.l %d6 | load shift count |
| move.b 52(%a5), %d6 | . |
| subq.l #1, %d6 | EMAC gives us one free shift |
| #if HIGH_PRECISION |
| moveq.l #8, %d7 |
| sub.l %d6, %d7 | shift for lower part of accumulator |
| #endif |
| movem.l (%a5), %a0-%a4 | load coefs |
| lea.l 20(%a5), %a5 | point to filter history |
| |
| 10: | channel loop |
| move.l 52(%sp), %a6 | load input channel pointer |
| addq.l #4, 52(%sp) | point x to next channel |
| move.l (%a6), %a6 | |
| move.l 56(%sp), %d5 | number of samples |
| movem.l (%a5), %d0-%d3 | load filter history |
| |
| | d0-d3 = history, d4 = temp, d5 = sample count, d6 = upper shift amount, |
| | d7 = lower shift amount,a0-a4 = coefs, a5 = history pointer, a6 = buf[ch] |
| 20: | loop |
| | Direct form 1 filtering code. We assume DSP has put EMAC in frac mode. |
| | y[n] = b0*x[i] + b1*x[i - 1] + b2*x[i - 2] + a1*y[i - 1] + a2*y[i - 2], |
| | where y[] is output and x[] is input. This is performed out of order |
| | to do parallel load of input value. |
| mac.l %a2, %d1, %acc0 | acc = b2*x[i - 2] |
| move.l %d0, %d1 | fix input history |
| mac.l %a1, %d0, (%a6), %d0, %acc0 | acc += b1*x[i - 1], x[i] -> d0 |
| mac.l %a0, %d0, %acc0 | acc += b0*x[i] |
| mac.l %a3, %d2, %acc0 | acc += a1*y[i - 1] |
| mac.l %a4, %d3, %acc0 | acc += a2*y[i - 2] |
| move.l %d2, %d3 | fix output history |
| #if HIGH_PRECISION |
| move.l %accext01, %d2 | fetch lower part of accumulator |
| move.b %d2, %d4 | clear upper three bytes |
| lsr.l %d7, %d4 | shift lower bits |
| #endif |
| movclr.l %acc0, %d2 | fetch upper part of result |
| asl.l %d6, %d2 | restore fixed point format |
| #if HIGH_PRECISION |
| or.l %d2, %d4 | combine lower and upper parts |
| #endif |
| move.l %d2, (%a6)+ | save result |
| subq.l #1, %d5 | are we done with this channel? |
| bgt 20b | loop |
| |
| movem.l %d0-%d3, (%a5) | save history back to struct |
| lea.l 16(%a5), %a5 | point to next channel's history |
| subq.l #1, 60(%sp) | have we processed both channels? |
| bhi 10b | channel loop |
| |
| #if HIGH_PRECISION |
| movem.l (%sp), %d2-%d7/%a2-%a6 |
| #else |
| movem.l (%sp), %d2-%d6/%a2-%a6 |
| #endif |
| lea.l 44(%sp), %sp |
| rts |
| .size filter_process, .-filter_process |
| |
| /**************************************************************************** |
| * void sample_output_stereo(struct sample_io_data *this, |
| * struct dsp_buffer *src, |
| * struct dsp_buffer *dst) |
| * |
| * Framework based on the ubiquitous Rockbox line transfer logic for |
| * Coldfire CPUs. |
| * |
| * Does emac clamping and scaling (which proved faster than the usual |
| * checks and branches - even single test clamping) and writes using |
| * line burst transfers. Also better than writing a single L-R pair per |
| * loop but a good deal more code. |
| * |
| * Attemping bursting during reads is rather futile since the source and |
| * destination alignments rarely agree and too much complication will |
| * slow us up. The parallel loads seem to do a bit better at least until |
| * a pcm buffer can always give line aligned chunk and then aligning the |
| * dest can then imply the source is aligned if the source buffers are. |
| * For now longword alignment is assumed of both the source and dest. |
| * |
| */ |
| .section .text |
| .align 2 |
| .global sample_output_stereo |
| sample_output_stereo: |
| | input: 4(sp) = count, 8(sp) = src, 12(sp) = dst |
| lea.l -48(%sp), %sp | save registers |
| move.l %macsr, %d1 | do it now as at many lines will |
| movem.l %d1-%d7/%a2-%a6, (%sp) | be the far more common condition |
| move.l #0x80, %macsr | put emac unit in signed int mode |
| movem.l 52(%sp), %a0-%a2 | %a0 = this, %a1 = src, %a2 = dst |
| move.l (%a0), %a0 | %a0 = this->outcount |
| move.l 4(%a2), %a4 | %a4 = dst->p16out |
| lea.l (%a4, %a0.l*4), %a0 | %a0 = count -> end address |
| movem.l 4(%a1), %a2-%a3 | %a2 = src->p32[0], %a3 = src->p32[1] |
| clr.l %d1 | %a1 = multiplier: (1 << (16 - scale)) |
| move.b 19(%a1), %d1 | %d1 = src->format.output_scale |
| sub.l #16, %d1 | |
| neg.l %d1 | |
| moveq.l #1, %d0 | |
| asl.l %d1, %d0 | |
| move.l %d0, %a1 | |
| move.l #0x8000, %a6 | %a6 = rounding term |
| moveq.l #28, %d0 | %d0 = second line bound |
| add.l %a4, %d0 | |
| and.l #0xfffffff0, %d0 | |
| cmp.l %a0, %d0 | at least a full line? |
| bhi.w 40f | long loop 1 start | no? do as trailing longwords |
| sub.l #16, %d0 | %d1 = first line bound |
| cmp.l %a4, %d0 | any leading longwords? |
| bls.b 20f | line loop start | no? start line loop |
| 10: | long loop 0 | |
| move.l (%a2)+, %d1 | read longword from L and R |
| move.l %a6, %acc0 | |
| move.l %acc0, %acc1 | |
| mac.l %d1, %a1, (%a3)+, %d2, %acc0 | shift L to high word |
| mac.l %d2, %a1, %acc1 | shift R to high word |
| movclr.l %acc0, %d1 | get possibly saturated results |
| movclr.l %acc1, %d2 | |
| swap.w %d2 | move R to low word |
| move.w %d2, %d1 | interleave MS 16 bits of each |
| move.l %d1, (%a4)+ | ...and write both |
| cmp.l %a4, %d0 | |
| bhi.b 10b | long loop 0 | |
| 20: | line loop start | |
| lea.l -12(%a0), %a5 | %a5 = at or just before last line bound |
| 30: | line loop | |
| move.l (%a3)+, %d4 | get next 4 R samples and scale |
| move.l %a6, %acc0 | |
| move.l %acc0, %acc1 | |
| move.l %acc1, %acc2 | |
| move.l %acc2, %acc3 | |
| mac.l %d4, %a1, (%a3)+, %d5, %acc0 | with saturation |
| mac.l %d5, %a1, (%a3)+, %d6, %acc1 | |
| mac.l %d6, %a1, (%a3)+, %d7, %acc2 | |
| mac.l %d7, %a1, (%a2)+, %d0, %acc3 | |
| lea.l 16(%a4), %a4 | increment dest here, mitigate stalls |
| movclr.l %acc0, %d4 | obtain R results |
| movclr.l %acc1, %d5 | |
| movclr.l %acc2, %d6 | |
| movclr.l %acc3, %d7 | |
| move.l %a6, %acc0 | |
| move.l %acc0, %acc1 | |
| move.l %acc1, %acc2 | |
| move.l %acc2, %acc3 | |
| mac.l %d0, %a1, (%a2)+, %d1, %acc0 | get next 4 L samples and scale |
| mac.l %d1, %a1, (%a2)+, %d2, %acc1 | with saturation |
| mac.l %d2, %a1, (%a2)+, %d3, %acc2 | |
| mac.l %d3, %a1 , %acc3 | |
| swap.w %d4 | a) interleave most significant... |
| swap.w %d5 | |
| swap.w %d6 | |
| swap.w %d7 | |
| movclr.l %acc0, %d0 | obtain L results |
| movclr.l %acc1, %d1 | |
| movclr.l %acc2, %d2 | |
| movclr.l %acc3, %d3 | |
| move.w %d4, %d0 | a) ... 16 bits of L and R |
| move.w %d5, %d1 | |
| move.w %d6, %d2 | |
| move.w %d7, %d3 | |
| movem.l %d0-%d3, -16(%a4) | write four stereo samples |
| cmp.l %a4, %a5 | |
| bhi.b 30b | line loop | |
| 40: | long loop 1 start | |
| cmp.l %a4, %a0 | any longwords left? |
| bls.b 60f | output end | no? stop |
| 50: | long loop 1 | |
| move.l (%a2)+, %d1 | handle trailing longwords |
| move.l %a6, %acc0 | |
| move.l %acc0, %acc1 | |
| mac.l %d1, %a1, (%a3)+, %d2, %acc0 | the same way as leading ones |
| mac.l %d2, %a1, %acc1 | |
| movclr.l %acc0, %d1 | |
| movclr.l %acc1, %d2 | |
| swap.w %d2 | |
| move.w %d2, %d1 | |
| move.l %d1, (%a4)+ | |
| cmp.l %a4, %a0 | |
| bhi.b 50b | long loop 1 |
| 60: | output end | |
| movem.l (%sp), %d1-%d7/%a2-%a6 | restore registers |
| move.l %d1, %macsr | |
| lea.l 48(%sp), %sp | cleanup |
| rts | |
| .size sample_output_stereo, .-sample_output_stereo |
| |
| /**************************************************************************** |
| * void sample_output_mono(struct sample_io_data *this, |
| * struct dsp_buffer *src, |
| * struct dsp_buffer *dst) |
| * |
| * Same treatment as sample_output_stereo but for one channel. |
| */ |
| .section .text |
| .align 2 |
| .global sample_output_mono |
| sample_output_mono: |
| | input: 4(sp) = count, 8(sp) = src, 12(sp) = dst |
| lea.l -32(%sp), %sp | save registers |
| move.l %macsr, %d1 | do it now as at many lines will |
| movem.l %d1-%d5/%a2-%a4, (%sp) | be the far more common condition |
| move.l #0x80, %macsr | put emac unit in signed int mode |
| movem.l 36(%sp), %a0-%a2 | %a0 = this, %a1 = src, %a2 = dst |
| move.l (%a0), %a0 | %a0 = this->outcount |
| move.l 4(%a2), %a3 | %a3 = dst->p16out |
| movem.l 4(%a1), %a2 | %a2 = src->p32[0] |
| lea.l (%a3, %a0.l*4), %a0 | %a0 = count -> end address |
| clr.l %d1 | %d5 = multiplier: (1 << (16 - scale)) |
| move.b 19(%a1), %d1 | %d1 = src->format.output_scale |
| sub.l #16, %d1 | |
| neg.l %d1 | |
| moveq.l #1, %d5 | |
| asl.l %d1, %d5 | |
| move.l #0x8000, %a4 | %a4 = rounding term |
| moveq.l #28, %d0 | %d0 = second line bound |
| add.l %a3, %d0 | |
| and.l #0xfffffff0, %d0 | |
| cmp.l %a0, %d0 | at least a full line? |
| bhi.w 40f | long loop 1 start | no? do as trailing longwords |
| sub.l #16, %d0 | %d1 = first line bound |
| cmp.l %a3, %d0 | any leading longwords? |
| bls.b 20f | line loop start | no? start line loop |
| 10: | long loop 0 | |
| move.l (%a2)+, %d1 | read longword from L and R |
| move.l %a4, %acc0 | |
| mac.l %d1, %d5, %acc0 | shift L to high word |
| movclr.l %acc0, %d1 | get possibly saturated results |
| move.l %d1, %d2 | |
| swap.w %d2 | move R to low word |
| move.w %d2, %d1 | duplicate single channel into |
| move.l %d1, (%a3)+ | L and R |
| cmp.l %a3, %d0 | |
| bhi.b 10b | long loop 0 | |
| 20: | line loop start | |
| lea.l -12(%a0), %a1 | %a1 = at or just before last line bound |
| 30: | line loop | |
| move.l (%a2)+, %d0 | get next 4 L samples and scale |
| move.l %a4, %acc0 | |
| move.l %acc0, %acc1 | |
| move.l %acc1, %acc2 | |
| move.l %acc2, %acc3 | |
| mac.l %d0, %d5, (%a2)+, %d1, %acc0 | with saturation |
| mac.l %d1, %d5, (%a2)+, %d2, %acc1 | |
| mac.l %d2, %d5, (%a2)+, %d3, %acc2 | |
| mac.l %d3, %d5 , %acc3 | |
| lea.l 16(%a3), %a3 | increment dest here, mitigate stalls |
| movclr.l %acc0, %d0 | obtain results |
| movclr.l %acc1, %d1 | |
| movclr.l %acc2, %d2 | |
| movclr.l %acc3, %d3 | |
| move.l %d0, %d4 | duplicate single channel |
| swap.w %d4 | into L and R |
| move.w %d4, %d0 | |
| move.l %d1, %d4 | |
| swap.w %d4 | |
| move.w %d4, %d1 | |
| move.l %d2, %d4 | |
| swap.w %d4 | |
| move.w %d4, %d2 | |
| move.l %d3, %d4 | |
| swap.w %d4 | |
| move.w %d4, %d3 | |
| movem.l %d0-%d3, -16(%a3) | write four stereo samples |
| cmp.l %a3, %a1 | |
| bhi.b 30b | line loop | |
| 40: | long loop 1 start | |
| cmp.l %a3, %a0 | any longwords left? |
| bls.b 60f | output end | no? stop |
| 50: | loop loop 1 | |
| move.l (%a2)+, %d1 | handle trailing longwords |
| move.l %a4, %acc0 | |
| mac.l %d1, %d5, %acc0 | the same way as leading ones |
| movclr.l %acc0, %d1 | |
| move.l %d1, %d2 | |
| swap.w %d2 | |
| move.w %d2, %d1 | |
| move.l %d1, (%a3)+ | |
| cmp.l %a3, %a0 | |
| bhi.b 50b | long loop 1 | |
| 60: | output end | |
| movem.l (%sp), %d1-%d5/%a2-%a4 | restore registers |
| move.l %d1, %macsr | |
| lea.l 32(%sp), %sp | cleanup |
| rts | |
| .size sample_output_mono, .-sample_output_mono |