blob: 884a0ac90f44d031bc8ee24dfce8b18111f7e557 [file] [log] [blame]
Thom Johansen9985caf2005-05-31 07:56:28 +00001/***************************************************************************
2 * __________ __ ___.
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7 * \/ \/ \/ \/ \/
8 * $Id$
9 *
10 * Copyright (C) 2005 by David Bryant
11 *
Daniel Stenberg2acc0ac2008-06-28 18:10:04 +000012 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
Thom Johansen9985caf2005-05-31 07:56:28 +000016 *
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
19 *
20 ****************************************************************************/
21
22/* This is an assembly optimized version of the following WavPack function:
23 *
24 * void decorr_stereo_pass_cont_mcf5249 (struct decorr_pass *dpp,
25 * long *buffer, long sample_count);
26 *
27 * It performs a single pass of stereo decorrelation on the provided buffer.
28 * Note that this version of the function requires that the 8 previous stereo
29 * samples are visible and correct. In other words, it ignores the "samples_*"
30 * fields in the decorr_pass structure and gets the history data directly
31 * from the buffer. It does, however, return the appropriate history samples
32 * to the decorr_pass structure before returning.
33 *
34 * This is written to work on a MCF5249 processor, or any processor based on
35 * the ColdFire V2 core with an EMAC unit. The EMAC is perfectly suited for
36 * the "apply_weight" function of WavPack decorrelation because it provides
37 * the requires 40-bit product. The fractional rounding mode of the EMAC is not
38 * configurable and uses "round to even" while WavPack uses "round to larger",
39 * so the rounding has to be done manually.
40 */
41
42 .text
43 .align 2
44 .global decorr_stereo_pass_cont_mcf5249
45
46decorr_stereo_pass_cont_mcf5249:
47
48 lea (-44, %sp), %sp
49 movem.l %d2-%d7/%a2-%a6, (%sp)
50 move.l 44+4(%sp), %a2 | a2 = dpp->
51 move.l 44+8(%sp), %a1 | a1 = bptr
52 move.w 2(%a2), %a3 | a3 = dpp->delta
53 move.w 4(%a2), %d3 | d3 = dpp->weight_A (sign extended)
54 ext.l %d3
55 move.w 6(%a2), %d4 | d4 = dpp->weight_B (sign extended)
56 ext.l %d4
57 move.l 44+12(%sp), %d0 | d0 = sample_count
58 jbeq return_only | if zero, nothing to do
59
60 lsl.l #3, %d0 | d5 = bptr + (sample_count * 8)
61 move.l %d0, %d5
62 add.l %a1, %d5
63
64 moveq.l #17, %d0 | left shift weights & delta 17 places
65 asl.l %d0, %d3
66 asl.l %d0, %d4
67 move.l %a3, %d1
68 asl.l %d0, %d1
69 move.l %d1, %a3
70
Thom Johansen668205f2005-06-08 13:41:07 +000071 moveq.l #0x20, %d6
72 move.l %d6, %macsr | set fractional mode for MAC
73 move.l #0x800000, %accext01 | acc1 = 0x00 0000 80 (for rounding)
Thom Johansen9985caf2005-05-31 07:56:28 +000074
75 move.l #1024<<17, %d6 | d6 & d7 are weight clipping limits
76 move.l #-1024<<17, %d7 | (only used by negative terms)
77
78 move.w (%a2), %d0 | d0 = term
79 ext.l %d0
80 cmp.l #17, %d0
81 jbeq term_17 | term = 17
82 cmp.l #18, %d0
83 jbeq term_18 | term = 18
84 addq.l #1, %d0
85 jbeq term_minus_1 | term = -1
86 addq.l #1, %d0
87 jbeq term_minus_2 | term = -2
88 addq.l #1, %d0
89 jbeq term_minus_3 | term = -3
90 jbra term_default | default term = 1 - 8
91
92|------------------------------------------------------------------------------
93| Loop to handle term = 17 condition
94|
95| a0 = d0 = (2 * bptr [-1]) - bptr [-2]
96| a1 = bptr d1 = initial bptr [0]
97| a2 = dpp-> d2 = updated bptr [0]
98| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
99| a4 = d4 = dpp->weight_B << 17
100| a5 = d5 = eptr
101| macsr = 0x20 acc1 = 0x00 0000 80
102|------------------------------------------------------------------------------
103
104term_17:
105 move.l -8(%a1), %d0 | d0 = 2 * bptr [-1] - bptr [-2]
106 add.l %d0, %d0
107 sub.l -16(%a1), %d0
108 beq .L251 | if zero, skip calculation
109 move.l %acc1, %acc0
110 asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_A
111 mac.l %d0, %d3, %acc0
112 move.l (%a1), %d1
113 beq .L255
114 eor.l %d1, %d0 | else compare signs
115 bge .L256 | if same, add delta to weight
116 sub.l %a3, %d3 | else subtract delta from weight
117 sub.l %a3, %d3 | subtract again instead of branch
118.L256: add.l %a3, %d3 | add delta to weight
119
120.L255: move.l %acc0, %d2 | d2 = rounded product
121 add.l %d1, %d2 | update bptr [0] and store
122 move.l %d2, (%a1)+
123
124.L253: move.l -8(%a1), %d0 | d0 = 2 * bptr [-1] - bptr [-2]
125 add.l %d0, %d0
126 sub.l -16(%a1), %d0
127 beq .L257 | if zero, skip calculations
128 move.l %acc1, %acc0
129 asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_B
130 mac.l %d0, %d4, %acc0
131 move.l (%a1), %d1
132 beq .L254
133 eor.l %d1, %d0 | else compare signs
134 bge .L259 | if same, add delta to weight
135 sub.l %a3, %d4 | else subtract delta from weight
136 sub.l %a3, %d4 | subtract again instead of branch
137.L259: add.l %a3, %d4 | add delta to weight
138
139.L254: move.l %acc0, %d2 | d2 = rounded product
140 add.l %d1, %d2 | update bptr [0] and store
141 move.l %d2, (%a1)+
142
143.L252: cmp.l %a1, %d5 | loop if bptr < eptr
144 jbhi term_17
145 bra term_17_18_finish | exit through common path
146
147.L251: addq.l #4, %a1 | update point and jump back into loop
148 bra .L253
149
150.L257: addq.l #4, %a1 | update point and jump back into loop
151 bra .L252
152
153|------------------------------------------------------------------------------
154| Loop to handle term = 18 condition
155|
156| a0 = d0 = ((3 * bptr [-1]) - bptr [-2]) >> 1
157| a1 = bptr d1 = initial bptr [0]
158| a2 = dpp-> d2 = updated bptr [0]
159| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
160| a4 = d4 = dpp->weight_B << 17
161| a5 = d5 = eptr
162| macsr = 0x20 acc1 = 0x00 0000 80
163|------------------------------------------------------------------------------
164
165term_18:
166 move.l -8(%a1), %a0 | d0 = (3 * bptr [-1] - bptr [-2]) >> 1
167 lea (%a0,%a0.l*2), %a0
168 move.l %a0, %d0
169 sub.l -16(%a1), %d0
170 asr.l #1, %d0
171 beq .L260
172 move.l %acc1, %acc0
173 asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_A
174 mac.l %d0, %d3, %acc0
175 move.l (%a1), %d1
176 beq .L266
177 eor.l %d1, %d0 | else compare signs
178 bge .L267 | if same, add delta to weight
179 sub.l %a3, %d3 | else subtract delta from weight
180 sub.l %a3, %d3 | subtract again instead of branch
181.L267: add.l %a3, %d3 | add delta to weight
182
183.L266: move.l %acc0, %d2 | d2 = rounded product
184 add.l %d1, %d2 | add applied weight to bptr [0], store
185 move.l %d2, (%a1)+
186
187.L268: move.l -8(%a1), %a0 | d0 = (3 * bptr [-1] - bptr [-2]) >> 1
188 lea (%a0,%a0.l*2), %a0
189 move.l %a0, %d0
190 sub.l -16(%a1), %d0
191 asr.l #1, %d0
192 beq .L261
193 move.l %acc1, %acc0
194 asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_B
195 mac.l %d0, %d4, %acc0
196 move.l (%a1), %d1
197 beq .L265
198 eor.l %d1, %d0 | else compare signs
199 bge .L270 | if same, add delta to weight
200 sub.l %a3, %d4 | else subtract delta from weight
201 sub.l %a3, %d4 | subtract again instead of branch
202.L270: add.l %a3, %d4 | add delta to weight
203
204.L265: move.l %acc0, %d2 | d2 = rounded product
205 add.l %d1, %d2 | add applied weight to bptr [0], store
206 move.l %d2, (%a1)+
207
208.L269: cmp.l %a1, %d5 | loop if bptr < eptr
209 jbhi term_18
210 bra term_17_18_finish | exit through common path
211
212.L260: addq.l #4, %a1 | bump pointer and jump back into loop
213 bra .L268
214
215.L261: addq.l #4, %a1 | bump pointer and jump back into loop
216 bra .L269
217
218term_17_18_finish:
219 move.l -4(%a1), 40(%a2) | restore dpp->samples_A [0-1], B [0-1]
220 move.l -8(%a1), 8(%a2)
221 move.l -12(%a1), 44(%a2)
222 move.l -16(%a1), 12(%a2)
223 jbra finish_up
224
225|------------------------------------------------------------------------------
226| Loop to handle default terms (i.e. 1 - 8)
227|
228| a0 = tptr d0 = tptr [0]
229| a1 = bptr d1 = initial bptr [0]
230| a2 = dpp-> d2 = updated bptr [0]
231| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
232| a4 = d4 = dpp->weight_B << 17
233| a5 = d5 = eptr
234| macsr = 0x20 acc1 = 0x00 0000 80
235|------------------------------------------------------------------------------
236
237term_default:
238 move.w (%a2), %d0 | a0 = a1 - (dpp->term * 8)
239 ext.l %d0
240 lsl.l #3, %d0
241 move.l %a1, %a0
242 sub.l %d0, %a0
243
244term_default_loop:
245 move.l (%a0)+, %d0 | d0 = tptr [0], skip ahead if zero
246 beq .L271
247 move.l %acc1, %acc0
248 asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_A
249 mac.l %d0, %d3, %acc0
250 move.l (%a1), %d1
251 beq .L277
252 eor.l %d1, %d0 | else compare signs
253 bge .L278 | if same, add delta to weight
254 sub.l %a3, %d3 | else subtract delta from weight
255 sub.l %a3, %d3 | subtract again instead of branch
256.L278: add.l %a3, %d3 | add delta to weight
257
258.L277: move.l %acc0, %d2 | d2 = rounded product
259 add.l %d1, %d2 | add applied weight to bptr [0], store
260 move.l %d2, (%a1)+
261
262.L275: move.l (%a0)+, %d0 | d0 = tptr [0], skip ahead if zero
263 beq .L272
264 move.l %acc1, %acc0
265 asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_B
266 mac.l %d0, %d4, %acc0
267 move.l (%a1), %d1
268 beq .L276
269 eor.l %d1, %d0 | else compare signs
270 bge .L281 | if same, add delta to weight
271 sub.l %a3, %d4 | else subtract delta from weight
272 sub.l %a3, %d4 | subtract again instead of branch
273.L281: add.l %a3, %d4 | add delta to weight
274
275.L276: move.l %acc0, %d2 | d2 = rounded product
276 add.l %d1, %d2 | add applied weight to bptr [0], store
277 move.l %d2, (%a1)+
278
279.L274: cmp.l %a1, %d5 | loop back if bptr < eptr
280 jbhi term_default_loop
281 move.w (%a2), %d0 | d0 = term - 1
282 moveq.l #8, %d1 | d1 = loop counter
283
284.L323: subq.l #1, %d0 | back up & mask index
285 and.l #7, %d0
286 move.l -(%a1), 40(%a2,%d0.l*4) | store dpp->samples_B [d0]
287 move.l -(%a1), 8(%a2,%d0.l*4) | store dpp->samples_A [d0]
288 subq.l #1, %d1 | loop on count
289 jbne .L323
290 jbra finish_up
291
292.L271: addq.l #4, %a1 | bump pointer and jump back into loop
293 bra .L275
294
295.L272: addq.l #4, %a1 | bump pointer and jump back into loop
296 bra .L274
297
298
299|------------------------------------------------------------------------------
300| Loop to handle term = -1 condition
301|
302| a0 = d0 = decorrelation sample
303| a1 = bptr d1 = initial bptr [0]
304| a2 = dpp-> d2 = updated bptr [0]
305| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
306| a4 = d4 = dpp->weight_B << 17
307| a5 = d5 = eptr
308| a6 = d6 = 1024 << 17
309| a7 = d7 = -1024 << 17
310| macsr = 0x20 acc1 = 0x00 0000 80
311|------------------------------------------------------------------------------
312
313term_minus_1:
314 move.l -4(%a1), %d0 | d0 = bptr [-1]
315 beq .L402
316 move.l %acc1, %acc0
317 asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_A)
318 mac.l %d0, %d3, %acc0
319 move.l (%a1), %d1
320 beq .L405
321 eor.l %d1, %d0 | else compare signs
322 bge .L404 | if same, add delta to weight
323 sub.l %a3, %d3 | else subtract delta from weight
324 cmp.l %d7, %d3 | check for negative clip limit
325 bge .L405
326 move.l %d7, %d3
327 bra .L405
328
329.L404: add.l %a3, %d3 | add delta to weight
330 cmp.l %d6, %d3 | check for positive clip limit
331 ble .L405
332 move.l %d6, %d3
333
334.L405: move.l %acc0, %d0 | d2 = rounded product
335 add.l %d1, %d0 | add applied weight to bptr [0], store
336 move.l %d0, (%a1)+
337 beq .L401
338
339.L410: move.l %acc1, %acc0
340 asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_B)
341 mac.l %d0, %d4, %acc0
342 move.l (%a1), %d1
343 beq .L403
344 eor.l %d1, %d0 | else compare signs
345 bge .L407 | if same, add delta to weight
346 sub.l %a3, %d4 | else subtract delta from weight
347 cmp.l %d7, %d4 | check for negative clip limit
348 bge .L403
349 move.l %d7, %d4
350 bra .L403
351
352.L407: add.l %a3, %d4 | add delta to weight
353 cmp.l %d6, %d4 | check for positive clip limit
354 ble .L403
355 move.l %d6, %d4
356
357.L403: move.l %acc0, %d2 | d2 = rounded product
358 add.l %d1, %d2 | add applied weight to bptr [1], store
359 move.l %d2, (%a1)+
360
361.L411: cmp.l %a1, %d5 | loop back if bptr < eptr
362 jbhi term_minus_1
363 move.l -4(%a1), 8(%a2) | dpp->samples_A [0] = bptr [-1]
364 jbra finish_up
365
366.L402: move.l (%a1)+, %d0
367 bne .L410
368
369.L401: addq.l #4, %a1
370 bra .L411
371
372
373|------------------------------------------------------------------------------
374| Loop to handle term = -2 condition
375|
376| a0 = d0 = decorrelation sample
377| a1 = bptr d1 = initial bptr [0]
378| a2 = dpp-> d2 = updated bptr [0]
379| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
380| a4 = d4 = dpp->weight_B << 17
381| a5 = d5 = eptr
382| a6 = d6 = 1024 << 17
383| a7 = d7 = -1024 << 17
384| macsr = 0x20 acc1 = 0x00 0000 80
385|------------------------------------------------------------------------------
386
387term_minus_2:
388 move.l -8(%a1), %d0 | d0 = bptr [-2]
389 beq .L511
390 move.l %acc1, %acc0
391 asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_B)
392 mac.l %d0, %d4, %acc0
393 move.l 4(%a1), %d1
394 beq .L505
395 eor.l %d1, %d0 | else compare signs
396 bge .L504 | if same, add delta to weight
397 sub.l %a3, %d4 | else subtract delta from weight
398 cmp.l %d7, %d4 | ckeck for negative clip limit
399 bge .L505
400 move.l %d7, %d4
401 bra .L505
402
403.L504: add.l %a3, %d4 | add delta to weight
404 cmp.l %d6, %d4 | check for positive clip limit
405 ble .L505
406 move.l %d6, %d4
407
408.L505: move.l %acc0, %d0 | d2 = rounded product
409 add.l %d1, %d0 | add applied weight to bptr [0], store
410 move.l %d0, 4(%a1)
411 beq .L512
412
413.L510: move.l %acc1, %acc0
414 asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_A)
415 mac.l %d0, %d3, %acc0
416 move.l (%a1), %d1
417 beq .L503
418 eor.l %d1, %d0 | else compare signs
419 bge .L507 | if same, add delta to weight
420 sub.l %a3, %d3 | else subtract delta from weight
421 cmp.l %d7, %d3 | check for negative clip limit
422 bge .L503
423 move.l %d7, %d3
424 bra .L503
425
426.L507: add.l %a3, %d3 | add delta to weight
427 cmp.l %d6, %d3 | check for negative clip limit
428 ble .L503
429 move.l %d6, %d3
430
431.L503: move.l %acc0, %d2 | d2 = rounded product
432 add.l %d1, %d2 | add applied weight to bptr [1], store
433 move.l %d2, (%a1)
434
435.L512: addq.l #8, %a1
436 cmp.l %a1, %d5 | loop if bptr < eptr
437 jbhi term_minus_2
438 move.l -8(%a1), 40(%a2) | dpp->samples_B [0] = bptr [-4]
439 jbra finish_up
440
441.L511: move.l 4(%a1), %d0
442 beq .L512
443 bra .L510
444
445
446|------------------------------------------------------------------------------
447| Loop to handle term = -3 condition
448|
449| a0 = d0 = decorrelation sample
450| a1 = bptr d1 = initial bptr [0]
451| a2 = dpp-> d2 = updated bptr [0]
452| a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
453| a4 = d4 = dpp->weight_B << 17
454| a5 = d5 = eptr
455| a6 = d6 = 1024 << 17
456| a7 = d7 = -1024 << 17
457| macsr = 0x20 acc1 = 0x00 0000 80
458|------------------------------------------------------------------------------
459
460term_minus_3:
461 move.l -4(%a1), %d0 | d0 = bptr [-1]
462 beq .L301
463 move.l %acc1, %acc0
464 asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_A)
465 mac.l %d0, %d3, %acc0
466 move.l (%a1), %d1
467 beq .L320
468 eor.l %d1, %d0 | else compare signs
469 bge .L319 | if same, add delta to weight
470 sub.l %a3, %d3 | else subtract delta from weight
471 cmp.l %d7, %d3 | check for negative clip limit
472 bge .L320
473 move.l %d7, %d3
474 bra .L320
475
476.L319: add.l %a3, %d3 | add delta to weight
477 cmp.l %d6, %d3 | check for positive clip limit
478 ble .L320
479 move.l %d6, %d3
480
481.L320: move.l %acc0, %d2 | d2 = rounded product
482 add.l %d1, %d2 | add applied weight to bptr [0], store
483 move.l %d2, (%a1)+
484
485.L330: move.l -12(%a1), %d0 | d0 = bptr [-2]
486 beq .L302
487 move.l %acc1, %acc0
488 asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_B)
489 mac.l %d0, %d4, %acc0
490 move.l (%a1), %d1
491 beq .L318
492 eor.l %d1, %d0 | else compare signs
493 bge .L322 | if same, add delta to weight
494 sub.l %a3, %d4 | else subtract delta from weight
495 cmp.l %d7, %d4 | check for negative clip limit
496 bge .L318
497 move.l %d7, %d4
498 bra .L318
499
500.L322: add.l %a3, %d4 | add delta to weight
501 cmp.l %d6, %d4 | check for positive clip limit
502 ble .L318
503 move.l %d6, %d4
504
505.L318: move.l %acc0, %d2 | d2 = rounded product
506 add.l %d1, %d2 | add applied weight to bptr [1], store
507 move.l %d2, (%a1)+
508
509.L331: cmp.l %a1, %d5 | bptr, eptr
510 jbhi term_minus_3
511 move.l -4(%a1), 8(%a2) | dpp->samples_A [0] = bptr [-1]
512 move.l -8(%a1), 40(%a2) | dpp->samples_B [0] = bptr [-2]
513 jbra finish_up
514
515.L301: addq.l #4, %a1
516 bra .L330
517
518.L302: addq.l #4, %a1
519 bra .L331
520
521| finish and return
522
523finish_up:
524 moveq.l #17, %d0
525 asr.l %d0, %d3
526 asr.l %d0, %d4
527 move.w %d3, 4(%a2) | weight_A, dpp->weight_A
528 move.w %d4, 6(%a2) | weight_B, dpp->weight_B
529
530 clr.l %d0 | clear up EMAC
531 move.l %d0, %acc0
532 move.l %d0, %acc1
533
534return_only:
535 movem.l (%sp), %d2-%d7/%a2-%a6
536 lea (44,%sp), %sp
537 rts