| /*************************************************************************** |
| * __________ __ ___. |
| * Open \______ \ ____ ____ | | _\_ |__ _______ ___ |
| * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / |
| * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < |
| * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ |
| * \/ \/ \/ \/ \/ |
| * $Id$ |
| * |
| * Copyright (C) 2006 by Thom Johansen |
| * |
| * This program is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU General Public License |
| * as published by the Free Software Foundation; either version 2 |
| * of the License, or (at your option) any later version. |
| * |
| * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY |
| * KIND, either express or implied. |
| * |
| ****************************************************************************/ |
| |
| #include "config.h" |
| |
| /* The following is an assembler optimised version of the LPC filtering |
| routines needed for FLAC decoding. It is optimised for use with ARM |
| processors. |
| All LPC filtering up to order 9 is done in specially optimised unrolled |
| loops, while every order above this is handled by a slower default routine. |
| */ |
| #ifdef USE_IRAM |
| .section .icode,"ax",%progbits |
| #else |
| .text |
| #endif |
| .global lpc_decode_arm |
| lpc_decode_arm: |
| stmdb sp!, { r4-r11, lr } |
| ldr r4, [sp, #36] |
| /* r0 = blocksize, r1 = qlevel, r2 = pred_order |
| r3 = data, r4 = coeffs |
| */ |
| |
| /* the data pointer always lags behind history pointer by 'pred_order' |
| samples. since we have one loop for each order, we can hard code this |
| and free a register by not saving data pointer. |
| */ |
| sub r3, r3, r2, lsl #2 @ r3 = history |
| cmp r0, #0 @ no samples to process |
| beq .exit |
| cmp r2, #9 @ check if order is too high for unrolled loops |
| addls pc, pc, r2, lsl #2 @ jump to our unrolled decode loop if it exists |
| @ jumptable: |
| b .default @ order too high, go to default routine |
| b .exit @ zero order filter isn't possible, exit function |
| b .order1 |
| b .order2 |
| b .order3 |
| b .order4 |
| b .order5 |
| b .order6 |
| b .order7 |
| b .order8 |
| |
| @ last jump table entry coincides with target, so leave it out |
| .order9: |
| ldmia r4, { r5-r12, r14 } @ fetch coefs |
| .loop9: |
| ldr r4, [r3], #4 @ load first history sample |
| mul r2, r4, r14 @ multiply with last coef |
| ldr r4, [r3], #4 @ rinse and repeat while accumulating sum in r2 |
| mla r2, r4, r12, r2 |
| ldr r4, [r3], #4 |
| mla r2, r4, r11, r2 |
| ldr r4, [r3], #4 |
| mla r2, r4, r10, r2 |
| ldr r4, [r3], #4 |
| mla r2, r4, r9, r2 |
| ldr r4, [r3], #4 |
| mla r2, r4, r8, r2 |
| ldr r4, [r3], #4 |
| mla r2, r4, r7, r2 |
| ldr r4, [r3], #4 |
| mla r2, r4, r6, r2 |
| ldr r4, [r3], #4 |
| mla r2, r4, r5, r2 |
| ldr r4, [r3] @ r4 = residual |
| add r2, r4, r2, asr r1 @ shift sum by qlevel bits and add residual |
| str r2, [r3], #-8*4 @ save result and wrap history pointer back |
| subs r0, r0, #1 @ check if we're done |
| bne .loop9 @ nope, jump back |
| b .exit |
| |
| .order8: |
| ldmia r4, { r5-r12 } |
| .loop8: |
| @ we have more registers to spare here, so start block reading |
| ldmia r3!, { r4, r14 } |
| mul r2, r4, r12 |
| mla r2, r14, r11, r2 |
| ldmia r3!, { r4, r14 } |
| mla r2, r4, r10, r2 |
| mla r2, r14, r9, r2 |
| ldmia r3!, { r4, r14 } |
| mla r2, r4, r8, r2 |
| mla r2, r14, r7, r2 |
| ldmia r3!, { r4, r14 } |
| mla r2, r4, r6, r2 |
| mla r2, r14, r5, r2 |
| ldr r4, [r3] |
| add r2, r4, r2, asr r1 |
| str r2, [r3], #-7*4 |
| subs r0, r0, #1 |
| bne .loop8 |
| b .exit |
| |
| .order7: |
| ldmia r4, { r5-r11 } |
| .loop7: |
| ldmia r3!, { r4, r12, r14 } |
| mul r2, r4, r11 |
| mla r2, r12, r10, r2 |
| mla r2, r14, r9, r2 |
| ldmia r3!, { r4, r12, r14 } |
| mla r2, r4, r8, r2 |
| mla r2, r12, r7, r2 |
| mla r2, r14, r6, r2 |
| ldr r4, [r3], #4 |
| mla r2, r4, r5, r2 |
| ldr r4, [r3] |
| add r2, r4, r2, asr r1 |
| str r2, [r3], #-6*4 |
| subs r0, r0, #1 |
| bne .loop7 |
| b .exit |
| |
| .order6: |
| ldmia r4, { r5-r10 } |
| .loop6: |
| ldmia r3!, { r4, r11-r12, r14 } |
| mul r2, r4, r10 |
| mla r2, r11, r9, r2 |
| mla r2, r12, r8, r2 |
| mla r2, r14, r7, r2 |
| ldmia r3!, { r4, r11 } |
| mla r2, r4, r6, r2 |
| mla r2, r11, r5, r2 |
| ldr r4, [r3] |
| add r2, r4, r2, asr r1 |
| str r2, [r3], #-5*4 |
| subs r0, r0, #1 |
| bne .loop6 |
| b .exit |
| |
| .order5: |
| ldmia r4, { r5-r9 } |
| .loop5: |
| ldmia r3!, { r4, r10-r12, r14 } |
| mul r2, r4, r9 |
| mla r2, r10, r8, r2 |
| mla r2, r11, r7, r2 |
| mla r2, r12, r6, r2 |
| mla r2, r14, r5, r2 |
| ldr r4, [r3] |
| add r2, r4, r2, asr r1 |
| str r2, [r3], #-4*4 |
| subs r0, r0, #1 |
| bne .loop5 |
| b .exit |
| |
| .order4: |
| ldmia r4, { r5-r8 } |
| .loop4: |
| ldmia r3!, { r4, r11-r12, r14 } |
| mul r2, r4, r8 |
| mla r2, r11, r7, r2 |
| mla r2, r12, r6, r2 |
| mla r2, r14, r5, r2 |
| ldr r4, [r3] |
| add r2, r4, r2, asr r1 |
| str r2, [r3], #-3*4 |
| subs r0, r0, #1 |
| bne .loop4 |
| b .exit |
| |
| .order3: |
| ldmia r4, { r5-r7 } |
| .loop3: |
| ldmia r3!, { r4, r12, r14 } |
| mul r2, r4, r7 |
| mla r2, r12, r6, r2 |
| mla r2, r14, r5, r2 |
| ldr r4, [r3] |
| add r2, r4, r2, asr r1 |
| str r2, [r3], #-2*4 |
| subs r0, r0, #1 |
| bne .loop3 |
| b .exit |
| |
| .order2: |
| ldmia r4, { r5-r6 } |
| .loop2: |
| ldmia r3!, { r4, r14 } |
| mul r2, r4, r6 |
| mla r2, r14, r5, r2 |
| ldr r4, [r3] |
| add r2, r4, r2, asr r1 |
| str r2, [r3], #-1*4 |
| subs r0, r0, #1 |
| bne .loop2 |
| b .exit |
| |
| .order1: |
| ldr r5, [r4] @ load the one coef we need |
| ldr r4, [r3], #4 @ load one history sample, r3 now points to residual |
| .loop1: |
| mul r2, r4, r5 @ multiply coef by history sample |
| ldr r4, [r3] @ load residual |
| add r4, r4, r2, asr r1 @ add result to residual |
| str r4, [r3], #4 @ place r3 at next residual, we already have |
| subs r0, r0, #1 @ the current sample in r4 for the next iteration |
| bne .loop1 |
| b .exit |
| |
| .default: |
| /* we do the filtering in an unrolled by 4 loop as far as we can, and then |
| do the rest by jump table. */ |
| add r5, r4, r2, lsl #2 @ need to start in the other end of coefs |
| mov r7, r2, lsr #2 @ r7 = coefs/4 |
| mov r14, #0 @ init accumulator |
| .dloop1: |
| ldmdb r5!, { r8-r11 } |
| ldmia r3!, { r6, r12 } |
| mla r14, r6, r11, r14 |
| mla r14, r12, r10, r14 |
| ldmia r3!, { r6, r12 } |
| mla r14, r6, r9, r14 |
| mla r14, r12, r8, r14 |
| subs r7, r7, #1 |
| bne .dloop1 |
| |
| and r7, r2, #3 @ get remaining samples to be filtered |
| add pc, pc, r7, lsl #2 @ jump into accumulator chain |
| @ jumptable: |
| b .dsave @ padding |
| b .dsave |
| b .oneleft |
| b .twoleft |
| @ implicit .threeleft |
| ldr r12, [r5, #-4]! |
| ldr r8, [r3], #4 |
| mla r14, r12, r8, r14 |
| .twoleft: |
| ldr r12, [r5, #-4]! |
| ldr r8, [r3], #4 |
| mla r14, r12, r8, r14 |
| .oneleft: |
| ldr r12, [r5, #-4]! |
| ldr r8, [r3], #4 |
| mla r14, r12, r8, r14 |
| |
| .dsave: |
| ldr r12, [r3] @ load residual |
| add r14, r12, r14, asr r1 @ shift sum by qlevel bits and add residual |
| str r14, [r3], #4 @ store result |
| sub r3, r3, r2, lsl #2 @ and wrap history pointer back to next first pos |
| subs r0, r0, #1 @ are we done? |
| bne .default @ no, prepare for next sample |
| |
| .exit: |
| ldmia sp!, { r4-r11, pc } |
| |