blob: d46843ff123f840bed469b7f2d42807ae87ce4d5 [file] [log] [blame]
/***************************************************************************
* __________ __ ___.
* Open \______ \ ____ ____ | | _\_ |__ _______ ___
* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
* \/ \/ \/ \/ \/
* $Id$
*
* JPEG assembly IDCT
*
* Copyright (C) 2009 Andrew Mahone asm versions of the C IDCT algorithms used
* jpeg_load.c with
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
* KIND, either express or implied.
*
****************************************************************************/
#include "config.h"
.section .text
.align 2
.global jpeg_idct1h
.type jpeg_idct1h, %function
.global jpeg_idct2v
.type jpeg_idct2v, %function
.global jpeg_idct2h
.type jpeg_idct2h, %function
.global jpeg_idct4v
.type jpeg_idct4v, %function
.global jpeg_idct4h
.type jpeg_idct4h, %function
jpeg_idct1h:
/* In the common case of one pass through the loop, the extra add should be
cheaper than saving registers to stack and loading a the value 4112. */
1:
ldrsh r12, [r0]
add r12, r12, #4096
add r12, r12, #16
#if ARM_ARCH < 6
mov r12, r12, asr #5
cmp r12, #255
mvnhi r12, r12, asr #31
#else
usat r12, #8, r12, asr #5
#endif
strb r12, [r1]
add r0, r0, #16
add r1, r1, r3
cmp r0, r2
bcc 1b
bx lr
.size jpeg_idct1h, .-jpeg_idct1h
jpeg_idct2v:
#if ARM_ARCH < 6
/* Use SWAR tricks to fake partitioned add and subtract. This is slightly faster
than loading two values in each register and using shifts and strh, and
requires fewer fixup operations than splitting the values, calculating, and
merging.
*/
stmdb sp!, { r4, lr }
1:
ldr r2, [r0]
ldr r3, [r0, #16]
eor r12, r2, r3
and r12, r12, #0x8000
bic r3, r3, #0x8000
bic r4, r2, #0x8000
add r4, r4, r3
eor r4, r4, r12
orr r2, r2, #0x8000
sub r2, r2, r3
eor r2, r2, r12
eor r2, r2, #0x8000
str r4, [r0]
str r2, [r0, #16]
add r0, r0, #4
cmp r0, r1
bcc 1b
ldmia sp!, { r4, pc }
#else
/* ARMv6 offers partitioned adds and subtracts, used here to unroll the loop
to two columns.
*/
1:
ldr r2, [r0]
ldr r3, [r0, #16]
sadd16 r12, r2, r3
ssub16 r2, r2, r3
str r12, [r0]
str r2, [r0, #16]
add r0, r0, #4
cmp r0, r1
bcc 1b
bx lr
#endif
.size jpeg_idct2v, .-jpeg_idct2v
jpeg_idct2h:
#if ARM_ARCH < 6
/* Using LDR and shifts here would costs two more ops, and is no faster as
results can not be stored merged.
*/
stmdb sp!, { r4-r5, lr }
ldr r14, =4112
1:
ldrsh r12, [r0]
ldrsh r4, [r0, #2]
add r12, r12, r14
add r5, r12, r4
sub r4, r12, r4
mov r5, r5, asr #5
mov r4, r4, asr #5
cmp r5, #255
mvnhi r5, r5, asr #31
cmp r4, #255
mvnhi r4, r4, asr #31
#ifdef HAVE_LCD_COLOR
strb r5, [r1]
strb r4, [r1, #4]
#else
strb r5, [r1]
strb r4, [r1, #1]
#endif
add r0, r0, #16
add r1, r1, r3
cmp r0, r2
bcc 1b
ldmia sp!, { r4-r5, pc }
#else
stmdb sp!, { r4, lr }
ldr r14, =4112
1:
ldr r12, [r0]
sadd16 r12, r12, r14
saddsubx r12, r12, r12
usat r4, #8, r12, asr #21
sxth r12, r12
usat r12, #8, r12, asr #5
#ifdef HAVE_LCD_COLOR
strb r4, [r1]
strb r12, [r1, #4]
#else
strb r4, [r1]
strb r12, [r1, #1]
#endif
add r0, r0, #16
add r1, r1, r3
cmp r0, r2
bcc 1b
ldmia sp!, { r4, pc }
#endif
.size jpeg_idct2h, .-jpeg_idct2h
jpeg_idct4v:
#if ARM_ARCH < 5
stmdb sp!, { r4-r7, lr }
ldr r14, =-15137
ldr r12, =6270
1:
ldrsh r4, [r0, #32]
ldrsh r2, [r0]
ldrsh r5, [r0, #48]
ldrsh r3, [r0, #16]
add r6, r2, r4 /* r6 = tmp10 >> 2 = d0 + d2 */
sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
add r4, r3, r5 /* r4 = z1 = d1 + d3 */
add r7, r4, r4, lsl #3
rsb r4, r4, r7, lsl #4
rsb r4, r4, r4, lsl #5 /* z1 *= 4433 */
add r4, r4, #1024
mla r3, r12, r3, r4 /* r3 = tmp2 = z1 + z2 * 6270 */
mla r5, r14, r5, r4 /* r5 = tmp0 = z1 - z3 * 15137 */
mov r6, r6, lsl #2 /* r6 <<= 2 */
mov r2, r2, lsl #2 /* r2 <<= 2 */
add r7, r6, r3, asr #11 /* r7 = o0 */
sub r3, r6, r3, asr #11 /* r3 = o3 */
add r6, r2, r5, asr #11 /* r6 = o1 */
sub r2, r2, r5, asr #11 /* r2 = o2 */
strh r7, [r0]
strh r3, [r0, #48]
strh r6, [r0, #16]
strh r2, [r0, #32]
add r0, r0, #2
cmp r0, r1
bcc 1b
ldmia sp!, { r4-r7, pc }
#elif ARM_ARCH < 6
stmdb sp!, { r4-r8, lr }
ldr r8, =1024
ldr r14, =4433
ldr r12, =3302955134
1:
ldrsh r5, [r0, #48]
ldrsh r3, [r0, #16]
ldrsh r4, [r0, #32]
ldrsh r2, [r0]
add r6, r3, r5 /* r6 = z1 = d1 + d3 */
add r7, r2, r4 /* r7 = tmp10 >> 2 = d0 + d2 */
smlabb r6, r14, r6, r8 /* z1 *= 4433 */
sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
smlabb r3, r12, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
smlatb r5, r12, r5, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
mov r7, r7, lsl #2
mov r2, r2, lsl #2
add r4, r7, r3, asr #11 /* r4 = o0 */
sub r7, r7, r3, asr #11 /* r7 = o3 */
add r3, r2, r5, asr #11 /* r3 = o1 */
sub r2, r2, r5, asr #11 /* r2 = o2 */
strh r4, [r0]
strh r7, [r0, #48]
strh r3, [r0, #16]
strh r2, [r0, #32]
add r0, r0, #2
cmp r0, r1
bcc 1b
ldmia sp!, { r4-r8, pc }
#else
stmdb sp!, { r4-r10, lr }
ldr r2, =1024
ldr r3, =4433
ldr r12, =3302955134
1:
ldr r6, [r0, #32]
ldr r4, [r0]
ldr r7, [r0, #48]
ldr r5, [r0, #16]
/* this part is being done in parallel on two columns */
sadd16 r8, r4, r6 /* r8 = d0 + d2 */
ssub16 r4, r4, r6 /* r4 = d0 - d2 */
sadd16 r6, r5, r7 /* r6 = d1 + d3 */
/* there is no parallel shift operation, but we can fake it with bic
and lsl */
bic r8, r8, #0xc000
bic r4, r4, #0xc000
/* multiplication expands values beyond 16 bits, so this part needs to be
split. the values will be merged below so that the rest of the addition
can be done in parallel */
smlabb r9, r3, r6, r2 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
smlabt r6, r3, r6, r2 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
smlabb r10, r12, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */
smlatb r14, r12, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */
smlabt r5, r12, r5, r6 /* r5 = tmp2[1] */
smlatt r6, r12, r7, r6 /* r6 = tmp0[1] */
mov r8, r8, lsl #2 /* complete the parallel shift started */
mov r4, r4, lsl #2 /* with the earlier bic instructions */
/* tmp2 are in r10, r5; tmp0 are in r14, r6 */
/* tmp10, tmp12 are in r4, r8 */
mov r10, r10, asr #11
mov r14, r14, asr #11
pkhbt r5, r10, r5, lsl #5 /* parallel tmp2 */
pkhbt r6, r14, r6, lsl #5 /* parallel tmp0 */
sadd16 r10, r8, r5 /* d0 */
ssub16 r5, r8, r5 /* d3 */
sadd16 r14, r4, r6 /* d1 */
ssub16 r6, r4, r6 /* d2 */
str r10, [r0]
str r5, [r0, #48]
str r14, [r0, #16]
str r6, [r0, #32]
add r0, r0, #4
cmp r0, r1
bcc 1b
ldmia sp!, { r4-r10, pc }
#endif
.size jpeg_idct4v, .-jpeg_idct4v
jpeg_idct4h:
#if ARM_ARCH < 5
stmdb sp!, { r4-r10, lr }
ldr r10, =-15137
ldr r14, =4112
ldr r12, =6270
1:
ldrsh r4, [r0]
ldrsh r6, [r0, #4]
ldrsh r7, [r0, #6]
ldrsh r5, [r0, #2]
add r4, r4, r14
add r8, r4, r6 /* r8 = tmp10 >> 2 = d0 + d2 */
sub r4, r4, r6 /* r4 = tmp12 >> 2= d0 - d2 */
add r6, r5, r7 /* r6 = z1 = d1 + d3 */
add r9, r6, r6, lsl #3
rsb r6, r6, r9, lsl #4
rsb r6, r6, r6, lsl #5 /* z1 *= 4433 */
mla r7, r10, r7, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
mla r5, r12, r5, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
add r9, r5, r8, lsl #13 /* r7 = o0 */
rsb r5, r5, r8, lsl #13 /* r3 = o3 */
add r8, r7, r4, lsl #13 /* r6 = o1 */
rsb r4, r7, r4, lsl #13 /* r2 = o2 */
mov r9, r9, asr #18
mov r8, r8, asr #18
mov r4, r4, asr #18
mov r5, r5, asr #18
cmp r9, #255
mvnhi r9, r9, asr #31
cmp r8, #255
mvnhi r8, r8, asr #31
cmp r4, #255
mvnhi r4, r4, asr #31
cmp r5, #255
mvnhi r5, r5, asr #31
#ifdef HAVE_LCD_COLOR
strb r9, [r1]
strb r8, [r1, #4]
strb r4, [r1, #8]
strb r5, [r1, #12]
#else
strb r9, [r1]
strb r8, [r1, #1]
strb r4, [r1, #2]
strb r5, [r1, #3]
#endif
add r0, r0, #16
add r1, r1, r3
cmp r0, r2
bcc 1b
ldmia sp!, { r4-r10, pc }
#elif ARM_ARCH < 6
stmdb sp!, { r4-r10, lr }
ldr r10, =4433
ldr r14, =4112
ldr r12, =3302955134
1:
ldrsh r7, [r0, #6]
ldrsh r5, [r0, #2]
ldrsh r4, [r0]
ldrsh r6, [r0, #4]
add r8, r5, r7 /* r8 = z1 = d1 + d3 */
add r4, r4, r14
smulbb r8, r10, r8 /* z1 *= 4433 */
add r9, r4, r6 /* r9 = tmp10 >> 13 = d0 + d2 */
smlabb r5, r12, r5, r8 /* r5 = tmp2 = z1 + z2 * 6270 */
smlatb r7, r12, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */
sub r4, r5, r6 /* r4 = tmp12 >> 13 = d0 - d2 */
add r6, r5, r9, lsl #13 /* r6 = o0 */
rsb r9, r5, r9, lsl #13 /* r9 = o3 */
add r5, r7, r4, lsl #13 /* r5 = o1 */
rsb r4, r7, r4, lsl #13 /* r4 = o2 */
mov r6, r6, asr #18
mov r5, r5, asr #18
mov r4, r4, asr #18
mov r9, r9, asr #18
cmp r6, #255
mvnhi r6, r6, asr #31
cmp r5, #255
mvnhi r5, r5, asr #31
cmp r4, #255
mvnhi r4, r4, asr #31
cmp r9, #255
mvnhi r9, r9, asr #31
#ifdef HAVE_LCD_COLOR
strb r6, [r1]
strb r5, [r1, #4]
strb r4, [r1, #8]
strb r9, [r1, #12]
#else
strb r6, [r1]
strb r5, [r1, #1]
strb r4, [r1, #2]
strb r9, [r1, #3]
#endif
add r0, r0, #16
add r1, r1, r3
cmp r0, r2
bcc 1b
ldmia sp!, { r4-r10, pc }
#else
stmdb sp!, { r4-r9, lr }
ldr r9, =4433
ldr r14, =4112
ldr r12, =3302955134
1:
ldmia r0, { r4-r5 }
sadd16 r4, r4, r14
sadd16 r6, r4, r5 /* r6lo = d0 + d2, r6hi = d1 + d3 */
ssub16 r7, r4, r5 /* r7lo = d0 - d2 */
smulbt r8, r9, r6
sxth r6, r6
smlabt r4, r12, r4, r8 /* r4 = tmp2 = z1 + z2 * 6270 */
smlatt r5, r12, r5, r8 /* r5 = tmp0 = z1 - z3 * 15137 */
sxth r7, r7
add r8, r4, r6, lsl #13 /* r8 = o0 */
rsb r6, r4, r6, lsl #13 /* r6 = o3 */
add r4, r5, r7, lsl #13 /* r4 = o1 */
rsb r5, r5, r7, lsl #13 /* r5 = o2 */
usat r8, #8, r8, asr #18
usat r6, #8, r6, asr #18
usat r4, #8, r4, asr #18
usat r5, #8, r5, asr #18
#ifdef HAVE_LCD_COLOR
strb r8, [r1]
strb r6, [r1, #12]
strb r4, [r1, #4]
strb r5, [r1, #8]
#else
strb r8, [r1]
strb r6, [r1, #3]
strb r4, [r1, #1]
strb r5, [r1, #2]
#endif
add r0, r0, #16
add r1, r1, r3
cmp r0, r2
bcc 1b
ldmia sp!, { r4-r9, pc }
#endif
.size jpeg_idct4h, .-jpeg_idct4h