2-point and 1-point JPEG IDCT ARM assembly, remove comment in jpeg_load.c about inline asm, change loop condition to be a bit safer in case of bad values being passed.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21349 a1c6a512-1295-4272-9138-f99709370657
diff --git a/apps/recorder/jpeg_idct_arm.S b/apps/recorder/jpeg_idct_arm.S
index 2ef868e..d46843f 100644
--- a/apps/recorder/jpeg_idct_arm.S
+++ b/apps/recorder/jpeg_idct_arm.S
@@ -25,11 +25,140 @@
.section .text
.align 2
+ .global jpeg_idct1h
+ .type jpeg_idct1h, %function
+ .global jpeg_idct2v
+ .type jpeg_idct2v, %function
+ .global jpeg_idct2h
+ .type jpeg_idct2h, %function
.global jpeg_idct4v
.type jpeg_idct4v, %function
.global jpeg_idct4h
.type jpeg_idct4h, %function
+jpeg_idct1h:
+/* In the common case of one pass through the loop, the extra add should be
+ cheaper than saving registers to stack and loading a the value 4112. */
+1:
+ ldrsh r12, [r0]
+ add r12, r12, #4096
+ add r12, r12, #16
+#if ARM_ARCH < 6
+ mov r12, r12, asr #5
+ cmp r12, #255
+ mvnhi r12, r12, asr #31
+#else
+ usat r12, #8, r12, asr #5
+#endif
+ strb r12, [r1]
+ add r0, r0, #16
+ add r1, r1, r3
+ cmp r0, r2
+ bcc 1b
+ bx lr
+ .size jpeg_idct1h, .-jpeg_idct1h
+
+jpeg_idct2v:
+#if ARM_ARCH < 6
+/* Use SWAR tricks to fake partitioned add and subtract. This is slightly faster
+ than loading two values in each register and using shifts and strh, and
+ requires fewer fixup operations than splitting the values, calculating, and
+ merging.
+*/
+ stmdb sp!, { r4, lr }
+1:
+ ldr r2, [r0]
+ ldr r3, [r0, #16]
+ eor r12, r2, r3
+ and r12, r12, #0x8000
+ bic r3, r3, #0x8000
+ bic r4, r2, #0x8000
+ add r4, r4, r3
+ eor r4, r4, r12
+ orr r2, r2, #0x8000
+ sub r2, r2, r3
+ eor r2, r2, r12
+ eor r2, r2, #0x8000
+ str r4, [r0]
+ str r2, [r0, #16]
+ add r0, r0, #4
+ cmp r0, r1
+ bcc 1b
+ ldmia sp!, { r4, pc }
+#else
+/* ARMv6 offers partitioned adds and subtracts, used here to unroll the loop
+ to two columns.
+*/
+1:
+ ldr r2, [r0]
+ ldr r3, [r0, #16]
+ sadd16 r12, r2, r3
+ ssub16 r2, r2, r3
+ str r12, [r0]
+ str r2, [r0, #16]
+ add r0, r0, #4
+ cmp r0, r1
+ bcc 1b
+ bx lr
+#endif
+ .size jpeg_idct2v, .-jpeg_idct2v
+
+jpeg_idct2h:
+#if ARM_ARCH < 6
+/* Using LDR and shifts here would costs two more ops, and is no faster as
+ results can not be stored merged.
+*/
+ stmdb sp!, { r4-r5, lr }
+ ldr r14, =4112
+1:
+ ldrsh r12, [r0]
+ ldrsh r4, [r0, #2]
+ add r12, r12, r14
+ add r5, r12, r4
+ sub r4, r12, r4
+ mov r5, r5, asr #5
+ mov r4, r4, asr #5
+ cmp r5, #255
+ mvnhi r5, r5, asr #31
+ cmp r4, #255
+ mvnhi r4, r4, asr #31
+#ifdef HAVE_LCD_COLOR
+ strb r5, [r1]
+ strb r4, [r1, #4]
+#else
+ strb r5, [r1]
+ strb r4, [r1, #1]
+#endif
+ add r0, r0, #16
+ add r1, r1, r3
+ cmp r0, r2
+ bcc 1b
+ ldmia sp!, { r4-r5, pc }
+#else
+ stmdb sp!, { r4, lr }
+ ldr r14, =4112
+1:
+ ldr r12, [r0]
+ sadd16 r12, r12, r14
+ saddsubx r12, r12, r12
+ usat r4, #8, r12, asr #21
+ sxth r12, r12
+ usat r12, #8, r12, asr #5
+#ifdef HAVE_LCD_COLOR
+ strb r4, [r1]
+ strb r12, [r1, #4]
+#else
+ strb r4, [r1]
+ strb r12, [r1, #1]
+#endif
+ add r0, r0, #16
+ add r1, r1, r3
+ cmp r0, r2
+ bcc 1b
+ ldmia sp!, { r4, pc }
+#endif
+ .size jpeg_idct2h, .-jpeg_idct2h
+
jpeg_idct4v:
#if ARM_ARCH < 5
stmdb sp!, { r4-r7, lr }
@@ -60,8 +189,8 @@
strh r6, [r0, #16]
strh r2, [r0, #32]
add r0, r0, #2
- teq r0, r1
- bne 1b
+ cmp r0, r1
+ bcc 1b
ldmia sp!, { r4-r7, pc }
#elif ARM_ARCH < 6
stmdb sp!, { r4-r8, lr }
@@ -90,8 +219,8 @@
strh r3, [r0, #16]
strh r2, [r0, #32]
add r0, r0, #2
- teq r0, r1
- bne 1b
+ cmp r0, r1
+ bcc 1b
ldmia sp!, { r4-r8, pc }
#else
stmdb sp!, { r4-r10, lr }
@@ -192,8 +321,8 @@
#endif
add r0, r0, #16
add r1, r1, r3
- teq r0, r2
- bne 1b
+ cmp r0, r2
+ bcc 1b
ldmia sp!, { r4-r10, pc }
#elif ARM_ARCH < 6
stmdb sp!, { r4-r10, lr }
@@ -241,8 +370,8 @@
#endif
add r0, r0, #16
add r1, r1, r3
- teq r0, r2
- bne 1b
+ cmp r0, r2
+ bcc 1b
ldmia sp!, { r4-r10, pc }
#else
stmdb sp!, { r4-r9, lr }
@@ -280,8 +409,8 @@
#endif
add r0, r0, #16
add r1, r1, r3
- teq r0, r2
- bne 1b
+ cmp r0, r2
+ bcc 1b
ldmia sp!, { r4-r9, pc }
#endif
.size jpeg_idct4h, .-jpeg_idct4h
diff --git a/apps/recorder/jpeg_load.c b/apps/recorder/jpeg_load.c
index f2b3b4b..fa2df5b 100644
--- a/apps/recorder/jpeg_load.c
+++ b/apps/recorder/jpeg_load.c
@@ -270,11 +270,7 @@
#define BUFAC 227
#define COMPONENT_SHIFT 15
-/* Some of the below have inline ASM optimizations of the loop contents. To
- make comparison with the C versions easier, the C variable names are used
- in comments whenever intermediate values are labeled.
-*/
-
+#ifndef CPU_ARM
/* horizontal-pass 1-point IDCT */
static void jpeg_idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
{
@@ -312,7 +308,6 @@
}
}
-#ifndef CPU_ARM
/* vertical-pass 4-point IDCT */
static void jpeg_idct4v(int16_t *ws, int16_t *end)
{
@@ -388,6 +383,9 @@
}
}
#else
+extern void jpeg_idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
+extern void jpeg_idct2v(int16_t *ws, int16_t *end);
+extern void jpeg_idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
extern void jpeg_idct4v(int16_t *ws, int16_t *end);
extern void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
#endif