Core JPEG decoder improvements:
For >8-point vertical IDCT, transpose the coefficients while decoding them, so that the vertical IDCT can read in rows rather than columns. This improves speed a bit for this size even using the C IDCT.
Remove inline ARM asm, replacing it with an external file containing pure asm IDCT functions.
Add jpeg_ prefix to JPEG IDCT functions since some of them will now be visible globally.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21345 a1c6a512-1295-4272-9138-f99709370657
diff --git a/apps/SOURCES b/apps/SOURCES
index 527b0b2..4caf32d 100644
--- a/apps/SOURCES
+++ b/apps/SOURCES
@@ -104,6 +104,9 @@
#endif
#ifdef HAVE_JPEG
recorder/jpeg_load.c
+#ifdef CPU_ARM
+recorder/jpeg_idct_arm.S
+#endif
#endif
#ifdef HAVE_ALBUMART
recorder/albumart.c
diff --git a/apps/plugins/lib/SOURCES b/apps/plugins/lib/SOURCES
index 7211109..2ed38c4 100644
--- a/apps/plugins/lib/SOURCES
+++ b/apps/plugins/lib/SOURCES
@@ -27,6 +27,9 @@
profile_plugin.c
#endif
#ifdef HAVE_LCD_BITMAP
+#ifdef CPU_ARM
+pluginlib_jpeg_idct_arm.S
+#endif
pluginlib_jpeg_mem.c
pluginlib_resize.c
#ifndef HAVE_JPEG
diff --git a/apps/plugins/lib/pluginlib_jpeg_idct_arm.S b/apps/plugins/lib/pluginlib_jpeg_idct_arm.S
new file mode 100644
index 0000000..5e6149d
--- /dev/null
+++ b/apps/plugins/lib/pluginlib_jpeg_idct_arm.S
@@ -0,0 +1,24 @@
+/***************************************************************************
+* __________ __ ___.
+* Open \______ \ ____ ____ | | _\_ |__ _______ ___
+* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
+* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
+* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
+* \/ \/ \/ \/ \/
+* $Id$
+*
+* Copyright (C) 2009 by Andrew Mahone
+*
+* This is a wrapper for the core jpeg_idct_arm.S
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License
+* as published by the Free Software Foundation; either version 2
+* of the License, or (at your option) any later version.
+*
+* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+* KIND, either express or implied.
+*
+****************************************************************************/
+
+#include "recorder/jpeg_idct_arm.S"
diff --git a/apps/recorder/jpeg_idct_arm.S b/apps/recorder/jpeg_idct_arm.S
new file mode 100644
index 0000000..2ef868e
--- /dev/null
+++ b/apps/recorder/jpeg_idct_arm.S
@@ -0,0 +1,287 @@
+/***************************************************************************
+* __________ __ ___.
+* Open \______ \ ____ ____ | | _\_ |__ _______ ___
+* Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
+* Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
+* Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
+* \/ \/ \/ \/ \/
+* $Id$
+*
+* JPEG assembly IDCT
+*
+* Copyright (C) 2009 Andrew Mahone asm versions of the C IDCT algorithms used
+* jpeg_load.c with
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License
+* as published by the Free Software Foundation; either version 2
+* of the License, or (at your option) any later version.
+*
+* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+* KIND, either express or implied.
+*
+****************************************************************************/
+#include "config.h"
+
+ .section .text
+ .align 2
+ .global jpeg_idct4v
+ .type jpeg_idct4v, %function
+ .global jpeg_idct4h
+ .type jpeg_idct4h, %function
+
+jpeg_idct4v:
+#if ARM_ARCH < 5
+ stmdb sp!, { r4-r7, lr }
+ ldr r14, =-15137
+ ldr r12, =6270
+1:
+ ldrsh r4, [r0, #32]
+ ldrsh r2, [r0]
+ ldrsh r5, [r0, #48]
+ ldrsh r3, [r0, #16]
+ add r6, r2, r4 /* r6 = tmp10 >> 2 = d0 + d2 */
+ sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
+ add r4, r3, r5 /* r4 = z1 = d1 + d3 */
+ add r7, r4, r4, lsl #3
+ rsb r4, r4, r7, lsl #4
+ rsb r4, r4, r4, lsl #5 /* z1 *= 4433 */
+ add r4, r4, #1024
+ mla r3, r12, r3, r4 /* r3 = tmp2 = z1 + z2 * 6270 */
+ mla r5, r14, r5, r4 /* r5 = tmp0 = z1 - z3 * 15137 */
+ mov r6, r6, lsl #2 /* r6 <<= 2 */
+ mov r2, r2, lsl #2 /* r2 <<= 2 */
+ add r7, r6, r3, asr #11 /* r7 = o0 */
+ sub r3, r6, r3, asr #11 /* r3 = o3 */
+ add r6, r2, r5, asr #11 /* r6 = o1 */
+ sub r2, r2, r5, asr #11 /* r2 = o2 */
+ strh r7, [r0]
+ strh r3, [r0, #48]
+ strh r6, [r0, #16]
+ strh r2, [r0, #32]
+ add r0, r0, #2
+ teq r0, r1
+ bne 1b
+ ldmia sp!, { r4-r7, pc }
+#elif ARM_ARCH < 6
+ stmdb sp!, { r4-r8, lr }
+ ldr r8, =1024
+ ldr r14, =4433
+ ldr r12, =3302955134
+1:
+ ldrsh r5, [r0, #48]
+ ldrsh r3, [r0, #16]
+ ldrsh r4, [r0, #32]
+ ldrsh r2, [r0]
+ add r6, r3, r5 /* r6 = z1 = d1 + d3 */
+ add r7, r2, r4 /* r7 = tmp10 >> 2 = d0 + d2 */
+ smlabb r6, r14, r6, r8 /* z1 *= 4433 */
+ sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
+ smlabb r3, r12, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
+ smlatb r5, r12, r5, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
+ mov r7, r7, lsl #2
+ mov r2, r2, lsl #2
+ add r4, r7, r3, asr #11 /* r4 = o0 */
+ sub r7, r7, r3, asr #11 /* r7 = o3 */
+ add r3, r2, r5, asr #11 /* r3 = o1 */
+ sub r2, r2, r5, asr #11 /* r2 = o2 */
+ strh r4, [r0]
+ strh r7, [r0, #48]
+ strh r3, [r0, #16]
+ strh r2, [r0, #32]
+ add r0, r0, #2
+ teq r0, r1
+ bne 1b
+ ldmia sp!, { r4-r8, pc }
+#else
+ stmdb sp!, { r4-r10, lr }
+ ldr r2, =1024
+ ldr r3, =4433
+ ldr r12, =3302955134
+1:
+ ldr r6, [r0, #32]
+ ldr r4, [r0]
+ ldr r7, [r0, #48]
+ ldr r5, [r0, #16]
+ /* this part is being done in parallel on two columns */
+ sadd16 r8, r4, r6 /* r8 = d0 + d2 */
+ ssub16 r4, r4, r6 /* r4 = d0 - d2 */
+ sadd16 r6, r5, r7 /* r6 = d1 + d3 */
+ /* there is no parallel shift operation, but we can fake it with bic
+ and lsl */
+ bic r8, r8, #0xc000
+ bic r4, r4, #0xc000
+ /* multiplication expands values beyond 16 bits, so this part needs to be
+ split. the values will be merged below so that the rest of the addition
+ can be done in parallel */
+ smlabb r9, r3, r6, r2 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
+ smlabt r6, r3, r6, r2 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
+ smlabb r10, r12, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */
+ smlatb r14, r12, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */
+ smlabt r5, r12, r5, r6 /* r5 = tmp2[1] */
+ smlatt r6, r12, r7, r6 /* r6 = tmp0[1] */
+ mov r8, r8, lsl #2 /* complete the parallel shift started */
+ mov r4, r4, lsl #2 /* with the earlier bic instructions */
+ /* tmp2 are in r10, r5; tmp0 are in r14, r6 */
+ /* tmp10, tmp12 are in r4, r8 */
+ mov r10, r10, asr #11
+ mov r14, r14, asr #11
+ pkhbt r5, r10, r5, lsl #5 /* parallel tmp2 */
+ pkhbt r6, r14, r6, lsl #5 /* parallel tmp0 */
+ sadd16 r10, r8, r5 /* d0 */
+ ssub16 r5, r8, r5 /* d3 */
+ sadd16 r14, r4, r6 /* d1 */
+ ssub16 r6, r4, r6 /* d2 */
+ str r10, [r0]
+ str r5, [r0, #48]
+ str r14, [r0, #16]
+ str r6, [r0, #32]
+ add r0, r0, #4
+ cmp r0, r1
+ bcc 1b
+ ldmia sp!, { r4-r10, pc }
+#endif
+ .size jpeg_idct4v, .-jpeg_idct4v
+
+jpeg_idct4h:
+#if ARM_ARCH < 5
+ stmdb sp!, { r4-r10, lr }
+ ldr r10, =-15137
+ ldr r14, =4112
+ ldr r12, =6270
+1:
+ ldrsh r4, [r0]
+ ldrsh r6, [r0, #4]
+ ldrsh r7, [r0, #6]
+ ldrsh r5, [r0, #2]
+ add r4, r4, r14
+ add r8, r4, r6 /* r8 = tmp10 >> 2 = d0 + d2 */
+ sub r4, r4, r6 /* r4 = tmp12 >> 2= d0 - d2 */
+ add r6, r5, r7 /* r6 = z1 = d1 + d3 */
+ add r9, r6, r6, lsl #3
+ rsb r6, r6, r9, lsl #4
+ rsb r6, r6, r6, lsl #5 /* z1 *= 4433 */
+ mla r7, r10, r7, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
+ mla r5, r12, r5, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
+ add r9, r5, r8, lsl #13 /* r7 = o0 */
+ rsb r5, r5, r8, lsl #13 /* r3 = o3 */
+ add r8, r7, r4, lsl #13 /* r6 = o1 */
+ rsb r4, r7, r4, lsl #13 /* r2 = o2 */
+ mov r9, r9, asr #18
+ mov r8, r8, asr #18
+ mov r4, r4, asr #18
+ mov r5, r5, asr #18
+ cmp r9, #255
+ mvnhi r9, r9, asr #31
+ cmp r8, #255
+ mvnhi r8, r8, asr #31
+ cmp r4, #255
+ mvnhi r4, r4, asr #31
+ cmp r5, #255
+ mvnhi r5, r5, asr #31
+#ifdef HAVE_LCD_COLOR
+ strb r9, [r1]
+ strb r8, [r1, #4]
+ strb r4, [r1, #8]
+ strb r5, [r1, #12]
+#else
+ strb r9, [r1]
+ strb r8, [r1, #1]
+ strb r4, [r1, #2]
+ strb r5, [r1, #3]
+#endif
+ add r0, r0, #16
+ add r1, r1, r3
+ teq r0, r2
+ bne 1b
+ ldmia sp!, { r4-r10, pc }
+#elif ARM_ARCH < 6
+ stmdb sp!, { r4-r10, lr }
+ ldr r10, =4433
+ ldr r14, =4112
+ ldr r12, =3302955134
+1:
+ ldrsh r7, [r0, #6]
+ ldrsh r5, [r0, #2]
+ ldrsh r4, [r0]
+ ldrsh r6, [r0, #4]
+ add r8, r5, r7 /* r8 = z1 = d1 + d3 */
+ add r4, r4, r14
+ smulbb r8, r10, r8 /* z1 *= 4433 */
+ add r9, r4, r6 /* r9 = tmp10 >> 13 = d0 + d2 */
+ smlabb r5, r12, r5, r8 /* r5 = tmp2 = z1 + z2 * 6270 */
+ smlatb r7, r12, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */
+ sub r4, r5, r6 /* r4 = tmp12 >> 13 = d0 - d2 */
+ add r6, r5, r9, lsl #13 /* r6 = o0 */
+ rsb r9, r5, r9, lsl #13 /* r9 = o3 */
+ add r5, r7, r4, lsl #13 /* r5 = o1 */
+ rsb r4, r7, r4, lsl #13 /* r4 = o2 */
+ mov r6, r6, asr #18
+ mov r5, r5, asr #18
+ mov r4, r4, asr #18
+ mov r9, r9, asr #18
+ cmp r6, #255
+ mvnhi r6, r6, asr #31
+ cmp r5, #255
+ mvnhi r5, r5, asr #31
+ cmp r4, #255
+ mvnhi r4, r4, asr #31
+ cmp r9, #255
+ mvnhi r9, r9, asr #31
+#ifdef HAVE_LCD_COLOR
+ strb r6, [r1]
+ strb r5, [r1, #4]
+ strb r4, [r1, #8]
+ strb r9, [r1, #12]
+#else
+ strb r6, [r1]
+ strb r5, [r1, #1]
+ strb r4, [r1, #2]
+ strb r9, [r1, #3]
+#endif
+ add r0, r0, #16
+ add r1, r1, r3
+ teq r0, r2
+ bne 1b
+ ldmia sp!, { r4-r10, pc }
+#else
+ stmdb sp!, { r4-r9, lr }
+ ldr r9, =4433
+ ldr r14, =4112
+ ldr r12, =3302955134
+1:
+ ldmia r0, { r4-r5 }
+ sadd16 r4, r4, r14
+ sadd16 r6, r4, r5 /* r6lo = d0 + d2, r6hi = d1 + d3 */
+ ssub16 r7, r4, r5 /* r7lo = d0 - d2 */
+ smulbt r8, r9, r6
+ sxth r6, r6
+ smlabt r4, r12, r4, r8 /* r4 = tmp2 = z1 + z2 * 6270 */
+ smlatt r5, r12, r5, r8 /* r5 = tmp0 = z1 - z3 * 15137 */
+ sxth r7, r7
+ add r8, r4, r6, lsl #13 /* r8 = o0 */
+ rsb r6, r4, r6, lsl #13 /* r6 = o3 */
+ add r4, r5, r7, lsl #13 /* r4 = o1 */
+ rsb r5, r5, r7, lsl #13 /* r5 = o2 */
+ usat r8, #8, r8, asr #18
+ usat r6, #8, r6, asr #18
+ usat r4, #8, r4, asr #18
+ usat r5, #8, r5, asr #18
+#ifdef HAVE_LCD_COLOR
+ strb r8, [r1]
+ strb r6, [r1, #12]
+ strb r4, [r1, #4]
+ strb r5, [r1, #8]
+#else
+ strb r8, [r1]
+ strb r6, [r1, #3]
+ strb r4, [r1, #1]
+ strb r5, [r1, #2]
+#endif
+ add r0, r0, #16
+ add r1, r1, r3
+ teq r0, r2
+ bne 1b
+ ldmia sp!, { r4-r9, pc }
+#endif
+ .size jpeg_idct4h, .-jpeg_idct4h
diff --git a/apps/recorder/jpeg_load.c b/apps/recorder/jpeg_load.c
index dc8bb33..f2b3b4b 100644
--- a/apps/recorder/jpeg_load.c
+++ b/apps/recorder/jpeg_load.c
@@ -31,6 +31,7 @@
#include "debug.h"
#include "jpeg_load.h"
/*#define JPEG_BS_DEBUG*/
+#define ROCKBOX_DEBUG_JPEG
/* for portability of below JPEG code */
#define MEMSET(p,v,c) memset(p,v,c)
#define MEMCPY(d,s,c) memcpy(d,s,c)
@@ -49,7 +50,23 @@
#else
typedef uint8_t jpeg_pix_t;
#endif
+#define JPEG_IDCT_TRANSPOSE
#define JPEG_PIX_SZ (sizeof(jpeg_pix_t))
+#ifdef HAVE_LCD_COLOR
+#define COLOR_EXTRA_IDCT_WS 64
+#else
+#define COLOR_EXTRA_IDCT_WS 0
+#endif
+#ifdef JPEG_IDCT_TRANSPOSE
+#define V_OUT(n) ws2[8*n]
+#define V_IN_ST 1
+#define TRANSPOSE_EXTRA_IDCT_WS 64
+#else
+#define V_OUT(n) ws[8*n]
+#define V_IN_ST 8
+#define TRANSPOSE_EXTRA_IDCT_WS 0
+#endif
+#define IDCT_WS_SIZE (64 + TRANSPOSE_EXTRA_IDCT_WS + COLOR_EXTRA_IDCT_WS)
/* This can't be in jpeg_load.h because plugin.h includes it, and it conflicts
* with the definition in jpeg_decoder.h
@@ -259,7 +276,7 @@
*/
/* horizontal-pass 1-point IDCT */
-static void idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
+static void jpeg_idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
{
for (; ws < end; ws += 8)
{
@@ -269,19 +286,19 @@
}
/* vertical-pass 2-point IDCT */
-static void idct2v(int16_t *ws, int16_t *end)
+static void jpeg_idct2v(int16_t *ws, int16_t *end)
{
for (; ws < end; ws++)
{
- int tmp1 = ws[0];
- int tmp2 = ws[8];
- ws[0] = tmp1 + tmp2;
- ws[8] = tmp1 - tmp2;
+ int tmp1 = ws[0*8];
+ int tmp2 = ws[1*8];
+ ws[0*8] = tmp1 + tmp2;
+ ws[1*8] = tmp1 - tmp2;
}
}
/* horizontal-pass 2-point IDCT */
-static void idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
+static void jpeg_idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
{
for (; ws < end; ws += 8, out += rowstep)
{
@@ -295,69 +312,12 @@
}
}
+#ifndef CPU_ARM
/* vertical-pass 4-point IDCT */
-static void idct4v(int16_t *ws, int16_t *end)
+static void jpeg_idct4v(int16_t *ws, int16_t *end)
{
for (; ws < end; ws++)
{
-#if defined(CPU_ARM)
- int t0, t1, t2, t3, t4;
-#if ARM_ARCH <= 4
- int t5;
-#endif
- asm volatile(
- "ldrsh %[t4], [%[ws]]\n\t" /* t4 = tmp0 (ws[8*0]) */
- "ldrsh %[t1], [%[ws], #32]\n\t" /* t1 = tmp2 (ws[8*2]) */
- "ldrsh %[t2], [%[ws], #16]\n\t" /* t2 = z2 (ws[8*1]) */
- "add %[t0], %[t4], %[t1]\n\t" /* t0 = tmp10 >> 2
- (tmp0 + tmp2) */
- "sub %[t1], %[t4], %[t1]\n\t" /* t1 = tmp12 >> 2
- (tmp0 - tmp2) */
- "ldrsh %[t3], [%[ws], #48]\n\t" /* t3 = z3 (ws[8*3] */
- "add %[t4], %[t2], %[t3]\n\t" /* t4 = z2 + z3 */
-#if ARM_ARCH > 4
- "smulbb %[t4], %[c1], %[t4]\n\t"
- "add %[t4], %[t4], #1024\n\t" /* t4 = z1 */
- "smlatb %[t3], %[c2c3], %[t3], %[t4]\n\t"
- "smlabb %[t2], %[c2c3], %[t2], %[t4]\n\t"
- "mov %[t3], %[t3], asr #11\n\t" /* t3 = tmp0 */
- "mov %[t2], %[t2], asr #11\n\t" /* t2 = tmp2 */
-#else
- "add %[t5], %[t4], %[t4], lsl #3\n\t"
- "rsb %[t4], %[t4], %[t5], lsl #4\n\t"
- "rsb %[t4], %[t4], %[t4], lsl #5\n\t"
- "add %[t4], %[t4], #1024\n\t" /*z1*/
- "mla %[t3], %[c2], %[t3], %[t4]\n\t"
- "mla %[t2], %[c3], %[t2], %[t4]\n\t"
- "mov %[t3], %[t3], asr #11\n\t" /* t3 = tmp0 */
- "mov %[t2], %[t2], asr #11\n\t" /* t2 = tmp2 */
-#endif
- "add %[t4], %[t2], %[t0], lsl #2\n\t" /* t4 = tmp10 + tmp2 */
- "rsb %[t0], %[t2], %[t0], lsl #2\n\t" /* t0 = tmp10 - tmp2 */
- "add %[t2], %[t3], %[t1], lsl #2\n\t" /* t2 = tmp12 + tmp0 */
- "rsb %[t3], %[t3], %[t1], lsl #2\n\t" /* t3 = tmp12 - tmp0 */
- "strh %[t4], [%[ws]]\n\t"
- "strh %[t0], [%[ws], #48]\n\t"
- "strh %[t2], [%[ws], #16]\n\t"
- "strh %[t3], [%[ws], #32]\n\t"
- : [t0] "=&r" (t0),
- [t1] "=&r" (t1),
- [t2] "=&r" (t2),
- [t3] "=&r" (t3),
- [t4] "=&r" (t4)
-#if ARM_ARCH <= 4
- ,[t5] "=&r" (t5)
-#endif
- : [ws] "r" (ws),
-#if ARM_ARCH > 4
- [c1] "r" (FIX_0_541196100),
- [c2c3] "r" (((-FIX_1_847759065)<<16)|FIX_0_765366865)
-#else
- [c2] "r" (-FIX_1_847759065),
- [c3] "r" (FIX_0_765366865)
-#endif
- );
-#else
int tmp0, tmp2, tmp10, tmp12;
int z1, z2, z3;
/* Even part */
@@ -382,93 +342,18 @@
CONST_BITS-PASS1_BITS);
/* Final output stage */
-
ws[8*0] = (int) (tmp10 + tmp2);
ws[8*3] = (int) (tmp10 - tmp2);
ws[8*1] = (int) (tmp12 + tmp0);
ws[8*2] = (int) (tmp12 - tmp0);
-#endif
}
}
/* horizontal-pass 4-point IDCT */
-static void idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
+static void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
{
for (; ws < end; out += rowstep, ws += 8)
{
-#if defined(CPU_ARM)
- int t0, t1, t2, t3, t4;
-#if ARM_ARCH <= 4
- int t5;
-#endif
- asm volatile(
- "ldrsh %[t4], [%[ws]]\n\t" /* t4 = tmp0 (ws[0]) */
- "ldrsh %[t1], [%[ws], #4]\n\t" /* t1 = tmp2 (ws[2]) */
- "add %[t4], %[t4], #16\n\t" /* add rounding to DC */
- "add %[t4], %[t4], #4096\n\t" /* pre-add offset */
- "ldrsh %[t2], [%[ws], #2]\n\t" /* t2 = z2 (ws[1]) */
- "add %[t0], %[t4], %[t1]\n\t" /* t0 = tmp10 >> 13
- (tmp0 + tmp2) */
- "sub %[t1], %[t4], %[t1]\n\t" /* t1 = tmp12 >> 13
- (tmp0 - tmp2) */
- "ldrsh %[t3], [%[ws], #6]\n\t" /* t3 = z3 (ws[3] */
- "add %[t4], %[t2], %[t3]\n\t" /* t4 = z2 + z3 */
-#if ARM_ARCH > 4
- "smulbb %[t4], %[c1], %[t4]\n\t"
- "smlatb %[t3], %[c2c3], %[t3], %[t4]\n\t"
- "smlabb %[t2], %[c2c3], %[t2], %[t4]\n\t"
-#else
- "add %[t5], %[t4], %[t4], lsl #3\n\t"
- "rsb %[t4], %[t4], %[t5], lsl #4\n\t"
- "rsb %[t4], %[t4], %[t4], lsl #5\n\t" /* t4 = z1 */
- "mla %[t3], %[c2], %[t3], %[t4]\n\t"
- "mla %[t2], %[c3], %[t2], %[t4]\n\t"
-#endif
- "add %[t4], %[t2], %[t0], lsl #13\n\t" /* t4 = tmp10 + tmp2 */
- "rsb %[t0], %[t2], %[t0], lsl #13\n\t" /* t0 = tmp10 - tmp2 */
- "add %[t2], %[t3], %[t1], lsl #13\n\t" /* t2 = tmp12 + tmp0 */
- "rsb %[t3], %[t3], %[t1], lsl #13\n\t" /* t3 = tmp12 - tmp0 */
- "mov %[t4], %[t4], asr #18\n\t" /* descale results */
- "mov %[t0], %[t0], asr #18\n\t"
- "mov %[t2], %[t2], asr #18\n\t"
- "mov %[t3], %[t3], asr #18\n\t"
- "cmp %[t4], #255\n\t" /* range limit results */
- "mvnhi %[t4], %[t4], asr #31\n\t"
- "cmp %[t0], #255\n\t"
- "mvnhi %[t0], %[t0], asr #31\n\t"
- "cmp %[t2], #255\n\t"
- "mvnhi %[t2], %[t2], asr #31\n\t"
- "cmp %[t3], #255\n\t"
- "mvnhi %[t3], %[t3], asr #31\n\t"
- "cmp %[t4], #255\n\t"
- "mvnhi %[t4], %[t4], asr #31\n\t"
- "strb %[t4], [%[out]]\n\t"
- "strb %[t0], [%[out], %[o3]]\n\t"
- "strb %[t2], [%[out], %[o1]]\n\t"
- "strb %[t3], [%[out], %[o2]]\n\t"
- : [t0] "=&r" (t0),
- [t1] "=&r" (t1),
- [t2] "=&r" (t2),
- [t3] "=&r" (t3),
- [t4] "=&r" (t4)
-#if ARM_ARCH <= 4
-
- ,[t5] "=&r" (t5)
-#endif
- : [ws] "r" (ws),
- [out] "r" (out),
- [o1] "i" (JPEG_PIX_SZ),
- [o2] "i" (JPEG_PIX_SZ*2),
- [o3] "i" (JPEG_PIX_SZ*3),
-#if ARM_ARCH > 4
- [c1] "r" (FIX_0_541196100),
- [c2c3] "r" (((-FIX_1_847759065)<<16)|FIX_0_765366865)
-#else
- [c2] "r" (-FIX_1_847759065),
- [c3] "r" (FIX_0_765366865)
-#endif
- );
-#else
int tmp0, tmp2, tmp10, tmp12;
int z1, z2, z3;
/* Even part */
@@ -500,18 +385,27 @@
DS_OUT));
out[JPEG_PIX_SZ*2] = range_limit((int) RIGHT_SHIFT(tmp12 - tmp0,
DS_OUT));
-#endif
}
}
+#else
+extern void jpeg_idct4v(int16_t *ws, int16_t *end);
+extern void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
+#endif
/* vertical-pass 8-point IDCT */
-static void idct8v(int16_t *ws, int16_t *end)
+static void jpeg_idct8v(int16_t *ws, int16_t *end)
{
long tmp0, tmp1, tmp2, tmp3;
long tmp10, tmp11, tmp12, tmp13;
long z1, z2, z3, z4, z5;
+#ifdef JPEG_IDCT_TRANSPOSE
+ int16_t *ws2 = ws + 64;
+ for (; ws < end; ws += 8, ws2++)
+ {
+#else
for (; ws < end; ws++)
{
+#endif
/* Due to quantization, we will usually find that many of the input
* coefficients are zero, especially the AC terms. We can exploit this
* by short-circuiting the IDCT calculation for any column in which all
@@ -520,30 +414,30 @@
* With typical images and quantization tables, half or more of the
* column DCT calculations can be simplified this way.
*/
- if ((ws[8*1] | ws[8*2] | ws[8*3]
- | ws[8*4] | ws[8*5] | ws[8*6] | ws[8*7]) == 0)
+ if ((ws[V_IN_ST*1] | ws[V_IN_ST*2] | ws[V_IN_ST*3]
+ | ws[V_IN_ST*4] | ws[V_IN_ST*5] | ws[V_IN_ST*6] | ws[V_IN_ST*7]) == 0)
{
/* AC terms all zero */
- int dcval = ws[8*0] << PASS1_BITS;
+ int dcval = ws[V_IN_ST*0] << PASS1_BITS;
- ws[8*0] = ws[8*1] = ws[8*2] = ws[8*3] = ws[8*4]
- = ws[8*5] = ws[8*6] = ws[8*7] = dcval;
+ V_OUT(0) = V_OUT(1) = V_OUT(2) = V_OUT(3) = V_OUT(4) = V_OUT(5) =
+ V_OUT(6) = V_OUT(7) = dcval;
continue;
}
/* Even part: reverse the even part of the forward DCT. */
/* The rotator is sqrt(2)*c(-6). */
- z2 = ws[8*2];
- z3 = ws[8*6];
+ z2 = ws[V_IN_ST*2];
+ z3 = ws[V_IN_ST*6];
z1 = MULTIPLY16(z2 + z3, FIX_0_541196100);
tmp2 = z1 + MULTIPLY16(z3, - FIX_1_847759065);
tmp3 = z1 + MULTIPLY16(z2, FIX_0_765366865);
- z2 = ws[8*0] << CONST_BITS;
+ z2 = ws[V_IN_ST*0] << CONST_BITS;
z2 += ONE << (CONST_BITS - PASS1_BITS - 1);
- z3 = ws[8*4] << CONST_BITS;
+ z3 = ws[V_IN_ST*4] << CONST_BITS;
tmp0 = (z2 + z3);
tmp1 = (z2 - z3);
@@ -556,10 +450,10 @@
/* Odd part per figure 8; the matrix is unitary and hence its
transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. */
- tmp0 = ws[8*7];
- tmp1 = ws[8*5];
- tmp2 = ws[8*3];
- tmp3 = ws[8*1];
+ tmp0 = ws[V_IN_ST*7];
+ tmp1 = ws[V_IN_ST*5];
+ tmp2 = ws[V_IN_ST*3];
+ tmp3 = ws[V_IN_ST*1];
z1 = tmp0 + tmp3;
z2 = tmp1 + tmp2;
@@ -586,19 +480,19 @@
/* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
- ws[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
- ws[8*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
- ws[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
- ws[8*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
- ws[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
- ws[8*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
- ws[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
- ws[8*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+ V_OUT(0) = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
+ V_OUT(7) = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
+ V_OUT(1) = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
+ V_OUT(6) = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
+ V_OUT(2) = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
+ V_OUT(5) = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
+ V_OUT(3) = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
+ V_OUT(4) = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
}
}
/* horizontal-pass 8-point IDCT */
-static void idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
+static void jpeg_idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
{
long tmp0, tmp1, tmp2, tmp3;
long tmp10, tmp11, tmp12, tmp13;
@@ -709,20 +603,26 @@
#ifdef HAVE_LCD_COLOR
/* vertical-pass 16-point IDCT */
-static void idct16v(int16_t *ws, int16_t *end)
+static void jpeg_idct16v(int16_t *ws, int16_t *end)
{
long tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
long tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
long z1, z2, z3, z4;
+#ifdef JPEG_IDCT_TRANSPOSE
+ int16_t *ws2 = ws + 64;
+ for (; ws < end; ws += 8, ws2++)
+ {
+#else
for (; ws < end; ws++)
{
+#endif
/* Even part */
- tmp0 = ws[8*0] << CONST_BITS;
+ tmp0 = ws[V_IN_ST*0] << CONST_BITS;
/* Add fudge factor here for final descale. */
tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
- z1 = ws[8*4];
+ z1 = ws[V_IN_ST*4];
tmp1 = MULTIPLY(z1, FIX(1.306562965)); /* c4[16] = c2[8] */
tmp2 = MULTIPLY(z1, FIX_0_541196100); /* c12[16] = c6[8] */
@@ -731,8 +631,8 @@
tmp12 = tmp0 + tmp2;
tmp13 = tmp0 - tmp2;
- z1 = ws[8*2];
- z2 = ws[8*6];
+ z1 = ws[V_IN_ST*2];
+ z2 = ws[V_IN_ST*6];
z3 = z1 - z2;
z4 = MULTIPLY(z3, FIX(0.275899379)); /* c14[16] = c7[8] */
z3 = MULTIPLY(z3, FIX(1.387039845)); /* c2[16] = c1[8] */
@@ -757,10 +657,10 @@
/* Odd part */
- z1 = ws[8*1];
- z2 = ws[8*3];
- z3 = ws[8*5];
- z4 = ws[8*7];
+ z1 = ws[V_IN_ST*1];
+ z2 = ws[V_IN_ST*3];
+ z3 = ws[V_IN_ST*5];
+ z4 = ws[V_IN_ST*7];
tmp11 = z1 + z3;
@@ -795,27 +695,27 @@
tmp11 += z2;
/* Final output stage */
- ws[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp0, CONST_BITS-PASS1_BITS);
- ws[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0, CONST_BITS-PASS1_BITS);
- ws[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp1, CONST_BITS-PASS1_BITS);
- ws[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1, CONST_BITS-PASS1_BITS);
- ws[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp2, CONST_BITS-PASS1_BITS);
- ws[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2, CONST_BITS-PASS1_BITS);
- ws[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp3, CONST_BITS-PASS1_BITS);
- ws[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3, CONST_BITS-PASS1_BITS);
- ws[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
- ws[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
- ws[8*5] = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
- ws[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
- ws[8*6] = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
- ws[8*9] = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
- ws[8*7] = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
- ws[8*8] = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
+ V_OUT(0) = (int) RIGHT_SHIFT(tmp20 + tmp0, CONST_BITS-PASS1_BITS);
+ V_OUT(15) = (int) RIGHT_SHIFT(tmp20 - tmp0, CONST_BITS-PASS1_BITS);
+ V_OUT(1) = (int) RIGHT_SHIFT(tmp21 + tmp1, CONST_BITS-PASS1_BITS);
+ V_OUT(14) = (int) RIGHT_SHIFT(tmp21 - tmp1, CONST_BITS-PASS1_BITS);
+ V_OUT(2) = (int) RIGHT_SHIFT(tmp22 + tmp2, CONST_BITS-PASS1_BITS);
+ V_OUT(13) = (int) RIGHT_SHIFT(tmp22 - tmp2, CONST_BITS-PASS1_BITS);
+ V_OUT(3) = (int) RIGHT_SHIFT(tmp23 + tmp3, CONST_BITS-PASS1_BITS);
+ V_OUT(12) = (int) RIGHT_SHIFT(tmp23 - tmp3, CONST_BITS-PASS1_BITS);
+ V_OUT(4) = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
+ V_OUT(11) = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
+ V_OUT(5) = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
+ V_OUT(10) = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
+ V_OUT(6) = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
+ V_OUT(9) = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
+ V_OUT(7) = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
+ V_OUT(8) = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
}
}
/* horizontal-pass 16-point IDCT */
-static void idct16h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
+static void jpeg_idct16h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
{
long tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
long tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
@@ -946,12 +846,12 @@
};
struct idct_entry idct_tbl[] = {
- { PASS1_BITS, NULL, idct1h },
- { PASS1_BITS, idct2v, idct2h },
- { 0, idct4v, idct4h },
- { 0, idct8v, idct8h },
+ { PASS1_BITS, NULL, jpeg_idct1h },
+ { PASS1_BITS, jpeg_idct2v, jpeg_idct2h },
+ { 0, jpeg_idct4v, jpeg_idct4h },
+ { 0, jpeg_idct8v, jpeg_idct8h },
#ifdef HAVE_LCD_COLOR
- { 0, idct16v, idct16h },
+ { 0, jpeg_idct16v, jpeg_idct16h },
#endif
};
@@ -1468,21 +1368,27 @@
}
-/* zag[i] is the natural-order position of the i'th element of zigzag order.
- * If the incoming data is corrupted, decode_mcu could attempt to
- * reference values beyond the end of the array. To avoid a wild store,
- * we put some extra zeroes after the real entries.
- */
+/* zag[i] is the natural-order position of the i'th element of zigzag order. */
static const unsigned char zag[] =
{
- 0, 1, 8, 16, 9, 2, 3, 10,
- 17, 24, 32, 25, 18, 11, 4, 5,
- 12, 19, 26, 33, 40, 48, 41, 34,
- 27, 20, 13, 6, 7, 14, 21, 28,
- 35, 42, 49, 56, 57, 50, 43, 36,
- 29, 22, 15, 23, 30, 37, 44, 51,
- 58, 59, 52, 45, 38, 31, 39, 46,
- 53, 60, 61, 54, 47, 55, 62, 63,
+#ifdef JPEG_IDCT_TRANSPOSE
+ 0, 8, 1, 2, 9, 16, 24, 17,
+ 10, 3, 4, 11, 18, 25, 32, 40,
+ 33, 26, 19, 12, 5, 6, 13, 20,
+ 27, 34, 41, 48, 56, 49, 42, 35,
+ 28, 21, 14, 7, 15, 22, 29, 36,
+ 43, 50, 57, 58, 51, 44, 37, 30,
+ 23, 31, 38, 45, 52, 59, 60, 53,
+ 46, 39, 47, 54, 61, 62, 55, 63,
+#endif
+ 0, 1, 8, 16, 9, 2, 3, 10,
+ 17, 24, 32, 25, 18, 11, 4, 5,
+ 12, 19, 26, 33, 40, 48, 41, 34,
+ 27, 20, 13, 6, 7, 14, 21, 28,
+ 35, 42, 49, 56, 57, 50, 43, 36,
+ 29, 22, 15, 23, 30, 37, 44, 51,
+ 58, 59, 52, 45, 38, 31, 39, 46,
+ 53, 60, 61, 54, 47, 55, 62, 63,
};
/* zig[i] is the the zig-zag order position of the i'th element of natural
@@ -1898,17 +1804,20 @@
store_offs[p_jpeg->store_pos[1]] = JPEG_PIX_SZ << p_jpeg->h_scale[0];
store_offs[p_jpeg->store_pos[2]] = b_width << p_jpeg->v_scale[0];
store_offs[p_jpeg->store_pos[3]] = store_offs[1] + store_offs[2];
-
- int16_t block[128]; /* decoded DCT coefficients */
+ /* decoded DCT coefficients */
+ int16_t block[IDCT_WS_SIZE] __attribute__((aligned(8)));
for (x = 0; x < p_jpeg->x_mbl; x++)
{
int blkn;
for (blkn = 0; blkn < p_jpeg->blocks; blkn++)
{
- int k = 1; /* coefficient index */
- int s, r; /* huffman values */
int ci = p_jpeg->mcu_membership[blkn]; /* component index */
int ti = p_jpeg->tab_membership[blkn]; /* table index */
+#ifdef JPEG_IDCT_TRANSPOSE
+ bool transpose = p_jpeg->v_scale[!!ci] > 2;
+#endif
+ int k = 1; /* coefficient index */
+ int s, r; /* huffman values */
struct derived_tbl* dctbl = &p_jpeg->dc_derived_tbls[ti];
struct derived_tbl* actbl = &p_jpeg->ac_derived_tbls[ti];
@@ -1948,7 +1857,11 @@
r = get_bits(p_jpeg, s);
r = HUFF_EXTEND(r, s);
r = MULTIPLY16(r, p_jpeg->quanttable[!!ci][k]);
+#ifdef JPEG_IDCT_TRANSPOSE
+ block[zag[transpose ? k : k + 64]] = r ;
+#else
block[zag[k]] = r ;
+#endif
}
else
{
@@ -1988,10 +1901,19 @@
int idct_rows = BIT_N(p_jpeg->v_scale[!!ci]);
unsigned char *b_out = out + (ci ? ci : store_offs[blkn]);
if (idct_tbl[p_jpeg->v_scale[!!ci]].v_idct)
+#ifdef JPEG_IDCT_TRANSPOSE
+ idct_tbl[p_jpeg->v_scale[!!ci]].v_idct(block,
+ transpose ? block + 8 * idct_cols
+ : block + idct_cols);
+ uint16_t * h_block = transpose ? block + 64 : block;
+ idct_tbl[p_jpeg->h_scale[!!ci]].h_idct(h_block, b_out,
+ h_block + idct_rows * 8, b_width);
+#else
idct_tbl[p_jpeg->v_scale[!!ci]].v_idct(block,
block + idct_cols);
idct_tbl[p_jpeg->h_scale[!!ci]].h_idct(block, b_out,
block + idct_rows * 8, b_width);
+#endif
}
} /* for blkn */
/* don't starve other threads while an MCU row decodes */
@@ -2048,7 +1970,6 @@
{
int fd, ret;
fd = open(filename, O_RDONLY);
-
JDEBUGF("read_jpeg_file: filename: %s buffer len: %d cformat: %p\n",
filename, maxsize, cformat);
/* Exit if file opening failed */
@@ -2181,14 +2102,22 @@
int decode_h = BIT_N(p_jpeg->v_scale[0]) - 1;
src_dim.width = (p_jpeg->x_size << p_jpeg->h_scale[0]) >> 3;
src_dim.height = (p_jpeg->y_size << p_jpeg->v_scale[0]) >> 3;
- p_jpeg->zero_need[0] = (decode_h << 3) + decode_w;
- p_jpeg->k_need[0] = zig[p_jpeg->zero_need[0]];
+#ifdef JPEG_IDCT_TRANSPOSE
+ if (p_jpeg->v_scale[0] > 2)
+ p_jpeg->zero_need[0] = (decode_w << 3) + decode_h;
+ else
+#endif
+ p_jpeg->zero_need[0] = (decode_h << 3) + decode_w;
+ p_jpeg->k_need[0] = zig[(decode_h << 3) + decode_w];
JDEBUGF("need luma components to %d\n", p_jpeg->k_need[0]);
#ifdef HAVE_LCD_COLOR
decode_w = BIT_N(MIN(p_jpeg->h_scale[1],3)) - 1;
decode_h = BIT_N(MIN(p_jpeg->v_scale[1],3)) - 1;
- p_jpeg->zero_need[1] = (decode_h << 3) + decode_w;
- p_jpeg->k_need[1] = zig[p_jpeg->zero_need[1]];
+ if (p_jpeg->v_scale[1] > 2)
+ p_jpeg->zero_need[1] = (decode_w << 3) + decode_h;
+ else
+ p_jpeg->zero_need[1] = (decode_h << 3) + decode_w;
+ p_jpeg->k_need[1] = zig[(decode_h << 3) + decode_w];
JDEBUGF("need chroma components to %d\n", p_jpeg->k_need[1]);
#endif
if (cformat)