Core JPEG decoder improvements:
For >8-point vertical IDCT, transpose the coefficients while decoding them, so that the vertical IDCT can read in rows rather than columns. This improves speed a bit for this size even using the C IDCT.
Remove inline ARM asm, replacing it with an external file containing pure asm IDCT functions.
Add jpeg_ prefix to JPEG IDCT functions since some of them will now be visible globally.


git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21345 a1c6a512-1295-4272-9138-f99709370657
diff --git a/apps/SOURCES b/apps/SOURCES
index 527b0b2..4caf32d 100644
--- a/apps/SOURCES
+++ b/apps/SOURCES
@@ -104,6 +104,9 @@
 #endif
 #ifdef HAVE_JPEG
 recorder/jpeg_load.c
+#ifdef CPU_ARM
+recorder/jpeg_idct_arm.S
+#endif
 #endif
 #ifdef HAVE_ALBUMART
 recorder/albumart.c
diff --git a/apps/plugins/lib/SOURCES b/apps/plugins/lib/SOURCES
index 7211109..2ed38c4 100644
--- a/apps/plugins/lib/SOURCES
+++ b/apps/plugins/lib/SOURCES
@@ -27,6 +27,9 @@
 profile_plugin.c
 #endif
 #ifdef HAVE_LCD_BITMAP
+#ifdef CPU_ARM
+pluginlib_jpeg_idct_arm.S
+#endif
 pluginlib_jpeg_mem.c
 pluginlib_resize.c
 #ifndef HAVE_JPEG
diff --git a/apps/plugins/lib/pluginlib_jpeg_idct_arm.S b/apps/plugins/lib/pluginlib_jpeg_idct_arm.S
new file mode 100644
index 0000000..5e6149d
--- /dev/null
+++ b/apps/plugins/lib/pluginlib_jpeg_idct_arm.S
@@ -0,0 +1,24 @@
+/***************************************************************************
+*             __________               __   ___.
+*   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+*   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+*   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+*   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+*                     \/            \/     \/    \/            \/
+* $Id$
+*
+* Copyright (C) 2009 by Andrew Mahone
+*
+* This is a wrapper for the core jpeg_idct_arm.S
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License
+* as published by the Free Software Foundation; either version 2
+* of the License, or (at your option) any later version.
+*
+* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+* KIND, either express or implied.
+*
+****************************************************************************/
+
+#include "recorder/jpeg_idct_arm.S"
diff --git a/apps/recorder/jpeg_idct_arm.S b/apps/recorder/jpeg_idct_arm.S
new file mode 100644
index 0000000..2ef868e
--- /dev/null
+++ b/apps/recorder/jpeg_idct_arm.S
@@ -0,0 +1,287 @@
+/***************************************************************************
+*             __________               __   ___.
+*   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+*   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+*   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+*   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+*                     \/            \/     \/    \/            \/
+* $Id$
+*
+* JPEG assembly IDCT
+*
+* Copyright (C) 2009 Andrew Mahone asm versions of the C IDCT algorithms used
+*                                  jpeg_load.c with 
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License
+* as published by the Free Software Foundation; either version 2
+* of the License, or (at your option) any later version.
+*
+* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+* KIND, either express or implied.
+*
+****************************************************************************/
+#include "config.h"
+
+    .section .text
+    .align   2
+    .global jpeg_idct4v
+    .type   jpeg_idct4v, %function
+    .global jpeg_idct4h
+    .type   jpeg_idct4h, %function
+
+jpeg_idct4v:
+#if ARM_ARCH < 5
+    stmdb  sp!, { r4-r7, lr }
+    ldr    r14, =-15137
+    ldr    r12, =6270
+1:
+    ldrsh  r4,  [r0, #32]
+    ldrsh  r2,  [r0]
+    ldrsh  r5,  [r0, #48]
+    ldrsh  r3,  [r0, #16]
+    add    r6,  r2,  r4            /* r6 = tmp10 >> 2 = d0 + d2 */
+    sub    r2,  r2,  r4            /* r2 = tmp12 >> 2= d0 - d2 */
+    add    r4,  r3,  r5            /* r4 = z1 = d1 + d3 */
+    add    r7,  r4,  r4,  lsl #3
+    rsb    r4,  r4,  r7,  lsl #4
+    rsb    r4,  r4,  r4,  lsl #5   /* z1 *= 4433 */
+    add    r4,  r4,  #1024
+    mla    r3,  r12, r3,  r4       /* r3 = tmp2 = z1 + z2 * 6270 */ 
+    mla    r5,  r14, r5,  r4       /* r5 = tmp0 = z1 - z3 * 15137 */
+    mov    r6,  r6,  lsl #2        /* r6 <<= 2 */
+    mov    r2,  r2,  lsl #2        /* r2 <<= 2 */
+    add    r7,  r6,  r3,  asr #11   /* r7 = o0 */
+    sub    r3,  r6,  r3,  asr #11   /* r3 = o3 */
+    add    r6,  r2,  r5,  asr #11   /* r6 = o1 */
+    sub    r2,  r2,  r5,  asr #11   /* r2 = o2 */
+    strh   r7,  [r0]
+    strh   r3,  [r0, #48]
+    strh   r6,  [r0, #16]
+    strh   r2,  [r0, #32]
+    add    r0,  r0,  #2
+    teq    r0,  r1
+    bne    1b
+    ldmia  sp!, { r4-r7, pc }
+#elif ARM_ARCH < 6
+    stmdb sp!, { r4-r8, lr }
+    ldr    r8,  =1024
+    ldr    r14, =4433
+    ldr    r12, =3302955134
+1:
+    ldrsh  r5,  [r0, #48]
+    ldrsh  r3,  [r0, #16]
+    ldrsh  r4,  [r0, #32]
+    ldrsh  r2,  [r0]
+    add    r6,  r3,  r5            /* r6 = z1 = d1 + d3 */
+    add    r7,  r2,  r4            /* r7 = tmp10 >> 2 = d0 + d2 */
+    smlabb r6, r14, r6,  r8        /* z1 *= 4433 */
+    sub    r2,  r2,  r4            /* r2 = tmp12 >> 2= d0 - d2 */
+    smlabb r3,  r12, r3,  r6       /* r3 = tmp2 = z1 + z2 * 6270 */ 
+    smlatb r5,  r12, r5,  r6       /* r5 = tmp0 = z1 - z3 * 15137 */
+    mov    r7,  r7,  lsl #2
+    mov    r2,  r2,  lsl #2
+    add    r4,  r7,  r3,  asr #11   /* r4 = o0 */
+    sub    r7,  r7,  r3,  asr #11   /* r7 = o3 */
+    add    r3,  r2,  r5,  asr #11   /* r3 = o1 */
+    sub    r2,  r2,  r5,  asr #11   /* r2 = o2 */
+    strh   r4,  [r0]
+    strh   r7,  [r0, #48]
+    strh   r3,  [r0, #16]
+    strh   r2,  [r0, #32]
+    add    r0,  r0,  #2
+    teq    r0,  r1
+    bne    1b
+    ldmia sp!, { r4-r8, pc }
+#else
+    stmdb  sp!, { r4-r10, lr }
+    ldr    r2,  =1024
+    ldr    r3,  =4433
+    ldr    r12, =3302955134
+1:
+    ldr    r6,  [r0, #32]
+    ldr    r4,  [r0]
+    ldr    r7,  [r0, #48]
+    ldr    r5,  [r0, #16]
+    /* this part is being done in parallel on two columns */
+    sadd16 r8,  r4,  r6            /* r8 = d0 + d2 */
+    ssub16 r4,  r4,  r6            /* r4 = d0 - d2 */
+    sadd16 r6,  r5,  r7            /* r6 = d1 + d3 */
+    /* there is no parallel shift operation, but we can fake it with bic
+       and lsl */
+    bic    r8,  r8,  #0xc000
+    bic    r4,  r4,  #0xc000
+    /* multiplication expands values beyond 16 bits, so this part needs to be
+       split. the values will be merged below so that the rest of the addition
+       can be done in parallel */
+    smlabb r9,  r3,  r6,  r2       /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
+    smlabt r6,  r3,  r6,  r2       /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
+    smlabb r10, r12, r5,  r9       /* r10 = tmp2[0] = z1 + d1 * 6270 */
+    smlatb r14, r12, r7,  r9       /* r14 = tmp0[0] = z1 - d3 * 15137 */
+    smlabt r5,  r12, r5,  r6       /* r5  = tmp2[1] */
+    smlatt r6,  r12, r7,  r6       /* r6  = tmp0[1] */
+    mov    r8,  r8,  lsl #2        /* complete the parallel shift started */
+    mov    r4,  r4,  lsl #2        /* with the earlier bic instructions */
+    /* tmp2 are in r10, r5; tmp0 are in r14, r6 */
+    /* tmp10, tmp12 are in r4, r8 */
+    mov    r10, r10, asr #11
+    mov    r14, r14, asr #11
+    pkhbt  r5,  r10, r5,  lsl #5 /* parallel tmp2 */ 
+    pkhbt  r6,  r14, r6,  lsl #5 /* parallel tmp0 */
+    sadd16 r10, r8,  r5 /* d0 */
+    ssub16 r5,  r8,  r5 /* d3 */
+    sadd16 r14, r4,  r6 /* d1 */
+    ssub16 r6,  r4,  r6 /* d2 */
+    str    r10, [r0]
+    str    r5,  [r0, #48]
+    str    r14, [r0, #16]
+    str    r6,  [r0, #32]
+    add    r0,  r0,  #4
+    cmp    r0,  r1
+    bcc    1b
+    ldmia  sp!, { r4-r10, pc }
+#endif
+    .size jpeg_idct4v, .-jpeg_idct4v
+
+jpeg_idct4h:
+#if ARM_ARCH < 5
+    stmdb  sp!, { r4-r10, lr }
+    ldr    r10, =-15137
+    ldr    r14, =4112
+    ldr    r12, =6270
+1:
+    ldrsh  r4,  [r0]
+    ldrsh  r6,  [r0, #4]
+    ldrsh  r7,  [r0, #6]
+    ldrsh  r5,  [r0, #2]
+    add    r4,  r4,  r14
+    add    r8,  r4,  r6            /* r8 = tmp10 >> 2 = d0 + d2 */
+    sub    r4,  r4,  r6            /* r4 = tmp12 >> 2= d0 - d2 */
+    add    r6,  r5,  r7            /* r6 = z1 = d1 + d3 */
+    add    r9,  r6,  r6,  lsl #3
+    rsb    r6,  r6,  r9,  lsl #4
+    rsb    r6,  r6,  r6,  lsl #5   /* z1 *= 4433 */
+    mla    r7,  r10, r7,  r6       /* r5 = tmp0 = z1 - z3 * 15137 */
+    mla    r5,  r12, r5,  r6       /* r3 = tmp2 = z1 + z2 * 6270 */ 
+    add    r9,  r5,  r8,  lsl #13  /* r7 = o0 */
+    rsb    r5,  r5,  r8,  lsl #13  /* r3 = o3 */
+    add    r8,  r7,  r4,  lsl #13  /* r6 = o1 */
+    rsb    r4,  r7,  r4,  lsl #13  /* r2 = o2 */
+    mov    r9,  r9,  asr #18
+    mov    r8,  r8,  asr #18
+    mov    r4,  r4,  asr #18
+    mov    r5,  r5,  asr #18
+    cmp    r9,  #255
+    mvnhi  r9,  r9,  asr #31
+    cmp    r8,  #255
+    mvnhi  r8,  r8,  asr #31
+    cmp    r4,  #255
+    mvnhi  r4,  r4,  asr #31
+    cmp    r5,  #255
+    mvnhi  r5,  r5,  asr #31
+#ifdef HAVE_LCD_COLOR
+    strb   r9,  [r1]
+    strb   r8,  [r1, #4]
+    strb   r4,  [r1, #8]
+    strb   r5,  [r1, #12]
+#else
+    strb   r9,  [r1]
+    strb   r8,  [r1, #1]
+    strb   r4,  [r1, #2]
+    strb   r5,  [r1, #3]
+#endif
+    add    r0,  r0,  #16
+    add    r1,  r1,  r3
+    teq    r0,  r2
+    bne    1b
+    ldmia sp!, { r4-r10, pc }
+#elif ARM_ARCH < 6
+    stmdb  sp!, { r4-r10, lr }
+    ldr    r10, =4433
+    ldr    r14, =4112
+    ldr    r12, =3302955134
+1:
+    ldrsh  r7,  [r0, #6]
+    ldrsh  r5,  [r0, #2]
+    ldrsh  r4,  [r0]
+    ldrsh  r6,  [r0, #4]
+    add    r8,  r5,  r7             /* r8 = z1 = d1 + d3 */
+    add    r4,  r4,  r14
+    smulbb r8, r10, r8              /* z1 *= 4433 */
+    add    r9,  r4,  r6             /* r9 = tmp10 >> 13 = d0 + d2 */
+    smlabb r5,  r12, r5,  r8        /* r5 = tmp2 = z1 + z2 * 6270 */ 
+    smlatb r7,  r12, r7,  r8        /* r7 = tmp0 = z1 - z3 * 15137 */
+    sub    r4,  r5,  r6             /* r4 = tmp12 >> 13 = d0 - d2 */
+    add    r6,  r5,  r9,  lsl #13   /* r6 = o0 */
+    rsb    r9,  r5,  r9,  lsl #13   /* r9 = o3 */
+    add    r5,  r7,  r4,  lsl #13   /* r5 = o1 */
+    rsb    r4,  r7,  r4,  lsl #13   /* r4 = o2 */
+    mov    r6,  r6,  asr #18
+    mov    r5,  r5,  asr #18
+    mov    r4,  r4,  asr #18
+    mov    r9,  r9,  asr #18
+    cmp    r6,  #255
+    mvnhi  r6,  r6,  asr #31
+    cmp    r5,  #255
+    mvnhi  r5,  r5,  asr #31
+    cmp    r4,  #255
+    mvnhi  r4,  r4,  asr #31
+    cmp    r9,  #255
+    mvnhi  r9,  r9,  asr #31
+#ifdef HAVE_LCD_COLOR
+    strb   r6,  [r1]
+    strb   r5,  [r1, #4]
+    strb   r4,  [r1, #8]
+    strb   r9,  [r1, #12]
+#else
+    strb   r6,  [r1]
+    strb   r5,  [r1, #1]
+    strb   r4,  [r1, #2]
+    strb   r9,  [r1, #3]
+#endif
+    add    r0,  r0,  #16
+    add    r1,  r1,  r3
+    teq    r0,  r2
+    bne    1b
+    ldmia sp!, { r4-r10, pc }
+#else
+    stmdb sp!, { r4-r9, lr }
+    ldr    r9,  =4433
+    ldr    r14, =4112
+    ldr    r12, =3302955134
+1:
+    ldmia  r0,  { r4-r5 }
+    sadd16 r4,  r4,  r14
+    sadd16 r6,  r4,  r5             /* r6lo = d0 + d2, r6hi = d1 + d3 */
+    ssub16 r7,  r4,  r5             /* r7lo = d0 - d2 */
+    smulbt r8,  r9,  r6
+    sxth   r6,  r6
+    smlabt r4,  r12, r4,  r8        /* r4 = tmp2 = z1 + z2 * 6270 */
+    smlatt r5,  r12, r5,  r8        /* r5 = tmp0 = z1 - z3 * 15137 */
+    sxth   r7,  r7
+    add    r8,  r4,  r6,  lsl #13   /* r8 = o0 */
+    rsb    r6,  r4,  r6,  lsl #13   /* r6 = o3 */
+    add    r4,  r5,  r7,  lsl #13   /* r4 = o1 */
+    rsb    r5,  r5,  r7,  lsl #13   /* r5 = o2 */
+    usat  r8,  #8,  r8,  asr #18
+    usat  r6,  #8,  r6,  asr #18
+    usat  r4,  #8,  r4,  asr #18
+    usat  r5,  #8,  r5,  asr #18
+#ifdef HAVE_LCD_COLOR
+    strb   r8,  [r1]
+    strb   r6,  [r1, #12]
+    strb   r4,  [r1, #4]
+    strb   r5,  [r1, #8]
+#else
+    strb   r8,  [r1]
+    strb   r6,  [r1, #3]
+    strb   r4,  [r1, #1]
+    strb   r5,  [r1, #2]
+#endif
+    add   r0,  r0,  #16
+    add   r1,  r1,  r3
+    teq   r0,  r2
+    bne   1b
+    ldmia sp!, { r4-r9, pc }
+#endif
+    .size jpeg_idct4h, .-jpeg_idct4h
diff --git a/apps/recorder/jpeg_load.c b/apps/recorder/jpeg_load.c
index dc8bb33..f2b3b4b 100644
--- a/apps/recorder/jpeg_load.c
+++ b/apps/recorder/jpeg_load.c
@@ -31,6 +31,7 @@
 #include "debug.h"
 #include "jpeg_load.h"
 /*#define JPEG_BS_DEBUG*/
+#define ROCKBOX_DEBUG_JPEG
 /* for portability of below JPEG code */
 #define MEMSET(p,v,c) memset(p,v,c)
 #define MEMCPY(d,s,c) memcpy(d,s,c)
@@ -49,7 +50,23 @@
 #else
 typedef uint8_t jpeg_pix_t;
 #endif
+#define JPEG_IDCT_TRANSPOSE
 #define JPEG_PIX_SZ (sizeof(jpeg_pix_t))
+#ifdef HAVE_LCD_COLOR
+#define COLOR_EXTRA_IDCT_WS 64
+#else
+#define COLOR_EXTRA_IDCT_WS 0
+#endif
+#ifdef JPEG_IDCT_TRANSPOSE
+#define V_OUT(n) ws2[8*n]
+#define V_IN_ST 1
+#define TRANSPOSE_EXTRA_IDCT_WS 64
+#else
+#define V_OUT(n) ws[8*n]
+#define V_IN_ST 8
+#define TRANSPOSE_EXTRA_IDCT_WS 0
+#endif
+#define IDCT_WS_SIZE (64 + TRANSPOSE_EXTRA_IDCT_WS + COLOR_EXTRA_IDCT_WS)
 
 /* This can't be in jpeg_load.h because plugin.h includes it, and it conflicts
  * with the definition in jpeg_decoder.h
@@ -259,7 +276,7 @@
 */
 
 /* horizontal-pass 1-point IDCT */
-static void idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
+static void jpeg_idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
 {
     for (; ws < end; ws += 8)
     {
@@ -269,19 +286,19 @@
 }
 
 /* vertical-pass 2-point IDCT */
-static void idct2v(int16_t *ws, int16_t *end)
+static void jpeg_idct2v(int16_t *ws, int16_t *end)
 {
     for (; ws < end; ws++)
     {
-        int tmp1 = ws[0];
-        int tmp2 = ws[8];
-        ws[0] = tmp1 + tmp2;
-        ws[8] = tmp1 - tmp2;
+        int tmp1 = ws[0*8];
+        int tmp2 = ws[1*8];
+        ws[0*8] = tmp1 + tmp2;
+        ws[1*8] = tmp1 - tmp2;
     }
 }
 
 /* horizontal-pass 2-point IDCT */
-static void idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
+static void jpeg_idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
 {
     for (; ws < end; ws += 8, out += rowstep)
     {
@@ -295,69 +312,12 @@
     }
 }
 
+#ifndef CPU_ARM
 /* vertical-pass 4-point IDCT */
-static void idct4v(int16_t *ws, int16_t *end)
+static void jpeg_idct4v(int16_t *ws, int16_t *end)
 {
     for (; ws < end; ws++)
     {
-#if defined(CPU_ARM)
-        int t0, t1, t2, t3, t4;
-#if ARM_ARCH <= 4
-        int t5;
-#endif
-        asm volatile(
-            "ldrsh %[t4], [%[ws]]\n\t"         /* t4 = tmp0 (ws[8*0]) */
-            "ldrsh %[t1], [%[ws], #32]\n\t"    /* t1 = tmp2 (ws[8*2]) */
-            "ldrsh %[t2], [%[ws], #16]\n\t"    /* t2 = z2 (ws[8*1]) */
-            "add   %[t0], %[t4], %[t1]\n\t"    /* t0 = tmp10 >> 2
-                                                  (tmp0 + tmp2) */
-            "sub   %[t1], %[t4], %[t1]\n\t"    /* t1 = tmp12 >> 2
-                                                  (tmp0 - tmp2) */
-            "ldrsh %[t3], [%[ws], #48]\n\t"    /* t3 = z3 (ws[8*3] */
-            "add   %[t4], %[t2], %[t3]\n\t"    /* t4 = z2 + z3 */
-#if ARM_ARCH > 4
-            "smulbb %[t4], %[c1], %[t4]\n\t"
-            "add    %[t4], %[t4], #1024\n\t"   /* t4 = z1 */
-            "smlatb %[t3], %[c2c3], %[t3], %[t4]\n\t"
-            "smlabb %[t2], %[c2c3], %[t2], %[t4]\n\t"
-            "mov   %[t3], %[t3], asr #11\n\t"  /* t3 = tmp0 */
-            "mov   %[t2], %[t2], asr #11\n\t"  /* t2 = tmp2 */
-#else
-            "add   %[t5], %[t4], %[t4], lsl #3\n\t"
-            "rsb   %[t4], %[t4], %[t5], lsl #4\n\t"
-            "rsb   %[t4], %[t4], %[t4], lsl #5\n\t"
-            "add   %[t4], %[t4], #1024\n\t" /*z1*/
-            "mla   %[t3], %[c2], %[t3], %[t4]\n\t"
-            "mla   %[t2], %[c3], %[t2], %[t4]\n\t"
-            "mov   %[t3], %[t3], asr #11\n\t"  /* t3 = tmp0 */
-            "mov   %[t2], %[t2], asr #11\n\t"  /* t2 = tmp2 */
-#endif
-            "add   %[t4], %[t2], %[t0], lsl #2\n\t" /* t4 = tmp10 + tmp2 */
-            "rsb   %[t0], %[t2], %[t0], lsl #2\n\t" /* t0 = tmp10 - tmp2 */
-            "add   %[t2], %[t3], %[t1], lsl #2\n\t" /* t2 = tmp12 + tmp0 */
-            "rsb   %[t3], %[t3], %[t1], lsl #2\n\t" /* t3 = tmp12 - tmp0 */
-            "strh  %[t4], [%[ws]]\n\t"
-            "strh  %[t0], [%[ws], #48]\n\t"
-            "strh  %[t2], [%[ws], #16]\n\t"
-            "strh  %[t3], [%[ws], #32]\n\t"
-            : [t0] "=&r" (t0),
-              [t1] "=&r" (t1),
-              [t2] "=&r" (t2),
-              [t3] "=&r" (t3),
-              [t4] "=&r" (t4)
-#if ARM_ARCH <= 4
-              ,[t5] "=&r" (t5)
-#endif
-            : [ws] "r" (ws),
-#if ARM_ARCH > 4
-              [c1]   "r" (FIX_0_541196100),
-              [c2c3] "r" (((-FIX_1_847759065)<<16)|FIX_0_765366865)
-#else
-              [c2] "r" (-FIX_1_847759065),
-              [c3]  "r" (FIX_0_765366865)
-#endif
-        );
-#else
         int tmp0, tmp2, tmp10, tmp12;
         int z1, z2, z3;
         /* Even part */
@@ -382,93 +342,18 @@
             CONST_BITS-PASS1_BITS);
 
         /* Final output stage */
-
         ws[8*0] = (int) (tmp10 + tmp2);
         ws[8*3] = (int) (tmp10 - tmp2);
         ws[8*1] = (int) (tmp12 + tmp0);
         ws[8*2] = (int) (tmp12 - tmp0);
-#endif
     }
 }
 
 /* horizontal-pass 4-point IDCT */
-static void idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
+static void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
 {
     for (; ws < end; out += rowstep, ws += 8)
     {
-#if defined(CPU_ARM)
-        int t0, t1, t2, t3, t4;
-#if ARM_ARCH <= 4
-        int t5;
-#endif
-        asm volatile(
-            "ldrsh %[t4], [%[ws]]\n\t"               /* t4 = tmp0 (ws[0]) */
-            "ldrsh %[t1], [%[ws], #4]\n\t"           /* t1 = tmp2 (ws[2]) */
-            "add   %[t4], %[t4], #16\n\t"            /* add rounding to DC */
-            "add   %[t4], %[t4], #4096\n\t"          /* pre-add offset */
-            "ldrsh %[t2], [%[ws], #2]\n\t"           /* t2 = z2 (ws[1]) */
-            "add   %[t0], %[t4], %[t1]\n\t"          /* t0 = tmp10 >> 13
-                                                        (tmp0 + tmp2) */
-            "sub   %[t1], %[t4], %[t1]\n\t"          /* t1 = tmp12 >> 13
-                                                        (tmp0 - tmp2) */
-            "ldrsh %[t3], [%[ws], #6]\n\t"           /* t3 = z3 (ws[3] */
-            "add   %[t4], %[t2], %[t3]\n\t"          /* t4 = z2 + z3 */
-#if ARM_ARCH > 4
-            "smulbb %[t4], %[c1], %[t4]\n\t"
-            "smlatb %[t3], %[c2c3], %[t3], %[t4]\n\t"
-            "smlabb %[t2], %[c2c3], %[t2], %[t4]\n\t"
-#else
-            "add   %[t5], %[t4], %[t4], lsl #3\n\t"
-            "rsb   %[t4], %[t4], %[t5], lsl #4\n\t"
-            "rsb   %[t4], %[t4], %[t4], lsl #5\n\t"  /* t4 = z1 */
-            "mla   %[t3], %[c2], %[t3], %[t4]\n\t"
-            "mla   %[t2], %[c3], %[t2], %[t4]\n\t"
-#endif
-            "add   %[t4], %[t2], %[t0], lsl #13\n\t" /* t4 = tmp10 + tmp2 */
-            "rsb   %[t0], %[t2], %[t0], lsl #13\n\t" /* t0 = tmp10 - tmp2 */
-            "add   %[t2], %[t3], %[t1], lsl #13\n\t" /* t2 = tmp12 + tmp0 */
-            "rsb   %[t3], %[t3], %[t1], lsl #13\n\t" /* t3 = tmp12 - tmp0 */
-            "mov   %[t4], %[t4], asr #18\n\t"        /* descale results */
-            "mov   %[t0], %[t0], asr #18\n\t"
-            "mov   %[t2], %[t2], asr #18\n\t"
-            "mov   %[t3], %[t3], asr #18\n\t"
-            "cmp   %[t4], #255\n\t"                  /* range limit results */
-            "mvnhi %[t4], %[t4], asr #31\n\t"
-            "cmp   %[t0], #255\n\t"
-            "mvnhi %[t0], %[t0], asr #31\n\t"
-            "cmp   %[t2], #255\n\t"
-            "mvnhi %[t2], %[t2], asr #31\n\t"
-            "cmp   %[t3], #255\n\t"
-            "mvnhi %[t3], %[t3], asr #31\n\t"
-            "cmp   %[t4], #255\n\t"
-            "mvnhi %[t4], %[t4], asr #31\n\t"
-            "strb  %[t4], [%[out]]\n\t"
-            "strb  %[t0], [%[out], %[o3]]\n\t"
-            "strb  %[t2], [%[out], %[o1]]\n\t"
-            "strb  %[t3], [%[out], %[o2]]\n\t"
-            : [t0] "=&r" (t0),
-              [t1] "=&r" (t1),
-              [t2] "=&r" (t2),
-              [t3] "=&r" (t3),
-              [t4] "=&r" (t4)
-#if ARM_ARCH <= 4
-
-              ,[t5] "=&r" (t5)
-#endif
-            : [ws]  "r" (ws),
-              [out] "r" (out),
-              [o1]  "i" (JPEG_PIX_SZ),
-              [o2]  "i" (JPEG_PIX_SZ*2),
-              [o3]  "i" (JPEG_PIX_SZ*3),
-#if ARM_ARCH > 4
-              [c1]   "r" (FIX_0_541196100),
-              [c2c3] "r" (((-FIX_1_847759065)<<16)|FIX_0_765366865)
-#else
-              [c2] "r" (-FIX_1_847759065),
-              [c3] "r" (FIX_0_765366865)
-#endif
-        );
-#else
         int tmp0, tmp2, tmp10, tmp12;
         int z1, z2, z3;
         /* Even part */
@@ -500,18 +385,27 @@
             DS_OUT));
         out[JPEG_PIX_SZ*2] = range_limit((int) RIGHT_SHIFT(tmp12 - tmp0,
             DS_OUT));
-#endif
     }
 }
+#else
+extern void jpeg_idct4v(int16_t *ws, int16_t *end);
+extern void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
+#endif
 
 /* vertical-pass 8-point IDCT */
-static void idct8v(int16_t *ws, int16_t *end)
+static void jpeg_idct8v(int16_t *ws, int16_t *end)
 {
     long tmp0, tmp1, tmp2, tmp3;
     long tmp10, tmp11, tmp12, tmp13;
     long z1, z2, z3, z4, z5;
+#ifdef JPEG_IDCT_TRANSPOSE
+    int16_t *ws2 = ws + 64;
+    for (; ws < end; ws += 8, ws2++)
+    {
+#else
     for (; ws < end; ws++)
     {
+#endif
     /* Due to quantization, we will usually find that many of the input
     * coefficients are zero, especially the AC terms.  We can exploit this
     * by short-circuiting the IDCT calculation for any column in which all
@@ -520,30 +414,30 @@
     * With typical images and quantization tables, half or more of the
     * column DCT calculations can be simplified this way.
     */
-        if ((ws[8*1] | ws[8*2] | ws[8*3]
-           | ws[8*4] | ws[8*5] | ws[8*6] | ws[8*7]) == 0)
+        if ((ws[V_IN_ST*1] | ws[V_IN_ST*2] | ws[V_IN_ST*3]
+           | ws[V_IN_ST*4] | ws[V_IN_ST*5] | ws[V_IN_ST*6] | ws[V_IN_ST*7]) == 0)
         {
             /* AC terms all zero */
-            int dcval = ws[8*0] << PASS1_BITS;
+            int dcval = ws[V_IN_ST*0] << PASS1_BITS;
 
-            ws[8*0] = ws[8*1] = ws[8*2] = ws[8*3] = ws[8*4]
-                       = ws[8*5] = ws[8*6] = ws[8*7] = dcval;
+            V_OUT(0) = V_OUT(1) = V_OUT(2) = V_OUT(3) = V_OUT(4) = V_OUT(5) =
+                       V_OUT(6) = V_OUT(7) = dcval;
             continue;
         }
 
         /* Even part: reverse the even part of the forward DCT. */
         /* The rotator is sqrt(2)*c(-6). */
 
-        z2 = ws[8*2];
-        z3 = ws[8*6];
+        z2 = ws[V_IN_ST*2];
+        z3 = ws[V_IN_ST*6];
 
         z1 = MULTIPLY16(z2 + z3, FIX_0_541196100);
         tmp2 = z1 + MULTIPLY16(z3, - FIX_1_847759065);
         tmp3 = z1 + MULTIPLY16(z2, FIX_0_765366865);
 
-        z2 = ws[8*0] << CONST_BITS;
+        z2 = ws[V_IN_ST*0] << CONST_BITS;
         z2 += ONE << (CONST_BITS - PASS1_BITS - 1);
-        z3 = ws[8*4] << CONST_BITS;
+        z3 = ws[V_IN_ST*4] << CONST_BITS;
 
         tmp0 = (z2 + z3);
         tmp1 = (z2 - z3);
@@ -556,10 +450,10 @@
         /* Odd part per figure 8; the matrix is unitary and hence its
            transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively. */
 
-        tmp0 = ws[8*7];
-        tmp1 = ws[8*5];
-        tmp2 = ws[8*3];
-        tmp3 = ws[8*1];
+        tmp0 = ws[V_IN_ST*7];
+        tmp1 = ws[V_IN_ST*5];
+        tmp2 = ws[V_IN_ST*3];
+        tmp3 = ws[V_IN_ST*1];
 
         z1 = tmp0 + tmp3;
         z2 = tmp1 + tmp2;
@@ -586,19 +480,19 @@
 
         /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 
-        ws[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
-        ws[8*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
-        ws[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
-        ws[8*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
-        ws[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
-        ws[8*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
-        ws[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
-        ws[8*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+        V_OUT(0) = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
+        V_OUT(7) = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
+        V_OUT(1) = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
+        V_OUT(6) = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
+        V_OUT(2) = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
+        V_OUT(5) = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
+        V_OUT(3) = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
+        V_OUT(4) = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
     }
 }
 
 /* horizontal-pass 8-point IDCT */
-static void idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
+static void jpeg_idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
 {
     long tmp0, tmp1, tmp2, tmp3;
     long tmp10, tmp11, tmp12, tmp13;
@@ -709,20 +603,26 @@
 
 #ifdef HAVE_LCD_COLOR
 /* vertical-pass 16-point IDCT */
-static void idct16v(int16_t *ws, int16_t *end)
+static void jpeg_idct16v(int16_t *ws, int16_t *end)
 {
     long tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
     long tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
     long z1, z2, z3, z4;
+#ifdef JPEG_IDCT_TRANSPOSE
+    int16_t *ws2 = ws + 64;
+    for (; ws < end; ws += 8, ws2++)
+    {
+#else
     for (; ws < end; ws++)
     {
+#endif
         /* Even part */
 
-        tmp0 = ws[8*0] << CONST_BITS;
+        tmp0 = ws[V_IN_ST*0] << CONST_BITS;
         /* Add fudge factor here for final descale. */
         tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
 
-        z1 = ws[8*4];
+        z1 = ws[V_IN_ST*4];
         tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
         tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
 
@@ -731,8 +631,8 @@
         tmp12 = tmp0 + tmp2;
         tmp13 = tmp0 - tmp2;
 
-        z1 = ws[8*2];
-        z2 = ws[8*6];
+        z1 = ws[V_IN_ST*2];
+        z2 = ws[V_IN_ST*6];
         z3 = z1 - z2;
         z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
         z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
@@ -757,10 +657,10 @@
 
         /* Odd part */
 
-        z1 = ws[8*1];
-        z2 = ws[8*3];
-        z3 = ws[8*5];
-        z4 = ws[8*7];
+        z1 = ws[V_IN_ST*1];
+        z2 = ws[V_IN_ST*3];
+        z3 = ws[V_IN_ST*5];
+        z4 = ws[V_IN_ST*7];
 
         tmp11 = z1 + z3;
 
@@ -795,27 +695,27 @@
         tmp11 += z2;
 
         /* Final output stage */
-        ws[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
-        ws[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
-        ws[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
-        ws[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
-        ws[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
-        ws[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
-        ws[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
-        ws[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
-        ws[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
-        ws[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
-        ws[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
-        ws[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
-        ws[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
-        ws[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
-        ws[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
-        ws[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
+        V_OUT(0)  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
+        V_OUT(15) = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
+        V_OUT(1)  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
+        V_OUT(14) = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
+        V_OUT(2)  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
+        V_OUT(13) = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
+        V_OUT(3)  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
+        V_OUT(12) = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
+        V_OUT(4)  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
+        V_OUT(11) = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
+        V_OUT(5)  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
+        V_OUT(10) = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
+        V_OUT(6)  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
+        V_OUT(9)  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
+        V_OUT(7)  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
+        V_OUT(8)  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
     }
 }
 
 /* horizontal-pass 16-point IDCT */
-static void idct16h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
+static void jpeg_idct16h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep)
 {
     long tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
     long tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
@@ -946,12 +846,12 @@
 };
 
 struct idct_entry idct_tbl[] = {
-    { PASS1_BITS, NULL, idct1h },
-    { PASS1_BITS, idct2v, idct2h },
-    { 0, idct4v, idct4h },
-    { 0, idct8v, idct8h },
+    { PASS1_BITS, NULL, jpeg_idct1h },
+    { PASS1_BITS, jpeg_idct2v, jpeg_idct2h },
+    { 0, jpeg_idct4v, jpeg_idct4h },
+    { 0, jpeg_idct8v, jpeg_idct8h },
 #ifdef HAVE_LCD_COLOR
-    { 0, idct16v, idct16h },
+    { 0, jpeg_idct16v, jpeg_idct16h },
 #endif
 };
 
@@ -1468,21 +1368,27 @@
 }
 
 
-/* zag[i] is the natural-order position of the i'th element of zigzag order.
- * If the incoming data is corrupted, decode_mcu could attempt to
- * reference values beyond the end of the array.  To avoid a wild store,
- * we put some extra zeroes after the real entries.
- */
+/* zag[i] is the natural-order position of the i'th element of zigzag order. */
 static const unsigned char zag[] =
 {
-     0,  1,  8, 16,  9,  2,  3, 10,
-    17, 24, 32, 25, 18, 11,  4,  5,
-    12, 19, 26, 33, 40, 48, 41, 34,
-    27, 20, 13,  6,  7, 14, 21, 28,
-    35, 42, 49, 56, 57, 50, 43, 36,
-    29, 22, 15, 23, 30, 37, 44, 51,
-    58, 59, 52, 45, 38, 31, 39, 46,
-    53, 60, 61, 54, 47, 55, 62, 63,
+#ifdef JPEG_IDCT_TRANSPOSE
+      0,   8,   1,   2,   9,  16,  24,  17,
+     10,   3,   4,  11,  18,  25,  32,  40,
+     33,  26,  19,  12,   5,   6,  13,  20,
+     27,  34,  41,  48,  56,  49,  42,  35,
+     28,  21,  14,   7,  15,  22,  29,  36,
+     43,  50,  57,  58,  51,  44,  37,  30,
+     23,  31,  38,  45,  52,  59,  60,  53,
+     46,  39,  47,  54,  61,  62,  55,  63,
+#endif
+      0,   1,   8,  16,   9,   2,   3,  10,
+     17,  24,  32,  25,  18,  11,   4,   5,
+     12,  19,  26,  33,  40,  48,  41,  34,
+     27,  20,  13,   6,   7,  14,  21,  28,
+     35,  42,  49,  56,  57,  50,  43,  36,
+     29,  22,  15,  23,  30,  37,  44,  51,
+     58,  59,  52,  45,  38,  31,  39,  46,
+     53,  60,  61,  54,  47,  55,  62,  63,
 };
 
 /* zig[i] is the the zig-zag order position of the i'th element of natural
@@ -1898,17 +1804,20 @@
         store_offs[p_jpeg->store_pos[1]] = JPEG_PIX_SZ << p_jpeg->h_scale[0];
         store_offs[p_jpeg->store_pos[2]] = b_width << p_jpeg->v_scale[0];
         store_offs[p_jpeg->store_pos[3]] = store_offs[1] + store_offs[2];
-
-        int16_t block[128]; /* decoded DCT coefficients */
+        /* decoded DCT coefficients */
+        int16_t block[IDCT_WS_SIZE] __attribute__((aligned(8)));
         for (x = 0; x < p_jpeg->x_mbl; x++)
         {
             int blkn;
             for (blkn = 0; blkn < p_jpeg->blocks; blkn++)
             {
-                int k = 1; /* coefficient index */
-                int s, r; /* huffman values */
                 int ci = p_jpeg->mcu_membership[blkn]; /* component index */
                 int ti = p_jpeg->tab_membership[blkn]; /* table index */
+#ifdef JPEG_IDCT_TRANSPOSE
+                bool transpose = p_jpeg->v_scale[!!ci] > 2;
+#endif
+                int k = 1; /* coefficient index */
+                int s, r; /* huffman values */
                 struct derived_tbl* dctbl = &p_jpeg->dc_derived_tbls[ti];
                 struct derived_tbl* actbl = &p_jpeg->ac_derived_tbls[ti];
 
@@ -1948,7 +1857,11 @@
                             r = get_bits(p_jpeg, s);
                             r = HUFF_EXTEND(r, s);
                             r = MULTIPLY16(r, p_jpeg->quanttable[!!ci][k]);
+#ifdef JPEG_IDCT_TRANSPOSE
+                            block[zag[transpose ? k : k + 64]] = r ;
+#else
                             block[zag[k]] = r ;
+#endif
                         }
                         else
                         {
@@ -1988,10 +1901,19 @@
                     int idct_rows = BIT_N(p_jpeg->v_scale[!!ci]);
                     unsigned char *b_out = out + (ci ? ci : store_offs[blkn]);
                     if (idct_tbl[p_jpeg->v_scale[!!ci]].v_idct)
+#ifdef JPEG_IDCT_TRANSPOSE
+                        idct_tbl[p_jpeg->v_scale[!!ci]].v_idct(block,
+                            transpose ? block + 8 * idct_cols
+                                      : block + idct_cols);
+                    uint16_t * h_block = transpose ? block + 64 : block;
+                    idct_tbl[p_jpeg->h_scale[!!ci]].h_idct(h_block, b_out,
+                        h_block + idct_rows * 8, b_width);
+#else
                         idct_tbl[p_jpeg->v_scale[!!ci]].v_idct(block,
                             block + idct_cols);
                     idct_tbl[p_jpeg->h_scale[!!ci]].h_idct(block, b_out,
                         block + idct_rows * 8, b_width);
+#endif
                 }
             } /* for blkn */
             /* don't starve other threads while an MCU row decodes */
@@ -2048,7 +1970,6 @@
 {
     int fd, ret;
     fd = open(filename, O_RDONLY);
-
     JDEBUGF("read_jpeg_file: filename: %s buffer len: %d cformat: %p\n",
         filename, maxsize, cformat);
     /* Exit if file opening failed */
@@ -2181,14 +2102,22 @@
     int decode_h = BIT_N(p_jpeg->v_scale[0]) - 1;
     src_dim.width = (p_jpeg->x_size << p_jpeg->h_scale[0]) >> 3;
     src_dim.height = (p_jpeg->y_size << p_jpeg->v_scale[0]) >> 3;
-    p_jpeg->zero_need[0] = (decode_h << 3) + decode_w;
-    p_jpeg->k_need[0] = zig[p_jpeg->zero_need[0]];
+#ifdef JPEG_IDCT_TRANSPOSE
+    if (p_jpeg->v_scale[0] > 2)
+        p_jpeg->zero_need[0] = (decode_w << 3) + decode_h;
+    else
+#endif
+        p_jpeg->zero_need[0] = (decode_h << 3) + decode_w;
+    p_jpeg->k_need[0] = zig[(decode_h << 3) + decode_w];
     JDEBUGF("need luma components to %d\n", p_jpeg->k_need[0]);
 #ifdef HAVE_LCD_COLOR
     decode_w = BIT_N(MIN(p_jpeg->h_scale[1],3)) - 1;
     decode_h = BIT_N(MIN(p_jpeg->v_scale[1],3)) - 1;
-    p_jpeg->zero_need[1] = (decode_h << 3) + decode_w;
-    p_jpeg->k_need[1] =  zig[p_jpeg->zero_need[1]];
+    if (p_jpeg->v_scale[1] > 2)
+        p_jpeg->zero_need[1] = (decode_w << 3) + decode_h;
+    else
+        p_jpeg->zero_need[1] = (decode_h << 3) + decode_w;
+    p_jpeg->k_need[1] = zig[(decode_h << 3) + decode_w];
     JDEBUGF("need chroma components to %d\n", p_jpeg->k_need[1]);
 #endif
     if (cformat)