ARM assembly 8-point IDCT, both passes. No ARMv5/6 optimizations yet, aside from usat for final output.


git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21526 a1c6a512-1295-4272-9138-f99709370657
diff --git a/apps/recorder/jpeg_idct_arm.S b/apps/recorder/jpeg_idct_arm.S
index b9c94e5..01b08c4 100644
--- a/apps/recorder/jpeg_idct_arm.S
+++ b/apps/recorder/jpeg_idct_arm.S
@@ -35,6 +35,10 @@
     .type   jpeg_idct4v, %function
     .global jpeg_idct4h
     .type   jpeg_idct4h, %function
+    .global jpeg_idct8v
+    .type   jpeg_idct8v, %function
+    .global jpeg_idct8h
+    .type   jpeg_idct8h, %function
 
 jpeg_idct1h:
 /* In the common case of one pass through the loop, the extra add should be
@@ -414,3 +418,264 @@
     ldmia sp!, { r4-r9, pc }
 #endif
     .size jpeg_idct4h, .-jpeg_idct4h
+
+jpeg_idct8v:
+    stmdb  sp!, { r4-r11, lr }
+    add    r2,  r0,  #128
+1:
+    ldmia  r0!, { r4-r7 }
+    mov    r8,  r4,  lsl #16
+    orrs   r9,  r6,  r7
+    orreqs r9,  r5,  r4, lsr #16
+    bne    2f
+    mov    r8,  r8,  asr #14
+    strh   r8,  [r2]
+    strh   r8,  [r2, #16]
+    strh   r8,  [r2, #32]
+    strh   r8,  [r2, #48]
+    strh   r8,  [r2, #64]
+    strh   r8,  [r2, #80]
+    strh   r8,  [r2, #96]
+    strh   r8,  [r2, #112]
+    cmp    r0,  r1
+    add    r2,  r2,  #2
+    bcc    1b
+    ldmia  sp!, { r4-r11, pc }
+2:
+    ldr    r14, =4433
+    ldr    r12, =-15137
+    mov    r10, r5,  lsl #16
+    mov    r11, r7,  lsl #16
+    mov    r10, r10, asr #16        /* r10 = z2 = d2 */
+    mov    r11, r11, asr #16        /* r11 = z3 = d6 */
+    add    r8,  r8,  #8192
+    add    r9,  r10, r11
+    mov    r8,  r8,  asr #3         /* r8  = z4 = (d0 + 4112) << 13 */
+    mul    r9,  r14, r9             /* r9  = z1 = (z2 + z3) * 4433 */
+    ldr    r14, =6270
+    mla    r11, r12, r11, r9         /* r11 = tmp2 = z1 - z3 * 15137 */
+    mla    r10, r14, r10, r9         /* r10 = tmp3 = z1 + z2 * 6270 */
+    mov    r9,  r6,  lsl #16         /* r9  = z5 << 3 = d4 << 16 */
+    add    r12, r8,  r9,  asr #3     /* r12 = tmp0 = z4 + z5 */
+    sub    r14, r8,  r9,  asr #3     /* r14 = tmp1 = z4 - z5 */
+    add    r8,  r12, r10             /* r8  = tmp10 = tmp0 + tmp3 */
+    sub    r9,  r12, r10             /* r9  = tmp13 = tmp0 - tmp3 */
+    add    r10, r14, r11             /* r10 = tmp11 = tmp1 + tmp2 */
+    sub    r11, r14, r11             /* r11 = tmp12 = tmp1 - tmp2 */
+    stmdb  sp,  { r8-r11 }           /* tmp10 tmp13 tmp11 tmp12 */
+    mov    r4,  r4,  asr #16         /* r4  = tmp3 = d1 */
+    mov    r5,  r5,  asr #16         /* r5  = tmp2 = d3 */
+    mov    r6,  r6,  asr #16         /* r6  = tmp1 = d5 */
+    mov    r7,  r7,  asr #16         /* r7  = tmp0 = d7 */
+    ldr    r10, =9633
+    ldr    r11, =-16069
+    add    r12, r5,  r7              /* r12 = z3 = tmp0 + tmp2 */
+    add    r14, r4,  r6              /* r14 = z4 = tmp1 + tmp3 */
+    add    r9,  r12, r14             /* r9  = z3 + z4 */
+    mul    r9,  r10, r9              /* r9  = z5 = (z3 + z4) * 9633 */
+    ldr    r10, =-3196
+    mla    r12, r11, r12, r9         /* r12 = z3 = z5 - z3 * 16069 */
+    ldr    r11, =-7373
+    mla    r14, r10, r14, r9         /* r14 = z4 = z5 - z4 * 3196 */
+    ldr    r10, =2446
+    add    r9,  r4,  r7              /* r9  = tmp0 + tmp3 */
+    mla    r8,  r11, r9,  r12        /* r8  = z1 + z3 */
+    mla    r9,  r11, r9,  r14        /* r9  = z1 + z4 */
+    ldr    r11, =12299
+    mla    r7,  r10, r7,  r8         /* r7  = tmp0 = z1 + z3 + tmp0 * 2446 */
+    ldr    r10, =-20995
+    mla    r4,  r11, r4,  r9         /* r4  = tmp3 = z1 + z4 + tmp0 * 12299 */
+    ldr    r11, =25172
+    add    r9,  r5,  r6              /* r9  = tmp1 + tmp2 */
+    mla    r12, r10, r9, r12         /* r12 = z2 + z3 */
+    mla    r14, r10, r9, r14         /* r14 = z2 + z4 */
+    ldr    r10, =16819
+    mla    r5,  r11, r5, r12         /* r5  = tmp2 = z2 + z3 + tmp2 * 25172 */
+    mla    r6,  r10, r6, r14         /* r6  = tmp1 = z2 + z4 + tmp1 * 16819 */
+    ldmdb  sp,  { r8-r11 }
+    add    r12, r8,  r4              /* o0 */
+    sub    r14, r8,  r4              /* o7 */
+    add    r8,  r9,  r7              /* o3 */
+    sub    r9,  r9,  r7              /* o4 */
+    add    r4,  r10, r5              /* O1 */
+    sub    r5,  r10, r5              /* o6 */
+    add    r10, r11, r6              /* o2 */
+    sub    r11, r11, r6              /* o5 */
+    /* output in order: r12 r4  r10 r8  r9  r11 r5  r14 */
+    mov    r12, r12, asr #11
+    mov    r4,  r4,  asr #11
+    mov    r10, r10, asr #11
+    mov    r8,  r8,  asr #11
+    mov    r9,  r9,  asr #11
+    mov    r11, r11, asr #11
+    mov    r5,  r5,  asr #11
+    mov    r14, r14, asr #11
+    strh   r12, [r2]
+    strh   r4,  [r2, #16]
+    strh   r10, [r2, #32]
+    strh   r8,  [r2, #48]
+    strh   r9,  [r2, #64]
+    strh   r11, [r2, #80]
+    strh   r5,  [r2, #96]
+    strh   r14, [r2, #112]
+    cmp    r0,  r1
+    add    r2,  r2,  #2
+    bcc    1b
+    ldmia  sp!, { r4-r11, pc }
+    .size jpeg_idct8v, .-jpeg_idct8v
+
+jpeg_idct8h:
+    stmdb  sp!, { r4-r11, lr }
+1:
+    ldmia  r0!, { r4-r7 }
+    ldr    r14, =4112
+    mov    r8,  r4,  lsl #16
+    add    r8,  r8,  r14, lsl #16
+    orrs   r9,  r6,  r7
+    orreqs r9,  r5,  r4, lsr #16
+    bne    2f
+    mov    r8,  r8,  asr #21
+    cmp    r8,  #255
+    mvnhi  r8,  r8,  asr #31
+#ifdef HAVE_LCD_COLOR
+    strb   r8,  [r1]
+    strb   r8,  [r1, #4]
+    strb   r8,  [r1, #8]
+    strb   r8,  [r1, #12]
+    strb   r8,  [r1, #16]
+    strb   r8,  [r1, #20]
+    strb   r8,  [r1, #24]
+    strb   r8,  [r1, #28]
+#else
+    strb   r8,  [r1]
+    strb   r8,  [r1, #1]
+    strb   r8,  [r1, #2]
+    strb   r8,  [r1, #3]
+    strb   r8,  [r1, #4]
+    strb   r8,  [r1, #5]
+    strb   r8,  [r1, #6]
+    strb   r8,  [r1, #7]
+#endif
+    add    r1,  r1,  r3
+    cmp    r0,  r2
+    bcc    1b
+    ldmia  sp!, { r4-r11, pc }
+2:
+    ldr    r14, =4433
+    ldr    r12, =-15137
+    mov    r10, r5,  lsl #16
+    mov    r11, r7,  lsl #16
+    mov    r10, r10, asr #16        /* r10 = z2 = d2 */
+    mov    r11, r11, asr #16        /* r11 = z3 = d6 */
+    add    r9,  r10, r11
+    mov    r8,  r8,  asr #3         /* r8  = z4 = (d0 + 4112) << 13 */
+    mul    r9,  r14, r9             /* r9  = z1 = (z2 + z3) * 4433 */
+    ldr    r14, =6270
+    mla    r11, r12, r11, r9         /* r11 = tmp2 = z1 - z3 * 15137 */
+    mla    r10, r14, r10, r9         /* r10 = tmp3 = z1 + z2 * 6270 */
+    mov    r9,  r6,  lsl #16         /* r9  = z5 << 3 = d4 << 16 */
+    add    r12, r8,  r9,  asr #3     /* r12 = tmp0 = z4 + z5 */
+    sub    r14, r8,  r9,  asr #3     /* r14 = tmp1 = z4 - z5 */
+    add    r8,  r12, r10             /* r8  = tmp10 = tmp0 + tmp3 */
+    sub    r9,  r12, r10             /* r9  = tmp13 = tmp0 - tmp3 */
+    add    r10, r14, r11             /* r10 = tmp11 = tmp1 + tmp2 */
+    sub    r11, r14, r11             /* r11 = tmp12 = tmp1 - tmp2 */
+    stmdb  sp,  { r8-r11 }           /* tmp10 tmp13 tmp11 tmp12 */
+    mov    r4,  r4,  asr #16         /* r4  = tmp3 = d1 */
+    mov    r5,  r5,  asr #16         /* r5  = tmp2 = d3 */
+    mov    r6,  r6,  asr #16         /* r6  = tmp1 = d5 */
+    mov    r7,  r7,  asr #16         /* r7  = tmp0 = d7 */
+    ldr    r10, =9633
+    ldr    r11, =-16069
+    add    r12, r5,  r7              /* r12 = z3 = tmp0 + tmp2 */
+    add    r14, r4,  r6              /* r14 = z4 = tmp1 + tmp3 */
+    add    r9,  r12, r14             /* r9  = z3 + z4 */
+    mul    r9,  r10, r9              /* r9  = z5 = (z3 + z4) * 9633 */
+    ldr    r10, =-3196
+    mla    r12, r11, r12, r9         /* r12 = z3 = z5 - z3 * 16069 */
+    ldr    r11, =-7373
+    mla    r14, r10, r14, r9         /* r14 = z4 = z5 - z4 * 3196 */
+    ldr    r10, =2446
+    add    r9,  r4,  r7              /* r9  = tmp0 + tmp3 */
+    mla    r8,  r11, r9,  r12        /* r8  = z1 + z3 */
+    mla    r9,  r11, r9,  r14        /* r9  = z1 + z4 */
+    ldr    r11, =12299
+    mla    r7,  r10, r7,  r8         /* r7  = tmp0 = z1 + z3 + tmp0 * 2446 */
+    ldr    r10, =-20995
+    mla    r4,  r11, r4,  r9         /* r4  = tmp3 = z1 + z4 + tmp0 * 12299 */
+    ldr    r11, =25172
+    add    r9,  r5,  r6              /* r9  = tmp1 + tmp2 */
+    mla    r12, r10, r9, r12         /* r12 = z2 + z3 */
+    mla    r14, r10, r9, r14         /* r14 = z2 + z4 */
+    ldr    r10, =16819
+    mla    r5,  r11, r5, r12         /* r5  = tmp2 = z2 + z3 + tmp2 * 25172 */
+    mla    r6,  r10, r6, r14         /* r6  = tmp1 = z2 + z4 + tmp1 * 16819 */
+    ldmdb  sp,  { r8-r11 }
+    add    r12, r8,  r4              /* o0 */
+    sub    r14, r8,  r4              /* o7 */
+    add    r8,  r9,  r7              /* o3 */
+    sub    r9,  r9,  r7              /* o4 */
+    add    r4,  r10, r5              /* O1 */
+    sub    r5,  r10, r5              /* o6 */
+    add    r10, r11, r6              /* o2 */
+    sub    r11, r11, r6              /* o5 */
+    /* output in order: r12 r4  r10 r8  r9  r11 r5  r14 */
+#if ARM_ARCH < 6
+    mov    r12, r12, asr #18
+    cmp    r12, #255
+    mvnhi  r12, r12, asr #31
+    mov    r4,  r4,  asr #18
+    cmp    r4,  #255
+    mvnhi  r4,  r4,  asr #31
+    mov    r10, r10, asr #18
+    cmp    r10, #255
+    mvnhi  r10, r10, asr #31
+    mov    r8,  r8,  asr #18
+    cmp    r8,  #255
+    mvnhi  r8,  r8,  asr #31
+    mov    r9,  r9,  asr #18
+    cmp    r9,  #255
+    mvnhi  r9,  r9,  asr #31
+    mov    r11, r11, asr #18
+    cmp    r11, #255
+    mvnhi  r11, r11, asr #31
+    mov    r5,  r5,  asr #18
+    cmp    r5,  #255
+    mvnhi  r5,  r5,  asr #31
+    mov    r14, r14, asr #18
+    cmp    r14, #255
+    mvnhi  r14, r14, asr #31
+#else
+    usat   r12, #8,  r12, asr #18
+    usat   r4,  #8,  r4,  asr #18
+    usat   r10, #8,  r10, asr #18
+    usat   r8,  #8,  r8,  asr #18
+    usat   r9,  #8,  r9,  asr #18
+    usat   r11, #8,  r11, asr #18
+    usat   r5,  #8,  r5,  asr #18
+    usat   r14, #8,  r14, asr #18
+#endif
+#ifdef HAVE_LCD_COLOR
+    strb   r12, [r1]
+    strb   r4,  [r1, #4]
+    strb   r10, [r1, #8]
+    strb   r8,  [r1, #12]
+    strb   r9,  [r1, #16]
+    strb   r11, [r1, #20]
+    strb   r5,  [r1, #24]
+    strb   r14, [r1, #28]
+#else
+    strb   r12, [r1]
+    strb   r4,  [r1, #1]
+    strb   r10, [r1, #2]
+    strb   r8,  [r1, #3]
+    strb   r9,  [r1, #4]
+    strb   r11, [r1, #5]
+    strb   r5,  [r1, #6]
+    strb   r14, [r1, #7]
+#endif
+    add    r1,  r1,  r3
+    cmp    r0,  r2
+    bcc    1b
+    ldmia  sp!, { r4-r11, pc }
+    .size jpeg_idct8h, .-jpeg_idct8h
diff --git a/apps/recorder/jpeg_load.c b/apps/recorder/jpeg_load.c
index fa2df5b..5ffa4a5 100644
--- a/apps/recorder/jpeg_load.c
+++ b/apps/recorder/jpeg_load.c
@@ -382,13 +382,6 @@
             DS_OUT));
     }
 }
-#else
-extern void jpeg_idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
-extern void jpeg_idct2v(int16_t *ws, int16_t *end);
-extern void jpeg_idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
-extern void jpeg_idct4v(int16_t *ws, int16_t *end);
-extern void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
-#endif
 
 /* vertical-pass 8-point IDCT */
 static void jpeg_idct8v(int16_t *ws, int16_t *end)
@@ -599,6 +592,16 @@
     }
 }
 
+#else
+extern void jpeg_idct1h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
+extern void jpeg_idct2v(int16_t *ws, int16_t *end);
+extern void jpeg_idct2h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
+extern void jpeg_idct4v(int16_t *ws, int16_t *end);
+extern void jpeg_idct4h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
+extern void jpeg_idct8v(int16_t *ws, int16_t *end);
+extern void jpeg_idct8h(int16_t *ws, unsigned char *out, int16_t *end, int rowstep);
+#endif
+
 #ifdef HAVE_LCD_COLOR
 /* vertical-pass 16-point IDCT */
 static void jpeg_idct16v(int16_t *ws, int16_t *end)