scaler optimizations:
on sh, use 8.24 fixed-point C math for final division in scaler

on coldfire, use 8.32 fixed-point via emac

on other architectures, use 8.32 fixed-point C math

use shift-and-add to divide when adjusting scale factors in pictureflow

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@19802 a1c6a512-1295-4272-9138-f99709370657
diff --git a/apps/plugins/pictureflow.c b/apps/plugins/pictureflow.c
index 2c59e1a..d7ba8b9 100644
--- a/apps/plugins/pictureflow.c
+++ b/apps/plugins/pictureflow.c
@@ -397,8 +397,8 @@
     return fsin(iangle + (IANGLE_MAX >> 2));
 }
 
-#define RB_DIV ((31ULL << 32) / 255 + 1)
-#define G_DIV ((63ULL << 32) / 255 + 1)
+#define DIV255(val) ((((((val)>>8)+(val))>>8)+(val))>>8)
+#define SCALE_VAL(val,out) DIV255((val) * (out) + 127)
 
 static void output_row_transposed(uint32_t row, void * row_in,
                                        struct scaler_context *ctx)
@@ -408,19 +408,19 @@
 #ifdef USEGSLIB
     uint32_t *qp = (uint32_t*)row_in;
     for (; dest < end; dest += ctx->bm->height)
-        *dest = ((*qp++) + ctx->round) * (uint64_t)ctx->divisor >> 32;
+        *dest = SC_MUL((*qp++) + ctx->round), ctx->divisor);
 #else
     struct uint32_rgb *qp = (struct uint32_rgb*)row_in;
-    uint32_t rb_mul = ctx->divisor * (uint64_t)RB_DIV >> 32,
-             rb_rnd = ctx->round * (uint64_t)RB_DIV >> 32,
-             g_mul = ctx->divisor * (uint64_t)G_DIV >> 32,
-             g_rnd = ctx->round * (uint64_t)G_DIV >> 32;
-             int r, g, b;
+    uint32_t rb_mul = SCALE_VAL(ctx->divisor, 31),
+             rb_rnd = SCALE_VAL(ctx->round, 31),
+             g_mul = SCALE_VAL(ctx->divisor, 63),
+             g_rnd = SCALE_VAL(ctx->round, 63);
+    int r, g, b;
     for (; dest < end; dest += ctx->bm->height)
     {
-        r = (qp->r + rb_rnd) * (uint64_t)rb_mul >> 32;
-        g = (qp->g + g_rnd) * (uint64_t)g_mul >> 32;
-        b = (qp->b + rb_rnd) * (uint64_t)rb_mul >> 32;
+        r = SC_MUL(qp->r + rb_rnd, rb_mul);
+        g = SC_MUL(qp->g + g_rnd, g_mul);
+        b = SC_MUL(qp->b + rb_rnd, rb_mul);
         qp++;
         *dest = LCD_RGBPACK_LCD(r,g,b);
     }
diff --git a/apps/recorder/resize.c b/apps/recorder/resize.c
index f934511..235e071 100644
--- a/apps/recorder/resize.c
+++ b/apps/recorder/resize.c
@@ -244,7 +244,7 @@
     /* Set up rounding and scale factors */
     ctx->divisor *= ctx->src->height;
     ctx->round = ctx->divisor >> 1;
-    ctx->divisor = ((ctx->divisor - 1 + 0x80000000U) / ctx->divisor) << 1;
+    ctx->divisor = (((ctx->divisor >> 1) + SC_NUM) / ctx->divisor) << SC_FIX;
     mul = 0;
     oy = rset->rowstart;
     oye = 0;
@@ -442,7 +442,7 @@
     /* Set up scale and rounding factors, the divisor is bm->height - 1 */
     ctx->divisor *= (ctx->bm->height - 1);
     ctx->round = ctx->divisor >> 1;
-    ctx->divisor = ((ctx->divisor - 1 + 0x80000000U) / ctx->divisor) << 1;
+    ctx->divisor = (((ctx->divisor >> 1) + SC_NUM) / ctx->divisor) << SC_FIX;
     /* Set up our two temp buffers. The names are generic because they'll be
        swapped each time a new input row is read
     */
@@ -531,8 +531,7 @@
                 for (col = 0; col < ctx->bm->width; col++) {
                     if (ctx->dither)
                         delta = DITHERXDY(col,dy);
-                    bright = ((*qp++) + ctx->round) *
-                             (uint64_t)ctx->divisor >> 32;
+                    bright = SC_MUL((*qp++) + ctx->round,ctx->divisor);
                     bright = (3 * bright + (bright >> 6) + delta) >> 8;
                     data |= (~bright & 3) << shift;
                     shift -= 2;
@@ -555,8 +554,7 @@
                 for (col = 0; col < ctx->bm->width; col++) {
                     if (ctx->dither)
                         delta = DITHERXDY(col,dy);
-                    bright = ((*qp++) + ctx->round) *
-                             (uint64_t)ctx->divisor >> 32;
+                    bright = SC_MUL((*qp++) + ctx->round, ctx->divisor);
                     bright = (3 * bright + (bright >> 6) + delta) >> 8;
                     *dest++ |= (~bright & 3) << shift;
                 }
@@ -571,8 +569,7 @@
                 for (col = 0; col < ctx->bm->width; col++) {
                     if (ctx->dither)
                         delta = DITHERXDY(col,dy);
-                    bright = ((*qp++) + ctx->round) *
-                             (uint64_t)ctx->divisor >> 32;
+                    bright = SC_MUL((*qp++) + ctx->round, ctx->divisor);
                     bright = (3 * bright + (bright >> 6) + delta) >> 8;
                     *dest++ |= vi_pattern[bright] << shift;
                 }
@@ -588,9 +585,9 @@
                     if (ctx->dither)
                         delta = DITHERXDY(col,dy);
                     q0 = *qp++;
-                    r = (q0.r + ctx->round) * (uint64_t)ctx->divisor >> 32;
-                    g = (q0.g + ctx->round) * (uint64_t)ctx->divisor >> 32;
-                    b = (q0.b + ctx->round) * (uint64_t)ctx->divisor >> 32;
+                    r = SC_MUL(q0.r + ctx->round, ctx->divisor);
+                    g = SC_MUL(q0.g + ctx->round, ctx->divisor);
+                    b = SC_MUL(q0.b + ctx->round, ctx->divisor);
                     r = (31 * r + (r >> 3) + delta) >> 8;
                     g = (63 * g + (g >> 2) + delta) >> 8;
                     b = (31 * b + (b >> 3) + delta) >> 8;
@@ -680,6 +677,7 @@
         scale_h_linear_setup(&ctx);
     }
 #endif
+    SC_MUL_INIT;
 #ifdef HAVE_UPSCALER
     if (sh > dh)
 #endif
@@ -688,6 +686,7 @@
     else
         ret = scale_v_linear(rset, &ctx);
 #endif
+    SC_MUL_END;
 #ifdef HAVE_ADJUSTABLE_CPU_FREQ
     cpu_boost(false);
 #endif
diff --git a/apps/recorder/resize.h b/apps/recorder/resize.h
index f8328e3..e1237c4 100644
--- a/apps/recorder/resize.h
+++ b/apps/recorder/resize.h
@@ -43,6 +43,51 @@
 #define MAX_SC_STACK_ALLOC 0
 #define HAVE_UPSCALER 1
 
+#if defined(CPU_COLDFIRE)
+#define SC_NUM 0x80000000U
+#define SC_MUL_INIT \
+    unsigned long macsr_st = coldfire_get_macsr(); \
+    coldfire_set_macsr(0);
+#define SC_MUL_END coldfire_set_macsr(macsr_st);
+#define SC_MUL(x, y) \
+({ \
+    unsigned long t; \
+    asm ("mac.l    %[a], %[b], %%acc0\n\t" \
+         "move.l %%accext01, %[t]\n\t" \
+         "move.l #0, %%acc0\n\t" \
+         : [t] "=r" (t) : [a] "r" (x), [b] "r" (y)); \
+    t; \
+})
+#elif defined(CPU_SH)
+#define SC_SHIFT 24
+#endif
+
+#ifndef SC_SHIFT
+#define SC_SHIFT 32
+#endif
+
+#if SC_SHIFT == 24
+#define SC_NUM 0x1000000U
+#define SC_FIX 0
+
+#ifndef SC_MUL
+#define SC_MUL(x, y) ((x) * (y) >> 24)
+#define SC_MUL_INIT
+#define SC_MUL_END
+#endif
+
+#else /* SC_SHIFT == 32 */
+#define SC_NUM 0x80000000U
+#define SC_FIX 1
+
+#ifndef SC_MUL
+#define SC_MUL(x, y) ((x) * (uint64_t)(y) >> 32)
+#define SC_MUL_INIT
+#define SC_MUL_END
+#endif
+
+#endif
+
 struct img_part {
     int len;
 #if !defined(HAVE_LCD_COLOR)