Use pre-multiplication in scaler to save one multiply per color component on ARM and Coldfire, at the cost of an extra add/shift in the horizontal scaler to reduce values to a workable range. SH-1 retains the same basic math, as
the use of 16x16->32 hardware multiplication in the earlier scaler stages saves more than removing the 32x32->40 multiply to descale output.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@21091 a1c6a512-1295-4272-9138-f99709370657
diff --git a/apps/plugins/bench_scaler.c b/apps/plugins/bench_scaler.c
index c24807d..246271d 100644
--- a/apps/plugins/bench_scaler.c
+++ b/apps/plugins/bench_scaler.c
@@ -49,8 +49,8 @@
 #else
     uint32_t *lim = in + ctx->bm->width;
 #endif
-    for (; in < lim; in++)
-        output = SC_MUL(*in + ctx->round, ctx->divisor);
+    while (in < lim)
+        output = SC_OUT(*in++, ctx);
     return;
 }
 
diff --git a/apps/plugins/lib/grey_draw.c b/apps/plugins/lib/grey_draw.c
index 6315ad9..c1e6376 100644
--- a/apps/plugins/lib/grey_draw.c
+++ b/apps/plugins/lib/grey_draw.c
@@ -733,7 +733,7 @@
     uint32_t *qp = (uint32_t*)row_in;
     uint8_t *dest = (uint8_t*)ctx->bm->data + ctx->bm->width * row;
     for (col = 0; col < ctx->bm->width; col++)
-        *dest++ = SC_MUL((*qp++) + ctx->round,ctx->divisor);
+        *dest++ = SC_OUT(*qp++, ctx);
 }
 
 static unsigned int get_size_grey(struct bitmap *bm)
diff --git a/apps/plugins/pictureflow/pictureflow.c b/apps/plugins/pictureflow/pictureflow.c
index a1ad3d2..bbe2541 100644
--- a/apps/plugins/pictureflow/pictureflow.c
+++ b/apps/plugins/pictureflow/pictureflow.c
@@ -592,25 +592,12 @@
     return fsin(iangle + (IANGLE_MAX >> 2));
 }
 
-static inline uint32_t div255(uint32_t val)
+static inline unsigned scale_val(unsigned val, unsigned bits)
 {
-    return ((((val >> 8) + val) >> 8) + val) >> 8;
+    val = val * ((1 << bits) - 1);
+    return ((val >> 8) + val + 128) >> 8;
 }
 
-#define SCALE_VAL(val,out) div255((val) * (out) + 127)
-#define SCALE_VAL32(val, out) \
-({ \
-    uint32_t val__ = (val) * (out); \
-    val__ = ((((val__ >> 8) + val__) >> 8) + val__ + 128) >> 8; \
-    val__; \
-})
-#define SCALE_VAL8(val, out) \
-({ \
-    unsigned val__ = (val) * (out); \
-    val__ = ((val__ >> 8) + val__ + 128) >> 8; \
-    val__; \
-})
-
 static void output_row_8_transposed(uint32_t row, void * row_in,
                                        struct scaler_context *ctx)
 {
@@ -625,9 +612,9 @@
     unsigned r, g, b;
     for (; dest < end; dest += ctx->bm->height)
     {
-        r = SCALE_VAL8(qp->red, 31);
-        g = SCALE_VAL8(qp->green, 63);
-        b = SCALE_VAL8((qp++)->blue, 31);
+        r = scale_val(qp->red, 5);
+        g = scale_val(qp->green, 6);
+        b = scale_val((qp++)->blue, 5);
         *dest = LCD_RGBPACK_LCD(r,g,b);
     }
 #endif
@@ -641,19 +628,15 @@
 #ifdef USEGSLIB
     uint32_t *qp = (uint32_t*)row_in;
     for (; dest < end; dest += ctx->bm->height)
-        *dest = SC_MUL((*qp++) + ctx->round, ctx->divisor);
+        *dest = SC_OUT(*qp++, ctx);
 #else
     struct uint32_rgb *qp = (struct uint32_rgb*)row_in;
-    uint32_t rb_mul = SCALE_VAL32(ctx->divisor, 31),
-             rb_rnd = SCALE_VAL32(ctx->round, 31),
-             g_mul = SCALE_VAL32(ctx->divisor, 63),
-             g_rnd = SCALE_VAL32(ctx->round, 63);
     int r, g, b;
     for (; dest < end; dest += ctx->bm->height)
     {
-        r = SC_MUL(qp->r + rb_rnd, rb_mul);
-        g = SC_MUL(qp->g + g_rnd, g_mul);
-        b = SC_MUL(qp->b + rb_rnd, rb_mul);
+        r = scale_val(SC_OUT(qp->r, ctx), 5);
+        g = scale_val(SC_OUT(qp->g, ctx), 6);
+        b = scale_val(SC_OUT(qp->b, ctx), 5);
         qp++;
         *dest = LCD_RGBPACK_LCD(r,g,b);
     }
@@ -670,14 +653,14 @@
     for (; dest < end; dest += ctx->bm->height)
     {
         unsigned r, g, b, y, u, v;
-        y = SC_MUL(qp->b + ctx->round, ctx->divisor);
-        u = SC_MUL(qp->g + ctx->round, ctx->divisor);
-        v = SC_MUL(qp->r + ctx->round, ctx->divisor);
+        y = SC_OUT(qp->b, ctx);
+        u = SC_OUT(qp->g, ctx);
+        v = SC_OUT(qp->r, ctx);
         qp++;
         yuv_to_rgb(y, u, v, &r, &g, &b);
-        r = (31 * r + (r >> 3) + 127) >> 8;
-        g = (63 * g + (g >> 2) + 127) >> 8;
-        b = (31 * b + (b >> 3) + 127) >> 8;
+        r = scale_val(r, 5);
+        g = scale_val(g, 6);
+        b = scale_val(b, 5);
         *dest = LCD_RGBPACK_LCD(r, g, b);
     }
 }
diff --git a/apps/recorder/resize.c b/apps/recorder/resize.c
index 1e9210e..3a0ad8d 100644
--- a/apps/recorder/resize.c
+++ b/apps/recorder/resize.c
@@ -131,20 +131,45 @@
         return false; \
 }
 
-/* Set up rounding and scale factors for horizontal area scaler */
-static inline void scale_h_area_setup(struct scaler_context *ctx)
+#if defined(CPU_COLDFIRE)
+#define MAC(op1, op2, num) \
+    asm volatile( \
+        "mac.l %0, %1, %%acc" #num \
+        : \
+        : "%d" (op1), "d" (op2)\
+    )
+#define MAC_OUT(dest, num) \
+    asm volatile( \
+        "movclr.l %%acc" #num ", %0" \
+        : "=d" (dest) \
+    )
+#elif defined(CPU_SH)
+/* calculate the 32-bit product of unsigned 16-bit op1 and op2 */
+static inline int32_t mul_s16_s16(int16_t op1, int16_t op2)
 {
-/* sum is output value * src->width */
-    SDEBUGF("scale_h_area_setup\n");
-    ctx->divisor = ctx->src->width;
+    return (int32_t)(op1 * op2);
 }
 
+/* calculate the 32-bit product of signed 16-bit op1 and op2 */
+static inline uint32_t mul_u16_u16(uint16_t op1, uint16_t op2)
+{
+    return (uint32_t)(op1 * op2);
+}
+#endif
+
 /* horizontal area average scaler */
 static bool scale_h_area(void *out_line_ptr,
                          struct scaler_context *ctx, bool accum)
 {
     SDEBUGF("scale_h_area\n");
     unsigned int ix, ox, oxe, mul;
+#if defined(CPU_SH) || defined (TEST_SH_MATH)
+    const uint32_t h_i_val = ctx->src->width,
+                   h_o_val = ctx->bm->width;
+#else
+    const uint32_t h_i_val = ctx->h_i_val,
+                   h_o_val = ctx->h_o_val;
+#endif
 #ifdef HAVE_LCD_COLOR
     struct uint32_rgb rgbvalacc = { 0, 0, 0 },
                       rgbvaltmp = { 0, 0, 0 },
@@ -161,31 +186,57 @@
     yield();
     for (ix = 0; ix < (unsigned int)ctx->src->width; ix++)
     {
-        oxe += ctx->bm->width;
+        oxe += h_o_val;
         /* end of current area has been reached */
         /* fill buffer if needed */
         FILL_BUF(part,ctx->store_part,ctx->args);
 #ifdef HAVE_LCD_COLOR
-        if (oxe >= (unsigned int)ctx->src->width)
+        if (oxe >= h_i_val)
         {
             /* "reset" error, which now represents partial coverage of next
                pixel by the next area
             */
-            oxe -= ctx->src->width;
+            oxe -= h_i_val;
 
+#if defined(CPU_COLDFIRE)
+/* Coldfire EMAC math */
             /* add saved partial pixel from start of area */
-            rgbvalacc.r = rgbvalacc.r * ctx->bm->width + rgbvaltmp.r * mul;
-            rgbvalacc.g = rgbvalacc.g * ctx->bm->width + rgbvaltmp.g * mul;
-            rgbvalacc.b = rgbvalacc.b * ctx->bm->width + rgbvaltmp.b * mul;
+            MAC(rgbvalacc.r, h_o_val, 0);
+            MAC(rgbvalacc.g, h_o_val, 1);
+            MAC(rgbvalacc.b, h_o_val, 2);
+            MAC(rgbvaltmp.r, mul, 0);
+            MAC(rgbvaltmp.g, mul, 1);
+            MAC(rgbvaltmp.b, mul, 2);
+            /* get new pixel , then add its partial coverage to this area */
+            mul = h_o_val - oxe;
+            rgbvaltmp.r = part->buf->red;
+            rgbvaltmp.g = part->buf->green;
+            rgbvaltmp.b = part->buf->blue;
+            MAC(rgbvaltmp.r, mul, 0);
+            MAC(rgbvaltmp.g, mul, 1);
+            MAC(rgbvaltmp.b, mul, 2);
+            MAC_OUT(rgbvalacc.r, 0);
+            MAC_OUT(rgbvalacc.g, 1);
+            MAC_OUT(rgbvalacc.b, 2);
+#else
+/* generic C math */
+            /* add saved partial pixel from start of area */
+            rgbvalacc.r = rgbvalacc.r * h_o_val + rgbvaltmp.r * mul;
+            rgbvalacc.g = rgbvalacc.g * h_o_val + rgbvaltmp.g * mul;
+            rgbvalacc.b = rgbvalacc.b * h_o_val + rgbvaltmp.b * mul;
 
             /* get new pixel , then add its partial coverage to this area */
             rgbvaltmp.r = part->buf->red;
             rgbvaltmp.g = part->buf->green;
             rgbvaltmp.b = part->buf->blue;
-            mul = ctx->bm->width - oxe;
+            mul = h_o_val - oxe;
             rgbvalacc.r += rgbvaltmp.r * mul;
             rgbvalacc.g += rgbvaltmp.g * mul;
             rgbvalacc.b += rgbvaltmp.b * mul;
+#endif /* CPU */
+            rgbvalacc.r = (rgbvalacc.r + (1 << 21)) >> 22;
+            rgbvalacc.g = (rgbvalacc.g + (1 << 21)) >> 22;
+            rgbvalacc.b = (rgbvalacc.b + (1 << 21)) >> 22;
             /* store or accumulate to output row */
             if (accum)
             {
@@ -200,7 +251,7 @@
             rgbvalacc.r = 0;
             rgbvalacc.g = 0;
             rgbvalacc.b = 0;
-            mul = ctx->bm->width - mul;
+            mul = oxe;
             ox += 1;
         /* inside an area */
         } else {
@@ -210,21 +261,45 @@
             rgbvalacc.b += part->buf->blue;
         }
 #else
-        if (oxe >= (unsigned int)ctx->src->width)
+        if (oxe >= h_i_val)
         {
             /* "reset" error, which now represents partial coverage of next
                pixel by the next area
             */
-            oxe -= ctx->src->width;
-
+            oxe -= h_i_val;
+#if defined(CPU_COLDFIRE)
+/* Coldfire EMAC math */
             /* add saved partial pixel from start of area */
-            acc = MULUQ(acc, ctx->bm->width) + MULUQ(tmp, mul);
+            MAC(acc, h_o_val, 0);
+            MAC(tmp, mul, 0);
+            /* get new pixel , then add its partial coverage to this area */
+            tmp = *(part->buf);
+            mul = h_o_val - oxe;
+            MAC(tmp, mul, 0);
+            MAC_OUT(acc, 0);
+#elif defined(CPU_SH)
+/* SH-1 16x16->32 math */
+            /* add saved partial pixel from start of area */
+            acc = mul_u16_u16(acc, h_o_val) + mul_u16_u16(tmp, mul);
 
             /* get new pixel , then add its partial coverage to this area */
             tmp = *(part->buf);
-            mul = ctx->bm->width - oxe;
-            acc += MULUQ(tmp, mul);
+            mul = h_o_val - oxe;
+            acc += mul_u16_u16(tmp, mul);
+#else
+/* generic C math */
+            /* add saved partial pixel from start of area */
+            acc = (acc * h_o_val) + (tmp * mul);
+
+            /* get new pixel , then add its partial coverage to this area */
+            tmp = *(part->buf);
+            mul = h_o_val - oxe;
+            acc += tmp * mul;
+#endif /* CPU */
+#if !(defined(CPU_SH) || defined(TEST_SH_MATH))
             /* round, divide, and either store or accumulate to output row */
+            acc = (acc + (1 << 21)) >> 22;
+#endif
             if (accum)
             {
                 acc += out_line[ox];
@@ -232,7 +307,7 @@
             out_line[ox] = acc;
             /* reset accumulator */
             acc = 0;
-            mul = ctx->bm->width - mul;
+            mul = oxe;
             ox += 1;
         /* inside an area */
         } else {
@@ -249,56 +324,56 @@
 /* vertical area average scaler */
 static inline bool scale_v_area(struct rowset *rset, struct scaler_context *ctx)
 {
-    uint32_t mul, x, oy, iy, oye;
+    uint32_t mul, oy, iy, oye;
+#if defined(CPU_SH) || defined (TEST_SH_MATH)
+    const uint32_t v_i_val = ctx->src->height,
+                   v_o_val = ctx->bm->height;
+#else
+    const uint32_t v_i_val = ctx->v_i_val,
+                   v_o_val = ctx->v_o_val;
+#endif
 
     /* Set up rounding and scale factors */
-    ctx->divisor *= ctx->src->height;
-    ctx->round = ctx->divisor >> 1;
-    ctx->divisor = 1 + (-((ctx->divisor + 1) >> 1)) / ctx->divisor;
     mul = 0;
     oy = rset->rowstart;
     oye = 0;
 #ifdef HAVE_LCD_COLOR
     uint32_t *rowacc = (uint32_t *) ctx->buf,
-             *rowtmp = rowacc + 3 * ctx->bm->width;
+             *rowtmp = rowacc + 3 * ctx->bm->width,
+             *rowacc_px, *rowtmp_px;
     memset((void *)ctx->buf, 0, ctx->bm->width * 2 * sizeof(struct uint32_rgb));
 #else
     uint32_t *rowacc = (uint32_t *) ctx->buf,
-             *rowtmp = rowacc + ctx->bm->width;
+             *rowtmp = rowacc + ctx->bm->width,
+             *rowacc_px, *rowtmp_px;
     memset((void *)ctx->buf, 0, ctx->bm->width * 2 * sizeof(uint32_t));
 #endif
     SDEBUGF("scale_v_area\n");
     /* zero the accumulator and temp rows */
     for (iy = 0; iy < (unsigned int)ctx->src->height; iy++)
     {
-        oye += ctx->bm->height;
+        oye += v_o_val;
         /* end of current area has been reached */
-        if (oye >= (unsigned int)ctx->src->height)
+        if (oye >= v_i_val)
         {
             /* "reset" error, which now represents partial coverage of the next
                row by the next area
             */
-            oye -= ctx->src->height;
+            oye -= v_i_val;
             /* add stored partial row to accumulator */
-#ifdef HAVE_LCD_COLOR
-            for (x = 0; x < 3 * (unsigned int)ctx->bm->width; x++)
-#else
-            for (x = 0; x < (unsigned int)ctx->bm->width; x++)
-#endif
-                rowacc[x] = rowacc[x] * ctx->bm->height + mul * rowtmp[x];
+            for(rowacc_px = rowacc, rowtmp_px = rowtmp; rowacc_px != rowtmp;
+                rowacc_px++, rowtmp_px++)
+                *rowacc_px = *rowacc_px * v_o_val + *rowtmp_px * mul;
             /* store new scaled row in temp row */
             if(!ctx->h_scaler(rowtmp, ctx, false))
                 return false;
             /* add partial coverage by new row to this area, then round and
                scale to final value
             */
-            mul = ctx->bm->height - oye;
-#ifdef HAVE_LCD_COLOR
-            for (x = 0; x < 3 * (unsigned int)ctx->bm->width; x++)
-#else
-            for (x = 0; x < (unsigned int)ctx->bm->width; x++)
-#endif
-                rowacc[x] += mul * rowtmp[x];
+            mul = v_o_val - oye;
+            for(rowacc_px = rowacc, rowtmp_px = rowtmp; rowacc_px != rowtmp;
+                rowacc_px++, rowtmp_px++)
+                *rowacc_px += mul * *rowtmp_px;
             ctx->output_row(oy, (void*)rowacc, ctx);
             /* clear accumulator row, store partial coverage for next row */
 #ifdef HAVE_LCD_COLOR
@@ -319,20 +394,18 @@
 }
 
 #ifdef HAVE_UPSCALER
-/* Set up rounding and scale factors for the horizontal scaler. The divisor
-   is bm->width - 1, so that the first and last pixels in the row align
-   exactly between input and output
-*/
-static inline void scale_h_linear_setup(struct scaler_context *ctx)
-{
-    ctx->divisor = ctx->bm->width - 1;
-}
-
 /* horizontal linear scaler */
 static bool scale_h_linear(void *out_line_ptr, struct scaler_context *ctx,
                            bool accum)
 {
     unsigned int ix, ox, ixe;
+#if defined(CPU_SH) || defined (TEST_SH_MATH)
+    const uint32_t h_i_val = ctx->src->width - 1,
+                   h_o_val = ctx->bm->width - 1;
+#else
+    const uint32_t h_i_val = ctx->h_i_val,
+                   h_o_val = ctx->h_o_val;
+#endif
     /* type x = x is an ugly hack for hiding an unitialized data warning. The
        values are conditionally initialized before use, but other values are
        set such that this will occur before these are used.
@@ -348,27 +421,35 @@
     FILL_BUF_INIT(part,ctx->store_part,ctx->args);
     ix = 0;
     /* The error is set so that values are initialized on the first pass. */
-    ixe = ctx->bm->width - 1;
+    ixe = h_o_val;
     /* give other tasks a chance to run */
     yield();
     for (ox = 0; ox < (uint32_t)ctx->bm->width; ox++)
     {
 #ifdef HAVE_LCD_COLOR
-        if (ixe >= ((uint32_t)ctx->bm->width - 1))
+        if (ixe >= h_o_val)
         {
             /* Store the new "current" pixel value in rgbval, and the color
                step value in rgbinc.
             */
-            ixe -= (ctx->bm->width - 1);
+            ixe -= h_o_val;
             rgbinc.r = -(part->buf->red);
             rgbinc.g = -(part->buf->green);
             rgbinc.b = -(part->buf->blue);
-            rgbval.r = (part->buf->red) * (ctx->bm->width - 1);
-            rgbval.g = (part->buf->green) * (ctx->bm->width - 1);
-            rgbval.b = (part->buf->blue) * (ctx->bm->width - 1);
+#if defined(CPU_COLDFIRE)
+/* Coldfire EMAC math */
+            MAC(part->buf->red, h_o_val, 0);
+            MAC(part->buf->green, h_o_val, 1);
+            MAC(part->buf->blue, h_o_val, 2);
+#else
+/* generic C math */
+            rgbval.r = (part->buf->red) * h_o_val;
+            rgbval.g = (part->buf->green) * h_o_val;
+            rgbval.b = (part->buf->blue) * h_o_val;
+#endif /* CPU */
             ix += 1;
             /* If this wasn't the last pixel, add the next one to rgbinc. */
-            if (ix < (uint32_t)ctx->src->width) {
+            if (LIKELY(ix < (uint32_t)ctx->src->width)) {
                 part->buf++;
                 part->len--;
                 /* Fetch new pixels if needed */
@@ -379,14 +460,28 @@
                 /* Add a partial step to rgbval, in this pixel isn't precisely
                    aligned with the new source pixel
                 */
+#if defined(CPU_COLDFIRE)
+/* Coldfire EMAC math */
+                MAC(rgbinc.r, ixe, 0);
+                MAC(rgbinc.g, ixe, 1);
+                MAC(rgbinc.b, ixe, 2);
+#else
+/* generic C math */
                 rgbval.r += rgbinc.r * ixe;
                 rgbval.g += rgbinc.g * ixe;
                 rgbval.b += rgbinc.b * ixe;
+#endif
             }
-            /* Now multiple the color increment to its proper value */
-            rgbinc.r *= ctx->src->width - 1;
-            rgbinc.g *= ctx->src->width - 1;
-            rgbinc.b *= ctx->src->width - 1;
+#if defined(CPU_COLDFIRE)
+/* get final EMAC result out of ACC registers */
+            MAC_OUT(rgbval.r, 0);
+            MAC_OUT(rgbval.g, 1);
+            MAC_OUT(rgbval.b, 2);
+#endif
+            /* Now multiply the color increment to its proper value */
+            rgbinc.r *= h_i_val;
+            rgbinc.g *= h_i_val;
+            rgbinc.b *= h_i_val;
         } else {
             rgbval.r += rgbinc.r;
             rgbval.g += rgbinc.g;
@@ -395,27 +490,36 @@
         /* round and scale values, and accumulate or store to output */
         if (accum)
         {
-            out_line[ox].r += rgbval.r;
-            out_line[ox].g += rgbval.g;
-            out_line[ox].b += rgbval.b;
+            out_line[ox].r += (rgbval.r + (1 << 21)) >> 22;
+            out_line[ox].g += (rgbval.g + (1 << 21)) >> 22;
+            out_line[ox].b += (rgbval.b + (1 << 21)) >> 22;
         } else {
-            out_line[ox].r = rgbval.r;
-            out_line[ox].g = rgbval.g;
-            out_line[ox].b = rgbval.b;
+            out_line[ox].r = (rgbval.r + (1 << 21)) >> 22;
+            out_line[ox].g = (rgbval.g + (1 << 21)) >> 22;
+            out_line[ox].b = (rgbval.b + (1 << 21)) >> 22;
         }
 #else
-        if (ixe >= ((uint32_t)ctx->bm->width - 1))
+        if (ixe >= h_o_val)
         {
             /* Store the new "current" pixel value in rgbval, and the color
                step value in rgbinc.
             */
-            ixe -= (ctx->bm->width - 1);
+            ixe -= h_o_val;
             val = *(part->buf);
             inc = -val;
-            val = MULUQ(val, ctx->bm->width - 1);
+#if defined(CPU_COLDFIRE)
+/* Coldfire EMAC math */
+            MAC(val, h_o_val, 0);
+#elif defined(CPU_SH)
+/* SH-1 16x16->32 math */
+            val = mul_u16_u16(val, h_o_val);
+#else
+/* generic C math */
+            val = val * h_o_val;
+#endif
             ix += 1;
             /* If this wasn't the last pixel, add the next one to rgbinc. */
-            if (ix < (uint32_t)ctx->src->width) {
+            if (LIKELY(ix < (uint32_t)ctx->src->width)) {
                 part->buf++;
                 part->len--;
                 /* Fetch new pixels if needed */
@@ -424,12 +528,40 @@
                 /* Add a partial step to rgbval, in this pixel isn't precisely
                    aligned with the new source pixel
                 */
-                val += MULQ(inc, ixe);
+#if defined(CPU_COLDFIRE)
+/* Coldfire EMAC math */
+                MAC(inc, ixe, 0);
+#elif defined(CPU_SH)
+/* SH-1 16x16->32 math */
+                val += mul_s16_s16(inc, ixe);
+#else
+/* generic C math */
+                val += inc * ixe;
+#endif
             }
+#if defined(CPU_COLDFIRE)
+/* get final EMAC result out of ACC register */
+            MAC_OUT(val, 0);
+#endif
             /* Now multiply the color increment to its proper value */
-            inc = MULQ(inc, ctx->src->width - 1);
+#if defined(CPU_SH)
+/* SH-1 16x16->32 math */
+            inc = mul_s16_s16(inc, h_i_val);
+#else
+/* generic C math */
+            inc *= h_i_val;
+#endif
         } else
             val += inc;
+#if !(defined(CPU_SH) || defined(TEST_SH_MATH))
+        /* round and scale values, and accumulate or store to output */
+        if (accum)
+        {
+            out_line[ox] += (val + (1 << 21)) >> 22;
+        } else {
+            out_line[ox] = (val + (1 << 21)) >> 22;
+        }
+#else
         /* round and scale values, and accumulate or store to output */
         if (accum)
         {
@@ -438,7 +570,8 @@
             out_line[ox] = val;
         }
 #endif
-        ixe += ctx->src->width - 1;
+#endif
+        ixe += h_i_val;
     }
     return true;
 }
@@ -447,71 +580,66 @@
 static inline bool scale_v_linear(struct rowset *rset,
                                   struct scaler_context *ctx)
 {
-    uint32_t mul, x, iy, iye;
+    uint32_t mul, iy, iye;
     int32_t oy;
-    /* Set up scale and rounding factors, the divisor is bm->height - 1 */
-    ctx->divisor *= (ctx->bm->height - 1);
-    ctx->round = ctx->divisor >> 1;
-    ctx->divisor = 1 + (-((ctx->divisor + 1) >> 1)) / ctx->divisor;
-    /* Set up our two temp buffers. The names are generic because they'll be
-       swapped each time a new input row is read
+#if defined(CPU_SH) || defined (TEST_SH_MATH)
+    const uint32_t v_i_val = ctx->src->height - 1,
+                   v_o_val = ctx->bm->height - 1;
+#else
+    const uint32_t v_i_val = ctx->v_i_val,
+                   v_o_val = ctx->v_o_val;
+#endif
+    /* Set up our buffers, to store the increment and current value for each
+       column, and one temp buffer used to read in new rows.
     */
 #ifdef HAVE_LCD_COLOR
     uint32_t *rowinc = (uint32_t *)(ctx->buf),
              *rowval = rowinc + 3 * ctx->bm->width,
-             *rowtmp = rowval + 3 * ctx->bm->width;
+             *rowtmp = rowval + 3 * ctx->bm->width,
 #else
     uint32_t *rowinc = (uint32_t *)(ctx->buf),
              *rowval = rowinc + ctx->bm->width,
-             *rowtmp = rowval + ctx->bm->width;
+             *rowtmp = rowval + ctx->bm->width,
 #endif
+             *rowinc_px, *rowval_px, *rowtmp_px;
 
     SDEBUGF("scale_v_linear\n");
     mul = 0;
     iy = 0;
-    iye = ctx->bm->height - 1;
+    iye = v_o_val;
     /* get first scaled row in rowtmp */
     if(!ctx->h_scaler((void*)rowtmp, ctx, false))
         return false;
     for (oy = rset->rowstart; oy != rset->rowstop; oy += rset->rowstep)
     {
-        if (iye >= (uint32_t)ctx->bm->height - 1)
+        if (iye >= v_o_val)
         {
-            iye -= ctx->bm->height - 1;
+            iye -= v_o_val;
             iy += 1;
-#ifdef HAVE_LCD_COLOR
-            for (x = 0; x < 3 * (uint32_t)ctx->bm->width; x++)
-#else
-            for (x = 0; x < (uint32_t)ctx->bm->width; x++)
-#endif
+            for(rowinc_px = rowinc, rowtmp_px = rowtmp, rowval_px = rowval;
+                rowinc_px < rowval; rowinc_px++, rowtmp_px++, rowval_px++)
             {
-                rowinc[x] = -rowtmp[x];
-                rowval[x] = rowtmp[x] * (ctx->bm->height - 1);
+                *rowinc_px = -*rowtmp_px;
+                *rowval_px = *rowtmp_px * v_o_val;
             }
             if (iy < (uint32_t)ctx->src->height)
             {
                 if (!ctx->h_scaler((void*)rowtmp, ctx, false))
                     return false;
-#ifdef HAVE_LCD_COLOR
-                for (x = 0; x < 3 * (uint32_t)ctx->bm->width; x++)
-#else
-                for (x = 0; x < (uint32_t)ctx->bm->width; x++)
-#endif
+                for(rowinc_px = rowinc, rowtmp_px = rowtmp, rowval_px = rowval;
+                    rowinc_px < rowval; rowinc_px++, rowtmp_px++, rowval_px++)
                 {
-                    rowinc[x] += rowtmp[x];
-                    rowval[x] += rowinc[x] * iye;
-                    rowinc[x] *= ctx->src->height - 1;
+                    *rowinc_px += *rowtmp_px;
+                    *rowval_px += *rowinc_px * iye;
+                    *rowinc_px *= v_i_val;
                 }
             }
         } else
-#ifdef HAVE_LCD_COLOR
-            for (x = 0; x < 3 * (uint32_t)ctx->bm->width; x++)
-#else
-            for (x = 0; x < (uint32_t)ctx->bm->width; x++)
-#endif
-                    rowval[x] += rowinc[x];
+            for(rowinc_px = rowinc, rowval_px = rowval; rowinc_px < rowval;
+                rowinc_px++, rowval_px++)
+                *rowval_px += *rowinc_px;
         ctx->output_row(oy, (void*)rowval, ctx);
-        iye += ctx->src->height - 1;
+        iye += v_i_val;
     }
     return true;
 }
@@ -533,9 +661,9 @@
     for (col = 0; col < ctx->bm->width; col++) {
         if (ctx->dither)
             delta = DITHERXDY(col,dy);
-        y = SC_MUL(qp->b + ctx->round, ctx->divisor);
-        u = SC_MUL(qp->g + ctx->round, ctx->divisor);
-        v = SC_MUL(qp->r + ctx->round, ctx->divisor);
+        y = SC_OUT(qp->b, ctx);
+        u = SC_OUT(qp->g, ctx);
+        v = SC_OUT(qp->r, ctx);
         qp++;
         yuv_to_rgb(y, u, v, &r, &g, &b);
         r = (31 * r + (r >> 3) + delta) >> 8;
@@ -571,7 +699,7 @@
                 for (col = 0; col < ctx->bm->width; col++) {
                     if (ctx->dither)
                         delta = DITHERXDY(col,dy);
-                    bright = SC_MUL((*qp++) + ctx->round,ctx->divisor);
+                    bright = SC_OUT(*qp++, ctx);
                     bright = (3 * bright + (bright >> 6) + delta) >> 8;
                     data |= (~bright & 3) << shift;
                     shift -= 2;
@@ -594,7 +722,7 @@
                 for (col = 0; col < ctx->bm->width; col++) {
                     if (ctx->dither)
                         delta = DITHERXDY(col,dy);
-                    bright = SC_MUL((*qp++) + ctx->round, ctx->divisor);
+                    bright = SC_OUT(*qp++, ctx);
                     bright = (3 * bright + (bright >> 6) + delta) >> 8;
                     *dest++ |= (~bright & 3) << shift;
                 }
@@ -609,7 +737,7 @@
                 for (col = 0; col < ctx->bm->width; col++) {
                     if (ctx->dither)
                         delta = DITHERXDY(col,dy);
-                    bright = SC_MUL((*qp++) + ctx->round, ctx->divisor);
+                    bright = SC_OUT(*qp++, ctx);
                     bright = (3 * bright + (bright >> 6) + delta) >> 8;
                     *dest++ |= vi_pattern[bright] << shift;
                 }
@@ -625,9 +753,9 @@
                     if (ctx->dither)
                         delta = DITHERXDY(col,dy);
                     q0 = *qp++;
-                    r = SC_MUL(q0.r + ctx->round, ctx->divisor);
-                    g = SC_MUL(q0.g + ctx->round, ctx->divisor);
-                    b = SC_MUL(q0.b + ctx->round, ctx->divisor);
+                    r = SC_OUT(q0.r, ctx);
+                    g = SC_OUT(q0.g, ctx);
+                    b = SC_OUT(q0.b, ctx);
                     r = (31 * r + (r >> 3) + delta) >> 8;
                     g = (63 * g + (g >> 2) + delta) >> 8;
                     b = (31 * b + (b >> 3) + delta) >> 8;
@@ -664,13 +792,10 @@
                    struct img_part* (*store_part)(void *args),
                    void *args)
 {
-
-#ifdef HAVE_UPSCALER
     const int sw = src->width;
     const int sh = src->height;
     const int dw = bm->width;
     const int dh = bm->height;
-#endif
     int ret;
 #ifdef HAVE_LCD_COLOR
     unsigned int needed = sizeof(struct uint32_rgb) * 3 * bm->width;
@@ -721,6 +846,9 @@
     ctx.bm = bm;
     ctx.src = src;
     ctx.dither = dither;
+#if defined(CPU_SH) || defined (TEST_SH_MATH)
+    uint32_t div;
+#endif
 #if !defined(PLUGIN)
 #if defined(HAVE_LCD_COLOR) && defined(HAVE_JPEG)
     ctx.output_row = format_index ? output_row_32_native_fromyuv
@@ -740,23 +868,56 @@
     {
 #endif
         ctx.h_scaler = scale_h_area;
-        scale_h_area_setup(&ctx);
+#if defined(CPU_SH) || defined (TEST_SH_MATH)
+        div = sw;
+#else
+        uint32_t h_div = (1U << 24) / sw;
+        ctx.h_i_val = sw * h_div;
+        ctx.h_o_val = dw * h_div;
+#endif
 #ifdef HAVE_UPSCALER
     } else {
         ctx.h_scaler = scale_h_linear;
-        scale_h_linear_setup(&ctx);
+#if defined(CPU_SH) || defined (TEST_SH_MATH)
+        div = dw - 1;
+#else
+        uint32_t h_div = (1U << 24) / (dw - 1);
+        ctx.h_i_val = (sw - 1) * h_div;
+        ctx.h_o_val = (dw - 1) * h_div;
+#endif
     }
 #endif
-    SC_MUL_INIT;
+#ifdef CPU_COLDFIRE
+    coldfire_set_macsr(EMAC_UNSIGNED);
+#endif
 #ifdef HAVE_UPSCALER
     if (sh > dh)
 #endif
+    {
+#if defined(CPU_SH) || defined (TEST_SH_MATH)
+        div *= sh;
+        ctx.recip = ((uint32_t)(-div)) / div + 1;
+#else
+        uint32_t v_div = (1U << 22) / sh;
+        ctx.v_i_val = sh * v_div;
+        ctx.v_o_val = dh * v_div;
+#endif
         ret = scale_v_area(rset, &ctx);
+    }
 #ifdef HAVE_UPSCALER
     else
-        ret = scale_v_linear(rset, &ctx);
+    {
+#if defined(CPU_SH) || defined (TEST_SH_MATH)
+        div *= dh - 1;
+        ctx.recip = ((uint32_t)(-div)) / div + 1;
+#else
+        uint32_t v_div = (1U << 22) / dh;
+        ctx.v_i_val = (sh - 1) * v_div;
+        ctx.v_o_val = (dh - 1) * v_div;
 #endif
-    SC_MUL_END;
+        ret = scale_v_linear(rset, &ctx);
+    }
+#endif
 #ifdef HAVE_ADJUSTABLE_CPU_FREQ
     cpu_boost(false);
 #endif
diff --git a/apps/recorder/resize.h b/apps/recorder/resize.h
index 2964fcd..ef32066 100644
--- a/apps/recorder/resize.h
+++ b/apps/recorder/resize.h
@@ -43,67 +43,61 @@
 #define MAX_SC_STACK_ALLOC 0
 #define HAVE_UPSCALER 1
 
-#if defined(CPU_COLDFIRE)
-#define SC_MUL_INIT \
-    unsigned long macsr_st = coldfire_get_macsr(); \
-    coldfire_set_macsr(EMAC_UNSIGNED);
-#define SC_MUL_END coldfire_set_macsr(macsr_st);
-#define SC_MUL(x, y) \
-({ \
-    unsigned long t; \
-    asm ("mac.l    %[a], %[b], %%acc0\n\t" \
-         "move.l %%accext01, %[t]\n\t" \
-         "move.l #0, %%acc0\n\t" \
-         : [t] "=r" (t) : [a] "r" (x), [b] "r" (y)); \
-    t; \
-})
-#elif (CONFIG_CPU == SH7034)
-/* multiply two unsigned 32 bit values and return the top 32 bit
- * of the 64 bit result */
-static inline unsigned sc_mul32(unsigned a, unsigned b)
+#if defined(CPU_SH)
+/* perform 32x32->40 unsigned multiply, round off and return top 8 bits */
+static inline uint32_t sc_mul_u32_rnd(uint32_t m, uint32_t n)
 {
     unsigned r, t1, t2, t3;
-
+    unsigned h = 1 << 15;
+    /* notation:
+       m = ab, n = cd
+       final result is (((a *c) << 32) + ((b * c + a * d) << 16) + b * d +
+            (1 << 31)) >> 32
+    */
     asm (
-        "swap.w  %[a], %[t1]     \n" /* t1 = ba */
-        "mulu    %[t1], %[b]     \n" /* a * d */
-        "swap.w  %[b], %[t3]     \n" /* t3 = dc */
-        "sts     macl, %[t2]     \n" /* t2 = a * d */
-        "mulu    %[t1], %[t3]    \n" /* a * c */
-        "sts     macl, %[r]      \n" /* hi = a * c */
-        "mulu    %[a], %[t3]     \n" /* b * c */
-        "clrt                    \n"
-        "sts     macl, %[t3]     \n" /* t3 = b * c */
-        "addc    %[t2], %[t3]    \n" /* t3 += t2, carry -> t2 */
-        "movt    %[t2]           \n"
-        "mulu    %[a], %[b]      \n" /* b * d */
-        "mov     %[t3], %[t1]    \n" /* t1t3 = t2t3 << 16 */
-        "xtrct   %[t2], %[t1]    \n"
-        "shll16  %[t3]           \n"
-        "sts     macl, %[t2]     \n" /* lo = b * d */
-        "clrt                    \n" /* hi.lo += t1t3 */
-        "addc    %[t3], %[t2]    \n"
-        "addc    %[t1], %[r]     \n"
+        "swap.w  %[m], %[t1]\n\t" /* t1 = ba */
+        "mulu    %[m], %[n]\n\t" /* b * d */
+        "swap.w  %[n], %[t3]\n\t" /* t3 = dc */
+        "sts     macl, %[r]\n\t" /* r = b * d */
+        "mulu    %[m], %[t3]\n\t" /* b * c */
+        "shlr16  %[r]\n\t"
+        "sts     macl, %[t2]\n\t" /* t2 = b * c */
+        "mulu    %[t1], %[t3]\n\t" /* a * c */
+        "add     %[t2], %[r]\n\t"
+        "sts     macl, %[t3]\n\t" /* t3 = a * c */
+        "mulu    %[t1], %[n]\n\t" /* a * d */
+        "shll16  %[t3]\n\t"
+        "sts     macl, %[t2]\n\t" /* t2 = a * d */
+        "add     %[t2], %[r]\n\t"
+        "add     %[t3], %[r]\n\t" /* r = ((b * d) >> 16) + (b * c + a * d) +
+                                         ((a * c) << 16) */
+        "add     %[h], %[r]\n\t" /* round result */
+        "shlr16  %[r]\n\t" /* truncate result */
         : /* outputs */
         [r] "=&r"(r),
         [t1]"=&r"(t1),
         [t2]"=&r"(t2),
         [t3]"=&r"(t3)
         : /* inputs */
-        [a] "r"  (a),
-        [b] "r"  (b)
+        [h] "r"  (h),
+        [m] "r"  (m),
+        [n] "r"  (n)
     );
     return r;
 }
-#define SC_MUL(x, y) sc_mul32(x, y)
-#define SC_MUL_INIT
-#define SC_MUL_END
+#elif defined(TEST_SH_MATH)
+static inline uint32_t sc_mul_u32_rnd(uint32_t op1, uint32_t op2)
+{
+    uint64_t tmp = (uint64_t)op1 * op2;
+    tmp += 1LU << 31;
+    tmp >>= 32;
+    return tmp;
+}   
+#else
+#define SC_OUT(n, c) (((n) + (1 << 23)) >> 24)
 #endif
-
-#ifndef SC_MUL
-#define SC_MUL(x, y) ((x) * (uint64_t)(y) >> 32)
-#define SC_MUL_INIT
-#define SC_MUL_END
+#ifndef SC_OUT
+#define SC_OUT(n, c) (sc_mul_u32_rnd(n, (c)->recip))
 #endif
 
 struct img_part {
@@ -130,8 +124,14 @@
    horizontal scaler, and row output
 */
 struct scaler_context {
-    uint32_t divisor;
-    uint32_t round;
+#if defined(CPU_SH) || defined(TEST_SH_MATH)
+    uint32_t recip;
+#else
+    uint32_t h_i_val;
+    uint32_t h_o_val;
+    uint32_t v_i_val;
+    uint32_t v_o_val;
+#endif
     struct bitmap *bm;
     struct dim *src;
     unsigned char *buf;