Port greylib blitting optimisation to MPIO HD200. ISR speedup is ~10%; further speedup should be possible by using line transfers for accessing the greylib buffers. Thanks to Marcin Bukat for testing.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@26793 a1c6a512-1295-4272-9138-f99709370657
diff --git a/firmware/target/coldfire/mpio/hd200/lcd-as-hd200.S b/firmware/target/coldfire/mpio/hd200/lcd-as-hd200.S
index 7ea0b8f..9709e21 100644
--- a/firmware/target/coldfire/mpio/hd200/lcd-as-hd200.S
+++ b/firmware/target/coldfire/mpio/hd200/lcd-as-hd200.S
@@ -125,54 +125,45 @@
     .type       lcd_grey_data,@function
 
 lcd_grey_data:
-    lea.l   (-2*4, %sp), %sp
-    movem.l  %a2-%a3, (%sp)
-    movem.l (2*4+4, %sp), %a0-%a2  /* values, phases, length */
+    lea.l   (-6*4, %sp), %sp
+    movem.l  %d2-%d5/%a2-%a3, (%sp)
+    movem.l (6*4+4, %sp), %a0-%a2  /* values, phases, length */
     add.l   %a2, %a2
     lea.l   (%a1, %a2.l*4), %a2    /* end address */
     lea.l   LCD_BASE_ADDRESS+2, %a3 /* LCD data port address */
+    moveq.l #24, %d4            /* shift count */
+    move.l  #0x204081, %d5      /* bit shuffle factor */
+
 .ph_loop:
-    clr.l   %d1
-    move.l  (%a1), %d0          /* fetch 4 pixel phases */
-    bclr.l  #31, %d0            /* Z = !(p0 & 0x80); p0 &= ~0x80; */
-    seq.b   %d1                 /* %d1 = ........................00000000 */
-    lsl.l   #1, %d1             /* %d1 = .......................00000000. */
-    bclr.l  #23, %d0            /* Z = !(p1 & 0x80); p1 &= ~0x80; */
-    seq.b   %d1                 /* %d1 = .......................011111111 */
-    lsl.l   #1, %d1             /* %d1 = ......................011111111. */
-    bclr.l  #15, %d0            /* Z = !(p2 & 0x80); p2 &= ~0x80; */
-    seq.b   %d1                 /* %d1 = ......................0122222222 */
-    lsl.l   #1, %d1             /* %d1 = .....................0122222222. */
-    bclr.l  #7, %d0             /* Z = !(p3 & 0x80); p3 &= ~0x80; */
-    seq.b   %d1                 /* %d1 = .....................01233333333 */
-    lsl.l   #1, %d1             /* %d1 = ....................01233333333. */
-    add.l   (%a0)+, %d0         /* add 4 pixel values to the phases */
-    move.l  %d0, (%a1)+         /* store new phases, advance pointer */
+    movem.l (%a1), %d0-%d1      /* fetch 8 pixel phases */
 
-    move.l  (%a1), %d0          /* fetch 4 pixel phases */
-    bclr.l  #31, %d0            /* Z = !(p0 & 0x80); p0 &= ~0x80; */
-    seq.b   %d1                 /* %d1 = ....................012344444444 */
-    lsl.l   #1, %d1             /* %d1 = ...................012344444444. */
-    bclr.l  #23, %d0            /* Z = !(p1 & 0x80); p1 &= ~0x80; */
-    seq.b   %d1                 /* %d1 = ...................0123455555555 */
-    lsl.l   #1, %d1             /* %d1 = ..................0123455555555. */
-    bclr.l  #15, %d0            /* Z = !(p2 & 0x80); p2 &= ~0x80; */
-    seq.b   %d1                 /* %d1 = ..................01234566666666 */
-    lsl.l   #1, %d1             /* %d1 = .................01234566666666. */
-    bclr.l  #7, %d0             /* Z = !(p3 & 0x80); p3 &= ~0x80; */
-    seq.b   %d1                 /* %d1 = .................012345677777777 */
-    lsr.l   #7, %d1             /* %d1 = ........................01234567 */
-    add.l   (%a0)+, %d0         /* add 4 pixel values to the phases */
-    move.l  %d0, (%a1)+         /* store new phases, advance pointer */
+    move.l  %d0, %d2
+    and.l   #0x80808080, %d2    /* %d2 = 0.......1.......2.......3....... */
+    eor.l   %d2, %d0            
+    add.l   (%a0)+, %d0         /* add values to first 4 phases */
 
-    move.w  %d1, (%a3)          /* transfer to lcd */
-    move.w  %d1, (%a3)          /* transfer to lcd */
+    move.l  %d1, %d3
+    and.l   #0x80808080, %d3    /* %d3 = 4.......5.......6.......7....... */
+    eor.l   %d3, %d1
+    add.l   (%a0)+, %d1         /* add values to second 4 phases */
 
+    lsr.l   #4, %d3             /* %d3 = ....4.......5.......6.......7... */
+    or.l    %d3, %d2            /* %d2 = 0...4...1...5...2...6...3...7... */
+    mulu.l  %d5, %d2            /* %d2 = 01234567123.567.23..67..3...7... */
+    not.l   %d2                 /*       negate bits */
+    lsr.l   %d4, %d2            /* %d2 = ........................01234567 */
+
+    move.w  %d2, (%a3)          /* transfer first LCD byte */
+
+    movem.l %d0-%d1, (%a1)      /* store 8 new pixel phases */
+    addq.l  #8, %a1
+
+    move.w  %d2, (%a3)          /* transfer second LCD byte */
     cmp.l   %a2, %a1
     bls.s   .ph_loop
 
-    movem.l (%sp), %a2-%a3
-    lea.l   (2*4, %sp), %sp
+    movem.l (%sp), %d2-%d5/%a2-%a3
+    lea.l   (6*4, %sp), %sp
     rts
 .grey_end:
     .size    lcd_grey_data,.grey_end-lcd_grey_data