Port greylib blitting optimisation to MPIO HD200. ISR speedup is ~10%; further speedup should be possible by using line transfers for accessing the greylib buffers. Thanks to Marcin Bukat for testing.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@26793 a1c6a512-1295-4272-9138-f99709370657
diff --git a/firmware/target/coldfire/mpio/hd200/lcd-as-hd200.S b/firmware/target/coldfire/mpio/hd200/lcd-as-hd200.S
index 7ea0b8f..9709e21 100644
--- a/firmware/target/coldfire/mpio/hd200/lcd-as-hd200.S
+++ b/firmware/target/coldfire/mpio/hd200/lcd-as-hd200.S
@@ -125,54 +125,45 @@
.type lcd_grey_data,@function
lcd_grey_data:
- lea.l (-2*4, %sp), %sp
- movem.l %a2-%a3, (%sp)
- movem.l (2*4+4, %sp), %a0-%a2 /* values, phases, length */
+ lea.l (-6*4, %sp), %sp
+ movem.l %d2-%d5/%a2-%a3, (%sp)
+ movem.l (6*4+4, %sp), %a0-%a2 /* values, phases, length */
add.l %a2, %a2
lea.l (%a1, %a2.l*4), %a2 /* end address */
lea.l LCD_BASE_ADDRESS+2, %a3 /* LCD data port address */
+ moveq.l #24, %d4 /* shift count */
+ move.l #0x204081, %d5 /* bit shuffle factor */
+
.ph_loop:
- clr.l %d1
- move.l (%a1), %d0 /* fetch 4 pixel phases */
- bclr.l #31, %d0 /* Z = !(p0 & 0x80); p0 &= ~0x80; */
- seq.b %d1 /* %d1 = ........................00000000 */
- lsl.l #1, %d1 /* %d1 = .......................00000000. */
- bclr.l #23, %d0 /* Z = !(p1 & 0x80); p1 &= ~0x80; */
- seq.b %d1 /* %d1 = .......................011111111 */
- lsl.l #1, %d1 /* %d1 = ......................011111111. */
- bclr.l #15, %d0 /* Z = !(p2 & 0x80); p2 &= ~0x80; */
- seq.b %d1 /* %d1 = ......................0122222222 */
- lsl.l #1, %d1 /* %d1 = .....................0122222222. */
- bclr.l #7, %d0 /* Z = !(p3 & 0x80); p3 &= ~0x80; */
- seq.b %d1 /* %d1 = .....................01233333333 */
- lsl.l #1, %d1 /* %d1 = ....................01233333333. */
- add.l (%a0)+, %d0 /* add 4 pixel values to the phases */
- move.l %d0, (%a1)+ /* store new phases, advance pointer */
+ movem.l (%a1), %d0-%d1 /* fetch 8 pixel phases */
- move.l (%a1), %d0 /* fetch 4 pixel phases */
- bclr.l #31, %d0 /* Z = !(p0 & 0x80); p0 &= ~0x80; */
- seq.b %d1 /* %d1 = ....................012344444444 */
- lsl.l #1, %d1 /* %d1 = ...................012344444444. */
- bclr.l #23, %d0 /* Z = !(p1 & 0x80); p1 &= ~0x80; */
- seq.b %d1 /* %d1 = ...................0123455555555 */
- lsl.l #1, %d1 /* %d1 = ..................0123455555555. */
- bclr.l #15, %d0 /* Z = !(p2 & 0x80); p2 &= ~0x80; */
- seq.b %d1 /* %d1 = ..................01234566666666 */
- lsl.l #1, %d1 /* %d1 = .................01234566666666. */
- bclr.l #7, %d0 /* Z = !(p3 & 0x80); p3 &= ~0x80; */
- seq.b %d1 /* %d1 = .................012345677777777 */
- lsr.l #7, %d1 /* %d1 = ........................01234567 */
- add.l (%a0)+, %d0 /* add 4 pixel values to the phases */
- move.l %d0, (%a1)+ /* store new phases, advance pointer */
+ move.l %d0, %d2
+ and.l #0x80808080, %d2 /* %d2 = 0.......1.......2.......3....... */
+ eor.l %d2, %d0
+ add.l (%a0)+, %d0 /* add values to first 4 phases */
- move.w %d1, (%a3) /* transfer to lcd */
- move.w %d1, (%a3) /* transfer to lcd */
+ move.l %d1, %d3
+ and.l #0x80808080, %d3 /* %d3 = 4.......5.......6.......7....... */
+ eor.l %d3, %d1
+ add.l (%a0)+, %d1 /* add values to second 4 phases */
+ lsr.l #4, %d3 /* %d3 = ....4.......5.......6.......7... */
+ or.l %d3, %d2 /* %d2 = 0...4...1...5...2...6...3...7... */
+ mulu.l %d5, %d2 /* %d2 = 01234567123.567.23..67..3...7... */
+ not.l %d2 /* negate bits */
+ lsr.l %d4, %d2 /* %d2 = ........................01234567 */
+
+ move.w %d2, (%a3) /* transfer first LCD byte */
+
+ movem.l %d0-%d1, (%a1) /* store 8 new pixel phases */
+ addq.l #8, %a1
+
+ move.w %d2, (%a3) /* transfer second LCD byte */
cmp.l %a2, %a1
bls.s .ph_loop
- movem.l (%sp), %a2-%a3
- lea.l (2*4, %sp), %sp
+ movem.l (%sp), %d2-%d5/%a2-%a3
+ lea.l (6*4, %sp), %sp
rts
.grey_end:
.size lcd_grey_data,.grey_end-lcd_grey_data