Self-extractor for on-disk firmware image: UCL decompressor in SH1 assembler - less than half the size of the compiled C function, and ~45% faster.

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@18904 a1c6a512-1295-4272-9138-f99709370657
diff --git a/firmware/decompressor/Makefile b/firmware/decompressor/Makefile
index 46d7afe..33b6aff 100644
--- a/firmware/decompressor/Makefile
+++ b/firmware/decompressor/Makefile
@@ -14,7 +14,8 @@
 
 LDS := link.lds
 LINKFILE = $(OBJDIR)/linkage.lds
-OBJS := $(OBJDIR)/decompressor.o $(OBJDIR)/uclimage.o $(OBJDIR)/startup.o
+OBJS := $(OBJDIR)/decompressor.o $(OBJDIR)/uclimage.o \
+        $(OBJDIR)/sh_nrv2e_d8.o $(OBJDIR)/startup.o
 CFLAGS = $(GCCOPTS)
 
 all: $(OBJDIR)/compressed.bin
@@ -25,9 +26,6 @@
 $(OBJDIR)/compressed.elf : $(OBJS) $(LINKFILE)
 	$(call PRINTS,LD $(@F))$(CC) $(GCCOPTS) -Os -nostdlib -o $@ $(OBJS) -T$(LINKFILE) -Wl,-Map,$(OBJDIR)/compressed.map
 
-$(LDS): $(OBJS)
-	
-
 $(LINKFILE): $(LDS)
 	$(call PRINTS,Build LDS file)cat $< | $(CC) -DMEMORYSIZE=$(MEMORYSIZE) $(INCLUDES) $(TARGET) $(DEFINES) -E -P $(ROMBUILD) - >$@
 
@@ -39,11 +37,15 @@
 	$(SILENT)mkdir -p $(dir $@)
 	$(call PRINTS,AS $<)$(CC) $(CFLAGS) -c $< -o $@
 
+$(OBJDIR)/sh_nrv2e_d8.o : sh_nrv2e_d8.S
+	$(SILENT)mkdir -p $(dir $@)
+	$(call PRINTS,AS $<)$(CC) $(CFLAGS) -c $< -o $@
+
 $(OBJDIR)/uclimage.o : $(OBJDIR)/uclimage.c
 	$(SILENT)mkdir -p $(dir $@)
 	$(call PRINTS,CC $(<F))$(CC) $(CFLAGS) -c $< -o $@
 
 $(OBJDIR)/uclimage.c : $(FLASHFILE)  $(TOOLSDIR)/ucl2src.pl
 	$(SILENT)mkdir -p $(dir $@)
-	$(call PRINTS,UCL2SRC)perl -s $(TOOLSDIR)/ucl2src.pl -p=$(OBJDIR)/uclimage $<
+	$(call PRINTS,UCL2SRC $(<F))perl -s $(TOOLSDIR)/ucl2src.pl -p=$(OBJDIR)/uclimage $<
 
diff --git a/firmware/decompressor/decompressor.c b/firmware/decompressor/decompressor.c
index cec82b8..11888ef 100644
--- a/firmware/decompressor/decompressor.c
+++ b/firmware/decompressor/decompressor.c
@@ -36,8 +36,8 @@
 extern void start(void);
 
 void main(void) ICODE_ATTR;
-static int ucl_nrv2e_decompress_8(const unsigned char *src, unsigned char *dst,
-                                  unsigned long *dst_len) ICODE_ATTR;
+int ucl_nrv2e_decompress_8(const unsigned char *src, unsigned char *dst,
+                           unsigned long *dst_len) ICODE_ATTR;
 
 /* Vector table */
 void (*vbr[]) (void) __attribute__ ((section (".vectors"))) =
@@ -50,71 +50,6 @@
 
 /** All subsequent functions are executed from IRAM **/
 
-/* Thinned out version of the UCL 2e decompression sourcecode
- * Original (C) Markus F.X.J Oberhumer under GNU GPL license */
-#define GETBIT(bb, src, ilen) \
-    (((bb = bb & 0x7f ? bb*2 : ((unsigned)src[ilen++]*2+1)) >> 8) & 1)
-
-static int ucl_nrv2e_decompress_8(const unsigned char *src, unsigned char *dst,
-                                  unsigned long *dst_len)
-{
-    unsigned long bb = 0;
-    unsigned ilen = 0, olen = 0, last_m_off = 1;
-
-    for (;;)
-    {
-        unsigned m_off, m_len;
-
-        while (GETBIT(bb,src,ilen))
-            dst[olen++] = src[ilen++];
-
-        m_off = 1;
-        for (;;)
-        {
-            m_off = m_off*2 + GETBIT(bb,src,ilen);
-            if (GETBIT(bb,src,ilen))
-                break;
-            m_off = (m_off-1)*2 + GETBIT(bb,src,ilen);
-        }
-        if (m_off == 2)
-        {
-            m_off = last_m_off;
-            m_len = GETBIT(bb,src,ilen);
-        }
-        else
-        {
-            m_off = (m_off-3)*256 + src[ilen++];
-            if (m_off == 0xffffffff)
-                break;
-            m_len = (m_off ^ 0xffffffff) & 1;
-            m_off >>= 1;
-            last_m_off = ++m_off;
-        }
-        if (m_len)
-            m_len = 1 + GETBIT(bb,src,ilen);
-        else if (GETBIT(bb,src,ilen))
-            m_len = 3 + GETBIT(bb,src,ilen);
-        else
-        {
-            m_len++;
-            do {
-                m_len = m_len*2 + GETBIT(bb,src,ilen);
-            } while (!GETBIT(bb,src,ilen));
-            m_len += 3;
-        }
-        m_len += (m_off > 0x500);
-        {
-            const unsigned char *m_pos;
-            m_pos = dst + olen - m_off;
-            dst[olen++] = *m_pos++;
-            do dst[olen++] = *m_pos++; while (--m_len > 0);
-        }
-    }
-    *dst_len = olen;
-
-    return ilen;
-}
-
 #define ALIGNED_IMG_SIZE ((sizeof(image) + 3) & ~3)
 /* This will never return */
 void main(void)
diff --git a/firmware/decompressor/sh_nrv2e_d8.S b/firmware/decompressor/sh_nrv2e_d8.S
new file mode 100644
index 0000000..c002911
--- /dev/null
+++ b/firmware/decompressor/sh_nrv2e_d8.S
@@ -0,0 +1,155 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2008 by Jens Arnold
+ *
+ * based on  arm_nrv2e_d8.S -- ARM decompressor for NRV2E
+ * Copyright (C) 1996-2008 Markus Franz Xaver Johannes Oberhumer
+ * Copyright (C) 1996-2008 Laszlo Molnar
+ * Copyright (C) 2000-2008 John F. Reiser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#define src  r4
+#define dst  r5
+#define len  r6  /* overlaps 'cnt' */
+#define cnt  r6  /* overlaps 'len' while reading an offset */
+#define tmp  r7
+
+#define off  r0  /* must be r0 because of indexed addressing */
+#define bits r1
+#define bitmask r2
+#define wrnk r3  /* -0x500  -M2_MAX_OFFSET before "wrinkle" */
+
+
+#define GETBIT             \
+    tst     bits, bitmask; \
+    bf      1f;            \
+    bsr     get1_n2e;      \
+1:                         \
+    shll    bits  /* using the delay slot on purpose */
+
+#define getnextb(reg) GETBIT; rotcl   reg
+#define   jnextb0     GETBIT; bf
+#define   jnextb1     GETBIT; bt
+
+    .section  .icode,"ax",@progbits
+    .align  2
+    .global _ucl_nrv2e_decompress_8
+    .type   _ucl_nrv2e_decompress_8,@function
+
+/* src_len = ucl_nrv2e_decompress_8(const unsigned char *src,
+ *                                  unsigned char *dst,
+ *                                  unsigned long *dst_len)
+ */
+
+_ucl_nrv2e_decompress_8:
+    sts.l   pr, @-r15
+    mov     #-1, off     ! off = -1 initial condition
+    mov.l   r6, @-r15
+    mov     #-5, wrnk
+    mov.l   r5, @-r15
+    shll8   wrnk         ! nrv2e -M2_MAX_OFFSET
+    mov.l   r4, @-r15
+    mov     #-1, bitmask
+    shlr    bitmask      ! 0x7fffffff for testing before shifting
+    bra     top_n2e
+    not     bitmask, bits ! refill next time (MSB must be set)
+
+eof_n2e:
+    mov.l   @r15+, r0    ! r0 = orig_src
+    mov.l   @r15+, r1    ! r1 = orig_dst
+    sub     r0, src
+    mov.l   @r15+, r2    ! r2 = plen_dst
+    sub     r1, dst
+    mov.l   dst, @r2
+    lds.l   @r15+, pr
+    rts
+    mov     src, r0
+
+    .align  2
+get1_n2e:  ! in: T bit set
+    mov.b   @src+, bits ! SH1 sign-extends on load
+    rotcl   bits        ! LSB = T, T = MSB
+    shll16  bits
+    rts
+    shll8   bits
+
+    .align  2
+lit_n2e:
+    mov.b   @src, tmp
+    add     #1, src     ! Need to fill the pipeline latency anyway
+    mov.b   tmp, @dst
+    add     #1, dst
+top_n2e:
+    jnextb1 lit_n2e
+    bra     getoff_n2e
+    mov     #1, cnt
+
+off_n2e:
+    add     #-1, cnt
+    getnextb(cnt)
+getoff_n2e:
+    getnextb(cnt)
+    jnextb0 off_n2e
+
+    mov     cnt, tmp
+    mov     #0, len     ! cnt and len share a reg!
+    add     #-3, tmp
+    cmp/pz  tmp
+    bf      offprev_n2e ! cnt was 2
+    mov.b   @src+, off  ! low 7+1 bits
+    shll8   tmp
+    extu.b  off, off
+    or      tmp, off
+    not     off, off    ! off = ~off
+    tst     off, off
+    bt      eof_n2e
+    shar    off
+    bt      lenlast_n2e
+    bra     lenmore_n2e
+    mov     #1, len
+
+offprev_n2e:
+    jnextb1 lenlast_n2e
+    mov     #1, len
+lenmore_n2e:
+    jnextb1 lenlast_n2e
+len_n2e:
+    getnextb(len)
+    jnextb0 len_n2e
+    bra     gotlen_n2e
+    add     #6-2, len
+
+lenlast_n2e:
+    getnextb(len)       ! 0,1,2,3
+    add     #2, len
+gotlen_n2e:
+    cmp/gt  off, wrnk
+    movt    tmp          ! too far away, so minimum match length is 3
+    add     tmp, len
+copy_n2e:
+    add     #-1, len
+    mov.b   @(off,dst), tmp
+    tst     len, len
+    mov.b   tmp, @dst
+    add     #1, dst
+    bf      copy_n2e
+    bra     top_n2e
+    nop
+
+    .size ucl_nrv2e_decompress_8, .-ucl_nrv2e_decompress_8