Fixed self-extracting loader: (1) Proper startup code, ensuring the stack pointer is set to the desired location. (2) Code cleanup.


git-svn-id: svn://svn.rockbox.org/rockbox/trunk@8103 a1c6a512-1295-4272-9138-f99709370657
diff --git a/firmware/decompressor/Makefile b/firmware/decompressor/Makefile
index 99e91aa..29a7852 100644
--- a/firmware/decompressor/Makefile
+++ b/firmware/decompressor/Makefile
@@ -21,7 +21,7 @@
 
 LDS := link.lds
 LINKFILE = $(OBJDIR)/linkage.lds
-OBJS := $(OBJDIR)/decompressor.o $(OBJDIR)/rockboxucl.o
+OBJS := $(OBJDIR)/decompressor.o $(OBJDIR)/rockboxucl.o $(OBJDIR)/startup.o
 
 CFLAGS = -O -W -Wall -m1 -nostdlib -ffreestanding -Wstrict-prototypes -fomit-frame-pointer -fschedule-insns
 
diff --git a/firmware/decompressor/decompressor.c b/firmware/decompressor/decompressor.c
index 1223ff2..9cd7d59 100644
--- a/firmware/decompressor/decompressor.c
+++ b/firmware/decompressor/decompressor.c
@@ -31,7 +31,8 @@
 extern char loadaddress[], dramend[];
 
 /* Prototypes */
-void start(void)  __attribute__ ((section (".start")));
+extern void start(void);
+
 void main(void) ICODE_ATTR;
 int ucl_nrv2e_decompress_8(const unsigned char *src, unsigned char *dst,
                            unsigned long *dst_len) ICODE_ATTR;
@@ -45,22 +46,6 @@
     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 };
 
-/* Inline copy function */
-static inline void longcopy(long *dst, long *dst_end, const long *src)
-                            __attribute__ ((always_inline));
-static inline void longcopy(long *dst, long *dst_end, const long *src)
-{
-    while (dst < dst_end)
-        *dst++ = *src++;
-}
-
-/* Entry point */
-void start(void)
-{
-    longcopy((long *)iramstart, (long *)iramend, (long *)iramcopy);
-    main();
-}
-
 /** All subsequent functions are executed from IRAM **/
 
 /* Thinned out version of the UCL 2e decompression sourcecode
@@ -133,18 +118,20 @@
 {
     unsigned long dst_len; /* dummy */
     unsigned long img_len = (unsigned long)(imgend - imgstart);
-
-    longcopy((long *)(dramend - img_len), (long *) dramend,
-             (long *) imgstart);
-
+    unsigned long *src = (unsigned long *)imgstart;
+    unsigned long *dst = (unsigned long *)(dramend - img_len);
+    
+    do
+        *dst++ = *src++;
+    while (dst < (unsigned long *)dramend);
+    
     ucl_nrv2e_decompress_8(dramend - img_len + UCL_HEADER,
                            loadaddress, &dst_len);
 
     asm(
         "mov.l   @%0+,r0     \n"
-        "mov.l   @%0+,r15    \n"
         "jmp     @r0         \n"
-        "nop                 \n"
+        "mov.l   @%0+,r15    \n"
         : : "r"(loadaddress) : "r0"
     );
 }
diff --git a/firmware/decompressor/startup.S b/firmware/decompressor/startup.S
new file mode 100755
index 0000000..62efef9
--- /dev/null
+++ b/firmware/decompressor/startup.S
@@ -0,0 +1,57 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2005 by Jens Arnold
+ * based on crt0.S by Linus Nielsen Feltzing
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+    .section  .start,"ax",@progbits
+    .global    _start
+_start:
+    /* copy the .iram section */
+    mov.l   .iramcopy_k,r0
+    mov.l   .iram_k,r1
+    mov.l   .iramend_k,r2
+    /* Note: We cannot put a PC relative load into the delay slot of a 'bra'
+       instruction (the offset would be wrong), but there is nothing else to
+       do before the loop, so the delay slot would be 'nop'. The cmp / bf
+       sequence is the same length, but more efficient. */
+    cmp/hi  r1,r2
+    bf      .noiramcopy
+.iramloop:
+    mov.l   @r0+,r3
+    mov.l   r3,@r1
+    add     #4,r1
+    cmp/hi  r1,r2
+    bt      .iramloop
+.noiramcopy:
+
+    /* call the mainline */
+    mov.l   .main_k,r0
+    mov.l   .stackend_k,r15
+    jmp     @r0
+    nop
+
+    .align  2
+.iramcopy_k:
+    .long   _iramcopy
+.iram_k:
+    .long   _iramstart
+.iramend_k:
+    .long   _iramend
+.stackend_k:
+    .long   _stackend
+.main_k:
+    .long   _main