ARM optimised memcpy/memmove from glibc.  This should give increased performance on all ARM targets, especially iPod 5G

git-svn-id: svn://svn.rockbox.org/rockbox/trunk@12000 a1c6a512-1295-4272-9138-f99709370657
diff --git a/firmware/SOURCES b/firmware/SOURCES
index 4f8e531..3fdbdbb 100644
--- a/firmware/SOURCES
+++ b/firmware/SOURCES
@@ -248,8 +248,8 @@
 
 #elif defined(CPU_PP) || defined(CPU_ARM)
 /* CPU_PP => CPU_ARM, CPU_ARM !=> CPU_PP */
-common/memcpy.c
-common/memmove.c
+target/arm/memcpy-arm.S
+target/arm/memmove-arm.S
 common/strlen.c
 #ifndef SIMULATOR
 target/arm/memset-arm.S
diff --git a/firmware/target/arm/memcpy-arm.S b/firmware/target/arm/memcpy-arm.S
new file mode 100644
index 0000000..b8cbff0
--- /dev/null
+++ b/firmware/target/arm/memcpy-arm.S
@@ -0,0 +1,174 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006 Free Software Foundation, Inc.
+ * This file was originally part of the GNU C Library
+ * Contributed to glibc by MontaVista Software, Inc. (written by Nicolas Pitre)
+ * Adapted for Rockbox by Daniel Ankers
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include "config.h"
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define pull            lsr
+#define push            lsl
+#else
+#define pull            lsl
+#define push            lsr
+#endif
+
+/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
+
+    .section    .icode,"ax",%progbits
+
+    .align      2
+    .global     memcpy
+    .type       memcpy,%function
+
+memcpy:
+		stmfd	sp!, {r0, r4, lr}
+
+		subs	r2, r2, #4
+		blt	8f
+		ands	ip, r0, #3
+		bne	9f
+		ands	ip, r1, #3
+		bne	10f
+
+1:		subs	r2, r2, #(28)
+		stmfd	sp!, {r5 - r8}
+		blt	5f
+
+2:
+3:
+4:		ldmia	r1!, {r3, r4, r5, r6, r7, r8, ip, lr}
+		subs	r2, r2, #32
+		stmia	r0!, {r3, r4, r5, r6, r7, r8, ip, lr}
+		bge	3b
+
+5:		ands	ip, r2, #28
+		rsb	ip, ip, #32
+		addne	pc, pc, ip		@ C is always clear here
+		b	7f
+6:		nop
+		ldr	r3, [r1], #4
+		ldr	r4, [r1], #4
+		ldr	r5, [r1], #4
+		ldr	r6, [r1], #4
+		ldr	r7, [r1], #4
+		ldr	r8, [r1], #4
+		ldr	lr, [r1], #4
+
+		add	pc, pc, ip
+		nop
+		nop
+		str	r3, [r0], #4
+		str	r4, [r0], #4
+		str	r5, [r0], #4
+		str	r6, [r0], #4
+		str	r7, [r0], #4
+		str	r8, [r0], #4
+		str	lr, [r0], #4
+
+7:		ldmfd	sp!, {r5 - r8}
+
+8:		movs	r2, r2, lsl #31
+		ldrneb	r3, [r1], #1
+		ldrcsb	r4, [r1], #1
+		ldrcsb	ip, [r1]
+		strneb	r3, [r0], #1
+		strcsb	r4, [r0], #1
+		strcsb	ip, [r0]
+
+		ldmfd	sp!, {r0, r4, pc}
+
+9:		rsb	ip, ip, #4
+		cmp	ip, #2
+		ldrgtb	r3, [r1], #1
+		ldrgeb	r4, [r1], #1
+		ldrb	lr, [r1], #1
+		strgtb	r3, [r0], #1
+		strgeb	r4, [r0], #1
+		subs	r2, r2, ip
+		strb	lr, [r0], #1
+		blt	8b
+		ands	ip, r1, #3
+		beq	1b
+
+10:		bic	r1, r1, #3
+		cmp	ip, #2
+		ldr	lr, [r1], #4
+		beq	17f
+		bgt	18f
+
+
+		.macro	forward_copy_shift pull push
+
+		subs	r2, r2, #28
+		blt	14f
+
+11:		stmfd	sp!, {r5 - r9}
+
+12:
+13:		ldmia	r1!, {r4, r5, r6, r7}
+		mov	r3, lr, pull #\pull
+		subs	r2, r2, #32
+		ldmia	r1!, {r8, r9, ip, lr}
+		orr	r3, r3, r4, push #\push
+		mov	r4, r4, pull #\pull
+		orr	r4, r4, r5, push #\push
+		mov	r5, r5, pull #\pull
+		orr	r5, r5, r6, push #\push
+		mov	r6, r6, pull #\pull
+		orr	r6, r6, r7, push #\push
+		mov	r7, r7, pull #\pull
+		orr	r7, r7, r8, push #\push
+		mov	r8, r8, pull #\pull
+		orr	r8, r8, r9, push #\push
+		mov	r9, r9, pull #\pull
+		orr	r9, r9, ip, push #\push
+		mov	ip, ip, pull #\pull
+		orr	ip, ip, lr, push #\push
+		stmia	r0!, {r3, r4, r5, r6, r7, r8, r9, ip}
+		bge	12b
+
+		ldmfd	sp!, {r5 - r9}
+
+14:		ands	ip, r2, #28
+		beq	16f
+
+15:		mov	r3, lr, pull #\pull
+		ldr	lr, [r1], #4
+		subs	ip, ip, #4
+		orr	r3, r3, lr, push #\push
+		str	r3, [r0], #4
+		bgt	15b
+
+16:		sub	r1, r1, #(\push / 8)
+		b	8b
+
+		.endm
+
+
+		forward_copy_shift	pull=8	push=24
+
+17:		forward_copy_shift	pull=16	push=16
+
+18:		forward_copy_shift	pull=24	push=8
+
diff --git a/firmware/target/arm/memmove-arm.S b/firmware/target/arm/memmove-arm.S
new file mode 100644
index 0000000..94103c0
--- /dev/null
+++ b/firmware/target/arm/memmove-arm.S
@@ -0,0 +1,188 @@
+/***************************************************************************
+ *             __________               __   ___.
+ *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
+ *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
+ *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
+ *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
+ *                     \/            \/     \/    \/            \/
+ * $Id$
+ *
+ * Copyright (C) 2006 Free Software Foundation, Inc.
+ * This file was originally part of the GNU C Library
+ * Contributed to glibc by MontaVista Software, Inc. (written by Nicolas Pitre)
+ * Adapted for Rockbox by Daniel Ankers
+ *
+ * All files in this archive are subject to the GNU General Public License.
+ * See the file COPYING in the source tree root for full license agreement.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ****************************************************************************/
+
+#include "config.h"
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define pull            lsr
+#define push            lsl
+#else
+#define pull            lsl
+#define push            lsr
+#endif
+
+		.text
+
+/*
+ * Prototype: void *memmove(void *dest, const void *src, size_t n);
+ *
+ * Note:
+ *
+ * If the memory regions don't overlap, we simply branch to memcpy which is
+ * normally a bit faster. Otherwise the copy is done going downwards.
+ */
+
+    .section    .icode,"ax",%progbits
+
+    .align      2
+    .global     memmove
+    .type       memmove,%function
+
+memmove:
+
+		subs	ip, r0, r1
+		cmphi	r2, ip
+		bls	memcpy
+
+		stmfd	sp!, {r0, r4, lr}
+		add	r1, r1, r2
+		add	r0, r0, r2
+		subs	r2, r2, #4
+		blt	8f
+		ands	ip, r0, #3
+		bne	9f
+		ands	ip, r1, #3
+		bne	10f
+
+1:		subs	r2, r2, #(28)
+		stmfd	sp!, {r5 - r8}
+		blt	5f
+
+2:
+3:
+4:		ldmdb	r1!, {r3, r4, r5, r6, r7, r8, ip, lr}
+		subs	r2, r2, #32
+		stmdb	r0!, {r3, r4, r5, r6, r7, r8, ip, lr}
+		bge	3b
+
+5:		ands	ip, r2, #28
+		rsb	ip, ip, #32
+		addne	pc, pc, ip		@ C is always clear here
+		b	7f
+6:		nop
+		ldr	r3, [r1, #-4]!
+		ldr	r4, [r1, #-4]!
+		ldr	r5, [r1, #-4]!
+		ldr	r6, [r1, #-4]!
+		ldr	r7, [r1, #-4]!
+		ldr	r8, [r1, #-4]!
+		ldr	lr, [r1, #-4]!
+
+		add	pc, pc, ip
+		nop
+		nop
+		str	r3, [r0, #-4]!
+		str	r4, [r0, #-4]!
+		str	r5, [r0, #-4]!
+		str	r6, [r0, #-4]!
+		str	r7, [r0, #-4]!
+		str	r8, [r0, #-4]!
+		str	lr, [r0, #-4]!
+
+7:		ldmfd	sp!, {r5 - r8}
+
+8:		movs	r2, r2, lsl #31
+		ldrneb	r3, [r1, #-1]!
+		ldrcsb	r4, [r1, #-1]!
+		ldrcsb	ip, [r1, #-1]
+		strneb	r3, [r0, #-1]!
+		strcsb	r4, [r0, #-1]!
+		strcsb	ip, [r0, #-1]
+		ldmfd	sp!, {r0, r4, pc}
+
+9:		cmp	ip, #2
+		ldrgtb	r3, [r1, #-1]!
+		ldrgeb	r4, [r1, #-1]!
+		ldrb	lr, [r1, #-1]!
+		strgtb	r3, [r0, #-1]!
+		strgeb	r4, [r0, #-1]!
+		subs	r2, r2, ip
+		strb	lr, [r0, #-1]!
+		blt	8b
+		ands	ip, r1, #3
+		beq	1b
+
+10:		bic	r1, r1, #3
+		cmp	ip, #2
+		ldr	r3, [r1, #0]
+		beq	17f
+		blt	18f
+
+
+		.macro	backward_copy_shift push pull
+
+		subs	r2, r2, #28
+		blt	14f
+
+11:		stmfd	sp!, {r5 - r9}
+
+12:
+13:		ldmdb   r1!, {r7, r8, r9, ip}
+		mov     lr, r3, push #\push
+		subs    r2, r2, #32
+		ldmdb   r1!, {r3, r4, r5, r6}
+		orr     lr, lr, ip, pull #\pull
+		mov     ip, ip, push #\push
+		orr     ip, ip, r9, pull #\pull
+		mov     r9, r9, push #\push
+		orr     r9, r9, r8, pull #\pull
+		mov     r8, r8, push #\push
+		orr     r8, r8, r7, pull #\pull
+		mov     r7, r7, push #\push
+		orr     r7, r7, r6, pull #\pull
+		mov     r6, r6, push #\push
+		orr     r6, r6, r5, pull #\pull
+		mov     r5, r5, push #\push
+		orr     r5, r5, r4, pull #\pull
+		mov     r4, r4, push #\push
+		orr     r4, r4, r3, pull #\pull
+		stmdb   r0!, {r4 - r9, ip, lr}
+		bge	12b
+
+		ldmfd	sp!, {r5 - r9}
+
+14:		ands	ip, r2, #28
+		beq	16f
+
+15:		mov     lr, r3, push #\push
+		ldr	r3, [r1, #-4]!
+		subs	ip, ip, #4
+		orr	lr, lr, r3, pull #\pull
+		str	lr, [r0, #-4]!
+		bgt	15b
+
+16:		add	r1, r1, #(\pull / 8)
+		b	8b
+
+		.endm
+
+
+		backward_copy_shift	push=8	pull=24
+
+17:		backward_copy_shift	push=16	pull=16
+
+18:		backward_copy_shift	push=24	pull=8
+
+