Add mempcpy implementation

A GNU extension that returns dst + size instead of dst. It's a nice
shortcut when copying strings with a known size or back-to-back blocks
and you have to do it often.

May of course be called directly or alternately through
__builtin_mempcpy in some compiler versions.

For ASM on native targets, it is implemented as an alternate entrypoint
to memcpy which adds minimal code and overhead.

Change-Id: I4cbb3483f6df3c1007247fe0a95fd7078737462b
diff --git a/firmware/asm/SOURCES b/firmware/asm/SOURCES
index a9293b4..ebb6951 100644
--- a/firmware/asm/SOURCES
+++ b/firmware/asm/SOURCES
@@ -7,6 +7,10 @@
 strlen.c
 #endif
 
+#if defined(WIN32) || defined(APPLICATION)
+mempcpy.c
+#endif
+
 #if (defined(SANSA_E200) || defined(GIGABEAT_F) || defined(GIGABEAT_S) || \
     defined(CREATIVE_ZVx) || defined(SANSA_CONNECT) || defined(SANSA_FUZEPLUS) || \
     defined(COWON_D2) || defined(MINI2440) || defined(SAMSUNG_YPR0) || \
diff --git a/firmware/asm/arm/memcpy.S b/firmware/asm/arm/memcpy.S
index 2a55fb5..83d4329 100644
--- a/firmware/asm/arm/memcpy.S
+++ b/firmware/asm/arm/memcpy.S
@@ -36,17 +36,25 @@
 #endif
 
 /* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
+/* Prototype: void *mempcpy(void *dest, const void *src, size_t n); */
 
     .section    .icode,"ax",%progbits
 
     .align      2
     .global     memcpy
     .type       memcpy,%function
+    .global     mempcpy
+    .type       mempcpy,%function
+
+mempcpy:
+        add     r3, r0, r2
+        stmfd   sp!, {r3, r4, lr}
+        b   0f
 
 memcpy:
         stmfd   sp!, {r0, r4, lr}
 
-        subs    r2, r2, #4
+0:      subs    r2, r2, #4
         blt 8f
         ands    ip, r0, #3
         bne 9f
diff --git a/firmware/asm/m68k/memcpy.S b/firmware/asm/m68k/memcpy.S
index 9762e31..a88ac3d 100644
--- a/firmware/asm/m68k/memcpy.S
+++ b/firmware/asm/m68k/memcpy.S
@@ -27,6 +27,8 @@
     .global memcpy
     .global __memcpy_fwd_entry
     .type   memcpy,@function
+    .global mempcpy
+    .type   mempcpy,@function
 
 /* Copies <length> bytes of data in memory from <source> to <dest>
  * This version is optimized for speed
@@ -53,6 +55,14 @@
  * long+3) it writes longwords only. Same goes for word aligned destinations
  * if FULLSPEED is undefined.
  */
+mempcpy:
+    move.l  (4,%sp),%a1     /* Destination */
+    move.l  (8,%sp),%a0     /* Source */
+    move.l  (12,%sp),%d1    /* Length */
+
+    add.l   %d1,(4,%sp)     /* retval=Destination + Length */
+    bra.b   __memcpy_fwd_entry
+
 memcpy:
     move.l  (4,%sp),%a1     /* Destination */
     move.l  (8,%sp),%a0     /* Source */
diff --git a/firmware/asm/mempcpy.c b/firmware/asm/mempcpy.c
new file mode 100644
index 0000000..2b1ccec
--- /dev/null
+++ b/firmware/asm/mempcpy.c
@@ -0,0 +1,47 @@
+/*
+FUNCTION
+        <<mempcpy>>---copy memory regions and return end pointer
+
+ANSI_SYNOPSIS
+        #include <string.h>
+        void* mempcpy(void *<[out]>, const void *<[in]>, size_t <[n]>);
+
+TRAD_SYNOPSIS
+        void *mempcpy(<[out]>, <[in]>, <[n]>
+        void *<[out]>;
+        void *<[in]>;
+        size_t <[n]>;
+
+DESCRIPTION
+        This function copies <[n]> bytes from the memory region
+        pointed to by <[in]> to the memory region pointed to by
+        <[out]>.
+
+        If the regions overlap, the behavior is undefined.
+
+RETURNS
+        <<mempcpy>> returns a pointer to the byte following the
+        last byte copied to the <[out]> region.
+
+PORTABILITY
+<<mempcpy>> is a GNU extension.
+
+<<mempcpy>> requires no supporting OS subroutines.
+
+        */
+
+#include "config.h"
+#include "_ansi.h" /* for _DEFUN */
+#include <string.h>
+
+/* This may be conjoined with memcpy in <cpu>/memcpy.S to get it nearly for
+   free */
+
+_PTR
+_DEFUN (mempcpy, (dst0, src0, len0),
+        _PTR dst0 _AND
+        _CONST _PTR src0 _AND
+        size_t len0)
+{
+    return memcpy(dst0, src0, len0) + len0;
+}
diff --git a/firmware/asm/mips/memcpy.S b/firmware/asm/mips/memcpy.S
index edbf5ac..ec1625b 100644
--- a/firmware/asm/mips/memcpy.S
+++ b/firmware/asm/mips/memcpy.S
@@ -43,8 +43,16 @@
 
     .global    memcpy
     .type      memcpy, %function
+    .global    mempcpy
+    .type      mempcpy, %function
     
     .set       noreorder
+mempcpy:
+    slti       t0, a2, 8                # Less than 8?
+    bne        t0, zero, last8
+     addu      v0, a0, a2               # exit value = s1 + n
+    b          1f
+     xor       t0, a1, a0               # Find a0/a1 displacement (fill delay)
 
 memcpy:
     slti       t0, a2, 8                # Less than 8?
@@ -52,7 +60,8 @@
     move       v0, a0                   # Setup exit value before too late
 
     xor        t0, a1, a0               # Find a0/a1 displacement
-    andi       t0, 0x3
+
+1:  andi       t0, 0x3
     bne        t0, zero, shift          # Go handle the unaligned case
     subu       t1, zero, a1
     andi       t1, 0x3                  # a0/a1 are aligned, but are we
diff --git a/firmware/asm/sh/memcpy.S b/firmware/asm/sh/memcpy.S
index 59c5801..3d623c4 100644
--- a/firmware/asm/sh/memcpy.S
+++ b/firmware/asm/sh/memcpy.S
@@ -24,8 +24,10 @@
 
     .align      2
     .global     _memcpy
+    .global     _mempcpy
     .global     ___memcpy_fwd_entry
     .type       _memcpy,@function
+    .type       _mempcpy,@function
 
 /* Copies <length> bytes of data in memory from <source> to <dest>
  * This version is optimized for speed
@@ -51,6 +53,10 @@
  * The instruction order is devised in a way to utilize the pipelining
  * of the SH1 to the max. The routine also tries to utilize fast page mode.
  */
+_mempcpy:
+    mov     r4,r7       /* store dest + length for returning */
+    bra     ___memcpy_fwd_entry
+     add    r6,r7
 
 _memcpy:
     mov     r4,r7       /* store dest for returning */
@@ -217,3 +223,5 @@
     mov     r7,r0       /* return dest start address */
 .end:
     .size   _memcpy,.end-_memcpy
+    .size   _mempcpy,.end-_mempcpy
+
diff --git a/firmware/include/string-extra.h b/firmware/include/string-extra.h
index bae250d..6a9e0c7 100644
--- a/firmware/include/string-extra.h
+++ b/firmware/include/string-extra.h
@@ -18,8 +18,8 @@
  * KIND, either express or implied.
  *
  ****************************************************************************/
-
-
+#ifndef STRING_EXTRA_H
+#define STRING_EXTRA_H
 #include <string.h>
 #include "strlcpy.h"
 #include "strlcat.h"
@@ -27,3 +27,11 @@
 #include "strcasestr.h"
 #include "strtok_r.h"
 #include "memset16.h"
+
+#if defined(WIN32) || defined(APPLICATION)
+#ifndef mempcpy
+#define mempcpy __builtin_mempcpy
+#endif
+#endif
+
+#endif /* STRING_EXTRA_H */
diff --git a/firmware/libc/include/string.h b/firmware/libc/include/string.h
index 9346611..9815c62 100644
--- a/firmware/libc/include/string.h
+++ b/firmware/libc/include/string.h
@@ -20,6 +20,7 @@
 _PTR     _EXFUN(memchr,(const _PTR, int, size_t));
 int      _EXFUN(memcmp,(const _PTR, const _PTR, size_t));
 _PTR     _EXFUN(memcpy,(_PTR, const _PTR, size_t));
+_PTR     _EXFUN(mempcpy,(_PTR, const _PTR, size_t));
 _PTR     _EXFUN(memmove,(_PTR, const _PTR, size_t));
 _PTR     _EXFUN(memset,(_PTR, int, size_t));
 char    *_EXFUN(strcat,(char *, const char *));