I was investigating the performance of an application, and was very
surprised upon profiling to discover that it was spending a large chunk of
time (something over 25%) in memmove.

To cut a long story short, it turns out that the asm version of memmove (on
i386 at least) is comically inefficient if len == 0. It runs its whole batch
of tests before eventually deciding that it's copied everything and
returning. As a quick test I plugged in the "portable" (and, one would
expect, substantially slower) memmove from
/usr/src/lib/libc/string/wmemmove.c and found myself the grateful recipient
of a noticeable speed increase in my application: the portable version
explicitly checks to see if len == 0 and returns immediately if so.

Hunting around a bit more, NetBSD have had a more highly optimised version
of our memmove/memcpy/bzero i386 code for quite some while. I've ported this
to OpenBSD, and also added an explicit check so that it returns immediately
if len == 0. As a rough idea, on the application I was looking at the NetBSD
memmove gives a 3-4% speed increase to the application overall; the len == 0
check another 6% speed increase on top of that. Obviously the real world
increases will vary depending on the application but these still seem to be
useful non-synthetic figures.


I've been running the following patch on an i386 -current desktop machine
without incident. Note that this creates a memcpy which - unlike the
current version - doesn't handle overlapping memory gracefully. I've
therefore included a tentative patch for the manpage, although of course
this patch currently only effects i386.

Comments welcome.


Laurie


--- src/lib/libc/string/memcpy.3.orig	Tue Apr 25 16:39:36 2006
+++ src/lib/libc/string/memcpy.3	Tue Apr 25 16:39:40 2006
@@ -66,12 +66,3 @@
 .Fn memcpy
 function conforms to
 .St -ansiC .
-.Sh BUGS
-In this implementation
-.Fn memcpy
-is implemented using
-.Xr bcopy 3 ,
-and therefore the buffers may overlap.
-On other systems, copying overlapping buffers may produce surprises.
-A simpler solution is to not use
-.Fn memcpy .
--- src/lib/libc/arch/i386/string/bcopy.S.orig	Sun Aug  7 12:30:16 2005
+++ src/lib/libc/arch/i386/string/bcopy.S	Tue Apr 25 16:41:47 2006
@@ -1,10 +1,12 @@
-/*	$OpenBSD: bcopy.S,v 1.5 2005/08/07 11:30:38 espie Exp $	*/
+/*  $OpenBSD$  */
+/*	$NetBSD: memcpy.S,v 1.2 2006/04/07 12:07:55 yamt Exp $	*/
 
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from locore.s.
+ * Optimised by David Laight 2003
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -40,54 +42,91 @@
 
 #ifdef MEMCOPY
 ENTRY(memcpy)
+#define NO_OVERLAP
 #else
 #ifdef MEMMOVE
 ENTRY(memmove)
 #else
 ENTRY(bcopy)
 #endif
+
 #endif
-	pushl	%esi
-	pushl	%edi
+	push	%esi
+	mov	%edi,%edx
 #if defined(MEMCOPY) || defined(MEMMOVE)
+	movl	8(%esp),%edi
+	movl	12(%esp),%esi
+#else
+	movl	8(%esp),%esi
 	movl	12(%esp),%edi
-	movl	16(%esp),%esi
-	movl	%edi, %eax
+#endif
+	movl	16(%esp),%ecx
+	cmpl    $0,%ecx
+	je  .Ldone
+#if defined(NO_OVERLAP)
+	movl	%ecx,%eax
 #else
-	movl	12(%esp),%esi
-	movl	16(%esp),%edi
+	movl	%edi,%eax
+	subl	%esi,%eax
+	cmpl	%ecx,%eax	/* overlapping? */
+	movl	%ecx,%eax
+	jb	.Lbackwards
 #endif
-	movl	20(%esp),%ecx
-	movl	%ecx,%edx
-	cmpl	%esi,%edi	/* potentially overlapping? */
-	jnb	1f
 	cld			/* nope, copy forwards. */
 	shrl	$2,%ecx		/* copy by words */
 	rep
 	movsl
-	movl	%edx,%ecx
-	andl	$3,%ecx		/* any bytes left? */
-	rep
-	movsb
-	popl	%edi
-	popl	%esi
+	and	$3,%eax		/* any bytes left? */
+	jnz	.Ltrailing
+.Ldone:
+#if defined(MEMCOPY) || defined(MEMMOVE)
+	movl	8(%esp),%eax
+#endif
+	mov	%edx,%edi
+	pop	%esi
 	ret
-1:
+
+.Ltrailing:
+	cmp	$2,%eax
+	jb	1f
+	movw	(%esi),%ax
+	movw	%ax,(%edi)
+	je	.Ldone
+	movb	2(%esi),%al
+	movb	%al,2(%edi)
+	jmp	.Ldone
+1:	movb	(%esi),%al
+	movb	%al,(%edi)
+	jmp	.Ldone
+
+#if !defined(NO_OVERLAP)
+.Lbackwards:
 	addl	%ecx,%edi	/* copy backwards. */
 	addl	%ecx,%esi
+	and	$3,%eax		/* any fractional bytes? */
+	jnz	.Lback_align
+.Lback_aligned:
+	shrl	$2,%ecx
+	subl	$4,%esi
+	subl	$4,%edi
 	std
-	andl	$3,%ecx		/* any fractional bytes? */
-	decl	%edi
-	decl	%esi
 	rep
-	movsb
-	movl	%edx,%ecx
-	shrl	$2,%ecx
-	subl	$3,%esi
-	subl	$3,%edi
-	rep
 	movsl
-	popl	%edi
-	popl	%esi
 	cld
-	ret
+	jmp	.Ldone
+
+.Lback_align:
+	sub	%eax,%esi
+	sub	%eax,%edi
+	cmp	$2,%eax
+	jb	1f
+	je	2f
+	movb	2(%esi),%al
+	movb	%al,2(%edi)
+2:	movw	(%esi),%ax
+	movw	%ax,(%edi)
+	jmp	.Lback_aligned
+1:	movb	(%esi),%al
+	movb	%al,(%edi)
+	jmp	.Lback_aligned
+#endif


!DSPAM:444e6f56188722013218095!