From owner-freebsd-current  Mon Apr  8 19:23:26 1996
Return-Path: owner-current
Received: (from root@localhost)
          by freefall.freebsd.org (8.7.3/8.7.3) id TAA25463
          for current-outgoing; Mon, 8 Apr 1996 19:23:26 -0700 (PDT)
Received: from sunrise.cs.berkeley.edu (sunrise.CS.Berkeley.EDU [128.32.38.121])
          by freefall.freebsd.org (8.7.3/8.7.3) with SMTP id TAA25446
          for <current@FreeBSD.org>; Mon, 8 Apr 1996 19:23:19 -0700 (PDT)
Received: (from asami@localhost) by sunrise.cs.berkeley.edu (8.6.12/8.6.12) id TAA16078; Mon, 8 Apr 1996 19:25:31 -0700
Date: Mon, 8 Apr 1996 19:25:31 -0700
Message-Id: <199604090225.TAA16078@sunrise.cs.berkeley.edu>
To: terry@lambert.org
CC: paul@netcraft.co.uk, davidg@Root.COM, current@FreeBSD.org,
        nisha@cs.berkeley.edu, tege@matematik.su.se, hasty@star-gate.com
In-reply-to: <199604052314.QAA25117@phaeton.artisoft.com> (message from Terry Lambert on Fri, 5 Apr 1996 16:14:00 -0700 (MST))
Subject: Re: fast memory copy for large data sizes
From: asami@cs.berkeley.edu (Satoshi Asami)
Sender: owner-current@FreeBSD.org
X-Loop: FreeBSD.org
Precedence: bulk

 * I also don't see the code seriously dealing with misalignment between
 * wource and target, which need to be aligned on the same boundry for
 * everything but the initial and final sub-increment sized moves.

That's a good point.  I changed our routine to fall back to the
original code if the alignment is not multiple of 8.

Use our code all the time:

  70MB/s if multiple of 8
  42MB/s if not multiple of 8

Use original code if not multiple of 8:

  70MB/s if multiple of 8
  56MB/s if multiple of 4 but not multiple of 8
  34MB/s if not multiple of 4

This is the "rawread" test (read/lseek loop, reading from same part of 
file/disk all the time -- I used a file and read 64K blocks so it
should all be coming from the disk cache).

 * Often it's better if the alignment isn't there to fallback to the
 * old code.

>From the above, it seems like we can still win in some cases but I
don't think further complicating the code is going to help us much, as
(probably) most of the big moves are going to be 8-byte aligned
anyway.

Satoshi

P.S. Here's the code after taking Terry's suggestions into account:

Index: support.s
===================================================================
RCS file: /usr/cvs/src/sys/i386/i386/support.s,v
retrieving revision 1.31
diff -u -r1.31 support.s
--- 1.31	1995/12/28 23:14:40
+++ support.s	1996/04/09 01:58:54
@@ -463,6 +463,14 @@
 	/* bcopy(%esi, %edi, %ebx) */
 3:
 	movl	%ebx,%ecx
+	cmpl	$1024,%ecx
+	jbe	slow_copyout
+
+	call	fastmove
+	jmp	done_copyout
+
+	ALIGN_TEXT
+slow_copyout:
 	shrl	$2,%ecx
 	cld
 	rep
@@ -510,6 +518,14 @@
 	cmpl	$VM_MAXUSER_ADDRESS,%edx
 	ja	copyin_fault
 
+	cmpl	$1024,%ecx
+	jbe	slow_copyin
+
+	call	fastmove
+	jmp	done_copyin
+
+	ALIGN_TEXT
+slow_copyin:
 	movb	%cl,%al
 	shrl	$2,%ecx				/* copy longword-wise */
 	cld
@@ -520,6 +536,8 @@
 	rep
 	movsb
 
+	ALIGN_TEXT
+done_copyin:
 	popl	%edi
 	popl	%esi
 	xorl	%eax,%eax
@@ -534,6 +552,84 @@
 	movl	_curpcb,%edx
 	movl	$0,PCB_ONFAULT(%edx)
 	movl	$EFAULT,%eax
+	ret
+
+/* fastmove(src, dst, len)
+	src in %esi
+	dst in %edi
+	len in %ecx
+	uses %eax and %edx for tmp. storage
+ */
+	ALIGN_TEXT
+fastmove:
+	cmpl $63,%ecx
+	jbe L57
+
+	movl %esi,%eax
+	andl $7,%eax	/* check if src addr is multiple of 8 */
+	jnz L57
+
+	movl %edi,%eax
+	andl $7,%eax	/* check if dst addr is multiple of 8 */
+	jnz L57
+
+	movl %cr0,%edx
+	movl $8, %eax	/* CR0_TS */
+	not %eax
+	andl %eax,%edx	/* clear CR0_TS */
+	movl %edx,%cr0
+
+	subl $108,%esp
+	fsave (%esp)
+
+	ALIGN_TEXT
+L58:
+	fildq 0(%esi)
+	fildq 8(%esi)
+	fildq 16(%esi)
+	fildq 24(%esi)
+	fildq 32(%esi)
+	fildq 40(%esi)
+	fildq 48(%esi)
+	fildq 56(%esi)
+	fxch %st(7)
+	fistpq 0(%edi)
+	fxch %st(5)
+	fistpq 8(%edi)
+	fxch %st(3)
+	fistpq 16(%edi)
+	fxch %st(1)
+	fistpq 24(%edi)
+	fistpq 32(%edi)
+	fistpq 40(%edi)
+	fistpq 48(%edi)
+	fistpq 56(%edi)
+	addl $-64,%ecx
+	addl $64,%esi
+	addl $64,%edi
+	cmpl $63,%ecx
+	ja L58
+
+	frstor (%esp)
+	addl $108,%esp
+
+	andl $8,%edx
+	movl %cr0,%eax
+	orl %edx, %eax	/* reset CR0_TS to the original value */
+	movl %eax,%cr0
+
+	ALIGN_TEXT
+L57:
+	movb	%cl,%al
+	shrl	$2,%ecx				/* copy longword-wise */
+	cld
+	rep
+	movsl
+	movb	%al,%cl
+	andb	$3,%cl				/* copy remaining bytes */
+	rep
+	movsb
+	
 	ret
 
 /*