From owner-freebsd-current Mon Apr 8 19:23:26 1996 Return-Path: owner-current Received: (from root@localhost) by freefall.freebsd.org (8.7.3/8.7.3) id TAA25463 for current-outgoing; Mon, 8 Apr 1996 19:23:26 -0700 (PDT) Received: from sunrise.cs.berkeley.edu (sunrise.CS.Berkeley.EDU [128.32.38.121]) by freefall.freebsd.org (8.7.3/8.7.3) with SMTP id TAA25446 for ; Mon, 8 Apr 1996 19:23:19 -0700 (PDT) Received: (from asami@localhost) by sunrise.cs.berkeley.edu (8.6.12/8.6.12) id TAA16078; Mon, 8 Apr 1996 19:25:31 -0700 Date: Mon, 8 Apr 1996 19:25:31 -0700 Message-Id: <199604090225.TAA16078@sunrise.cs.berkeley.edu> To: terry@lambert.org CC: paul@netcraft.co.uk, davidg@Root.COM, current@FreeBSD.org, nisha@cs.berkeley.edu, tege@matematik.su.se, hasty@star-gate.com In-reply-to: <199604052314.QAA25117@phaeton.artisoft.com> (message from Terry Lambert on Fri, 5 Apr 1996 16:14:00 -0700 (MST)) Subject: Re: fast memory copy for large data sizes From: asami@cs.berkeley.edu (Satoshi Asami) Sender: owner-current@FreeBSD.org X-Loop: FreeBSD.org Precedence: bulk * I also don't see the code seriously dealing with misalignment between * wource and target, which need to be aligned on the same boundry for * everything but the initial and final sub-increment sized moves. That's a good point. I changed our routine to fall back to the original code if the alignment is not multiple of 8. Use our code all the time: 70MB/s if multiple of 8 42MB/s if not multiple of 8 Use original code if not multiple of 8: 70MB/s if multiple of 8 56MB/s if multiple of 4 but not multiple of 8 34MB/s if not multiple of 4 This is the "rawread" test (read/lseek loop, reading from same part of file/disk all the time -- I used a file and read 64K blocks so it should all be coming from the disk cache). * Often it's better if the alignment isn't there to fallback to the * old code. >From the above, it seems like we can still win in some cases but I don't think further complicating the code is going to help us much, as (probably) most of the big moves are going to be 8-byte aligned anyway. Satoshi P.S. Here's the code after taking Terry's suggestions into account: Index: support.s =================================================================== RCS file: /usr/cvs/src/sys/i386/i386/support.s,v retrieving revision 1.31 diff -u -r1.31 support.s --- 1.31 1995/12/28 23:14:40 +++ support.s 1996/04/09 01:58:54 @@ -463,6 +463,14 @@ /* bcopy(%esi, %edi, %ebx) */ 3: movl %ebx,%ecx + cmpl $1024,%ecx + jbe slow_copyout + + call fastmove + jmp done_copyout + + ALIGN_TEXT +slow_copyout: shrl $2,%ecx cld rep @@ -510,6 +518,14 @@ cmpl $VM_MAXUSER_ADDRESS,%edx ja copyin_fault + cmpl $1024,%ecx + jbe slow_copyin + + call fastmove + jmp done_copyin + + ALIGN_TEXT +slow_copyin: movb %cl,%al shrl $2,%ecx /* copy longword-wise */ cld @@ -520,6 +536,8 @@ rep movsb + ALIGN_TEXT +done_copyin: popl %edi popl %esi xorl %eax,%eax @@ -534,6 +552,84 @@ movl _curpcb,%edx movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax + ret + +/* fastmove(src, dst, len) + src in %esi + dst in %edi + len in %ecx + uses %eax and %edx for tmp. storage + */ + ALIGN_TEXT +fastmove: + cmpl $63,%ecx + jbe L57 + + movl %esi,%eax + andl $7,%eax /* check if src addr is multiple of 8 */ + jnz L57 + + movl %edi,%eax + andl $7,%eax /* check if dst addr is multiple of 8 */ + jnz L57 + + movl %cr0,%edx + movl $8, %eax /* CR0_TS */ + not %eax + andl %eax,%edx /* clear CR0_TS */ + movl %edx,%cr0 + + subl $108,%esp + fsave (%esp) + + ALIGN_TEXT +L58: + fildq 0(%esi) + fildq 8(%esi) + fildq 16(%esi) + fildq 24(%esi) + fildq 32(%esi) + fildq 40(%esi) + fildq 48(%esi) + fildq 56(%esi) + fxch %st(7) + fistpq 0(%edi) + fxch %st(5) + fistpq 8(%edi) + fxch %st(3) + fistpq 16(%edi) + fxch %st(1) + fistpq 24(%edi) + fistpq 32(%edi) + fistpq 40(%edi) + fistpq 48(%edi) + fistpq 56(%edi) + addl $-64,%ecx + addl $64,%esi + addl $64,%edi + cmpl $63,%ecx + ja L58 + + frstor (%esp) + addl $108,%esp + + andl $8,%edx + movl %cr0,%eax + orl %edx, %eax /* reset CR0_TS to the original value */ + movl %eax,%cr0 + + ALIGN_TEXT +L57: + movb %cl,%al + shrl $2,%ecx /* copy longword-wise */ + cld + rep + movsl + movb %al,%cl + andb $3,%cl /* copy remaining bytes */ + rep + movsb + ret /*