Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 5 Apr 1996 03:16:38 -0800 (PST)
From:      asami@cs.berkeley.edu (Satoshi Asami)
To:        davidg@root.com
Cc:        current@freebsd.org, nisha@cs.berkeley.edu, tege@matematik.su.se, hasty@rah.star-gate.com, dyson@freebsd.org
Subject:   Re: fast memory copy for large data sizes
Message-ID:  <199604051116.DAA24816@silvia.HIP.Berkeley.EDU>
In-Reply-To: <199604051021.CAA00222@Root.COM> (message from David Greenman on Fri, 05 Apr 1996 02:21:48 -0800)

next in thread | previous in thread | raw e-mail | index | archive | help
 > I have that mail, tried what was in there, but it wasn't as fast as FP 
 > copies.  Maybe I screwed up something, I'll try again tomorrow.

It wasn't much trouble so I tried it again.  Here's what I got on the
133MHz Pentium:

    size     libc             ours
      32      N/A         30.517578 MB/s
      64  61.035156 MB/s  30.517578 MB/s
     128  40.690104 MB/s  40.690104 MB/s
     256  40.690104 MB/s  34.877232 MB/s
     512  40.690104 MB/s  34.877232 MB/s
    1024  40.690104 MB/s  33.674569 MB/s
    2048  39.859694 MB/s  34.265351 MB/s
    4096  39.859694 MB/s  34.265351 MB/s
    8192  39.657360 MB/s  34.115721 MB/s
   16384  39.556962 MB/s  34.115721 MB/s
   32768  39.506953 MB/s  34.153005 MB/s
   65536  39.531942 MB/s  34.227820 MB/s
  131072  39.345294 MB/s  34.125034 MB/s
  262144  39.227993 MB/s  34.227820 MB/s
  524288  38.735668 MB/s  34.218451 MB/s
 1048576  38.224839 MB/s  34.263003 MB/s
 2097152  37.799323 MB/s  34.270635 MB/s
 4194304  37.700283 MB/s  34.283265 MB/s

Hmm.  I can't even get it to be faster than libc now.  I think I've
seen 40MB/s for large copies before, I don't remember exactly what I
did.

Satoshi

P.S. Here's the "unrolled", pretty much stolen from Torbjorn's mail to
-hackers:

	.align 2
.globl _unrolled
	.type	 _unrolled,@function
_unrolled:
	pushl %ebp
	movl %esp,%ebp
	pushl %edi
	pushl %esi
	movl 8(%ebp),%esi
	movl 12(%ebp),%edi
	movl 16(%ebp),%ecx	/* count is in bytes */

	shrl $5,%ecx
	jz L54
	
	movl (%edi),%eax	/* fetch destination cache line */

	.align 2,0x90
L55:	movl	28(%edi),%eax	/* fetch destination cache line */
	orl	%eax,%eax	/* to make things go in pairs */

	movl	(%esi),%eax	/* load pairwise */
	movl	4(%esi),%edx
	movl	%eax,(%edi)	/* and store pairwise */
	movl	%edx,4(%edi)

	movl	8(%esi),%eax
	movl	12(%esi),%edx
	movl	%eax,8(%edi)
	movl	%edx,12(%edi)

	movl	16(%esi),%eax
	movl	20(%esi),%edx
	movl	%eax,16(%edi)
	movl	%edx,20(%edi)

	movl	24(%esi),%eax
	movl	28(%esi),%edx
	movl	%eax,24(%edi)
	movl	%edx,28(%edi)

	addl	$32,%esi	/* update source pointer */
	addl	$32,%edi	/* update destnation pointer */
	decl	%ecx		/* decr loop count */
	jnz	L55

L54:
	movl 16(%ebp),%ecx
	andl $31,%ecx
	movl %ecx,%edx
	shrl $2,%ecx		/* first copy as much as we can in words */
	cld
	rep
	movsl
	movl %edx,%ecx
	andl $3,%ecx		/* and then up to 3 bytes */
	rep
	movsb

	leal -8(%ebp),%esp
	popl %esi
	popl %edi
	leave
	ret
Lfe6:
	.size	 _unrolled,Lfe6-_unrolled




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199604051116.DAA24816>