Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 22 Mar 2001 15:01:58 +1100 (EST)
From:      Bruce Evans <bde@zeta.org.au>
To:        David Malone <dwmalone@maths.tcd.ie>
Cc:        Dag-Erling Smorgrav <des@ofug.org>, current@FreeBSD.ORG, jhb@FreeBSD.ORG, jake@FreeBSD.ORG, Ian Dowse <iedowse@maths.tcd.ie>
Subject:   Re: Interesting backtrace...
Message-ID:  <Pine.BSF.4.21.0103221351410.8482-100000@besplex.bde.org>
In-Reply-To: <20010321231601.A73422@walton.maths.tcd.ie>

next in thread | previous in thread | raw e-mail | index | archive | help
On Wed, 21 Mar 2001, David Malone wrote:

> On Mon, Mar 19, 2001 at 02:47:34PM +1100, Bruce Evans wrote:
> > > npx.c already has one "fix" for the overflow problem.  The problem
> > > is may be that clocks don't work early any more.
> > 
> > It must be that microtime() doesn't work early any more.

I checked that microtime() doesn't work for more than 10 msec if it
uses the i8254.  When it doesn't work for that long, the bandwidth
test breaks down for bzero() bandwidths smaller than 100 MB/sec.  Such
bandwidths are normal for Intel i586's.  E.g., my P5/133 has a
generic_bzero() bandwidth of 87e6 bytes/sec and an i586_bzero()
bandwidth of 174e6 bytes/sec.  This is in userland with a slightly
improved i586_bzero() (39 cycles instead of 41 for the inner loop
IIRC) and with slightly improved page coloring, and a buffer size of
1MB (same as in the bandwidth test).  So, the test always breaks down
for my P5/133 if microtime() uses the i8254.  OTOH, my K6-1/233 has
bandwidths of 135e6 and 127e6 bytes/sec, respectively, so the test
never breaks down for it.

> I did a quick check, and it does seem that i586_bzero can be faster
> on the k6-2. I found it was about twice as fast for large buffers.
> This was timed in userland using the TSC. With a slightly simplified
> version of i586_bzero (I removed all the kernel specific stuff and
> had it always save the floating point state on the stack). A graph
> is at:

This is surprising.

> 	http://www.maths.tcd.ie/~dwmalone/comp/bzero-band.ps
> 
> The graph seems to peak at about 160kB/s, which seems plausable.

160kB/sec is implausible :-).  160MB/sec is plausible.  Half that
is hard to understand.  Why is it slower than my K6-1?  Ah, I
partly understand.  My K6-1 has an L2 cache size of 1MB, so the
1MB buffer size is really too small for it if write allocation
is enabled.  P5's don't have write allocation, so the buffer size
for them is not critical.  All K6's have write allocation IIRC.
With a buffer size of 2MB, the bandwidths for my K6-1/233 are
84e6 and 80e6 bytes/sec, respectively.  So 80MB/sec is plausible
and 160MB/sec is fast (it's equivalent to 320MB/sec without
write allocation).

These complications show how hard it is to write a single bandwidth
test that works for all i586's.  I think the next step (after fixing
the i586 functions) should be to reduce the buffer size signicantly
and not worry about cache effects.  Cache effects benefit generic_bzero()
in the bandwidth test but they probably benefit it in normal use too.

> The code is at:
> 
> 	http://www.maths.tcd.ie/~dwmalone/comp/-time.S
> 	http://www.maths.tcd.ie/~dwmalone/comp/-time.c
> 
> (It's crude, but seemed to produce moderately OK results. You get
> ocasional dips in the bandwidth due to using the tcs for timing.
> I only tried sizes which were a power of two, aswell...)

I wrote not-so-crude read/write/copy/checksum userland benchmarks to
test this stuff when I helped implement the i586-optimized routines.
Here is the write benchmark.  Compile it with 'cc -aout'.

---
#include <sys/types.h>
#include <sys/time.h>
#include <sys/resource.h>

#include <machine/cpufunc.h>

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>

typedef void func_t(void *buf, size_t len);

struct func
{
    func_t *fn;
    char *name;
    char *description;
};

static func_t zero0, zero1, zero2, zero3, zero4, zero5, zero6, zero7;
static func_t zero8, zero9, zeroA, zeroB, zeroC, zeroD;
static void usage(void);

static char const *progname;

static struct func funcs[] =
{
    zero0, "zero0", "stosl",
    zero1, "zero1", "unroll 16",
    zero2, "zero2", "unroll 16 preallocate",
    zero3, "zero3", "unroll 32",
    zero4, "zero4", "unroll 32 preallocate",
    zero5, "zero5", "unroll 64",
    zero6, "zero6", "unroll 64 preallocate",
    zero7, "zero7", "fstl",
    zero8, "zero8", "movl",
    zero9, "zero9", "unroll 8",
    zeroA, "zeroA", "generic_bzero",
    zeroB, "zeroB", "i486_bzero",
    zeroC, "zeroC", "i586_bzero",
    zeroD, "zeroD", "i686_pagezero",
    bzero, "zeroE", "bzero (stosl)",
};
#define NFUNC	(sizeof funcs / sizeof funcs[0])

int main(int argc, char **argv)
{
    unsigned char *buf;
    int ch;
    int funcn;
    int funcnspecified;
    int i586;
    size_t len;
    size_t max;
    int precache;
    int quiet;
    size_t thrashbufsize;
    unsigned long long tot;

    progname = argv[0];
    funcnspecified = -1;
    i586 = 0;
    len = 4096;
    precache = 0;
    quiet = 0;
    tot = 100000000;
    while ((ch = getopt(argc, argv, "5f:l:pqt:")) != EOF)
    {
	switch (ch)
	{
	case '5':
	    i586 = 1;
	    break;
	case 'f':
	    funcnspecified = strtoul(optarg, (char **) NULL, 0);
	    if (funcnspecified < 0 || funcnspecified >= NFUNC)
		usage();
	    break;
	case 'l':
	    len = strtoul(optarg, (char **) NULL, 0);
	    break;
	case 'p':
	    precache = 1;
	    break;
	case 'q':
	    quiet = 1;
	    break;
	case 't':
	    tot = strtouq(optarg, (char **) NULL, 0);
	    break;
	default:
	    usage();
	}
    }
    if (optind != argc)
	usage();
    buf = malloc(len + 4096);
    if (buf == NULL)
    {
	fprintf(stderr, "%s: malloc failed\n", progname);
	exit(1);
    }
    max = tot / len;
    tot = (unsigned long long) max * len;

    for (funcn = 0; funcn < NFUNC; ++funcn)
    {
	func_t *funcp;
	struct rusage finish;
	size_t i;
	struct rusage start;
	unsigned long long tsc;
	long usec;

	if (funcnspecified != -1 && funcnspecified != funcn)
	    continue;

	/*
	 * Check the function.  As side effects, make sure that the buffer
	 * isn't a constant zero page, and leave as much of the buffer as
	 * possible in the cache to set up the `precache' case.
	 */
	memset(buf, 1, len);
	funcp = funcs[funcn].fn;
	funcp(buf, len);
#if 1
	for (i = 0; i < len; ++i)
	    if (buf[i] != '\0')
	    {
		fprintf(stderr, "%s: %s failed at %u\n",
			progname, funcs[funcn].name, i);
		exit(1);
	    }
#endif

	if (!precache)
	    /*
	     * Attempt to uncache the buffer so as to provide the same
	     * uncached environnment for all the functions.
	     */
	    for (thrashbufsize = 2 * 1024 * 1024; thrashbufsize != 0;
		 thrashbufsize /= 2)
	    {
		unsigned char *thrashbuf1;
		unsigned char *thrashbuf2;

		thrashbuf1 = malloc(thrashbufsize);
		thrashbuf2 = malloc(thrashbufsize);
		if (thrashbuf1 != NULL && thrashbuf2 != NULL)
		{
		    memcpy(thrashbuf2, thrashbuf1, thrashbufsize);
		    memcpy(thrashbuf1, thrashbuf2, thrashbufsize);
		}
		free(thrashbuf1);
		free(thrashbuf2);
	    }

	tsc = 0;
	getrusage(RUSAGE_SELF, &start);
	if (i586)
	    tsc = rdtsc();
#if 1
	for (i = 0; i < max; ++i)
	    funcp(buf, len);
#else
	tot /= 8 * 8;
	tot *= 8 * 8;
	for (i = 0; i < max / 8 / 8; ++i)
	{
	    int j, k;

	    for (j = 0; j < 8; ++j)
		for (k = 0; k < 8; ++k)
		    funcp(buf + j, len + k);
	}
#endif
	if (i586)
	    tsc = rdtsc() - tsc;
	getrusage(RUSAGE_SELF, &finish);
	usec = 1000000 * (finish.ru_utime.tv_sec - start.ru_utime.tv_sec)
	       + finish.ru_utime.tv_usec - start.ru_utime.tv_usec;
	if (usec < 0)
	    usec = 1;
	printf("%s: %10.0f B/s", funcs[funcn].name, tot * 1e6 / usec);
	if (!quiet)
	{
	    printf(" (%7ld us)", usec);
	    if (i586)
		printf(" (%9qd tsc)", tsc);
	    printf(" (%s)", funcs[funcn].description);
	}
	printf("\n");
    }
    return 0;
}

static void zero0(void *buf, size_t len)
{
    asm volatile("
	.p2align 4,0x90
	cld
	shrl $2,%1
	rep; stosl"
	: "=D" (buf), "=c" (len)
	: "0"  (buf), "1"  (len), "a" (0)
	: "memory");
}

static void zero1(void *buf, size_t len)
{
    asm volatile("
	.p2align 4,0x90
	1:
	movl %4,0(%0)
	movl %4,4(%0)
	movl %4,8(%0)
	movl %4,12(%0)
	addl $16,%0
	subl $16,%1
	ja 1b
	"
	: "=r" (buf), "=r" (len)
	: "0"  (buf), "1"  (len), "r" (0)
	: "memory");
}

static void zero2(void *buf, size_t len)
{
    unsigned preallocate;

    asm volatile("
	.p2align 4,0x90
	1:
	movl (%0),%2
	movl %5,0(%0)
	movl %5,4(%0)
	movl %5,8(%0)
	movl %5,12(%0)
	addl $16,%0
	subl $16,%1
	ja 1b
	"
	: "=r" (buf), "=r" (len), "=&r" (preallocate)
	: "0"  (buf), "1"  (len), "r" (0)
	: "memory");
}

static void zero3(void *buf, size_t len)
{
    asm volatile("
	.p2align 4,0x90
	1:
	movl %4,0(%0)
	movl %4,4(%0)
	movl %4,8(%0)
	movl %4,12(%0)
	movl %4,16(%0)
	movl %4,20(%0)
	movl %4,24(%0)
	movl %4,28(%0)
	addl $32,%0
	subl $32,%1
	ja 1b
	"
	: "=r" (buf), "=r" (len)
	: "0"  (buf), "1"  (len), "r" (0)
	: "memory");
}

static void zero4(void *buf, size_t len)
{
    unsigned preallocate;

    asm volatile("
	.p2align 4,0x90
	1:
	movl (%0),%2
	movl %5,0(%0)
	movl %5,4(%0)
	movl %5,8(%0)
	movl %5,12(%0)
	movl %5,16(%0)
	movl %5,20(%0)
	movl %5,24(%0)
	movl %5,28(%0)
	addl $32,%0
	subl $32,%1
	ja 1b
	"
	: "=r" (buf), "=r" (len), "=&r" (preallocate)
	: "0"  (buf), "1"  (len), "r" (0)
	: "memory");
}

static void zero5(void *buf, size_t len)
{
    asm volatile("
	.p2align 4,0x90
	1:
	movl %4,0(%0)
	movl %4,4(%0)
	movl %4,8(%0)
	movl %4,12(%0)
	movl %4,16(%0)
	movl %4,20(%0)
	movl %4,24(%0)
	movl %4,28(%0)
	movl %4,32(%0)
	movl %4,36(%0)
	movl %4,40(%0)
	movl %4,44(%0)
	movl %4,48(%0)
	movl %4,52(%0)
	movl %4,56(%0)
	movl %4,60(%0)
	addl $64,%0
	subl $64,%1
	ja 1b"
	: "=r" (buf), "=r" (len)
	: "0"  (buf), "1"  (len), "r" (0)
	: "memory");
}

static void zero6(void *buf, size_t len)
{
    void *buf2;
    unsigned preallocate;

    /*
     * The main loop has 11 pairs of i586 instructions with no AGI so that
     * it takes 11 cycles on i586's if all the data is in the L1 cache.
     *
     * On an ASUS P55TP4XE P133 the speeds are approx:
     *    data in L1 cache:      740,000,000 B/s
     *    data in L2 cache only:  90,000,000 B/s (highly variant)
     *    data not in any cache:  60,000,000 B/s
     * and without preallocating (function zero5) they are:
     *    data in L1 cache:       87,000,000 B/s
     *    data in L2 cache only:  87,000,000 B/s
     *    data not in any cache:  90,000,000 B/s
     *
     * Thus the instruction selection and ordering optimizations have an
     * insignificant effect if the data isn't in the L1 cache or the L2
     * cache, and preallocating is a pessimization if the data isn't in the
     * L2 cache.
     */
    asm volatile("
	.p2align 4,0x90
	1:
	movl (%0),%3
	leal 32(%0),%2
	movl %6,0(%0)
	movl %6,4(%0)
	movl %6,8(%0)
	movl %6,12(%0)
	movl %6,16(%0)
	movl %6,20(%0)
	movl %6,24(%0)
	movl %6,28(%0)
	movl (%2),%3
	addl $64,%0
	movl %6,0(%2)
	movl %6,4(%2)
	movl %6,8(%2)
	movl %6,12(%2)
	movl %6,16(%2)
	movl %6,20(%2)
	movl %6,24(%2)
	movl %6,28(%2)
	subl $64,%1
	ja 1b
	"
	: "=r" (buf), "=r" (len), "=&r" (buf2), "=&r" (preallocate)
	: "0"  (buf), "1"  (len), "r" (0)
	: "memory");
}

static void zero7(void *buf, size_t len)
{
    /*
     * On a P55TP4XE P133, `fstl' goes slower than all the loop control
     * instructions put together, so unrolling would be bad.
     */
    asm volatile("
	fldz
	.p2align 4,0x90
	1:
	fstl 0(%0)
	addl $8,%0
	subl $8,%1
	ja 1b
	fstp %%st(0)"
	: "=r" (buf), "=r" (len)
	: "0"  (buf), "1"  (len)
	: "memory");
}

static void zero8(void *buf, size_t len)
{
    asm volatile("
	.p2align 4,0x90
	1:
	movl $0,0(%0)
	addl $4,%0
	subl $4,%1
	ja 1b
	"
	: "=r" (buf), "=r" (len)
	: "0"  (buf), "1"  (len), "r" (0)
	: "memory");
}

static void zero9(void *buf, size_t len)
{
    asm volatile("
	.p2align 4,0x90
	1:
	movl $0,0(%0)
	movl $0,4(%0)
	addl $8,%0
	subl $8,%1
	ja 1b
	"
	: "=r" (buf), "=r" (len)
	: "0"  (buf), "1"  (len), "r" (0)
	: "memory");
}

asm("
	.p2align	2,0x90
_zeroA:
	pushl	%edi
	movl	8(%esp),%edi
	movl	12(%esp),%ecx
	xorl	%eax,%eax
	shrl	$2,%ecx
	cld
	rep
	stosl
	movl	12(%esp),%ecx
	andl	$3,%ecx
	rep
	stosb
	popl	%edi
	ret
");

asm("
	.p2align	2,0x90
_zeroB:
	movl	4(%esp),%edx
	movl	8(%esp),%ecx
	xorl	%eax,%eax
2:
	cmpl	$64,%ecx
	jb	3f
	movl	%eax,(%edx)
	movl	%eax,4(%edx)
	movl	%eax,8(%edx)
	movl	%eax,12(%edx)
	movl	%eax,16(%edx)
	movl	%eax,20(%edx)
	movl	%eax,24(%edx)
	movl	%eax,28(%edx)
	movl	%eax,32(%edx)
	movl	%eax,36(%edx)
	movl	%eax,40(%edx)
	movl	%eax,44(%edx)
	movl	%eax,48(%edx)
	movl	%eax,52(%edx)
	movl	%eax,56(%edx)
	movl	%eax,60(%edx)
	addl	$64,%edx
	subl	$64,%ecx
	jnz	2b
	ret

	.p2align	4,0x90
3:
	cmpl	$16,%ecx
	jb	4f
	movl	%eax,(%edx)
	movl	%eax,4(%edx)
	movl	%eax,8(%edx)
	movl	%eax,12(%edx)
	addl	$16,%edx
	subl	$16,%ecx
	jnz	3b
	ret

	.p2align	4,0x90
4:
	cmpl	$4,%ecx
	jb	5f
	movl	%eax,(%edx)
	addl	$4,%edx
	subl	$4,%ecx
	jnz	4b
	ret

	.data
jtab:
	.long	do0
	.long	do1
	.long	do2
	.long	do3

	.text
	.p2align	4,0x90
5:
	jmp	jtab(,%ecx,4)

	.p2align	4,0x90
do3:
	movw	%ax,(%edx)
	movb	%al,2(%edx)
	ret

	.p2align	4,0x90
do2:
	movw	%ax,(%edx)
	ret

	.p2align	4,0x90
do1:
	movb	%al,(%edx)
	ret

	.p2align	4,0x90
do0:
	ret
");

int npxproc = 0;
int kernel_fpu_lock = 0xfe;

asm("
_zeroC:
	movl	4(%esp),%edx
	movl	8(%esp),%ecx

	cmpl	$176,%ecx		# 112 in kernel; 104-136 without hair
	jb	intreg_i586_bzero

	cmpl	$0,_npxproc
	je	i586_bz1
	cmpl	$176+184,%ecx		# 112+184 in kernel; 320 without hair
	jb	intreg_i586_bzero
	sarb	$1,_kernel_fpu_lock
	jc	intreg_i586_bzero
	smsw	%ax
#	clts
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
	subl	$108,%esp
	fnsave	0(%esp)
	jmp	i586_bz2

i586_bz1:
	sarb	$1,_kernel_fpu_lock
	jc	intreg_i586_bzero
	smsw	%ax
#	clts
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
	fninit
i586_bz2:
	fldz

	fstl	0(%edx)
	addl	%edx,%ecx
	addl	$8,%edx
	andl	$~7,%edx
	subl	%edx,%ecx

	fstl	-8(%edx,%ecx)
	decl	%ecx
	andl	$~7,%ecx

fpureg_i586_bzero_loop:
	fstl	0(%edx)
	addl	$8,%edx
	subl	$8,%ecx
	cmpl	$8,%ecx
	jae	fpureg_i586_bzero_loop

	cmpl	$0,_npxproc
	je	i586_bz3
	frstor	0(%esp)
	addl	$108,%esp
#	lmsw	%ax
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
	movb	$0xfe,_kernel_fpu_lock
	ret

i586_bz3:
	fstpl	%st(0)
#	lmsw	%ax
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
	movb	$0xfe,_kernel_fpu_lock
	ret

intreg_i586_bzero:
	cmpl	$8,%ecx
	jb	i586_bz4
	movl	$0,(%edx)
	movl	$0,4(%edx)
	addl	%edx,%ecx
	addl	$8,%edx
	andl	$~7,%edx
	subl	%edx,%ecx
	cmpl	$8,%ecx
	jb	i586_bz4
intreg_i586_bzero_loop:
	movl	$0,0(%edx)
	movl	$0,4(%edx)
	addl	$8,%edx
	subl	$8,%ecx
	cmpl	$8,%ecx
	jae	intreg_i586_bzero_loop

	nop

i586_bz4:
	cmpl	$4,%ecx
	jb	i586_bz5
	movl	$0,(%edx)
	addl	$4,%edx
	subl	$4,%ecx
i586_bz5:
	cmpl	$2,%ecx
	jb	i586_bz6
	movw	$0,(%edx)
	addl	$2,%edx
	subl	$2,%ecx
i586_bz6:
	cmpl	$1,%ecx
	jb	i586_bz7
	movb	$0,(%edx)
i586_bz7:
	ret
");

asm("
_zeroD1:
	pushl	%edi

	movl	8(%esp), %edi
	movl	12(%esp), %ecx
	shrl	$2, %ecx
	cld

	.p2align 2,0x90
1:
	jmp	2f
	movl	(%edi), %eax
	orl	4(%edi), %eax
	orl	8(%edi), %eax
	orl	12(%edi), %eax
	orl	16(%edi), %eax
	orl	20(%edi), %eax
	orl	24(%edi), %eax
	orl	28(%edi), %eax
	jne	2f

	addl	$32, %edi
	subl	$32/4, %ecx
	jne	1b

	popl	%edi
	ret

	.p2align 2,0x90
3:
	leal	-32/4(%ecx), %edx
	xorl	%eax, %eax
	movl	$32/4, %ecx

	rep
	stosl

	addl	%edx, %ecx
	jne	1b

	popl	%edi
	ret

	.p2align 2,0x90
2:
	movl	$0, (%edi)
	movl	$0, 4(%edi)
	movl	$0, 8(%edi)
	movl	$0, 12(%edi)
	movl	$0, 16(%edi)
	movl	$0, 20(%edi)
	movl	$0, 24(%edi)
	movl	$0, 28(%edi)

	addl	$32, %edi
	subl	$32/4, %ecx
	jne	1b

	popl	%edi
	ret
");

asm("
_zeroD:
	movl	4(%esp), %edx
	movl	8(%esp), %ecx
	shrl	$2, %ecx

	.p2align 2,0x90
1:
	movl	(%edx), %eax
	orl	4(%edx), %eax
	orl	8(%edx), %eax
	orl	12(%edx), %eax
	orl	16(%edx), %eax
	orl	20(%edx), %eax
	orl	24(%edx), %eax
	orl	28(%edx), %eax
	jne	2f

	addl	$32, %edx
	subl	$32/4, %ecx
	jne	1b

	ret

	.p2align 2,0x90
2:
	movl	$0, (%edx)
	movl	$0, 4(%edx)
	movl	$0, 8(%edx)
	movl	$0, 12(%edx)
	movl	$0, 16(%edx)
	movl	$0, 20(%edx)
	movl	$0, 24(%edx)
	movl	$0, 28(%edx)

	addl	$32, %edx
	subl	$32/4, %ecx
	jne	1b

	ret
");

static void usage(void)
{
    fprintf(stderr, "%s: [-5cpq] [-f function] [-l length] [-t tot]\n",
	    progname);
    exit(1);
}
---

Bruce


To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-current" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?Pine.BSF.4.21.0103221351410.8482-100000>