Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 2 Feb 2004 22:24:49 +1100 (EST)
From:      Bruce Evans <bde@zeta.org.au>
To:        Andy Farkas <andyf@speednet.com.au>
Cc:        John Baldwin <jhb@FreeBSD.org>
Subject:   Re: cvs commit: src/sys/i386/i386 apic_vector.s src/sys/i386/isa atpic_vector.s
Message-ID:  <20040202215735.O2387@gamplex.bde.org>
In-Reply-To: <20040202175017.W1579@gamplex.bde.org>
References:  <200401282044.i0SKi8Y6063747@repoman.freebsd.org> <20040202175017.W1579@gamplex.bde.org>

next in thread | previous in thread | raw e-mail | index | archive | help
On Mon, 2 Feb 2004, Bruce Evans wrote:

> On Mon, 2 Feb 2004, Andy Farkas wrote:
>
> > On Wed, 28 Jan 2004, John Baldwin wrote:
> >
> > >   Modified files:
> > >     sys/i386/i386        apic_vector.s
> > >     sys/i386/isa         atpic_vector.s
> > >   Log:
> > >   Optimize the i386 interrupt entry code to not reload the segment registers
> > >   if they already contain the correct kernel selectors.
> >
> > What effect on performance does this change have? It seems to be a rather
> > significant change to an important code path, or am I totally confused..?
>
> I measured it in userland and saw about -1 cycles/interrupt on an AthlonXP
> and about -22 cycles/interrupt on an old Celeron (negative means a
> pessimization).

Bah, the benchmark that gave these results was very buggy.  Here is a
better version.  Results first (for a random pattern to simulate 75% of
interrupts coming drom userland):

Celeron 366:
-20 cycles/interrupt optimization

Celeron 366 with partial register stall pessimization backed out:
No significant change (measured -0.05 cycles/interrupt less)

P2 (or whatever freefall is):
Similar to Celeron.  Machine too loaded for accurate test.

AthlonXP:
+4 cycles/interrupt

Best cases for this and some other percentages:

100% user:  Celeron  0- AthlonXP  -1
 75% user:  Celeron  0+ AthlonXP  +4
 50% user:  Celeron  +2 AthlonXP  +9
 25% user:  Celeron  +9 AthlonXP +15
  0% user:  Celeron +20 AthlonXP +21

The branch prediction seems to be too good to be true.

%%%
#include <sys/types.h>
#include <machine/cpufunc.h>

#include <stdio.h>
#include <stdlib.h>

static void
testnewway(void)
{
	__asm("				\n\
KDSEL	=	0x2f			\n\
KPSEL	=	0x2f			\n\
	pushal				\n\
	pushl	%ds			\n\
	pushl	%es			\n\
	pushl	%fs			\n\
	mov	%fs,%ax			\n\
	cmp	$KPSEL,%ax		\n\
	je	1f			\n\
	mov	$KDSEL,%ax		\n\
	mov	%ax,%ds			\n\
	mov	%ax,%es			\n\
	mov	$KPSEL,%ax		\n\
	mov	%ax,%fs			\n\
1:					\n\
	popl	%fs			\n\
	popl	%es			\n\
	popl	%ds			\n\
	popal				\n\
	");
}

static void
testoldway(void)
{
	__asm("				\n\
KDSEL	=	0x2f			\n\
KPSEL	=	0x2f			\n\
	pushal				\n\
	pushl	%ds			\n\
	pushl	%es			\n\
	pushl	%fs			\n\
	movl	$KDSEL,%eax		\n\
	mov	%ax,%ds			\n\
	mov	%ax,%es			\n\
	movl	$KPSEL,%eax		\n\
	mov	%ax,%fs			\n\
1:					\n\
	popl	%fs			\n\
	popl	%es			\n\
	popl	%ds			\n\
	popal				\n\
	");
}

static void
testnewwayfix1(void)
{
	__asm("				\n\
KDSEL	=	0x2f			\n\
KPSEL	=	0x2f			\n\
	pushal				\n\
	pushl	%ds			\n\
	pushl	%es			\n\
	pushl	%fs			\n\
	mov	%fs,%ax			\n\
	cmp	$KPSEL,%ax		\n\
	je	1f			\n\
	movl	$KDSEL,%eax		\n\
	mov	%ax,%ds			\n\
	mov	%ax,%es			\n\
	movl	$KPSEL,%eax		\n\
	mov	%ax,%fs			\n\
1:					\n\
	popl	%fs			\n\
	popl	%es			\n\
	popl	%ds			\n\
	popal				\n\
	");
}

static void
testnewwayfix2(void)
{
	/*
	 * New way with a different way of avoiding the partial register
	 * stores (use lots of segment override prefixes).
	 *
	 * It seems that gcc now understands "mov %ax,%ds" like I did in
	 * 1986, but that understanding is wrong :-).  gcc leaves out the
	 * operand size prefix, but the prefix is apparently needed to
	 * actually mov from %ax.  Without it the mov is apparently from
	 * %eax and there is a partial register store if we only load %ax.
	 * gcc refuses to produce the prefix for "mov %eax,%ds".
	 *
	 * gcc understands mov's from segment registers better to general
	 * registers better.  Then it is clear that the target may be
	 * either 16 or 32 bits.  Intel now documents what happens in at
	 * least this case.  IIRC, it says that the operand size prefix
	 * works as should be expected, and the top 16 bits of the target
	 * are set to an indeterminate value.  See the commit logs for
	 * <machine/cpufunc.h> for where I shot down a pesimization of
	 * this (we avoid using the operand size prefix, so we get garbage
	 * in the top bits).
	 *
	 * This subset of the benchmark is just to demonstrate that using
	 * the operand size prefix to force 16-bit operations is just a
	 * pessimization.  It is only a small pessimization though.  Until
	 * today (2004/01/04) I didn't understand why getting the prefix
	 * wrong was such a large pessimization.  It was due to partial
	 * register stalls more that the prefix.
	 */
	__asm("				\n\
KDSEL	=	0x2f			\n\
KPSEL	=	0x2f			\n\
	pushal				\n\
	pushl	%ds			\n\
	pushl	%es			\n\
	pushl	%fs			\n\
	mov	%fs,%ax			\n\
	cmp	$KPSEL,%ax		\n\
	je	1f			\n\
	mov	$KDSEL,%ax		\n\
	.byte	0x66			\n\
	mov	%ax,%ds			\n\
	.byte	0x66			\n\
	mov	%ax,%es			\n\
	mov	$KPSEL,%ax		\n\
	.byte	0x66			\n\
	mov	%ax,%fs			\n\
1:					\n\
	popl	%fs			\n\
	popl	%es			\n\
	popl	%ds			\n\
	popal				\n\
	");
}

#define	SIZE	1000000

unsigned char state[SIZE];

static void
randfs(int i)
{
	if (state[i] != 0)
		__asm("pushl %cs; popl %fs");
	else
		__asm("pushl %ds; popl %fs");
}

int
main(void)
{
	double statetot;
	unsigned long long start;
	int i;

	statetot = 0;
	for (i = 0; i < SIZE; i++) {
#if 0
		state[i] = (random() >> 30) & 1;
#else
		/*
		 * Bias the state to simulate that the system probably
		 * spends most of its time in user mode (state[i] != 0).
		 * Guess 75% in user mode.
		 */
		state[i] = (((random() >> 28) & 3) != 0);
#endif
		statetot += state[i];
	}

	start = rdtsc();
	for (i = 0; i < SIZE; i++) {
		randfs(i);
		testnewway();
	}
	printf("%llu\n", rdtsc() - start);

	start = rdtsc();
	for (i = 0; i < SIZE; i++) {
		randfs(i);
		testoldway();
	}
	printf("%llu\n", rdtsc() - start);

	start = rdtsc();
	for (i = 0; i < SIZE; i++) {
		randfs(i);
		testnewwayfix1();
	}
	printf("%llu\n", rdtsc() - start);

	start = rdtsc();
	for (i = 0; i < SIZE; i++) {
		randfs(i);
		testnewwayfix2();
	}
	printf("%llu\n", rdtsc() - start);

	printf("state average %.3f\n", statetot / SIZE);
	return (0);
}
%%%

Bruce



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20040202215735.O2387>