Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 16 May 1996 02:39:57 -0700 (PDT)
From:      asami@cs.berkeley.edu (Satoshi Asami)
To:        bde@zeta.org.au
Cc:        bde@zeta.org.au, ccd@stampede.cs.berkeley.edu, current@freebsd.org
Subject:   Re: some more on fast bcopy
Message-ID:  <199605160939.CAA00809@silvia.HIP.Berkeley.EDU>
In-Reply-To: <199605151358.XAA22996@godzilla.zeta.org.au> (message from Bruce Evans on Wed, 15 May 1996 23:58:45 %2B1000)

next in thread | previous in thread | raw e-mail | index | archive | help
 * It should work better with a kernel mode trap handler for "FPU not
 * available" :-).  Add a T_DNA case for kernel mode in trap() by copying
 * the first 5 lines from the user mode T_DNA case (`break;' if npxdna()
 * doesn't handle the panic (this "can't happen")).

OK, I implemented it.  Will this work?

===
Index: trap.c
===================================================================
RCS file: /usr/cvs/src/sys/i386/i386/trap.c,v
retrieving revision 1.75
diff -u -r1.75 trap.c
--- trap.c	1996/03/28 05:40:57	1.75
+++ trap.c	1996/05/15 21:30:27
@@ -319,6 +319,14 @@
 			(void) trap_pfault(&frame, FALSE);
 			return;
 
+		case T_DNA:
+#if NNPX > 0
+			/* if a transparent fault (due to context switch "late") */
+			if (npxdna())
+				return;
+#endif	/* NNPX > 0 */
+			break;
+
 		case T_PROTFLT:		/* general protection fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 			/*
===

Also, I found a typo in my last patch.  The "fnsave 122(%eax)" should
have been "fnsave 112(%eax)".  (That's what you get by writing
assembly language when your hamster is scratching her cage.)

Here's the (correct) patch to support.s:

===
Index: support.s
===================================================================
RCS file: /usr/cvs/src/sys/i386/i386/support.s,v
retrieving revision 1.35
diff -u -r1.35 support.s
--- support.s	1996/05/03 21:01:00	1.35
+++ support.s	1996/05/16 08:18:59
@@ -453,6 +453,16 @@
 	/* bcopy(%esi, %edi, %ebx) */
 3:
 	movl	%ebx,%ecx
+#ifdef I586_FAST_BCOPY
+	cmpl	$128,%ecx
+	jbe	slow_copyout
+
+	call	fastmove
+	jmp	done_copyout
+
+	ALIGN_TEXT
+slow_copyout:
+#endif /* I586_FAST_BCOPY */
 	shrl	$2,%ecx
 	cld
 	rep
@@ -500,6 +510,16 @@
 	cmpl	$VM_MAXUSER_ADDRESS,%edx
 	ja	copyin_fault
 
+#ifdef I586_FAST_BCOPY
+	cmpl	$128,%ecx
+	jbe	slow_copyin
+
+	call	fastmove
+	jmp	done_copyin
+
+	ALIGN_TEXT
+slow_copyin:
+#endif /* I586_FAST_BCOPY */
 	movb	%cl,%al
 	shrl	$2,%ecx				/* copy longword-wise */
 	cld
@@ -510,6 +530,10 @@
 	rep
 	movsb
 
+#ifdef I586_FAST_BCOPY
+	ALIGN_TEXT
+done_copyin:
+#endif /* I586_FAST_BCOPY */
 	popl	%edi
 	popl	%esi
 	xorl	%eax,%eax
@@ -525,6 +549,252 @@
 	movl	$0,PCB_ONFAULT(%edx)
 	movl	$EFAULT,%eax
 	ret
+
+#ifdef I586_FAST_BCOPY
+/* fastmove(src, dst, len)
+	src in %esi
+	dst in %edi
+	len in %ecx
+	uses %eax and %edx for tmp. storage
+ */
+/* 
+LC0:
+	.ascii	"npxproc == curproc\0"
+LC1:
+	.ascii	"support.s"
+ */
+	ALIGN_TEXT
+fastmove:
+	cmpl	$255,%ecx
+	jbe	fastmove_tail
+
+	testl	$7,%esi	/* check if src addr is multiple of 8 */
+	jnz	fastmove_tail
+
+	testl	$7,%edi	/* check if dst addr is multiple of 8 */
+	jnz	fastmove_tail
+
+	pushl	%ebp
+	movl	%esp,%ebp
+	subl	$176,%esp
+
+/* if (intr_nesting_level > 0) */
+	cmpb	$0,_intr_nesting_level
+	je	L6
+/* save reentrantly */
+	movl	%cr0,%edx
+	clts
+	fnsave	-176(%ebp)
+	jmp L7
+
+/* else { */
+	ALIGN_TEXT
+L6:
+/* if (npxproc != NULL) { */
+	cmpl	$0,_npxproc
+	je	L8
+/*    assert(npxproc == curproc); */
+/*	movl	_npxproc,%eax
+	cmpl	%eax,_curproc
+	je	L6b
+	pushl	LC0
+	pushl	$599
+	pushl	LC1
+	call	___assert
+	addl	$12,%esp
+L6b: */
+/*    fnsave(&curpcb->pcb_savefpu); */
+	movl	_curpcb,%eax
+	fnsave	112(%eax)
+/*   npxproc = NULL; */
+	movl	$0,_npxproc
+/* } */
+L8:
+/* now we own the FPU. */
+
+/*
+ * The process' FP state is saved in the pcb, but if we get
+ * switched, the cpu_switch() will store our FP state in the
+ * pcb.  It should be possible to avoid all the copying for
+ * this, e.g., by setting a flag to tell cpu_switch() to
+ * save the state somewhere else.
+ */
+/* tmp = curpcb->pcb_savefpu; */
+	pushl	%edi
+	pushl	%esi
+	pushl	%ecx
+	leal	-176(%ebp),%edi
+	movl	_curpcb,%esi
+	addl	$112,%esi
+	cld
+	movl	$44,%ecx
+	rep
+	movsl
+	popl	%ecx
+	popl	%esi
+	popl	%edi
+/* stop_emulating(); */
+	clts
+/* npxproc = curproc; */
+	movl	_curproc,%eax
+	movl	%eax,_npxproc
+/* } */
+L7:
+	ALIGN_TEXT
+fastmove_loop:
+	movl	32(%esi),%eax
+	movl	64(%esi),%eax
+	movl	96(%esi),%eax
+	movl	128(%esi),%eax
+	movl	160(%esi),%eax
+	movl	192(%esi),%eax
+	movl	224(%esi),%eax
+
+	cmpl	$259,%ecx
+	jbe	fastmove_tmp
+	movl	256(%esi),%eax
+
+	ALIGN_TEXT
+fastmove_tmp:
+	fildq	0(%esi)
+	fildq	8(%esi)
+	fildq	16(%esi)
+	fildq	24(%esi)
+	fildq	32(%esi)
+	fildq	40(%esi)
+	fildq	48(%esi)
+	fildq	56(%esi)
+	fxch	%st(7)
+	fistpq	0(%edi)
+	fxch	%st(5)
+	fistpq	8(%edi)
+	fxch	%st(3)
+	fistpq	16(%edi)
+	fxch	%st(1)
+	fistpq	24(%edi)
+	fistpq	32(%edi)
+	fistpq	40(%edi)
+	fistpq	48(%edi)
+	fistpq	56(%edi)
+	fildq	64(%esi)
+	fildq	72(%esi)
+	fildq	80(%esi)
+	fildq	88(%esi)
+	fildq	96(%esi)
+	fildq	104(%esi)
+	fildq	112(%esi)
+	fildq	120(%esi)
+	fxch	%st(7)
+	fistpq	64(%edi)
+	fxch	%st(5)
+	fistpq	72(%edi)
+	fxch	%st(3)
+	fistpq	80(%edi)
+	fxch	%st(1)
+	fistpq	88(%edi)
+	fistpq	96(%edi)
+	fistpq	104(%edi)
+	fistpq	112(%edi)
+	fistpq	120(%edi)
+	fildq	128(%esi)
+	fildq	136(%esi)
+	fildq	144(%esi)
+	fildq	152(%esi)
+	fildq	160(%esi)
+	fildq	168(%esi)
+	fildq	176(%esi)
+	fildq	184(%esi)
+	fxch	%st(7)
+	fistpq	128(%edi)
+	fxch	%st(5)
+	fistpq	136(%edi)
+	fxch	%st(3)
+	fistpq	144(%edi)
+	fxch	%st(1)
+	fistpq	152(%edi)
+	fistpq	160(%edi)
+	fistpq	168(%edi)
+	fistpq	176(%edi)
+	fistpq	184(%edi)
+	fildq	192(%esi)
+	fildq	200(%esi)
+	fildq	208(%esi)
+	fildq	216(%esi)
+	fildq	224(%esi)
+	fildq	232(%esi)
+	fildq	240(%esi)
+	fildq	248(%esi)
+	fxch	%st(7)
+	fistpq	192(%edi)
+	fxch	%st(5)
+	fistpq	200(%edi)
+	fxch	%st(3)
+	fistpq	208(%edi)
+	fxch	%st(1)
+	fistpq	216(%edi)
+	fistpq	224(%edi)
+	fistpq	232(%edi)
+	fistpq	240(%edi)
+	fistpq	248(%edi)
+	addl	$-256,%ecx
+	addl	$256,%esi
+	addl	$256,%edi
+	cmpl	$255,%ecx
+	ja	fastmove_loop
+	
+/* if (intr_nesting_level > 0) */
+
+	cmpb	$0,_intr_nesting_level
+	je	L9
+	
+/* Restore reentrantly. */
+	frstor	-176(%ebp)
+	movl	%edx,%cr0
+	jmp	L10
+
+/* else { */
+	ALIGN_TEXT
+L9:
+/* curpcb->pcb_savefpu = tmp; */
+	pushl	%edi
+	pushl	%esi
+	pushl	%ecx
+	movl	_curpcb,%edi
+	addl	$112,%edi
+	leal	-176(%ebp),%esi
+	cld
+	movl	$44,%ecx
+	rep
+	movsl
+	popl	%ecx
+	popl	%esi
+	popl	%edi
+
+/* start_emulating(); */
+	smsw	%ax
+	orb	$8,%al
+	lmsw	%ax
+/* npxproc = NULL; */
+	movl	$0,_npxproc
+/* } */
+L10:
+	movl	%ebp,%esp
+	popl	%ebp
+	
+	ALIGN_TEXT
+fastmove_tail:
+	movb	%cl,%al
+	shrl	$2,%ecx				/* copy longword-wise */
+	cld
+	rep
+	movsl
+	movb	%al,%cl
+	andb	$3,%cl				/* copy remaining bytes */
+	rep
+	movsb
+
+	ret
+#endif /* I586_FAST_BCOPY */
 
 /*
  * fu{byte,sword,word} : fetch a byte (sword, word) from user memory
===

 * 						     When you get everything
 * working, add some tests and flags to restore the panic if an T_DNA
 * trap occurs unexpectedly (set a flag FP_KERNEL_USING_FP in pcb->pcbflags
 * while the kernel is using FP in fastmove() or elsewhere, and `break;'
 * for the kernel T_DNA case if this flag isn't set).

I'm not sure if I understand this, but let me try to digest it a
little more over the weekend. ;)

 * Also, the `intr_nesting_level > 0' case needs to preserve the TS bit.
 * This case probably hasn't been executed yet (it will be executed when
 * bcopy() uses fastmove() and an interrupt handler calls bcopy()).

Oh, really?  I thought it was saved via %edx.

Anyway, I think I've got it working here.  I have run several make
worlds on a few machines and they don't seem to crack.  I also have it
on my home machine and am running all sorts of things from ghostscript
to top and nothing suspicious has happened so far.  (I saw a few
processes die with SIGFPE before I fixed the fnsave bug.)

If some brave soul out there can test this, it will be great.  Just
apply the above patch to a -current /sys/i386/i386/ and rebuild your
kernel with 'options "I586_FAST_BCOPY"'.

No guarantees, though.  Just because it works here doesn't mean it
won't eat your root partition for lunch. :]

Satoshi



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199605160939.CAA00809>