From owner-freebsd-current Thu May 16 02:40:24 1996 Return-Path: owner-current Received: (from root@localhost) by freefall.freebsd.org (8.7.3/8.7.3) id CAA11827 for current-outgoing; Thu, 16 May 1996 02:40:24 -0700 (PDT) Received: from silvia.HIP.Berkeley.EDU (silvia.HIP.Berkeley.EDU [136.152.64.181]) by freefall.freebsd.org (8.7.3/8.7.3) with ESMTP id CAA11818 for ; Thu, 16 May 1996 02:40:18 -0700 (PDT) Received: (from asami@localhost) by silvia.HIP.Berkeley.EDU (8.7.5/8.6.9) id CAA00809; Thu, 16 May 1996 02:39:57 -0700 (PDT) Date: Thu, 16 May 1996 02:39:57 -0700 (PDT) Message-Id: <199605160939.CAA00809@silvia.HIP.Berkeley.EDU> To: bde@zeta.org.au CC: bde@zeta.org.au, ccd@stampede.cs.berkeley.edu, current@freebsd.org In-reply-to: <199605151358.XAA22996@godzilla.zeta.org.au> (message from Bruce Evans on Wed, 15 May 1996 23:58:45 +1000) Subject: Re: some more on fast bcopy From: asami@cs.berkeley.edu (Satoshi Asami) Sender: owner-current@freebsd.org X-Loop: FreeBSD.org Precedence: bulk * It should work better with a kernel mode trap handler for "FPU not * available" :-). Add a T_DNA case for kernel mode in trap() by copying * the first 5 lines from the user mode T_DNA case (`break;' if npxdna() * doesn't handle the panic (this "can't happen")). OK, I implemented it. Will this work? === Index: trap.c =================================================================== RCS file: /usr/cvs/src/sys/i386/i386/trap.c,v retrieving revision 1.75 diff -u -r1.75 trap.c --- trap.c 1996/03/28 05:40:57 1.75 +++ trap.c 1996/05/15 21:30:27 @@ -319,6 +319,14 @@ (void) trap_pfault(&frame, FALSE); return; + case T_DNA: +#if NNPX > 0 + /* if a transparent fault (due to context switch "late") */ + if (npxdna()) + return; +#endif /* NNPX > 0 */ + break; + case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ /* === Also, I found a typo in my last patch. The "fnsave 122(%eax)" should have been "fnsave 112(%eax)". (That's what you get by writing assembly language when your hamster is scratching her cage.) Here's the (correct) patch to support.s: === Index: support.s =================================================================== RCS file: /usr/cvs/src/sys/i386/i386/support.s,v retrieving revision 1.35 diff -u -r1.35 support.s --- support.s 1996/05/03 21:01:00 1.35 +++ support.s 1996/05/16 08:18:59 @@ -453,6 +453,16 @@ /* bcopy(%esi, %edi, %ebx) */ 3: movl %ebx,%ecx +#ifdef I586_FAST_BCOPY + cmpl $128,%ecx + jbe slow_copyout + + call fastmove + jmp done_copyout + + ALIGN_TEXT +slow_copyout: +#endif /* I586_FAST_BCOPY */ shrl $2,%ecx cld rep @@ -500,6 +510,16 @@ cmpl $VM_MAXUSER_ADDRESS,%edx ja copyin_fault +#ifdef I586_FAST_BCOPY + cmpl $128,%ecx + jbe slow_copyin + + call fastmove + jmp done_copyin + + ALIGN_TEXT +slow_copyin: +#endif /* I586_FAST_BCOPY */ movb %cl,%al shrl $2,%ecx /* copy longword-wise */ cld @@ -510,6 +530,10 @@ rep movsb +#ifdef I586_FAST_BCOPY + ALIGN_TEXT +done_copyin: +#endif /* I586_FAST_BCOPY */ popl %edi popl %esi xorl %eax,%eax @@ -525,6 +549,252 @@ movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax ret + +#ifdef I586_FAST_BCOPY +/* fastmove(src, dst, len) + src in %esi + dst in %edi + len in %ecx + uses %eax and %edx for tmp. storage + */ +/* +LC0: + .ascii "npxproc == curproc\0" +LC1: + .ascii "support.s" + */ + ALIGN_TEXT +fastmove: + cmpl $255,%ecx + jbe fastmove_tail + + testl $7,%esi /* check if src addr is multiple of 8 */ + jnz fastmove_tail + + testl $7,%edi /* check if dst addr is multiple of 8 */ + jnz fastmove_tail + + pushl %ebp + movl %esp,%ebp + subl $176,%esp + +/* if (intr_nesting_level > 0) */ + cmpb $0,_intr_nesting_level + je L6 +/* save reentrantly */ + movl %cr0,%edx + clts + fnsave -176(%ebp) + jmp L7 + +/* else { */ + ALIGN_TEXT +L6: +/* if (npxproc != NULL) { */ + cmpl $0,_npxproc + je L8 +/* assert(npxproc == curproc); */ +/* movl _npxproc,%eax + cmpl %eax,_curproc + je L6b + pushl LC0 + pushl $599 + pushl LC1 + call ___assert + addl $12,%esp +L6b: */ +/* fnsave(&curpcb->pcb_savefpu); */ + movl _curpcb,%eax + fnsave 112(%eax) +/* npxproc = NULL; */ + movl $0,_npxproc +/* } */ +L8: +/* now we own the FPU. */ + +/* + * The process' FP state is saved in the pcb, but if we get + * switched, the cpu_switch() will store our FP state in the + * pcb. It should be possible to avoid all the copying for + * this, e.g., by setting a flag to tell cpu_switch() to + * save the state somewhere else. + */ +/* tmp = curpcb->pcb_savefpu; */ + pushl %edi + pushl %esi + pushl %ecx + leal -176(%ebp),%edi + movl _curpcb,%esi + addl $112,%esi + cld + movl $44,%ecx + rep + movsl + popl %ecx + popl %esi + popl %edi +/* stop_emulating(); */ + clts +/* npxproc = curproc; */ + movl _curproc,%eax + movl %eax,_npxproc +/* } */ +L7: + ALIGN_TEXT +fastmove_loop: + movl 32(%esi),%eax + movl 64(%esi),%eax + movl 96(%esi),%eax + movl 128(%esi),%eax + movl 160(%esi),%eax + movl 192(%esi),%eax + movl 224(%esi),%eax + + cmpl $259,%ecx + jbe fastmove_tmp + movl 256(%esi),%eax + + ALIGN_TEXT +fastmove_tmp: + fildq 0(%esi) + fildq 8(%esi) + fildq 16(%esi) + fildq 24(%esi) + fildq 32(%esi) + fildq 40(%esi) + fildq 48(%esi) + fildq 56(%esi) + fxch %st(7) + fistpq 0(%edi) + fxch %st(5) + fistpq 8(%edi) + fxch %st(3) + fistpq 16(%edi) + fxch %st(1) + fistpq 24(%edi) + fistpq 32(%edi) + fistpq 40(%edi) + fistpq 48(%edi) + fistpq 56(%edi) + fildq 64(%esi) + fildq 72(%esi) + fildq 80(%esi) + fildq 88(%esi) + fildq 96(%esi) + fildq 104(%esi) + fildq 112(%esi) + fildq 120(%esi) + fxch %st(7) + fistpq 64(%edi) + fxch %st(5) + fistpq 72(%edi) + fxch %st(3) + fistpq 80(%edi) + fxch %st(1) + fistpq 88(%edi) + fistpq 96(%edi) + fistpq 104(%edi) + fistpq 112(%edi) + fistpq 120(%edi) + fildq 128(%esi) + fildq 136(%esi) + fildq 144(%esi) + fildq 152(%esi) + fildq 160(%esi) + fildq 168(%esi) + fildq 176(%esi) + fildq 184(%esi) + fxch %st(7) + fistpq 128(%edi) + fxch %st(5) + fistpq 136(%edi) + fxch %st(3) + fistpq 144(%edi) + fxch %st(1) + fistpq 152(%edi) + fistpq 160(%edi) + fistpq 168(%edi) + fistpq 176(%edi) + fistpq 184(%edi) + fildq 192(%esi) + fildq 200(%esi) + fildq 208(%esi) + fildq 216(%esi) + fildq 224(%esi) + fildq 232(%esi) + fildq 240(%esi) + fildq 248(%esi) + fxch %st(7) + fistpq 192(%edi) + fxch %st(5) + fistpq 200(%edi) + fxch %st(3) + fistpq 208(%edi) + fxch %st(1) + fistpq 216(%edi) + fistpq 224(%edi) + fistpq 232(%edi) + fistpq 240(%edi) + fistpq 248(%edi) + addl $-256,%ecx + addl $256,%esi + addl $256,%edi + cmpl $255,%ecx + ja fastmove_loop + +/* if (intr_nesting_level > 0) */ + + cmpb $0,_intr_nesting_level + je L9 + +/* Restore reentrantly. */ + frstor -176(%ebp) + movl %edx,%cr0 + jmp L10 + +/* else { */ + ALIGN_TEXT +L9: +/* curpcb->pcb_savefpu = tmp; */ + pushl %edi + pushl %esi + pushl %ecx + movl _curpcb,%edi + addl $112,%edi + leal -176(%ebp),%esi + cld + movl $44,%ecx + rep + movsl + popl %ecx + popl %esi + popl %edi + +/* start_emulating(); */ + smsw %ax + orb $8,%al + lmsw %ax +/* npxproc = NULL; */ + movl $0,_npxproc +/* } */ +L10: + movl %ebp,%esp + popl %ebp + + ALIGN_TEXT +fastmove_tail: + movb %cl,%al + shrl $2,%ecx /* copy longword-wise */ + cld + rep + movsl + movb %al,%cl + andb $3,%cl /* copy remaining bytes */ + rep + movsb + + ret +#endif /* I586_FAST_BCOPY */ /* * fu{byte,sword,word} : fetch a byte (sword, word) from user memory === * When you get everything * working, add some tests and flags to restore the panic if an T_DNA * trap occurs unexpectedly (set a flag FP_KERNEL_USING_FP in pcb->pcbflags * while the kernel is using FP in fastmove() or elsewhere, and `break;' * for the kernel T_DNA case if this flag isn't set). I'm not sure if I understand this, but let me try to digest it a little more over the weekend. ;) * Also, the `intr_nesting_level > 0' case needs to preserve the TS bit. * This case probably hasn't been executed yet (it will be executed when * bcopy() uses fastmove() and an interrupt handler calls bcopy()). Oh, really? I thought it was saved via %edx. Anyway, I think I've got it working here. I have run several make worlds on a few machines and they don't seem to crack. I also have it on my home machine and am running all sorts of things from ghostscript to top and nothing suspicious has happened so far. (I saw a few processes die with SIGFPE before I fixed the fnsave bug.) If some brave soul out there can test this, it will be great. Just apply the above patch to a -current /sys/i386/i386/ and rebuild your kernel with 'options "I586_FAST_BCOPY"'. No guarantees, though. Just because it works here doesn't mean it won't eat your root partition for lunch. :] Satoshi