Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 26 Aug 2005 19:05:12 +0000 (GMT)
From:      jkoshy@FreeBSD.ORG (Joseph Koshy)
To:        Stephan Uphoff <ups@FreeBSD.org>
Cc:        cvs-src@FreeBSD.org, src-committers@FreeBSD.org, cvs-all@FreeBSD.org, jkoshy@FreeBSD.ORG
Subject:   Re: cvs commit: src/sys/amd64/amd64 exception.S 
Message-ID:  <20050826190512.1717116A420@hub.freebsd.org>
In-Reply-To: Message from Stephan Uphoff <ups@FreeBSD.org> of "Thu, 25 Aug 2005 20:33:43 GMT." <200508252033.j7PKXhdT078969@repoman.freebsd.org> 

next in thread | previous in thread | raw e-mail | index | archive | help


> ups         2005-08-25 20:33:43 UTC
> 
>   FreeBSD src repository
> 
>   Modified files:
>     sys/amd64/amd64      exception.S 
>   Log:
>   NMI handler should not enable interrupts.
>   
>   Tested by: kris@
>   MFC after:      3 weeks

Actually, NMI handling doesn't quite work on the AMD64, even with
this change.

The problem is that the kernel protects code where it uses SWAPGS
from reentrancy by blocking interrupts (using 'cli').  However,
NMIs are non-maskable, and if an NMI is taken when in such a critical
section we end up with the GS.base values getting out of sync and
a processor reset or a double fault on a subsequent use of the
out-of-sync %gs value.

The attached patch addresses this bug for NMIs.  It is needed before
you can use hwpmc(4) to sample on an AMD64 box.

The general solution is more complicated than this patch.  In theory
all unmaskable traps into the kernel appear to be susceptible to
having the processor's GS.base values getting out of sync.  In
practice though, this appears to be a problem only for NMIs as most
of the other exceptions do not happen during the vulnerable critical
section.  Debug exceptions taken from userland cannot trigger the
bug.  A machine check exception would be vulnerable, but we don't
implement this today.

Regards,
Koshy
<jkoshy@freebsd.org>

Index: db_trace.c
===================================================================
RCS file: /cvs/FreeBSD/src/sys/amd64/amd64/db_trace.c,v
retrieving revision 1.68
diff -u -u -r1.68 db_trace.c
--- db_trace.c	3 Aug 2005 04:33:48 -0000	1.68
+++ db_trace.c	20 Aug 2005 15:17:12 -0000
@@ -317,7 +317,8 @@
 	db_symbol_values(sym, &name, NULL);
 	if (name != NULL) {
 		if (strcmp(name, "calltrap") == 0 ||
-		    strcmp(name, "fork_trampoline") == 0)
+		    strcmp(name, "fork_trampoline") == 0 ||
+		    strcmp(name, "nmi_calltrap") == 0)
 			frame_type = TRAP;
 		else if (strncmp(name, "Xatpic_intr", 11) == 0 ||
 		    strncmp(name, "Xatpic_fastintr", 15) == 0 ||
Index: exception.S
===================================================================
RCS file: /cvs/FreeBSD/src/sys/amd64/amd64/exception.S,v
retrieving revision 1.126
diff -u -u -r1.126 exception.S
--- exception.S	25 Aug 2005 20:33:43 -0000	1.126
+++ exception.S	26 Aug 2005 18:31:39 -0000
@@ -93,8 +93,6 @@
 	jmp alltraps
 IDTVEC(div)
 	TRAP(T_DIVIDE)
-IDTVEC(nmi)
-	TRAP_NOEN(T_NMI)
 IDTVEC(ofl)
 	TRAP(T_OFLOW)
 IDTVEC(bnd)
@@ -313,6 +311,82 @@
 IDTVEC(fast_syscall32)
 	sysret
 
+/*
+ * NMI handling is special.
+ *
+ * First, NMIs do not respect the state of the processor's RFLAGS.IF
+ * bit and the NMI handler may be invoked at any time, including when
+ * the processor is in a critical section.  In particular, this means
+ * that the processor's GS.base values could be inconsistent on entry
+ * to the handler.
+ *
+ * Second, the processor treats NMIs specially, blocking further NMIs
+ * till an 'iretq' instruction is executed.  We therefore need to
+ * execute the NMI handler with interrupts disabled to prevent a
+ * nested interrupt from executing an 'iretq' instruction and
+ * inadvertently taking the processor out of NMI mode.
+ *
+ * We use '%ebx', a C-preserved register, to remember whether to swap
+ * GS back on the exit path.
+ */
+
+IDTVEC(nmi)
+	subq	$TF_RIP,%rsp
+	movq	$(T_NMI),TF_TRAPNO(%rsp)
+	movq	$0,TF_ADDR(%rsp)
+	movq	$0,TF_ERR(%rsp)
+	movq	%rdi,TF_RDI(%rsp)
+	movq	%rsi,TF_RSI(%rsp)
+	movq	%rdx,TF_RDX(%rsp)
+	movq	%rcx,TF_RCX(%rsp)
+	movq	%r8,TF_R8(%rsp)
+	movq	%r9,TF_R9(%rsp)
+	movq	%rax,TF_RAX(%rsp)
+	movq	%rbx,TF_RBX(%rsp)
+	movq	%rbp,TF_RBP(%rsp)
+	movq	%r10,TF_R10(%rsp)
+	movq	%r11,TF_R11(%rsp)
+	movq	%r12,TF_R12(%rsp)
+	movq	%r13,TF_R13(%rsp)
+	movq	%r14,TF_R14(%rsp)
+	movq	%r15,TF_R15(%rsp)
+	xorl	%ebx,%ebx
+	testb	$SEL_RPL_MASK,TF_CS(%rsp)
+	jnz	nmi_needswapgs		/* we came from userland */
+	movl	$MSR_GSBASE,%ecx
+	rdmsr
+	cmpl	$VM_MAXUSER_ADDRESS >> 32,%edx
+	jae	nmi_calltrap		/* GS.base holds a kernel VA */
+nmi_needswapgs:
+	incl	%ebx
+	swapgs
+/* Note: this label is also used by ddb and gdb: */
+nmi_calltrap:
+	FAKE_MCOUNT(TF_RIP(%rsp))
+	call	trap
+	MEXITCOUNT
+	testl	%ebx,%ebx
+	jz	nmi_restoreregs
+	swapgs
+nmi_restoreregs:
+	movq	TF_RDI(%rsp),%rdi
+	movq	TF_RSI(%rsp),%rsi
+	movq	TF_RDX(%rsp),%rdx
+	movq	TF_RCX(%rsp),%rcx
+	movq	TF_R8(%rsp),%r8
+	movq	TF_R9(%rsp),%r9
+	movq	TF_RAX(%rsp),%rax
+	movq	TF_RBX(%rsp),%rbx
+	movq	TF_RBP(%rsp),%rbp
+	movq	TF_R10(%rsp),%r10
+	movq	TF_R11(%rsp),%r11
+	movq	TF_R12(%rsp),%r12
+	movq	TF_R13(%rsp),%r13
+	movq	TF_R14(%rsp),%r14
+	movq	TF_R15(%rsp),%r15
+	addq	$TF_RIP,%rsp
+	iretq
+
 ENTRY(fork_trampoline)
 	movq	%r12, %rdi		/* function */
 	movq	%rbx, %rsi		/* arg1 */
Index: genassym.c
===================================================================
RCS file: /cvs/FreeBSD/src/sys/amd64/amd64/genassym.c,v
retrieving revision 1.155
diff -u -u -r1.155 genassym.c
--- genassym.c	20 Nov 2004 02:30:59 -0000	1.155
+++ genassym.c	20 Aug 2005 15:11:01 -0000
@@ -209,3 +209,5 @@
 
 ASSYM(MTX_LOCK, offsetof(struct mtx, mtx_lock));
 ASSYM(MTX_RECURSECNT, offsetof(struct mtx, mtx_recurse));
+
+ASSYM(MSR_GSBASE, MSR_GSBASE);
Index: trap.c
===================================================================
RCS file: /cvs/FreeBSD/src/sys/amd64/amd64/trap.c,v
retrieving revision 1.289
diff -u -u -r1.289 trap.c
--- trap.c	29 Jun 2005 23:23:16 -0000	1.289
+++ trap.c	20 Aug 2005 15:18:25 -0000
@@ -219,9 +219,9 @@
 			    type);
 			/*
 			 * We shouldn't enable interrupts while in a critical
-			 * section.
+			 * section, or if servicing an NMI.
 			 */
-			if (td->td_critnest == 0)
+			if (type != T_NMI && td->td_critnest == 0)
 				enable_intr();
 		}
 	}





Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20050826190512.1717116A420>