Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 10 Jun 2004 19:59:46 +1000 (EST)
From:      Bruce Evans <bde@zeta.org.au>
To:        Don Bowman <don@sandvine.com>
Cc:        "'current@freebsd.org'" <current@FreeBSD.org>
Subject:   Re: kernel trap 19 with interrupts disabled
Message-ID:  <20040610194112.S8078@gamplex.bde.org>
In-Reply-To: <FE045D4D9F7AED4CBFF1B3B813C85337051D8F48@mail.sandvine.com>
References:  <FE045D4D9F7AED4CBFF1B3B813C85337051D8F48@mail.sandvine.com>

next in thread | previous in thread | raw e-mail | index | archive | help
On Wed, 9 Jun 2004, Don Bowman wrote:

> I have a machine which is completely locking
> up solid every day or so. Its been doing this
> for a couple of months on current. It is running
> cvs current from ~2weeks ago.
>
> This time, i tried shorting the NMI out, and I
> got this message to the serial console:
>
> kernel trap 19 with interrupts disabled
> NMI ... going to debugger
>
> ... but I still can't get into the debugger
> with the key sequence, and no additional
> output came out.
>
> Can I assume from the 'with interrupts disabled'
> that it means that all interrupts are locked off?
> or that 'sti' is set? Its a MP system, a dual
> xeon (P4).

It means that the NMI was serviced by a CPU that has interrupts disabled
at the CPU level.  The message for this is a little spurious because NMI
by definition is supposed to be able to occur when interrupts are masked,
but it can be useful to know when an unexpected or even an expected trap
occurs with interrupts disabled.  Here it tells us that the interrupt
may really have needed to be non-maskable to break into some code that is
looping with interrupts disabled.

If you didn't get a debugger prompt after the message, then the debugger
is probably looping too.  It tries to wait for all the other CPUs to stop,
but this will hang if all the other CPUs are looping with interrupts
disabled too.  Try the following hack to get further.

%%%
Index: db_interface.c
===================================================================
RCS file: /home/ncvs/src/sys/i386/i386/db_interface.c,v
retrieving revision 1.81
diff -u -2 -r1.81 db_interface.c
--- db_interface.c	3 Apr 2004 22:23:36 -0000	1.81
+++ db_interface.c	4 Apr 2004 05:37:38 -0000
@@ -35,4 +35,5 @@
 #include <sys/reboot.h>
 #include <sys/cons.h>
+#include <sys/ktr.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
@@ -41,4 +42,5 @@
 #include <machine/cpu.h>
 #ifdef SMP
+#include <machine/smp.h>
 #include <machine/smptests.h>	/** CPUSTOP_ON_DDBBREAK */
 #endif
@@ -61,4 +63,33 @@
 static jmp_buf	db_global_jmpbuf;

+#ifdef SMP
+/* XXX this is cloned from stop_cpus() since that function can hang. */
+static int
+attempt_to_stop_cpus(u_int map)
+{
+	int i;
+
+	if (!smp_started)
+		return 0;
+
+	CTR1(KTR_SMP, "attempt_to_stop_cpus(%x)", map);
+
+	/* send the stop IPI to all CPUs in map */
+	ipi_selected(map, IPI_STOP);
+
+	i = 0;
+	while ((atomic_load_acq_int(&stopped_cpus) & map) != map) {
+		/* spin */
+		i++;
+		if (i == 100000000) {
+			printf("timeout stopping cpus\n");
+			break;
+		}
+	}
+
+	return 1;
+}
+#endif /* SMP */
+
 /*
  *  kdb_trap - field a TRACE or BPT trap
@@ -69,4 +100,8 @@
 	u_int ef;
 	volatile int ddb_mode = !(boothowto & RB_GDB);
+#ifdef SMP
+	static u_int kdb_trap_lock = NOCPU;
+	static u_int output_lock;
+#endif

 	/*
@@ -91,16 +126,48 @@

 #ifdef SMP
+	if (atomic_cmpset_int(&kdb_trap_lock, NOCPU, PCPU_GET(cpuid)) == 0 &&
+	    kdb_trap_lock != PCPU_GET(cpuid)) {
+		while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+			;
+		db_printf(
+		    "concurrent ddb entry: type %d trap, code=%x cpu=%d\n",
+		    type, code, PCPU_GET(cpuid));
+		atomic_store_rel_int(&output_lock, 0);
+		if (type == T_BPTFLT)
+			regs->tf_eip--;
+		else {
+			while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+				;
+			db_printf(
+"concurrent ddb entry on non-breakpoint: too hard to handle properly\n");
+			atomic_store_rel_int(&output_lock, 0);
+		}
+		while (atomic_load_acq_int(&kdb_trap_lock) != NOCPU)
+			;
+		write_eflags(ef);
+		return (1);
+	}
+#endif
+
+#ifdef SMP
 #ifdef CPUSTOP_ON_DDBBREAK
+#define VERBOSE_CPUSTOP_ON_DDBBREAK_NOT

 #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK)
+	while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+		;
 	db_printf("\nCPU%d stopping CPUs: 0x%08x...", PCPU_GET(cpuid),
 	    PCPU_GET(other_cpus));
+	atomic_store_rel_int(&output_lock, 0);
 #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */

 	/* We stop all CPUs except ourselves (obviously) */
-	stop_cpus(PCPU_GET(other_cpus));
+	attempt_to_stop_cpus(PCPU_GET(other_cpus));

 #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK)
+	while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+		;
 	db_printf(" stopped.\n");
+	atomic_store_rel_int(&output_lock, 0);
 #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */

@@ -192,18 +259,29 @@

 #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK)
+	while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+		;
 	db_printf("\nCPU%d restarting CPUs: 0x%08x...", PCPU_GET(cpuid),
 	    stopped_cpus);
+	atomic_store_rel_int(&output_lock, 0);
 #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */

 	/* Restart all the CPUs we previously stopped */
 	if (stopped_cpus != PCPU_GET(other_cpus) && smp_started != 0) {
+		while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+			;
 		db_printf("whoa, other_cpus: 0x%08x, stopped_cpus: 0x%08x\n",
 			  PCPU_GET(other_cpus), stopped_cpus);
+		atomic_store_rel_int(&output_lock, 0);
+#if 0
 		panic("stop_cpus() failed");
+#endif
 	}
 	restart_cpus(stopped_cpus);

 #if defined(VERBOSE_CPUSTOP_ON_DDBBREAK)
+	while (atomic_cmpset_int(&output_lock, 0, 1) == 0)
+		;
 	db_printf(" restarted.\n");
+	atomic_store_rel_int(&output_lock, 0);
 #endif /* VERBOSE_CPUSTOP_ON_DDBBREAK */

@@ -211,4 +289,8 @@
 #endif /* SMP */

+#ifdef SMP
+	atomic_store_rel_int(&kdb_trap_lock, NOCPU);
+#endif
+
 	write_eflags(ef);

%%%

Bruce



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20040610194112.S8078>