Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 24 Jan 2017 16:05:42 +0000 (UTC)
From:      Sean Bruno <sbruno@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r312698 - in head/sys: dev/e1000 kern net sys
Message-ID:  <201701241605.v0OG5gqR092323@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: sbruno
Date: Tue Jan 24 16:05:42 2017
New Revision: 312698
URL: https://svnweb.freebsd.org/changeset/base/312698

Log:
  iflib:
       Add internal tracking of smp startup status to reliably figure out
       what methods are to be used to get gtaskqueue up and running.
  
  e1000:
       Calculating this pointer gives undefined behaviour when (last == -1)
       (it is before the buffer).  The pointer is always followed.  Panics
       occurred when it points to an unmapped page.  Otherwise, the pointed-to
       garbage tends to not have the E1000_TXD_STAT_DD bit set in it, so in the
       broken case the loop was usually null and the function just returned, and
       this was acidentally correct.
  
  Submitted by:	bde
  Reported by:	Matt Macy <mmacy@nextbsd.org>

Modified:
  head/sys/dev/e1000/em_txrx.c
  head/sys/kern/subr_gtaskqueue.c
  head/sys/net/iflib.c
  head/sys/sys/gtaskqueue.h

Modified: head/sys/dev/e1000/em_txrx.c
==============================================================================
--- head/sys/dev/e1000/em_txrx.c	Tue Jan 24 15:55:52 2017	(r312697)
+++ head/sys/dev/e1000/em_txrx.c	Tue Jan 24 16:05:42 2017	(r312698)
@@ -408,10 +408,13 @@ em_isc_txd_credits_update(void *arg, uin
 	cidx = cidx_init;
 	buf = &txr->tx_buffers[cidx];
 	tx_desc = &txr->tx_base[cidx];
-        last = buf->eop;
+	last = buf->eop;
+	if (last == -1)
+		return (processed);
 	eop_desc = &txr->tx_base[last];
 
-	DPRINTF(iflib_get_dev(adapter->ctx), "credits_update: cidx_init=%d clear=%d last=%d\n",
+	DPRINTF(iflib_get_dev(adapter->ctx),
+		      "credits_update: cidx_init=%d clear=%d last=%d\n",
 		      cidx_init, clear, last);
 	/*
 	 * What this does is get the index of the
@@ -420,7 +423,7 @@ em_isc_txd_credits_update(void *arg, uin
 	 * simple comparison on the inner while loop.
 	 */
 	if (++last == scctx->isc_ntxd[0])
-	     last = 0;
+		last = 0;
 	done = last;
 
 
@@ -436,7 +439,7 @@ em_isc_txd_credits_update(void *arg, uin
 			tx_desc++;
 			buf++;
 			processed++;
-		  
+
 			/* wrap the ring ? */
 			if (++cidx == scctx->isc_ntxd[0]) {
 				cidx = 0;

Modified: head/sys/kern/subr_gtaskqueue.c
==============================================================================
--- head/sys/kern/subr_gtaskqueue.c	Tue Jan 24 15:55:52 2017	(r312697)
+++ head/sys/kern/subr_gtaskqueue.c	Tue Jan 24 16:05:42 2017	(r312698)
@@ -630,6 +630,29 @@ taskqgroup_find(struct taskqgroup *qgrou
 
 	return (idx);
 }
+/*
+ * smp_started is unusable since it is not set for UP kernels or even for
+ * SMP kernels when there is 1 CPU.  This is usually handled by adding a
+ * (mp_ncpus == 1) test, but that would be broken here since we need to
+ * to synchronize with the SI_SUB_SMP ordering.  Even in the pure SMP case
+ * smp_started only gives a fuzzy ordering relative to SI_SUB_SMP.
+ *
+ * So maintain our own flag.  It must be set after all CPUs are started
+ * and before SI_SUB_SMP:SI_ORDER_ANY so that the SYSINIT for delayed
+ * adjustment is properly delayed.  SI_ORDER_FOURTH is clearly before
+ * SI_ORDER_ANY and unclearly after the CPUs are started.  It would be
+ * simpler for adjustment to pass a flag indicating if it is delayed.
+ */ 
+static int tqg_smp_started;
+
+static void
+tqg_record_smp_started(void *arg)
+{
+	tqg_smp_started = 1;
+}
+
+SYSINIT(tqg_record_smp_started, SI_SUB_SMP, SI_ORDER_FOURTH,
+	tqg_record_smp_started, NULL);
 
 void
 taskqgroup_attach(struct taskqgroup *qgroup, struct grouptask *gtask,
@@ -647,7 +670,7 @@ taskqgroup_attach(struct taskqgroup *qgr
 	qgroup->tqg_queue[qid].tgc_cnt++;
 	LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list);
 	gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
-	if (irq != -1 && (smp_started || mp_ncpus == 1)) {
+	if (irq != -1 && tqg_smp_started ) {
 		gtask->gt_cpu = qgroup->tqg_queue[qid].tgc_cpu;
 		CPU_ZERO(&mask);
 		CPU_SET(qgroup->tqg_queue[qid].tgc_cpu, &mask);
@@ -697,7 +720,7 @@ taskqgroup_attach_cpu(struct taskqgroup 
 	gtask->gt_irq = irq;
 	gtask->gt_cpu = cpu;
 	mtx_lock(&qgroup->tqg_lock);
-	if (smp_started || mp_ncpus == 1) {
+	if (tqg_smp_started) {
 		for (i = 0; i < qgroup->tqg_cnt; i++)
 			if (qgroup->tqg_queue[i].tgc_cpu == cpu) {
 				qid = i;
@@ -731,7 +754,7 @@ taskqgroup_attach_cpu_deferred(struct ta
 	qid = -1;
 	irq = gtask->gt_irq;
 	cpu = gtask->gt_cpu;
-	MPASS(smp_started || mp_ncpus == 1);
+	MPASS(tqg_smp_started);
 	mtx_lock(&qgroup->tqg_lock);
 	for (i = 0; i < qgroup->tqg_cnt; i++)
 		if (qgroup->tqg_queue[i].tgc_cpu == cpu) {
@@ -824,9 +847,10 @@ _taskqgroup_adjust(struct taskqgroup *qg
 
 	mtx_assert(&qgroup->tqg_lock, MA_OWNED);
 
-	if (cnt < 1 || cnt * stride > mp_ncpus || (!smp_started && (mp_ncpus != 1))) {
-		printf("taskqgroup_adjust failed cnt: %d stride: %d mp_ncpus: %d smp_started: %d\n",
-			   cnt, stride, mp_ncpus, smp_started);
+	if (cnt < 1 || cnt * stride > mp_ncpus || !tqg_smp_started) {
+		printf("%s: failed cnt: %d stride: %d "
+		       "mp_ncpus: %d smp_started: %d\n",
+			__func__, cnt, stride, mp_ncpus, smp_started);
 		return (EINVAL);
 	}
 	if (qgroup->tqg_adjusting) {

Modified: head/sys/net/iflib.c
==============================================================================
--- head/sys/net/iflib.c	Tue Jan 24 15:55:52 2017	(r312697)
+++ head/sys/net/iflib.c	Tue Jan 24 16:05:42 2017	(r312698)
@@ -1193,13 +1193,36 @@ iflib_dma_free_multi(iflib_dma_info_t *d
 		iflib_dma_free(*dmaiter);
 }
 
+#ifdef EARLY_AP_STARTUP
+static const int iflib_started = 1;
+#else
+/*
+ * We used to abuse the smp_started flag to decide if the queues have been
+ * fully initialized (by late taskqgroup_adjust() calls in a SYSINIT()).
+ * That gave bad races, since the SYSINIT() runs strictly after smp_started
+ * is set.  Run a SYSINIT() strictly after that to just set a usable
+ * completion flag.
+ */
+
+static int iflib_started;
+
+static void
+iflib_record_started(void *arg)
+{
+	iflib_started = 1;
+}
+
+SYSINIT(iflib_record_started, SI_SUB_SMP + 1, SI_ORDER_FIRST,
+	iflib_record_started, NULL);
+#endif
+
 static int
 iflib_fast_intr(void *arg)
 {
 	iflib_filter_info_t info = arg;
 	struct grouptask *gtask = info->ifi_task;
 
-	if (!smp_started && mp_ncpus > 1)
+	if (!iflib_started)
 		return (FILTER_HANDLED);
 
 	DBG_COUNTER_INC(fast_intrs);
@@ -3728,7 +3751,16 @@ iflib_device_register(device_t dev, void
 		device_printf(dev, "qset structure setup failed %d\n", err);
 		goto fail_queues;
 	}
-
+	/*
+	 * Group taskqueues aren't properly set up until SMP is started,
+	 * so we disable interrupts until we can handle them post
+	 * SI_SUB_SMP.
+	 *
+	 * XXX: disabling interrupts doesn't actually work, at least for
+	 * the non-MSI case.  When they occur before SI_SUB_SMP completes,
+	 * we do null handling and depend on this not causing too large an
+	 * interrupt storm.
+	 */
 	IFDI_INTR_DISABLE(ctx);
 	if (msix > 1 && (err = IFDI_MSIX_INTR_ASSIGN(ctx, msix)) != 0) {
 		device_printf(dev, "IFDI_MSIX_INTR_ASSIGN failed %d\n", err);
@@ -4556,13 +4588,6 @@ iflib_legacy_setup(if_ctx_t ctx, driver_
 	void *q;
 	int err;
 
-	/*
-	 * group taskqueues aren't properly set up until SMP is started
-	 * so we disable interrupts until we can handle them post
-	 * SI_SUB_SMP
-	 */
-	IFDI_INTR_DISABLE(ctx);
-
 	q = &ctx->ifc_rxqs[0];
 	info = &rxq[0].ifr_filter_info;
 	gtask = &rxq[0].ifr_task;

Modified: head/sys/sys/gtaskqueue.h
==============================================================================
--- head/sys/sys/gtaskqueue.h	Tue Jan 24 15:55:52 2017	(r312697)
+++ head/sys/sys/gtaskqueue.h	Tue Jan 24 16:05:42 2017	(r312698)
@@ -81,7 +81,7 @@ int	taskqgroup_adjust(struct taskqgroup 
 extern struct taskqgroup *qgroup_##name
 
 
-#if (!defined(SMP) || defined(EARLY_AP_STARTUP))
+#ifdef EARLY_AP_STARTUP
 #define TASKQGROUP_DEFINE(name, cnt, stride)				\
 									\
 struct taskqgroup *qgroup_##name;					\
@@ -95,7 +95,8 @@ taskqgroup_define_##name(void *arg)					
 									\
 SYSINIT(taskqgroup_##name, SI_SUB_INIT_IF, SI_ORDER_FIRST,		\
 	taskqgroup_define_##name, NULL)
-#else /* SMP && !EARLY_AP_STARTUP */
+
+#else /* !EARLY_AP_STARTUP */
 #define TASKQGROUP_DEFINE(name, cnt, stride)				\
 									\
 struct taskqgroup *qgroup_##name;					\
@@ -104,15 +105,6 @@ static void								\
 taskqgroup_define_##name(void *arg)					\
 {									\
 	qgroup_##name = taskqgroup_create(#name);			\
-	/* Adjustment will be null unless smp_cpus == 1. */		\
-	/*								\
-	 * XXX this was intended to fix the smp_cpus == 1 case, but	\
-	 * doesn't actually work for that.  It gives thes same strange	\
-	 * panic as adjustment at SI_SUB_INIT_IF:SI_ORDER_ANY for a	\
-	 * device that works with a pure UP kernel.			\
-	 */								\
-	/* XXX this code is common now, so should not be ifdefed. */	\
-	taskqgroup_adjust(qgroup_##name, (cnt), (stride));		\
 }									\
 									\
 SYSINIT(taskqgroup_##name, SI_SUB_INIT_IF, SI_ORDER_FIRST,		\
@@ -121,17 +113,13 @@ SYSINIT(taskqgroup_##name, SI_SUB_INIT_I
 static void								\
 taskqgroup_adjust_##name(void *arg)					\
 {									\
-	/* 								\
-	 * Adjustment when smp_cpus > 1 only works accidentally		\
-	 * (when there is no device interrupt before adjustment).	\
-	 */								\
 	taskqgroup_adjust(qgroup_##name, (cnt), (stride));		\
 }									\
 									\
 SYSINIT(taskqgroup_adj_##name, SI_SUB_SMP, SI_ORDER_ANY,		\
 	taskqgroup_adjust_##name, NULL);				\
 
-#endif /* !SMP || EARLY_AP_STARTUP */
+#endif /* EARLY_AP_STARTUP */
 
 TASKQGROUP_DECLARE(net);
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201701241605.v0OG5gqR092323>