Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 11 Feb 2011 11:22:14 +0000 (UTC)
From:      Jeff Roberson <jeff@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-projects@freebsd.org
Subject:   svn commit: r218552 - projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib
Message-ID:  <201102111122.p1BBMEIh057808@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: jeff
Date: Fri Feb 11 11:22:14 2011
New Revision: 218552
URL: http://svn.freebsd.org/changeset/base/218552

Log:
   - Eliminate zero length mbufs when loading the tx descriptor.  These cause
     the driver to hang.  The stack creates them when making ip fragments for
     unknown reasons.
   - Make it safe to poll tx completions without the device lock held.  This
     significantly improves TCP performance.
   - I had erroneously set the mtu based on the receive size which includes
     the GRH.  Correct this.

Modified:
  projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h
  projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c
  projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c
  projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
  projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c

Modified: projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h
==============================================================================
--- projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h	Fri Feb 11 10:50:33 2011	(r218551)
+++ projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h	Fri Feb 11 11:22:14 2011	(r218552)
@@ -91,7 +91,7 @@
 /* constants */
 
 #define	INFINIBAND_ALEN		20	/* Octets in IPoIB HW addr */
-#define	MAX_MB_FRAGS		(8192 / MCLBYTES)
+#define	MAX_MB_FRAGS		((8192 / MCLBYTES) + 2)
 
 #ifdef IPOIB_CM
 #define	CONFIG_INFINIBAND_IPOIB_CM
@@ -99,6 +99,7 @@
 
 #ifdef IPOIB_DEBUG
 #define	CONFIG_INFINIBAND_IPOIB_DEBUG
+#define CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
 #endif
 
 enum ipoib_flush_level {
@@ -110,7 +111,6 @@ enum ipoib_flush_level {
 enum {
 	IPOIB_ENCAP_LEN		  = 4,
 	IPOIB_HEADER_LEN	  = IPOIB_ENCAP_LEN + INFINIBAND_ALEN,
-	IPOIB_UD_HEAD_SIZE	  = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
 	IPOIB_UD_RX_SG		  = 1, /* max buffer needed for 4K mtu */
 
 	IPOIB_CM_MAX_MTU	  = MJUM16BYTES,
@@ -286,7 +286,6 @@ struct ipoib_cm_dev_priv {
 	struct ifqueue     	mb_queue;
 	struct list_head	start_list;
 	struct list_head	reap_list;
-	struct ib_wc		ibwc[IPOIB_NUM_WC];
 	struct ib_sge		rx_sge[IPOIB_CM_RX_SG];
 	struct ib_recv_wr       rx_wr;
 	int			nonsrq_conn_qp;
@@ -414,7 +413,7 @@ struct ipoib_path {
 };
 
 /* UD Only transmits encap len but we want the two sizes to be symmetrical. */
-#define IPOIB_UD_MTU(ib_mtu)		(ib_mtu - IB_GRH_BYTES)
+#define IPOIB_UD_MTU(ib_mtu)		(ib_mtu - IPOIB_ENCAP_LEN)
 #define	IPOIB_CM_MTU(ib_mtu)		(ib_mtu - IPOIB_ENCAP_LEN)
 
 #define	IPOIB_IS_MULTICAST(addr)	((addr)[4] == 0xff)
@@ -519,6 +518,8 @@ void ipoib_drain_cq(struct ipoib_dev_pri
 
 int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
 void ipoib_dma_unmap_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
+int ipoib_poll_tx(struct ipoib_dev_priv *priv);
+
 
 void ipoib_set_ethtool_ops(struct ifnet *dev);
 int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca);

Modified: projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c
==============================================================================
--- projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c	Fri Feb 11 10:50:33 2011	(r218551)
+++ projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c	Fri Feb 11 11:22:14 2011	(r218552)
@@ -94,6 +94,7 @@ static int ipoib_cm_post_receive_srq(str
 	priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
 
 	priv->cm.rx_sge[0].addr = priv->cm.srq_ring[id].mapping[0];
+	priv->cm.rx_sge[0].length = priv->cm.max_cm_mtu;
 
 	ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
 	if (unlikely(ret)) {
@@ -117,6 +118,7 @@ static int ipoib_cm_post_receive_nonsrq(
 	wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
 
 	sge[0].addr = rx->rx_ring[id].mapping[0];
+	priv->cm.rx_sge[0].length = priv->cm.max_cm_mtu;
 
 	ret = ib_post_recv(rx->qp, wr, &bad_wr);
 	if (unlikely(ret)) {
@@ -505,11 +507,13 @@ void ipoib_cm_handle_rx_wc(struct ipoib_
 
 	if (unlikely(wr_id >= ipoib_recvq_size)) {
 		if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
+			spin_lock(&priv->lock);
 			list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
 			ipoib_cm_start_rx_drain(priv);
 			if (priv->cm.id != NULL)
 				queue_work(ipoib_workqueue,
 				    &priv->cm.rx_reap_task);
+			spin_unlock(&priv->lock);
 		} else
 			ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
 				   wr_id, ipoib_recvq_size);
@@ -532,8 +536,10 @@ void ipoib_cm_handle_rx_wc(struct ipoib_
 			goto repost;
 		else {
 			if (!--p->recv_count) {
+				spin_lock(&priv->lock);
 				list_move(&p->list, &priv->cm.rx_reap_list);
 				queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
+				spin_unlock(&priv->lock);
 			}
 			return;
 		}
@@ -574,13 +580,9 @@ void ipoib_cm_handle_rx_wc(struct ipoib_
 	mb->m_pkthdr.rcvif = dev;
 	proto = *mtod(mb, uint16_t *);
 	m_adj(mb, IPOIB_ENCAP_LEN);
-	if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok))
-		mb->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID;
 
 	IPOIB_MTAP_PROTO(dev, mb, proto);
-	spin_unlock(&priv->lock);
 	ipoib_demux(dev, mb, ntohs(proto));
-	spin_lock(&priv->lock);
 
 repost:
 	if (has_srq) {
@@ -626,8 +628,11 @@ void ipoib_cm_send(struct ipoib_dev_priv
 	struct ipoib_tx_buf *tx_req;
 	struct ifnet *dev = priv->dev;
 
+	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
+		while (ipoib_poll_tx(priv)); /* nothing */
+
 	m_adj(mb, sizeof(struct ipoib_pseudoheader));
-	if (unlikely(mb->m_pkthdr.len > IPOIB_CM_MTU(tx->mtu))) {
+	if (unlikely(mb->m_pkthdr.len > tx->mtu)) {
 		ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
 			   mb->m_pkthdr.len, tx->mtu);
 		++dev->if_oerrors;
@@ -655,11 +660,6 @@ void ipoib_cm_send(struct ipoib_dev_priv
 		return;
 	}
 
-	if (mb->m_pkthdr.csum_flags & (CSUM_IP|CSUM_TCP|CSUM_UDP))
-		priv->tx_wr.send_flags |= IB_SEND_IP_CSUM;
-	else
-		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
-
 	if (unlikely(post_send(priv, tx, tx_req, tx->tx_head & (ipoib_sendq_size - 1)))) {
 		ipoib_warn(priv, "post_send failed\n");
 		++dev->if_oerrors;
@@ -676,6 +676,7 @@ void ipoib_cm_send(struct ipoib_dev_priv
 			dev->if_drv_flags |= IFF_DRV_OACTIVE;
 		}
 	}
+
 }
 
 void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
@@ -936,7 +937,7 @@ static struct ib_qp *ipoib_cm_create_tx_
     struct ipoib_cm_tx *tx)
 {
 	struct ib_qp_init_attr attr = {
-		.send_cq		= priv->recv_cq,
+		.send_cq		= priv->send_cq,
 		.recv_cq		= priv->recv_cq,
 		.srq			= priv->cm.srq,
 		.cap.max_send_wr	= ipoib_sendq_size,

Modified: projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c
==============================================================================
--- projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c	Fri Feb 11 10:50:33 2011	(r218551)
+++ projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c	Fri Feb 11 11:22:14 2011	(r218552)
@@ -90,8 +90,8 @@ void ipoib_free_ah(struct kref *kref)
 static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv,
 				  u64 mapping[IPOIB_UD_RX_SG])
 {
-	ib_dma_unmap_single(priv->ca, mapping[0], priv->max_ib_mtu,
-			    DMA_FROM_DEVICE);
+	ib_dma_unmap_single(priv->ca, mapping[0],
+	    priv->max_ib_mtu + IB_GRH_BYTES, DMA_FROM_DEVICE);
 }
 
 static void ipoib_ud_mb_put_frags(struct ipoib_dev_priv *priv,
@@ -110,6 +110,8 @@ static int ipoib_ib_post_receive(struct 
 
 	priv->rx_wr.wr_id   = id | IPOIB_OP_RECV;
 	priv->rx_sge[0].addr = priv->rx_ring[id].mapping[0];
+	priv->rx_sge[0].length = priv->max_ib_mtu + IB_GRH_BYTES;
+
 
 	ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr);
 	if (unlikely(ret)) {
@@ -131,7 +133,7 @@ static struct mbuf *ipoib_alloc_rx_mb(st
 	/*
 	 * XXX Should be calculated once and cached.
 	 */
-	buf_size = priv->max_ib_mtu;
+	buf_size = priv->max_ib_mtu + IB_GRH_BYTES;
 	if (buf_size <= MCLBYTES)
 		buf_size = MCLBYTES;
 	else if (buf_size <= MJUMPAGESIZE)
@@ -198,13 +200,18 @@ ipoib_ib_handle_rx_wc(struct ipoib_dev_p
 	mb  = priv->rx_ring[wr_id].mb;
 
 	if (unlikely(wc->status != IB_WC_SUCCESS)) {
-		if (wc->status != IB_WC_WR_FLUSH_ERR)
+		if (wc->status != IB_WC_WR_FLUSH_ERR) {
 			ipoib_warn(priv, "failed recv event "
 				   "(status=%d, wrid=%d vend_err %x)\n",
 				   wc->status, wr_id, wc->vendor_err);
-		ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
-		m_freem(mb);
-		priv->rx_ring[wr_id].mb = NULL;
+			goto repost;
+		}
+		if (mb) {
+			ipoib_ud_dma_unmap_rx(priv,
+			     priv->rx_ring[wr_id].mapping);
+			m_freem(mb);
+			priv->rx_ring[wr_id].mb = NULL;
+		}
 		return;
 	}
 
@@ -243,9 +250,7 @@ ipoib_ib_handle_rx_wc(struct ipoib_dev_p
 	if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok))
 		mb->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID;
 
-	spin_unlock(&priv->lock);
 	dev->if_input(dev, mb);
-	spin_lock(&priv->lock);
 
 repost:
 	if (unlikely(ipoib_ib_post_receive(priv, wr_id)))
@@ -257,11 +262,19 @@ int ipoib_dma_map_tx(struct ib_device *c
 {
 	struct mbuf *mb = tx_req->mb;
 	u64 *mapping = tx_req->mapping;
-	struct mbuf *m;
+	struct mbuf *m, *p;
 	int error;
 	int i;
 
-	for (m = mb, i = 0; m != NULL; m = m->m_next, i++);
+	for (m = mb, p = NULL, i = 0; m != NULL; p = m, m = m->m_next, i++) {
+		if (m->m_len != 0)
+			continue;
+		if (p == NULL)
+			panic("ipoib_dma_map_tx: First mbuf empty\n");
+		p->m_next = m_free(m);
+		m = p;
+		i--;
+	}
 	i--;
 	if (i >= MAX_MB_FRAGS) {
 		tx_req->mb = mb = m_defrag(mb, M_DONTWAIT);
@@ -339,13 +352,19 @@ static void ipoib_ib_handle_tx_wc(struct
 			   wc->status, wr_id, wc->vendor_err);
 }
 
-static int poll_tx(struct ipoib_dev_priv *priv)
+int
+ipoib_poll_tx(struct ipoib_dev_priv *priv)
 {
 	int n, i;
 
 	n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc);
-	for (i = 0; i < n; ++i)
-		ipoib_ib_handle_tx_wc(priv, priv->send_wc + i);
+	for (i = 0; i < n; ++i) {
+		struct ib_wc *wc = priv->send_wc + i;
+		if (wc->wr_id & IPOIB_OP_CM)
+			ipoib_cm_handle_tx_wc(priv, wc);
+		else
+			ipoib_ib_handle_tx_wc(priv, wc);
+	}
 
 	return n == MAX_SEND_CQE;
 }
@@ -362,13 +381,13 @@ poll_more:
 		for (i = 0; i < n; i++) {
 			struct ib_wc *wc = priv->ibwc + i;
 
-			if (wc->wr_id & IPOIB_OP_RECV) {
-				if (wc->wr_id & IPOIB_OP_CM)
-					ipoib_cm_handle_rx_wc(priv, wc);
-				else
-					ipoib_ib_handle_rx_wc(priv, wc);
-			} else
-				ipoib_cm_handle_tx_wc(priv, wc);
+			if ((wc->wr_id & IPOIB_OP_RECV) == 0)
+				panic("ipoib_poll: Bad wr_id 0x%jX\n",
+				    (intmax_t)wc->wr_id);
+			if (wc->wr_id & IPOIB_OP_CM)
+				ipoib_cm_handle_rx_wc(priv, wc);
+			else
+				ipoib_ib_handle_rx_wc(priv, wc);
 		}
 
 		if (n != IPOIB_NUM_WC)
@@ -384,9 +403,7 @@ void ipoib_ib_completion(struct ib_cq *c
 {
 	struct ipoib_dev_priv *priv = dev_ptr;
 
-	spin_lock(&priv->lock);
 	ipoib_poll(priv);
-	spin_unlock(&priv->lock);
 }
 
 static void drain_tx_cq(struct ipoib_dev_priv *priv)
@@ -394,7 +411,7 @@ static void drain_tx_cq(struct ipoib_dev
 	struct ifnet *dev = priv->dev;
 
 	spin_lock(&priv->lock);
-	while (poll_tx(priv))
+	while (ipoib_poll_tx(priv))
 		; /* nothing */
 
 	if (dev->if_drv_flags & IFF_DRV_OACTIVE)
@@ -430,6 +447,7 @@ post_send(struct ipoib_dev_priv *priv, u
 	priv->tx_wr.wr.ud.remote_qpn = qpn;
 	priv->tx_wr.wr.ud.ah 	     = address;
 
+
 	if (head) {
 		priv->tx_wr.wr.ud.mss	 = 0; /* XXX mb_shinfo(mb)->gso_size; */
 		priv->tx_wr.wr.ud.header = head;
@@ -450,6 +468,10 @@ ipoib_send(struct ipoib_dev_priv *priv, 
 	int hlen;
 	void *phead;
 
+	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
+		while (ipoib_poll_tx(priv))
+			; /* nothing */
+
 	m_adj(mb, sizeof (struct ipoib_pseudoheader));
 	if (0 /* XXX segment offload mb_is_gso(mb) */) {
 		/* XXX hlen = mb_transport_offset(mb) + tcp_hdrlen(mb); */
@@ -462,7 +484,7 @@ ipoib_send(struct ipoib_dev_priv *priv, 
 		}
 		m_adj(mb, hlen);
 	} else {
-		if (unlikely(mb->m_pkthdr.len > priv->mcast_mtu)) {
+		if (unlikely(mb->m_pkthdr.len - IPOIB_ENCAP_LEN > priv->mcast_mtu)) {
 			ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
 				   mb->m_pkthdr.len, priv->mcast_mtu);
 			++dev->if_oerrors;
@@ -518,10 +540,6 @@ ipoib_send(struct ipoib_dev_priv *priv, 
 		address->last_send = priv->tx_head;
 		++priv->tx_head;
 	}
-
-	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
-		while (poll_tx(priv))
-			; /* nothing */
 }
 
 static void __ipoib_reap_ah(struct ipoib_dev_priv *priv)
@@ -681,7 +699,6 @@ void ipoib_drain_cq(struct ipoib_dev_pri
 {
 	int i, n;
 
-	spin_lock(&priv->lock);
 	do {
 		n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
 		for (i = 0; i < n; ++i) {
@@ -693,17 +710,18 @@ void ipoib_drain_cq(struct ipoib_dev_pri
 			if (priv->ibwc[i].status == IB_WC_SUCCESS)
 				priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
 
-			if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) {
-				if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
-					ipoib_cm_handle_rx_wc(priv, priv->ibwc + i);
-				else
-					ipoib_ib_handle_rx_wc(priv, priv->ibwc + i);
-			} else
-				ipoib_cm_handle_tx_wc(priv, priv->ibwc + i);
+			if ((priv->ibwc[i].wr_id & IPOIB_OP_RECV) == 0)
+				panic("ipoib_drain_cq:  Bad wrid 0x%jX\n",
+				    (intmax_t)priv->ibwc[i].wr_id);
+			if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
+				ipoib_cm_handle_rx_wc(priv, priv->ibwc + i);
+			else
+				ipoib_ib_handle_rx_wc(priv, priv->ibwc + i);
 		}
 	} while (n == IPOIB_NUM_WC);
 
-	while (poll_tx(priv))
+	spin_lock(&priv->lock);
+	while (ipoib_poll_tx(priv))
 		; /* nothing */
 
 	spin_unlock(&priv->lock);

Modified: projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
==============================================================================
--- projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c	Fri Feb 11 10:50:33 2011	(r218551)
+++ projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c	Fri Feb 11 11:22:14 2011	(r218552)
@@ -596,7 +596,7 @@ path_rec_start(struct ipoib_dev_priv *pr
 	p_rec = path->pathrec;
 	p_rec.mtu_selector = IB_SA_GT;
 
-	switch (roundup_pow_of_two(dev->if_mtu + IB_GRH_BYTES)) {
+	switch (roundup_pow_of_two(dev->if_mtu + IPOIB_ENCAP_LEN)) {
 	case 512:
 		p_rec.mtu = IB_MTU_256;
 		break;
@@ -923,9 +923,11 @@ ipoib_set_dev_features(struct ipoib_dev_
 		priv->dev->if_capabilities = IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM;
 	}
 
+#if 0
 	if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO)
 		priv->dev->if_capabilities |= IFCAP_TSO4 | CSUM_TSO;
 #endif
+#endif
 	priv->dev->if_capabilities |=
 	    IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_LINKSTATE;
 	priv->dev->if_capenable = priv->dev->if_capabilities;

Modified: projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
==============================================================================
--- projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c	Fri Feb 11 10:50:33 2011	(r218551)
+++ projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_verbs.c	Fri Feb 11 11:22:14 2011	(r218552)
@@ -222,7 +222,6 @@ int ipoib_transport_dev_init(struct ipoi
 	priv->tx_wr.send_flags	= IB_SEND_SIGNALED;
 
 	priv->rx_sge[0].lkey = priv->mr->lkey;
-	priv->rx_sge[0].length = priv->max_ib_mtu;
 	priv->rx_wr.num_sge = 1;
 	priv->rx_wr.next = NULL;
 	priv->rx_wr.sg_list = priv->rx_sge;



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201102111122.p1BBMEIh057808>