Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 1 Jun 2019 03:03:48 +0000 (UTC)
From:      Navdeep Parhar <np@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r348491 - in head/sys/dev/cxgbe: cxgbei tom
Message-ID:  <201906010303.x5133mV3062988@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: np
Date: Sat Jun  1 03:03:48 2019
New Revision: 348491
URL: https://svnweb.freebsd.org/changeset/base/348491

Log:
  cxgbe/t4_tom: adjust the hardware receive window to match changes to the
  receive sockbuf's high water mark.
  
  Calculate rx credits on the spot instead of tracking sbused/sb_cc and
  rx_credits in the toepcb.  The previous method worked when the high
  water mark changed due to SB_AUTOSIZE but not when it was adjusted
  directly (for example, by the soreserve in nfsrvd_addsock).
  
  This fixes a connection hang while running iozone over an NFS mounted
  share where nfsd's TCP sockets are being handled by t4_tom.
  
  MFC after:	3 days
  Sponsored by:	Chelsio Communications

Modified:
  head/sys/dev/cxgbe/cxgbei/cxgbei.c
  head/sys/dev/cxgbe/tom/t4_connect.c
  head/sys/dev/cxgbe/tom/t4_cpl_io.c
  head/sys/dev/cxgbe/tom/t4_ddp.c
  head/sys/dev/cxgbe/tom/t4_listen.c
  head/sys/dev/cxgbe/tom/t4_tls.c
  head/sys/dev/cxgbe/tom/t4_tom.h

Modified: head/sys/dev/cxgbe/cxgbei/cxgbei.c
==============================================================================
--- head/sys/dev/cxgbe/cxgbei/cxgbei.c	Sat Jun  1 01:40:14 2019	(r348490)
+++ head/sys/dev/cxgbe/cxgbei/cxgbei.c	Sat Jun  1 03:03:48 2019	(r348491)
@@ -398,7 +398,6 @@ do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_he
 	tp->t_rcvtime = ticks;
 
 	/* update rx credits */
-	toep->rx_credits += pdu_len;
 	t4_rcvd(&toep->td->tod, tp);	/* XXX: sc->tom_softc.tod */
 
 	so = inp->inp_socket;

Modified: head/sys/dev/cxgbe/tom/t4_connect.c
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_connect.c	Sat Jun  1 01:40:14 2019	(r348490)
+++ head/sys/dev/cxgbe/tom/t4_connect.c	Sat Jun  1 03:03:48 2019	(r348491)
@@ -385,8 +385,7 @@ t4_connect(struct toedev *tod, struct socket *so, stru
 	toep->vnet = so->so_vnet;
 	set_ulp_mode(toep, select_ulp_mode(so, sc, &settings));
 	SOCKBUF_LOCK(&so->so_rcv);
-	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
-	toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
+	toep->opt0_rcv_bufsize = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	/*
@@ -440,7 +439,7 @@ t4_connect(struct toedev *tod, struct socket *so, stru
 		cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0];
 		cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8];
 		cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale,
-		    toep->rx_credits, toep->ulp_mode, &settings);
+		    toep->opt0_rcv_bufsize, toep->ulp_mode, &settings);
 		cpl->opt2 = calc_opt2a(so, toep, &settings);
 	} else {
 		struct cpl_act_open_req *cpl = wrtod(wr);
@@ -469,7 +468,7 @@ t4_connect(struct toedev *tod, struct socket *so, stru
 		inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port,
 		    &cpl->peer_ip, &cpl->peer_port);
 		cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale,
-		    toep->rx_credits, toep->ulp_mode, &settings);
+		    toep->opt0_rcv_bufsize, toep->ulp_mode, &settings);
 		cpl->opt2 = calc_opt2a(so, toep, &settings);
 	}
 

Modified: head/sys/dev/cxgbe/tom/t4_cpl_io.c
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_cpl_io.c	Sat Jun  1 01:40:14 2019	(r348490)
+++ head/sys/dev/cxgbe/tom/t4_cpl_io.c	Sat Jun  1 03:03:48 2019	(r348491)
@@ -399,20 +399,10 @@ make_established(struct toepcb *toep, uint32_t iss, ui
 
 	tp->irs = irs;
 	tcp_rcvseqinit(tp);
-	tp->rcv_wnd = toep->rx_credits << 10;
+	tp->rcv_wnd = toep->opt0_rcv_bufsize << 10;
 	tp->rcv_adv += tp->rcv_wnd;
 	tp->last_ack_sent = tp->rcv_nxt;
 
-	/*
-	 * If we were unable to send all rx credits via opt0, save the remainder
-	 * in rx_credits so that they can be handed over with the next credit
-	 * update.
-	 */
-	SOCKBUF_LOCK(&so->so_rcv);
-	bufsize = select_rcv_wnd(so);
-	SOCKBUF_UNLOCK(&so->so_rcv);
-	toep->rx_credits = bufsize - tp->rcv_wnd;
-
 	tp->iss = iss;
 	tcp_sendseqinit(tp);
 	tp->snd_una = iss + 1;
@@ -483,37 +473,29 @@ t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp)
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_rcv;
 	struct toepcb *toep = tp->t_toe;
-	int credits;
+	int rx_credits;
 
 	INP_WLOCK_ASSERT(inp);
-
 	SOCKBUF_LOCK_ASSERT(sb);
-	KASSERT(toep->sb_cc >= sbused(sb),
-	    ("%s: sb %p has more data (%d) than last time (%d).",
-	    __func__, sb, sbused(sb), toep->sb_cc));
 
-	credits = toep->sb_cc - sbused(sb);
-	toep->sb_cc = sbused(sb);
+	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
 	if (toep->ulp_mode == ULP_MODE_TLS) {
-		if (toep->tls.rcv_over >= credits) {
-			toep->tls.rcv_over -= credits;
-			credits = 0;
+		if (toep->tls.rcv_over >= rx_credits) {
+			toep->tls.rcv_over -= rx_credits;
+			rx_credits = 0;
 		} else {
-			credits -= toep->tls.rcv_over;
+			rx_credits -= toep->tls.rcv_over;
 			toep->tls.rcv_over = 0;
 		}
 	}
-	toep->rx_credits += credits;
 
-	if (toep->rx_credits > 0 &&
-	    (tp->rcv_wnd <= 32 * 1024 || toep->rx_credits >= 64 * 1024 ||
-	    (toep->rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
-	    toep->sb_cc + tp->rcv_wnd < sb->sb_lowat)) {
-
-		credits = send_rx_credits(sc, toep, toep->rx_credits);
-		toep->rx_credits -= credits;
-		tp->rcv_wnd += credits;
-		tp->rcv_adv += credits;
+	if (rx_credits > 0 &&
+	    (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 ||
+	    (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
+	    sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) {
+		rx_credits = send_rx_credits(sc, toep, rx_credits);
+		tp->rcv_wnd += rx_credits;
+		tp->rcv_adv += rx_credits;
 	} else if (toep->flags & TPF_FORCE_CREDITS)
 		send_rx_modulate(sc, toep);
 }
@@ -1551,7 +1533,7 @@ do_rx_data(struct sge_iq *iq, const struct rss_header 
 	struct socket *so;
 	struct sockbuf *sb;
 	struct epoch_tracker et;
-	int len;
+	int len, rx_credits;
 	uint32_t ddp_placed = 0;
 
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
@@ -1636,8 +1618,6 @@ do_rx_data(struct sge_iq *iq, const struct rss_header 
 
 		if (!sbreserve_locked(sb, newsize, so, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
-		else
-			toep->rx_credits += newsize - hiwat;
 	}
 
 	if (toep->ulp_mode == ULP_MODE_TCPDDP) {
@@ -1675,19 +1655,12 @@ do_rx_data(struct sge_iq *iq, const struct rss_header 
 		}
 	}
 
-	KASSERT(toep->sb_cc >= sbused(sb),
-	    ("%s: sb %p has more data (%d) than last time (%d).",
-	    __func__, sb, sbused(sb), toep->sb_cc));
-	toep->rx_credits += toep->sb_cc - sbused(sb);
 	sbappendstream_locked(sb, m, 0);
-	toep->sb_cc = sbused(sb);
-	if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) {
-		int credits;
-
-		credits = send_rx_credits(sc, toep, toep->rx_credits);
-		toep->rx_credits -= credits;
-		tp->rcv_wnd += credits;
-		tp->rcv_adv += credits;
+	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
+	if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) {
+		rx_credits = send_rx_credits(sc, toep, rx_credits);
+		tp->rcv_wnd += rx_credits;
+		tp->rcv_adv += rx_credits;
 	}
 
 	if (toep->ulp_mode == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 &&

Modified: head/sys/dev/cxgbe/tom/t4_ddp.c
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_ddp.c	Sat Jun  1 01:40:14 2019	(r348490)
+++ head/sys/dev/cxgbe/tom/t4_ddp.c	Sat Jun  1 03:03:48 2019	(r348491)
@@ -304,9 +304,6 @@ insert_ddp_data(struct toepcb *toep, uint32_t n)
 	KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
 	tp->rcv_wnd -= n;
 #endif
-#ifndef USE_DDP_RX_FLOW_CONTROL
-	toep->rx_credits += n;
-#endif
 	CTR2(KTR_CXGBE, "%s: placed %u bytes before falling out of DDP",
 	    __func__, n);
 	while (toep->ddp.active_count > 0) {
@@ -556,16 +553,10 @@ handle_ddp_data(struct toepcb *toep, __be32 ddp_report
 
 		if (!sbreserve_locked(sb, newsize, so, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
-		else
-			toep->rx_credits += newsize - hiwat;
 	}
 	SOCKBUF_UNLOCK(sb);
 	CURVNET_RESTORE();
 
-#ifndef USE_DDP_RX_FLOW_CONTROL
-	toep->rx_credits += len;
-#endif
-
 	job->msgrcv = 1;
 	if (db->cancel_pending) {
 		/*
@@ -714,12 +705,9 @@ handle_ddp_close(struct toepcb *toep, struct tcpcb *tp
 
 	INP_WLOCK_ASSERT(toep->inp);
 	DDP_ASSERT_LOCKED(toep);
-	len = be32toh(rcv_nxt) - tp->rcv_nxt;
 
+	len = be32toh(rcv_nxt) - tp->rcv_nxt;
 	tp->rcv_nxt += len;
-#ifndef USE_DDP_RX_FLOW_CONTROL
-	toep->rx_credits += len;
-#endif
 
 	while (toep->ddp.active_count > 0) {
 		MPASS(toep->ddp.active_id != -1);

Modified: head/sys/dev/cxgbe/tom/t4_listen.c
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_listen.c	Sat Jun  1 01:40:14 2019	(r348490)
+++ head/sys/dev/cxgbe/tom/t4_listen.c	Sat Jun  1 03:03:48 2019	(r348491)
@@ -1400,7 +1400,6 @@ found:
 
 		mtu_idx = find_best_mtu_idx(sc, &inc, &settings);
 		rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ?  select_rcv_wscale() : 0;
-		/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
 		wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND);
 		wnd = min(wnd, MAX_RCV_WND);
 		rx_credits = min(wnd >> 10, M_RCV_BUFSIZ);
@@ -1552,8 +1551,7 @@ reset:
 	toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx];
 	toep->vnet = lctx->vnet;
 	set_ulp_mode(toep, synqe->ulp_mode);
-	/* opt0 rcv_bufsiz initially, assumes its normal meaning later */
-	toep->rx_credits = synqe->rcv_bufsize;
+	toep->opt0_rcv_bufsize = synqe->rcv_bufsize;
 
 	MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss);
 	MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs);

Modified: head/sys/dev/cxgbe/tom/t4_tls.c
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_tls.c	Sat Jun  1 01:40:14 2019	(r348490)
+++ head/sys/dev/cxgbe/tom/t4_tls.c	Sat Jun  1 03:03:48 2019	(r348491)
@@ -1458,7 +1458,7 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_head
 	struct socket *so;
 	struct sockbuf *sb;
 	struct mbuf *tls_data;
-	int len, pdu_length, pdu_overhead, sb_length;
+	int len, pdu_length, rx_credits;
 
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
@@ -1562,24 +1562,10 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_head
 	}
 
 	/*
-	 * Not all of the bytes on the wire are included in the socket
-	 * buffer (e.g. the MAC of the TLS record).  However, those
-	 * bytes are included in the TCP sequence space.  To handle
-	 * this, compute the delta for this TLS record in
-	 * 'pdu_overhead' and treat those bytes as having already been
-	 * "read" by the application for the purposes of expanding the
-	 * window.  The meat of the TLS record passed to the
-	 * application ('sb_length') will still not be counted as
-	 * "read" until userland actually reads the bytes.
-	 *
-	 * XXX: Some of the calculations below are probably still not
-	 * really correct.
+	 * Not all of the bytes on the wire are included in the socket buffer
+	 * (e.g. the MAC of the TLS record).  However, those bytes are included
+	 * in the TCP sequence space.
 	 */
-	sb_length = m->m_pkthdr.len;
-	pdu_overhead = pdu_length - sb_length;
-	toep->rx_credits += pdu_overhead;
-	tp->rcv_wnd += pdu_overhead;
-	tp->rcv_adv += pdu_overhead;
 
 	/* receive buffer autosize */
 	MPASS(toep->vnet == so->so_vnet);
@@ -1587,34 +1573,25 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_head
 	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
-	    sb_length > (sbspace(sb) / 8 * 7)) {
+	    m->m_pkthdr.len > (sbspace(sb) / 8 * 7)) {
 		unsigned int hiwat = sb->sb_hiwat;
 		unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(sb, newsize, so, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
-		else
-			toep->rx_credits += newsize - hiwat;
 	}
 
-	KASSERT(toep->sb_cc >= sbused(sb),
-	    ("%s: sb %p has more data (%d) than last time (%d).",
-	    __func__, sb, sbused(sb), toep->sb_cc));
-	toep->rx_credits += toep->sb_cc - sbused(sb);
 	sbappendstream_locked(sb, m, 0);
-	toep->sb_cc = sbused(sb);
+	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %u PDU overhead %d rx_credits %u rcv_wnd %u",
-	    __func__, tid, pdu_overhead, toep->rx_credits, tp->rcv_wnd);
+	    __func__, tid, pdu_overhead, rx_credits, tp->rcv_wnd);
 #endif
-	if (toep->rx_credits > 0 && toep->sb_cc + tp->rcv_wnd < sb->sb_lowat) {
-		int credits;
-
-		credits = send_rx_credits(sc, toep, toep->rx_credits);
-		toep->rx_credits -= credits;
-		tp->rcv_wnd += credits;
-		tp->rcv_adv += credits;
+	if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) {
+		rx_credits = send_rx_credits(sc, toep, rx_credits);
+		tp->rcv_wnd += rx_credits;
+		tp->rcv_adv += rx_credits;
 	}
 
 	sorwakeup_locked(so);

Modified: head/sys/dev/cxgbe/tom/t4_tom.h
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_tom.h	Sat Jun  1 01:40:14 2019	(r348490)
+++ head/sys/dev/cxgbe/tom/t4_tom.h	Sat Jun  1 03:03:48 2019	(r348491)
@@ -181,9 +181,7 @@ struct toepcb {
 	u_int tx_nocompl;	/* tx WR credits since last compl request */
 	u_int plen_nocompl;	/* payload since last compl request */
 
-	/* rx credit handling */
-	u_int sb_cc;		/* last noted value of so_rcv->sb_cc */
-	int rx_credits;		/* rx credits (in bytes) to be returned to hw */
+	int opt0_rcv_bufsize;	/* XXX: save full opt0/opt2 for later? */
 
 	u_int ulp_mode;	/* ULP mode */
 	void *ulpcb;



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201906010303.x5133mV3062988>