Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 26 Jan 2008 08:00:16 GMT
From:      Kip Macy <kmacy@FreeBSD.org>
To:        Perforce Change Reviews <perforce@freebsd.org>
Subject:   PERFORCE change 134128 for review
Message-ID:  <200801260800.m0Q80G5u048397@repoman.freebsd.org>

next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=134128

Change 134128 by kmacy@kmacy:storage:toehead on 2008/01/26 07:59:26

	first cut at implementing zero copy soreceive

Affected files ...

.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#7 edit
.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_ddp.c#2 edit
.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#7 edit

Differences ...

==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#7 (text+ko) ====

@@ -101,6 +101,7 @@
 #ifndef PG_FRAME
 #define PG_FRAME	~PAGE_MASK
 #endif
+#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
 
 void
 t3_init_socket_ops(void)
@@ -251,7 +252,6 @@
  *   can be posted without closing the window in the middle of DDP (checked
  *   when the connection is offloaded)
  */
-#ifdef notyet
 static int
 so_should_ddp(const struct toepcb *toep, int last_recv_len)
 {
@@ -260,7 +260,67 @@
 	       toep->tp_tp->rcv_wnd > 
 	           (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN);
 }
-#endif
+
+static inline int
+is_ddp(const struct mbuf *m)
+{
+	return (m->m_flags & M_DDP);
+}
+
+static inline int
+is_ddp_psh(const struct mbuf *m)
+{
+        return is_ddp(m) && (m->m_pkthdr.csum_flags & DDP_BF_PSH);
+}
+
+static int
+m_uiomove(const struct mbuf *m, int offset, int len, struct uio *uio)
+{
+	int curlen, err = 0;
+	caddr_t buf;
+	
+	while (m && len) {
+		buf = mtod(m, caddr_t);
+		curlen = m->m_len;
+		if (offset < curlen) {
+			curlen -= offset;
+			buf += offset;
+			offset = 0;
+		} else {
+			offset -= curlen;
+			m = m->m_next;
+			continue;
+		}
+		
+		err = uiomove_frombuf(buf, min(len, curlen), uio);
+		if (err)
+			return (err);
+		len -= min(len, m->m_len);
+		m = m->m_next;
+	}
+	return (err);
+}
+
+/*
+ * Copy data from an sk_buff to an iovec.  Deals with RX_DATA, which carry the
+ * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a
+ * DDP buffer.
+ */
+static inline int
+copy_data(const struct mbuf *m, int offset, int len, struct uio *uio)
+{
+	struct iovec *to = uio->uio_iov;
+	
+	if (__predict_true(!is_ddp(m)))                             /* RX_DATA */
+		return m_uiomove(m, offset, len, uio);
+	if (__predict_true(m->m_ddp_flags & DDP_BF_NOCOPY)) { /* user DDP */
+		to->iov_len -= len;
+		to->iov_base = ((caddr_t)to->iov_base) + len;
+		uio->uio_iov = to;
+		return (0);
+	}
+	return t3_ddp_copy(m, offset, uio, len);             /* kernel DDP */
+}
 
 static void
 cxgb_wait_dma_completion(struct toepcb *toep)
@@ -449,34 +509,258 @@
 
 
 static int
-t3_soreceive(struct socket *so, struct uio *uio)
+t3_soreceive(struct socket *so, int *flagsp, struct uio *uio)
 {
-#ifdef notyet
-	int i, rv, count, hold_resid, sent, iovcnt;
-	struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
 	struct tcpcb *tp = sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct mbuf *m;
-	struct uio uiotmp;
+	uint32_t offset;
+	int err, flags, avail, len, buffers_freed = 0, copied = 0;
+	int target;		/* Read at least this many bytes */
+	long timeo;
+	int user_ddp_ok, user_ddp_pending = 0;
+	struct ddp_state *p;
+	struct inpcb *inp = sotoinpcb(so);
+	
+	flags = flagsp ? (*flagsp &~ MSG_EOR) : 0;
+
+	err = sblock(&so->so_rcv, SBLOCKWAIT(flags));
+	if (err)
+		return (err);
+restart:
+	SOCKBUF_LOCK(&so->so_rcv);
+	len = uio->uio_resid;
+	m = so->so_rcv.sb_mb;
+	target = (flags & MSG_WAITALL) ? min(len, so->so_rcv.sb_hiwat) : so->so_rcv.sb_lowat;
+	timeo = so->so_rcv.sb_timeo;
+	p = &toep->tp_ddp_state;
+	user_ddp_ok = p->ubuf_ddp_ready;
+	p->cancel_ubuf = 0;
+
+	/*
+	 * XXX check timeo/signal/urgent
+	 */
+	if (m) 
+		goto got_mbuf;
+
+	/* empty receive queue */
+	if (copied >= target && /* !sk->sk_backlog.tail && */
+	    !user_ddp_pending)
+		goto done;
 
+	if (copied) {
+		if (so->so_error || tp->t_state == TCPS_CLOSED || 
+		    (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)))
+			goto done;
+	} else {
+		if (so->so_state & SS_NOFDREF)
+			goto done;
+		if (so->so_error) {
+			err = so->so_error;
+			so->so_error = 0;
+			goto done;
+		}
+		if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
+			goto done;
+		if (tp->t_state == TCPS_CLOSED) {
+			err = ENOTCONN; 
+			goto done;
+		}
+	}
+	if (so->so_rcv.sb_mb && !user_ddp_pending) {
+		SOCKBUF_UNLOCK(&so->so_rcv);
+		INP_LOCK(inp);
+		t3_cleanup_rbuf(tp);
+		INP_UNLOCK(inp);
+		goto restart;
+	}
+	if (p->ubuf && user_ddp_ok && !user_ddp_pending && 
+	    uio->uio_iov->iov_len > p->kbuf[0]->dgl_length &&
+	    p->ubuf_ddp_ready) {
+		user_ddp_pending = 
+		    !t3_overlay_ubuf(so, uio, (so->so_state & SS_NBIO), flags, 1, 1);
+		if (user_ddp_pending) {
+			p->kbuf_posted++;
+			user_ddp_ok = 0;
+		}
+	}
+	if (user_ddp_pending) {
+		/* One shot at DDP if we already have enough data */
+		if (copied >= target)
+			user_ddp_ok = 0;
+		if ((err = sbwait(&so->so_rcv)) != 0)
+			goto done;
+//for timers to work			await_ddp_completion(sk, flags, &timeo);
+	} else if (copied >= target)
+		goto done;
+	else {
+		SOCKBUF_UNLOCK(&so->so_rcv);
+		INP_LOCK(inp);
+		t3_cleanup_rbuf(tp);
+		INP_UNLOCK(inp);
+		if ((err = sbwait(&so->so_rcv)) != 0)
+			goto done;
+	}
+	goto restart;
+got_mbuf:
+	if (m->m_pkthdr.len == 0) {
+		if ((m->m_ddp_flags & DDP_BF_NOCOPY) == 0)
+			panic("empty mbuf and NOCOPY not set\n");
+		user_ddp_pending = 0;
+		sbfree(&so->so_rcv, m);
+		m = so->so_rcv.sb_mb = m_free(m);
+		goto done;
+	}
+	offset = toep->tp_copied_seq - m->m_seq;
+	if (offset > m->m_pkthdr.len)
+		panic("t3_soreceive: BUG: OFFSET > LEN seq 0x%x "
+		    "skb->len %d flags 0x%x", m->m_seq,
+		    m->m_pkthdr.len, m->m_ddp_flags);
+	avail = m->m_pkthdr.len - offset;
+	if (len < avail) {
+		if (is_ddp(m) &&  (m->m_ddp_flags & DDP_BF_NOCOPY)) 
+			panic("bad state in t3_soreceive\n");
+		avail = len;
+	}	
+#ifdef notyet		
 	/*
-	 * Events requiring iteration:
-	 *  - number of pages exceeds max hold pages for process or system
-	 *  - number of pages exceeds maximum sg entries for a single WR
-	 *
-	 * We're limited to holding 128 pages at once - and we're limited to
-	 * 34 SG entries per work request, but each SG entry can be any number 
-	 * of contiguous pages
-	 *
+	 * Check if the data we are preparing to copy contains urgent
+	 * data.  Either stop short of urgent data or skip it if it's
+	 * first and we are not delivering urgent data inline.
+	 */
+	if (unlikely(tp->urg_data)) {
+		u32 urg_offset = tp->urg_seq - tp->copied_seq;
+		
+		if (urg_offset < avail) {
+			if (urg_offset) {
+				/* stop short of the urgent data */
+				avail = urg_offset;
+			} else if (!sock_flag(sk, SOCK_URGINLINE)) {
+				/* First byte is urgent, skip */
+				tp->copied_seq++;
+				offset++;
+				avail--;
+				if (!avail)
+					goto skip_copy;
+				}	
+		}	
+	}	
+#endif
+	if (is_ddp_psh(m) || offset) {
+		user_ddp_ok = 0;
+#ifdef T3_TRACE	
+		T3_TRACE0(TIDTB(so), "t3_sosend: PSH");
+#endif	
+	}
+	
+	if (user_ddp_ok && !user_ddp_pending &&
+	    /*
+	     * XXX
+	     */
+#ifdef notyet	    
+	    uio->uio_iovlen > p->kbuf[0]->length &&
+#endif	    
+	    p->ubuf_ddp_ready) {
+		user_ddp_pending = 
+		    !t3_overlay_ubuf(so, uio, (so->so_state & SS_NBIO), flags, 1, 1);
+		if (user_ddp_pending) {
+			p->kbuf_posted++;
+			user_ddp_ok = 0;
+		}
+	}
+		
+	/*
+	 * If MSG_TRUNC is specified the data is discarded.
+	 * XXX need to check pr_atomic
+	 */
+	if (__predict_true(!(flags & MSG_TRUNC)))
+		if ((err = copy_data(m, offset, avail, uio))) {
+			if (err)
+				err = EFAULT;
+			goto done;
+		}
+	
+	toep->tp_copied_seq += avail;
+	copied += avail;
+	len -= avail;
+#ifdef notyet
+skip_copy:
+	if (tp->urg_data && after(tp->copied_seq, tp->urg_seq))
+		tp->urg_data = 0;
+#endif
+	/*
+	 * If the buffer is fully consumed free it.  If it's a DDP
+	 * buffer also handle any events it indicates.
+	 */
+	if (avail + offset >= m->m_pkthdr.len) {
+		unsigned int fl = m->m_ddp_flags;
+		int got_psh = 0;
+		
+		if (p->ubuf != NULL && is_ddp(m) && (fl & 1)) {
+			if (is_ddp_psh(m) && user_ddp_pending)
+				got_psh = 1;
+			
+			if (fl & DDP_BF_NOCOPY)
+				user_ddp_pending = 0;
+			else {
+				p->kbuf_posted--;
+				p->ubuf_ddp_ready = 1;
+			}
+		}
+		sbfree(&so->so_rcv, m);
+		m = so->so_rcv.sb_mb = m_free(m);		
+		buffers_freed++;
+		
+		if  ((so->so_rcv.sb_mb == NULL) && got_psh)
+			goto done;
+	}
+	if (len > 0)
+		goto restart;
+	
+done:
+	/*
+	 * If we can still receive decide what to do in preparation for the
+	 * next receive.  Note that RCV_SHUTDOWN is set if the connection
+	 * transitioned to CLOSE but not if it was in that state to begin with.
 	 */
+	if (__predict_true((so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) == 0)) {
+		if (user_ddp_pending) {
+			user_ddp_ok = 0;
+			t3_cancel_ubuf(toep);
+			if (so->so_rcv.sb_mb) {
+				if (copied < 0)
+					copied = 0;
+				goto restart;
+			}
+			user_ddp_pending = 0;
+		}
+		if (p->kbuf_posted == 0) {
+#ifdef T3_TRACE
+			T3_TRACE0(TIDTB(so),
+			  "chelsio_recvmsg: about to exit, repost kbuf");
+#endif
+
+			t3_post_kbuf(so, 1);
+			p->kbuf_posted++;
+		} else if (so_should_ddp(toep, copied)) {
+			t3_enter_ddp(so, TOM_TUNABLE(TOE_DEV(so),
+						     ddp_copy_limit), 0);
+			p->kbuf_posted = 1;
+		}
+	}
+	if (buffers_freed)
+		t3_cleanup_rbuf(tp);
+#ifdef T3_TRACE
+	T3_TRACE5(TIDTB(so),
+		  "chelsio_recvmsg <-: copied %d len %d buffers_freed %d "
+		  "kbuf_posted %d user_ddp_pending %u",
+		  copied, len, buffers_freed, p ? p->kbuf_posted : -1, 
+		  user_ddp_pending);
+#endif
+	SOCKBUF_UNLOCK(&so->so_rcv);
+	sbunlock(&so->so_rcv);
 
-	uiotmp = *uio;
-	iovcnt = uio->uio_iovcnt;
-	iov = uio->uio_iov;
-	sent = 0;
-	re;
-#endif  
-	return (0);
+	return (err);
 }
 
 static int
@@ -484,9 +768,11 @@
     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	struct toedev *tdev;
-	int rv, zcopy_thres, zcopy_enabled;
+	int rv, zcopy_thres, zcopy_enabled, flags;
 	struct tcpcb *tp = sototcpcb(so);
 
+	flags = flagsp ? *flagsp &~ MSG_EOR : 0;
+	
 	/*
 	 * In order to use DMA direct from userspace the following
 	 * conditions must be met:
@@ -500,14 +786,16 @@
 	 *  - iovcnt is 1
 	 *
 	 */
-	if (tp->t_flags & TF_TOE) {
+	if ((tp->t_flags & TF_TOE) && ((flags & (MSG_WAITALL|MSG_OOB|MSG_PEEK|MSG_DONTWAIT)) == 0)
+	    && ((so->so_state & SS_NBIO) == 0) && (uio->uio_iovcnt == 1) &&
+	    ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) && (mp0 == NULL)) {
 		tdev =  TOE_DEV(so);
 		zcopy_thres = TOM_TUNABLE(tdev, ddp_thres);
 		zcopy_enabled = TOM_TUNABLE(tdev, ddp);
 		if ((uio->uio_resid > zcopy_thres) &&
 		    (uio->uio_iovcnt == 1) &&  ((so->so_state & SS_NBIO) == 0)
 		    && zcopy_enabled) {
-			rv = t3_soreceive(so, uio);
+			rv = t3_soreceive(so, flagsp, uio);
 			if (rv != EAGAIN)
 				return (rv);
 		}

==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_ddp.c#2 (text+ko) ====

@@ -326,9 +326,9 @@
 }
 
 /**
- * setup_iovec_ppods - setup HW page pods for a user iovec
+ * setup_uio_ppods - setup HW page pods for a user iovec
  * @sk: the associated socket
- * @iov: the iovec
+ * @uio: the uio
  * @oft: additional bytes to map before the start of the buffer
  *
  * Pins a user iovec and sets up HW page pods for DDP into it.  We allocate
@@ -339,13 +339,14 @@
  * The current implementation handles iovecs with only one entry.
  */
 static int
-setup_iovec_ppods(struct socket *so, const struct iovec *iov, int oft, int *length)
+setup_uio_ppods(struct socket *so, const struct uio *uio, int oft, int *length)
 {
 	int err;
 	unsigned int len;
 	struct ddp_gather_list *gl = NULL;
 	struct toepcb *toep = sototcpcb(so)->t_toe;
 	struct ddp_state *p = &toep->tp_ddp_state;
+	struct iovec *iov = uio->uio_iov;
 	unsigned long addr = (unsigned long)iov->iov_base - oft;
 
 	if (__predict_false(!p->ubuf_nppods)) {
@@ -424,7 +425,7 @@
  * Post a user buffer as an overlay on top of the current kernel buffer.
  */
 int
-t3_overlay_ubuf(struct socket *so, const struct iovec *iov,
+t3_overlay_ubuf(struct socket *so, const struct uio *uio,
 		    int nonblock, int rcv_flags, int modulate, int post_kbuf)
 {
 	int err, len, ubuf_idx;
@@ -435,7 +436,7 @@
 	if (p->ubuf == NULL)
 		return (EINVAL);
 
-	err = setup_iovec_ppods(so, iov, 0, &len);
+	err = setup_uio_ppods(so, uio, 0, &len);
 	if (err)
 		return (err);
 
@@ -481,67 +482,6 @@
 	return (0);
 }
 
-static inline int
-is_ddp(const struct mbuf *m)
-{
-	return (m->m_flags & M_DDP);
-}
-
-static inline int
-is_ddp_psh(const struct mbuf *m)
-{
-        return is_ddp(m) && (m->m_pkthdr.csum_flags & DDP_BF_PSH);
-}
-
-static int
-m_uiomove(const struct mbuf *m, int offset, int len, struct uio *uio)
-{
-	int curlen, err = 0;
-	caddr_t buf;
-	
-	while (m && len) {
-		buf = mtod(m, caddr_t);
-		curlen = m->m_len;
-		if (offset < curlen) {
-			curlen -= offset;
-			buf += offset;
-			offset = 0;
-		} else {
-			offset -= curlen;
-			m = m->m_next;
-			continue;
-		}
-		
-		err = uiomove_frombuf(buf, min(len, curlen), uio);
-		if (err)
-			return (err);
-		len -= min(len, m->m_len);
-		m = m->m_next;
-	}
-	return (err);
-}
-
-/*
- * Copy data from an sk_buff to an iovec.  Deals with RX_DATA, which carry the
- * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a
- * DDP buffer.
- */
-static inline int
-copy_data(const struct mbuf *m, int offset, struct uio *uio, int len)
-{
-	struct iovec *to = uio->uio_iov;
-	
-	if (__predict_true(!is_ddp(m)))                             /* RX_DATA */
-		return m_uiomove(m, offset, len, uio);
-	if (__predict_true(m->m_ddp_flags & DDP_BF_NOCOPY)) { /* user DDP */
-		to->iov_len -= len;
-		to->iov_base = ((caddr_t)to->iov_base) + len;
-		uio->uio_iov = to;
-		return (0);
-	}
-	return t3_ddp_copy(m, offset, uio, len);             /* kernel DDP */
-}
-
 /*
  * Clean up DDP state that needs to survive until socket close time, such as the
  * DDP buffers.  The buffers are already unmapped at this point as unmapping

==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#7 (text+ko) ====

@@ -153,14 +153,13 @@
 int t3_alloc_ppods(struct tom_data *td, unsigned int n, int *tag);
 void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n);
 void t3_free_ddp_gl(struct ddp_gather_list *gl);
-int t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio,
-		int len);
+int t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio, int len);
 //void t3_repost_kbuf(struct socket *so, int modulate, int activate);
 void t3_post_kbuf(struct socket *so, int modulate);
-int t3_post_ubuf(struct socket *so, const struct iovec *iov, int nonblock,
+int t3_post_ubuf(struct socket *so, const struct uio *uio, int nonblock,
 		 int rcv_flags, int modulate, int post_kbuf);
 void t3_cancel_ubuf(struct toepcb *toep);
-int t3_overlay_ubuf(struct socket *so, const struct iovec *iov, int nonblock,
+int t3_overlay_ubuf(struct socket *so, const struct uio *uio, int nonblock,
 		    int rcv_flags, int modulate, int post_kbuf);
 int t3_enter_ddp(struct socket *so, unsigned int kbuf_size, unsigned int waitall);
 void t3_cleanup_ddp(struct toepcb *toep);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200801260800.m0Q80G5u048397>