Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 26 Aug 1998 21:55:19 -0400 (EDT)
From:      Garrett Wollman <wollman@khavrinen.lcs.mit.edu>
To:        net@FreeBSD.ORG
Subject:   Next big network patch: specialized sosend for TCP
Message-ID:  <199808270155.VAA07365@khavrinen.lcs.mit.edu>

next in thread | raw e-mail | index | archive | help
Here's the next patch that's going into the TCP stack.  I am running
this right now, so I'm certain it's not completely bogus, but have not
stress-tested it as yet.  It does appear to be somewhat faster (having
eliminated about a dozen branches), but -current is a very hostile
environment for microbenchmarks of the sort I would usually use.

Once again, any comments would be appreciated.

-GAWollman


Index: netinet/tcp_usrreq.c
===================================================================
RCS file: /home/cvs/src/sys/netinet/tcp_usrreq.c,v
retrieving revision 1.38
diff -u -r1.38 tcp_usrreq.c
--- tcp_usrreq.c	1998/08/23 03:07:15	1.38
+++ tcp_usrreq.c	1998/08/27 01:44:26
@@ -73,6 +73,9 @@
 				 struct proc *));
 static struct tcpcb *
 		tcp_disconnect __P((struct tcpcb *));
+static int	tcp_sosend __P((struct socket *, struct sockaddr *,
+				struct uio *, struct mbuf *, struct mbuf *,
+				int, struct proc *));
 static struct tcpcb *
 		tcp_usrclosed __P((struct tcpcb *));
 
@@ -325,6 +328,10 @@
 /*
  * Do a send by putting data in output queue and updating urgent
  * marker if URG set.  Possibly send more data.
+ *
+ * XXX - this routine is really only here for the benefit of NFS.
+ * Somebody who knows the NFS code should figure out why NFS
+ * is going through here and where it should go.
  */
 static int
 tcp_usr_send(struct socket *so, int flags, struct mbuf *m, 
@@ -336,14 +343,12 @@
 	struct tcpcb *tp;
 
 	COMMON_START();
-	if (control && control->m_len) {
-		m_freem(control); /* XXX shouldn't caller do this??? */
-		if (m)
-			m_freem(m);
-		error = EINVAL;
-		goto out;
-	}
-
+	/*
+	 * We used to check for control information here, but
+	 * tcp_sosend() doesn't call here, and any direct callers (i.e., NFS)
+	 * should know enough to refrain from sending any since TCP
+	 * has never supported control information.
+	 */
 	if(!(flags & PRUS_OOB)) {
 		sbappend(&so->so_snd, m);
 		if (nam && tp->t_state < TCPS_SYN_SENT) {
@@ -459,7 +464,7 @@
 	tcp_usr_connect, pru_connect2_notsupp, in_control, tcp_usr_detach,
 	tcp_usr_disconnect, tcp_usr_listen, in_setpeeraddr, tcp_usr_rcvd,
 	tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown,
-	in_setsockaddr, sosend, soreceive, sopoll
+	in_setsockaddr, tcp_sosend, soreceive, sopoll
 };
 
 /*
@@ -786,5 +791,230 @@
 			tp->t_timer[TCPT_2MSL] = tcp_maxidle;
 	}
 	return (tp);
+}
+
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/resource.h>
+#include <sys/resourcevar.h>
+#include <sys/uio.h>
+
+#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
+/*
+ * Specialized from kern/uipc_socket.c:sosend().  This isn't
+ * even close to fully optimized, but it has been run through
+ * a round of invisible constant propagation and dead code
+ * elimination.  The (uio != 0) and (top != 0) should probably
+ * be separated by a single major branch, since the code
+ * is well interspersed at present (which is probably bad
+ * for branch prediction).
+ *
+ * Returns nonzero on error, timeout or signal; callers
+ * must check for short counts if EINTR/ERESTART are returned.
+ * Data and control buffers are freed on return.
+ */
+static int
+tcp_sosend(so, addr, uio, top, control, flags, p)
+	register struct socket *so;
+	struct sockaddr *addr;
+	struct uio *uio;
+	struct mbuf *top;
+	struct mbuf *control;
+	int flags;
+	struct proc *p;
+{
+	struct mbuf **mp;
+	register struct mbuf *m;
+	register long space, len, resid;
+	struct inpcb *inp;
+	struct tcpcb *tp;
+	int error, s, mlen;
+	TCPDEBUG0;
+
+	/*
+	 * I believe that a socket can never become ``bare''
+	 * during the execution of this routine.  (I don't think
+	 * that a socket can ever become ``bare'' except for brief
+	 * moments during initialization and rundown, but haven't
+	 * proven that.)  (Of course, I haven't proven this, either.
+	 * We'll see.)
+	 */
+	if ((inp = sotoinpcb(so)) == 0) {
+		error = EINVAL;
+		goto release;
+	}
+	tp = intotcpcb(inp);
+
+	if (uio)
+		resid = uio->uio_resid;
+	else
+		resid = top->m_pkthdr.len;
+	/*
+	 * In theory resid should be unsigned.
+	 * However, space must be signed, as it might be less than 0
+	 * if we over-committed, and we must use a signed comparison
+	 * of space and resid.  On the other hand, a negative resid
+	 * causes us to loop sending 0-length segments to the protocol.
+	 * Also bail early if we get control information -- TCP doesn't
+	 * support that.
+	 */
+	if (resid < 0 || (control && control->m_len) || inp == 0) {
+		error = EINVAL;
+		goto out;
+	}
+
+	if (p)
+		p->p_stats->p_ru.ru_msgsnd++;
+
+restart:
+	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
+	if (error)
+		goto out;
+	do {
+		s = splnet();
+#define	snderr(errno)	do { error = errno; splx(s); goto release; } while (0)
+		if (so->so_state & SS_CANTSENDMORE)
+			snderr(EPIPE);
+		if (so->so_error) {
+			error = so->so_error;
+			so->so_error = 0;
+			splx(s);
+			goto release;
+		}
+		if ((so->so_state & SS_ISCONNECTED) == 0
+		    && addr == 0)
+			    snderr(ENOTCONN);
+
+		space = sbspace(&so->so_snd);
+		if (flags & MSG_OOB)
+			space += 1024; /* XXX totally arbitrary */
+		if (uio == 0 && resid > so->so_snd.sb_hiwat)
+			snderr(EMSGSIZE);
+		if (space < resid && uio && space < so->so_snd.sb_lowat) {
+			if (so->so_state & SS_NBIO)
+				snderr(EWOULDBLOCK);
+			sbunlock(&so->so_snd);
+			error = sbwait(&so->so_snd);
+			splx(s);
+			if (error)
+				goto out;
+			goto restart;
+		}
+		splx(s);
+		mp = &top;
+		do {
+		    if (uio == NULL) {
+			/*
+			 * Data is prepackaged in "top".
+			 */
+			resid = 0;
+		    } else do {
+			if (top == 0) {
+				MGETHDR(m, M_WAIT, MT_DATA);
+				mlen = MHLEN;
+				m->m_pkthdr.len = 0;
+				m->m_pkthdr.rcvif = (struct ifnet *)0;
+			} else {
+				MGET(m, M_WAIT, MT_DATA);
+				mlen = MLEN;
+			}
+			if (resid >= MINCLSIZE) {
+				MCLGET(m, M_WAIT);
+				if ((m->m_flags & M_EXT) == 0)
+					goto nopages;
+				mlen = MCLBYTES;
+				len = min(min(mlen, resid), space);
+			} else {
+nopages:
+				len = min(min(mlen, resid), space);
+			}
+			space -= len;
+			error = uiomove(mtod(m, caddr_t), (int)len, uio);
+			resid = uio->uio_resid;
+			m->m_len = len;
+			*mp = m;
+			top->m_pkthdr.len += len;
+			if (error)
+				goto release;
+			mp = &m->m_next;
+			if (resid <= 0) {
+				break;
+			}
+		    } while (space > 0);
+		    s = splnet();				/* XXX */
+		    TCPDEBUG1();
+		    /*
+		     * XXX -- should be possible to perform this check
+		     * out of the loop.
+		     */
+		    if ((flags & MSG_OOB) && sbspace(&so->so_snd) < -512) {
+			    splx(s);
+			    snderr(ENOBUFS);
+		    }
+		    sbappend(&so->so_snd, top);
+
+		    /*
+		     * Do implied connect if not yet connected,
+		     * initialize window to default value, and
+		     * initialize maxseg/maxopd using peer's cached
+		     * MSS.
+		     */
+		    if (addr && tp->t_state < TCPS_SYN_SENT) {
+			    error = tcp_connect(tp, addr, p);
+			    if (error)
+				    goto out;
+			    tp->snd_wnd = TTCP_CLIENT_SND_WND;
+			    tcp_mss(tp, -1);
+		    }
+
+		    if (flags & MSG_OOB) {
+			    /*
+			     * According to RFC961 (Assigned Protocols),
+			     * the urgent pointer points to the last octet
+			     * of urgent data.  We continue, however,
+			     * to consider it to indicate the first octet
+			     * of data past the urgent section.
+			     * Otherwise, snd_up should be one lower.
+			     */
+			    tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
+			    tp->t_force = 1;
+			    error = tcp_output(tp);
+			    tp->t_force = 0;
+			    TCPDEBUG2(PRU_SEND_OOB);
+		    } else if ((flags & MSG_EOF) && resid <= 0) {
+			    socantsendmore(so);
+			    tp = tcp_usrclosed(tp);
+#ifdef DIAGNOSTIC
+			    /*
+			     * The only way tcp_usrclosed() can cause the
+			     * tcpcb to go away entirely is if it was either
+			     * CLOSED or LISTENing.  In either state, we
+			     * should never have gotten this far.
+			     */
+			    if (tp == 0)
+				    panic("tcp_sosend: socket already closed");
+#endif
+			    error = tcp_output(tp);
+			    TCPDEBUG2(PRU_SEND_EOF);
+		    } else {
+			    error = tcp_output(tp);
+			    TCPDEBUG2(PRU_SEND);
+		    }
+		    splx(s);
+		    top = 0;
+		    mp = &top;
+		    if (error)
+			goto release;
+		} while (resid && space > 0);
+	} while (resid);
+
+release:
+	sbunlock(&so->so_snd);
+out:
+	if (top)
+		m_freem(top);
+	if (control)
+		m_freem(control);
+	return (error);
 }
 

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-net" in the body of the message



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199808270155.VAA07365>