Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 9 Feb 2001 18:13:23 -0600 (CST)
From:      Jonathan Lemon <jlemon@flugsvamp.com>
To:        rizzo@aciri.org, net@freebsd.org
Subject:   Re: [patch] fast sbappend*, please try...
Message-ID:  <200102100013.f1A0DNp76359@prism.flugsvamp.com>
In-Reply-To: <local.mail.freebsd-net/200102080258.f182w7c08367@iguana.aciri.org>

next in thread | previous in thread | raw e-mail | index | archive | help
In article <local.mail.freebsd-net/200102080258.f182w7c08367@iguana.aciri.org> you write:
>
>I would be grateful if someone could test the attached patch which
>deals with the following problem:
>
>	on all *BSD version, socket buffers contain a list of
>	incoming and/or outgoing mbufs. Unfortunately the list only
>	has a pointer to the head, meaning that all append operations
>	require to scan the full list. The overhead can be very
>	bad in some cases (e.g. small UDP packets), and becomes
>	worse and worse as the socket buffer size increases (which
>	is what one would commonly do when expecting a lot of
>	traffic!).
>
>The attached patch implements a tail pointer to the list, so that
>you can append in constant time. By default, the code works exactly
>as before -- the tail of the list is reached with the usual linear
>scan, and the pointer to the tail is only used for comparison
>purposes to make sure that it yields the same value.

Aside from the obvious style bugs, and debugging cruft, I have a couple
of issues with this patch:

    - I dislike having sb_mb_tail only being valid if sb_mb is NULL;
      to me, this is non-obvious, and I feel it would be cleaner to
      always make it valid

    - sbgettail() is misnamed, it should reflect the function that 
      it is being used for in most cases (appending the new mbuf to 
      an existing chain)

    - I don't think that the fastscan sysctl is appropriate; either 
      the code works, or it should be debugged further before committing.
      You could move the debugging checks under a KASSERT though.

    - Also, I believe that there may be a problem with the original
      patch, in that the sb_mb_tail is not updated in sbdrop(), in 
      the case where we drop partial mbufs from the first packet 
      train.

In that spirit, I offer an amended patch below.
--
Jonathan

Index: kern/uipc_socket.c
===================================================================
RCS file: /ncvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.68.2.11
diff -u -r1.68.2.11 uipc_socket.c
--- kern/uipc_socket.c	2000/12/22 10:25:21	1.68.2.11
+++ kern/uipc_socket.c	2001/02/09 23:57:14
@@ -778,6 +778,11 @@
 		goto restart;
 	}
 dontblock:
+	/* 
+	 * On entry here, m points to the first record on the socket buffer.
+	 * While we process the initial mbufs containing address and control
+	 * info we save a copy of m->m_nextpkt into nextrecord.
+	 */
 	if (uio->uio_procp)
 		uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
 	nextrecord = m->m_nextpkt;
@@ -821,6 +826,18 @@
 			controlp = &(*controlp)->m_next;
 		}
 	}
+	/*
+	 * if nextrecord == NULL (this is a single chain) then m_tail
+	 * may not be valid here if m was changed earlier.
+	 */
+	if (nextrecord == NULL && (flags & MSG_PEEK) == 0)
+		so->so_rcv.sb_mb_tail = m;
+
+	/*
+	 * If m is non-null, we have some data to read. From now on, make
+	 * sure to keep sb_mb_tail consistent when working on the last
+	 * packet on the chain (nextrecord==NULL) and we change m->m_nextpkt.
+	 */
 	if (m) {
 		if ((flags & MSG_PEEK) == 0)
 			m->m_nextpkt = nextrecord;
@@ -881,6 +898,8 @@
 				}
 				if (m)
 					m->m_nextpkt = nextrecord;
+				if (nextrecord == NULL)
+					so->so_rcv.sb_mb_tail = m;
 			}
 		} else {
 			if (flags & MSG_PEEK)
@@ -937,8 +956,11 @@
 			(void) sbdroprecord(&so->so_rcv);
 	}
 	if ((flags & MSG_PEEK) == 0) {
-		if (m == 0)
+		if (m == 0) {
 			so->so_rcv.sb_mb = nextrecord;
+			if (nextrecord == NULL || nextrecord->m_nextpkt == NULL)
+				so->so_rcv.sb_mb_tail = nextrecord;
+		}
 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
 	}
Index: kern/uipc_socket2.c
===================================================================
RCS file: /ncvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.55.2.8
diff -u -r1.55.2.8 uipc_socket2.c
--- kern/uipc_socket2.c	2001/02/04 14:49:45	1.55.2.8
+++ kern/uipc_socket2.c	2001/02/09 23:59:16
@@ -63,6 +65,9 @@
 
 static	u_long sb_efficiency = 8;	/* parameter for sbreserve() */
 
+static int 	sbtailchk(struct sockbuf *sb);
+static __inline	void 	sbappendmbuf(struct sockbuf *sb, struct mbuf *m0);
+
 /*
  * Procedures to manipulate state flags of socket
  * and do appropriate wakeups.  Normal sequence from the
@@ -477,6 +482,29 @@
  * or sbdroprecord() when the data is acknowledged by the peer.
  */
 
+static int
+sbtailchk(struct sockbuf *sb)
+{
+	struct mbuf *m = sb->sb_mb;
+
+	while (m && m->m_nextpkt)
+		m = m->m_nextpkt;
+	return (m == sb->sb_mb_tail);
+}
+
+static __inline void
+sbappendmbuf(struct sockbuf *sb, struct mbuf *m0)
+{
+	struct mbuf *m = sb->sb_mb_tail;
+
+	KASSERT(sbtailchk(sb), ("sbappendmbuf: bad sb_mb_tail"));
+	if (m)
+		m->m_nextpkt = m0;
+	else
+		sb->sb_mb = m0;
+	sb->sb_mb_tail = m0;
+}
+
 /*
  * Append mbuf chain m to the last record in the
  * socket buffer sb.  The additional space associated
@@ -492,10 +520,9 @@
 
 	if (m == 0)
 		return;
-	n = sb->sb_mb;
+	KASSERT(sbtailchk(sb), ("sbappend: bad sb_mb_tail"));
+	n = sb->sb_mb_tail;
 	if (n) {
-		while (n->m_nextpkt)
-			n = n->m_nextpkt;
 		do {
 			if (n->m_flags & M_EOR) {
 				sbappendrecord(sb, m); /* XXXXXX!!!! */
@@ -545,19 +572,12 @@
 
 	if (m0 == 0)
 		return;
-	m = sb->sb_mb;
-	if (m)
-		while (m->m_nextpkt)
-			m = m->m_nextpkt;
 	/*
 	 * Put the first mbuf on the queue.
 	 * Note this permits zero length records.
 	 */
 	sballoc(sb, m0);
-	if (m)
-		m->m_nextpkt = m0;
-	else
-		sb->sb_mb = m0;
+	sbappendmbuf(sb, m0);
 	m = m0->m_next;
 	m0->m_next = 0;
 	if (m && (m0->m_flags & M_EOR)) {
@@ -603,6 +623,8 @@
 	 */
 	sballoc(sb, m0);
 	m0->m_nextpkt = *mp;
+	if (*mp == NULL) 		/* m0 is actually the new tail */
+		sb->sb_mb_tail = m0;
 	*mp = m0;
 	m = m0->m_next;
 	m0->m_next = 0;
@@ -653,13 +675,7 @@
 	m->m_next = control;
 	for (n = m; n; n = n->m_next)
 		sballoc(sb, n);
-	n = sb->sb_mb;
-	if (n) {
-		while (n->m_nextpkt)
-			n = n->m_nextpkt;
-		n->m_nextpkt = m;
-	} else
-		sb->sb_mb = m;
+	sbappendmbuf(sb, m);
 	return (1);
 }
 
@@ -686,13 +702,7 @@
 	n->m_next = m0;			/* concatenate data to control */
 	for (m = control; m; m = m->m_next)
 		sballoc(sb, m);
-	n = sb->sb_mb;
-	if (n) {
-		while (n->m_nextpkt)
-			n = n->m_nextpkt;
-		n->m_nextpkt = control;
-	} else
-		sb->sb_mb = control;
+	sbappendmbuf(sb, control);
 	return (1);
 }
 
@@ -733,7 +743,7 @@
 		if (n)
 			n->m_next = m;
 		else
-			sb->sb_mb = m;
+			sb->sb_mb = sb->sb_mb_tail = m;
 		sballoc(sb, m);
 		n = m;
 		m->m_flags &= ~M_EOR;
@@ -813,6 +823,9 @@
 		m->m_nextpkt = next;
 	} else
 		sb->sb_mb = next;
+	m = sb->sb_mb;
+	if (m == NULL || m->m_nextpkt == NULL)
+		sb->sb_mb_tail = m;
 }
 
 /*
@@ -833,6 +846,9 @@
 			MFREE(m, mn);
 			m = mn;
 		} while (m);
+		m = sb->sb_mb;
+		if (m == NULL || m->m_nextpkt == NULL)
+			sb->sb_mb_tail = m;
 	}
 }
 
Index: sys/socketvar.h
===================================================================
RCS file: /ncvs/src/sys/sys/socketvar.h,v
retrieving revision 1.46.2.4
diff -u -r1.46.2.4 socketvar.h
--- sys/socketvar.h	2000/11/26 02:30:08	1.46.2.4
+++ sys/socketvar.h	2001/02/08 17:55:46
@@ -93,6 +93,7 @@
 		u_long	sb_mbmax;	/* max chars of mbufs to use */
 		long	sb_lowat;	/* low water mark */
 		struct	mbuf *sb_mb;	/* the mbuf chain */
+		struct	mbuf *sb_mb_tail; /* last pkt in chain */
 		struct	selinfo sb_sel;	/* process selecting read/write */
 		short	sb_flags;	/* flags, see below */
 		short	sb_timeo;	/* timeout for read/write */


To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-net" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200102100013.f1A0DNp76359>