Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 2 Nov 2006 19:38:18 GMT
From:      Paolo Pisati <piso@FreeBSD.org>
To:        Perforce Change Reviews <perforce@freebsd.org>
Subject:   PERFORCE change 109067 for review
Message-ID:  <200611021938.kA2JcIVx038729@repoman.freebsd.org>

next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=109067

Change 109067 by piso@piso_newluxor on 2006/11/02 19:37:58

	IFC@109062

Affected files ...

.. //depot/projects/soc2005/libalias/sbin/ipfw/ipfw2.c#11 edit
.. //depot/projects/soc2005/libalias/sys/boot/common/load_elf.c#3 integrate
.. //depot/projects/soc2005/libalias/sys/boot/i386/boot2/boot2.c#3 integrate
.. //depot/projects/soc2005/libalias/sys/boot/i386/libi386/elf32_freebsd.c#3 integrate
.. //depot/projects/soc2005/libalias/sys/boot/pc98/boot2/boot.c#3 integrate
.. //depot/projects/soc2005/libalias/sys/ddb/db_command.c#4 integrate
.. //depot/projects/soc2005/libalias/sys/geom/eli/g_eli.c#6 integrate
.. //depot/projects/soc2005/libalias/sys/geom/journal/g_journal.c#3 integrate
.. //depot/projects/soc2005/libalias/sys/kern/uipc_mbuf.c#4 integrate
.. //depot/projects/soc2005/libalias/sys/kern/uipc_socket.c#8 integrate
.. //depot/projects/soc2005/libalias/sys/kern/uipc_syscalls.c#6 integrate
.. //depot/projects/soc2005/libalias/sys/net/bridgestp.c#7 integrate
.. //depot/projects/soc2005/libalias/sys/net/if_tap.c#4 integrate
.. //depot/projects/soc2005/libalias/sys/net/if_tun.c#5 integrate
.. //depot/projects/soc2005/libalias/sys/net/ppp_tty.c#2 integrate
.. //depot/projects/soc2005/libalias/sys/netgraph/ng_device.c#2 integrate
.. //depot/projects/soc2005/libalias/sys/sys/libkern.h#4 integrate
.. //depot/projects/soc2005/libalias/sys/sys/mbuf.h#6 integrate
.. //depot/projects/soc2005/libalias/sys/sys/socket.h#3 integrate

Differences ...

==== //depot/projects/soc2005/libalias/sbin/ipfw/ipfw2.c#11 (text+ko) ====

@@ -526,8 +526,9 @@
 
 	if (optname == IP_FW_GET || optname == IP_DUMMYNET_GET ||
 	    optname == IP_FW_ADD || optname == IP_FW_TABLE_LIST ||
-	    optname == IP_FW_TABLE_GETSIZE || optname == IP_FW_NAT_GET_CONFIG ||
- 	    optname == IP_FW_NAT_GET_LOG)
+	    optname == IP_FW_TABLE_GETSIZE || 
+	    optname == IP_FW_NAT_GET_CONFIG || 
+	    optname == IP_FW_NAT_GET_LOG)
 		i = getsockopt(s, IPPROTO_IP, optname, optval,
 			(socklen_t *)optlen);
 	else

==== //depot/projects/soc2005/libalias/sys/boot/common/load_elf.c#3 (text+ko) ====

@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/boot/common/load_elf.c,v 1.35 2006/10/29 14:50:57 ru Exp $");
+__FBSDID("$FreeBSD: src/sys/boot/common/load_elf.c,v 1.36 2006/11/02 17:28:37 ru Exp $");
 
 #include <sys/param.h>
 #include <sys/exec.h>
@@ -263,7 +263,7 @@
 #if __ELF_WORD_SIZE == 64
 	off = - (off & 0xffffffffff000000ull);/* x86_64 relocates after locore */
 #else
-	off = - (off & 0xc0000000u);	/* i386 relocates after locore */
+	off = - (off & 0xff000000u);	/* i386 relocates after locore */
 #endif
 #else
 	off = 0;		/* other archs use direct mapped kernels */

==== //depot/projects/soc2005/libalias/sys/boot/i386/boot2/boot2.c#3 (text+ko) ====

@@ -14,7 +14,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/boot/i386/boot2/boot2.c,v 1.82 2006/10/29 14:50:57 ru Exp $");
+__FBSDID("$FreeBSD: src/sys/boot/i386/boot2/boot2.c,v 1.83 2006/11/02 17:28:38 ru Exp $");
 
 #include <sys/param.h>
 #include <sys/disklabel.h>
@@ -334,7 +334,7 @@
 	return;
     }
     if (fmt == 0) {
-	addr = hdr.ex.a_entry & 0x3fffffff;
+	addr = hdr.ex.a_entry & 0xffffff;
 	p = PTOV(addr);
 	fs_off = PAGE_SIZE;
 	if (xfsread(ino, p, hdr.ex.a_text))
@@ -368,7 +368,7 @@
 		j++;
 	}
 	for (i = 0; i < 2; i++) {
-	    p = PTOV(ep[i].p_paddr & 0x3fffffff);
+	    p = PTOV(ep[i].p_paddr & 0xffffff);
 	    fs_off = ep[i].p_offset;
 	    if (xfsread(ino, p, ep[i].p_filesz))
 		return;
@@ -389,7 +389,7 @@
 		p += es[i].sh_size;
 	    }
 	}
-	addr = hdr.eh.e_entry & 0x3fffffff;
+	addr = hdr.eh.e_entry & 0xffffff;
     }
     bootinfo.bi_esymtab = VTOP(p);
     bootinfo.bi_kernelname = VTOP(kname);

==== //depot/projects/soc2005/libalias/sys/boot/i386/libi386/elf32_freebsd.c#3 (text+ko) ====

@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/boot/i386/libi386/elf32_freebsd.c,v 1.16 2006/10/29 14:50:58 ru Exp $");
+__FBSDID("$FreeBSD: src/sys/boot/i386/libi386/elf32_freebsd.c,v 1.17 2006/11/02 17:28:38 ru Exp $");
 
 #include <sys/param.h>
 #include <sys/exec.h>
@@ -65,7 +65,7 @@
     err = bi_load32(fp->f_args, &boothowto, &bootdev, &bootinfop, &modulep, &kernend);
     if (err != 0)
 	return(err);
-    entry = ehdr->e_entry & 0x3fffffff;
+    entry = ehdr->e_entry & 0xffffff;
 
 #ifdef DEBUG
     printf("Start @ 0x%lx ...\n", entry);

==== //depot/projects/soc2005/libalias/sys/boot/pc98/boot2/boot.c#3 (text+ko) ====

@@ -49,7 +49,7 @@
 */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/boot/pc98/boot2/boot.c,v 1.15 2006/10/29 14:50:58 ru Exp $");
+__FBSDID("$FreeBSD: src/sys/boot/pc98/boot2/boot.c,v 1.16 2006/11/02 17:28:38 ru Exp $");
 
 #include "boot.h"
 #include <a.out.h>
@@ -199,9 +199,9 @@
 	/*
 	 * We assume that the entry address is the same as the lowest text
 	 * address and that the kernel startup code handles relocation by
-	 * this address rounded down to a multiple of 1G.
+	 * this address rounded down to a multiple of 16M.
 	 */
-	startaddr = head.a_entry & 0x3FFFFFFF;
+	startaddr = head.a_entry & 0x00FFFFFF;
 	addr =  startaddr;
 	printf("Booting %d:%s(%d,%c)%s @ 0x%x\n"
 			, dosdev & 0x0f

==== //depot/projects/soc2005/libalias/sys/ddb/db_command.c#4 (text+ko) ====

@@ -32,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/ddb/db_command.c,v 1.71 2006/10/10 07:26:53 bde Exp $");
+__FBSDID("$FreeBSD: src/sys/ddb/db_command.c,v 1.72 2006/11/02 11:47:38 kib Exp $");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
@@ -690,14 +690,22 @@
 {
 	struct proc *p;
 	struct thread *td;
+	jmp_buf jb;
+	void *prev_jb;
 
 	LIST_FOREACH(p, &allproc, p_list) {
-		FOREACH_THREAD_IN_PROC(p, td) {
-			db_printf("\nTracing command %s pid %d tid %ld td %p\n",
-			    p->p_comm, p->p_pid, (long)td->td_tid, td);
-			db_trace_thread(td, -1);
-			if (db_pager_quit)
-				return;
+		prev_jb = kdb_jmpbuf(jb);
+		if (setjmp(jb) == 0) {
+			FOREACH_THREAD_IN_PROC(p, td) {
+				db_printf("\nTracing command %s pid %d tid %ld td %p\n",
+					  p->p_comm, p->p_pid, (long)td->td_tid, td);
+				db_trace_thread(td, -1);
+				if (db_pager_quit) {
+					kdb_jmpbuf(prev_jb);
+					return;
+				}
+			}
 		}
+		kdb_jmpbuf(prev_jb);
 	}
 }

==== //depot/projects/soc2005/libalias/sys/geom/eli/g_eli.c#6 (text+ko) ====

@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/geom/eli/g_eli.c,v 1.32 2006/11/01 16:05:06 pjd Exp $");
+__FBSDID("$FreeBSD: src/sys/geom/eli/g_eli.c,v 1.33 2006/11/02 09:01:34 pjd Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -642,7 +642,7 @@
 	for (i = 0; i < threads; i++) {
 		if (g_eli_cpu_is_disabled(i)) {
 			G_ELI_DEBUG(1, "%s: CPU %u disabled, skipping.",
-			    bpp->name, threads);
+			    bpp->name, i);
 			continue;
 		}
 		wr = malloc(sizeof(*wr), M_ELI, M_WAITOK | M_ZERO);

==== //depot/projects/soc2005/libalias/sys/geom/journal/g_journal.c#3 (text+ko) ====

@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/geom/journal/g_journal.c,v 1.6 2006/11/02 00:37:39 pjd Exp $");
+__FBSDID("$FreeBSD: src/sys/geom/journal/g_journal.c,v 1.8 2006/11/02 16:24:18 pjd Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -600,11 +600,9 @@
 	    &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
-	if (error != 0) {
+	if (buf == NULL) {
 		GJ_DEBUG(1, "Cannot read metadata from %s (error=%d).",
 		    cp->provider->name, error);
-		if (buf != NULL)
-			g_free(buf);
 		return (error);
 	}
 
@@ -1622,7 +1620,7 @@
 	}
 	if (bp != NULL) {
 		if (bp->bio_data == NULL) {
-			nbp = g_clone_bio(pbp);
+			nbp = g_duplicate_bio(pbp);
 			nbp->bio_cflags = GJ_BIO_READ;
 			nbp->bio_data =
 			    pbp->bio_data + cstart - pbp->bio_offset;
@@ -1646,7 +1644,7 @@
 		 * Its time for asking data provider.
 		 */
 		GJ_DEBUG(3, "READ(data): (%jd, %jd)", ostart, oend);
-		nbp = g_clone_bio(pbp);
+		nbp = g_duplicate_bio(pbp);
 		nbp->bio_cflags = GJ_BIO_READ;
 		nbp->bio_data = pbp->bio_data + ostart - pbp->bio_offset;
 		nbp->bio_offset = ostart;
@@ -2239,6 +2237,8 @@
 	struct g_consumer *cp;
 	int error;
 
+	sc = NULL;	/* gcc */
+
 	g_topology_assert();
 	/*
 	 * There are two possibilities:

==== //depot/projects/soc2005/libalias/sys/kern/uipc_mbuf.c#4 (text+ko) ====

@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.168 2006/10/22 11:52:13 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.169 2006/11/02 17:37:21 andre Exp $");
 
 #include "opt_mac.h"
 #include "opt_param.h"
@@ -94,61 +94,61 @@
  * chain.
  */
 struct mbuf *
-m_getm(struct mbuf *m, int len, int how, short type)
+m_getm2(struct mbuf *m, int len, int how, short type, int flags)
 {
-	struct mbuf *mb, *top, *cur, *mtail;
-	int num, rem;
-	int i;
+	struct mbuf *mb, *nm = NULL, *mtail = NULL;
+
+	KASSERT(len >= 0, ("%s: len is < 0", __func__));
+
+	/* Validate flags. */
+	flags &= (M_PKTHDR | M_EOR);
 
-	KASSERT(len >= 0, ("m_getm(): len is < 0"));
+	/* Packet header mbuf must be first in chain. */
+	if ((flags & M_PKTHDR) && m != NULL)
+		flags &= ~M_PKTHDR;
 
-	/* If m != NULL, we will append to the end of that chain. */
-	if (m != NULL)
-		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
-	else
-		mtail = NULL;
+	/* Loop and append maximum sized mbufs to the chain tail. */
+	while (len > 0) {
+		if (len > MCLBYTES)
+			mb = m_getjcl(how, type, (flags & M_PKTHDR),
+			    MJUMPAGESIZE);
+		else if (len >= MINCLSIZE)
+			mb = m_getcl(how, type, (flags & M_PKTHDR));
+		else if (flags & M_PKTHDR)
+			mb = m_gethdr(how, type);
+		else
+			mb = m_get(how, type);
 
-	/*
-	 * Calculate how many mbufs+clusters ("packets") we need and how much
-	 * leftover there is after that and allocate the first mbuf+cluster
-	 * if required.
-	 */
-	num = len / MCLBYTES;
-	rem = len % MCLBYTES;
-	top = cur = NULL;
-	if (num > 0) {
-		if ((top = cur = m_getcl(how, type, 0)) == NULL)
-			goto failed;
-		top->m_len = 0;
-	}
-	num--;
+		/* Fail the whole operation if one mbuf can't be allocated. */
+		if (mb == NULL) {
+			if (nm != NULL)
+				m_freem(nm);
+			return (NULL);
+		}
 
-	for (i = 0; i < num; i++) {
-		mb = m_getcl(how, type, 0);
-		if (mb == NULL)
-			goto failed;
-		mb->m_len = 0;
-		cur = (cur->m_next = mb);
-	}
-	if (rem > 0) {
-		mb = (rem >= MINCLSIZE) ?
-		    m_getcl(how, type, 0) : m_get(how, type);
-		if (mb == NULL)
-			goto failed;
-		mb->m_len = 0;
-		if (cur == NULL)
-			top = mb;
+		/* Book keeping. */
+		len -= (mb->m_flags & M_EXT) ? mb->m_ext.ext_size :
+			((mb->m_flags & M_PKTHDR) ? MHLEN : MLEN);
+		if (mtail != NULL)
+			mtail->m_next = mb;
 		else
-			cur->m_next = mb;
+			nm = mb;
+		mtail = mb;
+		flags &= ~M_PKTHDR;	/* Only valid on the first mbuf. */
 	}
+	if (flags & M_EOR)
+		mtail->m_flags |= M_EOR;  /* Only valid on the last mbuf. */
+
+	/* If mbuf was supplied, append new chain to the end of it. */
+	if (m != NULL) {
+		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next)
+			;
+		mtail->m_next = nm;
+		mtail->m_flags &= ~M_EOR;
+	} else
+		m = nm;
 
-	if (mtail != NULL)
-		mtail->m_next = top;
-	return top;
-failed:
-	if (top != NULL)
-		m_freem(top);
-	return NULL;
+	return (m);
 }
 
 /*
@@ -1610,55 +1610,58 @@
 
 #endif
 
+/*
+ * Copy the contents of uio into a properly sized mbuf chain.
+ */
 struct mbuf *
-m_uiotombuf(struct uio *uio, int how, int len, int align)
+m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
 {
-	struct mbuf *m_new = NULL, *m_final = NULL;
-	int progress = 0, error = 0, length, total;
+	struct mbuf *m, *mb;
+	int error, length, total;
+	int progress = 0;
 
+	/*
+	 * len can be zero or an arbitrary large value bound by
+	 * the total data supplied by the uio.
+	 */
 	if (len > 0)
 		total = min(uio->uio_resid, len);
 	else
 		total = uio->uio_resid;
+
+	/*
+	 * The smallest unit returned by m_getm2() is a single mbuf
+	 * with pkthdr.  We can't align past it.  Align align itself.
+	 */
+	if (align)
+		align &= ~(sizeof(long) - 1);
 	if (align >= MHLEN)
-		goto nospace;
-	if (total + align > MHLEN)
-		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
-	else
-		m_final = m_gethdr(how, MT_DATA);
-	if (m_final == NULL)
-		goto nospace;
-	m_final->m_data += align;
-	m_new = m_final;
-	while (progress < total) {
-		length = total - progress;
-		if (length > MCLBYTES)
-			length = MCLBYTES;
-		if (m_new == NULL) {
-			if (length > MLEN)
-				m_new = m_getcl(how, MT_DATA, 0);
-			else
-				m_new = m_get(how, MT_DATA);
-			if (m_new == NULL)
-				goto nospace;
+		return (NULL);
+
+	/* Give us all or nothing. */
+	m = m_getm2(NULL, total + align, how, MT_DATA, flags);
+	if (m == NULL)
+		return (NULL);
+	m->m_data += align;
+
+	/* Fill all mbufs with uio data and update header information. */
+	for (mb = m; mb != NULL; mb = mb->m_next) {
+		length = min(M_TRAILINGSPACE(mb), total - progress);
+
+		error = uiomove(mtod(mb, void *), length, uio);
+		if (error) {
+			m_freem(m);
+			return (NULL);
 		}
-		error = uiomove(mtod(m_new, void *), length, uio);
-		if (error)
-			goto nospace;
+
+		mb->m_len = length;
 		progress += length;
-		m_new->m_len = length;
-		if (m_new != m_final)
-			m_cat(m_final, m_new);
-		m_new = NULL;
+		if (flags & M_PKTHDR)
+			m->m_pkthdr.len += length;
 	}
-	m_fixhdr(m_final);
-	return (m_final);
-nospace:
-	if (m_new)
-		m_free(m_new);
-	if (m_final)
-		m_freem(m_final);
-	return (NULL);
+	KASSERT(progress == total, ("%s: progress != total", __func__));
+
+	return (m);
 }
 
 /*

==== //depot/projects/soc2005/libalias/sys/kern/uipc_socket.c#8 (text+ko) ====

@@ -94,7 +94,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_socket.c,v 1.284 2006/10/22 11:52:14 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_socket.c,v 1.285 2006/11/02 17:45:28 andre Exp $");
 
 #include "opt_inet.h"
 #include "opt_mac.h"
@@ -813,9 +813,11 @@
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
-#endif /*ZERO_COPY_SOCKETS*/
 
 /*
+ * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
+ * sosend_dgram() and sosend_generic() use m_uiotombuf().
+ * 
  * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
  * all of the data referenced by the uio.  If desired, it uses zero-copy.
  * *space will be updated to reflect data copied in.
@@ -939,6 +941,7 @@
 	*retmp = top;
 	return (error);
 }
+#endif /*ZERO_COPY_SOCKETS*/
 
 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
 
@@ -954,7 +957,9 @@
 {
 	long space, resid;
 	int clen = 0, error, dontroute;
+#ifdef ZERO_COPY_SOCKETS
 	int atomic = sosendallatonce(so) || top;
+#endif
 
 	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
 	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
@@ -1040,9 +1045,19 @@
 		if (flags & MSG_EOR)
 			top->m_flags |= M_EOR;
 	} else {
+#ifdef ZERO_COPY_SOCKETS
 		error = sosend_copyin(uio, &top, atomic, &space, flags);
 		if (error)
 			goto out;
+#else
+		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
+		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
+		if (top == NULL) {
+			error = EFAULT;	/* only possible error */
+			goto out;
+		}
+		space -= resid - uio->uio_resid;
+#endif
 		resid = uio->uio_resid;
 	}
 	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
@@ -1202,12 +1217,25 @@
 				if (flags & MSG_EOR)
 					top->m_flags |= M_EOR;
 			} else {
+#ifdef ZERO_COPY_SOCKETS
 				error = sosend_copyin(uio, &top, atomic,
 				    &space, flags);
 				if (error != 0) {
 					SOCKBUF_LOCK(&so->so_snd);
 					goto release;
 				}
+#else
+				top = m_uiotombuf(uio, M_WAITOK, space,
+				    (atomic ? max_hdr : 0),
+				    (atomic ? M_PKTHDR : 0) |
+				    ((flags & MSG_EOR) ? M_EOR : 0));
+				if (top == NULL) {
+					SOCKBUF_LOCK(&so->so_snd);
+					error = EFAULT; /* only possible error */
+					goto release;
+				}
+				space -= resid - uio->uio_resid;
+#endif
 				resid = uio->uio_resid;
 			}
 			if (dontroute) {

==== //depot/projects/soc2005/libalias/sys/kern/uipc_syscalls.c#6 (text+ko) ====

@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_syscalls.c,v 1.239 2006/10/22 11:52:14 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_syscalls.c,v 1.241 2006/11/02 17:37:21 andre Exp $");
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
@@ -1882,19 +1882,20 @@
 	struct vnode *vp;
 	struct vm_object *obj = NULL;
 	struct socket *so = NULL;
-	struct mbuf *m, *m_header = NULL;
+	struct mbuf *m = NULL;
 	struct sf_buf *sf;
 	struct vm_page *pg;
-	off_t off, xfsize, hdtr_size, sbytes = 0;
-	int error, headersize = 0, headersent = 0;
+	off_t off, xfsize, hdtr_size = 0, sbytes = 0, rem = 0;
+	int error, headersize = 0, headersent = 0, mnw = 0;
 	int vfslocked;
 
 	NET_LOCK_GIANT();
 
-	hdtr_size = 0;
-
 	/*
-	 * The descriptor must be a regular file and have a backing VM object.
+	 * The file descriptor must be a regular file and have a
+	 * backing VM object.
+	 * File offset must be positive.  If it goes beyond EOF
+	 * we send only the header/trailer and no payload data.
 	 */
 	if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
 		goto done;
@@ -1922,7 +1923,17 @@
 		error = EINVAL;
 		goto done;
 	}
-	if ((error = getsock(td->td_proc->p_fd, uap->s, &sock_fp, NULL)) != 0)
+	if (uap->offset < 0) {
+		error = EINVAL;
+		goto done;
+	}
+
+	/*
+	 * The socket must be a stream socket and connected.
+	 * Remember if it a blocking or non-blocking socket.
+	 */
+	if ((error = getsock(td->td_proc->p_fd, uap->s, &sock_fp,
+	    NULL)) != 0)
 		goto done;
 	so = sock_fp->f_data;
 	if (so->so_type != SOCK_STREAM) {
@@ -1933,10 +1944,13 @@
 		error = ENOTCONN;
 		goto done;
 	}
-	if (uap->offset < 0) {
-		error = EINVAL;
-		goto done;
-	}
+	/*
+	 * Do not wait on memory allocations but return ENOMEM for
+	 * caller to retry later.
+	 * XXX: Experimental.
+	 */
+	if (uap->flags & SF_MNOWAIT)
+		mnw = 1;
 
 #ifdef MAC
 	SOCK_LOCK(so);
@@ -1946,290 +1960,307 @@
 		goto done;
 #endif
 
-	/*
-	 * If specified, get the pointer to the sf_hdtr struct for
-	 * any headers/trailers.
-	 */
+	/* If headers are specified copy them into mbufs. */
 	if (hdr_uio != NULL) {
 		hdr_uio->uio_td = td;
 		hdr_uio->uio_rw = UIO_WRITE;
 		if (hdr_uio->uio_resid > 0) {
-			m_header = m_uiotombuf(hdr_uio, M_DONTWAIT, 0, 0);
-			if (m_header == NULL)
+			m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
+			    0, 0, 0);
+			if (m == NULL) {
+				error = mnw ? EAGAIN : ENOBUFS;
 				goto done;
-			headersize = m_header->m_pkthdr.len;
+			}
+			headersize = hdr_uio->uio_resid;
 			if (compat)
 				sbytes += headersize;
 		}
 	}
 
-	/*
-	 * Protect against multiple writers to the socket.
-	 */
+	/* Protect against multiple writers to the socket. */
 	SOCKBUF_LOCK(&so->so_snd);
 	(void) sblock(&so->so_snd, M_WAITOK);
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	/*
-	 * Loop through the pages in the file, starting with the requested
+	 * Loop through the pages of the file, starting with the requested
 	 * offset. Get a file page (do I/O if necessary), map the file page
 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
 	 * it on the socket.
+	 * This is done in two loops.  The inner loop turns as many pages
+	 * as it can, up to available socket buffer space, without blocking
+	 * into mbufs to have it bulk delivered into the socket send buffer.
+	 * The outer loop checks the state and available space of the socket
+	 * and takes care of the overall progress.
 	 */
-	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
-		vm_pindex_t pindex;
-		vm_offset_t pgoff;
+	for (off = uap->offset; ; ) {
+		int loopbytes = 0;
+		int space = 0;
+		int done = 0;
 
-		pindex = OFF_TO_IDX(off);
-		VM_OBJECT_LOCK(obj);
-retry_lookup:
 		/*
-		 * Calculate the amount to transfer. Not to exceed a page,
-		 * the EOF, or the passed in nbytes.
-		 */
-		xfsize = obj->un_pager.vnp.vnp_size - off;
-		VM_OBJECT_UNLOCK(obj);
-		if (xfsize > PAGE_SIZE)
-			xfsize = PAGE_SIZE;
-		pgoff = (vm_offset_t)(off & PAGE_MASK);
-		if (PAGE_SIZE - pgoff < xfsize)
-			xfsize = PAGE_SIZE - pgoff;
-		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
-			xfsize = uap->nbytes - sbytes;
-		if (xfsize <= 0) {
-			if (m_header != NULL) {
-				m = m_header;
-				m_header = NULL;
-				SOCKBUF_LOCK(&so->so_snd);
-				goto retry_space;
-			} else
-				break;
-		}
-		/*
-		 * Optimize the non-blocking case by looking at the socket space
-		 * before going to the extra work of constituting the sf_buf.
+		 * Check the socket state for ongoing connection,
+		 * no errors and space in socket buffer.
+		 * If space is low allow for the remainder of the
+		 * file to be processed if it fits the socket buffer.
+		 * Otherwise block in waiting for sufficient space
+		 * to proceed, or if the socket is nonblocking, return
+		 * to userland with EAGAIN while reporting how far
+		 * we've come.
+		 * We wait until the socket buffer has significant free
+		 * space to do bulk sends.  This makes good use of file
+		 * system read ahead and allows packet segmentation
+		 * offloading hardware to take over lots of work.  If
+		 * we were not careful here we would send off only one
+		 * sfbuf at a time.
 		 */
 		SOCKBUF_LOCK(&so->so_snd);
-		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
-			if (so->so_snd.sb_state & SBS_CANTSENDMORE)
-				error = EPIPE;
-			else
-				error = EAGAIN;
-			sbunlock(&so->so_snd);
+		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
+			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
+retry_space:
+		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+			error = EPIPE;
+			SOCKBUF_UNLOCK(&so->so_snd);
+			goto done;
+		} else if (so->so_error) {
+			error = so->so_error;
+			so->so_error = 0;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		}
-		SOCKBUF_UNLOCK(&so->so_snd);
-		VM_OBJECT_LOCK(obj);
-		/*
-		 * Attempt to look up the page.
-		 *
-		 *	Allocate if not found
-		 *
-		 *	Wait and loop if busy.
-		 */
-		pg = vm_page_lookup(obj, pindex);
-
-		if (pg == NULL) {
-			pg = vm_page_alloc(obj, pindex, VM_ALLOC_NOBUSY |
-			    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
-			if (pg == NULL) {
-				VM_OBJECT_UNLOCK(obj);
-				VM_WAIT;
-				VM_OBJECT_LOCK(obj);
-				goto retry_lookup;
+		space = sbspace(&so->so_snd);
+		if (space < rem &&
+		    (space <= 0 ||
+		     space < so->so_snd.sb_lowat)) {
+			if (so->so_state & SS_NBIO) {
+				SOCKBUF_UNLOCK(&so->so_snd);
+				error = EAGAIN;
+				goto done;
 			}
-		} else if (vm_page_sleep_if_busy(pg, TRUE, "sfpbsy"))
-			goto retry_lookup;
-		else {
+			/*
+			 * sbwait drops the lock while sleeping.
+			 * When we loop back to retry_space the
+			 * state may have changed and we retest
+			 * for it.
+			 */
+			error = sbwait(&so->so_snd);
 			/*
-			 * Wire the page so it does not get ripped out from
-			 * under us.
+			 * An error from sbwait usually indicates that we've
+			 * been interrupted by a signal. If we've sent anything
+			 * then return bytes sent, otherwise return the error.
 			 */
-			vm_page_lock_queues();
-			vm_page_wire(pg);
-			vm_page_unlock_queues();
+			if (error) {
+				SOCKBUF_UNLOCK(&so->so_snd);
+				goto done;
+			}
+			goto retry_space;
 		}
+		SOCKBUF_UNLOCK(&so->so_snd);
 
 		/*
-		 * If page is not valid for what we need, initiate I/O
+		 * Loop and construct maximum sized mbuf chain to be bulk
+		 * dumped into socket buffer.
 		 */
+		while(space > loopbytes) {
+			vm_pindex_t pindex;
+			vm_offset_t pgoff;
+			struct mbuf *m0;
 
-		if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize)) {
-			VM_OBJECT_UNLOCK(obj);
-		} else if (uap->flags & SF_NODISKIO) {
-			error = EBUSY;
-		} else {
-			int bsize, resid;
-
+			VM_OBJECT_LOCK(obj);
+			/*
+			 * Calculate the amount to transfer.
+			 * Not to exceed a page, the EOF,
+			 * or the passed in nbytes.
+			 */
+			pgoff = (vm_offset_t)(off & PAGE_MASK);
+			xfsize = omin(PAGE_SIZE - pgoff,
+			    obj->un_pager.vnp.vnp_size - off -
+			    sbytes - loopbytes);
+			if (uap->nbytes)
+				rem = (uap->nbytes - sbytes - loopbytes);
+			else
+				rem = obj->un_pager.vnp.vnp_size - off -
+				    sbytes - loopbytes;
+			xfsize = omin(rem, xfsize);
+			if (xfsize <= 0) {
+				VM_OBJECT_UNLOCK(obj);
+				done = 1;		/* all data sent */
+				break;
+			}
 			/*
-			 * Ensure that our page is still around when the I/O
-			 * completes.
+			 * Don't overflow the send buffer.
+			 * Stop here and send out what we've
+			 * already got.
 			 */
-			vm_page_io_start(pg);
-			VM_OBJECT_UNLOCK(obj);
-
+			if (space < loopbytes + xfsize) {
+				VM_OBJECT_UNLOCK(obj);
+				break;
+			}
+retry_lookup:
 			/*
-			 * Get the page from backing store.
+			 * Attempt to look up the page.
+			 * Allocate if not found or
+			 * wait and loop if busy.
 			 */
-			bsize = vp->v_mount->mnt_stat.f_iosize;
-			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-			vn_lock(vp, LK_SHARED | LK_RETRY, td);
+			pindex = OFF_TO_IDX(off);
+			pg = vm_page_lookup(obj, pindex);
+			if (pg == NULL) {
+				pg = vm_page_alloc(obj, pindex,
+				    VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL |
+				    VM_ALLOC_WIRED);
+				if (pg == NULL) {
+					VM_OBJECT_UNLOCK(obj);
+					VM_WAIT;
+					VM_OBJECT_LOCK(obj);
+					goto retry_lookup;
+				}
+			} else if (vm_page_sleep_if_busy(pg, TRUE, "sfpbsy"))
+				goto retry_lookup;
+			else {
+				/*
+				 * Wire the page so it does not get
+				 * ripped out from under us.
+				 */
+				vm_page_lock_queues();
+				vm_page_wire(pg);
+				vm_page_unlock_queues();
+			}
+
 			/*
-			 * XXXMAC: Because we don't have fp->f_cred here,
-			 * we pass in NOCRED.  This is probably wrong, but
-			 * is consistent with our original implementation.
+			 * Check if page is valid for what we need,
+			 * otherwise initiate I/O.
+			 * If we already turned some pages into mbufs,
+			 * send them off before we come here again and
+			 * block.
 			 */
-			error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
-			    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
-			    IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
-			    td->td_ucred, NOCRED, &resid, td);
-			VOP_UNLOCK(vp, 0, td);
-			VFS_UNLOCK_GIANT(vfslocked);
-			VM_OBJECT_LOCK(obj);
-			vm_page_io_finish(pg);
-			if (!error)
+			if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
+				VM_OBJECT_UNLOCK(obj);
+			else if (m != NULL)
+				error = EAGAIN;	/* send what we already got */
+			else if (uap->flags & SF_NODISKIO)
+				error = EBUSY;
+			else {
+				int bsize, resid;
+
+				/*
+				 * Ensure that our page is still around
+				 * when the I/O completes.
+				 */
+				vm_page_io_start(pg);
+				VM_OBJECT_UNLOCK(obj);
+
+				/*
+				 * Get the page from backing store.
+				 */
+				bsize = vp->v_mount->mnt_stat.f_iosize;
+				vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+				vn_lock(vp, LK_SHARED | LK_RETRY, td);
+
+				/*
+				 * XXXMAC: Because we don't have fp->f_cred
+				 * here, we pass in NOCRED.  This is probably
+				 * wrong, but is consistent with our original
+				 * implementation.
+				 */
+				error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
+				    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
+				    IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
+				    td->td_ucred, NOCRED, &resid, td);
+				VOP_UNLOCK(vp, 0, td);
+				VFS_UNLOCK_GIANT(vfslocked);
+				VM_OBJECT_LOCK(obj);
+				vm_page_io_finish(pg);
+				if (!error)
+					VM_OBJECT_UNLOCK(obj);
+				mbstat.sf_iocnt++;
+			}
+			if (error) {
+				vm_page_lock_queues();
+				vm_page_unwire(pg, 0);
+				/*
+				 * See if anyone else might know about
+				 * this page.  If not and it is not valid,
+				 * then free it.
+				 */
+				if (pg->wire_count == 0 && pg->valid == 0 &&
+				    pg->busy == 0 && !(pg->oflags & VPO_BUSY) &&
+				    pg->hold_count == 0) {
+					vm_page_free(pg);
+				}
+				vm_page_unlock_queues();
 				VM_OBJECT_UNLOCK(obj);
-			mbstat.sf_iocnt++;
-		}
-	
-		if (error) {
-			vm_page_lock_queues();
-			vm_page_unwire(pg, 0);
+				if (error == EAGAIN)
+					error = 0;	/* not a real error */
+				break;
+			}
+
 			/*
-			 * See if anyone else might know about this page.
-			 * If not and it is not valid, then free it.
+			 * Get a sendfile buf.  We usually wait as long
+			 * as necessary, but this wait can be interrupted.
 			 */
-			if (pg->wire_count == 0 && pg->valid == 0 &&
-			    pg->busy == 0 && !(pg->oflags & VPO_BUSY) &&
-			    pg->hold_count == 0) {

>>> TRUNCATED FOR MAIL (1000 lines) <<<



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200611021938.kA2JcIVx038729>