Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 12 Feb 2014 20:06:27 +0000 (UTC)
From:      Gleb Smirnoff <glebius@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-projects@freebsd.org
Subject:   svn commit: r261809 - projects/sendfile/sys/kern
Message-ID:  <201402122006.s1CK6RY9064831@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: glebius
Date: Wed Feb 12 20:06:26 2014
New Revision: 261809
URL: http://svnweb.freebsd.org/changeset/base/261809

Log:
  Make the sendfile(2) call non-blocking on disk I/O, but unlike SF_NODISKIO,
  still performing the I/O. The new call is a drop-in replacement for older
  sendfile(2), so applications like web-servers do not need to be recompiled
  or reconfigured to achieve the benefit.
  
  The mechanics of the change are the following:
  
  o We grab enough pages to fill the socket buffer.
  o We iterate through pages, and request I/O on those, that are not
    valid. The I/O is requested via VOP_GETPAGES_ASYNC(), so it doesn't
    block.
  o If we did initiated any I/Os, then we send data to the buffer as
    SB_NOTREADY data, since I/Os are still in progress. And return.
  o Once the last I/O completes, we mark our data in socket as ready,
    and if we were the blocker of the socket, then we initiate send.
  
  The code still has quite a lot of rough places, but has already been
  tested at Netflix with positive results.
  
  Sponsored by:	Netflix
  Sponsored by:	Nginx, Inc.

Modified:
  projects/sendfile/sys/kern/uipc_syscalls.c

Modified: projects/sendfile/sys/kern/uipc_syscalls.c
==============================================================================
--- projects/sendfile/sys/kern/uipc_syscalls.c	Wed Feb 12 19:59:30 2014	(r261808)
+++ projects/sendfile/sys/kern/uipc_syscalls.c	Wed Feb 12 20:06:26 2014	(r261809)
@@ -132,9 +132,6 @@ static int	filt_sfsync(struct knote *kn,
  */
 static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0,
     "sendfile(2) tunables");
-static int sfreadahead = 1;
-SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW,
-    &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks");
 
 #ifdef	SFSYNC_DEBUG
 static int sf_sync_debug = 0;
@@ -2651,11 +2648,53 @@ vmoff(int i, off_t off)
 	return (trunc_page(off + i * PAGE_SIZE));
 }
 
+struct sf_io {
+	u_int		nios;
+	int		npages;
+	struct file	*sock_fp;
+	struct mbuf	*m;
+	vm_page_t	pa[];
+};
+
 static void
-sendfile_swapin(vm_object_t obj, vm_page_t *pa, int npages, off_t off,
-    off_t len)
+sf_io_done(void *arg)
 {
-	int rv;
+	struct sf_io *sfio = arg;
+	struct socket *so;
+
+	if (!refcount_release(&sfio->nios))
+		return;
+
+	so  = sfio->sock_fp->f_data;
+
+	if (sbready(&so->so_snd, sfio->m, sfio->npages) == 0) {
+		struct mbuf *m;
+
+		m = m_get(M_NOWAIT, MT_DATA);
+		if (m == NULL) {
+			panic("XXXGL");
+		}
+		m->m_len = 0;
+		CURVNET_SET(so->so_vnet);
+		/* XXXGL: curthread */
+		(void )(so->so_proto->pr_usrreqs->pru_send)
+		    (so, 0, m, NULL, NULL, curthread);
+		CURVNET_RESTORE();
+	}
+
+	/* XXXGL: curthread */
+	fdrop(sfio->sock_fp, curthread);
+	free(sfio, M_TEMP);
+}
+
+static int
+sendfile_swapin(vm_object_t obj, struct sf_io *sfio, off_t off, off_t len)
+{
+	vm_page_t *pa = sfio->pa;
+	int npages = sfio->npages;
+	int nios, rv;
+
+	nios = 0;
 
 	VM_OBJECT_WLOCK(obj);
 	for (int i = 0; i < npages; i++)
@@ -2687,13 +2726,16 @@ sendfile_swapin(vm_object_t obj, vm_page
 		if (i == j)
 			continue;
 
-		rv = vm_pager_get_pages(obj, pa + i, min(a + 1, npages - i), 0);
+		refcount_acquire(&sfio->nios);
+		rv = vm_pager_get_pages_async(obj, pa + i,
+		    min(a + 1, npages - i), 0, &sf_io_done, sfio);
 
 		KASSERT(rv == VM_PAGER_OK, ("%s: pager fail obj %p page %p",
 		    __func__, obj, pa[i]));
 
-		vm_page_xunbusy(pa[i]);
 		SFSTAT_INC(sf_iocnt);
+		nios++;
+
 		i += a;
 		for (j = i - a; a > 0 && j < npages; a--, j++)
 			KASSERT(pa[j] == vm_page_lookup(obj,
@@ -2702,14 +2744,9 @@ sendfile_swapin(vm_object_t obj, vm_page
 			    vm_page_lookup(obj, OFF_TO_IDX(vmoff(j, off)))));
 	}
 
-	for (int i = 0; i < npages; i++)
-		KASSERT((pa[i]->wire_count > 0 && vm_page_is_valid(pa[i],
-		    vmoff(i, off) & PAGE_MASK, xfsize(i, npages, off, len))),
-		    ("wrong page %p state off 0x%jx len 0x%jx",
-		    pa[i], (uintmax_t)vmoff(i, off),
-		    (uintmax_t)xfsize(i, npages, off, len)));
-
 	VM_OBJECT_WUNLOCK(obj);
+
+	return (nios);
 }
 
 static int
@@ -2905,9 +2942,10 @@ vn_sendfile(struct file *fp, int sockfd,
 	 * and takes care of the overall progress.
 	 */
 	for (off = offset; rem > 0; ) {
+		struct sf_io *sfio;
 		vm_page_t *pa;
 		struct mbuf *mtail;
-		int space, npages;
+		int nios, space, npages;
 
 		mtail = NULL;
 		/*
@@ -3002,17 +3040,22 @@ retry_space:
 			    (PAGE_SIZE - (off & PAGE_MASK)), PAGE_SIZE);
 		else
 			npages = howmany(space, PAGE_SIZE);
-		pa = malloc(npages * sizeof(vm_page_t), M_TEMP, mwait);
-		if (pa == NULL) {
+		sfio = malloc(sizeof(struct sf_io) +
+		    npages * sizeof(vm_page_t), M_TEMP, mwait);
+		if (sfio == NULL) {
 			error = merror;
 			goto done;
 		}
-		sendfile_swapin(obj, pa, npages, off, space);
+		refcount_init(&sfio->nios, 1);
+		sfio->npages = npages;
+
+		nios = sendfile_swapin(obj, sfio, off, space);
 
 		/*
 		 * Loop and construct maximum sized mbuf chain to be bulk
 		 * dumped into socket buffer.
 		 */
+		pa = sfio->pa;
 		for (int i = 0; i < npages; i++) {
 			struct mbuf *m0;
 
@@ -3065,6 +3108,10 @@ retry_space:
 			m0->m_data = (char *)sf_buf_kva(sf) +
 			    (vmoff(i, off) & PAGE_MASK);
 			m0->m_len = xfsize(i, npages, off, space);
+			m0->m_flags |= M_NOTREADY;
+
+			if (i == 0)
+				sfio->m = m0;
 
 			/* Append to mbuf chain. */
 			if (mtail != NULL)
@@ -3081,14 +3128,12 @@ retry_space:
 				sf_sync_ref(sfs);
 		}
 
-		/* Keep track of bytes processed. */
-		off += space;
-		rem -= space;
-
 		if (vp != NULL)
 			VOP_UNLOCK(vp, 0);
 
-		free(pa, M_TEMP);
+		/* Keep track of bytes processed. */
+		off += space;
+		rem -= space;
 
 		/* Prepend header, if any. */
 		if (hdrlen) {
@@ -3096,26 +3141,30 @@ retry_space:
 			m = mh;
 		}
 
-		if (error)
-			break;
+		if (error) {
+			free(sfio, M_TEMP);
+			goto done;
+		}
 
 		/* Add the buffer chain to the socket buffer. */
 		KASSERT(m_length(m, NULL) == space + hdrlen,
 		    ("%s: mlen %u space %d hdrlen %d",
 		    __func__, m_length(m, NULL), space, hdrlen));
 
-		SOCKBUF_LOCK(&so->so_snd);
-		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
-			error = EPIPE;
-			SOCKBUF_UNLOCK(&so->so_snd);
-			goto done;
-		}
-		SOCKBUF_UNLOCK(&so->so_snd);
 		CURVNET_SET(so->so_vnet);
-		/* Avoid error aliasing. */
-		serror = (*so->so_proto->pr_usrreqs->pru_send)
+		if (nios == 0) {
+			free(sfio, M_TEMP);
+			serror = (*so->so_proto->pr_usrreqs->pru_send)
 			    (so, 0, m, NULL, NULL, td);
+		} else {
+			sfio->sock_fp = sock_fp;
+			fhold(sock_fp);
+			serror = (*so->so_proto->pr_usrreqs->pru_send)
+			    (so, PRUS_NOTREADY, m, NULL, NULL, td);
+			sf_io_done(sfio);
+		}
 		CURVNET_RESTORE();
+
 		if (serror == 0) {
 			sbytes += space + hdrlen;
 			if (hdrlen)



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201402122006.s1CK6RY9064831>