Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 19 Jul 2014 10:27:25 +0400
From:      Gleb Smirnoff <glebius@FreeBSD.org>
To:        current@FreeBSD.org
Cc:        kib@FreeBSD.org
Subject:   [CFT/CFR] machine independent sf_bufs
Message-ID:  <20140719062725.GB85917@FreeBSD.org>

next in thread | raw e-mail | index | archive | help

--1yeeQ81UyVL57Vl7
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

  Hi!

  we've got a lot of common code in sys/*/*/vm_machdep.c wrt the
sf_buf allocation. I have gathered it into kern/subr_sfbuf.c.

o No MD code left in sys/*/*/vm_machdep.c.
o The arches that have physical map have their implementation in
  machine/sf_buf.h
o The arches that needs sf_bufs use subr_sfbuf.c, optionally having
  some stuff in machine/sf_buf.h

I can test only i386. I'd be grateful for testing:

arm
mips
mips64
sparc64
powerpc
i386 XEN

The test is a simple use of any applcation or test that uses sendfile(2).
The box shouldn't crash :) of course, and after end of a test there
should be no evidence of sf_buf leak (observed via netstat -m).

-- 
Totus tuus, Glebius.

--1yeeQ81UyVL57Vl7
Content-Type: text/x-diff; charset=us-ascii
Content-Disposition: attachment; filename="sfbuf-mi.diff"

Index: sys/amd64/include/sf_buf.h
===================================================================
--- sys/amd64/include/sf_buf.h	(revision 268750)
+++ sys/amd64/include/sf_buf.h	(working copy)
@@ -29,10 +29,6 @@
 #ifndef _MACHINE_SF_BUF_H_
 #define _MACHINE_SF_BUF_H_
 
-#include <vm/vm.h>
-#include <vm/vm_param.h>
-#include <vm/vm_page.h>
-
 /*
  * On this machine, the only purpose for which sf_buf is used is to implement
  * an opaque pointer required by the machine-independent parts of the kernel.
@@ -39,21 +35,7 @@
  * That pointer references the vm_page that is "mapped" by the sf_buf.  The
  * actual mapping is provided by the direct virtual-to-physical mapping.  
  */
-struct sf_buf;
-
-static inline struct sf_buf *
-sf_buf_alloc(struct vm_page *m, int pri)
-{
-
-	return ((struct sf_buf *)m);
-}
-
-static inline void
-sf_buf_free(struct sf_buf *sf)
-{
-}
-
-static __inline vm_offset_t
+static inline vm_offset_t
 sf_buf_kva(struct sf_buf *sf)
 {
 
@@ -60,11 +42,10 @@ sf_buf_kva(struct sf_buf *sf)
 	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS((vm_page_t)sf)));
 }
 
-static __inline vm_page_t
+static inline vm_page_t
 sf_buf_page(struct sf_buf *sf)
 {
 
 	return ((vm_page_t)sf);
 }
-
 #endif /* !_MACHINE_SF_BUF_H_ */
Index: sys/arm/arm/vm_machdep.c
===================================================================
--- sys/arm/arm/vm_machdep.c	(revision 268750)
+++ sys/arm/arm/vm_machdep.c	(working copy)
@@ -50,7 +50,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/socketvar.h>
-#include <sys/sf_buf.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
@@ -83,43 +82,7 @@ __FBSDID("$FreeBSD$");
 CTASSERT(sizeof(struct switchframe) == 24);
 CTASSERT(sizeof(struct trapframe) == 80);
 
-#ifndef NSFBUFS
-#define NSFBUFS		(512 + maxusers * 16)
-#endif
-
-static int nsfbufs;
-static int nsfbufspeak;
-static int nsfbufsused;
-
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
-    "Maximum number of sendfile(2) sf_bufs available");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
-    "Number of sendfile(2) sf_bufs at peak usage");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
-    "Number of sendfile(2) sf_bufs in use");
-
-static void     sf_buf_init(void *arg);
-SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL);
-
-LIST_HEAD(sf_head, sf_buf);
-
 /*
- * A hash table of active sendfile(2) buffers
- */
-static struct sf_head *sf_buf_active;
-static u_long sf_buf_hashmask;
-
-#define SF_BUF_HASH(m)  (((m) - vm_page_array) & sf_buf_hashmask)
-
-static TAILQ_HEAD(, sf_buf) sf_buf_freelist;
-static u_int    sf_buf_alloc_want;
-
-/*
- * A lock used to synchronize access to the hash table and free list
- */
-static struct mtx sf_buf_lock;
-
-/*
  * Finish a fork operation, with process p2 nearly set up.
  * Copy and update the pcb, set up the stack so that the child
  * ready to run and return to user mode.
@@ -184,107 +147,7 @@ cpu_thread_swapout(struct thread *td)
 {
 }
 
-/*
- * Detatch mapped page and release resources back to the system.
- */
 void
-sf_buf_free(struct sf_buf *sf)
-{
-
-	 mtx_lock(&sf_buf_lock);
-	 sf->ref_count--;
-	 if (sf->ref_count == 0) {
-		 TAILQ_INSERT_TAIL(&sf_buf_freelist, sf, free_entry);
-		 nsfbufsused--;
-		 pmap_kremove(sf->kva);
-		 sf->m = NULL;
-		 LIST_REMOVE(sf, list_entry);
-		 if (sf_buf_alloc_want > 0)
-			 wakeup(&sf_buf_freelist);
-	 }
-	 mtx_unlock(&sf_buf_lock);
-}
-
-/*
- * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
- */
-static void
-sf_buf_init(void *arg)
-{
-	struct sf_buf *sf_bufs;
-	vm_offset_t sf_base;
-	int i;
-
-	nsfbufs = NSFBUFS;
-	TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
-		
-	sf_buf_active = hashinit(nsfbufs, M_TEMP, &sf_buf_hashmask);
-	TAILQ_INIT(&sf_buf_freelist);
-	sf_base = kva_alloc(nsfbufs * PAGE_SIZE);
-	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP,
-	    M_NOWAIT | M_ZERO);
-	for (i = 0; i < nsfbufs; i++) {
-		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
-		TAILQ_INSERT_TAIL(&sf_buf_freelist, &sf_bufs[i], free_entry);
-	}
-	sf_buf_alloc_want = 0;
-	mtx_init(&sf_buf_lock, "sf_buf", NULL, MTX_DEF);
-}
-
-/*
- * Get an sf_buf from the freelist. Will block if none are available.
- */
-struct sf_buf *
-sf_buf_alloc(struct vm_page *m, int flags)
-{
-	struct sf_head *hash_list;
-	struct sf_buf *sf;
-	int error;
-
-	hash_list = &sf_buf_active[SF_BUF_HASH(m)];
-	mtx_lock(&sf_buf_lock);
-	LIST_FOREACH(sf, hash_list, list_entry) {
-		if (sf->m == m) {
-			sf->ref_count++;
-			if (sf->ref_count == 1) {
-				TAILQ_REMOVE(&sf_buf_freelist, sf, free_entry);
-				nsfbufsused++;
-				nsfbufspeak = imax(nsfbufspeak, nsfbufsused);
-			}
-			goto done;
-		}
-	}
-	while ((sf = TAILQ_FIRST(&sf_buf_freelist)) == NULL) {
-		if (flags & SFB_NOWAIT)
-			goto done;
-		sf_buf_alloc_want++;
-		SFSTAT_INC(sf_allocwait);
-		error = msleep(&sf_buf_freelist, &sf_buf_lock,
-		    (flags & SFB_CATCH) ? PCATCH | PVM : PVM, "sfbufa", 0);
-		sf_buf_alloc_want--;
-	
-
-		/*
-		 * If we got a signal, don't risk going back to sleep.
-		 */
-		if (error)
-			goto done;
-	}
-	TAILQ_REMOVE(&sf_buf_freelist, sf, free_entry);
-	if (sf->m != NULL)
-		LIST_REMOVE(sf, list_entry);
-	LIST_INSERT_HEAD(hash_list, sf, list_entry);
-	sf->ref_count = 1;
-	sf->m = m;
-	nsfbufsused++;
-	nsfbufspeak = imax(nsfbufspeak, nsfbufsused);
-	pmap_kenter(sf->kva, VM_PAGE_TO_PHYS(sf->m));
-done:
-	mtx_unlock(&sf_buf_lock);
-	return (sf);
-}
-
-void
 cpu_set_syscall_retval(struct thread *td, int error)
 {
 	struct trapframe *frame;
Index: sys/arm/include/sf_buf.h
===================================================================
--- sys/arm/include/sf_buf.h	(revision 268750)
+++ sys/arm/include/sf_buf.h	(working copy)
@@ -29,33 +29,18 @@
 #ifndef _MACHINE_SF_BUF_H_
 #define _MACHINE_SF_BUF_H_
 
-#include <sys/queue.h>
-
-struct vm_page;
-
-struct sf_buf {
-	LIST_ENTRY(sf_buf) list_entry;	/* list of buffers */
-	TAILQ_ENTRY(sf_buf) free_entry;	/* list of buffers */
-	struct		vm_page *m;	/* currently mapped page */
-	vm_offset_t	kva;		/* va of mapping */
-	int		ref_count;	/* usage of this mapping */
-};
-
-static __inline vm_offset_t
-sf_buf_kva(struct sf_buf *sf)
+static inline void
+sf_buf_map(struct sf_buf *sf, int flags)
 {
 
-	return (sf->kva);
+	pmap_kenter(sf->kva, VM_PAGE_TO_PHYS(sf->m));
 }
 
-static __inline struct vm_page *
-sf_buf_page(struct sf_buf *sf)
+static inline int
+sf_buf_unmap(struct sf_buf *sf)
 {
 
-	return (sf->m);
+	pmap_kremove(sf->kva);
+	return (1);
 }
-
-struct sf_buf *	sf_buf_alloc(struct vm_page *m, int flags);
-void sf_buf_free(struct sf_buf *sf);
-
 #endif /* !_MACHINE_SF_BUF_H_ */
Index: sys/arm/include/vmparam.h
===================================================================
--- sys/arm/include/vmparam.h	(revision 268750)
+++ sys/arm/include/vmparam.h	(working copy)
@@ -170,4 +170,7 @@ extern vm_offset_t vm_max_kernel_address;
 #define	VM_MAX_AUTOTUNE_MAXUSERS	384
 #endif
 
+#define	SFBUF
+#define	SFBUF_MAP
+
 #endif	/* _MACHINE_VMPARAM_H_ */
Index: sys/conf/files.arm
===================================================================
--- sys/conf/files.arm	(revision 268750)
+++ sys/conf/files.arm	(working copy)
@@ -77,6 +77,7 @@ font.h				optional	sc			\
 	clean	"font.h ${SC_DFLT_FONT}-8x14 ${SC_DFLT_FONT}-8x16 ${SC_DFLT_FONT}-8x8"
 kern/subr_busdma_bufalloc.c	standard
 kern/subr_dummy_vdso_tc.c	standard
+kern/subr_sfbuf.c		standard
 libkern/arm/aeabi_unwind.c	standard
 libkern/arm/divsi3.S		standard
 libkern/arm/ffs.S		standard
Index: sys/conf/files.i386
===================================================================
--- sys/conf/files.i386	(revision 268750)
+++ sys/conf/files.i386	(working copy)
@@ -520,6 +520,7 @@ isa/vga_isa.c			optional vga
 kern/kern_clocksource.c		standard
 kern/imgact_aout.c		optional compat_aout
 kern/imgact_gzip.c		optional gzip
+kern/subr_sfbuf.c		standard
 libkern/divdi3.c		standard
 libkern/flsll.c			standard
 libkern/memmove.c		standard
Index: sys/conf/files.mips
===================================================================
--- sys/conf/files.mips	(revision 268750)
+++ sys/conf/files.mips	(working copy)
@@ -51,6 +51,7 @@ mips/mips/vm_machdep.c			standard
 kern/kern_clocksource.c			standard
 kern/link_elf_obj.c			standard
 kern/subr_dummy_vdso_tc.c		standard
+kern/subr_sfbuf.c			optional	mips | mipsel | mipsn32
 
 # gcc/clang runtime
 libkern/ffsl.c				standard
Index: sys/conf/files.pc98
===================================================================
--- sys/conf/files.pc98	(revision 268750)
+++ sys/conf/files.pc98	(working copy)
@@ -205,6 +205,7 @@ i386/svr4/svr4_machdep.c	optional compat_svr4
 kern/kern_clocksource.c		standard
 kern/imgact_aout.c		optional compat_aout
 kern/imgact_gzip.c		optional gzip
+kern/subr_sfbuf.c		standard
 libkern/divdi3.c		standard
 libkern/flsll.c			standard
 libkern/memmove.c		standard
Index: sys/conf/files.powerpc
===================================================================
--- sys/conf/files.powerpc	(revision 268750)
+++ sys/conf/files.powerpc	(working copy)
@@ -71,6 +71,7 @@ dev/vt/hw/ofwfb/ofwfb.c		optional	vt aim
 kern/kern_clocksource.c		standard
 kern/subr_dummy_vdso_tc.c	standard
 kern/syscalls.c			optional	ktr
+kern/subr_sfbuf.c		standard
 libkern/ashldi3.c		optional	powerpc
 libkern/ashrdi3.c		optional	powerpc
 libkern/bcmp.c			standard
Index: sys/conf/files.sparc64
===================================================================
--- sys/conf/files.sparc64	(revision 268750)
+++ sys/conf/files.sparc64	(working copy)
@@ -63,6 +63,7 @@ dev/uart/uart_kbd_sun.c		optional	uart sc | vt
 kern/kern_clocksource.c		standard
 kern/subr_dummy_vdso_tc.c	standard
 kern/syscalls.c			optional	ktr
+kern/subr_sfbuf.c		standard
 libkern/ffs.c			standard
 libkern/ffsl.c			standard
 libkern/fls.c			standard
Index: sys/i386/i386/vm_machdep.c
===================================================================
--- sys/i386/i386/vm_machdep.c	(revision 268750)
+++ sys/i386/i386/vm_machdep.c	(working copy)
@@ -118,38 +118,6 @@ static u_int	cpu_reset_proxyid;
 static volatile u_int	cpu_reset_proxy_active;
 #endif
 
-static int nsfbufs;
-static int nsfbufspeak;
-static int nsfbufsused;
-
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
-    "Maximum number of sendfile(2) sf_bufs available");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
-    "Number of sendfile(2) sf_bufs at peak usage");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
-    "Number of sendfile(2) sf_bufs in use");
-
-static void	sf_buf_init(void *arg);
-SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL);
-
-LIST_HEAD(sf_head, sf_buf);
-
-/*
- * A hash table of active sendfile(2) buffers
- */
-static struct sf_head *sf_buf_active;
-static u_long sf_buf_hashmask;
-
-#define	SF_BUF_HASH(m)	(((m) - vm_page_array) & sf_buf_hashmask)
-
-static TAILQ_HEAD(, sf_buf) sf_buf_freelist;
-static u_int	sf_buf_alloc_want;
-
-/*
- * A lock used to synchronize access to the hash table and free list
- */
-static struct mtx sf_buf_lock;
-
 extern int	_ucodesel, _udatasel;
 
 /*
@@ -750,122 +718,13 @@ cpu_reset_real()
 }
 
 /*
- * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
- */
-static void
-sf_buf_init(void *arg)
-{
-	struct sf_buf *sf_bufs;
-	vm_offset_t sf_base;
-	int i;
-
-	nsfbufs = NSFBUFS;
-	TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
-
-	sf_buf_active = hashinit(nsfbufs, M_TEMP, &sf_buf_hashmask);
-	TAILQ_INIT(&sf_buf_freelist);
-	sf_base = kva_alloc(nsfbufs * PAGE_SIZE);
-	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP,
-	    M_NOWAIT | M_ZERO);
-	for (i = 0; i < nsfbufs; i++) {
-		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
-		TAILQ_INSERT_TAIL(&sf_buf_freelist, &sf_bufs[i], free_entry);
-	}
-	sf_buf_alloc_want = 0;
-	mtx_init(&sf_buf_lock, "sf_buf", NULL, MTX_DEF);
-}
-
-/*
- * Invalidate the cache lines that may belong to the page, if
- * (possibly old) mapping of the page by sf buffer exists.  Returns
- * TRUE when mapping was found and cache invalidated.
- */
-boolean_t
-sf_buf_invalidate_cache(vm_page_t m)
-{
-	struct sf_head *hash_list;
-	struct sf_buf *sf;
-	boolean_t ret;
-
-	hash_list = &sf_buf_active[SF_BUF_HASH(m)];
-	ret = FALSE;
-	mtx_lock(&sf_buf_lock);
-	LIST_FOREACH(sf, hash_list, list_entry) {
-		if (sf->m == m) {
-			/*
-			 * Use pmap_qenter to update the pte for
-			 * existing mapping, in particular, the PAT
-			 * settings are recalculated.
-			 */
-			pmap_qenter(sf->kva, &m, 1);
-			pmap_invalidate_cache_range(sf->kva, sf->kva +
-			    PAGE_SIZE);
-			ret = TRUE;
-			break;
-		}
-	}
-	mtx_unlock(&sf_buf_lock);
-	return (ret);
-}
-
-/*
  * Get an sf_buf from the freelist.  May block if none are available.
  */
-struct sf_buf *
-sf_buf_alloc(struct vm_page *m, int flags)
+void
+sf_buf_map(struct sf_buf *sf, int flags)
 {
 	pt_entry_t opte, *ptep;
-	struct sf_head *hash_list;
-	struct sf_buf *sf;
-#ifdef SMP
-	cpuset_t other_cpus;
-	u_int cpuid;
-#endif
-	int error;
 
-	KASSERT(curthread->td_pinned > 0 || (flags & SFB_CPUPRIVATE) == 0,
-	    ("sf_buf_alloc(SFB_CPUPRIVATE): curthread not pinned"));
-	hash_list = &sf_buf_active[SF_BUF_HASH(m)];
-	mtx_lock(&sf_buf_lock);
-	LIST_FOREACH(sf, hash_list, list_entry) {
-		if (sf->m == m) {
-			sf->ref_count++;
-			if (sf->ref_count == 1) {
-				TAILQ_REMOVE(&sf_buf_freelist, sf, free_entry);
-				nsfbufsused++;
-				nsfbufspeak = imax(nsfbufspeak, nsfbufsused);
-			}
-#ifdef SMP
-			goto shootdown;	
-#else
-			goto done;
-#endif
-		}
-	}
-	while ((sf = TAILQ_FIRST(&sf_buf_freelist)) == NULL) {
-		if (flags & SFB_NOWAIT)
-			goto done;
-		sf_buf_alloc_want++;
-		SFSTAT_INC(sf_allocwait);
-		error = msleep(&sf_buf_freelist, &sf_buf_lock,
-		    (flags & SFB_CATCH) ? PCATCH | PVM : PVM, "sfbufa", 0);
-		sf_buf_alloc_want--;
-
-		/*
-		 * If we got a signal, don't risk going back to sleep. 
-		 */
-		if (error)
-			goto done;
-	}
-	TAILQ_REMOVE(&sf_buf_freelist, sf, free_entry);
-	if (sf->m != NULL)
-		LIST_REMOVE(sf, list_entry);
-	LIST_INSERT_HEAD(hash_list, sf, list_entry);
-	sf->ref_count = 1;
-	sf->m = m;
-	nsfbufsused++;
-	nsfbufspeak = imax(nsfbufspeak, nsfbufsused);
-
 	/*
 	 * Update the sf_buf's virtual-to-physical mapping, flushing the
 	 * virtual address from the TLB.  Since the reference count for 
@@ -876,11 +735,11 @@ cpu_reset_real()
 	ptep = vtopte(sf->kva);
 	opte = *ptep;
 #ifdef XEN
-       PT_SET_MA(sf->kva, xpmap_ptom(VM_PAGE_TO_PHYS(m)) | pgeflag
-	   | PG_RW | PG_V | pmap_cache_bits(m->md.pat_mode, 0));
+       PT_SET_MA(sf->kva, xpmap_ptom(VM_PAGE_TO_PHYS(sf->m)) | pgeflag
+	   | PG_RW | PG_V | pmap_cache_bits(sf->m->md.pat_mode, 0));
 #else
-	*ptep = VM_PAGE_TO_PHYS(m) | pgeflag | PG_RW | PG_V |
-	    pmap_cache_bits(m->md.pat_mode, 0);
+	*ptep = VM_PAGE_TO_PHYS(sf->m) | pgeflag | PG_RW | PG_V |
+	    pmap_cache_bits(sf->m->md.pat_mode, 0);
 #endif
 
 	/*
@@ -892,7 +751,21 @@ cpu_reset_real()
 #ifdef SMP
 	if ((opte & (PG_V | PG_A)) ==  (PG_V | PG_A))
 		CPU_ZERO(&sf->cpumask);
-shootdown:
+
+	sf_buf_shootdown(sf, flags);
+#else
+	if ((opte & (PG_V | PG_A)) ==  (PG_V | PG_A))
+		pmap_invalidate_page(kernel_pmap, sf->kva);
+#endif
+}
+
+#ifdef SMP
+void
+sf_buf_shootdown(struct sf_buf *sf, int flags)
+{
+	cpuset_t other_cpus;
+	u_int cpuid;
+
 	sched_pin();
 	cpuid = PCPU_GET(cpuid);
 	if (!CPU_ISSET(cpuid, &sf->cpumask)) {
@@ -909,42 +782,50 @@ cpu_reset_real()
 		}
 	}
 	sched_unpin();
+}
+#endif
+
+/*
+ * MD part of sf_buf_free().
+ */
+int
+sf_buf_unmap(struct sf_buf *sf)
+{
+#ifdef XEN
+	/*
+	 * Xen doesn't like having dangling R/W mappings
+	 */
+	pmap_qremove(sf->kva, 1);
+	return (1);
 #else
-	if ((opte & (PG_V | PG_A)) ==  (PG_V | PG_A))
-		pmap_invalidate_page(kernel_pmap, sf->kva);
+	return (0);
 #endif
-done:
-	mtx_unlock(&sf_buf_lock);
-	return (sf);
 }
 
+static void
+sf_buf_invalidate(struct sf_buf *sf)
+{
+	vm_page_t m = sf->m;
+
+	/*
+	 * Use pmap_qenter to update the pte for
+	 * existing mapping, in particular, the PAT
+	 * settings are recalculated.
+	 */
+	pmap_qenter(sf->kva, &m, 1);
+	pmap_invalidate_cache_range(sf->kva, sf->kva + PAGE_SIZE);
+}
+
 /*
- * Remove a reference from the given sf_buf, adding it to the free
- * list when its reference count reaches zero.  A freed sf_buf still,
- * however, retains its virtual-to-physical mapping until it is
- * recycled or reactivated by sf_buf_alloc(9).
+ * Invalidate the cache lines that may belong to the page, if
+ * (possibly old) mapping of the page by sf buffer exists.  Returns
+ * TRUE when mapping was found and cache invalidated.
  */
-void
-sf_buf_free(struct sf_buf *sf)
+boolean_t
+sf_buf_invalidate_cache(vm_page_t m)
 {
 
-	mtx_lock(&sf_buf_lock);
-	sf->ref_count--;
-	if (sf->ref_count == 0) {
-		TAILQ_INSERT_TAIL(&sf_buf_freelist, sf, free_entry);
-		nsfbufsused--;
-#ifdef XEN
-/*
- * Xen doesn't like having dangling R/W mappings
- */
-		pmap_qremove(sf->kva, 1);
-		sf->m = NULL;
-		LIST_REMOVE(sf, list_entry);
-#endif
-		if (sf_buf_alloc_want > 0)
-			wakeup(&sf_buf_freelist);
-	}
-	mtx_unlock(&sf_buf_lock);
+	return (sf_buf_process_page(m, sf_buf_invalidate));
 }
 
 /*
Index: sys/i386/include/sf_buf.h
===================================================================
--- sys/i386/include/sf_buf.h	(revision 268750)
+++ sys/i386/include/sf_buf.h	(working copy)
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2003, 2005 Alan L. Cox <alc@cs.rice.edu>
+ * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,39 +29,8 @@
 #ifndef _MACHINE_SF_BUF_H_
 #define _MACHINE_SF_BUF_H_
 
-#include <sys/_cpuset.h>
-#include <sys/queue.h>
+void sf_buf_map(struct sf_buf *, int);
+int sf_buf_unmap(struct sf_buf *);
+boolean_t sf_buf_invalidate_cache(vm_page_t);
 
-struct vm_page;
-
-struct sf_buf {
-	LIST_ENTRY(sf_buf) list_entry;	/* list of buffers */
-	TAILQ_ENTRY(sf_buf) free_entry;	/* list of buffers */
-	struct		vm_page *m;	/* currently mapped page */
-	vm_offset_t	kva;		/* va of mapping */
-	int		ref_count;	/* usage of this mapping */
-#ifdef SMP
-	cpuset_t	cpumask;	/* cpus on which mapping is valid */
-#endif
-};
-
-struct sf_buf * sf_buf_alloc(struct vm_page *m, int flags);
-void sf_buf_free(struct sf_buf *sf);
-
-static __inline vm_offset_t
-sf_buf_kva(struct sf_buf *sf)
-{
-
-	return (sf->kva);
-}
-
-static __inline struct vm_page *
-sf_buf_page(struct sf_buf *sf)
-{
-
-	return (sf->m);
-}
-
-boolean_t sf_buf_invalidate_cache(vm_page_t m);
-
 #endif /* !_MACHINE_SF_BUF_H_ */
Index: sys/i386/include/vmparam.h
===================================================================
--- sys/i386/include/vmparam.h	(revision 268750)
+++ sys/i386/include/vmparam.h	(working copy)
@@ -198,4 +198,9 @@
 #define VM_MAX_AUTOTUNE_MAXUSERS 384
 #endif
 
+#define	SFBUF
+#define	SFBUF_MAP
+#define	SFBUF_CPUSET
+#define	SFBUF_PROCESS_PAGE
+
 #endif /* _MACHINE_VMPARAM_H_ */
Index: sys/kern/subr_sfbuf.c
===================================================================
--- sys/kern/subr_sfbuf.c	(revision 0)
+++ sys/kern/subr_sfbuf.c	(working copy)
@@ -0,0 +1,226 @@
+/*-
+ * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org>
+ * Copyright (c) 2003, 2005 Alan L. Cox <alc@cs.rice.edu>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sf_buf.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_page.h>
+
+#ifndef NSFBUFS
+#define	NSFBUFS		(512 + maxusers * 16)
+#endif
+
+static int nsfbufs;
+static int nsfbufspeak;
+static int nsfbufsused;
+
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
+    "Maximum number of sendfile(2) sf_bufs available");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
+    "Number of sendfile(2) sf_bufs at peak usage");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
+    "Number of sendfile(2) sf_bufs in use");
+
+static void	sf_buf_init(void *arg);
+SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL);
+
+LIST_HEAD(sf_head, sf_buf);
+
+/*
+ * A hash table of active sendfile(2) buffers
+ */
+static struct sf_head *sf_buf_active;
+static u_long sf_buf_hashmask;
+
+#define	SF_BUF_HASH(m)	(((m) - vm_page_array) & sf_buf_hashmask)
+
+static TAILQ_HEAD(, sf_buf) sf_buf_freelist;
+static u_int	sf_buf_alloc_want;
+
+/*
+ * A lock used to synchronize access to the hash table and free list
+ */
+static struct mtx sf_buf_lock;
+
+/*
+ * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
+ */
+static void
+sf_buf_init(void *arg)
+{
+	struct sf_buf *sf_bufs;
+	vm_offset_t sf_base;
+	int i;
+
+#ifdef SFBUF_OPTIONAL_DIRECT_MAP
+	if (SFBUF_OPTIONAL_DIRECT_MAP)
+		return;
+#endif
+
+	nsfbufs = NSFBUFS;
+	TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
+
+	sf_buf_active = hashinit(nsfbufs, M_TEMP, &sf_buf_hashmask);
+	TAILQ_INIT(&sf_buf_freelist);
+	sf_base = kva_alloc(nsfbufs * PAGE_SIZE);
+	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP,
+	    M_NOWAIT | M_ZERO);
+	KASSERT(sf_bufs, ("%s: malloc failure", __func__));
+	for (i = 0; i < nsfbufs; i++) {
+		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
+		TAILQ_INSERT_TAIL(&sf_buf_freelist, &sf_bufs[i], free_entry);
+	}
+	sf_buf_alloc_want = 0;
+	mtx_init(&sf_buf_lock, "sf_buf", NULL, MTX_DEF);
+}
+
+/*
+ * Get an sf_buf from the freelist.  May block if none are available.
+ */
+struct sf_buf *
+sf_buf_alloc(struct vm_page *m, int flags)
+{
+	struct sf_head *hash_list;
+	struct sf_buf *sf;
+	int error;
+
+#ifdef SFBUF_OPTIONAL_DIRECT_MAP
+	if (SFBUF_OPTIONAL_DIRECT_MAP)
+		return ((struct sf_buf *)m);
+#endif
+
+	KASSERT(curthread->td_pinned > 0 || (flags & SFB_CPUPRIVATE) == 0,
+	    ("sf_buf_alloc(SFB_CPUPRIVATE): curthread not pinned"));
+	hash_list = &sf_buf_active[SF_BUF_HASH(m)];
+	mtx_lock(&sf_buf_lock);
+	LIST_FOREACH(sf, hash_list, list_entry) {
+		if (sf->m == m) {
+			sf->ref_count++;
+			if (sf->ref_count == 1) {
+				TAILQ_REMOVE(&sf_buf_freelist, sf, free_entry);
+				nsfbufsused++;
+				nsfbufspeak = imax(nsfbufspeak, nsfbufsused);
+			}
+#if defined(SMP) && defined(SFBUF_CPUSET)
+			sf_buf_shootdown(sf, flags);
+#endif
+			goto done;
+		}
+	}
+	while ((sf = TAILQ_FIRST(&sf_buf_freelist)) == NULL) {
+		if (flags & SFB_NOWAIT)
+			goto done;
+		sf_buf_alloc_want++;
+		SFSTAT_INC(sf_allocwait);
+		error = msleep(&sf_buf_freelist, &sf_buf_lock,
+		    (flags & SFB_CATCH) ? PCATCH | PVM : PVM, "sfbufa", 0);
+		sf_buf_alloc_want--;
+
+		/*
+		 * If we got a signal, don't risk going back to sleep. 
+		 */
+		if (error)
+			goto done;
+	}
+	TAILQ_REMOVE(&sf_buf_freelist, sf, free_entry);
+	if (sf->m != NULL)
+		LIST_REMOVE(sf, list_entry);
+	LIST_INSERT_HEAD(hash_list, sf, list_entry);
+	sf->ref_count = 1;
+	sf->m = m;
+	nsfbufsused++;
+	nsfbufspeak = imax(nsfbufspeak, nsfbufsused);
+	sf_buf_map(sf, flags);
+done:
+	mtx_unlock(&sf_buf_lock);
+	return (sf);
+}
+
+/*
+ * Remove a reference from the given sf_buf, adding it to the free
+ * list when its reference count reaches zero.  A freed sf_buf still,
+ * however, retains its virtual-to-physical mapping until it is
+ * recycled or reactivated by sf_buf_alloc(9).
+ */
+void
+sf_buf_free(struct sf_buf *sf)
+{
+
+#ifdef SFBUF_OPTIONAL_DIRECT_MAP
+	if (SFBUF_OPTIONAL_DIRECT_MAP)
+		return;
+#endif
+
+	mtx_lock(&sf_buf_lock);
+	sf->ref_count--;
+	if (sf->ref_count == 0) {
+		TAILQ_INSERT_TAIL(&sf_buf_freelist, sf, free_entry);
+		nsfbufsused--;
+		if (sf_buf_unmap(sf)) {
+			sf->m = NULL;
+			LIST_REMOVE(sf, list_entry);
+		}
+		if (sf_buf_alloc_want > 0)
+			wakeup(&sf_buf_freelist);
+	}
+	mtx_unlock(&sf_buf_lock);
+}
+
+#ifdef SFBUF_PROCESS_PAGE
+/*
+ * Run callback function on sf_buf that holds a certain page.
+ */
+boolean_t
+sf_buf_process_page(vm_page_t m, void (*cb)(struct sf_buf *))
+{
+	struct sf_head *hash_list;
+	struct sf_buf *sf;
+
+	hash_list = &sf_buf_active[SF_BUF_HASH(m)];
+	mtx_lock(&sf_buf_lock);
+	LIST_FOREACH(sf, hash_list, list_entry) {
+		if (sf->m == m) {
+			cb(sf);
+			mtx_unlock(&sf_buf_lock);
+			return (TRUE);
+		}
+	}
+	mtx_unlock(&sf_buf_lock);
+	return (FALSE);
+}
+#endif	/* SFBUF_PROCESS_PAGE */

Property changes on: sys/kern/subr_sfbuf.c
___________________________________________________________________
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Index: sys/mips/include/sf_buf.h
===================================================================
--- sys/mips/include/sf_buf.h	(revision 268750)
+++ sys/mips/include/sf_buf.h	(working copy)
@@ -29,31 +29,9 @@
 #ifndef _MACHINE_SF_BUF_H_
 #define _MACHINE_SF_BUF_H_
 
-#ifdef __mips_n64
-#include <vm/vm.h>
-#include <vm/vm_param.h>
-#include <vm/vm_page.h>
-#else
-#include <sys/queue.h>
-#endif
+#ifdef __mips_n64	/* In 64 bit the whole memory is directly mapped */
 
-#ifdef __mips_n64
-/* In 64 bit the whole memory is directly mapped */
-struct	sf_buf;
-
-static inline struct sf_buf *
-sf_buf_alloc(struct vm_page *m, int pri)
-{
-
-	return ((struct sf_buf *)m);
-}
-
-static inline void
-sf_buf_free(struct sf_buf *sf)
-{
-}
-
-static __inline vm_offset_t
+static inline vm_offset_t
 sf_buf_kva(struct sf_buf *sf)
 {
 	vm_page_t	m;
@@ -62,7 +40,7 @@ sf_buf_kva(struct sf_buf *sf)
 	return (MIPS_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)));
 }
 
-static __inline struct vm_page *
+static inline struct vm_page *
 sf_buf_page(struct sf_buf *sf)
 {
 
@@ -69,31 +47,5 @@ sf_buf_page(struct sf_buf *sf)
 	return ((vm_page_t)sf);
 }
 
-#else /* ! __mips_n64 */
-struct vm_page;
-
-struct sf_buf {
-	SLIST_ENTRY(sf_buf) free_list;	/* list of free buffer slots */
-	struct		vm_page *m;	/* currently mapped page */
-	vm_offset_t	kva;		/* va of mapping */
-};
-
-struct sf_buf * sf_buf_alloc(struct vm_page *m, int flags);
-void sf_buf_free(struct sf_buf *sf);
-
-static __inline vm_offset_t
-sf_buf_kva(struct sf_buf *sf)
-{
-
-	return (sf->kva);
-}
-
-static __inline struct vm_page *
-sf_buf_page(struct sf_buf *sf)
-{
-
-	return (sf->m);
-}
 #endif /* __mips_n64 */
-
 #endif /* !_MACHINE_SF_BUF_H_ */
Index: sys/mips/include/vmparam.h
===================================================================
--- sys/mips/include/vmparam.h	(revision 268750)
+++ sys/mips/include/vmparam.h	(working copy)
@@ -187,4 +187,8 @@
 
 #define	ZERO_REGION_SIZE	(64 * 1024)	/* 64KB */
 
+#ifndef __mips_n64
+#define	SFBUF
+#endif
+
 #endif /* !_MACHINE_VMPARAM_H_ */
Index: sys/mips/mips/vm_machdep.c
===================================================================
--- sys/mips/mips/vm_machdep.c	(revision 268750)
+++ sys/mips/mips/vm_machdep.c	(working copy)
@@ -76,9 +76,6 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/user.h>
 #include <sys/mbuf.h>
-#ifndef __mips_n64
-#include <sys/sf_buf.h>
-#endif
 
 /* Duplicated from asm.h */
 #if defined(__mips_o32)
@@ -92,39 +89,7 @@ __FBSDID("$FreeBSD$");
 #define	CALLFRAME_SIZ	(SZREG * 4)
 #endif
 
-#ifndef __mips_n64
-
-#ifndef NSFBUFS
-#define	NSFBUFS		(512 + maxusers * 16)
-#endif
-
-static int nsfbufs;
-static int nsfbufspeak;
-static int nsfbufsused;
-
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
-    "Maximum number of sendfile(2) sf_bufs available");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
-    "Number of sendfile(2) sf_bufs at peak usage");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
-    "Number of sendfile(2) sf_bufs in use");
-
-static void	sf_buf_init(void *arg);
-SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL);
-
 /*
- * Expanded sf_freelist head.  Really an SLIST_HEAD() in disguise, with the
- * sf_freelist head with the sf_lock mutex.
- */
-static struct {
-	SLIST_HEAD(, sf_buf) sf_head;
-	struct mtx sf_lock;
-} sf_freelist;
-
-static u_int	sf_buf_alloc_want;
-#endif /* !__mips_n64 */
-
-/*
  * Finish a fork operation, with process p2 nearly set up.
  * Copy and update the pcb, set up the stack so that the child
  * ready to run and return to user mode.
@@ -513,84 +478,6 @@ cpu_set_upcall_kse(struct thread *td, void (*entry
 #define	ZIDLE_HI(v)	((v) * 4 / 5)
 
 /*
- * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
- */
-#ifndef __mips_n64
-static void
-sf_buf_init(void *arg)
-{
-	struct sf_buf *sf_bufs;
-	vm_offset_t sf_base;
-	int i;
-
-	nsfbufs = NSFBUFS;
-	TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
-
-	mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF);
-	SLIST_INIT(&sf_freelist.sf_head);
-	sf_base = kva_alloc(nsfbufs * PAGE_SIZE);
-	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP,
-	    M_NOWAIT | M_ZERO);
-	for (i = 0; i < nsfbufs; i++) {
-		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
-		SLIST_INSERT_HEAD(&sf_freelist.sf_head, &sf_bufs[i], free_list);
-	}
-	sf_buf_alloc_want = 0;
-}
-
-/*
- * Get an sf_buf from the freelist.  Will block if none are available.
- */
-struct sf_buf *
-sf_buf_alloc(struct vm_page *m, int flags)
-{
-	struct sf_buf *sf;
-	int error;
-
-	mtx_lock(&sf_freelist.sf_lock);
-	while ((sf = SLIST_FIRST(&sf_freelist.sf_head)) == NULL) {
-		if (flags & SFB_NOWAIT)
-			break;
-		sf_buf_alloc_want++;
-		SFSTAT_INC(sf_allocwait);
-		error = msleep(&sf_freelist, &sf_freelist.sf_lock,
-		    (flags & SFB_CATCH) ? PCATCH | PVM : PVM, "sfbufa", 0);
-		sf_buf_alloc_want--;
-
-		/*
-		 * If we got a signal, don't risk going back to sleep.
-		 */
-		if (error)
-			break;
-	}
-	if (sf != NULL) {
-		SLIST_REMOVE_HEAD(&sf_freelist.sf_head, free_list);
-		sf->m = m;
-		nsfbufsused++;
-		nsfbufspeak = imax(nsfbufspeak, nsfbufsused);
-		pmap_qenter(sf->kva, &sf->m, 1);
-	}
-	mtx_unlock(&sf_freelist.sf_lock);
-	return (sf);
-}
-
-/*
- * Release resources back to the system.
- */
-void
-sf_buf_free(struct sf_buf *sf)
-{
-	pmap_qremove(sf->kva, 1);
-	mtx_lock(&sf_freelist.sf_lock);
-	SLIST_INSERT_HEAD(&sf_freelist.sf_head, sf, free_list);
-	nsfbufsused--;
-	if (sf_buf_alloc_want > 0)
-		wakeup(&sf_freelist);
-	mtx_unlock(&sf_freelist.sf_lock);
-}
-#endif	/* !__mips_n64 */
-
-/*
  * Software interrupt handler for queued VM system processing.
  */
 void
Index: sys/powerpc/include/sf_buf.h
===================================================================
--- sys/powerpc/include/sf_buf.h	(revision 268750)
+++ sys/powerpc/include/sf_buf.h	(working copy)
@@ -1,80 +0,0 @@
-/*-
- * Copyright (c) 2003 Alan L. Cox <alc@cs.rice.edu>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _MACHINE_SF_BUF_H_
-#define _MACHINE_SF_BUF_H_
-
-#include <vm/vm.h>
-#include <vm/vm_param.h>
-#include <vm/vm_page.h>
-#include <machine/md_var.h>
-#include <sys/queue.h>
-
-struct vm_page;
-
-struct sf_buf {
-	LIST_ENTRY(sf_buf) list_entry;	/* list of buffers */
-	TAILQ_ENTRY(sf_buf) free_entry;	/* list of buffers */
-	struct		vm_page *m;	/* currently mapped page */
-	vm_offset_t	kva;		/* va of mapping */
-	int		ref_count;	/* usage of this mapping */
-};
-
-struct sf_buf * sf_buf_alloc(struct vm_page *m, int flags);
-void sf_buf_free(struct sf_buf *sf);
-
-/*
- * On 32-bit OEA, the only purpose for which sf_buf is used is to implement
- * an opaque pointer required by the machine-independent parts of the kernel.
- * That pointer references the vm_page that is "mapped" by the sf_buf.  The
- * actual mapping is provided by the direct virtual-to-physical mapping.  
- *
- * On OEA64 and Book-E, we need to do something a little more complicated. Use
- * the runtime-detected hw_direct_map to pick between the two cases. Our
- * friends in vm_machdep.c will do the same to ensure nothing gets confused.
- */
-
-static __inline vm_offset_t
-sf_buf_kva(struct sf_buf *sf)
-{
-	if (hw_direct_map)
-		return (VM_PAGE_TO_PHYS((vm_page_t)sf));
-
-	return (sf->kva);
-}
-
-static __inline struct vm_page *
-sf_buf_page(struct sf_buf *sf)
-{
-	if (hw_direct_map)
-		return ((vm_page_t)sf);
-
-	return (sf->m);
-}
-
-#endif /* !_MACHINE_SF_BUF_H_ */
Index: sys/powerpc/include/vmparam.h
===================================================================
--- sys/powerpc/include/vmparam.h	(revision 268750)
+++ sys/powerpc/include/vmparam.h	(working copy)
@@ -197,4 +197,18 @@ struct pmap_physseg {
 
 #define	ZERO_REGION_SIZE	(64 * 1024)	/* 64KB */
 
+/*
+ * On 32-bit OEA, the only purpose for which sf_buf is used is to implement
+ * an opaque pointer required by the machine-independent parts of the kernel.
+ * That pointer references the vm_page that is "mapped" by the sf_buf.  The
+ * actual mapping is provided by the direct virtual-to-physical mapping.
+ *
+ * On OEA64 and Book-E, we need to do something a little more complicated. Use
+ * the runtime-detected hw_direct_map to pick between the two cases. Our
+ * friends in vm_machdep.c will do the same to ensure nothing gets confused.
+ */
+#define	SFBUF
+#define	SFBUF_NOMD
+#define	SFBUF_OPTIONAL_DIRECT_MAP	hw_direct_map
+ 
 #endif /* _MACHINE_VMPARAM_H_ */
Index: sys/powerpc/powerpc/vm_machdep.c
===================================================================
--- sys/powerpc/powerpc/vm_machdep.c	(revision 268750)
+++ sys/powerpc/powerpc/vm_machdep.c	(working copy)
@@ -80,7 +80,6 @@
 #include <sys/vmmeter.h>
 #include <sys/kernel.h>
 #include <sys/mbuf.h>
-#include <sys/sf_buf.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/unistd.h>
@@ -100,47 +99,6 @@
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 
-/*
- * On systems without a direct mapped region (e.g. PPC64),
- * we use the same code as the Book E implementation. Since
- * we need to have runtime detection of this, define some machinery
- * for sf_bufs in this case, and ignore it on systems with direct maps.
- */
-
-#ifndef NSFBUFS
-#define NSFBUFS		(512 + maxusers * 16)
-#endif
-
-static int nsfbufs;
-static int nsfbufspeak;
-static int nsfbufsused;
-
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
-    "Maximum number of sendfile(2) sf_bufs available");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
-    "Number of sendfile(2) sf_bufs at peak usage");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
-    "Number of sendfile(2) sf_bufs in use");
-
-static void sf_buf_init(void *arg);
-SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL);
- 
-LIST_HEAD(sf_head, sf_buf);
- 
-/* A hash table of active sendfile(2) buffers */
-static struct sf_head *sf_buf_active;
-static u_long sf_buf_hashmask;
-
-#define SF_BUF_HASH(m)  (((m) - vm_page_array) & sf_buf_hashmask)
-
-static TAILQ_HEAD(, sf_buf) sf_buf_freelist;
-static u_int sf_buf_alloc_want;
-
-/*
- * A lock used to synchronize access to the hash table and free list
- */
-static struct mtx sf_buf_lock;
-
 #ifdef __powerpc64__
 extern uintptr_t tocbase;
 #endif
@@ -245,124 +203,6 @@ cpu_exit(struct thread *td)
 }
 
 /*
- * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
- */
-static void
-sf_buf_init(void *arg)
-{
-	struct sf_buf *sf_bufs;
-	vm_offset_t sf_base;
-	int i;
-
-	/* Don't bother on systems with a direct map */
-	if (hw_direct_map)
-		return;
-
-	nsfbufs = NSFBUFS;
-	TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
-
-	sf_buf_active = hashinit(nsfbufs, M_TEMP, &sf_buf_hashmask);
-	TAILQ_INIT(&sf_buf_freelist);
-	sf_base = kva_alloc(nsfbufs * PAGE_SIZE);
-	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP,
-	    M_NOWAIT | M_ZERO);
-
-	for (i = 0; i < nsfbufs; i++) {
-		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
-		TAILQ_INSERT_TAIL(&sf_buf_freelist, &sf_bufs[i], free_entry);
-	}
-	sf_buf_alloc_want = 0;
-	mtx_init(&sf_buf_lock, "sf_buf", NULL, MTX_DEF);
-}
-
-/*
- * Get an sf_buf from the freelist. Will block if none are available.
- */
-struct sf_buf *
-sf_buf_alloc(struct vm_page *m, int flags)
-{
-	struct sf_head *hash_list;
-	struct sf_buf *sf;
-	int error;
-
-	if (hw_direct_map) {
-		/* Shortcut the direct mapped case */
-		return ((struct sf_buf *)m);
-	}
-
-	hash_list = &sf_buf_active[SF_BUF_HASH(m)];
-	mtx_lock(&sf_buf_lock);
-	LIST_FOREACH(sf, hash_list, list_entry) {
-		if (sf->m == m) {
-			sf->ref_count++;
-			if (sf->ref_count == 1) {
-				TAILQ_REMOVE(&sf_buf_freelist, sf, free_entry);
-				nsfbufsused++;
-				nsfbufspeak = imax(nsfbufspeak, nsfbufsused);
-			}
-			goto done;
-		}
-	}
-
-	while ((sf = TAILQ_FIRST(&sf_buf_freelist)) == NULL) {
-		if (flags & SFB_NOWAIT)
-			goto done;
-
-		sf_buf_alloc_want++;
-		SFSTAT_INC(sf_allocwait);
-		error = msleep(&sf_buf_freelist, &sf_buf_lock,
-		    (flags & SFB_CATCH) ? PCATCH | PVM : PVM, "sfbufa", 0);
-		sf_buf_alloc_want--;
-
-		/*
-		 * If we got a signal, don't risk going back to sleep.
-		 */
-		if (error)
-			goto done;
-	}
-
-	TAILQ_REMOVE(&sf_buf_freelist, sf, free_entry);
-	if (sf->m != NULL)
-		LIST_REMOVE(sf, list_entry);
-
-	LIST_INSERT_HEAD(hash_list, sf, list_entry);
-	sf->ref_count = 1;
-	sf->m = m;
-	nsfbufsused++;
-	nsfbufspeak = imax(nsfbufspeak, nsfbufsused);
-	pmap_qenter(sf->kva, &sf->m, 1);
-done:
-	mtx_unlock(&sf_buf_lock);
-	return (sf);
-}
-
-/*
- * Detach mapped page and release resources back to the system.
- *
- * Remove a reference from the given sf_buf, adding it to the free
- * list when its reference count reaches zero. A freed sf_buf still,
- * however, retains its virtual-to-physical mapping until it is
- * recycled or reactivated by sf_buf_alloc(9).
- */
-void
-sf_buf_free(struct sf_buf *sf)
-{
-	if (hw_direct_map)
-		return;
-
-	mtx_lock(&sf_buf_lock);
-	sf->ref_count--;
-	if (sf->ref_count == 0) {
-		TAILQ_INSERT_TAIL(&sf_buf_freelist, sf, free_entry);
-		nsfbufsused--;
-
-		if (sf_buf_alloc_want > 0)
-			wakeup(&sf_buf_freelist);
-	}
-	mtx_unlock(&sf_buf_lock);
-}
-
-/*
  * Software interrupt handler for queued VM system processing.
  */
 void
Index: sys/sparc64/include/sf_buf.h
===================================================================
--- sys/sparc64/include/sf_buf.h	(revision 268750)
+++ sys/sparc64/include/sf_buf.h	(working copy)
@@ -1,59 +0,0 @@
-/*-
- * Copyright (c) 2003 Alan L. Cox <alc@cs.rice.edu>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _MACHINE_SF_BUF_H_
-#define _MACHINE_SF_BUF_H_
-
-#include <sys/queue.h>
-
-struct vm_page;
-
-struct sf_buf {
-	SLIST_ENTRY(sf_buf) free_list;	/* list of free buffer slots */
-	struct		vm_page *m;	/* currently mapped page */
-	vm_offset_t	kva;		/* va of mapping */
-};
-
-struct sf_buf * sf_buf_alloc(struct vm_page *m, int flags);
-void sf_buf_free(struct sf_buf *sf);
-
-static __inline vm_offset_t
-sf_buf_kva(struct sf_buf *sf)
-{
-
-	return (sf->kva);
-}
-
-static __inline struct vm_page *
-sf_buf_page(struct sf_buf *sf)
-{
-
-	return (sf->m);
-}
-
-#endif /* !_MACHINE_SF_BUF_H_ */
Index: sys/sparc64/include/vmparam.h
===================================================================
--- sys/sparc64/include/vmparam.h	(revision 268750)
+++ sys/sparc64/include/vmparam.h	(working copy)
@@ -239,4 +239,7 @@ extern vm_offset_t vm_max_kernel_address;
  */
 #define	ZERO_REGION_SIZE	PAGE_SIZE
 
+#define	SFBUF
+#define	SFBUF_NOMD
+
 #endif /* !_MACHINE_VMPARAM_H_ */
Index: sys/sparc64/sparc64/vm_machdep.c
===================================================================
--- sys/sparc64/sparc64/vm_machdep.c	(revision 268750)
+++ sys/sparc64/sparc64/vm_machdep.c	(working copy)
@@ -53,7 +53,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
-#include <sys/sf_buf.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/unistd.h>
@@ -84,35 +83,6 @@ __FBSDID("$FreeBSD$");
 #include <machine/tlb.h>
 #include <machine/tstate.h>
 
-#ifndef NSFBUFS
-#define	NSFBUFS		(512 + maxusers * 16)
-#endif
-
-static int nsfbufs;
-static int nsfbufspeak;
-static int nsfbufsused;
-
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
-    "Maximum number of sendfile(2) sf_bufs available");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
-    "Number of sendfile(2) sf_bufs at peak usage");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
-    "Number of sendfile(2) sf_bufs in use");
-
-static void	sf_buf_init(void *arg);
-SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL);
-
-/*
- * Expanded sf_freelist head.  Really an SLIST_HEAD() in disguise, with the
- * sf_freelist head with the sf_lock mutex.
- */
-static struct {
-	SLIST_HEAD(, sf_buf) sf_head;
-	struct mtx sf_lock;
-} sf_freelist;
-
-static u_int	sf_buf_alloc_want;
-
 PMAP_STATS_VAR(uma_nsmall_alloc);
 PMAP_STATS_VAR(uma_nsmall_alloc_oc);
 PMAP_STATS_VAR(uma_nsmall_free);
@@ -417,84 +387,7 @@ is_physical_memory(vm_paddr_t addr)
 	return (0);
 }
 
-/*
- * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
- */
-static void
-sf_buf_init(void *arg)
-{
-	struct sf_buf *sf_bufs;
-	vm_offset_t sf_base;
-	int i;
-
-	nsfbufs = NSFBUFS;
-	TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
-
-	mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF);
-	SLIST_INIT(&sf_freelist.sf_head);
-	sf_base = kva_alloc(nsfbufs * PAGE_SIZE);
-	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP,
-	    M_NOWAIT | M_ZERO);
-	for (i = 0; i < nsfbufs; i++) {
-		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
-		SLIST_INSERT_HEAD(&sf_freelist.sf_head, &sf_bufs[i], free_list);
-	}
-	sf_buf_alloc_want = 0;
-}
-
-/*
- * Get an sf_buf from the freelist.  Will block if none are available.
- */
-struct sf_buf *
-sf_buf_alloc(struct vm_page *m, int flags)
-{
-	struct sf_buf *sf;
-	int error;
-
-	mtx_lock(&sf_freelist.sf_lock);
-	while ((sf = SLIST_FIRST(&sf_freelist.sf_head)) == NULL) {
-		if (flags & SFB_NOWAIT)
-			break;
-		sf_buf_alloc_want++;
-		SFSTAT_INC(sf_allocwait);
-		error = msleep(&sf_freelist, &sf_freelist.sf_lock,
-		    (flags & SFB_CATCH) ? PCATCH | PVM : PVM, "sfbufa", 0);
-		sf_buf_alloc_want--;
-
-		/*
-		 * If we got a signal, don't risk going back to sleep.
-		 */
-		if (error)
-			break;
-	}
-	if (sf != NULL) {
-		SLIST_REMOVE_HEAD(&sf_freelist.sf_head, free_list);
-		sf->m = m;
-		nsfbufsused++;
-		nsfbufspeak = imax(nsfbufspeak, nsfbufsused);
-		pmap_qenter(sf->kva, &sf->m, 1);
-	}
-	mtx_unlock(&sf_freelist.sf_lock);
-	return (sf);
-}
-
-/*
- * Release resources back to the system.
- */
 void
-sf_buf_free(struct sf_buf *sf)
-{
-
-	pmap_qremove(sf->kva, 1);
-	mtx_lock(&sf_freelist.sf_lock);
-	SLIST_INSERT_HEAD(&sf_freelist.sf_head, sf, free_list);
-	nsfbufsused--;
-	if (sf_buf_alloc_want > 0)
-		wakeup(&sf_freelist);
-	mtx_unlock(&sf_freelist.sf_lock);
-}
-
-void
 swi_vm(void *v)
 {
 
Index: sys/sys/sf_buf.h
===================================================================
--- sys/sys/sf_buf.h	(revision 268750)
+++ sys/sys/sf_buf.h	(working copy)
@@ -29,6 +29,114 @@
 #ifndef _SYS_SF_BUF_H_
 #define _SYS_SF_BUF_H_
 
+struct sfstat {				/* sendfile statistics */
+	uint64_t	sf_iocnt;	/* times sendfile had to do disk I/O */
+	uint64_t	sf_allocfail;	/* times sfbuf allocation failed */
+	uint64_t	sf_allocwait;	/* times sfbuf allocation had to wait */
+};
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_page.h>
+
+#ifdef SFBUF
+#if defined(SMP) && defined(SFBUF_CPUSET)
+#include <sys/_cpuset.h>
+#endif
+#include <sys/queue.h>
+
+struct sf_buf {
+	LIST_ENTRY(sf_buf)	list_entry;	/* list of buffers */
+	TAILQ_ENTRY(sf_buf)	free_entry;	/* list of buffers */
+	vm_page_t		m;		/* currently mapped page */
+	vm_offset_t		kva;		/* va of mapping */
+	int			ref_count;	/* usage of this mapping */
+#if defined(SMP) && defined(SFBUF_CPUSET)
+	cpuset_t		cpumask;	/* where mapping is valid */
+#endif
+};
+#else /* ! SFBUF */
+struct sf_buf;
+#endif /* SFBUF */
+
+#ifndef SFBUF_NOMD
+#include <machine/sf_buf.h>
+#endif
+#ifdef SFBUF_OPTIONAL_DIRECT_MAP
+#include <machine/md_var.h>
+#endif
+
+#ifdef SFBUF
+struct sf_buf *sf_buf_alloc(struct vm_page *, int);
+void sf_buf_free(struct sf_buf *);
+
+static inline vm_offset_t
+sf_buf_kva(struct sf_buf *sf)
+{
+#ifdef SFBUF_OPTIONAL_DIRECT_MAP
+	if (SFBUF_OPTIONAL_DIRECT_MAP)
+		return (VM_PAGE_TO_PHYS((vm_page_t)sf));
+#endif
+
+        return (sf->kva);
+}
+
+static inline vm_page_t
+sf_buf_page(struct sf_buf *sf)
+{
+#ifdef SFBUF_OPTIONAL_DIRECT_MAP
+	if (SFBUF_OPTIONAL_DIRECT_MAP)
+		return ((vm_page_t)sf);
+#endif
+
+        return (sf->m);
+}
+
+#ifndef SFBUF_MAP
+#include <vm/pmap.h>
+
+static inline void
+sf_buf_map(struct sf_buf *sf, int flags)
+{
+
+	pmap_qenter(sf->kva, &sf->m, 1);
+}
+
+static inline int
+sf_buf_unmap(struct sf_buf *sf)
+{
+
+	return (0);
+}
+#endif /* SFBUF_MAP */
+
+#if defined(SMP) && defined(SFBUF_CPUSET)
+void sf_buf_shootdown(struct sf_buf *, int);
+#endif
+
+#ifdef SFBUF_PROCESS_PAGE
+boolean_t sf_buf_process_page(vm_page_t, void (*)(struct sf_buf *));
+#endif
+
+#else /* ! SFBUF */
+
+static inline struct sf_buf *
+sf_buf_alloc(struct vm_page *m, int pri)
+{
+
+	return ((struct sf_buf *)m);
+}
+
+static inline void
+sf_buf_free(struct sf_buf *sf)
+{
+}
+#endif /* SFBUF */
+
 /*
  * Options to sf_buf_alloc() are specified through its flags argument.  This
  * argument's value should be the result of a bitwise or'ing of one or more
@@ -40,19 +148,6 @@
 #define	SFB_DEFAULT	0
 #define	SFB_NOWAIT	4		/* Return NULL if all bufs are used. */
 
-struct vm_page;
-
-struct sfstat {				/* sendfile statistics */
-	uint64_t	sf_iocnt;	/* times sendfile had to do disk I/O */
-	uint64_t	sf_allocfail;	/* times sfbuf allocation failed */
-	uint64_t	sf_allocwait;	/* times sfbuf allocation had to wait */
-};
-
-#ifdef _KERNEL
-#include <machine/sf_buf.h>
-#include <sys/systm.h>
-#include <sys/counter.h>
-
 extern counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
 #define	SFSTAT_ADD(name, val)	\
     counter_u64_add(sfstat[offsetof(struct sfstat, name) / sizeof(uint64_t)],\

--1yeeQ81UyVL57Vl7--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20140719062725.GB85917>