Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 27 Aug 2008 11:03:20 GMT
From:      Edward Tomasz Napierala <trasz@FreeBSD.org>
To:        Perforce Change Reviews <perforce@freebsd.org>
Subject:   PERFORCE change 148598 for review
Message-ID:  <200808271103.m7RB3Kww063624@repoman.freebsd.org>

next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=148598

Change 148598 by trasz@trasz_traszkan on 2008/08/27 11:02:34

	IFC.  This is the point 20080827-nfs4acls.diff was made.

Affected files ...

.. //depot/projects/soc2008/trasz_nfs4acl/lib/libc/include/libc_private.h#3 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/lib/libc/stdlib/Symbol.map#4 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/lib/libc/stdlib/malloc.3#2 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/lib/libc/stdlib/malloc.c#5 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/lib/libc/sys/execve.2#2 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/lib/libc/sys/wait.2#2 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/lib/libthr/thread/thr_exit.c#2 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sbin/ping6/ping6.8#3 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sbin/ping6/ping6.c#3 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sys/amd64/amd64/bpf_jit_machdep.c#4 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sys/boot/forth/loader.conf#8 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sys/conf/files.mips#2 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sys/conf/options#6 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sys/dev/ex/if_ex_pccard.c#3 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sys/dev/pccard/pccard_cis.c#3 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sys/dev/pccard/pccarddevs#5 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sys/dev/wi/if_wi_pccard.c#3 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sys/i386/cpufreq/est.c#4 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sys/i386/i386/bpf_jit_machdep.c#4 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sys/kern/imgact_shell.c#2 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sys/kern/kern_exit.c#4 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sys/net/bpf.h#4 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sys/net/bpf_jitter.c#3 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sys/powerpc/booke/locore.S#2 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sys/powerpc/booke/machdep.c#3 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sys/powerpc/booke/pmap.c#4 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/sys/sys/wait.h#2 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/tools/regression/acltools/fuzzer.sh#3 edit
.. //depot/projects/soc2008/trasz_nfs4acl/tools/regression/bpf/bpf_filter/Makefile#2 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/tools/regression/bpf/bpf_filter/bpf_test.c#2 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/tools/regression/bpf/bpf_filter/tests/test0001.h#2 integrate
.. //depot/projects/soc2008/trasz_nfs4acl/tools/regression/bpf/bpf_filter/tests/test0075.h#1 branch
.. //depot/projects/soc2008/trasz_nfs4acl/tools/regression/bpf/bpf_filter/tests/test0076.h#1 branch
.. //depot/projects/soc2008/trasz_nfs4acl/tools/regression/bpf/bpf_filter/tests/test0077.h#1 branch
.. //depot/projects/soc2008/trasz_nfs4acl/tools/regression/bpf/bpf_filter/tests/test0078.h#1 branch
.. //depot/projects/soc2008/trasz_nfs4acl/tools/regression/bpf/bpf_filter/tests/test0079.h#1 branch
.. //depot/projects/soc2008/trasz_nfs4acl/usr.bin/netstat/inet.c#3 integrate

Differences ...

==== //depot/projects/soc2008/trasz_nfs4acl/lib/libc/include/libc_private.h#3 (text+ko) ====

@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/lib/libc/include/libc_private.h,v 1.19 2008/06/23 05:22:06 ed Exp $
+ * $FreeBSD: src/lib/libc/include/libc_private.h,v 1.20 2008/08/27 02:00:53 jasone Exp $
  *
  * Private definitions for libc, libc_r and libpthread.
  *
@@ -158,6 +158,12 @@
 extern const char *__progname;
 
 /*
+ * This function is used by the threading libraries to notify malloc that a
+ * thread is exiting.
+ */
+void _malloc_thread_cleanup(void);
+
+/*
  * These functions are used by the threading libraries in order to protect
  * malloc across fork().
  */

==== //depot/projects/soc2008/trasz_nfs4acl/lib/libc/stdlib/Symbol.map#4 (text) ====

@@ -1,5 +1,5 @@
 /*
- * $FreeBSD: src/lib/libc/stdlib/Symbol.map,v 1.8 2008/08/20 08:31:58 ed Exp $
+ * $FreeBSD: src/lib/libc/stdlib/Symbol.map,v 1.9 2008/08/27 02:00:53 jasone Exp $
  */
 
 FBSD_1.0 {
@@ -93,6 +93,7 @@
 };
 
 FBSDprivate_1.0 {
+	_malloc_thread_cleanup;
 	_malloc_prefork;
 	_malloc_postfork;
 	__system;

==== //depot/projects/soc2008/trasz_nfs4acl/lib/libc/stdlib/malloc.3#2 (text+ko) ====

@@ -30,9 +30,9 @@
 .\" SUCH DAMAGE.
 .\"
 .\"     @(#)malloc.3	8.1 (Berkeley) 6/4/93
-.\" $FreeBSD: src/lib/libc/stdlib/malloc.3,v 1.78 2008/02/17 17:09:24 jasone Exp $
+.\" $FreeBSD: src/lib/libc/stdlib/malloc.3,v 1.79 2008/08/27 02:00:53 jasone Exp $
 .\"
-.Dd February 17, 2008
+.Dd August 26, 2008
 .Dt MALLOC 3
 .Os
 .Sh NAME
@@ -154,7 +154,7 @@
 implementation-dependent.
 .Sh TUNING
 Once, when the first call is made to one of these memory allocation
-routines, various flags will be set or reset, which affect the
+routines, various flags will be set or reset, which affects the
 workings of this allocator implementation.
 .Pp
 The
@@ -196,6 +196,11 @@
 Therefore, some applications may benefit from increasing or decreasing this
 threshold parameter.
 This option is not available for some configurations (non-PIC).
+.It C
+Double/halve the size of the maximum size class that is a multiple of the
+cacheline size (64).
+Above this size, subpage spacing (256 bytes) is used for size classes.
+The default value is 512 bytes.
 .It D
 Use
 .Xr sbrk 2
@@ -214,6 +219,16 @@
 The default is 512 pages per arena;
 .Ev MALLOC_OPTIONS=10f
 will prevent any dirty unused pages from accumulating.
+.It G
+When there are multiple threads, use thread-specific caching for objects that
+are smaller than one page.
+This option is enabled by default.
+Thread-specific caching allows many allocations to be satisfied without
+performing any thread synchronization, at the cost of increased memory use.
+See the
+.Dq R
+option for related tuning information.
+This option is not available for some configurations (non-PIC).
 .It J
 Each byte of new memory allocated by
 .Fn malloc ,
@@ -248,7 +263,7 @@
 acquiring memory.
 .It N
 Double/halve the number of arenas.
-The default number of arenas is four times the number of CPUs, or one if there
+The default number of arenas is two times the number of CPUs, or one if there
 is a single CPU.
 .It P
 Various statistics are printed at program exit via an
@@ -259,14 +274,18 @@
 Therefore, this option should only be used with care; it is primarily intended
 as a performance tuning aid during application development.
 .It Q
-Double/halve the size of the allocation quantum.
-The default quantum is the minimum allowed by the architecture (typically 8 or
-16 bytes).
-.It S
 Double/halve the size of the maximum size class that is a multiple of the
-quantum.
-Above this size, power-of-two spacing is used for size classes.
-The default value is 512 bytes.
+quantum (8 or 16 bytes, depending on architecture).
+Above this size, cacheline spacing is used for size classes.
+The default value is 128 bytes.
+.It R
+Double/halve magazine size, which approximately doubles/halves the number of
+rounds in each magazine.
+Magazines are used by the thread-specific caching machinery to acquire and
+release objects in bulk.
+Increasing the magazine size decreases locking overhead, at the expense of
+increased memory usage.
+This option is not available for some configurations (non-PIC).
 .It U
 Generate
 .Dq utrace
@@ -358,6 +377,13 @@
 However, it may make sense to reduce the number of arenas if an application
 does not make much use of the allocation functions.
 .Pp
+In addition to multiple arenas, this allocator supports thread-specific
+caching for small objects (smaller than one page), in order to make it
+possible to completely avoid synchronization for most small allocation requests.
+Such caching allows very fast allocation in the common case, but it increases
+memory usage and fragmentation, since a bounded number of objects can remain
+allocated in each thread cache.
+.Pp
 Memory is conceptually broken into equal-sized chunks, where the chunk size is
 a power of two that is greater than the page size.
 Chunks are always aligned to multiples of the chunk size.
@@ -366,7 +392,7 @@
 .Pp
 User objects are broken into three categories according to size: small, large,
 and huge.
-Small objects are no larger than one half of a page.
+Small objects are smaller than one page.
 Large objects are smaller than the chunk size.
 Huge objects are a multiple of the chunk size.
 Small and large objects are managed by arenas; huge objects are managed
@@ -378,23 +404,24 @@
 contiguous pages (unused, backing a set of small objects, or backing one large
 object).
 The combination of chunk alignment and chunk page maps makes it possible to
-determine all metadata regarding small and large allocations in
-constant and logarithmic time, respectively.
+determine all metadata regarding small and large allocations in constant time.
 .Pp
 Small objects are managed in groups by page runs.
 Each run maintains a bitmap that tracks which regions are in use.
-Allocation requests that are no more than half the quantum (see the
+Allocation requests that are no more than half the quantum (8 or 16, depending
+on architecture) are rounded up to the nearest power of two.
+Allocation requests that are more than half the quantum, but no more than the
+minimum cacheline-multiple size class (see the
 .Dq Q
-option) are rounded up to the nearest power of two (typically 2, 4, or 8).
-Allocation requests that are more than half the quantum, but no more than the
-maximum quantum-multiple size class (see the
-.Dq S
 option) are rounded up to the nearest multiple of the quantum.
-Allocation requests that are larger than the maximum quantum-multiple size
-class, but no larger than one half of a page, are rounded up to the nearest
-power of two.
-Allocation requests that are larger than half of a page, but small enough to
-fit in an arena-managed chunk (see the
+Allocation requests that are more than the minumum cacheline-multiple size
+class, but no more than the minimum subpage-multiple size class (see the
+.Dq C
+option) are rounded up to the nearest multiple of the cacheline size (64).
+Allocation requests that are more than the minimum subpage-multiple size class
+are rounded up to the nearest multiple of the subpage size (256).
+Allocation requests that are more than one page, but small enough to fit in
+an arena-managed chunk (see the
 .Dq K
 option), are rounded up to the nearest run size.
 Allocation requests that are too large to fit in an arena-managed chunk are
@@ -402,8 +429,8 @@
 .Pp
 Allocations are packed tightly together, which can be an issue for
 multi-threaded applications.
-If you need to assure that allocations do not suffer from cache line sharing,
-round your allocation requests up to the nearest multiple of the cache line
+If you need to assure that allocations do not suffer from cacheline sharing,
+round your allocation requests up to the nearest multiple of the cacheline
 size.
 .Sh DEBUGGING MALLOC PROBLEMS
 The first thing to do is to set the

==== //depot/projects/soc2008/trasz_nfs4acl/lib/libc/stdlib/malloc.c#5 (text+ko) ====

@@ -35,6 +35,9 @@
  *   + Multiple arenas are used if there are multiple CPUs, which reduces lock
  *     contention and cache sloshing.
  *
+ *   + Thread-specific caching is used if there are multiple threads, which
+ *     reduces the amount of locking.
+ *
  *   + Cache line sharing between arenas is avoided for internal data
  *     structures.
  *
@@ -48,37 +51,49 @@
  * and a 16 byte quantum on a 32-bit system, the size classes in each category
  * are as follows:
  *
- *   |=====================================|
- *   | Category | Subcategory    |    Size |
- *   |=====================================|
- *   | Small    | Tiny           |       2 |
- *   |          |                |       4 |
- *   |          |                |       8 |
- *   |          |----------------+---------|
- *   |          | Quantum-spaced |      16 |
- *   |          |                |      32 |
- *   |          |                |      48 |
- *   |          |                |     ... |
- *   |          |                |     480 |
- *   |          |                |     496 |
- *   |          |                |     512 |
- *   |          |----------------+---------|
- *   |          | Sub-page       |    1 kB |
- *   |          |                |    2 kB |
- *   |=====================================|
- *   | Large                     |    4 kB |
- *   |                           |    8 kB |
- *   |                           |   12 kB |
- *   |                           |     ... |
- *   |                           | 1012 kB |
- *   |                           | 1016 kB |
- *   |                           | 1020 kB |
- *   |=====================================|
- *   | Huge                      |    1 MB |
- *   |                           |    2 MB |
- *   |                           |    3 MB |
- *   |                           |     ... |
- *   |=====================================|
+ *   |=======================================|
+ *   | Category | Subcategory      |    Size |
+ *   |=======================================|
+ *   | Small    | Tiny             |       2 |
+ *   |          |                  |       4 |
+ *   |          |                  |       8 |
+ *   |          |------------------+---------|
+ *   |          | Quantum-spaced   |      16 |
+ *   |          |                  |      32 |
+ *   |          |                  |      48 |
+ *   |          |                  |     ... |
+ *   |          |                  |      96 |
+ *   |          |                  |     112 |
+ *   |          |                  |     128 |
+ *   |          |------------------+---------|
+ *   |          | Cacheline-spaced |     192 |
+ *   |          |                  |     256 |
+ *   |          |                  |     320 |
+ *   |          |                  |     384 |
+ *   |          |                  |     448 |
+ *   |          |                  |     512 |
+ *   |          |------------------+---------|
+ *   |          | Sub-page         |     760 |
+ *   |          |                  |    1024 |
+ *   |          |                  |    1280 |
+ *   |          |                  |     ... |
+ *   |          |                  |    3328 |
+ *   |          |                  |    3584 |
+ *   |          |                  |    3840 |
+ *   |=======================================|
+ *   | Large                       |    4 kB |
+ *   |                             |    8 kB |
+ *   |                             |   12 kB |
+ *   |                             |     ... |
+ *   |                             | 1012 kB |
+ *   |                             | 1016 kB |
+ *   |                             | 1020 kB |
+ *   |=======================================|
+ *   | Huge                        |    1 MB |
+ *   |                             |    2 MB |
+ *   |                             |    3 MB |
+ *   |                             |     ... |
+ *   |=======================================|
  *
  * A different mechanism is used for each category:
  *
@@ -113,6 +128,19 @@
 #endif
 
 /*
+ * MALLOC_TINY enables support for tiny objects, which are smaller than one
+ * quantum.
+ */
+#define	MALLOC_TINY
+
+/*
+ * MALLOC_MAG enables a magazine-based thread-specific caching layer for small
+ * objects.  This makes it possible to allocate/deallocate objects without any
+ * locking when the cache is in the steady state.
+ */
+#define	MALLOC_MAG
+
+/*
  * MALLOC_BALANCE enables monitoring of arena lock contention and dynamically
  * re-balances arena load if exponentially averaged contention exceeds a
  * certain threshold.
@@ -128,7 +156,7 @@
 #define	MALLOC_DSS
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/lib/libc/stdlib/malloc.c,v 1.176 2008/08/14 17:31:42 jasone Exp $");
+__FBSDID("$FreeBSD: src/lib/libc/stdlib/malloc.c,v 1.177 2008/08/27 02:00:53 jasone Exp $");
 
 #include "libc_private.h"
 #ifdef MALLOC_DEBUG
@@ -184,46 +212,63 @@
 /* Size of stack-allocated buffer passed to strerror_r(). */
 #define	STRERROR_BUF		64
 
-/* Minimum alignment of allocations is 2^QUANTUM_2POW_MIN bytes. */
+/*
+ * The const_size2bin table is sized according to PAGESIZE_2POW, but for
+ * correctness reasons, we never assume that
+ * (pagesize == (1U << * PAGESIZE_2POW)).
+ *
+ * Minimum alignment of allocations is 2^QUANTUM_2POW bytes.
+ */
 #ifdef __i386__
-#  define QUANTUM_2POW_MIN	4
+#  define PAGESIZE_2POW		12
+#  define QUANTUM_2POW		4
 #  define SIZEOF_PTR_2POW	2
 #  define CPU_SPINWAIT		__asm__ volatile("pause")
 #endif
 #ifdef __ia64__
-#  define QUANTUM_2POW_MIN	4
+#  define PAGESIZE_2POW		12
+#  define QUANTUM_2POW		4
 #  define SIZEOF_PTR_2POW	3
 #endif
 #ifdef __alpha__
-#  define QUANTUM_2POW_MIN	4
+#  define PAGESIZE_2POW		13
+#  define QUANTUM_2POW		4
 #  define SIZEOF_PTR_2POW	3
 #  define NO_TLS
 #endif
 #ifdef __sparc64__
-#  define QUANTUM_2POW_MIN	4
+#  define PAGESIZE_2POW		13
+#  define QUANTUM_2POW		4
 #  define SIZEOF_PTR_2POW	3
 #  define NO_TLS
 #endif
 #ifdef __amd64__
-#  define QUANTUM_2POW_MIN	4
+#  define PAGESIZE_2POW		12
+#  define QUANTUM_2POW		4
 #  define SIZEOF_PTR_2POW	3
 #  define CPU_SPINWAIT		__asm__ volatile("pause")
 #endif
 #ifdef __arm__
-#  define QUANTUM_2POW_MIN	3
+#  define PAGESIZE_2POW		12
+#  define QUANTUM_2POW		3
 #  define SIZEOF_PTR_2POW	2
 #  define NO_TLS
 #endif
 #ifdef __mips__
-#  define QUANTUM_2POW_MIN	3
+#  define PAGESIZE_2POW		12
+#  define QUANTUM_2POW		3
 #  define SIZEOF_PTR_2POW	2
 #  define NO_TLS
 #endif
 #ifdef __powerpc__
-#  define QUANTUM_2POW_MIN	4
+#  define PAGESIZE_2POW		12
+#  define QUANTUM_2POW		4
 #  define SIZEOF_PTR_2POW	2
 #endif
 
+#define	QUANTUM			((size_t)(1U << QUANTUM_2POW))
+#define	QUANTUM_MASK		(QUANTUM - 1)
+
 #define	SIZEOF_PTR		(1U << SIZEOF_PTR_2POW)
 
 /* sizeof(int) == (1U << SIZEOF_INT_2POW). */
@@ -237,6 +282,10 @@
 #endif
 
 #ifdef NO_TLS
+   /* MALLOC_MAG requires TLS. */
+#  ifdef MALLOC_MAG
+#    undef MALLOC_MAG
+#  endif
    /* MALLOC_BALANCE requires TLS. */
 #  ifdef MALLOC_BALANCE
 #    undef MALLOC_BALANCE
@@ -253,23 +302,42 @@
 #define	DIRTY_MAX_DEFAULT	(1U << 9)
 
 /*
- * Maximum size of L1 cache line.  This is used to avoid cache line aliasing,
- * so over-estimates are okay (up to a point), but under-estimates will
- * negatively affect performance.
+ * Maximum size of L1 cache line.  This is used to avoid cache line aliasing.
+ * In addition, this controls the spacing of cacheline-spaced size classes.
  */
 #define	CACHELINE_2POW		6
 #define	CACHELINE		((size_t)(1U << CACHELINE_2POW))
+#define	CACHELINE_MASK		(CACHELINE - 1)
 
-/* Smallest size class to support. */
-#define	TINY_MIN_2POW		1
+/*
+ * Subpages are an artificially designated partitioning of pages.  Their only
+ * purpose is to support subpage-spaced size classes.
+ *
+ * There must be at least 4 subpages per page, due to the way size classes are
+ * handled.
+ */
+#define	SUBPAGE_2POW		8
+#define	SUBPAGE			((size_t)(1U << SUBPAGE_2POW))
+#define	SUBPAGE_MASK		(SUBPAGE - 1)
+
+#ifdef MALLOC_TINY
+   /* Smallest size class to support. */
+#  define TINY_MIN_2POW		1
+#endif
 
 /*
  * Maximum size class that is a multiple of the quantum, but not (necessarily)
  * a power of 2.  Above this size, allocations are rounded up to the nearest
  * power of 2.
  */
-#define	SMALL_MAX_2POW_DEFAULT	9
-#define	SMALL_MAX_DEFAULT	(1U << SMALL_MAX_2POW_DEFAULT)
+#define	QSPACE_MAX_2POW_DEFAULT	7
+
+/*
+ * Maximum size class that is a multiple of the cacheline, but not (necessarily)
+ * a power of 2.  Above this size, allocations are rounded up to the nearest
+ * power of 2.
+ */
+#define	CSPACE_MAX_2POW_DEFAULT	9
 
 /*
  * RUN_MAX_OVRHD indicates maximum desired run header overhead.  Runs are sized
@@ -293,8 +361,7 @@
 #define	RUN_MAX_OVRHD_RELAX	0x00001800U
 
 /* Put a cap on small object run size.  This overrides RUN_MAX_OVRHD. */
-#define	RUN_MAX_SMALL_2POW	15
-#define	RUN_MAX_SMALL		(1U << RUN_MAX_SMALL_2POW)
+#define	RUN_MAX_SMALL	(12 * pagesize)
 
 /*
  * Hyper-threaded CPUs may need a special instruction inside spin loops in
@@ -319,6 +386,15 @@
  */
 #define	BLOCK_COST_2POW		4
 
+#ifdef MALLOC_MAG
+   /*
+    * Default magazine size, in bytes.  max_rounds is calculated to make
+    * optimal use of the space, leaving just enough room for the magazine
+    * header.
+    */
+#  define MAG_SIZE_2POW_DEFAULT	9
+#endif
+
 #ifdef MALLOC_BALANCE
    /*
     * We use an exponential moving average to track recent lock contention,
@@ -369,6 +445,11 @@
 	 */
 	uint64_t	nrequests;
 
+#ifdef MALLOC_MAG
+	/* Number of magazine reloads from this bin. */
+	uint64_t	nmags;
+#endif
+
 	/* Total number of runs created for this bin's size class. */
 	uint64_t	nruns;
 
@@ -678,6 +759,35 @@
 
 /******************************************************************************/
 /*
+ * Magazine data structures.
+ */
+
+#ifdef MALLOC_MAG
+typedef struct mag_s mag_t;
+struct mag_s {
+	size_t		binind; /* Index of associated bin. */
+	size_t		nrounds;
+	void		*rounds[1]; /* Dynamically sized. */
+};
+
+/*
+ * Magazines are lazily allocated, but once created, they remain until the
+ * associated mag_rack is destroyed.
+ */
+typedef struct bin_mags_s bin_mags_t;
+struct bin_mags_s {
+	mag_t	*curmag;
+	mag_t	*sparemag;
+};
+
+typedef struct mag_rack_s mag_rack_t;
+struct mag_rack_s {
+	bin_mags_t	bin_mags[1]; /* Dynamically sized. */
+};
+#endif
+
+/******************************************************************************/
+/*
  * Data.
  */
 
@@ -690,16 +800,147 @@
 static size_t		pagesize_2pow;
 
 /* Various bin-related settings. */
-static size_t		bin_maxclass; /* Max size class for bins. */
-static unsigned		ntbins; /* Number of (2^n)-spaced tiny bins. */
+#ifdef MALLOC_TINY		/* Number of (2^n)-spaced tiny bins. */
+#  define		ntbins	((unsigned)(QUANTUM_2POW - TINY_MIN_2POW))
+#else
+#  define		ntbins	0
+#endif
 static unsigned		nqbins; /* Number of quantum-spaced bins. */
-static unsigned		nsbins; /* Number of (2^n)-spaced sub-page bins. */
-static size_t		small_min;
-static size_t		small_max;
+static unsigned		ncbins; /* Number of cacheline-spaced bins. */
+static unsigned		nsbins; /* Number of subpage-spaced bins. */
+static unsigned		nbins;
+#ifdef MALLOC_TINY
+#  define		tspace_max	((size_t)(QUANTUM >> 1))
+#endif
+#define			qspace_min	QUANTUM
+static size_t		qspace_max;
+static size_t		cspace_min;
+static size_t		cspace_max;
+static size_t		sspace_min;
+static size_t		sspace_max;
+#define			bin_maxclass	sspace_max
+
+static uint8_t const	*size2bin;
+/*
+ * const_size2bin is a static constant lookup table that in the common case can
+ * be used as-is for size2bin.  For dynamically linked programs, this avoids
+ * a page of memory overhead per process.
+ */
+#define	S2B_1(i)	i,
+#define	S2B_2(i)	S2B_1(i) S2B_1(i)
+#define	S2B_4(i)	S2B_2(i) S2B_2(i)
+#define	S2B_8(i)	S2B_4(i) S2B_4(i)
+#define	S2B_16(i)	S2B_8(i) S2B_8(i)
+#define	S2B_32(i)	S2B_16(i) S2B_16(i)
+#define	S2B_64(i)	S2B_32(i) S2B_32(i)
+#define	S2B_128(i)	S2B_64(i) S2B_64(i)
+#define	S2B_256(i)	S2B_128(i) S2B_128(i)
+static const uint8_t	const_size2bin[(1U << PAGESIZE_2POW) - 255] = {
+	S2B_1(0xffU)		/*    0 */
+#if (QUANTUM_2POW == 4)
+/* 64-bit system ************************/
+#  ifdef MALLOC_TINY
+	S2B_2(0)		/*    2 */
+	S2B_2(1)		/*    4 */
+	S2B_4(2)		/*    8 */
+	S2B_8(3)		/*   16 */
+#    define S2B_QMIN 3
+#  else
+	S2B_16(0)		/*   16 */
+#    define S2B_QMIN 0
+#  endif
+	S2B_16(S2B_QMIN + 1)	/*   32 */
+	S2B_16(S2B_QMIN + 2)	/*   48 */
+	S2B_16(S2B_QMIN + 3)	/*   64 */
+	S2B_16(S2B_QMIN + 4)	/*   80 */
+	S2B_16(S2B_QMIN + 5)	/*   96 */
+	S2B_16(S2B_QMIN + 6)	/*  112 */
+	S2B_16(S2B_QMIN + 7)	/*  128 */
+#  define S2B_CMIN (S2B_QMIN + 8)
+#else
+/* 32-bit system ************************/
+#  ifdef MALLOC_TINY
+	S2B_2(0)		/*    2 */
+	S2B_2(1)		/*    4 */
+	S2B_4(2)		/*    8 */
+#    define S2B_QMIN 2
+#  else
+	S2B_8(0)		/*    8 */
+#    define S2B_QMIN 0
+#  endif
+	S2B_8(S2B_QMIN + 1)	/*   16 */
+	S2B_8(S2B_QMIN + 2)	/*   24 */
+	S2B_8(S2B_QMIN + 3)	/*   32 */
+	S2B_8(S2B_QMIN + 4)	/*   40 */
+	S2B_8(S2B_QMIN + 5)	/*   48 */
+	S2B_8(S2B_QMIN + 6)	/*   56 */
+	S2B_8(S2B_QMIN + 7)	/*   64 */
+	S2B_8(S2B_QMIN + 8)	/*   72 */
+	S2B_8(S2B_QMIN + 9)	/*   80 */
+	S2B_8(S2B_QMIN + 10)	/*   88 */
+	S2B_8(S2B_QMIN + 11)	/*   96 */
+	S2B_8(S2B_QMIN + 12)	/*  104 */
+	S2B_8(S2B_QMIN + 13)	/*  112 */
+	S2B_8(S2B_QMIN + 14)	/*  120 */
+	S2B_8(S2B_QMIN + 15)	/*  128 */
+#  define S2B_CMIN (S2B_QMIN + 16)
+#endif
+/****************************************/
+	S2B_64(S2B_CMIN + 0)	/*  192 */
+	S2B_64(S2B_CMIN + 1)	/*  256 */
+	S2B_64(S2B_CMIN + 2)	/*  320 */
+	S2B_64(S2B_CMIN + 3)	/*  384 */
+	S2B_64(S2B_CMIN + 4)	/*  448 */
+	S2B_64(S2B_CMIN + 5)	/*  512 */
+#  define S2B_SMIN (S2B_CMIN + 6)
+	S2B_256(S2B_SMIN + 0)	/*  768 */
+	S2B_256(S2B_SMIN + 1)	/* 1024 */
+	S2B_256(S2B_SMIN + 2)	/* 1280 */
+	S2B_256(S2B_SMIN + 3)	/* 1536 */
+	S2B_256(S2B_SMIN + 4)	/* 1792 */
+	S2B_256(S2B_SMIN + 5)	/* 2048 */
+	S2B_256(S2B_SMIN + 6)	/* 2304 */
+	S2B_256(S2B_SMIN + 7)	/* 2560 */
+	S2B_256(S2B_SMIN + 8)	/* 2816 */
+	S2B_256(S2B_SMIN + 9)	/* 3072 */
+	S2B_256(S2B_SMIN + 10)	/* 3328 */
+	S2B_256(S2B_SMIN + 11)	/* 3584 */
+	S2B_256(S2B_SMIN + 12)	/* 3840 */
+#if (PAGESIZE_2POW == 13)
+	S2B_256(S2B_SMIN + 13)	/* 4096 */
+	S2B_256(S2B_SMIN + 14)	/* 4352 */
+	S2B_256(S2B_SMIN + 15)	/* 4608 */
+	S2B_256(S2B_SMIN + 16)	/* 4864 */
+	S2B_256(S2B_SMIN + 17)	/* 5120 */
+	S2B_256(S2B_SMIN + 18)	/* 5376 */
+	S2B_256(S2B_SMIN + 19)	/* 5632 */
+	S2B_256(S2B_SMIN + 20)	/* 5888 */
+	S2B_256(S2B_SMIN + 21)	/* 6144 */
+	S2B_256(S2B_SMIN + 22)	/* 6400 */
+	S2B_256(S2B_SMIN + 23)	/* 6656 */
+	S2B_256(S2B_SMIN + 24)	/* 6912 */
+	S2B_256(S2B_SMIN + 25)	/* 7168 */
+	S2B_256(S2B_SMIN + 26)	/* 7424 */
+	S2B_256(S2B_SMIN + 27)	/* 7680 */
+	S2B_256(S2B_SMIN + 28)	/* 7936 */
+#endif
+};
+#undef S2B_1
+#undef S2B_2
+#undef S2B_4
+#undef S2B_8
+#undef S2B_16
+#undef S2B_32
+#undef S2B_64
+#undef S2B_128
+#undef S2B_256
+#undef S2B_QMIN
+#undef S2B_CMIN
+#undef S2B_SMIN
 
-/* Various quantum-related settings. */
-static size_t		quantum;
-static size_t		quantum_mask; /* (quantum - 1). */
+#ifdef MALLOC_MAG
+static size_t		max_rounds;
+#endif
 
 /* Various chunk-related settings. */
 static size_t		chunksize;
@@ -796,6 +1037,14 @@
 static __thread arena_t	*arenas_map;
 #endif
 
+#ifdef MALLOC_MAG
+/*
+ * Map of thread-specific magazine racks, used for thread-specific object
+ * caching.
+ */
+static __thread mag_rack_t	*mag_rack;
+#endif
+
 #ifdef MALLOC_STATS
 /* Chunk statistics. */
 static chunk_stats_t	stats_chunks;
@@ -818,13 +1067,17 @@
 static bool	opt_dss = true;
 static bool	opt_mmap = true;
 #endif
+#ifdef MALLOC_MAG
+static bool	opt_mag = true;
+static size_t	opt_mag_size_2pow = MAG_SIZE_2POW_DEFAULT;
+#endif
 static size_t	opt_dirty_max = DIRTY_MAX_DEFAULT;
 #ifdef MALLOC_BALANCE
 static uint64_t	opt_balance_threshold = BALANCE_THRESHOLD_DEFAULT;
 #endif
 static bool	opt_print_stats = false;
-static size_t	opt_quantum_2pow = QUANTUM_2POW_MIN;
-static size_t	opt_small_max_2pow = SMALL_MAX_2POW_DEFAULT;
+static size_t	opt_qspace_max_2pow = QSPACE_MAX_2POW_DEFAULT;
+static size_t	opt_cspace_max_2pow = CSPACE_MAX_2POW_DEFAULT;
 static size_t	opt_chunk_2pow = CHUNK_2POW_DEFAULT;
 static bool	opt_utrace = false;
 static bool	opt_sysv = false;
@@ -902,15 +1155,21 @@
 static void	arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk,
     arena_run_t *run, size_t oldsize, size_t newsize, bool dirty);
 static arena_run_t *arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin);
-static void *arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin);
-static size_t arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size);
+static void	*arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin);
+static size_t	arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size);
 #ifdef MALLOC_BALANCE
 static void	arena_lock_balance_hard(arena_t *arena);
 #endif
+#ifdef MALLOC_MAG
+static void	mag_load(mag_t *mag);
+#endif
 static void	*arena_malloc_large(arena_t *arena, size_t size, bool zero);
 static void	*arena_palloc(arena_t *arena, size_t alignment, size_t size,
     size_t alloc_size);
 static size_t	arena_salloc(const void *ptr);
+#ifdef MALLOC_MAG
+static void	mag_unload(mag_t *mag);
+#endif
 static void	arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk,
     void *ptr);
 static void	arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk,
@@ -921,11 +1180,22 @@
 static void	*arena_ralloc(void *ptr, size_t size, size_t oldsize);
 static bool	arena_new(arena_t *arena);
 static arena_t	*arenas_extend(unsigned ind);
+#ifdef MALLOC_MAG
+static mag_t	*mag_create(arena_t *arena, size_t binind);
+static void	mag_destroy(mag_t *mag);
+static mag_rack_t *mag_rack_create(arena_t *arena);
+static void	mag_rack_destroy(mag_rack_t *rack);
+#endif
 static void	*huge_malloc(size_t size, bool zero);
 static void	*huge_palloc(size_t alignment, size_t size);
 static void	*huge_ralloc(void *ptr, size_t size, size_t oldsize);
 static void	huge_dalloc(void *ptr);
 static void	malloc_print_stats(void);
+#ifdef MALLOC_DEBUG
+static void	size2bin_validate(void);
+#endif
+static bool	size2bin_init(void);
+static bool	size2bin_init_hard(void);
 static bool	malloc_init_hard(void);
 
 /*
@@ -1063,18 +1333,23 @@
 #define	CHUNK_CEILING(s)						\
 	(((s) + chunksize_mask) & ~chunksize_mask)
 
+/* Return the smallest quantum multiple that is >= a. */
+#define	QUANTUM_CEILING(a)						\
+	(((a) + QUANTUM_MASK) & ~QUANTUM_MASK)
+
 /* Return the smallest cacheline multiple that is >= s. */
 #define	CACHELINE_CEILING(s)						\
-	(((s) + (CACHELINE - 1)) & ~(CACHELINE - 1))
+	(((s) + CACHELINE_MASK) & ~CACHELINE_MASK)
 
-/* Return the smallest quantum multiple that is >= a. */
-#define	QUANTUM_CEILING(a)						\
-	(((a) + quantum_mask) & ~quantum_mask)
+/* Return the smallest subpage multiple that is >= s. */
+#define	SUBPAGE_CEILING(s)						\
+	(((s) + SUBPAGE_MASK) & ~SUBPAGE_MASK)
 
 /* Return the smallest pagesize multiple that is >= s. */
 #define	PAGE_CEILING(s)							\
 	(((s) + pagesize_mask) & ~pagesize_mask)
 
+#ifdef MALLOC_TINY
 /* Compute the smallest power of 2 that is >= x. */
 static inline size_t
 pow2_ceil(size_t x)
@@ -1092,6 +1367,7 @@
 	x++;
 	return (x);
 }
+#endif
 
 #ifdef MALLOC_BALANCE
 /*
@@ -1382,10 +1658,19 @@
 	    arena->stats.ndalloc_small + arena->stats.ndalloc_large);
 	malloc_printf("mapped:  %12zu\n", arena->stats.mapped);
 
-	malloc_printf("bins:     bin   size regs pgs  requests   newruns"
-	    "    reruns maxruns curruns\n");
-	for (i = 0, gap_start = UINT_MAX; i < ntbins + nqbins + nsbins; i++) {
-		if (arena->bins[i].stats.nrequests == 0) {
+#ifdef MALLOC_MAG
+	if (__isthreaded && opt_mag) {
+		malloc_printf("bins:     bin   size regs pgs      mags   "
+		    "newruns    reruns maxruns curruns\n");
+	} else {
+#endif
+		malloc_printf("bins:     bin   size regs pgs  requests   "
+		    "newruns    reruns maxruns curruns\n");
+#ifdef MALLOC_MAG
+	}
+#endif
+	for (i = 0, gap_start = UINT_MAX; i < nbins; i++) {
+		if (arena->bins[i].stats.nruns == 0) {
 			if (gap_start == UINT_MAX)
 				gap_start = i;
 		} else {
@@ -1404,10 +1689,15 @@
 			    "%13u %1s %4u %4u %3u %9llu %9llu"
 			    " %9llu %7lu %7lu\n",
 			    i,
-			    i < ntbins ? "T" : i < ntbins + nqbins ? "Q" : "S",
+			    i < ntbins ? "T" : i < ntbins + nqbins ? "Q" :
+			    i < ntbins + nqbins + ncbins ? "C" : "S",
 			    arena->bins[i].reg_size,
 			    arena->bins[i].nregs,
 			    arena->bins[i].run_size >> pagesize_2pow,
+#ifdef MALLOC_MAG
+			    (__isthreaded && opt_mag) ?
+			    arena->bins[i].stats.nmags :
+#endif
 			    arena->bins[i].stats.nrequests,
 			    arena->bins[i].stats.nruns,
 			    arena->bins[i].stats.reruns,
@@ -2137,44 +2427,9 @@
 static inline void
 arena_run_reg_dalloc(arena_run_t *run, arena_bin_t *bin, void *ptr, size_t size)
 {
-	/*
-	 * To divide by a number D that is not a power of two we multiply
-	 * by (2^21 / D) and then right shift by 21 positions.
-	 *
-	 *   X / D
-	 *
-	 * becomes
-	 *
-	 *   (X * size_invs[(D >> QUANTUM_2POW_MIN) - 3]) >> SIZE_INV_SHIFT
-	 */
-#define	SIZE_INV_SHIFT 21
-#define	SIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s << QUANTUM_2POW_MIN)) + 1)
-	static const unsigned size_invs[] = {
-	    SIZE_INV(3),
-	    SIZE_INV(4), SIZE_INV(5), SIZE_INV(6), SIZE_INV(7),
-	    SIZE_INV(8), SIZE_INV(9), SIZE_INV(10), SIZE_INV(11),
-	    SIZE_INV(12),SIZE_INV(13), SIZE_INV(14), SIZE_INV(15),
-	    SIZE_INV(16),SIZE_INV(17), SIZE_INV(18), SIZE_INV(19),
-	    SIZE_INV(20),SIZE_INV(21), SIZE_INV(22), SIZE_INV(23),
-	    SIZE_INV(24),SIZE_INV(25), SIZE_INV(26), SIZE_INV(27),
-	    SIZE_INV(28),SIZE_INV(29), SIZE_INV(30), SIZE_INV(31)
-#if (QUANTUM_2POW_MIN < 4)
-	    ,
-	    SIZE_INV(32), SIZE_INV(33), SIZE_INV(34), SIZE_INV(35),
-	    SIZE_INV(36), SIZE_INV(37), SIZE_INV(38), SIZE_INV(39),
-	    SIZE_INV(40), SIZE_INV(41), SIZE_INV(42), SIZE_INV(43),
-	    SIZE_INV(44), SIZE_INV(45), SIZE_INV(46), SIZE_INV(47),
-	    SIZE_INV(48), SIZE_INV(49), SIZE_INV(50), SIZE_INV(51),
-	    SIZE_INV(52), SIZE_INV(53), SIZE_INV(54), SIZE_INV(55),
-	    SIZE_INV(56), SIZE_INV(57), SIZE_INV(58), SIZE_INV(59),
-	    SIZE_INV(60), SIZE_INV(61), SIZE_INV(62), SIZE_INV(63)
-#endif
-	};
 	unsigned diff, regind, elm, bit;
 
 	assert(run->magic == ARENA_RUN_MAGIC);
-	assert(((sizeof(size_invs)) / sizeof(unsigned)) + 3
-	    >= (SMALL_MAX_DEFAULT >> QUANTUM_2POW_MIN));
 
 	/*
 	 * Avoid doing division with a variable divisor if possible.  Using
@@ -2203,26 +2458,89 @@
 			regind = (diff >> log2_table[size - 1]);
 		else if (size <= 32768)
 			regind = diff >> (8 + log2_table[(size >> 8) - 1]);
-		else {
-			/*
-			 * The run size is too large for us to use the lookup
-			 * table.  Use real division.
-			 */
+		else
 			regind = diff / size;
-		}
-	} else if (size <= (((sizeof(size_invs) / sizeof(unsigned)) + 2)
-	    << QUANTUM_2POW_MIN)) {
-		regind = size_invs[(size >> QUANTUM_2POW_MIN) - 3] * diff;
-		regind >>= SIZE_INV_SHIFT;
-	} else {
+	} else if (size < qspace_max) {
 		/*
-		 * size_invs isn't large enough to handle this size class, so
-		 * calculate regind using actual division.  This only happens
-		 * if the user increases small_max via the 'S' runtime
-		 * configuration option.
+		 * To divide by a number D that is not a power of two we
+		 * multiply by (2^21 / D) and then right shift by 21 positions.
+		 *
+		 *   X / D
+		 *
+		 * becomes
+		 *
+		 *   (X * qsize_invs[(D >> QUANTUM_2POW) - 3])
+		 *       >> SIZE_INV_SHIFT
+		 *
+		 * We can omit the first three elements, because we never
+		 * divide by 0, and QUANTUM and 2*QUANTUM are both powers of
+		 * two, which are handled above.
 		 */
-		regind = diff / size;
-	};
+#define	SIZE_INV_SHIFT 21
+#define	QSIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s << QUANTUM_2POW)) + 1)
+		static const unsigned qsize_invs[] = {
+		    QSIZE_INV(3),
+		    QSIZE_INV(4), QSIZE_INV(5), QSIZE_INV(6), QSIZE_INV(7)
+#if (QUANTUM_2POW < 4)
+		    ,
+		    QSIZE_INV(8), QSIZE_INV(9), QSIZE_INV(10), QSIZE_INV(11),
+		    QSIZE_INV(12),QSIZE_INV(13), QSIZE_INV(14), QSIZE_INV(15)
+#endif
+		};
+		assert(QUANTUM * (((sizeof(qsize_invs)) / sizeof(unsigned)) + 3)
+		    >= (1U << QSPACE_MAX_2POW_DEFAULT));
+
+		if (size <= (((sizeof(qsize_invs) / sizeof(unsigned)) + 2) <<
+		    QUANTUM_2POW)) {
+			regind = qsize_invs[(size >> QUANTUM_2POW) - 3] * diff;
+			regind >>= SIZE_INV_SHIFT;
+		} else
+			regind = diff / size;
+#undef QSIZE_INV
+	} else if (size < cspace_max) {
+#define	CSIZE_INV(s) (((1U << SIZE_INV_SHIFT) / (s << CACHELINE_2POW)) + 1)

>>> TRUNCATED FOR MAIL (1000 lines) <<<



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200808271103.m7RB3Kww063624>