Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 23 Aug 2014 15:13:56 +0100
From:      "Steven Hartland" <killing@multiplay.co.uk>
To:        <bugzilla-noreply@freebsd.org>, <freebsd-fs@FreeBSD.org>
Subject:   Re: [Bug 187594] [zfs] [patch] ZFS ARC behavior problem and fix
Message-ID:  <A15DD1EC190B44CA838F6FFEA8D9A9EA@multiplay.co.uk>
References:  <bug-187594-3630@https.bugs.freebsd.org/bugzilla/> <bug-187594-3630-CrQVkoxIUf@https.bugs.freebsd.org/bugzilla/>

next in thread | previous in thread | raw e-mail | index | archive | help
This is a multi-part message in MIME format.

------=_NextPart_000_05CD_01CFBEE4.DBEE9260
Content-Type: text/plain;
	format=flowed;
	charset="iso-8859-1";
	reply-type=original
Content-Transfer-Encoding: 7bit

----- Original Message ----- 
From: <bugzilla-noreply@freebsd.org>


> https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=187594
>
> --- Comment #25 from fullermd@over-yonder.net ---
> Having run it for a few months on a number of boxes now, my general 
> impression
> is that it seems like it goes a little _too_ far (with default options 
> anyway;
> I haven't tried any tuning) toward making the ARC give up its lunch 
> money to
> anybody who looks threateningly at it.  It feels like it should be a 
> bit more
> aggressive, and historically was and did fine.
>
> However, it's still _much_ nicer than the unpatched case, where the 
> rest of the
> system starves and hides out in the swap space.  So from here, while 
> perhaps
> imperfect and in need of some tuning work, it's still a significant 
> improvement
> on the prior state, so landing it sounds just fine to me.

The attached updated patch, which has been cleaned up and hammered hard 
at
the event here I'll look to commit to head soon if there are no 
objections.

    Regards
    Steve 

------=_NextPart_000_05CD_01CFBEE4.DBEE9260
Content-Type: application/octet-stream;
	name="arc-reclaim.patch"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: attachment;
	filename="arc-reclaim.patch"

Index: sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c=0A=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=0A=
--- sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c	(revision 270315)=0A=
+++ sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c	(working copy)=0A=
@@ -126,20 +126,42 @@=0A=
 }=0A=
 SYSINIT(kmem_size_init, SI_SUB_KMEM, SI_ORDER_ANY, kmem_size_init, =
NULL);=0A=
 =0A=
+/*=0A=
+ * The returns from kmem_free_*size are only valid once the pagedaemon=0A=
+ * has been initialised, before then they return 0.=0A=
+ * =0A=
+ * To ensure the returns are valid the caller can use a SYSINIT with=0A=
+ * subsystem set to SI_SUB_KTHREAD_PAGE and order of at least=0A=
+ * SI_ORDER_SECOND.=0A=
+ */=0A=
 uint64_t=0A=
-kmem_size(void)=0A=
+kmem_free_target_size(void)=0A=
 {=0A=
 =0A=
-	return (kmem_size_val);=0A=
+	return ((uint64_t)cnt.v_free_target * PAGE_SIZE);=0A=
 }=0A=
 =0A=
 uint64_t=0A=
-kmem_used(void)=0A=
+kmem_free_min_size(void)=0A=
 {=0A=
 =0A=
-	return (vmem_size(kmem_arena, VMEM_ALLOC));=0A=
+	return ((uint64_t)cnt.v_free_min * PAGE_SIZE);=0A=
 }=0A=
 =0A=
+uint64_t=0A=
+kmem_free_size(void)=0A=
+{=0A=
+=0A=
+	return ((uint64_t)cnt.v_free_count * PAGE_SIZE);=0A=
+}=0A=
+=0A=
+uint64_t=0A=
+kmem_size(void)=0A=
+{=0A=
+=0A=
+	return (kmem_size_val);=0A=
+}=0A=
+=0A=
 static int=0A=
 kmem_std_constructor(void *mem, int size __unused, void *private, int =
flags)=0A=
 {=0A=
Index: sys/cddl/compat/opensolaris/sys/kmem.h=0A=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=0A=
--- sys/cddl/compat/opensolaris/sys/kmem.h	(revision 270315)=0A=
+++ sys/cddl/compat/opensolaris/sys/kmem.h	(working copy)=0A=
@@ -66,7 +66,12 @@=0A=
 void *zfs_kmem_alloc(size_t size, int kmflags);=0A=
 void zfs_kmem_free(void *buf, size_t size);=0A=
 uint64_t kmem_size(void);=0A=
-uint64_t kmem_used(void);=0A=
+=0A=
+/* Return vals from kmem_free_*size are only valid after the pagedaemon =
init. */=0A=
+uint64_t kmem_free_size(void);=0A=
+uint64_t kmem_free_target_size(void);=0A=
+uint64_t kmem_free_min_size(void);=0A=
+=0A=
 kmem_cache_t *kmem_cache_create(char *name, size_t bufsize, size_t =
align,=0A=
     int (*constructor)(void *, void *, int), void (*destructor)(void *, =
void *),=0A=
     void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int =
cflags);=0A=
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c=0A=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=0A=
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	(revision =
270315)=0A=
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	(working copy)=0A=
@@ -193,9 +193,6 @@=0A=
  */=0A=
 static boolean_t arc_warm;=0A=
 =0A=
-/*=0A=
- * These tunables are for performance analysis.=0A=
- */=0A=
 uint64_t zfs_arc_max;=0A=
 uint64_t zfs_arc_min;=0A=
 uint64_t zfs_arc_meta_limit =3D 0;=0A=
@@ -204,7 +201,20 @@=0A=
 int zfs_arc_p_min_shift =3D 0;=0A=
 int zfs_disable_dup_eviction =3D 0;=0A=
 uint64_t zfs_arc_average_blocksize =3D 8 * 1024; /* 8KB */=0A=
+uint64_t zfs_arc_free_target =3D (1 << 30); /* 1GB */=0A=
 =0A=
+static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);=0A=
+=0A=
+static void=0A=
+arc_free_target_init(void *unused __unused)=0A=
+{=0A=
+=0A=
+	zfs_arc_free_target =3D kmem_free_target_size() * 3;=0A=
+}=0A=
+SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,=0A=
+    arc_free_target_init, NULL);=0A=
+=0A=
+=0A=
 TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);=0A=
 TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);=0A=
 TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);=0A=
@@ -217,7 +227,34 @@=0A=
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,=0A=
     &zfs_arc_average_blocksize, 0,=0A=
     "ARC average blocksize");=0A=
+/*=0A=
+ * We don't have a tunable for arc_free_target due to the dependency on=0A=
+ * pagedaemon initialisation.=0A=
+ */=0A=
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,=0A=
+    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),=0A=
+    sysctl_vfs_zfs_arc_free_target, "QU",=0A=
+    "Desired amount of free memory below which ARC triggers reclaim");=0A=
 =0A=
+static int=0A=
+sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)=0A=
+{=0A=
+	uint64_t val;=0A=
+	int err;=0A=
+=0A=
+	val =3D zfs_arc_free_target;=0A=
+	err =3D sysctl_handle_64(oidp, &val, 0, req);=0A=
+	if (err !=3D 0 || req->newptr =3D=3D NULL)=0A=
+		return (err);=0A=
+=0A=
+	if (val < kmem_free_min_size())=0A=
+		return (EINVAL);=0A=
+=0A=
+	zfs_arc_free_target =3D val;=0A=
+=0A=
+	return (0);=0A=
+}=0A=
+=0A=
 /*=0A=
  * Note that buffers can be in one of 6 states:=0A=
  *	ARC_anon	- anonymous (discussed below)=0A=
@@ -2421,9 +2458,12 @@=0A=
 void=0A=
 arc_shrink(void)=0A=
 {=0A=
+=0A=
 	if (arc_c > arc_c_min) {=0A=
 		uint64_t to_free;=0A=
 =0A=
+		DTRACE_PROBE2(arc__shrink, uint64_t, arc_c, uint64_t,=0A=
+			arc_c_min);=0A=
 #ifdef _KERNEL=0A=
 		to_free =3D arc_c >> arc_shrink_shift;=0A=
 #else=0A=
@@ -2443,8 +2483,11 @@=0A=
 		ASSERT((int64_t)arc_p >=3D 0);=0A=
 	}=0A=
 =0A=
-	if (arc_size > arc_c)=0A=
+	if (arc_size > arc_c) {=0A=
+		DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,=0A=
+			uint64_t, arc_c);=0A=
 		arc_adjust();=0A=
+	}=0A=
 }=0A=
 =0A=
 static int needfree =3D 0;=0A=
@@ -2455,15 +2498,24 @@=0A=
 =0A=
 #ifdef _KERNEL=0A=
 =0A=
-	if (needfree)=0A=
+	if (needfree) {=0A=
+		DTRACE_PROBE(arc__reclaim_needfree);=0A=
 		return (1);=0A=
+	}=0A=
 =0A=
+	if (kmem_free_size() < zfs_arc_free_target) {=0A=
+		DTRACE_PROBE(arc__reclaim_freetarget);=0A=
+		return (1);=0A=
+	}=0A=
+=0A=
 	/*=0A=
 	 * Cooperate with pagedaemon when it's time for it to scan=0A=
 	 * and reclaim some pages.=0A=
 	 */=0A=
-	if (vm_paging_needed())=0A=
+	if (vm_paging_needed()) {=0A=
+		DTRACE_PROBE(arc__reclaim_paging);=0A=
 		return (1);=0A=
+	}=0A=
 =0A=
 #ifdef sun=0A=
 	/*=0A=
@@ -2507,9 +2559,6 @@=0A=
 	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))=0A=
 		return (1);=0A=
 #endif=0A=
-#else	/* !sun */=0A=
-	if (kmem_used() > (kmem_size() * 3) / 4)=0A=
-		return (1);=0A=
 #endif	/* sun */=0A=
 =0A=
 #else=0A=
@@ -2516,6 +2565,8 @@=0A=
 	if (spa_get_random(100) =3D=3D 0)=0A=
 		return (1);=0A=
 #endif=0A=
+	DTRACE_PROBE(arc__reclaim_no);=0A=
+=0A=
 	return (0);=0A=
 }=0A=
 =0A=
Index: sys/vm/vm_pageout.c=0A=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=0A=
--- sys/vm/vm_pageout.c	(revision 270315)=0A=
+++ sys/vm/vm_pageout.c	(working copy)=0A=
@@ -115,10 +115,14 @@=0A=
 =0A=
 /* the kernel process "vm_pageout"*/=0A=
 static void vm_pageout(void);=0A=
+static void vm_pageout_init(void);=0A=
 static int vm_pageout_clean(vm_page_t);=0A=
 static void vm_pageout_scan(struct vm_domain *vmd, int pass);=0A=
 static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass);=0A=
 =0A=
+SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, =
vm_pageout_init,=0A=
+    NULL);=0A=
+=0A=
 struct proc *pageproc;=0A=
 =0A=
 static struct kproc_desc page_kp =3D {=0A=
@@ -126,7 +130,7 @@=0A=
 	vm_pageout,=0A=
 	&pageproc=0A=
 };=0A=
-SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start,=0A=
+SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,=0A=
     &page_kp);=0A=
 =0A=
 #if !defined(NO_SWAPPING)=0A=
@@ -1647,15 +1651,11 @@=0A=
 }=0A=
 =0A=
 /*=0A=
- *	vm_pageout is the high level pageout daemon.=0A=
+ *	vm_pageout_init initialises basic pageout daemon settings.=0A=
  */=0A=
 static void=0A=
-vm_pageout(void)=0A=
+vm_pageout_init(void)=0A=
 {=0A=
-#if MAXMEMDOM > 1=0A=
-	int error, i;=0A=
-#endif=0A=
-=0A=
 	/*=0A=
 	 * Initialize some paging parameters.=0A=
 	 */=0A=
@@ -1701,7 +1701,18 @@=0A=
 	/* XXX does not really belong here */=0A=
 	if (vm_page_max_wired =3D=3D 0)=0A=
 		vm_page_max_wired =3D cnt.v_free_count / 3;=0A=
+}=0A=
 =0A=
+/*=0A=
+ *	vm_pageout is the high level pageout daemon.=0A=
+ */=0A=
+static void=0A=
+vm_pageout(void)=0A=
+{=0A=
+#if MAXMEMDOM > 1=0A=
+	int error, i;=0A=
+#endif=0A=
+=0A=
 	swap_pager_swap_init();=0A=
 #if MAXMEMDOM > 1=0A=
 	for (i =3D 1; i < vm_ndomains; i++) {=0A=

------=_NextPart_000_05CD_01CFBEE4.DBEE9260--




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?A15DD1EC190B44CA838F6FFEA8D9A9EA>