Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 12 Jul 2010 19:39:33 +1000
From:      Peter Jeremy <peterjeremy@acm.org>
To:        Richard Lee <ricky@csua.berkeley.edu>
Cc:        freebsd-stable@freebsd.org
Subject:   Re: Serious zfs slowdown when mixed with another file system (ufs/msdosfs/etc.).
Message-ID:  <20100712093933.GA27950@server.vk2pj.dyndns.org>
In-Reply-To: <20100712093818.GA27693@server.vk2pj.dyndns.org>
References:  <20100711182511.GA21063@soda.CSUA.Berkeley.EDU> <20100712093818.GA27693@server.vk2pj.dyndns.org>

next in thread | previous in thread | raw e-mail | index | archive | help

--mojUlQ0s9EVzWg2t
Content-Type: multipart/mixed; boundary="RnlQjJ0d97Da+TV1"
Content-Disposition: inline


--RnlQjJ0d97Da+TV1
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable

On 2010-Jul-12 19:38:18 +1000, Peter Jeremy <peter@server.vk2pj.dyndns.org>=
 wrote:
>I have been using the attached arc.patch1 based on a patch written by
>Artem Belevich <fbsdlist@src.cx> (see http://pastebin.com/ZCkzkWcs )
>for about a month.  I have had reasonable success with it (and junked
>my cronjob) but have managed to wedge my system a couple of times
>whilst doing zfs send|recv.  Whilst looking at that diff, I just
>noticed a nasty signed/unsigned bug that could bite in low memory
>conditions and have revised it to arc.patch2 (untested as yet).

Let try actually attaching those patches...  Sorry.

--=20
Peter Jeremy

--RnlQjJ0d97Da+TV1
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="arc.patch1"
Content-Transfer-Encoding: quoted-printable

Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /usr/ncvs/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.=
c,v
retrieving revision 1.22.2.6
diff -u -r1.22.2.6 arc.c
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	24 May 2010 20:09:=
40 -0000	1.22.2.6
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	12 Jun 2010 21:04:=
13 -0000
@@ -183,10 +183,15 @@
 int zfs_arc_shrink_shift =3D 0;
 int zfs_arc_p_min_shift =3D 0;
=20
+uint64_t zfs_arc_bp_active;
+uint64_t zfs_arc_bp_inactive;
+
 TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
 TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
 TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
 TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
+TUNABLE_QUAD("vfs.zfs.arc_bp_active", &zfs_arc_bp_active);
+TUNABLE_QUAD("vfs.zfs.arc_bp_inactive", &zfs_arc_bp_inactive);
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
     "Maximum ARC size");
@@ -195,6 +200,11 @@
 SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
     &zfs_mdcomp_disable, 0, "Disable metadata compression");
=20
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_bp_active, CTLFLAG_RW|CTLFLAG_TUN, &zf=
s_arc_bp_active, 0,
+    "Start ARC backpressure if active memory is below this limit");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_bp_inactive, CTLFLAG_RW|CTLFLAG_TUN, &=
zfs_arc_bp_inactive, 0,
+    "Start ARC backpressure if inactive memory is below this limit");
+
 /*
  * Note that buffers can be in one of 6 states:
  *	ARC_anon	- anonymous (discussed below)
@@ -2103,7 +2113,6 @@
 }
=20
 static int needfree =3D 0;
-
 static int
 arc_reclaim_needed(void)
 {
@@ -2112,20 +2121,58 @@
 #endif
=20
 #ifdef _KERNEL
-	if (needfree)
-		return (1);
+	/* We've grown too much, */
 	if (arc_size > arc_c_max)
 		return (1);
+
+	/* Pagedaemon is stuck, let's free something right away */
+	if (vm_pageout_pages_needed)
+		return 1;
+
+	/* Check if inactive list have grown too much */
+	if ( zfs_arc_bp_inactive
+	     && (ptoa((uintmax_t)cnt.v_inactive_count) > zfs_arc_bp_inactive)) {
+		/* tell pager to reap 1/2th of inactive queue*/
+		atomic_add_int(&vm_pageout_deficit, cnt.v_inactive_count/2);
+		pagedaemon_wakeup();
+		return needfree;
+	}
+
+	/* Same for active list... */
+	if ( zfs_arc_bp_active
+	     && (ptoa((uintmax_t)cnt.v_active_count) > zfs_arc_bp_active)) {
+		atomic_add_int(&vm_pageout_deficit, cnt.v_active_count/2);
+		pagedaemon_wakeup();
+		return needfree;
+	}
+
+=09
+	/* Old style behavior -- ARC gives up memory whenever page daemon asks.. =
*/
+	if (needfree)
+		return 1;
+
+	/*
+	  We got here either because active/inactive lists are
+	  getting short or because we've been called during voluntary
+	  ARC size checks. Kind of gray area...
+	*/
+
+	/* If we didn't reach our minimum yet, don't rush to give memory up..*/
 	if (arc_size <=3D arc_c_min)
 		return (0);
=20
+	/* If we're really short on memory now, give it up. */
+	if (vm_page_count_min()) {
+		return (1);
+	}
+=09
 	/*
-	 * If pages are needed or we're within 2048 pages
-	 * of needing to page need to reclaim
+	 * If we're within 2048 pages of pagedaemon start, reclaim...
 	 */
-	if (vm_pages_needed || (vm_paging_target() > -2048))
+	if (vm_pages_needed && (vm_paging_target() > -2048))
 		return (1);
=20
+
 #if 0
 	/*
 	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
@@ -2169,8 +2216,6 @@
 		return (1);
 #endif
 #else
-	if (kmem_used() > (kmem_size() * 3) / 4)
-		return (1);
 #endif
=20
 #else
@@ -2279,7 +2324,7 @@
 		if (arc_eviction_list !=3D NULL)
 			arc_do_user_evicts();
=20
-		if (arc_reclaim_needed()) {
+		if (needfree) {
 			needfree =3D 0;
 #ifdef _KERNEL
 			wakeup(&needfree);
@@ -3611,10 +3656,15 @@
 {
 #ifdef _KERNEL
 	uint64_t inflight_data =3D arc_anon->arcs_size;
-	uint64_t available_memory =3D ptoa((uintmax_t)cnt.v_free_count);
+	uint64_t available_memory;
 	static uint64_t page_load =3D 0;
 	static uint64_t last_txg =3D 0;
=20
+        /* How much memory is potentially available */
+        available_memory =3D ptoa((uintmax_t)cnt.v_free_count);
+        available_memory +=3D ptoa((uintmax_t)cnt.v_cache_count);
+        available_memory -=3D ptoa((uintmax_t)cnt.v_free_min);
+       =20
 #if 0
 #if defined(__i386)
 	available_memory =3D

--RnlQjJ0d97Da+TV1
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="arc.patch2"
Content-Transfer-Encoding: quoted-printable

Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /usr/ncvs/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.=
c,v
retrieving revision 1.22.2.6
diff -u -r1.22.2.6 arc.c
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	24 May 2010 20:09:=
40 -0000	1.22.2.6
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	12 Jul 2010 09:21:=
31 -0000
@@ -183,10 +183,15 @@
 int zfs_arc_shrink_shift =3D 0;
 int zfs_arc_p_min_shift =3D 0;
=20
+uint64_t zfs_arc_bp_active;
+uint64_t zfs_arc_bp_inactive;
+
 TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
 TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
 TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
 TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
+TUNABLE_QUAD("vfs.zfs.arc_bp_active", &zfs_arc_bp_active);
+TUNABLE_QUAD("vfs.zfs.arc_bp_inactive", &zfs_arc_bp_inactive);
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
     "Maximum ARC size");
@@ -195,6 +200,11 @@
 SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
     &zfs_mdcomp_disable, 0, "Disable metadata compression");
=20
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_bp_active, CTLFLAG_RW|CTLFLAG_TUN, &zf=
s_arc_bp_active, 0,
+    "Start ARC backpressure if active memory is below this limit");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_bp_inactive, CTLFLAG_RW|CTLFLAG_TUN, &=
zfs_arc_bp_inactive, 0,
+    "Start ARC backpressure if inactive memory is below this limit");
+
 /*
  * Note that buffers can be in one of 6 states:
  *	ARC_anon	- anonymous (discussed below)
@@ -2103,7 +2113,6 @@
 }
=20
 static int needfree =3D 0;
-
 static int
 arc_reclaim_needed(void)
 {
@@ -2112,20 +2121,58 @@
 #endif
=20
 #ifdef _KERNEL
-	if (needfree)
-		return (1);
+	/* We've grown too much, */
 	if (arc_size > arc_c_max)
 		return (1);
+
+	/* Pagedaemon is stuck, let's free something right away */
+	if (vm_pageout_pages_needed)
+		return 1;
+
+	/* Check if inactive list have grown too much */
+	if ( zfs_arc_bp_inactive
+	     && (ptoa((uintmax_t)cnt.v_inactive_count) > zfs_arc_bp_inactive)) {
+		/* tell pager to reap 1/2th of inactive queue*/
+		atomic_add_int(&vm_pageout_deficit, cnt.v_inactive_count/2);
+		pagedaemon_wakeup();
+		return needfree;
+	}
+
+	/* Same for active list... */
+	if ( zfs_arc_bp_active
+	     && (ptoa((uintmax_t)cnt.v_active_count) > zfs_arc_bp_active)) {
+		atomic_add_int(&vm_pageout_deficit, cnt.v_active_count/2);
+		pagedaemon_wakeup();
+		return needfree;
+	}
+
+=09
+	/* Old style behavior -- ARC gives up memory whenever page daemon asks.. =
*/
+	if (needfree)
+		return 1;
+
+	/*
+	  We got here either because active/inactive lists are
+	  getting short or because we've been called during voluntary
+	  ARC size checks. Kind of gray area...
+	*/
+
+	/* If we didn't reach our minimum yet, don't rush to give memory up..*/
 	if (arc_size <=3D arc_c_min)
 		return (0);
=20
+	/* If we're really short on memory now, give it up. */
+	if (vm_page_count_min()) {
+		return (1);
+	}
+=09
 	/*
-	 * If pages are needed or we're within 2048 pages
-	 * of needing to page need to reclaim
+	 * If we're within 2048 pages of pagedaemon start, reclaim...
 	 */
-	if (vm_pages_needed || (vm_paging_target() > -2048))
+	if (vm_pages_needed && (vm_paging_target() > -2048))
 		return (1);
=20
+
 #if 0
 	/*
 	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
@@ -2169,8 +2216,6 @@
 		return (1);
 #endif
 #else
-	if (kmem_used() > (kmem_size() * 3) / 4)
-		return (1);
 #endif
=20
 #else
@@ -2279,7 +2324,7 @@
 		if (arc_eviction_list !=3D NULL)
 			arc_do_user_evicts();
=20
-		if (arc_reclaim_needed()) {
+		if (needfree) {
 			needfree =3D 0;
 #ifdef _KERNEL
 			wakeup(&needfree);
@@ -3611,10 +3656,17 @@
 {
 #ifdef _KERNEL
 	uint64_t inflight_data =3D arc_anon->arcs_size;
-	uint64_t available_memory =3D ptoa((uintmax_t)cnt.v_free_count);
+	uint64_t available_memory;
 	static uint64_t page_load =3D 0;
 	static uint64_t last_txg =3D 0;
=20
+        /* How much memory is potentially available */
+	available_memory =3D (uint64_t)cnt.v_free_count + cnt.v_cache_count;
+	if (available_memory > cnt.v_free_min)
+		available_memory =3D ptoa(available_memory - cnt.v_free_min);
+	else
+		available_memory =3D 0;
+
 #if 0
 #if defined(__i386)
 	available_memory =3D

--RnlQjJ0d97Da+TV1--

--mojUlQ0s9EVzWg2t
Content-Type: application/pgp-signature
Content-Disposition: inline

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v2.0.15 (FreeBSD)

iEYEARECAAYFAkw64tUACgkQ/opHv/APuIdaZACfbuBNLVzG4Ktjw5uy5rmh1xrz
PHIAmQFCVsVKV8DQg2BrlcwJulbXST89
=coqu
-----END PGP SIGNATURE-----

--mojUlQ0s9EVzWg2t--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20100712093933.GA27950>