From owner-freebsd-hackers  Wed Oct 25 22:53:29 2000
Delivered-To: freebsd-hackers@freebsd.org
Received: from earth.backplane.com (placeholder-dcat-1076843399.broadbandoffice.net [64.47.83.135])
	by hub.freebsd.org (Postfix) with ESMTP
	id 1D4EB37B479; Wed, 25 Oct 2000 22:53:23 -0700 (PDT)
Received: (from dillon@localhost)
	by earth.backplane.com (8.11.0/8.9.3) id e9Q5qnf32627;
	Wed, 25 Oct 2000 22:52:49 -0700 (PDT)
	(envelope-from dillon)
Date: Wed, 25 Oct 2000 22:52:49 -0700 (PDT)
From: Matt Dillon <dillon@earth.backplane.com>
Message-Id: <200010260552.e9Q5qnf32627@earth.backplane.com>
To: Alfred Perlstein <bright@wintelcom.net>, ps@FreeBSD.ORG,
	hackers@FreeBSD.ORG
Subject: VM pager patch (was Re: vm_pageout_scan badness)
References: <20001024112708.E28123@fw.wintelcom.net> <200010242010.e9OKAJK19739@earth.backplane.com> <20001025123154.A56298@heechee.tobez.org>
Sender: owner-freebsd-hackers@FreeBSD.ORG
Precedence: bulk
X-Loop: FreeBSD.ORG

    Here's a test patch, inclusive of some debugging sysctls:

	vm.always_launder	set to 1 to give up on trying to avoid
				pageouts.

	vm.vm_pageout_stats_rescans
				Number of times the main inactive scan
				in the pageout loop had to restart

	vm.vm_pageout_stats_xtralaunder
				Number of times a second pass had to be
				taken (in normal mode, with always_launder
				set to 0).

    This patch:

	* implements a placemarker to try to avoid restarts.

	* does not penalize the pageout daemon for being able 
	  to cluster writes.

	* adds an additional vnode check that should be there

    One last note:  I wrote a quick and dirty program to mmap() a bunch
    of big files MAP_NOSYNC and then dirty them in a loop.  I noticed
    that the filesystem update daemon 'froze up' the system for about a 
    second every 30 seconds due to the huge number of dirty MAP_NOSYNC
    pages (about 1GB worth) sitting around (it has to scan the vm_page_t's
    even if it doesn't do anything with them).  This is a separate issue.

    If Alfred, and others running heavily loaded systems are able to test
    this patch sufficiently, we can include it (minus the debugging
    sysctls) in the release.  If not, I will wait until after
    the release is rolled before I commit it or whatever the final patch
    winds up looking like.

					-Matt

Index: vm_page.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_page.c,v
retrieving revision 1.147.2.3
diff -u -r1.147.2.3 vm_page.c
--- vm_page.c	2000/08/04 22:31:11	1.147.2.3
+++ vm_page.c	2000/10/26 04:43:22
@@ -1783,6 +1783,12 @@
 					("contigmalloc1: page %p is not PQ_INACTIVE", m));
 
 				next = TAILQ_NEXT(m, pageq);
+				/*
+				 * ignore markers
+				 */
+				if (m->flags & PG_MARKER)
+					continue;
+
 				if (vm_page_sleep_busy(m, TRUE, "vpctw0"))
 					goto again1;
 				vm_page_test_dirty(m);
Index: vm_page.h
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_page.h,v
retrieving revision 1.75.2.3
diff -u -r1.75.2.3 vm_page.h
--- vm_page.h	2000/09/16 01:08:03	1.75.2.3
+++ vm_page.h	2000/10/26 04:17:28
@@ -251,6 +251,7 @@
 #define PG_SWAPINPROG	0x0200		/* swap I/O in progress on page	     */
 #define PG_NOSYNC	0x0400		/* do not collect for syncer */
 #define PG_UNMANAGED	0x0800		/* No PV management for page */
+#define PG_MARKER	0x1000		/* special queue marker page */
 
 /*
  * Misc constants.
Index: vm_pageout.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_pageout.c,v
retrieving revision 1.151.2.4
diff -u -r1.151.2.4 vm_pageout.c
--- vm_pageout.c	2000/08/04 22:31:11	1.151.2.4
+++ vm_pageout.c	2000/10/26 05:07:45
@@ -143,6 +143,9 @@
 static int disable_swap_pageouts=0;
 
 static int max_page_launder=100;
+static int always_launder=0;
+static int vm_pageout_stats_rescans=0;
+static int vm_pageout_stats_xtralaunder=0;
 #if defined(NO_SWAPPING)
 static int vm_swap_enabled=0;
 static int vm_swap_idle_enabled=0;
@@ -186,6 +189,12 @@
 
 SYSCTL_INT(_vm, OID_AUTO, max_page_launder,
 	CTLFLAG_RW, &max_page_launder, 0, "Maximum number of pages to clean per pass");
+SYSCTL_INT(_vm, OID_AUTO, always_launder,
+	CTLFLAG_RW, &always_launder, 0, "Always launder on the first pass");
+SYSCTL_INT(_vm, OID_AUTO, vm_pageout_stats_rescans,
+	CTLFLAG_RD, &vm_pageout_stats_rescans, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, vm_pageout_stats_xtralaunder,
+	CTLFLAG_RD, &vm_pageout_stats_xtralaunder, 0, "");
 
 
 #define VM_PAGEOUT_PAGE_COUNT 16
@@ -613,11 +622,16 @@
 
 /*
  *	vm_pageout_scan does the dirty work for the pageout daemon.
+ *
+ *	This code is responsible for calculating the page shortage
+ *	and then attempting to clean or free enough pages to hit that
+ *	mark.
  */
 static int
 vm_pageout_scan()
 {
 	vm_page_t m, next;
+	struct vm_page marker;
 	int page_shortage, maxscan, pcount;
 	int addl_page_shortage, addl_page_shortage_init;
 	int maxlaunder;
@@ -651,27 +665,41 @@
 	/*
 	 * Figure out what to do with dirty pages when they are encountered.
 	 * Assume that 1/3 of the pages on the inactive list are clean.  If
-	 * we think we can reach our target, disable laundering (do not
-	 * clean any dirty pages).  If we miss the target we will loop back
-	 * up and do a laundering run.
+	 * we think we can reach our target, reduce the amount of launder we
+	 * try to do in the first pass significantly.  If we miss the target
+	 * we will loop back up and do a full laundering run.
+	 *
+	 * If always_launder is set, we do a full laundering run on the
+	 * first pass.
 	 */
 
-	if (cnt.v_inactive_count / 3 > page_shortage) {
+	if (always_launder == 0 && cnt.v_inactive_count / 3 > page_shortage) {
+#if 0	/* THIS MAY BE BETTER */
+		maxlaunder = cnt.v_inactive_target / 10 + 1;
+#endif
 		maxlaunder = 0;
 		launder_loop = 0;
 	} else {
-		maxlaunder = 
-		    (cnt.v_inactive_target > max_page_launder) ?
-		    max_page_launder : cnt.v_inactive_target;
+		maxlaunder = cnt.v_inactive_target;
 		launder_loop = 1;
 	}
+	if (maxlaunder > max_page_launder)
+	    maxlaunder = max_page_launder;
 
 	/*
+	 * Initialize our marker
+	 */
+	bzero(&marker, sizeof(marker));
+	marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
+	marker.valid = 0;
+	marker.queue = PQ_INACTIVE;
+	marker.wire_count = 1;
+
+	/*
 	 * Start scanning the inactive queue for pages we can move to the
 	 * cache or free.  The scan will stop when the target is reached or
 	 * we have scanned the entire inactive queue.
 	 */
-
 rescan0:
 	addl_page_shortage = addl_page_shortage_init;
 	maxscan = cnt.v_inactive_count;
@@ -682,11 +710,18 @@
 		cnt.v_pdpages++;
 
 		if (m->queue != PQ_INACTIVE) {
+			++vm_pageout_stats_rescans;
 			goto rescan0;
 		}
 
 		next = TAILQ_NEXT(m, pageq);
 
+		/*
+		 * Skip marker pages
+		 */
+		if (m->flags & PG_MARKER)
+			continue;
+
 		if (m->hold_count) {
 			s = splvm();
 			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
@@ -763,7 +798,8 @@
 			--page_shortage;
 
 		/*
-		 * Clean pages can be placed onto the cache queue.
+		 * Clean pages can be placed onto the cache queue, which
+		 * is almost the same as freeing them.
 		 */
 		} else if (m->dirty == 0) {
 			vm_page_cache(m);
@@ -774,7 +810,6 @@
 		 * only a limited number of pages per pagedaemon pass.
 		 */
 		} else if (maxlaunder > 0) {
-			int written;
 			int swap_pageouts_ok;
 			struct vnode *vp = NULL;
 
@@ -871,10 +906,16 @@
 				}
 
 				/*
-				 * The page might have been moved to another queue
-				 * during potential blocking in vget() above.
+				 * The page might have been moved to another
+				 * queue during potential blocking in vget()
+				 * above.  The page might have been freed and
+				 * reused for another vnode.  The object might
+				 * have been reused for another vnode.
 				 */
-				if (m->queue != PQ_INACTIVE) {
+				if (m->queue != PQ_INACTIVE ||
+				    m->object != object ||
+				    object->handle != vp
+				) {
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					vput(vp);
@@ -882,9 +923,10 @@
 				}
 	
 				/*
-				 * The page may have been busied during the blocking in
-				 * vput();  We don't move the page back onto the end of
-				 * the queue so that statistics are more correct if we don't.
+				 * The page may have been busied during the
+				 * blocking in vput();  We don't move the
+				 * page back onto the end of the queue so that
+				 * statistics are more correct if we don't.
 				 */
 				if (m->busy || (m->flags & PG_BUSY)) {
 					vput(vp);
@@ -910,13 +952,27 @@
 			 * If a page is dirty, then it is either being washed
 			 * (but not yet cleaned) or it is still in the
 			 * laundry.  If it is still in the laundry, then we
-			 * start the cleaning operation.
+			 * start the cleaning operation.  maxlaunder nominally
+			 * counts I/O cost, essentially seeks, so we drop it
+			 * by one no matter how large a cluster
+			 * vm_pageout_clean() is able to put together.
+			 *
+			 * This operation may cluster-out, causing the 'next'
+			 * page to move to another queue.  To avoid loosing our
+			 * place we insert a placemarker, then recalculate
+			 * next after vm_pageout_clean() returns.
 			 */
-			written = vm_pageout_clean(m);
-			if (vp)
+			s = splvm();
+			TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq);
+			splx(s);
+			if (vm_pageout_clean(m) != 0)
+				--maxlaunder;
+			s = splvm();
+			next = TAILQ_NEXT(&marker, pageq);
+			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
+			splx(s);
+			if (vp != NULL)
 				vput(vp);
-
-			maxlaunder -= written;
 		}
 	}
 
@@ -930,6 +986,7 @@
 		maxlaunder = 
 		    (cnt.v_inactive_target > max_page_launder) ?
 		    max_page_launder : cnt.v_inactive_target;
+		++vm_pageout_stats_xtralaunder;
 		goto rescan0;
 	}
 

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-hackers" in the body of the message