Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 29 Jul 1999 11:56:07 -0700 (PDT)
From:      Matthew Dillon <dillon@apollo.backplane.com>
To:        Alan Cox <alc@cs.rice.edu>, David Greenman <dg@root.com>
Cc:        hackers@FreeBSD.ORG
Subject:   patch for behavior changes and madvise MADV_DONTNEED
Message-ID:  <199907291856.LAA77471@apollo.backplane.com>
References:  <199907162234.PAA21850@apollo.backplane.com> <19990720014804.A21777@cs.rice.edu>

next in thread | previous in thread | raw e-mail | index | archive | help
    I have tested this on both small and large files and it appears to work
    extremely well.  So well, in fact, that I can have a program which
    mmap()'s a large file and continuously scans it - generating 4MB/sec of
    network traffic, with virtually no effect to the rest of the system.

    I am not finished testing.  I will be running buildworlds and other related
    tests overnight to ensure that no previously fixed bugs have been 
    reintroduced.  This patch will probably become the commit candidate
    tomorrow.

    There are several things in this patch:

	* minor readability fix in pmap.c

	* vm_page_undirty()

	* madvise() has, in general, been extended to operate on files
	  (except for MADV_FREE)

	* madvise(... MADV_DONTNEED) has been implemented to avoid starving
	  the VM page queues while at the same time enforcing a slow balancing
	  to deal with both the small-file and the large-file case.

	  If we wanted to we could further optimize this code by modifying
	  vm_page_dontneed().  For example, we could have it ignore pages
	  whos act_count are too large.

	* #if 0'ing out of apparently unnecessary ufs code (if it winds up
	  being necessary I recommend removing it anyway and making the
	  required changes to vm_fault.c instead).

					-Matt
					Matthew Dillon 
					<dillon@backplane.com>


Index: i386/i386/pmap.c
===================================================================
RCS file: /home/ncvs/src/sys/i386/i386/pmap.c,v
retrieving revision 1.242
diff -u -r1.242 pmap.c
--- pmap.c	1999/07/21 18:01:40	1.242
+++ pmap.c	1999/07/21 20:43:00
@@ -3188,7 +3188,7 @@
 
 			pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 
-			if (pte && *pte & PG_A) {
+			if (pte && (*pte & PG_A)) {
 				*pte &= ~PG_A;
 
 				pmap_TLB_invalidate(pv->pv_pmap, pv->pv_va);
Index: miscfs/devfs/devfs_vnops.c
===================================================================
RCS file: /home/ncvs/src/sys/miscfs/devfs/devfs_vnops.c,v
retrieving revision 1.75
diff -u -r1.75 devfs_vnops.c
--- devfs_vnops.c	1999/06/26 02:46:17	1.75
+++ devfs_vnops.c	1999/07/08 22:20:29
@@ -2005,13 +2005,13 @@
 
 		if (nextoff <= nread) {
 			m->valid = VM_PAGE_BITS_ALL;
-			m->dirty = 0;
+			vm_page_undirty(m);
 		} else if (toff < nread) {
 			int nvalid = ((nread + DEV_BSIZE - 1) - toff) & ~(DEV_BSIZE - 1);
 			vm_page_set_validclean(m, 0, nvalid);
 		} else {
 			m->valid = 0;
-			m->dirty = 0;
+			vm_page_undirty(m);
 		}
 
 		if (i != ap->a_reqpage) {
Index: miscfs/specfs/spec_vnops.c
===================================================================
RCS file: /home/ncvs/src/sys/miscfs/specfs/spec_vnops.c,v
retrieving revision 1.90
diff -u -r1.90 spec_vnops.c
--- spec_vnops.c	1999/07/20 09:47:45	1.90
+++ spec_vnops.c	1999/07/21 05:50:06
@@ -860,7 +860,7 @@
 
 		if (nextoff <= nread) {
 			m->valid = VM_PAGE_BITS_ALL;
-			m->dirty = 0;
+			vm_page_undirty(m);
 		} else if (toff < nread) {
 			/*
 			 * Since this is a VM request, we have to supply the
@@ -870,7 +870,7 @@
 			vm_page_set_validclean(m, 0, nread - toff);
 		} else {
 			m->valid = 0;
-			m->dirty = 0;
+			vm_page_undirty(m);
 		}
 
 		if (i != ap->a_reqpage) {
Index: nfs/nfs_bio.c
===================================================================
RCS file: /home/ncvs/src/sys/nfs/nfs_bio.c,v
retrieving revision 1.74
diff -u -r1.74 nfs_bio.c
--- nfs_bio.c	1999/06/26 02:46:29	1.74
+++ nfs_bio.c	1999/07/08 22:21:48
@@ -185,7 +185,7 @@
 			 * Read operation filled an entire page
 			 */
 			m->valid = VM_PAGE_BITS_ALL;
-			m->dirty = 0;
+			vm_page_undirty(m);
 		} else if (size > toff) {
 			/*
 			 * Read operation filled a partial page.
@@ -313,7 +313,7 @@
 		int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
 		for (i = 0; i < nwritten; i++) {
 			rtvals[i] = VM_PAGER_OK;
-			pages[i]->dirty = 0;
+			vm_page_undirty(pages[i]);
 		}
 		if (must_commit)
 			nfs_clearcommit(vp->v_mount);
Index: ufs/ufs/ufs_readwrite.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ufs/ufs_readwrite.c,v
retrieving revision 1.61
diff -u -r1.61 ufs_readwrite.c
--- ufs_readwrite.c	1999/07/25 02:07:16	1.61
+++ ufs_readwrite.c	1999/07/29 17:40:14
@@ -591,6 +591,7 @@
 	if (firstindex == 0)
 		vp->v_lastr = 0;
 
+#if 0
 	if (((obj->behavior != OBJ_RANDOM) &&
 		(firstindex != 0) && (firstindex <= vp->v_lastr) &&
 		 ((firstindex + pcount) > vp->v_lastr)) ||
@@ -652,6 +653,7 @@
 			vm_page_zero_invalid(mreq, TRUE);
 		return VM_PAGER_OK;
 	}
+#endif
 
 	/*
 	 * foff is the file offset of the required page
@@ -670,7 +672,7 @@
 		if (reqblkno == -1) {
 			if ((mreq->flags & PG_ZERO) == 0)
 				vm_page_zero_fill(mreq);
-			mreq->dirty = 0;
+			vm_page_undirty(mreq);
 			mreq->valid = VM_PAGE_BITS_ALL;
 			return VM_PAGER_OK;
 		} else {
Index: vm/swap_pager.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/swap_pager.c,v
retrieving revision 1.121
diff -u -r1.121 swap_pager.c
--- swap_pager.c	1999/07/16 05:11:35	1.121
+++ swap_pager.c	1999/07/29 18:24:33
@@ -1631,7 +1631,7 @@
 
 			pmap_clear_modify(VM_PAGE_TO_PHYS(m));
 			m->valid = VM_PAGE_BITS_ALL;
-			m->dirty = 0;
+			vm_page_undirty(m);
 			vm_page_flag_clear(m, PG_ZERO);
 
 			/*
@@ -1656,7 +1656,7 @@
 			 */
 			vm_page_protect(m, VM_PROT_READ);
 			pmap_clear_modify(VM_PAGE_TO_PHYS(m));
-			m->dirty = 0;
+			vm_page_undirty(m);
 			vm_page_io_finish(m);
 		}
 	}
Index: vm/vm_fault.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_fault.c,v
retrieving revision 1.103
diff -u -r1.103 vm_fault.c
--- vm_fault.c	1999/07/20 05:46:56	1.103
+++ vm_fault.c	1999/07/29 17:43:07
@@ -386,7 +386,7 @@
 			int reqpage;
 			int ahead, behind;
 
-			if (fs.first_object->behavior == OBJ_RANDOM) {
+			if (fs.entry->behavior == BEHAV_RANDOM) {
 				ahead = 0;
 				behind = 0;
 			} else {
@@ -400,7 +400,7 @@
 			}
 
 			if ((fs.first_object->type != OBJT_DEVICE) &&
-				(fs.first_object->behavior == OBJ_SEQUENTIAL)) {
+				(fs.entry->behavior == BEHAV_SEQUENTIAL)) {
 				vm_pindex_t firstpindex, tmppindex;
 				if (fs.first_pindex <
 					2*(VM_FAULT_READ_BEHIND + VM_FAULT_READ_AHEAD + 1))
Index: vm/vm_map.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_map.c,v
retrieving revision 1.173
diff -u -r1.173 vm_map.c
--- vm_map.c	1999/07/21 18:02:27	1.173
+++ vm_map.c	1999/07/29 17:44:43
@@ -1051,13 +1051,13 @@
 
 		switch (advise) {
 	case MADV_NORMAL:
-			current->object.vm_object->behavior = OBJ_NORMAL;
+			current->behavior = BEHAV_NORMAL;
 			break;
 	case MADV_SEQUENTIAL:
-			current->object.vm_object->behavior = OBJ_SEQUENTIAL;
+			current->behavior = BEHAV_SEQUENTIAL;
 			break;
 	case MADV_RANDOM:
-			current->object.vm_object->behavior = OBJ_RANDOM;
+			current->behavior = BEHAV_RANDOM;
 			break;
 	/*
 	 * Right now, we could handle DONTNEED and WILLNEED with common code.
Index: vm/vm_map.h
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_map.h,v
retrieving revision 1.43
diff -u -r1.43 vm_map.h
--- vm_map.h	1999/07/10 18:16:08	1.43
+++ vm_map.h	1999/07/29 17:41:46
@@ -89,6 +89,10 @@
 	struct vm_map *sub_map;		/* belongs to another map */
 };
 
+#define BEHAV_NORMAL      0x0		/* default behavior */
+#define BEHAV_SEQUENTIAL  0x1		/* expect sequential accesses */
+#define BEHAV_RANDOM      0x2		/* expect random accesses */
+
 /*
  *	Address map entries consist of start and end addresses,
  *	a VM object (or sharing map) and offset into that object,
@@ -102,6 +106,8 @@
 	vm_offset_t end;		/* end address */
 	vm_offset_t avail_ssize;	/* amt can grow if this is a stack */
 	union vm_map_object object;	/* object I point to */
+	u_short behavior;		/* fault behavior */
+	u_short unused3;		/* (filler) */
 	vm_ooffset_t offset;		/* offset into object */
 	u_char eflags;			/* map entry flags */
 	/* Only in task maps: */
Index: vm/vm_object.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_object.c,v
retrieving revision 1.160
diff -u -r1.160 vm_object.c
--- vm_object.c	1999/07/16 05:11:36	1.160
+++ vm_object.c	1999/07/29 17:19:05
@@ -154,7 +154,9 @@
 	object->flags = 0;
 	if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
 		vm_object_set_flag(object, OBJ_ONEMAPPING);
+#if 0
 	object->behavior = OBJ_NORMAL;
+#endif
 	object->paging_in_progress = 0;
 	object->resident_page_count = 0;
 	object->shadow_count = 0;
@@ -735,12 +737,22 @@
  *	vm_object_madvise:
  *
  *	Implements the madvise function at the object/page level.
+ *
+ *	MADV_WILLNEED	(any map)
+ *
+ *	    Force activation of the page if it is found in-core
+ *
+ *	MADV_DONTNEED	(any map)
+ *
+ *	    Deactivate or cache the page as appropriate.
  *
- *	Currently, madvise() functions are limited to the default and
- *	swap object types only, and also limited to only the unshared portions 
- *	of a process's address space.  MADV_FREE, certainly, could never be
- *	run on anything else.  The others are more flexible and the code could
- *	be adjusted in the future to handle expanded cases for them.
+ *	MADV_FREE	(OBJT_DEFAULT or OBJT_SWAP maps, OBJ_ONEMAPPING only)
+ *
+ *	    essentially free the underlying storage.  We mark the storage
+ *	    clean but do not unmap it from the process, allowing the process
+ *	    to reuse the storage (by dirtying it again) as well as allowing
+ *	    the VM system to reuse it for other purpose, turning it back into
+ *	    zero-fill.
  */
 void
 vm_object_madvise(object, pindex, count, advise)
@@ -768,20 +780,26 @@
 		tpindex = pindex;
 shadowlookup:
 
-		if (tobject->type != OBJT_DEFAULT &&
-		    tobject->type != OBJT_SWAP
-		) {
-			continue;
-		}
+		/*
+		 * MADV_FREE only operates OBJT_DEFAULT or OBJT_SWAP pages
+		 * and those pages must be OBJ_ONEMAPPING.
+		 */
 
-		if ((tobject->flags & OBJ_ONEMAPPING) == 0)
-			continue;
+		if (advise == MADV_FREE) {
+			if ((tobject->type != OBJT_DEFAULT &&
+			    tobject->type != OBJT_SWAP) ||
+			    (tobject->type & OBJ_ONEMAPPING) == 0
+			) {
+				continue;
+			}
+		}
 
 		m = vm_page_lookup(tobject, tpindex);
 
 		if (m == NULL) {
 			/*
-			 * There may be swap even if there is no backing page
+			 * There may be swap in an intermediate object even 
+			 * if there is no backing page, deal with it here.
 			 */
 			if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
 				swap_pager_freespace(tobject, tpindex, 1);
@@ -813,9 +831,17 @@
   			goto relookup;
 
 		if (advise == MADV_WILLNEED) {
+			/*
+			 * Activate the page early to reduce the chance of
+			 * it being reused before the program accesses it.
+			 */
 			vm_page_activate(m);
 		} else if (advise == MADV_DONTNEED) {
-			vm_page_deactivate(m);
+			/*
+			 * Deactivate, cache, or do nothing to the page 
+			 * as appropriate. 
+			 */
+			vm_page_dontneed(m);
 		} else if (advise == MADV_FREE) {
 			/*
 			 * Mark the page clean.  This will allow the page
@@ -833,7 +859,7 @@
 			 * it.
 			 */
 			pmap_clear_modify(VM_PAGE_TO_PHYS(m));
-			m->dirty = 0;
+			vm_page_undirty(m);
 			m->act_count = 0;
 			vm_page_deactivate(m);
 			if (tobject->type == OBJT_SWAP)
Index: vm/vm_object.h
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_object.h,v
retrieving revision 1.58
diff -u -r1.58 vm_object.h
--- vm_object.h	1999/07/16 05:11:37	1.58
+++ vm_object.h	1999/07/29 17:13:33
@@ -98,7 +98,7 @@
 	u_short flags;			/* see below */
 	u_short pg_color;		/* color of first page in obj */
 	u_short paging_in_progress;	/* Paging (in or out) so don't collapse or destroy */
-	u_short	behavior;		/* see below */
+	u_short	unused13;
 	int resident_page_count;	/* number of resident pages */
 	struct vm_object *backing_object; /* object that I'm a shadow of */
 	vm_ooffset_t backing_object_offset;/* Offset in backing object */
@@ -148,10 +148,6 @@
 #define OBJ_CLEANING	0x0200
 #define OBJ_OPT		0x1000		/* I/O optimization */
 #define	OBJ_ONEMAPPING	0x2000		/* One USE (a single, non-forked) mapping flag */
-
-#define OBJ_NORMAL	0x0		/* default behavior */
-#define OBJ_SEQUENTIAL	0x1		/* expect sequential accesses */
-#define OBJ_RANDOM	0x2		/* expect random accesses */
 
 #define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT)
 #define OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT))
Index: vm/vm_page.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_page.c,v
retrieving revision 1.134
diff -u -r1.134 vm_page.c
--- vm_page.c	1999/07/01 19:53:42	1.134
+++ vm_page.c	1999/07/29 18:29:35
@@ -862,6 +862,10 @@
 	m->busy = 0;
 	m->valid = 0;
 	m->dirty = 0;
+#if 0
+	/* FUTURE */
+	KASSERT(m->dirty == 0, ("vm_page_alloc: free/cache page %p was dirty", m));
+#endif
 	m->queue = PQ_NONE;
 
 	/*
@@ -997,6 +1001,8 @@
  *	vm_page_activate:
  *
  *	Put the specified page on the active list (if appropriate).
+ *	Ensure that act_count is at least ACT_INIT but do not otherwise
+ *	mess with it.
  *
  *	The page queues must be locked.
  *	This routine may not block.
@@ -1119,6 +1125,7 @@
 	}
 
 	m->valid = 0;
+	vm_page_undirty(m);
 
 	if (m->wire_count != 0) {
 #if !defined(MAX_PERF)
@@ -1347,6 +1354,67 @@
 }
 
 /*
+ * vm_page_dontneed
+ *
+ *	Cache, deactivate, or do nothing as appropriate.  This routine
+ *	is typically used by madvise() MADV_DONTNEED.
+ *
+ *	Generally speaking we want to move the page into the cache so
+ *	it gets reused quickly.  However, this can result in a silly syndrome
+ *	due to the page recycling too quickly.  Small objects will not be
+ *	fully cached.  On the otherhand, if we move the page to the inactive
+ *	queue we wind up with a problem whereby very large objects 
+ *	unnecessarily blow away our inactive and cache queues.
+ *
+ *	The solution is to move the pages based on a fixed weighting.  We
+ *	either leave them alone, deactivate them, or move them to the cache,
+ *	where moving them to the cache has the highest weighting.
+ *	By forcing some pages into other queues we eventually force the
+ *	system to balance the queues, potentially recovering other unrelated
+ *	space from active.  The idea is to not force this to happen too
+ *	often.
+ */
+
+void
+vm_page_dontneed(m)
+	vm_page_t m;
+{
+	static int dnweight;
+	int dnw;
+
+	dnw = ++dnweight;
+
+	/*
+	 * Just adjust act_count and do not otherwise mess with the page
+	 * if it is already on the inactive or cache queues, and for one
+	 * page out of every 32.
+	 */
+
+	if ((dnw & 31) == 0 ||
+	    m->queue == PQ_INACTIVE || 
+	    m->queue - m->pc != PQ_CACHE
+	) {
+		if (m->act_count > 0)
+			--m->act_count;
+		return;
+	}
+
+	vm_page_test_dirty(m);
+
+	if ((dnw & 7) == 0 || m->dirty) {
+		/*
+		 * Deactivate the page 3 times out of 32.
+		 */
+		vm_page_deactivate(m);
+	} else {
+		/*
+		 * Cache the page 28 times out of every 32.
+		 */
+		vm_page_cache(m);
+	}
+}
+
+/*
  * Grab a page, waiting until we are waken up due to the page
  * changing state.  We keep on waiting, if the page continues
  * to be in the object.  If the page doesn't exist, allocate it.
@@ -1778,6 +1846,10 @@
 			m->valid = VM_PAGE_BITS_ALL;
 			m->flags = 0;
 			m->dirty = 0;
+#if 0
+			/* future */
+			KASSERT(m->dirty == 0, ("ctgmalloc1: page %p was dirty", m));
+#endif
 			m->wire_count = 0;
 			m->busy = 0;
 			m->queue = PQ_NONE;
Index: vm/vm_page.h
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_page.h,v
retrieving revision 1.63
diff -u -r1.63 vm_page.h
--- vm_page.h	1999/07/22 06:04:17	1.63
+++ vm_page.h	1999/07/29 06:38:23
@@ -376,6 +376,7 @@
 vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int));
 vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int));
 void vm_page_cache __P((register vm_page_t));
+void vm_page_dontneed __P((register vm_page_t));
 static __inline void vm_page_copy __P((vm_page_t, vm_page_t));
 static __inline void vm_page_free __P((vm_page_t));
 static __inline void vm_page_free_zero __P((vm_page_t));
@@ -555,6 +556,18 @@
 {
 	KASSERT(m->queue - m->pc != PQ_CACHE, ("vm_page_dirty: page in cache!"));
 	m->dirty = VM_PAGE_BITS_ALL;
+}
+
+/*
+ *	vm_page_undirty:
+ *
+ *	Set page to not be dirty.  Note: does not clear pmap modify bits 
+ */
+
+static __inline void
+vm_page_undirty(vm_page_t m)
+{
+	m->dirty = 0;
 }
 
 static __inline vm_page_t
Index: vm/vm_pageout.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_pageout.c,v
retrieving revision 1.144
diff -u -r1.144 vm_pageout.c
--- vm_pageout.c	1999/07/04 00:25:37	1.144
+++ vm_pageout.c	1999/07/13 05:47:33
@@ -425,7 +425,7 @@
 			 * worked.
 			 */
 			pmap_clear_modify(VM_PAGE_TO_PHYS(mt));
-			mt->dirty = 0;
+			vm_page_undirty(mt);
 			break;
 		case VM_PAGER_ERROR:
 		case VM_PAGER_FAIL:
Index: vm/vnode_pager.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vnode_pager.c,v
retrieving revision 1.112
diff -u -r1.112 vnode_pager.c
--- vnode_pager.c	1999/07/01 19:53:43	1.112
+++ vnode_pager.c	1999/07/08 22:27:33
@@ -511,7 +511,7 @@
 		vm_pager_unmap_page(kva);
 	}
 	pmap_clear_modify(VM_PAGE_TO_PHYS(m));
-	m->dirty = 0;
+	vm_page_undirty(m);
 	vm_page_flag_clear(m, PG_ZERO);
 	if (!error)
 		m->valid = VM_PAGE_BITS_ALL;
@@ -773,7 +773,7 @@
 			 * Read filled up entire page.
 			 */
 			mt->valid = VM_PAGE_BITS_ALL;
-			mt->dirty = 0;
+			vm_page_undirty(mt);	/* should be an assert? XXX */
 			pmap_clear_modify(VM_PAGE_TO_PHYS(mt));
 		} else {
 			/*


To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-hackers" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199907291856.LAA77471>