Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 14 Mar 2006 21:27:47 GMT
From:      Peter Wemm <peter@FreeBSD.org>
To:        Perforce Change Reviews <perforce@freebsd.org>
Subject:   PERFORCE change 93306 for review
Message-ID:  <200603142127.k2ELRlP6089714@repoman.freebsd.org>

next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=93306

Change 93306 by peter@peter_daintree on 2006/03/14 21:27:31

	Check in my WIP for half sized pv entries, very roughly ported
	forward 6.x at yahoo.  Much work is still to be done, especially
	the get_pv_entry() reclaim process which needs to be synced with
	alc's changes.

Affected files ...

.. //depot/projects/hammer/sys/amd64/amd64/pmap.c#134 edit
.. //depot/projects/hammer/sys/amd64/include/pmap.h#60 edit

Differences ...

==== //depot/projects/hammer/sys/amd64/amd64/pmap.c#134 (text+ko) ====

@@ -182,7 +182,6 @@
 /*
  * Data for the pv entry allocation mechanism
  */
-static uma_zone_t pvzone;
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 static int shpgperproc = PMAP_SHPGPERPROC;
 
@@ -198,7 +197,7 @@
  */
 static caddr_t crashdumpmap;
 
-static PMAP_INLINE void	free_pv_entry(pv_entry_t pv);
+static PMAP_INLINE void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t locked_pmap);
 static void	pmap_clear_ptes(vm_page_t m, long bit);
 
@@ -509,7 +508,7 @@
 	PMAP_LOCK_INIT(kernel_pmap);
 	kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys);
 	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
-	TAILQ_INIT(&kernel_pmap->pm_pvlist);
+	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 	nkpt = NKPT;
 
 	/*
@@ -569,8 +568,6 @@
 	 * high water mark so that the system can recover from excessive
 	 * numbers of pv entries.
 	 */
-	pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL, 
-	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM);
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
@@ -1063,7 +1060,7 @@
 	PMAP_LOCK_INIT(pmap);
 	pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys);
 	pmap->pm_active = 0;
-	TAILQ_INIT(&pmap->pm_pvlist);
+	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 }
 
@@ -1100,7 +1097,7 @@
 	pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
 
 	pmap->pm_active = 0;
-	TAILQ_INIT(&pmap->pm_pvlist);
+	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 }
 
@@ -1437,14 +1434,51 @@
  * page management routines.
  ***************************************************/
 
+CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
+CTASSERT(_NPCM == 3);
+CTASSERT(_NPCPV == 168);
+
+static __inline struct pv_chunk *
+pv_to_chunk(pv_entry_t pv)
+{
+
+	return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
+}
+
+#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
+
+#define	PC_FREE0	0xfffffffffffffffful
+#define	PC_FREE1	0xfffffffffffffffful
+#define	PC_FREE2	0x000000fffffffffful
+
 /*
  * free the pv_entry back to the free list
  */
 static PMAP_INLINE void
-free_pv_entry(pv_entry_t pv)
+free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
+	vm_page_t m;
+	struct pv_chunk *pc;
+	int idx, field, bit;
+
 	pv_entry_count--;
-	uma_zfree(pvzone, pv);
+	pc = pv_to_chunk(pv);
+	idx = pv - &pc->pc_pventry[0];
+	field = idx / 64;
+	bit = idx % 64;
+	pc->pc_map[field] |= 1ul << bit;
+	/* move to head of list */
+	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
+	    pc->pc_map[2] != PC_FREE2)
+		return;
+	/* entire chunk is free, return it */
+	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
+	vm_page_lock_queues();
+	vm_page_free(m);
+	vm_page_unlock_queues();
 }
 
 /*
@@ -1452,6 +1486,53 @@
  * when needed.
  */
 static pv_entry_t
+get_pv_entry(pmap_t pmap)
+{
+	static vm_pindex_t colour;
+	int bit, field;
+	pv_entry_t pv;
+	struct pv_chunk *pc;
+	vm_page_t m;
+
+	pv_entry_count++;
+	if ((pv_entry_count > pv_entry_high_water) &&
+		(pmap_pagedaemon_waken == 0)) {
+		pmap_pagedaemon_waken = 1;
+		wakeup (&vm_pages_needed);
+	}
+	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
+	for (field = 0; field < _NPCM; field++) {
+		bit = bsrq(pc->pc_map[field]);
+		if (bit >= 0)
+			break;
+	}
+	if (bit >= 0) {
+		pv = &pc->pc_pventry[field * 64 + bit];
+		pc->pc_map[field] &= (1ul << bit);
+		/* If this was the last item, move it to tail */
+		if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
+		    pc->pc_map[2] == 0) {
+			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
+		}
+		return (pv);
+	}
+	/* No free items, allocate another chunk */
+	m = vm_page_alloc(NULL, colour, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ);
+	if (m == NULL)
+		return (NULL);
+	colour++;
+	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
+	pc->pc_pmap = pmap;
+	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
+	pc->pc_map[1] = PC_FREE1;
+	pc->pc_map[2] = PC_FREE2;
+	pv = &pc->pc_pventry[0];
+	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+	return (pv);
+}
+#if 0
+static pv_entry_t
 get_pv_entry(pmap_t locked_pmap)
 {
 	static const struct timeval printinterval = { 60, 0 };
@@ -1535,6 +1616,7 @@
 	}
 	return (allocated_pv);
 }
+#endif
 
 static void
 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
@@ -1543,24 +1625,16 @@
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
-		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
-			if (pmap == pv->pv_pmap && va == pv->pv_va) 
-				break;
-		}
-	} else {
-		TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
-			if (va == pv->pv_va) 
-				break;
-		}
+	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+		if (pmap == PV_PMAP(pv) && va == pv->pv_va) 
+			break;
 	}
 	KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
 	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count--;
 	if (TAILQ_EMPTY(&m->md.pv_list))
 		vm_page_flag_clear(m, PG_WRITEABLE);
-	TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
-	free_pv_entry(pv);
+	free_pv_entry(pmap, pv);
 }
 
 /*
@@ -1574,11 +1648,9 @@
 
 	pv = get_pv_entry(pmap);
 	pv->pv_va = va;
-	pv->pv_pmap = pmap;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count++;
 }
@@ -1760,6 +1832,7 @@
 pmap_remove_all(vm_page_t m)
 {
 	register pv_entry_t pv;
+	pmap_t pmap;
 	pt_entry_t *pte, tpte;
 	pd_entry_t ptepde;
 
@@ -1774,12 +1847,13 @@
 #endif
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
-		PMAP_LOCK(pv->pv_pmap);
-		pv->pv_pmap->pm_stats.resident_count--;
-		pte = pmap_pte_pde(pv->pv_pmap, pv->pv_va, &ptepde);
+		pmap = PV_PMAP(pv);
+		PMAP_LOCK(pmap);
+		pmap->pm_stats.resident_count--;
+		pte = pmap_pte_pde(pmap, pv->pv_va, &ptepde);
 		tpte = pte_load_clear(pte);
 		if (tpte & PG_W)
-			pv->pv_pmap->pm_stats.wired_count--;
+			pmap->pm_stats.wired_count--;
 		if (tpte & PG_A)
 			vm_page_flag_set(m, PG_REFERENCED);
 
@@ -1793,13 +1867,12 @@
 			if (pmap_track_modified(pv->pv_va))
 				vm_page_dirty(m);
 		}
-		pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
-		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
+		pmap_invalidate_page(pmap, pv->pv_va);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count--;
-		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, ptepde);
-		PMAP_UNLOCK(pv->pv_pmap);
-		free_pv_entry(pv);
+		pmap_unuse_pt(pmap, pv->pv_va, ptepde);
+		PMAP_UNLOCK(pmap);
+		free_pv_entry(pmap, pv);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 }
@@ -2563,7 +2636,7 @@
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
-		if (pv->pv_pmap == pmap) {
+		if (PV_PMAP(pv) == pmap) {
 			return TRUE;
 		}
 		loops++;
@@ -2573,7 +2646,6 @@
 	return (FALSE);
 }
 
-#define PMAP_REMOVE_PAGES_CURPROC_ONLY
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
@@ -2589,73 +2661,97 @@
 {
 	pt_entry_t *pte, tpte;
 	vm_page_t m;
-	pv_entry_t pv, npv;
+	pv_entry_t pv;
+	struct pv_chunk *pc, *npc;
+	int field, idx;
+	int64_t bit;
+	int allfree, didfree;;
 
-#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
-#endif
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
-	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
+	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
+		allfree = 1;
+		didfree = 0;
+		/*
+		 * If only we could eliminate the sva/eva tests, and define
+		 * pmap_remove_pages() to simply remove *ALL* user pages, we
+		 * could make it faster here. eg: replace for() loop with
+		 * bsrq() and some other some algorithm changes.
+		 */
+		for (idx = 0; idx < _NPCPV; idx++) {
+			field = idx / 64;
+			bit = idx % 64;
+			if ((pc->pc_map[field] & 1ul << bit) == 0) { /* inuse */
+				pv = &pc->pc_pventry[idx];
 
-		if (pv->pv_va >= eva || pv->pv_va < sva) {
-			npv = TAILQ_NEXT(pv, pv_plist);
-			continue;
-		}
+				if (pv->pv_va >= eva || pv->pv_va < sva) {
+					allfree = 0;
+					continue;
+				}
 
-#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
-		pte = vtopte(pv->pv_va);
-#else
-		pte = pmap_pte(pmap, pv->pv_va);
-#endif
-		tpte = *pte;
+				pte = vtopte(pv->pv_va);
+				tpte = *pte;
 
-		if (tpte == 0) {
-			printf("TPTE at %p  IS ZERO @ VA %08lx\n",
-							pte, pv->pv_va);
-			panic("bad pte");
-		}
+				if (tpte == 0) {
+					printf(
+					    "TPTE at %p  IS ZERO @ VA %08lx\n",
+					    pte, pv->pv_va);
+					panic("bad pte");
+				}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
-		if (tpte & PG_W) {
-			npv = TAILQ_NEXT(pv, pv_plist);
-			continue;
-		}
+				if (tpte & PG_W)
+					continue;
+
+				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
+				KASSERT(m->phys_addr == (tpte & PG_FRAME),
+				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
+				    m, (uintmax_t)m->phys_addr,
+				    (uintmax_t)tpte));
 
-		m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
-		KASSERT(m->phys_addr == (tpte & PG_FRAME),
-		    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
-		    m, (uintmax_t)m->phys_addr, (uintmax_t)tpte));
+				KASSERT(m < &vm_page_array[vm_page_array_size],
+					("pmap_remove_pages: bad tpte %#jx",
+					(uintmax_t)tpte));
 
-		KASSERT(m < &vm_page_array[vm_page_array_size],
-			("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte));
+				pmap->pm_stats.resident_count--;
 
-		pmap->pm_stats.resident_count--;
+				pte_clear(pte);
 
-		pte_clear(pte);
+				/*
+				 * Update the vm_page_t clean/reference bits.
+				 */
+				if (tpte & PG_M)
+					vm_page_dirty(m);
 
-		/*
-		 * Update the vm_page_t clean and reference bits.
-		 */
-		if (tpte & PG_M) {
-			vm_page_dirty(m);
+				/* Mark free */
+				didfree = 1;
+				pc->pc_map[field] |= 1ul << bit;
+				m->md.pv_list_count--;
+				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+				if (TAILQ_EMPTY(&m->md.pv_list))
+					vm_page_flag_clear(m, PG_WRITEABLE);
+				pmap_unuse_pt(pmap, pv->pv_va,
+				    *vtopde(pv->pv_va));
+			}
+		}
+		if (allfree) {
+			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+			m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
+			vm_page_lock_queues();
+			vm_page_free(m);
+			vm_page_unlock_queues();
+		} else {
+			if (didfree) {
+				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+			}
 		}
-
-		npv = TAILQ_NEXT(pv, pv_plist);
-		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
-
-		m->md.pv_list_count--;
-		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-		if (TAILQ_EMPTY(&m->md.pv_list))
-			vm_page_flag_clear(m, PG_WRITEABLE);
-
-		pmap_unuse_pt(pmap, pv->pv_va, *vtopde(pv->pv_va));
-		free_pv_entry(pv);
 	}
 	pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
@@ -2673,6 +2769,7 @@
 {
 	pv_entry_t pv;
 	pt_entry_t *pte;
+	pmap_t pmap;
 	boolean_t rv;
 
 	rv = FALSE;
@@ -2688,10 +2785,11 @@
 		 */
 		if (!pmap_track_modified(pv->pv_va))
 			continue;
-		PMAP_LOCK(pv->pv_pmap);
-		pte = pmap_pte(pv->pv_pmap, pv->pv_va);
+		pmap = PV_PMAP(pv);
+		PMAP_LOCK(pmap);
+		pte = pmap_pte(pmap, pv->pv_va);
 		rv = (*pte & PG_M) != 0;
-		PMAP_UNLOCK(pv->pv_pmap);
+		PMAP_UNLOCK(pmap);
 		if (rv)
 			break;
 	}
@@ -2729,6 +2827,7 @@
 pmap_clear_ptes(vm_page_t m, long bit)
 {
 	register pv_entry_t pv;
+	pmap_t pmap;
 	pt_entry_t pbits, *pte;
 
 	if ((m->flags & PG_FICTITIOUS) ||
@@ -2749,8 +2848,9 @@
 				continue;
 		}
 
-		PMAP_LOCK(pv->pv_pmap);
-		pte = pmap_pte(pv->pv_pmap, pv->pv_va);
+		pmap = PV_PMAP(pv);
+		PMAP_LOCK(pmap);
+		pte = pmap_pte(pmap, pv->pv_va);
 retry:
 		pbits = *pte;
 		if (pbits & bit) {
@@ -2764,9 +2864,9 @@
 			} else {
 				atomic_clear_long(pte, bit);
 			}
-			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
+			pmap_invalidate_page(pmap, pv->pv_va);
 		}
-		PMAP_UNLOCK(pv->pv_pmap);
+		PMAP_UNLOCK(pmap);
 	}
 	if (bit == PG_RW)
 		vm_page_flag_clear(m, PG_WRITEABLE);
@@ -2805,6 +2905,7 @@
 pmap_ts_referenced(vm_page_t m)
 {
 	register pv_entry_t pv, pvf, pvn;
+	pmap_t pmap;
 	pt_entry_t *pte;
 	pt_entry_t v;
 	int rtval = 0;
@@ -2827,20 +2928,21 @@
 			if (!pmap_track_modified(pv->pv_va))
 				continue;
 
-			PMAP_LOCK(pv->pv_pmap);
-			pte = pmap_pte(pv->pv_pmap, pv->pv_va);
+			pmap = PV_PMAP(pv);
+			PMAP_LOCK(pmap);
+			pte = pmap_pte(pmap, pv->pv_va);
 
 			if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
 				atomic_clear_long(pte, PG_A);
-				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
+				pmap_invalidate_page(pmap, pv->pv_va);
 
 				rtval++;
 				if (rtval > 4) {
-					PMAP_UNLOCK(pv->pv_pmap);
+					PMAP_UNLOCK(pmap);
 					break;
 				}
 			}
-			PMAP_UNLOCK(pv->pv_pmap);
+			PMAP_UNLOCK(pmap);
 		} while ((pv = pvn) != NULL && pv != pvf);
 	}
 

==== //depot/projects/hammer/sys/amd64/include/pmap.h#60 (text+ko) ====

@@ -222,6 +222,7 @@
  * Pmap stuff
  */
 struct	pv_entry;
+struct	pv_chunk;
 
 struct md_page {
 	int pv_list_count;
@@ -231,7 +232,7 @@
 struct pmap {
 	struct mtx		pm_mtx;
 	pml4_entry_t		*pm_pml4;	/* KVA of level 4 page table */
-	TAILQ_HEAD(,pv_entry)	pm_pvlist;	/* list of mappings in pmap */
+	TAILQ_HEAD(,pv_chunk)	pm_pvchunk;	/* list of mappings in pmap */
 	u_int			pm_active;	/* active on cpus */
 	/* spare u_int here due to padding */
 	struct pmap_statistics	pm_stats;	/* pmap statistics */
@@ -260,12 +261,24 @@
  * mappings of that page.  An entry is a pv_entry_t, the list is pv_table.
  */
 typedef struct pv_entry {
-	pmap_t		pv_pmap;	/* pmap where mapping lies */
 	vm_offset_t	pv_va;		/* virtual address for mapping */
 	TAILQ_ENTRY(pv_entry)	pv_list;
-	TAILQ_ENTRY(pv_entry)	pv_plist;
 } *pv_entry_t;
 
+/*
+ * pv_entries are allocated in chunks per-process.  This avoids the
+ * need to track per-pmap assignments.
+ */
+#define	_NPCM	3
+#define	_NPCPV	168
+struct pv_chunk {
+	pmap_t			pc_pmap;
+	TAILQ_ENTRY(pv_chunk)	pc_list;
+	uint64_t		pc_map[_NPCM];	/* bitmap; 1 = free */
+	uint64_t		pc_spare[2];
+	struct pv_entry		pc_pventry[_NPCPV];
+};
+
 #ifdef	_KERNEL
 
 #define NPPROVMTRR		8



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200603142127.k2ELRlP6089714>