Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 4 Feb 2013 15:05:15 -0800
From:      Neel Natu <neelnatu@gmail.com>
To:        hackers@freebsd.org
Cc:        alc@freebsd.org, davide@freebsd.org, rank1seeker@gmail.com, avg@freebsd.org
Subject:   dynamically calculating NKPT [was: Re: huge ktr buffer]
Message-ID:  <CAFgRE9F4JMutV9jJ_m7_9va67xiX4YXMT%2BRm6rUoDPMPymsg4w@mail.gmail.com>

next in thread | raw e-mail | index | archive | help
Hi,

I have a patch to dynamically calculate NKPT for amd64 kernels. This
should fix the various issues that people pointed out in the email
thread.

Please review and let me know if there are any objections to committing thi=
s.

Also, thanks to Alan (alc@) for reviewing and providing feedback on
the initial version of the patch.

Patch (also available at http://people.freebsd.org/~neel/patches/nkpt_diff.=
txt):

Index: sys/amd64/include/pmap.h
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- sys/amd64/include/pmap.h	(revision 246277)
+++ sys/amd64/include/pmap.h	(working copy)
@@ -113,13 +113,7 @@
 	((unsigned long)(l2) << PDRSHIFT) | \
 	((unsigned long)(l1) << PAGE_SHIFT))

-/* Initial number of kernel page tables. */
-#ifndef NKPT
-#define	NKPT		32
-#endif
-
 #define NKPML4E		1		/* number of kernel PML4 slots */
-#define NKPDPE		howmany(NKPT, NPDEPG)/* number of kernel PDP slots */

 #define	NUPML4E		(NPML4EPG/2)	/* number of userland PML4 pages */
 #define	NUPDPE		(NUPML4E*NPDPEPG)/* number of userland PDP pages */
@@ -181,6 +175,7 @@
 #define	PML4map		((pd_entry_t *)(addr_PML4map))
 #define	PML4pml4e	((pd_entry_t *)(addr_PML4pml4e))

+extern int nkpt;		/* Initial number of kernel page tables */
 extern u_int64_t KPDPphys;	/* physical address of kernel level 3 */
 extern u_int64_t KPML4phys;	/* physical address of kernel level 4 */

Index: sys/amd64/amd64/minidump_machdep.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- sys/amd64/amd64/minidump_machdep.c	(revision 246277)
+++ sys/amd64/amd64/minidump_machdep.c	(working copy)
@@ -232,7 +232,7 @@
 	/* Walk page table pages, set bits in vm_page_dump */
 	pmapsize =3D 0;
 	pdp =3D (uint64_t *)PHYS_TO_DMAP(KPDPphys);
-	for (va =3D VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + NKPT * NBPDR,
+	for (va =3D VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + nkpt * NBPDR,
 	    kernel_vm_end); ) {
 		/*
 		 * We always write a page, even if it is zero. Each
@@ -364,7 +364,7 @@
 	/* Dump kernel page directory pages */
 	bzero(fakepd, sizeof(fakepd));
 	pdp =3D (uint64_t *)PHYS_TO_DMAP(KPDPphys);
-	for (va =3D VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + NKPT * NBPDR,
+	for (va =3D VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + nkpt * NBPDR,
 	    kernel_vm_end); va +=3D NBPDP) {
 		i =3D (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);

Index: sys/amd64/amd64/pmap.c
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
--- sys/amd64/amd64/pmap.c	(revision 246277)
+++ sys/amd64/amd64/pmap.c	(working copy)
@@ -202,6 +202,10 @@
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */

+int nkpt;
+SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
+    "Number of kernel page table pages allocated on bootup");
+
 static int ndmpdp;
 static vm_paddr_t dmaplimit;
 vm_offset_t kernel_vm_end =3D VM_MIN_KERNEL_ADDRESS;
@@ -495,17 +499,42 @@

 CTASSERT(powerof2(NDMPML4E));

+/* number of kernel PDP slots */
+#define	NKPDPE(ptpgs)		howmany((ptpgs), NPDEPG)
+
 static void
+nkpt_init(vm_paddr_t addr)
+{
+	int pt_pages;
+=09
+#ifdef NKPT
+	pt_pages =3D NKPT;
+#else
+	pt_pages =3D howmany(addr, 1 << PDRSHIFT);
+	pt_pages +=3D NKPDPE(pt_pages);
+
+	/*
+	 * Add some slop beyond the bare minimum required for bootstrapping
+	 * the kernel.
+	 *
+	 * This is quite important when allocating KVA for kernel modules.
+	 * The modules are required to be linked in the negative 2GB of
+	 * the address space.  If we run out of KVA in this region then
+	 * pmap_growkernel() will need to allocate page table pages to map
+	 * the entire 512GB of KVA space which is an unnecessary tax on
+	 * physical memory.
+	 */
+	pt_pages +=3D 4;		/* 8MB additional slop for kernel modules */
+#endif
+	nkpt =3D pt_pages;
+}
+
+static void
 create_pagetables(vm_paddr_t *firstaddr)
 {
-	int i, j, ndm1g;
+	int i, j, ndm1g, nkpdpe;

-	/* Allocate pages */
-	KPTphys =3D allocpages(firstaddr, NKPT);
-	KPML4phys =3D allocpages(firstaddr, 1);
-	KPDPphys =3D allocpages(firstaddr, NKPML4E);
-	KPDphys =3D allocpages(firstaddr, NKPDPE);
-
+	/* Allocate page table pages for the direct map */
 	ndmpdp =3D (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
 	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
 		ndmpdp =3D 4;
@@ -517,6 +546,22 @@
 		DMPDphys =3D allocpages(firstaddr, ndmpdp - ndm1g);
 	dmaplimit =3D (vm_paddr_t)ndmpdp << PDPSHIFT;

+	/* Allocate pages */
+	KPML4phys =3D allocpages(firstaddr, 1);
+	KPDPphys =3D allocpages(firstaddr, NKPML4E);
+
+	/*
+	 * Allocate the initial number of kernel page table pages required to
+	 * bootstrap.  We defer this until after all memory-size dependent
+	 * allocations are done (e.g. direct map), so that we don't have to
+	 * build in too much slop in our estimate.
+	 */
+	nkpt_init(*firstaddr);
+	nkpdpe =3D NKPDPE(nkpt);
+
+	KPTphys =3D allocpages(firstaddr, nkpt);
+	KPDphys =3D allocpages(firstaddr, nkpdpe);
+
 	/* Fill in the underlying page table pages */
 	/* Read-only from zero to physfree */
 	/* XXX not fully used, underneath 2M pages */
@@ -526,7 +571,7 @@
 	}

 	/* Now map the page tables at their location within PTmap */
-	for (i =3D 0; i < NKPT; i++) {
+	for (i =3D 0; i < nkpt; i++) {
 		((pd_entry_t *)KPDphys)[i] =3D KPTphys + (i << PAGE_SHIFT);
 		((pd_entry_t *)KPDphys)[i] |=3D PG_RW | PG_V;
 	}
@@ -539,7 +584,7 @@
 	}

 	/* And connect up the PD to the PDP */
-	for (i =3D 0; i < NKPDPE; i++) {
+	for (i =3D 0; i < nkpdpe; i++) {
 		((pdp_entry_t *)KPDPphys)[i + KPDPI] =3D KPDphys +
 		    (i << PAGE_SHIFT);
 		((pdp_entry_t *)KPDPphys)[i + KPDPI] |=3D PG_RW | PG_V | PG_U;
@@ -768,7 +813,7 @@
 	 * Initialize the vm page array entries for the kernel pmap's
 	 * page table pages.
 	 */
-	for (i =3D 0; i < NKPT; i++) {
+	for (i =3D 0; i < nkpt; i++) {
 		mpte =3D PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
 		KASSERT(mpte >=3D vm_page_array &&
 		    mpte < &vm_page_array[vm_page_array_size],
@@ -1995,7 +2040,7 @@
 	 * any new kernel page table pages between "kernel_vm_end" and
 	 * "KERNBASE".
 	 */
-	if (KERNBASE < addr && addr <=3D KERNBASE + NKPT * NBPDR)
+	if (KERNBASE < addr && addr <=3D KERNBASE + nkpt * NBPDR)
 		return;

 	addr =3D roundup2(addr, NBPDR);


best
Neel

On Sun, Dec 9, 2012 at 5:41 AM,  <rank1seeker@gmail.com> wrote:
>> As also Alan suggested, a way to workaround the problem is to increase
>> NKPT value (e.g. from 32 to 64). Obviously, this is not a proper fix.
>> For a proper fix the kernel needs to be able to dynamically set the
>> size of NKPT.  In this particular case, this wouldn't be too hard, but
>> there is a different case, where people preload a large memory disk
>> image at boot time that isn't so easy to fix.
>>
>> Thanks,
>>
>> Davide
>
>
> Had a same issue.
> I use very big preloaded images, with full world + many compiled ports in=
 it.
>
> Fix:
> 'sh' code snip ...
> ----
> # Get default NKTP value
> nkpt=3D`cat "/sys/$arch/include/pmap.h" | sed -En 's/.+NKPT[[:blank:]]+([=
0-9]{2})$/\1/p'`
>
> # How many additional NKPT (4 Mb each), for our image, added to amount of=
 NKPT?
> # Calculated in Kb
> : $((nkpt +=3D "$img_size" / 4096))
> ----
>
> But it loads sooooo slow into the RAM.
> That should be enhanced, too.
>
>
> Domagoj Smol=C4=8Di=C4=87
> _______________________________________________
> freebsd-hackers@freebsd.org mailing list
> http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
> To unsubscribe, send any mail to "freebsd-hackers-unsubscribe@freebsd.org=
"



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?CAFgRE9F4JMutV9jJ_m7_9va67xiX4YXMT%2BRm6rUoDPMPymsg4w>