Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 20 Mar 2002 18:07:11 -0800 (PST)
From:      Thomas Moestl <tmm@FreeBSD.org>
To:        Perforce Change Reviews <perforce@freebsd.org>
Subject:   PERFORCE change 8110 for review
Message-ID:  <200203210207.g2L27BE62790@freefall.freebsd.org>

next in thread | raw e-mail | index | archive | help
http://people.freebsd.org/~peter/p4db/chv.cgi?CH=8110

Change 8110 by tmm@tmm_sparc64 on 2002/03/20 18:06:36

	Integ VM optimizations and cacheing fixes from sparc64-tmm.

Affected files ...

... //depot/projects/sparc64/sys/fs/nwfs/nwfs_io.c#5 integrate
... //depot/projects/sparc64/sys/fs/smbfs/smbfs_io.c#4 integrate
... //depot/projects/sparc64/sys/fs/specfs/spec_vnops.c#10 integrate
... //depot/projects/sparc64/sys/kern/imgact_elf.c#17 integrate
... //depot/projects/sparc64/sys/kern/init_main.c#17 integrate
... //depot/projects/sparc64/sys/kern/kern_exec.c#18 integrate
... //depot/projects/sparc64/sys/kern/kern_resource.c#14 integrate
... //depot/projects/sparc64/sys/kern/sys_pipe.c#13 integrate
... //depot/projects/sparc64/sys/kern/sys_process.c#15 integrate
... //depot/projects/sparc64/sys/kern/vfs_bio.c#20 integrate
... //depot/projects/sparc64/sys/kern/vfs_cluster.c#9 integrate
... //depot/projects/sparc64/sys/nfsclient/nfs_bio.c#8 integrate
... //depot/projects/sparc64/sys/nfsclient/nfs_vnops.c#10 integrate
... //depot/projects/sparc64/sys/sparc64/include/param.h#22 integrate
... //depot/projects/sparc64/sys/sparc64/include/pmap.h#28 integrate
... //depot/projects/sparc64/sys/sparc64/include/vmparam.h#16 integrate
... //depot/projects/sparc64/sys/sparc64/sparc64/pmap.c#78 integrate
... //depot/projects/sparc64/sys/sparc64/sparc64/vm_machdep.c#46 integrate
... //depot/projects/sparc64/sys/sys/buf.h#12 integrate
... //depot/projects/sparc64/sys/sys/pipe.h#3 integrate
... //depot/projects/sparc64/sys/vm/pmap.h#7 integrate
... //depot/projects/sparc64/sys/vm/swap_pager.c#9 integrate
... //depot/projects/sparc64/sys/vm/vm.h#5 integrate
... //depot/projects/sparc64/sys/vm/vm_contig.c#6 integrate
... //depot/projects/sparc64/sys/vm/vm_extern.h#7 integrate
... //depot/projects/sparc64/sys/vm/vm_fault.c#9 integrate
... //depot/projects/sparc64/sys/vm/vm_init.c#5 integrate
... //depot/projects/sparc64/sys/vm/vm_kern.c#6 integrate
... //depot/projects/sparc64/sys/vm/vm_kern.h#2 integrate
... //depot/projects/sparc64/sys/vm/vm_map.c#12 integrate
... //depot/projects/sparc64/sys/vm/vm_mmap.c#13 integrate
... //depot/projects/sparc64/sys/vm/vm_object.c#8 integrate
... //depot/projects/sparc64/sys/vm/vm_page.c#12 integrate
... //depot/projects/sparc64/sys/vm/vm_page.h#7 integrate
... //depot/projects/sparc64/sys/vm/vm_pager.c#10 integrate
... //depot/projects/sparc64/sys/vm/vm_pager.h#4 integrate
... //depot/projects/sparc64/sys/vm/vm_unix.c#5 integrate
... //depot/projects/sparc64/sys/vm/vnode_pager.c#11 integrate

Differences ...

==== //depot/projects/sparc64/sys/fs/nwfs/nwfs_io.c#5 (text+ko) ====

@@ -421,8 +421,9 @@
 
 	bp = getpbuf(&nwfs_pbuf_freecnt);
 	npages = btoc(count);
+	mappbuf(bp, vp->v_object, IDX_TP_OFF(pages[0]->pindex), pages,
+	    npages, BIO_READ);
 	kva = (vm_offset_t) bp->b_data;
-	pmap_qenter(kva, pages, npages);
 
 	iov.iov_base = (caddr_t) kva;
 	iov.iov_len = count;
@@ -435,7 +436,7 @@
 	uio.uio_td = td;
 
 	error = ncp_read(NWFSTOCONN(nmp), &np->n_fh, &uio,cred);
-	pmap_qremove(kva, npages);
+	unmappbuf(bp, pages, 0, npages, BIO_READ);
 
 	relpbuf(bp, &nwfs_pbuf_freecnt);
 
@@ -458,6 +459,7 @@
 		m->flags &= ~PG_ZERO;
 
 		if (nextoff <= size) {
+			pmap_page_validated(m);
 			m->valid = VM_PAGE_BITS_ALL;
 			m->dirty = 0;
 		} else {
@@ -548,8 +550,9 @@
 	}
 
 	bp = getpbuf(&nwfs_pbuf_freecnt);
+	mappbuf(bp, vp->v_object, IDX_TP_OFF(pages[0]->pindex), pages,
+	    npages, BIO_WRITE);
 	kva = (vm_offset_t) bp->b_data;
-	pmap_qenter(kva, pages, npages);
 
 	iov.iov_base = (caddr_t) kva;
 	iov.iov_len = count;
@@ -566,7 +569,7 @@
 /*	VOP_CLOSE(vp, FWRITE, cred, td);*/
 	NCPVNDEBUG("paged write done: %d\n", error);
 
-	pmap_qremove(kva, npages);
+	unmappbuf(bp, pages, 0, npages, BIO_WRITE);
 	relpbuf(bp, &nwfs_pbuf_freecnt);
 
 	if (!error) {

==== //depot/projects/sparc64/sys/fs/smbfs/smbfs_io.c#4 (text+ko) ====

@@ -447,8 +447,9 @@
 	bp = getpbuf();
 #endif
 	npages = btoc(count);
+	mappbuf(bp, vp->v_object, IDX_TO_OFF(pages[0]->pindex), pages,
+	    npages, BIO_READ);
 	kva = (vm_offset_t) bp->b_data;
-	pmap_qenter(kva, pages, npages);
 	cnt.v_vnodein++;
 	cnt.v_vnodepgsin += count;
 
@@ -463,7 +464,7 @@
 	uio.uio_td = td;
 
 	error = smb_read(smp->sm_share, np->n_fid, &uio, &scred);
-	pmap_qremove(kva, npages);
+	unmappbuf(bp, pages, 0, npages, BIO_READ);
 
 #if __FreeBSD_version >= 400000
 	relpbuf(bp, &smbfs_pbuf_freecnt);
@@ -490,6 +491,7 @@
 		m->flags &= ~PG_ZERO;
 
 		if (nextoff <= size) {
+			pmap_page_validated(m);
 			m->valid = VM_PAGE_BITS_ALL;
 			vm_page_undirty(m);
 		} else {
@@ -585,8 +587,9 @@
 #else
 	bp = getpbuf();
 #endif
+	mappbuf(bp, vp->v_object, IDX_TO_OFF(pages[0]->pindex), pages,
+	    npages, BIO_WRITE);
 	kva = (vm_offset_t) bp->b_data;
-	pmap_qenter(kva, pages, npages);
 	cnt.v_vnodeout++;
 	cnt.v_vnodepgsout += count;
 
@@ -606,7 +609,7 @@
 /*	VOP_CLOSE(vp, FWRITE, cred, td);*/
 	SMBVDEBUG("paged write done: %d\n", error);
 
-	pmap_qremove(kva, npages);
+	unmappbuf(bp, pages, 0, npages, BIO_WRITE);
 #if __FreeBSD_version >= 400000
 	relpbuf(bp, &smbfs_pbuf_freecnt);
 #else

==== //depot/projects/sparc64/sys/fs/specfs/spec_vnops.c#10 (text+ko) ====

@@ -668,6 +668,7 @@
 	daddr_t blkno;
 	struct buf *bp;
 	vm_page_t m;
+	vm_object_t obj;
 	vm_ooffset_t offset;
 	int toff, nextoff, nread;
 	struct vnode *vp = ap->a_vp;
@@ -718,12 +719,14 @@
 	size = (ap->a_count + blksiz - 1) & ~(blksiz - 1);
 
 	bp = getpbuf(NULL);
-	kva = (vm_offset_t)bp->b_data;
 
 	/*
-	 * Map the pages to be read into the kva.
+	 * Map the pages to be read into the kva. The object may be NULL.
 	 */
-	pmap_qenter(kva, ap->a_m, pcount);
+	VOP_GETVOBJECT(vp, &obj);
+	mappbuf(bp, obj, IDX_TO_OFF(ap->a_m[0]->pindex), ap->a_m,
+	    pcount, BIO_READ);
+	kva = (vm_offset_t)bp->b_data;
 
 	/* Build a minimal buffer header. */
 	bp->b_iocmd = BIO_READ;
@@ -770,7 +773,7 @@
 		bzero((caddr_t)kva + nread,
 			ap->a_count - nread);
 	}
-	pmap_qremove(kva, pcount);
+	unmappbuf(bp, ap->a_m, pcount, BIO_READ);
 
 
 	gotreqpage = 0;
@@ -781,6 +784,7 @@
 		m->flags &= ~PG_ZERO;
 
 		if (nextoff <= nread) {
+			pmap_page_validated(m);
 			m->valid = VM_PAGE_BITS_ALL;
 			vm_page_undirty(m);
 		} else if (toff < nread) {

==== //depot/projects/sparc64/sys/kern/imgact_elf.c#17 (text+ko) ====

@@ -194,6 +194,8 @@
 	size_t copy_len;
 	vm_offset_t file_addr;
 	vm_offset_t data_buf = 0;
+	vm_offset_t color;
+	vm_size_t modulus;
 
 	GIANT_REQUIRED;
 	error = 0;
@@ -213,6 +215,20 @@
 		return (ENOEXEC);
 	}
 
+#ifdef VM_EOE
+	/*
+	 * XXX: this is a bit h0h0: if the segment we are mapping is writable
+	 * and executable, use VM_PROT_LAZY_EXECUTE. Since it is writable, it
+	 * is likely to contain non-executable data too, so that is
+	 * advantageous. In practice, this is the case for data/bss segments of
+	 * dynamic executables, which are usually executable because they
+	 * contain PLTs.
+	 */
+	if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) ==
+	    (VM_PROT_WRITE | VM_PROT_EXECUTE))
+		prot = (prot & ~VM_PROT_EXECUTE) | VM_PROT_LAZY_EXECUTE;
+#endif
+
 	map_addr = trunc_page((vm_offset_t)vmaddr);
 	file_addr = trunc_page(offset);
 
@@ -228,6 +244,12 @@
 		map_len = round_page(offset+filsz) - file_addr;
 
 	if (map_len != 0) {
+		/*
+		 * Notify that we are going to map this section. Since the
+		 * address is fixed, the color and modulus that are returned
+		 * must be ignored.
+		 */
+		pmap_addr_color(object, map_addr, file_addr, &modulus);
 		vm_object_reference(object);
 		vm_map_lock(&vmspace->vm_map);
 		rv = vm_map_insert(&vmspace->vm_map,
@@ -266,7 +288,8 @@
 		vm_map_lock(&vmspace->vm_map);
 		rv = vm_map_insert(&vmspace->vm_map, NULL, 0,
 					map_addr, map_addr + map_len,
-					VM_PROT_ALL, VM_PROT_ALL, 0);
+					VM_PROT_READ | VM_PROT_WRITE,
+					VM_PROT_ALL, 0);
 		vm_map_unlock(&vmspace->vm_map);
 		if (rv != KERN_SUCCESS) {
 			return EINVAL; 
@@ -274,16 +297,24 @@
 	}
 
 	if (copy_len != 0) {
+		/*
+		 * The virtual address of this is a bit bogus, but it does not
+		 * really matter in this case, since the correct virtual address
+		 * got passed in above, if not before.
+		 */
+		color = pmap_addr_color(object, trunc_page(offset + filsz),
+		    trunc_page(offset + filsz), &modulus);
 		vm_object_reference(object);
-		rv = vm_map_find(exec_map,
-				 object, 
-				 trunc_page(offset + filsz),
-				 &data_buf,
-				 PAGE_SIZE,
-				 TRUE,
-				 VM_PROT_READ,
-				 VM_PROT_ALL,
-				 MAP_COPY_ON_WRITE | MAP_PREFAULT_PARTIAL);
+		rv = vm_map_find_color(exec_map,
+				       object, 
+				       trunc_page(offset + filsz),
+				       &data_buf,
+				       PAGE_SIZE,
+				       TRUE,
+				       VM_PROT_READ,
+				       VM_PROT_ALL,
+				       MAP_COPY_ON_WRITE | MAP_PREFAULT_PARTIAL,
+				       color, modulus);
 		if (rv != KERN_SUCCESS) {
 			vm_object_deallocate(object);
 			return EINVAL;
@@ -355,14 +386,7 @@
 	imgp->uap = NULL;
 	imgp->attr = attr;
 	imgp->firstpage = NULL;
-	imgp->image_header = (char *)kmem_alloc_wait(exec_map, PAGE_SIZE);
 
-	if (imgp->image_header == NULL) {
-		nd->ni_vp = NULL;
-		error = ENOMEM;
-		goto fail;
-	}
-
 	/* XXXKSE */
         NDINIT(nd, LOOKUP, LOCKLEAF|FOLLOW, UIO_SYSSPACE, file, curthread);   
 			 
@@ -462,9 +486,6 @@
 fail:
 	if (imgp->firstpage)
 		exec_unmap_first_page(imgp);
-	if (imgp->image_header)
-		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->image_header,
-			PAGE_SIZE);
 	if (nd->ni_vp)
 		vrele(nd->ni_vp);
 

==== //depot/projects/sparc64/sys/kern/init_main.c#17 (text+ko) ====

@@ -521,7 +521,7 @@
 	 */
 	addr = trunc_page(USRSTACK - PAGE_SIZE);
 	if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE,
-			FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
+			FALSE, VM_PROT_STACK, VM_PROT_ALL, 0) != 0)
 		panic("init: couldn't allocate argument space");
 	p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
 	p->p_vmspace->vm_ssize = 1;

==== //depot/projects/sparc64/sys/kern/kern_exec.c#18 (text+ko) ====

@@ -171,14 +171,13 @@
 	 * Allocate temporary demand zeroed space for argument and
 	 *	environment strings
 	 */
-	imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX + PAGE_SIZE);
+	imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX);
 	if (imgp->stringbase == NULL) {
 		error = ENOMEM;
 		goto exec_fail;
 	}
 	imgp->stringp = imgp->stringbase;
 	imgp->stringspace = ARG_MAX;
-	imgp->image_header = imgp->stringbase + ARG_MAX;
 
 	/*
 	 * Translate the file name. namei() returns a vnode pointer
@@ -193,7 +192,7 @@
 	error = namei(ndp);
 	if (error) {
 		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
-			ARG_MAX + PAGE_SIZE);
+			ARG_MAX);
 		goto exec_fail;
 	}
 
@@ -471,7 +470,7 @@
 
 	if (imgp->stringbase != NULL)
 		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
-			ARG_MAX + PAGE_SIZE);
+			ARG_MAX);
 
 	if (imgp->vp) {
 		NDFREE(ndp, NDF_ONLY_PNBUF);
@@ -502,7 +501,7 @@
 exec_map_first_page(imgp)
 	struct image_params *imgp;
 {
-	int rv, i;
+	int rv, i, col;
 	int initial_pagein;
 	vm_page_t ma[VM_INITIAL_PAGEIN];
 	vm_object_t object;
@@ -548,10 +547,20 @@
 		}
 	}
 
+	col = vm_mdpg_pref_vcol(object, 0, ma, 1);
+	imgp->image_header = (char *)VM_MDPG_KMEM_ALLOC(exec_map, PAGE_SIZE,
+	    col);
+	if (imgp->image_header == NULL) {
+		vm_page_wakeup(ma[0]);
+		return ENOMEM;
+	}
+	pmap_qenter((vm_offset_t)imgp->image_header, ma, 1);
+	/* Reading from the page has the same semantics as a write operation. */
+	vm_mdpg_start_io((vm_offset_t)imgp->image_header, ma, 1, VM_MDPG_READ);
+
 	vm_page_wire(ma[0]);
 	vm_page_wakeup(ma[0]);
 
-	pmap_qenter((vm_offset_t)imgp->image_header, ma, 1);
 	imgp->firstpage = ma[0];
 
 	return 0;
@@ -564,9 +573,13 @@
 	GIANT_REQUIRED;
 
 	if (imgp->firstpage) {
+		vm_mdpg_done_io((vm_offset_t)imgp->image_header,
+		    &imgp->firstpage, 0, 1, VM_MDPG_READ);
 		pmap_qremove((vm_offset_t)imgp->image_header, 1);
 		vm_page_unwire(imgp->firstpage, 1);
 		imgp->firstpage = NULL;
+		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->image_header,
+		    PAGE_SIZE);
 	}
 }
 
@@ -613,7 +626,7 @@
 
 	/* Allocate a new stack */
 	error = vm_map_stack(&vmspace->vm_map, stack_addr, (vm_size_t)maxssiz,
-	    VM_PROT_ALL, VM_PROT_ALL, 0);
+	    VM_PROT_STACK, VM_PROT_ALL, 0);
 	if (error)
 		return (error);
 

==== //depot/projects/sparc64/sys/kern/kern_resource.c#14 (text+ko) ====

@@ -564,7 +564,7 @@
 			vm_prot_t prot;
 
 			if (limp->rlim_cur > alimp->rlim_cur) {
-				prot = VM_PROT_ALL;
+				prot = VM_PROT_STACK;
 				size = limp->rlim_cur - alimp->rlim_cur;
 				addr = USRSTACK - limp->rlim_cur;
 			} else {

==== //depot/projects/sparc64/sys/kern/sys_pipe.c#13 (text+ko) ====

@@ -307,7 +307,7 @@
 	 */
 	error = vm_map_find(kernel_map, object, 0,
 		(vm_offset_t *) &buffer, size, 1,
-		VM_PROT_ALL, VM_PROT_ALL, 0);
+		VM_PROT_READ | VM_PROT_WRITE, VM_PROT_ALL, 0);
 
 	if (error != KERN_SUCCESS) {
 		vm_object_deallocate(object);
@@ -345,7 +345,7 @@
 	/* so pipespace()->pipe_free_kmem() doesn't follow junk pointer */
 	cpipe->pipe_buffer.object = NULL;
 #ifndef PIPE_NODIRECT
-	cpipe->pipe_map.kva = NULL;
+	cpipe->pipe_map.bkva = cpipe->pipe_map.kva = NULL;
 #endif
 	/*
 	 * protect so pipeclose() doesn't follow a junk pointer
@@ -361,7 +361,7 @@
 	 * pipe data structure initializations to support direct pipe I/O
 	 */
 	cpipe->pipe_map.cnt = 0;
-	cpipe->pipe_map.kva = 0;
+	cpipe->pipe_map.bkva = cpipe->pipe_map.kva = 0;
 	cpipe->pipe_map.pos = 0;
 	cpipe->pipe_map.npages = 0;
 	/* cpipe->pipe_map.ms[] = invalid */
@@ -607,8 +607,8 @@
 	vm_page_t m;
 	boolean_t wired;
 	u_int size;
-	int i;
-	vm_offset_t addr, endaddr;
+	int i, col;
+	vm_offset_t addr, endaddr, kva;
 
 	GIANT_REQUIRED;
 	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
@@ -648,18 +648,32 @@
 
 /*
  * and map the buffer
+ * The range may span multiple objects, so don't bother to find out which object
+ * to pass to vm_mdpg_pref_vcol(); let it decide based on page colors.
  */
-	if (wpipe->pipe_map.kva == 0) {
+	if (wpipe->pipe_map.bkva == 0) {
 		/*
 		 * We need to allocate space for an extra page because the
 		 * address range might (will) span pages at times.
 		 */
-		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
-			wpipe->pipe_buffer.size + PAGE_SIZE);
+		wpipe->pipe_map.bkva = kmem_alloc_pageable(kernel_map,
+			wpipe->pipe_buffer.size + PAGE_SIZE + VCOLPAD);
 		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
 	}
-	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
-		wpipe->pipe_map.npages);
+	col = vm_mdpg_pref_vcol(NULL, 0, wpipe->pipe_map.ms,
+	    wpipe->pipe_map.npages);
+	if (col != -1) {
+		kva = vm_roundcolor2(wpipe->pipe_map.bkva, col << PAGE_SHIFT,
+		    VCOLBOUND);
+	} else
+		kva = wpipe->pipe_map.bkva;
+
+	/* XXX */
+	for (i = 0; i < wpipe->pipe_map.npages; i++) {
+		pmap_enter(kernel_pmap, kva + (vm_offset_t)i * PAGE_SIZE,
+		    wpipe->pipe_map.ms[i], VM_PROT_READ, 1);
+	}
+	wpipe->pipe_map.kva = kva;
 
 /*
  * and update the uio data
@@ -687,11 +701,13 @@
 	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
 
 	if (wpipe->pipe_map.kva) {
-		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
+		pmap_remove(kernel_pmap, wpipe->pipe_map.kva,
+		    wpipe->pipe_map.kva + (vm_offset_t)wpipe->pipe_map.npages *
+		    PAGE_SIZE);
 
 		if (amountpipekva > MAXPIPEKVA) {
-			vm_offset_t kva = wpipe->pipe_map.kva;
-			wpipe->pipe_map.kva = 0;
+			vm_offset_t kva = wpipe->pipe_map.bkva;
+			wpipe->pipe_map.bkva = wpipe->pipe_map.kva = 0;
 			kmem_free(kernel_map, kva,
 				wpipe->pipe_buffer.size + PAGE_SIZE);
 			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
@@ -912,7 +928,7 @@
 		 */
 		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
 		    (fp->f_flag & FNONBLOCK) == 0 &&
-			(wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
+			(wpipe->pipe_map.bkva || (amountpipekva < LIMITPIPEKVA)) &&
 			(uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
 			error = pipe_direct_write( wpipe, uio);
 			if (error)
@@ -1278,13 +1294,13 @@
 		cpipe->pipe_buffer.buffer = NULL;
 	}
 #ifndef PIPE_NODIRECT
-	if (cpipe->pipe_map.kva != NULL) {
+	if (cpipe->pipe_map.bkva != NULL) {
 		amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
 		kmem_free(kernel_map,
-			cpipe->pipe_map.kva,
+			cpipe->pipe_map.bkva,
 			cpipe->pipe_buffer.size + PAGE_SIZE);
 		cpipe->pipe_map.cnt = 0;
-		cpipe->pipe_map.kva = 0;
+		cpipe->pipe_map.bkva = cpipe->pipe_map.kva = 0;
 		cpipe->pipe_map.pos = 0;
 		cpipe->pipe_map.npages = 0;
 	}

==== //depot/projects/sparc64/sys/kern/sys_process.c#15 (text+ko) ====

@@ -151,8 +151,9 @@
 	vm_object_t object = NULL;
 	vm_offset_t pageno = 0;		/* page number */
 	vm_prot_t reqprot;
+	vm_offset_t bkva;
 	vm_offset_t kva;
-	int error, writing;
+	int error, writing, col;
 
 	GIANT_REQUIRED;
 
@@ -176,7 +177,7 @@
 	reqprot = writing ? (VM_PROT_WRITE | VM_PROT_OVERRIDE_WRITE) :
 	    VM_PROT_READ;
 
-	kva = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
+	bkva = kmem_alloc_pageable(kernel_map, PAGE_SIZE + VCOLPAD);
 
 	/*
 	 * Only map in one page at a time.  We don't have to, but it
@@ -278,14 +279,19 @@
 		vm_object_reference(object);
 		vm_map_lookup_done(tmap, out_entry);
 
-		pmap_qenter(kva, &m, 1);
+		col = vm_mdpg_pref_vcol(object, IDX_TO_OFF(m->pindex), &m, 1);
+		if (col != -1)
+			kva = vm_roundcolor2(bkva, col << PAGE_SHIFT, VCOLBOUND);
+		else
+			kva = bkva;
+		pmap_enter(kernel_pmap, kva, m, VM_PROT_READ | VM_PROT_WRITE, 1);
 
 		/*
 		 * Now do the i/o move.
 		 */
 		error = uiomove((caddr_t)(kva + page_offset), len, uio);
 
-		pmap_qremove(kva, 1);
+		pmap_remove(kernel_pmap, kva, kva + PAGE_SIZE);
 
 		/*
 		 * release the page and the object
@@ -300,7 +306,7 @@
 	if (object)
 		vm_object_deallocate(object);
 
-	kmem_free(kernel_map, kva, PAGE_SIZE);
+	kmem_free(kernel_map, bkva, PAGE_SIZE + VCOLPAD);
 	vmspace_free(vm);
 	return (error);
 }

==== //depot/projects/sparc64/sys/kern/vfs_bio.c#20 (text+ko) ====

@@ -232,6 +232,9 @@
 #define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
 #define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
 
+/* Last used buffer map index. */
+static int buf_map_idx;
+
 /*
  * Buffer hash table code.  Note that the logical block scans linearly, which
  * gives us some L1 cache locality.
@@ -588,12 +591,15 @@
 static void
 bfreekva(struct buf * bp)
 {
+	int col;
+
 	GIANT_REQUIRED;
 
 	if (bp->b_kvasize) {
 		++buffreekvacnt;
 		bufspace -= bp->b_kvasize;
-		vm_map_delete(buffer_map,
+		col = ((vm_offset_t) bp->b_kvabase >> PAGE_SHIFT) % NVCOLORS;
+		vm_map_delete(buffer_map[col],
 		    (vm_offset_t) bp->b_kvabase,
 		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize
 		);
@@ -676,6 +682,8 @@
 
 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
 
+	if (bp->b_flags & B_VMIO)
+		vmiobufstart(bp, bp->b_pages, bp->b_npages, BIO_READ);
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {
 		if (curthread != PCPU_GET(idlethread))
@@ -686,8 +694,14 @@
 		if (bp->b_rcred == NOCRED && cred != NOCRED)
 			bp->b_rcred = crhold(cred);
 		vfs_busy_pages(bp, 0);
+		if (bp->b_flags & B_VMIO)
+			vmiobufstart(bp, bp->b_pages, bp->b_npages, BIO_READ);
 		VOP_STRATEGY(vp, bp);
 		++readwait;
+	} else if (bp->b_flags & B_VMIO) {
+		/* No i/o needed, but cache flushing needs to be done anyway. */
+		vmiobufstart(bp, bp->b_pages, bp->b_npages, BIO_READ);
+		vmiobufdone(bp, bp->b_pages, 0, bp->b_npages, BIO_READ);
 	}
 
 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
@@ -705,6 +719,10 @@
 			if (rabp->b_rcred == NOCRED && cred != NOCRED)
 				rabp->b_rcred = crhold(cred);
 			vfs_busy_pages(rabp, 0);
+			if (bp->b_flags & B_VMIO) {
+				vmiobufstart(bp, bp->b_pages, bp->b_npages,
+				    BIO_READ);
+			}
 			BUF_KERNPROC(rabp);
 			VOP_STRATEGY(vp, rabp);
 		} else {
@@ -821,6 +839,8 @@
 
 	bp->b_vp->v_numoutput++;
 	vfs_busy_pages(bp, 1);
+	if (bp->b_flags & B_VMIO)
+		vmiobufstart(bp, bp->b_pages, bp->b_npages, BIO_WRITE);
 
 	/*
 	 * Normal bwrites pipeline writes
@@ -1629,12 +1649,13 @@
  */
 
 static struct buf *
-getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
+getnewbuf(int slpflag, int slptimeo, int size, int maxsize, int prefcol)
 {
 	struct buf *bp;
 	struct buf *nbp;
 	int defrag = 0;
 	int nqindex;
+	int col, scol, rv;
 	static int flushingbufs;
 
 	GIANT_REQUIRED;
@@ -1881,8 +1902,27 @@
 
 			bfreekva(bp);
 
-			if (vm_map_findspace(buffer_map,
-				vm_map_min(buffer_map), maxsize, &addr)) {
+			/*
+			 * Try to use the preferred color, but if the map
+			 * is full, cycle through all maps to find space. If
+			 * no color is specified, use the maps in a round-robin
+			 * fashion.
+			 */
+			if (prefcol == -1) {
+				scol = buf_map_idx;
+				buf_map_idx = (buf_map_idx + 1) % NVCOLORS;
+			} else
+				scol = prefcol;
+			col = scol;
+			do {
+				rv = vm_map_findspace_color(buffer_map[col],
+				    vm_map_min(buffer_map[col]), maxsize, &addr,
+				    (vm_offset_t)col << PAGE_SHIFT, VCOLBOUND);
+				if (rv == KERN_SUCCESS)
+					break;
+				col = (col + 1) % NVCOLORS;
+			} while (col != scol);
+			if (rv != KERN_SUCCESS) {
 				/*
 				 * Uh oh.  Buffer map is to fragmented.  We
 				 * must defragment the map.
@@ -1894,7 +1934,7 @@
 				goto restart;
 			}
 			if (addr) {
-				vm_map_insert(buffer_map, NULL, 0,
+				vm_map_insert(buffer_map[col], NULL, 0,
 					addr, addr + maxsize,
 					VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
 
@@ -2030,6 +2070,17 @@
 				bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
 				continue;
 			}
+			/*
+			 * Turn off background writing. This can lead to a
+			 * deadlock when bwrite() is trying to allocate a buffer
+			 * to execute a background write in a buffer shortage,
+			 * which is fatal if we haven't reached hidirtybuffers
+			 * yet, but there are no usable buffers on the queues
+			 * anyway (due to buffers not being on any queue, and
+			 * not reaching lodirtybuffers when that buffers get
+			 * requeued).
+			 */
+			bp->b_xflags &= ~BX_BKGRDWRITE;
 			vfs_bio_awrite(bp);
 			++r;
 			break;
@@ -2359,7 +2410,8 @@
 		 * returned by getnewbuf() is locked.  Note that the returned
 		 * buffer is also considered valid (not marked B_INVAL).
 		 */
-		int bsize, maxsize, vmio;
+		int bsize, maxsize, vmio, col;
+		vm_object_t obj;
 		off_t offset;
 
 		if (vn_isdisk(vp, NULL))
@@ -2372,11 +2424,16 @@
 			bsize = size;
 
 		offset = (off_t)blkno * bsize;
-		vmio = (VOP_GETVOBJECT(vp, NULL) == 0) && (vp->v_flag & VOBJBUF);
+		vmio = (VOP_GETVOBJECT(vp, &obj) == 0) && (vp->v_flag & VOBJBUF);
 		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
 		maxsize = imax(maxsize, bsize);
 
-		if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) {
+		if (vmio) {
+			col = vm_mdpg_pref_vcol(obj, offset, NULL,
+			    (maxsize + PAGE_MASK) >> PAGE_SHIFT);
+		} else
+			col = -1;
+		if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize, col)) == NULL) {
 			if (slpflag || slptimeo) {
 				splx(s);
 				return NULL;
@@ -2451,7 +2508,7 @@
 	maxsize = (size + BKVAMASK) & ~BKVAMASK;
 
 	s = splbio();
-	while ((bp = getnewbuf(0, 0, size, maxsize)) == 0);
+	while ((bp = getnewbuf(0, 0, size, maxsize, -1)) == 0);
 	splx(s);
 	allocbuf(bp, size);
 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
@@ -2752,7 +2809,7 @@
 			    bp->b_pages, 
 			    bp->b_npages
 			);
-			
+
 			bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 
 			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 		}
@@ -2827,7 +2884,7 @@
 void
 bufdone(struct buf *bp)
 {
-	int s, error;
+	int s, error, resid;
 	void    (*biodone)(struct buf *);
 
 	GIANT_REQUIRED;
@@ -2899,22 +2956,8 @@
 		}
 #endif
 
-		/*
-		 * Set B_CACHE if the op was a normal read and no error
-		 * occured.  B_CACHE is set for writes in the b*write()
-		 * routines.
-		 */
 		iosize = bp->b_bcount - bp->b_resid;
-		if (bp->b_iocmd == BIO_READ &&
-		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
-		    !(bp->b_ioflags & BIO_ERROR)) {
-			bp->b_flags |= B_CACHE;
-		}
-
 		for (i = 0; i < bp->b_npages; i++) {
-			int bogusflag = 0;
-			int resid;
-
 			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
 			if (resid > iosize)
 				resid = iosize;
@@ -2924,13 +2967,43 @@
 			 */
 			m = bp->b_pages[i];
 			if (m == bogus_page) {
-				bogusflag = 1;
 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 				if (m == NULL)
 					panic("biodone: page disappeared!");
 				bp->b_pages[i] = m;
 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+			} else {
+				/*
+				 * In the write case, the valid and clean bits
+				 * are already changed correctly( see
+				 * bdwrite() ), so we only need to do this here
+				 * in the read case.
+				 */
+				if ((bp->b_iocmd == BIO_READ) && resid > 0)
+					vfs_page_set_valid(bp, foff, i, m);
 			}
+			vm_page_flag_clear(m, PG_ZERO);
+			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+		}
+		vmiobufdone(bp, bp->b_pages, 0, bp->b_npages, bp->b_iocmd);
+		/*
+		 * Set B_CACHE if the op was a normal read and no error
+		 * occured.  B_CACHE is set for writes in the b*write()
+		 * routines.
+		 */
+		if (bp->b_iocmd == BIO_READ &&
+		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
+		    !(bp->b_ioflags & BIO_ERROR)) {
+			bp->b_flags |= B_CACHE;
+		}
+
+		foff = bp->b_offset;
+		for (i = 0; i < bp->b_npages; i++) {
+			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
+			if (resid > iosize)
+				resid = iosize;
+			m = bp->b_pages[i];
+
 #if defined(VFS_BIO_DEBUG)
 			if (OFF_TO_IDX(foff) != m->pindex) {
 				printf(
@@ -2940,16 +3013,6 @@
 #endif
 
 			/*
-			 * In the write case, the valid and clean bits are
-			 * already changed correctly ( see bdwrite() ), so we 
-			 * only need to do this here in the read case.
-			 */
-			if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
-				vfs_page_set_valid(bp, foff, i, m);
-			}
-			vm_page_flag_clear(m, PG_ZERO);
-
-			/*
 			 * when debugging new filesystems or buffer I/O methods, this
 			 * is the most common error that pops up.  if you see this, you
 			 * have not set the page busy flag correctly!!!
@@ -3364,6 +3427,30 @@
 	bp->b_npages = newnpages;
 }
 
+/*
+ * Map a buffer into kva. This may change the buffer virtual address by at
+ * most VCOLPAD bytes to achieve the preferred color.
+ * This can e.g. be used with pbufs.
+ */
+void
+vmiomapbuf(struct buf *bp, struct vm_object *obj, vm_ooffset_t offs,
+    vm_page_t *m, int count, int remap)
+{
+	vm_offset_t kva;
+	int color;
+
+	kva = trunc_page((vm_offset_t)bp->b_data);
+	color = vm_mdpg_pref_vcol(obj, offs, m, count);
+	if (color != -1) {
+		kva = vm_roundcolor2(kva, (vm_offset_t)color << PAGE_SHIFT,
+		    VCOLBOUND);
+	}
+	if (remap)
+		pmap_qremove((vm_offset_t)bp->b_data, count);
+	pmap_qenter(kva, m, count);
+	bp->b_data = (caddr_t)kva;
+}
+
 
 #include "opt_ddb.h"
 #ifdef DDB

==== //depot/projects/sparc64/sys/kern/vfs_cluster.c#9 (text+ko) ====

@@ -135,6 +135,10 @@
 	 * back-off on prospective read-aheads.
 	 */
 	if (bp->b_flags & B_CACHE) {
+		if (bp->b_flags & B_VMIO) {
+			vmiobufstart(bp, bp->b_pages, bp->b_npages, BIO_READ);
+			vmiobufdone(bp, bp->b_pages, 0, bp->b_npages, BIO_READ);
+		}
 		if (!seqcount) {
 			return 0;
 		} else if ((bp->b_flags & B_RAM) == 0) {
@@ -258,6 +262,8 @@
 		if ((bp->b_flags & B_CLUSTER) == 0) {
 			vfs_busy_pages(bp, 0);
 		}
+		if (bp->b_flags & B_VMIO)
+			vmiobufstart(bp, bp->b_pages, bp->b_npages, BIO_READ);
 		bp->b_flags &= ~B_INVAL;
 		bp->b_ioflags &= ~BIO_ERROR;
 		if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL)
@@ -295,6 +301,10 @@
 			if ((rbp->b_flags & B_CLUSTER) == 0) {
 				vfs_busy_pages(rbp, 0);
 			}
+			if (rbp->b_flags & B_VMIO) {
+				vmiobufstart(rbp, rbp->b_pages, rbp->b_npages,
+				    BIO_READ);
+			}
 			rbp->b_flags &= ~B_INVAL;
 			rbp->b_ioflags &= ~BIO_ERROR;
 			if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL)
@@ -325,6 +335,7 @@
 	struct buf *fbp;
 {
 	struct buf *bp, *tbp;
+	vm_object_t obj;
 	daddr_t bn;
 	int i, inc, j;
 
@@ -497,6 +508,8 @@
 			printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size);
 		if (tbp->b_bufsize != size)
 			printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size);
+		/* XXX: vmiobufstart() can be avoided in the read-ahead case. */
+		vmiobufstart(tbp, tbp->b_pages, tbp->b_npages, BIO_READ);
 		bp->b_bcount += size;
 		bp->b_bufsize += size;
 	}
@@ -516,8 +529,9 @@
 		    bp->b_bufsize, bp->b_kvasize);
 	bp->b_kvasize = bp->b_bufsize;
 
-	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
-		(vm_page_t *)bp->b_pages, bp->b_npages);
+	VOP_GETVOBJECT(vp, &obj);
+	vmiomapbuf(bp, obj, bp->b_offset, bp->b_pages,
+	    bp->b_npages, BIO_READ);
 	return (bp);
 }
 
@@ -542,7 +556,8 @@
 	if (bp->b_ioflags & BIO_ERROR)
 		error = bp->b_error;
 
-	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
+	KASSERT((bp->b_flags & B_VMIO) != NULL, ("cluster_callback: non-VMIO"));
+	unmappbuf(bp, bp->b_pages, bp->b_npages, bp->b_iocmd);
 	/*
 	 * Move memory from the large cluster buffer into the component
 	 * buffers and mark IO as done on these.
@@ -949,6 +964,14 @@
 			tbp->b_ioflags &= ~BIO_ERROR;
 			tbp->b_flags |= B_ASYNC;
 			tbp->b_iocmd = BIO_WRITE;
+			/*
+			 * XXX: vmiobufstart() can be ommitted, since the write
+			 * perform from the cluster buffer; hoever, this
+			 * requires another flag to tell bufdone() not to call
+			 * vmiobufdone().

>>> TRUNCATED FOR MAIL (1000 lines) <<<

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe p4-projects" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200203210207.g2L27BE62790>