Date: Thu, 22 Aug 2013 03:43:12 +0000 (UTC) From: Neel Natu <neel@FreeBSD.org> To: src-committers@freebsd.org, svn-src-projects@freebsd.org Subject: svn commit: r254635 - in projects/bhyve_npt_pmap: cddl/contrib/opensolaris/cmd/ztest lib/libc/iconv sys/amd64/amd64 sys/cddl/contrib/opensolaris/uts/common/fs/zfs sys/kern sys/net80211 sys/ofed/inc... Message-ID: <201308220343.r7M3hCKU005530@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: neel Date: Thu Aug 22 03:43:12 2013 New Revision: 254635 URL: http://svnweb.freebsd.org/changeset/base/254635 Log: IFC @254081 Modified: projects/bhyve_npt_pmap/cddl/contrib/opensolaris/cmd/ztest/ztest.c projects/bhyve_npt_pmap/lib/libc/iconv/citrus_iconv.c projects/bhyve_npt_pmap/sys/amd64/amd64/minidump_machdep.c projects/bhyve_npt_pmap/sys/amd64/amd64/pmap.c projects/bhyve_npt_pmap/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c projects/bhyve_npt_pmap/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c projects/bhyve_npt_pmap/sys/kern/kern_event.c projects/bhyve_npt_pmap/sys/net80211/ieee80211.c projects/bhyve_npt_pmap/sys/net80211/ieee80211_output.c projects/bhyve_npt_pmap/sys/net80211/ieee80211_proto.h projects/bhyve_npt_pmap/sys/net80211/ieee80211_var.h projects/bhyve_npt_pmap/sys/ofed/include/linux/page.h projects/bhyve_npt_pmap/sys/sparc64/sparc64/genassym.c projects/bhyve_npt_pmap/sys/sys/event.h projects/bhyve_npt_pmap/sys/vm/vm_page.c projects/bhyve_npt_pmap/sys/vm/vm_page.h projects/bhyve_npt_pmap/sys/vm/vm_pageout.c projects/bhyve_npt_pmap/sys/vm/vm_phys.c projects/bhyve_npt_pmap/sys/vm/vm_phys.h projects/bhyve_npt_pmap/sys/vm/vm_zeroidle.c projects/bhyve_npt_pmap/sys/x86/acpica/srat.c projects/bhyve_npt_pmap/tools/tools/sysdoc/sysdoc.sh Directory Properties: projects/bhyve_npt_pmap/ (props changed) projects/bhyve_npt_pmap/cddl/ (props changed) projects/bhyve_npt_pmap/cddl/contrib/opensolaris/ (props changed) projects/bhyve_npt_pmap/lib/libc/ (props changed) projects/bhyve_npt_pmap/sys/ (props changed) projects/bhyve_npt_pmap/sys/cddl/contrib/opensolaris/ (props changed) Modified: projects/bhyve_npt_pmap/cddl/contrib/opensolaris/cmd/ztest/ztest.c ============================================================================== --- projects/bhyve_npt_pmap/cddl/contrib/opensolaris/cmd/ztest/ztest.c Thu Aug 22 02:54:20 2013 (r254634) +++ projects/bhyve_npt_pmap/cddl/contrib/opensolaris/cmd/ztest/ztest.c Thu Aug 22 03:43:12 2013 (r254635) @@ -186,6 +186,7 @@ static const ztest_shared_opts_t ztest_o extern uint64_t metaslab_gang_bang; extern uint64_t metaslab_df_alloc_threshold; +extern uint64_t zfs_deadman_synctime; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; @@ -365,7 +366,7 @@ ztest_info_t ztest_info[] = { { ztest_fault_inject, 1, &zopt_sometimes }, { ztest_ddt_repair, 1, &zopt_sometimes }, { ztest_dmu_snapshot_hold, 1, &zopt_sometimes }, - { ztest_reguid, 1, &zopt_sometimes }, + { ztest_reguid, 1, &zopt_rarely }, { ztest_spa_rename, 1, &zopt_rarely }, { ztest_scrub, 1, &zopt_rarely }, { ztest_spa_upgrade, 1, &zopt_rarely }, @@ -3606,6 +3607,9 @@ ztest_dmu_read_write(ztest_ds_t *zd, uin else dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); + /* This accounts for setting the checksum/compression. */ + dmu_tx_hold_bonus(tx, bigobj); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) { umem_free(packbuf, packsize); @@ -4756,6 +4760,14 @@ ztest_fault_inject(ztest_ds_t *zd, uint6 ASSERT(leaves >= 1); /* + * Grab the name lock as reader. There are some operations + * which don't like to have their vdevs changed while + * they are in progress (i.e. spa_change_guid). Those + * operations will have grabbed the name lock as writer. + */ + (void) rw_rdlock(&ztest_name_lock); + + /* * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); @@ -4784,7 +4796,14 @@ ztest_fault_inject(ztest_ds_t *zd, uint6 if (vd0 != NULL && vd0->vdev_top->vdev_islog) islog = B_TRUE; - if (vd0 != NULL && maxfaults != 1) { + /* + * If the top-level vdev needs to be resilvered + * then we only allow faults on the device that is + * resilvering. + */ + if (vd0 != NULL && maxfaults != 1 && + (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || + vd0->vdev_resilvering)) { /* * Make vd0 explicitly claim to be unreadable, * or unwriteable, or reach behind its back @@ -4815,6 +4834,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint6 if (sav->sav_count == 0) { spa_config_exit(spa, SCL_STATE, FTAG); + (void) rw_unlock(&ztest_name_lock); return; } vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; @@ -4828,6 +4848,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint6 } spa_config_exit(spa, SCL_STATE, FTAG); + (void) rw_unlock(&ztest_name_lock); /* * If we can tolerate two or more faults, or we're dealing @@ -5293,16 +5314,33 @@ static void * ztest_deadman_thread(void *arg) { ztest_shared_t *zs = arg; - int grace = 300; - hrtime_t delta; - - delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace; + spa_t *spa = ztest_spa; + hrtime_t delta, total = 0; - (void) poll(NULL, 0, (int)(1000 * delta)); + for (;;) { + delta = (zs->zs_thread_stop - zs->zs_thread_start) / + NANOSEC + zfs_deadman_synctime; - fatal(0, "failed to complete within %d seconds of deadline", grace); + (void) poll(NULL, 0, (int)(1000 * delta)); - return (NULL); + /* + * If the pool is suspended then fail immediately. Otherwise, + * check to see if the pool is making any progress. If + * vdev_deadman() discovers that there hasn't been any recent + * I/Os then it will end up aborting the tests. + */ + if (spa_suspended(spa)) { + fatal(0, "aborting test after %llu seconds because " + "pool has transitioned to a suspended state.", + zfs_deadman_synctime); + return (NULL); + } + vdev_deadman(spa->spa_root_vdev); + + total += zfs_deadman_synctime; + (void) printf("ztest has been running for %lld seconds\n", + total); + } } static void @@ -6031,6 +6069,7 @@ main(int argc, char **argv) (void) setvbuf(stdout, NULL, _IOLBF, 0); dprintf_setup(&argc, argv); + zfs_deadman_synctime = 300; ztest_fd_rand = open("/dev/urandom", O_RDONLY); ASSERT3S(ztest_fd_rand, >=, 0); Modified: projects/bhyve_npt_pmap/lib/libc/iconv/citrus_iconv.c ============================================================================== --- projects/bhyve_npt_pmap/lib/libc/iconv/citrus_iconv.c Thu Aug 22 02:54:20 2013 (r254634) +++ projects/bhyve_npt_pmap/lib/libc/iconv/citrus_iconv.c Thu Aug 22 03:43:12 2013 (r254635) @@ -116,7 +116,20 @@ open_shared(struct _citrus_iconv_shared size_t len_convname; int ret; +#ifdef INCOMPATIBLE_WITH_GNU_ICONV + /* + * Sadly, the gnu tools expect iconv to actually parse the + * byte stream and don't allow for a pass-through when + * the (src,dest) encodings are the same. + * See gettext-0.18.3+ NEWS: + * msgfmt now checks PO file headers more strictly with less + * false-positives. + * NetBSD don't do this either. + */ module = (strcmp(src, dst) != 0) ? "iconv_std" : "iconv_none"; +#else + module = "iconv_std"; +#endif /* initialize iconv handle */ len_convname = strlen(convname); Modified: projects/bhyve_npt_pmap/sys/amd64/amd64/minidump_machdep.c ============================================================================== --- projects/bhyve_npt_pmap/sys/amd64/amd64/minidump_machdep.c Thu Aug 22 02:54:20 2013 (r254634) +++ projects/bhyve_npt_pmap/sys/amd64/amd64/minidump_machdep.c Thu Aug 22 03:43:12 2013 (r254635) @@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include <sys/msgbuf.h> #include <sys/watchdog.h> #include <vm/vm.h> +#include <vm/vm_param.h> #include <vm/vm_page.h> #include <vm/vm_phys.h> #include <vm/pmap.h> Modified: projects/bhyve_npt_pmap/sys/amd64/amd64/pmap.c ============================================================================== --- projects/bhyve_npt_pmap/sys/amd64/amd64/pmap.c Thu Aug 22 02:54:20 2013 (r254634) +++ projects/bhyve_npt_pmap/sys/amd64/amd64/pmap.c Thu Aug 22 03:43:12 2013 (r254635) @@ -632,6 +632,10 @@ static void create_pagetables(vm_paddr_t *firstaddr) { int i, j, ndm1g, nkpdpe; + pt_entry_t *pt_p; + pd_entry_t *pd_p; + pdp_entry_t *pdp_p; + pml4_entry_t *p4_p; pt_entry_t PG_G, PG_A, PG_M; PG_G = pmap_global_bit(kernel_pmap); @@ -667,32 +671,26 @@ create_pagetables(vm_paddr_t *firstaddr) KPDphys = allocpages(firstaddr, nkpdpe); /* Fill in the underlying page table pages */ - /* Read-only from zero to physfree */ + /* Nominally read-only (but really R/W) from zero to physfree */ /* XXX not fully used, underneath 2M pages */ - for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { - ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT; - ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G; - } + pt_p = (pt_entry_t *)KPTphys; + for (i = 0; ptoa(i) < *firstaddr; i++) + pt_p[i] = ptoa(i) | PG_RW | PG_V | PG_G; /* Now map the page tables at their location within PTmap */ - for (i = 0; i < nkpt; i++) { - ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); - ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V; - } + pd_p = (pd_entry_t *)KPDphys; + for (i = 0; i < nkpt; i++) + pd_p[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V; /* Map from zero to end of allocations under 2M pages */ /* This replaces some of the KPTphys entries above */ - for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { - ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT; - ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G; - } + for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) + pd_p[i] = (i << PDRSHIFT) | PG_RW | PG_V | PG_PS | PG_G; /* And connect up the PD to the PDP */ - for (i = 0; i < nkpdpe; i++) { - ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + - (i << PAGE_SHIFT); - ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U; - } + pdp_p = (pdp_entry_t *)KPDPphys; + for (i = 0; i < nkpdpe; i++) + pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | PG_RW | PG_V | PG_U; /* * Now, set up the direct map region using 2MB and/or 1GB pages. If @@ -702,37 +700,39 @@ create_pagetables(vm_paddr_t *firstaddr) * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings * that are partially used. */ + pd_p = (pd_entry_t *)DMPDphys; for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { - ((pd_entry_t *)DMPDphys)[j] = (vm_paddr_t)i << PDRSHIFT; + pd_p[j] = (vm_paddr_t)i << PDRSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ - ((pd_entry_t *)DMPDphys)[j] |= PG_RW | PG_V | PG_PS | PG_G | + pd_p[j] |= PG_RW | PG_V | PG_PS | PG_G | PG_M | PG_A; } + pdp_p = (pdp_entry_t *)DMPDPphys; for (i = 0; i < ndm1g; i++) { - ((pdp_entry_t *)DMPDPphys)[i] = (vm_paddr_t)i << PDPSHIFT; + pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ - ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | PG_G | + pdp_p[i] |= PG_RW | PG_V | PG_PS | PG_G | PG_M | PG_A; } for (j = 0; i < ndmpdp; i++, j++) { - ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (j << PAGE_SHIFT); - ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U; + pdp_p[i] = DMPDphys + ptoa(j); + pdp_p[i] |= PG_RW | PG_V | PG_U; } /* And recursively map PML4 to itself in order to get PTmap */ - ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; - ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U; + p4_p = (pml4_entry_t *)KPML4phys; + p4_p[PML4PML4I] = KPML4phys; + p4_p[PML4PML4I] |= PG_RW | PG_V | PG_U; /* Connect the Direct Map slot(s) up to the PML4. */ for (i = 0; i < NDMPML4E; i++) { - ((pdp_entry_t *)KPML4phys)[DMPML4I + i] = DMPDPphys + - (i << PAGE_SHIFT); - ((pdp_entry_t *)KPML4phys)[DMPML4I + i] |= PG_RW | PG_V | PG_U; + p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); + p4_p[DMPML4I + i] |= PG_RW | PG_V | PG_U; } /* Connect the KVA slot up to the PML4 */ - ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; - ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U; + p4_p[KPML4I] = KPDPphys; + p4_p[KPML4I] |= PG_RW | PG_V | PG_U; } /* Modified: projects/bhyve_npt_pmap/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c ============================================================================== --- projects/bhyve_npt_pmap/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c Thu Aug 22 02:54:20 2013 (r254634) +++ projects/bhyve_npt_pmap/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c Thu Aug 22 03:43:12 2013 (r254635) @@ -448,12 +448,12 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, ui blkid = off >> dn->dn_datablkshift; nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift; - if (blkid >= dn->dn_maxblkid) { + if (blkid > dn->dn_maxblkid) { rw_exit(&dn->dn_struct_rwlock); return; } if (blkid + nblks > dn->dn_maxblkid) - nblks = dn->dn_maxblkid - blkid; + nblks = dn->dn_maxblkid - blkid + 1; } l0span = nblks; /* save for later use to calc level > 1 overhead */ Modified: projects/bhyve_npt_pmap/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c ============================================================================== --- projects/bhyve_npt_pmap/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c Thu Aug 22 02:54:20 2013 (r254634) +++ projects/bhyve_npt_pmap/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c Thu Aug 22 03:43:12 2013 (r254635) @@ -759,6 +759,7 @@ spa_change_guid(spa_t *spa) int error; uint64_t guid; + mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); guid = spa_generate_guid(NULL); @@ -771,6 +772,7 @@ spa_change_guid(spa_t *spa) } mutex_exit(&spa_namespace_lock); + mutex_exit(&spa->spa_vdev_top_lock); return (error); } @@ -4861,7 +4863,6 @@ spa_vdev_detach(spa_t *spa, uint64_t gui if (pvd->vdev_ops == &vdev_spare_ops) cvd->vdev_unspare = B_FALSE; vdev_remove_parent(cvd); - cvd->vdev_resilvering = B_FALSE; } @@ -5496,6 +5497,13 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) return (oldvd); } + if (vd->vdev_resilvering && vdev_dtl_empty(vd, DTL_MISSING) && + vdev_dtl_empty(vd, DTL_OUTAGE)) { + ASSERT(vd->vdev_ops->vdev_op_leaf); + vd->vdev_resilvering = B_FALSE; + vdev_config_dirty(vd->vdev_top); + } + /* * Check for a completed replacement. We always consider the first * vdev in the list to be the oldest vdev, and the last one to be Modified: projects/bhyve_npt_pmap/sys/kern/kern_event.c ============================================================================== --- projects/bhyve_npt_pmap/sys/kern/kern_event.c Thu Aug 22 02:54:20 2013 (r254634) +++ projects/bhyve_npt_pmap/sys/kern/kern_event.c Thu Aug 22 03:43:12 2013 (r254635) @@ -431,8 +431,11 @@ filt_proc(struct knote *kn, long hint) if (!(kn->kn_status & KN_DETACHED)) knlist_remove_inevent(&p->p_klist, kn); kn->kn_flags |= (EV_EOF | EV_ONESHOT); - kn->kn_data = p->p_xstat; kn->kn_ptr.p_proc = NULL; + if (kn->kn_fflags & NOTE_EXIT) + kn->kn_data = p->p_xstat; + if (kn->kn_fflags == 0) + kn->kn_flags |= EV_DROP; return (1); } @@ -1410,7 +1413,21 @@ retry: KASSERT((kn->kn_status & KN_INFLUX) == 0, ("KN_INFLUX set when not suppose to be")); - if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) { + if ((kn->kn_flags & EV_DROP) == EV_DROP) { + kn->kn_status &= ~KN_QUEUED; + kn->kn_status |= KN_INFLUX; + kq->kq_count--; + KQ_UNLOCK(kq); + /* + * We don't need to lock the list since we've marked + * it _INFLUX. + */ + if (!(kn->kn_status & KN_DETACHED)) + kn->kn_fop->f_detach(kn); + knote_drop(kn, td); + KQ_LOCK(kq); + continue; + } else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) { kn->kn_status &= ~KN_QUEUED; kn->kn_status |= KN_INFLUX; kq->kq_count--; Modified: projects/bhyve_npt_pmap/sys/net80211/ieee80211.c ============================================================================== --- projects/bhyve_npt_pmap/sys/net80211/ieee80211.c Thu Aug 22 02:54:20 2013 (r254634) +++ projects/bhyve_npt_pmap/sys/net80211/ieee80211.c Thu Aug 22 03:43:12 2013 (r254635) @@ -241,9 +241,15 @@ null_transmit(struct ifnet *ifp, struct return EACCES; /* XXX EIO/EPERM? */ } +#if __FreeBSD_version >= 1000031 static int null_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) +#else +static int +null_output(struct ifnet *ifp, struct mbuf *m, + struct sockaddr *dst, struct route *ro) +#endif { if_printf(ifp, "discard raw packet\n"); return null_transmit(ifp, m); Modified: projects/bhyve_npt_pmap/sys/net80211/ieee80211_output.c ============================================================================== --- projects/bhyve_npt_pmap/sys/net80211/ieee80211_output.c Thu Aug 22 02:54:20 2013 (r254634) +++ projects/bhyve_npt_pmap/sys/net80211/ieee80211_output.c Thu Aug 22 03:43:12 2013 (r254635) @@ -482,9 +482,15 @@ ieee80211_raw_output(struct ieee80211vap * connect bpf write calls to the 802.11 layer for injecting * raw 802.11 frames. */ +#if __FreeBSD_version >= 1000031 int ieee80211_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) +#else +int +ieee80211_output(struct ifnet *ifp, struct mbuf *m, + struct sockaddr *dst, struct route *ro) +#endif { #define senderr(e) do { error = (e); goto bad;} while (0) struct ieee80211_node *ni = NULL; Modified: projects/bhyve_npt_pmap/sys/net80211/ieee80211_proto.h ============================================================================== --- projects/bhyve_npt_pmap/sys/net80211/ieee80211_proto.h Thu Aug 22 02:54:20 2013 (r254634) +++ projects/bhyve_npt_pmap/sys/net80211/ieee80211_proto.h Thu Aug 22 03:43:12 2013 (r254635) @@ -96,8 +96,13 @@ int ieee80211_mgmt_output(struct ieee802 struct ieee80211_bpf_params *); int ieee80211_raw_xmit(struct ieee80211_node *, struct mbuf *, const struct ieee80211_bpf_params *); +#if __FreeBSD_version >= 1000031 int ieee80211_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *ro); +#else +int ieee80211_output(struct ifnet *, struct mbuf *, + struct sockaddr *, struct route *ro); +#endif int ieee80211_vap_pkt_send_dest(struct ieee80211vap *, struct mbuf *, struct ieee80211_node *); int ieee80211_raw_output(struct ieee80211vap *, struct ieee80211_node *, Modified: projects/bhyve_npt_pmap/sys/net80211/ieee80211_var.h ============================================================================== --- projects/bhyve_npt_pmap/sys/net80211/ieee80211_var.h Thu Aug 22 02:54:20 2013 (r254634) +++ projects/bhyve_npt_pmap/sys/net80211/ieee80211_var.h Thu Aug 22 03:43:12 2013 (r254635) @@ -496,8 +496,13 @@ struct ieee80211vap { int (*iv_newstate)(struct ieee80211vap *, enum ieee80211_state, int); /* 802.3 output method for raw frame xmit */ +#if __FreeBSD_version >= 1000031 int (*iv_output)(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); +#else + int (*iv_output)(struct ifnet *, struct mbuf *, + struct sockaddr *, struct route *); +#endif uint64_t iv_spare[6]; }; MALLOC_DECLARE(M_80211_VAP); Modified: projects/bhyve_npt_pmap/sys/ofed/include/linux/page.h ============================================================================== --- projects/bhyve_npt_pmap/sys/ofed/include/linux/page.h Thu Aug 22 02:54:20 2013 (r254634) +++ projects/bhyve_npt_pmap/sys/ofed/include/linux/page.h Thu Aug 22 03:43:12 2013 (r254635) @@ -32,6 +32,7 @@ #include <sys/param.h> +#include <machine/atomic.h> #include <vm/vm.h> #include <vm/vm_page.h> Modified: projects/bhyve_npt_pmap/sys/sparc64/sparc64/genassym.c ============================================================================== --- projects/bhyve_npt_pmap/sys/sparc64/sparc64/genassym.c Thu Aug 22 02:54:20 2013 (r254634) +++ projects/bhyve_npt_pmap/sys/sparc64/sparc64/genassym.c Thu Aug 22 03:43:12 2013 (r254635) @@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include <sys/vmmeter.h> #include <sys/_cpuset.h> +#include <machine/atomic.h> #include <vm/vm.h> #include <vm/vm_page.h> #include <vm/vm_map.h> Modified: projects/bhyve_npt_pmap/sys/sys/event.h ============================================================================== --- projects/bhyve_npt_pmap/sys/sys/event.h Thu Aug 22 02:54:20 2013 (r254634) +++ projects/bhyve_npt_pmap/sys/sys/event.h Thu Aug 22 03:43:12 2013 (r254635) @@ -76,6 +76,7 @@ struct kevent { #define EV_DISPATCH 0x0080 /* disable event after reporting */ #define EV_SYSFLAGS 0xF000 /* reserved by system */ +#define EV_DROP 0x1000 /* note should be dropped */ #define EV_FLAG1 0x2000 /* filter-specific flag */ /* returned values */ Modified: projects/bhyve_npt_pmap/sys/vm/vm_page.c ============================================================================== --- projects/bhyve_npt_pmap/sys/vm/vm_page.c Thu Aug 22 02:54:20 2013 (r254634) +++ projects/bhyve_npt_pmap/sys/vm/vm_page.c Thu Aug 22 03:43:12 2013 (r254635) @@ -64,8 +64,7 @@ * GENERAL RULES ON VM_PAGE MANIPULATION * * - A page queue lock is required when adding or removing a page from a - * page queue (vm_pagequeues[]), regardless of other locks or the - * busy state of a page. + * page queue regardless of other locks or the busy state of a page. * * * In general, no thread besides the page daemon can acquire or * hold more than one page queue lock at a time. @@ -124,20 +123,7 @@ __FBSDID("$FreeBSD$"); * page structure. */ -struct vm_pagequeue vm_pagequeues[PQ_COUNT] = { - [PQ_INACTIVE] = { - .pq_pl = TAILQ_HEAD_INITIALIZER( - vm_pagequeues[PQ_INACTIVE].pq_pl), - .pq_cnt = &cnt.v_inactive_count, - .pq_name = "vm inactive pagequeue" - }, - [PQ_ACTIVE] = { - .pq_pl = TAILQ_HEAD_INITIALIZER( - vm_pagequeues[PQ_ACTIVE].pq_pl), - .pq_cnt = &cnt.v_active_count, - .pq_name = "vm active pagequeue" - } -}; +struct vm_domain vm_dom[MAXMEMDOM]; struct mtx_padalign vm_page_queue_free_mtx; struct mtx_padalign pa_lock[PA_LOCK_COUNT]; @@ -256,6 +242,34 @@ vm_page_blacklist_lookup(char *list, vm_ return (0); } +static void +vm_page_domain_init(struct vm_domain *vmd) +{ + struct vm_pagequeue *pq; + int i; + + *__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) = + "vm inactive pagequeue"; + *__DECONST(int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) = + &cnt.v_inactive_count; + *__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) = + "vm active pagequeue"; + *__DECONST(int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) = + &cnt.v_active_count; + vmd->vmd_fullintervalcount = 0; + vmd->vmd_page_count = 0; + vmd->vmd_free_count = 0; + vmd->vmd_segs = 0; + vmd->vmd_oom = FALSE; + vmd->vmd_pass = 0; + for (i = 0; i < PQ_COUNT; i++) { + pq = &vmd->vmd_pagequeues[i]; + TAILQ_INIT(&pq->pq_pl); + mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue", + MTX_DEF | MTX_DUPOK); + } +} + /* * vm_page_startup: * @@ -319,8 +333,8 @@ vm_page_startup(vm_offset_t vaddr) mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF); for (i = 0; i < PA_LOCK_COUNT; i++) mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF); - for (i = 0; i < PQ_COUNT; i++) - vm_pagequeue_init_lock(&vm_pagequeues[i]); + for (i = 0; i < vm_ndomains; i++) + vm_page_domain_init(&vm_dom[i]); /* * Allocate memory for use when boot strapping the kernel memory @@ -1055,7 +1069,7 @@ vm_page_cache_free(vm_object_t object, v KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE, ("vm_page_cache_free: page %p has inconsistent flags", m)); cnt.v_cache_count--; - cnt.v_free_count++; + vm_phys_freecnt_adj(m, 1); } empty = vm_radix_is_empty(&object->cache); mtx_unlock(&vm_page_queue_free_mtx); @@ -1311,7 +1325,7 @@ vm_page_alloc(vm_object_t object, vm_pin ("vm_page_alloc: page %p is not free", m)); KASSERT(m->valid == 0, ("vm_page_alloc: free page %p is valid", m)); - cnt.v_free_count--; + vm_phys_freecnt_adj(m, -1); } /* @@ -1569,7 +1583,7 @@ vm_page_alloc_init(vm_page_t m) ("vm_page_alloc_init: page %p is not free", m)); KASSERT(m->valid == 0, ("vm_page_alloc_init: free page %p is valid", m)); - cnt.v_free_count--; + vm_phys_freecnt_adj(m, -1); if ((m->flags & PG_ZERO) != 0) vm_page_zero_count--; } @@ -1711,6 +1725,13 @@ vm_waitpfault(void) "pfault", 0); } +struct vm_pagequeue * +vm_page_pagequeue(vm_page_t m) +{ + + return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]); +} + /* * vm_page_dequeue: * @@ -1726,11 +1747,11 @@ vm_page_dequeue(vm_page_t m) vm_page_lock_assert(m, MA_OWNED); KASSERT(m->queue != PQ_NONE, ("vm_page_dequeue: page %p is not queued", m)); - pq = &vm_pagequeues[m->queue]; + pq = vm_page_pagequeue(m); vm_pagequeue_lock(pq); m->queue = PQ_NONE; TAILQ_REMOVE(&pq->pq_pl, m, pageq); - (*pq->pq_cnt)--; + vm_pagequeue_cnt_dec(pq); vm_pagequeue_unlock(pq); } @@ -1747,11 +1768,11 @@ vm_page_dequeue_locked(vm_page_t m) struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); - pq = &vm_pagequeues[m->queue]; + pq = vm_page_pagequeue(m); vm_pagequeue_assert_locked(pq); m->queue = PQ_NONE; TAILQ_REMOVE(&pq->pq_pl, m, pageq); - (*pq->pq_cnt)--; + vm_pagequeue_cnt_dec(pq); } /* @@ -1767,11 +1788,11 @@ vm_page_enqueue(int queue, vm_page_t m) struct vm_pagequeue *pq; vm_page_lock_assert(m, MA_OWNED); - pq = &vm_pagequeues[queue]; + pq = &vm_phys_domain(m)->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); m->queue = queue; TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq); - ++*pq->pq_cnt; + vm_pagequeue_cnt_inc(pq); vm_pagequeue_unlock(pq); } @@ -1790,7 +1811,7 @@ vm_page_requeue(vm_page_t m) vm_page_lock_assert(m, MA_OWNED); KASSERT(m->queue != PQ_NONE, ("vm_page_requeue: page %p is not queued", m)); - pq = &vm_pagequeues[m->queue]; + pq = vm_page_pagequeue(m); vm_pagequeue_lock(pq); TAILQ_REMOVE(&pq->pq_pl, m, pageq); TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq); @@ -1811,7 +1832,7 @@ vm_page_requeue_locked(vm_page_t m) KASSERT(m->queue != PQ_NONE, ("vm_page_requeue_locked: page %p is not queued", m)); - pq = &vm_pagequeues[m->queue]; + pq = vm_page_pagequeue(m); vm_pagequeue_assert_locked(pq); TAILQ_REMOVE(&pq->pq_pl, m, pageq); TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq); @@ -1948,7 +1969,7 @@ vm_page_free_toq(vm_page_t m) */ mtx_lock(&vm_page_queue_free_mtx); m->flags |= PG_FREE; - cnt.v_free_count++; + vm_phys_freecnt_adj(m, 1); #if VM_NRESERVLEVEL > 0 if (!vm_reserv_free_page(m)) #else @@ -2081,14 +2102,14 @@ _vm_page_deactivate(vm_page_t m, int ath if (queue != PQ_NONE) vm_page_dequeue(m); m->flags &= ~PG_WINATCFLS; - pq = &vm_pagequeues[PQ_INACTIVE]; + pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE]; vm_pagequeue_lock(pq); m->queue = PQ_INACTIVE; if (athead) TAILQ_INSERT_HEAD(&pq->pq_pl, m, pageq); else TAILQ_INSERT_TAIL(&pq->pq_pl, m, pageq); - cnt.v_inactive_count++; + vm_pagequeue_cnt_inc(pq); vm_pagequeue_unlock(pq); } } @@ -2888,18 +2909,20 @@ DB_SHOW_COMMAND(page, vm_page_print_page DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info) { - - db_printf("PQ_FREE:"); - db_printf(" %d", cnt.v_free_count); - db_printf("\n"); - - db_printf("PQ_CACHE:"); - db_printf(" %d", cnt.v_cache_count); - db_printf("\n"); - - db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n", - *vm_pagequeues[PQ_ACTIVE].pq_cnt, - *vm_pagequeues[PQ_INACTIVE].pq_cnt); + int dom; + + db_printf("pq_free %d pq_cache %d\n", + cnt.v_free_count, cnt.v_cache_count); + for (dom = 0; dom < vm_ndomains; dom++) { + db_printf( + "dom %d page_cnt %d free %d pq_act %d pq_inact %d pass %d\n", + dom, + vm_dom[dom].vmd_page_count, + vm_dom[dom].vmd_free_count, + vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt, + vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt, + vm_dom[dom].vmd_pass); + } } DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo) Modified: projects/bhyve_npt_pmap/sys/vm/vm_page.h ============================================================================== --- projects/bhyve_npt_pmap/sys/vm/vm_page.h Thu Aug 22 02:54:20 2013 (r254634) +++ projects/bhyve_npt_pmap/sys/vm/vm_page.h Thu Aug 22 03:43:12 2013 (r254635) @@ -181,18 +181,44 @@ TAILQ_HEAD(pglist, vm_page); struct vm_pagequeue { struct mtx pq_mutex; struct pglist pq_pl; - int *const pq_cnt; - const char *const pq_name; + int pq_cnt; + int * const pq_vcnt; + const char * const pq_name; } __aligned(CACHE_LINE_SIZE); -extern struct vm_pagequeue vm_pagequeues[PQ_COUNT]; + +struct vm_domain { + struct vm_pagequeue vmd_pagequeues[PQ_COUNT]; + int vmd_fullintervalcount; + u_int vmd_page_count; + u_int vmd_free_count; + long vmd_segs; /* bitmask of the segments */ + boolean_t vmd_oom; + int vmd_pass; /* local pagedaemon pass */ + struct vm_page vmd_marker; /* marker for pagedaemon private use */ +}; + +extern struct vm_domain vm_dom[MAXMEMDOM]; #define vm_pagequeue_assert_locked(pq) mtx_assert(&(pq)->pq_mutex, MA_OWNED) -#define vm_pagequeue_init_lock(pq) mtx_init(&(pq)->pq_mutex, \ - (pq)->pq_name, "vm pagequeue", MTX_DEF | MTX_DUPOK); #define vm_pagequeue_lock(pq) mtx_lock(&(pq)->pq_mutex) #define vm_pagequeue_unlock(pq) mtx_unlock(&(pq)->pq_mutex) +#ifdef _KERNEL +static __inline void +vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend) +{ + +#ifdef notyet + vm_pagequeue_assert_locked(pq); +#endif + pq->pq_cnt += addend; + atomic_add_int(pq->pq_vcnt, addend); +} +#define vm_pagequeue_cnt_inc(pq) vm_pagequeue_cnt_add((pq), 1) +#define vm_pagequeue_cnt_dec(pq) vm_pagequeue_cnt_add((pq), -1) +#endif /* _KERNEL */ + extern struct mtx_padalign vm_page_queue_free_mtx; extern struct mtx_padalign pa_lock[]; @@ -393,6 +419,7 @@ boolean_t vm_page_is_cached(vm_object_t vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t); vm_page_t vm_page_next(vm_page_t m); int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *); +struct vm_pagequeue *vm_page_pagequeue(vm_page_t m); vm_page_t vm_page_prev(vm_page_t m); void vm_page_putfake(vm_page_t m); void vm_page_readahead_finish(vm_page_t m); Modified: projects/bhyve_npt_pmap/sys/vm/vm_pageout.c ============================================================================== --- projects/bhyve_npt_pmap/sys/vm/vm_pageout.c Thu Aug 22 02:54:20 2013 (r254634) +++ projects/bhyve_npt_pmap/sys/vm/vm_pageout.c Thu Aug 22 03:43:12 2013 (r254635) @@ -90,6 +90,7 @@ __FBSDID("$FreeBSD$"); #include <sys/resourcevar.h> #include <sys/sched.h> #include <sys/signalvar.h> +#include <sys/smp.h> #include <sys/vnode.h> #include <sys/vmmeter.h> #include <sys/rwlock.h> @@ -103,6 +104,7 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_map.h> #include <vm/vm_pageout.h> #include <vm/vm_pager.h> +#include <vm/vm_phys.h> #include <vm/swap_pager.h> #include <vm/vm_extern.h> #include <vm/uma.h> @@ -114,7 +116,8 @@ __FBSDID("$FreeBSD$"); /* the kernel process "vm_pageout"*/ static void vm_pageout(void); static int vm_pageout_clean(vm_page_t); -static void vm_pageout_scan(int pass); +static void vm_pageout_scan(struct vm_domain *vmd, int pass); +static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass); struct proc *pageproc; @@ -216,14 +219,15 @@ SYSCTL_INT(_vm, OID_AUTO, max_wired, CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); -static boolean_t vm_pageout_launder(int, int, vm_paddr_t, vm_paddr_t); +static boolean_t vm_pageout_launder(struct vm_pagequeue *pq, int, vm_paddr_t, + vm_paddr_t); #if !defined(NO_SWAPPING) static void vm_pageout_map_deactivate_pages(vm_map_t, long); static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); static void vm_req_vmdaemon(int req); #endif static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *); -static void vm_pageout_page_stats(void); +static void vm_pageout_page_stats(struct vm_domain *vmd); /* * Initialize a dummy page for marking the caller's place in the specified @@ -267,7 +271,7 @@ vm_pageout_fallback_object_lock(vm_page_ queue = m->queue; vm_pageout_init_marker(&marker, queue); - pq = &vm_pagequeues[queue]; + pq = vm_page_pagequeue(m); object = m->object; TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq); @@ -309,7 +313,7 @@ vm_pageout_page_lock(vm_page_t m, vm_pag queue = m->queue; vm_pageout_init_marker(&marker, queue); - pq = &vm_pagequeues[queue]; + pq = vm_page_pagequeue(m); TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq); vm_pagequeue_unlock(pq); @@ -567,21 +571,17 @@ vm_pageout_flush(vm_page_t *mc, int coun } static boolean_t -vm_pageout_launder(int queue, int tries, vm_paddr_t low, vm_paddr_t high) +vm_pageout_launder(struct vm_pagequeue *pq, int tries, vm_paddr_t low, + vm_paddr_t high) { struct mount *mp; - struct vm_pagequeue *pq; struct vnode *vp; vm_object_t object; vm_paddr_t pa; vm_page_t m, m_tmp, next; - pq = &vm_pagequeues[queue]; vm_pagequeue_lock(pq); TAILQ_FOREACH_SAFE(m, &pq->pq_pl, pageq, next) { - KASSERT(m->queue == queue, - ("vm_pageout_launder: page %p's queue is not %d", m, - queue)); if ((m->flags & PG_MARKER) != 0) continue; pa = VM_PAGE_TO_PHYS(m); @@ -661,7 +661,8 @@ vm_pageout_launder(int queue, int tries, void vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high) { - int actl, actmax, inactl, inactmax; + int actl, actmax, inactl, inactmax, dom, initial_dom; + static int start_dom = 0; if (tries > 0) { /* @@ -677,19 +678,55 @@ vm_pageout_grow_cache(int tries, vm_padd */ uma_reclaim(); } + + /* + * Make the next scan start on the next domain. + */ + initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains; + inactl = 0; inactmax = cnt.v_inactive_count; actl = 0; actmax = tries < 2 ? 0 : cnt.v_active_count; + dom = initial_dom; + + /* + * Scan domains in round-robin order, first inactive queues, + * then active. Since domain usually owns large physically + * contiguous chunk of memory, it makes sense to completely + * exhaust one domain before switching to next, while growing + * the pool of contiguous physical pages. + * + * Do not even start launder a domain which cannot contain + * the specified address range, as indicated by segments + * constituting the domain. + */ again: - if (inactl < inactmax && vm_pageout_launder(PQ_INACTIVE, tries, low, - high)) { - inactl++; - goto again; - } - if (actl < actmax && vm_pageout_launder(PQ_ACTIVE, tries, low, high)) { - actl++; - goto again; + if (inactl < inactmax) { + if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs, + low, high) && + vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_INACTIVE], + tries, low, high)) { + inactl++; + goto again; + } + if (++dom == vm_ndomains) + dom = 0; + if (dom != initial_dom) + goto again; + } + if (actl < actmax) { + if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs, + low, high) && + vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_ACTIVE], + tries, low, high)) { + actl++; + goto again; + } + if (++dom == vm_ndomains) + dom = 0; + if (dom != initial_dom) + goto again; } } @@ -861,10 +898,9 @@ vm_pageout_map_deactivate_pages(map, des * vm_pageout_scan does the dirty work for the pageout daemon. */ static void -vm_pageout_scan(int pass) +vm_pageout_scan(struct vm_domain *vmd, int pass) { vm_page_t m, next; - struct vm_page marker; struct vm_pagequeue *pq; int page_shortage, maxscan, pcount; int addl_page_shortage; @@ -874,8 +910,6 @@ vm_pageout_scan(int pass) int maxlaunder; boolean_t queues_locked; - vm_pageout_init_marker(&marker, PQ_INACTIVE); - /* * Decrease registered cache sizes. */ @@ -888,7 +922,7 @@ vm_pageout_scan(int pass) /* * The addl_page_shortage is the number of temporarily * stuck pages in the inactive queue. In other words, the - * number of pages from cnt.v_inactive_count that should be + * number of pages from the inactive count that should be * discounted in setting the target for the active queue scan. */ addl_page_shortage = atomic_readandclear_int(&vm_pageout_deficit); @@ -914,8 +948,6 @@ vm_pageout_scan(int pass) if (pass) maxlaunder = 10000; - maxscan = cnt.v_inactive_count; - /* * Start scanning the inactive queue for pages we can move to the * cache or free. The scan will stop when the target is reached or @@ -923,7 +955,8 @@ vm_pageout_scan(int pass) * is not used to form decisions for the inactive queue, only for the * active queue. */ - pq = &vm_pagequeues[PQ_INACTIVE]; + pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; + maxscan = pq->pq_cnt; vm_pagequeue_lock(pq); queues_locked = TRUE; for (m = TAILQ_FIRST(&pq->pq_pl); @@ -984,7 +1017,7 @@ vm_pageout_scan(int pass) * 'next' pointer. Use our marker to remember our * place. */ - TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, pageq); + TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, pageq); vm_pagequeue_unlock(pq); queues_locked = FALSE; @@ -1034,7 +1067,7 @@ vm_pageout_scan(int pass) /* * Held pages are essentially stuck in the * queue. So, they ought to be discounted - * from cnt.v_inactive_count. See the + * from the inactive count. See the * calculation of the page_shortage for the * loop over the active queue below. */ @@ -1178,7 +1211,7 @@ vm_pageout_scan(int pass) */ if (m->queue != PQ_INACTIVE || m->object != object || - TAILQ_NEXT(m, pageq) != &marker) { + TAILQ_NEXT(m, pageq) != &vmd->vmd_marker) { vm_page_unlock(m); if (object->flags & OBJ_MIGHTBEDIRTY) vnodes_skipped++; *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201308220343.r7M3hCKU005530>