Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 12 Nov 2016 15:19:13 -0800
From:      Adrian Chadd <adrian.chadd@gmail.com>
To:        Konstantin Belousov <kib@freebsd.org>, "freebsd-mips@freebsd.org" <freebsd-mips@freebsd.org>
Cc:        "src-committers@freebsd.org" <src-committers@freebsd.org>,  "svn-src-all@freebsd.org" <svn-src-all@freebsd.org>,  "svn-src-head@freebsd.org" <svn-src-head@freebsd.org>
Subject:   Re: svn commit: r307626 - head/sys/ufs/ffs
Message-ID:  <CAJ-Vmom5rYe89m7bch4qoHHq3X2e67pPk_7G2aRGrjSPNp5mzg@mail.gmail.com>
In-Reply-To: <201610191109.u9JB9TTC002727@repo.freebsd.org>
References:  <201610191109.u9JB9TTC002727@repo.freebsd.org>

next in thread | previous in thread | raw e-mail | index | archive | help
hi!

This broke freebsd on mips24k.

BAD_PAGE_FAULT: pid 1 tid 100001 (init), uid 0: pc 0x4002a4 got a read
fault (type 0x2) at 0
Trapframe Register Dump:
    zero: 0    at: 0    v0: 0    v1: 0
    a0: 0x7fffeecc    a1: 0    a2: 0    a3: 0
    t0: 0    t1: 0    t2: 0    t3: 0
    t4: 0    t5: 0    t6: 0    t7: 0
    t8: 0    t9: 0x400260    s0: 0x10    s1: 0x2
    s2: 0x7fffeed0    s3: 0    s4: 0    s5: 0
    s6: 0    s7: 0    k0: 0    k1: 0
    gp: 0x4d55d0    sp: 0x7ffeee90    s8: 0    ra: 0
    sr: 0xfc13    mullo: 0    mulhi: 0    badvaddr: 0
    cause: 0x8    pc: 0x4002a4
Page table info for pc address 0x4002a4: pde = 0x809be000, pte = 0xa001acda
Dumping 4 words starting at pc address 0x4002a4:
8c420000 14400003 00908021 8f828024
Page table info for bad address 0: pde = 0, pte = 0

.. and yes, I've spent three days bisecting everything to get to this
particular commit.




-adrian


On 19 October 2016 at 04:09, Konstantin Belousov <kib@freebsd.org> wrote:
> Author: kib
> Date: Wed Oct 19 11:09:29 2016
> New Revision: 307626
> URL: https://svnweb.freebsd.org/changeset/base/307626
>
> Log:
>   Add FFS pager, which uses buffer cache read operation to validate pages.
>   See the comments for more detailed description of the algorithm.
>
>   The pager is used unconditionally when the block size of the
>   underlying device is larger than the machine page size, since local
>   vnode pager cannot handle the configuration [1].  Otherwise, the
>   vfs.ffs.use_buf_pager sysctl allows to switch to the local pager.
>
>   Measurements demonstrated no regression in the ever-important
>   buildworld benchmark, and small (~5%) throughput improvements in the
>   special microbenchmark configuration for dbench over swap-backed
>   md(4).
>
>   Code can be generalized and reused for other filesystems which use
>   buffer cache.
>
>   Reported by:  Anton Yuzhaninov <citrin@citrin.ru> [1]
>   Tested by:    pho
>   Benchmarked by:       mjg, pho
>   Reviewed by:  alc, markj, mckusick (previous version)
>   Sponsored by: The FreeBSD Foundation
>   MFC after:    2 weeks
>   Differential revision:        https://reviews.freebsd.org/D8198
>
> Modified:
>   head/sys/ufs/ffs/ffs_vnops.c
>
> Modified: head/sys/ufs/ffs/ffs_vnops.c
> ==============================================================================
> --- head/sys/ufs/ffs/ffs_vnops.c        Wed Oct 19 10:01:04 2016        (r307625)
> +++ head/sys/ufs/ffs/ffs_vnops.c        Wed Oct 19 11:09:29 2016        (r307626)
> @@ -77,6 +77,7 @@ __FBSDID("$FreeBSD$");
>  #include <sys/priv.h>
>  #include <sys/rwlock.h>
>  #include <sys/stat.h>
> +#include <sys/sysctl.h>
>  #include <sys/vmmeter.h>
>  #include <sys/vnode.h>
>
> @@ -86,6 +87,7 @@ __FBSDID("$FreeBSD$");
>  #include <vm/vm_object.h>
>  #include <vm/vm_page.h>
>  #include <vm/vm_pager.h>
> +#include <vm/vm_pageout.h>
>  #include <vm/vnode_pager.h>
>
>  #include <ufs/ufs/extattr.h>
> @@ -102,8 +104,9 @@ __FBSDID("$FreeBSD$");
>  #ifdef DIRECTIO
>  extern int     ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
>  #endif
> -static vop_fsync_t     ffs_fsync;
>  static vop_fdatasync_t ffs_fdatasync;
> +static vop_fsync_t     ffs_fsync;
> +static vop_getpages_t  ffs_getpages;
>  static vop_lock1_t     ffs_lock;
>  static vop_read_t      ffs_read;
>  static vop_write_t     ffs_write;
> @@ -119,13 +122,12 @@ static vop_openextattr_t  ffs_openextattr
>  static vop_setextattr_t        ffs_setextattr;
>  static vop_vptofh_t    ffs_vptofh;
>
> -
>  /* Global vfs data structures for ufs. */
>  struct vop_vector ffs_vnodeops1 = {
>         .vop_default =          &ufs_vnodeops,
>         .vop_fsync =            ffs_fsync,
>         .vop_fdatasync =        ffs_fdatasync,
> -       .vop_getpages =         vnode_pager_local_getpages,
> +       .vop_getpages =         ffs_getpages,
>         .vop_getpages_async =   vnode_pager_local_getpages_async,
>         .vop_lock1 =            ffs_lock,
>         .vop_read =             ffs_read,
> @@ -147,7 +149,7 @@ struct vop_vector ffs_vnodeops2 = {
>         .vop_default =          &ufs_vnodeops,
>         .vop_fsync =            ffs_fsync,
>         .vop_fdatasync =        ffs_fdatasync,
> -       .vop_getpages =         vnode_pager_local_getpages,
> +       .vop_getpages =         ffs_getpages,
>         .vop_getpages_async =   vnode_pager_local_getpages_async,
>         .vop_lock1 =            ffs_lock,
>         .vop_read =             ffs_read,
> @@ -1784,3 +1786,165 @@ vop_vptofh {
>         ufhp->ufid_gen = ip->i_gen;
>         return (0);
>  }
> +
> +SYSCTL_DECL(_vfs_ffs);
> +static int use_buf_pager = 1;
> +SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
> +    "Always use buffer pager instead of bmap");
> +static int buf_pager_relbuf;
> +SYSCTL_INT(_vfs_ffs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN,
> +    &buf_pager_relbuf, 0,
> +    "Make buffer pager release buffers after reading");
> +
> +/*
> + * The FFS pager.  It uses buffer reads to validate pages.
> + *
> + * In contrast to the generic local pager from vm/vnode_pager.c, this
> + * pager correctly and easily handles volumes where the underlying
> + * device block size is greater than the machine page size.  The
> + * buffer cache transparently extends the requested page run to be
> + * aligned at the block boundary, and does the necessary bogus page
> + * replacements in the addends to avoid obliterating already valid
> + * pages.
> + *
> + * The only non-trivial issue is that the exclusive busy state for
> + * pages, which is assumed by the vm_pager_getpages() interface, is
> + * incompatible with the VMIO buffer cache's desire to share-busy the
> + * pages.  This function performs a trivial downgrade of the pages'
> + * state before reading buffers, and a less trivial upgrade from the
> + * shared-busy to excl-busy state after the read.
> + */
> +static int
> +ffs_getpages(struct vop_getpages_args *ap)
> +{
> +       struct vnode *vp;
> +       vm_page_t *ma, m;
> +       vm_object_t object;
> +       struct buf *bp;
> +       struct ufsmount *um;
> +       ufs_lbn_t lbn, lbnp;
> +       vm_ooffset_t la, lb;
> +       long bsize;
> +       int bo_bs, count, error, i;
> +       bool redo, lpart;
> +
> +       vp = ap->a_vp;
> +       ma = ap->a_m;
> +       count = ap->a_count;
> +
> +       um = VFSTOUFS(ap->a_vp->v_mount);
> +       bo_bs = um->um_devvp->v_bufobj.bo_bsize;
> +       if (!use_buf_pager && bo_bs <= PAGE_SIZE)
> +               return (vnode_pager_generic_getpages(vp, ma, count,
> +                   ap->a_rbehind, ap->a_rahead, NULL, NULL));
> +
> +       object = vp->v_object;
> +       la = IDX_TO_OFF(ma[count - 1]->pindex);
> +       if (la >= object->un_pager.vnp.vnp_size)
> +               return (VM_PAGER_BAD);
> +       lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size;
> +       if (ap->a_rbehind != NULL) {
> +               lb = IDX_TO_OFF(ma[0]->pindex);
> +               *ap->a_rbehind = OFF_TO_IDX(lb - rounddown2(lb, bo_bs));
> +       }
> +       if (ap->a_rahead != NULL) {
> +               *ap->a_rahead = OFF_TO_IDX(roundup2(la, bo_bs) - la);
> +               if (la + IDX_TO_OFF(*ap->a_rahead) >=
> +                   object->un_pager.vnp.vnp_size) {
> +                       *ap->a_rahead = OFF_TO_IDX(roundup2(object->un_pager.
> +                           vnp.vnp_size, PAGE_SIZE) - la);
> +               }
> +       }
> +       VM_OBJECT_WLOCK(object);
> +again:
> +       for (i = 0; i < count; i++)
> +               vm_page_busy_downgrade(ma[i]);
> +       VM_OBJECT_WUNLOCK(object);
> +
> +       lbnp = -1;
> +       for (i = 0; i < count; i++) {
> +               m = ma[i];
> +
> +               /*
> +                * Pages are shared busy and the object lock is not
> +                * owned, which together allow for the pages'
> +                * invalidation.  The racy test for validity avoids
> +                * useless creation of the buffer for the most typical
> +                * case when invalidation is not used in redo or for
> +                * parallel read.  The shared->excl upgrade loop at
> +                * the end of the function catches the race in a
> +                * reliable way (protected by the object lock).
> +                */
> +               if (m->valid == VM_PAGE_BITS_ALL)
> +                       continue;
> +
> +               lbn = lblkno(um->um_fs, IDX_TO_OFF(m->pindex));
> +               if (lbn != lbnp) {
> +                       bsize = blksize(um->um_fs, VTOI(vp), lbn);
> +                       error = bread_gb(vp, lbn, bsize, NOCRED, GB_UNMAPPED,
> +                           &bp);
> +                       if (error != 0)
> +                               break;
> +                       KASSERT(1 /* racy, enable for debugging */ ||
> +                           m->valid == VM_PAGE_BITS_ALL || i == count - 1,
> +                           ("buf %d %p invalid", i, m));
> +                       if (i == count - 1 && lpart) {
> +                               VM_OBJECT_WLOCK(object);
> +                               if (m->valid != 0 &&
> +                                   m->valid != VM_PAGE_BITS_ALL)
> +                                       vm_page_zero_invalid(m, TRUE);
> +                               VM_OBJECT_WUNLOCK(object);
> +                       }
> +                       if (LIST_EMPTY(&bp->b_dep)) {
> +                               /*
> +                                * Invalidation clears m->valid, but
> +                                * may leave B_CACHE flag if the
> +                                * buffer existed at the invalidation
> +                                * time.  In this case, recycle the
> +                                * buffer to do real read on next
> +                                * bread() after redo.
> +                                *
> +                                * Otherwise B_RELBUF is not strictly
> +                                * necessary, enable to reduce buf
> +                                * cache pressure.
> +                                */
> +                               if (buf_pager_relbuf ||
> +                                   m->valid != VM_PAGE_BITS_ALL)
> +                                       bp->b_flags |= B_RELBUF;
> +
> +                               bp->b_flags &= ~B_NOCACHE;
> +                               brelse(bp);
> +                       } else {
> +                               bqrelse(bp);
> +                       }
> +                       lbnp = lbn;
> +               }
> +       }
> +
> +       VM_OBJECT_WLOCK(object);
> +       redo = false;
> +       for (i = 0; i < count; i++) {
> +               vm_page_sunbusy(ma[i]);
> +               ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL);
> +
> +               /*
> +                * Since the pages were only sbusy while neither the
> +                * buffer nor the object lock was held by us, or
> +                * reallocated while vm_page_grab() slept for busy
> +                * relinguish, they could have been invalidated.
> +                * Recheck the valid bits and re-read as needed.
> +                *
> +                * Note that the last page is made fully valid in the
> +                * read loop, and partial validity for the page at
> +                * index count - 1 could mean that the page was
> +                * invalidated or removed, so we must restart for
> +                * safety as well.
> +                */
> +               if (ma[i]->valid != VM_PAGE_BITS_ALL)
> +                       redo = true;
> +       }
> +       if (redo && error == 0)
> +               goto again;
> +       VM_OBJECT_WUNLOCK(object);
> +       return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
> +}
>



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?CAJ-Vmom5rYe89m7bch4qoHHq3X2e67pPk_7G2aRGrjSPNp5mzg>