From owner-svn-src-all@FreeBSD.ORG Sun Jun 15 04:51:54 2014 Return-Path: Delivered-To: svn-src-all@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [8.8.178.115]) (using TLSv1 with cipher ADH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTPS id 20E7E850; Sun, 15 Jun 2014 04:51:54 +0000 (UTC) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 01DA6205C; Sun, 15 Jun 2014 04:51:54 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.8/8.14.8) with ESMTP id s5F4prlS004285; Sun, 15 Jun 2014 04:51:53 GMT (envelope-from kib@svn.freebsd.org) Received: (from kib@localhost) by svn.freebsd.org (8.14.8/8.14.8/Submit) id s5F4prkW004284; Sun, 15 Jun 2014 04:51:53 GMT (envelope-from kib@svn.freebsd.org) Message-Id: <201406150451.s5F4prkW004284@svn.freebsd.org> From: Konstantin Belousov Date: Sun, 15 Jun 2014 04:51:53 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r267491 - head/sys/kern X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.18 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sun, 15 Jun 2014 04:51:54 -0000 Author: kib Date: Sun Jun 15 04:51:53 2014 New Revision: 267491 URL: http://svnweb.freebsd.org/changeset/base/267491 Log: Use vn_io_fault for the writes from core dumping code. Recursing into VM due to copyin(9) faulting while VFS locks are held is deadlock-prone there in the same way as for the write(2) syscall. Reported and tested by: pho Sponsored by: The FreeBSD Foundation MFC after: 2 weeks Modified: head/sys/kern/vfs_vnops.c Modified: head/sys/kern/vfs_vnops.c ============================================================================== --- head/sys/kern/vfs_vnops.c Sun Jun 15 03:54:23 2014 (r267490) +++ head/sys/kern/vfs_vnops.c Sun Jun 15 04:51:53 2014 (r267491) @@ -8,7 +8,7 @@ * the permission of UNIX System Laboratories, Inc. * * Copyright (c) 2012 Konstantin Belousov - * Copyright (c) 2013 The FreeBSD Foundation + * Copyright (c) 2013, 2014 The FreeBSD Foundation * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. @@ -106,6 +106,53 @@ struct fileops vnops = { .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; +static const int io_hold_cnt = 16; +static int vn_io_fault_enable = 1; +SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW, + &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance"); +static u_long vn_io_faults_cnt; +SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD, + &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers"); + +/* + * Returns true if vn_io_fault mode of handling the i/o request should + * be used. + */ +static bool +do_vn_io_fault(struct vnode *vp, struct uio *uio) +{ + struct mount *mp; + + return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG && + (mp = vp->v_mount) != NULL && + (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable); +} + +/* + * Structure used to pass arguments to vn_io_fault1(), to do either + * file- or vnode-based I/O calls. + */ +struct vn_io_fault_args { + enum { + VN_IO_FAULT_FOP, + VN_IO_FAULT_VOP + } kind; + struct ucred *cred; + int flags; + union { + struct fop_args_tag { + struct file *fp; + fo_rdwr_t *doio; + } fop_args; + struct vop_args_tag { + struct vnode *vp; + } vop_args; + } args; +}; + +static int vn_io_fault1(struct vnode *vp, struct uio *uio, + struct vn_io_fault_args *args, struct thread *td); + int vn_open(ndp, flagp, cmode, fp) struct nameidata *ndp; @@ -439,6 +486,7 @@ vn_rdwr(enum uio_rw rw, struct vnode *vp struct mount *mp; struct ucred *cred; void *rl_cookie; + struct vn_io_fault_args args; int error, lock_flags; auio.uio_iov = &aiov; @@ -493,10 +541,17 @@ vn_rdwr(enum uio_rw rw, struct vnode *vp cred = file_cred; else cred = active_cred; - if (rw == UIO_READ) + if (do_vn_io_fault(vp, &auio)) { + args.kind = VN_IO_FAULT_VOP; + args.cred = cred; + args.flags = ioflg; + args.args.vop_args.vp = vp; + error = vn_io_fault1(vp, &auio, &args, td); + } else if (rw == UIO_READ) { error = VOP_READ(vp, &auio, ioflg, cred); - else + } else /* if (rw == UIO_WRITE) */ { error = VOP_WRITE(vp, &auio, ioflg, cred); + } } if (aresid) *aresid = auio.uio_resid; @@ -883,14 +938,6 @@ unlock: return (error); } -static const int io_hold_cnt = 16; -static int vn_io_fault_enable = 1; -SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW, - &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance"); -static u_long vn_io_faults_cnt; -SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD, - &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers"); - /* * The vn_io_fault() is a wrapper around vn_read() and vn_write() to * prevent the following deadlock: @@ -924,38 +971,55 @@ SYSCTL_ULONG(_debug, OID_AUTO, vn_io_fau * make the current i/o request atomic with respect to other i/os and * truncations. */ + +/* + * Decode vn_io_fault_args and perform the corresponding i/o. + */ static int -vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred, - int flags, struct thread *td) +vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio, + struct thread *td) +{ + + switch (args->kind) { + case VN_IO_FAULT_FOP: + return ((args->args.fop_args.doio)(args->args.fop_args.fp, + uio, args->cred, args->flags, td)); + case VN_IO_FAULT_VOP: + if (uio->uio_rw == UIO_READ) { + return (VOP_READ(args->args.vop_args.vp, uio, + args->flags, args->cred)); + } else if (uio->uio_rw == UIO_WRITE) { + return (VOP_WRITE(args->args.vop_args.vp, uio, + args->flags, args->cred)); + } + break; + } + panic("vn_io_fault_doio: unknown kind of io %d %d", args->kind, + uio->uio_rw); +} + +/* + * Common code for vn_io_fault(), agnostic to the kind of i/o request. + * Uses vn_io_fault_doio() to make the call to an actual i/o function. + * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request + * into args and call vn_io_fault1() to handle faults during the user + * mode buffer accesses. + */ +static int +vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args, + struct thread *td) { vm_page_t ma[io_hold_cnt + 2]; struct uio *uio_clone, short_uio; struct iovec short_iovec[1]; - fo_rdwr_t *doio; - struct vnode *vp; - void *rl_cookie; - struct mount *mp; vm_page_t *prev_td_ma; - int error, cnt, save, saveheld, prev_td_ma_cnt; - vm_offset_t addr, end; vm_prot_t prot; + vm_offset_t addr, end; size_t len, resid; ssize_t adv; + int error, cnt, save, saveheld, prev_td_ma_cnt; - if (uio->uio_rw == UIO_READ) - doio = vn_read; - else - doio = vn_write; - vp = fp->f_vnode; - foffset_lock_uio(fp, uio, flags); - - if (uio->uio_segflg != UIO_USERSPACE || vp->v_type != VREG || - ((mp = vp->v_mount) != NULL && - (mp->mnt_kern_flag & MNTK_NO_IOPF) == 0) || - !vn_io_fault_enable) { - error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td); - goto out_last; - } + prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ; /* * The UFS follows IO_UNIT directive and replays back both @@ -973,22 +1037,8 @@ vn_io_fault(struct file *fp, struct uio short_uio.uio_rw = uio->uio_rw; short_uio.uio_td = uio->uio_td; - if (uio->uio_rw == UIO_READ) { - prot = VM_PROT_WRITE; - rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset, - uio->uio_offset + uio->uio_resid); - } else { - prot = VM_PROT_READ; - if ((fp->f_flag & O_APPEND) != 0 || (flags & FOF_OFFSET) == 0) - /* For appenders, punt and lock the whole range. */ - rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); - else - rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset, - uio->uio_offset + uio->uio_resid); - } - save = vm_fault_disable_pagefaults(); - error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td); + error = vn_io_fault_doio(args, uio, td); if (error != EFAULT) goto out; @@ -1038,8 +1088,7 @@ vn_io_fault(struct file *fp, struct uio td->td_ma = ma; td->td_ma_cnt = cnt; - error = doio(fp, &short_uio, active_cred, flags | FOF_OFFSET, - td); + error = vn_io_fault_doio(args, &short_uio, td); vm_page_unhold_pages(ma, cnt); adv = len - short_uio.uio_resid; @@ -1060,9 +1109,45 @@ vn_io_fault(struct file *fp, struct uio curthread_pflags_restore(saveheld); out: vm_fault_enable_pagefaults(save); - vn_rangelock_unlock(vp, rl_cookie); free(uio_clone, M_IOV); -out_last: + return (error); +} + +static int +vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) +{ + fo_rdwr_t *doio; + struct vnode *vp; + void *rl_cookie; + struct vn_io_fault_args args; + int error; + + doio = uio->uio_rw == UIO_READ ? vn_read : vn_write; + vp = fp->f_vnode; + foffset_lock_uio(fp, uio, flags); + if (do_vn_io_fault(vp, uio)) { + args.kind = VN_IO_FAULT_FOP; + args.args.fop_args.fp = fp; + args.args.fop_args.doio = doio; + args.cred = active_cred; + args.flags = flags | FOF_OFFSET; + if (uio->uio_rw == UIO_READ) { + rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset, + uio->uio_offset + uio->uio_resid); + } else if ((fp->f_flag & O_APPEND) != 0 || + (flags & FOF_OFFSET) == 0) { + /* For appenders, punt and lock the whole range. */ + rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); + } else { + rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset, + uio->uio_offset + uio->uio_resid); + } + error = vn_io_fault1(vp, uio, &args, td); + vn_rangelock_unlock(vp, rl_cookie); + } else { + error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td); + } foffset_unlock_uio(fp, uio, flags); return (error); }