From owner-freebsd-current Sun Nov 7 7:52:29 1999 Delivered-To: freebsd-current@freebsd.org Received: from critter.freebsd.dk (critter.freebsd.dk [212.242.40.131]) by hub.freebsd.org (Postfix) with ESMTP id 3747B14D24 for ; Sun, 7 Nov 1999 07:52:17 -0800 (PST) (envelope-from phk@critter.freebsd.dk) Received: from critter.freebsd.dk (localhost [127.0.0.1]) by critter.freebsd.dk (8.9.3/8.9.2) with ESMTP id QAA14631 for ; Sun, 7 Nov 1999 16:52:12 +0100 (CET) (envelope-from phk@critter.freebsd.dk) To: current@freebsd.org Subject: bdev patch looking for review & testers From: Poul-Henning Kamp Date: Sun, 07 Nov 1999 16:52:12 +0100 Message-ID: <14629.941989932@critter.freebsd.dk> Sender: owner-freebsd-current@FreeBSD.ORG Precedence: bulk X-Loop: FreeBSD.ORG The main thing in this patch is to make bdevs use the same (ie: normal) vnode locking as character devices. Secondary effects are general cleanup in spec_vnops.c You should see no operational changes by applying this patch, if you do, please report them to me. Poul-Henning Index: gnu/ext2fs/ext2_vfsops.c =================================================================== RCS file: /home/ncvs/src/sys/gnu/ext2fs/ext2_vfsops.c,v retrieving revision 1.56 diff -u -r1.56 ext2_vfsops.c --- ext2_vfsops.c 1999/11/01 23:57:22 1.56 +++ ext2_vfsops.c 1999/11/02 06:20:23 @@ -607,7 +607,9 @@ #endif ronly = (mp->mnt_flag & MNT_RDONLY) != 0; - if ((error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p)) != 0) + error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p); + VOP_UNLOCK(devvp, 0, p); + if (error != 0) return (error); if (VOP_IOCTL(devvp, DIOCGPART, (caddr_t)&dpart, FREAD, NOCRED, p) != 0) size = DEV_BSIZE; Index: isofs/cd9660/cd9660_vfsops.c =================================================================== RCS file: /home/ncvs/src/sys/isofs/cd9660/cd9660_vfsops.c,v retrieving revision 1.67 diff -u -r1.67 cd9660_vfsops.c --- cd9660_vfsops.c 1999/11/01 23:57:25 1.67 +++ cd9660_vfsops.c 1999/11/02 06:20:26 @@ -296,8 +296,11 @@ if ((error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0))) return (error); - if ((error = VOP_OPEN(devvp, FREAD, FSCRED, p))) + error = VOP_OPEN(devvp, FREAD, FSCRED, p); + VOP_UNLOCK(devvp, 0, p); + if (error != 0) return error; + needclose = 1; /* This is the "logical sector size". The standard says this Index: kern/vfs_subr.c =================================================================== RCS file: /home/ncvs/src/sys/kern/vfs_subr.c,v retrieving revision 1.232 diff -u -r1.232 vfs_subr.c --- vfs_subr.c 1999/10/29 18:08:55 1.232 +++ vfs_subr.c 1999/10/31 15:57:32 @@ -2870,7 +2870,7 @@ vn_isdisk(vp) struct vnode *vp; { - if (vp->v_type != VBLK) + if (vp->v_type != VBLK && vp->v_type != VCHR) return (0); if (!devsw(vp->v_rdev)) return (0); Index: miscfs/specfs/spec_vnops.c =================================================================== RCS file: /home/ncvs/src/sys/miscfs/specfs/spec_vnops.c,v retrieving revision 1.125 diff -u -r1.125 spec_vnops.c --- spec_vnops.c 1999/11/07 15:09:59 1.125 +++ spec_vnops.c 1999/11/07 15:29:29 @@ -68,10 +68,8 @@ static int spec_poll __P((struct vop_poll_args *)); static int spec_print __P((struct vop_print_args *)); static int spec_read __P((struct vop_read_args *)); -static int spec_bufread __P((struct vop_read_args *)); static int spec_strategy __P((struct vop_strategy_args *)); static int spec_write __P((struct vop_write_args *)); -static int spec_bufwrite __P((struct vop_write_args *)); vop_t **spec_vnodeop_p; static struct vnodeopv_entry_desc spec_vnodeop_entries[] = { @@ -142,8 +140,8 @@ } */ *ap; { struct proc *p = ap->a_p; - struct vnode *bvp, *vp = ap->a_vp; - dev_t bdev, dev = vp->v_rdev; + struct vnode *vp = ap->a_vp; + dev_t dev = vp->v_rdev; int error; struct cdevsw *dsw; const char *cp; @@ -162,61 +160,43 @@ if (!dev->si_iosize_max) dev->si_iosize_max = DFLTPHYS; - switch (vp->v_type) { - case VCHR: - if (ap->a_cred != FSCRED && (ap->a_mode & FWRITE)) { - /* - * When running in very secure mode, do not allow - * opens for writing of any disk character devices. - */ - if (securelevel >= 2 - && dsw->d_bmaj != -1 - && (dsw->d_flags & D_TYPEMASK) == D_DISK) - return (EPERM); - /* - * When running in secure mode, do not allow opens - * for writing of character - * devices whose corresponding block devices are - * currently mounted. - */ - if (securelevel >= 1) { - if ((bdev = chrtoblk(dev)) != NODEV && - vfinddev(bdev, VBLK, &bvp) && - bvp->v_usecount > 0 && - (error = vfs_mountedon(bvp))) - return (error); - } - } - if ((dsw->d_flags & D_TYPEMASK) == D_TTY) - vp->v_flag |= VISTTY; - VOP_UNLOCK(vp, 0, p); - error = (*dsw->d_open)(dev, ap->a_mode, S_IFCHR, p); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - break; - case VBLK: + /* + * XXX: Disks get special billing here, but it is mostly wrong. + * XXX: diskpartitions can overlap and the real checks should + * XXX: take this into account, and consequently they need to + * XXX: live in the diskslicing code. Some checks do. + */ + if (vn_isdisk(vp) && ap->a_cred != FSCRED && (ap->a_mode & FWRITE)) { /* - * When running in very secure mode, do not allow - * opens for writing of any disk block devices. + * Never allow opens for write if the device is mounted R/W */ - if (securelevel >= 2 && ap->a_cred != FSCRED && - (ap->a_mode & FWRITE) && - (dsw->d_flags & D_TYPEMASK) == D_DISK) + if (vp->v_specmountpoint != NULL && + !(vp->v_specmountpoint->mnt_flag & MNT_RDONLY)) + return (EBUSY); + + /* + * When running in secure mode, do not allow opens + * for writing if the device is mounted + */ + if (securelevel >= 1 && vp->v_specmountpoint != NULL) return (EPERM); /* - * Do not allow opens of block devices that are - * currently mounted. + * When running in very secure mode, do not allow + * opens for writing of any devices. */ - error = vfs_mountedon(vp); - if (error) - return (error); - error = (*dsw->d_open)(dev, ap->a_mode, S_IFBLK, p); - break; - default: - error = ENXIO; - break; + if (securelevel >= 2) + return (EPERM); } + /* XXX: Special casing of ttys for deadfs. Probably redundant */ + if (dsw->d_flags & D_TTY) + vp->v_flag |= VISTTY; + + VOP_UNLOCK(vp, 0, p); + error = (*dsw->d_open)(dev, ap->a_mode, S_IFCHR, p); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (error) return (error); @@ -254,123 +234,29 @@ spec_read(ap) struct vop_read_args /* { struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - struct uio *uio = ap->a_uio; - struct proc *p = uio->uio_procp; - int error = 0; - -#ifdef DIAGNOSTIC - if (uio->uio_rw != UIO_READ) - panic("spec_read mode"); - if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) - panic("spec_read proc"); -#endif - if (uio->uio_resid == 0) - return (0); - - if (vp->v_type == VCHR || (bdev_buffered == 0)) { - VOP_UNLOCK(vp, 0, p); - error = (*devsw(vp->v_rdev)->d_read) - (vp->v_rdev, uio, ap->a_ioflag); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - return (error); - } else { - return (spec_bufread(ap)); - } -} - - -/* Vnode op for buffered read */ -/* ARGSUSED */ -static int -spec_bufread(ap) - struct vop_read_args /* { - struct vnode *a_vp; struct uio *a_uio; - int a_ioflag; + int a_ioflag; struct ucred *a_cred; } */ *ap; { - struct vnode *vp = ap->a_vp; - struct uio *uio = ap->a_uio; - struct proc *p = uio->uio_procp; - struct buf *bp; - daddr_t bn, nextbn; - long bsize, bscale; - struct partinfo dpart; - int n, on; - d_ioctl_t *ioctl; - int error = 0; - int seqcount = ap->a_ioflag >> 16; + struct vnode *vp; + struct proc *p; + struct uio *uio; dev_t dev; + int error; - if (uio->uio_offset < 0) - return (EINVAL); + vp = ap->a_vp; dev = vp->v_rdev; + uio = ap->a_uio; + p = uio->uio_procp; - /* - * Calculate block size for block device. The block size must - * be larger then the physical minimum. - */ - - bsize = vp->v_rdev->si_bsize_best; - if (bsize < vp->v_rdev->si_bsize_phys) - bsize = vp->v_rdev->si_bsize_phys; - if (bsize < BLKDEV_IOSIZE) - bsize = BLKDEV_IOSIZE; - - if ((ioctl = devsw(dev)->d_ioctl) != NULL && - (*ioctl)(dev, DIOCGPART, (caddr_t)&dpart, FREAD, p) == 0 && - dpart.part->p_fstype == FS_BSDFFS && - dpart.part->p_frag != 0 && dpart.part->p_fsize != 0) - bsize = dpart.part->p_frag * dpart.part->p_fsize; - bscale = btodb(bsize); - do { - bn = btodb(uio->uio_offset) & ~(bscale - 1); - on = uio->uio_offset % bsize; - if (seqcount > 1) { - nextbn = bn + bscale; - error = breadn(vp, bn, (int)bsize, &nextbn, - (int *)&bsize, 1, NOCRED, &bp); - } else { - error = bread(vp, bn, (int)bsize, NOCRED, &bp); - } + if (uio->uio_resid == 0) + return (0); - /* - * Figure out how much of the buffer is valid relative - * to our offset into the buffer, which may be negative - * if we are beyond the EOF. - * - * The valid size of the buffer is based on - * bp->b_bcount (which may have been truncated by - * dscheck or the device) minus bp->b_resid, which - * may be indicative of an I/O error if non-zero. - */ - if (error == 0) { - n = bp->b_bcount - on; - if (n < 0) { - error = EINVAL; - } else { - n = min(n, bp->b_bcount - bp->b_resid - on); - if (n < 0) - error = EIO; - } - } - if (error) { - brelse(bp); - return (error); - } - n = min(n, uio->uio_resid); - error = uiomove((char *)bp->b_data + on, n, uio); - brelse(bp); - } while (error == 0 && uio->uio_resid > 0 && n != 0); + VOP_UNLOCK(vp, 0, p); + error = (*devsw(dev)->d_read) (dev, uio, ap->a_ioflag); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); return (error); - /* NOTREACHED */ } /* @@ -381,130 +267,25 @@ spec_write(ap) struct vop_write_args /* { struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - struct ucred *a_cred; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - struct uio *uio = ap->a_uio; - struct proc *p = uio->uio_procp; - int error = 0; - -#ifdef DIAGNOSTIC - if (uio->uio_rw != UIO_WRITE) - panic("spec_write mode"); - if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) - panic("spec_write proc"); -#endif - - if (vp->v_type == VCHR || (bdev_buffered == 0)) { - VOP_UNLOCK(vp, 0, p); - error = (*devsw(vp->v_rdev)->d_write) - (vp->v_rdev, uio, ap->a_ioflag); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - return (error); - } else { - return (spec_bufwrite(ap)); - } -} - - -/* Vnode op for buffered write */ -/* ARGSUSED */ -static int -spec_bufwrite(ap) - struct vop_write_args /* { - struct vnode *a_vp; struct uio *a_uio; - int a_ioflag; + int a_ioflag; struct ucred *a_cred; } */ *ap; { - struct vnode *vp = ap->a_vp; - struct uio *uio = ap->a_uio; - struct proc *p = uio->uio_procp; - struct buf *bp; - daddr_t bn; - int bsize, blkmask; - struct partinfo dpart; - register int n, on; - int error = 0; - - if (uio->uio_resid == 0) - return (0); - if (uio->uio_offset < 0) - return (EINVAL); - - /* - * Calculate block size for block device. The block size must - * be larger then the physical minimum. - */ - bsize = vp->v_rdev->si_bsize_best; - if (bsize < vp->v_rdev->si_bsize_phys) - bsize = vp->v_rdev->si_bsize_phys; - if (bsize < BLKDEV_IOSIZE) - bsize = BLKDEV_IOSIZE; - - if ((*devsw(vp->v_rdev)->d_ioctl)(vp->v_rdev, DIOCGPART, - (caddr_t)&dpart, FREAD, p) == 0) { - if (dpart.part->p_fstype == FS_BSDFFS && - dpart.part->p_frag != 0 && dpart.part->p_fsize != 0) - bsize = dpart.part->p_frag * - dpart.part->p_fsize; - } - blkmask = btodb(bsize) - 1; - do { - bn = btodb(uio->uio_offset) & ~blkmask; - on = uio->uio_offset % bsize; + struct vnode *vp; + struct proc *p; + struct uio *uio; + dev_t dev; + int error; - /* - * Calculate potential request size, determine - * if we can avoid a read-before-write. - */ - n = min((unsigned)(bsize - on), uio->uio_resid); - if (n == bsize) - bp = getblk(vp, bn, bsize, 0, 0); - else - error = bread(vp, bn, bsize, NOCRED, &bp); + vp = ap->a_vp; + dev = vp->v_rdev; + uio = ap->a_uio; + p = uio->uio_procp; - /* - * n is the amount of effective space in the buffer - * that we wish to write relative to our offset into - * the buffer. We have to truncate it to the valid - * size of the buffer relative to our offset into - * the buffer (which may end up being negative if - * we are beyond the EOF). - * - * The valid size of the buffer is based on - * bp->b_bcount (which may have been truncated by - * dscheck or the device) minus bp->b_resid, which - * may be indicative of an I/O error if non-zero. - * - * XXX In a newly created buffer, b_bcount == bsize - * and, being asynchronous, we have no idea of the - * EOF. - */ - if (error == 0) { - n = min(n, bp->b_bcount - on); - if (n < 0) { - error = EINVAL; - } else { - n = min(n, bp->b_bcount - bp->b_resid - on); - if (n < 0) - error = EIO; - } - } - if (error) { - brelse(bp); - return (error); - } - error = uiomove((char *)bp->b_data + on, n, uio); - if (n + on == bsize) - bawrite(bp); - else - bdwrite(bp); - } while (error == 0 && uio->uio_resid > 0 && n != 0); + VOP_UNLOCK(vp, 0, p); + error = (*devsw(dev)->d_write) (dev, uio, ap->a_ioflag); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); return (error); } @@ -523,20 +304,11 @@ struct proc *a_p; } */ *ap; { - dev_t dev = ap->a_vp->v_rdev; - - switch (ap->a_vp->v_type) { + dev_t dev; - case VCHR: - return ((*devsw(dev)->d_ioctl)(dev, ap->a_command, - ap->a_data, ap->a_fflag, ap->a_p)); - case VBLK: - return ((*devsw(dev)->d_ioctl)(dev, ap->a_command, - ap->a_data, ap->a_fflag, ap->a_p)); - default: - panic("spec_ioctl"); - /* NOTREACHED */ - } + dev = ap->a_vp->v_rdev; + return ((*devsw(dev)->d_ioctl)(dev, ap->a_command, + ap->a_data, ap->a_fflag, ap->a_p)); } /* ARGSUSED */ @@ -550,17 +322,11 @@ } */ *ap; { register dev_t dev; - - switch (ap->a_vp->v_type) { - case VCHR: - dev = ap->a_vp->v_rdev; - return (*devsw(dev)->d_poll)(dev, ap->a_events, ap->a_p); - default: - return (vop_defaultop((struct vop_generic_args *)ap)); - - } + dev = ap->a_vp->v_rdev; + return (*devsw(dev)->d_poll)(dev, ap->a_events, ap->a_p); } + /* * Synch buffers associated with a block device */ @@ -574,16 +340,16 @@ struct proc *a_p; } */ *ap; { - register struct vnode *vp = ap->a_vp; - register struct buf *bp; + struct vnode *vp = ap->a_vp; + struct buf *bp; struct buf *nbp; int s; - if (vp->v_type == VCHR) - return (0); /* - * Flush all dirty buffers associated with a block device. + * Flush all dirty buffers associated with a disk device. */ + if (!vn_isdisk(vp)) + return (0); loop: s = splbio(); for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { @@ -669,6 +435,10 @@ struct cdevsw *bsw; struct buf *bp; + /* + * XXX: This assumes that strategy does the deed right away. + * XXX: this may not be TRTTD. + */ bsw = devsw(ap->a_vp->v_rdev); if ((bsw->d_flags & D_CANFREE) == 0) return (0); @@ -706,7 +476,7 @@ *ap->a_vpp = vp; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; - if (vp->v_type == VBLK && vp->v_mount != NULL) + if (vp->v_mount != NULL) runp = runb = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize; if (ap->a_runp != NULL) *ap->a_runp = runp; @@ -731,46 +501,20 @@ struct vnode *vp = ap->a_vp; struct proc *p = ap->a_p; dev_t dev = vp->v_rdev; - int mode, error; - switch (vp->v_type) { - - case VCHR: - /* - * Hack: a tty device that is a controlling terminal - * has a reference from the session structure. - * We cannot easily tell that a character device is - * a controlling terminal, unless it is the closing - * process' controlling terminal. In that case, - * if the reference count is 2 (this last descriptor - * plus the session), release the reference from the session. - */ - if (vcount(vp) == 2 && p && (vp->v_flag & VXLOCK) == 0 && - vp == p->p_session->s_ttyvp) { - vrele(vp); - p->p_session->s_ttyvp = NULL; - } - mode = S_IFCHR; - break; - - case VBLK: - if (bdev_buffered) { - /* - * On last close of a block device (that isn't mounted) - * we must invalidate any in core blocks, so that - * we can, for instance, change floppy disks. - */ - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - error = vinvalbuf(vp, V_SAVE, ap->a_cred, p, 0, 0); - VOP_UNLOCK(vp, 0, p); - if (error) - return (error); - } - mode = S_IFBLK; - break; - - default: - panic("spec_close: not special"); + /* + * Hack: a tty device that is a controlling terminal + * has a reference from the session structure. + * We cannot easily tell that a character device is + * a controlling terminal, unless it is the closing + * process' controlling terminal. In that case, + * if the reference count is 2 (this last descriptor + * plus the session), release the reference from the session. + */ + if (vcount(vp) == 2 && p && (vp->v_flag & VXLOCK) == 0 && + vp == p->p_session->s_ttyvp) { + vrele(vp); + p->p_session->s_ttyvp = NULL; } /* * We do not want to really close the device if it @@ -788,7 +532,7 @@ } else if (vcount(vp) > 1) { return (0); } - return (devsw(dev)->d_close(dev, ap->a_fflag, mode, p)); + return (devsw(dev)->d_close(dev, ap->a_fflag, S_IFCHR, p)); } /* @@ -882,7 +626,7 @@ * block device is mounted. However, we can use v_rdev. */ - if (vp->v_type == VBLK) + if (vn_isdisk(vp)) blksiz = vp->v_rdev->si_bsize_phys; else blksiz = DEV_BSIZE; Index: msdosfs/msdosfs_vfsops.c =================================================================== RCS file: /home/ncvs/src/sys/msdosfs/msdosfs_vfsops.c,v retrieving revision 1.53 diff -u -r1.53 msdosfs_vfsops.c --- msdosfs_vfsops.c 1999/09/20 23:27:57 1.53 +++ msdosfs_vfsops.c 1999/10/31 15:14:10 @@ -383,6 +383,7 @@ ronly = (mp->mnt_flag & MNT_RDONLY) != 0; error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p); + VOP_UNLOCK(devvp, 0, p); if (error) return (error); Index: ntfs/ntfs_vfsops.c =================================================================== RCS file: /home/ncvs/src/sys/ntfs/ntfs_vfsops.c,v retrieving revision 1.15 diff -u -r1.15 ntfs_vfsops.c --- ntfs_vfsops.c 1999/10/29 18:09:12 1.15 +++ ntfs_vfsops.c 1999/10/31 15:13:47 @@ -423,6 +423,7 @@ ronly = (mp->mnt_flag & MNT_RDONLY) != 0; error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p); + VOP_UNLOCK(devvp, 0, p); if (error) return (error); Index: ufs/ffs/ffs_vfsops.c =================================================================== RCS file: /home/ncvs/src/sys/ufs/ffs/ffs_vfsops.c,v retrieving revision 1.110 diff -u -r1.110 ffs_vfsops.c --- ffs_vfsops.c 1999/11/03 12:05:37 1.110 +++ ffs_vfsops.c 1999/11/03 13:10:45 @@ -602,6 +602,7 @@ ronly = (mp->mnt_flag & MNT_RDONLY) != 0; error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p); + VOP_UNLOCK(devvp, 0, p); if (error) return (error); if (devvp->v_rdev->si_iosize_max > mp->mnt_iosize_max) -- Poul-Henning Kamp FreeBSD coreteam member phk@FreeBSD.ORG "Real hackers run -current on their laptop." FreeBSD -- It will take a long time before progress goes too far! To Unsubscribe: send mail to majordomo@FreeBSD.org with "unsubscribe freebsd-current" in the body of the message