Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 16 Dec 2003 18:20:20 -0800 (PST)
From:      Bruce Evans <bde@zeta.org.au>
To:        freebsd-bugs@FreeBSD.org
Subject:   Re: kern/60313: data destruction: lseek() misalignment silently ignored by some block devices
Message-ID:  <200312170220.hBH2KKIa025034@freefall.freebsd.org>

next in thread | raw e-mail | index | archive | help
The following reply was made to PR kern/60313; it has been noted by GNATS.

From: Bruce Evans <bde@zeta.org.au>
To: Matthias Andree <matthias.andree@gmx.de>
Cc: FreeBSD-gnats-submit@freebsd.org, freebsd-bugs@freebsd.org
Subject: Re: kern/60313: data destruction: lseek() misalignment silently
 ignored by some block devices
Date: Wed, 17 Dec 2003 13:13:55 +1100 (EST)

 On Wed, 17 Dec 2003, Matthias Andree wrote:
 
 > >Description:
 > da(4) will happily return success on lseek() to some random position that is
 > not aligned at a block boundary and start writing from the beginning, byte 0,
 > of the block, ignoring the lseek().
 >
 > In other words, if I have a drive with 512 bytes per block, seek to offset 256
 > and write 512 bytes, da(4) will scribble over the first 256 bytes as well.
 > Either the seek or the write should be rejected.
 
 The boundary is checked in dscheck() in RELENG_4, but only after
 forgetting it mod DEV_BSIZE, so only offsets that are multiples of
 DEV_BSIZE but not a multiple of the sector size are detected.
 
 The boundary is forgotten here in physio() (RELENG_4 version):
 
 % 			blockno = bp->b_offset >> DEV_BSHIFT;
 % 			if (chk_blockno && (daddr_t)blockno != blockno) {
 % 				error = EINVAL; /* blockno overflow */
 % 				goto doerror;
 % 			}
 % 			bp->b_blkno = blockno;
 
 except bp->b_offset is still available so lower layers can check it.  None
 do.  (The acd driver doesn't use dscheck() and uses b_offset to determine
 its own idea of the block number; it has a subset of dscheck()'s other
 boundary checks and has the same nonexistent boundary checking as physio()
 for the offset.)
 
 > vn(4) will accept the seek but a subsequent write(2) will return the bogus
 > EINVAL in errno.
 
 I don't see how vn(4) can be missing the bug.  In the "labels" case it
 uses dscheck() which doesn't check b_offset, and in the non-labels case
 it uses its own check which also doesn't check b_offset.
 
 > ad(4) status is unknown.
 
 I don't see how it can be missing the bug.
 
 > FreeBSD 5 status is unknown.
 
 -current is quite different.  It deprecates b_blkno and uses b_offset
 more (mostly in its bio_offset form), and uses GEOM (except in my
 version).  physio() still sets b_blkno without checking the boundary,
 but b_blkno is mostly not used.
 
 The problem seems to be fixed in GEOM (function g_io_check()).
 
 I fixed it my version of dscheck() but didn't get around
 to committing the fix before dscheck() was axed.  Here are my current
 diffs relative to RELENG_4.  They don't apply directly.  RELENG_4 needs
 mainly the "if (offset % (uoff_t)DEV_BSIZE) {" clause.  The patch also
 fixes some slightly incorrect choices of types that become fatally
 incorrect with larger daddr_t's in -current, and fixes the longstanding
 brokenness of EOF handling.
 
 %%%
 Index: subr_diskslice.c
 ===================================================================
 RCS file: /home/ncvs/src/sys/kern/Attic/subr_diskslice.c,v
 retrieving revision 1.82.2.6
 diff -u -2 -r1.82.2.6 subr_diskslice.c
 --- subr_diskslice.c	24 Jul 2001 09:49:41 -0000	1.82.2.6
 +++ subr_diskslice.c	7 Dec 2003 05:05:22 -0000
 @@ -134,47 +284,55 @@
  int
  dscheck(bp, ssp)
 -	struct buf *bp;
 +	struct bio *bp;
  	struct diskslices *ssp;
  {
 -	daddr_t	blkno;
 -	u_long	endsecno;
 -	daddr_t	labelsect;
 +	daddr_t blkno;
 +	u_long endsecno;
 +	daddr_t labelsect;
  	struct disklabel *lp;
  	char *msg;
 -	long	nsec;
 +	long nsec;
 +	off_t offset;
  	struct partition *pp;
 -	daddr_t	secno;
 -	daddr_t	slicerel_secno;
 +	daddr_t secno;
 +	daddr_t slicerel_secno;
  	struct diskslice *sp;
 -	int s;
 
 -	blkno = bp->b_blkno;
 -	if (blkno < 0) {
 -		printf("dscheck(%s): negative b_blkno %ld\n",
 -		    devtoname(bp->b_dev), (long)blkno);
 -		bp->b_error = EINVAL;
 +	offset = bp->bio_offset;
 +	if (offset < 0) {
 +		printf("dscheck(%s): negative bio_offset %jd\n",
 +		    devtoname(bp->bio_dev), (intmax_t)offset);
 +		bp->bio_error = EINVAL;
 +		goto bad;
 +	}
 +	if (offset % (uoff_t)DEV_BSIZE) {
 +		printf(
 +		"dscheck(%s): bio_offset %jd is not on a DEV_BSIZE boundary\n",
 +		    devtoname(bp->bio_dev), (intmax_t)offset);
 +		bp->bio_error = EINVAL;
  		goto bad;
  	}
 -	sp = &ssp->dss_slices[dkslice(bp->b_dev)];
 +	blkno = btodb(bp->bio_offset);
 +	sp = &ssp->dss_slices[dkslice(bp->bio_dev)];
  	lp = sp->ds_label;
  	if (ssp->dss_secmult == 1) {
 -		if (bp->b_bcount % (u_long)DEV_BSIZE)
 +		if (bp->bio_bcount % (u_long)DEV_BSIZE)
  			goto bad_bcount;
  		secno = blkno;
 -		nsec = bp->b_bcount >> DEV_BSHIFT;
 +		nsec = bp->bio_bcount >> DEV_BSHIFT;
  	} else if (ssp->dss_secshift != -1) {
 -		if (bp->b_bcount & (ssp->dss_secsize - 1))
 +		if (bp->bio_bcount & (ssp->dss_secsize - 1))
  			goto bad_bcount;
  		if (blkno & (ssp->dss_secmult - 1))
 -			goto bad_blkno;
 +			goto bad_offset;
  		secno = blkno >> ssp->dss_secshift;
 -		nsec = bp->b_bcount >> (DEV_BSHIFT + ssp->dss_secshift);
 +		nsec = bp->bio_bcount >> (DEV_BSHIFT + ssp->dss_secshift);
  	} else {
 -		if (bp->b_bcount % ssp->dss_secsize)
 +		if (bp->bio_bcount % ssp->dss_secsize)
  			goto bad_bcount;
  		if (blkno % ssp->dss_secmult)
 -			goto bad_blkno;
 +			goto bad_offset;
  		secno = blkno / ssp->dss_secmult;
 -		nsec = bp->b_bcount / ssp->dss_secsize;
 +		nsec = bp->bio_bcount / ssp->dss_secsize;
  	}
  	if (lp == NULL) {
 @@ -184,6 +342,6 @@
  	} else {
  		labelsect = lp->d_partitions[LABEL_PART].p_offset;
 -if (labelsect != 0) Debugger("labelsect != 0 in dscheck()");
 -		pp = &lp->d_partitions[dkpart(bp->b_dev)];
 +		KASSERT(labelsect == 0, ("labelsect != 0 in dscheck()"));
 +		pp = &lp->d_partitions[dkpart(bp->bio_dev)];
  		endsecno = pp->p_size;
  		slicerel_secno = pp->p_offset + secno;
 @@ -196,6 +354,6 @@
  	    slicerel_secno + nsec > LABELSECTOR + labelsect &&
  #endif
 -	    (bp->b_flags & B_READ) == 0 && sp->ds_wlabel == 0) {
 -		bp->b_error = EROFS;
 +	    bp->bio_cmd == BIO_WRITE && sp->ds_wlabel == 0) {
 +		bp->bio_error = EROFS;
  		goto bad;
  	}
 @@ -203,7 +361,7 @@
  #if defined(DOSBBSECTOR) && defined(notyet)
  	/* overwriting master boot record? */
 -	if (slicerel_secno <= DOSBBSECTOR && (bp->b_flags & B_READ) == 0 &&
 +	if (slicerel_secno <= DOSBBSECTOR && bp->bio_cmd == BIO_WRITE &&
  	    sp->ds_wlabel == 0) {
 -		bp->b_error = EROFS;
 +		bp->bio_error = EROFS;
  		goto bad;
  	}
 @@ -211,20 +369,19 @@
 
  	/* beyond partition? */
 -	if (secno + nsec > endsecno) {
 -		/* if exactly at end of disk, return an EOF */
 -		if (secno == endsecno) {
 -			bp->b_resid = bp->b_bcount;
 -			return (0);
 -		}
 -		/* or truncate if part of it fits */
 -		nsec = endsecno - secno;
 -		if (nsec <= 0) {
 -			bp->b_error = EINVAL;
 +	if ((uintmax_t)secno + nsec > endsecno) {
 +		/* If at or beyond end of partition, return EOF. */
 +		if (secno >= endsecno) {
 +			if (bp->bio_cmd == BIO_READ) {
 +				bp->bio_resid = bp->bio_bcount;
 +				return (0);
 +			}
 +			bp->bio_error = ENOSPC;
  			goto bad;
  		}
 -		bp->b_bcount = nsec * ssp->dss_secsize;
 +		/* Truncate to the part that fits. */
 +		bp->bio_bcount = (endsecno - secno) * ssp->dss_secsize;
  	}
 
 -	bp->b_pblkno = sp->ds_offset + slicerel_secno;
 +	bp->bio_pblkno = sp->ds_offset + slicerel_secno;
 
  	/*
 @@ -237,18 +394,18 @@
  	    && slicerel_secno + nsec > LABELSECTOR + labelsect
  #endif
 +	    /* XXX following seems to be broken for dedicated case: */
  	    && sp->ds_offset != 0) {
  		struct iodone_chain *ic;
 
  		ic = malloc(sizeof *ic , M_DEVBUF, M_WAITOK);
 -		ic->ic_prev_flags = bp->b_flags;
 -		ic->ic_prev_iodone = bp->b_iodone;
 -		ic->ic_prev_iodone_chain = bp->b_iodone_chain;
 +		ic->ic_prev_flags = bp->bio_flags;
 +		ic->ic_prev_iodone = bp->bio_done;
 +		ic->ic_prev_iodone_chain = bp->bio_done_chain;
  		ic->ic_args[0].ia_long = (LABELSECTOR + labelsect -
  		    slicerel_secno) * ssp->dss_secsize;
  		ic->ic_args[1].ia_ptr = sp;
 -		bp->b_flags |= B_CALL;
 -		bp->b_iodone = dsiodone;
 -		bp->b_iodone_chain = ic;
 -		if (!(bp->b_flags & B_READ)) {
 +		bp->bio_done = dsiodone;
 +		bp->bio_done_chain = ic;
 +		if (bp->bio_cmd != BIO_READ) {
  			/*
  			 * XXX even disklabel(8) writes directly so we need
 @@ -260,18 +417,13 @@
  			 * temporarily corrupting the in-core copy.
  			 */
 -			if (bp->b_vp != NULL) {
 -				s = splbio();
 -				bp->b_vp->v_numoutput++;
 -				splx(s);
 -			}
  			/* XXX need name here. */
  			msg = fixlabel((char *)NULL, sp,
  				       (struct disklabel *)
 -				       (bp->b_data + ic->ic_args[0].ia_long),
 +				       (bp->bio_data + ic->ic_args[0].ia_long),
  				       TRUE);
  			if (msg != NULL) {
  				printf("dscheck(%s): %s\n",
 -				    devtoname(bp->b_dev), msg);
 -				bp->b_error = EROFS;
 +				    devtoname(bp->bio_dev), msg);
 +				bp->bio_error = EROFS;
  				goto bad;
  			}
 @@ -282,19 +434,19 @@
  bad_bcount:
  	printf(
 -	"dscheck(%s): b_bcount %ld is not on a sector boundary (ssize %d)\n",
 -	    devtoname(bp->b_dev), bp->b_bcount, ssp->dss_secsize);
 -	bp->b_error = EINVAL;
 +	"dscheck(%s): bio_bcount %ld is not on a sector boundary (ssize %d)\n",
 +	    devtoname(bp->bio_dev), bp->bio_bcount, ssp->dss_secsize);
 +	bp->bio_error = EINVAL;
  	goto bad;
 
 -bad_blkno:
 +bad_offset:
  	printf(
 -	"dscheck(%s): b_blkno %ld is not on a sector boundary (ssize %d)\n",
 -	    devtoname(bp->b_dev), (long)blkno, ssp->dss_secsize);
 -	bp->b_error = EINVAL;
 +	"dscheck(%s): bio_offset %jd is not on a sector boundary (ssize %d)\n",
 +	    devtoname(bp->bio_dev), (intmax_t)offset, ssp->dss_secsize);
 +	bp->bio_error = EINVAL;
  	goto bad;
 
  bad:
 -	bp->b_resid = bp->b_bcount;
 -	bp->b_flags |= B_ERROR;
 +	bp->bio_resid = bp->bio_bcount;
 +	bp->bio_flags |= BIO_ERROR;
  	return (-1);
  }
 %%%
 
 Bruce



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200312170220.hBH2KKIa025034>