From owner-svn-src-projects@FreeBSD.ORG  Tue Jan 11 20:52:35 2011
Return-Path: <owner-svn-src-projects@FreeBSD.ORG>
Delivered-To: svn-src-projects@freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34])
	by hub.freebsd.org (Postfix) with ESMTP id F33CE1065670;
	Tue, 11 Jan 2011 20:52:34 +0000 (UTC) (envelope-from imp@FreeBSD.org)
Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c])
	by mx1.freebsd.org (Postfix) with ESMTP id E17CD8FC13;
	Tue, 11 Jan 2011 20:52:34 +0000 (UTC)
Received: from svn.freebsd.org (localhost [127.0.0.1])
	by svn.freebsd.org (8.14.3/8.14.3) with ESMTP id p0BKqYZO035226;
	Tue, 11 Jan 2011 20:52:34 GMT (envelope-from imp@svn.freebsd.org)
Received: (from imp@localhost)
	by svn.freebsd.org (8.14.3/8.14.3/Submit) id p0BKqYig035221;
	Tue, 11 Jan 2011 20:52:34 GMT (envelope-from imp@svn.freebsd.org)
Message-Id: <201101112052.p0BKqYig035221@svn.freebsd.org>
From: Warner Losh <imp@FreeBSD.org>
Date: Tue, 11 Jan 2011 20:52:34 +0000 (UTC)
To: src-committers@freebsd.org, svn-src-projects@freebsd.org
X-SVN-Group: projects
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Cc: 
Subject: svn commit: r217284 - projects/graid/head/sys/geom/raid
X-BeenThere: svn-src-projects@freebsd.org
X-Mailman-Version: 2.1.5
Precedence: list
List-Id: "SVN commit messages for the src &quot; projects&quot;
	tree" <svn-src-projects.freebsd.org>
List-Unsubscribe: <http://lists.freebsd.org/mailman/listinfo/svn-src-projects>, 
	<mailto:svn-src-projects-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/svn-src-projects>
List-Post: <mailto:svn-src-projects@freebsd.org>
List-Help: <mailto:svn-src-projects-request@freebsd.org?subject=help>
List-Subscribe: <http://lists.freebsd.org/mailman/listinfo/svn-src-projects>, 
	<mailto:svn-src-projects-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Tue, 11 Jan 2011 20:52:35 -0000

Author: imp
Date: Tue Jan 11 20:52:34 2011
New Revision: 217284
URL: http://svn.freebsd.org/changeset/base/217284

Log:
  Implement range locking.  When a range of a volume is locked, any
  writes to that range will be deferred.  Range locking either succeeds
  right away, or is deferred until all the in-flight writes complete.
  Once a range is locked, write requests are queued.  These requests are
  resumed when the range is unlocked.  This is intended to be used while
  rebuilding a drive, or when doing bad-sector recovery via writing disk
  sectors that fail to read properly.  Writes to areas of the volume
  that aren't locked are unaffected.
  
  Users of this facility should ensure that they lock small ranges for
  short periods to keep too many requests from queueing up (possiblely
  forcing a resource shortage that may prevent the range from being
  unlocked).  I'm not sure how to trigger this problem, nor what
  remediation is necessary to prevent/reduce this problem.
  
  All the hooks are in place.  Nothing that I'm checking in uses this
  facility yet, but in testing so far it doesn't affect anything by it
  being there unused.
  
  Also, create a callback for the RAID1 transform so I can start using
  it for bad sector recovery and then rebuilds.

Modified:
  projects/graid/head/sys/geom/raid/g_raid.c
  projects/graid/head/sys/geom/raid/g_raid.h
  projects/graid/head/sys/geom/raid/g_raid_tr_if.m
  projects/graid/head/sys/geom/raid/tr_raid1.c

Modified: projects/graid/head/sys/geom/raid/g_raid.c
==============================================================================
--- projects/graid/head/sys/geom/raid/g_raid.c	Tue Jan 11 20:08:34 2011	(r217283)
+++ projects/graid/head/sys/geom/raid/g_raid.c	Tue Jan 11 20:52:34 2011	(r217284)
@@ -103,7 +103,6 @@ struct g_class g_raid_class = {
 	.fini = g_raid_fini
 };
 
-
 static void g_raid_destroy_provider(struct g_raid_volume *vol);
 static int g_raid_update_disk(struct g_raid_disk *disk, u_int state);
 static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int state);
@@ -737,6 +736,46 @@ g_raid_start(struct bio *bp)
 	wakeup(sc);
 }
 
+static int
+g_raid_bio_overlaps(const struct bio *bp, off_t off, off_t len)
+{
+	/*
+	 * 5 cases:
+	 * (1) bp entirely below NO
+	 * (2) bp entirely above NO
+	 * (3) bp start below, but end in range YES
+	 * (4) bp entirely within YES
+	 * (5) bp starts within, ends above YES
+	 *
+	 * lock range 10-19 (offset 10 length 10)
+	 * (1) 1-5: first if kicks it out
+	 * (2) 30-35: second if kicks it out
+	 * (3) 5-15: passes both ifs
+	 * (4) 12-14: passes both ifs
+	 * (5) 19-20: passes both
+	 */
+
+	if (bp->bio_offset + bp->bio_length - 1 < off)
+		return (0);
+	if (bp->bio_offset < off + len - 1)
+		return (0);
+	return (1);
+}
+
+static int
+g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp)
+{
+	struct g_raid_lock *lp;
+
+	sx_assert(&vol->v_softc->sc_lock, SX_LOCKED);
+
+	LIST_FOREACH(lp, &vol->v_locks, l_next) {
+		if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length))
+			return (1);
+	}
+	return (0);
+}
+
 static void
 g_raid_start_request(struct bio *bp)
 {
@@ -744,6 +783,7 @@ g_raid_start_request(struct bio *bp)
 	struct g_raid_volume *vol;
 
 	sc = bp->bio_to->geom->softc;
+	sx_assert(&sc->sc_lock, SX_LOCKED);
 	vol = bp->bio_to->private;
 	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
 		if (vol->v_idle)
@@ -752,6 +792,16 @@ g_raid_start_request(struct bio *bp)
 			vol->v_last_write = time_uptime;
 	}
 	/*
+	 * Check to see if this item is in a locked range.  If so,
+	 * queue it to our locked queue and return.  We'll requeue
+	 * it when the range is unlocked.
+	 */
+	if (g_raid_is_in_locked_range(vol, bp)) {
+		bioq_insert_tail(&vol->v_locked, bp);
+		return;
+	}
+
+	/*
 	 * Put request onto inflight queue, so we can check if new
 	 * synchronization requests don't collide with it.
 	 */
@@ -764,14 +814,100 @@ g_raid_iodone(struct bio *bp, int error)
 {
 	struct g_raid_softc *sc;
 	struct g_raid_volume *vol;
+	struct g_raid_lock *lp;
 
 	sc = bp->bio_to->geom->softc;
+	sx_assert(&sc->sc_lock, SX_LOCKED);
+
 	vol = bp->bio_to->private;
 	G_RAID_LOGREQ(3, bp, "Request done: %d.", error);
 	bioq_remove(&vol->v_inflight, bp);
+	if (bp->bio_cmd == BIO_WRITE && vol->v_pending_lock &&
+	    g_raid_is_in_locked_range(vol, bp)) {
+		/*
+		 * XXX this structure forces serialization of all
+		 * XXX pending requests before any are allowed through.
+		 */
+		G_RAID_LOGREQ(3, bp,
+		    "Write to locking zone complete: %d writes outstanding",
+		    vol->v_pending_lock);
+		if (--vol->v_pending_lock == 0) {
+			G_RAID_LOGREQ(3, bp,
+			    "Last write done, calling pending callbacks.");
+			LIST_FOREACH(lp, &vol->v_locks,l_next) {
+				if (lp->l_flags & G_RAID_LOCK_PENDING) {
+					G_RAID_TR_LOCKED(vol->v_tr,
+					    lp->l_callback_arg);
+					lp->l_flags &= ~G_RAID_LOCK_PENDING;
+				}
+			}
+		}
+	}
 	g_io_deliver(bp, error);
 }
 
+int
+g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len, void *argp)
+{
+	struct g_raid_softc *sc;
+	struct g_raid_lock *lp;
+	struct bio *bp;
+	int pending;
+
+	sc = vol->v_softc;
+	lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO);
+	LIST_INSERT_HEAD(&vol->v_locks, lp, l_next);
+	lp->l_flags |= G_RAID_LOCK_PENDING;
+	lp->l_offset = off;
+	lp->l_length = len;
+	lp->l_callback_arg = argp;
+
+	/* XXX lock in-flight queue? -- not done elsewhere, but should it be? */
+	pending = 0;
+	TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) {
+		if (g_raid_bio_overlaps(bp, off, len))
+			pending++;
+	}	
+	/*
+	 * If there are any writes that are pending, we return EBUSY.  All
+	 * callers will have to wait until all pending writes clear.
+	 */
+	if (pending > 0) {
+		vol->v_pending_lock += pending;
+		return (EBUSY);
+	}
+	lp->l_flags &= ~G_RAID_LOCK_PENDING;
+	return (0);
+}
+
+int
+g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len)
+{
+	struct g_raid_lock *lp, *tmp;
+	struct g_raid_softc *sc;
+	struct bio *bp;
+
+	sc = vol->v_softc;
+	LIST_FOREACH_SAFE(lp, &vol->v_locks, l_next, tmp) {
+		if (lp->l_offset == off && lp->l_length == len) {
+			LIST_REMOVE(lp, l_next);
+			/* XXX
+			 * Right now we just put them all back on the queue
+			 * and hope for the best.  We hope this because any
+			 * locked ranges will go right back on this list
+			 * when the worker thread runs.
+			 */
+			mtx_lock(&sc->sc_queue_mtx);
+			while ((bp = bioq_takefirst(&sc->sc_queue)) != NULL)
+				bioq_disksort(&sc->sc_queue, bp);
+			mtx_unlock(&sc->sc_queue_mtx);
+			free(lp, M_RAID);
+			return (0);
+		}
+	}
+	return (EINVAL);
+}
+
 void
 g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp)
 {

Modified: projects/graid/head/sys/geom/raid/g_raid.h
==============================================================================
--- projects/graid/head/sys/geom/raid/g_raid.h	Tue Jan 11 20:08:34 2011	(r217283)
+++ projects/graid/head/sys/geom/raid/g_raid.h	Tue Jan 11 20:52:34 2011	(r217284)
@@ -86,6 +86,15 @@ extern u_int g_raid_start_timeout;
 #define	G_RAID_BIO_FLAG_REGULAR	0x01
 #define	G_RAID_BIO_FLAG_SYNC		0x02
 
+#define G_RAID_LOCK_PENDING	0x1
+struct g_raid_lock {
+	off_t			 l_offset;
+	off_t			 l_length;
+	void			*l_callback_arg;
+	int			 l_flags;
+	LIST_ENTRY(g_raid_lock)	 l_next;
+};
+
 #define	G_RAID_EVENT_WAIT	0x01
 #define	G_RAID_EVENT_VOLUME	0x02
 #define	G_RAID_EVENT_SUBDISK	0x04
@@ -196,7 +205,8 @@ struct g_raid_volume {
 	off_t			 v_mediasize;	/* Volume media size.  */
 	struct bio_queue_head	 v_inflight;	/* In-flight write requests. */
 	struct bio_queue_head	 v_locked;	/* Blocked I/O requests. */
-	LIST_HEAD(, g_raid_lock)	 v_locks; /* List of locked regions. */
+	LIST_HEAD(, g_raid_lock) v_locks;	 /* List of locked regions. */
+	int			 v_pending_lock; /* writes to locked region */
 	int			 v_idle;	/* DIRTY flags removed. */
 	time_t			 v_last_write;	/* Time of the last write. */
 	u_int			 v_writes;	/* Number of active writes. */
@@ -311,6 +321,8 @@ u_int g_raid_nsubdisks(struct g_raid_vol
 #define	G_RAID_DESTROY_HARD		2
 int g_raid_destroy(struct g_raid_softc *sc, int how);
 int g_raid_event_send(void *arg, int event, int flags);
+int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len, void *argp);
+int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len);
 
 g_ctl_req_t g_raid_ctl;
 #endif	/* _KERNEL */

Modified: projects/graid/head/sys/geom/raid/g_raid_tr_if.m
==============================================================================
--- projects/graid/head/sys/geom/raid/g_raid_tr_if.m	Tue Jan 11 20:08:34 2011	(r217283)
+++ projects/graid/head/sys/geom/raid/g_raid_tr_if.m	Tue Jan 11 20:52:34 2011	(r217284)
@@ -43,7 +43,7 @@ INTERFACE g_raid_tr;
 # Default implementations of methods.
 CODE {
 	static int
-	g_raid_tr_locked_default(struct g_raid_tr_object *tr)
+	g_raid_tr_locked_default(struct g_raid_tr_object *tr, void *argp)
 	{
 
 		return (0);
@@ -94,6 +94,7 @@ METHOD void iodone {
 # locked() - callback method for lock().
 METHOD int locked {
 	struct g_raid_tr_object *tr;
+	void *argp;
 } DEFAULT g_raid_tr_locked_default;
 
 # free() - destructor.

Modified: projects/graid/head/sys/geom/raid/tr_raid1.c
==============================================================================
--- projects/graid/head/sys/geom/raid/tr_raid1.c	Tue Jan 11 20:08:34 2011	(r217283)
+++ projects/graid/head/sys/geom/raid/tr_raid1.c	Tue Jan 11 20:52:34 2011	(r217284)
@@ -54,6 +54,7 @@ static g_raid_tr_start_t g_raid_tr_start
 static g_raid_tr_stop_t g_raid_tr_stop_raid1;
 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1;
 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1;
+static g_raid_tr_locked_t g_raid_tr_locked_raid1;
 static g_raid_tr_free_t g_raid_tr_free_raid1;
 
 static kobj_method_t g_raid_tr_raid1_methods[] = {
@@ -63,6 +64,7 @@ static kobj_method_t g_raid_tr_raid1_met
 	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid1),
 	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid1),
 	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid1),
+	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid1),
 	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid1),
 	{ 0, 0 }
 };
@@ -347,6 +349,12 @@ g_raid_tr_iodone_raid1(struct g_raid_tr_
 }
 
 static int
+g_raid_tr_locked_raid1(struct g_raid_tr_object *tr, void *argp)
+{
+	return (0);
+}
+
+static int
 g_raid_tr_free_raid1(struct g_raid_tr_object *tr)
 {
 	struct g_raid_tr_raid1_object *trs;