Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 27 Nov 2005 15:05:00 -0500
From:      Michael Butler <imb@protected-networks.net>
To:        stable@freebsd.org
Cc:        =?ISO-8859-1?Q?S=F8ren_Schmidt?= <sos@freebsd.org>
Subject:   ata (raid) patches
Message-ID:  <438A116C.3010803@protected-networks.net>

next in thread | raw e-mail | index | archive | help
This is a multi-part message in MIME format.
--------------030501090602010103040600
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 7bit

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

For those game enough to try the results of my handiwork ;-), enclosed
is a patch against the files in /usr/src/sys/dev/ata for RELENG_6 (and
possibly others) with the following objectives:

1) the ata-raid driver currently leaks ata_composite and ata_request
structures into "neverland" in a mirrored configuration. This can be
observed using "sysctl -a | grep ^ata_" and noting the increasing
"in-use" count as time goes on. Eventually, this causes the kernel to
run out of memory. This is fixed by tracking the request counts on each
composite request.

2) another part of this patch is to ata-queue where a channel lock is
asserted in a (hopefully rare) rebuild process even if the dependencies
flag is set (we're waiting for a read). Moving the test for a dependency
outside of the lock saves waiting on it when nothing can be done. A
small nit with near negligible impact but, when you're waiting for a
rebuild ...

3) another part of this patch is to ata-raid where the choice of drive
from which to read favours one side of a mirror even when both drives
are near the block(s) we want. Because the mirror is on another channel
on the Highpoint controller, it performs (marginally :-() better when we
toggle between them.

As usual, this patch comes with no warranty ... it works for me. "If it
breaks your system, you own all the pieces".

I recommend you back up your system before testing,

	Michael

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.0 (MingW32)

iD8DBQFDihFsiJykeV6HPMURAlHQAJ9UO/vD8rp/V3+zh89qBBOPGQ+cugCg5qGp
9N8FaffInZMuOIMSGICV70c=
=NZTT
-----END PGP SIGNATURE-----

--------------030501090602010103040600
Content-Type: text/plain;
 name="ata-raid-patch.txt"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="ata-raid-patch.txt"

*** /usr/src/sys/dev/ata/ata-all.h.orig	Sun Nov 27 14:17:57 2005
--- /usr/src/sys/dev/ata/ata-all.h	Sun Nov 27 14:22:05 2005
***************
*** 331,336 ****
--- 331,337 ----
      u_int32_t           wr_depend;              /* write depends on subdisks */
      u_int32_t           wr_done;                /* done write subdisks */
      struct ata_request  *request[32];           /* size must match maps above */
+     long		count;			/* count required of this composite */
      caddr_t             data_1;     
      caddr_t             data_2;     
  };
*** /usr/src/sys/dev/ata/ata-queue.c.orig	Sun Nov 27 14:17:57 2005
--- /usr/src/sys/dev/ata/ata-queue.c	Sun Nov 27 14:40:43 2005
***************
*** 182,209 ****
  	    }
  
  	    /* check we are in the right state and has no dependencies */
! 	    mtx_lock(&ch->state_mtx);
! 	    if (ch->state == ATA_IDLE && !dependencies) {
! 		ATA_DEBUG_RQ(request, "starting");
! 		TAILQ_REMOVE(&ch->ata_queue, request, chain);
! 		ch->running = request;
! 		ch->state = ATA_ACTIVE;
! 
! 		/* if we are the freezing point release it */
! 		if (ch->freezepoint == request)
! 		    ch->freezepoint = NULL;
! 
! 		if (ch->hw.begin_transaction(request) == ATA_OP_FINISHED) {
! 		    ch->running = NULL;
! 		    ch->state = ATA_IDLE;
! 		    mtx_unlock(&ch->state_mtx);
! 		    mtx_unlock(&ch->queue_mtx);
! 		    ATA_LOCKING(dev, ATA_LF_UNLOCK);
! 		    ata_finish(request);
! 		    return;
  		}
  	    }
- 	    mtx_unlock(&ch->state_mtx);
  	}
      }
      mtx_unlock(&ch->queue_mtx);
--- 182,211 ----
  	    }
  
  	    /* check we are in the right state and has no dependencies */
! 	    if (!dependencies) {
! 		mtx_lock(&ch->state_mtx);
! 		if (ch->state == ATA_IDLE) {
! 		   ATA_DEBUG_RQ(request, "starting");
! 		   TAILQ_REMOVE(&ch->ata_queue, request, chain);
! 		   ch->running = request;
! 		   ch->state = ATA_ACTIVE;
! 
! 		   /* if we are the freezing point release it */
! 		   if (ch->freezepoint == request)
! 			ch->freezepoint = NULL;
! 
! 		   if (ch->hw.begin_transaction(request) == ATA_OP_FINISHED) {
! 			ch->running = NULL;
! 			ch->state = ATA_IDLE;
! 			mtx_unlock(&ch->state_mtx);
! 			mtx_unlock(&ch->queue_mtx);
! 			ATA_LOCKING(dev, ATA_LF_UNLOCK);
! 			ata_finish(request);
! 			return;
! 		   }
  		}
+ 		mtx_unlock(&ch->state_mtx);
  	    }
  	}
      }
      mtx_unlock(&ch->queue_mtx);
***************
*** 426,432 ****
  	    composite->wr_done |= (1 << request->this);
  
  	if (composite->wr_depend &&
! 	    (composite->rd_done & composite->wr_depend)==composite->wr_depend &&
  	    (composite->wr_needed & (~composite->wr_done))) {
  	    index = ((composite->wr_needed & (~composite->wr_done))) - 1;
  	}
--- 428,434 ----
  	    composite->wr_done |= (1 << request->this);
  
  	if (composite->wr_depend &&
! 	    ((composite->rd_done & composite->wr_depend) == composite->wr_depend) &&
  	    (composite->wr_needed & (~composite->wr_done))) {
  	    index = ((composite->wr_needed & (~composite->wr_done))) - 1;
  	}
*** /usr/src/sys/dev/ata/ata-raid.c.orig	Sun Nov 27 14:17:57 2005
--- /usr/src/sys/dev/ata/ata-raid.c	Sun Nov 27 14:31:13 2005
***************
*** 44,49 ****
--- 44,50 ----
  #include <sys/cons.h>
  #include <sys/sema.h>
  #include <sys/taskqueue.h>
+ #include <sys/libkern.h>
  #include <vm/uma.h>
  #include <machine/bus.h>
  #include <sys/rman.h>
***************
*** 346,380 ****
  	    if (rdp->status & AR_S_REBUILDING)
  		blk = ((lba / rdp->interleave) * rdp->width) * rdp->interleave +
  		      (rdp->interleave * (drv % rdp->width)) +
! 		      lba % rdp->interleave;;
  
  	    if (bp->bio_cmd == BIO_READ) {
! 		int src_online =
! 		    (rdp->disks[drv].flags & AR_DF_ONLINE);
! 		int mir_online =
! 		    (rdp->disks[drv+rdp->width].flags & AR_DF_ONLINE);
! 
! 		/* if mirror gone or close to last access on source */
! 		if (!mir_online || 
! 		    ((src_online) &&
! 		     bp->bio_pblkno >=
! 			(rdp->disks[drv].last_lba - AR_PROXIMITY) &&
! 		     bp->bio_pblkno <=
! 			(rdp->disks[drv].last_lba + AR_PROXIMITY))) {
  		    rdp->toggle = 0;
  		} 
! 		/* if source gone or close to last access on mirror */
! 		else if (!src_online ||
! 			 ((mir_online) &&
! 			  bp->bio_pblkno >=
! 			  (rdp->disks[drv+rdp->width].last_lba-AR_PROXIMITY) &&
! 			  bp->bio_pblkno <=
! 			  (rdp->disks[drv+rdp->width].last_lba+AR_PROXIMITY))) {
  		    drv += rdp->width;
  		    rdp->toggle = 1;
  		}
! 		/* not close to any previous access, toggle */
! 		else {
  		    if (rdp->toggle)
  			rdp->toggle = 0;
  		    else {
--- 347,386 ----
  	    if (rdp->status & AR_S_REBUILDING)
  		blk = ((lba / rdp->interleave) * rdp->width) * rdp->interleave +
  		      (rdp->interleave * (drv % rdp->width)) +
! 		      lba % rdp->interleave;
  
  	    if (bp->bio_cmd == BIO_READ) {
!                 int decided = 0;
!                 
! 		/* if mirror gone, use drv */
! 		if ((rdp->disks[drv+rdp->width].flags & AR_DF_ONLINE) == 0) {
  		    rdp->toggle = 0;
+ 		    decided = 1;
+                 } else
+                 /* if src is gone, use mirror */
+                 if ((rdp->disks[drv].flags & AR_DF_ONLINE) == 0) {
+ 		    drv += rdp->width;
+ 		    rdp->toggle = 1;
+                     decided = 1;
  		} 
!                 
!                 /* both appear to be online, find out if we're already close */
!                 if (!decided) {
!                     /* if drv is near and mirror is not, use drv */
!                     if (abs(bp->bio_pblkno - rdp->disks[drv].last_lba) <= AR_PROXIMITY &&
!                         abs(bp->bio_pblkno - rdp->disks[drv + rdp->width].last_lba) > AR_PROXIMITY) {
!                         rdp->toggle = 0;
!                         decided = 1;
!                     /* if mirror is near and drv is not, use mirror */
! 		    } else if (abs(bp->bio_pblkno - rdp->disks[drv + rdp->width].last_lba) <= AR_PROXIMITY &&
!                         abs(bp->bio_pblkno - rdp->disks[drv].last_lba) > AR_PROXIMITY) {
  		    drv += rdp->width;
  		    rdp->toggle = 1;
+                         decided = 1;
  		}
! 		} 
!                 /* neither is near or both are, just toggle */
!                 if(!decided) {
  		    if (rdp->toggle)
  			rdp->toggle = 0;
  		    else {
***************
*** 410,415 ****
--- 416,422 ----
  				mtx_init(&composite->lock,
  					 "ATA PseudoRAID rebuild lock",
  					 NULL, MTX_DEF);
+ 				composite->count = request->bytecount;
  				composite->rd_needed |= (1 << drv);
  				composite->wr_depend |= (1 << drv);
  				composite->wr_needed |= (1 << this);
***************
*** 468,473 ****
--- 475,481 ----
  				mtx_init(&composite->lock,
  					 "ATA PseudoRAID mirror lock",
  					 NULL, MTX_DEF);
+ 				composite->count = request->bytecount;
  				composite->wr_needed |= (1 << drv);
  				composite->wr_needed |= (1 << this);
  				composite->request[drv] = request;
***************
*** 607,613 ****
  			/* good data, update how far we've gotten */
  			else {
  			    bp->bio_resid -= request->donecount;
! 			    if (bp->bio_resid == 0) {
  				if (composite->wr_done & (1 << mirror))
  				    finished = 1;
  			    }
--- 615,622 ----
  			/* good data, update how far we've gotten */
  			else {
  			    bp->bio_resid -= request->donecount;
! 			    composite->count -= request->donecount;
! 			    if (composite->count == 0) {
  				if (composite->wr_done & (1 << mirror))
  				    finished = 1;
  			    }
***************
*** 621,627 ****
  				printf("DOH! rebuild failed\n"); /* XXX SOS */
  				rdp->rebuild_lba = blk;
  			    }
! 			    if (bp->bio_resid == 0)
  				finished = 1;
  			}
  		    }
--- 630,636 ----
  				printf("DOH! rebuild failed\n"); /* XXX SOS */
  				rdp->rebuild_lba = blk;
  			    }
! 			    if (composite->count == 0)
  				finished = 1;
  			}
  		    }
***************
*** 658,667 ****
  			    }
  			    bp->bio_resid -=
  				composite->request[mirror]->donecount;
  			}
! 			else
  			    bp->bio_resid -= request->donecount;
! 			if (bp->bio_resid == 0)
  			    finished = 1;
  		    }
  		    mtx_unlock(&composite->lock);
--- 667,680 ----
  			    }
  			    bp->bio_resid -=
  				composite->request[mirror]->donecount;
+ 			    composite->count -=
+ 				composite->request[mirror]->donecount;
  			}
! 			else {
  			    bp->bio_resid -= request->donecount;
! 			    composite->count -= request->donecount;
! 			}
! 			if (composite->count == 0)
  			    finished = 1;
  		    }
  		    mtx_unlock(&composite->lock);
***************
*** 723,729 ****
  	    rdp->status &= ~AR_S_REBUILDING;
  	    ata_raid_config_changed(rdp, 1);
  	}
! 	biodone(bp);
      }
  		 
      if (composite) {
--- 736,744 ----
  	    rdp->status &= ~AR_S_REBUILDING;
  	    ata_raid_config_changed(rdp, 1);
  	}
! 	/* cover the case of a series of composites which are only partly complete */
! 	if (bp->bio_resid == 0)
! 	    biodone(bp);
      }
  		 
      if (composite) {

--------------030501090602010103040600--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?438A116C.3010803>