From owner-p4-projects@FreeBSD.ORG Sun Aug 16 22:06:17 2009 Return-Path: Delivered-To: p4-projects@freebsd.org Received: by hub.freebsd.org (Postfix, from userid 32767) id 535F21065691; Sun, 16 Aug 2009 22:06:17 +0000 (UTC) Delivered-To: perforce@FreeBSD.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id F3623106568D for ; Sun, 16 Aug 2009 22:06:16 +0000 (UTC) (envelope-from fabio@FreeBSD.org) Received: from repoman.freebsd.org (repoman.freebsd.org [IPv6:2001:4f8:fff6::29]) by mx1.freebsd.org (Postfix) with ESMTP id E18AC8FC45 for ; Sun, 16 Aug 2009 22:06:16 +0000 (UTC) Received: from repoman.freebsd.org (localhost [127.0.0.1]) by repoman.freebsd.org (8.14.3/8.14.3) with ESMTP id n7GM6GYJ035925 for ; Sun, 16 Aug 2009 22:06:16 GMT (envelope-from fabio@FreeBSD.org) Received: (from perforce@localhost) by repoman.freebsd.org (8.14.3/8.14.3/Submit) id n7GM6GZ6035923 for perforce@freebsd.org; Sun, 16 Aug 2009 22:06:16 GMT (envelope-from fabio@FreeBSD.org) Date: Sun, 16 Aug 2009 22:06:16 GMT Message-Id: <200908162206.n7GM6GZ6035923@repoman.freebsd.org> X-Authentication-Warning: repoman.freebsd.org: perforce set sender to fabio@FreeBSD.org using -f From: Fabio Checconi To: Perforce Change Reviews Cc: Subject: PERFORCE change 167421 for review X-BeenThere: p4-projects@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: p4 projects tree changes List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sun, 16 Aug 2009 22:06:17 -0000 http://perforce.freebsd.org/chv.cgi?CH=167421 Change 167421 by fabio@fabio_granpasso on 2009/08/16 22:05:59 Resync with the private repo: (hopefully) safer proxy insertion/deletion, a new proportional share scheduler, revised heuristics. Affected files ... .. //depot/projects/soc2009/fabio_gsched/geom_sched/geom_sched.c#2 edit .. //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/g_sched.c#3 edit .. //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/gs_bfq.c#1 add .. //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/gs_rr.c#3 edit .. //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/gs_scheduler.h#3 edit .. //depot/projects/soc2009/fabio_gsched/geom_sched/sys/modules/geom/geom_sched/Makefile#2 edit .. //depot/projects/soc2009/fabio_gsched/geom_sched/sys/modules/geom/geom_sched/gsched_bfq/Makefile#1 add Differences ... ==== //depot/projects/soc2009/fabio_gsched/geom_sched/geom_sched.c#2 (text+ko) ==== @@ -69,7 +69,7 @@ else reqalgo = algo; - snprintf(name, sizeof(name), "gsched_%s", algo); + snprintf(name, sizeof(name), "gsched_%s", reqalgo); /* * Do not complain about errors here, gctl_issue() * will fail anyway. ==== //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/g_sched.c#3 (text+ko) ==== @@ -410,7 +410,8 @@ } static void -g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask) +g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask, + struct g_gsched *gsp, void *data) { struct g_sched_class *cp, *cp2; int i; @@ -418,6 +419,9 @@ if (!hp) return; + if (data && gsp->gs_hash_unref) + gsp->gs_hash_unref(data); + for (i = 0; i < G_SCHED_HASH_SIZE; i++) { LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2) g_sched_put_class(gp, cp->gsc_priv); @@ -468,58 +472,70 @@ * * Must be called with the gp mutex held and topology locked. */ -static void +static int g_sched_wait_pending(struct g_geom *gp) { struct g_sched_softc *sc = gp->softc; + int endticks = ticks + hz; g_topology_assert(); - while (sc->sc_pending) + while (sc->sc_pending && endticks - ticks >= 0) msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4); + + return (sc->sc_pending ? ETIMEDOUT : 0); } -static void +static int g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp) { struct g_sched_softc *sc = gp->softc; + int error; /* Set the flushing flag: new bios will not enter the scheduler. */ sc->sc_flags |= G_SCHED_FLUSHING; g_sched_forced_dispatch(gp); - g_sched_wait_pending(gp); - + error = g_sched_wait_pending(gp); + if (error) + goto failed; + /* No more requests pending or in flight from the old gsp. */ - g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask); + g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data); sc->sc_hash = NULL; /* - * XXX avoid deadlock here by releasing the gp mutex and - * reacquiring it once done. - * It should be safe, since no reconfiguration - * or destruction can take place due to the geom topology - * lock; no new request can use the current sc_data since - * we flagged the geom as being flushed. + * Avoid deadlock here by releasing the gp mutex and reacquiring + * it once done. It should be safe, since no reconfiguration or + * destruction can take place due to the geom topology lock; no + * new request can use the current sc_data since we flagged the + * geom as being flushed. */ g_sched_unlock(gp); gsp->gs_fini(sc->sc_data); g_sched_lock(gp); sc->sc_gsched = NULL; + sc->sc_data = NULL; + g_gsched_unref(gsp); + +failed: sc->sc_flags &= ~G_SCHED_FLUSHING; - g_gsched_unref(gsp); + return (error); } -static void +static int g_sched_remove(struct g_geom *gp, struct g_gsched *gsp) { + int error; g_sched_lock(gp); - g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */ + error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */ g_sched_unlock(gp); + + return (error); } /* @@ -607,6 +623,7 @@ struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp; struct g_sched_softc *sc; struct g_geom *gp, *gp_tmp; + int error; parm->gup_error = 0; @@ -622,8 +639,11 @@ continue; /* Should not happen. */ sc = gp->softc; - if (sc->sc_gsched == gsp) - g_sched_remove(gp, gsp); + if (sc->sc_gsched == gsp) { + error = g_sched_remove(gp, gsp); + if (error) + goto failed; + } } LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) { @@ -647,6 +667,7 @@ parm->gup_error = ENOENT; } +failed: mtx_unlock(&me.gs_mtx); } @@ -878,6 +899,65 @@ mtx_unlock(&me.gs_mtx); } +static void +g_sched_flush_pending(g_start_t *start) +{ + struct bio *bp; + + while ((bp = gs_bioq_takefirst(&me.gs_pending))) + start(bp); +} + +static int +g_insert_proxy(struct g_geom *gp, struct g_provider *newpp, + struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp) +{ + struct g_sched_softc *sc = gp->softc; + g_start_t *saved_start, *flush = g_sched_start; + int error = 0, endticks = ticks + hz; + + g_cancel_event(newpp); /* prevent taste() */ + /* copy private fields */ + newpp->private = pp->private; + newpp->index = pp->index; + + /* Queue all the early requests coming for us. */ + me.gs_npending = 0; + saved_start = pp->geom->start; + dstgp->start = g_sched_temporary_start; + + while (pp->nstart - pp->nend != me.gs_npending && + endticks - ticks >= 0) + tsleep(pp, PRIBIO, "-", hz/10); + + if (pp->nstart - pp->nend != me.gs_npending) { + flush = saved_start; + error = ETIMEDOUT; + goto fail; + } + + /* link pp to this geom */ + LIST_REMOVE(pp, provider); + pp->geom = gp; + LIST_INSERT_HEAD(&gp->provider, pp, provider); + + /* + * replicate the counts from the parent in the + * new provider and consumer nodes + */ + cp->acr = newpp->acr = pp->acr; + cp->acw = newpp->acw = pp->acw; + cp->ace = newpp->ace = pp->ace; + sc->sc_flags |= G_SCHED_PROXYING; + +fail: + dstgp->start = saved_start; + + g_sched_flush_pending(flush); + + return (error); +} + /* * Create a geom node for the device passed as *pp. * If successful, add a reference to this gsp. @@ -886,12 +966,10 @@ g_sched_create(struct gctl_req *req, struct g_class *mp, struct g_provider *pp, struct g_gsched *gsp, int proxy) { - struct g_sched_softc *sc; - struct g_geom *gp, *dst_gp; + struct g_sched_softc *sc = NULL; + struct g_geom *gp, *dstgp; struct g_provider *newpp = NULL; struct g_consumer *cp = NULL; - void (*saved_start)(struct bio *bp); - struct bio *bp; char name[64]; int error; @@ -907,7 +985,7 @@ } gp = g_new_geomf(mp, name); - dst_gp = proxy ? pp->geom : gp; /* where do we link the provider */ + dstgp = proxy ? pp->geom : gp; /* where do we link the provider */ if (gp == NULL) { gctl_error(req, "Cannot create geom %s.", name); error = ENOMEM; @@ -937,7 +1015,7 @@ gp->access = g_sched_access; gp->dumpconf = g_sched_dumpconf; - newpp = g_new_providerf(dst_gp, gp->name); + newpp = g_new_providerf(dstgp, gp->name); if (newpp == NULL) { gctl_error(req, "Cannot create provider %s.", name); error = ENOMEM; @@ -962,44 +1040,16 @@ goto fail; } - g_gsched_ref(gsp); - g_error_provider(newpp, 0); if (proxy) { - g_cancel_event(newpp); /* prevent taste() */ - /* copy private fields */ - newpp->private = pp->private; - newpp->index = pp->index; - - /* Queue all the early requests coming for us. */ - me.gs_npending = 0; - saved_start = pp->geom->start; - dst_gp->start = g_sched_temporary_start; - - while (pp->nstart - pp->nend != me.gs_npending) - tsleep(pp, PRIBIO, "-", hz/10); - - /* link pp to this geom */ - LIST_REMOVE(pp, provider); - pp->geom = gp; - LIST_INSERT_HEAD(&gp->provider, pp, provider); - - dst_gp->start = saved_start; - - /* - * replicate the counts from the parent in the - * new provider and consumer nodes - */ - cp->acr = newpp->acr = pp->acr; - cp->acw = newpp->acw = pp->acw; - cp->ace = newpp->ace = pp->ace; - sc->sc_flags |= G_SCHED_PROXYING; - - while ((bp = gs_bioq_takefirst(&me.gs_pending))) - g_sched_start(bp); + error = g_insert_proxy(gp, newpp, dstgp, pp, cp); + if (error) + goto fail; } G_SCHED_DEBUG(0, "Device %s created.", gp->name); + g_gsched_ref(gsp); + return (0); fail: @@ -1012,13 +1062,14 @@ if (newpp != NULL) g_destroy_provider(newpp); - if (pp->geom == gp) { - /* Link pp back to the original geom */ - LIST_REMOVE(pp, provider); - pp->geom = dst_gp; - LIST_INSERT_HEAD(&pp->geom->provider, pp, provider); + if (sc && sc->sc_hash) { + g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, + gsp, sc->sc_data); } + if (sc && sc->sc_data) + gsp->gs_fini(sc->sc_data); + if (gp != NULL) { if (gp->softc != NULL) g_free(gp->softc); @@ -1043,6 +1094,7 @@ struct g_hash *newh; void *data; u_long mask; + int error = 0; gp = pp->geom; sc = gp->softc; @@ -1052,14 +1104,18 @@ return (ENOMEM); newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK); - if (gsp->gs_priv_size && newh == NULL) { - gsp->gs_fini(data); - return (ENOMEM); + if (gsp->gs_priv_size && !newh) { + error = ENOMEM; + goto fail; } g_sched_lock(gp); - if (sc->sc_gsched) /* can be NULL in some cases */ - g_sched_remove_locked(gp, sc->sc_gsched); + if (sc->sc_gsched) { /* can be NULL in some cases */ + error = g_sched_remove_locked(gp, sc->sc_gsched); + if (error) + goto fail; + } + g_gsched_ref(gsp); sc->sc_gsched = gsp; sc->sc_data = data; @@ -1069,8 +1125,23 @@ g_sched_unlock(gp); return (0); + +fail: + if (newh) + g_sched_hash_fini(gp, newh, mask, gsp, data); + + if (data) + gsp->gs_fini(data); + + g_sched_unlock(gp); + + return (error); } +/* + * Stop the request flow directed to the proxy, redirecting the new + * requests to the me.gs_pending queue. + */ static struct g_provider * g_detach_proxy(struct g_geom *gp) { @@ -1091,13 +1162,6 @@ me.gs_npending = 0; pp->geom->start = g_sched_temporary_start; - /* Link provider to the original geom. */ - LIST_REMOVE(pp, provider); - pp->private = newpp->private; - pp->index = newpp->index; - pp->geom = newpp->geom; - LIST_INSERT_HEAD(&pp->geom->provider, pp, provider); - return (pp); } while (0); printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name); @@ -1106,11 +1170,51 @@ } static void -g_destroy_proxy(struct g_geom *gp, struct g_provider *old_pp) +g_sched_blackhole(struct bio *bp) +{ + + g_io_deliver(bp, ENXIO); +} + +static inline void +g_reparent_provider(struct g_provider *pp, struct g_geom *gp, + struct g_provider *newpp) +{ + + LIST_REMOVE(pp, provider); + if (newpp) { + pp->private = newpp->private; + pp->index = newpp->index; + } + pp->geom = gp; + LIST_INSERT_HEAD(&gp->provider, pp, provider); +} + +static inline void +g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp) +{ + struct g_geom *gp = oldpp->geom; + + g_reparent_provider(oldpp, newpp->geom, newpp); + + /* + * Hackish: let the system destroy the old provider for us, just + * in case someone attached a consumer to it, in which case a + * direct call to g_destroy_provider() would not work. + */ + g_reparent_provider(newpp, gp, NULL); +} + +/* + * Complete the proxy destruction, linking the old provider to its + * original geom, and destroying the proxy provider. Also take care + * of issuing the pending requests collected in me.gs_pending (if any). + */ +static int +g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp) { struct g_consumer *cp; struct g_provider *newpp; - struct bio *bp; do { cp = LIST_FIRST(&gp->consumer); @@ -1119,52 +1223,58 @@ newpp = cp->provider; if (newpp == NULL) break; - /* detach consumer from provider, and destroy provider. */ + + /* Relink the provider to its original geom. */ + g_unproxy_provider(oldpp, newpp); + + /* Detach consumer from provider, and destroy provider. */ cp->acr = newpp->acr = 0; cp->acw = newpp->acw = 0; cp->ace = newpp->ace = 0; g_detach(cp); - printf("destroying provider %s\n", newpp->name); - g_destroy_provider(newpp); - while ((bp = gs_bioq_takefirst(&me.gs_pending))) - old_pp->geom->start(bp); + /* Send the pending bios through the right start function. */ + g_sched_flush_pending(oldpp->geom->start); - return; + return (0); } while (0); printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name); + + /* We cannot send the pending bios anywhere... */ + g_sched_flush_pending(g_sched_blackhole); + + return (EINVAL); } static int g_sched_destroy(struct g_geom *gp, boolean_t force) { - struct g_provider *pp, *old_pp = NULL; + struct g_provider *pp, *oldpp = NULL; struct g_sched_softc *sc; struct g_gsched *gsp; + int error; g_topology_assert(); sc = gp->softc; if (sc == NULL) return (ENXIO); - if ((sc->sc_flags & G_SCHED_PROXYING)) { - old_pp = g_detach_proxy(gp); - goto flush; - } - pp = LIST_FIRST(&gp->provider); - if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { - const char *msg = force ? - "but we force removal" : "cannot remove"; + if (!(sc->sc_flags & G_SCHED_PROXYING)) { + pp = LIST_FIRST(&gp->provider); + if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { + const char *msg = force ? + "but we force removal" : "cannot remove"; - G_SCHED_DEBUG(!force, - "Device %s is still open (r%dw%de%d), %s.", - pp->name, pp->acr, pp->acw, pp->ace, msg); - if (!force) - return (EBUSY); - } else { - G_SCHED_DEBUG(0, "Device %s removed.", gp->name); - } + G_SCHED_DEBUG(!force, + "Device %s is still open (r%dw%de%d), %s.", + pp->name, pp->acr, pp->acw, pp->ace, msg); + if (!force) + return (EBUSY); + } else { + G_SCHED_DEBUG(0, "Device %s removed.", gp->name); + } + } else + oldpp = g_detach_proxy(gp); -flush: gsp = sc->sc_gsched; if (gsp) { /* @@ -1174,26 +1284,69 @@ */ g_sched_lock(gp); /* - * We are dying here, no new request should enter + * We are dying here, no new requests should enter * the scheduler. This is granted by the topolgy, - * either in case we were proxying (the provider we - * stole now returned home) or not (see the access - * check above). + * either in case we were proxying (new bios are + * being redirected) or not (see the access check + * above). */ g_sched_forced_dispatch(gp); - g_sched_wait_pending(gp); + error = g_sched_wait_pending(gp); + + if (error) { + /* + * Not all the requests came home: this might happen + * under heavy load, or if we were waiting for any + * bio which is served in the event path (see + * geom_slice.c for an example of how this can + * happen). Try to restore a working configuration + * if we can fail. + */ + if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { + g_sched_flush_pending(force ? + g_sched_blackhole : g_sched_start); + } + + /* + * In the forced destroy case there is not so much + * we can do, we have pending bios that will call + * g_sched_done() somehow, and we don't want them + * to crash the system using freed memory. We tell + * the user that something went wrong, and leak some + * memory here. + * Note: the callers using force = 1 ignore the + * return value. + */ + if (force) { + G_SCHED_DEBUG(0, "Pending requests while " + " destroying geom, some memory leaked."); + } + + return (error); + } + g_sched_unlock(gp); - - g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask); + g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, + gsp, sc->sc_data); sc->sc_hash = NULL; - gsp->gs_fini(sc->sc_data); - g_gsched_unref(gsp); + sc->sc_gsched = NULL; } - if ((sc->sc_flags & G_SCHED_PROXYING) && old_pp) - g_destroy_proxy(gp, old_pp); + if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) { + error = g_destroy_proxy(gp, oldpp); + + if (error) { + if (force) { + G_SCHED_DEBUG(0, "Unrecoverable error while " + "destroying a proxy geom, leaking some " + " memory."); + } + + return (error); + } + } mtx_destroy(&sc->sc_mtx); @@ -1201,7 +1354,7 @@ gp->softc = NULL; g_wither_geom(gp, ENXIO); - return (0); + return (error); } static int @@ -1372,7 +1525,7 @@ g_ioreq_restore(); } -#else /* !HAVE_BIO_CLASSIFIER */ +#else /* HAVE_BIO_CLASSIFIER */ static int g_sched_tag(void *arg, struct bio *bp) { ==== //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/gs_rr.c#3 (text+ko) ==== @@ -96,6 +96,8 @@ struct g_savg q_thinktime; /* Thinktime average. */ struct g_savg q_seekdist; /* Seek distance average. */ + int q_bionum; /* Number of requests. */ + off_t q_lastoff; /* Last submitted req. offset. */ int q_lastsub; /* Last submitted req. time. */ @@ -182,7 +184,7 @@ .sc_head = LIST_HEAD_INITIALIZER(&me.sc_head), .w_anticipate = 1, .queue_depth = { 1, 1, 50 }, - .wait_ms = { 1, 5, 30 }, + .wait_ms = { 1, 10, 30 }, .quantum_ms = { 1, 100, 500 }, .quantum_kb = { 16, 8192, 65536 }, }; @@ -375,14 +377,14 @@ return (0); if (g_savg_valid(&qp->q_seekdist) && - g_savg_read(&qp->q_seekdist) > 2048) + g_savg_read(&qp->q_seekdist) > 8192) return (0); return (1); } /* - * called on a request arrival, timeout or completion. + * Called on a request arrival, timeout or completion. * Try to serve a request among those queued. */ static struct bio * @@ -468,7 +470,7 @@ sc->sc_active = NULL; } } - /* if sc_active != NULL, its q_status is always correct */ + /* If sc_active != NULL, its q_status is always correct. */ sc->sc_in_flight++; @@ -480,9 +482,13 @@ { int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2); + if (qp->q_sc->sc_active != qp) + return; + qp->q_lastsub = ticks; delta = (delta > 2 * wait) ? 2 * wait : delta; - g_savg_add_sample(&qp->q_thinktime, delta); + if (qp->q_bionum > 7) + g_savg_add_sample(&qp->q_thinktime, delta); } static inline void @@ -495,8 +501,13 @@ else dist = bp->bio_offset - qp->q_lastoff; + if (dist > (8192 * 8)) + dist = 8192 * 8; + qp->q_lastoff = bp->bio_offset + bp->bio_length; - g_savg_add_sample(&qp->q_seekdist, qp->q_seekdist.gs_smpl ? dist : 0); + + if (qp->q_bionum > 7) + g_savg_add_sample(&qp->q_seekdist, dist); } /* @@ -536,6 +547,8 @@ } } + qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3); + g_rr_update_thinktime(qp); g_rr_update_seekdist(qp, bp); ==== //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/gs_scheduler.h#3 (text+ko) ==== @@ -79,6 +79,10 @@ * * gs_init_class() is called when a new client (as determined by * the classifier) starts being used. + * + * gs_hash_unref() is called right before the class hashtable is + * destroyed; after this call, the scheduler is supposed to hold no + * more references to the elements in the table. */ /* Forward declarations for prototypes. */ @@ -92,6 +96,7 @@ typedef struct bio *gs_next_t (void *data, int force); typedef int gs_init_class_t (void *data, void *priv); typedef void gs_fini_class_t (void *data, void *priv); +typedef void gs_hash_unref_t (void *data); struct g_gsched { const char *gs_name; @@ -107,6 +112,7 @@ gs_init_class_t *gs_init_class; gs_fini_class_t *gs_fini_class; + gs_hash_unref_t *gs_hash_unref; LIST_ENTRY(g_gsched) glist; }; ==== //depot/projects/soc2009/fabio_gsched/geom_sched/sys/modules/geom/geom_sched/Makefile#2 (text+ko) ==== @@ -1,6 +1,6 @@ # $FreeBSD: $ -SUBDIR= gs_sched gsched_as gsched_rr +SUBDIR= gs_sched gsched_as gsched_rr gsched_bfq .if defined(WITH_SSD) SUBDIR += gsched_ssd .endif