Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 12 Apr 2010 16:37:45 +0000 (UTC)
From:      Luigi Rizzo <luigi@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r206497 - in head: sbin/geom/class sbin/geom/class/sched sys/geom/sched sys/modules/geom sys/modules/geom/geom_sched sys/modules/geom/geom_sched/gs_sched sys/modules/geom/geom_sched/gsc...
Message-ID:  <201004121637.o3CGbjSK080066@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: luigi
Date: Mon Apr 12 16:37:45 2010
New Revision: 206497
URL: http://svn.freebsd.org/changeset/base/206497

Log:
  Bring in geom_sched, support for scheduling disk I/O requests
  in a device independent manner. Also include an example anticipatory
  scheduler, gsched_rr, which gives very nice performance improvements
  in presence of competing random access patterns.
  
  This is joint work with Fabio Checconi, developed last year
  and presented at BSDCan 2009. You can find details in the
  README file or at
  
  http://info.iet.unipi.it/~luigi/geom_sched/

Added:
  head/sbin/geom/class/sched/
  head/sbin/geom/class/sched/Makefile   (contents, props changed)
  head/sbin/geom/class/sched/geom_sched.c   (contents, props changed)
  head/sbin/geom/class/sched/gsched.8   (contents, props changed)
  head/sys/geom/sched/
  head/sys/geom/sched/README   (contents, props changed)
  head/sys/geom/sched/g_sched.c   (contents, props changed)
  head/sys/geom/sched/g_sched.h   (contents, props changed)
  head/sys/geom/sched/gs_rr.c   (contents, props changed)
  head/sys/geom/sched/gs_scheduler.h   (contents, props changed)
  head/sys/geom/sched/subr_disk.c   (contents, props changed)
  head/sys/modules/geom/geom_sched/
  head/sys/modules/geom/geom_sched/Makefile   (contents, props changed)
  head/sys/modules/geom/geom_sched/Makefile.inc   (contents, props changed)
  head/sys/modules/geom/geom_sched/gs_sched/
  head/sys/modules/geom/geom_sched/gs_sched/Makefile   (contents, props changed)
  head/sys/modules/geom/geom_sched/gsched_rr/
  head/sys/modules/geom/geom_sched/gsched_rr/Makefile   (contents, props changed)
Modified:
  head/sbin/geom/class/Makefile
  head/sys/modules/geom/Makefile

Modified: head/sbin/geom/class/Makefile
==============================================================================
--- head/sbin/geom/class/Makefile	Mon Apr 12 13:46:20 2010	(r206496)
+++ head/sbin/geom/class/Makefile	Mon Apr 12 16:37:45 2010	(r206497)
@@ -15,6 +15,7 @@ SUBDIR+=multipath
 SUBDIR+=nop
 SUBDIR+=part
 SUBDIR+=raid3
+SUBDIR+=sched
 SUBDIR+=shsec
 SUBDIR+=stripe
 SUBDIR+=virstor

Added: head/sbin/geom/class/sched/Makefile
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sbin/geom/class/sched/Makefile	Mon Apr 12 16:37:45 2010	(r206497)
@@ -0,0 +1,19 @@
+# GEOM_LIBRARY_PATH
+# $FreeBSD$
+
+.PATH: /usr/src/sbin/geom/misc
+
+CFLAGS += -I/usr/src/sbin/geom
+
+CLASS=sched
+
+WARNS?= 6
+CLASS_DIR?=/lib/geom
+
+SHLIBDIR?=${CLASS_DIR}
+SHLIB_NAME?=geom_${CLASS}.so
+LINKS=  ${BINDIR}/geom ${BINDIR}/g${CLASS}
+MAN=    g${CLASS}.8
+SRCS+=  geom_${CLASS}.c subr.c
+
+.include <bsd.lib.mk>

Added: head/sbin/geom/class/sched/geom_sched.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sbin/geom/class/sched/geom_sched.c	Mon Apr 12 16:37:45 2010	(r206497)
@@ -0,0 +1,123 @@
+/*-
+ * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id$
+ * $FreeBSD$
+ *
+ * This file implements the userspace library used by the 'geom'
+ * command to load and manipulate disk schedulers.
+ */
+  
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/linker.h>
+#include <sys/module.h>
+
+#include <stdio.h>
+#include <stdint.h>
+#include <libgeom.h>
+
+#include "core/geom.h"
+#include "misc/subr.h"
+
+#define	G_SCHED_VERSION	0
+
+uint32_t lib_version = G_LIB_VERSION;
+uint32_t version = G_SCHED_VERSION;
+
+/*
+ * storage for parameters used by this geom class.
+ * Right now only the scheduler name is used.
+ */
+static char algo[] = "rr";	/* default scheduler */
+
+/*
+ * Adapt to differences in geom library.
+ * in V1 struct g_command misses gc_argname, eld, and G_BOOL is undefined
+ */
+#if G_LIB_VERSION == 1
+#define G_ARGNAME
+#define G_TYPE_BOOL	G_TYPE_NUMBER
+#else
+#define G_ARGNAME	NULL,
+#endif
+
+static void
+gcmd_createinsert(struct gctl_req *req, unsigned flags __unused)
+{
+	const char *reqalgo;
+	char name[64];
+
+	if (gctl_has_param(req, "algo"))
+		reqalgo = gctl_get_ascii(req, "algo");
+	else
+		reqalgo = algo;
+
+	snprintf(name, sizeof(name), "gsched_%s", reqalgo);
+	/*
+	 * Do not complain about errors here, gctl_issue()
+	 * will fail anyway.
+	 */
+	if (modfind(name) < 0)
+		kldload(name);
+	gctl_issue(req);
+}
+
+struct g_command class_commands[] = {
+	{ "create", G_FLAG_VERBOSE | G_FLAG_LOADKLD, gcmd_createinsert,
+	    {
+		{ 'a', "algo", algo, G_TYPE_STRING },
+		G_OPT_SENTINEL
+	    },
+	    G_ARGNAME "[-v] [-a algorithm_name] dev ..."
+	},
+	{ "insert", G_FLAG_VERBOSE | G_FLAG_LOADKLD, gcmd_createinsert,
+	    {
+		{ 'a', "algo", algo, G_TYPE_STRING },
+		G_OPT_SENTINEL
+	    },
+	    G_ARGNAME "[-v] [-a algorithm_name] dev ..."
+	},
+	{ "configure", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'a', "algo", algo, G_TYPE_STRING },
+		G_OPT_SENTINEL
+	    },
+	    G_ARGNAME "[-v] [-a algorithm_name] prov ..."
+	},
+	{ "destroy", G_FLAG_VERBOSE, NULL,
+	    {
+		{ 'f', "force", NULL, G_TYPE_BOOL },
+		G_OPT_SENTINEL
+	    },
+	    G_ARGNAME "[-fv] prov ..."
+	},
+	{ "reset", G_FLAG_VERBOSE, NULL, G_NULL_OPTS,
+	    G_ARGNAME "[-v] prov ..."
+	},
+	G_CMD_SENTINEL
+};

Added: head/sbin/geom/class/sched/gsched.8
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sbin/geom/class/sched/gsched.8	Mon Apr 12 16:37:45 2010	(r206497)
@@ -0,0 +1,161 @@
+.\" Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo
+.\" All rights reserved.
+.\" $FreeBSD$
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.Dd April 12, 2010
+.Dt GSCHED 8
+.Os
+.Sh NAME
+.Nm gsched
+.Nd "control utility for disk scheduler GEOM class"
+.Sh SYNOPSIS
+.Nm
+.Cm create
+.Op Fl v
+.Op Fl a Ar algorithm
+.Ar provider ...
+.Nm
+.Cm insert
+.Op Fl v
+.Op Fl a Ar algorithm
+.Ar provider ...
+.Nm
+.Cm configure
+.Op Fl v
+.Op Fl a Ar algorithm
+.Ar node ...
+.Nm
+.Cm destroy
+.Op Fl fv
+.Ar node ...
+.Nm
+.Cm reset
+.Op Fl v
+.Ar node ...
+.Nm
+.Cm { list | status | load | unload }
+.Sh DESCRIPTION
+The
+.Nm
+utility (also callable as
+.Nm geom sched ... )
+changes the scheduling policy of the requests going to a provider.
+.Pp
+The first argument to
+.Nm
+indicates an action to be performed:
+.Bl -tag -width ".Cm configure"
+.It Cm create
+Create a new provider and geom node using the specified scheduling algorithm.
+.Ar algorithm
+is the name of the scheduling algorithm used for the provider.
+Available algorithms include:
+.Ar rr ,
+which implements anticipatory scheduling with round robin service
+among clients;
+.Ar as ,
+which implements a simple form of anticipatory scheduling with
+no per-client queue.
+.Pp
+If the operation succeeds, the new provider should appear with name
+.Pa /dev/ Ns Ao Ar dev Ac Ns Pa .sched. .
+The kernel module
+.Pa geom_sched.ko
+will be loaded if it is not loaded already.
+.It Cm insert
+Operates as "create", but the insertion is "transparent",
+i.e. the existing provider is rerouted to the newly created geom,
+which in turn forwards requests to the existing geom.
+This operation allows one to start/stop a scheduling service
+on an already existing provider.
+.Pp
+A subsequent 'destroy' will remove the newly created geom and
+hook the provider back to the original geom.
+.Ar algorithm
+.It Cm configure
+Configure existing scheduling provider.  It supports the same options
+as the 
+.Nm create
+command.
+.It Cm destroy
+Destroy the geom specified in the parameter.
+.It Cm reset
+Do nothing.
+.It Cm list | status | load | unload
+See
+.Xr geom 8 .
+.El
+.Pp
+Additional options:
+.Bl -tag -width ".Fl f"
+.It Fl f
+Force the removal of the specified provider.
+.It Fl v
+Be more verbose.
+.El
+.Sh SYSCTL VARIABLES
+The following
+.Xr sysctl 8
+variables can be used to control the behavior of the
+.Nm SCHED
+GEOM class.
+The default value is shown next to each variable.
+.Bl -tag -width indent
+.It Va kern.geom.sched.debug : No 0
+Debug level of the
+.Nm SCHED
+GEOM class.
+This can be set to a number between 0 and 2 inclusive.
+If set to 0 minimal debug information is printed, and if set to 2 the
+maximum amount of debug information is printed.
+.El
+.Sh EXIT STATUS
+Exit status is 0 on success, and 1 if the command fails.
+.Sh EXAMPLES
+The following example shows how to create a scheduling provider for disk
+.Pa /dev/da0
+, and how to destroy it.
+.Bd -literal -offset indent
+# Load the geom_sched module:
+kldload geom_sched
+# Load some scheduler classes used by geom_sched:
+kldload gsched_rr gsched_as
+# Configure device ad0 to use scheduler 'rr':
+geom sched insert -s rr ad0
+# Now provider ad0 uses the 'rr' algorithm;
+# the new geom is ad0.sched.
+# Remove the scheduler on the device:
+geom sched destroy -v ad0.sched.
+.Ed
+.Pp
+.Sh SEE ALSO
+.Xr geom 4 ,
+.Xr geom 8
+.Sh HISTORY
+The
+.Nm
+utility appeared in April 2010.
+.Sh AUTHORS
+.An Fabio Checconi Aq fabio@FreeBSD.org
+.An Luigi Rizzo Aq luigi@FreeBSD.org

Added: head/sys/geom/sched/README
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/geom/sched/README	Mon Apr 12 16:37:45 2010	(r206497)
@@ -0,0 +1,162 @@
+
+	--- GEOM BASED DISK SCHEDULERS FOR FREEBSD ---
+
+This code contains a framework for GEOM-based disk schedulers and a
+couple of sample scheduling algorithms that use the framework and
+implement two forms of "anticipatory scheduling" (see below for more
+details).
+
+As a quick example of what this code can give you, try to run "dd",
+"tar", or some other program with highly SEQUENTIAL access patterns,
+together with "cvs", "cvsup", "svn" or other highly RANDOM access patterns
+(this is not a made-up example: it is pretty common for developers
+to have one or more apps doing random accesses, and others that do
+sequential accesses e.g., loading large binaries from disk, checking
+the integrity of tarballs, watching media streams and so on).
+
+These are the results we get on a local machine (AMD BE2400 dual
+core CPU, SATA 250GB disk):
+
+    /mnt is a partition mounted on /dev/ad0s1f
+
+    cvs: 	cvs -d /mnt/home/ncvs-local update -Pd /mnt/ports
+    dd-read:	dd bs=128k of=/dev/null if=/dev/ad0 (or ad0-sched-)
+    dd-writew	dd bs=128k if=/dev/zero of=/mnt/largefile
+
+			NO SCHEDULER		RR SCHEDULER
+                	dd	cvs		dd	cvs
+
+    dd-read only        72 MB/s	----		72 MB/s	---
+    dd-write only	55 MB/s	---		55 MB/s	---
+    dd-read+cvs		 6 MB/s	ok    		30 MB/s	ok
+    dd-write+cvs	55 MB/s slooow		14 MB/s	ok
+
+As you can see, when a cvs is running concurrently with dd, the
+performance drops dramatically, and depending on read or write mode,
+one of the two is severely penalized.  The use of the RR scheduler
+in this example makes the dd-reader go much faster when competing
+with cvs, and lets cvs progress when competing with a writer.
+
+To try it out:
+
+1. USERS OF FREEBSD 7, PLEASE READ CAREFULLY THE FOLLOWING:
+
+    On loading, this module patches one kernel function (g_io_request())
+    so that I/O requests ("bio's") carry a classification tag, useful
+    for scheduling purposes.
+
+    ON FREEBSD 7, the tag is stored in an existing (though rarely used)
+    field of the "struct bio", a solution which makes this module
+    incompatible with other modules using it, such as ZFS and gjournal.
+    Additionally, g_io_request() is patched in-memory to add a call
+    to the function that initializes this field (i386/amd64 only;
+    for other architectures you need to manually patch sys/geom/geom_io.c).
+    See details in the file g_sched.c.
+
+    On FreeBSD 8.0 and above, the above trick is not necessary,
+    as the struct bio contains dedicated fields for the classifier,
+    and hooks for request classifiers.
+
+    If you don't like the above, don't run this code.
+
+2. PLEASE MAKE SURE THAT THE DISK THAT YOU WILL BE USING FOR TESTS
+   DOES NOT CONTAIN PRECIOUS DATA.
+    This is experimental code, so we make no guarantees, though
+    I am routinely using it on my desktop and laptop.
+
+3. EXTRACT AND BUILD THE PROGRAMS
+    A 'make install' in the directory should work (with root privs),
+    or you can even try the binary modules.
+    If you want to build the modules yourself, look at the Makefile.
+
+4. LOAD THE MODULE, CREATE A GEOM NODE, RUN TESTS
+
+    The scheduler's module must be loaded first:
+
+      # kldload gsched_rr
+
+    substitute with gsched_as to test AS.  Then, supposing that you are
+    using /dev/ad0 for testing, a scheduler can be attached to it with:
+
+      # geom sched insert ad0
+
+    The scheduler is inserted transparently in the geom chain, so
+    mounted partitions and filesystems will keep working, but
+    now requests will go through the scheduler.
+
+    To change scheduler on-the-fly, you can reconfigure the geom:
+
+      # geom sched configure -a as ad0.sched.
+
+    assuming that gsched_as was loaded previously.
+
+5. SCHEDULER REMOVAL
+
+    In principle it is possible to remove the scheduler module
+    even on an active chain by doing
+
+	# geom sched destroy ad0.sched.
+
+    However, there is some race in the geom subsystem which makes
+    the removal unsafe if there are active requests on a chain.
+    So, in order to reduce the risk of data losses, make sure
+    you don't remove a scheduler from a chain with ongoing transactions.
+
+--- NOTES ON THE SCHEDULERS ---
+
+The important contribution of this code is the framework to experiment
+with different scheduling algorithms.  'Anticipatory scheduling'
+is a very powerful technique based on the following reasoning:
+
+    The disk throughput is much better if it serves sequential requests.
+    If we have a mix of sequential and random requests, and we see a
+    non-sequential request, do not serve it immediately but instead wait
+    a little bit (2..5ms) to see if there is another one coming that
+    the disk can serve more efficiently.
+
+There are many details that should be added to make sure that the
+mechanism is effective with different workloads and systems, to
+gain a few extra percent in performance, to improve fairness,
+insulation among processes etc.  A discussion of the vast literature
+on the subject is beyond the purpose of this short note.
+
+--------------------------------------------------------------------------
+
+TRANSPARENT INSERT/DELETE
+
+geom_sched is an ordinary geom module, however it is convenient
+to plug it transparently into the geom graph, so that one can
+enable or disable scheduling on a mounted filesystem, and the
+names in /etc/fstab do not depend on the presence of the scheduler.
+
+To understand how this works in practice, remember that in GEOM
+we have "providers" and "geom" objects.
+Say that we want to hook a scheduler on provider "ad0",
+accessible through pointer 'pp'. Originally, pp is attached to
+geom "ad0" (same name, different object) accessible through pointer old_gp
+
+  BEFORE	---> [ pp    --> old_gp ...]
+
+A normal "geom sched create ad0" call would create a new geom node
+on top of provider ad0/pp, and export a newly created provider
+("ad0.sched." accessible through pointer newpp).
+
+  AFTER create  ---> [ newpp --> gp --> cp ] ---> [ pp    --> old_gp ... ]
+
+On top of newpp, a whole tree will be created automatically, and we
+can e.g. mount partitions on /dev/ad0.sched.s1d, and those requests
+will go through the scheduler, whereas any partition mounted on
+the pre-existing device entries will not go through the scheduler.
+
+With the transparent insert mechanism, the original provider "ad0"/pp
+is hooked to the newly created geom, as follows:
+
+  AFTER insert  ---> [ pp    --> gp --> cp ] ---> [ newpp --> old_gp ... ]
+
+so anything that was previously using provider pp will now have
+the requests routed through the scheduler node.
+
+A removal ("geom sched destroy ad0.sched.") will restore the original
+configuration.
+
+# $FreeBSD$

Added: head/sys/geom/sched/g_sched.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/geom/sched/g_sched.c	Mon Apr 12 16:37:45 2010	(r206497)
@@ -0,0 +1,1901 @@
+/*-
+ * Copyright (c) 2009-2010 Fabio Checconi, Luigi Rizzo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * $Id$
+ * $FreeBSD$
+ *
+ * Main control module for geom-based disk schedulers ('sched').
+ *
+ * USER VIEW
+ * A 'sched' node is typically inserted transparently between
+ * an existing provider pp and its original geom gp
+ *
+ *	[pp --> gp  ..]
+ *
+ * using the command "geom sched insert <provider>" and
+ * resulting in the following topology
+ *
+ *	[pp --> sched_gp --> cp]   [new_pp --> gp ... ]
+ *
+ * Deletion "geom sched destroy <provider>.sched." restores the
+ * original chain. The normal "geom sched create <provide>"
+ * is also supported.
+ *
+ * INTERNALS
+ * Internally, the 'sched' uses the following data structures
+ *
+ *   geom{}         g_sched_softc{}      g_gsched{}
+ * +----------+    +---------------+   +-------------+
+ * |  softc *-|--->| sc_gsched   *-|-->|  gs_init    |
+ * |  ...     |    |               |   |  gs_fini    |
+ * |          |    | [ hash table] |   |  gs_start   |
+ * +----------+    |               |   |  ...        |
+ *                 |               |   +-------------+
+ *                 |               |
+ *                 |               |     g_*_softc{}
+ *                 |               |   +-------------+
+ *                 | sc_data     *-|-->|             |
+ *                 +---------------+   |  algorithm- |
+ *                                     |  specific   |
+ *                                     +-------------+
+ *
+ * A g_sched_softc{} is created with a "geom sched insert" call.
+ * In turn this instantiates a specific scheduling algorithm,
+ * which sets sc_gsched to point to the algorithm callbacks,
+ * and calls gs_init() to create the g_*_softc{} .
+ * The other callbacks (gs_start, gs_next, ...) are invoked
+ * as needed 
+ *
+ * g_sched_softc{} is defined in g_sched.h and mostly used here;
+ * g_gsched{}, and the gs_callbacks, are documented in gs_scheduler.h;
+ * g_*_softc{} is defined/implemented by each algorithm (gs_*.c)
+ *
+ * DATA MOVING
+ * When a bio is received on the provider, it goes to the
+ * g_sched_start() which calls gs_start() to initially queue it;
+ * then we call g_sched_dispatch() that loops around gs_next()
+ * to select zero or more bio's to be sent downstream.
+ *
+ * g_sched_dispatch() can also be called as a result of a timeout,
+ * e.g. when doing anticipation or pacing requests.
+ *
+ * When a bio comes back, it goes to g_sched_done() which in turn
+ * calls gs_done(). The latter does any necessary housekeeping in
+ * the scheduling algorithm, and may decide to call g_sched_dispatch()
+ * to send more bio's downstream.
+ *
+ * If an algorithm needs per-flow queues, these are created
+ * calling gs_init_class() and destroyed with gs_fini_class(),
+ * and they are also inserted in the hash table implemented in
+ * the g_sched_softc{}
+ *
+ * If an algorithm is replaced, or a transparently-inserted node is
+ * removed with "geom sched destroy", we need to remove all references
+ * to the g_*_softc{} and g_sched_softc from the bio's still in
+ * the scheduler. g_sched_forced_dispatch() helps doing this.
+ * XXX need to explain better.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/bio.h>
+#include <sys/limits.h>
+#include <sys/hash.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>		/* we access curthread */
+#include <geom/geom.h>
+#include "gs_scheduler.h"
+#include "g_sched.h"		/* geom hooks */
+
+/*
+ * Size of the per-geom hash table storing traffic classes.
+ * We may decide to change it at a later time, it has no ABI
+ * implications as it is only used for run-time allocations.
+ */
+#define G_SCHED_HASH_SIZE	32
+
+static int g_sched_destroy(struct g_geom *gp, boolean_t force);
+static int g_sched_destroy_geom(struct gctl_req *req,
+    struct g_class *mp, struct g_geom *gp);
+static void g_sched_config(struct gctl_req *req, struct g_class *mp,
+    const char *verb);
+static struct g_geom *g_sched_taste(struct g_class *mp,
+    struct g_provider *pp, int flags __unused);
+static void g_sched_dumpconf(struct sbuf *sb, const char *indent,
+    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
+static void g_sched_init(struct g_class *mp);
+static void g_sched_fini(struct g_class *mp);
+
+struct g_class g_sched_class = {
+	.name = G_SCHED_CLASS_NAME,
+	.version = G_VERSION,
+	.ctlreq = g_sched_config,
+	.taste = g_sched_taste,
+	.destroy_geom = g_sched_destroy_geom,
+	.init = g_sched_init,
+	.fini = g_sched_fini
+};
+
+MALLOC_DEFINE(M_GEOM_SCHED, "GEOM_SCHED", "Geom schedulers data structures");
+
+/*
+ * Global variables describing the state of the geom_sched module.
+ * There is only one static instance of this structure.
+ */
+LIST_HEAD(gs_list, g_gsched);	/* type, link field */
+struct geom_sched_vars {
+	struct mtx	gs_mtx;
+	struct gs_list	gs_scheds;	/* list of algorithms */
+	u_int		gs_debug;
+	u_int		gs_sched_count;	/* how many algorithms ? */
+	u_int 		gs_patched;	/* g_io_request was patched */
+
+	u_int		gs_initialized;
+	u_int		gs_expire_secs;	/* expiration of hash entries */
+
+	struct bio_queue_head gs_pending;
+	u_int		gs_npending;
+
+	/* The following are for stats, usually protected by gs_mtx. */
+	u_long		gs_requests;	/* total requests */
+	u_long		gs_done;	/* total done */
+	u_int 		gs_in_flight;	/* requests in flight */
+	u_int 		gs_writes_in_flight;
+	u_int 		gs_bytes_in_flight;
+	u_int 		gs_write_bytes_in_flight;
+
+	char		gs_names[256];	/* names of schedulers */
+};
+
+static struct geom_sched_vars me = {
+	.gs_expire_secs = 10,
+};
+
+SYSCTL_DECL(_kern_geom);
+SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0,
+    "GEOM_SCHED stuff");
+
+SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_wb, CTLFLAG_RD,
+    &me.gs_write_bytes_in_flight, 0, "Write bytes in flight");
+
+SYSCTL_INT(_kern_geom_sched, OID_AUTO, in_flight_b, CTLFLAG_RD,
+    &me.gs_bytes_in_flight, 0, "Bytes in flight");
+
+SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight_w, CTLFLAG_RD,
+    &me.gs_writes_in_flight, 0, "Write Requests in flight");
+
+SYSCTL_UINT(_kern_geom_sched, OID_AUTO, in_flight, CTLFLAG_RD,
+    &me.gs_in_flight, 0, "Requests in flight");
+
+SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, done, CTLFLAG_RD,
+    &me.gs_done, 0, "Total done");
+
+SYSCTL_ULONG(_kern_geom_sched, OID_AUTO, requests, CTLFLAG_RD,
+    &me.gs_requests, 0, "Total requests");
+
+SYSCTL_STRING(_kern_geom_sched, OID_AUTO, algorithms, CTLFLAG_RD,
+    &me.gs_names, 0, "Algorithm names");
+
+SYSCTL_UINT(_kern_geom_sched, OID_AUTO, alg_count, CTLFLAG_RD,
+    &me.gs_sched_count, 0, "Number of algorithms");
+
+SYSCTL_UINT(_kern_geom_sched, OID_AUTO, debug, CTLFLAG_RW,
+    &me.gs_debug, 0, "Debug level");
+
+SYSCTL_UINT(_kern_geom_sched, OID_AUTO, expire_secs, CTLFLAG_RW,
+    &me.gs_expire_secs, 0, "Expire time in seconds");
+
+/*
+ * g_sched calls the scheduler algorithms with this lock held.
+ * The locking functions are exposed so the scheduler algorithms can also
+ * protect themselves e.g. when running a callout handler.
+ */
+void
+g_sched_lock(struct g_geom *gp)
+{
+	struct g_sched_softc *sc = gp->softc;
+
+	mtx_lock(&sc->sc_mtx);
+}
+
+void
+g_sched_unlock(struct g_geom *gp)
+{
+	struct g_sched_softc *sc = gp->softc;
+
+	mtx_unlock(&sc->sc_mtx);
+}
+
+/*
+ * Support functions to handle references to the module,
+ * which are coming from devices using this scheduler.
+ */
+static inline void
+g_gsched_ref(struct g_gsched *gsp)
+{
+
+	atomic_add_int(&gsp->gs_refs, 1);
+}
+
+static inline void
+g_gsched_unref(struct g_gsched *gsp)
+{
+
+	atomic_add_int(&gsp->gs_refs, -1);
+}
+
+/*
+ * Update the stats when this request is done.
+ */
+static void
+g_sched_update_stats(struct bio *bio)
+{
+
+	me.gs_done++;
+	me.gs_in_flight--;
+	me.gs_bytes_in_flight -= bio->bio_length;
+	if (bio->bio_cmd & BIO_WRITE) {
+		me.gs_writes_in_flight--;
+		me.gs_write_bytes_in_flight -= bio->bio_length;
+	}
+}
+
+/*
+ * Dispatch any pending request.
+ */
+static void
+g_sched_forced_dispatch(struct g_geom *gp)
+{
+	struct g_sched_softc *sc = gp->softc;
+	struct g_gsched *gsp = sc->sc_gsched;
+	struct bio *bp;
+
+	KASSERT(mtx_owned(&sc->sc_mtx),
+	    ("sc_mtx not owned during forced dispatch"));
+
+	while ((bp = gsp->gs_next(sc->sc_data, 1)) != NULL)
+		g_io_request(bp, LIST_FIRST(&gp->consumer));
+}
+
+/*
+ * The main dispatch loop, called either here after the start
+ * routine, or by scheduling algorithms when they receive a timeout
+ * or a 'done' notification.  Does not share code with the forced
+ * dispatch path, since the gs_done() callback can call us.
+ */
+void
+g_sched_dispatch(struct g_geom *gp)
+{
+	struct g_sched_softc *sc = gp->softc;
+	struct g_gsched *gsp = sc->sc_gsched;
+	struct bio *bp;
+
+	KASSERT(mtx_owned(&sc->sc_mtx), ("sc_mtx not owned during dispatch"));
+
+	if ((sc->sc_flags & G_SCHED_FLUSHING))
+		return;
+
+	while ((bp = gsp->gs_next(sc->sc_data, 0)) != NULL)
+		g_io_request(bp, LIST_FIRST(&gp->consumer));
+}
+
+/*
+ * Recent (8.0 and above) versions of FreeBSD have support to
+ * register classifiers of disk requests. The classifier is
+ * invoked by g_io_request(), and stores the information into
+ * bp->bio_classifier1.
+ *
+ * Support for older versions, which is left here only for
+ * documentation purposes, relies on two hacks:
+ * 1. classification info is written into the bio_caller1
+ *    field of the topmost node in the bio chain. This field
+ *    is rarely used, but this module is incompatible with
+ *    those that use bio_caller1 for other purposes,
+ *    such as ZFS and gjournal;
+ * 2. g_io_request() is patched in-memory when the module is
+ *    loaded, so that the function calls a classifier as its
+ *    first thing. g_io_request() is restored when the module
+ *    is unloaded. This functionality is only supported for
+ *    x86 and amd64, other architectures need source code changes.
+ */
+
+/*
+ * Lookup the identity of the issuer of the original request.
+ * In the current implementation we use the curthread of the
+ * issuer, but different mechanisms may be implemented later
+ * so we do not make assumptions on the return value which for
+ * us is just an opaque identifier.
+ */
+
+static inline u_long
+g_sched_classify(struct bio *bp)
+{
+
+#if __FreeBSD_version > 800098
+	/* we have classifier fields in the struct bio */
+#define HAVE_BIO_CLASSIFIER
+	return ((u_long)bp->bio_classifier1);
+#else
+#warning old version!!!
+	while (bp->bio_parent != NULL)
+		bp = bp->bio_parent;
+
+	return ((u_long)bp->bio_caller1);
+#endif
+}
+
+/* Return the hash chain for the given key. */
+static inline struct g_hash *
+g_sched_hash(struct g_sched_softc *sc, u_long key)
+{
+
+	return (&sc->sc_hash[key & sc->sc_mask]);
+}
+
+/*
+ * Helper function for the children classes, which takes
+ * a geom and a bio and returns the private descriptor
+ * associated to the request.  This involves fetching
+ * the classification field and [al]locating the
+ * corresponding entry in the hash table.
+ */
+void *
+g_sched_get_class(struct g_geom *gp, struct bio *bp)
+{
+	struct g_sched_softc *sc;
+	struct g_sched_class *gsc;
+	struct g_gsched *gsp;
+	struct g_hash *bucket;
+	u_long key;
+
+	sc = gp->softc;
+	key = g_sched_classify(bp);
+	bucket = g_sched_hash(sc, key);
+	LIST_FOREACH(gsc, bucket, gsc_clist) {
+		if (key == gsc->gsc_key) {
+			gsc->gsc_refs++;
+			return (gsc->gsc_priv);
+		}
+	}
+
+	gsp = sc->sc_gsched;
+	gsc = malloc(sizeof(*gsc) + gsp->gs_priv_size,
+	    M_GEOM_SCHED, M_NOWAIT | M_ZERO);
+	if (!gsc)
+		return (NULL);
+
+	if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) {
+		free(gsc, M_GEOM_SCHED);
+		return (NULL);
+	}
+
+	gsc->gsc_refs = 2;	/* 1 for the hash table, 1 for the caller. */
+	gsc->gsc_key = key;
+	LIST_INSERT_HEAD(bucket, gsc, gsc_clist);
+
+	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
+
+	return (gsc->gsc_priv);
+}
+
+/*
+ * Release a reference to the per-client descriptor,
+ */
+void
+g_sched_put_class(struct g_geom *gp, void *priv)
+{
+	struct g_sched_class *gsc;
+	struct g_sched_softc *sc;
+
+	gsc = g_sched_priv2class(priv);
+	gsc->gsc_expire = ticks + me.gs_expire_secs * hz;
+
+	if (--gsc->gsc_refs > 0)
+		return;
+
+	sc = gp->softc;
+	sc->sc_gsched->gs_fini_class(sc->sc_data, priv);
+
+	LIST_REMOVE(gsc, gsc_clist);
+	free(gsc, M_GEOM_SCHED);
+}
+
+static void
+g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask,
+    struct g_gsched *gsp, void *data)
+{
+	struct g_sched_class *cp, *cp2;
+	int i;
+
+	if (!hp)
+		return;
+
+	if (data && gsp->gs_hash_unref)
+		gsp->gs_hash_unref(data);
+
+	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
+		LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2)
+			g_sched_put_class(gp, cp->gsc_priv);
+	}
+
+	hashdestroy(hp, M_GEOM_SCHED, mask);
+}
+
+static struct g_hash *
+g_sched_hash_init(struct g_gsched *gsp, u_long *mask, int flags)
+{
+	struct g_hash *hash;
+
+	if (gsp->gs_priv_size == 0)
+		return (NULL);
+
+	hash = hashinit_flags(G_SCHED_HASH_SIZE, M_GEOM_SCHED, mask, flags);
+
+	return (hash);
+}
+
+static void
+g_sched_flush_classes(struct g_geom *gp)
+{
+	struct g_sched_softc *sc;
+	struct g_sched_class *cp, *cp2;
+	int i;
+
+	sc = gp->softc;
+
+	if (!sc->sc_hash || ticks - sc->sc_flush_ticks <= 0)
+		return;
+
+	for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
+		LIST_FOREACH_SAFE(cp, &sc->sc_hash[i], gsc_clist, cp2) {
+			if (cp->gsc_refs == 1 && ticks - cp->gsc_expire > 0)
+				g_sched_put_class(gp, cp->gsc_priv);
+		}
+	}
+
+	sc->sc_flush_ticks = ticks + me.gs_expire_secs * hz;
+}
+
+/*
+ * Wait for the completion of any outstanding request.  To ensure
+ * that this does not take forever the caller has to make sure that
+ * no new request enter the scehduler before calling us.
+ *
+ * Must be called with the gp mutex held and topology locked.
+ */
+static int
+g_sched_wait_pending(struct g_geom *gp)
+{
+	struct g_sched_softc *sc = gp->softc;
+	int endticks = ticks + hz;
+
+	g_topology_assert();
+
+	while (sc->sc_pending && endticks - ticks >= 0)
+		msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4);
+
+	return (sc->sc_pending ? ETIMEDOUT : 0);
+}
+
+static int

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201004121637.o3CGbjSK080066>