Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 6 Feb 2009 14:51:32 +0000 (UTC)
From:      John Baldwin <jhb@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r188232 - in head/sys: kern sys
Message-ID:  <200902061451.n16EpWoJ064143@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: jhb
Date: Fri Feb  6 14:51:32 2009
New Revision: 188232
URL: http://svn.freebsd.org/changeset/base/188232

Log:
  Expand the scope of the sysctllock sx lock to protect the sysctl tree itself.
  Back in 1.1 of kern_sysctl.c the sysctl() routine wired the "old" userland
  buffer for most sysctls (everything except kern.vnode.*).  I think to prevent
  issues with wiring too much memory it used a 'memlock' to serialize all
  sysctl(2) invocations, meaning that only one user buffer could be wired at
  a time.  In 5.0 the 'memlock' was converted to an sx lock and renamed to
  'sysctl lock'.  However, it still only served the purpose of serializing
  sysctls to avoid wiring too much memory and didn't actually protect the
  sysctl tree as its name suggested.  These changes expand the lock to actually
  protect the tree.
  
  Later on in 5.0, sysctl was changed to not wire buffers for requests by
  default (sysctl_handle_opaque() will still wire buffers larger than a single
  page, however).  As a result, user buffers are no longer wired as often.
  However, many sysctl handlers still wire user buffers, so it is still
  desirable to serialize userland sysctl requests.  Kernel sysctl requests
  are allowed to run in parallel, however.
  
  - Expose sysctl_lock()/sysctl_unlock() routines to exclusively lock the
    sysctl tree for a few places outside of kern_sysctl.c that manipulate
    the sysctl tree directly including the kernel linker and vfs_register().
  - sysctl_register() and sysctl_unregister() require the caller to lock
    the sysctl lock using sysctl_lock() and sysctl_unlock().  The rest of
    the public sysctl API manage the locking internally.
  - Add a locked variant of sysctl_remove_oid() for internal use so that
    external uses of the API do not need to be aware of locking requirements.
  - The kernel linker no longer needs Giant when manipulating the sysctl
    tree.
  - Add a missing break to the loop in vfs_register() so that we stop looking
    at the sysctl MIB once we have changed it.
  
  MFC after:	1 month

Modified:
  head/sys/kern/kern_linker.c
  head/sys/kern/kern_sysctl.c
  head/sys/kern/vfs_init.c
  head/sys/sys/sysctl.h

Modified: head/sys/kern/kern_linker.c
==============================================================================
--- head/sys/kern/kern_linker.c	Fri Feb  6 12:39:42 2009	(r188231)
+++ head/sys/kern/kern_linker.c	Fri Feb  6 14:51:32 2009	(r188232)
@@ -293,10 +293,10 @@ linker_file_register_sysctls(linker_file
 	if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
 		return;
 
-	mtx_lock(&Giant);
+	sysctl_lock();
 	for (oidp = start; oidp < stop; oidp++)
 		sysctl_register_oid(*oidp);
-	mtx_unlock(&Giant);
+	sysctl_unlock();
 }
 
 static void
@@ -310,10 +310,10 @@ linker_file_unregister_sysctls(linker_fi
 	if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
 		return;
 
-	mtx_lock(&Giant);
+	sysctl_lock();
 	for (oidp = start; oidp < stop; oidp++)
 		sysctl_unregister_oid(*oidp);
-	mtx_unlock(&Giant);
+	sysctl_unlock();
 }
 
 static int

Modified: head/sys/kern/kern_sysctl.c
==============================================================================
--- head/sys/kern/kern_sysctl.c	Fri Feb  6 12:39:42 2009	(r188231)
+++ head/sys/kern/kern_sysctl.c	Fri Feb  6 14:51:32 2009	(r188232)
@@ -65,24 +65,41 @@ static MALLOC_DEFINE(M_SYSCTLOID, "sysct
 static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer");
 
 /*
- * Locking - this locks the sysctl tree in memory.
+ * The sysctllock protects the MIB tree.  It also protects sysctl
+ * contexts used with dynamic sysctls.  The sysctl_register_oid() and
+ * sysctl_unregister_oid() routines require the sysctllock to already
+ * be held, so the sysctl_lock() and sysctl_unlock() routines are
+ * provided for the few places in the kernel which need to use that
+ * API rather than using the dynamic API.  Use of the dynamic API is
+ * strongly encouraged for most code.
+ *
+ * This lock is also used to serialize userland sysctl requests.  Some
+ * sysctls wire user memory, and serializing the requests limits the
+ * amount of wired user memory in use.
  */
 static struct sx sysctllock;
 
-#define	SYSCTL_LOCK()		sx_xlock(&sysctllock)
-#define	SYSCTL_UNLOCK()		sx_xunlock(&sysctllock)
-#define	SYSCTL_LOCK_ASSERT()	sx_assert(&sysctllock, SX_XLOCKED)
+#define	SYSCTL_SLOCK()		sx_slock(&sysctllock)
+#define	SYSCTL_SUNLOCK()	sx_sunlock(&sysctllock)
+#define	SYSCTL_XLOCK()		sx_xlock(&sysctllock)
+#define	SYSCTL_XUNLOCK()	sx_xunlock(&sysctllock)
+#define	SYSCTL_ASSERT_XLOCKED()	sx_assert(&sysctllock, SA_XLOCKED)
+#define	SYSCTL_ASSERT_LOCKED()	sx_assert(&sysctllock, SA_LOCKED)
 #define	SYSCTL_INIT()		sx_init(&sysctllock, "sysctl lock")
 
 static int sysctl_root(SYSCTL_HANDLER_ARGS);
 
 struct sysctl_oid_list sysctl__children; /* root list */
 
+static int	sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del,
+		    int recurse);
+
 static struct sysctl_oid *
 sysctl_find_oidname(const char *name, struct sysctl_oid_list *list)
 {
 	struct sysctl_oid *oidp;
 
+	SYSCTL_ASSERT_LOCKED();
 	SLIST_FOREACH(oidp, list, oid_link) {
 		if (strcmp(oidp->oid_name, name) == 0) {
 			return (oidp);
@@ -96,6 +113,19 @@ sysctl_find_oidname(const char *name, st
  *
  * Order by number in each list.
  */
+void
+sysctl_lock(void)
+{
+
+	SYSCTL_XLOCK();
+}
+
+void
+sysctl_unlock(void)
+{
+
+	SYSCTL_XUNLOCK();
+}
 
 void
 sysctl_register_oid(struct sysctl_oid *oidp)
@@ -108,6 +138,7 @@ sysctl_register_oid(struct sysctl_oid *o
 	 * First check if another oid with the same name already
 	 * exists in the parent's list.
 	 */
+	SYSCTL_ASSERT_XLOCKED();
 	p = sysctl_find_oidname(oidp->oid_name, parent);
 	if (p != NULL) {
 		if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
@@ -160,6 +191,7 @@ sysctl_unregister_oid(struct sysctl_oid 
 	struct sysctl_oid *p;
 	int error;
 
+	SYSCTL_ASSERT_XLOCKED();
 	error = ENOENT;
 	if (oidp->oid_number == OID_AUTO) {
 		error = EINVAL;
@@ -191,6 +223,12 @@ sysctl_ctx_init(struct sysctl_ctx_list *
 	if (c == NULL) {
 		return (EINVAL);
 	}
+
+	/*
+	 * No locking here, the caller is responsible for not adding
+	 * new nodes to a context until after this function has
+	 * returned.
+	 */
 	TAILQ_INIT(c);
 	return (0);
 }
@@ -209,8 +247,9 @@ sysctl_ctx_free(struct sysctl_ctx_list *
 	 * XXX This algorithm is a hack. But I don't know any
 	 * XXX better solution for now...
 	 */
+	SYSCTL_XLOCK();
 	TAILQ_FOREACH(e, clist, link) {
-		error = sysctl_remove_oid(e->entry, 0, 0);
+		error = sysctl_remove_oid_locked(e->entry, 0, 0);
 		if (error)
 			break;
 	}
@@ -227,19 +266,22 @@ sysctl_ctx_free(struct sysctl_ctx_list *
 		sysctl_register_oid(e1->entry);
 		e1 = TAILQ_PREV(e1, sysctl_ctx_list, link);
 	}
-	if (error)
+	if (error) {
+		SYSCTL_XUNLOCK();
 		return(EBUSY);
+	}
 	/* Now really delete the entries */
 	e = TAILQ_FIRST(clist);
 	while (e != NULL) {
 		e1 = TAILQ_NEXT(e, link);
-		error = sysctl_remove_oid(e->entry, 1, 0);
+		error = sysctl_remove_oid_locked(e->entry, 1, 0);
 		if (error)
 			panic("sysctl_remove_oid: corrupt tree, entry: %s",
 			    e->entry->oid_name);
 		free(e, M_SYSCTLOID);
 		e = e1;
 	}
+	SYSCTL_XUNLOCK();
 	return (error);
 }
 
@@ -249,6 +291,7 @@ sysctl_ctx_entry_add(struct sysctl_ctx_l
 {
 	struct sysctl_ctx_entry *e;
 
+	SYSCTL_ASSERT_XLOCKED();
 	if (clist == NULL || oidp == NULL)
 		return(NULL);
 	e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK);
@@ -263,6 +306,7 @@ sysctl_ctx_entry_find(struct sysctl_ctx_
 {
 	struct sysctl_ctx_entry *e;
 
+	SYSCTL_ASSERT_LOCKED();
 	if (clist == NULL || oidp == NULL)
 		return(NULL);
 	TAILQ_FOREACH(e, clist, link) {
@@ -284,13 +328,17 @@ sysctl_ctx_entry_del(struct sysctl_ctx_l
 
 	if (clist == NULL || oidp == NULL)
 		return (EINVAL);
+	SYSCTL_XLOCK();
 	e = sysctl_ctx_entry_find(clist, oidp);
 	if (e != NULL) {
 		TAILQ_REMOVE(clist, e, link);
+		SYSCTL_XUNLOCK();
 		free(e, M_SYSCTLOID);
 		return (0);
-	} else
+	} else {
+		SYSCTL_XUNLOCK();
 		return (ENOENT);
+	}
 }
 
 /*
@@ -302,9 +350,21 @@ sysctl_ctx_entry_del(struct sysctl_ctx_l
 int
 sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse)
 {
+	int error;
+
+	SYSCTL_XLOCK();
+	error = sysctl_remove_oid_locked(oidp, del, recurse);
+	SYSCTL_XUNLOCK();
+	return (error);
+}
+
+static int
+sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse)
+{
 	struct sysctl_oid *p;
 	int error;
 
+	SYSCTL_ASSERT_XLOCKED();
 	if (oidp == NULL)
 		return(EINVAL);
 	if ((oidp->oid_kind & CTLFLAG_DYN) == 0) {
@@ -323,7 +383,8 @@ sysctl_remove_oid(struct sysctl_oid *oid
 			SLIST_FOREACH(p, SYSCTL_CHILDREN(oidp), oid_link) {
 				if (!recurse)
 					return (ENOTEMPTY);
-				error = sysctl_remove_oid(p, del, recurse);
+				error = sysctl_remove_oid_locked(p, del,
+				    recurse);
 				if (error)
 					return (error);
 			}
@@ -368,6 +429,7 @@ sysctl_add_oid(struct sysctl_ctx_list *c
 	if (parent == NULL)
 		return(NULL);
 	/* Check if the node already exists, otherwise create it */
+	SYSCTL_XLOCK();
 	oidp = sysctl_find_oidname(name, parent);
 	if (oidp != NULL) {
 		if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
@@ -375,8 +437,10 @@ sysctl_add_oid(struct sysctl_ctx_list *c
 			/* Update the context */
 			if (clist != NULL)
 				sysctl_ctx_entry_add(clist, oidp);
+			SYSCTL_XUNLOCK();
 			return (oidp);
 		} else {
+			SYSCTL_XUNLOCK();
 			printf("can't re-use a leaf (%s)!\n", name);
 			return (NULL);
 		}
@@ -414,6 +478,7 @@ sysctl_add_oid(struct sysctl_ctx_list *c
 		sysctl_ctx_entry_add(clist, oidp);
 	/* Register this oid */
 	sysctl_register_oid(oidp);
+	SYSCTL_XUNLOCK();
 	return (oidp);
 }
 
@@ -427,12 +492,14 @@ sysctl_rename_oid(struct sysctl_oid *oid
 	char *newname;
 	void *oldname;
 
-	oldname = (void *)(uintptr_t)(const void *)oidp->oid_name;
 	len = strlen(name);
 	newname = malloc(len + 1, M_SYSCTLOID, M_WAITOK);
 	bcopy(name, newname, len + 1);
 	newname[len] = '\0';
+	SYSCTL_XLOCK();
+	oldname = (void *)(uintptr_t)(const void *)oidp->oid_name;
 	oidp->oid_name = newname;
+	SYSCTL_XUNLOCK();
 	free(oldname, M_SYSCTLOID);
 }
 
@@ -444,15 +511,21 @@ sysctl_move_oid(struct sysctl_oid *oid, 
 {
 	struct sysctl_oid *oidp;
 
-	if (oid->oid_parent == parent)
+	SYSCTL_XLOCK();
+	if (oid->oid_parent == parent) {
+		SYSCTL_XUNLOCK();
 		return (0);
+	}
 	oidp = sysctl_find_oidname(oid->oid_name, parent);
-	if (oidp != NULL)
+	if (oidp != NULL) {
+		SYSCTL_XUNLOCK();
 		return (EEXIST);
+	}
 	sysctl_unregister_oid(oid);
 	oid->oid_parent = parent;
 	oid->oid_number = OID_AUTO;
 	sysctl_register_oid(oid);
+	SYSCTL_XUNLOCK();
 	return (0);
 }
 
@@ -467,8 +540,10 @@ sysctl_register_all(void *arg)
 	struct sysctl_oid **oidp;
 
 	SYSCTL_INIT();
+	SYSCTL_XLOCK();
 	SET_FOREACH(oidp, sysctl_set)
 		sysctl_register_oid(*oidp);
+	SYSCTL_XUNLOCK();
 }
 SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_register_all, 0);
 
@@ -498,6 +573,7 @@ sysctl_sysctl_debug_dump_node(struct sys
 	int k;
 	struct sysctl_oid *oidp;
 
+	SYSCTL_ASSERT_LOCKED();
 	SLIST_FOREACH(oidp, l, oid_link) {
 
 		for (k=0; k<i; k++)
@@ -556,6 +632,7 @@ sysctl_sysctl_name(SYSCTL_HANDLER_ARGS)
 	struct sysctl_oid_list *lsp = &sysctl__children, *lsp2;
 	char buf[10];
 
+	SYSCTL_ASSERT_LOCKED();
 	while (namelen) {
 		if (!lsp) {
 			snprintf(buf,sizeof(buf),"%d",*name);
@@ -607,6 +684,7 @@ sysctl_sysctl_next_ls(struct sysctl_oid_
 {
 	struct sysctl_oid *oidp;
 
+	SYSCTL_ASSERT_LOCKED();
 	*len = level;
 	SLIST_FOREACH(oidp, lsp, oid_link) {
 		*next = oidp->oid_number;
@@ -687,7 +765,7 @@ name2oid (char *name, int *oid, int *len
 	struct sysctl_oid_list *lsp = &sysctl__children;
 	char *p;
 
-	SYSCTL_LOCK_ASSERT();
+	SYSCTL_ASSERT_LOCKED();
 
 	if (!*name)
 		return (ENOENT);
@@ -745,7 +823,7 @@ sysctl_sysctl_name2oid(SYSCTL_HANDLER_AR
 	int error, oid[CTL_MAXNAME], len;
 	struct sysctl_oid *op = 0;
 
-	SYSCTL_LOCK_ASSERT();
+	SYSCTL_ASSERT_LOCKED();
 
 	if (!req->newlen) 
 		return (ENOENT);
@@ -1091,9 +1169,9 @@ kernel_sysctl(struct thread *td, int *na
 	req.newfunc = sysctl_new_kernel;
 	req.lock = REQ_LOCKED;
 
-	SYSCTL_LOCK();
+	SYSCTL_SLOCK();
 	error = sysctl_root(0, name, namelen, &req);
-	SYSCTL_UNLOCK();
+	SYSCTL_SUNLOCK();
 
 	if (req.lock == REQ_WIRED && req.validlen > 0)
 		vsunlock(req.oldptr, req.validlen);
@@ -1125,6 +1203,9 @@ kernel_sysctlbyname(struct thread *td, c
 	/*
 	 * XXX: Prone to a possible race condition between lookup and
 	 * execution? Maybe put locking around it?
+	 *
+	 * Userland is just as racy, so I think the current implementation
+	 * is fine.
 	 */
 
 	error = kernel_sysctl(td, oid, 2, oid, &oidlen,
@@ -1236,6 +1317,7 @@ sysctl_find_oid(int *name, u_int namelen
 	struct sysctl_oid *oid;
 	int indx;
 
+	SYSCTL_ASSERT_LOCKED();
 	oid = SLIST_FIRST(&sysctl__children);
 	indx = 0;
 	while (oid && indx < CTL_MAXNAME) {
@@ -1279,7 +1361,7 @@ sysctl_root(SYSCTL_HANDLER_ARGS)
 	struct sysctl_oid *oid;
 	int error, indx, lvl;
 
-	SYSCTL_LOCK_ASSERT();
+	SYSCTL_ASSERT_LOCKED();
 
 	error = sysctl_find_oid(arg1, arg2, &oid, &indx, req);
 	if (error)
@@ -1357,7 +1439,7 @@ struct sysctl_args {
 int
 __sysctl(struct thread *td, struct sysctl_args *uap)
 {
-	int error, name[CTL_MAXNAME];
+	int error, i, name[CTL_MAXNAME];
 	size_t j;
 
 	if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
@@ -1373,7 +1455,7 @@ __sysctl(struct thread *td, struct sysct
 	if (error && error != ENOMEM)
 		return (error);
 	if (uap->oldlenp) {
-		int i = copyout(&j, uap->oldlenp, sizeof(j));
+		i = copyout(&j, uap->oldlenp, sizeof(j));
 		if (i)
 			return (i);
 	}
@@ -1425,7 +1507,7 @@ userland_sysctl(struct thread *td, int *
 	req.newfunc = sysctl_new_user;
 	req.lock = REQ_LOCKED;
 
-	SYSCTL_LOCK();
+	SYSCTL_XLOCK();
 	CURVNET_SET(TD_TO_VNET(curthread));
 
 	for (;;) {
@@ -1438,7 +1520,7 @@ userland_sysctl(struct thread *td, int *
 	}
 
 	CURVNET_RESTORE();
-	SYSCTL_UNLOCK();
+	SYSCTL_XUNLOCK();
 
 	if (req.lock == REQ_WIRED && req.validlen > 0)
 		vsunlock(req.oldptr, req.validlen);

Modified: head/sys/kern/vfs_init.c
==============================================================================
--- head/sys/kern/vfs_init.c	Fri Feb  6 12:39:42 2009	(r188231)
+++ head/sys/kern/vfs_init.c	Fri Feb  6 14:51:32 2009	(r188232)
@@ -165,12 +165,15 @@ vfs_register(struct vfsconf *vfc)
 	 * preserved by re-registering the oid after modifying its
 	 * number.
 	 */
+	sysctl_lock();
 	SLIST_FOREACH(oidp, &sysctl__vfs_children, oid_link)
 		if (strcmp(oidp->oid_name, vfc->vfc_name) == 0) {
 			sysctl_unregister_oid(oidp);
 			oidp->oid_number = vfc->vfc_typenum;
 			sysctl_register_oid(oidp);
+			break;
 		}
+	sysctl_unlock();
 
 	/*
 	 * Initialise unused ``struct vfsops'' fields, to use

Modified: head/sys/sys/sysctl.h
==============================================================================
--- head/sys/sys/sysctl.h	Fri Feb  6 12:39:42 2009	(r188231)
+++ head/sys/sys/sysctl.h	Fri Feb  6 14:51:32 2009	(r188232)
@@ -799,6 +799,8 @@ int	userland_sysctl(struct thread *td, i
 			size_t *retval, int flags);
 int	sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
 			int *nindx, struct sysctl_req *req);
+void	sysctl_lock(void);
+void	sysctl_unlock(void);
 int	sysctl_wire_old_buffer(struct sysctl_req *req, size_t len);
 
 #else	/* !_KERNEL */



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200902061451.n16EpWoJ064143>