Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 15 Oct 2014 16:54:18 +0000 (UTC)
From:      Mateusz Guzik <mjg@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-releng@freebsd.org
Subject:   svn commit: r273137 - in releng/10.1/sys: kern sys
Message-ID:  <201410151654.s9FGsIMF041920@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mjg
Date: Wed Oct 15 16:54:18 2014
New Revision: 273137
URL: https://svnweb.freebsd.org/changeset/base/273137

Log:
  MFC r273109:
  
  fget_unlocked currently reads 'fde' which is a structure consisting of
  serveral fields. In effect the read is inatomic and may result in
  obtaining file pointer with stale or incorrect capabilities.
  
  Example race is with dup2.
  
  Side effect is that capability checks can be circumvented.
  
  Fix the problem with introduction of sequence counters.
  
  MFC r269023,r272503,r272505,r272523,r272567,r272569,r272574:
  
  Prepare fget_unlocked for reading fd table only once.
  
  Some capsicum functions accept fdp + fd and lookup fde based on that.
  Add variants which accept fde.
  
  ===============================
  
  Add sequence counters with memory barriers.
  
  Current implementation is somewhat simplistic and hackish,
  will be improved later after possible memory barrier overhaul.
  
  ===============================
  
  Plug capability races.
  
  fp and appropriate capability lookups were not atomic, which could result in
  improper capabilities being checked.
  
  This could result either in protection bypass or in a spurious ENOTCAPABLE.
  
  Make fp + capability check atomic with the help of sequence counters.
  
  ===============================
  
  Put and #ifdef _KERNEL around the #include for opt_capsicum.h to
  hopefully allow the build to finish after r272505.
  
  ===============================
  
  filedesc: fix up breakage introduced in 272505
  
  Include sequence counter supports incoditionally [1]. This fixes reprted build
  problems with e.g. nvidia driver due to missing opt_capsicum.h.
  
  Replace fishy looking sizeof with offsetof. Make fde_seq the last member in
  order to simplify calculations.
  
  ===============================
  
  Keep struct filedescent comments within 80-char limit.
  
  ===============================
  
  seq_t needs to be visible to userspace
  
  Approved by:	re (kib)

Added:
  releng/10.1/sys/sys/seq.h
     - copied unchanged from r273109, stable/10/sys/sys/seq.h
Modified:
  releng/10.1/sys/kern/kern_descrip.c
  releng/10.1/sys/kern/sys_capability.c
  releng/10.1/sys/sys/capability.h
  releng/10.1/sys/sys/filedesc.h
Directory Properties:
  releng/10.1/   (props changed)

Modified: releng/10.1/sys/kern/kern_descrip.c
==============================================================================
--- releng/10.1/sys/kern/kern_descrip.c	Wed Oct 15 14:07:24 2014	(r273136)
+++ releng/10.1/sys/kern/kern_descrip.c	Wed Oct 15 16:54:18 2014	(r273137)
@@ -304,11 +304,18 @@ _fdfree(struct filedesc *fdp, int fd, in
 	struct filedescent *fde;
 
 	fde = &fdp->fd_ofiles[fd];
+#ifdef CAPABILITIES
+	if (!last)
+		seq_write_begin(&fde->fde_seq);
+#endif
 	filecaps_free(&fde->fde_caps);
 	if (last)
 		return;
-	bzero(fde, sizeof(*fde));
+	bzero(fde, fde_change_size);
 	fdunused(fdp, fd);
+#ifdef CAPABILITIES
+	seq_write_end(&fde->fde_seq);
+#endif
 }
 
 static inline void
@@ -899,13 +906,19 @@ do_dup(struct thread *td, int flags, int
 	/*
 	 * Duplicate the source descriptor.
 	 */
+#ifdef CAPABILITIES
+	seq_write_begin(&newfde->fde_seq);
+#endif
 	filecaps_free(&newfde->fde_caps);
-	*newfde = *oldfde;
+	memcpy(newfde, oldfde, fde_change_size);
 	filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
 	if ((flags & DUP_CLOEXEC) != 0)
 		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
 	else
 		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
+#ifdef CAPABILITIES
+	seq_write_end(&newfde->fde_seq);
+#endif
 	*retval = new;
 
 	if (delfp != NULL) {
@@ -1804,6 +1817,9 @@ finstall(struct thread *td, struct file 
 	}
 	fhold(fp);
 	fde = &fdp->fd_ofiles[*fd];
+#ifdef CAPABILITIES
+	seq_write_begin(&fde->fde_seq);
+#endif
 	fde->fde_file = fp;
 	if ((flags & O_CLOEXEC) != 0)
 		fde->fde_flags |= UF_EXCLOSE;
@@ -1811,6 +1827,9 @@ finstall(struct thread *td, struct file 
 		filecaps_move(fcaps, &fde->fde_caps);
 	else
 		filecaps_fill(&fde->fde_caps);
+#ifdef CAPABILITIES
+	seq_write_end(&fde->fde_seq);
+#endif
 	FILEDESC_XUNLOCK(fdp);
 	return (0);
 }
@@ -2336,9 +2355,13 @@ int
 fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     int needfcntl, struct file **fpp, cap_rights_t *haverightsp)
 {
+#ifdef CAPABILITIES
+	struct filedescent fde;
+#endif
 	struct file *fp;
 	u_int count;
 #ifdef CAPABILITIES
+	seq_t seq;
 	cap_rights_t haverights;
 	int error;
 #endif
@@ -2358,17 +2381,27 @@ fget_unlocked(struct filedesc *fdp, int 
 	 * due to preemption.
 	 */
 	for (;;) {
+#ifdef CAPABILITIES
+		seq = seq_read(fd_seq(fdp, fd));
+		fde = fdp->fd_ofiles[fd];
+		if (!seq_consistent(fd_seq(fdp, fd), seq)) {
+			cpu_spinwait();
+			continue;
+		}
+		fp = fde.fde_file;
+#else
 		fp = fdp->fd_ofiles[fd].fde_file;
+#endif
 		if (fp == NULL)
 			return (EBADF);
 #ifdef CAPABILITIES
-		haverights = *cap_rights(fdp, fd);
+		haverights = *cap_rights_fde(&fde);
 		if (needrightsp != NULL) {
 			error = cap_check(&haverights, needrightsp);
 			if (error != 0)
 				return (error);
 			if (cap_rights_is_set(needrightsp, CAP_FCNTL)) {
-				error = cap_fcntl_check(fdp, fd, needfcntl);
+				error = cap_fcntl_check_fde(&fde, needfcntl);
 				if (error != 0)
 					return (error);
 			}
@@ -2383,7 +2416,11 @@ fget_unlocked(struct filedesc *fdp, int 
 		 */
 		if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1)
 			continue;
+#ifdef	CAPABILITIES
+		if (seq_consistent_nomb(fd_seq(fdp, fd), seq))
+#else
 		if (fp == fdp->fd_ofiles[fd].fde_file)
+#endif
 			break;
 		fdrop(fp, curthread);
 	}
@@ -2740,6 +2777,7 @@ int
 dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
     int openerror, int *indxp)
 {
+	struct filedescent *newfde, *oldfde;
 	struct file *fp;
 	int error, indx;
 
@@ -2783,17 +2821,32 @@ dupfdopen(struct thread *td, struct file
 			return (EACCES);
 		}
 		fhold(fp);
-		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
-		filecaps_copy(&fdp->fd_ofiles[dfd].fde_caps,
-		    &fdp->fd_ofiles[indx].fde_caps);
+		newfde = &fdp->fd_ofiles[indx];
+		oldfde = &fdp->fd_ofiles[dfd];
+#ifdef CAPABILITIES
+		seq_write_begin(&newfde->fde_seq);
+#endif
+		memcpy(newfde, oldfde, fde_change_size);
+		filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
+#ifdef CAPABILITIES
+		seq_write_end(&newfde->fde_seq);
+#endif
 		break;
 	case ENXIO:
 		/*
 		 * Steal away the file pointer from dfd and stuff it into indx.
 		 */
-		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
-		bzero(&fdp->fd_ofiles[dfd], sizeof(fdp->fd_ofiles[dfd]));
+		newfde = &fdp->fd_ofiles[indx];
+		oldfde = &fdp->fd_ofiles[dfd];
+#ifdef CAPABILITIES
+		seq_write_begin(&newfde->fde_seq);
+#endif
+		memcpy(newfde, oldfde, fde_change_size);
+		bzero(oldfde, fde_change_size);
 		fdunused(fdp, dfd);
+#ifdef CAPABILITIES
+		seq_write_end(&newfde->fde_seq);
+#endif
 		break;
 	}
 	FILEDESC_XUNLOCK(fdp);

Modified: releng/10.1/sys/kern/sys_capability.c
==============================================================================
--- releng/10.1/sys/kern/sys_capability.c	Wed Oct 15 14:07:24 2014	(r273136)
+++ releng/10.1/sys/kern/sys_capability.c	Wed Oct 15 16:54:18 2014	(r273137)
@@ -199,11 +199,19 @@ cap_rights_to_vmprot(cap_rights_t *havep
  * any other way, as we want to keep all capability permission evaluation in
  * this one file.
  */
+
+cap_rights_t *
+cap_rights_fde(struct filedescent *fde)
+{
+
+	return (&fde->fde_rights);
+}
+
 cap_rights_t *
 cap_rights(struct filedesc *fdp, int fd)
 {
 
-	return (&fdp->fd_ofiles[fd].fde_rights);
+	return (cap_rights_fde(&fdp->fd_ofiles[fd]));
 }
 
 /*
@@ -486,24 +494,31 @@ out:
  * Test whether a capability grants the given fcntl command.
  */
 int
-cap_fcntl_check(struct filedesc *fdp, int fd, int cmd)
+cap_fcntl_check_fde(struct filedescent *fde, int cmd)
 {
 	uint32_t fcntlcap;
 
-	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
-	    ("%s: invalid fd=%d", __func__, fd));
-
 	fcntlcap = (1 << cmd);
 	KASSERT((CAP_FCNTL_ALL & fcntlcap) != 0,
 	    ("Unsupported fcntl=%d.", cmd));
 
-	if ((fdp->fd_ofiles[fd].fde_fcntls & fcntlcap) != 0)
+	if ((fde->fde_fcntls & fcntlcap) != 0)
 		return (0);
 
 	return (ENOTCAPABLE);
 }
 
 int
+cap_fcntl_check(struct filedesc *fdp, int fd, int cmd)
+{
+
+	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
+	    ("%s: invalid fd=%d", __func__, fd));
+
+	return (cap_fcntl_check_fde(&fdp->fd_ofiles[fd], cmd));
+}
+
+int
 sys_cap_fcntls_limit(struct thread *td, struct cap_fcntls_limit_args *uap)
 {
 	struct filedesc *fdp;

Modified: releng/10.1/sys/sys/capability.h
==============================================================================
--- releng/10.1/sys/sys/capability.h	Wed Oct 15 14:07:24 2014	(r273136)
+++ releng/10.1/sys/sys/capability.h	Wed Oct 15 16:54:18 2014	(r273137)
@@ -343,6 +343,7 @@ bool cap_rights_contains(const cap_right
 #define IN_CAPABILITY_MODE(td) ((td->td_ucred->cr_flags & CRED_FLAG_CAPMODE) != 0)
 
 struct filedesc;
+struct filedescent;
 
 /*
  * Test whether a capability grants the requested rights.
@@ -357,9 +358,11 @@ u_char	cap_rights_to_vmprot(cap_rights_t
  * For the purposes of procstat(1) and similar tools, allow kern_descrip.c to
  * extract the rights from a capability.
  */
+cap_rights_t	*cap_rights_fde(struct filedescent *fde);
 cap_rights_t	*cap_rights(struct filedesc *fdp, int fd);
 
 int	cap_ioctl_check(struct filedesc *fdp, int fd, u_long cmd);
+int	cap_fcntl_check_fde(struct filedescent *fde, int cmd);
 int	cap_fcntl_check(struct filedesc *fdp, int fd, int cmd);
 
 #else /* !_KERNEL */

Modified: releng/10.1/sys/sys/filedesc.h
==============================================================================
--- releng/10.1/sys/sys/filedesc.h	Wed Oct 15 14:07:24 2014	(r273136)
+++ releng/10.1/sys/sys/filedesc.h	Wed Oct 15 16:54:18 2014	(r273137)
@@ -38,6 +38,7 @@
 #include <sys/event.h>
 #include <sys/lock.h>
 #include <sys/priority.h>
+#include <sys/seq.h>
 #include <sys/sx.h>
 
 #include <machine/_limits.h>
@@ -50,14 +51,16 @@ struct filecaps {
 };
 
 struct filedescent {
-	struct file	*fde_file;		/* file structure for open file */
-	struct filecaps	 fde_caps;		/* per-descriptor rights */
-	uint8_t		 fde_flags;		/* per-process open file flags */
+	struct file	*fde_file;	/* file structure for open file */
+	struct filecaps	 fde_caps;	/* per-descriptor rights */
+	uint8_t		 fde_flags;	/* per-process open file flags */
+	seq_t		 fde_seq;	/* keep file and caps in sync */
 };
 #define	fde_rights	fde_caps.fc_rights
 #define	fde_fcntls	fde_caps.fc_fcntls
 #define	fde_ioctls	fde_caps.fc_ioctls
 #define	fde_nioctls	fde_caps.fc_nioctls
+#define	fde_change_size	(offsetof(struct filedescent, fde_seq))
 
 /*
  * This structure is used for the management of descriptors.  It may be
@@ -82,6 +85,7 @@ struct filedesc {
 	int	fd_holdleaderscount;	/* block fdfree() for shared close() */
 	int	fd_holdleaderswakeup;	/* fdfree() needs wakeup */
 };
+#define	fd_seq(fdp, fd)	(&(fdp)->fd_ofiles[(fd)].fde_seq)
 
 /*
  * Structure to keep track of (process leader, struct fildedesc) tuples.

Copied: releng/10.1/sys/sys/seq.h (from r273109, stable/10/sys/sys/seq.h)
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ releng/10.1/sys/sys/seq.h	Wed Oct 15 16:54:18 2014	(r273137, copy of r273109, stable/10/sys/sys/seq.h)
@@ -0,0 +1,146 @@
+/*-
+ * Copyright (c) 2014 Mateusz Guzik <mjg@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_SEQ_H_
+#define _SYS_SEQ_H_
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#endif
+#include <sys/types.h>
+
+/*
+ * seq_t may be included in structs visible to userspace
+ */
+typedef uint32_t seq_t;
+
+#ifdef _KERNEL
+
+/*
+ * Typical usage:
+ *
+ * writers:
+ * 	lock_exclusive(&obj->lock);
+ * 	seq_write_begin(&obj->seq);
+ * 	.....
+ * 	seq_write_end(&obj->seq);
+ * 	unlock_exclusive(&obj->unlock);
+ *
+ * readers:
+ * 	obj_t lobj;
+ * 	seq_t seq;
+ *
+ * 	for (;;) {
+ * 		seq = seq_read(&gobj->seq);
+ * 		lobj = gobj;
+ * 		if (seq_consistent(&gobj->seq, seq))
+ * 			break;
+ * 		cpu_spinwait();
+ * 	}
+ * 	foo(lobj);
+ */		
+
+/* A hack to get MPASS macro */
+#include <sys/lock.h>
+
+#include <machine/cpu.h>
+
+/*
+ * This is a temporary hack until memory barriers are cleaned up.
+ *
+ * atomic_load_acq_int at least on amd64 provides a full memory barrier,
+ * in a way which affects perforance.
+ *
+ * Hack below covers all architectures and avoids most of the penalty at least
+ * on amd64.
+ */
+static __inline int
+atomic_load_acq_rmb_int(volatile u_int *p)
+{
+	volatile u_int v;
+
+	v = *p;
+	atomic_load_acq_int(&v);
+	return (v);
+}
+
+static __inline bool
+seq_in_modify(seq_t seqp)
+{
+
+	return (seqp & 1);
+}
+
+static __inline void
+seq_write_begin(seq_t *seqp)
+{
+
+	MPASS(!seq_in_modify(*seqp));
+	atomic_add_acq_int(seqp, 1);
+}
+
+static __inline void
+seq_write_end(seq_t *seqp)
+{
+
+	atomic_add_rel_int(seqp, 1);
+	MPASS(!seq_in_modify(*seqp));
+}
+
+static __inline seq_t
+seq_read(seq_t *seqp)
+{
+	seq_t ret;
+
+	for (;;) {
+		ret = atomic_load_acq_rmb_int(seqp);
+		if (seq_in_modify(ret)) {
+			cpu_spinwait();
+			continue;
+		}
+		break;
+	}
+
+	return (ret);
+}
+
+static __inline seq_t
+seq_consistent(seq_t *seqp, seq_t oldseq)
+{
+
+	return (atomic_load_acq_rmb_int(seqp) == oldseq);
+}
+
+static __inline seq_t
+seq_consistent_nomb(seq_t *seqp, seq_t oldseq)
+{
+
+	return (*seqp == oldseq);
+}
+
+#endif	/* _KERNEL */
+#endif	/* _SYS_SEQ_H_ */



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201410151654.s9FGsIMF041920>