Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 24 Jun 2008 17:46:39 -0400
From:      Christopher Thunes <c2thunes@brewtab.com>
To:        freebsd-jail@freebsd.org
Subject:   Memory limits on 7.0
Message-ID:  <48616B3F.4030705@brewtab.com>

next in thread | raw e-mail | index | archive | help
This is a multi-part message in MIME format.
--------------040604060800000202080402
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 7bit

Hey everyone,
   I spent some time working on getting cdjones' memory limit patches 
updated for 7.0 and beyond and thought I'd post my progress. I've 
attached my current patch which implements memory limits on 7.0-RELEASE, 
but only for the older (and default in -RELEASE) bsd4 scheduler (won't 
work at all on ULE). I haven't yet started work for ULE or getting CPU 
sharing working. This patch also includes fixes for problems in the 
original cdjones patches. If you want to give it a whirl it should apply 
cleanly to a 7.0-RELEASE source tree and if you run into any issues let 
me know.

- Chris

--------------040604060800000202080402
Content-Type: text/x-diff;
 name="memory_limits_70.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="memory_limits_70.patch"

diff -burN src.old/lib/libc/sys/Symbol.map src.new/lib/libc/sys/Symbol.map
--- src.old/lib/libc/sys/Symbol.map	2007-08-21 21:56:35.000000000 -0400
+++ src.new/lib/libc/sys/Symbol.map	2008-05-28 19:55:04.000000000 -0400
@@ -131,6 +131,7 @@
 	issetugid;
 	jail;
 	jail_attach;
+        jail_set_resource_limits;
 	kenv;
 	kevent;
 	kill;
@@ -580,6 +581,8 @@
 	__sys_jail;
 	_jail_attach;
 	__sys_jail_attach;
+        _jail_set_resource_limits;
+        __sys_jail_set_resource_limits;
 	_kenv;
 	__sys_kenv;
 	_kevent;
diff -burN src.old/sys/kern/init_sysent.c src.new/sys/kern/init_sysent.c
--- src.old/sys/kern/init_sysent.c	2007-08-16 01:32:25.000000000 -0400
+++ src.new/sys/kern/init_sysent.c	2008-05-28 19:49:37.000000000 -0400
@@ -2,8 +2,8 @@
  * System call switch table.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/kern/init_sysent.c,v 1.230 2007/08/16 05:32:25 davidxu Exp $
- * created from FreeBSD: src/sys/kern/syscalls.master,v 1.232 2007/07/04 22:47:37 peter Exp 
+ * $FreeBSD$
+ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.233 2007/08/16 05:26:41 davidxu Exp 
  */
 
 #include "opt_compat.h"
@@ -511,4 +511,5 @@
 	{ AS(truncate_args), (sy_call_t *)truncate, AUE_TRUNCATE, NULL, 0, 0 },	/* 479 = truncate */
 	{ AS(ftruncate_args), (sy_call_t *)ftruncate, AUE_FTRUNCATE, NULL, 0, 0 },	/* 480 = ftruncate */
 	{ AS(thr_kill2_args), (sy_call_t *)thr_kill2, AUE_KILL, NULL, 0, 0 },	/* 481 = thr_kill2 */
+	{ AS(jail_set_resource_limits_args), (sy_call_t *)jail_set_resource_limits, AUE_NULL, NULL, 0, 0 },	/* 482 = jail_set_resource_limits */
 };
diff -burN src.old/sys/kern/kern_jail.c src.new/sys/kern/kern_jail.c
--- src.old/sys/kern/kern_jail.c	2007-04-13 19:54:22.000000000 -0400
+++ src.new/sys/kern/kern_jail.c	2008-06-19 03:16:43.000000000 -0400
@@ -5,8 +5,38 @@
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
+ *
+ *  Portions copyright (c) 2006 Chris Jones,
+ *  All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project by Chris Jones
+ * thanks to the support of Google's Summer of Code program and
+ * mentoring by Kip Macy.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE. 
+ *
  */
 
+
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD: src/sys/kern/kern_jail.c,v 1.70 2007/04/13 23:54:22 pjd Exp $");
 
@@ -15,6 +45,7 @@
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/kernel.h>
+#include <sys/kthread.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/sysproto.h>
@@ -33,6 +64,12 @@
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_map.h>
+#include <vm/vm_pageout.h>
 #include <net/if.h>
 #include <netinet/in.h>
 
@@ -78,12 +115,27 @@
     &jail_mount_allowed, 0,
     "Processes in jail can mount/unmount jail-friendly file systems");
 
+int     jail_limit_memory = 0;
+SYSCTL_INT(_security_jail, OID_AUTO, limit_jail_memory, CTLFLAG_RW,
+	   &jail_limit_memory, 0,
+	   "Limit jails' memory usage");
+
+int     jail_memory_pager_interval = 5;
+SYSCTL_INT(_security_jail, OID_AUTO, jail_pager_interval,
+	   CTLTYPE_INT | CTLFLAG_RW,
+	   &jail_memory_pager_interval, 0,
+	   "Interval between jail memory limit checks");
+
+
 /* allprison, lastprid, and prisoncount are protected by allprison_lock. */
 struct	prisonlist allprison;
 struct	sx allprison_lock;
 int	lastprid = 0;
 int	prisoncount = 0;
 
+/* Make the sched_lock visible */
+extern struct mtx sched_lock;
+
 /*
  * List of jail services. Protected by allprison_lock.
  */
@@ -114,6 +166,104 @@
 
 SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL);
 
+static void
+jpager_td(void *arg)
+{
+	struct proc *p;
+	struct prison *pr = arg;
+	struct thread *td;
+	long limit, cursize, newsize, usage;
+	int breakout;
+	int flags = J_PAGER_TD_ACTIVE;
+	pr->pr_pager_flags_ptr = &flags;
+	
+	for (;;) {
+		if (flags & J_PAGER_TD_DIE)
+			break;
+	       
+		if (jail_limit_memory && pr->pr_mem_limit) {
+			/* 
+			 * TODO: consider whether it might be better to start
+			 * pushing back when we approach the limit, rather than
+			 * when we hit it.
+			 * 
+			 */
+			limit = prison_memory_limit(pr);
+			usage = prison_memory(pr);
+			
+                        /* Copy the current memory usage to the prison struct */
+                        mtx_lock(&pr->pr_mtx);
+                        pr->pr_mem_usage = usage;
+                        mtx_unlock(&pr->pr_mtx);
+
+			/*
+			 * The logic from vm_daemon() really needs to go here.
+			 * Problem: we want to push things below their rlimits,
+			 * and vm_daemon doesn't do that.  It'd be better to 
+			 * refactor vm_daemon to fit, but this'll do for now.
+			 *
+			 */
+			
+			if ((usage - limit) > 0) {
+				sx_slock(&allproc_lock);
+				LIST_FOREACH(p, &allproc, p_list) {
+					
+					if (pr != p->p_ucred->cr_prison || !p->p_vmspace)
+						continue;
+	
+					PROC_LOCK(p);
+					if (p->p_flag & (P_SYSTEM | P_WEXIT)) {
+						PROC_UNLOCK(p);
+						continue;
+					}
+					
+					mtx_lock_spin(&sched_lock);
+					breakout = 0;
+					FOREACH_THREAD_IN_PROC(p, td) {
+						if (!TD_ON_RUNQ(td) &&
+						    !TD_IS_RUNNING(td) &&
+						    !TD_IS_SLEEPING(td)) {
+							breakout = 1;
+							break;
+						}
+					}
+					mtx_unlock_spin(&sched_lock);
+					if (breakout) {
+						PROC_UNLOCK(p);
+						continue;
+					}
+					
+					/* NOTE: we differ here from vm_daemon b/c we don't 
+					 * care about the rlimit; things that are exceeding that will
+					 * get caught in due course.  We need, however, to decrease
+					 * the pressure on our permitted memory allocation.  Fortunately, 
+					 * we only care about eventually hitting the limit, so if we
+					 * don't get there right away, it's okay.
+					 */      
+					
+					/* TODO: this arbitrarily reduces each process's space by
+					 * 6.25% (until it's completely swapped out) while
+					 * we're under memory pressure.  A better way would be 
+					 * to either hit large processes first, or to hit the
+					 * least-active processes first, or go proportionally,
+					 * or .... 
+					 */
+					newsize = cursize = vmspace_resident_count(p->p_vmspace);
+					newsize -= newsize / 16;
+					if (cursize < 0)
+						newsize = 0;
+					PROC_UNLOCK(p);
+					vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, newsize);
+				} /* end LIST_FOREACH procs */
+				sx_sunlock(&allproc_lock);
+			}
+		}
+		tsleep(pr, 0, "-", jail_memory_pager_interval * hz); 
+	}
+	
+	kthread_exit(0);
+}
+
 /*
  * struct jail_args {
  *	struct jail *jail;
@@ -127,6 +277,7 @@
 	struct prison_service *psrv;
 	struct jail j;
 	struct jail_attach_args jaa;
+        struct proc *j_pager_proc = NULL;
 	int vfslocked, error, tryprid;
 
 	error = copyin(uap->jail, &j, sizeof(j));
@@ -135,6 +286,7 @@
 	if (j.version != 0)
 		return (EINVAL);
 
+
 	MALLOC(pr, struct prison *, sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
 	mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF);
 	pr->pr_ref = 1;
@@ -156,7 +308,10 @@
 		goto e_dropvnref;
 	pr->pr_ip = j.ip_number;
 	pr->pr_linux = NULL;
+        pr->pr_sched_shares = j.sched_shares;
 	pr->pr_securelevel = securelevel;
+        pr->pr_mem_limit = j.mem_limit;
+
 	if (prison_service_slots == 0)
 		pr->pr_slots = NULL;
 	else {
@@ -169,6 +324,7 @@
 	tryprid = lastprid + 1;
 	if (tryprid == JAIL_MAX)
 		tryprid = 1;
+
 next:
 	LIST_FOREACH(tpr, &allprison, pr_list) {
 		if (tpr->pr_id == tryprid) {
@@ -190,6 +346,11 @@
 	}
 	sx_sunlock(&allprison_lock);
 
+	if (kthread_create(jpager_td, pr, (void *) j_pager_proc, 0, 0, "jpager %d", pr->pr_id))
+		goto e_dropprref;
+	KASSERT(j_pager_proc != NULL, ("NULL j_pager_proc"));
+	pr->pr_pager = j_pager_proc;
+
 	error = jail_attach(td, &jaa);
 	if (error)
 		goto e_dropprref;
@@ -199,6 +360,11 @@
 	td->td_retval[0] = jaa.jid;
 	return (0);
 e_dropprref:
+        if (j_pager_proc != NULL) {
+            *pr->pr_pager_flags_ptr = J_PAGER_TD_DIE;
+            wakeup(pr);
+        }
+
 	sx_xlock(&allprison_lock);
 	LIST_REMOVE(pr, pr_list);
 	prisoncount--;
@@ -267,11 +433,13 @@
 
 	newcred = crget();
 	PROC_LOCK(p);
+
 	oldcred = p->p_ucred;
 	setsugid(p);
 	crcopy(newcred, oldcred);
 	newcred->cr_prison = pr;
 	p->p_ucred = newcred;
+
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
@@ -314,6 +482,9 @@
 	pr->pr_ref--;
 	if (pr->pr_ref == 0) {
 		mtx_unlock(&pr->pr_mtx);
+                *pr->pr_pager_flags_ptr = J_PAGER_TD_DIE;
+                wakeup(pr);
+
 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
 		return;
@@ -436,6 +607,92 @@
 	return (ok);
 }
 
+/* Given credential, return memory usage in bytes. */
+long
+prison_memory(struct prison *pr)
+{
+	struct proc *p;
+	long mem_used = 0;
+	
+	/* 
+	 * TODO: this is a really bad way of doing the
+	 * search, as we end up going across all processes
+	 * for each jail.  It'd be more efficient to just do 
+	 * this once in a period and update the relevant jail.
+	 *
+	 */
+	FOREACH_PROC_IN_SYSTEM(p) {
+		PROC_LOCK(p);
+		if (!jailed(p->p_ucred) ||
+		    (pr != p->p_ucred->cr_prison) ||
+		    !p->p_vmspace) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+		mem_used += vmspace_resident_count(p->p_vmspace);
+		PROC_UNLOCK(p);
+	}
+       	mem_used *= PAGE_SIZE;
+	return mem_used;
+}
+
+/* Given credential, return permitted memory usage in bytes. */
+long
+prison_memory_limit(struct prison *pr)
+{
+	vm_pindex_t memlimit;
+	mtx_lock(&pr->pr_mtx);
+	memlimit = (vm_pindex_t) pr->pr_mem_limit;
+	mtx_unlock(&pr->pr_mtx);
+	return memlimit;
+}
+
+/* 
+ * Change resource limit for a prison.
+ * 
+ * unsigned int jid: id of jail to mess with
+ *
+ * int cpushares:  0 -> remove prison from cpu limits
+ *                -1 -> don't change existing shares
+ *                >0 -> set cpu shares
+ *
+ * int memlimit:   0 -> remove prison from mem limits
+ *                -1 -> don't change existing limit
+ *                >1 -> set memory limit (bytes)
+ *
+ * TODO: might this be better handled via a writable 
+ * sysctl than with a new syscall?
+ */
+int
+jail_set_resource_limits(struct thread *td, struct jail_set_resource_limits_args *uap)
+{
+	struct prison *pr;
+	int error;
+
+	error = suser(td);
+	if (error)
+		return (error);
+
+	sx_xlock(&allprison_lock);
+	LIST_FOREACH(pr, &allprison, pr_list) {
+		if (pr->pr_id == uap->jid)
+			break;
+	}
+	if (NULL == pr) {
+            sx_unlock(&allprison_lock);
+		return 1;
+	}
+	
+	mtx_lock(&pr->pr_mtx);
+	if (-1 != uap->cpushares)
+		pr->pr_sched_shares = uap->cpushares;
+	if (-1 != uap->memlimit)
+		pr->pr_mem_limit = uap->memlimit;
+	mtx_unlock(&pr->pr_mtx);
+	sx_unlock(&allprison_lock);
+	return 0;
+}
+
 /*
  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
  */
@@ -955,9 +1212,15 @@
 		xp->pr_id = pr->pr_id;
 		xp->pr_ip = pr->pr_ip;
 		strlcpy(xp->pr_path, pr->pr_path, sizeof(xp->pr_path));
+
 		mtx_lock(&pr->pr_mtx);
+		xp->pr_sched_shares = pr->pr_sched_shares;
+		xp->pr_estcpu = pr->pr_estcpu;
+		xp->pr_mem_limit = pr->pr_mem_limit;
+		xp->pr_mem_usage = pr->pr_mem_usage;
 		strlcpy(xp->pr_host, pr->pr_host, sizeof(xp->pr_host));
 		mtx_unlock(&pr->pr_mtx);
+
 		xp++;
 	}
 	sx_sunlock(&allprison_lock);
diff -burN src.old/sys/kern/syscalls.c src.new/sys/kern/syscalls.c
--- src.old/sys/kern/syscalls.c	2007-08-16 01:32:26.000000000 -0400
+++ src.new/sys/kern/syscalls.c	2008-05-28 19:49:37.000000000 -0400
@@ -2,8 +2,8 @@
  * System call names.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/kern/syscalls.c,v 1.214 2007/08/16 05:32:26 davidxu Exp $
- * created from FreeBSD: src/sys/kern/syscalls.master,v 1.232 2007/07/04 22:47:37 peter Exp 
+ * $FreeBSD$
+ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.233 2007/08/16 05:26:41 davidxu Exp 
  */
 
 const char *syscallnames[] = {
@@ -489,4 +489,5 @@
 	"truncate",			/* 479 = truncate */
 	"ftruncate",			/* 480 = ftruncate */
 	"thr_kill2",			/* 481 = thr_kill2 */
+	"jail_set_resource_limits",			/* 482 = jail_set_resource_limits */
 };
diff -burN src.old/sys/kern/syscalls.master src.new/sys/kern/syscalls.master
--- src.old/sys/kern/syscalls.master	2007-08-16 01:26:41.000000000 -0400
+++ src.new/sys/kern/syscalls.master	2008-05-28 11:03:25.000000000 -0400
@@ -847,5 +847,7 @@
 479	AUE_TRUNCATE	STD	{ int truncate(char *path, off_t length); }
 480	AUE_FTRUNCATE	STD	{ int ftruncate(int fd, off_t length); }
 481	AUE_KILL	STD	{ int thr_kill2(pid_t pid, long id, int sig); }
+482	AUE_NULL	STD	{ int jail_set_resource_limits(unsigned int jid, \
+				    int cpushares, int memlimit); }
 ; Please copy any additions and changes to the following compatability tables:
 ; sys/compat/freebsd32/syscalls.master
diff -burN src.old/sys/kern/systrace_args.c src.new/sys/kern/systrace_args.c
--- src.old/sys/kern/systrace_args.c	2007-08-16 01:32:26.000000000 -0400
+++ src.new/sys/kern/systrace_args.c	2008-05-28 19:49:37.000000000 -0400
@@ -2,7 +2,7 @@
  * System call argument to DTrace register array converstion.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/kern/systrace_args.c,v 1.14 2007/08/16 05:32:26 davidxu Exp $
+ * $FreeBSD$
  * This file is part of the DTrace syscall provider.
  */
 
@@ -2871,6 +2871,15 @@
 		*n_args = 3;
 		break;
 	}
+	/* jail_set_resource_limits */
+	case 482: {
+		struct jail_set_resource_limits_args *p = params;
+		uarg[0] = p->jid; /* unsigned int */
+		iarg[1] = p->cpushares; /* int */
+		iarg[2] = p->memlimit; /* int */
+		*n_args = 3;
+		break;
+	}
 	default:
 		*n_args = 0;
 		break;
diff -burN src.old/sys/sys/jail.h src.new/sys/sys/jail.h
--- src.old/sys/sys/jail.h	2007-04-05 19:19:13.000000000 -0400
+++ src.new/sys/sys/jail.h	2008-05-28 09:35:21.000000000 -0400
@@ -18,6 +18,8 @@
 	char		*path;
 	char		*hostname;
 	u_int32_t	ip_number;
+        unsigned int    sched_shares;
+        unsigned int    mem_limit;
 };
 
 struct xprison {
@@ -26,13 +28,24 @@
 	char		 pr_path[MAXPATHLEN];
 	char 		 pr_host[MAXHOSTNAMELEN];
 	u_int32_t	 pr_ip;
+        unsigned int     pr_sched_shares;
+        unsigned int     pr_estcpu;
+        unsigned int     pr_mem_limit;
+        unsigned int     pr_mem_usage;
 };
-#define	XPRISON_VERSION	1
+#define	XPRISON_VERSION	2
+
+#define JAIL_MINIMUM_SHARES 1
+
+#define J_PAGER_TD_ACTIVE 0x01
+#define J_PAGER_TD_DIE    0x02
+#define J_PAGER_TD_DEAD   0x04
 
 #ifndef _KERNEL
 
 int jail(struct jail *);
 int jail_attach(int);
+int jail_set_resource_limits(unsigned int, int, int);
 
 #else /* _KERNEL */
 
@@ -73,6 +86,12 @@
 	int		 pr_securelevel;		/* (p) securelevel */
 	struct task	 pr_task;			/* (d) destroy task */
 	struct mtx	 pr_mtx;
+	u_int32_t        pr_sched_shares;		/* (p) jail priority */
+	u_int		 pr_estcpu;			/* (p) est. cpu of jail */
+        struct proc     *pr_pager;                      /* (c) pager pid */
+        int             *pr_pager_flags_ptr;            /* (p) communication to pager */
+        size_t           pr_mem_limit;                  /* (p) memory allocation limit */
+        size_t           pr_mem_usage;                  /* (p) memory in use */
 	void		**pr_slots;			/* (p) additional data */
 };
 #endif /* _KERNEL || _WANT_PRISON */
@@ -113,6 +132,8 @@
 void prison_hold(struct prison *pr);
 int prison_if(struct ucred *cred, struct sockaddr *sa);
 int prison_ip(struct ucred *cred, int flag, u_int32_t *ip);
+long prison_memory(struct prison *pr);
+long prison_memory_limit(struct prison *pr);
 int prison_priv_check(struct ucred *cred, int priv);
 void prison_remote_ip(struct ucred *cred, int flags, u_int32_t *ip);
 
diff -burN src.old/sys/sys/syscall.h src.new/sys/sys/syscall.h
--- src.old/sys/sys/syscall.h	2007-08-16 01:32:26.000000000 -0400
+++ src.new/sys/sys/syscall.h	2008-05-28 19:49:37.000000000 -0400
@@ -2,8 +2,8 @@
  * System call numbers.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/sys/syscall.h,v 1.211 2007/08/16 05:32:26 davidxu Exp $
- * created from FreeBSD: src/sys/kern/syscalls.master,v 1.232 2007/07/04 22:47:37 peter Exp 
+ * $FreeBSD$
+ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.233 2007/08/16 05:26:41 davidxu Exp 
  */
 
 #define	SYS_syscall	0
@@ -401,4 +401,5 @@
 #define	SYS_truncate	479
 #define	SYS_ftruncate	480
 #define	SYS_thr_kill2	481
-#define	SYS_MAXSYSCALL	482
+#define	SYS_jail_set_resource_limits	482
+#define	SYS_MAXSYSCALL	483
diff -burN src.old/sys/sys/syscall.mk src.new/sys/sys/syscall.mk
--- src.old/sys/sys/syscall.mk	2007-08-16 01:32:26.000000000 -0400
+++ src.new/sys/sys/syscall.mk	2008-05-28 19:49:37.000000000 -0400
@@ -1,7 +1,7 @@
 # FreeBSD system call names.
 # DO NOT EDIT-- this file is automatically generated.
-# $FreeBSD: src/sys/sys/syscall.mk,v 1.166 2007/08/16 05:32:26 davidxu Exp $
-# created from FreeBSD: src/sys/kern/syscalls.master,v 1.232 2007/07/04 22:47:37 peter Exp 
+# $FreeBSD$
+# created from FreeBSD: src/sys/kern/syscalls.master,v 1.233 2007/08/16 05:26:41 davidxu Exp 
 MIASM =  \
 	syscall.o \
 	exit.o \
@@ -349,4 +349,5 @@
 	lseek.o \
 	truncate.o \
 	ftruncate.o \
-	thr_kill2.o
+	thr_kill2.o \
+	jail_set_resource_limits.o
diff -burN src.old/sys/sys/sysproto.h src.new/sys/sys/sysproto.h
--- src.old/sys/sys/sysproto.h	2007-08-16 01:32:26.000000000 -0400
+++ src.new/sys/sys/sysproto.h	2008-05-28 19:49:37.000000000 -0400
@@ -2,8 +2,8 @@
  * System call prototypes.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/sys/sysproto.h,v 1.215 2007/08/16 05:32:26 davidxu Exp $
- * created from FreeBSD: src/sys/kern/syscalls.master,v 1.232 2007/07/04 22:47:37 peter Exp 
+ * $FreeBSD$
+ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.233 2007/08/16 05:26:41 davidxu Exp 
  */
 
 #ifndef _SYS_SYSPROTO_H_
@@ -1520,6 +1520,11 @@
 	char id_l_[PADL_(long)]; long id; char id_r_[PADR_(long)];
 	char sig_l_[PADL_(int)]; int sig; char sig_r_[PADR_(int)];
 };
+struct jail_set_resource_limits_args {
+	char jid_l_[PADL_(unsigned int)]; unsigned int jid; char jid_r_[PADR_(unsigned int)];
+	char cpushares_l_[PADL_(int)]; int cpushares; char cpushares_r_[PADR_(int)];
+	char memlimit_l_[PADL_(int)]; int memlimit; char memlimit_r_[PADR_(int)];
+};
 int	nosys(struct thread *, struct nosys_args *);
 void	sys_exit(struct thread *, struct sys_exit_args *);
 int	fork(struct thread *, struct fork_args *);
@@ -1859,6 +1864,7 @@
 int	truncate(struct thread *, struct truncate_args *);
 int	ftruncate(struct thread *, struct ftruncate_args *);
 int	thr_kill2(struct thread *, struct thr_kill2_args *);
+int	jail_set_resource_limits(struct thread *, struct jail_set_resource_limits_args *);
 
 #ifdef COMPAT_43
 
@@ -2423,6 +2429,7 @@
 #define	SYS_AUE_truncate	AUE_TRUNCATE
 #define	SYS_AUE_ftruncate	AUE_FTRUNCATE
 #define	SYS_AUE_thr_kill2	AUE_KILL
+#define	SYS_AUE_jail_set_resource_limits	AUE_NULL
 
 #undef PAD_
 #undef PADL_
diff -burN src.old/sys/vm/vm_pageout.c src.new/sys/vm/vm_pageout.c
--- src.old/sys/vm/vm_pageout.c	2007-09-25 02:25:06.000000000 -0400
+++ src.new/sys/vm/vm_pageout.c	2008-05-28 13:05:44.000000000 -0400
@@ -208,7 +208,6 @@
 int vm_page_max_wired;		/* XXX max # of wired pages system-wide */
 
 #if !defined(NO_SWAPPING)
-static void vm_pageout_map_deactivate_pages(vm_map_t, long);
 static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
 static void vm_req_vmdaemon(int req);
 #endif
@@ -594,7 +593,7 @@
  * deactivate some number of pages in a map, try to do it fairly, but
  * that is really hard to do.
  */
-static void
+void
 vm_pageout_map_deactivate_pages(map, desired)
 	vm_map_t map;
 	long desired;
diff -burN src.old/sys/vm/vm_pageout.h src.new/sys/vm/vm_pageout.h
--- src.old/sys/vm/vm_pageout.h	2005-01-06 21:29:27.000000000 -0500
+++ src.new/sys/vm/vm_pageout.h	2008-05-28 09:37:17.000000000 -0400
@@ -87,6 +87,8 @@
  *	Exported routines.
  */
 
+void vm_pageout_map_deactivate_pages(vm_map_t map, long desired);
+
 /*
  *	Signal pageout-daemon and wait for it.
  */
diff -burN src.old/usr.sbin/jail/jail.8 src.new/usr.sbin/jail/jail.8
--- src.old/usr.sbin/jail/jail.8	2007-04-05 17:17:52.000000000 -0400
+++ src.new/usr.sbin/jail/jail.8	2008-05-28 19:58:58.000000000 -0400
@@ -45,6 +45,8 @@
 .Op Fl J Ar jid_file
 .Op Fl s Ar securelevel
 .Op Fl l u Ar username | Fl U Ar username
+.Op Fl S Ar cpu_shares
+.Op Fl M Ar mem_limit
 .Ar path hostname ip-number command ...
 .Sh DESCRIPTION
 The
@@ -88,6 +90,10 @@
 The user name from jailed environment as whom the
 .Ar command
 should run.
+.It Fl S Ar cpu_shares
+CPU shares to assign to the prison.
+.It Fl M Ar mem_limit
+Amount of memory (in MB) to allow the prison to use.
 .It Ar path
 Directory which is to be the root of the prison.
 .It Ar hostname
@@ -550,6 +556,17 @@
 This MIB entry determines if a privileged user inside a jail will be
 able to mount and unmount file system types marked as jail-friendly.
 The
+.It Va security.jail.limit_jail_memory, Va security.jail.jail_pager_interval
+These MIB entries determine whether and how often (in seconds) a
+jail's memory-limit monitoring daemon will run, and consequently the 
+period during which a jail can be overcommitted for resident memory.
+.It Va kern.sched.limit_jail_cpu
+This MIB entry sets whether CPU usage limits will be enforced 
+against processes in jails with CPU limits.
+.It Va kern.sched.system_cpu_shares
+Number of CPU usage shares to allocate to unjailed processes for the 
+purposes of determining CPU usage permitted for jailed processes.  
+Unjailed processes are not subject to CPU usage limits.
 .Xr lsvfs 1
 command can be used to find file system types available for mount from within
 a jail.
diff -burN src.old/usr.sbin/jail/jail.c src.new/usr.sbin/jail/jail.c
--- src.old/usr.sbin/jail/jail.c	2006-05-12 11:14:43.000000000 -0400
+++ src.new/usr.sbin/jail/jail.c	2008-05-28 10:02:59.000000000 -0400
@@ -56,6 +56,8 @@
 	struct in_addr in;
 	gid_t groups[NGROUPS];
 	int ch, i, iflag, Jflag, lflag, ngroups, securelevel, uflag, Uflag;
+	unsigned int mem_limit = 0;
+	unsigned int sched_shares = 0;
 	char path[PATH_MAX], *ep, *username, *JidFile;
 	static char *cleanenv;
 	const char *shell, *p = NULL;
@@ -67,7 +69,7 @@
 	username = JidFile = cleanenv = NULL;
 	fp = NULL;
 
-	while ((ch = getopt(argc, argv, "ils:u:U:J:")) != -1) {
+	while ((ch = getopt(argc, argv, "ilS:M:s:u:U:J:")) != -1) {
 		switch (ch) {
 		case 'i':
 			iflag = 1;
@@ -76,6 +78,13 @@
 			JidFile = optarg;
 			Jflag = 1;
 			break;
+		case 'M':
+			mem_limit = atoi(optarg);
+			mem_limit *= 1024 * 1024;
+			break;
+		case 'S':
+			sched_shares = atoi(optarg);
+			break;
 		case 's':
 			ltmp = strtol(optarg, &ep, 0);
 			if (*ep || ep == optarg || ltmp > INT_MAX || !ltmp)
@@ -118,6 +127,8 @@
 	if (inet_aton(argv[2], &in) == 0)
 		errx(1, "Could not make sense of ip-number: %s", argv[2]);
 	j.ip_number = ntohl(in.s_addr);
+	j.mem_limit = mem_limit;
+	j.sched_shares = sched_shares;
 	if (Jflag) {
 		fp = fopen(JidFile, "w");
 		if (fp == NULL)
@@ -182,8 +193,10 @@
 usage(void)
 {
 
-	(void)fprintf(stderr, "%s%s%s\n",
-	     "usage: jail [-i] [-J jid_file] [-s securelevel] [-l -u ",
+	(void)fprintf(stderr, "%s%s%s%s%s\n",
+	     "usage: jail [-i] [-J jid_file] [-M mem_limit] ",
+             "[-S cpu_shares] [-s securelevel]",
+             " [-l -u ",
 	     "username | -U username]",
 	     " path hostname ip-number command ...");
 	exit(1);
diff -burN src.old/usr.sbin/jls/jls.8 src.new/usr.sbin/jls/jls.8
--- src.old/usr.sbin/jls/jls.8	2003-04-08 23:04:12.000000000 -0400
+++ src.new/usr.sbin/jls/jls.8	2008-05-28 10:18:45.000000000 -0400
@@ -42,7 +42,8 @@
 .Sh SEE ALSO
 .Xr jail 2 ,
 .Xr jail 8 ,
-.Xr jexec 8
+.Xr jexec 8 ,
+.Xr jtune 8
 .Sh HISTORY
 The
 .Nm
diff -burN src.old/usr.sbin/jtune/Makefile src.new/usr.sbin/jtune/Makefile
--- src.old/usr.sbin/jtune/Makefile	1969-12-31 19:00:00.000000000 -0500
+++ src.new/usr.sbin/jtune/Makefile	2008-05-28 03:41:05.000000000 -0400
@@ -0,0 +1,10 @@
+# $FreeBSD$
+
+PROG=	jtune
+MAN=	jtune.8
+DPADD=	${LIBUTIL}
+LDADD=	-lutil
+
+WARNS?=	6
+
+.include <bsd.prog.mk>
diff -burN src.old/usr.sbin/jtune/jtune.8 src.new/usr.sbin/jtune/jtune.8
--- src.old/usr.sbin/jtune/jtune.8	1969-12-31 19:00:00.000000000 -0500
+++ src.new/usr.sbin/jtune/jtune.8	2008-05-28 10:19:33.000000000 -0400
@@ -0,0 +1,75 @@
+.\" Copyright (c) 2006 Chris Jones
+.\" All rights reserved.
+.\"
+.\" This software was developed for the FreeBSD Project by Chris Jones
+.\" thanks to the support of Google's Summer of Code program and
+.\" mentoring by Kip Macy.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE  
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL  
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\" 
+.\" $FreeBSD$
+.\" 
+.Dd August 21, 2006
+.Dt JTUNE 8
+.Os
+.Sh NAME
+.Nm jtune
+.Nd "modify jail resource limits"
+.Sh SYNOPSIS
+.Nm 
+.Fl j Ar jail_id   
+.Op Fl i
+.Op Fl m Ar mem_limit
+.Op Fl s Ar cpu_shares
+.Sh DESCRIPTION
+The
+.Nm
+utility modifies a jail's memory and CPU usage limits.
+.Pp
+The options are as follows:
+.Bl -tag -width ".Fl u Ar cpu_shares"
+.It Ar jail_id
+Jail identifier (JID) of the jail whose limits are being tuned.
+.It Fl i
+Show jail's resource limits.
+.It Fl m Ar mem_limit
+Limit a jail's memory usage (resident set size) to 
+.Ar mem_limit
+megabytes.
+.It Fl s Ar cpu_shares
+Set a jail's CPU shares to 
+.Ar cpu_shares
+shares.
+.Sh SEE ALSO
+.Xr jail 2 ,
+.Xr jail 8 ,
+.Xr jexec 8
+.Xr jls 8
+.Sh HISTORY
+The 
+.Nm
+utility first appeared in 
+.Fx FIXME .
+.Pp
+.Nm
+was written by Chris Jones through the 2006 Google Summer of Code 
+program.
Files src.old/usr.sbin/jtune/jtune.8.gz and src.new/usr.sbin/jtune/jtune.8.gz differ
diff -burN src.old/usr.sbin/jtune/jtune.c src.new/usr.sbin/jtune/jtune.c
--- src.old/usr.sbin/jtune/jtune.c	1969-12-31 19:00:00.000000000 -0500
+++ src.new/usr.sbin/jtune/jtune.c	2008-05-28 03:39:15.000000000 -0400
@@ -0,0 +1,188 @@
+/*-
+ *  Copyright (c) 2006 Chris Jones
+ *  All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project by Chris Jones
+ * thanks to the support of Google's Summer of Code program and
+ * mentoring by Kip Macy.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD");
+
+#include <sys/param.h>
+#include <sys/jail.h>
+#include <sys/sysctl.h>
+
+#include <err.h>
+#include <errno.h>
+#include <grp.h>
+#include <login_cap.h>
+#include <paths.h>
+#include <pwd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static void usage(void);
+static struct xprison *getxprison(int);
+extern char **environ;
+
+int
+main(int argc, char **argv)
+{
+	struct xprison *xp;
+	int jid = 0;
+	int memlimit = -1;
+	int shares = -1;
+	int iflag = 0;
+	int retval;
+	int ch;
+	
+	while ((ch = getopt(argc, argv, "ij:m:s:")) != -1) {
+		switch (ch) {
+		case 'i':
+			iflag = 1;
+			break;
+		case 'j':
+			jid = atoi(optarg);
+			if (!jid && errno)
+				err(1, "invalid jail id '%s'", optarg);
+			break;
+			
+		case 'm':
+			memlimit = atoi(optarg);
+			if (!memlimit && errno)
+				err(1, "invalid memory limit '%s'", optarg);
+			if (memlimit < 0)
+				errx(1, "invalid memory limit '%s'", optarg);
+			memlimit *= 1024 * 1024;
+			break;
+			
+		case 's':
+			shares = atoi(optarg);
+			if (!shares && errno)
+				err(1, "invalid cpu share '%s'", optarg);
+			if (shares < 0)
+				errx(1, "invalid cpu share '%s'", optarg);
+			break;
+			
+		default:
+			usage();
+		}
+	}
+	
+	argc -= optind;
+	argv += optind;
+
+	if (!jid)
+		usage();
+	
+	xp = getxprison(jid);
+	if (NULL == xp)
+		errx(1, "no jail with id %d", jid);
+
+	if (iflag) {
+		char *memlimstr, *memusestr;
+
+		asprintf(&memusestr, "%d M",
+			xp->pr_mem_usage / (1024 * 1024));
+		if (xp->pr_mem_limit) {
+			asprintf(&memlimstr, "%d M", 
+				xp->pr_mem_limit / (1024 * 1024));
+		} else {
+			asprintf(&memlimstr, "None");
+		}
+
+		if (NULL == memusestr || NULL == memlimstr)
+			err(1, "couldn't allocate memory");
+
+		printf("   JID  Hostname            Memory Used / Limit  CPU Shares\n");
+		printf("%6d  %-24.24s %6s / %-6.6s %-4d\n",
+		xp->pr_id, xp->pr_host, 
+		memusestr, memlimstr,
+		xp->pr_sched_shares);
+		exit(0);
+	}
+
+	retval = jail_set_resource_limits(jid, shares, memlimit);
+	if (retval) {
+		errx(1, "jail_set_resource_limit(%d, %d, %d) failed",
+		     jid, memlimit, shares);
+	}
+	exit(0);
+	
+}
+
+static void
+usage()
+{
+	(void)fprintf(stderr, "%s\n",
+		"usage: jtune -j jid_id [-m mem_limit] [-s cpu_shares]");
+	exit(0);
+}
+
+static struct xprison *
+getxprison(int jid)
+{
+	size_t i, len;
+	struct xprison *xpl, *sxpl;
+	if (sysctlbyname("security.jail.list", NULL, &len, NULL, 0) == -1)
+		err(1, "sysctlbyname(): security.jail.list");
+	
+	if (len <= 0)
+		errx(1, "sysctl security.jail.list has no entries for jid %d", jid);
+	
+	/* getxprison allocates the structure, caller frees */
+	sxpl = xpl = malloc(len);
+	if (NULL == xpl)
+		err(1, "malloc()");
+	
+	if (sysctlbyname("security.jail.list", xpl, &len, NULL, 0) == -1) {
+		free(xpl);
+		err(1, "sysctlbyname(): security.jail.list");
+	}
+	
+	if (len < sizeof(*xpl) || len % sizeof(*xpl) ||
+	    xpl->pr_version != XPRISON_VERSION)
+		errx(1, "Kernel and userland out of sync");
+	
+	for (i = 0; i < len / sizeof(*xpl); i++) {
+		if (jid == xpl->pr_id) {
+			struct xprison *xp;
+			xp = malloc(sizeof (struct xprison));
+			if (NULL == xp)
+				err(1, "malloc()");
+			memcpy(xp, xpl, sizeof (struct xprison));
+			free(sxpl);
+			return xp;
+		}
+		xpl++;
+	}
+
+	free(sxpl);
+	return NULL;
+}

--------------040604060800000202080402--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?48616B3F.4030705>