Date: Mon, 23 Jul 2007 23:02:15 GMT From: Ulf Lilleengen <lulf@FreeBSD.org> To: Perforce Change Reviews <perforce@FreeBSD.org> Subject: PERFORCE change 123983 for review Message-ID: <200707232302.l6NN2Fvw031988@repoman.freebsd.org>
next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=123983 Change 123983 by lulf@lulf_carrot on 2007/07/23 23:01:14 IFC Affected files ... .. //depot/projects/soc2007/lulf/gvinum_fixup/sys/conf/NOTES#11 integrate .. //depot/projects/soc2007/lulf/gvinum_fixup/sys/i386/i386/genassym.c#3 integrate .. //depot/projects/soc2007/lulf/gvinum_fixup/sys/i386/i386/swtch.s#3 integrate .. //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/kern_mutex.c#6 integrate .. //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/kern_resource.c#7 integrate .. //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/sched_4bsd.c#4 integrate .. //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/sched_ule.c#6 integrate .. //depot/projects/soc2007/lulf/gvinum_fixup/sys/sys/mutex.h#3 integrate Differences ... ==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/conf/NOTES#11 (text+ko) ==== @@ -1,4 +1,4 @@ -# $FreeBSD: src/sys/conf/NOTES,v 1.1444 2007/07/14 21:49:23 rwatson Exp $ +# $FreeBSD: src/sys/conf/NOTES,v 1.1445 2007/07/18 02:51:21 jeff Exp $ # # NOTES -- Lines that can be cut/pasted into kernel and hints configs. # @@ -176,10 +176,11 @@ # queue and no CPU affinity which makes it suboptimal for SMP. It has very # good interactivity and priority selection. # -# SCHED_ULE is a new scheduler that has been designed for SMP and has some -# advantages for UP as well. It is intended to replace the 4BSD scheduler -# over time. NOTE: SCHED_ULE is currently considered experimental and is -# not recommended for production use at this time. +# SCHED_ULE provides significant performance advantages over 4BSD on many +# workloads on SMP machines. It supports cpu-affinity, per-cpu runqueues +# and scheduler locks. It also has a stronger notion of interactivity +# which leads to better responsiveness even on uniprocessor machines. This +# will eventually become the default scheduler. # options SCHED_4BSD #options SCHED_ULE ==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/i386/i386/genassym.c#3 (text+ko) ==== @@ -33,7 +33,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: src/sys/i386/i386/genassym.c,v 1.157 2007/06/06 07:35:07 davidxu Exp $"); +__FBSDID("$FreeBSD: src/sys/i386/i386/genassym.c,v 1.158 2007/07/17 22:34:14 jeff Exp $"); #include "opt_apic.h" #include "opt_compat.h" @@ -81,6 +81,7 @@ ASSYM(P_SFLAG, offsetof(struct proc, p_sflag)); ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); +ASSYM(TD_LOCK, offsetof(struct thread, td_lock)); ASSYM(TD_PCB, offsetof(struct thread, td_pcb)); ASSYM(TD_PROC, offsetof(struct thread, td_proc)); ASSYM(TD_MD, offsetof(struct thread, td_md)); ==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/i386/i386/swtch.s#3 (text+ko) ==== @@ -29,15 +29,32 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: src/sys/i386/i386/swtch.s,v 1.154 2007/06/06 07:35:07 davidxu Exp $ + * $FreeBSD: src/sys/i386/i386/swtch.s,v 1.155 2007/07/17 22:34:14 jeff Exp $ */ #include "opt_npx.h" +#include "opt_sched.h" #include <machine/asmacros.h> #include "assym.s" +#if defined(SMP) && defined(SCHED_ULE) +#define SETOP xchgl +#define BLOCK_SPIN(reg) \ + movl $blocked_lock,%eax ; \ + 100: ; \ + lock ; \ + cmpxchgl %eax,TD_LOCK(reg) ; \ + jne 101f ; \ + pause ; \ + jmp 100b ; \ + 101: +#else +#define SETOP movl +#define BLOCK_SPIN(reg) +#endif + /*****************************************************************************/ /* Scheduling */ /*****************************************************************************/ @@ -91,6 +108,7 @@ * 0(%esp) = ret * 4(%esp) = oldtd * 8(%esp) = newtd + * 12(%esp) = newlock */ ENTRY(cpu_switch) @@ -145,13 +163,14 @@ #endif /* Save is done. Now fire up new thread. Leave old vmspace. */ + movl 4(%esp),%edi movl 8(%esp),%ecx /* New thread */ + movl 12(%esp),%esi /* New lock */ #ifdef INVARIANTS testl %ecx,%ecx /* no thread? */ jz badsw3 /* no, panic */ #endif movl TD_PCB(%ecx),%edx - movl PCPU(CPUID), %esi /* switch address space */ movl PCB_CR3(%edx),%eax @@ -160,11 +179,14 @@ #else cmpl %eax,IdlePTD /* Kernel address space? */ #endif - je sw1 + je sw0 movl %cr3,%ebx /* The same address space? */ cmpl %ebx,%eax - je sw1 + je sw0 movl %eax,%cr3 /* new address space */ + movl %esi,%eax + movl PCPU(CPUID),%esi + SETOP %eax,TD_LOCK(%edi) /* Switchout td_lock */ /* Release bit from old pmap->pm_active */ movl PCPU(CURPMAP), %ebx @@ -182,8 +204,12 @@ lock #endif btsl %esi, PM_ACTIVE(%ebx) /* set new */ + jmp sw1 +sw0: + SETOP %esi,TD_LOCK(%edi) /* Switchout td_lock */ sw1: + BLOCK_SPIN(%ecx) /* * At this point, we've switched address spaces and are ready * to load up the rest of the next context. ==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/kern_mutex.c#6 (text+ko) ==== @@ -34,7 +34,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: src/sys/kern/kern_mutex.c,v 1.196 2007/06/09 18:09:37 mjacob Exp $"); +__FBSDID("$FreeBSD: src/sys/kern/kern_mutex.c,v 1.198 2007/07/18 20:46:05 jeff Exp $"); #include "opt_adaptive_mutexes.h" #include "opt_ddb.h" @@ -118,7 +118,6 @@ * System-wide mutexes */ struct mtx blocked_lock; -struct mtx sched_lock; struct mtx Giant; #ifdef LOCK_PROFILING @@ -473,9 +472,12 @@ { struct mtx *m; uintptr_t tid; - int i; + int i, contested; + uint64_t waittime; - i = 0; + + contested = i = 0; + waittime = 0; tid = (uintptr_t)curthread; for (;;) { retry: @@ -488,6 +490,7 @@ m->mtx_recurse++; break; } + lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime); /* Give interrupts a chance while we spin. */ spinlock_exit(); while (m->mtx_lock != MTX_UNOWNED) { @@ -508,6 +511,8 @@ break; _rel_spin_lock(m); /* does spinlock_exit() */ } + lock_profile_obtain_lock_success(&m->lock_object, contested, + waittime, (file), (line)); WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); } @@ -769,7 +774,6 @@ * Initialize mutexes. */ mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE); - mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE); mtx_init(&blocked_lock, "blocked lock", NULL, MTX_SPIN); blocked_lock.mtx_lock = 0xdeadc0de; /* Always blocked. */ mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK); ==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/kern_resource.c#7 (text+ko) ==== @@ -35,7 +35,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: src/sys/kern/kern_resource.c,v 1.179 2007/07/12 18:01:31 jhb Exp $"); +__FBSDID("$FreeBSD: src/sys/kern/kern_resource.c,v 1.180 2007/07/17 01:08:09 jeff Exp $"); #include "opt_compat.h" @@ -840,6 +840,14 @@ p->p_rux.rux_runtime += u - PCPU_GET(switchtime); PCPU_SET(switchtime, u); } + /* Make sure the per-thread stats are current. */ + FOREACH_THREAD_IN_PROC(p, td) { + if (td->td_runtime == 0) + continue; + thread_lock(td); + ruxagg(&p->p_rux, td); + thread_unlock(td); + } calcru1(p, &p->p_rux, up, sp); } ==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/sched_4bsd.c#4 (text+ko) ==== @@ -33,7 +33,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: src/sys/kern/sched_4bsd.c,v 1.102 2007/06/12 07:47:09 jeff Exp $"); +__FBSDID("$FreeBSD: src/sys/kern/sched_4bsd.c,v 1.103 2007/07/18 20:46:05 jeff Exp $"); #include "opt_hwpmc_hooks.h" @@ -101,6 +101,7 @@ ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq) static struct td_sched td_sched0; +struct mtx sched_lock; static int sched_tdcnt; /* Total runnable threads in the system. */ static int sched_quantum; /* Roundrobin scheduling quantum in ticks. */ @@ -578,6 +579,7 @@ thread0.td_sched = &td_sched0; thread0.td_lock = &sched_lock; td_sched0.ts_thread = &thread0; + mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE); } int ==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/sched_ule.c#6 (text+ko) ==== @@ -24,8 +24,19 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/* + * This file implements the ULE scheduler. ULE supports independent CPU + * run queues and fine grain locking. It has superior interactive + * performance under load even on uni-processor systems. + * + * etymology: + * ULE is the last three letters in schedule. It owes it's name to a + * generic user created for a scheduling system by Paul Mikesell at + * Isilon Systems and a general lack of creativity on the part of the author. + */ + #include <sys/cdefs.h> -__FBSDID("$FreeBSD: src/sys/kern/sched_ule.c,v 1.199 2007/06/15 19:33:58 jeff Exp $"); +__FBSDID("$FreeBSD: src/sys/kern/sched_ule.c,v 1.200 2007/07/17 22:53:23 jeff Exp $"); #include "opt_hwpmc_hooks.h" #include "opt_sched.h" @@ -64,26 +75,23 @@ #error "SCHED_ULE requires options PREEMPTION" #endif -/* - * TODO: - * Pick idle from affinity group or self group first. - * Implement pick_score. - */ - -#define KTR_ULE 0x0 /* Enable for pickpri debugging. */ +#define KTR_ULE 0 /* - * Thread scheduler specific section. + * Thread scheduler specific section. All fields are protected + * by the thread lock. */ struct td_sched { - TAILQ_ENTRY(td_sched) ts_procq; /* (j/z) Run queue. */ - int ts_flags; /* (j) TSF_* flags. */ - struct thread *ts_thread; /* (*) Active associated thread. */ - u_char ts_rqindex; /* (j) Run queue index. */ - int ts_slptime; - int ts_slice; - struct runq *ts_runq; + TAILQ_ENTRY(td_sched) ts_procq; /* Run queue. */ + struct thread *ts_thread; /* Active associated thread. */ + struct runq *ts_runq; /* Run-queue we're queued on. */ + short ts_flags; /* TSF_* flags. */ + u_char ts_rqindex; /* Run queue index. */ u_char ts_cpu; /* CPU that we have affinity for. */ + int ts_slptick; /* Tick when we went to sleep. */ + int ts_slice; /* Ticks of slice remaining. */ + u_int ts_slptime; /* Number of ticks we vol. slept */ + u_int ts_runtime; /* Number of ticks we were running */ /* The following variables are only used for pctcpu calculation */ int ts_ltick; /* Last tick that we were running on */ int ts_ftick; /* First tick that we were running on */ @@ -91,10 +99,6 @@ #ifdef SMP int ts_rltick; /* Real last tick, for affinity. */ #endif - - /* originally from kg_sched */ - u_int skg_slptime; /* Number of ticks we vol. slept */ - u_int skg_runtime; /* Number of ticks we were running */ }; /* flags kept in ts_flags */ #define TSF_BOUND 0x0001 /* Thread can not migrate. */ @@ -165,33 +169,40 @@ * due to rounding would be unacceptably high. * realstathz: stathz is sometimes 0 and run off of hz. * sched_slice: Runtime of each thread before rescheduling. + * preempt_thresh: Priority threshold for preemption and remote IPIs. */ static int sched_interact = SCHED_INTERACT_THRESH; static int realstathz; static int tickincr; static int sched_slice; +static int preempt_thresh = PRI_MIN_KERN; +#define SCHED_BAL_SECS 2 /* How often we run the rebalance algorithm. */ + /* - * tdq - per processor runqs and statistics. + * tdq - per processor runqs and statistics. All fields are protected by the + * tdq_lock. The load and lowpri may be accessed without to avoid excess + * locking in sched_pickcpu(); */ struct tdq { + struct mtx tdq_lock; /* Protects all fields below. */ + struct runq tdq_realtime; /* real-time run queue. */ + struct runq tdq_timeshare; /* timeshare run queue. */ struct runq tdq_idle; /* Queue of IDLE threads. */ - struct runq tdq_timeshare; /* timeshare run queue. */ - struct runq tdq_realtime; /* real-time run queue. */ + int tdq_load; /* Aggregate load. */ u_char tdq_idx; /* Current insert index. */ u_char tdq_ridx; /* Current removal index. */ - short tdq_flags; /* Thread queue flags */ - int tdq_load; /* Aggregate load. */ #ifdef SMP - int tdq_transferable; + u_char tdq_lowpri; /* Lowest priority thread. */ + int tdq_transferable; /* Transferable thread count. */ LIST_ENTRY(tdq) tdq_siblings; /* Next in tdq group. */ struct tdq_group *tdq_group; /* Our processor group. */ #else int tdq_sysload; /* For loadavg, !ITHD load. */ #endif -}; + char tdq_name[16]; /* lock name. */ +} __aligned(64); -#define TDQF_BUSY 0x0001 /* Queue is marked as busy */ #ifdef SMP /* @@ -210,9 +221,9 @@ int tdg_load; /* Total load of this group. */ int tdg_transferable; /* Transferable load of this group. */ LIST_HEAD(, tdq) tdg_members; /* Linked list of all members. */ -}; +} __aligned(64); -#define SCHED_AFFINITY_DEFAULT (hz / 100) +#define SCHED_AFFINITY_DEFAULT (max(1, hz / 300)) #define SCHED_AFFINITY(ts) ((ts)->ts_rltick > ticks - affinity) /* @@ -220,28 +231,23 @@ */ static int rebalance = 0; static int pick_pri = 0; +static int pick_zero = 0; static int affinity; static int tryself = 1; static int tryselfidle = 1; -static int ipi_ast = 0; -static int ipi_preempt = 1; -static int ipi_thresh = PRI_MIN_KERN; -static int steal_htt = 1; -static int steal_busy = 1; -static int busy_thresh = 4; +static int steal_htt = 0; +static int steal_idle = 0; static int topology = 0; /* * One thread queue per processor. */ static volatile cpumask_t tdq_idle; -static volatile cpumask_t tdq_busy; static int tdg_maxid; static struct tdq tdq_cpu[MAXCPU]; static struct tdq_group tdq_groups[MAXCPU]; -static int bal_tick; -static int gbal_tick; -static int balance_groups; +static struct callout balco; +static struct callout gbalco; #define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)]) #define TDQ_CPU(x) (&tdq_cpu[(x)]) @@ -255,14 +261,18 @@ #define TDQ_CPU(x) (&tdq_cpu) #endif +#define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type)) +#define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t))) +#define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f)) +#define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t))) +#define TDQ_LOCKPTR(t) (&(t)->tdq_lock) + static void sched_priority(struct thread *); static void sched_thread_priority(struct thread *, u_char); static int sched_interact_score(struct thread *); static void sched_interact_update(struct thread *); static void sched_interact_fork(struct thread *); static void sched_pctcpu_update(struct td_sched *); -static inline void sched_pin_td(struct thread *td); -static inline void sched_unpin_td(struct thread *td); /* Operations on per processor queues */ static struct td_sched * tdq_choose(struct tdq *); @@ -273,19 +283,21 @@ static __inline void tdq_runq_rem(struct tdq *, struct td_sched *); void tdq_print(int cpu); static void runq_print(struct runq *rq); +static void tdq_add(struct tdq *, struct thread *, int); #ifdef SMP -static int tdq_pickidle(struct tdq *, struct td_sched *); -static int tdq_pickpri(struct tdq *, struct td_sched *, int); +static void tdq_move(struct tdq *, struct tdq *); +static int tdq_idled(struct tdq *); +static void tdq_notify(struct td_sched *); +static struct td_sched *tdq_steal(struct tdq *, int); static struct td_sched *runq_steal(struct runq *); -static void sched_balance(void); -static void sched_balance_groups(void); +static int sched_pickcpu(struct td_sched *, int); +static void sched_balance(void *); +static void sched_balance_groups(void *); static void sched_balance_group(struct tdq_group *); static void sched_balance_pair(struct tdq *, struct tdq *); -static void sched_smp_tick(struct thread *); -static void tdq_move(struct tdq *, int); -static int tdq_idled(struct tdq *); -static void tdq_notify(struct td_sched *); -static struct td_sched *tdq_steal(struct tdq *, int); +static inline struct tdq *sched_setcpu(struct td_sched *, int, int); +static inline struct mtx *thread_block_switch(struct thread *); +static inline void thread_unblock_switch(struct thread *, struct mtx *); #define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0) #endif @@ -296,18 +308,9 @@ static void sched_initticks(void *dummy); SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL) -static inline void -sched_pin_td(struct thread *td) -{ - td->td_pinned++; -} - -static inline void -sched_unpin_td(struct thread *td) -{ - td->td_pinned--; -} - +/* + * Print the threads waiting on a run-queue. + */ static void runq_print(struct runq *rq) { @@ -332,6 +335,9 @@ } } +/* + * Print the status of a per-cpu thread queue. Should be a ddb show cmd. + */ void tdq_print(int cpu) { @@ -340,8 +346,10 @@ tdq = TDQ_CPU(cpu); printf("tdq:\n"); + printf("\tlockptr %p\n", TDQ_LOCKPTR(tdq)); + printf("\tlock name %s\n", tdq->tdq_name); printf("\tload: %d\n", tdq->tdq_load); - printf("\ttimeshare idx: %d\n", tdq->tdq_idx); + printf("\ttimeshare idx: %d\n", tdq->tdq_idx); printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); printf("\trealtime runq:\n"); runq_print(&tdq->tdq_realtime); @@ -351,22 +359,26 @@ runq_print(&tdq->tdq_idle); #ifdef SMP printf("\tload transferable: %d\n", tdq->tdq_transferable); + printf("\tlowest priority: %d\n", tdq->tdq_lowpri); #endif } +#define TS_RQ_PPQ (((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS) +/* + * Add a thread to the actual run-queue. Keeps transferable counts up to + * date with what is actually on the run-queue. Selects the correct + * queue position for timeshare threads. + */ static __inline void tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags) { + TDQ_LOCK_ASSERT(tdq, MA_OWNED); + THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); #ifdef SMP if (THREAD_CAN_MIGRATE(ts->ts_thread)) { tdq->tdq_transferable++; tdq->tdq_group->tdg_transferable++; ts->ts_flags |= TSF_XFERABLE; - if (tdq->tdq_transferable >= busy_thresh && - (tdq->tdq_flags & TDQF_BUSY) == 0) { - tdq->tdq_flags |= TDQF_BUSY; - atomic_set_int(&tdq_busy, 1 << TDQ_ID(tdq)); - } } #endif if (ts->ts_runq == &tdq->tdq_timeshare) { @@ -379,7 +391,6 @@ * This queue contains only priorities between MIN and MAX * realtime. Use the whole queue to represent these values. */ -#define TS_RQ_PPQ (((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS) if ((flags & SRQ_BORROWING) == 0) { pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ; pri = (pri + tdq->tdq_idx) % RQ_NQS; @@ -398,19 +409,22 @@ runq_add(ts->ts_runq, ts, flags); } +/* + * Remove a thread from a run-queue. This typically happens when a thread + * is selected to run. Running threads are not on the queue and the + * transferable count does not reflect them. + */ static __inline void tdq_runq_rem(struct tdq *tdq, struct td_sched *ts) { + TDQ_LOCK_ASSERT(tdq, MA_OWNED); + KASSERT(ts->ts_runq != NULL, + ("tdq_runq_remove: thread %p null ts_runq", ts->ts_thread)); #ifdef SMP if (ts->ts_flags & TSF_XFERABLE) { tdq->tdq_transferable--; tdq->tdq_group->tdg_transferable--; ts->ts_flags &= ~TSF_XFERABLE; - if (tdq->tdq_transferable < busy_thresh && - (tdq->tdq_flags & TDQF_BUSY)) { - atomic_clear_int(&tdq_busy, 1 << TDQ_ID(tdq)); - tdq->tdq_flags &= ~TDQF_BUSY; - } } #endif if (ts->ts_runq == &tdq->tdq_timeshare) { @@ -429,11 +443,17 @@ runq_remove(ts->ts_runq, ts); } +/* + * Load is maintained for all threads RUNNING and ON_RUNQ. Add the load + * for this thread to the referenced thread queue. + */ static void tdq_load_add(struct tdq *tdq, struct td_sched *ts) { int class; - mtx_assert(&sched_lock, MA_OWNED); + + TDQ_LOCK_ASSERT(tdq, MA_OWNED); + THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); class = PRI_BASE(ts->ts_thread->td_pri_class); tdq->tdq_load++; CTR2(KTR_SCHED, "cpu %jd load: %d", TDQ_ID(tdq), tdq->tdq_load); @@ -446,11 +466,17 @@ #endif } +/* + * Remove the load from a thread that is transitioning to a sleep state or + * exiting. + */ static void tdq_load_rem(struct tdq *tdq, struct td_sched *ts) { int class; - mtx_assert(&sched_lock, MA_OWNED); + + THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); + TDQ_LOCK_ASSERT(tdq, MA_OWNED); class = PRI_BASE(ts->ts_thread->td_pri_class); if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) @@ -459,27 +485,14 @@ #else tdq->tdq_sysload--; #endif + KASSERT(tdq->tdq_load != 0, + ("tdq_load_rem: Removing with 0 load on queue %d", (int)TDQ_ID(tdq))); tdq->tdq_load--; CTR1(KTR_SCHED, "load: %d", tdq->tdq_load); ts->ts_runq = NULL; } #ifdef SMP -static void -sched_smp_tick(struct thread *td) -{ - struct tdq *tdq; - - tdq = TDQ_SELF(); - if (rebalance) { - if (ticks >= bal_tick) - sched_balance(); - if (ticks >= gbal_tick && balance_groups) - sched_balance_groups(); - } - td->td_sched->ts_rltick = ticks; -} - /* * sched_balance is a simple CPU load balancing algorithm. It operates by * finding the least loaded and most loaded cpu and equalizing their load @@ -489,15 +502,11 @@ * installations will only have 2 cpus. Secondly, load balancing too much at * once can have an unpleasant effect on the system. The scheduler rarely has * enough information to make perfect decisions. So this algorithm chooses - * algorithm simplicity and more gradual effects on load in larger systems. + * simplicity and more gradual effects on load in larger systems. * - * It could be improved by considering the priorities and slices assigned to - * each task prior to balancing them. There are many pathological cases with - * any approach and so the semi random algorithm below may work as well as any. - * */ static void -sched_balance(void) +sched_balance(void *arg) { struct tdq_group *high; struct tdq_group *low; @@ -505,8 +514,9 @@ int cnt; int i; - bal_tick = ticks + (random() % (hz * 2)); - if (smp_started == 0) + callout_reset(&balco, max(hz / 2, random() % (hz * SCHED_BAL_SECS)), + sched_balance, NULL); + if (smp_started == 0 || rebalance == 0) return; low = high = NULL; i = random() % (tdg_maxid + 1); @@ -529,18 +539,25 @@ LIST_FIRST(&low->tdg_members)); } +/* + * Balance load between CPUs in a group. Will only migrate within the group. + */ static void -sched_balance_groups(void) +sched_balance_groups(void *arg) { int i; - gbal_tick = ticks + (random() % (hz * 2)); - mtx_assert(&sched_lock, MA_OWNED); - if (smp_started) - for (i = 0; i <= tdg_maxid; i++) - sched_balance_group(TDQ_GROUP(i)); + callout_reset(&gbalco, max(hz / 2, random() % (hz * SCHED_BAL_SECS)), + sched_balance_groups, NULL); + if (smp_started == 0 || rebalance == 0) + return; + for (i = 0; i <= tdg_maxid; i++) + sched_balance_group(TDQ_GROUP(i)); } +/* + * Finds the greatest imbalance between two tdqs in a group. + */ static void sched_balance_group(struct tdq_group *tdg) { @@ -564,6 +581,24 @@ sched_balance_pair(high, low); } +/* + * Lock two thread queues using their address to maintain lock order. + */ +static void +tdq_lock_pair(struct tdq *one, struct tdq *two) +{ + if (one < two) { + TDQ_LOCK(one); + TDQ_LOCK_FLAGS(two, MTX_DUPOK); + } else { + TDQ_LOCK(two); + TDQ_LOCK_FLAGS(one, MTX_DUPOK); + } +} + +/* + * Transfer load between two imbalanced thread queues. + */ static void sched_balance_pair(struct tdq *high, struct tdq *low) { @@ -574,6 +609,7 @@ int diff; int i; + tdq_lock_pair(high, low); /* * If we're transfering within a group we have to use this specific * tdq's transferable count, otherwise we can steal from other members @@ -588,31 +624,37 @@ high_load = high->tdq_group->tdg_load; low_load = low->tdq_group->tdg_load; } - if (transferable == 0) - return; /* * Determine what the imbalance is and then adjust that to how many * threads we actually have to give up (transferable). */ - diff = high_load - low_load; - move = diff / 2; - if (diff & 0x1) - move++; - move = min(move, transferable); - for (i = 0; i < move; i++) - tdq_move(high, TDQ_ID(low)); + if (transferable != 0) { + diff = high_load - low_load; + move = diff / 2; + if (diff & 0x1) + move++; + move = min(move, transferable); + for (i = 0; i < move; i++) + tdq_move(high, low); + } + TDQ_UNLOCK(high); + TDQ_UNLOCK(low); return; } +/* + * Move a thread from one thread queue to another. + */ static void -tdq_move(struct tdq *from, int cpu) +tdq_move(struct tdq *from, struct tdq *to) { + struct td_sched *ts; + struct thread *td; struct tdq *tdq; - struct tdq *to; - struct td_sched *ts; + int cpu; tdq = from; - to = TDQ_CPU(cpu); + cpu = TDQ_ID(to); ts = tdq_steal(tdq, 1); if (ts == NULL) { struct tdq_group *tdg; @@ -625,26 +667,42 @@ break; } if (ts == NULL) - panic("tdq_move: No threads available with a " - "transferable count of %d\n", - tdg->tdg_transferable); + return; } if (tdq == to) return; - sched_rem(ts->ts_thread); + td = ts->ts_thread; + /* + * Although the run queue is locked the thread may be blocked. Lock + * it to clear this. + */ + thread_lock(td); + /* Drop recursive lock on from. */ + TDQ_UNLOCK(from); + sched_rem(td); ts->ts_cpu = cpu; - sched_pin_td(ts->ts_thread); - sched_add(ts->ts_thread, SRQ_YIELDING); - sched_unpin_td(ts->ts_thread); + td->td_lock = TDQ_LOCKPTR(to); + tdq_add(to, td, SRQ_YIELDING); } +/* + * This tdq has idled. Try to steal a thread from another cpu and switch + * to it. + */ static int tdq_idled(struct tdq *tdq) { struct tdq_group *tdg; struct tdq *steal; struct td_sched *ts; + struct thread *td; + int highload; + int highcpu; + int load; + int cpu; + /* We don't want to be preempted while we're iterating over tdqs */ + spinlock_enter(); tdg = tdq->tdq_group; /* * If we're in a cpu group, try and steal threads from another cpu in @@ -654,51 +712,59 @@ LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) { if (steal == tdq || steal->tdq_transferable == 0) continue; + TDQ_LOCK(steal); ts = tdq_steal(steal, 0); if (ts) goto steal; + TDQ_UNLOCK(steal); } } - if (steal_busy) { - while (tdq_busy) { - int cpu; - - cpu = ffs(tdq_busy); - if (cpu == 0) - break; - cpu--; + for (;;) { + if (steal_idle == 0) + break; + highcpu = 0; + highload = 0; + for (cpu = 0; cpu <= mp_maxid; cpu++) { + if (CPU_ABSENT(cpu)) + continue; steal = TDQ_CPU(cpu); - if (steal->tdq_transferable == 0) + load = TDQ_CPU(cpu)->tdq_transferable; + if (load < highload) continue; - ts = tdq_steal(steal, 1); - if (ts == NULL) - continue; - CTR5(KTR_ULE, - "tdq_idled: stealing td %p(%s) pri %d from %d busy 0x%X", - ts->ts_thread, ts->ts_thread->td_proc->p_comm, - ts->ts_thread->td_priority, cpu, tdq_busy); + highload = load; + highcpu = cpu; + } + if (highload < 2) + break; + steal = TDQ_CPU(highcpu); + TDQ_LOCK(steal); + if (steal->tdq_transferable > 1 && + (ts = tdq_steal(steal, 1)) != NULL) goto steal; - } + TDQ_UNLOCK(steal); + break; } - /* - * We only set the idled bit when all of the cpus in the group are - * idle. Otherwise we could get into a situation where a thread bounces - * back and forth between two idle cores on seperate physical CPUs. - */ - tdg->tdg_idlemask |= PCPU_GET(cpumask); - if (tdg->tdg_idlemask == tdg->tdg_cpumask) - atomic_set_int(&tdq_idle, tdg->tdg_mask); + spinlock_exit(); return (1); steal: - sched_rem(ts->ts_thread); - ts->ts_cpu = PCPU_GET(cpuid); - sched_pin_td(ts->ts_thread); - sched_add(ts->ts_thread, SRQ_YIELDING); - sched_unpin_td(ts->ts_thread); + td = ts->ts_thread; + thread_lock(td); + spinlock_exit(); + MPASS(td->td_lock == TDQ_LOCKPTR(steal)); + TDQ_UNLOCK(steal); + sched_rem(td); + sched_setcpu(ts, PCPU_GET(cpuid), SRQ_YIELDING); + tdq_add(tdq, td, SRQ_YIELDING); + MPASS(td->td_lock == curthread->td_lock); + mi_switch(SW_VOL, NULL); + thread_unlock(curthread); return (0); } +/* + * Notify a remote cpu of new work. Sends an IPI if criteria are met. + */ static void tdq_notify(struct td_sched *ts) { @@ -734,29 +800,74 @@ /* * Otherwise only IPI if we exceed the threshold. */ - if (pri > ipi_thresh) + if (pri > preempt_thresh) return; sendipi: ctd->td_flags |= TDF_NEEDRESCHED; - if (cpri < PRI_MIN_IDLE) { - if (ipi_ast) - ipi_selected(1 << cpu, IPI_AST); - else if (ipi_preempt) - ipi_selected(1 << cpu, IPI_PREEMPT); - } else - ipi_selected(1 << cpu, IPI_PREEMPT); + ipi_selected(1 << cpu, IPI_PREEMPT); +} + +/* + * Steals load from a timeshare queue. Honors the rotating queue head + * index. + */ +static struct td_sched * +runq_steal_from(struct runq *rq, u_char start) +{ + struct td_sched *ts; + struct rqbits *rqb; + struct rqhead *rqh; + int first; + int bit; + int pri; + int i; + + rqb = &rq->rq_status; + bit = start & (RQB_BPW -1); + pri = 0; + first = 0; +again: + for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) { + if (rqb->rqb_bits[i] == 0) + continue; + if (bit != 0) { + for (pri = bit; pri < RQB_BPW; pri++) + if (rqb->rqb_bits[i] & (1ul << pri)) + break; >>> TRUNCATED FOR MAIL (1000 lines) <<<
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200707232302.l6NN2Fvw031988>