diff -urNp a/arch/i386/config.in c/arch/i386/config.in --- a/arch/i386/config.in Mon Feb 25 20:37:52 2002 +++ c/arch/i386/config.in Tue Jul 16 11:38:41 2002 @@ -196,6 +196,10 @@ if [ "$CONFIG_SMP" != "y" ]; then fi else bool 'Multiquad NUMA system' CONFIG_MULTIQUAD + bool ' Enable NUMA scheduler' CONFIG_NUMA_SCHED + if [ "$CONFIG_NUMA_SCHED" = "y" ]; then + bool ' Enable node affine scheduler' CONFIG_NODE_AFFINE_SCHED + fi fi if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then diff -urNp a/arch/i386/kernel/smpboot.c c/arch/i386/kernel/smpboot.c --- a/arch/i386/kernel/smpboot.c Mon Jul 15 17:05:54 2002 +++ c/arch/i386/kernel/smpboot.c Tue Jul 16 11:38:41 2002 @@ -777,6 +777,8 @@ static int wakeup_secondary_via_INIT(int extern unsigned long cpu_initialized; +static int __initdata nr_lnodes = 0; + static void __init do_boot_cpu (int apicid) /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad @@ -785,11 +787,17 @@ static void __init do_boot_cpu (int apic { struct task_struct *idle; unsigned long boot_error = 0; - int timeout, cpu; + int timeout, cpu, cell; unsigned long start_eip; unsigned short nmi_high, nmi_low; cpu = ++cpucount; +#ifdef CONFIG_NUMA_SCHED + cell = SAPICID_TO_PNODE(apicid); + if (pnode_to_lnode[cell] < 0) { + pnode_to_lnode[cell] = nr_lnodes++; + } +#endif /* * We can't use kernel_thread since we must avoid to * reschedule the child. @@ -1022,6 +1030,9 @@ void __init smp_boot_cpus(void) set_bit(0, &cpu_online_map); boot_cpu_logical_apicid = logical_smp_processor_id(); map_cpu_to_boot_apicid(0, boot_cpu_apicid); +#ifdef CONFIG_NUMA_SCHED + pnode_to_lnode[SAPICID_TO_PNODE(boot_cpu_apicid)] = nr_lnodes++; +#endif global_irq_holder = 0; current->cpu = 0; @@ -1221,4 +1232,9 @@ void __init smp_boot_cpus(void) smp_done: zap_low_mappings(); +#ifdef CONFIG_NUMA_SCHED + pooldata_lock(); + bld_pools(); + pooldata_unlock(); +#endif } diff -urNp a/arch/ia64/config.in c/arch/ia64/config.in --- a/arch/ia64/config.in Mon Jul 15 16:53:17 2002 +++ c/arch/ia64/config.in Tue Jul 16 11:38:41 2002 @@ -81,12 +81,20 @@ fi if [ "$CONFIG_IA64_GENERIC" = "y" ] || [ "$CONFIG_IA64_DIG" = "y" ] \ || [ "$CONFIG_IA64_HP_ZX1" = "y" ]; then bool ' Enable IA-64 Machine Check Abort' CONFIG_IA64_MCA + bool ' Enable NUMA scheduler' CONFIG_NUMA_SCHED y + if [ "$CONFIG_NUMA_SCHED" = "y" ]; then + bool ' Enable node affine scheduler' CONFIG_NODE_AFFINE_SCHED y + fi define_bool CONFIG_PM y fi if [ "$CONFIG_IA64_SGI_SN1" = "y" ] || [ "$CONFIG_IA64_SGI_SN2" = "y" ]; then define_bool CONFIG_IA64_SGI_SN y bool ' Enable extra debugging code' CONFIG_IA64_SGI_SN_DEBUG n + bool ' Enable NUMA scheduler' CONFIG_NUMA_SCHED y + if [ "$CONFIG_NUMA_SCHED" = "y" ]; then + bool ' Enable node affine scheduler' CONFIG_NODE_AFFINE_SCHED y + fi bool ' Enable SGI Medusa Simulator Support' CONFIG_IA64_SGI_SN_SIM bool ' Enable autotest (llsc). Option to run cache test instead of booting' \ CONFIG_IA64_SGI_AUTOTEST n diff -urNp a/arch/ia64/kernel/smpboot.c c/arch/ia64/kernel/smpboot.c --- a/arch/ia64/kernel/smpboot.c Mon Jul 15 17:05:58 2002 +++ c/arch/ia64/kernel/smpboot.c Fri Jul 19 10:54:10 2002 @@ -12,6 +12,7 @@ #include +#include #include #include #include @@ -135,6 +136,22 @@ nointroute (char *str) __setup("nointroute", nointroute); +void __init +bub_sort(int n, int *a) +{ + int end, j, t; + + for (end = n-1; end >= 0; end--) { + for (j = 0; j < end; j++) { + if (a[j] > a[j+1]) { + t = a[j+1]; + a[j+1] = a[j]; + a[j] = t; + } + } + } +} + void sync_master (void *arg) { @@ -402,13 +419,23 @@ fork_by_hand (void) return do_fork(CLONE_VM|CLONE_PID, 0, 0, 0); } +#ifdef CONFIG_NUMA_SCHED +static int __initdata nr_lnodes=0; +#endif + static void __init do_boot_cpu (int sapicid) { struct task_struct *idle; - int timeout, cpu; + int timeout, cpu, cell; cpu = ++cpucount; +#ifdef CONFIG_NUMA_SCHED + cell = SAPICID_TO_PNODE(sapicid); + if (pnode_to_lnode[cell] < 0) { + pnode_to_lnode[cell] = nr_lnodes++; + } +#endif /* * We can't use kernel_thread since we must avoid to * reschedule the child. @@ -484,6 +511,9 @@ smp_boot_cpus (void) local_cpu_data->loops_per_jiffy = loops_per_jiffy; ia64_cpu_to_sapicid[0] = boot_cpu_id; +#ifdef CONFIG_NUMA_SCHED + pnode_to_lnode[SAPICID_TO_PNODE(boot_cpu_id)] = nr_lnodes++; +#endif printk("Boot processor id 0x%x/0x%x\n", 0, boot_cpu_id); @@ -502,6 +532,13 @@ smp_boot_cpus (void) if (max_cpus != -1) printk (KERN_INFO "Limiting CPUs to %d\n", max_cpus); +#ifdef CONFIG_IA64_DIG + /* + * To be on the safe side: sort SAPIC IDs of CPUs + */ + bub_sort(smp_boot_data.cpu_count, &smp_boot_data.cpu_phys_id[0]); +#endif + if (smp_boot_data.cpu_count > 1) { printk(KERN_INFO "SMP: starting up secondaries.\n"); @@ -547,6 +584,11 @@ smp_boot_cpus (void) } smp_done: ; +#ifdef CONFIG_NUMA_SCHED + pooldata_lock(); + bld_pools(); + pooldata_unlock(); +#endif } /* @@ -578,6 +620,6 @@ init_smp_config(void) /* Number of ticks we consider an idle tasks still cache-hot. * For Itanium: with 1GB/s bandwidth we need 4ms to fill up 4MB L3 cache... - * So let's try 10 ticks. + * The minimum time_slice is 10 ticks, so let's try 8 ticks. */ -unsigned long cache_decay_ticks=10; +unsigned long cache_decay_ticks=8; diff -urNp a/fs/exec.c c/fs/exec.c --- a/fs/exec.c Fri Dec 21 18:41:55 2001 +++ c/fs/exec.c Tue Jul 16 11:38:41 2002 @@ -860,6 +860,11 @@ int do_execve(char * filename, char ** a int retval; int i; +#ifdef CONFIG_NODE_AFFINE_SCHED + if (current->node_policy == NODPOL_EXEC) + sched_balance_exec(); +#endif + file = open_exec(filename); retval = PTR_ERR(file); diff -urNp a/fs/pipe.c c/fs/pipe.c --- a/fs/pipe.c Wed Jul 24 16:42:16 2002 +++ c/fs/pipe.c Mon Jul 15 17:05:58 2002 @@ -115,7 +115,7 @@ do_more_read: * writers synchronously that there is more * room. */ - wake_up_interruptible_sync(PIPE_WAIT(*inode)); + wake_up_interruptible(PIPE_WAIT(*inode)); if (!PIPE_EMPTY(*inode)) BUG(); goto do_more_read; diff -urNp a/include/asm-i386/atomic.h c/include/asm-i386/atomic.h --- a/include/asm-i386/atomic.h Thu Nov 22 20:46:18 2001 +++ c/include/asm-i386/atomic.h Tue Jul 16 11:38:41 2002 @@ -111,6 +111,18 @@ static __inline__ void atomic_inc(atomic } /** + * atomic_inc_return - increment atomic variable and return new value + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1 and return it's new value. Note that + * the guaranteed useful range of an atomic_t is only 24 bits. + */ +static inline int atomic_inc_return(atomic_t *v){ + atomic_inc(v); + return v->counter; +} + +/** * atomic_dec - decrement atomic variable * @v: pointer of type atomic_t * diff -urNp a/include/asm-i386/smp.h c/include/asm-i386/smp.h --- a/include/asm-i386/smp.h Mon Jul 15 17:05:58 2002 +++ c/include/asm-i386/smp.h Tue Jul 16 11:38:41 2002 @@ -123,5 +123,11 @@ static __inline int logical_smp_processo #define NO_PROC_ID 0xFF /* No processor magic marker */ +#ifdef CONFIG_NUMA_SCHED +#define NR_NODES 8 +#define cpu_physical_id(cpuid) (cpu_to_physical_apicid(cpuid)) +#define SAPICID_TO_PNODE(hwid) (cpu_to_logical_apicid(physical_apicid_to_cpu(hwid)) >> 4) +#endif + #endif #endif diff -urNp a/include/asm-ia64/smp.h c/include/asm-ia64/smp.h --- a/include/asm-ia64/smp.h Mon Jul 15 17:05:58 2002 +++ c/include/asm-ia64/smp.h Tue Jul 16 11:38:41 2002 @@ -13,6 +13,7 @@ #ifdef CONFIG_SMP +#include #include #include #include @@ -110,6 +111,32 @@ hard_smp_processor_id (void) #define NO_PROC_ID 0xffffffff /* no processor magic marker */ +#ifdef CONFIG_NUMA_SCHED +#ifdef CONFIG_IA64_DIG +/* sooner or later this should be a configurable parameter [EF] */ +#define NR_NODES 8 +/* + * This is the node ID on the NEC AzusA, + * on LION and BigSur it correctly initializes to node 0 + */ +#define SAPICID_TO_PNODE(hwid) ((hwid >> 12) & 0xff) + +#elif defined(CONFIG_IA64_SGI_SN) + +/* + * SGI SN1 & SN2 specific macros + */ +#define NR_NODES 32 +#define SAPICID_TO_PNODE(hwid) cpuid_to_cnodeid(hwid) + +#endif + +#else /* CONFIG_NODE_AFFINE_SCHED */ +#define NR_NODES 1 +#define CPU_TO_NODE(cpu) 0 +#define SAPICID_TO_PNODE(hwid) 0 +#endif /* CONFIG_NODE_AFFINE_SCHED */ + extern void __init init_smp_config (void); extern void smp_do_timer (struct pt_regs *regs); diff -urNp a/include/linux/prctl.h c/include/linux/prctl.h --- a/include/linux/prctl.h Mon Feb 25 20:38:13 2002 +++ c/include/linux/prctl.h Tue Jul 16 11:38:41 2002 @@ -26,4 +26,10 @@ # define PR_FPEMU_NOPRINT 1 /* silently emulate fp operations accesses */ # define PR_FPEMU_SIGFPE 2 /* don't emulate fp operations, send SIGFPE instead */ +/* Get/set node for node-affine scheduling */ +#define PR_GET_NODE 16 +#define PR_SET_NODE 17 +#define PR_GET_NODPOL 18 +#define PR_SET_NODPOL 19 + #endif /* _LINUX_PRCTL_H */ diff -urNp a/include/linux/sched.h c/include/linux/sched.h --- a/include/linux/sched.h Mon Jul 15 17:05:58 2002 +++ c/include/linux/sched.h Sun Jul 21 18:45:50 2002 @@ -153,6 +153,16 @@ extern void resched_cpu(int cpu); extern void scheduler_tick(int user_tick, int system); extern void migration_init(void); extern unsigned long cache_decay_ticks; +#ifdef CONFIG_NUMA_SCHED +extern void sched_balance_exec(void); +extern void sched_balance_fork(task_t *p); +extern void set_task_node(task_t *p, int node); +#else +#define sched_balance_exec() {} +#define sched_balance_fork(p) {} +#define set_task_node(p,n) {} +#endif +extern void sched_migrate_task(task_t *p, int cpu); #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern signed long FASTCALL(schedule_timeout(signed long timeout)); @@ -315,7 +325,10 @@ struct task_struct { unsigned long policy; unsigned long cpus_allowed; unsigned int time_slice; - +#ifdef CONFIG_NUMA_SCHED + int node; + int node_policy; +#endif task_t *next_task, *prev_task; struct mm_struct *mm, *active_mm; @@ -454,6 +467,48 @@ extern void set_cpus_allowed(task_t *p, # define set_cpus_allowed(p, new_mask) do { } while (0) #endif +/* Avoid zeroes in integer divides for load calculations */ +#define BALANCE_FACTOR 100 +/* + * If the current node has average load it waits 100ms before trying to + * steal a task from a remote node. + */ + +#ifdef CONFIG_NUMA_SCHED + +#define POOL_DELAY(this_node,node) \ + (_pool_delay[this_node * numpools + node]) +#define POOL_WEIGHT(this_node,node) \ + (_pool_weight[this_node * numpools + node]) + +#define NODPOL_EXEC 0 /* choose node & cpu in do_exec */ +#define NODPOL_FORK 1 /* choose node & cpu in do_fork if !CLONE_VM */ +#define NODPOL_FORK_ALL 2 /* choose node & cpu in do_fork */ + +extern int node_levels[NR_NODES]; +extern int nr_node_levels; +extern void find_node_levels(int numpools); + +extern int numpools; +extern int pool_ptr[NR_NODES+1]; +extern int pool_cpus[NR_CPUS]; +extern int pool_nr_cpus[NR_NODES]; +extern unsigned long pool_mask[NR_NODES]; +extern int pnode_to_lnode[NR_NODES]; +extern atomic_t pool_lock; +extern void *runqueues_address; +extern char lnode_number[NR_CPUS] __cacheline_aligned; +#define CPU_TO_NODE(cpu) lnode_number[cpu] + +extern void pooldata_lock(void); +extern void pooldata_unlock(void); +# define HOMENODE_INC(rq,node) (rq)->nr_homenode[node]++ +# define HOMENODE_DEC(rq,node) (rq)->nr_homenode[node]-- +#else +# define HOMENODE_INC(rq,node) {} +# define HOMENODE_DEC(rq,node) {} +#endif + extern void set_user_nice(task_t *p, long nice); extern int task_prio(task_t *p); extern int task_nice(task_t *p); diff -urNp a/kernel/fork.c c/kernel/fork.c --- a/kernel/fork.c Mon Jul 15 17:06:00 2002 +++ c/kernel/fork.c Tue Jul 16 11:38:41 2002 @@ -649,11 +649,12 @@ int do_fork(unsigned long clone_flags, u #ifdef CONFIG_SMP { int i; +#ifdef CONFIG_NODE_AFFINE_SCHED + if (p->node_policy == NODPOL_FORK_ALL || + (p->node_policy == NODPOL_FORK && !(clone_flags & CLONE_VM))) + sched_balance_fork(p); +#endif - if (likely(p->cpus_allowed & (1UL<cpu = smp_processor_id(); - else - p->cpu = __ffs(p->cpus_allowed); /* ?? should we just memset this ?? */ for(i = 0; i < smp_num_cpus; i++) p->per_cpu_utime[cpu_logical_map(i)] = diff -urNp a/kernel/ksyms.c c/kernel/ksyms.c --- a/kernel/ksyms.c Mon Jul 15 17:06:00 2002 +++ c/kernel/ksyms.c Tue Jul 16 11:38:41 2002 @@ -564,3 +564,14 @@ EXPORT_SYMBOL(init_task_union); EXPORT_SYMBOL(tasklist_lock); EXPORT_SYMBOL(pidhash); + +#ifdef CONFIG_NUMA_SCHED +#include +EXPORT_SYMBOL(runqueues_address); +EXPORT_SYMBOL(numpools); +EXPORT_SYMBOL(pool_ptr); +EXPORT_SYMBOL(pool_cpus); +EXPORT_SYMBOL(pool_nr_cpus); +EXPORT_SYMBOL(pool_mask); +EXPORT_SYMBOL(sched_migrate_task); +#endif diff -urNp a/kernel/sched.c c/kernel/sched.c --- a/kernel/sched.c Mon Jul 15 17:24:23 2002 +++ c/kernel/sched.c Wed Aug 21 13:18:04 2002 @@ -21,6 +21,11 @@ #include #include #include +#include +#ifdef CONFIG_X86 +#include +#endif +#include /* * Priority of a process goes from 0 to 139. The 0-99 @@ -146,9 +151,13 @@ struct runqueue { int prev_nr_running[NR_CPUS]; task_t *migration_thread; list_t migration_queue; + unsigned long wait_time; + int wait_node; + short nr_homenode[NR_NODES]; + int load[2][NR_CPUS]; } ____cacheline_aligned; -static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; +struct runqueue runqueues[NR_CPUS] __cacheline_aligned; #define cpu_rq(cpu) (runqueues + (cpu)) #define this_rq() cpu_rq(smp_processor_id()) @@ -166,6 +175,39 @@ int task_has_cpu(task_t *p) } /* + * Variables for describing and accessing processor pools. Using a + * compressed row format like notation. Processor pools are treated + * like logical node numbers. + * + * numpools: number of CPU pools (nodes), + * pool_cpus[]: CPUs in pools sorted by their pool ID, + * pool_ptr[pool]: index of first element in pool_cpus[] belonging to pool. + * pnode_to_lnode[pnode]: pool number corresponding to a physical node ID. + * pool_mask[]: cpu mask of a pool. + * _pool_delay[]: delay when stealing a task from remote nodes for multilevel + * topology. Needed by the macro POOL_DELAY(). + * + * Example: loop over all CPUs in a pool p: + * for (i = pool_ptr[p]; i < pool_ptr[p+1]; i++) { + * cpu = pool_cpus[i]; + * ... + * } + * + */ +int numpools = 1; +int pool_ptr[NR_NODES+1] = { 0, NR_CPUS, }; +int pool_cpus[NR_CPUS]; +int pool_nr_cpus[NR_NODES] = { NR_CPUS, }; +unsigned long pool_mask[NR_NODES] = { -1L, }; +int pnode_to_lnode[NR_NODES] = { [0 ... NR_NODES-1] = -1 }; +void *runqueues_address = (void *)runqueues; /* export this symbol to modules */ +char lnode_number[NR_CPUS] __cacheline_aligned; +static int _pool_delay[NR_NODES*NR_NODES] __cacheline_aligned; +static int _pool_weight[NR_NODES*NR_NODES] __cacheline_aligned; +static atomic_t pool_lock = ATOMIC_INIT(0); /* set to 1 while modifying pool data */ +#define MAX_CACHE_WEIGHT 100 + +/* * Default context-switch locking: */ #ifndef prepare_arch_switch @@ -263,10 +305,12 @@ static inline void activate_task(task_t } enqueue_task(p, array); rq->nr_running++; + HOMENODE_INC(rq,p->node); } static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) { + HOMENODE_DEC(rq,p->node); rq->nr_running--; if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; @@ -294,6 +338,7 @@ void resched_cpu(int cpu) } #ifdef CONFIG_SMP + /* * Wait for a process to unschedule. This is used by the exit() and * ptrace() code. @@ -369,6 +414,7 @@ repeat_lock_task: } p->state = TASK_RUNNING; task_rq_unlock(rq, &flags); + return success; } @@ -380,9 +426,9 @@ int wake_up_process(task_t * p) void wake_up_forked_process(task_t * p) { runqueue_t *rq; + unsigned long flags; - rq = this_rq(); - spin_lock_irq(&rq->lock); + rq = task_rq_lock(p, &flags); p->state = TASK_RUNNING; if (!rt_task(p)) { /* @@ -394,9 +440,11 @@ void wake_up_forked_process(task_t * p) p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100; p->prio = effective_prio(p); } - p->cpu = smp_processor_id(); + //p->cpu = smp_processor_id(); activate_task(p, rq); - spin_unlock_irq(&rq->lock); + if (p->prio < rq->curr->prio) + resched_task(rq->curr); + task_rq_unlock(rq, &flags); } /* @@ -473,6 +521,11 @@ unsigned long nr_running(void) return sum; } +unsigned long qnr_running(int cpu) +{ + return cpu_rq(cpu_logical_map(cpu))->nr_running; +} + /* Note: the per-cpu information is useful only to get the cumulative result */ unsigned long nr_uninterruptible(void) { @@ -519,83 +572,151 @@ static inline unsigned int double_lock_b } /* - * Current runqueue is empty, or rebalance tick: if there is an - * inbalance (current runqueue is too short) then pull from - * busiest runqueue(s). + * Calculate load of a CPU pool, store results in data[][NR_CPUS]. + * Return the index of the most loaded runqueue. * - * We call this with the current runqueue locked, - * irqs disabled. */ -static void load_balance(runqueue_t *this_rq, int idle) +static int calc_pool_load(int data[][NR_CPUS], int this_cpu, int pool, int idle) { - int imbalance, nr_running, load, max_load, - idx, i, this_cpu = smp_processor_id(); - task_t *next = this_rq->idle, *tmp; - runqueue_t *busiest, *rq_src; - prio_array_t *array; - list_t *head, *curr; - - /* - * We search all runqueues to find the most busy one. - * We do this lockless to reduce cache-bouncing overhead, - * we re-check the 'best' source CPU later on again, with - * the lock held. - * - * We fend off statistical fluctuations in runqueue lengths by - * saving the runqueue length during the previous load-balancing - * operation and using the smaller one the current and saved lengths. - * If a runqueue is long enough for a longer amount of time then - * we recognize it and pull tasks from it. - * - * The 'current runqueue length' is a statistical maximum variable, - * for that one we take the longer one - to avoid fluctuations in - * the other direction. So for a load-balance to happen it needs - * stable long runqueue on the target CPU and stable short runqueue - * on the local runqueue. - * - * We make an exception if this CPU is about to become idle - in - * that case we are less picky about moving a task across CPUs and - * take what can be taken. - */ - if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) - nr_running = this_rq->nr_running; - else - nr_running = this_rq->prev_nr_running[this_cpu]; - - busiest = NULL; - max_load = 1; - for (i = 0; i < smp_num_cpus; i++) { - int logical = cpu_logical_map(i); - - rq_src = cpu_rq(logical); - if (idle || (rq_src->nr_running < this_rq->prev_nr_running[logical])) + runqueue_t *rq_src, *this_rq = cpu_rq(this_cpu); + int this_pool = CPU_TO_NODE(this_cpu); + int i, ii, idx=-1, refload, load; + + data[1][pool] = 0; + refload = -1; + + for (ii = pool_ptr[pool]; ii < pool_ptr[pool+1]; ii++) { + i = pool_cpus[ii]; + rq_src = cpu_rq(cpu_logical_map(i)); + if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i])) load = rq_src->nr_running; else - load = this_rq->prev_nr_running[logical]; - this_rq->prev_nr_running[logical] = rq_src->nr_running; + load = this_rq->prev_nr_running[i]; + this_rq->prev_nr_running[i] = rq_src->nr_running; +#ifdef CONFIG_NODE_AFFINE_SCHED + /* prefer cpus running tasks from this node */ + if (pool != this_pool) + load += rq_src->nr_homenode[this_pool]; +#endif - if ((load > max_load) && (rq_src != this_rq)) { - busiest = rq_src; - max_load = load; + data[0][i] = load; + data[1][pool] += load; + + if (load > refload) { + idx = i; + refload = load; } } + data[1][pool] = data[1][pool] * BALANCE_FACTOR / pool_nr_cpus[pool]; + return idx; +} - if (likely(!busiest)) - return; +/* + * Find a runqueue from which to steal a task. We try to do this as locally as + * possible because we don't want to let tasks get far from their home node. + * This is done in two steps: + * 1. First try to find a runqueue within the own CPU pool (AKA node) with + * imbalance larger than 25% (relative to the current runqueue). + * 2. If the local node is well balanced, locate the most loaded node and its + * most loaded CPU. Remote runqueues running tasks having their homenode on the + * current node are preferred (those tasks count twice in the load calculation). + * If the current load is far below the average try to steal a task from the + * most loaded node/cpu. Otherwise wait 100ms and give less loaded nodes the + * chance to approach the average load. + * + * This concept can be extended easilly to more than two levels (multi-level + * scheduler?), e.g.: CPU -> multi-core package -> node -> supernode... + * + */ +static inline runqueue_t *scan_pools(runqueue_t *this_rq, int idle, int *nr_running) +{ + runqueue_t *busiest = NULL; + int imax, best_cpu, pool, max_pool_load, max_pool_idx; + int i, del_shift, this_cpu = this_rq->curr->cpu; + int avg_load=-1, this_pool = CPU_TO_NODE(this_cpu); - imbalance = (max_load - nr_running) / 2; + /* Need at least ~25% imbalance to trigger balancing. */ +#define BALANCED(m,t) (((m) <= 1) || (((m) - (t))/2 < (((m) + (t))/2 + 3)/4)) - /* It needs an at least ~25% imbalance to trigger balancing. */ - if (!idle && (imbalance < (max_load + 3)/4)) - return; + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) + *nr_running = this_rq->nr_running; + else + *nr_running = this_rq->prev_nr_running[this_cpu]; + + best_cpu = calc_pool_load(this_rq->load, this_cpu, this_pool, idle); + if (best_cpu != this_cpu) + goto check_out; + + scan_all: + best_cpu = -1; + max_pool_load = this_rq->load[1][this_pool]; + max_pool_idx = this_pool; + avg_load = max_pool_load * pool_nr_cpus[this_pool]; + for (i = 1; i < numpools; i++) { + pool = (i + this_pool) % numpools; + imax = calc_pool_load(this_rq->load, this_cpu, pool, idle); + avg_load += this_rq->load[1][pool]*pool_nr_cpus[pool]; + if (this_rq->load[1][pool] > max_pool_load) { + max_pool_load = this_rq->load[1][pool]; + max_pool_idx = pool; + best_cpu = imax; + } + } + /* Exit if not enough imbalance on any remote node. */ + if ((best_cpu < 0) || + BALANCED(max_pool_load,this_rq->load[1][this_pool])) { + this_rq->wait_node = -1; + goto out; + } + avg_load /= smp_num_cpus; + /* Wait longer before stealing if load is average. */ + if (BALANCED(avg_load,this_rq->load[1][this_pool])) + del_shift = 0; + else + del_shift = 6; + + if (this_rq->wait_node != max_pool_idx) { + this_rq->wait_node = max_pool_idx; + this_rq->wait_time = jiffies; + goto out; + } else + if (jiffies - this_rq->wait_time < + (POOL_DELAY(this_pool,this_rq->wait_node) >> del_shift)) + goto out; + check_out: + /* Enough imbalance in the remote cpu loads? */ + if (!BALANCED(this_rq->load[0][best_cpu],*nr_running)) { + busiest = cpu_rq(cpu_logical_map(best_cpu)); + this_rq->wait_node = -1; + } else if (avg_load == -1) + /* only scanned local pool, so let's look at all of them */ + goto scan_all; + out: + return busiest; +} + +/* + * Find a task to steal from the busiest RQ. The busiest->lock must be held + * while calling this routine. + */ +static inline task_t *task_to_steal(runqueue_t *busiest, int this_cpu) +{ + int idx; + task_t *next = NULL, *tmp; + prio_array_t *array; + list_t *head, *curr; + int this_pool=CPU_TO_NODE(this_cpu), weight, maxweight=0; - nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running); /* - * Make sure nothing changed since we checked the - * runqueue length. + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed. */ - if (busiest->nr_running <= nr_running + 1) - goto out_unlock; + +#define CAN_MIGRATE_TASK(p,rq,this_cpu) \ + ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \ + !task_running(rq, p) && \ + ((p)->cpus_allowed & (1UL<<(this_cpu)))) /* * We first consider expired tasks. Those will likely not be @@ -621,7 +742,7 @@ skip_bitmap: array = busiest->active; goto new_array; } - goto out_unlock; + goto out; } head = array->queue + idx; @@ -629,45 +750,139 @@ skip_bitmap: skip_queue: tmp = list_entry(curr, task_t, run_list); + if (CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) { + weight = (jiffies - tmp->sleep_timestamp)/cache_decay_ticks; +#ifdef CONFIG_NODE_AFFINE_SCHED + /* limit weight influence of sleep_time and cache coolness */ + if (weight >= MAX_CACHE_WEIGHT) weight=MAX_CACHE_WEIGHT-1; + /* weight depending on homenode of task */ + weight += POOL_WEIGHT(this_pool,tmp->node); + /* task gets bonus if running on its homenode */ + if (tmp->node == CPU_TO_NODE(busiest->curr->cpu)) + weight -= MAX_CACHE_WEIGHT; +#endif + if (weight > maxweight) { + maxweight = weight; + next = tmp; + } + } + curr = curr->next; + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + + out: + return next; +} + +static inline void +try_push_home(runqueue_t *this_rq, int this_cpu, int nr_running) +{ +#ifdef CONFIG_NODE_AFFINE_SCHED + task_t *p; + int tgt_pool, tgt_cpu, i, ii; + runqueue_t *rq; + static int sched_push_task(task_t *p, int cpu_dest); + + if (nr_running != 1) + return; + p = this_rq->curr; + tgt_pool = p->node; + if (tgt_pool != CPU_TO_NODE(this_cpu)) { + /* compute how many own tasks run on the tgt node */ + int load = 0; + for (ii=pool_ptr[tgt_pool]; iinr_homenode[tgt_pool]; + } + load = BALANCE_FACTOR * load / pool_nr_cpus[tgt_pool]; + if (load < BALANCE_FACTOR/4) { + tgt_cpu = __ffs(p->cpus_allowed & pool_mask[tgt_pool] + & cpu_online_map); + if (tgt_cpu) + sched_push_task(p, tgt_cpu); + } + } +#endif +} + +/* + * Current runqueue is empty, or rebalance tick: if there is an + * inbalance (current runqueue is too short) then pull from + * busiest runqueue(s). + * + * We call this with the current runqueue locked, + * irqs disabled. + */ +static void load_balance(runqueue_t *this_rq, int idle) +{ + int nr_running, this_cpu = this_rq->curr->cpu; + task_t *next; + runqueue_t *busiest; + prio_array_t *array; + /* - * We do not migrate tasks that are: - * 1) running (obviously), or - * 2) cannot be migrated to this CPU due to cpus_allowed, or - * 3) are cache-hot on their current CPU. + * We search all runqueues to find the most busy one. + * We do this lockless to reduce cache-bouncing overhead, + * we re-check the 'best' source CPU later on again, with + * the lock held. + * + * We fend off statistical fluctuations in runqueue lengths by + * saving the runqueue length during the previous load-balancing + * operation and using the smaller one the current and saved lengths. + * If a runqueue is long enough for a longer amount of time then + * we recognize it and pull tasks from it. + * + * The 'current runqueue length' is a statistical maximum variable, + * for that one we take the longer one - to avoid fluctuations in + * the other direction. So for a load-balance to happen it needs + * stable long runqueue on the target CPU and stable short runqueue + * on the local runqueue. + * + * We make an exception if this CPU is about to become idle - in + * that case we are less picky about moving a task across CPUs and + * take what can be taken. */ + /* avoid deadlock by timer interrupt on own cpu */ + if (atomic_read(&pool_lock)) return; + busiest = scan_pools(this_rq, idle, &nr_running); -#define CAN_MIGRATE_TASK(p,rq,this_cpu) \ - ((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \ - !task_running(rq, p) && \ - ((p)->cpus_allowed & (1 << (this_cpu)))) - - if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) { - curr = curr->next; - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; + if (!busiest) { + try_push_home(this_rq, this_cpu, nr_running); + return; } - next = tmp; + + nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running); + /* + * Make sure nothing changed since we checked the + * runqueue length. + */ + if (busiest->nr_running <= nr_running + 1) + goto out_unlock; + + new_steal: + next = task_to_steal(busiest, this_cpu); + if (!next) + goto out_unlock; + + array = next->array; + /* * take the task out of the other runqueue and * put it into this one: */ dequeue_task(next, array); busiest->nr_running--; + HOMENODE_DEC(busiest,next->node); next->cpu = this_cpu; this_rq->nr_running++; enqueue_task(next, this_rq->active); + HOMENODE_INC(this_rq,next->node); if (next->prio < current->prio) set_need_resched(); -#ifdef ORIG_IMBALANCE - if (!idle && --imbalance) { - if (array == busiest->expired) { - array = busiest->active; - goto new_array; - } - } -#endif + out_unlock: spin_unlock(&busiest->lock); } @@ -678,10 +893,10 @@ out_unlock: * frequency and balancing agressivity depends on whether the CPU is * idle or not. * - * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on + * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on * systems with HZ=100, every 10 msecs.) */ -#define BUSY_REBALANCE_TICK (HZ/4 ?: 1) +#define BUSY_REBALANCE_TICK (HZ/5 ?: 1) #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) static inline void idle_tick(runqueue_t *rq) @@ -689,10 +904,7 @@ static inline void idle_tick(runqueue_t if (jiffies % IDLE_REBALANCE_TICK) return; spin_lock(&rq->lock); - if (!rq->nr_running) - load_balance(rq, 1); - else - set_tsk_need_resched(rq->curr); + load_balance(rq, 1); spin_unlock(&rq->lock); } @@ -1484,6 +1696,286 @@ static inline void double_rq_unlock(runq spin_unlock(&rq2->lock); } +/* used as counter for round-robin node-scheduling */ +static atomic_t sched_node=ATOMIC_INIT(0); + +/* + * Find the least loaded CPU on the current node of the task. + */ +#ifdef CONFIG_NUMA_SCHED +static int sched_best_cpu(struct task_struct *p) +{ + int n, cpu, load, best_cpu = p->cpu; + + load = 1000000; + for (n = pool_ptr[p->node]; n < pool_ptr[p->node+1]; n++) { + cpu = cpu_logical_map(pool_cpus[n]); + if (!(p->cpus_allowed & (1UL << cpu) & cpu_online_map)) + continue; + if (cpu_rq(cpu)->nr_running < load) { + best_cpu = cpu; + load = cpu_rq(cpu)->nr_running; + } + } + return best_cpu; +} + +/* + * Find the node with fewest tasks assigned to it. Don't bother about the + * current load of the nodes, the load balancer should take care. + * The argument flag gives some options for initial load balancing: + * flag = 0: don't count own task (when balancing at do_exec()) + * flag = 1: count own task (when balancing at do_fork()) + */ +static int sched_best_node(struct task_struct *p, int flag) +{ + int n, best_node=0, min_load, pool_load, min_pool=p->node; + int pool, load[NR_NODES]; + unsigned long mask = p->cpus_allowed & cpu_online_map; + + do { + best_node = atomic_inc_return(&sched_node) % numpools; + } while (!(pool_mask[best_node] & mask)); + + for (pool = 0; pool < numpools; pool++) + load[pool] = 0; + + for (n = 0; n < smp_num_cpus; n++) +#ifdef CONFIG_NODE_AFFINE_SCHED + for (pool = 0; pool < numpools; pool++) + load[pool] += cpu_rq(cpu_logical_map(n))->nr_homenode[pool]; +#else + load[CPU_TO_NODE(n)] += cpu_rq(cpu_logical_map(n))->nr_running; +#endif + + /* don't count own process, this one will be moved */ + if (!flag) + --load[p->node]; + + min_load = 100000000; + for (n = 0; n < numpools; n++) { + pool = (best_node + n) % numpools; + pool_load = (100*load[pool])/pool_nr_cpus[pool]; + if ((pool_load < min_load) && (pool_mask[pool] & mask)) { + min_load = pool_load; + min_pool = pool; + } + } + atomic_set(&sched_node, min_pool); + return min_pool; +} + +void sched_balance_exec(void) +{ + int new_cpu, new_node; + + while (atomic_read(&pool_lock)) + cpu_relax(); + if (numpools > 1) { + new_node = sched_best_node(current, 0); + if (new_node != current->node) { + set_task_node(current, new_node); + } + } + new_cpu = sched_best_cpu(current); + if (new_cpu != smp_processor_id()) + sched_migrate_task(current, new_cpu); +} + +void sched_balance_fork(task_t *p) +{ + while (atomic_read(&pool_lock)) + cpu_relax(); + if (numpools > 1) + p->node = sched_best_node(p, 1); + p->cpu = sched_best_cpu(p); +} + +void pools_info(void) +{ + int i, j; + + printk("CPU pools : %d\n",numpools); + for (i=0;i 1) { + atomic_dec(&pool_lock); + goto retry; + } + /* + * Wait a while, any loops using pool data should finish + * in between. This is VERY ugly and should be replaced + * by some real RCU stuff. [EF] + */ + for (i=0; i<100; i++) + udelay(1000); +} + +void pooldata_unlock(void) +{ + atomic_dec(&pool_lock); +} + +int node_levels[NR_NODES]; +int nr_node_levels; + +/* + * Default setting of node_distance() for up to 8 nodes. + * Each platform should initialize this to more appropriate values + * in the arch dependent part. + */ +#ifndef node_distance +int __node_distance[ 8 * 8] = { 10, 15, 15, 15, 20, 20, 20, 20, + 15, 10, 15, 15, 20, 20, 20, 20, + 15, 15, 10, 15, 20, 20, 20, 20, + 15, 15, 15, 10, 20, 20, 20, 20, + 20, 20, 20, 20, 10, 15, 15, 15, + 20, 20, 20, 20, 15, 10, 15, 15, + 20, 20, 20, 20, 15, 15, 10, 15, + 20, 20, 20, 20, 15, 15, 15, 10 }; +#define node_distance(i,j) __node_distance[i*8+j] +#endif + +/* + * Find all values of node distances in the SLIT table and sort them + * into the array node_levels[]. + */ +static void +find_node_levels(inr numpools) +{ + int lev, tgtlev, nlarger, i; + + nr_node_levels = 1; + node_levels[0] = node_distance(0, 0); + do { + nlarger = 0; + tgtlev = 100000; + for (i=1; i node_levels[nr_node_levels-1] && + lev < tgtlev) { + if (tgtlev < 100000) nlarger++; + tgtlev = lev; + } + if (lev > tgtlev) nlarger++; + } + if (tgtlev != 100000) + node_levels[nr_node_levels++] = tgtlev; + } while (nlarger); + + for (i=0; i= numpools) return; + rq = task_rq_lock(p, &flags); + if (p->array) { + HOMENODE_DEC(rq, p->node); + HOMENODE_INC(rq, node); + } + p->node = node; + task_rq_unlock(rq, &flags); +} +#endif /* CONFIG_NUMA_SCHED */ + void __init init_idle(task_t *idle, int cpu) { runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(idle->cpu); @@ -1499,6 +1991,9 @@ void __init init_idle(task_t *idle, int idle->prio = MAX_PRIO; idle->state = TASK_RUNNING; idle->cpu = cpu; +#ifdef CONFIG_NUMA_SCHED + idle->node = 0; // was: SAPICID_TO_PNODE(cpu_physical_id(cpu)); +#endif double_rq_unlock(idle_rq, rq); set_tsk_need_resched(idle); __restore_flags(flags); @@ -1532,7 +2027,21 @@ void __init sched_init(void) // delimiter for bitsearch __set_bit(MAX_PRIO, array->bitmap); } +#ifdef CONFIG_NUMA_SCHED + for (j = 0; j < NR_NODES; j++) + rq->nr_homenode[j]=0; + pool_cpus[i] = i; +#endif } +#ifdef CONFIG_NUMA_SCHED + pool_ptr[0] = 0; + pool_ptr[1] = NR_CPUS; + + numpools = 1; + pool_mask[0] = -1L; + pool_nr_cpus[0] = NR_CPUS; +#endif + /* * We have to do a little magic to get the first * process right in SMP mode. @@ -1576,6 +2085,8 @@ typedef struct { list_t list; task_t *task; struct semaphore sem; + int cpu_dest; + int sync; } migration_req_t; /* @@ -1593,8 +2104,7 @@ void set_cpus_allowed(task_t *p, unsigne migration_req_t req; runqueue_t *rq; - new_mask &= cpu_online_map; - if (!new_mask) + if (!(new_mask & cpu_online_map)) BUG(); rq = task_rq_lock(p, &flags); @@ -1619,6 +2129,7 @@ void set_cpus_allowed(task_t *p, unsigne init_MUTEX_LOCKED(&req.sem); req.task = p; + req.sync = 1; list_add(&req.list, &rq->migration_queue); task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@ -1626,10 +2137,56 @@ void set_cpus_allowed(task_t *p, unsigne down(&req.sem); } -static int migration_thread(void * unused) +/* + * Static per cpu migration request structures for pushing the current + * process to another CPU from within load_balance(). + */ +static migration_req_t migr_req[NR_CPUS]; + +/* + * Push the current task to another cpu asynchronously. To be used from within + * load_balance() to push tasks running alone on a remote node back to their + * homenode. The RQ lock must be held when calling this function, it protects + * migr_req[cpu]. Function should not be preempted! + */ +static int sched_push_task(task_t *p, int cpu_dest) { - int bind_cpu = (int) (long) unused; - int cpu = cpu_logical_map(bind_cpu); + int cpu = smp_processor_id(); + runqueue_t *rq = this_rq(); + unsigned long flags; + + if (migr_req[cpu].task) + return -1; + else { + migr_req[cpu].task = p; + migr_req[cpu].cpu_dest = cpu_dest; + migr_req[cpu].sync = 0; + list_add(&migr_req[cpu].list, &rq->migration_queue); + + if (!rq->migration_thread->array) { + activate_task(rq->migration_thread, rq); + if (rq->migration_thread->prio < rq->curr->prio) + resched_task(rq->curr); + } + rq->migration_thread->state = TASK_RUNNING; + return 0; + } +} + +void sched_migrate_task(task_t *p, int dest_cpu) +{ + unsigned long old_mask; + + old_mask = p->cpus_allowed; + if (!(old_mask & (1UL << cpu_logical_map(dest_cpu)))) + return; + set_cpus_allowed(p, 1UL << cpu_logical_map(dest_cpu)); + set_cpus_allowed(p, old_mask); +} + +static int migration_thread(void * bind_cpu) +{ + int cpu = cpu_logical_map((int) (long) bind_cpu); struct sched_param param = { sched_priority: 99 }; runqueue_t *rq; int ret; @@ -1647,6 +2204,7 @@ static int migration_thread(void * unuse yield(); set_cpus_allowed(current, 1UL << cpu); } + set_task_node(current, CPU_TO_NODE(cpu)); printk("migration_task %d on cpu=%d\n",cpu,smp_processor_id()); ret = setscheduler(0, SCHED_FIFO, ¶m); @@ -1658,7 +2216,7 @@ static int migration_thread(void * unuse for (;;) { runqueue_t *rq_src, *rq_dest; struct list_head *head; - int cpu_src, cpu_dest; + int cpu_src, cpu_dest, sync; migration_req_t *req; unsigned long flags; task_t *p; @@ -1673,10 +2231,17 @@ static int migration_thread(void * unuse } req = list_entry(head->next, migration_req_t, list); list_del_init(head->next); - spin_unlock_irqrestore(&rq->lock, flags); p = req->task; - cpu_dest = __ffs(p->cpus_allowed); + sync = req->sync; + if (sync) + cpu_dest = __ffs(p->cpus_allowed & cpu_online_map); + else { + cpu_dest = req->cpu_dest; + req->task = NULL; + } + spin_unlock_irqrestore(&rq->lock, flags); + rq_dest = cpu_rq(cpu_dest); repeat: cpu_src = p->cpu; @@ -1701,7 +2266,8 @@ repeat: double_rq_unlock(rq_src, rq_dest); local_irq_restore(flags); - up(&req->sem); + if (sync) + up(&req->sem); } } diff -urNp a/kernel/softirq.c c/kernel/softirq.c --- a/kernel/softirq.c Mon Jul 15 17:06:00 2002 +++ c/kernel/softirq.c Tue Jul 16 11:38:41 2002 @@ -371,6 +371,7 @@ static int ksoftirqd(void * __bind_cpu) set_cpus_allowed(current, 1UL << cpu); if (cpu() != cpu) BUG(); + set_task_node(current, CPU_TO_NODE(cpu)); sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu); diff -urNp a/kernel/sys.c c/kernel/sys.c --- a/kernel/sys.c Mon Jul 15 17:06:00 2002 +++ c/kernel/sys.c Tue Jul 16 11:38:41 2002 @@ -1205,6 +1205,8 @@ asmlinkage long sys_prctl(int option, un { int error = 0; int sig; + int pid; + struct task_struct *child; switch (option) { case PR_SET_PDEATHSIG: @@ -1272,6 +1274,66 @@ asmlinkage long sys_prctl(int option, un } current->keep_capabilities = arg2; break; +#ifdef CONFIG_NODE_AFFINE_SCHED + case PR_GET_NODE: + pid = (int) arg3; + read_lock(&tasklist_lock); + child = find_task_by_pid(pid); + if (child) { + error = put_user(child->node,(int *)arg2); + } else { + printk("prctl: could not find process %d\n",pid); + error = -EINVAL; + } + read_unlock(&tasklist_lock); + break; + case PR_SET_NODE: + pid = (int) arg3; + read_lock(&tasklist_lock); + child = find_task_by_pid(pid); + if (child) { + if (child->uid == current->uid || \ + current->uid == 0) { + printk("setting node of pid %d to %d\n",pid,(int)arg2); + set_task_node(child,(int)arg2); + } + } else { + printk("prctl: could not find pid %d\n",pid); + error = -EINVAL; + } + read_unlock(&tasklist_lock); + break; + + case PR_GET_NODPOL: + pid = (int) arg3; + read_lock(&tasklist_lock); + child = find_task_by_pid(pid); + if (child) { + error = put_user(child->node_policy,(int *)arg2); + } else { + printk("prctl: could not find pid %d\n",pid); + error = -EINVAL; + } + read_unlock(&tasklist_lock); + break; + case PR_SET_NODPOL: + pid = (int) arg3; + read_lock(&tasklist_lock); + child = find_task_by_pid(pid); + if (child) { + if (child->uid == current->uid || \ + current->uid == 0) { + printk("setting node policy of process %d to %d\n",pid,(int)arg2); + child->node_policy = (int) arg2; + } + } else { + printk("prctl: could not find pid %d\n",pid); + error = -EINVAL; + } + read_unlock(&tasklist_lock); + break; +#endif + default: error = -EINVAL; break;