diff -ru linux-2.5.59/fs/proc/proc_misc.c sched-2.5.59/fs/proc/proc_misc.c --- linux-2.5.59/fs/proc/proc_misc.c Thu Jan 16 18:21:39 2003 +++ sched-2.5.59/fs/proc/proc_misc.c Wed Jan 22 15:33:01 2003 @@ -254,6 +254,9 @@ .release = seq_release, }; +extern int schedstats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data); + #ifdef CONFIG_PROC_HARDWARE static int hardware_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -573,6 +576,7 @@ {"locks", locks_read_proc}, {"iomem", memory_read_proc}, {"execdomains", execdomains_read_proc}, + {"schedstat", schedstats_read_proc}, {NULL,} }; for (p = simple_ones; p->name; p++) diff -ru linux-2.5.59/kernel/sched.c sched-2.5.59/kernel/sched.c --- linux-2.5.59/kernel/sched.c Thu Jan 16 18:22:29 2003 +++ sched-2.5.59/kernel/sched.c Mon Jan 27 00:41:58 2003 @@ -224,6 +224,83 @@ #endif /* CONFIG_NUMA */ + +struct schedstat { + /* sys_sched_yield stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule stats */ + unsigned long sched_noswitch; + unsigned long sched_switch; + unsigned long sched_cnt; + + /* load_balance stats */ + unsigned long lb_imbalance; + unsigned long lb_idle; + unsigned long lb_resched; + unsigned long lb_cnt; + unsigned long lb_nobusy; +} ____cacheline_aligned; + +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 1 + +struct schedstat schedstats[NR_CPUS]; + +/* + * This could conceivably exceed a page's worth of output on machines with + * large number of cpus, where large == about 4096/100 or 40ish. Start + * worrying when we pass 32, probably. Then this has to stop being a + * "simple" entry in proc/proc_misc.c and needs to be an actual seq_file. + */ +int schedstats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct schedstat sums; + int i, len; + + memset(&sums, 0, sizeof(sums)); + len = sprintf(page, "version %d\n", SCHEDSTAT_VERSION); + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) continue; + sums.yld_exp_empty += schedstats[i].yld_exp_empty; + sums.yld_act_empty += schedstats[i].yld_act_empty; + sums.yld_both_empty += schedstats[i].yld_both_empty; + sums.yld_cnt += schedstats[i].yld_cnt; + sums.sched_noswitch += schedstats[i].sched_noswitch; + sums.sched_switch += schedstats[i].sched_switch; + sums.sched_switch += schedstats[i].sched_cnt; + sums.lb_idle += schedstats[i].lb_idle; + sums.lb_resched += schedstats[i].lb_resched; + sums.lb_cnt += schedstats[i].lb_cnt; + sums.lb_imbalance += schedstats[i].lb_imbalance; + sums.lb_nobusy += schedstats[i].lb_nobusy; + len += sprintf(page + len, + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + i, schedstats[i].yld_both_empty, + schedstats[i].yld_act_empty, schedstats[i].yld_exp_empty, + schedstats[i].yld_cnt, schedstats[i].sched_noswitch, + schedstats[i].sched_switch, schedstats[i].sched_cnt, + schedstats[i].lb_idle, schedstats[i].lb_resched, + schedstats[i].lb_cnt, schedstats[i].lb_imbalance, + schedstats[i].lb_nobusy); + } + len += sprintf(page + len, + "totals %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + sums.yld_both_empty, sums.yld_act_empty, sums.yld_exp_empty, + sums.yld_cnt, sums.sched_noswitch, sums.sched_switch, + sums.sched_cnt, sums.lb_idle, sums.lb_resched, sums.lb_cnt, + sums.lb_imbalance, sums.lb_nobusy); + + return len; +} + /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without @@ -583,7 +660,6 @@ return prev; } - /* * nr_running, nr_uninterruptible and nr_context_switches: * @@ -921,7 +997,13 @@ * * We call this with the current runqueue locked, * irqs disabled. + * + * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on + * systems with HZ=100, every 10 msecs.) */ +#define BUSY_REBALANCE_TICK (HZ/4 ?: 1) +#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) + static void load_balance(runqueue_t *this_rq, int idle) { int imbalance, idx, this_cpu = smp_processor_id(); @@ -930,11 +1012,18 @@ struct list_head *head, *curr; task_t *tmp; + schedstats[this_cpu].lb_cnt++; + if (idle) + schedstats[this_cpu].lb_idle++; + busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpus_to_balance(this_cpu, this_rq)); - if (!busiest) + if (!busiest) { + schedstats[this_cpu].lb_nobusy++; goto out; + } + schedstats[this_cpu].lb_imbalance += imbalance; /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to @@ -1005,13 +1094,7 @@ * get called every timer tick, on every CPU. Our balancing action * frequency and balancing agressivity depends on whether the CPU is * idle or not. - * - * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on - * systems with HZ=100, every 10 msecs.) */ -#define BUSY_REBALANCE_TICK (HZ/4 ?: 1) -#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) - static inline void idle_tick(runqueue_t *rq) { if (jiffies % IDLE_REBALANCE_TICK) @@ -1139,13 +1222,14 @@ runqueue_t *rq; prio_array_t *array; struct list_head *queue; - int idx; + int idx, mycpu = smp_processor_id(); /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ + schedstats[mycpu].sched_cnt++; if (likely(current->state != TASK_ZOMBIE)) { if (unlikely(in_atomic())) { printk(KERN_ERR "bad: scheduling while atomic!\n"); @@ -1184,6 +1268,7 @@ pick_next_task: if (unlikely(!rq->nr_running)) { #if CONFIG_SMP + schedstats[this_cpu].lb_resched++; load_balance(rq, 1); if (rq->nr_running) goto pick_next_task; @@ -1198,11 +1283,13 @@ /* * Switch the active and expired arrays. */ + schedstats[mycpu].sched_switch++; rq->active = rq->expired; rq->expired = array; array = rq->active; rq->expired_timestamp = 0; } + schedstats[mycpu].sched_noswitch++; idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; @@ -1859,6 +1946,7 @@ { runqueue_t *rq = this_rq_lock(); prio_array_t *array = current->array; + int mycpu = smp_processor_id(); /* * We implement yielding by moving the task into the expired @@ -1867,7 +1955,15 @@ * (special rule: RT tasks will just roundrobin in the active * array.) */ + schedstats[mycpu].yld_cnt++; if (likely(!rt_task(current))) { + if (current->array->nr_active == 1) { + schedstats[mycpu].yld_act_empty++; + if (!rq->expired->nr_active) + schedstats[mycpu].yld_both_empty++; + } else if (!rq->expired->nr_active) { + schedstats[mycpu].yld_exp_empty++; + } dequeue_task(current, array); enqueue_task(current, rq->expired); } else {