diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 120 | ||||
-rw-r--r-- | kernel/sched/cpuacct.c | 51 | ||||
-rw-r--r-- | kernel/sched/fair.c | 105 | ||||
-rw-r--r-- | kernel/sched/sched.h | 14 |
4 files changed, 165 insertions, 125 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 05c39f030314..57c186d9477e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5133,18 +5133,23 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * two cpus are in the same cache domain, see cpus_share_cache(). */ DEFINE_PER_CPU(struct sched_domain *, sd_llc); +DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); static void update_top_cache_domain(int cpu) { struct sched_domain *sd; int id = cpu; + int size = 1; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) + if (sd) { id = cpumask_first(sched_domain_span(sd)); + size = cpumask_weight(sched_domain_span(sd)); + } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); + per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; } @@ -6815,7 +6820,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); - tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, + tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, lockdep_is_held(&tsk->sighand->siglock)), struct task_group, css); tg = autogroup_task_group(tsk, tg); @@ -7137,23 +7142,22 @@ int sched_rt_handler(struct ctl_table *table, int write, #ifdef CONFIG_CGROUP_SCHED -/* return corresponding task_group object of a cgroup */ -static inline struct task_group *cgroup_tg(struct cgroup *cgrp) +static inline struct task_group *css_tg(struct cgroup_subsys_state *css) { - return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), - struct task_group, css); + return css ? container_of(css, struct task_group, css) : NULL; } -static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { - struct task_group *tg, *parent; + struct task_group *parent = css_tg(parent_css); + struct task_group *tg; - if (!cgrp->parent) { + if (!parent) { /* This is early initialization for the top cgroup */ return &root_task_group.css; } - parent = cgroup_tg(cgrp->parent); tg = sched_create_group(parent); if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); @@ -7161,41 +7165,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) return &tg->css; } -static int cpu_cgroup_css_online(struct cgroup *cgrp) +static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) { - struct task_group *tg = cgroup_tg(cgrp); - struct task_group *parent; - - if (!cgrp->parent) - return 0; + struct task_group *tg = css_tg(css); + struct task_group *parent = css_tg(css_parent(css)); - parent = cgroup_tg(cgrp->parent); - sched_online_group(tg, parent); + if (parent) + sched_online_group(tg, parent); return 0; } -static void cpu_cgroup_css_free(struct cgroup *cgrp) +static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); sched_destroy_group(tg); } -static void cpu_cgroup_css_offline(struct cgroup *cgrp) +static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); sched_offline_group(tg); } -static int cpu_cgroup_can_attach(struct cgroup *cgrp, +static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *task; - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED - if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) + if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ @@ -7206,18 +7207,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp, return 0; } -static void cpu_cgroup_attach(struct cgroup *cgrp, +static void cpu_cgroup_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *task; - cgroup_taskset_for_each(task, cgrp, tset) + cgroup_taskset_for_each(task, css, tset) sched_move_task(task); } -static void -cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, - struct task_struct *task) +static void cpu_cgroup_exit(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, + struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. @@ -7231,15 +7232,16 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, } #ifdef CONFIG_FAIR_GROUP_SCHED -static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, - u64 shareval) +static int cpu_shares_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 shareval) { - return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); + return sched_group_set_shares(css_tg(css), scale_load(shareval)); } -static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); return (u64) scale_load_down(tg->shares); } @@ -7361,26 +7363,28 @@ long tg_get_cfs_period(struct task_group *tg) return cfs_period_us; } -static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) +static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) { - return tg_get_cfs_quota(cgroup_tg(cgrp)); + return tg_get_cfs_quota(css_tg(css)); } -static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, - s64 cfs_quota_us) +static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 cfs_quota_us) { - return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); + return tg_set_cfs_quota(css_tg(css), cfs_quota_us); } -static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { - return tg_get_cfs_period(cgroup_tg(cgrp)); + return tg_get_cfs_period(css_tg(css)); } -static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, - u64 cfs_period_us) +static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 cfs_period_us) { - return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); + return tg_set_cfs_period(css_tg(css), cfs_period_us); } struct cfs_schedulable_data { @@ -7461,10 +7465,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) return ret; } -static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, +static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, struct cgroup_map_cb *cb) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; cb->fill(cb, "nr_periods", cfs_b->nr_periods); @@ -7477,26 +7481,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED -static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, - s64 val) +static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, + struct cftype *cft, s64 val) { - return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); + return sched_group_set_rt_runtime(css_tg(css), val); } -static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) +static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return sched_group_rt_runtime(cgroup_tg(cgrp)); + return sched_group_rt_runtime(css_tg(css)); } -static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, - u64 rt_period_us) +static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 rt_period_us) { - return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); + return sched_group_set_rt_period(css_tg(css), rt_period_us); } -static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, + struct cftype *cft) { - return sched_group_rt_period(cgroup_tg(cgrp)); + return sched_group_rt_period(css_tg(css)); } #endif /* CONFIG_RT_GROUP_SCHED */ diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index dbb7e2cd95eb..f64722ff0299 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -33,30 +33,20 @@ struct cpuacct { struct kernel_cpustat __percpu *cpustat; }; -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) { - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), - struct cpuacct, css); + return css ? container_of(css, struct cpuacct, css) : NULL; } /* return cpu accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), - struct cpuacct, css); -} - -static inline struct cpuacct *__parent_ca(struct cpuacct *ca) -{ - return cgroup_ca(ca->css.cgroup->parent); + return css_ca(task_css(tsk, cpuacct_subsys_id)); } static inline struct cpuacct *parent_ca(struct cpuacct *ca) { - if (!ca->css.cgroup->parent) - return NULL; - return cgroup_ca(ca->css.cgroup->parent); + return css_ca(css_parent(&ca->css)); } static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); @@ -66,11 +56,12 @@ static struct cpuacct root_cpuacct = { }; /* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) { struct cpuacct *ca; - if (!cgrp->parent) + if (!parent_css) return &root_cpuacct.css; ca = kzalloc(sizeof(*ca), GFP_KERNEL); @@ -96,9 +87,9 @@ out: } /* destroy an existing cpu accounting group */ -static void cpuacct_css_free(struct cgroup *cgrp) +static void cpuacct_css_free(struct cgroup_subsys_state *css) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); free_percpu(ca->cpustat); free_percpu(ca->cpuusage); @@ -141,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) } /* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); u64 totalcpuusage = 0; int i; @@ -153,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) return totalcpuusage; } -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, - u64 reset) +static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 reset) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); int err = 0; int i; @@ -172,10 +163,10 @@ out: return err; } -static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) +static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m) { - struct cpuacct *ca = cgroup_ca(cgroup); + struct cpuacct *ca = css_ca(css); u64 percpu; int i; @@ -192,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_SYSTEM] = "system", }; -static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) +static int cpuacct_stats_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct cgroup_map_cb *cb) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); int cpu; s64 val = 0; @@ -281,7 +272,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) while (ca != &root_cpuacct) { kcpustat = this_cpu_ptr(ca->cpustat); kcpustat->cpustat[index] += val; - ca = __parent_ca(ca); + ca = parent_ca(ca); } rcu_read_unlock(); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 68f1609ca149..8977a249816f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3018,6 +3018,23 @@ static unsigned long cpu_avg_load_per_task(int cpu) return 0; } +static void record_wakee(struct task_struct *p) +{ + /* + * Rough decay (wiping) for cost saving, don't worry + * about the boundary, really active task won't care + * about the loss. + */ + if (jiffies > current->wakee_flip_decay_ts + HZ) { + current->wakee_flips = 0; + current->wakee_flip_decay_ts = jiffies; + } + + if (current->last_wakee != p) { + current->last_wakee = p; + current->wakee_flips++; + } +} static void task_waking_fair(struct task_struct *p) { @@ -3038,6 +3055,7 @@ static void task_waking_fair(struct task_struct *p) #endif se->vruntime -= min_vruntime; + record_wakee(p); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -3156,6 +3174,28 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif +static int wake_wide(struct task_struct *p) +{ + int factor = this_cpu_read(sd_llc_size); + + /* + * Yeah, it's the switching-frequency, could means many wakee or + * rapidly switch, use factor here will just help to automatically + * adjust the loose-degree, so bigger node will lead to more pull. + */ + if (p->wakee_flips > factor) { + /* + * wakee is somewhat hot, it needs certain amount of cpu + * resource, so if waker is far more hot, prefer to leave + * it alone. + */ + if (current->wakee_flips > (factor * p->wakee_flips)) + return 1; + } + + return 0; +} + static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { s64 this_load, load; @@ -3165,6 +3205,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) unsigned long weight; int balanced; + /* + * If we wake multiple tasks be careful to not bounce + * ourselves around too much. + */ + if (wake_wide(p)) + return 0; + idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); @@ -4172,47 +4219,48 @@ static void update_blocked_averages(int cpu) } /* - * Compute the cpu's hierarchical load factor for each task group. + * Compute the hierarchical load factor for cfs_rq and all its ascendants. * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ -static int tg_load_down(struct task_group *tg, void *data) -{ - unsigned long load; - long cpu = (long)data; - - if (!tg->parent) { - load = cpu_rq(cpu)->avg.load_avg_contrib; - } else { - load = tg->parent->cfs_rq[cpu]->h_load; - load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib, - tg->parent->cfs_rq[cpu]->runnable_load_avg + 1); - } - - tg->cfs_rq[cpu]->h_load = load; - - return 0; -} - -static void update_h_load(long cpu) +static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) { - struct rq *rq = cpu_rq(cpu); + struct rq *rq = rq_of(cfs_rq); + struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; unsigned long now = jiffies; + unsigned long load; - if (rq->h_load_throttle == now) + if (cfs_rq->last_h_load_update == now) return; - rq->h_load_throttle = now; + cfs_rq->h_load_next = NULL; + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + cfs_rq->h_load_next = se; + if (cfs_rq->last_h_load_update == now) + break; + } - rcu_read_lock(); - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); - rcu_read_unlock(); + if (!se) { + cfs_rq->h_load = rq->avg.load_avg_contrib; + cfs_rq->last_h_load_update = now; + } + + while ((se = cfs_rq->h_load_next) != NULL) { + load = cfs_rq->h_load; + load = div64_ul(load * se->avg.load_avg_contrib, + cfs_rq->runnable_load_avg + 1); + cfs_rq = group_cfs_rq(se); + cfs_rq->h_load = load; + cfs_rq->last_h_load_update = now; + } } static unsigned long task_h_load(struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); + update_cfs_rq_h_load(cfs_rq); return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, cfs_rq->runnable_load_avg + 1); } @@ -4221,10 +4269,6 @@ static inline void update_blocked_averages(int cpu) { } -static inline void update_h_load(long cpu) -{ -} - static unsigned long task_h_load(struct task_struct *p) { return p->se.avg.load_avg_contrib; @@ -5114,7 +5158,6 @@ redo: env.src_rq = busiest; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); - update_h_load(env.src_cpu); more_balance: local_irq_save(flags); double_rq_lock(env.dst_rq, busiest); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ef0a7b2439dd..b3c5653e1dca 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -285,7 +285,6 @@ struct cfs_rq { /* Required to track per-cpu representation of a task_group */ u32 tg_runnable_contrib; unsigned long tg_load_contrib; -#endif /* CONFIG_FAIR_GROUP_SCHED */ /* * h_load = weight * f(tg) @@ -294,6 +293,9 @@ struct cfs_rq { * this group. */ unsigned long h_load; + u64 last_h_load_update; + struct sched_entity *h_load_next; +#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED @@ -429,9 +431,6 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; -#ifdef CONFIG_SMP - unsigned long h_load_throttle; -#endif /* CONFIG_SMP */ #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED @@ -595,6 +594,7 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) } DECLARE_PER_CPU(struct sched_domain *, sd_llc); +DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); struct sched_group_power { @@ -665,9 +665,9 @@ extern int group_balance_cpu(struct sched_group *sg); /* * Return the group to which this tasks belongs. * - * We cannot use task_subsys_state() and friends because the cgroup - * subsystem changes that value before the cgroup_subsys::attach() method - * is called, therefore we cannot pin it and might observe the wrong value. + * We cannot use task_css() and friends because the cgroup subsystem + * changes that value before the cgroup_subsys::attach() method is called, + * therefore we cannot pin it and might observe the wrong value. * * The same is true for autogroup's p->signal->autogroup->tg, the autogroup * core changes this before calling sched_move_task(). |