aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c120
-rw-r--r--kernel/sched/cpuacct.c51
-rw-r--r--kernel/sched/fair.c105
-rw-r--r--kernel/sched/sched.h14
4 files changed, 165 insertions, 125 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 05c39f030314..57c186d9477e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5133,18 +5133,23 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
* two cpus are in the same cache domain, see cpus_share_cache().
*/
DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
static void update_top_cache_domain(int cpu)
{
struct sched_domain *sd;
int id = cpu;
+ int size = 1;
sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- if (sd)
+ if (sd) {
id = cpumask_first(sched_domain_span(sd));
+ size = cpumask_weight(sched_domain_span(sd));
+ }
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+ per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
}
@@ -6815,7 +6820,7 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
- tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
+ tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
lockdep_is_held(&tsk->sighand->siglock)),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
@@ -7137,23 +7142,22 @@ int sched_rt_handler(struct ctl_table *table, int write,
#ifdef CONFIG_CGROUP_SCHED
-/* return corresponding task_group object of a cgroup */
-static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
{
- return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
- struct task_group, css);
+ return css ? container_of(css, struct task_group, css) : NULL;
}
-static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
+static struct cgroup_subsys_state *
+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
- struct task_group *tg, *parent;
+ struct task_group *parent = css_tg(parent_css);
+ struct task_group *tg;
- if (!cgrp->parent) {
+ if (!parent) {
/* This is early initialization for the top cgroup */
return &root_task_group.css;
}
- parent = cgroup_tg(cgrp->parent);
tg = sched_create_group(parent);
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
@@ -7161,41 +7165,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
return &tg->css;
}
-static int cpu_cgroup_css_online(struct cgroup *cgrp)
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
{
- struct task_group *tg = cgroup_tg(cgrp);
- struct task_group *parent;
-
- if (!cgrp->parent)
- return 0;
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css_parent(css));
- parent = cgroup_tg(cgrp->parent);
- sched_online_group(tg, parent);
+ if (parent)
+ sched_online_group(tg, parent);
return 0;
}
-static void cpu_cgroup_css_free(struct cgroup *cgrp)
+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *tg = css_tg(css);
sched_destroy_group(tg);
}
-static void cpu_cgroup_css_offline(struct cgroup *cgrp)
+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *tg = css_tg(css);
sched_offline_group(tg);
}
-static int cpu_cgroup_can_attach(struct cgroup *cgrp,
+static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
struct task_struct *task;
- cgroup_taskset_for_each(task, cgrp, tset) {
+ cgroup_taskset_for_each(task, css, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
- if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
+ if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
#else
/* We don't support RT-tasks being in separate groups */
@@ -7206,18 +7207,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp,
return 0;
}
-static void cpu_cgroup_attach(struct cgroup *cgrp,
+static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
struct task_struct *task;
- cgroup_taskset_for_each(task, cgrp, tset)
+ cgroup_taskset_for_each(task, css, tset)
sched_move_task(task);
}
-static void
-cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
- struct task_struct *task)
+static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
{
/*
* cgroup_exit() is called in the copy_process() failure path.
@@ -7231,15 +7232,16 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
- u64 shareval)
+static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 shareval)
{
- return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
+ return sched_group_set_shares(css_tg(css), scale_load(shareval));
}
-static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *tg = css_tg(css);
return (u64) scale_load_down(tg->shares);
}
@@ -7361,26 +7363,28 @@ long tg_get_cfs_period(struct task_group *tg)
return cfs_period_us;
}
-static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
+static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return tg_get_cfs_quota(cgroup_tg(cgrp));
+ return tg_get_cfs_quota(css_tg(css));
}
-static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
- s64 cfs_quota_us)
+static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, s64 cfs_quota_us)
{
- return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
+ return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
}
-static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return tg_get_cfs_period(cgroup_tg(cgrp));
+ return tg_get_cfs_period(css_tg(css));
}
-static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
- u64 cfs_period_us)
+static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 cfs_period_us)
{
- return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+ return tg_set_cfs_period(css_tg(css), cfs_period_us);
}
struct cfs_schedulable_data {
@@ -7461,10 +7465,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
return ret;
}
-static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
+static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
struct cgroup_map_cb *cb)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *tg = css_tg(css);
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
cb->fill(cb, "nr_periods", cfs_b->nr_periods);
@@ -7477,26 +7481,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
-static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
- s64 val)
+static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 val)
{
- return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
+ return sched_group_set_rt_runtime(css_tg(css), val);
}
-static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
+static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return sched_group_rt_runtime(cgroup_tg(cgrp));
+ return sched_group_rt_runtime(css_tg(css));
}
-static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
- u64 rt_period_us)
+static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 rt_period_us)
{
- return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
+ return sched_group_set_rt_period(css_tg(css), rt_period_us);
}
-static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- return sched_group_rt_period(cgroup_tg(cgrp));
+ return sched_group_rt_period(css_tg(css));
}
#endif /* CONFIG_RT_GROUP_SCHED */
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index dbb7e2cd95eb..f64722ff0299 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -33,30 +33,20 @@ struct cpuacct {
struct kernel_cpustat __percpu *cpustat;
};
-/* return cpu accounting group corresponding to this container */
-static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
+static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
{
- return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
- struct cpuacct, css);
+ return css ? container_of(css, struct cpuacct, css) : NULL;
}
/* return cpu accounting group to which this task belongs */
static inline struct cpuacct *task_ca(struct task_struct *tsk)
{
- return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
- struct cpuacct, css);
-}
-
-static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
-{
- return cgroup_ca(ca->css.cgroup->parent);
+ return css_ca(task_css(tsk, cpuacct_subsys_id));
}
static inline struct cpuacct *parent_ca(struct cpuacct *ca)
{
- if (!ca->css.cgroup->parent)
- return NULL;
- return cgroup_ca(ca->css.cgroup->parent);
+ return css_ca(css_parent(&ca->css));
}
static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
@@ -66,11 +56,12 @@ static struct cpuacct root_cpuacct = {
};
/* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
+static struct cgroup_subsys_state *
+cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct cpuacct *ca;
- if (!cgrp->parent)
+ if (!parent_css)
return &root_cpuacct.css;
ca = kzalloc(sizeof(*ca), GFP_KERNEL);
@@ -96,9 +87,9 @@ out:
}
/* destroy an existing cpu accounting group */
-static void cpuacct_css_free(struct cgroup *cgrp)
+static void cpuacct_css_free(struct cgroup_subsys_state *css)
{
- struct cpuacct *ca = cgroup_ca(cgrp);
+ struct cpuacct *ca = css_ca(css);
free_percpu(ca->cpustat);
free_percpu(ca->cpuusage);
@@ -141,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
}
/* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
- struct cpuacct *ca = cgroup_ca(cgrp);
+ struct cpuacct *ca = css_ca(css);
u64 totalcpuusage = 0;
int i;
@@ -153,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
return totalcpuusage;
}
-static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
- u64 reset)
+static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 reset)
{
- struct cpuacct *ca = cgroup_ca(cgrp);
+ struct cpuacct *ca = css_ca(css);
int err = 0;
int i;
@@ -172,10 +163,10 @@ out:
return err;
}
-static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
- struct seq_file *m)
+static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css,
+ struct cftype *cft, struct seq_file *m)
{
- struct cpuacct *ca = cgroup_ca(cgroup);
+ struct cpuacct *ca = css_ca(css);
u64 percpu;
int i;
@@ -192,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = {
[CPUACCT_STAT_SYSTEM] = "system",
};
-static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
- struct cgroup_map_cb *cb)
+static int cpuacct_stats_show(struct cgroup_subsys_state *css,
+ struct cftype *cft, struct cgroup_map_cb *cb)
{
- struct cpuacct *ca = cgroup_ca(cgrp);
+ struct cpuacct *ca = css_ca(css);
int cpu;
s64 val = 0;
@@ -281,7 +272,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
while (ca != &root_cpuacct) {
kcpustat = this_cpu_ptr(ca->cpustat);
kcpustat->cpustat[index] += val;
- ca = __parent_ca(ca);
+ ca = parent_ca(ca);
}
rcu_read_unlock();
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 68f1609ca149..8977a249816f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3018,6 +3018,23 @@ static unsigned long cpu_avg_load_per_task(int cpu)
return 0;
}
+static void record_wakee(struct task_struct *p)
+{
+ /*
+ * Rough decay (wiping) for cost saving, don't worry
+ * about the boundary, really active task won't care
+ * about the loss.
+ */
+ if (jiffies > current->wakee_flip_decay_ts + HZ) {
+ current->wakee_flips = 0;
+ current->wakee_flip_decay_ts = jiffies;
+ }
+
+ if (current->last_wakee != p) {
+ current->last_wakee = p;
+ current->wakee_flips++;
+ }
+}
static void task_waking_fair(struct task_struct *p)
{
@@ -3038,6 +3055,7 @@ static void task_waking_fair(struct task_struct *p)
#endif
se->vruntime -= min_vruntime;
+ record_wakee(p);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3156,6 +3174,28 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
#endif
+static int wake_wide(struct task_struct *p)
+{
+ int factor = this_cpu_read(sd_llc_size);
+
+ /*
+ * Yeah, it's the switching-frequency, could means many wakee or
+ * rapidly switch, use factor here will just help to automatically
+ * adjust the loose-degree, so bigger node will lead to more pull.
+ */
+ if (p->wakee_flips > factor) {
+ /*
+ * wakee is somewhat hot, it needs certain amount of cpu
+ * resource, so if waker is far more hot, prefer to leave
+ * it alone.
+ */
+ if (current->wakee_flips > (factor * p->wakee_flips))
+ return 1;
+ }
+
+ return 0;
+}
+
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
s64 this_load, load;
@@ -3165,6 +3205,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
unsigned long weight;
int balanced;
+ /*
+ * If we wake multiple tasks be careful to not bounce
+ * ourselves around too much.
+ */
+ if (wake_wide(p))
+ return 0;
+
idx = sd->wake_idx;
this_cpu = smp_processor_id();
prev_cpu = task_cpu(p);
@@ -4172,47 +4219,48 @@ static void update_blocked_averages(int cpu)
}
/*
- * Compute the cpu's hierarchical load factor for each task group.
+ * Compute the hierarchical load factor for cfs_rq and all its ascendants.
* This needs to be done in a top-down fashion because the load of a child
* group is a fraction of its parents load.
*/
-static int tg_load_down(struct task_group *tg, void *data)
-{
- unsigned long load;
- long cpu = (long)data;
-
- if (!tg->parent) {
- load = cpu_rq(cpu)->avg.load_avg_contrib;
- } else {
- load = tg->parent->cfs_rq[cpu]->h_load;
- load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
- tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
- }
-
- tg->cfs_rq[cpu]->h_load = load;
-
- return 0;
-}
-
-static void update_h_load(long cpu)
+static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
{
- struct rq *rq = cpu_rq(cpu);
+ struct rq *rq = rq_of(cfs_rq);
+ struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
unsigned long now = jiffies;
+ unsigned long load;
- if (rq->h_load_throttle == now)
+ if (cfs_rq->last_h_load_update == now)
return;
- rq->h_load_throttle = now;
+ cfs_rq->h_load_next = NULL;
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ cfs_rq->h_load_next = se;
+ if (cfs_rq->last_h_load_update == now)
+ break;
+ }
- rcu_read_lock();
- walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
- rcu_read_unlock();
+ if (!se) {
+ cfs_rq->h_load = rq->avg.load_avg_contrib;
+ cfs_rq->last_h_load_update = now;
+ }
+
+ while ((se = cfs_rq->h_load_next) != NULL) {
+ load = cfs_rq->h_load;
+ load = div64_ul(load * se->avg.load_avg_contrib,
+ cfs_rq->runnable_load_avg + 1);
+ cfs_rq = group_cfs_rq(se);
+ cfs_rq->h_load = load;
+ cfs_rq->last_h_load_update = now;
+ }
}
static unsigned long task_h_load(struct task_struct *p)
{
struct cfs_rq *cfs_rq = task_cfs_rq(p);
+ update_cfs_rq_h_load(cfs_rq);
return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
cfs_rq->runnable_load_avg + 1);
}
@@ -4221,10 +4269,6 @@ static inline void update_blocked_averages(int cpu)
{
}
-static inline void update_h_load(long cpu)
-{
-}
-
static unsigned long task_h_load(struct task_struct *p)
{
return p->se.avg.load_avg_contrib;
@@ -5114,7 +5158,6 @@ redo:
env.src_rq = busiest;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
- update_h_load(env.src_cpu);
more_balance:
local_irq_save(flags);
double_rq_lock(env.dst_rq, busiest);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef0a7b2439dd..b3c5653e1dca 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -285,7 +285,6 @@ struct cfs_rq {
/* Required to track per-cpu representation of a task_group */
u32 tg_runnable_contrib;
unsigned long tg_load_contrib;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
* h_load = weight * f(tg)
@@ -294,6 +293,9 @@ struct cfs_rq {
* this group.
*/
unsigned long h_load;
+ u64 last_h_load_update;
+ struct sched_entity *h_load_next;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -429,9 +431,6 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
-#ifdef CONFIG_SMP
- unsigned long h_load_throttle;
-#endif /* CONFIG_SMP */
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
@@ -595,6 +594,7 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
}
DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
struct sched_group_power {
@@ -665,9 +665,9 @@ extern int group_balance_cpu(struct sched_group *sg);
/*
* Return the group to which this tasks belongs.
*
- * We cannot use task_subsys_state() and friends because the cgroup
- * subsystem changes that value before the cgroup_subsys::attach() method
- * is called, therefore we cannot pin it and might observe the wrong value.
+ * We cannot use task_css() and friends because the cgroup subsystem
+ * changes that value before the cgroup_subsys::attach() method is called,
+ * therefore we cannot pin it and might observe the wrong value.
*
* The same is true for autogroup's p->signal->autogroup->tg, the autogroup
* core changes this before calling sched_move_task().