diff options
Diffstat (limited to 'include/linux/sched.h')
-rw-r--r-- | include/linux/sched.h | 253 |
1 files changed, 223 insertions, 30 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 0f72548732f1..25f54c79f757 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3,6 +3,8 @@ #include <uapi/linux/sched.h> +#include <linux/sched/prio.h> + struct sched_param { int sched_priority; @@ -16,6 +18,7 @@ struct sched_param { #include <linux/types.h> #include <linux/timex.h> #include <linux/jiffies.h> +#include <linux/plist.h> #include <linux/rbtree.h> #include <linux/thread_info.h> #include <linux/cpumask.h> @@ -26,7 +29,7 @@ struct sched_param { #include <asm/page.h> #include <asm/ptrace.h> -#include <asm/cputime.h> +#include <linux/cputime.h> #include <linux/smp.h> #include <linux/sem.h> @@ -56,6 +59,70 @@ struct sched_param { #include <asm/processor.h> +#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ + +/* + * Extended scheduling parameters data structure. + * + * This is needed because the original struct sched_param can not be + * altered without introducing ABI issues with legacy applications + * (e.g., in sched_getparam()). + * + * However, the possibility of specifying more than just a priority for + * the tasks may be useful for a wide variety of application fields, e.g., + * multimedia, streaming, automation and control, and many others. + * + * This variant (sched_attr) is meant at describing a so-called + * sporadic time-constrained task. In such model a task is specified by: + * - the activation period or minimum instance inter-arrival time; + * - the maximum (or average, depending on the actual scheduling + * discipline) computation time of all instances, a.k.a. runtime; + * - the deadline (relative to the actual activation time) of each + * instance. + * Very briefly, a periodic (sporadic) task asks for the execution of + * some specific computation --which is typically called an instance-- + * (at most) every period. Moreover, each instance typically lasts no more + * than the runtime and must be completed by time instant t equal to + * the instance activation time + the deadline. + * + * This is reflected by the actual fields of the sched_attr structure: + * + * @size size of the structure, for fwd/bwd compat. + * + * @sched_policy task's scheduling policy + * @sched_flags for customizing the scheduler behaviour + * @sched_nice task's nice value (SCHED_NORMAL/BATCH) + * @sched_priority task's static priority (SCHED_FIFO/RR) + * @sched_deadline representative of the task's deadline + * @sched_runtime representative of the task's runtime + * @sched_period representative of the task's period + * + * Given this task model, there are a multiplicity of scheduling algorithms + * and policies, that can be used to ensure all the tasks will make their + * timing constraints. + * + * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the + * only user of this new interface. More information about the algorithm + * available in the scheduling class file or in Documentation/. + */ +struct sched_attr { + u32 size; + + u32 sched_policy; + u64 sched_flags; + + /* SCHED_NORMAL, SCHED_BATCH */ + s32 sched_nice; + + /* SCHED_FIFO, SCHED_RR */ + u32 sched_priority; + + /* SCHED_DEADLINE */ + u64 sched_runtime; + u64 sched_deadline; + u64 sched_period; +}; + struct exec_domain; struct futex_pi_state; struct robust_list_head; @@ -63,6 +130,11 @@ struct bio_list; struct fs_struct; struct perf_event_context; struct blk_plug; +struct filename; + +#define VMACACHE_BITS 2 +#define VMACACHE_SIZE (1U << VMACACHE_BITS) +#define VMACACHE_MASK (VMACACHE_SIZE - 1) /* * List of flags we want to share for kernel threads, @@ -138,8 +210,9 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); #define __TASK_STOPPED 4 #define __TASK_TRACED 8 /* in tsk->exit_state */ -#define EXIT_ZOMBIE 16 -#define EXIT_DEAD 32 +#define EXIT_DEAD 16 +#define EXIT_ZOMBIE 32 +#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* in tsk->state again */ #define TASK_DEAD 64 #define TASK_WAKEKILL 128 @@ -164,11 +237,10 @@ extern char ___assert_task_state[1 - 2*!!( /* get_task_state() */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ - __TASK_TRACED) + __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD) #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) -#define task_is_dead(task) ((task)->exit_state != 0) #define task_is_stopped_or_traced(task) \ ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) #define task_contributes_to_load(task) \ @@ -227,10 +299,14 @@ extern int runqueue_is_locked(int cpu); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_balance_enter_idle(int cpu); extern void set_cpu_sd_state_idle(void); -extern int get_nohz_timer_target(void); +extern int get_nohz_timer_target(int pinned); #else static inline void nohz_balance_enter_idle(int cpu) { } static inline void set_cpu_sd_state_idle(void) { } +static inline int get_nohz_timer_target(int pinned) +{ + return smp_processor_id(); +} #endif /* @@ -327,22 +403,33 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} #endif - -extern void set_dumpable(struct mm_struct *mm, int value); -extern int get_dumpable(struct mm_struct *mm); - #define SUID_DUMP_DISABLE 0 /* No setuid dumping */ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ /* mm flags */ -/* dumpable bits */ -#define MMF_DUMPABLE 0 /* core dump is permitted */ -#define MMF_DUMP_SECURELY 1 /* core file is readable only by root */ +/* for SUID_DUMP_* above */ #define MMF_DUMPABLE_BITS 2 #define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) +extern void set_dumpable(struct mm_struct *mm, int value); +/* + * This returns the actual value of the suid_dumpable flag. For things + * that are using this for checking for privilege transitions, it must + * test against SUID_DUMP_USER rather than treating it as a boolean + * value. + */ +static inline int __get_dumpable(unsigned long mm_flags) +{ + return mm_flags & MMF_DUMPABLE_MASK; +} + +static inline int get_dumpable(struct mm_struct *mm) +{ + return __get_dumpable(mm->flags); +} + /* coredump filter bits */ #define MMF_DUMP_ANON_PRIVATE 2 #define MMF_DUMP_ANON_SHARED 3 @@ -485,6 +572,7 @@ struct signal_struct { atomic_t sigcnt; atomic_t live; int nr_threads; + struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ @@ -1000,6 +1088,7 @@ struct sched_entity { #endif #ifdef CONFIG_FAIR_GROUP_SCHED + int depth; struct sched_entity *parent; /* rq on which this entity is (to be) queued: */ struct cfs_rq *cfs_rq; @@ -1029,6 +1118,51 @@ struct sched_rt_entity { #endif }; +struct sched_dl_entity { + struct rb_node rb_node; + + /* + * Original scheduling parameters. Copied here from sched_attr + * during sched_setscheduler2(), they will remain the same until + * the next sched_setscheduler2(). + */ + u64 dl_runtime; /* maximum runtime for each instance */ + u64 dl_deadline; /* relative deadline of each instance */ + u64 dl_period; /* separation of two instances (period) */ + u64 dl_bw; /* dl_runtime / dl_deadline */ + + /* + * Actual scheduling parameters. Initialized with the values above, + * they are continously updated during task execution. Note that + * the remaining runtime could be < 0 in case we are in overrun. + */ + s64 runtime; /* remaining runtime for this instance */ + u64 deadline; /* absolute deadline for this instance */ + unsigned int flags; /* specifying the scheduler behaviour */ + + /* + * Some bool flags: + * + * @dl_throttled tells if we exhausted the runtime. If so, the + * task has to wait for a replenishment to be performed at the + * next firing of dl_timer. + * + * @dl_new tells if a new instance arrived. If so we must + * start executing it with full runtime and reset its absolute + * deadline; + * + * @dl_boosted tells if we are boosted due to DI. If so we are + * outside bandwidth enforcement mechanism (but only until we + * exit the critical section). + */ + int dl_throttled, dl_new, dl_boosted; + + /* + * Bandwidth enforcement timer. Each -deadline task has its + * own bandwidth to be enforced, thus we need one timer per task. + */ + struct hrtimer dl_timer; +}; struct rcu_node; @@ -1065,6 +1199,7 @@ struct task_struct { #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif + struct sched_dl_entity dl; #ifdef CONFIG_PREEMPT_NOTIFIERS /* list of struct preempt_notifier: */ @@ -1098,12 +1233,16 @@ struct task_struct { struct list_head tasks; #ifdef CONFIG_SMP struct plist_node pushable_tasks; + struct rb_node pushable_dl_tasks; #endif struct mm_struct *mm, *active_mm; #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif + /* per-thread vma caching */ + u32 vmacache_seqnum; + struct vm_area_struct *vmacache[VMACACHE_SIZE]; #if defined(SPLIT_RSS_COUNTING) struct task_rss_stat rss_stat; #endif @@ -1116,7 +1255,6 @@ struct task_struct { /* Used for emulating ABI behavior of previous Linux versions */ unsigned int personality; - unsigned did_exec:1; unsigned in_execve:1; /* Tell the LSMs that the process is doing an * execve */ unsigned in_iowait:1; @@ -1160,6 +1298,7 @@ struct task_struct { /* PID/PID hash table linkage. */ struct pid_link pids[PIDTYPE_MAX]; struct list_head thread_group; + struct list_head thread_node; struct completion *vfork_done; /* for vfork() */ int __user *set_child_tid; /* CLONE_CHILD_SETTID */ @@ -1249,9 +1388,12 @@ struct task_struct { #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task */ - struct plist_head pi_waiters; + struct rb_root pi_waiters; + struct rb_node *pi_waiters_leftmost; /* Deadlock detection and priority inheritance handling */ struct rt_mutex_waiter *pi_blocked_on; + /* Top pi_waiters task */ + struct task_struct *pi_top_task; #endif #ifdef CONFIG_DEBUG_MUTEXES @@ -1333,6 +1475,9 @@ struct task_struct { struct mutex perf_event_mutex; struct list_head perf_event_list; #endif +#ifdef CONFIG_DEBUG_PREEMPT + unsigned long preempt_disable_ip; +#endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; /* Protected by alloc_lock */ short il_next; @@ -1343,9 +1488,10 @@ struct task_struct { unsigned int numa_scan_period; unsigned int numa_scan_period_max; int numa_preferred_nid; - int numa_migrate_deferred; unsigned long numa_migrate_retry; u64 node_stamp; /* migration stamp */ + u64 last_task_numa_placement; + u64 last_sum_exec_runtime; struct callback_head numa_work; struct list_head numa_entry; @@ -1356,15 +1502,22 @@ struct task_struct { * Scheduling placement decisions are made based on the these counts. * The values remain static for the duration of a PTE scan */ - unsigned long *numa_faults; + unsigned long *numa_faults_memory; unsigned long total_numa_faults; /* * numa_faults_buffer records faults per node during the current - * scan window. When the scan completes, the counts in numa_faults - * decay and these values are copied. + * scan window. When the scan completes, the counts in + * numa_faults_memory decay and these values are copied. */ - unsigned long *numa_faults_buffer; + unsigned long *numa_faults_buffer_memory; + + /* + * Track the nodes the process was running on when a NUMA hinting + * fault was incurred. + */ + unsigned long *numa_faults_cpu; + unsigned long *numa_faults_buffer_cpu; /* * numa_faults_locality tracks if faults recorded during the last @@ -1469,8 +1622,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); extern void task_numa_free(struct task_struct *p); - -extern unsigned int sysctl_numa_balancing_migrate_deferred; +extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page, + int src_nid, int dst_cpu); #else static inline void task_numa_fault(int last_node, int node, int pages, int flags) @@ -1486,6 +1639,11 @@ static inline void set_numabalancing_state(bool enabled) static inline void task_numa_free(struct task_struct *p) { } +static inline bool should_numa_migrate_memory(struct task_struct *p, + struct page *page, int src_nid, int dst_cpu) +{ + return true; +} #endif static inline struct pid *task_pid(struct task_struct *task) @@ -1712,7 +1870,6 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ -#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* this thread called freeze_processes and should not be frozen */ @@ -1898,7 +2055,9 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns) * but then during bootup it turns out that sched_clock() * is reliable after all: */ -extern int sched_clock_stable; +extern int sched_clock_stable(void); +extern void set_sched_clock_stable(void); +extern void clear_sched_clock_stable(void); extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); @@ -1969,7 +2128,16 @@ static inline void sched_autogroup_exit(struct signal_struct *sig) { } extern bool yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); -extern int task_nice(const struct task_struct *p); +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + * + * Return: The nice value [ -20 ... 0 ... 19 ]. + */ +static inline int task_nice(const struct task_struct *p) +{ + return PRIO_TO_NICE((p)->static_prio); +} extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); @@ -1977,6 +2145,8 @@ extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); +extern int sched_setattr(struct task_struct *, + const struct sched_attr *); extern struct task_struct *idle_task(int cpu); /** * is_idle_task - is the specified task an idle task? @@ -2056,7 +2226,7 @@ extern void wake_up_new_task(struct task_struct *tsk); #else static inline void kick_process(struct task_struct *tsk) { } #endif -extern void sched_fork(unsigned long clone_flags, struct task_struct *p); +extern int sched_fork(unsigned long clone_flags, struct task_struct *p); extern void sched_dead(struct task_struct *p); extern void proc_caches_init(void); @@ -2182,8 +2352,6 @@ extern struct mm_struct *get_task_mm(struct task_struct *task); extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(struct task_struct *, struct mm_struct *); -/* Allocate a new mm structure and copy contents from tsk->mm */ -extern struct mm_struct *dup_mm(struct task_struct *tsk); extern int copy_thread(unsigned long, unsigned long, unsigned long, struct task_struct *); @@ -2201,14 +2369,14 @@ extern void do_group_exit(int); extern int allow_signal(int); extern int disallow_signal(int); -extern int do_execve(const char *, +extern int do_execve(struct filename *, const char __user * const __user *, const char __user * const __user *); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); -extern void set_task_comm(struct task_struct *tsk, char *from); +extern void set_task_comm(struct task_struct *tsk, const char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP @@ -2241,6 +2409,16 @@ extern bool current_is_single_threaded(void); #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) +#define __for_each_thread(signal, t) \ + list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node) + +#define for_each_thread(p, t) \ + __for_each_thread((p)->signal, t) + +/* Careful: this is a double loop, 'break' won't work as expected. */ +#define for_each_process_thread(p, t) \ + for_each_process(p) for_each_thread(p, t) + static inline int get_nr_threads(struct task_struct *tsk) { return tsk->signal->nr_threads; @@ -2645,6 +2823,21 @@ static inline bool __must_check current_clr_polling_and_test(void) } #endif +static inline void current_clr_polling(void) +{ + __current_clr_polling(); + + /* + * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. + * Once the bit is cleared, we'll get IPIs with every new + * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also + * fold. + */ + smp_mb(); /* paired with resched_task() */ + + preempt_fold_need_resched(); +} + static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); |