diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup_freezer.c | 5 | ||||
-rw-r--r-- | kernel/fork.c | 3 | ||||
-rw-r--r-- | kernel/hw_breakpoint.c | 196 | ||||
-rw-r--r-- | kernel/kprobes.c | 132 | ||||
-rw-r--r-- | kernel/lockdep.c | 4 | ||||
-rw-r--r-- | kernel/perf_event.c | 379 | ||||
-rw-r--r-- | kernel/ptrace.c | 1 | ||||
-rw-r--r-- | kernel/sched.c | 61 | ||||
-rw-r--r-- | kernel/trace/Kconfig | 11 | ||||
-rw-r--r-- | kernel/trace/Makefile | 1 | ||||
-rw-r--r-- | kernel/trace/trace.h | 20 | ||||
-rw-r--r-- | kernel/trace/trace_entries.h | 12 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_hw_branches.c | 312 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 535 | ||||
-rw-r--r-- | kernel/trace/trace_ksym.c | 26 | ||||
-rw-r--r-- | kernel/trace/trace_selftest.c | 57 | ||||
-rw-r--r-- | kernel/workqueue.c | 2 |
18 files changed, 849 insertions, 910 deletions
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index da5e1397553..e5c0244962b 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -205,9 +205,12 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) * No lock is needed, since the task isn't on tasklist yet, * so it can't be moved to another cgroup, which means the * freezer won't be removed and will be valid during this - * function call. + * function call. Nevertheless, apply RCU read-side critical + * section to suppress RCU lockdep false positives. */ + rcu_read_lock(); freezer = task_freezer(task); + rcu_read_unlock(); /* * The root cgroup is non-freezable, so we can skip the diff --git a/kernel/fork.c b/kernel/fork.c index 44b0791b0a2..5d3592deaf7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1111,9 +1111,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->memcg_batch.do_batch = 0; p->memcg_batch.memcg = NULL; #endif - - p->bts = NULL; - p->stack_start = stack_start; /* Perform scheduler related setup. Assign this task to a CPU. */ diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 03808ed342a..7a56b22e060 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -40,23 +40,29 @@ #include <linux/percpu.h> #include <linux/sched.h> #include <linux/init.h> +#include <linux/slab.h> #include <linux/cpu.h> #include <linux/smp.h> #include <linux/hw_breakpoint.h> + /* * Constraints data */ /* Number of pinned cpu breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); +static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]); /* Number of pinned task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]); +static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]); /* Number of non-pinned cpu/task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); +static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); + +static int nr_slots[TYPE_MAX]; + +static int constraints_initialized; /* Gather the number of total pinned and un-pinned bp in a cpuset */ struct bp_busy_slots { @@ -67,16 +73,29 @@ struct bp_busy_slots { /* Serialize accesses to the above constraints */ static DEFINE_MUTEX(nr_bp_mutex); +__weak int hw_breakpoint_weight(struct perf_event *bp) +{ + return 1; +} + +static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) +{ + if (bp->attr.bp_type & HW_BREAKPOINT_RW) + return TYPE_DATA; + + return TYPE_INST; +} + /* * Report the maximum number of pinned breakpoints a task * have in this cpu */ -static unsigned int max_task_bp_pinned(int cpu) +static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) { int i; - unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); + unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); - for (i = HBP_NUM -1; i >= 0; i--) { + for (i = nr_slots[type] - 1; i >= 0; i--) { if (tsk_pinned[i] > 0) return i + 1; } @@ -84,7 +103,7 @@ static unsigned int max_task_bp_pinned(int cpu) return 0; } -static int task_bp_pinned(struct task_struct *tsk) +static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type) { struct perf_event_context *ctx = tsk->perf_event_ctxp; struct list_head *list; @@ -105,7 +124,8 @@ static int task_bp_pinned(struct task_struct *tsk) */ list_for_each_entry(bp, list, event_entry) { if (bp->attr.type == PERF_TYPE_BREAKPOINT) - count++; + if (find_slot_idx(bp) == type) + count += hw_breakpoint_weight(bp); } raw_spin_unlock_irqrestore(&ctx->lock, flags); @@ -118,18 +138,19 @@ static int task_bp_pinned(struct task_struct *tsk) * a given cpu (cpu > -1) or in all of them (cpu = -1). */ static void -fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) +fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, + enum bp_type_idx type) { int cpu = bp->cpu; struct task_struct *tsk = bp->ctx->task; if (cpu >= 0) { - slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); + slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); if (!tsk) - slots->pinned += max_task_bp_pinned(cpu); + slots->pinned += max_task_bp_pinned(cpu, type); else - slots->pinned += task_bp_pinned(tsk); - slots->flexible = per_cpu(nr_bp_flexible, cpu); + slots->pinned += task_bp_pinned(tsk, type); + slots->flexible = per_cpu(nr_bp_flexible[type], cpu); return; } @@ -137,16 +158,16 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) for_each_online_cpu(cpu) { unsigned int nr; - nr = per_cpu(nr_cpu_bp_pinned, cpu); + nr = per_cpu(nr_cpu_bp_pinned[type], cpu); if (!tsk) - nr += max_task_bp_pinned(cpu); + nr += max_task_bp_pinned(cpu, type); else - nr += task_bp_pinned(tsk); + nr += task_bp_pinned(tsk, type); if (nr > slots->pinned) slots->pinned = nr; - nr = per_cpu(nr_bp_flexible, cpu); + nr = per_cpu(nr_bp_flexible[type], cpu); if (nr > slots->flexible) slots->flexible = nr; @@ -154,31 +175,49 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) } /* + * For now, continue to consider flexible as pinned, until we can + * ensure no flexible event can ever be scheduled before a pinned event + * in a same cpu. + */ +static void +fetch_this_slot(struct bp_busy_slots *slots, int weight) +{ + slots->pinned += weight; +} + +/* * Add a pinned breakpoint for the given task in our constraint table */ -static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) +static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, + enum bp_type_idx type, int weight) { unsigned int *tsk_pinned; - int count = 0; + int old_count = 0; + int old_idx = 0; + int idx = 0; - count = task_bp_pinned(tsk); + old_count = task_bp_pinned(tsk, type); + old_idx = old_count - 1; + idx = old_idx + weight; - tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); + tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); if (enable) { - tsk_pinned[count]++; - if (count > 0) - tsk_pinned[count-1]--; + tsk_pinned[idx]++; + if (old_count > 0) + tsk_pinned[old_idx]--; } else { - tsk_pinned[count]--; - if (count > 0) - tsk_pinned[count-1]++; + tsk_pinned[idx]--; + if (old_count > 0) + tsk_pinned[old_idx]++; } } /* * Add/remove the given breakpoint in our constraint table */ -static void toggle_bp_slot(struct perf_event *bp, bool enable) +static void +toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, + int weight) { int cpu = bp->cpu; struct task_struct *tsk = bp->ctx->task; @@ -186,20 +225,20 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) /* Pinned counter task profiling */ if (tsk) { if (cpu >= 0) { - toggle_bp_task_slot(tsk, cpu, enable); + toggle_bp_task_slot(tsk, cpu, enable, type, weight); return; } for_each_online_cpu(cpu) - toggle_bp_task_slot(tsk, cpu, enable); + toggle_bp_task_slot(tsk, cpu, enable, type, weight); return; } /* Pinned counter cpu profiling */ if (enable) - per_cpu(nr_cpu_bp_pinned, bp->cpu)++; + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; else - per_cpu(nr_cpu_bp_pinned, bp->cpu)--; + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; } /* @@ -246,14 +285,29 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) static int __reserve_bp_slot(struct perf_event *bp) { struct bp_busy_slots slots = {0}; + enum bp_type_idx type; + int weight; - fetch_bp_busy_slots(&slots, bp); + /* We couldn't initialize breakpoint constraints on boot */ + if (!constraints_initialized) + return -ENOMEM; + + /* Basic checks */ + if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY || + bp->attr.bp_type == HW_BREAKPOINT_INVALID) + return -EINVAL; + + type = find_slot_idx(bp); + weight = hw_breakpoint_weight(bp); + + fetch_bp_busy_slots(&slots, bp, type); + fetch_this_slot(&slots, weight); /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) == HBP_NUM) + if (slots.pinned + (!!slots.flexible) > nr_slots[type]) return -ENOSPC; - toggle_bp_slot(bp, true); + toggle_bp_slot(bp, true, type, weight); return 0; } @@ -273,7 +327,12 @@ int reserve_bp_slot(struct perf_event *bp) static void __release_bp_slot(struct perf_event *bp) { - toggle_bp_slot(bp, false); + enum bp_type_idx type; + int weight; + + type = find_slot_idx(bp); + weight = hw_breakpoint_weight(bp); + toggle_bp_slot(bp, false, type, weight); } void release_bp_slot(struct perf_event *bp) @@ -308,6 +367,28 @@ int dbg_release_bp_slot(struct perf_event *bp) return 0; } +static int validate_hw_breakpoint(struct perf_event *bp) +{ + int ret; + + ret = arch_validate_hwbkpt_settings(bp); + if (ret) + return ret; + + if (arch_check_bp_in_kernelspace(bp)) { + if (bp->attr.exclude_kernel) + return -EINVAL; + /* + * Don't let unprivileged users set a breakpoint in the trap + * path to avoid trap recursion attacks. + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + } + + return 0; +} + int register_perf_hw_breakpoint(struct perf_event *bp) { int ret; @@ -316,17 +397,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp) if (ret) return ret; - /* - * Ptrace breakpoints can be temporary perf events only - * meant to reserve a slot. In this case, it is created disabled and - * we don't want to check the params right now (as we put a null addr) - * But perf tools create events as disabled and we want to check - * the params for them. - * This is a quick hack that will be removed soon, once we remove - * the tmp breakpoints from ptrace - */ - if (!bp->attr.disabled || !bp->overflow_handler) - ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + ret = validate_hw_breakpoint(bp); /* if arch_validate_hwbkpt_settings() fails then release bp slot */ if (ret) @@ -373,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att if (attr->disabled) goto end; - err = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + err = validate_hw_breakpoint(bp); if (!err) perf_event_enable(bp); @@ -480,7 +551,36 @@ static struct notifier_block hw_breakpoint_exceptions_nb = { static int __init init_hw_breakpoint(void) { + unsigned int **task_bp_pinned; + int cpu, err_cpu; + int i; + + for (i = 0; i < TYPE_MAX; i++) + nr_slots[i] = hw_breakpoint_slots(i); + + for_each_possible_cpu(cpu) { + for (i = 0; i < TYPE_MAX; i++) { + task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu); + *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i], + GFP_KERNEL); + if (!*task_bp_pinned) + goto err_alloc; + } + } + + constraints_initialized = 1; + return register_die_notifier(&hw_breakpoint_exceptions_nb); + + err_alloc: + for_each_possible_cpu(err_cpu) { + if (err_cpu == cpu) + break; + for (i = 0; i < TYPE_MAX; i++) + kfree(per_cpu(nr_task_bp_pinned[i], cpu)); + } + + return -ENOMEM; } core_initcall(init_hw_breakpoint); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 0ed46f3e51e..282035f3ae9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1588,6 +1588,72 @@ static void __kprobes kill_kprobe(struct kprobe *p) arch_remove_kprobe(p); } +/* Disable one kprobe */ +int __kprobes disable_kprobe(struct kprobe *kp) +{ + int ret = 0; + struct kprobe *p; + + mutex_lock(&kprobe_mutex); + + /* Check whether specified probe is valid. */ + p = __get_valid_kprobe(kp); + if (unlikely(p == NULL)) { + ret = -EINVAL; + goto out; + } + + /* If the probe is already disabled (or gone), just return */ + if (kprobe_disabled(kp)) + goto out; + + kp->flags |= KPROBE_FLAG_DISABLED; + if (p != kp) + /* When kp != p, p is always enabled. */ + try_to_disable_aggr_kprobe(p); + + if (!kprobes_all_disarmed && kprobe_disabled(p)) + disarm_kprobe(p); +out: + mutex_unlock(&kprobe_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(disable_kprobe); + +/* Enable one kprobe */ +int __kprobes enable_kprobe(struct kprobe *kp) +{ + int ret = 0; + struct kprobe *p; + + mutex_lock(&kprobe_mutex); + + /* Check whether specified probe is valid. */ + p = __get_valid_kprobe(kp); + if (unlikely(p == NULL)) { + ret = -EINVAL; + goto out; + } + + if (kprobe_gone(kp)) { + /* This kprobe has gone, we couldn't enable it. */ + ret = -EINVAL; + goto out; + } + + if (p != kp) + kp->flags &= ~KPROBE_FLAG_DISABLED; + + if (!kprobes_all_disarmed && kprobe_disabled(p)) { + p->flags &= ~KPROBE_FLAG_DISABLED; + arm_kprobe(p); + } +out: + mutex_unlock(&kprobe_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(enable_kprobe); + void __kprobes dump_kprobe(struct kprobe *kp) { printk(KERN_WARNING "Dumping kprobe:\n"); @@ -1805,72 +1871,6 @@ static const struct file_operations debugfs_kprobes_operations = { .release = seq_release, }; -/* Disable one kprobe */ -int __kprobes disable_kprobe(struct kprobe *kp) -{ - int ret = 0; - struct kprobe *p; - - mutex_lock(&kprobe_mutex); - - /* Check whether specified probe is valid. */ - p = __get_valid_kprobe(kp); - if (unlikely(p == NULL)) { - ret = -EINVAL; - goto out; - } - - /* If the probe is already disabled (or gone), just return */ - if (kprobe_disabled(kp)) - goto out; - - kp->flags |= KPROBE_FLAG_DISABLED; - if (p != kp) - /* When kp != p, p is always enabled. */ - try_to_disable_aggr_kprobe(p); - - if (!kprobes_all_disarmed && kprobe_disabled(p)) - disarm_kprobe(p); -out: - mutex_unlock(&kprobe_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(disable_kprobe); - -/* Enable one kprobe */ -int __kprobes enable_kprobe(struct kprobe *kp) -{ - int ret = 0; - struct kprobe *p; - - mutex_lock(&kprobe_mutex); - - /* Check whether specified probe is valid. */ - p = __get_valid_kprobe(kp); - if (unlikely(p == NULL)) { - ret = -EINVAL; - goto out; - } - - if (kprobe_gone(kp)) { - /* This kprobe has gone, we couldn't enable it. */ - ret = -EINVAL; - goto out; - } - - if (p != kp) - kp->flags &= ~KPROBE_FLAG_DISABLED; - - if (!kprobes_all_disarmed && kprobe_disabled(p)) { - p->flags &= ~KPROBE_FLAG_DISABLED; - arm_kprobe(p); - } -out: - mutex_unlock(&kprobe_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(enable_kprobe); - static void __kprobes arm_all_kprobes(void) { struct hlist_head *head; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 2594e1ce41c..e9c759f06c1 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3227,7 +3227,7 @@ void lock_release(struct lockdep_map *lock, int nested, raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; - trace_lock_release(lock, nested, ip); + trace_lock_release(lock, ip); __lock_release(lock, nested, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); @@ -3380,7 +3380,7 @@ found_it: hlock->holdtime_stamp = now; } - trace_lock_acquired(lock, ip, waittime); + trace_lock_acquired(lock, ip); stats = get_lock_stats(hlock_class(hlock)); if (waittime) { diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 2f3fbf84215..a4fa381db3c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -16,6 +16,7 @@ #include <linux/file.h> #include <linux/poll.h> #include <linux/slab.h> +#include <linux/hash.h> #include <linux/sysfs.h> #include <linux/dcache.h> #include <linux/percpu.h> @@ -82,14 +83,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } -int __weak -hw_perf_group_sched_in(struct perf_event *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - return 0; -} - void __weak perf_event_print_debug(void) { } static DEFINE_PER_CPU(int, perf_disable_count); @@ -262,6 +255,18 @@ static void update_event_times(struct perf_event *event) event->total_time_running = run_end - event->tstamp_running; } +/* + * Update total_time_enabled and total_time_running for all events in a group. + */ +static void update_group_times(struct perf_event *leader) +{ + struct perf_event *event; + + update_event_times(leader); + list_for_each_entry(event, &leader->sibling_list, group_entry) + update_event_times(event); +} + static struct list_head * ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) { @@ -315,8 +320,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_event *sibling, *tmp; - if (list_empty(&event->group_entry)) return; ctx->nr_events--; @@ -329,7 +332,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader != event) event->group_leader->nr_siblings--; - update_event_times(event); + update_group_times(event); /* * If event was in error state, then keep it @@ -340,6 +343,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) */ if (event->state > PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_OFF; +} + +static void +perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event *sibling, *tmp; /* * If this was a group event with sibling events then @@ -505,18 +514,6 @@ retry: } /* - * Update total_time_enabled and total_time_running for all events in a group. - */ -static void update_group_times(struct perf_event *leader) -{ - struct perf_event *event; - - update_event_times(leader); - list_for_each_entry(event, &leader->sibling_list, group_entry) - update_event_times(event); -} - -/* * Cross CPU call to disable a performance event */ static void __perf_event_disable(void *info) @@ -640,15 +637,20 @@ group_sched_in(struct perf_event *group_event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - struct perf_event *event, *partial_group; + struct perf_event *event, *partial_group = NULL; + const struct pmu *pmu = group_event->pmu; + bool txn = false; int ret; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; - ret = hw_perf_group_sched_in(group_event, cpuctx, ctx); - if (ret) - return ret < 0 ? ret : 0; + /* Check if group transaction availabe */ + if (pmu->start_txn) + txn = true; + + if (txn) + pmu->start_txn(pmu); if (event_sched_in(group_event, cpuctx, ctx)) return -EAGAIN; @@ -663,9 +665,19 @@ group_sched_in(struct perf_event *group_event, } } - return 0; + if (!txn) + return 0; + + ret = pmu->commit_txn(pmu); + if (!ret) { + pmu->cancel_txn(pmu); + return 0; + } group_error: + if (txn) + pmu->cancel_txn(pmu); + /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: @@ -1367,6 +1379,8 @@ void perf_event_task_sched_in(struct task_struct *task) if (cpuctx->task_ctx == ctx) return; + perf_disable(); + /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -1379,6 +1393,8 @@ void perf_event_task_sched_in(struct task_struct *task) ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); cpuctx->task_ctx = ctx; + + perf_enable(); } #define MAX_INTERRUPTS (~0ULL) @@ -1856,9 +1872,30 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + /* + * Remove from the PMU, can't get re-enabled since we got + * here because the last ref went. + */ + perf_event_disable(event); + WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - perf_event_remove_from_context(event); + /* + * There are two ways this annotation is useful: + * + * 1) there is a lock recursion from perf_event_exit_task + * see the comment there. + * + * 2) there is a lock-inversion with mmap_sem through + * perf_event_read_group(), which takes faults while + * holding ctx->mutex, however this is called after + * the last filedesc died, so there is no possibility + * to trigger the AB-BA case. + */ + mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); + raw_spin_lock_irq(&ctx->lock); + list_del_event(event, ctx); + perf_destroy_group(event, ctx); + raw_spin_unlock_irq(&ctx->lock); mutex_unlock(&ctx->mutex); mutex_lock(&event->owner->perf_event_mutex); @@ -2642,6 +2679,7 @@ static int perf_fasync(int fd, struct file *filp, int on) } static const struct file_operations perf_fops = { + .llseek = no_llseek, .release = perf_release, .read = perf_read, .poll = perf_poll, @@ -2792,6 +2830,27 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski /* + * We assume there is only KVM supporting the callbacks. + * Later on, we might change it to a list if there is + * another virtualization implementation supporting the callbacks. + */ +struct perf_guest_info_callbacks *perf_guest_cbs; + +int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ + perf_guest_cbs = cbs; + return 0; +} +EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); + +int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ + perf_guest_cbs = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); + +/* * Output */ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, @@ -3743,7 +3802,7 @@ void __perf_event_mmap(struct vm_area_struct *vma) .event_id = { .header = { .type = PERF_RECORD_MMAP, - .misc = 0, + .misc = PERF_RECORD_MISC_USER, /* .size */ }, /* .pid */ @@ -3961,36 +4020,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, perf_swevent_overflow(event, 0, nmi, data, regs); } -static int perf_swevent_is_counting(struct perf_event *event) -{ - /* - * The event is active, we're good! - */ - if (event->state == PERF_EVENT_STATE_ACTIVE) - return 1; - - /* - * The event is off/error, not counting. - */ - if (event->state != PERF_EVENT_STATE_INACTIVE) - return 0; - - /* - * The event is inactive, if the context is active - * we're part of a group that didn't make it on the 'pmu', - * not counting. - */ - if (event->ctx->is_active) - return 0; - - /* - * We're inactive and the context is too, this means the - * task is scheduled out, we're counting events that happen - * to us, like migration events. - */ - return 1; -} - static int perf_tp_event_match(struct perf_event *event, struct perf_sample_data *data); @@ -4014,12 +4043,6 @@ static int perf_swevent_match(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { - if (event->cpu != -1 && event->cpu != smp_processor_id()) - return 0; - - if (!perf_swevent_is_counting(event)) - return 0; - if (event->attr.type != type) return 0; @@ -4036,18 +4059,53 @@ static int perf_swevent_match(struct perf_event *event, return 1; } -static void perf_swevent_ctx_event(struct perf_event_context *ctx, - enum perf_type_id type, - u32 event_id, u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) +static inline u64 swevent_hash(u64 type, u32 event_id) +{ + u64 val = event_id | (type << 32); + + return hash_64(val, SWEVENT_HLIST_BITS); +} + +static struct hlist_head * +find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id) +{ + u64 hash; + struct swevent_hlist *hlist; + + hash = swevent_hash(type, event_id); + + hlist = rcu_dereference(ctx->swevent_hlist); + if (!hlist) + return NULL; + + return &hlist->heads[hash]; +} + +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, + u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) { + struct perf_cpu_context *cpuctx; struct perf_event *event; + struct hlist_node *node; + struct hlist_head *head; - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + cpuctx = &__get_cpu_var(perf_cpu_context); + + rcu_read_lock(); + + head = find_swevent_head(cpuctx, type, event_id); + + if (!head) + goto end; + + hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_add(event, nr, nmi, data, regs); } +end: + rcu_read_unlock(); } int perf_swevent_get_recursion_context(void) @@ -4085,27 +4143,6 @@ void perf_swevent_put_recursion_context(int rctx) } EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); -static void do_perf_sw_event(enum perf_type_id type, u32 event_id, - u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - - cpuctx = &__get_cpu_var(perf_cpu_context); - rcu_read_lock(); - perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, - nr, nmi, data, regs); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_event_ctxp); - if (ctx) - perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); - rcu_read_unlock(); -} void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) @@ -4131,16 +4168,28 @@ static void perf_swevent_read(struct perf_event *event) static int perf_swevent_enable(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + struct perf_cpu_context *cpuctx; + struct hlist_head *head; + + cpuctx = &__get_cpu_var(perf_cpu_context); if (hwc->sample_period) { hwc->last_period = hwc->sample_period; perf_swevent_set_period(event); } + + head = find_swevent_head(cpuctx, event->attr.type, event->attr.config); + if (WARN_ON_ONCE(!head)) + return -EINVAL; + + hlist_add_head_rcu(&event->hlist_entry, head); + return 0; } static void perf_swevent_disable(struct perf_event *event) { + hlist_del_rcu(&event->hlist_entry); } static const struct pmu perf_ops_generic = { @@ -4168,15 +4217,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) perf_sample_data_init(&data, 0); data.period = event->hw.last_period; regs = get_irq_regs(); - /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. - */ - if ((event->attr.exclude_kernel || !regs) && - !event->attr.exclude_user) - regs = task_pt_regs(current); - if (regs) { + if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && current->pid == 0)) if (perf_event_overflow(event, 0, &data, regs)) ret = HRTIMER_NORESTART; @@ -4324,6 +4366,105 @@ static const struct pmu perf_ops_task_clock = { .read = task_clock_perf_event_read, }; +static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) +{ + struct swevent_hlist *hlist; + + hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); + kfree(hlist); +} + +static void swevent_hlist_release(struct perf_cpu_context *cpuctx) +{ + struct swevent_hlist *hlist; + + if (!cpuctx->swevent_hlist) + return; + + hlist = cpuctx->swevent_hlist; + rcu_assign_pointer(cpuctx->swevent_hlist, NULL); + call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); +} + +static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + + mutex_lock(&cpuctx->hlist_mutex); + + if (!--cpuctx->hlist_refcount) + swevent_hlist_release(cpuctx); + + mutex_unlock(&cpuctx->hlist_mutex); +} + +static void swevent_hlist_put(struct perf_event *event) +{ + int cpu; + + if (event->cpu != -1) { + swevent_hlist_put_cpu(event, event->cpu); + return; + } + + for_each_possible_cpu(cpu) + swevent_hlist_put_cpu(event, cpu); +} + +static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + int err = 0; + + mutex_lock(&cpuctx->hlist_mutex); + + if (!cpuctx->swevent_hlist && cpu_online(cpu)) { + struct swevent_hlist *hlist; + + hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); + if (!hlist) { + err = -ENOMEM; + goto exit; + } + rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + } + cpuctx->hlist_refcount++; + exit: + mutex_unlock(&cpuctx->hlist_mutex); + + return err; +} + +static int swevent_hlist_get(struct perf_event *event) +{ + int err; + int cpu, failed_cpu; + + if (event->cpu != -1) + return swevent_hlist_get_cpu(event, event->cpu); + + get_online_cpus(); + for_each_possible_cpu(cpu) { + err = swevent_hlist_get_cpu(event, cpu); + if (err) { + failed_cpu = cpu; + goto fail; + } + } + put_online_cpus(); + + return 0; + fail: + for_each_possible_cpu(cpu) { + if (cpu == failed_cpu) + break; + swevent_hlist_put_cpu(event, cpu); + } + + put_online_cpus(); + return err; +} + #ifdef CONFIG_EVENT_TRACING void perf_tp_event(int event_id, u64 addr, u64 count, void *record, @@ -4357,10 +4498,13 @@ static int perf_tp_event_match(struct perf_event *event, static void tp_perf_event_destroy(struct perf_event *event) { perf_trace_disable(event->attr.config); + swevent_hlist_put(event); } static const struct pmu *tp_perf_event_init(struct perf_event *event) { + int err; + /* * Raw tracepoint data is a severe data leak, only allow root to * have these. @@ -4374,6 +4518,11 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) return NULL; event->destroy = tp_perf_event_destroy; + err = swevent_hlist_get(event); + if (err) { + perf_trace_disable(event->attr.config); + return ERR_PTR(err); + } return &perf_ops_generic; } @@ -4474,6 +4623,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); atomic_dec(&perf_swevent_enabled[event_id]); + swevent_hlist_put(event); } static const struct pmu *sw_perf_event_init(struct perf_event *event) @@ -4512,6 +4662,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) case PERF_COUNT_SW_ALIGNMENT_FAULTS: case PERF_COUNT_SW_EMULATION_FAULTS: if (!event->parent) { + int err; + + err = swevent_hlist_get(event); + if (err) + return ERR_PTR(err); + atomic_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } @@ -4897,7 +5053,7 @@ err_fput_free_put_context: err_free_put_context: if (err < 0) - kfree(event); + free_event(event); err_put_context: if (err < 0) @@ -5176,7 +5332,7 @@ void perf_event_exit_task(struct task_struct *child) * * But since its the parent context it won't be the same instance. */ - mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); + mutex_lock(&child_ctx->mutex); again: list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, @@ -5384,6 +5540,7 @@ static void __init perf_event_init_all_cpus(void) for_each_possible_cpu(cpu) { cpuctx = &per_cpu(perf_cpu_context, cpu); + mutex_init(&cpuctx->hlist_mutex); __perf_event_init_context(&cpuctx->ctx, NULL); } } @@ -5397,6 +5554,16 @@ static void __cpuinit perf_event_init_cpu(int cpu) spin_lock(&perf_resource_lock); cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; spin_unlock(&perf_resource_lock); + + mutex_lock(&cpuctx->hlist_mutex); + if (cpuctx->hlist_refcount > 0) { + struct swevent_hlist *hlist; + + hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); + WARN_ON_ONCE(!hlist); + rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + } + mutex_unlock(&cpuctx->hlist_mutex); } #ifdef CONFIG_HOTPLUG_CPU @@ -5416,6 +5583,10 @@ static void perf_event_exit_cpu(int cpu) struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_event_context *ctx = &cpuctx->ctx; + mutex_lock(&cpuctx->hlist_mutex); + swevent_hlist_release(cpuctx); + mutex_unlock(&cpuctx->hlist_mutex); + mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); mutex_unlock(&ctx->mutex); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 42ad8ae729a..9fb51237b18 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -76,7 +76,6 @@ void __ptrace_unlink(struct task_struct *child) child->parent = child->real_parent; list_del_init(&child->ptrace_entry); - arch_ptrace_untrace(child); if (task_is_traced(child)) ptrace_untrace(child); } diff --git a/kernel/sched.c b/kernel/sched.c index b531d793408..78554dd0d1a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -323,6 +323,15 @@ static inline struct task_group *task_group(struct task_struct *p) /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { + /* + * Strictly speaking this rcu_read_lock() is not needed since the + * task_group is tied to the cgroup, which in turn can never go away + * as long as there are tasks attached to it. + * + * However since task_group() uses task_subsys_state() which is an + * rcu_dereference() user, this quiets CONFIG_PROVE_RCU. + */ + rcu_read_lock(); #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; p->se.parent = task_group(p)->se[cpu]; @@ -332,6 +341,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) p->rt.rt_rq = task_group(p)->rt_rq[cpu]; p->rt.parent = task_group(p)->rt_se[cpu]; #endif + rcu_read_unlock(); } #else @@ -2058,49 +2068,6 @@ static bool migrate_task(struct task_struct *p, int dest_cpu) } /* - * wait_task_context_switch - wait for a thread to complete at least one - * context switch. - * - * @p must not be current. - */ -void wait_task_context_switch(struct task_struct *p) -{ - unsigned long nvcsw, nivcsw, flags; - int running; - struct rq *rq; - - nvcsw = p->nvcsw; - nivcsw = p->nivcsw; - for (;;) { - /* - * The runqueue is assigned before the actual context - * switch. We need to take the runqueue lock. - * - * We could check initially without the lock but it is - * very likely that we need to take the lock in every - * iteration. - */ - rq = task_rq_lock(p, &flags); - running = task_running(rq, p); - task_rq_unlock(rq, &flags); - - if (likely(!running)) - break; - /* - * The switch count is incremented before the actual - * context switch. We thus wait for two switches to be - * sure at least one completed. - */ - if ((p->nvcsw - nvcsw) > 1) - break; - if ((p->nivcsw - nivcsw) > 1) - break; - - cpu_relax(); - } -} - -/* * wait_task_inactive - wait for a thread to unschedule. * * If @match_state is nonzero, it's the @p->state value just checked and @@ -3724,7 +3691,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) * the mutex owner just released it and exited. */ if (probe_kernel_address(&owner->cpu, cpu)) - goto out; + return 0; #else cpu = owner->cpu; #endif @@ -3734,14 +3701,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) * the cpu field may no longer be valid. */ if (cpu >= nr_cpumask_bits) - goto out; + return 0; /* * We need to validate that we can do a * get_cpu() and that we have the percpu area. */ if (!cpu_online(cpu)) - goto out; + return 0; rq = cpu_rq(cpu); @@ -3760,7 +3727,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) cpu_relax(); } -out: + return 1; } #endif diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 13e13d428cd..8b1797c4545 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -44,9 +44,6 @@ config HAVE_FTRACE_MCOUNT_RECORD help See Documentation/trace/ftrace-design.txt -config HAVE_HW_BRANCH_TRACER - bool - config HAVE_SYSCALL_TRACEPOINTS bool help @@ -374,14 +371,6 @@ config STACK_TRACER Say N if unsure. -config HW_BRANCH_TRACER - depends on HAVE_HW_BRANCH_TRACER - bool "Trace hw branches" - select GENERIC_TRACER - help - This tracer records all branches on the system in a circular - buffer, giving access to the last N branches for each cpu. - config KMEMTRACE bool "Trace SLAB allocations" select GENERIC_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 78edc649003..ffb1a5b0550 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -41,7 +41,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o -obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 40cd1718fb1..2cd96399463 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -34,7 +34,6 @@ enum trace_type { TRACE_GRAPH_RET, TRACE_GRAPH_ENT, TRACE_USER_STACK, - TRACE_HW_BRANCHES, TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, TRACE_BLK, @@ -103,29 +102,17 @@ struct syscall_trace_exit { long ret; }; -struct kprobe_trace_entry { +struct kprobe_trace_entry_head { struct trace_entry ent; unsigned long ip; - int nargs; - unsigned long args[]; }; -#define SIZEOF_KPROBE_TRACE_ENTRY(n) \ - (offsetof(struct kprobe_trace_entry, args) + \ - (sizeof(unsigned long) * (n))) - -struct kretprobe_trace_entry { +struct kretprobe_trace_entry_head { struct trace_entry ent; unsigned long func; unsigned long ret_ip; - int nargs; - unsigned long args[]; }; -#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \ - (offsetof(struct kretprobe_trace_entry, args) + \ - (sizeof(unsigned long) * (n))) - /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: @@ -229,7 +216,6 @@ extern void __ftrace_bad_type(void); TRACE_GRAPH_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ - IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ @@ -470,8 +456,6 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); -extern int trace_selftest_startup_hw_branches(struct tracer *trace, - struct trace_array *tr); extern int trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index c16a08f399d..dc008c1240d 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -318,18 +318,6 @@ FTRACE_ENTRY(branch, trace_branch, __entry->func, __entry->file, __entry->correct) ); -FTRACE_ENTRY(hw_branch, hw_branch_entry, - - TRACE_HW_BRANCHES, - - F_STRUCT( - __field( u64, from ) - __field( u64, to ) - ), - - F_printk("from: %llx to: %llx", __entry->from, __entry->to) -); - FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, TRACE_KMEM_ALLOC, diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 239ea5d77d6..57bb1bb3299 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1400,7 +1400,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, } err = -EINVAL; - if (!call) + if (&call->list == &ftrace_events) goto out_unlock; err = -EEXIST; diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c deleted file mode 100644 index 7b97000745f..00000000000 --- a/kernel/trace/trace_hw_branches.c +++ /dev/null @@ -1,312 +0,0 @@ -/* - * h/w branch tracer for x86 based on BTS - * - * Copyright (C) 2008-2009 Intel Corporation. - * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009 - */ -#include <linux/kallsyms.h> -#include <linux/debugfs.h> -#include <linux/ftrace.h> -#include <linux/module.h> -#include <linux/cpu.h> -#include <linux/smp.h> -#include <linux/fs.h> - -#include <asm/ds.h> - -#include "trace_output.h" -#include "trace.h" - - -#define BTS_BUFFER_SIZE (1 << 13) - -static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer); -static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer); - -#define this_tracer per_cpu(hwb_tracer, smp_processor_id()) - -static int trace_hw_branches_enabled __read_mostly; -static int trace_hw_branches_suspended __read_mostly; -static struct trace_array *hw_branch_trace __read_mostly; - - -static void bts_trace_init_cpu(int cpu) -{ - per_cpu(hwb_tracer, cpu) = - ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu), - BTS_BUFFER_SIZE, NULL, (size_t)-1, - BTS_KERNEL); - - if (IS_ERR(per_cpu(hwb_tracer, cpu))) - per_cpu(hwb_tracer, cpu) = NULL; -} - -static int bts_trace_init(struct trace_array *tr) -{ - int cpu; - - hw_branch_trace = tr; - trace_hw_branches_enabled = 0; - - get_online_cpus(); - for_each_online_cpu(cpu) { - bts_trace_init_cpu(cpu); - - if (likely(per_cpu(hwb_tracer, cpu))) - trace_hw_branches_enabled = 1; - } - trace_hw_branches_suspended = 0; - put_online_cpus(); - - /* If we could not enable tracing on a single cpu, we fail. */ - return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP; -} - -static void bts_trace_reset(struct trace_array *tr) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) { - if (likely(per_cpu(hwb_tracer, cpu))) { - ds_release_bts(per_cpu(hwb_tracer, cpu)); - per_cpu(hwb_tracer, cpu) = NULL; - } - } - trace_hw_branches_enabled = 0; - trace_hw_branches_suspended = 0; - put_online_cpus(); -} - -static void bts_trace_start(struct trace_array *tr) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - if (likely(per_cpu(hwb_tracer, cpu))) - ds_resume_bts(per_cpu(hwb_tracer, cpu)); - trace_hw_branches_suspended = 0; - put_online_cpus(); -} - -static void bts_trace_stop(struct trace_array *tr) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - if (likely(per_cpu(hwb_tracer, cpu))) - ds_suspend_bts(per_cpu(hwb_tracer, cpu)); - trace_hw_branches_suspended = 1; - put_online_cpus(); -} - -static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int cpu = (long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - /* The notification is sent with interrupts enabled. */ - if (trace_hw_branches_enabled) { - bts_trace_init_cpu(cpu); - - if (trace_hw_branches_suspended && - likely(per_cpu(hwb_tracer, cpu))) - ds_suspend_bts(per_cpu(hwb_tracer, cpu)); - } - break; - - case CPU_DOWN_PREPARE: - /* The notification is sent with interrupts enabled. */ - if (likely(per_cpu(hwb_tracer, cpu))) { - ds_release_bts(per_cpu(hwb_tracer, cpu)); - per_cpu(hwb_tracer, cpu) = NULL; - } - } - - return NOTIFY_DONE; -} - -static struct notifier_block bts_hotcpu_notifier __cpuinitdata = { - .notifier_call = bts_hotcpu_handler -}; - -static void bts_trace_print_header(struct seq_file *m) -{ - seq_puts(m, "# CPU# TO <- FROM\n"); -} - -static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) -{ - unsigned long symflags = TRACE_ITER_SYM_OFFSET; - struct trace_entry *entry = iter->ent; - struct trace_seq *seq = &iter->seq; - struct hw_branch_entry *it; - - trace_assign_type(it, entry); - - if (entry->type == TRACE_HW_BRANCHES) { - if (trace_seq_printf(seq, "%4d ", iter->cpu) && - seq_print_ip_sym(seq, it->to, symflags) && - trace_seq_printf(seq, "\t <- ") && - seq_print_ip_sym(seq, it->from, symflags) && - trace_seq_printf(seq, "\n")) - return TRACE_TYPE_HANDLED; - return TRACE_TYPE_PARTIAL_LINE; - } - return TRACE_TYPE_UNHANDLED; -} - -void trace_hw_branch(u64 from, u64 to) -{ - struct ftrace_event_call *call = &event_hw_branch; - struct trace_array *tr = hw_branch_trace; - struct ring_buffer_event *event; - struct ring_buffer *buf; - struct hw_branch_entry *entry; - unsigned long irq1; - int cpu; - - if (unlikely(!tr)) - return; - - if (unlikely(!trace_hw_branches_enabled)) - return; - - local_irq_save(irq1); - cpu = raw_smp_processor_id(); - if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) - goto out; - - buf = tr->buffer; - event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, from); - entry->ent.type = TRACE_HW_BRANCHES; - entry->from = from; - entry->to = to; - if (!filter_check_discard(call, entry, buf, event)) - trace_buffer_unlock_commit(buf, event, 0, 0); - - out: - atomic_dec(&tr->data[cpu]->disabled); - local_irq_restore(irq1); -} - -static void trace_bts_at(const struct bts_trace *trace, void *at) -{ - struct bts_struct bts; - int err = 0; - - WARN_ON_ONCE(!trace->read); - if (!trace->read) - return; - - err = trace->read(this_tracer, at, &bts); - if (err < 0) - return; - - switch (bts.qualifier) { - case BTS_BRANCH: - trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to); - break; - } -} - -/* - * Collect the trace on the current cpu and write it into the ftrace buffer. - * - * pre: tracing must be suspended on the current cpu - */ -static void trace_bts_cpu(void *arg) -{ - struct trace_array *tr = (struct trace_array *)arg; - const struct bts_trace *trace; - unsigned char *at; - - if (unlikely(!tr)) - return; - - if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled))) - return; - - if (unlikely(!this_tracer)) - return; - - trace = ds_read_bts(this_tracer); - if (!trace) - return; - - for (at = trace->ds.top; (void *)at < trace->ds.end; - at += trace->ds.size) - trace_bts_at(trace, at); - - for (at = trace->ds.begin; (void *)at < trace->ds.top; - at += trace->ds.size) - trace_bts_at(trace, at); -} - -static void trace_bts_prepare(struct trace_iterator *iter) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - if (likely(per_cpu(hwb_tracer, cpu))) - ds_suspend_bts(per_cpu(hwb_tracer, cpu)); - /* - * We need to collect the trace on the respective cpu since ftrace - * implicitly adds the record for the current cpu. - * Once that is more flexible, we could collect the data from any cpu. - */ - on_each_cpu(trace_bts_cpu, iter->tr, 1); - - for_each_online_cpu(cpu) - if (likely(per_cpu(hwb_tracer, cpu))) - ds_resume_bts(per_cpu(hwb_tracer, cpu)); - put_online_cpus(); -} - -static void trace_bts_close(struct trace_iterator *iter) -{ - tracing_reset_online_cpus(iter->tr); -} - -void trace_hw_branch_oops(void) -{ - if (this_tracer) { - ds_suspend_bts_noirq(this_tracer); - trace_bts_cpu(hw_branch_trace); - ds_resume_bts_noirq(this_tracer); - } -} - -struct tracer bts_tracer __read_mostly = -{ - .name = "hw-branch-tracer", - .init = bts_trace_init, - .reset = bts_trace_reset, - .print_header = bts_trace_print_header, - .print_line = bts_trace_print_line, - .start = bts_trace_start, - .stop = bts_trace_stop, - .open = trace_bts_prepare, - .close = trace_bts_close, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_hw_branches, -#endif /* CONFIG_FTRACE_SELFTEST */ -}; - -__init static int init_bts_trace(void) -{ - register_hotcpu_notifier(&bts_hotcpu_notifier); - return register_tracer(&bts_tracer); -} -device_initcall(init_bts_trace); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 0e3ded64cdb..9a082bba953 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -29,6 +29,8 @@ #include <linux/ctype.h> #include <linux/ptrace.h> #include <linux/perf_event.h> +#include <linux/stringify.h> +#include <asm/bitsperlong.h> #include "trace.h" #include "trace_output.h" @@ -40,7 +42,6 @@ /* Reserved field names */ #define FIELD_STRING_IP "__probe_ip" -#define FIELD_STRING_NARGS "__probe_nargs" #define FIELD_STRING_RETIP "__probe_ret_ip" #define FIELD_STRING_FUNC "__probe_func" @@ -52,56 +53,102 @@ const char *reserved_field_names[] = { "common_tgid", "common_lock_depth", FIELD_STRING_IP, - FIELD_STRING_NARGS, FIELD_STRING_RETIP, FIELD_STRING_FUNC, }; -struct fetch_func { - unsigned long (*func)(struct pt_regs *, void *); +/* Printing function type */ +typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *); +#define PRINT_TYPE_FUNC_NAME(type) print_type_##type +#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type + +/* Printing in basic type function template */ +#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ +static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ + const char *name, void *data)\ +{ \ + return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ +} \ +static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; + +DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) +DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) +DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) +DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) +DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) + +/* Data fetch function type */ +typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); + +struct fetch_param { + fetch_func_t fn; void *data; }; -static __kprobes unsigned long call_fetch(struct fetch_func *f, - struct pt_regs *regs) +static __kprobes void call_fetch(struct fetch_param *fprm, + struct pt_regs *regs, void *dest) { - return f->func(regs, f->data); + return fprm->fn(regs, fprm->data, dest); } -/* fetch handlers */ -static __kprobes unsigned long fetch_register(struct pt_regs *regs, - void *offset) -{ - return regs_get_register(regs, (unsigned int)((unsigned long)offset)); +#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type +/* + * Define macro for basic types - we don't need to define s* types, because + * we have to care only about bitwidth at recording time. + */ +#define DEFINE_BASIC_FETCH_FUNCS(kind) \ +DEFINE_FETCH_##kind(u8) \ +DEFINE_FETCH_##kind(u16) \ +DEFINE_FETCH_##kind(u32) \ +DEFINE_FETCH_##kind(u64) + +#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \ + ((FETCH_FUNC_NAME(kind, u8) == fn) || \ + (FETCH_FUNC_NAME(kind, u16) == fn) || \ + (FETCH_FUNC_NAME(kind, u32) == fn) || \ + (FETCH_FUNC_NAME(kind, u64) == fn)) + +/* Data fetch function templates */ +#define DEFINE_FETCH_reg(type) \ +static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_register(regs, \ + (unsigned int)((unsigned long)offset)); \ } - -static __kprobes unsigned long fetch_stack(struct pt_regs *regs, - void *num) -{ - return regs_get_kernel_stack_nth(regs, - (unsigned int)((unsigned long)num)); +DEFINE_BASIC_FETCH_FUNCS(reg) + +#define DEFINE_FETCH_stack(type) \ +static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ + (unsigned int)((unsigned long)offset)); \ } +DEFINE_BASIC_FETCH_FUNCS(stack) -static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) -{ - unsigned long retval; - - if (probe_kernel_address(addr, retval)) - return 0; - return retval; +#define DEFINE_FETCH_retval(type) \ +static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ + void *dummy, void *dest) \ +{ \ + *(type *)dest = (type)regs_return_value(regs); \ } - -static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, - void *dummy) -{ - return regs_return_value(regs); -} - -static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs, - void *dummy) -{ - return kernel_stack_pointer(regs); +DEFINE_BASIC_FETCH_FUNCS(retval) + +#define DEFINE_FETCH_memory(type) \ +static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ + void *addr, void *dest) \ +{ \ + type retval; \ + if (probe_kernel_address(addr, retval)) \ + *(type *)dest = 0; \ + else \ + *(type *)dest = retval; \ } +DEFINE_BASIC_FETCH_FUNCS(memory) /* Memory fetching by symbol */ struct symbol_cache { @@ -145,51 +192,126 @@ static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) return sc; } -static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) -{ - struct symbol_cache *sc = data; - - if (sc->addr) - return fetch_memory(regs, (void *)sc->addr); - else - return 0; +#define DEFINE_FETCH_symbol(type) \ +static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct symbol_cache *sc = data; \ + if (sc->addr) \ + fetch_memory_##type(regs, (void *)sc->addr, dest); \ + else \ + *(type *)dest = 0; \ } +DEFINE_BASIC_FETCH_FUNCS(symbol) -/* Special indirect memory access interface */ -struct indirect_fetch_data { - struct fetch_func orig; +/* Dereference memory access function */ +struct deref_fetch_param { + struct fetch_param orig; long offset; }; -static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) -{ - struct indirect_fetch_data *ind = data; - unsigned long addr; - - addr = call_fetch(&ind->orig, regs); - if (addr) { - addr += ind->offset; - return fetch_memory(regs, (void *)addr); - } else - return 0; +#define DEFINE_FETCH_deref(type) \ +static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct deref_fetch_param *dprm = data; \ + unsigned long addr; \ + call_fetch(&dprm->orig, regs, &addr); \ + if (addr) { \ + addr += dprm->offset; \ + fetch_memory_##type(regs, (void *)addr, dest); \ + } else \ + *(type *)dest = 0; \ } +DEFINE_BASIC_FETCH_FUNCS(deref) -static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) +static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) { - if (data->orig.func == fetch_indirect) - free_indirect_fetch_data(data->orig.data); - else if (data->orig.func == fetch_symbol) + if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn)) + free_deref_fetch_param(data->orig.data); + else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn)) free_symbol_cache(data->orig.data); kfree(data); } +/* Default (unsigned long) fetch type */ +#define __DEFAULT_FETCH_TYPE(t) u##t +#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) +#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) +#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) + +#define ASSIGN_FETCH_FUNC(kind, type) \ + .kind = FETCH_FUNC_NAME(kind, type) + +#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ + {.name = #ptype, \ + .size = sizeof(ftype), \ + .is_signed = sign, \ + .print = PRINT_TYPE_FUNC_NAME(ptype), \ + .fmt = PRINT_TYPE_FMT_NAME(ptype), \ +ASSIGN_FETCH_FUNC(reg, ftype), \ +ASSIGN_FETCH_FUNC(stack, ftype), \ +ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(memory, ftype), \ +ASSIGN_FETCH_FUNC(symbol, ftype), \ +ASSIGN_FETCH_FUNC(deref, ftype), \ + } + +/* Fetch type information table */ +static const struct fetch_type { + const char *name; /* Name of type */ + size_t size; /* Byte size of type */ + int is_signed; /* Signed flag */ + print_type_func_t print; /* Print functions */ + const char *fmt; /* Fromat string */ + /* Fetch functions */ + fetch_func_t reg; + fetch_func_t stack; + fetch_func_t retval; + fetch_func_t memory; + fetch_func_t symbol; + fetch_func_t deref; +} fetch_type_table[] = { + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), +}; + +static const struct fetch_type *find_fetch_type(const char *type) +{ + int i; + + if (!type) + type = DEFAULT_FETCH_TYPE_STR; + + for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) + if (strcmp(type, fetch_type_table[i].name) == 0) + return &fetch_type_table[i]; + return NULL; +} + +/* Special function : only accept unsigned long */ +static __kprobes void fetch_stack_address(struct pt_regs *regs, + void *dummy, void *dest) +{ + *(unsigned long *)dest = kernel_stack_pointer(regs); +} + /** * Kprobe event core functions */ struct probe_arg { - struct fetch_func fetch; - const char *name; + struct fetch_param fetch; + unsigned int offset; /* Offset from argument entry */ + const char *name; /* Name of this argument */ + const char *comm; /* Command of this argument */ + const struct fetch_type *type; /* Type of this argument */ }; /* Flags for trace_probe */ @@ -204,6 +326,7 @@ struct trace_probe { const char *symbol; /* symbol name */ struct ftrace_event_class class; struct ftrace_event_call call; + ssize_t size; /* trace entry size */ unsigned int nr_args; struct probe_arg args[]; }; @@ -212,6 +335,7 @@ struct trace_probe { (offsetof(struct trace_probe, args) + \ (sizeof(struct probe_arg) * (n))) + static __kprobes int probe_is_return(struct trace_probe *tp) { return tp->rp.handler != NULL; @@ -222,49 +346,6 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp) return tp->symbol ? tp->symbol : "unknown"; } -static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff) -{ - int ret = -EINVAL; - - if (ff->func == fetch_register) { - const char *name; - name = regs_query_register_name((unsigned int)((long)ff->data)); - ret = snprintf(buf, n, "%%%s", name); - } else if (ff->func == fetch_stack) - ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data); - else if (ff->func == fetch_memory) - ret = snprintf(buf, n, "@0x%p", ff->data); - else if (ff->func == fetch_symbol) { - struct symbol_cache *sc = ff->data; - if (sc->offset) - ret = snprintf(buf, n, "@%s%+ld", sc->symbol, - sc->offset); - else - ret = snprintf(buf, n, "@%s", sc->symbol); - } else if (ff->func == fetch_retvalue) - ret = snprintf(buf, n, "$retval"); - else if (ff->func == fetch_stack_address) - ret = snprintf(buf, n, "$stack"); - else if (ff->func == fetch_indirect) { - struct indirect_fetch_data *id = ff->data; - size_t l = 0; - ret = snprintf(buf, n, "%+ld(", id->offset); - if (ret >= n) - goto end; - l += ret; - ret = probe_arg_string(buf + l, n - l, &id->orig); - if (ret < 0) - goto end; - l += ret; - ret = snprintf(buf + l, n - l, ")"); - ret += l; - } -end: - if (ret >= n) - return -ENOSPC; - return ret; -} - static int register_probe_event(struct trace_probe *tp); static void unregister_probe_event(struct trace_probe *tp); @@ -348,11 +429,12 @@ error: static void free_probe_arg(struct probe_arg *arg) { - if (arg->fetch.func == fetch_symbol) + if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn)) + free_deref_fetch_param(arg->fetch.data); + else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn)) free_symbol_cache(arg->fetch.data); - else if (arg->fetch.func == fetch_indirect) - free_indirect_fetch_data(arg->fetch.data); kfree(arg->name); + kfree(arg->comm); } static void free_trace_probe(struct trace_probe *tp) @@ -458,28 +540,30 @@ static int split_symbol_offset(char *symbol, unsigned long *offset) #define PARAM_MAX_ARGS 16 #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) -static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) +static int parse_probe_vars(char *arg, const struct fetch_type *t, + struct fetch_param *f, int is_return) { int ret = 0; unsigned long param; if (strcmp(arg, "retval") == 0) { - if (is_return) { - ff->func = fetch_retvalue; - ff->data = NULL; - } else + if (is_return) + f->fn = t->retval; + else ret = -EINVAL; } else if (strncmp(arg, "stack", 5) == 0) { if (arg[5] == '\0') { - ff->func = fetch_stack_address; - ff->data = NULL; + if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) + f->fn = fetch_stack_address; + else + ret = -EINVAL; } else if (isdigit(arg[5])) { ret = strict_strtoul(arg + 5, 10, ¶m); if (ret || param > PARAM_MAX_STACK) ret = -EINVAL; else { - ff->func = fetch_stack; - ff->data = (void *)param; + f->fn = t->stack; + f->data = (void *)param; } } else ret = -EINVAL; @@ -489,7 +573,8 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) } /* Recursive argument parser */ -static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +static int __parse_probe_arg(char *arg, const struct fetch_type *t, + struct fetch_param *f, int is_return) { int ret = 0; unsigned long param; @@ -498,13 +583,13 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, ff, is_return); + ret = parse_probe_vars(arg + 1, t, f, is_return); break; case '%': /* named register */ ret = regs_query_register_offset(arg + 1); if (ret >= 0) { - ff->func = fetch_register; - ff->data = (void *)(unsigned long)ret; + f->fn = t->reg; + f->data = (void *)(unsigned long)ret; ret = 0; } break; @@ -513,26 +598,22 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) ret = strict_strtoul(arg + 1, 0, ¶m); if (ret) break; - ff->func = fetch_memory; - ff->data = (void *)param; + f->fn = t->memory; + f->data = (void *)param; } else { ret = split_symbol_offset(arg + 1, &offset); if (ret) break; - ff->data = alloc_symbol_cache(arg + 1, offset); - if (ff->data) - ff->func = fetch_symbol; - else - ret = -EINVAL; + f->data = alloc_symbol_cache(arg + 1, offset); + if (f->data) + f->fn = t->symbol; } break; - case '+': /* indirect memory */ + case '+': /* deref memory */ case '-': tmp = strchr(arg, '('); - if (!tmp) { - ret = -EINVAL; + if (!tmp) break; - } *tmp = '\0'; ret = strict_strtol(arg + 1, 0, &offset); if (ret) @@ -542,38 +623,58 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) arg = tmp + 1; tmp = strrchr(arg, ')'); if (tmp) { - struct indirect_fetch_data *id; + struct deref_fetch_param *dprm; + const struct fetch_type *t2 = find_fetch_type(NULL); *tmp = '\0'; - id = kzalloc(sizeof(struct indirect_fetch_data), - GFP_KERNEL); - if (!id) + dprm = kzalloc(sizeof(struct deref_fetch_param), + GFP_KERNEL); + if (!dprm) return -ENOMEM; - id->offset = offset; - ret = __parse_probe_arg(arg, &id->orig, is_return); + dprm->offset = offset; + ret = __parse_probe_arg(arg, t2, &dprm->orig, + is_return); if (ret) - kfree(id); + kfree(dprm); else { - ff->func = fetch_indirect; - ff->data = (void *)id; + f->fn = t->deref; + f->data = (void *)dprm; } - } else - ret = -EINVAL; + } break; - default: - /* TODO: support custom handler */ - ret = -EINVAL; } + if (!ret && !f->fn) + ret = -EINVAL; return ret; } /* String length checking wrapper */ -static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +static int parse_probe_arg(char *arg, struct trace_probe *tp, + struct probe_arg *parg, int is_return) { + const char *t; + if (strlen(arg) > MAX_ARGSTR_LEN) { pr_info("Argument is too long.: %s\n", arg); return -ENOSPC; } - return __parse_probe_arg(arg, ff, is_return); + parg->comm = kstrdup(arg, GFP_KERNEL); + if (!parg->comm) { + pr_info("Failed to allocate memory for command '%s'.\n", arg); + return -ENOMEM; + } + t = strchr(parg->comm, ':'); + if (t) { + arg[t - parg->comm] = '\0'; + t++; + } + parg->type = find_fetch_type(t); + if (!parg->type) { + pr_info("Unsupported type: %s\n", t); + return -EINVAL; + } + parg->offset = tp->size; + tp->size += parg->type->size; + return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); } /* Return 1 if name is reserved or already used by another argument */ @@ -603,15 +704,18 @@ static int create_trace_probe(int argc, char **argv) * @ADDR : fetch memory at ADDR (ADDR should be in kernel) * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) * %REG : fetch register REG - * Indirect memory fetch: + * Dereferencing memory fetch: * +|-offs(ARG) : fetch memory at ARG +|- offs address. * Alias name of args: * NAME=FETCHARG : set NAME as alias of FETCHARG. + * Type of args: + * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_probe *tp; int i, ret = 0; int is_return = 0, is_delete = 0; - char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; + char *symbol = NULL, *event = NULL, *group = NULL; + char *arg, *tmp; unsigned long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; @@ -724,13 +828,6 @@ static int create_trace_probe(int argc, char **argv) else arg = argv[i]; - if (conflict_field_name(argv[i], tp->args, i)) { - pr_info("Argument%d name '%s' conflicts with " - "another field.\n", i, argv[i]); - ret = -EINVAL; - goto error; - } - tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); if (!tp->args[i].name) { pr_info("Failed to allocate argument%d name '%s'.\n", @@ -738,9 +835,19 @@ static int create_trace_probe(int argc, char **argv) ret = -ENOMEM; goto error; } + tmp = strchr(tp->args[i].name, ':'); + if (tmp) + *tmp = '_'; /* convert : to _ */ + + if (conflict_field_name(tp->args[i].name, tp->args, i)) { + pr_info("Argument%d name '%s' conflicts with " + "another field.\n", i, argv[i]); + ret = -EINVAL; + goto error; + } /* Parse fetch argument */ - ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); + ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); if (ret) { pr_info("Parse error at argument%d. (%d)\n", i, ret); kfree(tp->args[i].name); @@ -795,8 +902,7 @@ static void probes_seq_stop(struct seq_file *m, void *v) static int probes_seq_show(struct seq_file *m, void *v) { struct trace_probe *tp = v; - int i, ret; - char buf[MAX_ARGSTR_LEN + 1]; + int i; seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); @@ -808,15 +914,10 @@ static int probes_seq_show(struct seq_file *m, void *v) else seq_printf(m, " %s", probe_symbol(tp)); - for (i = 0; i < tp->nr_args; i++) { - ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); - if (ret < 0) { - pr_warning("Argument%d decoding error(%d).\n", i, ret); - return ret; - } - seq_printf(m, " %s=%s", tp->args[i].name, buf); - } + for (i = 0; i < tp->nr_args; i++) + seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); seq_printf(m, "\n"); + return 0; } @@ -946,9 +1047,10 @@ static const struct file_operations kprobe_profile_ops = { static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) { struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); - struct kprobe_trace_entry *entry; + struct kprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; + u8 *data; int size, i, pc; unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; @@ -958,7 +1060,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) local_save_flags(irq_flags); pc = preempt_count(); - size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + size = sizeof(*entry) + tp->size; event = trace_current_buffer_lock_reserve(&buffer, call->event.type, size, irq_flags, pc); @@ -966,10 +1068,10 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) return; entry = ring_buffer_event_data(event); - entry->nargs = tp->nr_args; entry->ip = (unsigned long)kp->addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -980,9 +1082,10 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); - struct kretprobe_trace_entry *entry; + struct kretprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; + u8 *data; int size, i, pc; unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; @@ -990,7 +1093,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, local_save_flags(irq_flags); pc = preempt_count(); - size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + size = sizeof(*entry) + tp->size; event = trace_current_buffer_lock_reserve(&buffer, call->event.type, size, irq_flags, pc); @@ -998,11 +1101,11 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, return; entry = ring_buffer_event_data(event); - entry->nargs = tp->nr_args; entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -1013,12 +1116,13 @@ enum print_line_t print_kprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) { - struct kprobe_trace_entry *field; + struct kprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; struct trace_probe *tp; + u8 *data; int i; - field = (struct kprobe_trace_entry *)iter->ent; + field = (struct kprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); if (!trace_seq_printf(s, "%s: (", tp->call.name)) @@ -1030,9 +1134,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags, if (!trace_seq_puts(s, ")")) goto partial; - for (i = 0; i < field->nargs; i++) - if (!trace_seq_printf(s, " %s=%lx", - tp->args[i].name, field->args[i])) + data = (u8 *)&field[1]; + for (i = 0; i < tp->nr_args; i++) + if (!tp->args[i].type->print(s, tp->args[i].name, + data + tp->args[i].offset)) goto partial; if (!trace_seq_puts(s, "\n")) @@ -1047,12 +1152,13 @@ enum print_line_t print_kretprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) { - struct kretprobe_trace_entry *field; + struct kretprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; struct trace_probe *tp; + u8 *data; int i; - field = (struct kretprobe_trace_entry *)iter->ent; + field = (struct kretprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); if (!trace_seq_printf(s, "%s: (", tp->call.name)) @@ -1070,9 +1176,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, if (!trace_seq_puts(s, ")")) goto partial; - for (i = 0; i < field->nargs; i++) - if (!trace_seq_printf(s, " %s=%lx", - tp->args[i].name, field->args[i])) + data = (u8 *)&field[1]; + for (i = 0; i < tp->nr_args; i++) + if (!tp->args[i].type->print(s, tp->args[i].name, + data + tp->args[i].offset)) goto partial; if (!trace_seq_puts(s, "\n")) @@ -1126,29 +1233,43 @@ static int probe_event_raw_init(struct ftrace_event_call *event_call) static int kprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; - struct kprobe_trace_entry field; + struct kprobe_trace_entry_head field; struct trace_probe *tp = (struct trace_probe *)event_call->data; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); - DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) - DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + for (i = 0; i < tp->nr_args; i++) { + ret = trace_define_field(event_call, tp->args[i].type->name, + tp->args[i].name, + sizeof(field) + tp->args[i].offset, + tp->args[i].type->size, + tp->args[i].type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } return 0; } static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; - struct kretprobe_trace_entry field; + struct kretprobe_trace_entry_head field; struct trace_probe *tp = (struct trace_probe *)event_call->data; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); - DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) - DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + for (i = 0; i < tp->nr_args; i++) { + ret = trace_define_field(event_call, tp->args[i].type->name, + tp->args[i].name, + sizeof(field) + tp->args[i].offset, + tp->args[i].type->size, + tp->args[i].type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } return 0; } @@ -1173,8 +1294,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); for (i = 0; i < tp->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx", - tp->args[i].name); + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", + tp->args[i].name, tp->args[i].type->fmt); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); @@ -1216,12 +1337,13 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, { struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); struct ftrace_event_call *call = &tp->call; - struct kprobe_trace_entry *entry; + struct kprobe_trace_entry_head *entry; + u8 *data; int size, __size, i; unsigned long irq_flags; int rctx; - __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + __size = sizeof(*entry) + tp->size; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, @@ -1233,10 +1355,10 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, if (!entry) return; - entry->nargs = tp->nr_args; entry->ip = (unsigned long)kp->addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); } @@ -1247,12 +1369,13 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, { struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); struct ftrace_event_call *call = &tp->call; - struct kretprobe_trace_entry *entry; + struct kretprobe_trace_entry_head *entry; + u8 *data; int size, __size, i; unsigned long irq_flags; int rctx; - __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + __size = sizeof(*entry) + tp->size; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, @@ -1264,11 +1387,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, if (!entry) return; - entry->nargs = tp->nr_args; entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags, regs); diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index d59cd687947..8eaf00749b6 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -34,12 +34,6 @@ #include <asm/atomic.h> -/* - * For now, let us restrict the no. of symbols traced simultaneously to number - * of available hardware breakpoint registers. - */ -#define KSYM_TRACER_MAX HBP_NUM - #define KSYM_TRACER_OP_LEN 3 /* rw- */ struct trace_ksym { @@ -53,7 +47,6 @@ struct trace_ksym { static struct trace_array *ksym_trace_array; -static unsigned int ksym_filter_entry_count; static unsigned int ksym_tracing_enabled; static HLIST_HEAD(ksym_filter_head); @@ -181,13 +174,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) struct trace_ksym *entry; int ret = -ENOMEM; - if (ksym_filter_entry_count >= KSYM_TRACER_MAX) { - printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No" - " new requests for tracing can be accepted now.\n", - KSYM_TRACER_MAX); - return -ENOSPC; - } - entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); if (!entry) return -ENOMEM; @@ -203,13 +189,17 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) if (IS_ERR(entry->ksym_hbp)) { ret = PTR_ERR(entry->ksym_hbp); - printk(KERN_INFO "ksym_tracer request failed. Try again" - " later!!\n"); + if (ret == -ENOSPC) { + printk(KERN_ERR "ksym_tracer: Maximum limit reached." + " No new requests for tracing can be accepted now.\n"); + } else { + printk(KERN_INFO "ksym_tracer request failed. Try again" + " later!!\n"); + } goto err; } hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); - ksym_filter_entry_count++; return 0; @@ -265,7 +255,6 @@ static void __ksym_trace_reset(void) hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, ksym_hlist) { unregister_wide_hw_breakpoint(entry->ksym_hbp); - ksym_filter_entry_count--; hlist_del_rcu(&(entry->ksym_hlist)); synchronize_rcu(); kfree(entry); @@ -338,7 +327,6 @@ static ssize_t ksym_trace_filter_write(struct file *file, goto out_unlock; } /* Error or "symbol:---" case: drop it */ - ksym_filter_entry_count--; hlist_del_rcu(&(entry->ksym_hlist)); synchronize_rcu(); kfree(entry); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 6a9d36ddfcf..250e7f9bd2f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -17,7 +17,6 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_BRANCH: case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: - case TRACE_HW_BRANCHES: case TRACE_KSYM: return 1; } @@ -756,62 +755,6 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) } #endif /* CONFIG_BRANCH_TRACER */ -#ifdef CONFIG_HW_BRANCH_TRACER -int -trace_selftest_startup_hw_branches(struct tracer *trace, - struct trace_array *tr) -{ - struct trace_iterator *iter; - struct tracer tracer; - unsigned long count; - int ret; - - if (!trace->open) { - printk(KERN_CONT "missing open function..."); - return -1; - } - - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - /* - * The hw-branch tracer needs to collect the trace from the various - * cpu trace buffers - before tracing is stopped. - */ - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - - memcpy(&tracer, trace, sizeof(tracer)); - - iter->trace = &tracer; - iter->tr = tr; - iter->pos = -1; - mutex_init(&iter->mutex); - - trace->open(iter); - - mutex_destroy(&iter->mutex); - kfree(iter); - - tracing_stop(); - - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - tracing_start(); - - if (!ret && !count) { - printk(KERN_CONT "no entries found.."); - ret = -1; - } - - return ret; -} -#endif /* CONFIG_HW_BRANCH_TRACER */ - #ifdef CONFIG_KSYM_TRACER static int ksym_selftest_dummy; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index dee48658805..5bfb213984b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -774,7 +774,7 @@ void flush_delayed_work(struct delayed_work *dwork) { if (del_timer_sync(&dwork->timer)) { struct cpu_workqueue_struct *cwq; - cwq = wq_per_cpu(keventd_wq, get_cpu()); + cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu()); __queue_work(cwq, &dwork->work); put_cpu(); } |