From 935055856458a05c43f518bf9ed406f67c090f0a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 14 May 2012 15:04:50 -0700 Subject: workqueue: skip nr_running sanity check in worker_enter_idle() if trustee is active commit 544ecf310f0e7f51fa057ac2a295fc1b3b35a9d3 upstream. worker_enter_idle() has WARN_ON_ONCE() which triggers if nr_running isn't zero when every worker is idle. This can trigger spuriously while a cpu is going down due to the way trustee sets %WORKER_ROGUE and zaps nr_running. It first sets %WORKER_ROGUE on all workers without updating nr_running, releases gcwq->lock, schedules, regrabs gcwq->lock and then zaps nr_running. If the last running worker enters idle inbetween, it would see stale nr_running which hasn't been zapped yet and trigger the WARN_ON_ONCE(). Fix it by performing the sanity check iff the trustee is idle. Signed-off-by: Tejun Heo Reported-by: "Paul E. McKenney" Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1456daba9a0..ee1845b8d69 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1214,8 +1214,13 @@ static void worker_enter_idle(struct worker *worker) } else wake_up_all(&gcwq->trustee_wait); - /* sanity check nr_running */ - WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && + /* + * Sanity check nr_running. Because trustee releases gcwq->lock + * between setting %WORKER_ROGUE and zapping nr_running, the + * warning may trigger spuriously. Check iff trustee is idle. + */ + WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && + gcwq->nr_workers == gcwq->nr_idle && atomic_read(get_gcwq_nr_running(gcwq->cpu))); } -- cgit v1.2.3 From b1c7ba1bab7363fee6dc5d4ee5be4e916adcf691 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 17 Jul 2012 12:39:26 -0700 Subject: workqueue: perform cpu down operations from low priority cpu_notifier() commit 6575820221f7a4dd6eadecf7bf83cdd154335eda upstream. Currently, all workqueue cpu hotplug operations run off CPU_PRI_WORKQUEUE which is higher than normal notifiers. This is to ensure that workqueue is up and running while bringing up a CPU before other notifiers try to use workqueue on the CPU. Per-cpu workqueues are supposed to remain working and bound to the CPU for normal CPU_DOWN_PREPARE notifiers. This holds mostly true even with workqueue offlining running with higher priority because workqueue CPU_DOWN_PREPARE only creates a bound trustee thread which runs the per-cpu workqueue without concurrency management without explicitly detaching the existing workers. However, if the trustee needs to create new workers, it creates unbound workers which may wander off to other CPUs while CPU_DOWN_PREPARE notifiers are in progress. Furthermore, if the CPU down is cancelled, the per-CPU workqueue may end up with workers which aren't bound to the CPU. While reliably reproducible with a convoluted artificial test-case involving scheduling and flushing CPU burning work items from CPU down notifiers, this isn't very likely to happen in the wild, and, even when it happens, the effects are likely to be hidden by the following successful CPU down. Fix it by using different priorities for up and down notifiers - high priority for up operations and low priority for down operations. Workqueue cpu hotplug operations will soon go through further cleanup. Signed-off-by: Tejun Heo Acked-by: "Rafael J. Wysocki" Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ee1845b8d69..e88c924fc6b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3561,6 +3561,41 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, return notifier_from_errno(0); } +/* + * Workqueues should be brought up before normal priority CPU notifiers. + * This will be registered high priority CPU notifier. + */ +static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + case CPU_UP_CANCELED: + case CPU_DOWN_FAILED: + case CPU_ONLINE: + return workqueue_cpu_callback(nfb, action, hcpu); + } + return NOTIFY_OK; +} + +/* + * Workqueues should be brought down after normal priority CPU notifiers. + * This will be registered as low priority CPU notifier. + */ +static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + case CPU_DYING: + case CPU_POST_DEAD: + return workqueue_cpu_callback(nfb, action, hcpu); + } + return NOTIFY_OK; +} + #ifdef CONFIG_SMP struct work_for_cpu { @@ -3754,7 +3789,8 @@ static int __init init_workqueues(void) unsigned int cpu; int i; - cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); + cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); + cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); /* initialize gcwqs */ for_each_gcwq_cpu(cpu) { -- cgit v1.2.3 From 3d45db6b5158a7f09f8be651577416ef6a4dcbd4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Sep 2012 12:48:43 -0700 Subject: workqueue: reimplement work_on_cpu() using system_wq commit ed48ece27cd3d5ee0354c32bbaec0f3e1d4715c3 upstream. The existing work_on_cpu() implementation is hugely inefficient. It creates a new kthread, execute that single function and then let the kthread die on each invocation. Now that system_wq can handle concurrent executions, there's no advantage of doing this. Reimplement work_on_cpu() using system_wq which makes it simpler and way more efficient. stable: While this isn't a fix in itself, it's needed to fix a workqueue related bug in cpufreq/powernow-k8. AFAICS, this shouldn't break other existing users. Signed-off-by: Tejun Heo Acked-by: Jiri Kosina Cc: Linus Torvalds Cc: Bjorn Helgaas Cc: Len Brown Cc: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e88c924fc6b..ebd96393ae5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3599,18 +3599,17 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, #ifdef CONFIG_SMP struct work_for_cpu { - struct completion completion; + struct work_struct work; long (*fn)(void *); void *arg; long ret; }; -static int do_work_for_cpu(void *_wfc) +static void work_for_cpu_fn(struct work_struct *work) { - struct work_for_cpu *wfc = _wfc; + struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); + wfc->ret = wfc->fn(wfc->arg); - complete(&wfc->completion); - return 0; } /** @@ -3625,19 +3624,11 @@ static int do_work_for_cpu(void *_wfc) */ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) { - struct task_struct *sub_thread; - struct work_for_cpu wfc = { - .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion), - .fn = fn, - .arg = arg, - }; + struct work_for_cpu wfc = { .fn = fn, .arg = arg }; - sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu"); - if (IS_ERR(sub_thread)) - return PTR_ERR(sub_thread); - kthread_bind(sub_thread, cpu); - wake_up_process(sub_thread); - wait_for_completion(&wfc.completion); + INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); + schedule_work_on(cpu, &wfc.work); + flush_work(&wfc.work); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); -- cgit v1.2.3 From bc713e264ea9badd399aa34f208154a64e4e1b3b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 2 Sep 2012 00:28:19 +0800 Subject: workqueue: UNBOUND -> REBIND morphing in rebind_workers() should be atomic commit 96e65306b81351b656835c15931d1d237b252f27 upstream. The compiler may compile the following code into TWO write/modify instructions. worker->flags &= ~WORKER_UNBOUND; worker->flags |= WORKER_REBIND; so the other CPU may temporarily see worker->flags which doesn't have either WORKER_UNBOUND or WORKER_REBIND set and perform local wakeup prematurely. Fix it by using single explicit assignment via ACCESS_ONCE(). Because idle workers have another WORKER_NOT_RUNNING flag, this bug doesn't exist for them; however, update it to use the same pattern for consistency. tj: Applied the change to idle workers too and updated comments and patch description a bit. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ebd96393ae5..00c0bad5060 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3412,14 +3412,17 @@ static int __cpuinit trustee_thread(void *__gcwq) for_each_busy_worker(worker, i, pos, gcwq) { struct work_struct *rebind_work = &worker->rebind_work; + unsigned long worker_flags = worker->flags; /* * Rebind_work may race with future cpu hotplug * operations. Use a separate flag to mark that - * rebinding is scheduled. + * rebinding is scheduled. The morphing should + * be atomic. */ - worker->flags |= WORKER_REBIND; - worker->flags &= ~WORKER_ROGUE; + worker_flags |= WORKER_REBIND; + worker_flags &= ~WORKER_ROGUE; + ACCESS_ONCE(worker->flags) = worker_flags; /* queue rebind_work, wq doesn't matter, use the default one */ if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, -- cgit v1.2.3 From 21de4eb26ec0b1b9c484da823fbcd1d3a48afec9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Aug 2012 10:30:45 -0700 Subject: workqueue: add missing smp_wmb() in process_one_work() commit 959d1af8cffc8fd38ed53e8be1cf4ab8782f9c00 upstream. WORK_STRUCT_PENDING is used to claim ownership of a work item and process_one_work() releases it before starting execution. When someone else grabs PENDING, all pre-release updates to the work item should be visible and all updates made by the new owner should happen afterwards. Grabbing PENDING uses test_and_set_bit() and thus has a full barrier; however, clearing doesn't have a matching wmb. Given the preceding spin_unlock and use of clear_bit, I don't believe this can be a problem on an actual machine and there hasn't been any related report but it still is theretically possible for clear_pending to permeate upwards and happen before work->entry update. Add an explicit smp_wmb() before work_clear_pending(). Signed-off-by: Tejun Heo Cc: Oleg Nesterov Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 00c0bad5060..aef94527595 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1868,7 +1868,9 @@ __acquires(&gcwq->lock) spin_unlock_irq(&gcwq->lock); + smp_wmb(); /* paired with test_and_set_bit(PENDING) */ work_clear_pending(work); + lock_map_acquire_read(&cwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); trace_workqueue_execute_start(work); -- cgit v1.2.3 From cc3c85dfa1a7e8d32c6e625176269f9dff88447d Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 28 Nov 2012 07:17:18 +0100 Subject: workqueue: exit rescuer_thread() as TASK_RUNNING commit 412d32e6c98527078779e5b515823b2810e40324 upstream. A rescue thread exiting TASK_INTERRUPTIBLE can lead to a task scheduling off, never to be seen again. In the case where this occurred, an exiting thread hit reiserfs homebrew conditional resched while holding a mutex, bringing the box to its knees. PID: 18105 TASK: ffff8807fd412180 CPU: 5 COMMAND: "kdmflush" #0 [ffff8808157e7670] schedule at ffffffff8143f489 #1 [ffff8808157e77b8] reiserfs_get_block at ffffffffa038ab2d [reiserfs] #2 [ffff8808157e79a8] __block_write_begin at ffffffff8117fb14 #3 [ffff8808157e7a98] reiserfs_write_begin at ffffffffa0388695 [reiserfs] #4 [ffff8808157e7ad8] generic_perform_write at ffffffff810ee9e2 #5 [ffff8808157e7b58] generic_file_buffered_write at ffffffff810eeb41 #6 [ffff8808157e7ba8] __generic_file_aio_write at ffffffff810f1a3a #7 [ffff8808157e7c58] generic_file_aio_write at ffffffff810f1c88 #8 [ffff8808157e7cc8] do_sync_write at ffffffff8114f850 #9 [ffff8808157e7dd8] do_acct_process at ffffffff810a268f [exception RIP: kernel_thread_helper] RIP: ffffffff8144a5c0 RSP: ffff8808157e7f58 RFLAGS: 00000202 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff8107af60 RDI: ffff8803ee491d18 RBP: 0000000000000000 R8: 0000000000000000 R9: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 Signed-off-by: Mike Galbraith Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index aef94527595..472cfe38520 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2044,8 +2044,10 @@ static int rescuer_thread(void *__wq) repeat: set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); return 0; + } /* * See whether any cpu is asking for help. Unbounded -- cgit v1.2.3 From 3ff0aeceb49aca3f99c37189e949bf43bfd393a0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 4 Dec 2012 07:40:39 -0800 Subject: workqueue: convert BUG_ON()s in __queue_delayed_work() to WARN_ON_ONCE()s commit fc4b514f2727f74a4587c31db87e0e93465518c3 upstream. 8852aac25e ("workqueue: mod_delayed_work_on() shouldn't queue timer on 0 delay") unexpectedly uncovered a very nasty abuse of delayed_work in megaraid - it allocated work_struct, casted it to delayed_work and then pass that into queue_delayed_work(). Previously, this was okay because 0 @delay short-circuited to queue_work() before doing anything with delayed_work. 8852aac25e moved 0 @delay test into __queue_delayed_work() after sanity check on delayed_work making megaraid trigger BUG_ON(). Although megaraid is already fixed by c1d390d8e6 ("megaraid: fix BUG_ON() from incorrect use of delayed work"), this patch converts BUG_ON()s in __queue_delayed_work() to WARN_ON_ONCE()s so that such abusers, if there are more, trigger warning but don't crash the machine. Signed-off-by: Tejun Heo Cc: Xiaotian Feng Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 472cfe38520..dc8438d2aa6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1145,8 +1145,8 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { unsigned int lcpu; - BUG_ON(timer_pending(timer)); - BUG_ON(!list_empty(&work->entry)); + WARN_ON_ONCE(timer_pending(timer)); + WARN_ON_ONCE(!list_empty(&work->entry)); timer_stats_timer_set_start_info(&dwork->timer); -- cgit v1.2.3