diff options
| author | Andy McFadden <fadden@android.com> | 2009-06-17 16:29:30 -0700 |
|---|---|---|
| committer | Andy McFadden <fadden@android.com> | 2009-07-01 15:00:28 -0700 |
| commit | d5a9659f74fc3ac23cecf1db70886b919bf98ff9 (patch) | |
| tree | 766f43e3db733a3e9a48a1195a94548eaab97c51 /vm/Thread.c | |
| parent | fea44bae077d2d5b4d5197132b2556fd3882d241 (diff) | |
| download | android_dalvik-d5a9659f74fc3ac23cecf1db70886b919bf98ff9.tar.gz android_dalvik-d5a9659f74fc3ac23cecf1db70886b919bf98ff9.tar.bz2 android_dalvik-d5a9659f74fc3ac23cecf1db70886b919bf98ff9.zip | |
Reduce VM aborts during high CPU stress.
(This was cherry-picked from master 2aa43610 for internal bug 1952616.)
The VM has some timeouts that are meant to kill the current process if
something gets stuck (e.g. a thread grabs a lock and then manages to die
while the rest of the process continues on). These were tripping a
little too easily during some high-load situations.
This changes the order of operations so that we now unlock the "thread
suspend" lock before sending a wakeup broadcast to the condition variable
that threads sleep on. This should make it less likely for a thread to
be running for an extended period while the lock is held. (Relates to
internal bug 1664687.)
This also wraps a couple of things (pthread_create, dlopen) with a state
change to VMWAIT. During high load situations these can take a while to
complete, and we would (with the K-Means Visualizer load generator
running) very occasionally time out.
Augmented the debug output in a couple of minor ways. Updated comments.
Diffstat (limited to 'vm/Thread.c')
| -rw-r--r-- | vm/Thread.c | 78 |
1 files changed, 67 insertions, 11 deletions
diff --git a/vm/Thread.c b/vm/Thread.c index 7116dfd76..ee195eb6c 100644 --- a/vm/Thread.c +++ b/vm/Thread.c @@ -516,9 +516,15 @@ static void lockThreadSuspend(const char* who, SuspendCause why) if (cc != 0) { if (!dvmCheckSuspendPending(NULL)) { /* - * Could be we hit the window as the suspend or resume - * was started (i.e. the lock has been grabbed but the - * other thread hasn't yet set our "please suspend" flag). + * Could be that a resume-all is in progress, and something + * grabbed the CPU when the wakeup was broadcast. The thread + * performing the resume hasn't had a chance to release the + * thread suspend lock. (Should no longer be an issue -- + * we now release before broadcast.) + * + * Could be we hit the window as a suspend was started, + * and the lock has been grabbed but the suspend counts + * haven't been incremented yet. * * Could be an unusual JNI thread-attach thing. * @@ -540,6 +546,7 @@ static void lockThreadSuspend(const char* who, SuspendCause why) LOGE("threadid=%d: couldn't get thread-suspend lock (%s:%s)," " bailing\n", dvmThreadSelf()->threadId, who, getSuspendCauseStr(why)); + /* threads are not suspended, thread dump could crash */ dvmDumpAllThreads(false); dvmAbort(); } @@ -1244,9 +1251,13 @@ bool dvmCreateInterpThread(Object* threadObj, int reqStackSize) */ dvmUnlockThreadList(); - if (pthread_create(&threadHandle, &threadAttr, interpThreadStart, - newThread) != 0) - { + int cc, oldStatus; + oldStatus = dvmChangeStatus(self, THREAD_VMWAIT); + cc = pthread_create(&threadHandle, &threadAttr, interpThreadStart, + newThread); + oldStatus = dvmChangeStatus(self, oldStatus); + + if (cc != 0) { /* * Failure generally indicates that we have exceeded system * resource limits. VirtualMachineError is probably too severe, @@ -1291,7 +1302,8 @@ bool dvmCreateInterpThread(Object* threadObj, int reqStackSize) * * The easiest way to deal with this is to prevent the new thread from * running until the parent says it's okay. This results in the - * following sequence of events for a "badly timed" GC: + * following (correct) sequence of events for a "badly timed" GC + * (where '-' is us, 'o' is the child, and '+' is some other thread): * * - call pthread_create() * - lock thread list @@ -2335,6 +2347,7 @@ static void waitForThreadSuspend(Thread* self, Thread* thread) { const int kMaxRetries = 10; const int kSpinSleepTime = 750*1000; /* 0.75s */ + bool complained = false; int sleepIter = 0; int retryCount = 0; @@ -2349,6 +2362,7 @@ static void waitForThreadSuspend(Thread* self, Thread* thread) self->threadId, (int)self->handle, thread->threadId, (int)thread->handle); dumpWedgedThread(thread); + complained = true; // keep going; could be slow due to valgrind sleepIter = 0; @@ -2361,6 +2375,11 @@ static void waitForThreadSuspend(Thread* self, Thread* thread) } } } + + if (complained) { + LOGW("threadid=%d: spin on suspend resolved\n", self->threadId); + //dvmDumpThread(thread, false); /* suspended, so dump is safe */ + } } /* @@ -2497,7 +2516,9 @@ void dvmResumeAllThreads(SuspendCause why) /* debugger events don't suspend JDWP thread */ if ((why == SUSPEND_FOR_DEBUG || why == SUSPEND_FOR_DEBUG_EVENT) && thread->handle == dvmJdwpGetDebugThread(gDvm.jdwpState)) + { continue; + } if (thread->suspendCount > 0) { thread->suspendCount--; @@ -2512,6 +2533,43 @@ void dvmResumeAllThreads(SuspendCause why) dvmUnlockThreadList(); /* + * In some ways it makes sense to continue to hold the thread-suspend + * lock while we issue the wakeup broadcast. It allows us to complete + * one operation before moving on to the next, which simplifies the + * thread activity debug traces. + * + * This approach caused us some difficulty under Linux, because the + * condition variable broadcast not only made the threads runnable, + * but actually caused them to execute, and it was a while before + * the thread performing the wakeup had an opportunity to release the + * thread-suspend lock. + * + * This is a problem because, when a thread tries to acquire that + * lock, it times out after 3 seconds. If at some point the thread + * is told to suspend, the clock resets; but since the VM is still + * theoretically mid-resume, there's no suspend pending. If, for + * example, the GC was waking threads up while the SIGQUIT handler + * was trying to acquire the lock, we would occasionally time out on + * a busy system and SignalCatcher would abort. + * + * We now perform the unlock before the wakeup broadcast. The next + * suspend can't actually start until the broadcast completes and + * returns, because we're holding the thread-suspend-count lock, but the + * suspending thread is now able to make progress and we avoid the abort. + * + * (Technically there is a narrow window between when we release + * the thread-suspend lock and grab the thread-suspend-count lock. + * This could cause us to send a broadcast to threads with nonzero + * suspend counts, but this is expected and they'll all just fall + * right back to sleep. It's probably safe to grab the suspend-count + * lock before releasing thread-suspend, since we're still following + * the correct order of acquisition, but it feels weird.) + */ + + LOG_THREAD("threadid=%d: ResumeAll waking others\n", self->threadId); + unlockThreadSuspend(); + + /* * Broadcast a notification to all suspended threads, some or all of * which may choose to wake up. No need to wait for them. */ @@ -2520,8 +2578,6 @@ void dvmResumeAllThreads(SuspendCause why) assert(cc == 0); unlockThreadSuspendCount(); - unlockThreadSuspend(); - LOG_THREAD("threadid=%d: ResumeAll complete\n", self->threadId); } @@ -2989,9 +3045,9 @@ void dvmDumpThreadEx(const DebugOutputTarget* target, Thread* thread, threadName, isDaemon ? " daemon" : "", priority, thread->threadId, kStatusNames[thread->status]); dvmPrintDebugMessage(target, - " | group=\"%s\" sCount=%d dsCount=%d s=%c obj=%p\n", + " | group=\"%s\" sCount=%d dsCount=%d s=%c obj=%p self=%p\n", groupName, thread->suspendCount, thread->dbgSuspendCount, - thread->isSuspended ? 'Y' : 'N', thread->threadObj); + thread->isSuspended ? 'Y' : 'N', thread->threadObj, thread); dvmPrintDebugMessage(target, " | sysTid=%d nice=%d sched=%d/%d handle=%d\n", thread->systemTid, getpriority(PRIO_PROCESS, thread->systemTid), |
