1 files changed, 1532 insertions, 0 deletions
diff --git a/vm/compiler/codegen/x86/CodegenInterface.cpp b/vm/compiler/codegen/x86/CodegenInterface.cpp
new file mode 100644
index 000000000..aade180e0
--- /dev/null
+++ b/vm/compiler/codegen/x86/CodegenInterface.cpp
@@ -0,0 +1,1532 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <sys/mman.h>
+#include "Dalvik.h"
+#include "libdex/DexOpcodes.h"
+#include "compiler/Compiler.h"
+#include "compiler/CompilerIR.h"
+#include "interp/Jit.h"
+#include "libdex/DexFile.h"
+#include "Lower.h"
+#include "NcgAot.h"
+#include "compiler/codegen/CompilerCodegen.h"
+
+/* Init values when a predicted chain is initially assembled */
+/* E7FE is branch to self */
+#define PREDICTED_CHAIN_BX_PAIR_INIT     0xe7fe
+
+/* Target-specific save/restore */
+extern "C" void dvmJitCalleeSave(double *saveArea);
+extern "C" void dvmJitCalleeRestore(double *saveArea);
+
+/*
+ * Determine the initial instruction set to be used for this trace.
+ * Later components may decide to change this.
+ */
+//JitInstructionSetType dvmCompilerInstructionSet(CompilationUnit *cUnit)
+JitInstructionSetType dvmCompilerInstructionSet(void)
+{
+    return DALVIK_JIT_IA32;
+}
+
+JitInstructionSetType dvmCompilerGetInterpretTemplateSet()
+{
+    return DALVIK_JIT_IA32;
+}
+
+/* we don't use template for IA32 */
+void *dvmCompilerGetInterpretTemplate()
+{
+      return NULL;
+}
+
+/* Track the number of times that the code cache is patched */
+#if defined(WITH_JIT_TUNING)
+#define UPDATE_CODE_CACHE_PATCHES()    (gDvmJit.codeCachePatches++)
+#else
+#define UPDATE_CODE_CACHE_PATCHES()
+#endif
+
+bool dvmCompilerArchInit() {
+    /* Target-specific configuration */
+    gDvmJit.jitTableSize = 1 << 12;
+    gDvmJit.jitTableMask = gDvmJit.jitTableSize - 1;
+    gDvmJit.threshold = 255;
+    gDvmJit.codeCacheSize = 512*1024;
+    gDvmJit.optLevel = kJitOptLevelO1;
+
+#if defined(WITH_SELF_VERIFICATION)
+    /* Force into blocking mode */
+    gDvmJit.blockingMode = true;
+    gDvm.nativeDebuggerActive = true;
+#endif
+
+    // Make sure all threads have current values
+    dvmJitUpdateThreadStateAll();
+
+    return true;
+}
+
+void dvmCompilerPatchInlineCache(void)
+{
+    int i;
+    PredictedChainingCell *minAddr, *maxAddr;
+
+    /* Nothing to be done */
+    if (gDvmJit.compilerICPatchIndex == 0) return;
+
+    /*
+     * Since all threads are already stopped we don't really need to acquire
+     * the lock. But race condition can be easily introduced in the future w/o
+     * paying attention so we still acquire the lock here.
+     */
+    dvmLockMutex(&gDvmJit.compilerICPatchLock);
+
+    UNPROTECT_CODE_CACHE(gDvmJit.codeCache, gDvmJit.codeCacheByteUsed);
+
+    //ALOGD("Number of IC patch work orders: %d", gDvmJit.compilerICPatchIndex);
+
+    /* Initialize the min/max address range */
+    minAddr = (PredictedChainingCell *)
+        ((char *) gDvmJit.codeCache + gDvmJit.codeCacheSize);
+    maxAddr = (PredictedChainingCell *) gDvmJit.codeCache;
+
+    for (i = 0; i < gDvmJit.compilerICPatchIndex; i++) {
+        ICPatchWorkOrder *workOrder = &gDvmJit.compilerICPatchQueue[i];
+        PredictedChainingCell *cellAddr = workOrder->cellAddr;
+        PredictedChainingCell *cellContent = &workOrder->cellContent;
+        ClassObject *clazz = dvmFindClassNoInit(workOrder->classDescriptor,
+                                                workOrder->classLoader);
+
+        assert(clazz->serialNumber == workOrder->serialNumber);
+
+        /* Use the newly resolved clazz pointer */
+        cellContent->clazz = clazz;
+
+        if (cellAddr->clazz == NULL) {
+            COMPILER_TRACE_CHAINING(
+                ALOGI("Jit Runtime: predicted chain %p to %s (%s) initialized",
+                      cellAddr,
+                      cellContent->clazz->descriptor,
+                      cellContent->method->name));
+        } else {
+            COMPILER_TRACE_CHAINING(
+                ALOGI("Jit Runtime: predicted chain %p from %s to %s (%s) "
+                      "patched",
+                      cellAddr,
+                      cellAddr->clazz->descriptor,
+                      cellContent->clazz->descriptor,
+                      cellContent->method->name));
+        }
+
+        /* Patch the chaining cell */
+        *cellAddr = *cellContent;
+        minAddr = (cellAddr < minAddr) ? cellAddr : minAddr;
+        maxAddr = (cellAddr > maxAddr) ? cellAddr : maxAddr;
+    }
+
+    PROTECT_CODE_CACHE(gDvmJit.codeCache, gDvmJit.codeCacheByteUsed);
+
+    gDvmJit.compilerICPatchIndex = 0;
+    dvmUnlockMutex(&gDvmJit.compilerICPatchLock);
+}
+
+/* Target-specific cache clearing */
+void dvmCompilerCacheClear(char *start, size_t size)
+{
+    /* "0xFF 0xFF" is an invalid opcode for x86. */
+    memset(start, 0xFF, size);
+}
+
+/* for JIT debugging, to be implemented */
+void dvmJitCalleeSave(double *saveArea) {
+}
+
+void dvmJitCalleeRestore(double *saveArea) {
+}
+
+void dvmJitToInterpSingleStep() {
+}
+
+JitTraceDescription *dvmCopyTraceDescriptor(const u2 *pc,
+                                            const JitEntry *knownEntry) {
+    return NULL;
+}
+
+void dvmCompilerCodegenDump(CompilationUnit *cUnit) //in ArchUtility.c
+{
+}
+
+void dvmCompilerArchDump(void)
+{
+}
+
+char *getTraceBase(const JitEntry *p)
+{
+    return NULL;
+}
+
+void dvmCompilerAssembleLIR(CompilationUnit *cUnit, JitTranslationInfo* info)
+{
+}
+
+void dvmJitInstallClassObjectPointers(CompilationUnit *cUnit, char *codeAddress)
+{
+}
+
+void dvmCompilerMethodMIR2LIR(CompilationUnit *cUnit)
+{
+    ALOGE("Method-based JIT not supported for the x86 target");
+    dvmAbort();
+}
+
+void dvmJitScanAllClassPointers(void (*callback)(void *))
+{
+}
+
+/* Handy function to retrieve the profile count */
+static inline int getProfileCount(const JitEntry *entry)
+{
+    if (entry->dPC == 0 || entry->codeAddress == 0)
+        return 0;
+    u4 *pExecutionCount = (u4 *) getTraceBase(entry);
+
+    return pExecutionCount ? *pExecutionCount : 0;
+}
+
+/* qsort callback function */
+static int sortTraceProfileCount(const void *entry1, const void *entry2)
+{
+    const JitEntry *jitEntry1 = (const JitEntry *)entry1;
+    const JitEntry *jitEntry2 = (const JitEntry *)entry2;
+
+    JitTraceCounter_t count1 = getProfileCount(jitEntry1);
+    JitTraceCounter_t count2 = getProfileCount(jitEntry2);
+    return (count1 == count2) ? 0 : ((count1 > count2) ? -1 : 1);
+}
+
+/* Sort the trace profile counts and dump them */
+void dvmCompilerSortAndPrintTraceProfiles() //in Assemble.c
+{
+    JitEntry *sortedEntries;
+    int numTraces = 0;
+    unsigned long counts = 0;
+    unsigned int i;
+
+    /* Make sure that the table is not changing */
+    dvmLockMutex(&gDvmJit.tableLock);
+
+    /* Sort the entries by descending order */
+    sortedEntries = (JitEntry *)malloc(sizeof(JitEntry) * gDvmJit.jitTableSize);
+    if (sortedEntries == NULL)
+        goto done;
+    memcpy(sortedEntries, gDvmJit.pJitEntryTable,
+           sizeof(JitEntry) * gDvmJit.jitTableSize);
+    qsort(sortedEntries, gDvmJit.jitTableSize, sizeof(JitEntry),
+          sortTraceProfileCount);
+
+    /* Dump the sorted entries */
+    for (i=0; i < gDvmJit.jitTableSize; i++) {
+        if (sortedEntries[i].dPC != 0) {
+            numTraces++;
+        }
+    }
+    if (numTraces == 0)
+        numTraces = 1;
+    ALOGI("JIT: Average execution count -> %d",(int)(counts / numTraces));
+
+    free(sortedEntries);
+done:
+    dvmUnlockMutex(&gDvmJit.tableLock);
+    return;
+}
+
+void jumpWithRelOffset(char* instAddr, int relOffset) {
+    stream = instAddr;
+    OpndSize immSize = estOpndSizeFromImm(relOffset);
+    relOffset -= getJmpCallInstSize(immSize, JmpCall_uncond);
+    dump_imm(Mnemonic_JMP, immSize, relOffset);
+}
+
+// works whether instructions for target basic block are generated or not
+LowOp* jumpToBasicBlock(char* instAddr, int targetId) {
+    stream = instAddr;
+    bool unknown;
+    OpndSize size;
+    int relativeNCG = targetId;
+    relativeNCG = getRelativeNCG(targetId, JmpCall_uncond, &unknown, &size);
+    unconditional_jump_int(relativeNCG, size);
+    return NULL;
+}
+
+LowOp* condJumpToBasicBlock(char* instAddr, ConditionCode cc, int targetId) {
+    stream = instAddr;
+    bool unknown;
+    OpndSize size;
+    int relativeNCG = targetId;
+    relativeNCG = getRelativeNCG(targetId, JmpCall_cond, &unknown, &size);
+    conditional_jump_int(cc, relativeNCG, size);
+    return NULL;
+}
+
+/*
+ * Attempt to enqueue a work order to patch an inline cache for a predicted
+ * chaining cell for virtual/interface calls.
+ */
+static bool inlineCachePatchEnqueue(PredictedChainingCell *cellAddr,
+                                    PredictedChainingCell *newContent)
+{
+    bool result = true;
+
+    /*
+     * Make sure only one thread gets here since updating the cell (ie fast
+     * path and queueing the request (ie the queued path) have to be done
+     * in an atomic fashion.
+     */
+    dvmLockMutex(&gDvmJit.compilerICPatchLock);
+
+    /* Fast path for uninitialized chaining cell */
+    if (cellAddr->clazz == NULL &&
+        cellAddr->branch == PREDICTED_CHAIN_BX_PAIR_INIT) {
+        UNPROTECT_CODE_CACHE(cellAddr, sizeof(*cellAddr));
+
+        cellAddr->method = newContent->method;
+        cellAddr->branch = newContent->branch;
+        cellAddr->branch2 = newContent->branch2;
+
+        /*
+         * The update order matters - make sure clazz is updated last since it
+         * will bring the uninitialized chaining cell to life.
+         */
+        android_atomic_release_store((int32_t)newContent->clazz,
+            (volatile int32_t *)(void*) &cellAddr->clazz);
+        //cacheflush((intptr_t) cellAddr, (intptr_t) (cellAddr+1), 0);
+        UPDATE_CODE_CACHE_PATCHES();
+
+        PROTECT_CODE_CACHE(cellAddr, sizeof(*cellAddr));
+
+#if 0
+        MEM_BARRIER();
+        cellAddr->clazz = newContent->clazz;
+        //cacheflush((intptr_t) cellAddr, (intptr_t) (cellAddr+1), 0);
+#endif
+#if defined(IA_JIT_TUNING)
+        gDvmJit.icPatchInit++;
+#endif
+        COMPILER_TRACE_CHAINING(
+            ALOGI("Jit Runtime: FAST predicted chain %p to method %s%s %p",
+                  cellAddr, newContent->clazz->descriptor, newContent->method->name, newContent->method));
+    /* Check if this is a frequently missed clazz */
+    } else if (cellAddr->stagedClazz != newContent->clazz) {
+        /* Not proven to be frequent yet - build up the filter cache */
+        UNPROTECT_CODE_CACHE(cellAddr, sizeof(*cellAddr));
+
+        cellAddr->stagedClazz = newContent->clazz;
+
+        UPDATE_CODE_CACHE_PATCHES();
+        PROTECT_CODE_CACHE(cellAddr, sizeof(*cellAddr));
+
+#if defined(WITH_JIT_TUNING)
+        gDvmJit.icPatchRejected++;
+#endif
+    /*
+     * Different classes but same method implementation - it is safe to just
+     * patch the class value without the need to stop the world.
+     */
+    } else if (cellAddr->method == newContent->method) {
+        UNPROTECT_CODE_CACHE(cellAddr, sizeof(*cellAddr));
+
+        cellAddr->clazz = newContent->clazz;
+        /* No need to flush the cache here since the branch is not patched */
+        UPDATE_CODE_CACHE_PATCHES();
+
+        PROTECT_CODE_CACHE(cellAddr, sizeof(*cellAddr));
+
+#if defined(WITH_JIT_TUNING)
+        gDvmJit.icPatchLockFree++;
+#endif
+    /*
+     * Cannot patch the chaining cell inline - queue it until the next safe
+     * point.
+     */
+    } else if (gDvmJit.compilerICPatchIndex < COMPILER_IC_PATCH_QUEUE_SIZE)  {
+        int index = gDvmJit.compilerICPatchIndex++;
+        const ClassObject *clazz = newContent->clazz;
+
+        gDvmJit.compilerICPatchQueue[index].cellAddr = cellAddr;
+        gDvmJit.compilerICPatchQueue[index].cellContent = *newContent;
+        gDvmJit.compilerICPatchQueue[index].classDescriptor = clazz->descriptor;
+        gDvmJit.compilerICPatchQueue[index].classLoader = clazz->classLoader;
+        /* For verification purpose only */
+        gDvmJit.compilerICPatchQueue[index].serialNumber = clazz->serialNumber;
+
+#if defined(WITH_JIT_TUNING)
+        gDvmJit.icPatchQueued++;
+#endif
+        COMPILER_TRACE_CHAINING(
+            ALOGI("Jit Runtime: QUEUE predicted chain %p to method %s%s",
+                  cellAddr, newContent->clazz->descriptor, newContent->method->name));
+    } else {
+    /* Queue is full - just drop this patch request */
+#if defined(WITH_JIT_TUNING)
+        gDvmJit.icPatchDropped++;
+#endif
+
+        COMPILER_TRACE_CHAINING(
+            ALOGI("Jit Runtime: DROP predicted chain %p to method %s%s",
+                  cellAddr, newContent->clazz->descriptor, newContent->method->name));
+    }
+
+    dvmUnlockMutex(&gDvmJit.compilerICPatchLock);
+    return result;
+}
+
+/*
+ * This method is called from the invoke templates for virtual and interface
+ * methods to speculatively setup a chain to the callee. The templates are
+ * written in assembly and have setup method, cell, and clazz at r0, r2, and
+ * r3 respectively, so there is a unused argument in the list. Upon return one
+ * of the following three results may happen:
+ *   1) Chain is not setup because the callee is native. Reset the rechain
+ *      count to a big number so that it will take a long time before the next
+ *      rechain attempt to happen.
+ *   2) Chain is not setup because the callee has not been created yet. Reset
+ *      the rechain count to a small number and retry in the near future.
+ *   3) Ask all other threads to stop before patching this chaining cell.
+ *      This is required because another thread may have passed the class check
+ *      but hasn't reached the chaining cell yet to follow the chain. If we
+ *      patch the content before halting the other thread, there could be a
+ *      small window for race conditions to happen that it may follow the new
+ *      but wrong chain to invoke a different method.
+ */
+const Method *dvmJitToPatchPredictedChain(const Method *method,
+                                          Thread *self,
+                                          PredictedChainingCell *cell,
+                                          const ClassObject *clazz)
+{
+    int newRechainCount = PREDICTED_CHAIN_COUNTER_RECHAIN;
+    /* Don't come back here for a long time if the method is native */
+    if (dvmIsNativeMethod(method)) {
+        UNPROTECT_CODE_CACHE(cell, sizeof(*cell));
+
+        /*
+         * Put a non-zero/bogus value in the clazz field so that it won't
+         * trigger immediate patching and will continue to fail to match with
+         * a real clazz pointer.
+         */
+        cell->clazz = (ClassObject *) PREDICTED_CHAIN_FAKE_CLAZZ;
+
+        UPDATE_CODE_CACHE_PATCHES();
+        PROTECT_CODE_CACHE(cell, sizeof(*cell));
+        COMPILER_TRACE_CHAINING(
+            ALOGI("Jit Runtime: predicted chain %p to native method %s ignored",
+                  cell, method->name));
+        goto done;
+    }
+    {
+    int tgtAddr = (int) dvmJitGetTraceAddr(method->insns);
+
+    /*
+     * Compilation not made yet for the callee. Reset the counter to a small
+     * value and come back to check soon.
+     */
+    if ((tgtAddr == 0) ||
+        ((void*)tgtAddr == dvmCompilerGetInterpretTemplate())) {
+        COMPILER_TRACE_CHAINING(
+            ALOGI("Jit Runtime: predicted chain %p to method %s%s delayed",
+                  cell, method->clazz->descriptor, method->name));
+        goto done;
+    }
+
+    PredictedChainingCell newCell;
+
+    if (cell->clazz == NULL) {
+        newRechainCount = self->icRechainCount;
+    }
+
+    int relOffset = (int) tgtAddr - (int)cell;
+    OpndSize immSize = estOpndSizeFromImm(relOffset);
+    int jumpSize = getJmpCallInstSize(immSize, JmpCall_uncond);
+    relOffset -= jumpSize;
+    COMPILER_TRACE_CHAINING(
+            ALOGI("inlineCachePatchEnqueue chain %p to method %s%s inst size %d",
+                  cell, method->clazz->descriptor, method->name, jumpSize));
+    //can't use stream here since it is used by the compilation thread
+    dump_imm_with_codeaddr(Mnemonic_JMP, immSize, relOffset, (char*) (&newCell)); //update newCell.branch
+
+    newCell.clazz = clazz;
+    newCell.method = method;
+
+    /*
+     * Enter the work order to the queue and the chaining cell will be patched
+     * the next time a safe point is entered.
+     *
+     * If the enqueuing fails reset the rechain count to a normal value so that
+     * it won't get indefinitely delayed.
+     */
+    inlineCachePatchEnqueue(cell, &newCell);
+    }
+done:
+    self->icRechainCount = newRechainCount;
+    return method;
+}
+
+/*
+ * Unchain a trace given the starting address of the translation
+ * in the code cache.  Refer to the diagram in dvmCompilerAssembleLIR.
+ * For ARM, it returns the address following the last cell unchained.
+ * For IA, it returns NULL since cacheflush is not required for IA.
+ */
+u4* dvmJitUnchain(void* codeAddr)
+{
+    /* codeAddr is 4-byte aligned, so is chain cell count offset */
+    u2* pChainCellCountOffset = (u2*)((char*)codeAddr - 4);
+    u2 chainCellCountOffset = *pChainCellCountOffset;
+    /* chain cell counts information is 4-byte aligned */
+    ChainCellCounts *pChainCellCounts =
+          (ChainCellCounts*)((char*)codeAddr + chainCellCountOffset);
+    u2* pChainCellOffset = (u2*)((char*)codeAddr - 2);
+    u2 chainCellOffset = *pChainCellOffset;
+    u1* pChainCells;
+    int i,j;
+    PredictedChainingCell *predChainCell;
+    int padding;
+
+    /* Locate the beginning of the chain cell region */
+    pChainCells = (u1 *)((char*)codeAddr + chainCellOffset);
+
+    /* The cells are sorted in order - walk through them and reset */
+    for (i = 0; i < kChainingCellGap; i++) {
+        /* for hot, normal, singleton chaining:
+               nop  //padding.
+               jmp 0
+               mov imm32, reg1
+               mov imm32, reg2
+               call reg2
+           after chaining:
+               nop
+               jmp imm
+               mov imm32, reg1
+               mov imm32, reg2
+               call reg2
+           after unchaining:
+               nop
+               jmp 0
+               mov imm32, reg1
+               mov imm32, reg2
+               call reg2
+           Space occupied by the chaining cell in bytes: nop is for padding,
+                jump 0, the target 0 is 4 bytes aligned.
+           Space for predicted chaining: 5 words = 20 bytes
+        */
+        int elemSize = 0;
+        if (i == kChainingCellInvokePredicted) {
+            elemSize = 20;
+        }
+        COMPILER_TRACE_CHAINING(
+            ALOGI("Jit Runtime: unchaining type %d count %d", i, pChainCellCounts->u.count[i]));
+
+        for (j = 0; j < pChainCellCounts->u.count[i]; j++) {
+            switch(i) {
+                case kChainingCellNormal:
+                case kChainingCellHot:
+                case kChainingCellInvokeSingleton:
+                case kChainingCellBackwardBranch:
+                    COMPILER_TRACE_CHAINING(
+                        ALOGI("Jit Runtime: unchaining of normal, hot, or singleton"));
+                    pChainCells = (u1*) (((uint)pChainCells + 4)&(~0x03));
+                    elemSize = 4+5+5+2;
+                    memset(pChainCells, 0, 4);
+                    break;
+                case kChainingCellInvokePredicted:
+                    COMPILER_TRACE_CHAINING(
+                        ALOGI("Jit Runtime: unchaining of predicted"));
+                    /* 4-byte aligned */
+                    padding = (4 - ((u4)pChainCells & 3)) & 3;
+                    pChainCells += padding;
+                    predChainCell = (PredictedChainingCell *) pChainCells;
+                    /*
+                     * There could be a race on another mutator thread to use
+                     * this particular predicted cell and the check has passed
+                     * the clazz comparison. So we cannot safely wipe the
+                     * method and branch but it is safe to clear the clazz,
+                     * which serves as the key.
+                     */
+                    predChainCell->clazz = PREDICTED_CHAIN_CLAZZ_INIT;
+                    break;
+                default:
+                    ALOGE("Unexpected chaining type: %d", i);
+                    dvmAbort();  // dvmAbort OK here - can't safely recover
+            }
+            COMPILER_TRACE_CHAINING(
+                ALOGI("Jit Runtime: unchaining 0x%x", (int)pChainCells));
+            pChainCells += elemSize;  /* Advance by a fixed number of bytes */
+        }
+    }
+    return NULL;
+}
+
+/* Unchain all translation in the cache. */
+void dvmJitUnchainAll()
+{
+    ALOGV("Jit Runtime: unchaining all");
+    if (gDvmJit.pJitEntryTable != NULL) {
+        COMPILER_TRACE_CHAINING(ALOGI("Jit Runtime: unchaining all"));
+        dvmLockMutex(&gDvmJit.tableLock);
+
+        UNPROTECT_CODE_CACHE(gDvmJit.codeCache, gDvmJit.codeCacheByteUsed);
+
+        for (size_t i = 0; i < gDvmJit.jitTableSize; i++) {
+            if (gDvmJit.pJitEntryTable[i].dPC &&
+                !gDvmJit.pJitEntryTable[i].u.info.isMethodEntry &&
+                gDvmJit.pJitEntryTable[i].codeAddress) {
+                      dvmJitUnchain(gDvmJit.pJitEntryTable[i].codeAddress);
+            }
+        }
+
+        PROTECT_CODE_CACHE(gDvmJit.codeCache, gDvmJit.codeCacheByteUsed);
+
+        dvmUnlockMutex(&gDvmJit.tableLock);
+        gDvmJit.translationChains = 0;
+    }
+    gDvmJit.hasNewChain = false;
+}
+
+#define P_GPR_1 PhysicalReg_EBX
+/* Add an additional jump instruction, keep jump target 4 bytes aligned.*/
+static void insertJumpHelp()
+{
+    int rem = (uint)stream % 4;
+    int nop_size = 3 - rem;
+    dump_nop(nop_size);
+    unconditional_jump_int(0, OpndSize_32);
+    return;
+}
+
+/* Chaining cell for code that may need warmup. */
+/* ARM assembly: ldr r0, [r6, #76] (why a single instruction to access member of glue structure?)
+                 blx r0
+                 data 0xb23a //bytecode address: 0x5115b23a
+                 data 0x5115
+   IA32 assembly:
+                  jmp  0 //5 bytes
+                  movl address, %ebx
+                  movl dvmJitToInterpNormal, %eax
+                  call %eax
+                  <-- return address
+*/
+static void handleNormalChainingCell(CompilationUnit *cUnit,
+                                     unsigned int offset, int blockId, LowOpBlockLabel* labelList)
+{
+    ALOGV("in handleNormalChainingCell for method %s block %d BC offset %x NCG offset %x",
+          cUnit->method->name, blockId, offset, stream - streamMethodStart);
+    if(dump_x86_inst)
+        ALOGI("LOWER NormalChainingCell at offsetPC %x offsetNCG %x @%p",
+              offset, stream - streamMethodStart, stream);
+    /* Add one additional "jump 0" instruction, it may be modified during jit chaining. This helps
+     * reslove the multithreading issue.
+     */
+    insertJumpHelp();
+    move_imm_to_reg(OpndSize_32, (int) (cUnit->method->insns + offset), P_GPR_1, true);
+    scratchRegs[0] = PhysicalReg_EAX;
+    call_dvmJitToInterpNormal();
+    //move_imm_to_reg(OpndSize_32, (int) (cUnit->method->insns + offset), P_GPR_1, true); /* used when unchaining */
+}
+
+/*
+ * Chaining cell for instructions that immediately following already translated
+ * code.
+ */
+static void handleHotChainingCell(CompilationUnit *cUnit,
+                                  unsigned int offset, int blockId, LowOpBlockLabel* labelList)
+{
+    ALOGV("in handleHotChainingCell for method %s block %d BC offset %x NCG offset %x",
+          cUnit->method->name, blockId, offset, stream - streamMethodStart);
+    if(dump_x86_inst)
+        ALOGI("LOWER HotChainingCell at offsetPC %x offsetNCG %x @%p",
+              offset, stream - streamMethodStart, stream);
+    /* Add one additional "jump 0" instruction, it may be modified during jit chaining. This helps
+     * reslove the multithreading issue.
+     */
+    insertJumpHelp();
+    move_imm_to_reg(OpndSize_32, (int) (cUnit->method->insns + offset), P_GPR_1, true);
+    scratchRegs[0] = PhysicalReg_EAX;
+    call_dvmJitToInterpTraceSelect();
+    //move_imm_to_reg(OpndSize_32, (int) (cUnit->method->insns + offset), P_GPR_1, true); /* used when unchaining */
+}
+
+/* Chaining cell for branches that branch back into the same basic block */
+static void handleBackwardBranchChainingCell(CompilationUnit *cUnit,
+                                     unsigned int offset, int blockId, LowOpBlockLabel* labelList)
+{
+    ALOGV("in handleBackwardBranchChainingCell for method %s block %d BC offset %x NCG offset %x",
+          cUnit->method->name, blockId, offset, stream - streamMethodStart);
+    if(dump_x86_inst)
+        ALOGI("LOWER BackwardBranchChainingCell at offsetPC %x offsetNCG %x @%p",
+              offset, stream - streamMethodStart, stream);
+    /* Add one additional "jump 0" instruction, it may be modified during jit chaining. This helps
+     * reslove the multithreading issue.
+     */
+    insertJumpHelp();
+    move_imm_to_reg(OpndSize_32, (int) (cUnit->method->insns + offset), P_GPR_1, true);
+    scratchRegs[0] = PhysicalReg_EAX;
+    call_dvmJitToInterpNormal();
+    //move_imm_to_reg(OpndSize_32, (int) (cUnit->method->insns + offset), P_GPR_1, true); /* used when unchaining */
+}
+
+/* Chaining cell for monomorphic method invocations. */
+static void handleInvokeSingletonChainingCell(CompilationUnit *cUnit,
+                                              const Method *callee, int blockId, LowOpBlockLabel* labelList)
+{
+    ALOGV("in handleInvokeSingletonChainingCell for method %s block %d callee %s NCG offset %x",
+          cUnit->method->name, blockId, callee->name, stream - streamMethodStart);
+    if(dump_x86_inst)
+        ALOGI("LOWER InvokeSingletonChainingCell at block %d offsetNCG %x @%p",
+              blockId, stream - streamMethodStart, stream);
+    /* Add one additional "jump 0" instruction, it may be modified during jit chaining. This helps
+     * reslove the multithreading issue.
+     */
+    insertJumpHelp();
+    move_imm_to_reg(OpndSize_32, (int) (callee->insns), P_GPR_1, true);
+    scratchRegs[0] = PhysicalReg_EAX;
+    call_dvmJitToInterpTraceSelect();
+    //move_imm_to_reg(OpndSize_32, (int) (callee->insns), P_GPR_1, true); /* used when unchaining */
+}
+#undef P_GPR_1
+
+/* Chaining cell for monomorphic method invocations. */
+static void handleInvokePredictedChainingCell(CompilationUnit *cUnit, int blockId)
+{
+    if(dump_x86_inst)
+        ALOGI("LOWER InvokePredictedChainingCell at block %d offsetNCG %x @%p",
+              blockId, stream - streamMethodStart, stream);
+#ifndef PREDICTED_CHAINING
+    //assume rPC for callee->insns in %ebx
+    scratchRegs[0] = PhysicalReg_EAX;
+    call_dvmJitToInterpTraceSelectNoChain();
+#else
+    /* make sure section for predicited chaining cell is 4-byte aligned */
+    //int padding = (4 - ((u4)stream & 3)) & 3;
+    //stream += padding;
+    int* streamData = (int*)stream;
+    /* Should not be executed in the initial state */
+    streamData[0] = PREDICTED_CHAIN_BX_PAIR_INIT;
+    streamData[1] = 0;
+    /* To be filled: class */
+    streamData[2] = PREDICTED_CHAIN_CLAZZ_INIT;
+    /* To be filled: method */
+    streamData[3] = PREDICTED_CHAIN_METHOD_INIT;
+    /*
+     * Rechain count. The initial value of 0 here will trigger chaining upon
+     * the first invocation of this callsite.
+     */
+    streamData[4] = PREDICTED_CHAIN_COUNTER_INIT;
+#if 0
+    ALOGI("--- DATA @ %p: %x %x %x %x", stream, *((int*)stream), *((int*)(stream+4)),
+          *((int*)(stream+8)), *((int*)(stream+12)));
+#endif
+    stream += 20; //5 *4
+#endif
+}
+
+/* Load the Dalvik PC into r0 and jump to the specified target */
+static void handlePCReconstruction(CompilationUnit *cUnit,
+                                   LowOpBlockLabel *targetLabel)
+{
+#if 0
+    LowOp **pcrLabel =
+        (LowOp **) cUnit->pcReconstructionList.elemList;
+    int numElems = cUnit->pcReconstructionList.numUsed;
+    int i;
+    for (i = 0; i < numElems; i++) {
+        dvmCompilerAppendLIR(cUnit, (LIR *) pcrLabel[i]);
+        /* r0 = dalvik PC */
+        loadConstant(cUnit, r0, pcrLabel[i]->operands[0]);
+        genUnconditionalBranch(cUnit, targetLabel);
+    }
+#endif
+}
+
+//use O0 code generator for hoisted checks outside of the loop
+/*
+ * vA = arrayReg;
+ * vB = idxReg;
+ * vC = endConditionReg;
+ * arg[0] = maxC
+ * arg[1] = minC
+ * arg[2] = loopBranchConditionCode
+ */
+#define P_GPR_1 PhysicalReg_EBX
+#define P_GPR_2 PhysicalReg_ECX
+static void genHoistedChecksForCountUpLoop(CompilationUnit *cUnit, MIR *mir)
+{
+    /*
+     * NOTE: these synthesized blocks don't have ssa names assigned
+     * for Dalvik registers.  However, because they dominate the following
+     * blocks we can simply use the Dalvik name w/ subscript 0 as the
+     * ssa name.
+     */
+    DecodedInstruction *dInsn = &mir->dalvikInsn;
+    const int maxC = dInsn->arg[0];
+
+    /* assign array in virtual register to P_GPR_1 */
+    get_virtual_reg(mir->dalvikInsn.vA, OpndSize_32, P_GPR_1, true);
+    /* assign index in virtual register to P_GPR_2 */
+    get_virtual_reg(mir->dalvikInsn.vC, OpndSize_32, P_GPR_2, true);
+    export_pc();
+    compare_imm_reg(OpndSize_32, 0, P_GPR_1, true);
+    condJumpToBasicBlock(stream, Condition_E, cUnit->exceptionBlockId);
+    int delta = maxC;
+    /*
+     * If the loop end condition is ">=" instead of ">", then the largest value
+     * of the index is "endCondition - 1".
+     */
+    if (dInsn->arg[2] == OP_IF_GE) {
+        delta--;
+    }
+
+    if (delta < 0) { //+delta
+        //if P_GPR_2 is mapped to a VR, we can't do this
+        alu_binary_imm_reg(OpndSize_32, sub_opc, -delta, P_GPR_2, true);
+    } else if(delta > 0) {
+        alu_binary_imm_reg(OpndSize_32, add_opc, delta, P_GPR_2, true);
+    }
+    compare_mem_reg(OpndSize_32, offArrayObject_length, P_GPR_1, true, P_GPR_2, true);
+    condJumpToBasicBlock(stream, Condition_NC, cUnit->exceptionBlockId);
+}
+
+/*
+ * vA = arrayReg;
+ * vB = idxReg;
+ * vC = endConditionReg;
+ * arg[0] = maxC
+ * arg[1] = minC
+ * arg[2] = loopBranchConditionCode
+ */
+static void genHoistedChecksForCountDownLoop(CompilationUnit *cUnit, MIR *mir)
+{
+    DecodedInstruction *dInsn = &mir->dalvikInsn;
+    const int maxC = dInsn->arg[0];
+
+    /* assign array in virtual register to P_GPR_1 */
+    get_virtual_reg(mir->dalvikInsn.vA, OpndSize_32, P_GPR_1, true);
+    /* assign index in virtual register to P_GPR_2 */
+    get_virtual_reg(mir->dalvikInsn.vB, OpndSize_32, P_GPR_2, true);
+    export_pc();
+    compare_imm_reg(OpndSize_32, 0, P_GPR_1, true);
+    condJumpToBasicBlock(stream, Condition_E, cUnit->exceptionBlockId);
+
+    if (maxC < 0) {
+        //if P_GPR_2 is mapped to a VR, we can't do this
+        alu_binary_imm_reg(OpndSize_32, sub_opc, -maxC, P_GPR_2, true);
+    } else if(maxC > 0) {
+        alu_binary_imm_reg(OpndSize_32, add_opc, maxC, P_GPR_2, true);
+    }
+    compare_mem_reg(OpndSize_32, offArrayObject_length, P_GPR_1, true, P_GPR_2, true);
+    condJumpToBasicBlock(stream, Condition_NC, cUnit->exceptionBlockId);
+
+}
+#undef P_GPR_1
+#undef P_GPR_2
+
+/*
+ * vA = idxReg;
+ * vB = minC;
+ */
+#define P_GPR_1 PhysicalReg_ECX
+static void genHoistedLowerBoundCheck(CompilationUnit *cUnit, MIR *mir)
+{
+    DecodedInstruction *dInsn = &mir->dalvikInsn;
+    const int minC = dInsn->vB;
+    get_virtual_reg(mir->dalvikInsn.vA, OpndSize_32, P_GPR_1, true); //array
+    export_pc();
+    compare_imm_reg(OpndSize_32, -minC, P_GPR_1, true);
+    condJumpToBasicBlock(stream, Condition_C, cUnit->exceptionBlockId);
+}
+#undef P_GPR_1
+
+#ifdef WITH_JIT_INLINING
+static void genValidationForPredictedInline(CompilationUnit *cUnit, MIR *mir)
+{
+    CallsiteInfo *callsiteInfo = mir->meta.callsiteInfo;
+    if(gDvm.executionMode == kExecutionModeNcgO0) {
+        get_virtual_reg(mir->dalvikInsn.vC, OpndSize_32, PhysicalReg_EBX, true);
+        move_imm_to_reg(OpndSize_32, (int) callsiteInfo->clazz, PhysicalReg_ECX, true);
+        compare_imm_reg(OpndSize_32, 0, PhysicalReg_EBX, true);
+        export_pc(); //use %edx
+        conditional_jump_global_API(, Condition_E, "common_errNullObject", false);
+        move_mem_to_reg(OpndSize_32, offObject_clazz, PhysicalReg_EBX, true, PhysicalReg_EAX, true);
+        compare_reg_reg(PhysicalReg_ECX, true, PhysicalReg_EAX, true);
+    } else {
+        get_virtual_reg(mir->dalvikInsn.vC, OpndSize_32, 5, false);
+        move_imm_to_reg(OpndSize_32, (int) callsiteInfo->clazz, 4, false);
+        nullCheck(5, false, 1, mir->dalvikInsn.vC);
+        move_mem_to_reg(OpndSize_32, offObject_clazz, 5, false, 6, false);
+        compare_reg_reg(4, false, 6, false);
+    }
+
+    //immdiate will be updated later in genLandingPadForMispredictedCallee
+    streamMisPred = stream;
+    callsiteInfo->misPredBranchOver = (LIR*)conditional_jump_int(Condition_NE, 0, OpndSize_8);
+}
+#endif
+
+/* Extended MIR instructions like PHI */
+void handleExtendedMIR(CompilationUnit *cUnit, MIR *mir)
+{
+    ExecutionMode origMode = gDvm.executionMode;
+    gDvm.executionMode = kExecutionModeNcgO0;
+    switch ((ExtendedMIROpcode)mir->dalvikInsn.opcode) {
+        case kMirOpPhi: {
+            break;
+        }
+        case kMirOpNullNRangeUpCheck: {
+            genHoistedChecksForCountUpLoop(cUnit, mir);
+            break;
+        }
+        case kMirOpNullNRangeDownCheck: {
+            genHoistedChecksForCountDownLoop(cUnit, mir);
+            break;
+        }
+        case kMirOpLowerBound: {
+            genHoistedLowerBoundCheck(cUnit, mir);
+            break;
+        }
+        case kMirOpPunt: {
+            break;
+        }
+#ifdef WITH_JIT_INLINING
+        case kMirOpCheckInlinePrediction: { //handled in ncg_o1_data.c
+            genValidationForPredictedInline(cUnit, mir);
+            break;
+        }
+#endif
+        default:
+            break;
+    }
+    gDvm.executionMode = origMode;
+}
+
+static void setupLoopEntryBlock(CompilationUnit *cUnit, BasicBlock *entry,
+                                int bodyId)
+{
+    /*
+     * Next, create two branches - one branch over to the loop body and the
+     * other branch to the PCR cell to punt.
+     */
+    //LowOp* branchToBody = jumpToBasicBlock(stream, bodyId);
+    //setupResourceMasks(branchToBody);
+    //cUnit->loopAnalysis->branchToBody = ((LIR*)branchToBody);
+
+#if 0
+    LowOp *branchToPCR = dvmCompilerNew(sizeof(ArmLIR), true);
+    branchToPCR->opCode = kThumbBUncond;
+    branchToPCR->generic.target = (LIR *) pcrLabel;
+    setupResourceMasks(branchToPCR);
+    cUnit->loopAnalysis->branchToPCR = (LIR *) branchToPCR;
+#endif
+}
+
+/* check whether we can merge the block at index i with its target block */
+bool mergeBlock(BasicBlock *bb) {
+    if(bb->blockType == kDalvikByteCode &&
+       bb->firstMIRInsn != NULL &&
+       (bb->lastMIRInsn->dalvikInsn.opcode == OP_GOTO_16 ||
+        bb->lastMIRInsn->dalvikInsn.opcode == OP_GOTO ||
+        bb->lastMIRInsn->dalvikInsn.opcode == OP_GOTO_32) &&
+       bb->fallThrough == NULL) {// &&
+       //cUnit->hasLoop) {
+        //ALOGI("merge blocks ending with goto at index %d", i);
+        MIR* prevInsn = bb->lastMIRInsn->prev;
+        if(bb->taken == NULL) return false;
+        MIR* mergeInsn = bb->taken->firstMIRInsn;
+        if(mergeInsn == NULL) return false;
+        if(prevInsn == NULL) {//the block has a single instruction
+            bb->firstMIRInsn = mergeInsn;
+        } else {
+            prevInsn->next = mergeInsn; //remove goto from the chain
+        }
+        mergeInsn->prev = prevInsn;
+        bb->lastMIRInsn = bb->taken->lastMIRInsn;
+        bb->taken->firstMIRInsn = NULL; //block being merged in
+        bb->fallThrough = bb->taken->fallThrough;
+        bb->taken = bb->taken->taken;
+        return true;
+    }
+    return false;
+}
+
+static int genTraceProfileEntry(CompilationUnit *cUnit)
+{
+    cUnit->headerSize = 6;
+    if ((gDvmJit.profileMode == kTraceProfilingContinuous) ||
+        (gDvmJit.profileMode == kTraceProfilingDisabled)) {
+        return 12;
+    } else {
+        return 4;
+    }
+
+}
+
+#define PRINT_BUFFER_LEN 1024
+/* Print the code block in code cache in the range of [startAddr, endAddr)
+ * in readable format.
+ */
+void printEmittedCodeBlock(unsigned char *startAddr, unsigned char *endAddr)
+{
+    char strbuf[PRINT_BUFFER_LEN];
+    unsigned char *addr;
+    unsigned char *next_addr;
+    int n;
+
+    if (gDvmJit.printBinary) {
+        // print binary in bytes
+        n = 0;
+        for (addr = startAddr; addr < endAddr; addr++) {
+            n += snprintf(&strbuf[n], PRINT_BUFFER_LEN-n, "0x%x, ", *addr);
+            if (n > PRINT_BUFFER_LEN - 10) {
+                ALOGD("## %s", strbuf);
+                n = 0;
+            }
+        }
+        if (n > 0)
+            ALOGD("## %s", strbuf);
+    }
+
+    // print disassembled instructions
+    addr = startAddr;
+    while (addr < endAddr) {
+        next_addr = reinterpret_cast<unsigned char*>
+            (decoder_disassemble_instr(reinterpret_cast<char*>(addr),
+                                       strbuf, PRINT_BUFFER_LEN));
+        if (addr != next_addr) {
+            ALOGD("**  %p: %s", addr, strbuf);
+        } else {                // check whether this is nop padding
+            if (addr[0] == 0x90) {
+                ALOGD("**  %p: NOP (1 byte)", addr);
+                next_addr += 1;
+            } else if (addr[0] == 0x66 && addr[1] == 0x90) {
+                ALOGD("**  %p: NOP (2 bytes)", addr);
+                next_addr += 2;
+            } else if (addr[0] == 0x0f && addr[1] == 0x1f && addr[2] == 0x00) {
+                ALOGD("**  %p: NOP (3 bytes)", addr);
+                next_addr += 3;
+            } else {
+                ALOGD("** unable to decode binary at %p", addr);
+                break;
+            }
+        }
+        addr = next_addr;
+    }
+}
+
+/* 4 is the number of additional bytes needed for chaining information for trace:
+ * 2 bytes for chaining cell count offset and 2 bytes for chaining cell offset */
+#define EXTRA_BYTES_FOR_CHAINING 4
+
+/* Entry function to invoke the backend of the JIT compiler */
+void dvmCompilerMIR2LIR(CompilationUnit *cUnit, JitTranslationInfo *info)
+{
+    dump_x86_inst = cUnit->printMe;
+    /* Used to hold the labels of each block */
+    LowOpBlockLabel *labelList =
+        (LowOpBlockLabel *)dvmCompilerNew(sizeof(LowOpBlockLabel) * cUnit->numBlocks, true); //Utility.c
+    LowOp *headLIR = NULL;
+    GrowableList chainingListByType[kChainingCellLast];
+    unsigned int i, padding;
+
+    /*
+     * Initialize various types chaining lists.
+     */
+    for (i = 0; i < kChainingCellLast; i++) {
+        dvmInitGrowableList(&chainingListByType[i], 2);
+    }
+
+    /* Clear the visited flag for each block */
+    dvmCompilerDataFlowAnalysisDispatcher(cUnit, dvmCompilerClearVisitedFlag,
+                                          kAllNodes, false /* isIterative */);
+
+    GrowableListIterator iterator;
+    dvmGrowableListIteratorInit(&cUnit->blockList, &iterator);
+
+    /* Traces start with a profiling entry point.  Generate it here */
+    cUnit->profileCodeSize = genTraceProfileEntry(cUnit);
+
+    //BasicBlock **blockList = cUnit->blockList;
+    GrowableList *blockList = &cUnit->blockList;
+    BasicBlock *bb;
+
+    info->codeAddress = NULL;
+    stream = (char*)gDvmJit.codeCache + gDvmJit.codeCacheByteUsed;
+
+    // TODO: compile into a temporary buffer and then copy into the code cache.
+    // That would let us leave the code cache unprotected for a shorter time.
+    size_t unprotected_code_cache_bytes =
+            gDvmJit.codeCacheSize - gDvmJit.codeCacheByteUsed - CODE_CACHE_PADDING;
+    UNPROTECT_CODE_CACHE(stream, unprotected_code_cache_bytes);
+
+    streamStart = stream; /* trace start before alignment */
+    stream += EXTRA_BYTES_FOR_CHAINING; /* This is needed for chaining. Add the bytes before the alignment */
+    stream = (char*)(((unsigned int)stream + 0xF) & ~0xF); /* Align trace to 16-bytes */
+    streamMethodStart = stream; /* code start */
+    for (i = 0; i < ((unsigned int) cUnit->numBlocks); i++) {
+        labelList[i].lop.generic.offset = -1;
+    }
+    cUnit->exceptionBlockId = -1;
+    for (i = 0; i < blockList->numUsed; i++) {
+        bb = (BasicBlock *) blockList->elemList[i];
+        if(bb->blockType == kExceptionHandling)
+            cUnit->exceptionBlockId = i;
+    }
+    startOfTrace(cUnit->method, labelList, cUnit->exceptionBlockId, cUnit);
+    if(gDvm.executionMode == kExecutionModeNcgO1) {
+        //merge blocks ending with "goto" with the fall through block
+        if (cUnit->jitMode != kJitLoop)
+            for (i = 0; i < blockList->numUsed; i++) {
+                bb = (BasicBlock *) blockList->elemList[i];
+                bool merged = mergeBlock(bb);
+                while(merged) merged = mergeBlock(bb);
+            }
+        for (i = 0; i < blockList->numUsed; i++) {
+            bb = (BasicBlock *) blockList->elemList[i];
+            if(bb->blockType == kDalvikByteCode &&
+               bb->firstMIRInsn != NULL) {
+                preprocessingBB(bb);
+            }
+        }
+        preprocessingTrace();
+    }
+
+    /* Handle the content in each basic block */
+    for (i = 0; ; i++) {
+        MIR *mir;
+        bb = (BasicBlock *) dvmGrowableListIteratorNext(&iterator);
+        if (bb == NULL) break;
+        if (bb->visited == true) continue;
+
+        labelList[i].immOpnd.value = bb->startOffset;
+
+        if (bb->blockType >= kChainingCellLast) {
+            /*
+             * Append the label pseudo LIR first. Chaining cells will be handled
+             * separately afterwards.
+             */
+            dvmCompilerAppendLIR(cUnit, (LIR *) &labelList[i]);
+        }
+
+        if (bb->blockType == kEntryBlock) {
+            labelList[i].lop.opCode2 = ATOM_PSEUDO_ENTRY_BLOCK;
+            if (bb->firstMIRInsn == NULL) {
+                continue;
+            } else {
+              setupLoopEntryBlock(cUnit, bb, bb->fallThrough->id);
+                                  //&labelList[blockList[i]->fallThrough->id]);
+            }
+        } else if (bb->blockType == kExitBlock) {
+            labelList[i].lop.opCode2 = ATOM_PSEUDO_EXIT_BLOCK;
+            labelList[i].lop.generic.offset = (stream - streamMethodStart);
+            goto gen_fallthrough;
+        } else if (bb->blockType == kDalvikByteCode) {
+            if (bb->hidden == true) continue;
+            labelList[i].lop.opCode2 = ATOM_PSEUDO_NORMAL_BLOCK_LABEL;
+            /* Reset the register state */
+#if 0
+            resetRegisterScoreboard(cUnit);
+#endif
+        } else {
+            switch (bb->blockType) {
+                case kChainingCellNormal:
+                    labelList[i].lop.opCode2 = ATOM_PSEUDO_CHAINING_CELL_NORMAL;
+                    /* handle the codegen later */
+                    dvmInsertGrowableList(
+                        &chainingListByType[kChainingCellNormal], i);
+                    break;
+                case kChainingCellInvokeSingleton:
+                    labelList[i].lop.opCode2 =
+                        ATOM_PSEUDO_CHAINING_CELL_INVOKE_SINGLETON;
+                    labelList[i].immOpnd.value =
+                        (int) bb->containingMethod;
+                    /* handle the codegen later */
+                    dvmInsertGrowableList(
+                        &chainingListByType[kChainingCellInvokeSingleton], i);
+                    break;
+                case kChainingCellInvokePredicted:
+                    labelList[i].lop.opCode2 =
+                        ATOM_PSEUDO_CHAINING_CELL_INVOKE_PREDICTED;
+                   /*
+                     * Move the cached method pointer from operand 1 to 0.
+                     * Operand 0 was clobbered earlier in this routine to store
+                     * the block starting offset, which is not applicable to
+                     * predicted chaining cell.
+                     */
+                    //TODO
+                    //labelList[i].operands[0] = labelList[i].operands[1];
+
+                    /* handle the codegen later */
+                    dvmInsertGrowableList(
+                        &chainingListByType[kChainingCellInvokePredicted], i);
+                    break;
+                case kChainingCellHot:
+                    labelList[i].lop.opCode2 =
+                        ATOM_PSEUDO_CHAINING_CELL_HOT;
+                    /* handle the codegen later */
+                    dvmInsertGrowableList(
+                        &chainingListByType[kChainingCellHot], i);
+                    break;
+                case kPCReconstruction:
+                    /* Make sure exception handling block is next */
+                    labelList[i].lop.opCode2 =
+                        ATOM_PSEUDO_PC_RECONSTRUCTION_BLOCK_LABEL;
+                    //assert (i == cUnit->numBlocks - 2);
+                    labelList[i].lop.generic.offset = (stream - streamMethodStart);
+                    handlePCReconstruction(cUnit,
+                                           &labelList[cUnit->puntBlock->id]);
+                    break;
+                case kExceptionHandling:
+                    labelList[i].lop.opCode2 = ATOM_PSEUDO_EH_BLOCK_LABEL;
+                    labelList[i].lop.generic.offset = (stream - streamMethodStart);
+                    //if (cUnit->pcReconstructionList.numUsed) {
+                        scratchRegs[0] = PhysicalReg_EAX;
+                        jumpToInterpPunt();
+                        //call_dvmJitToInterpPunt();
+                    //}
+                    break;
+                case kChainingCellBackwardBranch:
+                    labelList[i].lop.opCode2 = ATOM_PSEUDO_CHAINING_CELL_BACKWARD_BRANCH;
+                    /* handle the codegen later */
+                    dvmInsertGrowableList(
+                        &chainingListByType[kChainingCellBackwardBranch],
+                        i);
+                    break;
+                default:
+                    break;
+            }
+            continue;
+        }
+        {
+        //LowOp *headLIR = NULL;
+        const DexCode *dexCode = dvmGetMethodCode(cUnit->method);
+        const u2 *startCodePtr = dexCode->insns;
+        const u2 *codePtr;
+        labelList[i].lop.generic.offset = (stream - streamMethodStart);
+        ALOGV("get ready to handle JIT bb %d type %d hidden %d",
+              bb->id, bb->blockType, bb->hidden);
+        for (BasicBlock *nextBB = bb; nextBB != NULL; nextBB = cUnit->nextCodegenBlock) {
+            bb = nextBB;
+            bb->visited = true;
+            cUnit->nextCodegenBlock = NULL;
+
+        if(gDvm.executionMode == kExecutionModeNcgO1 &&
+           bb->blockType != kEntryBlock &&
+           bb->firstMIRInsn != NULL) {
+            startOfBasicBlock(bb);
+            int cg_ret = codeGenBasicBlockJit(cUnit->method, bb);
+            endOfBasicBlock(bb);
+            if(cg_ret < 0) {
+                endOfTrace(true/*freeOnly*/);
+                cUnit->baseAddr = NULL;
+                ALOGI("codeGenBasicBlockJit returns negative number");
+                PROTECT_CODE_CACHE(stream, unprotected_code_cache_bytes);
+                return;
+            }
+        } else {
+        for (mir = bb->firstMIRInsn; mir; mir = mir->next) {
+            startOfBasicBlock(bb); //why here for O0
+            Opcode dalvikOpCode = mir->dalvikInsn.opcode;
+            if((int)dalvikOpCode >= (int)kMirOpFirst) {
+                handleExtendedMIR(cUnit, mir);
+                continue;
+            }
+            InstructionFormat dalvikFormat =
+                dexGetFormatFromOpcode(dalvikOpCode);
+            ALOGV("ready to handle bytecode at offset %x: opcode %d format %d",
+                  mir->offset, dalvikOpCode, dalvikFormat);
+            LowOpImm *boundaryLIR = dump_special(ATOM_PSEUDO_DALVIK_BYTECODE_BOUNDARY, mir->offset);
+            /* Remember the first LIR for this block */
+            if (headLIR == NULL) {
+                headLIR = (LowOp*)boundaryLIR;
+            }
+            bool notHandled = true;
+            /*
+             * Debugging: screen the opcode first to see if it is in the
+             * do[-not]-compile list
+             */
+            bool singleStepMe =
+                gDvmJit.includeSelectedOp !=
+                ((gDvmJit.opList[dalvikOpCode >> 3] &
+                  (1 << (dalvikOpCode & 0x7))) !=
+                 0);
+            if (singleStepMe || cUnit->allSingleStep) {
+            } else {
+                codePtr = startCodePtr + mir->offset;
+                //lower each byte code, update LIR
+                notHandled = lowerByteCodeJit(cUnit->method, cUnit->method->insns+mir->offset, mir);
+                if(gDvmJit.codeCacheByteUsed + (stream - streamStart) +
+                   CODE_CACHE_PADDING > gDvmJit.codeCacheSize) {
+                    ALOGI("JIT code cache full after lowerByteCodeJit (trace uses %uB)", (stream - streamStart));
+                    gDvmJit.codeCacheFull = true;
+                    cUnit->baseAddr = NULL;
+                    endOfTrace(true/*freeOnly*/);
+                    PROTECT_CODE_CACHE(stream, unprotected_code_cache_bytes);
+                    return;
+                }
+            }
+            if (notHandled) {
+                ALOGE("%#06x: Opcode 0x%x (%s) / Fmt %d not handled",
+                     mir->offset,
+                     dalvikOpCode, dexGetOpcodeName(dalvikOpCode),
+                     dalvikFormat);
+                dvmAbort();
+                break;
+            }
+        } // end for
+        } // end else //JIT + O0 code generator
+        }
+        } // end for
+        /* Eliminate redundant loads/stores and delay stores into later slots */
+#if 0
+        dvmCompilerApplyLocalOptimizations(cUnit, (LIR *) headLIR,
+                                           cUnit->lastLIRInsn);
+#endif
+        if (headLIR) headLIR = NULL;
+gen_fallthrough:
+        /*
+         * Check if the block is terminated due to trace length constraint -
+         * insert an unconditional branch to the chaining cell.
+         */
+        if (bb->needFallThroughBranch) {
+            jumpToBasicBlock(stream, bb->fallThrough->id);
+        }
+
+    }
+
+    char* streamChainingStart = (char*)stream;
+    /* Handle the chaining cells in predefined order */
+    for (i = 0; i < kChainingCellGap; i++) {
+        size_t j;
+        int *blockIdList = (int *) chainingListByType[i].elemList;
+
+        cUnit->numChainingCells[i] = chainingListByType[i].numUsed;
+
+        /* No chaining cells of this type */
+        if (cUnit->numChainingCells[i] == 0)
+            continue;
+
+        /* Record the first LIR for a new type of chaining cell */
+        cUnit->firstChainingLIR[i] = (LIR *) &labelList[blockIdList[0]];
+        for (j = 0; j < chainingListByType[i].numUsed; j++) {
+            int blockId = blockIdList[j];
+            BasicBlock *chainingBlock =
+                (BasicBlock *) dvmGrowableListGetElement(&cUnit->blockList,
+                                                         blockId);
+
+            labelList[blockId].lop.generic.offset = (stream - streamMethodStart);
+
+            /* Align this chaining cell first */
+#if 0
+            newLIR0(cUnit, ATOM_PSEUDO_ALIGN4);
+#endif
+            /* Insert the pseudo chaining instruction */
+            dvmCompilerAppendLIR(cUnit, (LIR *) &labelList[blockId]);
+
+
+            switch (chainingBlock->blockType) {
+                case kChainingCellNormal:
+                    handleNormalChainingCell(cUnit,
+                     chainingBlock->startOffset, blockId, labelList);
+                    break;
+                case kChainingCellInvokeSingleton:
+                    handleInvokeSingletonChainingCell(cUnit,
+                        chainingBlock->containingMethod, blockId, labelList);
+                    break;
+                case kChainingCellInvokePredicted:
+                    handleInvokePredictedChainingCell(cUnit, blockId);
+                    break;
+                case kChainingCellHot:
+                    handleHotChainingCell(cUnit,
+                        chainingBlock->startOffset, blockId, labelList);
+                    break;
+                case kChainingCellBackwardBranch:
+                    handleBackwardBranchChainingCell(cUnit,
+                        chainingBlock->startOffset, blockId, labelList);
+                    break;
+                default:
+                    ALOGE("Bad blocktype %d", chainingBlock->blockType);
+                    dvmAbort();
+                    break;
+            }
+
+            if (gDvmJit.codeCacheByteUsed + (stream - streamStart) + CODE_CACHE_PADDING > gDvmJit.codeCacheSize) {
+                ALOGI("JIT code cache full after ChainingCell (trace uses %uB)", (stream - streamStart));
+                gDvmJit.codeCacheFull = true;
+                cUnit->baseAddr = NULL;
+                endOfTrace(true); /* need to free structures */
+                PROTECT_CODE_CACHE(stream, unprotected_code_cache_bytes);
+                return;
+            }
+        }
+    }
+#if 0
+    dvmCompilerApplyGlobalOptimizations(cUnit);
+#endif
+    endOfTrace(false);
+
+    if (gDvmJit.codeCacheFull) {
+        /* We hit code cache size limit inside endofTrace(false).
+         * Bail out for this trace!
+         */
+        ALOGI("JIT code cache full after endOfTrace (trace uses %uB)", (stream - streamStart));
+        cUnit->baseAddr = NULL;
+        PROTECT_CODE_CACHE(stream, unprotected_code_cache_bytes);
+        return;
+    }
+
+    /* dump section for chaining cell counts, make sure it is 4-byte aligned */
+    padding = (4 - ((u4)stream & 3)) & 3;
+    stream += padding;
+    ChainCellCounts chainCellCounts;
+    /* Install the chaining cell counts */
+    for (i=0; i< kChainingCellGap; i++) {
+        chainCellCounts.u.count[i] = cUnit->numChainingCells[i];
+    }
+    char* streamCountStart = (char*)stream;
+    memcpy((char*)stream, &chainCellCounts, sizeof(chainCellCounts));
+    stream += sizeof(chainCellCounts);
+
+    cUnit->baseAddr = streamMethodStart;
+    cUnit->totalSize = (stream - streamStart);
+    if(gDvmJit.codeCacheByteUsed + cUnit->totalSize + CODE_CACHE_PADDING > gDvmJit.codeCacheSize) {
+        ALOGI("JIT code cache full after ChainingCellCounts (trace uses %uB)", (stream - streamStart));
+        gDvmJit.codeCacheFull = true;
+        cUnit->baseAddr = NULL;
+        PROTECT_CODE_CACHE(stream, unprotected_code_cache_bytes);
+        return;
+    }
+
+    /* write chaining cell count offset & chaining cell offset */
+    u2* pOffset = (u2*)(streamMethodStart - EXTRA_BYTES_FOR_CHAINING); /* space was already allocated for this purpose */
+    *pOffset = streamCountStart - streamMethodStart; /* from codeAddr */
+    pOffset[1] = streamChainingStart - streamMethodStart;
+
+    PROTECT_CODE_CACHE(stream, unprotected_code_cache_bytes);
+
+    gDvmJit.codeCacheByteUsed += (stream - streamStart);
+    if (cUnit->printMe) {
+        unsigned char* codeBaseAddr = (unsigned char *) cUnit->baseAddr;
+        unsigned char* codeBaseAddrNext = ((unsigned char *) gDvmJit.codeCache) + gDvmJit.codeCacheByteUsed;
+        ALOGD("-------- Built trace for %s%s, JIT code [%p, %p) cache start %p",
+              cUnit->method->clazz->descriptor, cUnit->method->name,
+              codeBaseAddr, codeBaseAddrNext, gDvmJit.codeCache);
+        ALOGD("** %s%s@0x%x:", cUnit->method->clazz->descriptor,
+              cUnit->method->name, cUnit->traceDesc->trace[0].info.frag.startOffset);
+        printEmittedCodeBlock(codeBaseAddr, codeBaseAddrNext);
+    }
+    ALOGV("JIT CODE after trace %p to %p size %x START %p", cUnit->baseAddr,
+          (char *) gDvmJit.codeCache + gDvmJit.codeCacheByteUsed,
+          cUnit->totalSize, gDvmJit.codeCache);
+
+    gDvmJit.numCompilations++;
+
+    info->codeAddress = (char*)cUnit->baseAddr;// + cUnit->headerSize;
+}
+
+/*
+ * Perform translation chain operation.
+ */
+void* dvmJitChain(void* tgtAddr, u4* branchAddr)
+{
+#ifdef JIT_CHAIN
+    int relOffset = (int) tgtAddr - (int)branchAddr;
+
+    if ((gDvmJit.pProfTable != NULL) && (gDvm.sumThreadSuspendCount == 0) &&
+        (gDvmJit.codeCacheFull == false)) {
+
+        gDvmJit.translationChains++;
+
+        //OpndSize immSize = estOpndSizeFromImm(relOffset);
+        //relOffset -= getJmpCallInstSize(immSize, JmpCall_uncond);
+        /* Hard coded the jump opnd size to 32 bits, This instruction will replace the "jump 0" in
+         * the original code sequence.
+         */
+        OpndSize immSize = OpndSize_32;
+        relOffset -= 5;
+        //can't use stream here since it is used by the compilation thread
+        UNPROTECT_CODE_CACHE(branchAddr, sizeof(*branchAddr));
+        dump_imm_with_codeaddr(Mnemonic_JMP, immSize, relOffset, (char*)branchAddr); //dump to branchAddr
+        PROTECT_CODE_CACHE(branchAddr, sizeof(*branchAddr));
+
+        gDvmJit.hasNewChain = true;
+
+        COMPILER_TRACE_CHAINING(
+            ALOGI("Jit Runtime: chaining 0x%x to %p with relOffset %x",
+                  (int) branchAddr, tgtAddr, relOffset));
+    }
+#endif
+    return tgtAddr;
+}
+
+/*
+ * Accept the work and start compiling.  Returns true if compilation
+ * is attempted.
+ */
+bool dvmCompilerDoWork(CompilerWorkOrder *work)
+{
+    JitTraceDescription *desc;
+    bool isCompile;
+    bool success = true;
+
+    if (gDvmJit.codeCacheFull) {
+        return false;
+    }
+
+    switch (work->kind) {
+        case kWorkOrderTrace:
+            isCompile = true;
+            /* Start compilation with maximally allowed trace length */
+            desc = (JitTraceDescription *)work->info;
+            success = dvmCompileTrace(desc, JIT_MAX_TRACE_LEN, &work->result,
+                                        work->bailPtr, 0 /* no hints */);
+            break;
+        case kWorkOrderTraceDebug: {
+            bool oldPrintMe = gDvmJit.printMe;
+            gDvmJit.printMe = true;
+            isCompile = true;
+            /* Start compilation with maximally allowed trace length */
+            desc = (JitTraceDescription *)work->info;
+            success = dvmCompileTrace(desc, JIT_MAX_TRACE_LEN, &work->result,
+                                        work->bailPtr, 0 /* no hints */);
+            gDvmJit.printMe = oldPrintMe;
+            break;
+        }
+        case kWorkOrderProfileMode:
+            dvmJitChangeProfileMode((TraceProfilingModes)(int)work->info);
+            isCompile = false;
+            break;
+        default:
+            isCompile = false;
+            ALOGE("Jit: unknown work order type");
+            assert(0);  // Bail if debug build, discard otherwise
+    }
+    if (!success)
+        work->result.codeAddress = NULL;
+    return isCompile;
+}
+
+void dvmCompilerCacheFlush(long start, long end, long flags) {
+  /* cacheflush is needed for ARM, but not for IA32 (coherent icache) */
+}
+
+//#endif