diff options
author | Nicolas Geoffray <ngeoffray@google.com> | 2015-01-26 10:10:46 +0000 |
---|---|---|
committer | Gerrit Code Review <noreply-gerritcodereview@google.com> | 2015-01-26 10:10:47 +0000 |
commit | 2dadc9df0ffb822870a150f81257792b83241c77 (patch) | |
tree | ee8650cc14ec18ce0d7abf089c7d2e0dfc9e079d | |
parent | 336247fa6deba2948f5ede1df806f48cf67c790a (diff) | |
parent | 4dee636d21d9ce54386cdfbb824e5eb2a9c1af0d (diff) | |
download | android_art-2dadc9df0ffb822870a150f81257792b83241c77.tar.gz android_art-2dadc9df0ffb822870a150f81257792b83241c77.tar.bz2 android_art-2dadc9df0ffb822870a150f81257792b83241c77.zip |
Merge "Support callee-save registers on ARM."
-rw-r--r-- | compiler/optimizing/code_generator.cc | 4 | ||||
-rw-r--r-- | compiler/optimizing/code_generator.h | 14 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm.cc | 102 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm.h | 2 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_x86_64.cc | 16 |
5 files changed, 88 insertions, 50 deletions
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc index 43fd8bb668..0a405c4bbe 100644 --- a/compiler/optimizing/code_generator.cc +++ b/compiler/optimizing/code_generator.cc @@ -140,9 +140,7 @@ void CodeGenerator::ComputeFrameSize(size_t number_of_spill_slots, size_t maximum_number_of_live_core_registers, size_t maximum_number_of_live_fp_registers, size_t number_of_out_slots) { - core_spill_mask_ = allocated_registers_.GetCoreRegisters() & core_callee_save_mask_; - DCHECK_NE(core_spill_mask_, 0u) << "At least the return address register must be saved"; - fpu_spill_mask_ = allocated_registers_.GetFloatingPointRegisters() & fpu_callee_save_mask_; + ComputeSpillMask(); first_register_slot_in_slow_path_ = (number_of_out_slots + number_of_spill_slots) * kVRegSize; SetFrameSize(RoundUp( diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h index 85d18c0b43..45f02e53dc 100644 --- a/compiler/optimizing/code_generator.h +++ b/compiler/optimizing/code_generator.h @@ -129,6 +129,20 @@ class CodeGenerator { size_t GetNumberOfFloatingPointRegisters() const { return number_of_fpu_registers_; } virtual void SetupBlockedRegisters(bool is_baseline) const = 0; + virtual void ComputeSpillMask() { + core_spill_mask_ = allocated_registers_.GetCoreRegisters() & core_callee_save_mask_; + DCHECK_NE(core_spill_mask_, 0u) << "At least the return address register must be saved"; + fpu_spill_mask_ = allocated_registers_.GetFloatingPointRegisters() & fpu_callee_save_mask_; + } + + static uint32_t ComputeRegisterMask(const int* registers, size_t length) { + uint32_t mask = 0; + for (size_t i = 0, e = length; i < e; ++i) { + mask |= (1 << registers[i]); + } + return mask; + } + virtual void DumpCoreRegister(std::ostream& stream, int reg) const = 0; virtual void DumpFloatingPointRegister(std::ostream& stream, int reg) const = 0; virtual InstructionSet GetInstructionSet() const = 0; diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc index f4e4f5a74a..824663a3a6 100644 --- a/compiler/optimizing/code_generator_arm.cc +++ b/compiler/optimizing/code_generator_arm.cc @@ -50,6 +50,13 @@ static constexpr size_t kRuntimeParameterCoreRegistersLength = static constexpr SRegister kRuntimeParameterFpuRegisters[] = { S0, S1, S2, S3 }; static constexpr size_t kRuntimeParameterFpuRegistersLength = arraysize(kRuntimeParameterFpuRegisters); +// We unconditionally allocate R5 to ensure we can do long operations +// with baseline. +static constexpr Register kCoreSavedRegisterForBaseline = R5; +static constexpr Register kCoreCalleeSaves[] = + { R5, R6, R7, R8, R10, R11, PC }; +static constexpr SRegister kFpuCalleeSaves[] = + { S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31 }; class InvokeRuntimeCallingConvention : public CallingConvention<Register, SRegister> { public: @@ -374,20 +381,27 @@ size_t CodeGeneratorARM::RestoreFloatingPointRegister(size_t stack_index, uint32 CodeGeneratorARM::CodeGeneratorARM(HGraph* graph, const ArmInstructionSetFeatures& isa_features, const CompilerOptions& compiler_options) - : CodeGenerator(graph, kNumberOfCoreRegisters, kNumberOfSRegisters, - kNumberOfRegisterPairs, (1 << R6) | (1 << R7) | (1 << LR), 0, compiler_options), + : CodeGenerator(graph, + kNumberOfCoreRegisters, + kNumberOfSRegisters, + kNumberOfRegisterPairs, + ComputeRegisterMask(reinterpret_cast<const int*>(kCoreCalleeSaves), + arraysize(kCoreCalleeSaves)), + ComputeRegisterMask(reinterpret_cast<const int*>(kFpuCalleeSaves), + arraysize(kFpuCalleeSaves)), + compiler_options), block_labels_(graph->GetArena(), 0), location_builder_(graph, this), instruction_visitor_(graph, this), move_resolver_(graph->GetArena(), this), assembler_(true), isa_features_(isa_features) { - // We unconditionally allocate R6 and R7 to ensure we can do long operations - // with baseline. - AddAllocatedRegister(Location::RegisterLocation(R6)); - AddAllocatedRegister(Location::RegisterLocation(R7)); - // Save the link register to mimic Quick. - AddAllocatedRegister(Location::RegisterLocation(LR)); + // Save one extra register for baseline. Note that on thumb2, there is no easy + // instruction to restore just the PC, so this actually helps both baseline + // and non-baseline to save and restore at least two registers at entry and exit. + AddAllocatedRegister(Location::RegisterLocation(kCoreSavedRegisterForBaseline)); + // Save the PC register to mimic Quick. + AddAllocatedRegister(Location::RegisterLocation(PC)); } Location CodeGeneratorARM::AllocateFreeRegister(Primitive::Type type) const { @@ -456,31 +470,17 @@ void CodeGeneratorARM::SetupBlockedRegisters(bool is_baseline ATTRIBUTE_UNUSED) // Reserve temp register. blocked_core_registers_[IP] = true; - // TODO: We currently don't use Quick's callee saved registers. - // We always save and restore R6 and R7 to make sure we can use three - // register pairs for long operations. - blocked_core_registers_[R4] = true; - blocked_core_registers_[R5] = true; - blocked_core_registers_[R8] = true; - blocked_core_registers_[R10] = true; - blocked_core_registers_[R11] = true; - - blocked_fpu_registers_[S16] = true; - blocked_fpu_registers_[S17] = true; - blocked_fpu_registers_[S18] = true; - blocked_fpu_registers_[S19] = true; - blocked_fpu_registers_[S20] = true; - blocked_fpu_registers_[S21] = true; - blocked_fpu_registers_[S22] = true; - blocked_fpu_registers_[S23] = true; - blocked_fpu_registers_[S24] = true; - blocked_fpu_registers_[S25] = true; - blocked_fpu_registers_[S26] = true; - blocked_fpu_registers_[S27] = true; - blocked_fpu_registers_[S28] = true; - blocked_fpu_registers_[S29] = true; - blocked_fpu_registers_[S30] = true; - blocked_fpu_registers_[S31] = true; + if (is_baseline) { + for (size_t i = 0; i < arraysize(kCoreCalleeSaves); ++i) { + blocked_core_registers_[kCoreCalleeSaves[i]] = true; + } + + blocked_core_registers_[kCoreSavedRegisterForBaseline] = false; + + for (size_t i = 0; i < arraysize(kFpuCalleeSaves); ++i) { + blocked_fpu_registers_[kFpuCalleeSaves[i]] = true; + } + } UpdateBlockedPairRegisters(); } @@ -501,6 +501,28 @@ InstructionCodeGeneratorARM::InstructionCodeGeneratorARM(HGraph* graph, CodeGene assembler_(codegen->GetAssembler()), codegen_(codegen) {} +static uint32_t LeastSignificantBit(uint32_t mask) { + // ffs starts at 1. + return ffs(mask) - 1; +} + +void CodeGeneratorARM::ComputeSpillMask() { + core_spill_mask_ = allocated_registers_.GetCoreRegisters() & core_callee_save_mask_; + DCHECK_NE(core_spill_mask_, 0u) << "At least the return address register must be saved"; + fpu_spill_mask_ = allocated_registers_.GetFloatingPointRegisters() & fpu_callee_save_mask_; + // We use vpush and vpop for saving and restoring floating point registers, which take + // a SRegister and the number of registers to save/restore after that SRegister. We + // therefore update the `fpu_spill_mask_` to also contain those registers not allocated, + // but in the range. + if (fpu_spill_mask_ != 0) { + uint32_t least_significant_bit = LeastSignificantBit(fpu_spill_mask_); + uint32_t most_significant_bit = MostSignificantBit(fpu_spill_mask_); + for (uint32_t i = least_significant_bit + 1 ; i < most_significant_bit; ++i) { + fpu_spill_mask_ |= (1 << i); + } + } +} + void CodeGeneratorARM::GenerateFrameEntry() { bool skip_overflow_check = IsLeafMethod() && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kArm); @@ -511,14 +533,24 @@ void CodeGeneratorARM::GenerateFrameEntry() { RecordPcInfo(nullptr, 0); } - __ PushList(core_spill_mask_); + // PC is in the list of callee-save to mimic Quick, but we need to push + // LR at entry instead. + __ PushList((core_spill_mask_ & (~(1 << PC))) | 1 << LR); + if (fpu_spill_mask_ != 0) { + SRegister start_register = SRegister(LeastSignificantBit(fpu_spill_mask_)); + __ vpushs(start_register, POPCOUNT(fpu_spill_mask_)); + } __ AddConstant(SP, -(GetFrameSize() - FrameEntrySpillSize())); __ StoreToOffset(kStoreWord, R0, SP, 0); } void CodeGeneratorARM::GenerateFrameExit() { __ AddConstant(SP, GetFrameSize() - FrameEntrySpillSize()); - __ PopList((core_spill_mask_ & (~(1 << LR))) | 1 << PC); + if (fpu_spill_mask_ != 0) { + SRegister start_register = SRegister(LeastSignificantBit(fpu_spill_mask_)); + __ vpops(start_register, POPCOUNT(fpu_spill_mask_)); + } + __ PopList(core_spill_mask_); } void CodeGeneratorARM::Bind(HBasicBlock* block) { diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h index 46accfdaf0..dd69e4dd9c 100644 --- a/compiler/optimizing/code_generator_arm.h +++ b/compiler/optimizing/code_generator_arm.h @@ -245,6 +245,8 @@ class CodeGeneratorARM : public CodeGenerator { return type == Primitive::kPrimDouble || type == Primitive::kPrimLong; } + void ComputeSpillMask() OVERRIDE; + private: // Labels for each block that will be compiled. GrowableArray<Label> block_labels_; diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index 8cc0678bae..6bc28ff247 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -401,14 +401,6 @@ size_t CodeGeneratorX86_64::RestoreFloatingPointRegister(size_t stack_index, uin return kX86_64WordSize; } -static uint32_t ComputeCalleeSaveMask(const int* registers, size_t length) { - uint32_t mask = 0; - for (size_t i = 0, e = length; i < e; ++i) { - mask |= (1 << registers[i]); - } - return mask; -} - static constexpr int kNumberOfCpuRegisterPairs = 0; // Use a fake return address register to mimic Quick. static constexpr Register kFakeReturnRegister = Register(kLastCpuRegister + 1); @@ -417,11 +409,11 @@ CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph, const CompilerOptions& c kNumberOfCpuRegisters, kNumberOfFloatRegisters, kNumberOfCpuRegisterPairs, - ComputeCalleeSaveMask(reinterpret_cast<const int*>(kCoreCalleeSaves), - arraysize(kCoreCalleeSaves)) + ComputeRegisterMask(reinterpret_cast<const int*>(kCoreCalleeSaves), + arraysize(kCoreCalleeSaves)) | (1 << kFakeReturnRegister), - ComputeCalleeSaveMask(reinterpret_cast<const int*>(kFpuCalleeSaves), - arraysize(kFpuCalleeSaves)), + ComputeRegisterMask(reinterpret_cast<const int*>(kFpuCalleeSaves), + arraysize(kFpuCalleeSaves)), compiler_options), block_labels_(graph->GetArena(), 0), location_builder_(graph, this), |