diff options
Diffstat (limited to 'compiler/dex/quick/x86/target_x86.cc')
-rwxr-xr-x | compiler/dex/quick/x86/target_x86.cc | 658 |
1 files changed, 433 insertions, 225 deletions
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc index 68c1633c2a..ffe6702bb8 100755 --- a/compiler/dex/quick/x86/target_x86.cc +++ b/compiler/dex/quick/x86/target_x86.cc @@ -24,6 +24,7 @@ #include "dex/reg_storage_eq.h" #include "mirror/array.h" #include "mirror/string.h" +#include "oat.h" #include "x86_lir.h" #include "utils/dwarf_cfi.h" @@ -454,7 +455,7 @@ RegStorage X86Mir2Lir::AllocateByteRegister() { } RegStorage X86Mir2Lir::Get128BitRegister(RegStorage reg) { - return GetRegInfo(reg)->FindMatchingView(RegisterInfo::k128SoloStorageMask)->GetReg(); + return GetRegInfo(reg)->Master()->GetReg(); } bool X86Mir2Lir::IsByteRegister(RegStorage reg) { @@ -689,8 +690,11 @@ int X86Mir2Lir::VectorRegisterSize() { return 128; } -int X86Mir2Lir::NumReservableVectorRegisters(bool fp_used) { - return fp_used ? 5 : 7; +int X86Mir2Lir::NumReservableVectorRegisters(bool long_or_fp) { + int num_vector_temps = cu_->target64 ? xp_temps_64.size() : xp_temps_32.size(); + + // Leave a few temps for use by backend as scratch. + return long_or_fp ? num_vector_temps - 2 : num_vector_temps - 1; } void X86Mir2Lir::SpillCoreRegs() { @@ -864,9 +868,6 @@ X86Mir2Lir::X86Mir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* rX86_RET1 = rDX; rX86_INVOKE_TGT = rAX; rX86_COUNT = rCX; - - // Initialize the number of reserved vector registers - num_reserved_vector_regs_ = -1; } Mir2Lir* X86CodeGenerator(CompilationUnit* const cu, MIRGraph* const mir_graph, @@ -1022,19 +1023,18 @@ void X86Mir2Lir::InstallLiteralPools() { DCHECK(method_literal_list_ == nullptr); DCHECK(class_literal_list_ == nullptr); - // Align to 16 byte boundary. We have implicit knowledge that the start of the method is - // on a 4 byte boundary. How can I check this if it changes (other than aligned loads - // will fail at runtime)? - if (const_vectors_ != nullptr) { - int align_size = (16-4) - (code_buffer_.size() & 0xF); - if (align_size < 0) { - align_size += 16; - } - while (align_size > 0) { + if (const_vectors_ != nullptr) { + // Vector literals must be 16-byte aligned. The header that is placed + // in the code section causes misalignment so we take it into account. + // Otherwise, we are sure that for x86 method is aligned to 16. + DCHECK_EQ(GetInstructionSetAlignment(cu_->instruction_set), 16u); + uint32_t bytes_to_fill = (0x10 - ((code_buffer_.size() + sizeof(OatQuickMethodHeader)) & 0xF)) & 0xF; + while (bytes_to_fill > 0) { code_buffer_.push_back(0); - align_size--; + bytes_to_fill--; } + for (LIR *p = const_vectors_; p != nullptr; p = p->next) { PushWord(&code_buffer_, p->operands[0]); PushWord(&code_buffer_, p->operands[1]); @@ -1489,7 +1489,7 @@ void X86Mir2Lir::GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) { ReserveVectorRegisters(mir); break; case kMirOpReturnVectorRegisters: - ReturnVectorRegisters(); + ReturnVectorRegisters(mir); break; case kMirOpConstVector: GenConst128(bb, mir); @@ -1536,17 +1536,19 @@ void X86Mir2Lir::GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) { case kMirOpMemBarrier: GenMemBarrier(static_cast<MemBarrierKind>(mir->dalvikInsn.vA)); break; + case kMirOpPackedArrayGet: + GenPackedArrayGet(bb, mir); + break; + case kMirOpPackedArrayPut: + GenPackedArrayPut(bb, mir); + break; default: break; } } void X86Mir2Lir::ReserveVectorRegisters(MIR* mir) { - // We should not try to reserve twice without returning the registers - DCHECK_NE(num_reserved_vector_regs_, -1); - - int num_vector_reg = mir->dalvikInsn.vA; - for (int i = 0; i < num_vector_reg; i++) { + for (uint32_t i = mir->dalvikInsn.vA; i <= mir->dalvikInsn.vB; i++) { RegStorage xp_reg = RegStorage::Solo128(i); RegisterInfo *xp_reg_info = GetRegInfo(xp_reg); Clobber(xp_reg); @@ -1561,13 +1563,10 @@ void X86Mir2Lir::ReserveVectorRegisters(MIR* mir) { } } } - - num_reserved_vector_regs_ = num_vector_reg; } -void X86Mir2Lir::ReturnVectorRegisters() { - // Return all the reserved registers - for (int i = 0; i < num_reserved_vector_regs_; i++) { +void X86Mir2Lir::ReturnVectorRegisters(MIR* mir) { + for (uint32_t i = mir->dalvikInsn.vA; i <= mir->dalvikInsn.vB; i++) { RegStorage xp_reg = RegStorage::Solo128(i); RegisterInfo *xp_reg_info = GetRegInfo(xp_reg); @@ -1581,17 +1580,12 @@ void X86Mir2Lir::ReturnVectorRegisters() { } } } - - // We don't have anymore reserved vector registers - num_reserved_vector_regs_ = -1; } void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) { - store_method_addr_used_ = true; - int type_size = mir->dalvikInsn.vB; - // We support 128 bit vectors. - DCHECK_EQ(type_size & 0xFFFF, 128); RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest); + uint32_t *args = mir->dalvikInsn.arg; int reg = rs_dest.GetReg(); // Check for all 0 case. @@ -1601,14 +1595,24 @@ void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) { } // Append the mov const vector to reg opcode. - AppendOpcodeWithConst(kX86MovupsRM, reg, mir); + AppendOpcodeWithConst(kX86MovdqaRM, reg, mir); } void X86Mir2Lir::AppendOpcodeWithConst(X86OpCode opcode, int reg, MIR* mir) { - // Okay, load it from the constant vector area. - LIR *data_target = ScanVectorLiteral(mir); + // The literal pool needs position independent logic. + store_method_addr_used_ = true; + + // To deal with correct memory ordering, reverse order of constants. + int32_t constants[4]; + constants[3] = mir->dalvikInsn.arg[0]; + constants[2] = mir->dalvikInsn.arg[1]; + constants[1] = mir->dalvikInsn.arg[2]; + constants[0] = mir->dalvikInsn.arg[3]; + + // Search if there is already a constant in pool with this value. + LIR *data_target = ScanVectorLiteral(constants); if (data_target == nullptr) { - data_target = AddVectorLiteral(mir); + data_target = AddVectorLiteral(constants); } // Address the start of the method. @@ -1624,7 +1628,7 @@ void X86Mir2Lir::AppendOpcodeWithConst(X86OpCode opcode, int reg, MIR* mir) { // 4 byte offset. We will fix this up in the assembler later to have the right // value. ScopedMemRefType mem_ref_type(this, ResourceMask::kLiteral); - LIR *load = NewLIR2(opcode, reg, rl_method.reg.GetReg()); + LIR *load = NewLIR3(opcode, reg, rl_method.reg.GetReg(), 256 /* bogus */); load->flags.fixup = kFixupLoad; load->target = data_target; } @@ -1633,16 +1637,12 @@ void X86Mir2Lir::GenMoveVector(BasicBlock *bb, MIR *mir) { // We only support 128 bit registers. DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest); RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vB); - NewLIR2(kX86Mova128RR, rs_dest.GetReg(), rs_src.GetReg()); + NewLIR2(kX86MovdqaRR, rs_dest.GetReg(), rs_src.GetReg()); } -void X86Mir2Lir::GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir) { - const int BYTE_SIZE = 8; - RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); - RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); - RegStorage rs_src1_high_tmp = Get128BitRegister(AllocTempWide()); - +void X86Mir2Lir::GenMultiplyVectorSignedByte(RegStorage rs_dest_src1, RegStorage rs_src2) { /* * Emulate the behavior of a kSignedByte by separating out the 16 values in the two XMM * and multiplying 8 at a time before recombining back into one XMM register. @@ -1660,29 +1660,100 @@ void X86Mir2Lir::GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir) { */ // Copy xmm1. - NewLIR2(kX86Mova128RR, rs_src1_high_tmp.GetReg(), rs_dest_src1.GetReg()); + RegStorage rs_src1_high_tmp = Get128BitRegister(AllocTempDouble()); + RegStorage rs_dest_high_tmp = Get128BitRegister(AllocTempDouble()); + NewLIR2(kX86MovdqaRR, rs_src1_high_tmp.GetReg(), rs_src2.GetReg()); + NewLIR2(kX86MovdqaRR, rs_dest_high_tmp.GetReg(), rs_dest_src1.GetReg()); // Multiply low bits. + // x7 *= x3 NewLIR2(kX86PmullwRR, rs_dest_src1.GetReg(), rs_src2.GetReg()); // xmm1 now has low bits. AndMaskVectorRegister(rs_dest_src1, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF); // Prepare high bits for multiplication. - NewLIR2(kX86PsrlwRI, rs_src1_high_tmp.GetReg(), BYTE_SIZE); - AndMaskVectorRegister(rs_src2, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00); + NewLIR2(kX86PsrlwRI, rs_src1_high_tmp.GetReg(), 0x8); + AndMaskVectorRegister(rs_dest_high_tmp, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00); // Multiply high bits and xmm2 now has high bits. - NewLIR2(kX86PmullwRR, rs_src2.GetReg(), rs_src1_high_tmp.GetReg()); + NewLIR2(kX86PmullwRR, rs_src1_high_tmp.GetReg(), rs_dest_high_tmp.GetReg()); // Combine back into dest XMM register. - NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg()); + NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src1_high_tmp.GetReg()); +} + +void X86Mir2Lir::GenMultiplyVectorLong(RegStorage rs_dest_src1, RegStorage rs_src2) { + /* + * We need to emulate the packed long multiply. + * For kMirOpPackedMultiply xmm1, xmm0: + * - xmm1 is src/dest + * - xmm0 is src + * - Get xmm2 and xmm3 as temp + * - Idea is to multiply the lower 32 of each operand with the higher 32 of the other. + * - Then add the two results. + * - Move it to the upper 32 of the destination + * - Then multiply the lower 32-bits of the operands and add the result to the destination. + * + * (op dest src ) + * movdqa %xmm2, %xmm1 + * movdqa %xmm3, %xmm0 + * psrlq %xmm3, $0x20 + * pmuludq %xmm3, %xmm2 + * psrlq %xmm1, $0x20 + * pmuludq %xmm1, %xmm0 + * paddq %xmm1, %xmm3 + * psllq %xmm1, $0x20 + * pmuludq %xmm2, %xmm0 + * paddq %xmm1, %xmm2 + * + * When both the operands are the same, then we need to calculate the lower-32 * higher-32 + * calculation only once. Thus we don't need the xmm3 temp above. That sequence becomes: + * + * (op dest src ) + * movdqa %xmm2, %xmm1 + * psrlq %xmm1, $0x20 + * pmuludq %xmm1, %xmm0 + * paddq %xmm1, %xmm1 + * psllq %xmm1, $0x20 + * pmuludq %xmm2, %xmm0 + * paddq %xmm1, %xmm2 + * + */ + + bool both_operands_same = (rs_dest_src1.GetReg() == rs_src2.GetReg()); + + RegStorage rs_tmp_vector_1; + RegStorage rs_tmp_vector_2; + rs_tmp_vector_1 = Get128BitRegister(AllocTempDouble()); + NewLIR2(kX86MovdqaRR, rs_tmp_vector_1.GetReg(), rs_dest_src1.GetReg()); + + if (both_operands_same == false) { + rs_tmp_vector_2 = Get128BitRegister(AllocTempDouble()); + NewLIR2(kX86MovdqaRR, rs_tmp_vector_2.GetReg(), rs_src2.GetReg()); + NewLIR2(kX86PsrlqRI, rs_tmp_vector_2.GetReg(), 0x20); + NewLIR2(kX86PmuludqRR, rs_tmp_vector_2.GetReg(), rs_tmp_vector_1.GetReg()); + } + + NewLIR2(kX86PsrlqRI, rs_dest_src1.GetReg(), 0x20); + NewLIR2(kX86PmuludqRR, rs_dest_src1.GetReg(), rs_src2.GetReg()); + + if (both_operands_same == false) { + NewLIR2(kX86PaddqRR, rs_dest_src1.GetReg(), rs_tmp_vector_2.GetReg()); + } else { + NewLIR2(kX86PaddqRR, rs_dest_src1.GetReg(), rs_dest_src1.GetReg()); + } + + NewLIR2(kX86PsllqRI, rs_dest_src1.GetReg(), 0x20); + NewLIR2(kX86PmuludqRR, rs_tmp_vector_1.GetReg(), rs_src2.GetReg()); + NewLIR2(kX86PaddqRR, rs_dest_src1.GetReg(), rs_tmp_vector_1.GetReg()); } void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) { DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest_src1); RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); int opcode = 0; switch (opsize) { @@ -1700,7 +1771,10 @@ void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) { break; case kSignedByte: // HW doesn't support 16x16 byte multiplication so emulate it. - GenMultiplyVectorSignedByte(bb, mir); + GenMultiplyVectorSignedByte(rs_dest_src1, rs_src2); + return; + case k64: + GenMultiplyVectorLong(rs_dest_src1, rs_src2); return; default: LOG(FATAL) << "Unsupported vector multiply " << opsize; @@ -1713,12 +1787,16 @@ void X86Mir2Lir::GenAddVector(BasicBlock *bb, MIR *mir) { DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest_src1); RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); int opcode = 0; switch (opsize) { case k32: opcode = kX86PadddRR; break; + case k64: + opcode = kX86PaddqRR; + break; case kSignedHalf: case kUnsignedHalf: opcode = kX86PaddwRR; @@ -1744,12 +1822,16 @@ void X86Mir2Lir::GenSubtractVector(BasicBlock *bb, MIR *mir) { DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest_src1); RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); int opcode = 0; switch (opsize) { case k32: opcode = kX86PsubdRR; break; + case k64: + opcode = kX86PsubqRR; + break; case kSignedHalf: case kUnsignedHalf: opcode = kX86PsubwRR; @@ -1772,58 +1854,54 @@ void X86Mir2Lir::GenSubtractVector(BasicBlock *bb, MIR *mir) { } void X86Mir2Lir::GenShiftByteVector(BasicBlock *bb, MIR *mir) { + // Destination does not need clobbered because it has already been as part + // of the general packed shift handler (caller of this method). RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); - RegStorage rs_tmp = Get128BitRegister(AllocTempWide()); int opcode = 0; - int imm = mir->dalvikInsn.vB; - switch (static_cast<ExtendedMIROpcode>(mir->dalvikInsn.opcode)) { case kMirOpPackedShiftLeft: opcode = kX86PsllwRI; break; case kMirOpPackedSignedShiftRight: - opcode = kX86PsrawRI; - break; case kMirOpPackedUnsignedShiftRight: - opcode = kX86PsrlwRI; - break; + // TODO Add support for emulated byte shifts. default: LOG(FATAL) << "Unsupported shift operation on byte vector " << opcode; break; } - /* - * xmm1 will have low bits - * xmm2 will have high bits - * - * xmm2 = xmm1 - * xmm1 = xmm1 .<< N - * xmm2 = xmm2 && 0xFF00FF00FF00FF00FF00FF00FF00FF00 - * xmm2 = xmm2 .<< N - * xmm1 = xmm1 | xmm2 - */ - - // Copy xmm1. - NewLIR2(kX86Mova128RR, rs_tmp.GetReg(), rs_dest_src1.GetReg()); + // Clear xmm register and return if shift more than byte length. + int imm = mir->dalvikInsn.vB; + if (imm >= 8) { + NewLIR2(kX86PxorRR, rs_dest_src1.GetReg(), rs_dest_src1.GetReg()); + return; + } // Shift lower values. NewLIR2(opcode, rs_dest_src1.GetReg(), imm); - // Mask bottom bits. - AndMaskVectorRegister(rs_tmp, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00); - - // Shift higher values. - NewLIR2(opcode, rs_tmp.GetReg(), imm); + /* + * The above shift will shift the whole word, but that means + * both the bytes will shift as well. To emulate a byte level + * shift, we can just throw away the lower (8 - N) bits of the + * upper byte, and we are done. + */ + uint8_t byte_mask = 0xFF << imm; + uint32_t int_mask = byte_mask; + int_mask = int_mask << 8 | byte_mask; + int_mask = int_mask << 8 | byte_mask; + int_mask = int_mask << 8 | byte_mask; - // Combine back into dest XMM register. - NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_tmp.GetReg()); + // And the destination with the mask + AndMaskVectorRegister(rs_dest_src1, int_mask, int_mask, int_mask, int_mask); } void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) { DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest_src1); int imm = mir->dalvikInsn.vB; int opcode = 0; switch (opsize) { @@ -1852,6 +1930,7 @@ void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) { DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest_src1); int imm = mir->dalvikInsn.vB; int opcode = 0; switch (opsize) { @@ -1866,6 +1945,8 @@ void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) { case kUnsignedByte: GenShiftByteVector(bb, mir); return; + case k64: + // TODO Implement emulated shift algorithm. default: LOG(FATAL) << "Unsupported vector signed shift right " << opsize; break; @@ -1877,6 +1958,7 @@ void X86Mir2Lir::GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir) { DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest_src1); int imm = mir->dalvikInsn.vB; int opcode = 0; switch (opsize) { @@ -1905,6 +1987,7 @@ void X86Mir2Lir::GenAndVector(BasicBlock *bb, MIR *mir) { // We only support 128 bit registers. DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest_src1); RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); NewLIR2(kX86PandRR, rs_dest_src1.GetReg(), rs_src2.GetReg()); } @@ -1913,6 +1996,7 @@ void X86Mir2Lir::GenOrVector(BasicBlock *bb, MIR *mir) { // We only support 128 bit registers. DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest_src1); RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg()); } @@ -1921,6 +2005,7 @@ void X86Mir2Lir::GenXorVector(BasicBlock *bb, MIR *mir) { // We only support 128 bit registers. DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + Clobber(rs_dest_src1); RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); NewLIR2(kX86PxorRR, rs_dest_src1.GetReg(), rs_src2.GetReg()); } @@ -1945,134 +2030,240 @@ void X86Mir2Lir::MaskVectorRegister(X86OpCode opcode, RegStorage rs_src1, uint32 void X86Mir2Lir::GenAddReduceVector(BasicBlock *bb, MIR *mir) { OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); - RegStorage rs_src1 = RegStorage::Solo128(mir->dalvikInsn.vB); - RegLocation rl_dest = mir_graph_->GetDest(mir); - RegStorage rs_tmp; - - int vec_bytes = (mir->dalvikInsn.vC & 0xFFFF) / 8; - int vec_unit_size = 0; - int opcode = 0; - int extr_opcode = 0; - RegLocation rl_result; + RegStorage vector_src = RegStorage::Solo128(mir->dalvikInsn.vB); + bool is_wide = opsize == k64 || opsize == kDouble; + + // Get the location of the virtual register. Since this bytecode is overloaded + // for different types (and sizes), we need different logic for each path. + // The design of bytecode uses same VR for source and destination. + RegLocation rl_src, rl_dest, rl_result; + if (is_wide) { + rl_src = mir_graph_->GetSrcWide(mir, 0); + rl_dest = mir_graph_->GetDestWide(mir); + } else { + rl_src = mir_graph_->GetSrc(mir, 0); + rl_dest = mir_graph_->GetDest(mir); + } - switch (opsize) { - case k32: - extr_opcode = kX86PextrdRRI; - opcode = kX86PhadddRR; - vec_unit_size = 4; - break; - case kSignedByte: - case kUnsignedByte: - extr_opcode = kX86PextrbRRI; - opcode = kX86PhaddwRR; - vec_unit_size = 2; - break; - case kSignedHalf: - case kUnsignedHalf: - extr_opcode = kX86PextrwRRI; - opcode = kX86PhaddwRR; - vec_unit_size = 2; - break; - case kSingle: - rl_result = EvalLoc(rl_dest, kFPReg, true); - vec_unit_size = 4; - for (int i = 0; i < 3; i++) { - NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), rs_src1.GetReg()); - NewLIR3(kX86ShufpsRRI, rs_src1.GetReg(), rs_src1.GetReg(), 0x39); - } - NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), rs_src1.GetReg()); - StoreValue(rl_dest, rl_result); + // We need a temp for byte and short values + RegStorage temp; - // For single-precision floats, we are done here - return; - default: - LOG(FATAL) << "Unsupported vector add reduce " << opsize; - break; - } + // There is a different path depending on type and size. + if (opsize == kSingle) { + // Handle float case. + // TODO Add support for fast math (not value safe) and do horizontal add in that case. - int elems = vec_bytes / vec_unit_size; + rl_src = LoadValue(rl_src, kFPReg); + rl_result = EvalLoc(rl_dest, kFPReg, true); - // Emulate horizontal add instruction by reducing 2 vectors with 8 values before adding them again - // TODO is overflow handled correctly? - if (opsize == kSignedByte || opsize == kUnsignedByte) { - rs_tmp = Get128BitRegister(AllocTempWide()); + // Since we are doing an add-reduce, we move the reg holding the VR + // into the result so we include it in result. + OpRegCopy(rl_result.reg, rl_src.reg); + NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg()); - // tmp = xmm1 .>> 8. - NewLIR2(kX86Mova128RR, rs_tmp.GetReg(), rs_src1.GetReg()); - NewLIR2(kX86PsrlwRI, rs_tmp.GetReg(), 8); + // Since FP must keep order of operation for value safety, we shift to low + // 32-bits and add to result. + for (int i = 0; i < 3; i++) { + NewLIR3(kX86ShufpsRRI, vector_src.GetReg(), vector_src.GetReg(), 0x39); + NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg()); + } - // Zero extend low bits in xmm1. - AndMaskVectorRegister(rs_src1, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF); - } + StoreValue(rl_dest, rl_result); + } else if (opsize == kDouble) { + // Handle double case. + rl_src = LoadValueWide(rl_src, kFPReg); + rl_result = EvalLocWide(rl_dest, kFPReg, true); + LOG(FATAL) << "Unsupported vector add reduce for double."; + } else if (opsize == k64) { + /* + * Handle long case: + * 1) Reduce the vector register to lower half (with addition). + * 1-1) Get an xmm temp and fill it with vector register. + * 1-2) Shift the xmm temp by 8-bytes. + * 1-3) Add the xmm temp to vector register that is being reduced. + * 2) Allocate temp GP / GP pair. + * 2-1) In 64-bit case, use movq to move result to a 64-bit GP. + * 2-2) In 32-bit case, use movd twice to move to 32-bit GP pair. + * 3) Finish the add reduction by doing what add-long/2addr does, + * but instead of having a VR as one of the sources, we have our temp GP. + */ + RegStorage rs_tmp_vector = Get128BitRegister(AllocTempDouble()); + NewLIR2(kX86MovdqaRR, rs_tmp_vector.GetReg(), vector_src.GetReg()); + NewLIR2(kX86PsrldqRI, rs_tmp_vector.GetReg(), 8); + NewLIR2(kX86PaddqRR, vector_src.GetReg(), rs_tmp_vector.GetReg()); + FreeTemp(rs_tmp_vector); + + // We would like to be able to reuse the add-long implementation, so set up a fake + // register location to pass it. + RegLocation temp_loc = mir_graph_->GetBadLoc(); + temp_loc.core = 1; + temp_loc.wide = 1; + temp_loc.location = kLocPhysReg; + temp_loc.reg = AllocTempWide(); + + if (cu_->target64) { + DCHECK(!temp_loc.reg.IsPair()); + NewLIR2(kX86MovqrxRR, temp_loc.reg.GetReg(), vector_src.GetReg()); + } else { + NewLIR2(kX86MovdrxRR, temp_loc.reg.GetLowReg(), vector_src.GetReg()); + NewLIR2(kX86PsrlqRI, vector_src.GetReg(), 0x20); + NewLIR2(kX86MovdrxRR, temp_loc.reg.GetHighReg(), vector_src.GetReg()); + } - while (elems > 1) { - if (opsize == kSignedByte || opsize == kUnsignedByte) { - NewLIR2(opcode, rs_tmp.GetReg(), rs_tmp.GetReg()); + GenArithOpLong(Instruction::ADD_LONG_2ADDR, rl_dest, temp_loc, temp_loc); + } else if (opsize == kSignedByte || opsize == kUnsignedByte) { + RegStorage rs_tmp = Get128BitRegister(AllocTempDouble()); + NewLIR2(kX86PxorRR, rs_tmp.GetReg(), rs_tmp.GetReg()); + NewLIR2(kX86PsadbwRR, vector_src.GetReg(), rs_tmp.GetReg()); + NewLIR3(kX86PshufdRRI, rs_tmp.GetReg(), vector_src.GetReg(), 0x4e); + NewLIR2(kX86PaddbRR, vector_src.GetReg(), rs_tmp.GetReg()); + // Move to a GPR + temp = AllocTemp(); + NewLIR2(kX86MovdrxRR, temp.GetReg(), vector_src.GetReg()); + } else { + // Handle and the int and short cases together + + // Initialize as if we were handling int case. Below we update + // the opcode if handling byte or short. + int vec_bytes = (mir->dalvikInsn.vC & 0xFFFF) / 8; + int vec_unit_size; + int horizontal_add_opcode; + int extract_opcode; + + if (opsize == kSignedHalf || opsize == kUnsignedHalf) { + extract_opcode = kX86PextrwRRI; + horizontal_add_opcode = kX86PhaddwRR; + vec_unit_size = 2; + } else if (opsize == k32) { + vec_unit_size = 4; + horizontal_add_opcode = kX86PhadddRR; + extract_opcode = kX86PextrdRRI; + } else { + LOG(FATAL) << "Unsupported vector add reduce " << opsize; + return; } - NewLIR2(opcode, rs_src1.GetReg(), rs_src1.GetReg()); - elems >>= 1; - } - // Combine the results if we separated them. - if (opsize == kSignedByte || opsize == kUnsignedByte) { - NewLIR2(kX86PaddbRR, rs_src1.GetReg(), rs_tmp.GetReg()); - } + int elems = vec_bytes / vec_unit_size; - // We need to extract to a GPR. - RegStorage temp = AllocTemp(); - NewLIR3(extr_opcode, temp.GetReg(), rs_src1.GetReg(), 0); + while (elems > 1) { + NewLIR2(horizontal_add_opcode, vector_src.GetReg(), vector_src.GetReg()); + elems >>= 1; + } - // Can we do this directly into memory? - rl_result = UpdateLocTyped(rl_dest, kCoreReg); - if (rl_result.location == kLocPhysReg) { - // Ensure res is in a core reg - rl_result = EvalLoc(rl_dest, kCoreReg, true); - OpRegReg(kOpAdd, rl_result.reg, temp); - StoreFinalValue(rl_dest, rl_result); - } else { - OpMemReg(kOpAdd, rl_result, temp.GetReg()); - } + // Handle this as arithmetic unary case. + ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg); - FreeTemp(temp); + // Extract to a GP register because this is integral typed. + temp = AllocTemp(); + NewLIR3(extract_opcode, temp.GetReg(), vector_src.GetReg(), 0); + } + + if (opsize != k64 && opsize != kSingle && opsize != kDouble) { + // The logic below looks very similar to the handling of ADD_INT_2ADDR + // except the rhs is not a VR but a physical register allocated above. + // No load of source VR is done because it assumes that rl_result will + // share physical register / memory location. + rl_result = UpdateLocTyped(rl_dest, kCoreReg); + if (rl_result.location == kLocPhysReg) { + // Ensure res is in a core reg. + rl_result = EvalLoc(rl_dest, kCoreReg, true); + OpRegReg(kOpAdd, rl_result.reg, temp); + StoreFinalValue(rl_dest, rl_result); + } else { + // Do the addition directly to memory. + OpMemReg(kOpAdd, rl_result, temp.GetReg()); + } + } } void X86Mir2Lir::GenReduceVector(BasicBlock *bb, MIR *mir) { OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); RegLocation rl_dest = mir_graph_->GetDest(mir); - RegStorage rs_src1 = RegStorage::Solo128(mir->dalvikInsn.vB); + RegStorage vector_src = RegStorage::Solo128(mir->dalvikInsn.vB); int extract_index = mir->dalvikInsn.arg[0]; int extr_opcode = 0; RegLocation rl_result; bool is_wide = false; - switch (opsize) { - case k32: - rl_result = UpdateLocTyped(rl_dest, kCoreReg); - extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrdMRI : kX86PextrdRRI; - break; - case kSignedHalf: - case kUnsignedHalf: - rl_result= UpdateLocTyped(rl_dest, kCoreReg); - extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrwMRI : kX86PextrwRRI; - break; - default: - LOG(FATAL) << "Unsupported vector add reduce " << opsize; - return; - break; - } + // There is a different path depending on type and size. + if (opsize == kSingle) { + // Handle float case. + // TODO Add support for fast math (not value safe) and do horizontal add in that case. - if (rl_result.location == kLocPhysReg) { - NewLIR3(extr_opcode, rl_result.reg.GetReg(), rs_src1.GetReg(), extract_index); - if (is_wide == true) { - StoreFinalValue(rl_dest, rl_result); + rl_result = EvalLoc(rl_dest, kFPReg, true); + NewLIR2(kX86PxorRR, rl_result.reg.GetReg(), rl_result.reg.GetReg()); + NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg()); + + // Since FP must keep order of operation for value safety, we shift to low + // 32-bits and add to result. + for (int i = 0; i < 3; i++) { + NewLIR3(kX86ShufpsRRI, vector_src.GetReg(), vector_src.GetReg(), 0x39); + NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg()); + } + + StoreValue(rl_dest, rl_result); + } else if (opsize == kDouble) { + // TODO Handle double case. + LOG(FATAL) << "Unsupported add reduce for double."; + } else if (opsize == k64) { + /* + * Handle long case: + * 1) Reduce the vector register to lower half (with addition). + * 1-1) Get an xmm temp and fill it with vector register. + * 1-2) Shift the xmm temp by 8-bytes. + * 1-3) Add the xmm temp to vector register that is being reduced. + * 2) Evaluate destination to a GP / GP pair. + * 2-1) In 64-bit case, use movq to move result to a 64-bit GP. + * 2-2) In 32-bit case, use movd twice to move to 32-bit GP pair. + * 3) Store the result to the final destination. + */ + RegStorage rs_tmp_vector = Get128BitRegister(AllocTempDouble()); + NewLIR2(kX86MovdqaRR, rs_tmp_vector.GetReg(), vector_src.GetReg()); + NewLIR2(kX86PsrldqRI, rs_tmp_vector.GetReg(), 8); + NewLIR2(kX86PaddqRR, vector_src.GetReg(), rs_tmp_vector.GetReg()); + FreeTemp(rs_tmp_vector); + + rl_result = EvalLocWide(rl_dest, kCoreReg, true); + if (cu_->target64) { + DCHECK(!rl_result.reg.IsPair()); + NewLIR2(kX86MovqrxRR, rl_result.reg.GetReg(), vector_src.GetReg()); } else { - StoreFinalValueWide(rl_dest, rl_result); + NewLIR2(kX86MovdrxRR, rl_result.reg.GetLowReg(), vector_src.GetReg()); + NewLIR2(kX86PsrlqRI, vector_src.GetReg(), 0x20); + NewLIR2(kX86MovdrxRR, rl_result.reg.GetHighReg(), vector_src.GetReg()); } + + StoreValueWide(rl_dest, rl_result); } else { - int displacement = SRegOffset(rl_result.s_reg_low); - LIR *l = NewLIR3(extr_opcode, rs_rX86_SP.GetReg(), displacement, rs_src1.GetReg()); - AnnotateDalvikRegAccess(l, displacement >> 2, true /* is_load */, is_wide /* is_64bit */); - AnnotateDalvikRegAccess(l, displacement >> 2, false /* is_load */, is_wide /* is_64bit */); + // Handle the rest of integral types now. + switch (opsize) { + case k32: + rl_result = UpdateLocTyped(rl_dest, kCoreReg); + extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrdMRI : kX86PextrdRRI; + break; + case kSignedHalf: + case kUnsignedHalf: + rl_result= UpdateLocTyped(rl_dest, kCoreReg); + extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrwMRI : kX86PextrwRRI; + break; + default: + LOG(FATAL) << "Unsupported vector reduce " << opsize; + return; + } + + if (rl_result.location == kLocPhysReg) { + NewLIR3(extr_opcode, rl_result.reg.GetReg(), vector_src.GetReg(), extract_index); + if (is_wide == true) { + StoreFinalValue(rl_dest, rl_result); + } else { + StoreFinalValueWide(rl_dest, rl_result); + } + } else { + int displacement = SRegOffset(rl_result.s_reg_low); + LIR *l = NewLIR3(extr_opcode, rs_rX86_SP.GetReg(), displacement, vector_src.GetReg()); + AnnotateDalvikRegAccess(l, displacement >> 2, true /* is_load */, is_wide /* is_64bit */); + AnnotateDalvikRegAccess(l, displacement >> 2, false /* is_load */, is_wide /* is_64bit */); + } } } @@ -2080,96 +2271,113 @@ void X86Mir2Lir::GenSetVector(BasicBlock *bb, MIR *mir) { DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA); - int op_low = 0, op_high = 0, imm = 0, op_mov = kX86MovdxrRR; + Clobber(rs_dest); + int op_shuffle = 0, op_shuffle_high = 0, op_mov = kX86MovdxrRR; RegisterClass reg_type = kCoreReg; + bool is_wide = false; switch (opsize) { case k32: - op_low = kX86PshufdRRI; + op_shuffle = kX86PshufdRRI; break; case kSingle: - op_low = kX86PshufdRRI; - op_mov = kX86Mova128RR; + op_shuffle = kX86PshufdRRI; + op_mov = kX86MovdqaRR; reg_type = kFPReg; break; case k64: - op_low = kX86PshufdRRI; - imm = 0x44; - break; - case kDouble: - op_low = kX86PshufdRRI; - op_mov = kX86Mova128RR; - reg_type = kFPReg; - imm = 0x44; + op_shuffle = kX86PunpcklqdqRR; + op_mov = kX86MovqrxRR; + is_wide = true; break; case kSignedByte: case kUnsignedByte: - // Shuffle 8 bit value into 16 bit word. - // We set val = val + (val << 8) below and use 16 bit shuffle. + // We will have the source loaded up in a + // double-word before we use this shuffle + op_shuffle = kX86PshufdRRI; + break; case kSignedHalf: case kUnsignedHalf: // Handles low quadword. - op_low = kX86PshuflwRRI; + op_shuffle = kX86PshuflwRRI; // Handles upper quadword. - op_high = kX86PshufdRRI; + op_shuffle_high = kX86PshufdRRI; break; default: LOG(FATAL) << "Unsupported vector set " << opsize; break; } - RegLocation rl_src = mir_graph_->GetSrc(mir, 0); - - // Load the value from the VR into the reg. - if (rl_src.wide == 0) { + // Load the value from the VR into a physical register. + RegLocation rl_src; + if (!is_wide) { + rl_src = mir_graph_->GetSrc(mir, 0); rl_src = LoadValue(rl_src, reg_type); } else { + rl_src = mir_graph_->GetSrcWide(mir, 0); rl_src = LoadValueWide(rl_src, reg_type); } + RegStorage reg_to_shuffle = rl_src.reg; - // If opsize is 8 bits wide then double value and use 16 bit shuffle instead. - if (opsize == kSignedByte || opsize == kUnsignedByte) { - RegStorage temp = AllocTemp(); - // val = val + (val << 8). - NewLIR2(kX86Mov32RR, temp.GetReg(), rl_src.reg.GetReg()); - NewLIR2(kX86Sal32RI, temp.GetReg(), 8); - NewLIR2(kX86Or32RR, rl_src.reg.GetReg(), temp.GetReg()); - FreeTemp(temp); + // Load the value into the XMM register. + if (!cu_->target64 && opsize == k64) { + // Logic assumes that longs are loaded in GP register pairs. + NewLIR2(kX86MovdxrRR, rs_dest.GetReg(), reg_to_shuffle.GetLowReg()); + RegStorage r_tmp = AllocTempDouble(); + NewLIR2(kX86MovdxrRR, r_tmp.GetReg(), reg_to_shuffle.GetHighReg()); + NewLIR2(kX86PunpckldqRR, rs_dest.GetReg(), r_tmp.GetReg()); + FreeTemp(r_tmp); + } else { + NewLIR2(op_mov, rs_dest.GetReg(), reg_to_shuffle.GetReg()); } - // Load the value into the XMM register. - NewLIR2(op_mov, rs_dest.GetReg(), rl_src.reg.GetReg()); + if (opsize == kSignedByte || opsize == kUnsignedByte) { + // In the byte case, first duplicate it to be a word + // Then duplicate it to be a double-word + NewLIR2(kX86PunpcklbwRR, rs_dest.GetReg(), rs_dest.GetReg()); + NewLIR2(kX86PunpcklwdRR, rs_dest.GetReg(), rs_dest.GetReg()); + } // Now shuffle the value across the destination. - NewLIR3(op_low, rs_dest.GetReg(), rs_dest.GetReg(), imm); + if (op_shuffle == kX86PunpcklqdqRR) { + NewLIR2(op_shuffle, rs_dest.GetReg(), rs_dest.GetReg()); + } else { + NewLIR3(op_shuffle, rs_dest.GetReg(), rs_dest.GetReg(), 0); + } // And then repeat as needed. - if (op_high != 0) { - NewLIR3(op_high, rs_dest.GetReg(), rs_dest.GetReg(), imm); + if (op_shuffle_high != 0) { + NewLIR3(op_shuffle_high, rs_dest.GetReg(), rs_dest.GetReg(), 0); } } -LIR *X86Mir2Lir::ScanVectorLiteral(MIR *mir) { - int *args = reinterpret_cast<int*>(mir->dalvikInsn.arg); +void X86Mir2Lir::GenPackedArrayGet(BasicBlock *bb, MIR *mir) { + UNIMPLEMENTED(FATAL) << "Extended opcode kMirOpPackedArrayGet not supported."; +} + +void X86Mir2Lir::GenPackedArrayPut(BasicBlock *bb, MIR *mir) { + UNIMPLEMENTED(FATAL) << "Extended opcode kMirOpPackedArrayPut not supported."; +} + +LIR* X86Mir2Lir::ScanVectorLiteral(int32_t* constants) { for (LIR *p = const_vectors_; p != nullptr; p = p->next) { - if (args[0] == p->operands[0] && args[1] == p->operands[1] && - args[2] == p->operands[2] && args[3] == p->operands[3]) { + if (constants[0] == p->operands[0] && constants[1] == p->operands[1] && + constants[2] == p->operands[2] && constants[3] == p->operands[3]) { return p; } } return nullptr; } -LIR *X86Mir2Lir::AddVectorLiteral(MIR *mir) { +LIR* X86Mir2Lir::AddVectorLiteral(int32_t* constants) { LIR* new_value = static_cast<LIR*>(arena_->Alloc(sizeof(LIR), kArenaAllocData)); - int *args = reinterpret_cast<int*>(mir->dalvikInsn.arg); - new_value->operands[0] = args[0]; - new_value->operands[1] = args[1]; - new_value->operands[2] = args[2]; - new_value->operands[3] = args[3]; + new_value->operands[0] = constants[0]; + new_value->operands[1] = constants[1]; + new_value->operands[2] = constants[2]; + new_value->operands[3] = constants[3]; new_value->next = const_vectors_; if (const_vectors_ == nullptr) { - estimated_native_code_size_ += 12; // Amount needed to align to 16 byte boundary. + estimated_native_code_size_ += 12; // Maximum needed to align to 16 byte boundary. } estimated_native_code_size_ += 16; // Space for one vector. const_vectors_ = new_value; |