diff options
-rw-r--r-- | compiler/dex/backend.h | 19 | ||||
-rw-r--r-- | compiler/dex/compiler_enums.h | 92 | ||||
-rw-r--r-- | compiler/dex/mir_dataflow.cc | 48 | ||||
-rw-r--r-- | compiler/dex/mir_graph.cc | 5 | ||||
-rw-r--r-- | compiler/dex/quick/x86/assemble_x86.cc | 9 | ||||
-rw-r--r-- | compiler/dex/quick/x86/codegen_x86.h | 33 | ||||
-rwxr-xr-x | compiler/dex/quick/x86/target_x86.cc | 460 | ||||
-rw-r--r-- | compiler/dex/quick/x86/x86_lir.h | 3 | ||||
-rw-r--r-- | disassembler/disassembler_x86.cc | 20 |
9 files changed, 567 insertions, 122 deletions
diff --git a/compiler/dex/backend.h b/compiler/dex/backend.h index 596b3c9802..1f24849257 100644 --- a/compiler/dex/backend.h +++ b/compiler/dex/backend.h @@ -28,6 +28,25 @@ class Backend { virtual void Materialize() = 0; virtual CompiledMethod* GetCompiledMethod() = 0; + // Queries for backend support for vectors + /* + * Return the number of bits in a vector register. + * @return 0 if vector registers are not supported, or the + * number of bits in the vector register if supported. + */ + virtual int VectorRegisterSize() { return 0; } + + /* + * Return the number of reservable vector registers supported + * @param fp_used ‘true’ if floating point computations will be + * executed while vector registers are reserved. + * @return the number of vector registers that are available + * @note The backend should ensure that sufficient vector registers + * are held back to generate scalar code without exhausting vector + * registers, if scalar code also uses the vector registers. + */ + virtual int NumReservableVectorRegisters(bool fp_used) { return 0; } + protected: explicit Backend(ArenaAllocator* arena) : arena_(arena) {} ArenaAllocator* const arena_; diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h index caecb7a48e..799a742032 100644 --- a/compiler/dex/compiler_enums.h +++ b/compiler/dex/compiler_enums.h @@ -133,91 +133,101 @@ enum ExtendedMIROpcode { // could be supported by using a bit in TypeSize and arg[0] where needed. // @brief MIR to move constant data to a vector register - // vA: number of bits in register - // vB: destination + // vA: destination + // vB: number of bits in register // args[0]~args[3]: up to 128 bits of data for initialization kMirOpConstVector, // @brief MIR to move a vectorized register to another - // vA: TypeSize - // vB: destination - // vC: source + // vA: destination + // vB: source + // vC: TypeSize kMirOpMoveVector, // @brief Packed multiply of units in two vector registers: vB = vB .* vC using vA to know the type of the vector. - // vA: TypeSize - // vB: destination and source - // vC: source + // vA: destination and source + // vB: source + // vC: TypeSize kMirOpPackedMultiply, // @brief Packed addition of units in two vector registers: vB = vB .+ vC using vA to know the type of the vector. - // vA: TypeSize - // vB: destination and source - // vC: source + // vA: destination and source + // vB: source + // vC: TypeSize kMirOpPackedAddition, // @brief Packed subtraction of units in two vector registers: vB = vB .- vC using vA to know the type of the vector. - // vA: TypeSize - // vB: destination and source - // vC: source + // vA: destination and source + // vB: source + // vC: TypeSize kMirOpPackedSubtract, // @brief Packed shift left of units in two vector registers: vB = vB .<< vC using vA to know the type of the vector. - // vA: TypeSize - // vB: destination and source - // vC: immediate + // vA: destination and source + // vB: amount to shift + // vC: TypeSize kMirOpPackedShiftLeft, // @brief Packed signed shift right of units in two vector registers: vB = vB .>> vC using vA to know the type of the vector. - // vA: TypeSize - // vB: destination and source - // vC: immediate + // vA: destination and source + // vB: amount to shift + // vC: TypeSize kMirOpPackedSignedShiftRight, // @brief Packed unsigned shift right of units in two vector registers: vB = vB .>>> vC using vA to know the type of the vector. - // vA: TypeSize - // vB: destination and source - // vC: immediate + // vA: destination and source + // vB: amount to shift + // vC: TypeSize kMirOpPackedUnsignedShiftRight, // @brief Packed bitwise and of units in two vector registers: vB = vB .& vC using vA to know the type of the vector. - // vA: TypeSize - // vB: destination and source - // vC: source + // vA: destination and source + // vB: source + // vC: TypeSize kMirOpPackedAnd, // @brief Packed bitwise or of units in two vector registers: vB = vB .| vC using vA to know the type of the vector. - // vA: TypeSize - // vB: destination and source - // vC: source + // vA: destination and source + // vB: source + // vC: TypeSize kMirOpPackedOr, // @brief Packed bitwise xor of units in two vector registers: vB = vB .^ vC using vA to know the type of the vector. - // vA: TypeSize - // vB: destination and source - // vC: source + // vA: destination and source + // vB: source + // vC: TypeSize kMirOpPackedXor, // @brief Reduce a 128-bit packed element into a single VR by taking lower bits // @details Instruction does a horizontal addition of the packed elements and then adds it to VR - // vA: TypeSize - // vB: destination and source VR (not vector register) - // vC: source (vector register) + // vA: destination and source VR (not vector register) + // vB: source (vector register) + // vC: TypeSize kMirOpPackedAddReduce, // @brief Extract a packed element into a single VR. - // vA: TypeSize - // vB: destination VR (not vector register) - // vC: source (vector register) + // vA: destination VR (not vector register) + // vB: source (vector register) + // vC: TypeSize // arg[0]: The index to use for extraction from vector register (which packed element) kMirOpPackedReduce, // @brief Create a vector value, with all TypeSize values equal to vC - // vA: TypeSize - // vB: destination vector register - // vC: source VR (not vector register) + // vA: destination vector register + // vB: source VR (not vector register) + // vC: TypeSize kMirOpPackedSet, + // @brief Reserve N vector registers (named 0..N-1) + // vA: Number of registers + // @note: The backend may choose to map vector numbers used in vector opcodes. + // Reserved registers are removed from the list of backend temporary pool. + kMirOpReserveVectorRegisters, + + // @brief Free Reserved vector registers + // @note: All currently reserved vector registers are returned to the temporary pool. + kMirOpReturnVectorRegisters, + kMirOpLast, }; diff --git a/compiler/dex/mir_dataflow.cc b/compiler/dex/mir_dataflow.cc index 9fea709568..bc99a272a6 100644 --- a/compiler/dex/mir_dataflow.cc +++ b/compiler/dex/mir_dataflow.cc @@ -840,6 +840,54 @@ const uint64_t MIRGraph::oat_data_flow_attributes_[kMirOpLast] = { // 113 MIR_SELECT DF_DA | DF_UB, + + // 114 MirOpConstVector + DF_DA, + + // 115 MirOpMoveVector + 0, + + // 116 MirOpPackedMultiply + 0, + + // 117 MirOpPackedAddition + 0, + + // 118 MirOpPackedSubtract + 0, + + // 119 MirOpPackedShiftLeft + 0, + + // 120 MirOpPackedSignedShiftRight + 0, + + // 121 MirOpPackedUnsignedShiftRight + 0, + + // 122 MirOpPackedAnd + 0, + + // 123 MirOpPackedOr + 0, + + // 124 MirOpPackedXor + 0, + + // 125 MirOpPackedAddReduce + DF_DA | DF_UA, + + // 126 MirOpPackedReduce + DF_DA, + + // 127 MirOpPackedSet + DF_UB, + + // 128 MirOpReserveVectorRegisters + 0, + + // 129 MirOpReturnVectorRegisters + 0, }; /* Return the base virtual register for a SSA name */ diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc index baa46d61bd..8ce0389520 100644 --- a/compiler/dex/mir_graph.cc +++ b/compiler/dex/mir_graph.cc @@ -62,6 +62,8 @@ const char* MIRGraph::extended_mir_op_names_[kMirOpLast - kMirOpFirst] = { "PackedAddReduce", "PackedReduce", "PackedSet", + "ReserveVectorRegisters", + "ReturnVectorRegisters", }; MIRGraph::MIRGraph(CompilationUnit* cu, ArenaAllocator* arena) @@ -836,12 +838,13 @@ void MIRGraph::DumpCFG(const char* dir_prefix, bool all_blocks, const char *suff mir->next ? " | " : " "); } } else { - fprintf(file, " {%04x %s %s %s\\l}%s\\\n", mir->offset, + fprintf(file, " {%04x %s %s %s %s\\l}%s\\\n", mir->offset, mir->ssa_rep ? GetDalvikDisassembly(mir) : !IsPseudoMirOp(opcode) ? Instruction::Name(mir->dalvikInsn.opcode) : extended_mir_op_names_[opcode - kMirOpFirst], (mir->optimization_flags & MIR_IGNORE_RANGE_CHECK) != 0 ? " no_rangecheck" : " ", (mir->optimization_flags & MIR_IGNORE_NULL_CHECK) != 0 ? " no_nullcheck" : " ", + (mir->optimization_flags & MIR_IGNORE_SUSPEND_CHECK) != 0 ? " no_suspendcheck" : " ", mir->next ? " | " : " "); } } diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc index 7baf2d9663..8267e021de 100644 --- a/compiler/dex/quick/x86/assemble_x86.cc +++ b/compiler/dex/quick/x86/assemble_x86.cc @@ -405,9 +405,12 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0, EXT_0F_ENCODING_MAP(Haddpd, 0x66, 0x7C, REG_DEF0_USE0), EXT_0F_ENCODING_MAP(Haddps, 0xF2, 0x7C, REG_DEF0_USE0), - { kX86PextrbRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x14, 0, 0, 1, false }, "PextbRRI", "!0r,!1r,!2d" }, - { kX86PextrwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0xC5, 0x00, 0, 0, 1, false }, "PextwRRI", "!0r,!1r,!2d" }, - { kX86PextrdRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "PextdRRI", "!0r,!1r,!2d" }, + { kX86PextrbRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x14, 0, 0, 1, false }, "PextbRRI", "!0r,!1r,!2d" }, + { kX86PextrwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0xC5, 0x00, 0, 0, 1, false }, "PextwRRI", "!0r,!1r,!2d" }, + { kX86PextrdRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "PextdRRI", "!0r,!1r,!2d" }, + { kX86PextrbMRI, kMemRegImm, IS_QUAD_OP | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "kX86PextrbMRI", "[!0r+!1d],!2r,!3d" }, + { kX86PextrwMRI, kMemRegImm, IS_QUAD_OP | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "kX86PextrwMRI", "[!0r+!1d],!2r,!3d" }, + { kX86PextrdMRI, kMemRegImm, IS_QUAD_OP | REG_USE02 | IS_STORE, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1, false }, "kX86PextrdMRI", "[!0r+!1d],!2r,!3d" }, { kX86PshuflwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0xF2, 0, 0x0F, 0x70, 0, 0, 0, 1, false }, "PshuflwRRI", "!0r,!1r,!2d" }, { kX86PshufdRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x70, 0, 0, 0, 1, false }, "PshuffRRI", "!0r,!1r,!2d" }, diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h index 123fe90d03..123087fe29 100644 --- a/compiler/dex/quick/x86/codegen_x86.h +++ b/compiler/dex/quick/x86/codegen_x86.h @@ -118,6 +118,8 @@ class X86Mir2Lir : public Mir2Lir { void FreeCallTemps(); void LockCallTemps(); void CompilerInitializeRegAlloc(); + int VectorRegisterSize(); + int NumReservableVectorRegisters(bool fp_used); // Required for target - miscellaneous. void AssembleLIR(); @@ -503,6 +505,11 @@ class X86Mir2Lir : public Mir2Lir { void GenFusedLongCmpImmBranch(BasicBlock* bb, RegLocation rl_src1, int64_t val, ConditionCode ccode); void GenConstWide(RegLocation rl_dest, int64_t value); + void GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir); + void GenShiftByteVector(BasicBlock *bb, MIR *mir); + void AndMaskVectorRegister(RegStorage rs_src1, uint32_t m1, uint32_t m2, uint32_t m3, uint32_t m4); + void MaskVectorRegister(X86OpCode opcode, RegStorage rs_src1, uint32_t m1, uint32_t m2, uint32_t m3, uint32_t m4); + void AppendOpcodeWithConst(X86OpCode opcode, int reg, MIR* mir); static bool ProvidesFullMemoryBarrier(X86OpCode opcode); @@ -513,6 +520,12 @@ class X86Mir2Lir : public Mir2Lir { virtual RegStorage AllocateByteRegister(); /* + * @brief Use a wide temporary as a 128-bit register + * @returns a 128-bit temporary register. + */ + virtual RegStorage Get128BitRegister(RegStorage reg); + + /* * @brief Check if a register is byte addressable. * @returns true if a register is byte addressable. */ @@ -527,6 +540,22 @@ class X86Mir2Lir : public Mir2Lir { */ bool GenInlinedIndexOf(CallInfo* info, bool zero_based); + /** + * @brief Reserve a fixed number of vector registers from the register pool + * @details The mir->dalvikInsn.vA specifies an N such that vector registers + * [0..N-1] are removed from the temporary pool. The caller must call + * ReturnVectorRegisters before calling ReserveVectorRegisters again. + * Also sets the num_reserved_vector_regs_ to the specified value + * @param mir whose vA specifies the number of registers to reserve + */ + void ReserveVectorRegisters(MIR* mir); + + /** + * @brief Return all the reserved vector registers to the temp pool + * @details Returns [0..num_reserved_vector_regs_] + */ + void ReturnVectorRegisters(); + /* * @brief Load 128 bit constant into vector register. * @param bb The basic block in which the MIR is from. @@ -900,6 +929,10 @@ class X86Mir2Lir : public Mir2Lir { LIR *AddVectorLiteral(MIR *mir); InToRegStorageMapping in_to_reg_storage_mapping_; + + private: + // The number of vector registers [0..N] reserved by a call to ReserveVectorRegisters + int num_reserved_vector_regs_; }; } // namespace art diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc index 72e47d06b1..7791e138fd 100755 --- a/compiler/dex/quick/x86/target_x86.cc +++ b/compiler/dex/quick/x86/target_x86.cc @@ -427,6 +427,10 @@ RegStorage X86Mir2Lir::AllocateByteRegister() { return reg; } +RegStorage X86Mir2Lir::Get128BitRegister(RegStorage reg) { + return GetRegInfo(reg)->FindMatchingView(RegisterInfo::k128SoloStorageMask)->GetReg(); +} + bool X86Mir2Lir::IsByteRegister(RegStorage reg) { return cu_->target64 || reg.GetRegNum() < rs_rX86_SP.GetRegNum(); } @@ -646,6 +650,14 @@ void X86Mir2Lir::CompilerInitializeRegAlloc() { reg_pool_->next_dp_reg_ = 1; } +int X86Mir2Lir::VectorRegisterSize() { + return 128; +} + +int X86Mir2Lir::NumReservableVectorRegisters(bool fp_used) { + return fp_used ? 5 : 7; +} + void X86Mir2Lir::SpillCoreRegs() { if (num_core_spills_ == 0) { return; @@ -790,6 +802,9 @@ X86Mir2Lir::X86Mir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* rX86_RET1 = rDX; rX86_INVOKE_TGT = rAX; rX86_COUNT = rCX; + + // Initialize the number of reserved vector registers + num_reserved_vector_regs_ = -1; } Mir2Lir* X86CodeGenerator(CompilationUnit* const cu, MIRGraph* const mir_graph, @@ -1358,6 +1373,12 @@ std::vector<uint8_t>* X86Mir2Lir::ReturnCallFrameInformation() { void X86Mir2Lir::GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) { switch (static_cast<ExtendedMIROpcode>(mir->dalvikInsn.opcode)) { + case kMirOpReserveVectorRegisters: + ReserveVectorRegisters(mir); + break; + case kMirOpReturnVectorRegisters: + ReturnVectorRegisters(); + break; case kMirOpConstVector: GenConst128(bb, mir); break; @@ -1405,11 +1426,57 @@ void X86Mir2Lir::GenMachineSpecificExtendedMethodMIR(BasicBlock* bb, MIR* mir) { } } +void X86Mir2Lir::ReserveVectorRegisters(MIR* mir) { + // We should not try to reserve twice without returning the registers + DCHECK_NE(num_reserved_vector_regs_, -1); + + int num_vector_reg = mir->dalvikInsn.vA; + for (int i = 0; i < num_vector_reg; i++) { + RegStorage xp_reg = RegStorage::Solo128(i); + RegisterInfo *xp_reg_info = GetRegInfo(xp_reg); + Clobber(xp_reg); + + for (RegisterInfo *info = xp_reg_info->GetAliasChain(); + info != nullptr; + info = info->GetAliasChain()) { + if (info->GetReg().IsSingle()) { + reg_pool_->sp_regs_.Delete(info); + } else { + reg_pool_->dp_regs_.Delete(info); + } + } + } + + num_reserved_vector_regs_ = num_vector_reg; +} + +void X86Mir2Lir::ReturnVectorRegisters() { + // Return all the reserved registers + for (int i = 0; i < num_reserved_vector_regs_; i++) { + RegStorage xp_reg = RegStorage::Solo128(i); + RegisterInfo *xp_reg_info = GetRegInfo(xp_reg); + + for (RegisterInfo *info = xp_reg_info->GetAliasChain(); + info != nullptr; + info = info->GetAliasChain()) { + if (info->GetReg().IsSingle()) { + reg_pool_->sp_regs_.Insert(info); + } else { + reg_pool_->dp_regs_.Insert(info); + } + } + } + + // We don't have anymore reserved vector registers + num_reserved_vector_regs_ = -1; +} + void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) { - int type_size = mir->dalvikInsn.vA; + store_method_addr_used_ = true; + int type_size = mir->dalvikInsn.vB; // We support 128 bit vectors. DCHECK_EQ(type_size & 0xFFFF, 128); - RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB); + RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA); uint32_t *args = mir->dalvikInsn.arg; int reg = rs_dest.GetReg(); // Check for all 0 case. @@ -1417,6 +1484,12 @@ void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) { NewLIR2(kX86XorpsRR, reg, reg); return; } + + // Append the mov const vector to reg opcode. + AppendOpcodeWithConst(kX86MovupsRM, reg, mir); +} + +void X86Mir2Lir::AppendOpcodeWithConst(X86OpCode opcode, int reg, MIR* mir) { // Okay, load it from the constant vector area. LIR *data_target = ScanVectorLiteral(mir); if (data_target == nullptr) { @@ -1436,24 +1509,66 @@ void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) { // 4 byte offset. We will fix this up in the assembler later to have the right // value. ScopedMemRefType mem_ref_type(this, ResourceMask::kLiteral); - LIR *load = NewLIR3(kX86Mova128RM, reg, rl_method.reg.GetReg(), 256 /* bogus */); + LIR *load = NewLIR2(opcode, reg, rl_method.reg.GetReg()); load->flags.fixup = kFixupLoad; load->target = data_target; } void X86Mir2Lir::GenMoveVector(BasicBlock *bb, MIR *mir) { // We only support 128 bit registers. - DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U); - RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB); - RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vC); + DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); + RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA); + RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vB); NewLIR2(kX86Mova128RR, rs_dest.GetReg(), rs_src.GetReg()); } +void X86Mir2Lir::GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir) { + const int BYTE_SIZE = 8; + RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); + RegStorage rs_src1_high_tmp = Get128BitRegister(AllocTempWide()); + + /* + * Emulate the behavior of a kSignedByte by separating out the 16 values in the two XMM + * and multiplying 8 at a time before recombining back into one XMM register. + * + * let xmm1, xmm2 be real srcs (keep low bits of 16bit lanes) + * xmm3 is tmp (operate on high bits of 16bit lanes) + * + * xmm3 = xmm1 + * xmm1 = xmm1 .* xmm2 + * xmm1 = xmm1 & 0x00ff00ff00ff00ff00ff00ff00ff00ff // xmm1 now has low bits + * xmm3 = xmm3 .>> 8 + * xmm2 = xmm2 & 0xff00ff00ff00ff00ff00ff00ff00ff00 + * xmm2 = xmm2 .* xmm3 // xmm2 now has high bits + * xmm1 = xmm1 | xmm2 // combine results + */ + + // Copy xmm1. + NewLIR2(kX86Mova128RR, rs_src1_high_tmp.GetReg(), rs_dest_src1.GetReg()); + + // Multiply low bits. + NewLIR2(kX86PmullwRR, rs_dest_src1.GetReg(), rs_src2.GetReg()); + + // xmm1 now has low bits. + AndMaskVectorRegister(rs_dest_src1, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF); + + // Prepare high bits for multiplication. + NewLIR2(kX86PsrlwRI, rs_src1_high_tmp.GetReg(), BYTE_SIZE); + AndMaskVectorRegister(rs_src2, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00); + + // Multiply high bits and xmm2 now has high bits. + NewLIR2(kX86PmullwRR, rs_src2.GetReg(), rs_src1_high_tmp.GetReg()); + + // Combine back into dest XMM register. + NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg()); +} + void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) { - DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U); - OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16); - RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB); - RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC); + DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); + OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); + RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); int opcode = 0; switch (opsize) { case k32: @@ -1468,6 +1583,10 @@ void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) { case kDouble: opcode = kX86MulpdRR; break; + case kSignedByte: + // HW doesn't support 16x16 byte multiplication so emulate it. + GenMultiplyVectorSignedByte(bb, mir); + return; default: LOG(FATAL) << "Unsupported vector multiply " << opsize; break; @@ -1476,10 +1595,10 @@ void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) { } void X86Mir2Lir::GenAddVector(BasicBlock *bb, MIR *mir) { - DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U); - OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16); - RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB); - RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC); + DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); + OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); + RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); int opcode = 0; switch (opsize) { case k32: @@ -1507,10 +1626,10 @@ void X86Mir2Lir::GenAddVector(BasicBlock *bb, MIR *mir) { } void X86Mir2Lir::GenSubtractVector(BasicBlock *bb, MIR *mir) { - DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U); - OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16); - RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB); - RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC); + DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); + OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); + RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); int opcode = 0; switch (opsize) { case k32: @@ -1537,11 +1656,60 @@ void X86Mir2Lir::GenSubtractVector(BasicBlock *bb, MIR *mir) { NewLIR2(opcode, rs_dest_src1.GetReg(), rs_src2.GetReg()); } +void X86Mir2Lir::GenShiftByteVector(BasicBlock *bb, MIR *mir) { + RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + RegStorage rs_tmp = Get128BitRegister(AllocTempWide()); + + int opcode = 0; + int imm = mir->dalvikInsn.vB; + + switch (static_cast<ExtendedMIROpcode>(mir->dalvikInsn.opcode)) { + case kMirOpPackedShiftLeft: + opcode = kX86PsllwRI; + break; + case kMirOpPackedSignedShiftRight: + opcode = kX86PsrawRI; + break; + case kMirOpPackedUnsignedShiftRight: + opcode = kX86PsrlwRI; + break; + default: + LOG(FATAL) << "Unsupported shift operation on byte vector " << opcode; + break; + } + + /* + * xmm1 will have low bits + * xmm2 will have high bits + * + * xmm2 = xmm1 + * xmm1 = xmm1 .<< N + * xmm2 = xmm2 && 0xFF00FF00FF00FF00FF00FF00FF00FF00 + * xmm2 = xmm2 .<< N + * xmm1 = xmm1 | xmm2 + */ + + // Copy xmm1. + NewLIR2(kX86Mova128RR, rs_tmp.GetReg(), rs_dest_src1.GetReg()); + + // Shift lower values. + NewLIR2(opcode, rs_dest_src1.GetReg(), imm); + + // Mask bottom bits. + AndMaskVectorRegister(rs_tmp, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00); + + // Shift higher values. + NewLIR2(opcode, rs_tmp.GetReg(), imm); + + // Combine back into dest XMM register. + NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_tmp.GetReg()); +} + void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) { - DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U); - OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16); - RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB); - int imm = mir->dalvikInsn.vC; + DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); + OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); + RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + int imm = mir->dalvikInsn.vB; int opcode = 0; switch (opsize) { case k32: @@ -1554,6 +1722,10 @@ void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) { case kUnsignedHalf: opcode = kX86PsllwRI; break; + case kSignedByte: + case kUnsignedByte: + GenShiftByteVector(bb, mir); + return; default: LOG(FATAL) << "Unsupported vector shift left " << opsize; break; @@ -1562,10 +1734,10 @@ void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) { } void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) { - DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U); - OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16); - RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB); - int imm = mir->dalvikInsn.vC; + DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); + OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); + RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + int imm = mir->dalvikInsn.vB; int opcode = 0; switch (opsize) { case k32: @@ -1575,6 +1747,10 @@ void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) { case kUnsignedHalf: opcode = kX86PsrawRI; break; + case kSignedByte: + case kUnsignedByte: + GenShiftByteVector(bb, mir); + return; default: LOG(FATAL) << "Unsupported vector signed shift right " << opsize; break; @@ -1583,10 +1759,10 @@ void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) { } void X86Mir2Lir::GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir) { - DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U); - OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16); - RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB); - int imm = mir->dalvikInsn.vC; + DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); + OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); + RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + int imm = mir->dalvikInsn.vB; int opcode = 0; switch (opsize) { case k32: @@ -1599,6 +1775,10 @@ void X86Mir2Lir::GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir) { case kUnsignedHalf: opcode = kX86PsrlwRI; break; + case kSignedByte: + case kUnsignedByte: + GenShiftByteVector(bb, mir); + return; default: LOG(FATAL) << "Unsupported vector unsigned shift right " << opsize; break; @@ -1608,91 +1788,209 @@ void X86Mir2Lir::GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir) { void X86Mir2Lir::GenAndVector(BasicBlock *bb, MIR *mir) { // We only support 128 bit registers. - DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U); - RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB); - RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC); + DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); + RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); NewLIR2(kX86PandRR, rs_dest_src1.GetReg(), rs_src2.GetReg()); } void X86Mir2Lir::GenOrVector(BasicBlock *bb, MIR *mir) { // We only support 128 bit registers. - DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U); - RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB); - RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC); + DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); + RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg()); } void X86Mir2Lir::GenXorVector(BasicBlock *bb, MIR *mir) { // We only support 128 bit registers. - DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U); - RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB); - RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC); + DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); + RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA); + RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB); NewLIR2(kX86PxorRR, rs_dest_src1.GetReg(), rs_src2.GetReg()); } +void X86Mir2Lir::AndMaskVectorRegister(RegStorage rs_src1, uint32_t m1, uint32_t m2, uint32_t m3, uint32_t m4) { + MaskVectorRegister(kX86PandRM, rs_src1, m1, m2, m3, m4); +} + +void X86Mir2Lir::MaskVectorRegister(X86OpCode opcode, RegStorage rs_src1, uint32_t m0, uint32_t m1, uint32_t m2, uint32_t m3) { + // Create temporary MIR as container for 128-bit binary mask. + MIR const_mir; + MIR* const_mirp = &const_mir; + const_mirp->dalvikInsn.opcode = static_cast<Instruction::Code>(kMirOpConstVector); + const_mirp->dalvikInsn.arg[0] = m0; + const_mirp->dalvikInsn.arg[1] = m1; + const_mirp->dalvikInsn.arg[2] = m2; + const_mirp->dalvikInsn.arg[3] = m3; + + // Mask vector with const from literal pool. + AppendOpcodeWithConst(opcode, rs_src1.GetReg(), const_mirp); +} + void X86Mir2Lir::GenAddReduceVector(BasicBlock *bb, MIR *mir) { - DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U); - OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16); - RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB); - int imm = mir->dalvikInsn.vC; + OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); + RegStorage rs_src1 = RegStorage::Solo128(mir->dalvikInsn.vB); + RegLocation rl_dest = mir_graph_->GetDest(mir); + RegStorage rs_tmp; + + int vec_bytes = (mir->dalvikInsn.vC & 0xFFFF) / 8; + int vec_unit_size = 0; int opcode = 0; + int extr_opcode = 0; + RegLocation rl_result; + switch (opsize) { case k32: + extr_opcode = kX86PextrdRRI; opcode = kX86PhadddRR; + vec_unit_size = 4; + break; + case kSignedByte: + case kUnsignedByte: + extr_opcode = kX86PextrbRRI; + opcode = kX86PhaddwRR; + vec_unit_size = 2; break; case kSignedHalf: case kUnsignedHalf: + extr_opcode = kX86PextrwRRI; opcode = kX86PhaddwRR; + vec_unit_size = 2; break; + case kSingle: + rl_result = EvalLoc(rl_dest, kFPReg, true); + vec_unit_size = 4; + for (int i = 0; i < 3; i++) { + NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), rs_src1.GetReg()); + NewLIR3(kX86ShufpsRRI, rs_src1.GetReg(), rs_src1.GetReg(), 0x39); + } + NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), rs_src1.GetReg()); + StoreValue(rl_dest, rl_result); + + // For single-precision floats, we are done here + return; default: LOG(FATAL) << "Unsupported vector add reduce " << opsize; break; } - NewLIR2(opcode, rs_dest_src1.GetReg(), imm); + + int elems = vec_bytes / vec_unit_size; + + // Emulate horizontal add instruction by reducing 2 vectors with 8 values before adding them again + // TODO is overflow handled correctly? + if (opsize == kSignedByte || opsize == kUnsignedByte) { + rs_tmp = Get128BitRegister(AllocTempWide()); + + // tmp = xmm1 .>> 8. + NewLIR2(kX86Mova128RR, rs_tmp.GetReg(), rs_src1.GetReg()); + NewLIR2(kX86PsrlwRI, rs_tmp.GetReg(), 8); + + // Zero extend low bits in xmm1. + AndMaskVectorRegister(rs_src1, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF); + } + + while (elems > 1) { + if (opsize == kSignedByte || opsize == kUnsignedByte) { + NewLIR2(opcode, rs_tmp.GetReg(), rs_tmp.GetReg()); + } + NewLIR2(opcode, rs_src1.GetReg(), rs_src1.GetReg()); + elems >>= 1; + } + + // Combine the results if we separated them. + if (opsize == kSignedByte || opsize == kUnsignedByte) { + NewLIR2(kX86PaddbRR, rs_src1.GetReg(), rs_tmp.GetReg()); + } + + // We need to extract to a GPR. + RegStorage temp = AllocTemp(); + NewLIR3(extr_opcode, temp.GetReg(), rs_src1.GetReg(), 0); + + // Can we do this directly into memory? + rl_result = UpdateLocTyped(rl_dest, kCoreReg); + if (rl_result.location == kLocPhysReg) { + // Ensure res is in a core reg + rl_result = EvalLoc(rl_dest, kCoreReg, true); + OpRegReg(kOpAdd, rl_result.reg, temp); + StoreFinalValue(rl_dest, rl_result); + } else { + OpMemReg(kOpAdd, rl_result, temp.GetReg()); + } + + FreeTemp(temp); } void X86Mir2Lir::GenReduceVector(BasicBlock *bb, MIR *mir) { - DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U); - OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16); - RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vB); - int index = mir->dalvikInsn.arg[0]; - int opcode = 0; + OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); + RegLocation rl_dest = mir_graph_->GetDest(mir); + RegStorage rs_src1 = RegStorage::Solo128(mir->dalvikInsn.vB); + int extract_index = mir->dalvikInsn.arg[0]; + int extr_opcode = 0; + RegLocation rl_result; + bool is_wide = false; + switch (opsize) { case k32: - opcode = kX86PextrdRRI; + rl_result = UpdateLocTyped(rl_dest, kCoreReg); + extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrdMRI : kX86PextrdRRI; break; case kSignedHalf: case kUnsignedHalf: - opcode = kX86PextrwRRI; - break; - case kUnsignedByte: - case kSignedByte: - opcode = kX86PextrbRRI; + rl_result= UpdateLocTyped(rl_dest, kCoreReg); + extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrwMRI : kX86PextrwRRI; break; default: - LOG(FATAL) << "Unsupported vector reduce " << opsize; + LOG(FATAL) << "Unsupported vector add reduce " << opsize; + return; break; } - // We need to extract to a GPR. - RegStorage temp = AllocTemp(); - NewLIR3(opcode, temp.GetReg(), rs_src.GetReg(), index); - // Assume that the destination VR is in the def for the mir. - RegLocation rl_dest = mir_graph_->GetDest(mir); - RegLocation rl_temp = - {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, temp, INVALID_SREG, INVALID_SREG}; - StoreValue(rl_dest, rl_temp); + if (rl_result.location == kLocPhysReg) { + NewLIR3(extr_opcode, rl_result.reg.GetReg(), rs_src1.GetReg(), extract_index); + if (is_wide == true) { + StoreFinalValue(rl_dest, rl_result); + } else { + StoreFinalValueWide(rl_dest, rl_result); + } + } else { + int displacement = SRegOffset(rl_result.s_reg_low); + LIR *l = NewLIR3(extr_opcode, rs_rX86_SP.GetReg(), displacement, rs_src1.GetReg()); + AnnotateDalvikRegAccess(l, displacement >> 2, true /* is_load */, is_wide /* is_64bit */); + AnnotateDalvikRegAccess(l, displacement >> 2, false /* is_load */, is_wide /* is_64bit */); + } } void X86Mir2Lir::GenSetVector(BasicBlock *bb, MIR *mir) { - DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U); - OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16); - RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB); - int op_low = 0, op_high = 0; + DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U); + OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16); + RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA); + int op_low = 0, op_high = 0, imm = 0, op_mov = kX86MovdxrRR; + RegisterClass reg_type = kCoreReg; + switch (opsize) { case k32: op_low = kX86PshufdRRI; break; + case kSingle: + op_low = kX86PshufdRRI; + op_mov = kX86Mova128RR; + reg_type = kFPReg; + break; + case k64: + op_low = kX86PshufdRRI; + imm = 0x44; + break; + case kDouble: + op_low = kX86PshufdRRI; + op_mov = kX86Mova128RR; + reg_type = kFPReg; + imm = 0x44; + break; + case kSignedByte: + case kUnsignedByte: + // Shuffle 8 bit value into 16 bit word. + // We set val = val + (val << 8) below and use 16 bit shuffle. case kSignedHalf: case kUnsignedHalf: // Handles low quadword. @@ -1705,23 +2003,37 @@ void X86Mir2Lir::GenSetVector(BasicBlock *bb, MIR *mir) { break; } - // Load the value from the VR into a GPR. RegLocation rl_src = mir_graph_->GetSrc(mir, 0); - rl_src = LoadValue(rl_src, kCoreReg); + + // Load the value from the VR into the reg. + if (rl_src.wide == 0) { + rl_src = LoadValue(rl_src, reg_type); + } else { + rl_src = LoadValueWide(rl_src, reg_type); + } + + // If opsize is 8 bits wide then double value and use 16 bit shuffle instead. + if (opsize == kSignedByte || opsize == kUnsignedByte) { + RegStorage temp = AllocTemp(); + // val = val + (val << 8). + NewLIR2(kX86Mov32RR, temp.GetReg(), rl_src.reg.GetReg()); + NewLIR2(kX86Sal32RI, temp.GetReg(), 8); + NewLIR2(kX86Or32RR, rl_src.reg.GetReg(), temp.GetReg()); + FreeTemp(temp); + } // Load the value into the XMM register. - NewLIR2(kX86MovdxrRR, rs_dest.GetReg(), rl_src.reg.GetReg()); + NewLIR2(op_mov, rs_dest.GetReg(), rl_src.reg.GetReg()); // Now shuffle the value across the destination. - NewLIR3(op_low, rs_dest.GetReg(), rs_dest.GetReg(), 0); + NewLIR3(op_low, rs_dest.GetReg(), rs_dest.GetReg(), imm); // And then repeat as needed. if (op_high != 0) { - NewLIR3(op_high, rs_dest.GetReg(), rs_dest.GetReg(), 0); + NewLIR3(op_high, rs_dest.GetReg(), rs_dest.GetReg(), imm); } } - LIR *X86Mir2Lir::ScanVectorLiteral(MIR *mir) { int *args = reinterpret_cast<int*>(mir->dalvikInsn.arg); for (LIR *p = const_vectors_; p != nullptr; p = p->next) { diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h index e271e9d100..d361be7e1b 100644 --- a/compiler/dex/quick/x86/x86_lir.h +++ b/compiler/dex/quick/x86/x86_lir.h @@ -569,6 +569,9 @@ enum X86OpCode { kX86PextrbRRI, // Extract 8 bits from XMM into GPR kX86PextrwRRI, // Extract 16 bits from XMM into GPR kX86PextrdRRI, // Extract 32 bits from XMM into GPR + kX86PextrbMRI, // Extract 8 bits from XMM into memory + kX86PextrwMRI, // Extract 16 bits from XMM into memory + kX86PextrdMRI, // Extract 32 bits from XMM into memory kX86PshuflwRRI, // Shuffle 16 bits in lower 64 bits of XMM. kX86PshufdRRI, // Shuffle 32 bits in XMM. kX86ShufpsRRI, // FP Shuffle 32 bits in XMM. diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc index e6cbf05744..80ddbd5809 100644 --- a/disassembler/disassembler_x86.cc +++ b/disassembler/disassembler_x86.cc @@ -426,6 +426,20 @@ DISASSEMBLER_ENTRY(cmp, instr++; if (prefix[2] == 0x66) { switch (*instr) { + case 0x01: + opcode << "phaddw"; + prefix[2] = 0; + has_modrm = true; + load = true; + src_reg_file = dst_reg_file = SSE; + break; + case 0x02: + opcode << "phaddd"; + prefix[2] = 0; + has_modrm = true; + load = true; + src_reg_file = dst_reg_file = SSE; + break; case 0x40: opcode << "pmulld"; prefix[2] = 0; @@ -449,7 +463,7 @@ DISASSEMBLER_ENTRY(cmp, prefix[2] = 0; has_modrm = true; store = true; - dst_reg_file = SSE; + src_reg_file = SSE; immediate_bytes = 1; break; case 0x16: @@ -457,7 +471,7 @@ DISASSEMBLER_ENTRY(cmp, prefix[2] = 0; has_modrm = true; store = true; - dst_reg_file = SSE; + src_reg_file = SSE; immediate_bytes = 1; break; default: @@ -742,7 +756,7 @@ DISASSEMBLER_ENTRY(cmp, prefix[2] = 0; has_modrm = true; store = true; - src_reg_file = dst_reg_file = SSE; + src_reg_file = SSE; immediate_bytes = 1; } else { opcode << StringPrintf("unknown opcode '0F %02X'", *instr); |