aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorChris Lattner <sabre@nondot.org>2007-12-29 06:41:28 +0000
committerChris Lattner <sabre@nondot.org>2007-12-29 06:41:28 +0000
commit40758734ced401f1e71b4c840e67d1f81bb6c640 (patch)
tree65c56b163f38ef5fbd9d996e8d81c7e006ce31d6 /lib
parent447d8e8978ea81da25e7cac988a57998ce2cba6d (diff)
downloadexternal_llvm-40758734ced401f1e71b4c840e67d1f81bb6c640.tar.gz
external_llvm-40758734ced401f1e71b4c840e67d1f81bb6c640.tar.bz2
external_llvm-40758734ced401f1e71b4c840e67d1f81bb6c640.zip
avoid going through a stack slot to convert from fpstack to xmm reg
if we are just going to store it back anyway. This improves things like: double foo(); void bar(double *P) { *P = foo(); } git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@45399 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib')
-rw-r--r--lib/Target/X86/README.txt21
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp47
2 files changed, 39 insertions, 29 deletions
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index 67eb2ce1a5..46f31164d5 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -1636,24 +1636,3 @@ a stride-4 IV, would would allow all the scales in the loop to go away.
This would result in smaller code and more efficient microops.
//===---------------------------------------------------------------------===//
-
-We should be smarter about conversion from fpstack to XMM regs.
-
-double foo();
-void bar(double *P) { *P = foo(); }
-
-We compile that to:
-
-_bar:
- subl $12, %esp
- call L_foo$stub
- fstpl (%esp)
- movl 16(%esp), %eax
- movsd (%esp), %xmm0
- movsd %xmm0, (%eax)
- addl $12, %esp
- ret
-
-for example. The magic to/from the stack is unneeded.
-
-//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 79aaaebb01..598536d8e3 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -33,7 +33,6 @@
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SSARegMap.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/ADT/SmallSet.h"
@@ -812,7 +811,6 @@ LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall,
CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
CCInfo.AnalyzeCallResult(TheCall, RetCC_X86);
-
SmallVector<SDOperand, 8> ResultVals;
// Copy all of the result registers out of their specified physreg.
@@ -838,17 +836,50 @@ LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall,
// an XMM register.
if ((X86ScalarSSEf32 && RVLocs[0].getValVT() == MVT::f32) ||
(X86ScalarSSEf64 && RVLocs[0].getValVT() == MVT::f64)) {
+ SDOperand StoreLoc;
+ const Value *SrcVal = 0;
+ int SrcValOffset = 0;
+
+ // Determine where to store the value. If the call result is directly
+ // used by a store, see if we can store directly into the location. In
+ // this case, we'll end up producing a fst + movss[load] + movss[store] to
+ // the same location, and the two movss's will be nuked as dead. This
+ // optimizes common things like "*D = atof(..)" to not need an
+ // intermediate stack slot.
+ if (SDOperand(TheCall, 0).hasOneUse() &&
+ SDOperand(TheCall, 1).hasOneUse()) {
+ // Ok, we have one use of the value and one use of the chain. See if
+ // they are the same node: a store.
+ if (StoreSDNode *N = dyn_cast<StoreSDNode>(*TheCall->use_begin())) {
+ if (N->getChain().Val == TheCall && N->getValue().Val == TheCall &&
+ !N->isVolatile() && !N->isTruncatingStore() &&
+ N->getAddressingMode() == ISD::UNINDEXED) {
+ StoreLoc = N->getBasePtr();
+ SrcVal = N->getSrcValue();
+ SrcValOffset = N->getSrcValueOffset();
+ }
+ }
+ }
+
+ // If we weren't able to optimize the result, just create a temporary
+ // stack slot.
+ if (StoreLoc.Val == 0) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
+ StoreLoc = DAG.getFrameIndex(SSFI, getPointerTy());
+ }
+
// FIXME: Currently the FST is flagged to the FP_GET_RESULT. This
// shouldn't be necessary except that RFP cannot be live across
- // multiple blocks. When stackifier is fixed, they can be uncoupled.
- MachineFunction &MF = DAG.getMachineFunction();
- int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
- SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+ // multiple blocks (which could happen if a select gets lowered into
+ // multiple blocks and scheduled in between them). When stackifier is
+ // fixed, they can be uncoupled.
SDOperand Ops[] = {
- Chain, RetVal, StackSlot, DAG.getValueType(RVLocs[0].getValVT()), InFlag
+ Chain, RetVal, StoreLoc, DAG.getValueType(RVLocs[0].getValVT()), InFlag
};
Chain = DAG.getNode(X86ISD::FST, MVT::Other, Ops, 5);
- RetVal = DAG.getLoad(RVLocs[0].getValVT(), Chain, StackSlot, NULL, 0);
+ RetVal = DAG.getLoad(RVLocs[0].getValVT(), Chain,
+ StoreLoc, SrcVal, SrcValOffset);
Chain = RetVal.getValue(1);
}
ResultVals.push_back(RetVal);