aboutsummaryrefslogtreecommitdiffstats
path: root/gcc-4.7
diff options
context:
space:
mode:
authorPavel Chupin <pavel.v.chupin@intel.com>2013-06-03 17:20:10 +0400
committerPavel Chupin <pavel.v.chupin@intel.com>2013-06-04 12:31:31 +0400
commit01b34967a57ca33621130d36e007214b93bdfeaa (patch)
tree500dc83dc102a108e21ba6a4f57ca491343a30d1 /gcc-4.7
parent41eff3d706b202f682f64fcc7773c64abd59ac45 (diff)
downloadtoolchain_gcc-01b34967a57ca33621130d36e007214b93bdfeaa.tar.gz
toolchain_gcc-01b34967a57ca33621130d36e007214b93bdfeaa.tar.bz2
toolchain_gcc-01b34967a57ca33621130d36e007214b93bdfeaa.zip
[4.7, 4.8] Extended Silvermont tuning.
Backport r199546 from trunk: 2013-05-31 Yuri Rumyantsev <yuri.s.rumyantsev@intel.com> Igor Zamyatin <igor.zamyatin@intel.com> Silvermont (SLM) architecture performance tuning. * config/i386/i386.h (enum ix86_tune_indices): Add X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS. (TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS): New define. * config/i386/i386.c (initial_ix86_tune_features) <X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS>: Initialize. (ix86_lea_outperforms): Handle Silvermont tuning. (ix86_avoid_lea_for_add): Add new argument to ix86_lea_outperforms call. (ix86_use_lea_for_mov): Likewise. (ix86_avoid_lea_for_addr): Likewise. (ix86_lea_for_add_ok): Likewise. (exact_dependency_1): New function. (exact_store_load_dependency): Likewise. (ix86_adjust_cost): Handle Silvermont tuning. (do_reoder_for_imul): Likewise. (swap_top_of_ready_list): New function. (ix86_sched_reorder): Changed to handle Silvermont tuning. * config/i386/i386.md (peepholes that split memory operand in fp converts): New. Also backport r199611 with fixes for the patch above and previous SLM patch: 2013-06-03 Yuri Rumyantsev <yuri.s.rumyantsev@intel.com> * config/i386/i386.c (ix86_lea_outperforms): Fix formatting. (ix86_avoid_lea_for_addr): Likewise. (exact_dependency_1): Likewise. (ix86_adjust_cost): Likewise. (swap_top_of_ready_list): Fix formatting and !reload_completed check removed. (do_reorder_for_imul): Fix typo, formatting and !reload_completed check removed. (ix86_sched_reorder): Fix typo and formatting. (fold_builtin_cpu): Move M_INTEL_SLM at the end of processor types list. * config/i386/cpuinfo.c (INTEL_SLM): New enum value. Note that [4.7] part of the patch doesn't contain some of optimizations (IMUL) due to missed dependencies. [4.8] part of this backport is complete. Change-Id: I4b5f92b025aab217046f5b393527636f3cf25669 Signed-off-by: Pavel Chupin <pavel.v.chupin@intel.com>
Diffstat (limited to 'gcc-4.7')
-rw-r--r--gcc-4.7/gcc/config/i386/i386.c129
-rw-r--r--gcc-4.7/gcc/config/i386/i386.h3
-rw-r--r--gcc-4.7/gcc/config/i386/i386.md24
3 files changed, 151 insertions, 5 deletions
diff --git a/gcc-4.7/gcc/config/i386/i386.c b/gcc-4.7/gcc/config/i386/i386.c
index 5c2dd46c3..b9f9d7728 100644
--- a/gcc-4.7/gcc/config/i386/i386.c
+++ b/gcc-4.7/gcc/config/i386/i386.c
@@ -2252,6 +2252,10 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
/* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
during reassociation of fp computation. */
m_ATOM | m_SLM,
+
+ /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
+ fp converts to destination register. */
+ m_SLM
};
/* Feature tests against the various architecture variations. */
@@ -16938,10 +16942,24 @@ distance_agu_use (unsigned int regno0, rtx insn)
static bool
ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
- unsigned int regno2, int split_cost)
+ unsigned int regno2, int split_cost, bool has_scale)
{
int dist_define, dist_use;
+ /* For Silvermont if using a 2-source or 3-source LEA for
+ non-destructive destination purposes, or due to wanting
+ ability to use SCALE, the use of LEA is justified. */
+ if (ix86_tune == PROCESSOR_SLM)
+ {
+ if (has_scale)
+ return true;
+ if (split_cost < 1)
+ return false;
+ if (regno0 == regno1 || regno0 == regno2)
+ return false;
+ return true;
+ }
+
dist_define = distance_non_agu_define (regno1, regno2, insn);
dist_use = distance_agu_use (regno0, insn);
@@ -17028,7 +17046,7 @@ ix86_avoid_lea_for_add (rtx insn, rtx operands[])
if (regno0 == regno1 || regno0 == regno2)
return false;
else
- return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
+ return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
}
/* Return true if we should emit lea instruction instead of mov
@@ -17051,7 +17069,7 @@ ix86_use_lea_for_mov (rtx insn, rtx operands[])
regno0 = true_regnum (operands[0]);
regno1 = true_regnum (operands[1]);
- return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
+ return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
}
/* Return true if we need to split lea into a sequence of
@@ -17131,7 +17149,8 @@ ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
split_cost -= 1;
}
- return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
+ return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
+ parts.scale > 1);
}
/* Emit x86 binary operand CODE in mode MODE, where the first operand
@@ -17269,7 +17288,7 @@ ix86_lea_for_add_ok (rtx insn, rtx operands[])
if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
return false;
- return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
+ return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
}
/* Return true if destination reg of SET_BODY is shift count of
@@ -23894,6 +23913,73 @@ ix86_agi_dependent (rtx set_insn, rtx use_insn)
return false;
}
+/* Helper function for exact_store_load_dependency.
+ Return true if addr is found in insn. */
+static bool
+exact_dependency_1 (rtx addr, rtx insn)
+{
+ enum rtx_code code;
+ const char *format_ptr;
+ int i, j;
+
+ code = GET_CODE (insn);
+ switch (code)
+ {
+ case MEM:
+ if (rtx_equal_p (addr, insn))
+ return true;
+ break;
+ case REG:
+ CASE_CONST_ANY:
+ case SYMBOL_REF:
+ case CODE_LABEL:
+ case PC:
+ case CC0:
+ case EXPR_LIST:
+ return false;
+ default:
+ break;
+ }
+
+ format_ptr = GET_RTX_FORMAT (code);
+ for (i = 0; i < GET_RTX_LENGTH (code); i++)
+ {
+ switch (*format_ptr++)
+ {
+ case 'e':
+ if (exact_dependency_1 (addr, XEXP (insn, i)))
+ return true;
+ break;
+ case 'E':
+ for (j = 0; j < XVECLEN (insn, i); j++)
+ if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
+ return true;
+ break;
+ }
+ }
+ return false;
+}
+
+/* Return true if there exists exact dependency for store & load, i.e.
+ the same memory address is used in them. */
+static bool
+exact_store_load_dependency (rtx store, rtx load)
+{
+ rtx set1, set2;
+
+ set1 = single_set (store);
+ if (!set1)
+ return false;
+ if (!MEM_P (SET_DEST (set1)))
+ return false;
+ set2 = single_set (load);
+ if (!set2)
+ return false;
+ if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
+ return true;
+ return false;
+}
+
static int
ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
{
@@ -24043,6 +24129,39 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
else
cost = 0;
}
+ break;
+
+ case PROCESSOR_SLM:
+ if (!reload_completed)
+ return cost;
+
+ /* Increase cost of integer loads. */
+ memory = get_attr_memory (dep_insn);
+ if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+ {
+ enum attr_unit unit = get_attr_unit (dep_insn);
+ if (unit == UNIT_INTEGER && cost == 1)
+ {
+ if (memory == MEMORY_LOAD)
+ cost = 3;
+ else
+ {
+ /* Increase cost of ld/st for short int types only
+ because of store forwarding issue. */
+ rtx set = single_set (dep_insn);
+ if (set && (GET_MODE (SET_DEST (set)) == QImode
+ || GET_MODE (SET_DEST (set)) == HImode))
+ {
+ /* Increase cost of store/load insn if exact
+ dependence exists and it is load insn. */
+ enum attr_memory insn_memory = get_attr_memory (insn);
+ if (insn_memory == MEMORY_LOAD
+ && exact_store_load_dependency (dep_insn, insn))
+ cost = 3;
+ }
+ }
+ }
+ }
default:
break;
diff --git a/gcc-4.7/gcc/config/i386/i386.h b/gcc-4.7/gcc/config/i386/i386.h
index e01cc4797..b286862cc 100644
--- a/gcc-4.7/gcc/config/i386/i386.h
+++ b/gcc-4.7/gcc/config/i386/i386.h
@@ -321,6 +321,7 @@ enum ix86_tune_indices {
X86_TUNE_AVX128_OPTIMAL,
X86_TUNE_REASSOC_INT_TO_PARALLEL,
X86_TUNE_REASSOC_FP_TO_PARALLEL,
+ X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS,
X86_TUNE_LAST
};
@@ -423,6 +424,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_REASSOC_INT_TO_PARALLEL]
#define TARGET_REASSOC_FP_TO_PARALLEL \
ix86_tune_features[X86_TUNE_REASSOC_FP_TO_PARALLEL]
+#define TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS \
+ ix86_tune_features[X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS]
/* Feature tests against the various architecture variations. */
enum ix86_arch_indices {
diff --git a/gcc-4.7/gcc/config/i386/i386.md b/gcc-4.7/gcc/config/i386/i386.md
index 1b630da8e..7809961e4 100644
--- a/gcc-4.7/gcc/config/i386/i386.md
+++ b/gcc-4.7/gcc/config/i386/i386.md
@@ -3887,6 +3887,18 @@
CONST0_RTX (V4SFmode), operands[1]));
})
+;; It's more profitable to split and then extend in the same register.
+(define_peephole2
+ [(set (match_operand:DF 0 "register_operand")
+ (float_extend:DF
+ (match_operand:SF 1 "memory_operand")))]
+ "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
+ && optimize_insn_for_speed_p ()
+ && SSE_REG_P (operands[0])"
+ [(set (match_dup 2) (match_dup 1))
+ (set (match_dup 0) (float_extend:DF (match_dup 2)))]
+ "operands[2] = gen_rtx_REG (SFmode, REGNO (operands[0]));")
+
(define_insn "*extendsfdf2_mixed"
[(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,x")
(float_extend:DF
@@ -4031,6 +4043,18 @@
CONST0_RTX (V2DFmode), operands[1]));
})
+;; It's more profitable to split and then extend in the same register.
+(define_peephole2
+ [(set (match_operand:SF 0 "register_operand")
+ (float_truncate:SF
+ (match_operand:DF 1 "memory_operand")))]
+ "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
+ && optimize_insn_for_speed_p ()
+ && SSE_REG_P (operands[0])"
+ [(set (match_dup 2) (match_dup 1))
+ (set (match_dup 0) (float_truncate:SF (match_dup 2)))]
+ "operands[2] = gen_rtx_REG (DFmode, REGNO (operands[0]));")
+
(define_expand "truncdfsf2_with_temp"
[(parallel [(set (match_operand:SF 0 "" "")
(float_truncate:SF (match_operand:DF 1 "" "")))