aboutsummaryrefslogtreecommitdiffstats
path: root/gcc-4.8
diff options
context:
space:
mode:
authorAndrew Hsieh <andrewhsieh@google.com>2014-06-20 13:51:24 +0000
committerGerrit Code Review <noreply-gerritcodereview@google.com>2014-06-20 02:15:53 +0000
commit97b6d1e9575f96aaf20695c65aba9cbdfee54e8e (patch)
treea375824617f9d871e6943588672a084442152064 /gcc-4.8
parent599d6fef7c92dcdf37c72f3b5672622cf8fa2ee4 (diff)
parente9b7fc112190da54adcebdc2506214495d73b4d3 (diff)
downloadtoolchain_gcc-97b6d1e9575f96aaf20695c65aba9cbdfee54e8e.zip
toolchain_gcc-97b6d1e9575f96aaf20695c65aba9cbdfee54e8e.tar.gz
toolchain_gcc-97b6d1e9575f96aaf20695c65aba9cbdfee54e8e.tar.bz2
Merge "[4.8, REAPPLY] Extended Silvermont tuning."
Diffstat (limited to 'gcc-4.8')
-rw-r--r--gcc-4.8/gcc/config/i386/i386.c372
-rw-r--r--gcc-4.8/gcc/config/i386/i386.h3
-rw-r--r--gcc-4.8/gcc/config/i386/i386.md24
-rw-r--r--gcc-4.8/libgcc/config/i386/cpuinfo.c4
4 files changed, 323 insertions, 80 deletions
diff --git a/gcc-4.8/gcc/config/i386/i386.c b/gcc-4.8/gcc/config/i386/i386.c
index 99d1eb8..66724b6 100644
--- a/gcc-4.8/gcc/config/i386/i386.c
+++ b/gcc-4.8/gcc/config/i386/i386.c
@@ -2103,7 +2103,12 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
/* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
a conditional move. */
- m_ATOM
+ m_ATOM,
+
+ /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
+ fp converts to destination register. */
+ m_SLM
+
};
/* Feature tests against the various architecture variations. */
@@ -17272,10 +17277,24 @@ distance_agu_use (unsigned int regno0, rtx insn)
static bool
ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
- unsigned int regno2, int split_cost)
+ unsigned int regno2, int split_cost, bool has_scale)
{
int dist_define, dist_use;
+ /* For Silvermont if using a 2-source or 3-source LEA for
+ non-destructive destination purposes, or due to wanting
+ ability to use SCALE, the use of LEA is justified. */
+ if (ix86_tune == PROCESSOR_SLM)
+ {
+ if (has_scale)
+ return true;
+ if (split_cost < 1)
+ return false;
+ if (regno0 == regno1 || regno0 == regno2)
+ return false;
+ return true;
+ }
+
dist_define = distance_non_agu_define (regno1, regno2, insn);
dist_use = distance_agu_use (regno0, insn);
@@ -17364,7 +17383,7 @@ ix86_avoid_lea_for_add (rtx insn, rtx operands[])
if (regno0 == regno1 || regno0 == regno2)
return false;
else
- return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
+ return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
}
/* Return true if we should emit lea instruction instead of mov
@@ -17386,7 +17405,7 @@ ix86_use_lea_for_mov (rtx insn, rtx operands[])
regno0 = true_regnum (operands[0]);
regno1 = true_regnum (operands[1]);
- return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
+ return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
}
/* Return true if we need to split lea into a sequence of
@@ -17475,7 +17494,8 @@ ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
split_cost -= 1;
}
- return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
+ return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
+ parts.scale > 1);
}
/* Emit x86 binary operand CODE in mode MODE, where the first operand
@@ -17660,7 +17680,7 @@ ix86_lea_for_add_ok (rtx insn, rtx operands[])
if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
return false;
- return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
+ return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
}
/* Return true if destination reg of SET_BODY is shift count of
@@ -24269,6 +24289,73 @@ ix86_agi_dependent (rtx set_insn, rtx use_insn)
return false;
}
+/* Helper function for exact_store_load_dependency.
+ Return true if addr is found in insn. */
+static bool
+exact_dependency_1 (rtx addr, rtx insn)
+{
+ enum rtx_code code;
+ const char *format_ptr;
+ int i, j;
+
+ code = GET_CODE (insn);
+ switch (code)
+ {
+ case MEM:
+ if (rtx_equal_p (addr, insn))
+ return true;
+ break;
+ case REG:
+ CASE_CONST_ANY:
+ case SYMBOL_REF:
+ case CODE_LABEL:
+ case PC:
+ case CC0:
+ case EXPR_LIST:
+ return false;
+ default:
+ break;
+ }
+
+ format_ptr = GET_RTX_FORMAT (code);
+ for (i = 0; i < GET_RTX_LENGTH (code); i++)
+ {
+ switch (*format_ptr++)
+ {
+ case 'e':
+ if (exact_dependency_1 (addr, XEXP (insn, i)))
+ return true;
+ break;
+ case 'E':
+ for (j = 0; j < XVECLEN (insn, i); j++)
+ if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
+ return true;
+ break;
+ }
+ }
+ return false;
+}
+
+/* Return true if there exists exact dependency for store & load, i.e.
+ the same memory address is used in them. */
+static bool
+exact_store_load_dependency (rtx store, rtx load)
+{
+ rtx set1, set2;
+
+ set1 = single_set (store);
+ if (!set1)
+ return false;
+ if (!MEM_P (SET_DEST (set1)))
+ return false;
+ set2 = single_set (load);
+ if (!set2)
+ return false;
+ if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
+ return true;
+ return false;
+}
+
static int
ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
{
@@ -24420,6 +24507,39 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
else
cost = 0;
}
+ break;
+
+ case PROCESSOR_SLM:
+ if (!reload_completed)
+ return cost;
+
+ /* Increase cost of integer loads. */
+ memory = get_attr_memory (dep_insn);
+ if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+ {
+ enum attr_unit unit = get_attr_unit (dep_insn);
+ if (unit == UNIT_INTEGER && cost == 1)
+ {
+ if (memory == MEMORY_LOAD)
+ cost = 3;
+ else
+ {
+ /* Increase cost of ld/st for short int types only
+ because of store forwarding issue. */
+ rtx set = single_set (dep_insn);
+ if (set && (GET_MODE (SET_DEST (set)) == QImode
+ || GET_MODE (SET_DEST (set)) == HImode))
+ {
+ /* Increase cost of store/load insn if exact
+ dependence exists and it is load insn. */
+ enum attr_memory insn_memory = get_attr_memory (insn);
+ if (insn_memory == MEMORY_LOAD
+ && exact_store_load_dependency (dep_insn, insn))
+ cost = 3;
+ }
+ }
+ }
+ }
default:
break;
@@ -24466,110 +24586,204 @@ ia32_multipass_dfa_lookahead (void)
execution. It is applied if
(1) IMUL instruction is on the top of list;
(2) There exists the only producer of independent IMUL instruction in
- ready list;
- (3) Put found producer on the top of ready list.
- Returns issue rate. */
-
+ ready list.
+ Return index of IMUL producer if it was found and -1 otherwise. */
static int
-ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
- int clock_var ATTRIBUTE_UNUSED)
+do_reorder_for_imul (rtx *ready, int n_ready)
{
- static int issue_rate = -1;
- int n_ready = *pn_ready;
- rtx insn, insn1, insn2;
- int i;
+ rtx insn, set, insn1, insn2;
sd_iterator_def sd_it;
dep_t dep;
int index = -1;
+ int i;
- /* Set up issue rate. */
- issue_rate = ix86_issue_rate();
-
- /* Do reodering for Atom only. */
if (ix86_tune != PROCESSOR_ATOM)
- return issue_rate;
- /* Do not perform ready list reodering for pre-reload schedule pass. */
- if (!reload_completed)
- return issue_rate;
- /* Nothing to do if ready list contains only 1 instruction. */
- if (n_ready <= 1)
- return issue_rate;
+ return index;
/* Check that IMUL instruction is on the top of ready list. */
insn = ready[n_ready - 1];
- if (!NONDEBUG_INSN_P (insn))
- return issue_rate;
- insn = PATTERN (insn);
- if (GET_CODE (insn) == PARALLEL)
- insn = XVECEXP (insn, 0, 0);
- if (GET_CODE (insn) != SET)
- return issue_rate;
- if (!(GET_CODE (SET_SRC (insn)) == MULT
- && GET_MODE (SET_SRC (insn)) == SImode))
- return issue_rate;
+ set = single_set (insn);
+ if (!set)
+ return index;
+ if (!(GET_CODE (SET_SRC (set)) == MULT
+ && GET_MODE (SET_SRC (set)) == SImode))
+ return index;
/* Search for producer of independent IMUL instruction. */
- for (i = n_ready - 2; i>= 0; i--)
+ for (i = n_ready - 2; i >= 0; i--)
{
insn = ready[i];
if (!NONDEBUG_INSN_P (insn))
- continue;
+ continue;
/* Skip IMUL instruction. */
insn2 = PATTERN (insn);
if (GET_CODE (insn2) == PARALLEL)
- insn2 = XVECEXP (insn2, 0, 0);
+ insn2 = XVECEXP (insn2, 0, 0);
if (GET_CODE (insn2) == SET
- && GET_CODE (SET_SRC (insn2)) == MULT
- && GET_MODE (SET_SRC (insn2)) == SImode)
- continue;
+ && GET_CODE (SET_SRC (insn2)) == MULT
+ && GET_MODE (SET_SRC (insn2)) == SImode)
+ continue;
FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
- {
- rtx con;
+ {
+ rtx con;
con = DEP_CON (dep);
if (!NONDEBUG_INSN_P (con))
continue;
- insn1 = PATTERN (con);
- if (GET_CODE (insn1) == PARALLEL)
- insn1 = XVECEXP (insn1, 0, 0);
+ insn1 = PATTERN (con);
+ if (GET_CODE (insn1) == PARALLEL)
+ insn1 = XVECEXP (insn1, 0, 0);
- if (GET_CODE (insn1) == SET
- && GET_CODE (SET_SRC (insn1)) == MULT
- && GET_MODE (SET_SRC (insn1)) == SImode)
- {
- sd_iterator_def sd_it1;
- dep_t dep1;
- /* Check if there is no other dependee for IMUL. */
- index = i;
- FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
- {
- rtx pro;
- pro = DEP_PRO (dep1);
+ if (GET_CODE (insn1) == SET
+ && GET_CODE (SET_SRC (insn1)) == MULT
+ && GET_MODE (SET_SRC (insn1)) == SImode)
+ {
+ sd_iterator_def sd_it1;
+ dep_t dep1;
+ /* Check if there is no other dependee for IMUL. */
+ index = i;
+ FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
+ {
+ rtx pro;
+ pro = DEP_PRO (dep1);
if (!NONDEBUG_INSN_P (pro))
continue;
- if (pro != insn)
- index = -1;
- }
- if (index >= 0)
- break;
- }
- }
+ if (pro != insn)
+ index = -1;
+ }
+ if (index >= 0)
+ break;
+ }
+ }
if (index >= 0)
- break;
+ break;
}
- if (index < 0)
- return issue_rate; /* Didn't find IMUL producer. */
+ return index;
+}
- if (sched_verbose > 1)
- fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
- INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
+/* Try to find the best candidate on the top of ready list if two insns
+ have the same priority - candidate is best if its dependees were
+ scheduled earlier. Applied for Silvermont only.
+ Return true if top 2 insns must be interchanged. */
+static bool
+swap_top_of_ready_list (rtx *ready, int n_ready)
+{
+ rtx top = ready[n_ready - 1];
+ rtx next = ready[n_ready - 2];
+ rtx set;
+ sd_iterator_def sd_it;
+ dep_t dep;
+ int clock1 = -1;
+ int clock2 = -1;
+ #define INSN_TICK(INSN) (HID (INSN)->tick)
- /* Put IMUL producer (ready[index]) at the top of ready list. */
- insn1= ready[index];
- for (i = index; i < n_ready - 1; i++)
- ready[i] = ready[i + 1];
- ready[n_ready - 1] = insn1;
+ if (ix86_tune != PROCESSOR_SLM)
+ return false;
+ if (!NONDEBUG_INSN_P (top))
+ return false;
+ if (!NONJUMP_INSN_P (top))
+ return false;
+ if (!NONDEBUG_INSN_P (next))
+ return false;
+ if (!NONJUMP_INSN_P (next))
+ return false;
+ set = single_set (top);
+ if (!set)
+ return false;
+ set = single_set (next);
+ if (!set)
+ return false;
+
+ if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
+ {
+ if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
+ return false;
+ /* Determine winner more precise. */
+ FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
+ {
+ rtx pro;
+ pro = DEP_PRO (dep);
+ if (!NONDEBUG_INSN_P (pro))
+ continue;
+ if (INSN_TICK (pro) > clock1)
+ clock1 = INSN_TICK (pro);
+ }
+ FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
+ {
+ rtx pro;
+ pro = DEP_PRO (dep);
+ if (!NONDEBUG_INSN_P (pro))
+ continue;
+ if (INSN_TICK (pro) > clock2)
+ clock2 = INSN_TICK (pro);
+ }
+
+ if (clock1 == clock2)
+ {
+ /* Determine winner - load must win. */
+ enum attr_memory memory1, memory2;
+ memory1 = get_attr_memory (top);
+ memory2 = get_attr_memory (next);
+ if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
+ return true;
+ }
+ return (bool) (clock2 < clock1);
+ }
+ return false;
+ #undef INSN_TICK
+}
+
+/* Perform possible reodering of ready list for Atom/Silvermont only.
+ Return issue rate. */
+static int
+ix86_sched_reorder (FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
+ int clock_var)
+{
+ int issue_rate = -1;
+ int n_ready = *pn_ready;
+ int i;
+ rtx insn;
+ int index = -1;
+
+ /* Set up issue rate. */
+ issue_rate = ix86_issue_rate ();
+
+ /* Do reodering for Atom/SLM only. */
+ if (ix86_tune != PROCESSOR_ATOM && ix86_tune != PROCESSOR_SLM)
+ return issue_rate;
+
+ /* Nothing to do if ready list contains only 1 instruction. */
+ if (n_ready <= 1)
+ return issue_rate;
+
+ /* Do reodering for post-reload scheduler only. */
+ if (!reload_completed)
+ return issue_rate;
+
+ if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
+ {
+ if (sched_verbose > 1)
+ fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
+ INSN_UID (ready[index]));
+
+ /* Put IMUL producer (ready[index]) at the top of ready list. */
+ insn = ready[index];
+ for (i = index; i < n_ready - 1; i++)
+ ready[i] = ready[i + 1];
+ ready[n_ready - 1] = insn;
+ return issue_rate;
+ }
+ if (clock_var != 0 && swap_top_of_ready_list (ready, n_ready))
+ {
+ if (sched_verbose > 1)
+ fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
+ INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
+ /* Swap 2 top elements of ready list. */
+ insn = ready[n_ready - 1];
+ ready[n_ready - 1] = ready[n_ready - 2];
+ ready[n_ready - 2] = insn;
+ }
return issue_rate;
}
@@ -29731,11 +29945,11 @@ fold_builtin_cpu (tree fndecl, tree *args)
M_AMD,
M_CPU_TYPE_START,
M_INTEL_ATOM,
- M_INTEL_SLM,
M_INTEL_CORE2,
M_INTEL_COREI7,
M_AMDFAM10H,
M_AMDFAM15H,
+ M_INTEL_SLM,
M_CPU_SUBTYPE_START,
M_INTEL_COREI7_NEHALEM,
M_INTEL_COREI7_WESTMERE,
diff --git a/gcc-4.8/gcc/config/i386/i386.h b/gcc-4.8/gcc/config/i386/i386.h
index eca7c4b..ac28ec1 100644
--- a/gcc-4.8/gcc/config/i386/i386.h
+++ b/gcc-4.8/gcc/config/i386/i386.h
@@ -332,6 +332,7 @@ enum ix86_tune_indices {
X86_TUNE_REASSOC_FP_TO_PARALLEL,
X86_TUNE_GENERAL_REGS_SSE_SPILL,
X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE,
+ X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS,
X86_TUNE_LAST
};
@@ -439,6 +440,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_GENERAL_REGS_SSE_SPILL]
#define TARGET_AVOID_MEM_OPND_FOR_CMOVE \
ix86_tune_features[X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE]
+#define TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS \
+ ix86_tune_features[X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS]
/* Feature tests against the various architecture variations. */
enum ix86_arch_indices {
diff --git a/gcc-4.8/gcc/config/i386/i386.md b/gcc-4.8/gcc/config/i386/i386.md
index b390df4..f2de157 100644
--- a/gcc-4.8/gcc/config/i386/i386.md
+++ b/gcc-4.8/gcc/config/i386/i386.md
@@ -3965,6 +3965,18 @@
CONST0_RTX (V4SFmode), operands[1]));
})
+;; It's more profitable to split and then extend in the same register.
+(define_peephole2
+ [(set (match_operand:DF 0 "register_operand")
+ (float_extend:DF
+ (match_operand:SF 1 "memory_operand")))]
+ "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
+ && optimize_insn_for_speed_p ()
+ && SSE_REG_P (operands[0])"
+ [(set (match_dup 2) (match_dup 1))
+ (set (match_dup 0) (float_extend:DF (match_dup 2)))]
+ "operands[2] = gen_rtx_REG (SFmode, REGNO (operands[0]));")
+
(define_insn "*extendsfdf2_mixed"
[(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,x")
(float_extend:DF
@@ -4106,6 +4118,18 @@
CONST0_RTX (V2DFmode), operands[1]));
})
+;; It's more profitable to split and then extend in the same register.
+(define_peephole2
+ [(set (match_operand:SF 0 "register_operand")
+ (float_truncate:SF
+ (match_operand:DF 1 "memory_operand")))]
+ "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
+ && optimize_insn_for_speed_p ()
+ && SSE_REG_P (operands[0])"
+ [(set (match_dup 2) (match_dup 1))
+ (set (match_dup 0) (float_truncate:SF (match_dup 2)))]
+ "operands[2] = gen_rtx_REG (DFmode, REGNO (operands[0]));")
+
(define_expand "truncdfsf2_with_temp"
[(parallel [(set (match_operand:SF 0)
(float_truncate:SF (match_operand:DF 1)))
diff --git a/gcc-4.8/libgcc/config/i386/cpuinfo.c b/gcc-4.8/libgcc/config/i386/cpuinfo.c
index f32ec17..1c744f1 100644
--- a/gcc-4.8/libgcc/config/i386/cpuinfo.c
+++ b/gcc-4.8/libgcc/config/i386/cpuinfo.c
@@ -52,14 +52,16 @@ enum processor_vendor
VENDOR_MAX
};
+/* Any new types or subtypes have to be inserted at the end. */
+
enum processor_types
{
INTEL_ATOM = 1,
- INTEL_SLM,
INTEL_CORE2,
INTEL_COREI7,
AMDFAM10H,
AMDFAM15H,
+ INTEL_SLM,
CPU_TYPE_MAX
};