diff options
author | Pavel Chupin <pavel.v.chupin@intel.com> | 2013-06-03 17:20:10 +0400 |
---|---|---|
committer | Pavel Chupin <pavel.v.chupin@intel.com> | 2014-06-19 22:39:12 +0400 |
commit | e9b7fc112190da54adcebdc2506214495d73b4d3 (patch) | |
tree | b80e246323666f457376aff35a2a47ea38ed0355 /gcc-4.8 | |
parent | 27f3b6e52df39b2bf4038d7fb92e1c642c4913dc (diff) | |
download | toolchain_gcc-e9b7fc112190da54adcebdc2506214495d73b4d3.tar.gz toolchain_gcc-e9b7fc112190da54adcebdc2506214495d73b4d3.tar.bz2 toolchain_gcc-e9b7fc112190da54adcebdc2506214495d73b4d3.zip |
[4.8, REAPPLY] Extended Silvermont tuning.
Reapplying https://android-review.googlesource.com/#/c/60083 after 4.8.3
merge
Change-Id: I93a985ae43d9349ed3fa0dd37e485f5477774f37
Signed-off-by: Pavel Chupin <pavel.v.chupin@intel.com>
Diffstat (limited to 'gcc-4.8')
-rw-r--r-- | gcc-4.8/gcc/config/i386/i386.c | 372 | ||||
-rw-r--r-- | gcc-4.8/gcc/config/i386/i386.h | 3 | ||||
-rw-r--r-- | gcc-4.8/gcc/config/i386/i386.md | 24 | ||||
-rw-r--r-- | gcc-4.8/libgcc/config/i386/cpuinfo.c | 4 |
4 files changed, 323 insertions, 80 deletions
diff --git a/gcc-4.8/gcc/config/i386/i386.c b/gcc-4.8/gcc/config/i386/i386.c index 99d1eb8e7..66724b60c 100644 --- a/gcc-4.8/gcc/config/i386/i386.c +++ b/gcc-4.8/gcc/config/i386/i386.c @@ -2103,7 +2103,12 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for a conditional move. */ - m_ATOM + m_ATOM, + + /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for + fp converts to destination register. */ + m_SLM + }; /* Feature tests against the various architecture variations. */ @@ -17272,10 +17277,24 @@ distance_agu_use (unsigned int regno0, rtx insn) static bool ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1, - unsigned int regno2, int split_cost) + unsigned int regno2, int split_cost, bool has_scale) { int dist_define, dist_use; + /* For Silvermont if using a 2-source or 3-source LEA for + non-destructive destination purposes, or due to wanting + ability to use SCALE, the use of LEA is justified. */ + if (ix86_tune == PROCESSOR_SLM) + { + if (has_scale) + return true; + if (split_cost < 1) + return false; + if (regno0 == regno1 || regno0 == regno2) + return false; + return true; + } + dist_define = distance_non_agu_define (regno1, regno2, insn); dist_use = distance_agu_use (regno0, insn); @@ -17364,7 +17383,7 @@ ix86_avoid_lea_for_add (rtx insn, rtx operands[]) if (regno0 == regno1 || regno0 == regno2) return false; else - return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1); + return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false); } /* Return true if we should emit lea instruction instead of mov @@ -17386,7 +17405,7 @@ ix86_use_lea_for_mov (rtx insn, rtx operands[]) regno0 = true_regnum (operands[0]); regno1 = true_regnum (operands[1]); - return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0); + return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false); } /* Return true if we need to split lea into a sequence of @@ -17475,7 +17494,8 @@ ix86_avoid_lea_for_addr (rtx insn, rtx operands[]) split_cost -= 1; } - return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost); + return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost, + parts.scale > 1); } /* Emit x86 binary operand CODE in mode MODE, where the first operand @@ -17660,7 +17680,7 @@ ix86_lea_for_add_ok (rtx insn, rtx operands[]) if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun)) return false; - return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0); + return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false); } /* Return true if destination reg of SET_BODY is shift count of @@ -24269,6 +24289,73 @@ ix86_agi_dependent (rtx set_insn, rtx use_insn) return false; } +/* Helper function for exact_store_load_dependency. + Return true if addr is found in insn. */ +static bool +exact_dependency_1 (rtx addr, rtx insn) +{ + enum rtx_code code; + const char *format_ptr; + int i, j; + + code = GET_CODE (insn); + switch (code) + { + case MEM: + if (rtx_equal_p (addr, insn)) + return true; + break; + case REG: + CASE_CONST_ANY: + case SYMBOL_REF: + case CODE_LABEL: + case PC: + case CC0: + case EXPR_LIST: + return false; + default: + break; + } + + format_ptr = GET_RTX_FORMAT (code); + for (i = 0; i < GET_RTX_LENGTH (code); i++) + { + switch (*format_ptr++) + { + case 'e': + if (exact_dependency_1 (addr, XEXP (insn, i))) + return true; + break; + case 'E': + for (j = 0; j < XVECLEN (insn, i); j++) + if (exact_dependency_1 (addr, XVECEXP (insn, i, j))) + return true; + break; + } + } + return false; +} + +/* Return true if there exists exact dependency for store & load, i.e. + the same memory address is used in them. */ +static bool +exact_store_load_dependency (rtx store, rtx load) +{ + rtx set1, set2; + + set1 = single_set (store); + if (!set1) + return false; + if (!MEM_P (SET_DEST (set1))) + return false; + set2 = single_set (load); + if (!set2) + return false; + if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2))) + return true; + return false; +} + static int ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) { @@ -24420,6 +24507,39 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) else cost = 0; } + break; + + case PROCESSOR_SLM: + if (!reload_completed) + return cost; + + /* Increase cost of integer loads. */ + memory = get_attr_memory (dep_insn); + if (memory == MEMORY_LOAD || memory == MEMORY_BOTH) + { + enum attr_unit unit = get_attr_unit (dep_insn); + if (unit == UNIT_INTEGER && cost == 1) + { + if (memory == MEMORY_LOAD) + cost = 3; + else + { + /* Increase cost of ld/st for short int types only + because of store forwarding issue. */ + rtx set = single_set (dep_insn); + if (set && (GET_MODE (SET_DEST (set)) == QImode + || GET_MODE (SET_DEST (set)) == HImode)) + { + /* Increase cost of store/load insn if exact + dependence exists and it is load insn. */ + enum attr_memory insn_memory = get_attr_memory (insn); + if (insn_memory == MEMORY_LOAD + && exact_store_load_dependency (dep_insn, insn)) + cost = 3; + } + } + } + } default: break; @@ -24466,110 +24586,204 @@ ia32_multipass_dfa_lookahead (void) execution. It is applied if (1) IMUL instruction is on the top of list; (2) There exists the only producer of independent IMUL instruction in - ready list; - (3) Put found producer on the top of ready list. - Returns issue rate. */ - + ready list. + Return index of IMUL producer if it was found and -1 otherwise. */ static int -ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready, - int clock_var ATTRIBUTE_UNUSED) +do_reorder_for_imul (rtx *ready, int n_ready) { - static int issue_rate = -1; - int n_ready = *pn_ready; - rtx insn, insn1, insn2; - int i; + rtx insn, set, insn1, insn2; sd_iterator_def sd_it; dep_t dep; int index = -1; + int i; - /* Set up issue rate. */ - issue_rate = ix86_issue_rate(); - - /* Do reodering for Atom only. */ if (ix86_tune != PROCESSOR_ATOM) - return issue_rate; - /* Do not perform ready list reodering for pre-reload schedule pass. */ - if (!reload_completed) - return issue_rate; - /* Nothing to do if ready list contains only 1 instruction. */ - if (n_ready <= 1) - return issue_rate; + return index; /* Check that IMUL instruction is on the top of ready list. */ insn = ready[n_ready - 1]; - if (!NONDEBUG_INSN_P (insn)) - return issue_rate; - insn = PATTERN (insn); - if (GET_CODE (insn) == PARALLEL) - insn = XVECEXP (insn, 0, 0); - if (GET_CODE (insn) != SET) - return issue_rate; - if (!(GET_CODE (SET_SRC (insn)) == MULT - && GET_MODE (SET_SRC (insn)) == SImode)) - return issue_rate; + set = single_set (insn); + if (!set) + return index; + if (!(GET_CODE (SET_SRC (set)) == MULT + && GET_MODE (SET_SRC (set)) == SImode)) + return index; /* Search for producer of independent IMUL instruction. */ - for (i = n_ready - 2; i>= 0; i--) + for (i = n_ready - 2; i >= 0; i--) { insn = ready[i]; if (!NONDEBUG_INSN_P (insn)) - continue; + continue; /* Skip IMUL instruction. */ insn2 = PATTERN (insn); if (GET_CODE (insn2) == PARALLEL) - insn2 = XVECEXP (insn2, 0, 0); + insn2 = XVECEXP (insn2, 0, 0); if (GET_CODE (insn2) == SET - && GET_CODE (SET_SRC (insn2)) == MULT - && GET_MODE (SET_SRC (insn2)) == SImode) - continue; + && GET_CODE (SET_SRC (insn2)) == MULT + && GET_MODE (SET_SRC (insn2)) == SImode) + continue; FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep) - { - rtx con; + { + rtx con; con = DEP_CON (dep); if (!NONDEBUG_INSN_P (con)) continue; - insn1 = PATTERN (con); - if (GET_CODE (insn1) == PARALLEL) - insn1 = XVECEXP (insn1, 0, 0); + insn1 = PATTERN (con); + if (GET_CODE (insn1) == PARALLEL) + insn1 = XVECEXP (insn1, 0, 0); - if (GET_CODE (insn1) == SET - && GET_CODE (SET_SRC (insn1)) == MULT - && GET_MODE (SET_SRC (insn1)) == SImode) - { - sd_iterator_def sd_it1; - dep_t dep1; - /* Check if there is no other dependee for IMUL. */ - index = i; - FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1) - { - rtx pro; - pro = DEP_PRO (dep1); + if (GET_CODE (insn1) == SET + && GET_CODE (SET_SRC (insn1)) == MULT + && GET_MODE (SET_SRC (insn1)) == SImode) + { + sd_iterator_def sd_it1; + dep_t dep1; + /* Check if there is no other dependee for IMUL. */ + index = i; + FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1) + { + rtx pro; + pro = DEP_PRO (dep1); if (!NONDEBUG_INSN_P (pro)) continue; - if (pro != insn) - index = -1; - } - if (index >= 0) - break; - } - } + if (pro != insn) + index = -1; + } + if (index >= 0) + break; + } + } if (index >= 0) - break; + break; } - if (index < 0) - return issue_rate; /* Didn't find IMUL producer. */ + return index; +} - if (sched_verbose > 1) - fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n", - INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1])); +/* Try to find the best candidate on the top of ready list if two insns + have the same priority - candidate is best if its dependees were + scheduled earlier. Applied for Silvermont only. + Return true if top 2 insns must be interchanged. */ +static bool +swap_top_of_ready_list (rtx *ready, int n_ready) +{ + rtx top = ready[n_ready - 1]; + rtx next = ready[n_ready - 2]; + rtx set; + sd_iterator_def sd_it; + dep_t dep; + int clock1 = -1; + int clock2 = -1; + #define INSN_TICK(INSN) (HID (INSN)->tick) - /* Put IMUL producer (ready[index]) at the top of ready list. */ - insn1= ready[index]; - for (i = index; i < n_ready - 1; i++) - ready[i] = ready[i + 1]; - ready[n_ready - 1] = insn1; + if (ix86_tune != PROCESSOR_SLM) + return false; + if (!NONDEBUG_INSN_P (top)) + return false; + if (!NONJUMP_INSN_P (top)) + return false; + if (!NONDEBUG_INSN_P (next)) + return false; + if (!NONJUMP_INSN_P (next)) + return false; + set = single_set (top); + if (!set) + return false; + set = single_set (next); + if (!set) + return false; + + if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next)) + { + if (INSN_PRIORITY (top) != INSN_PRIORITY (next)) + return false; + /* Determine winner more precise. */ + FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep) + { + rtx pro; + pro = DEP_PRO (dep); + if (!NONDEBUG_INSN_P (pro)) + continue; + if (INSN_TICK (pro) > clock1) + clock1 = INSN_TICK (pro); + } + FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep) + { + rtx pro; + pro = DEP_PRO (dep); + if (!NONDEBUG_INSN_P (pro)) + continue; + if (INSN_TICK (pro) > clock2) + clock2 = INSN_TICK (pro); + } + + if (clock1 == clock2) + { + /* Determine winner - load must win. */ + enum attr_memory memory1, memory2; + memory1 = get_attr_memory (top); + memory2 = get_attr_memory (next); + if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD) + return true; + } + return (bool) (clock2 < clock1); + } + return false; + #undef INSN_TICK +} + +/* Perform possible reodering of ready list for Atom/Silvermont only. + Return issue rate. */ +static int +ix86_sched_reorder (FILE *dump, int sched_verbose, rtx *ready, int *pn_ready, + int clock_var) +{ + int issue_rate = -1; + int n_ready = *pn_ready; + int i; + rtx insn; + int index = -1; + + /* Set up issue rate. */ + issue_rate = ix86_issue_rate (); + + /* Do reodering for Atom/SLM only. */ + if (ix86_tune != PROCESSOR_ATOM && ix86_tune != PROCESSOR_SLM) + return issue_rate; + + /* Nothing to do if ready list contains only 1 instruction. */ + if (n_ready <= 1) + return issue_rate; + + /* Do reodering for post-reload scheduler only. */ + if (!reload_completed) + return issue_rate; + + if ((index = do_reorder_for_imul (ready, n_ready)) >= 0) + { + if (sched_verbose > 1) + fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n", + INSN_UID (ready[index])); + + /* Put IMUL producer (ready[index]) at the top of ready list. */ + insn = ready[index]; + for (i = index; i < n_ready - 1; i++) + ready[i] = ready[i + 1]; + ready[n_ready - 1] = insn; + return issue_rate; + } + if (clock_var != 0 && swap_top_of_ready_list (ready, n_ready)) + { + if (sched_verbose > 1) + fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n", + INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2])); + /* Swap 2 top elements of ready list. */ + insn = ready[n_ready - 1]; + ready[n_ready - 1] = ready[n_ready - 2]; + ready[n_ready - 2] = insn; + } return issue_rate; } @@ -29731,11 +29945,11 @@ fold_builtin_cpu (tree fndecl, tree *args) M_AMD, M_CPU_TYPE_START, M_INTEL_ATOM, - M_INTEL_SLM, M_INTEL_CORE2, M_INTEL_COREI7, M_AMDFAM10H, M_AMDFAM15H, + M_INTEL_SLM, M_CPU_SUBTYPE_START, M_INTEL_COREI7_NEHALEM, M_INTEL_COREI7_WESTMERE, diff --git a/gcc-4.8/gcc/config/i386/i386.h b/gcc-4.8/gcc/config/i386/i386.h index eca7c4bc8..ac28ec1aa 100644 --- a/gcc-4.8/gcc/config/i386/i386.h +++ b/gcc-4.8/gcc/config/i386/i386.h @@ -332,6 +332,7 @@ enum ix86_tune_indices { X86_TUNE_REASSOC_FP_TO_PARALLEL, X86_TUNE_GENERAL_REGS_SSE_SPILL, X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, + X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, X86_TUNE_LAST }; @@ -439,6 +440,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_GENERAL_REGS_SSE_SPILL] #define TARGET_AVOID_MEM_OPND_FOR_CMOVE \ ix86_tune_features[X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE] +#define TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS \ + ix86_tune_features[X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS] /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { diff --git a/gcc-4.8/gcc/config/i386/i386.md b/gcc-4.8/gcc/config/i386/i386.md index b390df447..f2de157ce 100644 --- a/gcc-4.8/gcc/config/i386/i386.md +++ b/gcc-4.8/gcc/config/i386/i386.md @@ -3965,6 +3965,18 @@ CONST0_RTX (V4SFmode), operands[1])); }) +;; It's more profitable to split and then extend in the same register. +(define_peephole2 + [(set (match_operand:DF 0 "register_operand") + (float_extend:DF + (match_operand:SF 1 "memory_operand")))] + "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS + && optimize_insn_for_speed_p () + && SSE_REG_P (operands[0])" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (float_extend:DF (match_dup 2)))] + "operands[2] = gen_rtx_REG (SFmode, REGNO (operands[0]));") + (define_insn "*extendsfdf2_mixed" [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,x") (float_extend:DF @@ -4106,6 +4118,18 @@ CONST0_RTX (V2DFmode), operands[1])); }) +;; It's more profitable to split and then extend in the same register. +(define_peephole2 + [(set (match_operand:SF 0 "register_operand") + (float_truncate:SF + (match_operand:DF 1 "memory_operand")))] + "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS + && optimize_insn_for_speed_p () + && SSE_REG_P (operands[0])" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (float_truncate:SF (match_dup 2)))] + "operands[2] = gen_rtx_REG (DFmode, REGNO (operands[0]));") + (define_expand "truncdfsf2_with_temp" [(parallel [(set (match_operand:SF 0) (float_truncate:SF (match_operand:DF 1))) diff --git a/gcc-4.8/libgcc/config/i386/cpuinfo.c b/gcc-4.8/libgcc/config/i386/cpuinfo.c index f32ec17aa..1c744f123 100644 --- a/gcc-4.8/libgcc/config/i386/cpuinfo.c +++ b/gcc-4.8/libgcc/config/i386/cpuinfo.c @@ -52,14 +52,16 @@ enum processor_vendor VENDOR_MAX }; +/* Any new types or subtypes have to be inserted at the end. */ + enum processor_types { INTEL_ATOM = 1, - INTEL_SLM, INTEL_CORE2, INTEL_COREI7, AMDFAM10H, AMDFAM15H, + INTEL_SLM, CPU_TYPE_MAX }; |