From f1fba0f5b24322265b9c3d37863a504c4b67cdc2 Mon Sep 17 00:00:00 2001 From: Andrew Hsieh Date: Wed, 18 Jun 2014 09:53:32 -0700 Subject: revert 01b34967a57ca33621130d36e007214b93bdfeaa For the purpose of merging gcc-4.8.3, will put it back. https://android-review.googlesource.com/#/c/60083 [4.7, 4.8] Extended Silvermont tuning Change-Id: If13a6989286c0ab57def1dd65f0bbd2f6ed8d807 --- gcc-4.8/gcc/config/i386/i386.c | 372 ++++++++--------------------------- gcc-4.8/gcc/config/i386/i386.h | 3 - gcc-4.8/gcc/config/i386/i386.md | 24 --- gcc-4.8/libgcc/config/i386/cpuinfo.c | 4 +- 4 files changed, 80 insertions(+), 323 deletions(-) (limited to 'gcc-4.8') diff --git a/gcc-4.8/gcc/config/i386/i386.c b/gcc-4.8/gcc/config/i386/i386.c index 7e8f653b6..0c5b57d3f 100644 --- a/gcc-4.8/gcc/config/i386/i386.c +++ b/gcc-4.8/gcc/config/i386/i386.c @@ -2103,12 +2103,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for a conditional move. */ - m_ATOM, - - /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for - fp converts to destination register. */ - m_SLM - + m_ATOM }; /* Feature tests against the various architecture variations. */ @@ -17317,24 +17312,10 @@ distance_agu_use (unsigned int regno0, rtx insn) static bool ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1, - unsigned int regno2, int split_cost, bool has_scale) + unsigned int regno2, int split_cost) { int dist_define, dist_use; - /* For Silvermont if using a 2-source or 3-source LEA for - non-destructive destination purposes, or due to wanting - ability to use SCALE, the use of LEA is justified. */ - if (ix86_tune == PROCESSOR_SLM) - { - if (has_scale) - return true; - if (split_cost < 1) - return false; - if (regno0 == regno1 || regno0 == regno2) - return false; - return true; - } - dist_define = distance_non_agu_define (regno1, regno2, insn); dist_use = distance_agu_use (regno0, insn); @@ -17423,7 +17404,7 @@ ix86_avoid_lea_for_add (rtx insn, rtx operands[]) if (regno0 == regno1 || regno0 == regno2) return false; else - return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false); + return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1); } /* Return true if we should emit lea instruction instead of mov @@ -17445,7 +17426,7 @@ ix86_use_lea_for_mov (rtx insn, rtx operands[]) regno0 = true_regnum (operands[0]); regno1 = true_regnum (operands[1]); - return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false); + return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0); } /* Return true if we need to split lea into a sequence of @@ -17524,8 +17505,7 @@ ix86_avoid_lea_for_addr (rtx insn, rtx operands[]) split_cost -= 1; } - return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost, - parts.scale > 1); + return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost); } /* Emit x86 binary operand CODE in mode MODE, where the first operand @@ -17710,7 +17690,7 @@ ix86_lea_for_add_ok (rtx insn, rtx operands[]) if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun)) return false; - return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false); + return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0); } /* Return true if destination reg of SET_BODY is shift count of @@ -24309,73 +24289,6 @@ ix86_agi_dependent (rtx set_insn, rtx use_insn) return false; } -/* Helper function for exact_store_load_dependency. - Return true if addr is found in insn. */ -static bool -exact_dependency_1 (rtx addr, rtx insn) -{ - enum rtx_code code; - const char *format_ptr; - int i, j; - - code = GET_CODE (insn); - switch (code) - { - case MEM: - if (rtx_equal_p (addr, insn)) - return true; - break; - case REG: - CASE_CONST_ANY: - case SYMBOL_REF: - case CODE_LABEL: - case PC: - case CC0: - case EXPR_LIST: - return false; - default: - break; - } - - format_ptr = GET_RTX_FORMAT (code); - for (i = 0; i < GET_RTX_LENGTH (code); i++) - { - switch (*format_ptr++) - { - case 'e': - if (exact_dependency_1 (addr, XEXP (insn, i))) - return true; - break; - case 'E': - for (j = 0; j < XVECLEN (insn, i); j++) - if (exact_dependency_1 (addr, XVECEXP (insn, i, j))) - return true; - break; - } - } - return false; -} - -/* Return true if there exists exact dependency for store & load, i.e. - the same memory address is used in them. */ -static bool -exact_store_load_dependency (rtx store, rtx load) -{ - rtx set1, set2; - - set1 = single_set (store); - if (!set1) - return false; - if (!MEM_P (SET_DEST (set1))) - return false; - set2 = single_set (load); - if (!set2) - return false; - if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2))) - return true; - return false; -} - static int ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) { @@ -24527,39 +24440,6 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) else cost = 0; } - break; - - case PROCESSOR_SLM: - if (!reload_completed) - return cost; - - /* Increase cost of integer loads. */ - memory = get_attr_memory (dep_insn); - if (memory == MEMORY_LOAD || memory == MEMORY_BOTH) - { - enum attr_unit unit = get_attr_unit (dep_insn); - if (unit == UNIT_INTEGER && cost == 1) - { - if (memory == MEMORY_LOAD) - cost = 3; - else - { - /* Increase cost of ld/st for short int types only - because of store forwarding issue. */ - rtx set = single_set (dep_insn); - if (set && (GET_MODE (SET_DEST (set)) == QImode - || GET_MODE (SET_DEST (set)) == HImode)) - { - /* Increase cost of store/load insn if exact - dependence exists and it is load insn. */ - enum attr_memory insn_memory = get_attr_memory (insn); - if (insn_memory == MEMORY_LOAD - && exact_store_load_dependency (dep_insn, insn)) - cost = 3; - } - } - } - } default: break; @@ -24606,204 +24486,110 @@ ia32_multipass_dfa_lookahead (void) execution. It is applied if (1) IMUL instruction is on the top of list; (2) There exists the only producer of independent IMUL instruction in - ready list. - Return index of IMUL producer if it was found and -1 otherwise. */ + ready list; + (3) Put found producer on the top of ready list. + Returns issue rate. */ + static int -do_reorder_for_imul (rtx *ready, int n_ready) +ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready, + int clock_var ATTRIBUTE_UNUSED) { - rtx insn, set, insn1, insn2; + static int issue_rate = -1; + int n_ready = *pn_ready; + rtx insn, insn1, insn2; + int i; sd_iterator_def sd_it; dep_t dep; int index = -1; - int i; + /* Set up issue rate. */ + issue_rate = ix86_issue_rate(); + + /* Do reodering for Atom only. */ if (ix86_tune != PROCESSOR_ATOM) - return index; + return issue_rate; + /* Do not perform ready list reodering for pre-reload schedule pass. */ + if (!reload_completed) + return issue_rate; + /* Nothing to do if ready list contains only 1 instruction. */ + if (n_ready <= 1) + return issue_rate; /* Check that IMUL instruction is on the top of ready list. */ insn = ready[n_ready - 1]; - set = single_set (insn); - if (!set) - return index; - if (!(GET_CODE (SET_SRC (set)) == MULT - && GET_MODE (SET_SRC (set)) == SImode)) - return index; + if (!NONDEBUG_INSN_P (insn)) + return issue_rate; + insn = PATTERN (insn); + if (GET_CODE (insn) == PARALLEL) + insn = XVECEXP (insn, 0, 0); + if (GET_CODE (insn) != SET) + return issue_rate; + if (!(GET_CODE (SET_SRC (insn)) == MULT + && GET_MODE (SET_SRC (insn)) == SImode)) + return issue_rate; /* Search for producer of independent IMUL instruction. */ - for (i = n_ready - 2; i >= 0; i--) + for (i = n_ready - 2; i>= 0; i--) { insn = ready[i]; if (!NONDEBUG_INSN_P (insn)) - continue; + continue; /* Skip IMUL instruction. */ insn2 = PATTERN (insn); if (GET_CODE (insn2) == PARALLEL) - insn2 = XVECEXP (insn2, 0, 0); + insn2 = XVECEXP (insn2, 0, 0); if (GET_CODE (insn2) == SET - && GET_CODE (SET_SRC (insn2)) == MULT - && GET_MODE (SET_SRC (insn2)) == SImode) - continue; + && GET_CODE (SET_SRC (insn2)) == MULT + && GET_MODE (SET_SRC (insn2)) == SImode) + continue; FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep) - { - rtx con; + { + rtx con; con = DEP_CON (dep); if (!NONDEBUG_INSN_P (con)) continue; - insn1 = PATTERN (con); - if (GET_CODE (insn1) == PARALLEL) - insn1 = XVECEXP (insn1, 0, 0); + insn1 = PATTERN (con); + if (GET_CODE (insn1) == PARALLEL) + insn1 = XVECEXP (insn1, 0, 0); - if (GET_CODE (insn1) == SET - && GET_CODE (SET_SRC (insn1)) == MULT - && GET_MODE (SET_SRC (insn1)) == SImode) - { - sd_iterator_def sd_it1; - dep_t dep1; - /* Check if there is no other dependee for IMUL. */ - index = i; - FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1) - { - rtx pro; - pro = DEP_PRO (dep1); + if (GET_CODE (insn1) == SET + && GET_CODE (SET_SRC (insn1)) == MULT + && GET_MODE (SET_SRC (insn1)) == SImode) + { + sd_iterator_def sd_it1; + dep_t dep1; + /* Check if there is no other dependee for IMUL. */ + index = i; + FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1) + { + rtx pro; + pro = DEP_PRO (dep1); if (!NONDEBUG_INSN_P (pro)) continue; - if (pro != insn) - index = -1; - } - if (index >= 0) - break; - } - } + if (pro != insn) + index = -1; + } + if (index >= 0) + break; + } + } if (index >= 0) - break; - } - return index; -} - -/* Try to find the best candidate on the top of ready list if two insns - have the same priority - candidate is best if its dependees were - scheduled earlier. Applied for Silvermont only. - Return true if top 2 insns must be interchanged. */ -static bool -swap_top_of_ready_list (rtx *ready, int n_ready) -{ - rtx top = ready[n_ready - 1]; - rtx next = ready[n_ready - 2]; - rtx set; - sd_iterator_def sd_it; - dep_t dep; - int clock1 = -1; - int clock2 = -1; - #define INSN_TICK(INSN) (HID (INSN)->tick) - - if (ix86_tune != PROCESSOR_SLM) - return false; - - if (!NONDEBUG_INSN_P (top)) - return false; - if (!NONJUMP_INSN_P (top)) - return false; - if (!NONDEBUG_INSN_P (next)) - return false; - if (!NONJUMP_INSN_P (next)) - return false; - set = single_set (top); - if (!set) - return false; - set = single_set (next); - if (!set) - return false; - - if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next)) - { - if (INSN_PRIORITY (top) != INSN_PRIORITY (next)) - return false; - /* Determine winner more precise. */ - FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep) - { - rtx pro; - pro = DEP_PRO (dep); - if (!NONDEBUG_INSN_P (pro)) - continue; - if (INSN_TICK (pro) > clock1) - clock1 = INSN_TICK (pro); - } - FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep) - { - rtx pro; - pro = DEP_PRO (dep); - if (!NONDEBUG_INSN_P (pro)) - continue; - if (INSN_TICK (pro) > clock2) - clock2 = INSN_TICK (pro); - } - - if (clock1 == clock2) - { - /* Determine winner - load must win. */ - enum attr_memory memory1, memory2; - memory1 = get_attr_memory (top); - memory2 = get_attr_memory (next); - if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD) - return true; - } - return (bool) (clock2 < clock1); + break; } - return false; - #undef INSN_TICK -} - -/* Perform possible reodering of ready list for Atom/Silvermont only. - Return issue rate. */ -static int -ix86_sched_reorder (FILE *dump, int sched_verbose, rtx *ready, int *pn_ready, - int clock_var) -{ - int issue_rate = -1; - int n_ready = *pn_ready; - int i; - rtx insn; - int index = -1; - - /* Set up issue rate. */ - issue_rate = ix86_issue_rate (); + if (index < 0) + return issue_rate; /* Didn't find IMUL producer. */ - /* Do reodering for Atom/SLM only. */ - if (ix86_tune != PROCESSOR_ATOM && ix86_tune != PROCESSOR_SLM) - return issue_rate; + if (sched_verbose > 1) + fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n", + INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1])); - /* Nothing to do if ready list contains only 1 instruction. */ - if (n_ready <= 1) - return issue_rate; + /* Put IMUL producer (ready[index]) at the top of ready list. */ + insn1= ready[index]; + for (i = index; i < n_ready - 1; i++) + ready[i] = ready[i + 1]; + ready[n_ready - 1] = insn1; - /* Do reodering for post-reload scheduler only. */ - if (!reload_completed) - return issue_rate; - - if ((index = do_reorder_for_imul (ready, n_ready)) >= 0) - { - if (sched_verbose > 1) - fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n", - INSN_UID (ready[index])); - - /* Put IMUL producer (ready[index]) at the top of ready list. */ - insn = ready[index]; - for (i = index; i < n_ready - 1; i++) - ready[i] = ready[i + 1]; - ready[n_ready - 1] = insn; - return issue_rate; - } - if (clock_var != 0 && swap_top_of_ready_list (ready, n_ready)) - { - if (sched_verbose > 1) - fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n", - INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2])); - /* Swap 2 top elements of ready list. */ - insn = ready[n_ready - 1]; - ready[n_ready - 1] = ready[n_ready - 2]; - ready[n_ready - 2] = insn; - } return issue_rate; } @@ -29964,11 +29750,11 @@ fold_builtin_cpu (tree fndecl, tree *args) M_AMD, M_CPU_TYPE_START, M_INTEL_ATOM, + M_INTEL_SLM, M_INTEL_CORE2, M_INTEL_COREI7, M_AMDFAM10H, M_AMDFAM15H, - M_INTEL_SLM, M_CPU_SUBTYPE_START, M_INTEL_COREI7_NEHALEM, M_INTEL_COREI7_WESTMERE, diff --git a/gcc-4.8/gcc/config/i386/i386.h b/gcc-4.8/gcc/config/i386/i386.h index ebf62a557..6c1c33466 100644 --- a/gcc-4.8/gcc/config/i386/i386.h +++ b/gcc-4.8/gcc/config/i386/i386.h @@ -332,7 +332,6 @@ enum ix86_tune_indices { X86_TUNE_REASSOC_FP_TO_PARALLEL, X86_TUNE_GENERAL_REGS_SSE_SPILL, X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, - X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, X86_TUNE_LAST }; @@ -440,8 +439,6 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_GENERAL_REGS_SSE_SPILL] #define TARGET_AVOID_MEM_OPND_FOR_CMOVE \ ix86_tune_features[X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE] -#define TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS \ - ix86_tune_features[X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS] /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { diff --git a/gcc-4.8/gcc/config/i386/i386.md b/gcc-4.8/gcc/config/i386/i386.md index 2d28d694c..6dcad7e02 100644 --- a/gcc-4.8/gcc/config/i386/i386.md +++ b/gcc-4.8/gcc/config/i386/i386.md @@ -3945,18 +3945,6 @@ CONST0_RTX (V4SFmode), operands[1])); }) -;; It's more profitable to split and then extend in the same register. -(define_peephole2 - [(set (match_operand:DF 0 "register_operand") - (float_extend:DF - (match_operand:SF 1 "memory_operand")))] - "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS - && optimize_insn_for_speed_p () - && SSE_REG_P (operands[0])" - [(set (match_dup 2) (match_dup 1)) - (set (match_dup 0) (float_extend:DF (match_dup 2)))] - "operands[2] = gen_rtx_REG (SFmode, REGNO (operands[0]));") - (define_insn "*extendsfdf2_mixed" [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,x") (float_extend:DF @@ -4098,18 +4086,6 @@ CONST0_RTX (V2DFmode), operands[1])); }) -;; It's more profitable to split and then extend in the same register. -(define_peephole2 - [(set (match_operand:SF 0 "register_operand") - (float_truncate:SF - (match_operand:DF 1 "memory_operand")))] - "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS - && optimize_insn_for_speed_p () - && SSE_REG_P (operands[0])" - [(set (match_dup 2) (match_dup 1)) - (set (match_dup 0) (float_truncate:SF (match_dup 2)))] - "operands[2] = gen_rtx_REG (DFmode, REGNO (operands[0]));") - (define_expand "truncdfsf2_with_temp" [(parallel [(set (match_operand:SF 0) (float_truncate:SF (match_operand:DF 1))) diff --git a/gcc-4.8/libgcc/config/i386/cpuinfo.c b/gcc-4.8/libgcc/config/i386/cpuinfo.c index 1c744f123..f32ec17aa 100644 --- a/gcc-4.8/libgcc/config/i386/cpuinfo.c +++ b/gcc-4.8/libgcc/config/i386/cpuinfo.c @@ -52,16 +52,14 @@ enum processor_vendor VENDOR_MAX }; -/* Any new types or subtypes have to be inserted at the end. */ - enum processor_types { INTEL_ATOM = 1, + INTEL_SLM, INTEL_CORE2, INTEL_COREI7, AMDFAM10H, AMDFAM15H, - INTEL_SLM, CPU_TYPE_MAX }; -- cgit v1.2.3