From 01b34967a57ca33621130d36e007214b93bdfeaa Mon Sep 17 00:00:00 2001 From: Pavel Chupin Date: Mon, 3 Jun 2013 17:20:10 +0400 Subject: [4.7, 4.8] Extended Silvermont tuning. Backport r199546 from trunk: 2013-05-31 Yuri Rumyantsev Igor Zamyatin Silvermont (SLM) architecture performance tuning. * config/i386/i386.h (enum ix86_tune_indices): Add X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS. (TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS): New define. * config/i386/i386.c (initial_ix86_tune_features) : Initialize. (ix86_lea_outperforms): Handle Silvermont tuning. (ix86_avoid_lea_for_add): Add new argument to ix86_lea_outperforms call. (ix86_use_lea_for_mov): Likewise. (ix86_avoid_lea_for_addr): Likewise. (ix86_lea_for_add_ok): Likewise. (exact_dependency_1): New function. (exact_store_load_dependency): Likewise. (ix86_adjust_cost): Handle Silvermont tuning. (do_reoder_for_imul): Likewise. (swap_top_of_ready_list): New function. (ix86_sched_reorder): Changed to handle Silvermont tuning. * config/i386/i386.md (peepholes that split memory operand in fp converts): New. Also backport r199611 with fixes for the patch above and previous SLM patch: 2013-06-03 Yuri Rumyantsev * config/i386/i386.c (ix86_lea_outperforms): Fix formatting. (ix86_avoid_lea_for_addr): Likewise. (exact_dependency_1): Likewise. (ix86_adjust_cost): Likewise. (swap_top_of_ready_list): Fix formatting and !reload_completed check removed. (do_reorder_for_imul): Fix typo, formatting and !reload_completed check removed. (ix86_sched_reorder): Fix typo and formatting. (fold_builtin_cpu): Move M_INTEL_SLM at the end of processor types list. * config/i386/cpuinfo.c (INTEL_SLM): New enum value. Note that [4.7] part of the patch doesn't contain some of optimizations (IMUL) due to missed dependencies. [4.8] part of this backport is complete. Change-Id: I4b5f92b025aab217046f5b393527636f3cf25669 Signed-off-by: Pavel Chupin --- gcc-4.7/gcc/config/i386/i386.c | 129 ++++++++++++++++++++++++++++++++++++++-- gcc-4.7/gcc/config/i386/i386.h | 3 + gcc-4.7/gcc/config/i386/i386.md | 24 ++++++++ 3 files changed, 151 insertions(+), 5 deletions(-) (limited to 'gcc-4.7') diff --git a/gcc-4.7/gcc/config/i386/i386.c b/gcc-4.7/gcc/config/i386/i386.c index 5c2dd46c3..b9f9d7728 100644 --- a/gcc-4.7/gcc/config/i386/i386.c +++ b/gcc-4.7/gcc/config/i386/i386.c @@ -2252,6 +2252,10 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations during reassociation of fp computation. */ m_ATOM | m_SLM, + + /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for + fp converts to destination register. */ + m_SLM }; /* Feature tests against the various architecture variations. */ @@ -16938,10 +16942,24 @@ distance_agu_use (unsigned int regno0, rtx insn) static bool ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1, - unsigned int regno2, int split_cost) + unsigned int regno2, int split_cost, bool has_scale) { int dist_define, dist_use; + /* For Silvermont if using a 2-source or 3-source LEA for + non-destructive destination purposes, or due to wanting + ability to use SCALE, the use of LEA is justified. */ + if (ix86_tune == PROCESSOR_SLM) + { + if (has_scale) + return true; + if (split_cost < 1) + return false; + if (regno0 == regno1 || regno0 == regno2) + return false; + return true; + } + dist_define = distance_non_agu_define (regno1, regno2, insn); dist_use = distance_agu_use (regno0, insn); @@ -17028,7 +17046,7 @@ ix86_avoid_lea_for_add (rtx insn, rtx operands[]) if (regno0 == regno1 || regno0 == regno2) return false; else - return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1); + return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false); } /* Return true if we should emit lea instruction instead of mov @@ -17051,7 +17069,7 @@ ix86_use_lea_for_mov (rtx insn, rtx operands[]) regno0 = true_regnum (operands[0]); regno1 = true_regnum (operands[1]); - return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0); + return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false); } /* Return true if we need to split lea into a sequence of @@ -17131,7 +17149,8 @@ ix86_avoid_lea_for_addr (rtx insn, rtx operands[]) split_cost -= 1; } - return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost); + return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost, + parts.scale > 1); } /* Emit x86 binary operand CODE in mode MODE, where the first operand @@ -17269,7 +17288,7 @@ ix86_lea_for_add_ok (rtx insn, rtx operands[]) if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun)) return false; - return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0); + return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false); } /* Return true if destination reg of SET_BODY is shift count of @@ -23894,6 +23913,73 @@ ix86_agi_dependent (rtx set_insn, rtx use_insn) return false; } +/* Helper function for exact_store_load_dependency. + Return true if addr is found in insn. */ +static bool +exact_dependency_1 (rtx addr, rtx insn) +{ + enum rtx_code code; + const char *format_ptr; + int i, j; + + code = GET_CODE (insn); + switch (code) + { + case MEM: + if (rtx_equal_p (addr, insn)) + return true; + break; + case REG: + CASE_CONST_ANY: + case SYMBOL_REF: + case CODE_LABEL: + case PC: + case CC0: + case EXPR_LIST: + return false; + default: + break; + } + + format_ptr = GET_RTX_FORMAT (code); + for (i = 0; i < GET_RTX_LENGTH (code); i++) + { + switch (*format_ptr++) + { + case 'e': + if (exact_dependency_1 (addr, XEXP (insn, i))) + return true; + break; + case 'E': + for (j = 0; j < XVECLEN (insn, i); j++) + if (exact_dependency_1 (addr, XVECEXP (insn, i, j))) + return true; + break; + } + } + return false; +} + +/* Return true if there exists exact dependency for store & load, i.e. + the same memory address is used in them. */ +static bool +exact_store_load_dependency (rtx store, rtx load) +{ + rtx set1, set2; + + set1 = single_set (store); + if (!set1) + return false; + if (!MEM_P (SET_DEST (set1))) + return false; + set2 = single_set (load); + if (!set2) + return false; + if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2))) + return true; + return false; +} + static int ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) { @@ -24043,6 +24129,39 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) else cost = 0; } + break; + + case PROCESSOR_SLM: + if (!reload_completed) + return cost; + + /* Increase cost of integer loads. */ + memory = get_attr_memory (dep_insn); + if (memory == MEMORY_LOAD || memory == MEMORY_BOTH) + { + enum attr_unit unit = get_attr_unit (dep_insn); + if (unit == UNIT_INTEGER && cost == 1) + { + if (memory == MEMORY_LOAD) + cost = 3; + else + { + /* Increase cost of ld/st for short int types only + because of store forwarding issue. */ + rtx set = single_set (dep_insn); + if (set && (GET_MODE (SET_DEST (set)) == QImode + || GET_MODE (SET_DEST (set)) == HImode)) + { + /* Increase cost of store/load insn if exact + dependence exists and it is load insn. */ + enum attr_memory insn_memory = get_attr_memory (insn); + if (insn_memory == MEMORY_LOAD + && exact_store_load_dependency (dep_insn, insn)) + cost = 3; + } + } + } + } default: break; diff --git a/gcc-4.7/gcc/config/i386/i386.h b/gcc-4.7/gcc/config/i386/i386.h index e01cc4797..b286862cc 100644 --- a/gcc-4.7/gcc/config/i386/i386.h +++ b/gcc-4.7/gcc/config/i386/i386.h @@ -321,6 +321,7 @@ enum ix86_tune_indices { X86_TUNE_AVX128_OPTIMAL, X86_TUNE_REASSOC_INT_TO_PARALLEL, X86_TUNE_REASSOC_FP_TO_PARALLEL, + X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, X86_TUNE_LAST }; @@ -423,6 +424,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_REASSOC_INT_TO_PARALLEL] #define TARGET_REASSOC_FP_TO_PARALLEL \ ix86_tune_features[X86_TUNE_REASSOC_FP_TO_PARALLEL] +#define TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS \ + ix86_tune_features[X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS] /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { diff --git a/gcc-4.7/gcc/config/i386/i386.md b/gcc-4.7/gcc/config/i386/i386.md index 1b630da8e..7809961e4 100644 --- a/gcc-4.7/gcc/config/i386/i386.md +++ b/gcc-4.7/gcc/config/i386/i386.md @@ -3887,6 +3887,18 @@ CONST0_RTX (V4SFmode), operands[1])); }) +;; It's more profitable to split and then extend in the same register. +(define_peephole2 + [(set (match_operand:DF 0 "register_operand") + (float_extend:DF + (match_operand:SF 1 "memory_operand")))] + "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS + && optimize_insn_for_speed_p () + && SSE_REG_P (operands[0])" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (float_extend:DF (match_dup 2)))] + "operands[2] = gen_rtx_REG (SFmode, REGNO (operands[0]));") + (define_insn "*extendsfdf2_mixed" [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,x") (float_extend:DF @@ -4031,6 +4043,18 @@ CONST0_RTX (V2DFmode), operands[1])); }) +;; It's more profitable to split and then extend in the same register. +(define_peephole2 + [(set (match_operand:SF 0 "register_operand") + (float_truncate:SF + (match_operand:DF 1 "memory_operand")))] + "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS + && optimize_insn_for_speed_p () + && SSE_REG_P (operands[0])" + [(set (match_dup 2) (match_dup 1)) + (set (match_dup 0) (float_truncate:SF (match_dup 2)))] + "operands[2] = gen_rtx_REG (DFmode, REGNO (operands[0]));") + (define_expand "truncdfsf2_with_temp" [(parallel [(set (match_operand:SF 0 "" "") (float_truncate:SF (match_operand:DF 1 "" ""))) -- cgit v1.2.3