diff options
author | Pavel Chupin <pavel.v.chupin@intel.com> | 2013-05-30 15:35:10 +0400 |
---|---|---|
committer | Pavel Chupin <pavel.v.chupin@intel.com> | 2014-06-19 22:39:06 +0400 |
commit | 27f3b6e52df39b2bf4038d7fb92e1c642c4913dc (patch) | |
tree | 8da144c634a878118db78dbf604043a75e2fb0c5 /gcc-4.8/gcc/config/i386/i386.c | |
parent | f190d6284359da8ae8694b2d2e14b01602a959ed (diff) | |
download | toolchain_gcc-27f3b6e52df39b2bf4038d7fb92e1c642c4913dc.tar.gz toolchain_gcc-27f3b6e52df39b2bf4038d7fb92e1c642c4913dc.tar.bz2 toolchain_gcc-27f3b6e52df39b2bf4038d7fb92e1c642c4913dc.zip |
[4.8, REAPPLY] Release basic tuning for new Silvermont architecture
Reapplying https://android-review.googlesource.com/#/c/59726 after 4.8.3
merge
Change-Id: I855de6c963d423f68899f90aada1758ae6f6c0d9
Signed-off-by: Pavel Chupin <pavel.v.chupin@intel.com>
Diffstat (limited to 'gcc-4.8/gcc/config/i386/i386.c')
-rw-r--r-- | gcc-4.8/gcc/config/i386/i386.c | 127 |
1 files changed, 105 insertions, 22 deletions
diff --git a/gcc-4.8/gcc/config/i386/i386.c b/gcc-4.8/gcc/config/i386/i386.c index 0569828f3..99d1eb8e7 100644 --- a/gcc-4.8/gcc/config/i386/i386.c +++ b/gcc-4.8/gcc/config/i386/i386.c @@ -1480,6 +1480,79 @@ struct processor_costs atom_cost = { 1, /* cond_not_taken_branch_cost. */ }; +static const +struct processor_costs slm_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (2)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (26), /* HI */ + COSTS_N_INSNS (42), /* SI */ + COSTS_N_INSNS (74), /* DI */ + COSTS_N_INSNS (74)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 17, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {12, 12, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {8, 8, 8}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {8, 8, 8}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 5, /* MMX or SSE register to integer */ + 32, /* size of l1 cache. */ + 256, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + 3, /* Branch cost */ + COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (8), /* cost of FMUL instruction. */ + COSTS_N_INSNS (20), /* cost of FDIV instruction. */ + COSTS_N_INSNS (8), /* cost of FABS instruction. */ + COSTS_N_INSNS (8), /* cost of FCHS instruction. */ + COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + {{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, + {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}, + {{libcall, {{8, loop, false}, {15, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{24, loop, false}, {32, unrolled_loop, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + /* Generic64 should produce code tuned for Nocona and K8. */ static const struct processor_costs generic64_cost = { @@ -1733,6 +1806,7 @@ const struct processor_costs *ix86_cost = &pentium_cost; #define m_HASWELL (1<<PROCESSOR_HASWELL) #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_HASWELL) #define m_ATOM (1<<PROCESSOR_ATOM) +#define m_SLM (1<<PROCESSOR_SLM) #define m_GEODE (1<<PROCESSOR_GEODE) #define m_K6 (1<<PROCESSOR_K6) @@ -1776,7 +1850,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { m_486 | m_PENT, /* X86_TUNE_UNROLL_STRLEN */ - m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE_ALL | m_K6 | m_AMD_MULTIPLE | m_GENERIC, + m_486 | m_PENT | m_PPRO | m_ATOM | m_SLM | m_CORE_ALL | m_K6 | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based on simulation result. But after P4 was made, no performance benefit @@ -1788,11 +1862,11 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { ~m_386, /* X86_TUNE_USE_SAHF */ - m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC, + m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC, /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid partial dependencies. */ - m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC, + m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial register stalls on Generic32 compilation setting as well. However @@ -1815,13 +1889,13 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { m_386 | m_486 | m_K6_GEODE, /* X86_TUNE_USE_SIMODE_FIOP */ - ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC), + ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC), /* X86_TUNE_USE_MOV0 */ m_K6, /* X86_TUNE_USE_CLTD */ - ~(m_PENT | m_ATOM | m_K6), + ~(m_PENT | m_ATOM | m_SLM | m_K6), /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */ m_PENT4, @@ -1836,7 +1910,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { ~(m_PENT | m_PPRO), /* X86_TUNE_PROMOTE_QIMODE */ - m_386 | m_486 | m_PENT | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC, + m_386 | m_486 | m_PENT | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_FAST_PREFIX */ ~(m_386 | m_486 | m_PENT), @@ -1877,10 +1951,10 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred for DFmode copies */ - ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC), + ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC), /* X86_TUNE_PARTIAL_REG_DEPENDENCY */ - m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, + m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a conflict here in between PPro/Pentium4 based chips that thread 128bit @@ -1891,13 +1965,13 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { shows that disabling this option on P4 brings over 20% SPECfp regression, while enabling it on K8 brings roughly 2.4% regression that can be partly masked by careful scheduling of moves. */ - m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC, + m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMDFAM10 | m_BDVER | m_GENERIC, /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */ - m_COREI7 | m_HASWELL | m_AMDFAM10 | m_BDVER | m_BTVER, + m_COREI7 | m_HASWELL | m_AMDFAM10 | m_BDVER | m_BTVER | m_SLM, /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */ - m_COREI7 | m_HASWELL| m_BDVER, + m_COREI7 | m_HASWELL | m_BDVER | m_SLM, /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */ m_BDVER , @@ -1915,7 +1989,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { m_PPRO | m_P4_NOCONA, /* X86_TUNE_MEMORY_MISMATCH_STALL */ - m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, + m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_PROLOGUE_USING_MOVE */ m_PPRO | m_ATHLON_K8, @@ -1937,16 +2011,16 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more than 4 branch instructions in the 16 byte window. */ - m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, + m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_SCHEDULE */ - m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC, + m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_USE_BT */ - m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, + m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_USE_INCDEC */ - ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GENERIC), + ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GENERIC), /* X86_TUNE_PAD_RETURNS */ m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC, @@ -1955,7 +2029,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { m_ATOM, /* X86_TUNE_EXT_80387_CONSTANTS */ - m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC, + m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC, /* X86_TUNE_AVOID_VECTOR_DECODE */ m_CORE_ALL | m_K8 | m_GENERIC64, @@ -2000,7 +2074,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag will impact LEA instruction selection. */ - m_ATOM, + m_ATOM | m_SLM, /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector instructions. */ @@ -2021,7 +2095,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations during reassociation of fp computation. */ - m_ATOM | m_HASWELL, + m_ATOM | m_SLM | m_HASWELL, /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE regs instead of memory. */ @@ -2055,10 +2129,10 @@ static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = { }; static const unsigned int x86_accumulate_outgoing_args - = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC; + = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC; static const unsigned int x86_arch_always_fancy_math_387 - = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC; + = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC; static const unsigned int x86_avx256_split_unaligned_load = m_COREI7 | m_GENERIC; @@ -2433,6 +2507,7 @@ static const struct ptt processor_target_table[PROCESSOR_max] = {"corei7", &core_cost, 16, 10, 16, 10, 16}, {"core-avx2", &core_cost, 16, 10, 16, 10, 16}, {"atom", &atom_cost, 16, 15, 16, 7, 16}, + {"slm", &slm_cost, 16, 15, 16, 7, 16}, {"geode", &geode_cost, 0, 0, 0, 0, 0}, {"k6", &k6_cost, 32, 7, 32, 7, 32}, {"athlon", &athlon_cost, 16, 7, 16, 7, 16}, @@ -2892,6 +2967,10 @@ ix86_option_override_internal (bool main_args_p) {"atom", PROCESSOR_ATOM, CPU_ATOM, PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR}, + {"slm", PROCESSOR_SLM, CPU_SLM, + PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 + | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_MOVBE + | PTA_FXSR}, {"geode", PROCESSOR_GEODE, CPU_GEODE, PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW}, {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX}, @@ -24101,6 +24180,7 @@ ix86_issue_rate (void) { case PROCESSOR_PENTIUM: case PROCESSOR_ATOM: + case PROCESSOR_SLM: case PROCESSOR_K6: case PROCESSOR_BTVER2: return 2; @@ -24368,6 +24448,7 @@ ia32_multipass_dfa_lookahead (void) case PROCESSOR_COREI7: case PROCESSOR_HASWELL: case PROCESSOR_ATOM: + case PROCESSOR_SLM: /* Generally, we want haifa-sched:max_issue() to look ahead as far as many instructions can be executed on a cycle, i.e., issue_rate. I wonder why tuning for many CPUs does not do this. */ @@ -24468,7 +24549,7 @@ ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready, continue; if (pro != insn) index = -1; - } + } if (index >= 0) break; } @@ -29650,6 +29731,7 @@ fold_builtin_cpu (tree fndecl, tree *args) M_AMD, M_CPU_TYPE_START, M_INTEL_ATOM, + M_INTEL_SLM, M_INTEL_CORE2, M_INTEL_COREI7, M_AMDFAM10H, @@ -29676,6 +29758,7 @@ fold_builtin_cpu (tree fndecl, tree *args) {"amd", M_AMD}, {"intel", M_INTEL}, {"atom", M_INTEL_ATOM}, + {"slm", M_INTEL_SLM}, {"core2", M_INTEL_CORE2}, {"corei7", M_INTEL_COREI7}, {"nehalem", M_INTEL_COREI7_NEHALEM}, |