aboutsummaryrefslogtreecommitdiffstats
path: root/gcc-4.8/gcc/config/i386/i386.c
diff options
context:
space:
mode:
authorAndrew Hsieh <andrewhsieh@google.com>2014-06-18 09:58:10 -0700
committerPavel Chupin <pavel.v.chupin@intel.com>2014-06-19 22:38:26 +0400
commit4ff2f42147bc128ce38789071d98e55844cd3a5e (patch)
tree7d111051796048be35be22a8655bad3f665d673a /gcc-4.8/gcc/config/i386/i386.c
parentf1fba0f5b24322265b9c3d37863a504c4b67cdc2 (diff)
downloadtoolchain_gcc-4ff2f42147bc128ce38789071d98e55844cd3a5e.tar.gz
toolchain_gcc-4ff2f42147bc128ce38789071d98e55844cd3a5e.tar.bz2
toolchain_gcc-4ff2f42147bc128ce38789071d98e55844cd3a5e.zip
revert 41eff3d706b202f682f64fcc7773c64abd59ac45
For the purpose of merging gcc-4.8.3, will put it back. https://android-review.googlesource.com/#/c/59726 [4.7, 4.8] Release basic tuning for new Silvermont architecture Change-Id: Id2b9e714418b3cf0a7eef0f599c39ba81df47d13
Diffstat (limited to 'gcc-4.8/gcc/config/i386/i386.c')
-rw-r--r--gcc-4.8/gcc/config/i386/i386.c130
1 files changed, 23 insertions, 107 deletions
diff --git a/gcc-4.8/gcc/config/i386/i386.c b/gcc-4.8/gcc/config/i386/i386.c
index 0c5b57d3f..98dc2e41e 100644
--- a/gcc-4.8/gcc/config/i386/i386.c
+++ b/gcc-4.8/gcc/config/i386/i386.c
@@ -1480,79 +1480,6 @@ struct processor_costs atom_cost = {
1, /* cond_not_taken_branch_cost. */
};
-static const
-struct processor_costs slm_cost = {
- COSTS_N_INSNS (1), /* cost of an add instruction */
- COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
- COSTS_N_INSNS (1), /* variable shift costs */
- COSTS_N_INSNS (1), /* constant shift costs */
- {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
- COSTS_N_INSNS (4), /* HI */
- COSTS_N_INSNS (3), /* SI */
- COSTS_N_INSNS (4), /* DI */
- COSTS_N_INSNS (2)}, /* other */
- 0, /* cost of multiply per each bit set */
- {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
- COSTS_N_INSNS (26), /* HI */
- COSTS_N_INSNS (42), /* SI */
- COSTS_N_INSNS (74), /* DI */
- COSTS_N_INSNS (74)}, /* other */
- COSTS_N_INSNS (1), /* cost of movsx */
- COSTS_N_INSNS (1), /* cost of movzx */
- 8, /* "large" insn */
- 17, /* MOVE_RATIO */
- 4, /* cost for loading QImode using movzbl */
- {4, 4, 4}, /* cost of loading integer registers
- in QImode, HImode and SImode.
- Relative to reg-reg move (2). */
- {4, 4, 4}, /* cost of storing integer registers */
- 4, /* cost of reg,reg fld/fst */
- {12, 12, 12}, /* cost of loading fp registers
- in SFmode, DFmode and XFmode */
- {6, 6, 8}, /* cost of storing fp registers
- in SFmode, DFmode and XFmode */
- 2, /* cost of moving MMX register */
- {8, 8}, /* cost of loading MMX registers
- in SImode and DImode */
- {8, 8}, /* cost of storing MMX registers
- in SImode and DImode */
- 2, /* cost of moving SSE register */
- {8, 8, 8}, /* cost of loading SSE registers
- in SImode, DImode and TImode */
- {8, 8, 8}, /* cost of storing SSE registers
- in SImode, DImode and TImode */
- 5, /* MMX or SSE register to integer */
- 32, /* size of l1 cache. */
- 256, /* size of l2 cache. */
- 64, /* size of prefetch block */
- 6, /* number of parallel prefetches */
- 3, /* Branch cost */
- COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
- COSTS_N_INSNS (8), /* cost of FMUL instruction. */
- COSTS_N_INSNS (20), /* cost of FDIV instruction. */
- COSTS_N_INSNS (8), /* cost of FABS instruction. */
- COSTS_N_INSNS (8), /* cost of FCHS instruction. */
- COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
- {{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
- {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
- {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
- {{libcall, {{8, loop, false}, {15, unrolled_loop, false},
- {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
- {libcall, {{24, loop, false}, {32, unrolled_loop, false},
- {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
- 1, /* scalar_stmt_cost. */
- 1, /* scalar load_cost. */
- 1, /* scalar_store_cost. */
- 1, /* vec_stmt_cost. */
- 1, /* vec_to_scalar_cost. */
- 1, /* scalar_to_vec_cost. */
- 1, /* vec_align_load_cost. */
- 2, /* vec_unalign_load_cost. */
- 1, /* vec_store_cost. */
- 3, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
-};
-
/* Generic64 should produce code tuned for Nocona and K8. */
static const
struct processor_costs generic64_cost = {
@@ -1806,7 +1733,6 @@ const struct processor_costs *ix86_cost = &pentium_cost;
#define m_HASWELL (1<<PROCESSOR_HASWELL)
#define m_CORE_ALL (m_CORE2 | m_COREI7 | m_HASWELL)
#define m_ATOM (1<<PROCESSOR_ATOM)
-#define m_SLM (1<<PROCESSOR_SLM)
#define m_GEODE (1<<PROCESSOR_GEODE)
#define m_K6 (1<<PROCESSOR_K6)
@@ -1850,7 +1776,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
m_486 | m_PENT,
/* X86_TUNE_UNROLL_STRLEN */
- m_486 | m_PENT | m_PPRO | m_ATOM | m_SLM | m_CORE_ALL | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
+ m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE_ALL | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
/* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
on simulation result. But after P4 was made, no performance benefit
@@ -1862,11 +1788,11 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
~m_386,
/* X86_TUNE_USE_SAHF */
- m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
+ m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
/* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
partial dependencies. */
- m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
+ m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
/* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
register stalls on Generic32 compilation setting as well. However
@@ -1889,13 +1815,13 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
m_386 | m_486 | m_K6_GEODE,
/* X86_TUNE_USE_SIMODE_FIOP */
- ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC),
+ ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
/* X86_TUNE_USE_MOV0 */
m_K6,
/* X86_TUNE_USE_CLTD */
- ~(m_PENT | m_ATOM | m_SLM | m_K6),
+ ~(m_PENT | m_ATOM | m_K6),
/* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
m_PENT4,
@@ -1910,7 +1836,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
~(m_PENT | m_PPRO),
/* X86_TUNE_PROMOTE_QIMODE */
- m_386 | m_486 | m_PENT | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
+ m_386 | m_486 | m_PENT | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
/* X86_TUNE_FAST_PREFIX */
~(m_386 | m_486 | m_PENT),
@@ -1951,10 +1877,10 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
/* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
for DFmode copies */
- ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC),
+ ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
/* X86_TUNE_PARTIAL_REG_DEPENDENCY */
- m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC,
+ m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
/* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
conflict here in between PPro/Pentium4 based chips that thread 128bit
@@ -1965,13 +1891,13 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
shows that disabling this option on P4 brings over 20% SPECfp regression,
while enabling it on K8 brings roughly 2.4% regression that can be partly
masked by careful scheduling of moves. */
- m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMDFAM10 | m_BDVER | m_GENERIC,
+ m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
/* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
- m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER | m_SLM,
+ m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER,
/* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
- m_COREI7 | m_BDVER | m_SLM,
+ m_COREI7 | m_BDVER,
/* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
m_BDVER ,
@@ -1989,7 +1915,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
m_PPRO | m_P4_NOCONA,
/* X86_TUNE_MEMORY_MISMATCH_STALL */
- m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC,
+ m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
/* X86_TUNE_PROLOGUE_USING_MOVE */
m_PPRO | m_ATHLON_K8,
@@ -2011,16 +1937,16 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
/* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
than 4 branch instructions in the 16 byte window. */
- m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC,
+ m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
/* X86_TUNE_SCHEDULE */
- m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
+ m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
/* X86_TUNE_USE_BT */
- m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC,
+ m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
/* X86_TUNE_USE_INCDEC */
- ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GENERIC),
+ ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GENERIC),
/* X86_TUNE_PAD_RETURNS */
m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC,
@@ -2029,7 +1955,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
m_ATOM,
/* X86_TUNE_EXT_80387_CONSTANTS */
- m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
+ m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
/* X86_TUNE_AVOID_VECTOR_DECODE */
m_CORE_ALL | m_K8 | m_GENERIC64,
@@ -2074,7 +2000,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
/* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
will impact LEA instruction selection. */
- m_ATOM | m_SLM,
+ m_ATOM,
/* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
instructions. */
@@ -2095,7 +2021,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
/* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
during reassociation of fp computation. */
- m_ATOM | m_SLM | m_HASWELL,
+ m_ATOM | m_HASWELL,
/* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
regs instead of memory. */
@@ -2129,10 +2055,10 @@ static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
};
static const unsigned int x86_accumulate_outgoing_args
- = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC;
+ = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC;
static const unsigned int x86_arch_always_fancy_math_387
- = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC;
+ = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
static const unsigned int x86_avx256_split_unaligned_load
= m_COREI7 | m_GENERIC;
@@ -2517,8 +2443,7 @@ static const struct ptt processor_target_table[PROCESSOR_max] =
{&bdver3_cost, 16, 10, 16, 7, 11},
{&btver1_cost, 16, 10, 16, 7, 11},
{&btver2_cost, 16, 10, 16, 7, 11},
- {&atom_cost, 16, 15, 16, 7, 16},
- {&slm_cost, 16, 15, 16, 7, 16}
+ {&atom_cost, 16, 15, 16, 7, 16}
};
static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
@@ -2539,7 +2464,6 @@ static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
"corei7",
"core-avx2",
"atom",
- "slm",
"geode",
"k6",
"k6-2",
@@ -3002,10 +2926,6 @@ ix86_option_override_internal (bool main_args_p)
{"atom", PROCESSOR_ATOM, CPU_ATOM,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
- {"slm", PROCESSOR_SLM, CPU_SLM,
- PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
- | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_MOVBE
- | PTA_FXSR},
{"geode", PROCESSOR_GEODE, CPU_GEODE,
PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
{"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
@@ -24200,7 +24120,6 @@ ix86_issue_rate (void)
{
case PROCESSOR_PENTIUM:
case PROCESSOR_ATOM:
- case PROCESSOR_SLM:
case PROCESSOR_K6:
case PROCESSOR_BTVER2:
return 2;
@@ -24468,7 +24387,6 @@ ia32_multipass_dfa_lookahead (void)
case PROCESSOR_COREI7:
case PROCESSOR_HASWELL:
case PROCESSOR_ATOM:
- case PROCESSOR_SLM:
/* Generally, we want haifa-sched:max_issue() to look ahead as far
as many instructions can be executed on a cycle, i.e.,
issue_rate. I wonder why tuning for many CPUs does not do this. */
@@ -24569,7 +24487,7 @@ ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
continue;
if (pro != insn)
index = -1;
- }
+ }
if (index >= 0)
break;
}
@@ -29750,7 +29668,6 @@ fold_builtin_cpu (tree fndecl, tree *args)
M_AMD,
M_CPU_TYPE_START,
M_INTEL_ATOM,
- M_INTEL_SLM,
M_INTEL_CORE2,
M_INTEL_COREI7,
M_AMDFAM10H,
@@ -29777,7 +29694,6 @@ fold_builtin_cpu (tree fndecl, tree *args)
{"amd", M_AMD},
{"intel", M_INTEL},
{"atom", M_INTEL_ATOM},
- {"slm", M_INTEL_SLM},
{"core2", M_INTEL_CORE2},
{"corei7", M_INTEL_COREI7},
{"nehalem", M_INTEL_COREI7_NEHALEM},