From 220882082b32a048e224bc678f5a3b6fe87259b7 Mon Sep 17 00:00:00 2001
From: jgreenhalgh <jgreenhalgh@138bc75d-0d04-0410-961f-82ee72b054a4>
Date: Fri, 16 May 2014 08:41:46 +0000
Subject: [4.9] Adjust several costs for AArch64.

Backport from trunk

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>

Refactor aarch64_address_costs.

gcc/

	* config/aarch64/aarch64-protos.h (scale_addr_mode_cost): New.
	(cpu_addrcost_table): Use it.
	* config/aarch64/aarch64.c (generic_addrcost_table): Initialize it.
	(aarch64_address_cost): Rewrite using aarch64_classify_address,
	move it.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@210493 138bc75d-0d04-0410-961f-82ee72b054a4

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>

Add cost tables for Cortex-A57

gcc/

	* config/aarch64/aarch64.c (cortexa57_addrcost_table): New.
	(cortexa57_vector_cost): Likewise.
	(cortexa57_tunings): Use them.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@210494 138bc75d-0d04-0410-961f-82ee72b054a4

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>

Better estimate cost of building a constant

gcc/

	* config/aarch64/aarch64.c (aarch64_build_constant): Conditionally
	emit instructions, return number of instructions which would
	be emitted.
	(aarch64_add_constant): Update call to aarch64_build_constant.
	(aarch64_output_mi_thunk): Likewise.
	(aarch64_rtx_costs): Estimate cost of a CONST_INT, cost
	a CONST_DOUBLE.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@210496 138bc75d-0d04-0410-961f-82ee72b054a4

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>

Add cost tables for Cortex-A57

gcc/

        * config/aarch64/aarch64.c (cortexa57_addrcost_table): New.
        (cortexa57_vector_cost): Likewise.
        (cortexa57_tunings): Use them.

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>

Wrap aarch64_rtx_costs to dump verbose output

gcc/

        * config/aarch64/aarch64.c (aarch64_rtx_costs_wrapper): New.
        (TARGET_RTX_COSTS): Call it.

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>
            Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>

Better estimate cost of building a constant

gcc/

        * config/aarch64/aarch64.c (aarch64_build_constant): Conditionally
        emit instructions, return number of instructions which would
        be emitted.
        (aarch64_add_constant): Update call to aarch64_build_constant.
        (aarch64_output_mi_thunk): Likewise.
        (aarch64_rtx_costs): Estimate cost of a CONST_INT, cost
        a CONST_DOUBLE.

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>
            Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>

Factor out common MULT cases

gcc/

	* config/aarch64/aarch64.c (aarch64_strip_shift_or_extend): Rename
	to...
	(aarch64_strip_extend): ...this, don't strip shifts, check RTX is
	well formed.
	(aarch64_rtx_mult_cost): New.
	(aarch64_rtx_costs): Use it, refactor as appropriate.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@210497 138bc75d-0d04-0410-961f-82ee72b054a4

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>

Set default costs and handle vector modes.

gcc/

	* config/aarch64/aarch64.c (aarch64_rtx_costs): Set default costs.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@210498 138bc75d-0d04-0410-961f-82ee72b054a4

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>
            Philip Tomsich  <philipp.tomsich@theobroma-systems.com>

Improve SET cost.

gcc/

	* config/aarch64/aarch64.c (aarch64_rtx_costs): Improve costing
	for SET RTX.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@210499 138bc75d-0d04-0410-961f-82ee72b054a4

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>
            Philip Tomsich  <philipp.tomsich@theobroma-systems.com>

Cost memory accesses using address costs

gcc/

	* config/aarch64/aarch64.c (aarch64_rtx_costs): Use address
	costs when costing loads and stores to memory.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@210500 138bc75d-0d04-0410-961f-82ee72b054a4

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>
            Philip Tomsich  <philipp.tomsich@theobroma-systems.com>

Better cost logical operations

gcc/

	* config/aarch64/aarch64.c (aarch64_rtx_costs): Improve cost for
	logical operations.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@210501 138bc75d-0d04-0410-961f-82ee72b054a4

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>
            Philip Tomsich  <philipp.tomsich@theobroma-systems.com>

Improve costs for sign/zero extend operations

gcc/

	* config/aarch64/aarch64.c (aarch64_rtx_costs): Cost
	ZERO_EXTEND and SIGN_EXTEND better.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@210502 138bc75d-0d04-0410-961f-82ee72b054a4

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>
            Philip Tomsich  <philipp.tomsich@theobroma-systems.com>

Improve costs for rotate and shift operations.

	* config/aarch64/aarch64.c (aarch64_rtx_costs): Improve costs for
	rotates and shifts.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@210503 138bc75d-0d04-0410-961f-82ee72b054a4

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>
            Philip Tomsich  <philipp.tomsich@theobroma-systems.com>

Improve costs for sign/zero extracts

gcc/

	* config/aarch64/aarch64.c (aarch64_rtx_costs): Improve costs for
	SIGN/ZERO_EXTRACT.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@210504 138bc75d-0d04-0410-961f-82ee72b054a4

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>

Improve costs for div/mod

gcc/

	* config/aarch64/aarch64.c (aarch64_rtx_costs): Improve costs for
	DIV/MOD.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@210505 138bc75d-0d04-0410-961f-82ee72b054a4

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>
            Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>

Cost comparisons, flag setting operators and IF_THEN_ELSE

gcc/

	* config/aarch64/aarch64.c (aarch64_rtx_costs): Cost comparison
	operators.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@210506 138bc75d-0d04-0410-961f-82ee72b054a4

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>
            Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>

Cost more Floating point RTX.

gcc/

	* config/aarch64/aarch64.c (aarch64_rtx_costs): Cost FMA,
	FLOAT_EXTEND, FLOAT_TRUNCATE, ABS, SMAX, and SMIN.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@210507 138bc75d-0d04-0410-961f-82ee72b054a4

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>
            Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>

Cost TRUNCATE

gcc/

	* config/aarch64/aarch64.c (aarch64_rtx_costs): Cost TRUNCATE.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@210508 138bc75d-0d04-0410-961f-82ee72b054a4

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>

Cost for SYMBOL_REF, HIGH and LO_SUM

gcc/

	* config/aarch64/aarch64.c (aarch64_rtx_costs): Cost SYMBOL_REF,
	HIGH, LO_SUM.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@210509 138bc75d-0d04-0410-961f-82ee72b054a4

2014-05-16  James Greenhalgh  <james.greenhalgh@arm.com>

Dump a message if we are unable to cost an insn.

gcc/

	* config/aarch64/aarch64.c (aarch64_rtx_costs): Handle the case
	where we were unable to cost an RTX.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@210510 138bc75d-0d04-0410-961f-82ee72b054a4

2014-08-26  Evandro Menezes  <e.menezes@samsung.com>

Fix typos in cost data structure.

	* config/arm/aarch64/aarch64.c (generic_addrcost_table): Delete
	qi cost; add di cost.
	(cortexa57_addrcost_table): Likewise.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@214503 138bc75d-0d04-0410-961f-82ee72b054a4
---
 gcc-4.9/gcc/config/aarch64/aarch64-protos.h |   13 +
 gcc-4.9/gcc/config/aarch64/aarch64.c        | 1285 ++++++++++++++++++++++-----
 2 files changed, 1092 insertions(+), 206 deletions(-)

(limited to 'gcc-4.9/gcc')

diff --git a/gcc-4.9/gcc/config/aarch64/aarch64-protos.h b/gcc-4.9/gcc/config/aarch64/aarch64-protos.h
index 8b0a70538..e78348ea0 100644
--- a/gcc-4.9/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc-4.9/gcc/config/aarch64/aarch64-protos.h
@@ -108,9 +108,22 @@ enum aarch64_symbol_type
    cost models and vectors for address cost calculations, register
    move costs and memory move costs.  */
 
+/* Scaled addressing modes can vary cost depending on the mode of the
+   value to be loaded/stored.  QImode values cannot use scaled
+   addressing modes.  */
+
+struct scale_addr_mode_cost
+{
+  const int hi;
+  const int si;
+  const int di;
+  const int ti;
+};
+
 /* Additional cost for addresses.  */
 struct cpu_addrcost_table
 {
+  const struct scale_addr_mode_cost addr_scale_costs;
   const int pre_modify;
   const int post_modify;
   const int register_offset;
diff --git a/gcc-4.9/gcc/config/aarch64/aarch64.c b/gcc-4.9/gcc/config/aarch64/aarch64.c
index 6d45984d0..d8895eeee 100644
--- a/gcc-4.9/gcc/config/aarch64/aarch64.c
+++ b/gcc-4.9/gcc/config/aarch64/aarch64.c
@@ -141,6 +141,7 @@ static bool aarch64_const_vec_all_same_int_p (rtx,
 
 static bool aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
 						 const unsigned char *sel);
+static int aarch64_address_cost (rtx, enum machine_mode, addr_space_t, bool);
 
 /* The processor for which instructions should be scheduled.  */
 enum aarch64_processor aarch64_tune = cortexa53;
@@ -171,6 +172,15 @@ __extension__
 #endif
 static const struct cpu_addrcost_table generic_addrcost_table =
 {
+#if HAVE_DESIGNATED_INITIALIZERS
+  .addr_scale_costs =
+#endif
+    {
+      NAMED_PARAM (hi, 0),
+      NAMED_PARAM (si, 0),
+      NAMED_PARAM (di, 0),
+      NAMED_PARAM (ti, 0),
+    },
   NAMED_PARAM (pre_modify, 0),
   NAMED_PARAM (post_modify, 0),
   NAMED_PARAM (register_offset, 0),
@@ -178,6 +188,27 @@ static const struct cpu_addrcost_table generic_addrcost_table =
   NAMED_PARAM (imm_offset, 0)
 };
 
+#if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
+__extension__
+#endif
+static const struct cpu_addrcost_table cortexa57_addrcost_table =
+{
+#if HAVE_DESIGNATED_INITIALIZERS
+  .addr_scale_costs =
+#endif
+    {
+      NAMED_PARAM (hi, 1),
+      NAMED_PARAM (si, 0),
+      NAMED_PARAM (di, 0),
+      NAMED_PARAM (ti, 1),
+    },
+  NAMED_PARAM (pre_modify, 0),
+  NAMED_PARAM (post_modify, 0),
+  NAMED_PARAM (register_offset, 0),
+  NAMED_PARAM (register_extend, 0),
+  NAMED_PARAM (imm_offset, 0),
+};
+
 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 __extension__
 #endif
@@ -214,6 +245,26 @@ static const struct cpu_vector_cost generic_vector_cost =
   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 };
 
+/* Generic costs for vector insn classes.  */
+#if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
+__extension__
+#endif
+static const struct cpu_vector_cost cortexa57_vector_cost =
+{
+  NAMED_PARAM (scalar_stmt_cost, 1),
+  NAMED_PARAM (scalar_load_cost, 4),
+  NAMED_PARAM (scalar_store_cost, 1),
+  NAMED_PARAM (vec_stmt_cost, 3),
+  NAMED_PARAM (vec_to_scalar_cost, 8),
+  NAMED_PARAM (scalar_to_vec_cost, 8),
+  NAMED_PARAM (vec_align_load_cost, 5),
+  NAMED_PARAM (vec_unalign_load_cost, 5),
+  NAMED_PARAM (vec_unalign_store_cost, 1),
+  NAMED_PARAM (vec_store_cost, 1),
+  NAMED_PARAM (cond_taken_branch_cost, 1),
+  NAMED_PARAM (cond_not_taken_branch_cost, 1)
+};
+
 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 __extension__
 #endif
@@ -240,9 +291,9 @@ static const struct tune_params cortexa53_tunings =
 static const struct tune_params cortexa57_tunings =
 {
   &cortexa57_extra_costs,
-  &generic_addrcost_table,
+  &cortexa57_addrcost_table,
   &generic_regmove_cost,
-  &generic_vector_cost,
+  &cortexa57_vector_cost,
   NAMED_PARAM (memmov_cost, 4),
   NAMED_PARAM (issue_rate, 3)
 };
@@ -446,7 +497,7 @@ aarch64_is_long_call_p (rtx sym)
    represent an expression that matches an extend operation.  The
    operands represent the paramters from
 
-   (extract (mult (reg) (mult_imm)) (extract_imm) (const_int 0)).  */
+   (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 bool
 aarch64_is_extend_from_extract (enum machine_mode mode, rtx mult_imm,
 				rtx extract_imm)
@@ -2463,12 +2514,22 @@ aarch64_final_eh_return_addr (void)
 				       - 2 * UNITS_PER_WORD));
 }
 
-/* Output code to build up a constant in a register.  */
-static void
-aarch64_build_constant (int regnum, HOST_WIDE_INT val)
+/* Possibly output code to build up a constant in a register.  For
+   the benefit of the costs infrastructure, returns the number of
+   instructions which would be emitted.  GENERATE inhibits or
+   enables code generation.  */
+
+static int
+aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
 {
+  int insns = 0;
+
   if (aarch64_bitmask_imm (val, DImode))
-    emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
+    {
+      if (generate)
+	emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
+      insns = 1;
+    }
   else
     {
       int i;
@@ -2499,15 +2560,19 @@ aarch64_build_constant (int regnum, HOST_WIDE_INT val)
 	 the same.  */
       if (ncount < zcount)
 	{
-	  emit_move_insn (gen_rtx_REG (Pmode, regnum),
-			  GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
+	  if (generate)
+	    emit_move_insn (gen_rtx_REG (Pmode, regnum),
+			    GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
 	  tval = 0xffff;
+	  insns++;
 	}
       else
 	{
-	  emit_move_insn (gen_rtx_REG (Pmode, regnum),
-			  GEN_INT (val & 0xffff));
+	  if (generate)
+	    emit_move_insn (gen_rtx_REG (Pmode, regnum),
+			    GEN_INT (val & 0xffff));
 	  tval = 0;
+	  insns++;
 	}
 
       val >>= 16;
@@ -2515,11 +2580,17 @@ aarch64_build_constant (int regnum, HOST_WIDE_INT val)
       for (i = 16; i < 64; i += 16)
 	{
 	  if ((val & 0xffff) != tval)
-	    emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
-				       GEN_INT (i), GEN_INT (val & 0xffff)));
+	    {
+	      if (generate)
+		emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
+					   GEN_INT (i),
+					   GEN_INT (val & 0xffff)));
+	      insns++;
+	    }
 	  val >>= 16;
 	}
     }
+  return insns;
 }
 
 static void
@@ -2534,7 +2605,7 @@ aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
 
   if (mdelta >= 4096 * 4096)
     {
-      aarch64_build_constant (scratchreg, delta);
+      (void) aarch64_build_constant (scratchreg, delta, true);
       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
     }
   else if (mdelta > 0)
@@ -2608,7 +2679,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
 	  addr = plus_constant (Pmode, temp0, vcall_offset);
       else
 	{
-	  aarch64_build_constant (IP1_REGNUM, vcall_offset);
+	  (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
 	  addr = gen_rtx_PLUS (Pmode, temp0, temp1);
 	}
 
@@ -4471,12 +4542,12 @@ aarch64_strip_shift (rtx x)
   return x;
 }
 
-/* Helper function for rtx cost calculation.  Strip a shift or extend
+/* Helper function for rtx cost calculation.  Strip an extend
    expression from X.  Returns the inner operand if successful, or the
    original expression on failure.  We deal with a number of possible
    canonicalization variations here.  */
 static rtx
-aarch64_strip_shift_or_extend (rtx x)
+aarch64_strip_extend (rtx x)
 {
   rtx op = x;
 
@@ -4512,7 +4583,246 @@ aarch64_strip_shift_or_extend (rtx x)
   if (op != x)
     return op;
 
-  return aarch64_strip_shift (x);
+  return x;
+}
+
+/* Helper function for rtx cost calculation.  Calculate the cost of
+   a MULT, which may be part of a multiply-accumulate rtx.  Return
+   the calculated cost of the expression, recursing manually in to
+   operands where needed.  */
+
+static int
+aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
+{
+  rtx op0, op1;
+  const struct cpu_cost_table *extra_cost
+    = aarch64_tune_params->insn_extra_cost;
+  int cost = 0;
+  bool maybe_fma = (outer == PLUS || outer == MINUS);
+  enum machine_mode mode = GET_MODE (x);
+
+  gcc_checking_assert (code == MULT);
+
+  op0 = XEXP (x, 0);
+  op1 = XEXP (x, 1);
+
+  if (VECTOR_MODE_P (mode))
+    mode = GET_MODE_INNER (mode);
+
+  /* Integer multiply/fma.  */
+  if (GET_MODE_CLASS (mode) == MODE_INT)
+    {
+      /* The multiply will be canonicalized as a shift, cost it as such.  */
+      if (CONST_INT_P (op1)
+	  && exact_log2 (INTVAL (op1)) > 0)
+	{
+	  if (speed)
+	    {
+	      if (maybe_fma)
+		/* ADD (shifted register).  */
+		cost += extra_cost->alu.arith_shift;
+	      else
+		/* LSL (immediate).  */
+		cost += extra_cost->alu.shift;
+	    }
+
+	  cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
+
+	  return cost;
+	}
+
+      /* Integer multiplies or FMAs have zero/sign extending variants.  */
+      if ((GET_CODE (op0) == ZERO_EXTEND
+	   && GET_CODE (op1) == ZERO_EXTEND)
+	  || (GET_CODE (op0) == SIGN_EXTEND
+	      && GET_CODE (op1) == SIGN_EXTEND))
+	{
+	  cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
+		  + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
+
+	  if (speed)
+	    {
+	      if (maybe_fma)
+		/* MADD/SMADDL/UMADDL.  */
+		cost += extra_cost->mult[0].extend_add;
+	      else
+		/* MUL/SMULL/UMULL.  */
+		cost += extra_cost->mult[0].extend;
+	    }
+
+	  return cost;
+	}
+
+      /* This is either an integer multiply or an FMA.  In both cases
+	 we want to recurse and cost the operands.  */
+      cost += rtx_cost (op0, MULT, 0, speed)
+	      + rtx_cost (op1, MULT, 1, speed);
+
+      if (speed)
+	{
+	  if (maybe_fma)
+	    /* MADD.  */
+	    cost += extra_cost->mult[mode == DImode].add;
+	  else
+	    /* MUL.  */
+	    cost += extra_cost->mult[mode == DImode].simple;
+	}
+
+      return cost;
+    }
+  else
+    {
+      if (speed)
+	{
+	  /* Floating-point FMA can also support negations of the
+	     operands.  */
+	  if (GET_CODE (op0) == NEG)
+	    {
+	      maybe_fma = true;
+	      op0 = XEXP (op0, 0);
+	    }
+	  if (GET_CODE (op1) == NEG)
+	    {
+	      maybe_fma = true;
+	      op1 = XEXP (op1, 0);
+	    }
+
+	  if (maybe_fma)
+	    /* FMADD/FNMADD/FNMSUB/FMSUB.  */
+	    cost += extra_cost->fp[mode == DFmode].fma;
+	  else
+	    /* FMUL.  */
+	    cost += extra_cost->fp[mode == DFmode].mult;
+	}
+
+      cost += rtx_cost (op0, MULT, 0, speed)
+	      + rtx_cost (op1, MULT, 1, speed);
+      return cost;
+    }
+}
+
+static int
+aarch64_address_cost (rtx x,
+		      enum machine_mode mode,
+		      addr_space_t as ATTRIBUTE_UNUSED,
+		      bool speed)
+{
+  enum rtx_code c = GET_CODE (x);
+  const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
+  struct aarch64_address_info info;
+  int cost = 0;
+  info.shift = 0;
+
+  if (!aarch64_classify_address (&info, x, mode, c, false))
+    {
+      if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
+	{
+	  /* This is a CONST or SYMBOL ref which will be split
+	     in a different way depending on the code model in use.
+	     Cost it through the generic infrastructure.  */
+	  int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
+	  /* Divide through by the cost of one instruction to
+	     bring it to the same units as the address costs.  */
+	  cost_symbol_ref /= COSTS_N_INSNS (1);
+	  /* The cost is then the cost of preparing the address,
+	     followed by an immediate (possibly 0) offset.  */
+	  return cost_symbol_ref + addr_cost->imm_offset;
+	}
+      else
+	{
+	  /* This is most likely a jump table from a case
+	     statement.  */
+	  return addr_cost->register_offset;
+	}
+    }
+
+  switch (info.type)
+    {
+      case ADDRESS_LO_SUM:
+      case ADDRESS_SYMBOLIC:
+      case ADDRESS_REG_IMM:
+	cost += addr_cost->imm_offset;
+	break;
+
+      case ADDRESS_REG_WB:
+	if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
+	  cost += addr_cost->pre_modify;
+	else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
+	  cost += addr_cost->post_modify;
+	else
+	  gcc_unreachable ();
+
+	break;
+
+      case ADDRESS_REG_REG:
+	cost += addr_cost->register_offset;
+	break;
+
+      case ADDRESS_REG_UXTW:
+      case ADDRESS_REG_SXTW:
+	cost += addr_cost->register_extend;
+	break;
+
+      default:
+	gcc_unreachable ();
+    }
+
+
+  if (info.shift > 0)
+    {
+      /* For the sake of calculating the cost of the shifted register
+	 component, we can treat same sized modes in the same way.  */
+      switch (GET_MODE_BITSIZE (mode))
+	{
+	  case 16:
+	    cost += addr_cost->addr_scale_costs.hi;
+	    break;
+
+	  case 32:
+	    cost += addr_cost->addr_scale_costs.si;
+	    break;
+
+	  case 64:
+	    cost += addr_cost->addr_scale_costs.di;
+	    break;
+
+	  /* We can't tell, or this is a 128-bit vector.  */
+	  default:
+	    cost += addr_cost->addr_scale_costs.ti;
+	    break;
+	}
+    }
+
+  return cost;
+}
+
+/* Return true if the RTX X in mode MODE is a zero or sign extract
+   usable in an ADD or SUB (extended register) instruction.  */
+static bool
+aarch64_rtx_arith_op_extract_p (rtx x, enum machine_mode mode)
+{
+  /* Catch add with a sign extract.
+     This is add_<optab><mode>_multp2.  */
+  if (GET_CODE (x) == SIGN_EXTRACT
+      || GET_CODE (x) == ZERO_EXTRACT)
+    {
+      rtx op0 = XEXP (x, 0);
+      rtx op1 = XEXP (x, 1);
+      rtx op2 = XEXP (x, 2);
+
+      if (GET_CODE (op0) == MULT
+	  && CONST_INT_P (op1)
+	  && op2 == const0_rtx
+	  && CONST_INT_P (XEXP (op0, 1))
+	  && aarch64_is_extend_from_extract (mode,
+					     XEXP (op0, 1),
+					     op1))
+	{
+	  return true;
+	}
+    }
+
+  return false;
 }
 
 /* Calculate the cost of calculating X, storing it in *COST.  Result
@@ -4521,13 +4831,31 @@ static bool
 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
 		   int param ATTRIBUTE_UNUSED, int *cost, bool speed)
 {
-  rtx op0, op1;
+  rtx op0, op1, op2;
   const struct cpu_cost_table *extra_cost
     = aarch64_tune_params->insn_extra_cost;
+  enum machine_mode mode = GET_MODE (x);
+
+  /* By default, assume that everything has equivalent cost to the
+     cheapest instruction.  Any additional costs are applied as a delta
+     above this default.  */
+  *cost = COSTS_N_INSNS (1);
+
+  /* TODO: The cost infrastructure currently does not handle
+     vector operations.  Assume that all vector operations
+     are equally expensive.  */
+  if (VECTOR_MODE_P (mode))
+    {
+      if (speed)
+	*cost += extra_cost->vect.alu;
+      return true;
+    }
 
   switch (code)
     {
     case SET:
+      /* The cost depends entirely on the operands to SET.  */
+      *cost = 0;
       op0 = SET_DEST (x);
       op1 = SET_SRC (x);
 
@@ -4535,25 +4863,47 @@ aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
 	{
 	case MEM:
 	  if (speed)
-	    *cost += extra_cost->ldst.store;
+	    {
+	      rtx address = XEXP (op0, 0);
+	      if (GET_MODE_CLASS (mode) == MODE_INT)
+		*cost += extra_cost->ldst.store;
+	      else if (mode == SFmode)
+		*cost += extra_cost->ldst.storef;
+	      else if (mode == DFmode)
+		*cost += extra_cost->ldst.stored;
+
+	      *cost +=
+		COSTS_N_INSNS (aarch64_address_cost (address, mode,
+						     0, speed));
+	    }
 
-	  if (op1 != const0_rtx)
-	    *cost += rtx_cost (op1, SET, 1, speed);
+	  *cost += rtx_cost (op1, SET, 1, speed);
 	  return true;
 
 	case SUBREG:
 	  if (! REG_P (SUBREG_REG (op0)))
 	    *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
+
 	  /* Fall through.  */
 	case REG:
-	  /* Cost is just the cost of the RHS of the set.  */
-	  *cost += rtx_cost (op1, SET, 1, true);
+	  /* const0_rtx is in general free, but we will use an
+	     instruction to set a register to 0.  */
+          if (REG_P (op1) || op1 == const0_rtx)
+            {
+              /* The cost is 1 per register copied.  */
+              int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
+			      / UNITS_PER_WORD;
+              *cost = COSTS_N_INSNS (n_minus_1 + 1);
+            }
+          else
+	    /* Cost is just the cost of the RHS of the set.  */
+	    *cost += rtx_cost (op1, SET, 1, speed);
 	  return true;
 
-	case ZERO_EXTRACT:  /* Bit-field insertion.  */
+	case ZERO_EXTRACT:
 	case SIGN_EXTRACT:
-	  /* Strip any redundant widening of the RHS to meet the width of
-	     the target.  */
+	  /* Bit-field insertion.  Strip any redundant widening of
+	     the RHS to meet the width of the target.  */
 	  if (GET_CODE (op1) == SUBREG)
 	    op1 = SUBREG_REG (op1);
 	  if ((GET_CODE (op1) == ZERO_EXTEND
@@ -4562,24 +4912,138 @@ aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
 	      && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
 		  >= INTVAL (XEXP (op0, 1))))
 	    op1 = XEXP (op1, 0);
-	  *cost += rtx_cost (op1, SET, 1, speed);
+
+          if (CONST_INT_P (op1))
+            {
+              /* MOV immediate is assumed to always be cheap.  */
+              *cost = COSTS_N_INSNS (1);
+            }
+          else
+            {
+              /* BFM.  */
+	      if (speed)
+		*cost += extra_cost->alu.bfi;
+              *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
+            }
+
 	  return true;
 
 	default:
+	  /* We can't make sense of this, assume default cost.  */
+          *cost = COSTS_N_INSNS (1);
 	  break;
 	}
       return false;
 
+    case CONST_INT:
+      /* If an instruction can incorporate a constant within the
+	 instruction, the instruction's expression avoids calling
+	 rtx_cost() on the constant.  If rtx_cost() is called on a
+	 constant, then it is usually because the constant must be
+	 moved into a register by one or more instructions.
+
+	 The exception is constant 0, which can be expressed
+	 as XZR/WZR and is therefore free.  The exception to this is
+	 if we have (set (reg) (const0_rtx)) in which case we must cost
+	 the move.  However, we can catch that when we cost the SET, so
+	 we don't need to consider that here.  */
+      if (x == const0_rtx)
+	*cost = 0;
+      else
+	{
+	  /* To an approximation, building any other constant is
+	     proportionally expensive to the number of instructions
+	     required to build that constant.  This is true whether we
+	     are compiling for SPEED or otherwise.  */
+	  *cost = COSTS_N_INSNS (aarch64_build_constant (0,
+							 INTVAL (x),
+							 false));
+	}
+      return true;
+
+    case CONST_DOUBLE:
+      if (speed)
+	{
+	  /* mov[df,sf]_aarch64.  */
+	  if (aarch64_float_const_representable_p (x))
+	    /* FMOV (scalar immediate).  */
+	    *cost += extra_cost->fp[mode == DFmode].fpconst;
+	  else if (!aarch64_float_const_zero_rtx_p (x))
+	    {
+	      /* This will be a load from memory.  */
+	      if (mode == DFmode)
+		*cost += extra_cost->ldst.loadd;
+	      else
+		*cost += extra_cost->ldst.loadf;
+	    }
+	  else
+	    /* Otherwise this is +0.0.  We get this using MOVI d0, #0
+	       or MOV v0.s[0], wzr - neither of which are modeled by the
+	       cost tables.  Just use the default cost.  */
+	    {
+	    }
+	}
+
+      return true;
+
     case MEM:
       if (speed)
-	*cost += extra_cost->ldst.load;
+	{
+	  /* For loads we want the base cost of a load, plus an
+	     approximation for the additional cost of the addressing
+	     mode.  */
+	  rtx address = XEXP (x, 0);
+	  if (GET_MODE_CLASS (mode) == MODE_INT)
+	    *cost += extra_cost->ldst.load;
+	  else if (mode == SFmode)
+	    *cost += extra_cost->ldst.loadf;
+	  else if (mode == DFmode)
+	    *cost += extra_cost->ldst.loadd;
+
+	  *cost +=
+		COSTS_N_INSNS (aarch64_address_cost (address, mode,
+						     0, speed));
+	}
 
       return true;
 
     case NEG:
-      op0 = CONST0_RTX (GET_MODE (x));
-      op1 = XEXP (x, 0);
-      goto cost_minus;
+      op0 = XEXP (x, 0);
+
+      if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
+       {
+          if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
+              || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
+            {
+              /* CSETM.  */
+              *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
+              return true;
+            }
+
+	  /* Cost this as SUB wzr, X.  */
+          op0 = CONST0_RTX (GET_MODE (x));
+          op1 = XEXP (x, 0);
+          goto cost_minus;
+        }
+
+      if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
+        {
+          /* Support (neg(fma...)) as a single instruction only if
+             sign of zeros is unimportant.  This matches the decision
+             making in aarch64.md.  */
+          if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
+            {
+	      /* FNMADD.  */
+              *cost = rtx_cost (op0, NEG, 0, speed);
+              return true;
+            }
+	  if (speed)
+	    /* FNEG.  */
+	    *cost += extra_cost->fp[mode == DFmode].neg;
+          return false;
+        }
+
+      return false;
 
     case COMPARE:
       op0 = XEXP (x, 0);
@@ -4592,94 +5056,208 @@ aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
 	  goto cost_logic;
 	}
 
-      /* Comparisons can work if the order is swapped.
-	 Canonicalization puts the more complex operation first, but
-	 we want it in op1.  */
-      if (! (REG_P (op0)
-	     || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
-	{
-	  op0 = XEXP (x, 1);
-	  op1 = XEXP (x, 0);
-	}
-      goto cost_minus;
+      if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
+        {
+          /* TODO: A write to the CC flags possibly costs extra, this
+	     needs encoding in the cost tables.  */
+
+          /* CC_ZESWPmode supports zero extend for free.  */
+          if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
+            op0 = XEXP (op0, 0);
+
+          /* ANDS.  */
+          if (GET_CODE (op0) == AND)
+            {
+              x = op0;
+              goto cost_logic;
+            }
+
+          if (GET_CODE (op0) == PLUS)
+            {
+	      /* ADDS (and CMN alias).  */
+              x = op0;
+              goto cost_plus;
+            }
+
+          if (GET_CODE (op0) == MINUS)
+            {
+	      /* SUBS.  */
+              x = op0;
+              goto cost_minus;
+            }
+
+          if (GET_CODE (op1) == NEG)
+            {
+	      /* CMN.  */
+	      if (speed)
+		*cost += extra_cost->alu.arith;
+
+              *cost += rtx_cost (op0, COMPARE, 0, speed);
+	      *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
+              return true;
+            }
+
+          /* CMP.
+
+	     Compare can freely swap the order of operands, and
+             canonicalization puts the more complex operation first.
+             But the integer MINUS logic expects the shift/extend
+             operation in op1.  */
+          if (! (REG_P (op0)
+                 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
+          {
+            op0 = XEXP (x, 1);
+            op1 = XEXP (x, 0);
+          }
+          goto cost_minus;
+        }
+
+      if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
+        {
+	  /* FCMP.  */
+	  if (speed)
+	    *cost += extra_cost->fp[mode == DFmode].compare;
+
+          if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
+            {
+              /* FCMP supports constant 0.0 for no extra cost. */
+              return true;
+            }
+          return false;
+        }
+
+      return false;
 
     case MINUS:
-      op0 = XEXP (x, 0);
-      op1 = XEXP (x, 1);
+      {
+	op0 = XEXP (x, 0);
+	op1 = XEXP (x, 1);
+
+cost_minus:
+	/* Detect valid immediates.  */
+	if ((GET_MODE_CLASS (mode) == MODE_INT
+	     || (GET_MODE_CLASS (mode) == MODE_CC
+		 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
+	    && CONST_INT_P (op1)
+	    && aarch64_uimm12_shift (INTVAL (op1)))
+	  {
+	    *cost += rtx_cost (op0, MINUS, 0, speed);
 
-    cost_minus:
-      if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT
-	  || (GET_MODE_CLASS (GET_MODE (x)) == MODE_CC
-	      && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
-	{
-	  if (op0 != const0_rtx)
+	    if (speed)
+	      /* SUB(S) (immediate).  */
+	      *cost += extra_cost->alu.arith;
+	    return true;
+
+	  }
+
+	/* Look for SUB (extended register).  */
+        if (aarch64_rtx_arith_op_extract_p (op1, mode))
+	  {
+	    if (speed)
+	      *cost += extra_cost->alu.arith_shift;
+
+	    *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
+			       (enum rtx_code) GET_CODE (op1),
+			       0, speed);
+	    return true;
+	  }
+
+	rtx new_op1 = aarch64_strip_extend (op1);
+
+	/* Cost this as an FMA-alike operation.  */
+	if ((GET_CODE (new_op1) == MULT
+	     || GET_CODE (new_op1) == ASHIFT)
+	    && code != COMPARE)
+	  {
+	    *cost += aarch64_rtx_mult_cost (new_op1, MULT,
+					    (enum rtx_code) code,
+					    speed);
 	    *cost += rtx_cost (op0, MINUS, 0, speed);
+	    return true;
+	  }
 
-	  if (CONST_INT_P (op1))
-	    {
-	      if (!aarch64_uimm12_shift (INTVAL (op1)))
-		*cost += rtx_cost (op1, MINUS, 1, speed);
-	    }
-	  else
-	    {
-	      op1 = aarch64_strip_shift_or_extend (op1);
-	      *cost += rtx_cost (op1, MINUS, 1, speed);
-	    }
-	  return true;
-	}
+	*cost += rtx_cost (new_op1, MINUS, 1, speed);
 
-      return false;
+	if (speed)
+	  {
+	    if (GET_MODE_CLASS (mode) == MODE_INT)
+	      /* SUB(S).  */
+	      *cost += extra_cost->alu.arith;
+	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
+	      /* FSUB.  */
+	      *cost += extra_cost->fp[mode == DFmode].addsub;
+	  }
+	return true;
+      }
 
     case PLUS:
-      op0 = XEXP (x, 0);
-      op1 = XEXP (x, 1);
+      {
+	rtx new_op0;
 
-      if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
-	{
-	  if (CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
-	    {
-	      *cost += rtx_cost (op0, PLUS, 0, speed);
-	    }
-	  else
-	    {
-	      rtx new_op0 = aarch64_strip_shift_or_extend (op0);
+	op0 = XEXP (x, 0);
+	op1 = XEXP (x, 1);
 
-	      if (new_op0 == op0
-		  && GET_CODE (op0) == MULT)
-		{
-		  if ((GET_CODE (XEXP (op0, 0)) == ZERO_EXTEND
-		       && GET_CODE (XEXP (op0, 1)) == ZERO_EXTEND)
-		      || (GET_CODE (XEXP (op0, 0)) == SIGN_EXTEND
-			  && GET_CODE (XEXP (op0, 1)) == SIGN_EXTEND))
-		    {
-		      *cost += (rtx_cost (XEXP (XEXP (op0, 0), 0), MULT, 0,
-					  speed)
-				+ rtx_cost (XEXP (XEXP (op0, 1), 0), MULT, 1,
-					    speed)
-				+ rtx_cost (op1, PLUS, 1, speed));
-		      if (speed)
-			*cost +=
-			  extra_cost->mult[GET_MODE (x) == DImode].extend_add;
-		      return true;
-		    }
+cost_plus:
+	if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
+	    || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
+	  {
+	    /* CSINC.  */
+	    *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
+	    *cost += rtx_cost (op1, PLUS, 1, speed);
+	    return true;
+	  }
 
-		  *cost += (rtx_cost (XEXP (op0, 0), MULT, 0, speed)
-			    + rtx_cost (XEXP (op0, 1), MULT, 1, speed)
-			    + rtx_cost (op1, PLUS, 1, speed));
+	if (GET_MODE_CLASS (mode) == MODE_INT
+	    && CONST_INT_P (op1)
+	    && aarch64_uimm12_shift (INTVAL (op1)))
+	  {
+	    *cost += rtx_cost (op0, PLUS, 0, speed);
 
-		  if (speed)
-		    *cost += extra_cost->mult[GET_MODE (x) == DImode].add;
+	    if (speed)
+	      /* ADD (immediate).  */
+	      *cost += extra_cost->alu.arith;
+	    return true;
+	  }
 
-		  return true;
-		}
+	/* Look for ADD (extended register).  */
+        if (aarch64_rtx_arith_op_extract_p (op0, mode))
+	  {
+	    if (speed)
+	      *cost += extra_cost->alu.arith_shift;
 
-	      *cost += (rtx_cost (new_op0, PLUS, 0, speed)
-			+ rtx_cost (op1, PLUS, 1, speed));
-	    }
-	  return true;
-	}
+	    *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
+			       (enum rtx_code) GET_CODE (op0),
+			       0, speed);
+	    return true;
+	  }
 
-      return false;
+	/* Strip any extend, leave shifts behind as we will
+	   cost them through mult_cost.  */
+	new_op0 = aarch64_strip_extend (op0);
+
+	if (GET_CODE (new_op0) == MULT
+	    || GET_CODE (new_op0) == ASHIFT)
+	  {
+	    *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
+					    speed);
+	    *cost += rtx_cost (op1, PLUS, 1, speed);
+	    return true;
+	  }
+
+	*cost += (rtx_cost (new_op0, PLUS, 0, speed)
+		  + rtx_cost (op1, PLUS, 1, speed));
+
+	if (speed)
+	  {
+	    if (GET_MODE_CLASS (mode) == MODE_INT)
+	      /* ADD.  */
+	      *cost += extra_cost->alu.arith;
+	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
+	      /* FADD.  */
+	      *cost += extra_cost->fp[mode == DFmode].addsub;
+	  }
+	return true;
+      }
 
     case IOR:
     case XOR:
@@ -4688,117 +5266,252 @@ aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
       op0 = XEXP (x, 0);
       op1 = XEXP (x, 1);
 
+      if (code == AND
+          && GET_CODE (op0) == MULT
+          && CONST_INT_P (XEXP (op0, 1))
+          && CONST_INT_P (op1)
+          && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
+                               INTVAL (op1)) != 0)
+        {
+          /* This is a UBFM/SBFM.  */
+          *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
+	  if (speed)
+	    *cost += extra_cost->alu.bfx;
+          return true;
+        }
+
       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
 	{
+	  /* We possibly get the immediate for free, this is not
+	     modelled.  */
 	  if (CONST_INT_P (op1)
 	      && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
 	    {
-	      *cost += rtx_cost (op0, AND, 0, speed);
+	      *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
+
+	      if (speed)
+		*cost += extra_cost->alu.logical;
+
+	      return true;
 	    }
 	  else
 	    {
+	      rtx new_op0 = op0;
+
+	      /* Handle ORN, EON, or BIC.  */
 	      if (GET_CODE (op0) == NOT)
 		op0 = XEXP (op0, 0);
-	      op0 = aarch64_strip_shift (op0);
-	      *cost += (rtx_cost (op0, AND, 0, speed)
-			+ rtx_cost (op1, AND, 1, speed));
+
+	      new_op0 = aarch64_strip_shift (op0);
+
+	      /* If we had a shift on op0 then this is a logical-shift-
+		 by-register/immediate operation.  Otherwise, this is just
+		 a logical operation.  */
+	      if (speed)
+		{
+		  if (new_op0 != op0)
+		    {
+		      /* Shift by immediate.  */
+		      if (CONST_INT_P (XEXP (op0, 1)))
+			*cost += extra_cost->alu.log_shift;
+		      else
+			*cost += extra_cost->alu.log_shift_reg;
+		    }
+		  else
+		    *cost += extra_cost->alu.logical;
+		}
+
+	      /* In both cases we want to cost both operands.  */
+	      *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
+		       + rtx_cost (op1, (enum rtx_code) code, 1, speed);
+
+	      return true;
 	    }
-	  return true;
 	}
       return false;
 
+    case NOT:
+      /* MVN.  */
+      if (speed)
+	*cost += extra_cost->alu.logical;
+
+      /* The logical instruction could have the shifted register form,
+         but the cost is the same if the shift is processed as a separate
+         instruction, so we don't bother with it here.  */
+      return false;
+
     case ZERO_EXTEND:
-      if ((GET_MODE (x) == DImode
-	   && GET_MODE (XEXP (x, 0)) == SImode)
-	  || GET_CODE (XEXP (x, 0)) == MEM)
+
+      op0 = XEXP (x, 0);
+      /* If a value is written in SI mode, then zero extended to DI
+	 mode, the operation will in general be free as a write to
+	 a 'w' register implicitly zeroes the upper bits of an 'x'
+	 register.  However, if this is
+
+	   (set (reg) (zero_extend (reg)))
+
+	 we must cost the explicit register move.  */
+      if (mode == DImode
+	  && GET_MODE (op0) == SImode
+	  && outer == SET)
 	{
-	  *cost += rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
+	  int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
+
+	  if (!op_cost && speed)
+	    /* MOV.  */
+	    *cost += extra_cost->alu.extend;
+	  else
+	    /* Free, the cost is that of the SI mode operation.  */
+	    *cost = op_cost;
+
 	  return true;
 	}
+      else if (MEM_P (XEXP (x, 0)))
+	{
+	  /* All loads can zero extend to any size for free.  */
+	  *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
+	  return true;
+	}
+
+      /* UXTB/UXTH.  */
+      if (speed)
+	*cost += extra_cost->alu.extend;
+
       return false;
 
     case SIGN_EXTEND:
-      if (GET_CODE (XEXP (x, 0)) == MEM)
+      if (MEM_P (XEXP (x, 0)))
 	{
-	  *cost += rtx_cost (XEXP (x, 0), SIGN_EXTEND, 0, speed);
+	  /* LDRSH.  */
+	  if (speed)
+	    {
+	      rtx address = XEXP (XEXP (x, 0), 0);
+	      *cost += extra_cost->ldst.load_sign_extend;
+
+	      *cost +=
+		COSTS_N_INSNS (aarch64_address_cost (address, mode,
+						     0, speed));
+	    }
 	  return true;
 	}
+
+      if (speed)
+	*cost += extra_cost->alu.extend;
       return false;
 
-    case ROTATE:
-      if (!CONST_INT_P (XEXP (x, 1)))
-	*cost += COSTS_N_INSNS (2);
-      /* Fall through.  */
-    case ROTATERT:
-    case LSHIFTRT:
     case ASHIFT:
-    case ASHIFTRT:
-
-      /* Shifting by a register often takes an extra cycle.  */
-      if (speed && !CONST_INT_P (XEXP (x, 1)))
-	*cost += extra_cost->alu.arith_shift_reg;
+      op0 = XEXP (x, 0);
+      op1 = XEXP (x, 1);
 
-      *cost += rtx_cost (XEXP (x, 0), ASHIFT, 0, speed);
-      return true;
+      if (CONST_INT_P (op1))
+        {
+	  /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
+	     aliases.  */
+	  if (speed)
+	    *cost += extra_cost->alu.shift;
 
-    case HIGH:
-      if (!CONSTANT_P (XEXP (x, 0)))
-	*cost += rtx_cost (XEXP (x, 0), HIGH, 0, speed);
-      return true;
+          /* We can incorporate zero/sign extend for free.  */
+          if (GET_CODE (op0) == ZERO_EXTEND
+              || GET_CODE (op0) == SIGN_EXTEND)
+            op0 = XEXP (op0, 0);
 
-    case LO_SUM:
-      if (!CONSTANT_P (XEXP (x, 1)))
-	*cost += rtx_cost (XEXP (x, 1), LO_SUM, 1, speed);
-      *cost += rtx_cost (XEXP (x, 0), LO_SUM, 0, speed);
-      return true;
+          *cost += rtx_cost (op0, ASHIFT, 0, speed);
+          return true;
+        }
+      else
+        {
+	  /* LSLV.  */
+	  if (speed)
+	    *cost += extra_cost->alu.shift_reg;
 
-    case ZERO_EXTRACT:
-    case SIGN_EXTRACT:
-      *cost += rtx_cost (XEXP (x, 0), ZERO_EXTRACT, 0, speed);
-      return true;
+	  return false;  /* All arguments need to be in registers.  */
+        }
 
-    case MULT:
+    case ROTATE:
+    case ROTATERT:
+    case LSHIFTRT:
+    case ASHIFTRT:
       op0 = XEXP (x, 0);
       op1 = XEXP (x, 1);
 
-      *cost = COSTS_N_INSNS (1);
-      if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
+      if (CONST_INT_P (op1))
 	{
-	  if (CONST_INT_P (op1)
-	      && exact_log2 (INTVAL (op1)) > 0)
-	    {
-	      *cost += rtx_cost (op0, ASHIFT, 0, speed);
-	      return true;
-	    }
+	  /* ASR (immediate) and friends.  */
+	  if (speed)
+	    *cost += extra_cost->alu.shift;
 
-	  if ((GET_CODE (op0) == ZERO_EXTEND
-	       && GET_CODE (op1) == ZERO_EXTEND)
-	      || (GET_CODE (op0) == SIGN_EXTEND
-		  && GET_CODE (op1) == SIGN_EXTEND))
-	    {
-	      *cost += (rtx_cost (XEXP (op0, 0), MULT, 0, speed)
-			+ rtx_cost (XEXP (op1, 0), MULT, 1, speed));
-	      if (speed)
-		*cost += extra_cost->mult[GET_MODE (x) == DImode].extend;
-	      return true;
-	    }
+	  *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
+	  return true;
+	}
+      else
+	{
 
+	  /* ASR (register) and friends.  */
 	  if (speed)
-	    *cost += extra_cost->mult[GET_MODE (x) == DImode].simple;
+	    *cost += extra_cost->alu.shift_reg;
+
+	  return false;  /* All arguments need to be in registers.  */
 	}
-      else if (speed)
+
+    case SYMBOL_REF:
+
+      if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
 	{
-	  if (GET_MODE (x) == DFmode)
-	    *cost += extra_cost->fp[1].mult;
-	  else if (GET_MODE (x) == SFmode)
-	    *cost += extra_cost->fp[0].mult;
+	  /* LDR.  */
+	  if (speed)
+	    *cost += extra_cost->ldst.load;
+	}
+      else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
+	       || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
+	{
+	  /* ADRP, followed by ADD.  */
+	  *cost += COSTS_N_INSNS (1);
+	  if (speed)
+	    *cost += 2 * extra_cost->alu.arith;
+	}
+      else if (aarch64_cmodel == AARCH64_CMODEL_TINY
+	       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
+	{
+	  /* ADR.  */
+	  if (speed)
+	    *cost += extra_cost->alu.arith;
 	}
 
-      return false;  /* All arguments need to be in registers.  */
+      if (flag_pic)
+	{
+	  /* One extra load instruction, after accessing the GOT.  */
+	  *cost += COSTS_N_INSNS (1);
+	  if (speed)
+	    *cost += extra_cost->ldst.load;
+	}
+      return true;
+
+    case HIGH:
+    case LO_SUM:
+      /* ADRP/ADD (immediate).  */
+      if (speed)
+	*cost += extra_cost->alu.arith;
+      return true;
+
+    case ZERO_EXTRACT:
+    case SIGN_EXTRACT:
+      /* UBFX/SBFX.  */
+      if (speed)
+	*cost += extra_cost->alu.bfx;
+
+      /* We can trust that the immediates used will be correct (there
+	 are no by-register forms), so we need only cost op0.  */
+      *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
+      return true;
+
+    case MULT:
+      *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
+      /* aarch64_rtx_mult_cost always handles recursion to its
+	 operands.  */
+      return true;
 
     case MOD:
     case UMOD:
-      *cost = COSTS_N_INSNS (2);
       if (speed)
 	{
 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
@@ -4815,53 +5528,213 @@ aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
 
     case DIV:
     case UDIV:
-      *cost = COSTS_N_INSNS (1);
+    case SQRT:
       if (speed)
 	{
-	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
-	    *cost += extra_cost->mult[GET_MODE (x) == DImode].idiv;
-	  else if (GET_MODE (x) == DFmode)
-	    *cost += extra_cost->fp[1].div;
-	  else if (GET_MODE (x) == SFmode)
-	    *cost += extra_cost->fp[0].div;
+	  if (GET_MODE_CLASS (mode) == MODE_INT)
+	    /* There is no integer SQRT, so only DIV and UDIV can get
+	       here.  */
+	    *cost += extra_cost->mult[mode == DImode].idiv;
+	  else
+	    *cost += extra_cost->fp[mode == DFmode].div;
 	}
       return false;  /* All arguments need to be in registers.  */
 
-    default:
-      break;
-    }
-  return false;
-}
+    case IF_THEN_ELSE:
+      op2 = XEXP (x, 2);
+      op0 = XEXP (x, 0);
+      op1 = XEXP (x, 1);
 
-static int
-aarch64_address_cost (rtx x ATTRIBUTE_UNUSED,
-		  enum machine_mode mode ATTRIBUTE_UNUSED,
-		  addr_space_t as ATTRIBUTE_UNUSED, bool speed ATTRIBUTE_UNUSED)
-{
-  enum rtx_code c  = GET_CODE (x);
-  const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
+      if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
+        {
+          /* Conditional branch.  */
+          if (GET_MODE_CLASS (GET_MODE (XEXP (op0, 0))) == MODE_CC)
+	    return true;
+	  else
+	    {
+	      if (GET_CODE (op0) == NE
+		  || GET_CODE (op0) == EQ)
+		{
+		  rtx inner = XEXP (op0, 0);
+		  rtx comparator = XEXP (op0, 1);
 
-  if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
-    return addr_cost->pre_modify;
+		  if (comparator == const0_rtx)
+		    {
+		      /* TBZ/TBNZ/CBZ/CBNZ.  */
+		      if (GET_CODE (inner) == ZERO_EXTRACT)
+			/* TBZ/TBNZ.  */
+			*cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
+					   0, speed);
+		      else
+			/* CBZ/CBNZ.  */
+			*cost += rtx_cost (inner, GET_CODE (op0), 0, speed);
 
-  if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
-    return addr_cost->post_modify;
+		      return true;
+		    }
+		}
+	      else if (GET_CODE (op0) == LT
+		       || GET_CODE (op0) == GE)
+		{
+		  rtx comparator = XEXP (op0, 1);
 
-  if (c == PLUS)
-    {
-      if (GET_CODE (XEXP (x, 1)) == CONST_INT)
-	return addr_cost->imm_offset;
-      else if (GET_CODE (XEXP (x, 0)) == MULT
-	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND
-	       || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
-	return addr_cost->register_extend;
+		  /* TBZ/TBNZ.  */
+		  if (comparator == const0_rtx)
+		    return true;
+		}
+	    }
+        }
+      else if (GET_MODE_CLASS (GET_MODE (XEXP (op0, 0))) == MODE_CC)
+        {
+          /* It's a conditional operation based on the status flags,
+             so it must be some flavor of CSEL.  */
+
+          /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
+          if (GET_CODE (op1) == NEG
+              || GET_CODE (op1) == NOT
+              || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
+            op1 = XEXP (op1, 0);
+
+          *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
+          *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
+          return true;
+        }
 
-      return addr_cost->register_offset;
-    }
-  else if (c == MEM || c == LABEL_REF || c == SYMBOL_REF)
-    return addr_cost->imm_offset;
+      /* We don't know what this is, cost all operands.  */
+      return false;
 
-  return 0;
+    case EQ:
+    case NE:
+    case GT:
+    case GTU:
+    case LT:
+    case LTU:
+    case GE:
+    case GEU:
+    case LE:
+    case LEU:
+
+      return false; /* All arguments must be in registers.  */
+
+    case FMA:
+      op0 = XEXP (x, 0);
+      op1 = XEXP (x, 1);
+      op2 = XEXP (x, 2);
+
+      if (speed)
+	*cost += extra_cost->fp[mode == DFmode].fma;
+
+      /* FMSUB, FNMADD, and FNMSUB are free.  */
+      if (GET_CODE (op0) == NEG)
+        op0 = XEXP (op0, 0);
+
+      if (GET_CODE (op2) == NEG)
+        op2 = XEXP (op2, 0);
+
+      /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
+	 and the by-element operand as operand 0.  */
+      if (GET_CODE (op1) == NEG)
+        op1 = XEXP (op1, 0);
+
+      /* Catch vector-by-element operations.  The by-element operand can
+	 either be (vec_duplicate (vec_select (x))) or just
+	 (vec_select (x)), depending on whether we are multiplying by
+	 a vector or a scalar.
+
+	 Canonicalization is not very good in these cases, FMA4 will put the
+	 by-element operand as operand 0, FNMA4 will have it as operand 1.  */
+      if (GET_CODE (op0) == VEC_DUPLICATE)
+	op0 = XEXP (op0, 0);
+      else if (GET_CODE (op1) == VEC_DUPLICATE)
+	op1 = XEXP (op1, 0);
+
+      if (GET_CODE (op0) == VEC_SELECT)
+	op0 = XEXP (op0, 0);
+      else if (GET_CODE (op1) == VEC_SELECT)
+	op1 = XEXP (op1, 0);
+
+      /* If the remaining parameters are not registers,
+         get the cost to put them into registers.  */
+      *cost += rtx_cost (op0, FMA, 0, speed);
+      *cost += rtx_cost (op1, FMA, 1, speed);
+      *cost += rtx_cost (op2, FMA, 2, speed);
+      return true;
+
+    case FLOAT_EXTEND:
+      if (speed)
+	*cost += extra_cost->fp[mode == DFmode].widen;
+      return false;
+
+    case FLOAT_TRUNCATE:
+      if (speed)
+	*cost += extra_cost->fp[mode == DFmode].narrow;
+      return false;
+
+    case ABS:
+      if (GET_MODE_CLASS (mode) == MODE_FLOAT)
+	{
+	  /* FABS and FNEG are analogous.  */
+	  if (speed)
+	    *cost += extra_cost->fp[mode == DFmode].neg;
+	}
+      else
+	{
+	  /* Integer ABS will either be split to
+	     two arithmetic instructions, or will be an ABS
+	     (scalar), which we don't model.  */
+	  *cost = COSTS_N_INSNS (2);
+	  if (speed)
+	    *cost += 2 * extra_cost->alu.arith;
+	}
+      return false;
+
+    case SMAX:
+    case SMIN:
+      if (speed)
+	{
+	  /* FMAXNM/FMINNM/FMAX/FMIN.
+	     TODO: This may not be accurate for all implementations, but
+	     we do not model this in the cost tables.  */
+	  *cost += extra_cost->fp[mode == DFmode].addsub;
+	}
+      return false;
+
+    case TRUNCATE:
+
+      /* Decompose <su>muldi3_highpart.  */
+      if (/* (truncate:DI  */
+	  mode == DImode
+	  /*   (lshiftrt:TI  */
+          && GET_MODE (XEXP (x, 0)) == TImode
+          && GET_CODE (XEXP (x, 0)) == LSHIFTRT
+	  /*      (mult:TI  */
+          && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
+	  /*        (ANY_EXTEND:TI (reg:DI))
+	            (ANY_EXTEND:TI (reg:DI)))  */
+          && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
+               && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
+              || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
+                  && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
+          && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
+          && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
+	  /*     (const_int 64)  */
+          && CONST_INT_P (XEXP (XEXP (x, 0), 1))
+          && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
+        {
+          /* UMULH/SMULH.  */
+	  if (speed)
+	    *cost += extra_cost->mult[mode == DImode].extend;
+          *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
+			     MULT, 0, speed);
+          *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
+			     MULT, 1, speed);
+          return true;
+        }
+
+      /* Fall through.  */
+    default:
+      return true;
+    }
+  return false;
 }
 
 static int
-- 
cgit v1.2.3