Initial checkin of GCC 4.9.0 from trunk (r208799).

Change-Id: I48a3c08bb98542aa215912a75f03c0890e497dba
author: Ben Cheng <bccheng@google.com> 2014-03-25 22:37:19 -0700
committer: Ben Cheng <bccheng@google.com> 2014-03-25 22:37:19 -0700
commit: 1bc5aee63eb72b341f506ad058502cd0361f0d10 (patch)
tree: c607e8252f3405424ff15bc2d00aa38dadbb2518 /gcc-4.9/gcc/config/i386
parent: 283a0bf58fcf333c58a2a92c3ebbc41fb9eb1fdb (diff)
download: toolchain_gcc-1bc5aee63eb72b341f506ad058502cd0361f0d10.tar.gz
toolchain_gcc-1bc5aee63eb72b341f506ad058502cd0361f0d10.tar.bz2
toolchain_gcc-1bc5aee63eb72b341f506ad058502cd0361f0d10.zip
167 files changed, 133481 insertions, 0 deletions
diff --git a/gcc-4.9/gcc/config/i386/adxintrin.h b/gcc-4.9/gcc/config/i386/adxintrin.h
new file mode 100644
index 000000000..611890044
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/adxintrin.h
@@ -0,0 +1,49 @@
+/* Copyright (C) 2012-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <adxintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _ADXINTRIN_H_INCLUDED
+#define _ADXINTRIN_H_INCLUDED
+
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_addcarryx_u32 (unsigned char __CF, unsigned int __X,
+		unsigned int __Y, unsigned int *__P)
+{
+    return __builtin_ia32_addcarryx_u32 (__CF, __X, __Y, __P);
+}
+
+#ifdef __x86_64__
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_addcarryx_u64 (unsigned char __CF, unsigned long __X,
+		unsigned long __Y, unsigned long long *__P)
+{
+    return __builtin_ia32_addcarryx_u64 (__CF, __X, __Y, __P);
+}
+#endif
+
+#endif /* _ADXINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/ammintrin.h b/gcc-4.9/gcc/config/i386/ammintrin.h
new file mode 100644
index 000000000..a89b2046d
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/ammintrin.h
@@ -0,0 +1,93 @@
+/* Copyright (C) 2007-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the AMD Programmers
+   Manual Update, version 2.x */
+
+#ifndef _AMMINTRIN_H_INCLUDED
+#define _AMMINTRIN_H_INCLUDED
+
+/* We need definitions from the SSE3, SSE2 and SSE header files*/
+#include <pmmintrin.h>
+
+#ifndef __SSE4A__
+#pragma GCC push_options
+#pragma GCC target("sse4a")
+#define __DISABLE_SSE4A__
+#endif /* __SSE4A__ */
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_sd (double * __P, __m128d __Y)
+{
+  __builtin_ia32_movntsd (__P, (__v2df) __Y);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_ss (float * __P, __m128 __Y)
+{
+  __builtin_ia32_movntss (__P, (__v4sf) __Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_si64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_extrq ((__v2di) __X, (__v16qi) __Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extracti_si64 (__m128i __X, unsigned const int __I, unsigned const int __L)
+{
+  return (__m128i) __builtin_ia32_extrqi ((__v2di) __X, __I, __L);
+}
+#else
+#define _mm_extracti_si64(X, I, L)					\
+  ((__m128i) __builtin_ia32_extrqi ((__v2di)(__m128i)(X),		\
+				    (unsigned int)(I), (unsigned int)(L)))
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_si64 (__m128i __X,__m128i __Y)
+{
+  return (__m128i) __builtin_ia32_insertq ((__v2di)__X, (__v2di)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_inserti_si64(__m128i __X, __m128i __Y, unsigned const int __I, unsigned const int __L)
+{
+  return (__m128i) __builtin_ia32_insertqi ((__v2di)__X, (__v2di)__Y, __I, __L);
+}
+#else
+#define _mm_inserti_si64(X, Y, I, L)					\
+  ((__m128i) __builtin_ia32_insertqi ((__v2di)(__m128i)(X),		\
+				      (__v2di)(__m128i)(Y),		\
+				      (unsigned int)(I), (unsigned int)(L)))
+#endif
+
+#ifdef __DISABLE_SSE4A__
+#undef __DISABLE_SSE4A__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE4A__ */
+
+#endif /* _AMMINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/athlon.md b/gcc-4.9/gcc/config/i386/athlon.md
new file mode 100644
index 000000000..b207d882f
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/athlon.md
@@ -0,0 +1,1186 @@
+;; Copyright (C) 2002-2014 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+;;
+;; AMD Athlon Scheduling
+;;
+;; The Athlon does contain three pipelined FP units, three integer units and
+;; three address generation units.
+;;
+;; The predecode logic is determining boundaries of instructions in the 64
+;; byte cache line. So the cache line straddling problem of K6 might be issue
+;; here as well, but it is not noted in the documentation.
+;;
+;; Three DirectPath instructions decoders and only one VectorPath decoder
+;; is available. They can decode three DirectPath instructions or one VectorPath
+;; instruction per cycle.
+;; Decoded macro instructions are then passed to 72 entry instruction control
+;; unit, that passes
+;; it to the specialized integer (18 entry) and fp (36 entry) schedulers.
+;;
+;; The load/store queue unit is not attached to the schedulers but
+;; communicates with all the execution units separately instead.
+
+(define_attr "athlon_decode" "direct,vector,double"
+  (cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str,pop,leave")
+	   (const_string "vector")
+         (and (eq_attr "type" "push")
+              (match_operand 1 "memory_operand"))
+	   (const_string "vector")
+         (and (eq_attr "type" "fmov")
+	      (and (eq_attr "memory" "load,store")
+		   (eq_attr "mode" "XF")))
+	   (const_string "vector")]
+	(const_string "direct")))
+
+(define_attr "amdfam10_decode" "direct,vector,double"
+  (const_string "direct"))
+;;
+;;           decode0 decode1 decode2
+;;                 \    |   /
+;;    instruction control unit (72 entry scheduler)
+;;                |                        |
+;;      integer scheduler (18)         stack map
+;;     /  |    |    |    |   \        stack rename
+;;  ieu0 agu0 ieu1 agu1 ieu2 agu2      scheduler
+;;    |  agu0  |   agu1      agu2    register file
+;;    |      \ |    |       /         |     |     |
+;;     \      /\    |     /         fadd  fmul  fstore
+;;       \  /    \  |   /           fadd  fmul  fstore
+;;       imul  load/store (2x)      fadd  fmul  fstore
+
+(define_automaton "athlon,athlon_load,athlon_mult,athlon_fp")
+(define_cpu_unit "athlon-decode0" "athlon")
+(define_cpu_unit "athlon-decode1" "athlon")
+(define_cpu_unit "athlon-decode2" "athlon")
+(define_cpu_unit "athlon-decodev" "athlon")
+;; Model the fact that double decoded instruction may take 2 cycles
+;; to decode when decoder2 and decoder0 in next cycle
+;; is used (this is needed to allow troughput of 1.5 double decoded
+;; instructions per cycle).
+;;
+;; In order to avoid dependence between reservation of decoder
+;; and other units, we model decoder as two stage fully pipelined unit
+;; and only double decoded instruction may occupy unit in the first cycle.
+;; With this scheme however two double instructions can be issued cycle0.
+;;
+;; Avoid this by using presence set requiring decoder0 to be allocated
+;; too. Vector decoded instructions then can't be issued when
+;; modeled as consuming decoder0+decoder1+decoder2.
+;; We solve that by specialized vector decoder unit and exclusion set.
+(presence_set "athlon-decode2" "athlon-decode0")
+(exclusion_set "athlon-decodev" "athlon-decode0,athlon-decode1,athlon-decode2")
+(define_reservation "athlon-vector" "nothing,athlon-decodev")
+(define_reservation "athlon-direct0" "nothing,athlon-decode0")
+(define_reservation "athlon-direct" "nothing,
+				     (athlon-decode0 | athlon-decode1
+				     | athlon-decode2)")
+;; Double instructions behaves like two direct instructions.
+(define_reservation "athlon-double" "((athlon-decode2, athlon-decode0)
+				     | (nothing,(athlon-decode0 + athlon-decode1))
+				     | (nothing,(athlon-decode1 + athlon-decode2)))")
+
+;; Agu and ieu unit results in extremely large automatons and
+;; in our approximation they are hardly filled in.  Only ieu
+;; unit can, as issue rate is 3 and agu unit is always used
+;; first in the insn reservations.  Skip the models.
+
+;(define_cpu_unit "athlon-ieu0" "athlon_ieu")
+;(define_cpu_unit "athlon-ieu1" "athlon_ieu")
+;(define_cpu_unit "athlon-ieu2" "athlon_ieu")
+;(define_reservation "athlon-ieu" "(athlon-ieu0 | athlon-ieu1 | athlon-ieu2)")
+(define_reservation "athlon-ieu" "nothing")
+(define_cpu_unit "athlon-ieu0" "athlon")
+;(define_cpu_unit "athlon-agu0" "athlon_agu")
+;(define_cpu_unit "athlon-agu1" "athlon_agu")
+;(define_cpu_unit "athlon-agu2" "athlon_agu")
+;(define_reservation "athlon-agu" "(athlon-agu0 | athlon-agu1 | athlon-agu2)")
+(define_reservation "athlon-agu" "nothing")
+
+(define_cpu_unit "athlon-mult" "athlon_mult")
+
+(define_cpu_unit "athlon-load0" "athlon_load")
+(define_cpu_unit "athlon-load1" "athlon_load")
+(define_reservation "athlon-load" "athlon-agu,
+				   (athlon-load0 | athlon-load1),nothing")
+;; 128bit SSE instructions issue two loads at once
+(define_reservation "athlon-load2" "athlon-agu,
+				   (athlon-load0 + athlon-load1),nothing")
+
+(define_reservation "athlon-store" "(athlon-load0 | athlon-load1)")
+;; 128bit SSE instructions issue two stores at once
+(define_reservation "athlon-store2" "(athlon-load0 + athlon-load1)")
+
+
+;; The FP operations start to execute at stage 12 in the pipeline, while
+;; integer operations start to execute at stage 9 for Athlon and 11 for K8
+;; Compensate the difference for Athlon because it results in significantly
+;; smaller automata.
+(define_reservation "athlon-fpsched" "nothing,nothing,nothing")
+;; The floating point loads.
+(define_reservation "athlon-fpload" "(athlon-fpsched + athlon-load)")
+(define_reservation "athlon-fpload2" "(athlon-fpsched + athlon-load2)")
+(define_reservation "athlon-fploadk8" "(athlon-fpsched + athlon-load)")
+(define_reservation "athlon-fpload2k8" "(athlon-fpsched + athlon-load2)")
+
+
+;; The three fp units are fully pipelined with latency of 3
+(define_cpu_unit "athlon-fadd" "athlon_fp")
+(define_cpu_unit "athlon-fmul" "athlon_fp")
+(define_cpu_unit "athlon-fstore" "athlon_fp")
+(define_reservation "athlon-fany" "(athlon-fstore | athlon-fmul | athlon-fadd)")
+(define_reservation "athlon-faddmul" "(athlon-fadd | athlon-fmul)")
+
+;; Vector operations usually consume many of pipes.
+(define_reservation "athlon-fvector" "(athlon-fadd + athlon-fmul + athlon-fstore)")
+
+
+;; Jump instructions are executed in the branch unit completely transparent to us
+(define_insn_reservation "athlon_branch" 0
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (eq_attr "type" "ibr"))
+			 "athlon-direct,athlon-ieu")
+(define_insn_reservation "athlon_call" 0
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (eq_attr "type" "call,callv"))
+			 "athlon-vector,athlon-ieu")
+(define_insn_reservation "athlon_call_amdfam10" 0
+			 (and (eq_attr "cpu" "amdfam10")
+			      (eq_attr "type" "call,callv"))
+			 "athlon-double,athlon-ieu")
+
+;; Latency of push operation is 3 cycles, but ESP value is available
+;; earlier
+(define_insn_reservation "athlon_push" 2
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (eq_attr "type" "push"))
+			 "athlon-direct,athlon-agu,athlon-store")
+(define_insn_reservation "athlon_pop" 4
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (eq_attr "type" "pop"))
+			 "athlon-vector,athlon-load,athlon-ieu")
+(define_insn_reservation "athlon_pop_k8" 3
+			 (and (eq_attr "cpu" "k8,generic")
+			      (eq_attr "type" "pop"))
+			 "athlon-double,(athlon-ieu+athlon-load)")
+(define_insn_reservation "athlon_pop_amdfam10" 3
+			 (and (eq_attr "cpu" "amdfam10")
+			      (eq_attr "type" "pop"))
+			 "athlon-direct,(athlon-ieu+athlon-load)")
+(define_insn_reservation "athlon_leave" 3
+			 (and (eq_attr "cpu" "athlon")
+			      (eq_attr "type" "leave"))
+			 "athlon-vector,(athlon-ieu+athlon-load)")
+(define_insn_reservation "athlon_leave_k8" 3
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (eq_attr "type" "leave"))
+			 "athlon-double,(athlon-ieu+athlon-load)")
+
+;; Lea executes in AGU unit with 2 cycles latency.
+(define_insn_reservation "athlon_lea" 2
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (eq_attr "type" "lea"))
+			 "athlon-direct,athlon-agu,nothing")
+;; Lea executes in AGU unit with 1 cycle latency on AMDFAM10
+(define_insn_reservation "athlon_lea_amdfam10" 1
+			 (and (eq_attr "cpu" "amdfam10")
+			      (eq_attr "type" "lea"))
+			 "athlon-direct,athlon-agu,nothing")
+
+;; Mul executes in special multiplier unit attached to IEU0
+(define_insn_reservation "athlon_imul" 5
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "imul")
+				   (eq_attr "memory" "none,unknown")))
+			 "athlon-vector,athlon-ieu0,athlon-mult,nothing,nothing,athlon-ieu0")
+;; ??? Widening multiply is vector or double.
+(define_insn_reservation "athlon_imul_k8_DI" 4
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (and (eq_attr "type" "imul")
+				   (and (eq_attr "mode" "DI")
+					(eq_attr "memory" "none,unknown"))))
+			 "athlon-direct0,athlon-ieu0,athlon-mult,nothing,athlon-ieu0")
+(define_insn_reservation "athlon_imul_k8" 3
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (and (eq_attr "type" "imul")
+				   (eq_attr "memory" "none,unknown")))
+			 "athlon-direct0,athlon-ieu0,athlon-mult,athlon-ieu0")
+(define_insn_reservation "athlon_imul_amdfam10_HI" 4
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "imul")
+				   (and (eq_attr "mode" "HI")
+					(eq_attr "memory" "none,unknown"))))
+			 "athlon-vector,athlon-ieu0,athlon-mult,nothing,athlon-ieu0")
+(define_insn_reservation "athlon_imul_mem" 8
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "imul")
+				   (eq_attr "memory" "load,both")))
+			 "athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,nothing,athlon-ieu")
+(define_insn_reservation "athlon_imul_mem_k8_DI" 7
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (and (eq_attr "type" "imul")
+				   (and (eq_attr "mode" "DI")
+					(eq_attr "memory" "load,both"))))
+			 "athlon-vector,athlon-load,athlon-ieu,athlon-mult,nothing,athlon-ieu")
+(define_insn_reservation "athlon_imul_mem_k8" 6
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (and (eq_attr "type" "imul")
+				   (eq_attr "memory" "load,both")))
+			 "athlon-vector,athlon-load,athlon-ieu,athlon-mult,athlon-ieu")
+
+;; Idiv cannot execute in parallel with other instructions.  Dealing with it
+;; as with short latency vector instruction is good approximation avoiding
+;; scheduler from trying too hard to can hide it's latency by overlap with
+;; other instructions.
+;; ??? Experiments show that the idiv can overlap with roughly 6 cycles
+;; of the other code
+;; Using the same heuristics for amdfam10 as K8 with idiv
+
+(define_insn_reservation "athlon_idiv" 6
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (and (eq_attr "type" "idiv")
+				   (eq_attr "memory" "none,unknown")))
+			 "athlon-vector,(athlon-ieu0*6+(athlon-fpsched,athlon-fvector))")
+(define_insn_reservation "athlon_idiv_mem" 9
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (and (eq_attr "type" "idiv")
+				   (eq_attr "memory" "load,both")))
+			 "athlon-vector,((athlon-load,athlon-ieu0*6)+(athlon-fpsched,athlon-fvector))")
+;; The parallelism of string instructions is not documented.  Model it same way
+;; as idiv to create smaller automata.  This probably does not matter much.
+;; Using the same heuristics for amdfam10 as K8 with idiv
+(define_insn_reservation "athlon_str" 6
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (and (eq_attr "type" "str")
+				   (eq_attr "memory" "load,both,store")))
+			 "athlon-vector,athlon-load,athlon-ieu0*6")
+
+(define_insn_reservation "athlon_idirect" 1
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (and (eq_attr "athlon_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "none,unknown"))))
+			 "athlon-direct,athlon-ieu")
+(define_insn_reservation "athlon_idirect_amdfam10" 1
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "amdfam10_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "none,unknown"))))
+			 "athlon-direct,athlon-ieu")
+(define_insn_reservation "athlon_ivector" 2
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (and (eq_attr "athlon_decode" "vector")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "none,unknown"))))
+			 "athlon-vector,athlon-ieu,athlon-ieu")
+(define_insn_reservation "athlon_ivector_amdfam10" 2
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "amdfam10_decode" "vector")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "none,unknown"))))
+			 "athlon-vector,athlon-ieu,athlon-ieu")
+
+(define_insn_reservation "athlon_idirect_loadmov" 3
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (and (eq_attr "type" "imov")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-load")
+
+(define_insn_reservation "athlon_idirect_load" 4
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (and (eq_attr "athlon_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "load"))))
+			 "athlon-direct,athlon-load,athlon-ieu")
+(define_insn_reservation "athlon_idirect_load_amdfam10" 4
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "amdfam10_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "load"))))
+			 "athlon-direct,athlon-load,athlon-ieu")
+(define_insn_reservation "athlon_ivector_load" 6
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (and (eq_attr "athlon_decode" "vector")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "load"))))
+			 "athlon-vector,athlon-load,athlon-ieu,athlon-ieu")
+(define_insn_reservation "athlon_ivector_load_amdfam10" 6
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "amdfam10_decode" "vector")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "load"))))
+			 "athlon-vector,athlon-load,athlon-ieu,athlon-ieu")
+
+(define_insn_reservation "athlon_idirect_movstore" 1
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (and (eq_attr "type" "imov")
+				   (eq_attr "memory" "store")))
+			 "athlon-direct,athlon-agu,athlon-store")
+
+(define_insn_reservation "athlon_idirect_both" 4
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (and (eq_attr "athlon_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "both"))))
+			 "athlon-direct,athlon-load,
+			  athlon-ieu,athlon-store,
+			  athlon-store")
+(define_insn_reservation "athlon_idirect_both_amdfam10" 4
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "amdfam10_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "both"))))
+			 "athlon-direct,athlon-load,
+			  athlon-ieu,athlon-store,
+			  athlon-store")
+
+(define_insn_reservation "athlon_ivector_both" 6
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (and (eq_attr "athlon_decode" "vector")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "both"))))
+			 "athlon-vector,athlon-load,
+			  athlon-ieu,
+			  athlon-ieu,
+			  athlon-store")
+(define_insn_reservation "athlon_ivector_both_amdfam10" 6
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "amdfam10_decode" "vector")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "both"))))
+			 "athlon-vector,athlon-load,
+			  athlon-ieu,
+			  athlon-ieu,
+			  athlon-store")
+
+(define_insn_reservation "athlon_idirect_store" 1
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (and (eq_attr "athlon_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "store"))))
+			 "athlon-direct,(athlon-ieu+athlon-agu),
+			  athlon-store")
+(define_insn_reservation "athlon_idirect_store_amdfam10" 1
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "amdfam10_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "store"))))
+			 "athlon-direct,(athlon-ieu+athlon-agu),
+			  athlon-store")
+
+(define_insn_reservation "athlon_ivector_store" 2
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (and (eq_attr "athlon_decode" "vector")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "store"))))
+			 "athlon-vector,(athlon-ieu+athlon-agu),athlon-ieu,
+			  athlon-store")
+(define_insn_reservation "athlon_ivector_store_amdfam10" 2
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "amdfam10_decode" "vector")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "store"))))
+			 "athlon-vector,(athlon-ieu+athlon-agu),athlon-ieu,
+			  athlon-store")
+
+;; Athlon floatin point unit
+(define_insn_reservation "athlon_fldxf" 12
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "fmov")
+				   (and (eq_attr "memory" "load")
+					(eq_attr "mode" "XF"))))
+			 "athlon-vector,athlon-fpload2,athlon-fvector*9")
+(define_insn_reservation "athlon_fldxf_k8" 13
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (and (eq_attr "type" "fmov")
+				   (and (eq_attr "memory" "load")
+					(eq_attr "mode" "XF"))))
+			 "athlon-vector,athlon-fpload2k8,athlon-fvector*9")
+;; Assume superforwarding to take place so effective latency of fany op is 0.
+(define_insn_reservation "athlon_fld" 0
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "fmov")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fpload,athlon-fany")
+(define_insn_reservation "athlon_fld_k8" 2
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (and (eq_attr "type" "fmov")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fploadk8,athlon-fstore")
+
+(define_insn_reservation "athlon_fstxf" 10
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "fmov")
+				   (and (eq_attr "memory" "store,both")
+					(eq_attr "mode" "XF"))))
+			 "athlon-vector,(athlon-fpsched+athlon-agu),(athlon-store2+(athlon-fvector*7))")
+(define_insn_reservation "athlon_fstxf_k8" 8
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (and (eq_attr "type" "fmov")
+				   (and (eq_attr "memory" "store,both")
+					(eq_attr "mode" "XF"))))
+			 "athlon-vector,(athlon-fpsched+athlon-agu),(athlon-store2+(athlon-fvector*6))")
+(define_insn_reservation "athlon_fst" 4
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "fmov")
+				   (eq_attr "memory" "store,both")))
+			 "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)")
+(define_insn_reservation "athlon_fst_k8" 2
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (and (eq_attr "type" "fmov")
+				   (eq_attr "memory" "store,both")))
+			 "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)")
+(define_insn_reservation "athlon_fist" 4
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (eq_attr "type" "fistp,fisttp"))
+			 "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)")
+(define_insn_reservation "athlon_fmov" 2
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (eq_attr "type" "fmov"))
+			 "athlon-direct,athlon-fpsched,athlon-faddmul")
+(define_insn_reservation "athlon_fadd_load" 4
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "fop")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fpload,athlon-fadd")
+(define_insn_reservation "athlon_fadd_load_k8" 6
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (and (eq_attr "type" "fop")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fploadk8,athlon-fadd")
+(define_insn_reservation "athlon_fadd" 4
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (eq_attr "type" "fop"))
+			 "athlon-direct,athlon-fpsched,athlon-fadd")
+(define_insn_reservation "athlon_fmul_load" 4
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "fmul")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fpload,athlon-fmul")
+(define_insn_reservation "athlon_fmul_load_k8" 6
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (and (eq_attr "type" "fmul")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fploadk8,athlon-fmul")
+(define_insn_reservation "athlon_fmul" 4
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (eq_attr "type" "fmul"))
+			 "athlon-direct,athlon-fpsched,athlon-fmul")
+(define_insn_reservation "athlon_fsgn" 2
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (eq_attr "type" "fsgn"))
+			 "athlon-direct,athlon-fpsched,athlon-fmul")
+(define_insn_reservation "athlon_fdiv_load" 24
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "fdiv")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fpload,athlon-fmul")
+(define_insn_reservation "athlon_fdiv_load_k8" 13
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (and (eq_attr "type" "fdiv")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fploadk8,athlon-fmul")
+(define_insn_reservation "athlon_fdiv" 24
+			 (and (eq_attr "cpu" "athlon")
+			      (eq_attr "type" "fdiv"))
+			 "athlon-direct,athlon-fpsched,athlon-fmul")
+(define_insn_reservation "athlon_fdiv_k8" 11
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (eq_attr "type" "fdiv"))
+			 "athlon-direct,athlon-fpsched,athlon-fmul")
+(define_insn_reservation "athlon_fpspc_load" 103
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (and (eq_attr "type" "fpspc")
+				   (eq_attr "memory" "load")))
+			 "athlon-vector,athlon-fpload,athlon-fvector")
+(define_insn_reservation "athlon_fpspc" 100
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (eq_attr "type" "fpspc"))
+			 "athlon-vector,athlon-fpsched,athlon-fvector")
+(define_insn_reservation "athlon_fcmov_load" 7
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "fcmov")
+				   (eq_attr "memory" "load")))
+			 "athlon-vector,athlon-fpload,athlon-fvector")
+(define_insn_reservation "athlon_fcmov" 7
+			 (and (eq_attr "cpu" "athlon")
+			      (eq_attr "type" "fcmov"))
+			 "athlon-vector,athlon-fpsched,athlon-fvector")
+(define_insn_reservation "athlon_fcmov_load_k8" 17
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (and (eq_attr "type" "fcmov")
+				   (eq_attr "memory" "load")))
+			 "athlon-vector,athlon-fploadk8,athlon-fvector")
+(define_insn_reservation "athlon_fcmov_k8" 15
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (eq_attr "type" "fcmov"))
+			 "athlon-vector,athlon-fpsched,athlon-fvector")
+;; fcomi is vector decoded by uses only one pipe.
+(define_insn_reservation "athlon_fcomi_load" 3
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "fcmp")
+				   (and (eq_attr "athlon_decode" "vector")
+				        (eq_attr "memory" "load"))))
+			 "athlon-vector,athlon-fpload,athlon-fadd")
+(define_insn_reservation "athlon_fcomi_load_k8" 5
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (and (eq_attr "type" "fcmp")
+				   (and (eq_attr "athlon_decode" "vector")
+				        (eq_attr "memory" "load"))))
+			 "athlon-vector,athlon-fploadk8,athlon-fadd")
+(define_insn_reservation "athlon_fcomi" 3
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (and (eq_attr "athlon_decode" "vector")
+				   (eq_attr "type" "fcmp")))
+			 "athlon-vector,athlon-fpsched,athlon-fadd")
+(define_insn_reservation "athlon_fcom_load" 2
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "fcmp")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fpload,athlon-fadd")
+(define_insn_reservation "athlon_fcom_load_k8" 4
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (and (eq_attr "type" "fcmp")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fploadk8,athlon-fadd")
+(define_insn_reservation "athlon_fcom" 2
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (eq_attr "type" "fcmp"))
+			 "athlon-direct,athlon-fpsched,athlon-fadd")
+;; Never seen by the scheduler because we still don't do post reg-stack
+;; scheduling.
+;(define_insn_reservation "athlon_fxch" 2
+;			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+;			      (eq_attr "type" "fxch"))
+;			 "athlon-direct,athlon-fpsched,athlon-fany")
+
+;; Athlon handle MMX operations in the FPU unit with shorter latencies
+
+(define_insn_reservation "athlon_movlpd_load" 0
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "ssemov")
+				   (match_operand:DF 1 "memory_operand")))
+			 "athlon-direct,athlon-fpload,athlon-fany")
+(define_insn_reservation "athlon_movlpd_load_k8" 2
+			 (and (eq_attr "cpu" "k8")
+			      (and (eq_attr "type" "ssemov")
+				   (match_operand:DF 1 "memory_operand")))
+			 "athlon-direct,athlon-fploadk8,athlon-fstore")
+(define_insn_reservation "athlon_movsd_load_generic" 2
+			 (and (eq_attr "cpu" "generic")
+			      (and (eq_attr "type" "ssemov")
+				   (match_operand:DF 1 "memory_operand")))
+			 "athlon-double,athlon-fploadk8,(athlon-fstore+athlon-fmul)")
+(define_insn_reservation "athlon_movaps_load_k8" 2
+			 (and (eq_attr "cpu" "k8,generic")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "V4SF,V2DF,TI")
+					(eq_attr "memory" "load"))))
+			 "athlon-double,athlon-fpload2k8,athlon-fstore,athlon-fstore")
+(define_insn_reservation "athlon_movaps_load" 0
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "V4SF,V2DF,TI")
+					(eq_attr "memory" "load"))))
+			 "athlon-vector,athlon-fpload2,(athlon-fany+athlon-fany)")
+(define_insn_reservation "athlon_movss_load" 1
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "SF,DI")
+					(eq_attr "memory" "load"))))
+			 "athlon-vector,athlon-fpload,(athlon-fany*2)")
+(define_insn_reservation "athlon_movss_load_k8" 1
+			 (and (eq_attr "cpu" "k8,generic")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "SF,DI")
+					(eq_attr "memory" "load"))))
+			 "athlon-double,athlon-fploadk8,(athlon-fstore+athlon-fany)")
+(define_insn_reservation "athlon_mmxsseld" 0
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "mmxmov,ssemov")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fpload,athlon-fany")
+(define_insn_reservation "athlon_mmxsseld_k8" 2
+			 (and (eq_attr "cpu" "k8,generic")
+			      (and (eq_attr "type" "mmxmov,ssemov")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fploadk8,athlon-fstore")
+;; On AMDFAM10 all double, single and integer packed and scalar SSEx data
+;; loads  generated are direct path, latency of 2 and do not use any FP
+;; executions units. No separate entries for movlpx/movhpx loads, which
+;; are direct path, latency of 4 and use the FADD/FMUL FP execution units,
+;; as they will not be generated.
+(define_insn_reservation "athlon_sseld_amdfam10" 2
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "ssemov")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fploadk8")
+;; On AMDFAM10 MMX data loads  generated are direct path, latency of 4
+;; and can use any  FP executions units
+(define_insn_reservation "athlon_mmxld_amdfam10" 4
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "mmxmov")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fploadk8, athlon-fany")
+(define_insn_reservation "athlon_mmxssest" 3
+			 (and (eq_attr "cpu" "k8,generic")
+			      (and (eq_attr "type" "mmxmov,ssemov")
+				   (and (eq_attr "mode" "V4SF,V2DF,TI")
+					(eq_attr "memory" "store,both"))))
+			 "athlon-vector,(athlon-fpsched+athlon-agu),((athlon-fstore+athlon-store2)*2)")
+(define_insn_reservation "athlon_mmxssest_k8" 3
+			 (and (eq_attr "cpu" "k8,generic")
+			      (and (eq_attr "type" "mmxmov,ssemov")
+				   (and (eq_attr "mode" "V4SF,V2DF,TI")
+					(eq_attr "memory" "store,both"))))
+			 "athlon-double,(athlon-fpsched+athlon-agu),((athlon-fstore+athlon-store2)*2)")
+(define_insn_reservation "athlon_mmxssest_short" 2
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (and (eq_attr "type" "mmxmov,ssemov")
+				   (eq_attr "memory" "store,both")))
+			 "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)")
+;; On AMDFAM10 all double, single and integer packed SSEx data stores
+;; generated are all double path, latency of 2 and use the FSTORE FP
+;; execution unit. No entries separate for movupx/movdqu, which are
+;; vector path, latency of 3 and use the FSTORE*2 FP execution unit,
+;; as they will not be generated.
+(define_insn_reservation "athlon_ssest_amdfam10" 2
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "V4SF,V2DF,TI")
+					(eq_attr "memory" "store,both"))))
+			 "athlon-double,(athlon-fpsched+athlon-agu),((athlon-fstore+athlon-store)*2)")
+;; On AMDFAM10 all double, single and integer scalar SSEx and MMX
+;; data stores generated are all direct path, latency of 2 and use
+;; the FSTORE FP execution unit
+(define_insn_reservation "athlon_mmxssest_short_amdfam10" 2
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "mmxmov,ssemov")
+				   (eq_attr "memory" "store,both")))
+			 "athlon-direct,(athlon-fpsched+athlon-agu),(athlon-fstore+athlon-store)")
+(define_insn_reservation "athlon_movaps_k8" 2
+			 (and (eq_attr "cpu" "k8,generic")
+			      (and (eq_attr "type" "ssemov")
+				   (eq_attr "mode" "V4SF,V2DF,TI")))
+			 "athlon-double,athlon-fpsched,((athlon-faddmul+athlon-faddmul) | (athlon-faddmul, athlon-faddmul))")
+(define_insn_reservation "athlon_movaps" 2
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "ssemov")
+				   (eq_attr "mode" "V4SF,V2DF,TI")))
+			 "athlon-vector,athlon-fpsched,(athlon-faddmul+athlon-faddmul)")
+(define_insn_reservation "athlon_mmxssemov" 2
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (eq_attr "type" "mmxmov,ssemov"))
+			 "athlon-direct,athlon-fpsched,athlon-faddmul")
+(define_insn_reservation "athlon_mmxmul_load" 4
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (and (eq_attr "type" "mmxmul")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fpload,athlon-fmul")
+(define_insn_reservation "athlon_mmxmul" 3
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (eq_attr "type" "mmxmul"))
+			 "athlon-direct,athlon-fpsched,athlon-fmul")
+(define_insn_reservation "athlon_mmx_load" 3
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (and (eq_attr "unit" "mmx")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fpload,athlon-faddmul")
+(define_insn_reservation "athlon_mmx" 2
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (eq_attr "unit" "mmx"))
+			 "athlon-direct,athlon-fpsched,athlon-faddmul")
+;; SSE operations are handled by the i387 unit as well.  The latency
+;; is same as for i387 operations for scalar operations
+
+(define_insn_reservation "athlon_sselog_load" 3
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1")
+				   (eq_attr "memory" "load")))
+			 "athlon-vector,athlon-fpload2,(athlon-fmul*2)")
+(define_insn_reservation "athlon_sselog_load_k8" 5
+			 (and (eq_attr "cpu" "k8,generic")
+			      (and (eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1")
+				   (eq_attr "memory" "load")))
+			 "athlon-double,athlon-fpload2k8,(athlon-fmul*2)")
+(define_insn_reservation "athlon_sselog_load_amdfam10" 4
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fploadk8,(athlon-fadd|athlon-fmul)")
+(define_insn_reservation "athlon_sselog" 3
+			 (and (eq_attr "cpu" "athlon")
+			      (eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1"))
+			 "athlon-vector,athlon-fpsched,athlon-fmul*2")
+(define_insn_reservation "athlon_sselog_k8" 3
+			 (and (eq_attr "cpu" "k8,generic")
+			      (eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1"))
+			 "athlon-double,athlon-fpsched,athlon-fmul")
+(define_insn_reservation "athlon_sselog_amdfam10" 2
+			 (and (eq_attr "cpu" "amdfam10")
+			      (eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1"))
+			 "athlon-direct,athlon-fpsched,(athlon-fadd|athlon-fmul)")
+
+;; ??? pcmp executes in addmul, probably not worthwhile to bother about that.
+(define_insn_reservation "athlon_ssecmp_load" 2
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "ssecmp")
+				   (and (eq_attr "mode" "SF,DF,DI")
+					(eq_attr "memory" "load"))))
+			 "athlon-direct,athlon-fpload,athlon-fadd")
+(define_insn_reservation "athlon_ssecmp_load_k8" 4
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (and (eq_attr "type" "ssecmp")
+				   (and (eq_attr "mode" "SF,DF,DI,TI")
+					(eq_attr "memory" "load"))))
+			 "athlon-direct,athlon-fploadk8,athlon-fadd")
+(define_insn_reservation "athlon_ssecmp" 2
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (and (eq_attr "type" "ssecmp")
+				   (eq_attr "mode" "SF,DF,DI,TI")))
+			 "athlon-direct,athlon-fpsched,athlon-fadd")
+(define_insn_reservation "athlon_ssecmpvector_load" 3
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "ssecmp")
+				   (eq_attr "memory" "load")))
+			 "athlon-vector,athlon-fpload2,(athlon-fadd*2)")
+(define_insn_reservation "athlon_ssecmpvector_load_k8" 5
+			 (and (eq_attr "cpu" "k8,generic")
+			      (and (eq_attr "type" "ssecmp")
+				   (eq_attr "memory" "load")))
+			 "athlon-double,athlon-fpload2k8,(athlon-fadd*2)")
+(define_insn_reservation "athlon_ssecmpvector_load_amdfam10" 4
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "ssecmp")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fploadk8,athlon-fadd")
+(define_insn_reservation "athlon_ssecmpvector" 3
+			 (and (eq_attr "cpu" "athlon")
+			      (eq_attr "type" "ssecmp"))
+			 "athlon-vector,athlon-fpsched,(athlon-fadd*2)")
+(define_insn_reservation "athlon_ssecmpvector_k8" 3
+			 (and (eq_attr "cpu" "k8,generic")
+			      (eq_attr "type" "ssecmp"))
+			 "athlon-double,athlon-fpsched,(athlon-fadd*2)")
+(define_insn_reservation "athlon_ssecmpvector_amdfam10" 2
+			 (and (eq_attr "cpu" "amdfam10")
+			      (eq_attr "type" "ssecmp"))
+			 "athlon-direct,athlon-fpsched,athlon-fadd")
+(define_insn_reservation "athlon_ssecomi_load" 4
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "ssecomi")
+				   (eq_attr "memory" "load")))
+			 "athlon-vector,athlon-fpload,athlon-fadd")
+(define_insn_reservation "athlon_ssecomi_load_k8" 6
+			 (and (eq_attr "cpu" "k8,generic")
+			      (and (eq_attr "type" "ssecomi")
+				   (eq_attr "memory" "load")))
+			 "athlon-vector,athlon-fploadk8,athlon-fadd")
+(define_insn_reservation "athlon_ssecomi_load_amdfam10" 5
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "ssecomi")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fploadk8,athlon-fadd")
+(define_insn_reservation "athlon_ssecomi" 4
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (eq_attr "type" "ssecomi"))
+			 "athlon-vector,athlon-fpsched,athlon-fadd")
+(define_insn_reservation "athlon_ssecomi_amdfam10" 3
+			 (and (eq_attr "cpu" "amdfam10")
+;; It seems athlon_ssecomi has a bug in the attr_type, fixed for amdfam10
+			      (eq_attr "type" "ssecomi"))
+			 "athlon-direct,athlon-fpsched,athlon-fadd")
+(define_insn_reservation "athlon_sseadd_load" 4
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "sseadd,sseadd1")
+				   (and (eq_attr "mode" "SF,DF,DI")
+					(eq_attr "memory" "load"))))
+			 "athlon-direct,athlon-fpload,athlon-fadd")
+(define_insn_reservation "athlon_sseadd_load_k8" 6
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (and (eq_attr "type" "sseadd,sseadd1")
+				   (and (eq_attr "mode" "SF,DF,DI")
+					(eq_attr "memory" "load"))))
+			 "athlon-direct,athlon-fploadk8,athlon-fadd")
+(define_insn_reservation "athlon_sseadd" 4
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (and (eq_attr "type" "sseadd,sseadd1")
+				   (eq_attr "mode" "SF,DF,DI")))
+			 "athlon-direct,athlon-fpsched,athlon-fadd")
+(define_insn_reservation "athlon_sseaddvector_load" 5
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "sseadd,sseadd1")
+				   (eq_attr "memory" "load")))
+			 "athlon-vector,athlon-fpload2,(athlon-fadd*2)")
+(define_insn_reservation "athlon_sseaddvector_load_k8" 7
+			 (and (eq_attr "cpu" "k8,generic")
+			      (and (eq_attr "type" "sseadd,sseadd1")
+				   (eq_attr "memory" "load")))
+			 "athlon-double,athlon-fpload2k8,(athlon-fadd*2)")
+(define_insn_reservation "athlon_sseaddvector_load_amdfam10" 6
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "sseadd,sseadd1")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fploadk8,athlon-fadd")
+(define_insn_reservation "athlon_sseaddvector" 5
+			 (and (eq_attr "cpu" "athlon")
+			      (eq_attr "type" "sseadd,sseadd1"))
+			 "athlon-vector,athlon-fpsched,(athlon-fadd*2)")
+(define_insn_reservation "athlon_sseaddvector_k8" 5
+			 (and (eq_attr "cpu" "k8,generic")
+			      (eq_attr "type" "sseadd,sseadd1"))
+			 "athlon-double,athlon-fpsched,(athlon-fadd*2)")
+(define_insn_reservation "athlon_sseaddvector_amdfam10" 4
+			 (and (eq_attr "cpu" "amdfam10")
+			      (eq_attr "type" "sseadd,sseadd1"))
+			 "athlon-direct,athlon-fpsched,athlon-fadd")
+
+;; Conversions behaves very irregularly and the scheduling is critical here.
+;; Take each instruction separately.  Assume that the mode is always set to the
+;; destination one and athlon_decode is set to the K8 versions.
+
+;; cvtss2sd
+(define_insn_reservation "athlon_ssecvt_cvtss2sd_load_k8" 4
+			 (and (eq_attr "cpu" "k8,athlon,generic")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "athlon_decode" "direct")
+					(and (eq_attr "mode" "DF")
+					     (eq_attr "memory" "load")))))
+			 "athlon-direct,athlon-fploadk8,athlon-fstore")
+(define_insn_reservation "athlon_ssecvt_cvtss2sd_load_amdfam10" 7
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "amdfam10_decode" "double")
+					(and (eq_attr "mode" "DF")
+					     (eq_attr "memory" "load")))))
+			 "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
+(define_insn_reservation "athlon_ssecvt_cvtss2sd" 2
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "athlon_decode" "direct")
+					(eq_attr "mode" "DF"))))
+			 "athlon-direct,athlon-fpsched,athlon-fstore")
+(define_insn_reservation "athlon_ssecvt_cvtss2sd_amdfam10" 7
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "amdfam10_decode" "vector")
+					(eq_attr "mode" "DF"))))
+			 "athlon-vector,athlon-fpsched,athlon-faddmul,(athlon-fstore*2)")
+;; cvtps2pd.  Model same way the other double decoded FP conversions.
+(define_insn_reservation "athlon_ssecvt_cvtps2pd_load_k8" 5
+			 (and (eq_attr "cpu" "k8,athlon,generic")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "athlon_decode" "double")
+					(and (eq_attr "mode" "V2DF,V4SF,TI")
+					     (eq_attr "memory" "load")))))
+			 "athlon-double,athlon-fpload2k8,(athlon-fstore*2)")
+(define_insn_reservation "athlon_ssecvt_cvtps2pd_load_amdfam10" 4
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "amdfam10_decode" "direct")
+					(and (eq_attr "mode" "V2DF,V4SF,TI")
+					     (eq_attr "memory" "load")))))
+			 "athlon-direct,athlon-fploadk8,athlon-fstore")
+(define_insn_reservation "athlon_ssecvt_cvtps2pd_k8" 3
+			 (and (eq_attr "cpu" "k8,athlon,generic")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "athlon_decode" "double")
+					(eq_attr "mode" "V2DF,V4SF,TI"))))
+			 "athlon-double,athlon-fpsched,athlon-fstore,athlon-fstore")
+(define_insn_reservation "athlon_ssecvt_cvtps2pd_amdfam10" 2
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "amdfam10_decode" "direct")
+					(eq_attr "mode" "V2DF,V4SF,TI"))))
+			 "athlon-direct,athlon-fpsched,athlon-fstore")
+;; cvtsi2sd mem,reg is directpath path  (cvtsi2sd reg,reg is doublepath)
+;; cvtsi2sd has troughput 1 and is executed in store unit with latency of 6
+(define_insn_reservation "athlon_sseicvt_cvtsi2sd_load" 6
+			 (and (eq_attr "cpu" "athlon,k8")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "athlon_decode" "direct")
+					(and (eq_attr "mode" "SF,DF")
+					     (eq_attr "memory" "load")))))
+			 "athlon-direct,athlon-fploadk8,athlon-fstore")
+(define_insn_reservation "athlon_sseicvt_cvtsi2sd_load_amdfam10" 9
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "amdfam10_decode" "double")
+					(and (eq_attr "mode" "SF,DF")
+					     (eq_attr "memory" "load")))))
+			 "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
+;; cvtsi2ss mem, reg is doublepath
+(define_insn_reservation "athlon_sseicvt_cvtsi2ss_load" 9
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "athlon_decode" "double")
+					(and (eq_attr "mode" "SF,DF")
+					     (eq_attr "memory" "load")))))
+			 "athlon-vector,athlon-fpload,(athlon-fstore*2)")
+(define_insn_reservation "athlon_sseicvt_cvtsi2ss_load_k8" 9
+			 (and (eq_attr "cpu" "k8,generic")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "athlon_decode" "double")
+					(and (eq_attr "mode" "SF,DF")
+					     (eq_attr "memory" "load")))))
+			 "athlon-double,athlon-fploadk8,(athlon-fstore*2)")
+(define_insn_reservation "athlon_sseicvt_cvtsi2ss_load_amdfam10" 9
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "amdfam10_decode" "double")
+					(and (eq_attr "mode" "SF,DF")
+					     (eq_attr "memory" "load")))))
+			 "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
+;; cvtsi2sd reg,reg is double decoded (vector on Athlon)
+(define_insn_reservation "athlon_sseicvt_cvtsi2sd_k8" 11
+			 (and (eq_attr "cpu" "k8,athlon,generic")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "athlon_decode" "double")
+					(and (eq_attr "mode" "SF,DF")
+					     (eq_attr "memory" "none")))))
+			 "athlon-double,athlon-fploadk8,athlon-fstore")
+(define_insn_reservation "athlon_sseicvt_cvtsi2sd_amdfam10" 14
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "amdfam10_decode" "vector")
+					(and (eq_attr "mode" "SF,DF")
+					     (eq_attr "memory" "none")))))
+			 "athlon-vector,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
+;; cvtsi2ss reg, reg is doublepath
+(define_insn_reservation "athlon_sseicvt_cvtsi2ss" 14
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "athlon_decode" "vector")
+					(and (eq_attr "mode" "SF,DF")
+					     (eq_attr "memory" "none")))))
+			 "athlon-vector,athlon-fploadk8,(athlon-fvector*2)")
+(define_insn_reservation "athlon_sseicvt_cvtsi2ss_amdfam10" 14
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "amdfam10_decode" "vector")
+					(and (eq_attr "mode" "SF,DF")
+					     (eq_attr "memory" "none")))))
+			 "athlon-vector,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
+;; cvtsd2ss mem,reg is doublepath, troughput unknown, latency 9
+(define_insn_reservation "athlon_ssecvt_cvtsd2ss_load_k8" 9
+			 (and (eq_attr "cpu" "k8,athlon,generic")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "athlon_decode" "double")
+					(and (eq_attr "mode" "SF")
+					     (eq_attr "memory" "load")))))
+			 "athlon-double,athlon-fploadk8,(athlon-fstore*3)")
+(define_insn_reservation "athlon_ssecvt_cvtsd2ss_load_amdfam10" 9
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "amdfam10_decode" "double")
+					(and (eq_attr "mode" "SF")
+					     (eq_attr "memory" "load")))))
+			 "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
+;; cvtsd2ss reg,reg is vectorpath, troughput unknown, latency 12
+(define_insn_reservation "athlon_ssecvt_cvtsd2ss" 12
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "athlon_decode" "vector")
+					(and (eq_attr "mode" "SF")
+					     (eq_attr "memory" "none")))))
+			 "athlon-vector,athlon-fpsched,(athlon-fvector*3)")
+(define_insn_reservation "athlon_ssecvt_cvtsd2ss_amdfam10" 8
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "amdfam10_decode" "vector")
+					(and (eq_attr "mode" "SF")
+					     (eq_attr "memory" "none")))))
+			 "athlon-vector,athlon-fpsched,athlon-faddmul,(athlon-fstore*2)")
+(define_insn_reservation "athlon_ssecvt_cvtpd2ps_load_k8" 8
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "athlon_decode" "vector")
+					(and (eq_attr "mode" "V4SF,V2DF,TI")
+					     (eq_attr "memory" "load")))))
+			 "athlon-double,athlon-fpload2k8,(athlon-fstore*3)")
+(define_insn_reservation "athlon_ssecvt_cvtpd2ps_load_amdfam10" 9
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "amdfam10_decode" "double")
+					(and (eq_attr "mode" "V4SF,V2DF,TI")
+					     (eq_attr "memory" "load")))))
+			 "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
+;; cvtpd2ps mem,reg is vectorpath, troughput unknown, latency 10
+;; ??? Why it is fater than cvtsd2ss?
+(define_insn_reservation "athlon_ssecvt_cvtpd2ps" 8
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "athlon_decode" "vector")
+					(and (eq_attr "mode" "V4SF,V2DF,TI")
+					     (eq_attr "memory" "none")))))
+			 "athlon-vector,athlon-fpsched,athlon-fvector*2")
+(define_insn_reservation "athlon_ssecvt_cvtpd2ps_amdfam10" 7
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "amdfam10_decode" "double")
+					(and (eq_attr "mode" "V4SF,V2DF,TI")
+					     (eq_attr "memory" "none")))))
+			 "athlon-double,athlon-fpsched,(athlon-faddmul+athlon-fstore)")
+;; cvtsd2si mem,reg is doublepath, troughput 1, latency 9
+(define_insn_reservation "athlon_secvt_cvtsX2si_load" 9
+			 (and (eq_attr "cpu" "athlon,k8,generic")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "athlon_decode" "vector")
+					(and (eq_attr "mode" "SI,DI")
+					     (eq_attr "memory" "load")))))
+			 "athlon-vector,athlon-fploadk8,athlon-fvector")
+(define_insn_reservation "athlon_secvt_cvtsX2si_load_amdfam10" 10
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "amdfam10_decode" "double")
+					(and (eq_attr "mode" "SI,DI")
+					     (eq_attr "memory" "load")))))
+			 "athlon-double,athlon-fploadk8,(athlon-fadd+athlon-fstore)")
+;; cvtsd2si reg,reg is doublepath, troughput 1, latency 9
+(define_insn_reservation "athlon_ssecvt_cvtsX2si" 9
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "athlon_decode" "double")
+					(and (eq_attr "mode" "SI,DI")
+					     (eq_attr "memory" "none")))))
+			 "athlon-vector,athlon-fpsched,athlon-fvector")
+(define_insn_reservation "athlon_ssecvt_cvtsX2si_k8" 9
+			 (and (eq_attr "cpu" "k8,generic")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "athlon_decode" "double")
+					(and (eq_attr "mode" "SI,DI")
+					     (eq_attr "memory" "none")))))
+			 "athlon-double,athlon-fpsched,athlon-fstore")
+(define_insn_reservation "athlon_ssecvt_cvtsX2si_amdfam10" 8
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "amdfam10_decode" "double")
+					(and (eq_attr "mode" "SI,DI")
+					     (eq_attr "memory" "none")))))
+			 "athlon-double,athlon-fpsched,(athlon-fadd+athlon-fstore)")
+;; cvtpd2dq reg,mem is doublepath, troughput 1, latency 9 on amdfam10
+(define_insn_reservation "athlon_sseicvt_cvtpd2dq_load_amdfam10" 9
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "amdfam10_decode" "double")
+					(and (eq_attr "mode" "TI")
+					     (eq_attr "memory" "load")))))
+			 "athlon-double,athlon-fploadk8,(athlon-faddmul+athlon-fstore)")
+;; cvtpd2dq reg,mem is doublepath, troughput 1, latency 7 on amdfam10
+(define_insn_reservation "athlon_sseicvt_cvtpd2dq_amdfam10" 7
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "amdfam10_decode" "double")
+					(and (eq_attr "mode" "TI")
+					     (eq_attr "memory" "none")))))
+			 "athlon-double,athlon-fpsched,(athlon-faddmul+athlon-fstore)")
+
+
+(define_insn_reservation "athlon_ssemul_load" 4
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "ssemul")
+				   (and (eq_attr "mode" "SF,DF")
+					(eq_attr "memory" "load"))))
+			 "athlon-direct,athlon-fpload,athlon-fmul")
+(define_insn_reservation "athlon_ssemul_load_k8" 6
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (and (eq_attr "type" "ssemul")
+				   (and (eq_attr "mode" "SF,DF")
+					(eq_attr "memory" "load"))))
+			 "athlon-direct,athlon-fploadk8,athlon-fmul")
+(define_insn_reservation "athlon_ssemul" 4
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (and (eq_attr "type" "ssemul")
+				   (eq_attr "mode" "SF,DF")))
+			 "athlon-direct,athlon-fpsched,athlon-fmul")
+(define_insn_reservation "athlon_ssemulvector_load" 5
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "ssemul")
+				   (eq_attr "memory" "load")))
+			 "athlon-vector,athlon-fpload2,(athlon-fmul*2)")
+(define_insn_reservation "athlon_ssemulvector_load_k8" 7
+			 (and (eq_attr "cpu" "k8,generic")
+			      (and (eq_attr "type" "ssemul")
+				   (eq_attr "memory" "load")))
+			 "athlon-double,athlon-fpload2k8,(athlon-fmul*2)")
+(define_insn_reservation "athlon_ssemulvector_load_amdfam10" 6
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "ssemul")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fploadk8,athlon-fmul")
+(define_insn_reservation "athlon_ssemulvector" 5
+			 (and (eq_attr "cpu" "athlon")
+			      (eq_attr "type" "ssemul"))
+			 "athlon-vector,athlon-fpsched,(athlon-fmul*2)")
+(define_insn_reservation "athlon_ssemulvector_k8" 5
+			 (and (eq_attr "cpu" "k8,generic")
+			      (eq_attr "type" "ssemul"))
+			 "athlon-double,athlon-fpsched,(athlon-fmul*2)")
+(define_insn_reservation "athlon_ssemulvector_amdfam10" 4
+			 (and (eq_attr "cpu" "amdfam10")
+			      (eq_attr "type" "ssemul"))
+			 "athlon-direct,athlon-fpsched,athlon-fmul")
+;; divsd timings.  divss is faster
+(define_insn_reservation "athlon_ssediv_load" 20
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "ssediv")
+				   (and (eq_attr "mode" "SF,DF")
+					(eq_attr "memory" "load"))))
+			 "athlon-direct,athlon-fpload,athlon-fmul*17")
+(define_insn_reservation "athlon_ssediv_load_k8" 22
+			 (and (eq_attr "cpu" "k8,generic,amdfam10")
+			      (and (eq_attr "type" "ssediv")
+				   (and (eq_attr "mode" "SF,DF")
+					(eq_attr "memory" "load"))))
+			 "athlon-direct,athlon-fploadk8,athlon-fmul*17")
+(define_insn_reservation "athlon_ssediv" 20
+			 (and (eq_attr "cpu" "athlon,k8,generic,amdfam10")
+			      (and (eq_attr "type" "ssediv")
+				   (eq_attr "mode" "SF,DF")))
+			 "athlon-direct,athlon-fpsched,athlon-fmul*17")
+(define_insn_reservation "athlon_ssedivvector_load" 39
+			 (and (eq_attr "cpu" "athlon")
+			      (and (eq_attr "type" "ssediv")
+				   (eq_attr "memory" "load")))
+			 "athlon-vector,athlon-fpload2,athlon-fmul*34")
+(define_insn_reservation "athlon_ssedivvector_load_k8" 35
+			 (and (eq_attr "cpu" "k8,generic")
+			      (and (eq_attr "type" "ssediv")
+				   (eq_attr "memory" "load")))
+			 "athlon-double,athlon-fpload2k8,athlon-fmul*34")
+(define_insn_reservation "athlon_ssedivvector_load_amdfam10" 22
+			 (and (eq_attr "cpu" "amdfam10")
+			      (and (eq_attr "type" "ssediv")
+				   (eq_attr "memory" "load")))
+			 "athlon-direct,athlon-fploadk8,athlon-fmul*17")
+(define_insn_reservation "athlon_ssedivvector" 39
+			 (and (eq_attr "cpu" "athlon")
+			      (eq_attr "type" "ssediv"))
+			 "athlon-vector,athlon-fmul*34")
+(define_insn_reservation "athlon_ssedivvector_k8" 39
+			 (and (eq_attr "cpu" "k8,generic")
+			      (eq_attr "type" "ssediv"))
+			 "athlon-double,athlon-fmul*34")
+(define_insn_reservation "athlon_ssedivvector_amdfam10" 20
+			 (and (eq_attr "cpu" "amdfam10")
+			      (eq_attr "type" "ssediv"))
+			 "athlon-direct,athlon-fmul*17")
+(define_insn_reservation "athlon_sseins_amdfam10" 5
+                         (and (eq_attr "cpu" "amdfam10")
+                              (and (eq_attr "type" "sseins")
+                                   (eq_attr "mode" "TI")))
+                         "athlon-vector,athlon-fpsched,athlon-faddmul")
diff --git a/gcc-4.9/gcc/config/i386/atom.md b/gcc-4.9/gcc/config/i386/atom.md
new file mode 100644
index 000000000..7102df123
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/atom.md
@@ -0,0 +1,794 @@
+;; Atom Scheduling
+;; Copyright (C) 2009-2014 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+;;
+;; Atom is an in-order core with two integer pipelines.
+
+
+(define_attr "atom_unit" "sishuf,simul,jeu,complex,other" 
+  (const_string "other"))
+
+(define_attr "atom_sse_attr" "rcp,movdup,lfence,fence,prefetch,sqrt,mxcsr,other"
+  (const_string "other"))
+
+(define_automaton "atom")
+
+;;  Atom has two ports: port 0 and port 1 connecting to all execution units
+(define_cpu_unit "atom-port-0,atom-port-1" "atom")
+
+;;  EU: Execution Unit
+;;  Atom EUs are connected by port 0 or port 1. 
+
+(define_cpu_unit "atom-eu-0, atom-eu-1,
+                  atom-imul-1, atom-imul-2, atom-imul-3, atom-imul-4"
+                  "atom")
+
+;; Some EUs have duplicated copied and can be accessed via either
+;; port 0 or port 1
+;; (define_reservation "atom-port-either" "(atom-port-0 | atom-port-1)")
+
+;;; Some instructions is dual-pipe execution, need both ports
+;;; Complex multi-op macro-instructoins need both ports and all EUs
+(define_reservation "atom-port-dual" "(atom-port-0 + atom-port-1)")
+(define_reservation "atom-all-eu" "(atom-eu-0 + atom-eu-1 + 
+                                    atom-imul-1 + atom-imul-2 + atom-imul-3 +
+                                    atom-imul-4)")
+
+;;; Most of simple instructions have 1 cycle latency. Some of them
+;;; issue in port 0, some in port 0 and some in either port.
+(define_reservation "atom-simple-0" "(atom-port-0 + atom-eu-0)")
+(define_reservation "atom-simple-1" "(atom-port-1 + atom-eu-1)")
+(define_reservation "atom-simple-either" "(atom-simple-0 | atom-simple-1)")
+
+;;; Some insn issues in port 0 with 3 cycle latency and 1 cycle tput
+(define_reservation "atom-eu-0-3-1" "(atom-port-0 + atom-eu-0, nothing*2)")
+
+;;; fmul insn can have 4 or 5 cycles latency
+(define_reservation "atom-fmul-5c" "(atom-port-0 + atom-eu-0), nothing*4")
+(define_reservation "atom-fmul-4c" "(atom-port-0 + atom-eu-0), nothing*3")
+
+;;; fadd can has 5 cycles latency depends on instruction forms
+(define_reservation "atom-fadd-5c" "(atom-port-1 + atom-eu-1), nothing*5")
+
+;;; imul insn has 5 cycles latency
+(define_reservation "atom-imul-32" 
+                    "atom-imul-1, atom-imul-2, atom-imul-3, atom-imul-4, 
+                     atom-port-0")
+;;; imul instruction excludes other non-FP instructions.
+(exclusion_set "atom-eu-0, atom-eu-1" 
+               "atom-imul-1, atom-imul-2, atom-imul-3, atom-imul-4")
+
+;;; dual-execution instructions can have 1,2,4,5 cycles latency depends on 
+;;; instruction forms
+(define_reservation "atom-dual-1c" "(atom-port-dual + atom-eu-0 + atom-eu-1)")
+(define_reservation "atom-dual-2c"
+                    "(atom-port-dual + atom-eu-0 + atom-eu-1, nothing)")
+(define_reservation "atom-dual-5c"
+                    "(atom-port-dual + atom-eu-0 + atom-eu-1, nothing*4)")
+
+;;; Complex macro-instruction has variants of latency, and uses both ports.
+(define_reservation "atom-complex" "(atom-port-dual + atom-all-eu)")
+
+(define_insn_reservation  "atom_other" 9
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "other")
+            (eq_attr "atom_unit" "!jeu")))
+  "atom-complex, atom-all-eu*8")
+
+;; return has type "other" with atom_unit "jeu"
+(define_insn_reservation  "atom_other_2" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "other")
+            (eq_attr "atom_unit" "jeu")))
+  "atom-dual-1c")
+
+(define_insn_reservation  "atom_multi" 9
+  (and (eq_attr "cpu" "atom")
+       (eq_attr "type" "multi"))
+  "atom-complex, atom-all-eu*8")
+
+;; Normal alu insns without carry
+(define_insn_reservation  "atom_alu" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "alu")
+            (and (eq_attr "memory" "none")
+                 (eq_attr "use_carry" "0"))))
+  "atom-simple-either")
+
+;; Normal alu insns without carry
+(define_insn_reservation  "atom_alu_mem" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "alu")
+            (and (eq_attr "memory" "!none")
+                 (eq_attr "use_carry" "0"))))
+  "atom-simple-either")
+
+;; Alu insn consuming CF, such as add/sbb
+(define_insn_reservation  "atom_alu_carry" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "alu")
+            (and (eq_attr "memory" "none")
+                 (eq_attr "use_carry" "1"))))
+  "atom-simple-either")
+
+;; Alu insn consuming CF, such as add/sbb
+(define_insn_reservation  "atom_alu_carry_mem" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "alu")
+            (and (eq_attr "memory" "!none")
+                (eq_attr "use_carry" "1"))))
+  "atom-simple-either")
+
+(define_insn_reservation  "atom_alu1" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "alu1")
+            (eq_attr "memory" "none")))
+  "atom-simple-either")
+
+(define_insn_reservation  "atom_alu1_mem" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "alu1")
+            (eq_attr "memory" "!none")))
+  "atom-simple-either")
+
+(define_insn_reservation  "atom_negnot" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "negnot")
+            (eq_attr "memory" "none")))
+  "atom-simple-either")
+
+(define_insn_reservation  "atom_negnot_mem" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "negnot")
+            (eq_attr "memory" "!none")))
+  "atom-simple-either")
+
+(define_insn_reservation  "atom_imov" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "imov")
+            (eq_attr "memory" "none")))
+  "atom-simple-either")
+
+(define_insn_reservation  "atom_imov_mem" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "imov")
+            (eq_attr "memory" "!none")))
+  "atom-simple-either")
+
+;; 16<-16, 32<-32
+(define_insn_reservation  "atom_imovx" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "imovx")
+            (and (eq_attr "memory" "none")
+                 (ior (and (match_operand:HI 0 "register_operand")
+                           (match_operand:HI 1 "general_operand"))
+                      (and (match_operand:SI 0 "register_operand")
+                           (match_operand:SI 1 "general_operand"))))))
+  "atom-simple-either")
+
+;; 16<-16, 32<-32, mem
+(define_insn_reservation  "atom_imovx_mem" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "imovx")
+            (and (eq_attr "memory" "!none")
+                 (ior (and (match_operand:HI 0 "register_operand")
+                           (match_operand:HI 1 "general_operand"))
+                      (and (match_operand:SI 0 "register_operand")
+                           (match_operand:SI 1 "general_operand"))))))
+  "atom-simple-either")
+
+;; 32<-16, 32<-8, 64<-16, 64<-8, 64<-32, 8<-8
+(define_insn_reservation  "atom_imovx_2" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "imovx")
+            (and (eq_attr "memory" "none")
+                 (ior (match_operand:QI 0 "register_operand")
+                      (ior (and (match_operand:SI 0 "register_operand")
+                                (not (match_operand:SI 1 "general_operand")))
+                           (match_operand:DI 0 "register_operand"))))))
+  "atom-simple-0")
+
+;; 32<-16, 32<-8, 64<-16, 64<-8, 64<-32, 8<-8, mem
+(define_insn_reservation  "atom_imovx_2_mem" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "imovx")
+            (and (eq_attr "memory" "!none")
+                 (ior (match_operand:QI 0 "register_operand")
+                      (ior (and (match_operand:SI 0 "register_operand")
+                                (not (match_operand:SI 1 "general_operand")))
+                           (match_operand:DI 0 "register_operand"))))))
+  "atom-simple-0")
+
+;; 16<-8
+(define_insn_reservation  "atom_imovx_3" 3
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "imovx")
+            (and (match_operand:HI 0 "register_operand")
+                 (match_operand:QI 1 "general_operand"))))
+  "atom-complex, atom-all-eu*2")
+
+(define_insn_reservation  "atom_lea" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "lea")
+            (eq_attr "mode" "!HI")))
+  "atom-simple-either")
+
+;; lea 16bit address is complex insn
+(define_insn_reservation  "atom_lea_2" 2
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "lea")
+            (eq_attr "mode" "HI")))
+  "atom-complex, atom-all-eu")
+
+(define_insn_reservation  "atom_incdec" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "incdec")
+            (eq_attr "memory" "none")))
+  "atom-simple-either")
+
+(define_insn_reservation  "atom_incdec_mem" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "incdec")
+            (eq_attr "memory" "!none")))
+  "atom-simple-either")
+
+;; simple shift instruction use SHIFT eu, none memory
+(define_insn_reservation  "atom_ishift" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "ishift")
+            (and (eq_attr "memory" "none") (eq_attr "prefix_0f" "0"))))
+  "atom-simple-0")
+
+;; simple shift instruction use SHIFT eu, memory
+(define_insn_reservation  "atom_ishift_mem" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "ishift")
+            (and (eq_attr "memory" "!none") (eq_attr "prefix_0f" "0"))))
+  "atom-simple-0")
+
+;; DF shift (prefixed with 0f) is complex insn with latency of 7 cycles
+(define_insn_reservation  "atom_ishift_3" 7
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "ishift")
+            (eq_attr "prefix_0f" "1")))
+  "atom-complex, atom-all-eu*6")
+
+(define_insn_reservation  "atom_ishift1" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "ishift1")
+            (eq_attr "memory" "none")))
+  "atom-simple-0")
+
+(define_insn_reservation  "atom_ishift1_mem" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "ishift1")
+            (eq_attr "memory" "!none")))
+  "atom-simple-0")
+
+(define_insn_reservation  "atom_rotate" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "rotate")
+            (eq_attr "memory" "none")))
+  "atom-simple-0")
+
+(define_insn_reservation  "atom_rotate_mem" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "rotate")
+            (eq_attr "memory" "!none")))
+  "atom-simple-0")
+
+(define_insn_reservation  "atom_rotate1" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "rotate1")
+            (eq_attr "memory" "none")))
+  "atom-simple-0")
+
+(define_insn_reservation  "atom_rotate1_mem" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "rotate1")
+            (eq_attr "memory" "!none")))
+  "atom-simple-0")
+
+(define_insn_reservation  "atom_imul" 5
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "imul")
+            (and (eq_attr "memory" "none") (eq_attr "mode" "SI"))))
+  "atom-imul-32")
+
+(define_insn_reservation  "atom_imul_mem" 5
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "imul")
+            (and (eq_attr "memory" "!none") (eq_attr "mode" "SI"))))
+  "atom-imul-32")
+
+;; latency set to 10 as common 64x64 imul
+(define_insn_reservation  "atom_imul_3" 10
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "imul")
+            (eq_attr "mode" "!SI")))
+  "atom-complex, atom-all-eu*9")
+
+(define_insn_reservation  "atom_idiv" 65
+  (and (eq_attr "cpu" "atom")
+       (eq_attr "type" "idiv"))
+  "atom-complex, atom-all-eu*32, nothing*32")
+
+(define_insn_reservation  "atom_icmp" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "icmp")
+            (eq_attr "memory" "none")))
+  "atom-simple-either")
+
+(define_insn_reservation  "atom_icmp_mem" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "icmp")
+            (eq_attr "memory" "!none")))
+  "atom-simple-either")
+
+(define_insn_reservation  "atom_test" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "test")
+            (eq_attr "memory" "none")))
+  "atom-simple-either")
+
+(define_insn_reservation  "atom_test_mem" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "test")
+            (eq_attr "memory" "!none")))
+  "atom-simple-either")
+
+(define_insn_reservation  "atom_ibr" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "ibr")
+            (eq_attr "memory" "!load")))
+  "atom-simple-1")
+
+;; complex if jump target is from address
+(define_insn_reservation  "atom_ibr_2" 2
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "ibr")
+            (eq_attr "memory" "load")))
+  "atom-complex, atom-all-eu")
+
+(define_insn_reservation  "atom_setcc" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "setcc")
+            (eq_attr "memory" "!store")))
+  "atom-simple-either")
+
+;; 2 cycles complex if target is in memory
+(define_insn_reservation  "atom_setcc_2" 2
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "setcc")
+            (eq_attr "memory" "store")))
+  "atom-complex, atom-all-eu")
+
+(define_insn_reservation  "atom_icmov" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "icmov")
+            (eq_attr "memory" "none")))
+  "atom-simple-either")
+
+(define_insn_reservation  "atom_icmov_mem" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "icmov")
+            (eq_attr "memory" "!none")))
+  "atom-simple-either")
+
+;; UCODE if segreg, ignored
+(define_insn_reservation  "atom_push" 2
+  (and (eq_attr "cpu" "atom")
+       (eq_attr "type" "push"))
+  "atom-dual-2c")
+
+;; pop r64 is 1 cycle. UCODE if segreg, ignored
+(define_insn_reservation  "atom_pop" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "pop")
+            (eq_attr "mode" "DI")))
+  "atom-dual-1c")
+
+;; pop non-r64 is 2 cycles. UCODE if segreg, ignored
+(define_insn_reservation  "atom_pop_2" 2
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "pop")
+            (eq_attr "mode" "!DI")))
+  "atom-dual-2c")
+
+;; UCODE if segreg, ignored
+(define_insn_reservation  "atom_call" 1
+  (and (eq_attr "cpu" "atom")
+       (eq_attr "type" "call"))
+  "atom-dual-1c")
+
+(define_insn_reservation  "atom_callv" 1
+  (and (eq_attr "cpu" "atom")
+       (eq_attr "type" "callv"))
+  "atom-dual-1c")
+
+(define_insn_reservation  "atom_leave" 3
+  (and (eq_attr "cpu" "atom")
+       (eq_attr "type" "leave"))
+  "atom-complex, atom-all-eu*2")
+
+(define_insn_reservation  "atom_str" 3
+  (and (eq_attr "cpu" "atom")
+       (eq_attr "type" "str"))
+  "atom-complex, atom-all-eu*2")
+
+(define_insn_reservation  "atom_sselog" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sselog,sseshuf")
+            (eq_attr "memory" "none")))
+  "atom-simple-either")
+
+(define_insn_reservation  "atom_sselog_mem" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sselog,sseshuf")
+            (eq_attr "memory" "!none")))
+  "atom-simple-either")
+
+(define_insn_reservation  "atom_sselog1" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sselog1,sseshuf1")
+            (eq_attr "memory" "none")))
+  "atom-simple-0")
+
+(define_insn_reservation  "atom_sselog1_mem" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sselog1,sseshuf1")
+            (eq_attr "memory" "!none")))
+  "atom-simple-0")
+
+;; not pmad, not psad
+(define_insn_reservation  "atom_sseiadd" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sseiadd")
+            (and (not (match_operand:V2DI 0 "register_operand"))
+                 (and (eq_attr "atom_unit" "!simul")
+                      (eq_attr "atom_unit" "!complex")))))
+  "atom-simple-either")
+
+;; pmad, psad and 64
+(define_insn_reservation  "atom_sseiadd_2" 4
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sseiadd")
+            (and (not (match_operand:V2DI 0 "register_operand"))
+                 (and (eq_attr "atom_unit" "simul" )
+                      (eq_attr "mode" "DI")))))
+  "atom-fmul-4c")
+
+;; pmad, psad and 128
+(define_insn_reservation  "atom_sseiadd_3" 5
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sseiadd")
+            (and (not (match_operand:V2DI 0 "register_operand"))
+                 (and (eq_attr "atom_unit" "simul" )
+                      (eq_attr "mode" "TI")))))
+  "atom-fmul-5c")
+
+;; if paddq(64 bit op), phadd/phsub
+(define_insn_reservation  "atom_sseiadd_4" 6
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sseiadd")
+            (ior (match_operand:V2DI 0 "register_operand")
+                 (eq_attr "atom_unit" "complex"))))
+  "atom-complex, atom-all-eu*5")
+
+;; if immediate op. 
+(define_insn_reservation  "atom_sseishft" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sseishft")
+            (and (eq_attr "atom_unit" "!sishuf")
+                 (match_operand 2 "immediate_operand"))))
+  "atom-simple-either")
+
+;; if palignr or psrldq
+(define_insn_reservation  "atom_sseishft_2" 1
+  (and (eq_attr "cpu" "atom")
+       (ior (eq_attr "type" "sseishft1")
+	    (and (eq_attr "type" "sseishft")
+		 (and (eq_attr "atom_unit" "sishuf")
+		      (match_operand 2 "immediate_operand")))))
+  "atom-simple-0")
+
+;; if reg/mem op
+(define_insn_reservation  "atom_sseishft_3" 2
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sseishft")
+            (not (match_operand 2 "immediate_operand"))))
+  "atom-complex, atom-all-eu")
+
+(define_insn_reservation  "atom_sseimul" 1
+  (and (eq_attr "cpu" "atom")
+       (eq_attr "type" "sseimul"))
+  "atom-simple-0")
+
+;; rcpss or rsqrtss
+(define_insn_reservation  "atom_sse" 4
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sse")
+            (and (eq_attr "atom_sse_attr" "rcp") (eq_attr "mode" "SF"))))
+  "atom-fmul-4c")
+
+;; movshdup, movsldup. Suggest to type sseishft
+(define_insn_reservation  "atom_sse_2" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sse")
+            (eq_attr "atom_sse_attr" "movdup")))
+  "atom-simple-0")
+
+;; lfence
+(define_insn_reservation  "atom_sse_3" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sse")
+            (eq_attr "atom_sse_attr" "lfence")))
+  "atom-simple-either")
+
+;; sfence,clflush,mfence, prefetch
+(define_insn_reservation  "atom_sse_4" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sse")
+            (eq_attr "atom_sse_attr" "fence,prefetch")))
+  "atom-simple-0")
+
+;; rcpps, rsqrtss, sqrt, ldmxcsr
+(define_insn_reservation  "atom_sse_5" 7
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sse")
+            (ior (eq_attr "atom_sse_attr" "sqrt,mxcsr")
+                 (and (eq_attr "atom_sse_attr" "rcp")
+                      (eq_attr "mode" "V4SF")))))
+  "atom-complex, atom-all-eu*6")
+
+;; xmm->xmm
+(define_insn_reservation  "atom_ssemov" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "ssemov")
+            (and (match_operand 0 "register_operand" "xy") (match_operand 1 "register_operand" "xy"))))
+  "atom-simple-either")
+
+;; reg->xmm
+(define_insn_reservation  "atom_ssemov_2" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "ssemov")
+            (and (match_operand 0 "register_operand" "xy") (match_operand 1 "register_operand" "r"))))
+  "atom-simple-0")
+
+;; xmm->reg
+(define_insn_reservation  "atom_ssemov_3" 3
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "ssemov")
+            (and (match_operand 0 "register_operand" "r") (match_operand 1 "register_operand" "xy"))))
+  "atom-eu-0-3-1")
+
+;; mov mem
+(define_insn_reservation  "atom_ssemov_4" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "ssemov")
+            (and (eq_attr "movu" "0") (eq_attr "memory" "!none"))))
+  "atom-simple-0")
+
+;; movu mem
+(define_insn_reservation  "atom_ssemov_5" 2
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "ssemov")
+            (ior (eq_attr "movu" "1") (eq_attr "memory" "!none"))))
+  "atom-complex, atom-all-eu")
+
+;; no memory simple
+(define_insn_reservation  "atom_sseadd" 5
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sseadd,sseadd1")
+            (and (eq_attr "memory" "none")
+                 (and (eq_attr "mode" "!V2DF")
+                      (eq_attr "atom_unit" "!complex")))))
+  "atom-fadd-5c")
+
+;; memory simple
+(define_insn_reservation  "atom_sseadd_mem" 5
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sseadd,sseadd1")
+            (and (eq_attr "memory" "!none")
+                 (and (eq_attr "mode" "!V2DF")
+                      (eq_attr "atom_unit" "!complex")))))
+  "atom-dual-5c")
+
+;; maxps, minps, *pd, hadd, hsub
+(define_insn_reservation  "atom_sseadd_3" 8
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sseadd,sseadd1")
+            (ior (eq_attr "mode" "V2DF") (eq_attr "atom_unit" "complex"))))
+  "atom-complex, atom-all-eu*7")
+
+;; Except dppd/dpps
+(define_insn_reservation  "atom_ssemul" 5
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "ssemul")
+            (eq_attr "mode" "!SF")))
+  "atom-fmul-5c")
+
+;; Except dppd/dpps, 4 cycle if mulss
+(define_insn_reservation  "atom_ssemul_2" 4
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "ssemul")
+            (eq_attr "mode" "SF")))
+  "atom-fmul-4c")
+
+(define_insn_reservation  "atom_ssecmp" 1
+  (and (eq_attr "cpu" "atom")
+       (eq_attr "type" "ssecmp"))
+  "atom-simple-either")
+
+(define_insn_reservation  "atom_ssecomi" 10
+  (and (eq_attr "cpu" "atom")
+       (eq_attr "type" "ssecomi"))
+  "atom-complex, atom-all-eu*9")
+
+;; no memory and cvtpi2ps, cvtps2pi, cvttps2pi
+(define_insn_reservation  "atom_ssecvt" 5
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "ssecvt")
+            (ior (and (match_operand:V2SI 0 "register_operand")
+                      (match_operand:V4SF 1 "register_operand"))
+                 (and (match_operand:V4SF 0 "register_operand")
+                      (match_operand:V2SI 1 "register_operand")))))
+  "atom-fadd-5c")
+
+;; memory and cvtpi2ps, cvtps2pi, cvttps2pi
+(define_insn_reservation  "atom_ssecvt_2" 5
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "ssecvt")
+            (ior (and (match_operand:V2SI 0 "register_operand")
+                      (match_operand:V4SF 1 "memory_operand"))
+                 (and (match_operand:V4SF 0 "register_operand")
+                      (match_operand:V2SI 1 "memory_operand")))))
+  "atom-dual-5c")
+
+;; otherwise. 7 cycles average for cvtss2sd
+(define_insn_reservation  "atom_ssecvt_3" 7
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "ssecvt")
+            (not (ior (and (match_operand:V2SI 0 "register_operand")
+                           (match_operand:V4SF 1 "nonimmediate_operand"))
+                      (and (match_operand:V4SF 0 "register_operand")
+                           (match_operand:V2SI 1 "nonimmediate_operand"))))))
+  "atom-complex, atom-all-eu*6")
+
+;; memory and cvtsi2sd
+(define_insn_reservation  "atom_sseicvt" 5
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sseicvt")
+            (and (match_operand:V2DF 0 "register_operand")
+                 (match_operand:SI 1 "memory_operand"))))
+  "atom-dual-5c")
+
+;; otherwise. 8 cycles average for cvtsd2si
+(define_insn_reservation  "atom_sseicvt_2" 8
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "sseicvt")
+            (not (and (match_operand:V2DF 0 "register_operand")
+                      (match_operand:SI 1 "memory_operand")))))
+  "atom-complex, atom-all-eu*7")
+
+(define_insn_reservation  "atom_ssediv" 62
+  (and (eq_attr "cpu" "atom")
+       (eq_attr "type" "ssediv"))
+  "atom-complex, atom-all-eu*12, nothing*49")
+
+;; simple for fmov
+(define_insn_reservation  "atom_fmov" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "fmov")
+            (eq_attr "memory" "none")))
+  "atom-simple-either")
+
+;; simple for fmov
+(define_insn_reservation  "atom_fmov_mem" 1
+  (and (eq_attr "cpu" "atom")
+       (and (eq_attr "type" "fmov")
+            (eq_attr "memory" "!none")))
+  "atom-simple-either")
+
+;; Define bypass here
+
+;; There will be no stall from lea to non-mem EX insns
+(define_bypass 0 "atom_lea"
+                 "atom_alu_carry,
+                  atom_alu,atom_alu1,atom_negnot,atom_imov,atom_imovx,
+                  atom_incdec, atom_setcc, atom_icmov, atom_pop")
+
+(define_bypass 0 "atom_lea"
+                 "atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem,
+                  atom_imovx_mem, atom_imovx_2_mem,
+                  atom_imov_mem, atom_icmov_mem, atom_fmov_mem"
+                 "!ix86_agi_dependent")
+
+;; There will be 3 cycles stall from EX insns to AGAN insns LEA
+(define_bypass 4 "atom_alu_carry,
+                  atom_alu,atom_alu1,atom_negnot,atom_imov,atom_imovx,
+                  atom_incdec,atom_ishift,atom_ishift1,atom_rotate,
+                  atom_rotate1, atom_setcc, atom_icmov, atom_pop,
+                  atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem,
+                  atom_imovx_mem, atom_imovx_2_mem,
+                  atom_imov_mem, atom_icmov_mem, atom_fmov_mem"
+                 "atom_lea")
+
+;; There will be 3 cycles stall from EX insns to insns need addr calculation
+(define_bypass 4 "atom_alu_carry,
+                  atom_alu,atom_alu1,atom_negnot,atom_imov,atom_imovx,
+                  atom_incdec,atom_ishift,atom_ishift1,atom_rotate,
+                  atom_rotate1, atom_setcc, atom_icmov, atom_pop,
+                  atom_imovx_mem, atom_imovx_2_mem,
+                  atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem,
+                  atom_imov_mem, atom_icmov_mem, atom_fmov_mem"
+                 "atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem,
+                  atom_negnot_mem, atom_imov_mem, atom_incdec_mem,
+                  atom_imovx_mem, atom_imovx_2_mem,
+                  atom_imul_mem, atom_icmp_mem,
+                  atom_test_mem, atom_icmov_mem, atom_sselog_mem,
+                  atom_sselog1_mem, atom_fmov_mem, atom_sseadd_mem,
+                  atom_ishift_mem, atom_ishift1_mem, 
+                  atom_rotate_mem, atom_rotate1_mem"
+                  "ix86_agi_dependent")
+
+;; Stall from imul to lea is 8 cycles.
+(define_bypass 9 "atom_imul, atom_imul_mem" "atom_lea")
+
+;; Stall from imul to memory address is 8 cycles.
+(define_bypass 9 "atom_imul, atom_imul_mem" 
+                 "atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem,
+                  atom_negnot_mem, atom_imov_mem, atom_incdec_mem,
+                  atom_ishift_mem, atom_ishift1_mem, atom_rotate_mem,
+                  atom_rotate1_mem, atom_imul_mem, atom_icmp_mem,
+                  atom_test_mem, atom_icmov_mem, atom_sselog_mem,
+                  atom_sselog1_mem, atom_fmov_mem, atom_sseadd_mem"
+                  "ix86_agi_dependent")
+
+;; There will be 0 cycle stall from cmp/test to jcc
+
+;; There will be 1 cycle stall from flag producer to cmov and adc/sbb
+(define_bypass 2 "atom_icmp, atom_test, atom_alu, atom_alu_carry,
+                  atom_alu1, atom_negnot, atom_incdec, atom_ishift,
+                  atom_ishift1, atom_rotate, atom_rotate1"
+                 "atom_icmov, atom_alu_carry")
+
+;; lea to shift count stall is 2 cycles
+(define_bypass 3 "atom_lea"
+                 "atom_ishift, atom_ishift1, atom_rotate, atom_rotate1,
+                  atom_ishift_mem, atom_ishift1_mem, 
+                  atom_rotate_mem, atom_rotate1_mem"
+                 "ix86_dep_by_shift_count")
+
+;; lea to shift source stall is 1 cycle
+(define_bypass 2 "atom_lea"
+                 "atom_ishift, atom_ishift1, atom_rotate, atom_rotate1"
+                 "!ix86_dep_by_shift_count")
+
+;; non-lea to shift count stall is 1 cycle
+(define_bypass 2 "atom_alu_carry,
+                  atom_alu,atom_alu1,atom_negnot,atom_imov,atom_imovx,
+                  atom_incdec,atom_ishift,atom_ishift1,atom_rotate,
+                  atom_rotate1, atom_setcc, atom_icmov, atom_pop,
+                  atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem,
+                  atom_imovx_mem, atom_imovx_2_mem,
+                  atom_imov_mem, atom_icmov_mem, atom_fmov_mem"
+                 "atom_ishift, atom_ishift1, atom_rotate, atom_rotate1,
+                  atom_ishift_mem, atom_ishift1_mem, 
+                  atom_rotate_mem, atom_rotate1_mem"
+                 "ix86_dep_by_shift_count")
diff --git a/gcc-4.9/gcc/config/i386/att.h b/gcc-4.9/gcc/config/i386/att.h
new file mode 100644
index 000000000..f559a83bb
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/att.h
@@ -0,0 +1,91 @@
+/* Definitions for AT&T assembler syntax for the Intel 80386.
+   Copyright (C) 1988-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+
+/* Define the syntax of instructions and addresses.  */
+
+/* Prefix for internally generated assembler labels.  */
+#define LPREFIX ".L"
+
+/* Assembler pseudos to introduce constants of various size.  */
+
+#define ASM_BYTE "\t.byte\t"
+#define ASM_SHORT "\t.value\t"
+#define ASM_LONG "\t.long\t"
+#define ASM_QUAD "\t.quad\t"  /* Should not be used for 32bit compilation.  */
+
+/* How to output an ASCII string constant.  */
+
+#undef ASM_OUTPUT_ASCII
+#define ASM_OUTPUT_ASCII(FILE, PTR, SIZE)			\
+do								\
+{ size_t i = 0, limit = (SIZE); 				\
+  while (i < limit)						\
+    { if (i%10 == 0) { if (i!=0) putc ('\n', (FILE));		\
+		       fputs (ASM_BYTE, (FILE)); }		\
+      else putc (',', (FILE));					\
+      fprintf ((FILE), "0x%x", ((PTR)[i++] & 0377)) ;}		\
+      putc ('\n', (FILE));					\
+} while (0)
+
+/* Output at beginning of assembler file.  */
+#define TARGET_ASM_FILE_START_FILE_DIRECTIVE true
+
+/* This is how to output an assembler line
+   that says to advance the location counter
+   to a multiple of 2**LOG bytes.  */
+
+#define ASM_OUTPUT_ALIGN(FILE,LOG)	\
+    if ((LOG)!=0) fprintf ((FILE), "\t.align %d\n", 1<<(LOG))
+
+/* This is how to output an assembler line
+   that says to advance the location counter by SIZE bytes.  */
+
+#undef ASM_OUTPUT_SKIP
+#define ASM_OUTPUT_SKIP(FILE,SIZE)  \
+  fprintf ((FILE), "\t.set .,.+%u\n", (int)(SIZE))
+
+/* Can't use ASM_OUTPUT_SKIP in text section; it doesn't leave 0s.  */
+
+#define ASM_NO_SKIP_IN_TEXT 1
+
+/* Define the syntax of labels and symbol definitions/declarations.  */
+
+/* The prefix to add for compiler private assembler symbols.  */
+#undef LOCAL_LABEL_PREFIX
+#define LOCAL_LABEL_PREFIX "."
+
+/* This is how to store into the string BUF
+   the symbol_ref name of an internal numbered label where
+   PREFIX is the class of label and NUM is the number within the class.
+   This is suitable for output with `assemble_name'.  */
+
+#undef ASM_GENERATE_INTERNAL_LABEL
+#define ASM_GENERATE_INTERNAL_LABEL(BUF,PREFIX,NUMBER)	\
+  sprintf ((BUF), LOCAL_LABEL_PREFIX "%s%ld", (PREFIX), (long)(NUMBER))
+
+/* The prefix to add to user-visible assembler symbols.  */
+
+#undef USER_LABEL_PREFIX
+#define USER_LABEL_PREFIX ""
diff --git a/gcc-4.9/gcc/config/i386/avx2intrin.h b/gcc-4.9/gcc/config/i386/avx2intrin.h
new file mode 100644
index 000000000..d04c972ed
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/avx2intrin.h
@@ -0,0 +1,1889 @@
+/* Copyright (C) 2011-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+# error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX2INTRIN_H_INCLUDED
+#define _AVX2INTRIN_H_INCLUDED
+
+#ifndef __AVX2__
+#pragma GCC push_options
+#pragma GCC target("avx2")
+#define __DISABLE_AVX2__
+#endif /* __AVX2__ */
+
+/* Sum absolute 8-bit integer difference of adjacent groups of 4
+   byte integers in the first 2 operands.  Starting offsets within
+   operands are determined by the 3rd mask operand.  */
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
+{
+  return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
+					      (__v32qi)__Y, __M);
+}
+#else
+#define _mm256_mpsadbw_epu8(X, Y, M)					\
+  ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X),		\
+					(__v32qi)(__m256i)(Y), (int)(M)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_epi8 (__m256i __A)
+{
+  return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_epi16 (__m256i __A)
+{
+  return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_epi32 (__m256i __A)
+{
+  return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_packs_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_packs_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_packus_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_packus_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_adds_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_adds_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_adds_epu8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_adds_epu16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
+{
+  return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
+					      (__v4di)__B,
+					      __N * 8);
+}
+#else
+/* In that case (__N*8) will be in vreg, and insn will not be matched. */
+/* Use define instead */
+#define _mm256_alignr_epi8(A, B, N)				   \
+  ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A),	   \
+					(__v4di)(__m256i)(B),	   \
+					(int)(N) * 8))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_and_si256 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_andsi256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_andnot_si256 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avg_epu8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avg_epu16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
+{
+  return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
+					       (__v32qi)__Y,
+					       (__v32qi)__M);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
+{
+  return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
+					      (__v16hi)__Y,
+					       __M);
+}
+#else
+#define _mm256_blend_epi16(X, Y, M)					\
+  ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X),		\
+					(__v16hi)(__m256i)(Y), (int)(M)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pcmpeqb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pcmpeqw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pcmpeqd256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pcmpeqq256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pcmpgtb256 ((__v32qi)__A,
+					     (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pcmpgtw256 ((__v16hi)__A,
+					     (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pcmpgtd256 ((__v8si)__A,
+					     (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pcmpgtq256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadd_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
+					     (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadd_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadds_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
+					      (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsub_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
+					     (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsub_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
+					      (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
+						(__v32qi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_madd_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
+					     (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epu8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epu16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epu32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epu8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epu16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epu32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movemask_epi8 (__m256i __A)
+{
+  return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi8_epi16 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi8_epi32 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi8_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi16_epi32 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi16_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu8_epi16 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu8_epi32 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu8_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu16_epi32 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu16_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu32_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
+					       (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mulhi_epu16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mulhi_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mullo_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mullo_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_epu32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_or_si256 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_por256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sad_epu8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
+					     (__v32qi)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_epi32 (__m256i __A, const int __mask)
+{
+  return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shufflehi_epi16 (__m256i __A, const int __mask)
+{
+  return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shufflelo_epi16 (__m256i __A, const int __mask)
+{
+  return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
+}
+#else
+#define _mm256_shuffle_epi32(A, N) \
+  ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
+#define _mm256_shufflehi_epi16(A, N) \
+  ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
+#define _mm256_shufflelo_epi16(A, N) \
+  ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sign_epi8 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sign_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sign_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_slli_si256 (__m256i __A, const int __N)
+{
+  return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
+}
+#else
+#define _mm256_slli_si256(A, N) \
+  ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_slli_epi16 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sll_epi16 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_slli_epi32 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sll_epi32 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_slli_epi64 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sll_epi64 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srai_epi16 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sra_epi16 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srai_epi32 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sra_epi32 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srli_si256 (__m256i __A, const int __N)
+{
+  return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
+}
+#else
+#define _mm256_srli_si256(A, N) \
+  ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srli_epi16 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srl_epi16 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srli_epi32 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srl_epi32 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srli_epi64 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srl_epi64 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_subs_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_subs_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_subs_epu8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_subs_epu16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_xor_si256 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pxor256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_stream_load_si256 (__m256i const *__X)
+{
+  return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastss_ps (__m128 __X)
+{
+  return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastss_ps (__m128 __X)
+{
+  return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastsd_pd (__m128d __X)
+{
+  return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastsi128_si256 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
+{
+  return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
+					      (__v4si)__Y,
+					      __M);
+}
+#else
+#define _mm_blend_epi32(X, Y, M)					\
+  ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X),		\
+					(__v4si)(__m128i)(Y), (int)(M)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
+{
+  return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
+					      (__v8si)__Y,
+					      __M);
+}
+#else
+#define _mm256_blend_epi32(X, Y, M)					\
+  ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X),		\
+					(__v8si)(__m256i)(Y), (int)(M)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastb_epi8 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastw_epi16 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastd_epi32 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastq_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastb_epi8 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastw_epi16 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastd_epi32 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastq_epi64 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute4x64_pd (__m256d __X, const int __M)
+{
+  return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
+}
+#else
+#define _mm256_permute4x64_pd(X, M)			       \
+  ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
+#endif
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
+{
+  return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute4x64_epi64 (__m256i __X, const int __M)
+{
+  return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
+}
+#else
+#define _mm256_permute4x64_epi64(X, M)			       \
+  ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
+#endif
+
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
+{
+  return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
+}
+#else
+#define _mm256_permute2x128_si256(X, Y, M)				\
+  ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extracti128_si256 (__m256i __X, const int __M)
+{
+  return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
+}
+#else
+#define _mm256_extracti128_si256(X, M)				\
+  ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
+{
+  return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
+}
+#else
+#define _mm256_inserti128_si256(X, Y, M)			 \
+  ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
+					   (__v2di)(__m128i)(Y), \
+					   (int)(M)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskload_epi32 (int const *__X, __m256i __M )
+{
+  return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
+						(__v8si)__M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskload_epi64 (long long const *__X, __m256i __M )
+{
+  return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
+						(__v4di)__M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskload_epi32 (int const *__X, __m128i __M )
+{
+  return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
+					     (__v4si)__M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskload_epi64 (long long const *__X, __m128i __M )
+{
+  return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
+					     (__v2di)__M);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
+{
+  __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
+{
+  __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
+{
+  __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
+{
+  __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sllv_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sllv_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sllv_epi64 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sllv_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srav_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srav_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srlv_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srlv_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srlv_epi64 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srlv_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32gather_pd (double const *base, __m128i index, const int scale)
+{
+  __v2df zero = _mm_setzero_pd ();
+  __v2df mask = _mm_cmpeq_pd (zero, zero);
+
+  return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (),
+						base,
+						(__v4si)index,
+						mask,
+						scale);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index,
+		       __m128d mask, const int scale)
+{
+  return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src,
+						base,
+						(__v4si)index,
+						(__v2df)mask,
+						scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32gather_pd (double const *base, __m128i index, const int scale)
+{
+  __v4df zero = _mm256_setzero_pd ();
+  __v4df mask = _mm256_cmp_pd (zero, zero, _CMP_EQ_OQ);
+
+  return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (),
+						base,
+						(__v4si)index,
+						mask,
+						scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32gather_pd (__m256d src, double const *base,
+			  __m128i index, __m256d mask, const int scale)
+{
+  return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src,
+						base,
+						(__v4si)index,
+						(__v4df)mask,
+						scale);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64gather_pd (double const *base, __m128i index, const int scale)
+{
+  __v2df src = _mm_setzero_pd ();
+  __v2df mask = _mm_cmpeq_pd (src, src);
+
+  return (__m128d) __builtin_ia32_gatherdiv2df (src,
+						base,
+						(__v2di)index,
+						mask,
+						scale);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index,
+		       __m128d mask, const int scale)
+{
+  return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src,
+						base,
+						(__v2di)index,
+						(__v2df)mask,
+						scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64gather_pd (double const *base, __m256i index, const int scale)
+{
+  __v4df src = _mm256_setzero_pd ();
+  __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
+
+  return (__m256d) __builtin_ia32_gatherdiv4df (src,
+						base,
+						(__v4di)index,
+						mask,
+						scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64gather_pd (__m256d src, double const *base,
+			  __m256i index, __m256d mask, const int scale)
+{
+  return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src,
+						base,
+						(__v4di)index,
+						(__v4df)mask,
+						scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32gather_ps (float const *base, __m128i index, const int scale)
+{
+  __v4sf src = _mm_setzero_ps ();
+  __v4sf mask = _mm_cmpeq_ps (src, src);
+
+  return (__m128) __builtin_ia32_gathersiv4sf (src,
+					       base,
+					       (__v4si)index,
+					       mask,
+					       scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index,
+		       __m128 mask, const int scale)
+{
+  return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src,
+					       base,
+					       (__v4si)index,
+					       (__v4sf)mask,
+					       scale);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32gather_ps (float const *base, __m256i index, const int scale)
+{
+  __v8sf src = _mm256_setzero_ps ();
+  __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
+
+  return (__m256) __builtin_ia32_gathersiv8sf (src,
+					       base,
+					       (__v8si)index,
+					       mask,
+					       scale);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32gather_ps (__m256 src, float const *base,
+			  __m256i index, __m256 mask, const int scale)
+{
+  return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src,
+					       base,
+					       (__v8si)index,
+					       (__v8sf)mask,
+					       scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64gather_ps (float const *base, __m128i index, const int scale)
+{
+  __v4sf src = _mm_setzero_ps ();
+  __v4sf mask = _mm_cmpeq_ps (src, src);
+
+  return (__m128) __builtin_ia32_gatherdiv4sf (src,
+					       base,
+					       (__v2di)index,
+					       mask,
+					       scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index,
+		       __m128 mask, const int scale)
+{
+  return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src,
+						base,
+						(__v2di)index,
+						(__v4sf)mask,
+						scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64gather_ps (float const *base, __m256i index, const int scale)
+{
+  __v4sf src = _mm_setzero_ps ();
+  __v4sf mask = _mm_cmpeq_ps (src, src);
+
+  return (__m128) __builtin_ia32_gatherdiv4sf256 (src,
+						  base,
+						  (__v4di)index,
+						  mask,
+						  scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64gather_ps (__m128 src, float const *base,
+			  __m256i index, __m128 mask, const int scale)
+{
+  return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src,
+						  base,
+						  (__v4di)index,
+						  (__v4sf)mask,
+						  scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32gather_epi64 (long long int const *base,
+		     __m128i index, const int scale)
+{
+  __v2di src = __extension__ (__v2di){ 0, 0 };
+  __v2di mask = __extension__ (__v2di){ ~0, ~0 };
+
+  return (__m128i) __builtin_ia32_gathersiv2di (src,
+						base,
+						(__v4si)index,
+						mask,
+						scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32gather_epi64 (__m128i src, long long int const *base,
+			  __m128i index, __m128i mask, const int scale)
+{
+  return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src,
+						base,
+						(__v4si)index,
+						(__v2di)mask,
+						scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32gather_epi64 (long long int const *base,
+			__m128i index, const int scale)
+{
+  __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
+  __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
+
+  return (__m256i) __builtin_ia32_gathersiv4di (src,
+						base,
+						(__v4si)index,
+						mask,
+						scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32gather_epi64 (__m256i src, long long int const *base,
+			     __m128i index, __m256i mask, const int scale)
+{
+  return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src,
+						base,
+						(__v4si)index,
+						(__v4di)mask,
+						scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64gather_epi64 (long long int const *base,
+		     __m128i index, const int scale)
+{
+  __v2di src = __extension__ (__v2di){ 0, 0 };
+  __v2di mask = __extension__ (__v2di){ ~0, ~0 };
+
+  return (__m128i) __builtin_ia32_gatherdiv2di (src,
+						base,
+						(__v2di)index,
+						mask,
+						scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index,
+			  __m128i mask, const int scale)
+{
+  return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src,
+						base,
+						(__v2di)index,
+						(__v2di)mask,
+						scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64gather_epi64 (long long int const *base,
+			__m256i index, const int scale)
+{
+  __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
+  __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
+
+  return (__m256i) __builtin_ia32_gatherdiv4di (src,
+						base,
+						(__v4di)index,
+						mask,
+						scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64gather_epi64 (__m256i src, long long int const *base,
+			     __m256i index, __m256i mask, const int scale)
+{
+  return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src,
+						base,
+						(__v4di)index,
+						(__v4di)mask,
+						scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32gather_epi32 (int const *base, __m128i index, const int scale)
+{
+  __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
+  __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
+
+  return (__m128i) __builtin_ia32_gathersiv4si (src,
+					       base,
+					       (__v4si)index,
+					       mask,
+					       scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index,
+			  __m128i mask, const int scale)
+{
+  return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src,
+						base,
+						(__v4si)index,
+						(__v4si)mask,
+						scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32gather_epi32 (int const *base, __m256i index, const int scale)
+{
+  __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
+  __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
+
+  return (__m256i) __builtin_ia32_gathersiv8si (src,
+						base,
+						(__v8si)index,
+						mask,
+						scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32gather_epi32 (__m256i src, int const *base,
+			     __m256i index, __m256i mask, const int scale)
+{
+  return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src,
+						base,
+						(__v8si)index,
+						(__v8si)mask,
+						scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64gather_epi32 (int const *base, __m128i index, const int scale)
+{
+  __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
+  __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
+
+  return (__m128i) __builtin_ia32_gatherdiv4si (src,
+						base,
+						(__v2di)index,
+						mask,
+						scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index,
+			  __m128i mask, const int scale)
+{
+  return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src,
+						base,
+						(__v2di)index,
+						(__v4si)mask,
+						scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64gather_epi32 (int const *base, __m256i index, const int scale)
+{
+  __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
+  __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
+
+  return (__m128i) __builtin_ia32_gatherdiv4si256 (src,
+						  base,
+						  (__v4di)index,
+						  mask,
+						  scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64gather_epi32 (__m128i src, int const *base,
+			     __m256i index, __m128i mask, const int scale)
+{
+  return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src,
+						   base,
+						   (__v4di)index,
+						   (__v4si)mask,
+						   scale);
+}
+#else /* __OPTIMIZE__ */
+#define _mm_i32gather_pd(BASE, INDEX, SCALE)				\
+  (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (),	\
+					 (double const *)BASE,		\
+					 (__v4si)(__m128i)INDEX,	\
+					 (__v2df)_mm_set1_pd(		\
+					   (double)(long long int) -1), \
+					 (int)SCALE)
+
+#define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
+  (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC,	 \
+					 (double const *)BASE,	 \
+					 (__v4si)(__m128i)INDEX, \
+					 (__v2df)(__m128d)MASK,	 \
+					 (int)SCALE)
+
+#define _mm256_i32gather_pd(BASE, INDEX, SCALE)				\
+  (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (),	\
+					 (double const *)BASE,		\
+					 (__v4si)(__m128i)INDEX,	\
+					 (__v4df)_mm256_set1_pd(	\
+					   (double)(long long int) -1), \
+					 (int)SCALE)
+
+#define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
+  (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC,	 \
+					 (double const *)BASE,	 \
+					 (__v4si)(__m128i)INDEX, \
+					 (__v4df)(__m256d)MASK,	 \
+					 (int)SCALE)
+
+#define _mm_i64gather_pd(BASE, INDEX, SCALE)				\
+  (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (),	\
+					 (double const *)BASE,		\
+					 (__v2di)(__m128i)INDEX,	\
+					 (__v2df)_mm_set1_pd(		\
+					   (double)(long long int) -1), \
+					 (int)SCALE)
+
+#define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
+  (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC,	 \
+					 (double const *)BASE,	 \
+					 (__v2di)(__m128i)INDEX, \
+					 (__v2df)(__m128d)MASK,	 \
+					 (int)SCALE)
+
+#define _mm256_i64gather_pd(BASE, INDEX, SCALE)				\
+  (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (),	\
+					 (double const *)BASE,		\
+					 (__v4di)(__m256i)INDEX,	\
+					 (__v4df)_mm256_set1_pd(	\
+					   (double)(long long int) -1), \
+					 (int)SCALE)
+
+#define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
+  (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC,	 \
+					 (double const *)BASE,	 \
+					 (__v4di)(__m256i)INDEX, \
+					 (__v4df)(__m256d)MASK,	 \
+					 (int)SCALE)
+
+#define _mm_i32gather_ps(BASE, INDEX, SCALE)				\
+  (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (),	\
+					(float const *)BASE,		\
+					(__v4si)(__m128i)INDEX,		\
+					_mm_set1_ps ((float)(int) -1),	\
+					(int)SCALE)
+
+#define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE)	 \
+  (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC,	 \
+					(float const *)BASE,	 \
+					(__v4si)(__m128i)INDEX,	 \
+					(__v4sf)(__m128d)MASK,	 \
+					(int)SCALE)
+
+#define _mm256_i32gather_ps(BASE, INDEX, SCALE)			       \
+  (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
+					(float const *)BASE,	       \
+					(__v8si)(__m256i)INDEX,	       \
+					(__v8sf)_mm256_set1_ps (       \
+					  (float)(int) -1),	       \
+					(int)SCALE)
+
+#define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
+  (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC,	\
+					(float const *)BASE,	\
+					(__v8si)(__m256i)INDEX, \
+					(__v8sf)(__m256d)MASK,	\
+					(int)SCALE)
+
+#define _mm_i64gather_ps(BASE, INDEX, SCALE)				\
+  (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (),	\
+					(float const *)BASE,		\
+					(__v2di)(__m128i)INDEX,		\
+					(__v4sf)_mm_set1_ps (		\
+					  (float)(int) -1),		\
+					(int)SCALE)
+
+#define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)	 \
+  (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC,	 \
+					(float const *)BASE,	 \
+					(__v2di)(__m128i)INDEX,	 \
+					(__v4sf)(__m128d)MASK,	 \
+					(int)SCALE)
+
+#define _mm256_i64gather_ps(BASE, INDEX, SCALE)				\
+  (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (),	\
+					   (float const *)BASE,		\
+					   (__v4di)(__m256i)INDEX,	\
+					   (__v4sf)_mm_set1_ps(		\
+					     (float)(int) -1),		\
+					   (int)SCALE)
+
+#define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)	   \
+  (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC,	   \
+					   (float const *)BASE,	   \
+					   (__v4di)(__m256i)INDEX, \
+					   (__v4sf)(__m128)MASK,   \
+					   (int)SCALE)
+
+#define _mm_i32gather_epi64(BASE, INDEX, SCALE)				\
+  (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
+					 (long long const *)BASE,	\
+					 (__v4si)(__m128i)INDEX,	\
+					 (__v2di)_mm_set1_epi64x (-1),	\
+					 (int)SCALE)
+
+#define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE)	  \
+  (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC,	  \
+					 (long long const *)BASE, \
+					 (__v4si)(__m128i)INDEX,  \
+					 (__v2di)(__m128i)MASK,	  \
+					 (int)SCALE)
+
+#define _mm256_i32gather_epi64(BASE, INDEX, SCALE)			   \
+  (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
+					 (long long const *)BASE,	   \
+					 (__v4si)(__m128i)INDEX,	   \
+					 (__v4di)_mm256_set1_epi64x (-1),  \
+					 (int)SCALE)
+
+#define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
+  (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC,	   \
+					 (long long const *)BASE,  \
+					 (__v4si)(__m128i)INDEX,   \
+					 (__v4di)(__m256i)MASK,	   \
+					 (int)SCALE)
+
+#define _mm_i64gather_epi64(BASE, INDEX, SCALE)				\
+  (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
+					 (long long const *)BASE,	\
+					 (__v2di)(__m128i)INDEX,	\
+					 (__v2di)_mm_set1_epi64x (-1),	\
+					 (int)SCALE)
+
+#define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE)	  \
+  (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC,	  \
+					 (long long const *)BASE, \
+					 (__v2di)(__m128i)INDEX,  \
+					 (__v2di)(__m128i)MASK,	  \
+					 (int)SCALE)
+
+#define _mm256_i64gather_epi64(BASE, INDEX, SCALE)			   \
+  (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
+					 (long long const *)BASE,	   \
+					 (__v4di)(__m256i)INDEX,	   \
+					 (__v4di)_mm256_set1_epi64x (-1),  \
+					 (int)SCALE)
+
+#define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
+  (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC,	   \
+					 (long long const *)BASE,  \
+					 (__v4di)(__m256i)INDEX,   \
+					 (__v4di)(__m256i)MASK,	   \
+					 (int)SCALE)
+
+#define _mm_i32gather_epi32(BASE, INDEX, SCALE)				\
+  (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (),	\
+					 (int const *)BASE,		\
+					 (__v4si)(__m128i)INDEX,	\
+					 (__v4si)_mm_set1_epi32 (-1),	\
+					 (int)SCALE)
+
+#define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
+  (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC,	\
+					(int const *)BASE,	\
+					(__v4si)(__m128i)INDEX, \
+					(__v4si)(__m128i)MASK,	\
+					(int)SCALE)
+
+#define _mm256_i32gather_epi32(BASE, INDEX, SCALE)			   \
+  (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
+					 (int const *)BASE,		   \
+					 (__v8si)(__m256i)INDEX,	   \
+					 (__v8si)_mm256_set1_epi32 (-1),   \
+					 (int)SCALE)
+
+#define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
+  (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC,	   \
+					(int const *)BASE,	   \
+					(__v8si)(__m256i)INDEX,	   \
+					(__v8si)(__m256i)MASK,	   \
+					(int)SCALE)
+
+#define _mm_i64gather_epi32(BASE, INDEX, SCALE)				\
+  (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (),	\
+					 (int const *)BASE,		\
+					 (__v2di)(__m128i)INDEX,	\
+					 (__v4si)_mm_set1_epi32 (-1),	\
+					 (int)SCALE)
+
+#define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
+  (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC,	\
+					(int const *)BASE,	\
+					(__v2di)(__m128i)INDEX, \
+					(__v4si)(__m128i)MASK,	\
+					(int)SCALE)
+
+#define _mm256_i64gather_epi32(BASE, INDEX, SCALE)			   \
+  (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
+					    (int const *)BASE,		   \
+					    (__v4di)(__m256i)INDEX,	   \
+					    (__v4si)_mm_set1_epi32(-1),	   \
+					    (int)SCALE)
+
+#define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
+  (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC,  \
+					   (int const *)BASE,	   \
+					   (__v4di)(__m256i)INDEX, \
+					   (__v4si)(__m128i)MASK,  \
+					   (int)SCALE)
+#endif  /* __OPTIMIZE__ */
+
+#ifdef __DISABLE_AVX2__
+#undef __DISABLE_AVX2__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX2__ */
+
+#endif /* _AVX2INTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/avx512cdintrin.h b/gcc-4.9/gcc/config/i386/avx512cdintrin.h
new file mode 100644
index 000000000..a4939f7ac
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/avx512cdintrin.h
@@ -0,0 +1,184 @@
+/* Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512CDINTRIN_H_INCLUDED
+#define _AVX512CDINTRIN_H_INCLUDED
+
+#ifndef __AVX512CD__
+#pragma GCC push_options
+#pragma GCC target("avx512cd")
+#define __DISABLE_AVX512CD__
+#endif /* __AVX512CD__ */
+
+/* Internal data types for implementing the intrinsics.  */
+typedef long long __v8di __attribute__ ((__vector_size__ (64)));
+typedef int __v16si __attribute__ ((__vector_size__ (64)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__));
+typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__));
+
+typedef unsigned char  __mmask8;
+typedef unsigned short __mmask16;
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_conflict_epi32 (__m512i __A)
+{
+  return (__m512i)
+	 __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+					       (__v16si) _mm512_setzero_si512 (),
+					       (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+							 (__v16si) __W,
+							 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i)
+	 __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+					       (__v16si) _mm512_setzero_si512 (),
+					       (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_conflict_epi64 (__m512i __A)
+{
+  return (__m512i)
+	 __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+					       (__v8di) _mm512_setzero_si512 (),
+					       (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+							 (__v8di) __W,
+							 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i)
+	 __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+					       (__v8di) _mm512_setzero_si512 (),
+					       (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_lzcnt_epi64 (__m512i __A)
+{
+  return (__m512i)
+	 __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+					   (__v8di) _mm512_setzero_si512 (),
+					   (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+						     (__v8di) __W,
+						     (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i)
+	 __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+					   (__v8di) _mm512_setzero_si512 (),
+					   (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_lzcnt_epi32 (__m512i __A)
+{
+  return (__m512i)
+	 __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+					   (__v16si) _mm512_setzero_si512 (),
+					   (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+						     (__v16si) __W,
+						     (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i)
+	 __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+					   (__v16si) _mm512_setzero_si512 (),
+					   (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m512i) __builtin_ia32_broadcastmb512 (__A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m512i) __builtin_ia32_broadcastmw512 (__A);
+}
+
+#ifdef __DISABLE_AVX512CD__
+#undef __DISABLE_AVX512CD__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512CD__ */
+
+#endif /* _AVX512CDINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/avx512erintrin.h b/gcc-4.9/gcc/config/i386/avx512erintrin.h
new file mode 100644
index 000000000..f6870a5f7
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/avx512erintrin.h
@@ -0,0 +1,394 @@
+/* Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512ERINTRIN_H_INCLUDED
+#define _AVX512ERINTRIN_H_INCLUDED
+
+#ifndef __AVX512ER__
+#pragma GCC push_options
+#pragma GCC target("avx512er")
+#define __DISABLE_AVX512ER__
+#endif /* __AVX512ER__ */
+
+/* Internal data types for implementing the intrinsics.  */
+typedef double __v8df __attribute__ ((__vector_size__ (64)));
+typedef float __v16sf __attribute__ ((__vector_size__ (64)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__));
+typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__));
+
+typedef unsigned char  __mmask8;
+typedef unsigned short __mmask16;
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_exp2a23_round_pd (__m512d __A, int __R)
+{
+  __m512d __W;
+  return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A,
+					       (__v8df) __W,
+					       (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_exp2a23_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R)
+{
+  return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A,
+					       (__v8df) __W,
+					       (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_exp2a23_round_pd (__mmask8 __U, __m512d __A, int __R)
+{
+  return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A,
+					       (__v8df) _mm512_setzero_pd (),
+					       (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_exp2a23_round_ps (__m512 __A, int __R)
+{
+  __m512 __W;
+  return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A,
+					      (__v16sf) __W,
+					      (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_exp2a23_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R)
+{
+  return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A,
+					      (__v16sf) __W,
+					      (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_exp2a23_round_ps (__mmask16 __U, __m512 __A, int __R)
+{
+  return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A,
+					      (__v16sf) _mm512_setzero_ps (),
+					      (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rcp28_round_pd (__m512d __A, int __R)
+{
+  __m512d __W;
+  return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A,
+						(__v8df) __W,
+						(__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rcp28_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R)
+{
+  return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A,
+						(__v8df) __W,
+						(__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rcp28_round_pd (__mmask8 __U, __m512d __A, int __R)
+{
+  return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A,
+						(__v8df) _mm512_setzero_pd (),
+						(__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rcp28_round_ps (__m512 __A, int __R)
+{
+  __m512 __W;
+  return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A,
+					       (__v16sf) __W,
+					       (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rcp28_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R)
+{
+  return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A,
+					       (__v16sf) __W,
+					       (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rcp28_round_ps (__mmask16 __U, __m512 __A, int __R)
+{
+  return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A,
+					       (__v16sf) _mm512_setzero_ps (),
+					       (__mmask16) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp28_round_sd (__m128d __A, __m128d __B, int __R)
+{
+  return (__m128d) __builtin_ia32_rcp28sd_round ((__v2df) __B,
+						 (__v2df) __A,
+						 __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp28_round_ss (__m128 __A, __m128 __B, int __R)
+{
+  return (__m128) __builtin_ia32_rcp28ss_round ((__v4sf) __B,
+						(__v4sf) __A,
+						__R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rsqrt28_round_pd (__m512d __A, int __R)
+{
+  __m512d __W;
+  return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A,
+						  (__v8df) __W,
+						  (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rsqrt28_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R)
+{
+  return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A,
+						  (__v8df) __W,
+						  (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rsqrt28_round_pd (__mmask8 __U, __m512d __A, int __R)
+{
+  return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A,
+						  (__v8df) _mm512_setzero_pd (),
+						  (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rsqrt28_round_ps (__m512 __A, int __R)
+{
+  __m512 __W;
+  return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A,
+						 (__v16sf) __W,
+						 (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rsqrt28_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R)
+{
+  return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A,
+						 (__v16sf) __W,
+						 (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rsqrt28_round_ps (__mmask16 __U, __m512 __A, int __R)
+{
+  return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A,
+						 (__v16sf) _mm512_setzero_ps (),
+						 (__mmask16) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt28_round_sd (__m128d __A, __m128d __B, int __R)
+{
+  return (__m128d) __builtin_ia32_rsqrt28sd_round ((__v2df) __B,
+						   (__v2df) __A,
+						   __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt28_round_ss (__m128 __A, __m128 __B, int __R)
+{
+  return (__m128) __builtin_ia32_rsqrt28ss_round ((__v4sf) __B,
+						  (__v4sf) __A,
+						  __R);
+}
+
+#else
+#define _mm512_exp2a23_round_pd(A, C)            \
+    __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C)
+
+#define _mm512_mask_exp2a23_round_pd(W, U, A, C) \
+    __builtin_ia32_exp2pd_mask(A, W, U, C)
+
+#define _mm512_maskz_exp2a23_round_pd(U, A, C)   \
+    __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_exp2a23_round_ps(A, C)            \
+    __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C)
+
+#define _mm512_mask_exp2a23_round_ps(W, U, A, C) \
+    __builtin_ia32_exp2ps_mask(A, W, U, C)
+
+#define _mm512_maskz_exp2a23_round_ps(U, A, C)   \
+    __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm512_rcp28_round_pd(A, C)            \
+    __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C)
+
+#define _mm512_mask_rcp28_round_pd(W, U, A, C) \
+    __builtin_ia32_rcp28pd_mask(A, W, U, C)
+
+#define _mm512_maskz_rcp28_round_pd(U, A, C)   \
+    __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_rcp28_round_ps(A, C)            \
+    __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C)
+
+#define _mm512_mask_rcp28_round_ps(W, U, A, C) \
+    __builtin_ia32_rcp28ps_mask(A, W, U, C)
+
+#define _mm512_maskz_rcp28_round_ps(U, A, C)   \
+    __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm512_rsqrt28_round_pd(A, C)            \
+    __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C)
+
+#define _mm512_mask_rsqrt28_round_pd(W, U, A, C) \
+    __builtin_ia32_rsqrt28pd_mask(A, W, U, C)
+
+#define _mm512_maskz_rsqrt28_round_pd(U, A, C)   \
+    __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_rsqrt28_round_ps(A, C)            \
+    __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C)
+
+#define _mm512_mask_rsqrt28_round_ps(W, U, A, C) \
+    __builtin_ia32_rsqrt28ps_mask(A, W, U, C)
+
+#define _mm512_maskz_rsqrt28_round_ps(U, A, C)   \
+    __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm_rcp28_round_sd(A, B, R)	\
+    __builtin_ia32_rcp28sd_round(A, B, R)
+
+#define _mm_rcp28_round_ss(A, B, R)	\
+    __builtin_ia32_rcp28ss_round(A, B, R)
+
+#define _mm_rsqrt28_round_sd(A, B, R)	\
+    __builtin_ia32_rsqrt28sd_round(A, B, R)
+
+#define _mm_rsqrt28_round_ss(A, B, R)	\
+    __builtin_ia32_rsqrt28ss_round(A, B, R)
+
+#endif
+
+#define _mm512_exp2a23_pd(A)                    \
+    _mm512_exp2a23_round_pd(A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_pd(W, U, A)   \
+    _mm512_mask_exp2a23_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_pd(U, A)     \
+    _mm512_maskz_exp2a23_round_pd(U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_exp2a23_ps(A)                    \
+    _mm512_exp2a23_round_ps(A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_ps(W, U, A)   \
+    _mm512_mask_exp2a23_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_ps(U, A)     \
+    _mm512_maskz_exp2a23_round_ps(U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rcp28_pd(A)                    \
+    _mm512_rcp28_round_pd(A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_pd(W, U, A)   \
+    _mm512_mask_rcp28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_pd(U, A)     \
+    _mm512_maskz_rcp28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rcp28_ps(A)                    \
+    _mm512_rcp28_round_ps(A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_ps(W, U, A)   \
+    _mm512_mask_rcp28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_ps(U, A)     \
+    _mm512_maskz_rcp28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rsqrt28_pd(A)                    \
+    _mm512_rsqrt28_round_pd(A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_pd(W, U, A)   \
+    _mm512_mask_rsqrt28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_pd(U, A)     \
+    _mm512_maskz_rsqrt28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rsqrt28_ps(A)                    \
+    _mm512_rsqrt28_round_ps(A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_ps(W, U, A)   \
+    _mm512_mask_rsqrt28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_ps(U, A)     \
+    _mm512_maskz_rsqrt28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_sd(A, B)	\
+    __builtin_ia32_rcp28sd_round(B, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_ss(A, B)	\
+    __builtin_ia32_rcp28ss_round(B, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_sd(A, B)	\
+    __builtin_ia32_rsqrt28sd_round(B, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_ss(A, B)	\
+    __builtin_ia32_rsqrt28ss_round(B, A, _MM_FROUND_CUR_DIRECTION)
+
+#ifdef __DISABLE_AVX512ER__
+#undef __DISABLE_AVX512ER__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512ER__ */
+
+#endif /* _AVX512ERINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/avx512fintrin.h b/gcc-4.9/gcc/config/i386/avx512fintrin.h
new file mode 100644
index 000000000..960286618
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/avx512fintrin.h
@@ -0,0 +1,12832 @@
+/* Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512FINTRIN_H_INCLUDED
+#define _AVX512FINTRIN_H_INCLUDED
+
+#ifndef __AVX512F__
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define __DISABLE_AVX512F__
+#endif /* __AVX512F__ */
+
+/* Internal data types for implementing the intrinsics.  */
+typedef double __v8df __attribute__ ((__vector_size__ (64)));
+typedef float __v16sf __attribute__ ((__vector_size__ (64)));
+typedef long long __v8di __attribute__ ((__vector_size__ (64)));
+typedef int __v16si __attribute__ ((__vector_size__ (64)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__));
+typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__));
+typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__));
+
+typedef unsigned char  __mmask8;
+typedef unsigned short __mmask16;
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_epi64 (long long __A, long long __B, long long __C,
+		  long long __D, long long __E, long long __F,
+		  long long __G, long long __H)
+{
+  return __extension__ (__m512i) (__v8di)
+	 { __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+/* Create the vector [A B C D E F G H I J K L M N O P].  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_epi32 (int __A, int __B, int __C, int __D,
+		  int __E, int __F, int __G, int __H,
+		  int __I, int __J, int __K, int __L,
+		  int __M, int __N, int __O, int __P)
+{
+  return __extension__ (__m512i)(__v16si)
+	 { __P, __O, __N, __M, __L, __K, __J, __I,
+	   __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_pd (double __A, double __B, double __C, double __D,
+	       double __E, double __F, double __G, double __H)
+{
+  return __extension__ (__m512d)
+	 { __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_ps (float __A, float __B, float __C, float __D,
+	       float __E, float __F, float __G, float __H,
+	       float __I, float __J, float __K, float __L,
+	       float __M, float __N, float __O, float __P)
+{
+  return __extension__ (__m512)
+	 { __P, __O, __N, __M, __L, __K, __J, __I,
+	   __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+#define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7)			      \
+  _mm512_set_epi64(e7,e6,e5,e4,e3,e2,e1,e0)
+
+#define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7,			      \
+			  e8,e9,e10,e11,e12,e13,e14,e15)		      \
+  _mm512_set_epi32(e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0)
+
+#define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7)				      \
+  _mm512_set_pd(e7,e6,e5,e4,e3,e2,e1,e0)
+
+#define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \
+  _mm512_set_ps(e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0)
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_undefined_ps (void)
+{
+  __m512 __Y = __Y;
+  return __Y;
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_undefined_pd (void)
+{
+  __m512d __Y = __Y;
+  return __Y;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_undefined_si512 (void)
+{
+  __m512i __Y = __Y;
+  return __Y;
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_pd (double __A)
+{
+  return (__m512d) __builtin_ia32_broadcastsd512 (__extension__
+						  (__v2df) { __A, },
+						  (__v8df)
+						  _mm512_undefined_pd (),
+						  (__mmask8) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_ps (float __A)
+{
+  return (__m512) __builtin_ia32_broadcastss512 (__extension__
+						 (__v4sf) { __A, },
+						 (__v16sf)
+						 _mm512_undefined_ps (),
+						 (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setzero_ps (void)
+{
+  return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+				 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setzero_pd (void)
+{
+  return __extension__ (__m512d) { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setzero_si512 (void)
+{
+  return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_movapd512_mask ((__v8df) __A,
+						  (__v8df) __W,
+						  (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_movapd512_mask ((__v8df) __A,
+						  (__v8df)
+						  _mm512_setzero_pd (),
+						  (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_movaps512_mask ((__v16sf) __A,
+						 (__v16sf) __W,
+						 (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_movaps512_mask ((__v16sf) __A,
+						 (__v16sf)
+						 _mm512_setzero_ps (),
+						 (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_pd (void const *__P)
+{
+  return *(__m512d *) __P;
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
+						   (__v8df) __W,
+						   (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_load_pd (__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
+						   (__v8df)
+						   _mm512_setzero_pd (),
+						   (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_pd (void *__P, __m512d __A)
+{
+  *(__m512d *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_store_pd (void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_storeapd512_mask ((__v8df *) __P, (__v8df) __A,
+				   (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_ps (void const *__P)
+{
+  return *(__m512 *) __P;
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
+						  (__v16sf) __W,
+						  (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_load_ps (__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
+						  (__v16sf)
+						  _mm512_setzero_ps (),
+						  (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_ps (void *__P, __m512 __A)
+{
+  *(__m512 *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_store_ps (void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_storeaps512_mask ((__v16sf *) __P, (__v16sf) __A,
+				   (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_movdqa64_512_mask ((__v8di) __A,
+						     (__v8di) __W,
+						     (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_movdqa64_512_mask ((__v8di) __A,
+						     (__v8di)
+						     _mm512_setzero_si512 (),
+						     (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_epi64 (void const *__P)
+{
+  return *(__m512i *) __P;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
+							(__v8di) __W,
+							(__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_load_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
+							(__v8di)
+							_mm512_setzero_si512 (),
+							(__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_epi64 (void *__P, __m512i __A)
+{
+  *(__m512i *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A)
+{
+  __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A,
+					(__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_movdqa32_512_mask ((__v16si) __A,
+						     (__v16si) __W,
+						     (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_movdqa32_512_mask ((__v16si) __A,
+						     (__v16si)
+						     _mm512_setzero_si512 (),
+						     (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_si512 (void const *__P)
+{
+  return *(__m512i *) __P;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_epi32 (void const *__P)
+{
+  return *(__m512i *) __P;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
+							(__v16si) __W,
+							(__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_load_epi32 (__mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
+							(__v16si)
+							_mm512_setzero_si512 (),
+							(__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_si512 (void *__P, __m512i __A)
+{
+  *(__m512i *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_epi32 (void *__P, __m512i __A)
+{
+  *(__m512i *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A)
+{
+  __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A,
+					(__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mullo_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si)
+						  _mm512_undefined_si512 (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mullo_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si)
+						  _mm512_setzero_si512 (),
+						  __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mullo_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sllv_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
+						  (__v16si) __Y,
+						  (__v16si)
+						  _mm512_undefined_si512 (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sllv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
+						  (__v16si) __Y,
+						  (__v16si) __W,
+						  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sllv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
+						  (__v16si) __Y,
+						  (__v16si)
+						  _mm512_setzero_si512 (),
+						  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srav_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
+						  (__v16si) __Y,
+						  (__v16si)
+						  _mm512_undefined_si512 (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srav_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
+						  (__v16si) __Y,
+						  (__v16si) __W,
+						  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srav_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
+						  (__v16si) __Y,
+						  (__v16si)
+						  _mm512_setzero_si512 (),
+						  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srlv_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
+						  (__v16si) __Y,
+						  (__v16si)
+						  _mm512_undefined_si512 (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srlv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
+						  (__v16si) __Y,
+						  (__v16si) __W,
+						  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srlv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
+						  (__v16si) __Y,
+						  (__v16si)
+						  _mm512_setzero_si512 (),
+						  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
+						 (__v8di) __B,
+						 (__v8di)
+						 _mm512_undefined_si512 (),
+						 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
+						 (__v8di) __B,
+						 (__v8di) __W,
+						 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
+						 (__v8di) __B,
+						 (__v8di)
+						 _mm512_setzero_si512 (),
+						 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
+						 (__v8di) __B,
+						 (__v8di)
+						 _mm512_undefined_pd (),
+						 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
+						 (__v8di) __B,
+						 (__v8di) __W,
+						 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
+						 (__v8di) __B,
+						 (__v8di)
+						 _mm512_setzero_si512 (),
+						 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sllv_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
+						 (__v8di) __Y,
+						 (__v8di)
+						 _mm512_undefined_pd (),
+						 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sllv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
+						 (__v8di) __Y,
+						 (__v8di) __W,
+						 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sllv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
+						 (__v8di) __Y,
+						 (__v8di)
+						 _mm512_setzero_si512 (),
+						 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srav_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
+						 (__v8di) __Y,
+						 (__v8di)
+						 _mm512_undefined_si512 (),
+						 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srav_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
+						 (__v8di) __Y,
+						 (__v8di) __W,
+						 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srav_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
+						 (__v8di) __Y,
+						 (__v8di)
+						 _mm512_setzero_si512 (),
+						 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srlv_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
+						 (__v8di) __Y,
+						 (__v8di)
+						 _mm512_undefined_si512 (),
+						 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srlv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
+						 (__v8di) __Y,
+						 (__v8di) __W,
+						 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
+						 (__v8di) __Y,
+						 (__v8di)
+						 _mm512_setzero_si512 (),
+						 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
+						 (__v16si) __B,
+						 (__v16si)
+						 _mm512_undefined_si512 (),
+						 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
+						 (__v16si) __B,
+						 (__v16si) __W,
+						 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
+						 (__v16si) __B,
+						 (__v16si)
+						 _mm512_setzero_si512 (),
+						 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
+						  (__v16si) __Y,
+						  (__v8di)
+						  _mm512_undefined_si512 (),
+						  (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_epi32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
+						  (__v16si) __Y,
+						  (__v8di) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_epi32 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
+						  (__v16si) __Y,
+						  (__v8di)
+						  _mm512_setzero_si512 (),
+						  __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
+						 (__v16si) __B,
+						 (__v16si)
+						 _mm512_undefined_si512 (),
+						 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
+						 (__v16si) __B,
+						 (__v16si) __W,
+						 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
+						 (__v16si) __B,
+						 (__v16si)
+						 _mm512_setzero_si512 (),
+						 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_epu32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
+						   (__v16si) __Y,
+						   (__v8di)
+						   _mm512_undefined_si512 (),
+						   (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_epu32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
+						   (__v16si) __Y,
+						   (__v8di) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_epu32 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
+						   (__v16si) __Y,
+						   (__v8di)
+						   _mm512_setzero_si512 (),
+						   __M);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_slli_epi64 (__m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B,
+						  (__v8di)
+						  _mm512_undefined_si512 (),
+						  (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_slli_epi64 (__m512i __W, __mmask8 __U, __m512i __A,
+			unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B,
+						  (__v8di) __W,
+						  (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_slli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B,
+						  (__v8di)
+						  _mm512_setzero_si512 (),
+						  (__mmask8) __U);
+}
+#else
+#define _mm512_slli_epi64(X, C)						   \
+  ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+    (__v8di)(__m512i)_mm512_undefined_si512 (),\
+    (__mmask8)-1))
+
+#define _mm512_mask_slli_epi64(W, U, X, C)				   \
+  ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+    (__v8di)(__m512i)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_slli_epi64(U, X, C)                                   \
+  ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+    (__v8di)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask8)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sll_epi64 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
+						 (__v2di) __B,
+						 (__v8di)
+						 _mm512_undefined_si512 (),
+						 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sll_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
+						 (__v2di) __B,
+						 (__v8di) __W,
+						 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sll_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
+						 (__v2di) __B,
+						 (__v8di)
+						 _mm512_setzero_si512 (),
+						 (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srli_epi64 (__m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B,
+						  (__v8di)
+						  _mm512_undefined_si512 (),
+						  (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srli_epi64 (__m512i __W, __mmask8 __U,
+			__m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B,
+						  (__v8di) __W,
+						  (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B,
+						  (__v8di)
+						  _mm512_setzero_si512 (),
+						  (__mmask8) __U);
+}
+#else
+#define _mm512_srli_epi64(X, C)						   \
+  ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+    (__v8di)(__m512i)_mm512_undefined_si512 (),\
+    (__mmask8)-1))
+
+#define _mm512_mask_srli_epi64(W, U, X, C)				   \
+  ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+    (__v8di)(__m512i)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_srli_epi64(U, X, C)                                   \
+  ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+    (__v8di)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask8)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srl_epi64 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
+						 (__v2di) __B,
+						 (__v8di)
+						 _mm512_undefined_si512 (),
+						 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srl_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
+						 (__v2di) __B,
+						 (__v8di) __W,
+						 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srl_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
+						 (__v2di) __B,
+						 (__v8di)
+						 _mm512_setzero_si512 (),
+						 (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srai_epi64 (__m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B,
+						  (__v8di)
+						  _mm512_undefined_si512 (),
+						  (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srai_epi64 (__m512i __W, __mmask8 __U, __m512i __A,
+			unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B,
+						  (__v8di) __W,
+						  (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srai_epi64 (__mmask8 __U, __m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B,
+						  (__v8di)
+						  _mm512_setzero_si512 (),
+						  (__mmask8) __U);
+}
+#else
+#define _mm512_srai_epi64(X, C)						   \
+  ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+    (__v8di)(__m512i)_mm512_undefined_si512 (),\
+    (__mmask8)-1))
+
+#define _mm512_mask_srai_epi64(W, U, X, C)				   \
+  ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+    (__v8di)(__m512i)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_srai_epi64(U, X, C)				   \
+  ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+    (__v8di)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask8)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sra_epi64 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
+						 (__v2di) __B,
+						 (__v8di)
+						 _mm512_undefined_si512 (),
+						 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sra_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
+						 (__v2di) __B,
+						 (__v8di) __W,
+						 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sra_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
+						 (__v2di) __B,
+						 (__v8di)
+						 _mm512_setzero_si512 (),
+						 (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_slli_epi32 (__m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B,
+						  (__v16si)
+						  _mm512_undefined_si512 (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_slli_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+			unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B,
+						  (__v16si) __W,
+						  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_slli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B,
+						  (__v16si)
+						  _mm512_setzero_si512 (),
+						  (__mmask16) __U);
+}
+#else
+#define _mm512_slli_epi32(X, C)						    \
+  ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)_mm512_undefined_si512 (),\
+    (__mmask16)-1))
+
+#define _mm512_mask_slli_epi32(W, U, X, C)                                  \
+  ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)(W),\
+    (__mmask16)(U)))
+
+#define _mm512_maskz_slli_epi32(U, X, C)                                    \
+  ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask16)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sll_epi32 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
+						 (__v4si) __B,
+						 (__v16si)
+						 _mm512_undefined_si512 (),
+						 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sll_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
+						 (__v4si) __B,
+						 (__v16si) __W,
+						 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sll_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
+						 (__v4si) __B,
+						 (__v16si)
+						 _mm512_setzero_si512 (),
+						 (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srli_epi32 (__m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B,
+						  (__v16si)
+						  _mm512_undefined_si512 (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srli_epi32 (__m512i __W, __mmask16 __U,
+			__m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B,
+						  (__v16si) __W,
+						  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B,
+						  (__v16si)
+						  _mm512_setzero_si512 (),
+						  (__mmask16) __U);
+}
+#else
+#define _mm512_srli_epi32(X, C)						    \
+  ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)_mm512_undefined_si512 (),\
+    (__mmask16)-1))
+
+#define _mm512_mask_srli_epi32(W, U, X, C)                                  \
+  ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)(W),\
+    (__mmask16)(U)))
+
+#define _mm512_maskz_srli_epi32(U, X, C)				    \
+  ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask16)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srl_epi32 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
+						 (__v4si) __B,
+						 (__v16si)
+						 _mm512_undefined_si512 (),
+						 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srl_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
+						 (__v4si) __B,
+						 (__v16si) __W,
+						 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srl_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
+						 (__v4si) __B,
+						 (__v16si)
+						 _mm512_setzero_si512 (),
+						 (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srai_epi32 (__m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B,
+						  (__v16si)
+						  _mm512_undefined_si512 (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srai_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+			unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B,
+						  (__v16si) __W,
+						  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srai_epi32 (__mmask16 __U, __m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B,
+						  (__v16si)
+						  _mm512_setzero_si512 (),
+						  (__mmask16) __U);
+}
+#else
+#define _mm512_srai_epi32(X, C)						    \
+  ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)_mm512_undefined_si512 (),\
+    (__mmask16)-1))
+
+#define _mm512_mask_srai_epi32(W, U, X, C)				    \
+  ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)(W),\
+    (__mmask16)(U)))
+
+#define _mm512_maskz_srai_epi32(U, X, C)				    \
+  ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask16)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sra_epi32 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
+						 (__v4si) __B,
+						 (__v16si)
+						 _mm512_undefined_si512 (),
+						 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sra_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
+						 (__v4si) __B,
+						 (__v16si) __W,
+						 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sra_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
+						 (__v4si) __B,
+						 (__v16si)
+						 _mm512_setzero_si512 (),
+						 (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_addsd_round ((__v2df) __A,
+					       (__v2df) __B,
+					       __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_addss_round ((__v4sf) __A,
+					      (__v4sf) __B,
+					      __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_subsd_round ((__v2df) __A,
+					       (__v2df) __B,
+					       __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_subss_round ((__v4sf) __A,
+					      (__v4sf) __B,
+					      __R);
+}
+
+#else
+#define _mm_add_round_sd(A, B, C)            \
+    (__m128d)__builtin_ia32_addsd_round(A, B, C)
+
+#define _mm_add_round_ss(A, B, C)            \
+    (__m128)__builtin_ia32_addss_round(A, B, C)
+
+#define _mm_sub_round_sd(A, B, C)            \
+    (__m128d)__builtin_ia32_subsd_round(A, B, C)
+
+#define _mm_sub_round_ss(A, B, C)            \
+    (__m128)__builtin_ia32_subss_round(A, B, C)
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_ternarylogic_epi64 (__m512i __A, __m512i __B, __m512i __C, const int imm)
+{
+  return (__m512i) __builtin_ia32_pternlogq512_mask ((__v8di) __A,
+						     (__v8di) __B,
+						     (__v8di) __C, imm,
+						     (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_ternarylogic_epi64 (__m512i __A, __mmask8 __U, __m512i __B,
+				__m512i __C, const int imm)
+{
+  return (__m512i) __builtin_ia32_pternlogq512_mask ((__v8di) __A,
+						     (__v8di) __B,
+						     (__v8di) __C, imm,
+						     (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_ternarylogic_epi64 (__mmask8 __U, __m512i __A, __m512i __B,
+				 __m512i __C, const int imm)
+{
+  return (__m512i) __builtin_ia32_pternlogq512_maskz ((__v8di) __A,
+						      (__v8di) __B,
+						      (__v8di) __C,
+						      imm, (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_ternarylogic_epi32 (__m512i __A, __m512i __B, __m512i __C, const int imm)
+{
+  return (__m512i) __builtin_ia32_pternlogd512_mask ((__v16si) __A,
+						     (__v16si) __B,
+						     (__v16si) __C,
+						     imm, (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_ternarylogic_epi32 (__m512i __A, __mmask16 __U, __m512i __B,
+				__m512i __C, const int imm)
+{
+  return (__m512i) __builtin_ia32_pternlogd512_mask ((__v16si) __A,
+						     (__v16si) __B,
+						     (__v16si) __C,
+						     imm, (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_ternarylogic_epi32 (__mmask16 __U, __m512i __A, __m512i __B,
+				 __m512i __C, const int imm)
+{
+  return (__m512i) __builtin_ia32_pternlogd512_maskz ((__v16si) __A,
+						      (__v16si) __B,
+						      (__v16si) __C,
+						      imm, (__mmask16) __U);
+}
+#else
+#define _mm512_ternarylogic_epi64(A, B, C, I)				\
+  ((__m512i) __builtin_ia32_pternlogq512_mask ((__v8di)(__m512i)(A),	\
+    (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)-1))
+#define _mm512_mask_ternarylogic_epi64(A, U, B, C, I)			\
+  ((__m512i) __builtin_ia32_pternlogq512_mask ((__v8di)(__m512i)(A),	\
+    (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)(U)))
+#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, I)			\
+  ((__m512i) __builtin_ia32_pternlogq512_maskz ((__v8di)(__m512i)(A),	\
+    (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), (int)(I), (__mmask8)(U)))
+#define _mm512_ternarylogic_epi32(A, B, C, I)				\
+  ((__m512i) __builtin_ia32_pternlogd512_mask ((__v16si)(__m512i)(A),	\
+    (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I),		\
+    (__mmask16)-1))
+#define _mm512_mask_ternarylogic_epi32(A, U, B, C, I)			\
+  ((__m512i) __builtin_ia32_pternlogd512_mask ((__v16si)(__m512i)(A),	\
+    (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I),		\
+    (__mmask16)(U)))
+#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, I)			\
+  ((__m512i) __builtin_ia32_pternlogd512_maskz ((__v16si)(__m512i)(A),	\
+    (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), (int)(I),		\
+    (__mmask16)(U)))
+#endif
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rcp14_pd (__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+						   (__v8df)
+						   _mm512_undefined_pd (),
+						   (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+						   (__v8df) __W,
+						   (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+						   (__v8df)
+						   _mm512_setzero_pd (),
+						   (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rcp14_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+						  (__v16sf)
+						  _mm512_undefined_ps (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+						  (__v16sf) __W,
+						  (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+						  (__v16sf)
+						  _mm512_setzero_ps (),
+						  (__mmask16) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp14_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_rcp14sd ((__v2df) __B,
+					   (__v2df) __A);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp14_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_rcp14ss ((__v4sf) __B,
+					  (__v4sf) __A);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rsqrt14_pd (__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+						     (__v8df)
+						     _mm512_undefined_pd (),
+						     (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+						     (__v8df) __W,
+						     (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+						     (__v8df)
+						     _mm512_setzero_pd (),
+						     (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rsqrt14_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+						    (__v16sf)
+						    _mm512_undefined_ps (),
+						    (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+						    (__v16sf) __W,
+						    (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+						    (__v16sf)
+						    _mm512_setzero_ps (),
+						    (__mmask16) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt14_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_rsqrt14sd ((__v2df) __B,
+					     (__v2df) __A);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt14_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_rsqrt14ss ((__v4sf) __B,
+					    (__v4sf) __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_round_pd (__m512d __A, const int __R)
+{
+  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+						  (__v8df)
+						  _mm512_undefined_pd (),
+						  (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+			   const int __R)
+{
+  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+						  (__v8df) __W,
+						  (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_round_pd (__mmask8 __U, __m512d __A, const int __R)
+{
+  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+						  (__v8df)
+						  _mm512_setzero_pd (),
+						  (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_round_ps (__m512 __A, const int __R)
+{
+  return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
+						 (__v16sf)
+						 _mm512_undefined_ps (),
+						 (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_round_ps (__m512 __W, __mmask16 __U, __m512 __A, const int __R)
+{
+  return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
+						 (__v16sf) __W,
+						 (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_round_ps (__mmask16 __U, __m512 __A, const int __R)
+{
+  return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
+						 (__v16sf)
+						 _mm512_setzero_ps (),
+						 (__mmask16) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_sqrtsd_round ((__v2df) __B,
+						(__v2df) __A,
+						__R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_sqrtss_round ((__v4sf) __B,
+					       (__v4sf) __A,
+					       __R);
+}
+#else
+#define _mm512_sqrt_round_pd(A, C)            \
+    (__m512d)__builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_undefined_pd(), -1, C)
+
+#define _mm512_mask_sqrt_round_pd(W, U, A, C) \
+    (__m512d)__builtin_ia32_sqrtpd512_mask(A, W, U, C)
+
+#define _mm512_maskz_sqrt_round_pd(U, A, C)   \
+    (__m512d)__builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_sqrt_round_ps(A, C)            \
+    (__m512)__builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_undefined_ps(), -1, C)
+
+#define _mm512_mask_sqrt_round_ps(W, U, A, C) \
+    (__m512)__builtin_ia32_sqrtps512_mask(A, W, U, C)
+
+#define _mm512_maskz_sqrt_round_ps(U, A, C)   \
+    (__m512)__builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm_sqrt_round_sd(A, B, C)            \
+    (__m128d)__builtin_ia32_sqrtsd_round(A, B, C)
+
+#define _mm_sqrt_round_ss(A, B, C)            \
+    (__m128)__builtin_ia32_sqrtss_round(A, B, C)
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi8_epi32 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
+						    (__v16si)
+						    _mm512_undefined_si512 (),
+						    (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi8_epi32 (__m512i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
+						    (__v16si) __W,
+						    (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi8_epi32 (__mmask16 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
+						    (__v16si)
+						    _mm512_setzero_si512 (),
+						    (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi8_epi64 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
+						    (__v8di)
+						    _mm512_undefined_si512 (),
+						    (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi8_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
+						    (__v8di) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
+						    (__v8di)
+						    _mm512_setzero_si512 (),
+						    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi16_epi32 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
+						    (__v16si)
+						    _mm512_undefined_si512 (),
+						    (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi16_epi32 (__m512i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
+						    (__v16si) __W,
+						    (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi16_epi32 (__mmask16 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
+						    (__v16si)
+						    _mm512_setzero_si512 (),
+						    (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi16_epi64 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
+						    (__v8di)
+						    _mm512_undefined_si512 (),
+						    (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi16_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
+						    (__v8di) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
+						    (__v8di)
+						    _mm512_setzero_si512 (),
+						    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi32_epi64 (__m256i __X)
+{
+  return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
+						    (__v8di)
+						    _mm512_undefined_si512 (),
+						    (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_epi64 (__m512i __W, __mmask8 __U, __m256i __X)
+{
+  return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
+						    (__v8di) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi32_epi64 (__mmask8 __U, __m256i __X)
+{
+  return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
+						    (__v8di)
+						    _mm512_setzero_si512 (),
+						    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu8_epi32 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
+						    (__v16si)
+						    _mm512_undefined_si512 (),
+						    (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu8_epi32 (__m512i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
+						    (__v16si) __W,
+						    (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu8_epi32 (__mmask16 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
+						    (__v16si)
+						    _mm512_setzero_si512 (),
+						    (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu8_epi64 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
+						    (__v8di)
+						    _mm512_undefined_si512 (),
+						    (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu8_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
+						    (__v8di) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
+						    (__v8di)
+						    _mm512_setzero_si512 (),
+						    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu16_epi32 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
+						    (__v16si)
+						    _mm512_undefined_si512 (),
+						    (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu16_epi32 (__m512i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
+						    (__v16si) __W,
+						    (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu16_epi32 (__mmask16 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
+						    (__v16si)
+						    _mm512_setzero_si512 (),
+						    (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu16_epi64 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
+						    (__v8di)
+						    _mm512_undefined_si512 (),
+						    (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu16_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
+						    (__v8di) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
+						    (__v8di)
+						    _mm512_setzero_si512 (),
+						    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu32_epi64 (__m256i __X)
+{
+  return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
+						    (__v8di)
+						    _mm512_undefined_si512 (),
+						    (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu32_epi64 (__m512i __W, __mmask8 __U, __m256i __X)
+{
+  return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
+						    (__v8di) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu32_epi64 (__mmask8 __U, __m256i __X)
+{
+  return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
+						    (__v8di)
+						    _mm512_setzero_si512 (),
+						    (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_round_pd (__m512d __A, __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_undefined_pd (),
+						 (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+			  __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df) __W,
+						 (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+			   const int __R)
+{
+  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_setzero_pd (),
+						 (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_undefined_ps (),
+						(__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+			  __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf) __W,
+						(__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_setzero_ps (),
+						(__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_round_pd (__m512d __A, __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_undefined_pd (),
+						 (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+			  __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df) __W,
+						 (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+			   const int __R)
+{
+  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_setzero_pd (),
+						 (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_undefined_ps (),
+						(__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+			  __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf) __W,
+						(__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_setzero_ps (),
+						(__mmask16) __U, __R);
+}
+#else
+#define _mm512_add_round_pd(A, B, C)            \
+    (__m512d)__builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)
+
+#define _mm512_mask_add_round_pd(W, U, A, B, C) \
+    (__m512d)__builtin_ia32_addpd512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_add_round_pd(U, A, B, C)   \
+    (__m512d)__builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_add_round_ps(A, B, C)            \
+    (__m512)__builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)
+
+#define _mm512_mask_add_round_ps(W, U, A, B, C) \
+    (__m512)__builtin_ia32_addps512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_add_round_ps(U, A, B, C)   \
+    (__m512)__builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm512_sub_round_pd(A, B, C)            \
+    (__m512d)__builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)
+
+#define _mm512_mask_sub_round_pd(W, U, A, B, C) \
+    (__m512d)__builtin_ia32_subpd512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_sub_round_pd(U, A, B, C)   \
+    (__m512d)__builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_sub_round_ps(A, B, C)            \
+    (__m512)__builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)
+
+#define _mm512_mask_sub_round_ps(W, U, A, B, C) \
+    (__m512)__builtin_ia32_subps512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_sub_round_ps(U, A, B, C)   \
+    (__m512)__builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_round_pd (__m512d __A, __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_undefined_pd (),
+						 (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+			  __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df) __W,
+						 (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+			   const int __R)
+{
+  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_setzero_pd (),
+						 (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_undefined_ps (),
+						(__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+			  __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf) __W,
+						(__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_setzero_ps (),
+						(__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_round_pd (__m512d __M, __m512d __V, const int __R)
+{
+  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
+						 (__v8df) __V,
+						 (__v8df)
+						 _mm512_undefined_pd (),
+						 (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_round_pd (__m512d __W, __mmask8 __U, __m512d __M,
+			  __m512d __V, const int __R)
+{
+  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
+						 (__v8df) __V,
+						 (__v8df) __W,
+						 (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_round_pd (__mmask8 __U, __m512d __M, __m512d __V,
+			   const int __R)
+{
+  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
+						 (__v8df) __V,
+						 (__v8df)
+						 _mm512_setzero_pd (),
+						 (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_undefined_ps (),
+						(__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+			  __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf) __W,
+						(__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_setzero_ps (),
+						(__mmask16) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A,
+					       (__v2df) __B,
+					       __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_mulss_round ((__v4sf) __A,
+					      (__v4sf) __B,
+					      __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_divsd_round ((__v2df) __A,
+					       (__v2df) __B,
+					       __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_divss_round ((__v4sf) __A,
+					      (__v4sf) __B,
+					      __R);
+}
+
+#else
+#define _mm512_mul_round_pd(A, B, C)            \
+    (__m512d)__builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)
+
+#define _mm512_mask_mul_round_pd(W, U, A, B, C) \
+    (__m512d)__builtin_ia32_mulpd512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_mul_round_pd(U, A, B, C)   \
+    (__m512d)__builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_mul_round_ps(A, B, C)            \
+    (__m512)__builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)
+
+#define _mm512_mask_mul_round_ps(W, U, A, B, C) \
+    (__m512)__builtin_ia32_mulps512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_mul_round_ps(U, A, B, C)   \
+    (__m512)__builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm512_div_round_pd(A, B, C)            \
+    (__m512d)__builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)
+
+#define _mm512_mask_div_round_pd(W, U, A, B, C) \
+    (__m512d)__builtin_ia32_divpd512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_div_round_pd(U, A, B, C)   \
+    (__m512d)__builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_div_round_ps(A, B, C)            \
+    (__m512)__builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)
+
+#define _mm512_mask_div_round_ps(W, U, A, B, C) \
+    (__m512)__builtin_ia32_divps512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_div_round_ps(U, A, B, C)   \
+    (__m512)__builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm_mul_round_sd(A, B, C)            \
+    (__m128d)__builtin_ia32_mulsd_round(A, B, C)
+
+#define _mm_mul_round_ss(A, B, C)            \
+    (__m128)__builtin_ia32_mulss_round(A, B, C)
+
+#define _mm_div_round_sd(A, B, C)            \
+    (__m128d)__builtin_ia32_divsd_round(A, B, C)
+
+#define _mm_div_round_ss(A, B, C)            \
+    (__m128)__builtin_ia32_divss_round(A, B, C)
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_round_pd (__m512d __A, __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_undefined_pd (),
+						 (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+			  __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df) __W,
+						 (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+			   const int __R)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_setzero_pd (),
+						 (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_undefined_ps (),
+						(__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+			  __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf) __W,
+						(__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_setzero_ps (),
+						(__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_round_pd (__m512d __A, __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_undefined_pd (),
+						 (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+			  __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df) __W,
+						 (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+			   const int __R)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_setzero_pd (),
+						 (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_undefined_ps (),
+						(__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+			  __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf) __W,
+						(__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_setzero_ps (),
+						(__mmask16) __U, __R);
+}
+#else
+#define _mm512_max_round_pd(A, B,  R) \
+    (__m512d)__builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, R)
+
+#define _mm512_mask_max_round_pd(W, U,  A, B, R) \
+    (__m512d)__builtin_ia32_maxpd512_mask(A, B, W, U, R)
+
+#define _mm512_maskz_max_round_pd(U, A,  B, R) \
+    (__m512d)__builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R)
+
+#define _mm512_max_round_ps(A, B,  R) \
+    (__m512)__builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_undefined_pd(), -1, R)
+
+#define _mm512_mask_max_round_ps(W, U,  A, B, R) \
+    (__m512)__builtin_ia32_maxps512_mask(A, B, W, U, R)
+
+#define _mm512_maskz_max_round_ps(U, A,  B, R) \
+    (__m512)__builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R)
+
+#define _mm512_min_round_pd(A, B,  R) \
+    (__m512d)__builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, R)
+
+#define _mm512_mask_min_round_pd(W, U,  A, B, R) \
+    (__m512d)__builtin_ia32_minpd512_mask(A, B, W, U, R)
+
+#define _mm512_maskz_min_round_pd(U, A,  B, R) \
+    (__m512d)__builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R)
+
+#define _mm512_min_round_ps(A, B, R) \
+    (__m512)__builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, R)
+
+#define _mm512_mask_min_round_ps(W, U,  A, B, R) \
+    (__m512)__builtin_ia32_minps512_mask(A, B, W, U, R)
+
+#define _mm512_maskz_min_round_ps(U, A,  B, R) \
+    (__m512)__builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R)
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_round_pd (__m512d __A, __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    (__v8df)
+						    _mm512_undefined_pd (),
+						    (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+			     __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    (__v8df) __W,
+						    (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+			      const int __R)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    (__v8df)
+						    _mm512_setzero_pd (),
+						    (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   (__v16sf)
+						   _mm512_undefined_ps (),
+						   (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+			     __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   (__v16sf) __W,
+						   (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+			      const int __R)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   (__v16sf)
+						   _mm512_setzero_ps (),
+						   (__mmask16) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_scalefsd_round ((__v2df) __A,
+						  (__v2df) __B,
+						  __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_scalefss_round ((__v4sf) __A,
+						 (__v4sf) __B,
+						 __R);
+}
+#else
+#define _mm512_scalef_round_pd(A, B, C)            \
+    (__m512d)__builtin_ia32_scalefpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)
+
+#define _mm512_mask_scalef_round_pd(W, U, A, B, C) \
+    (__m512d)__builtin_ia32_scalefpd512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_scalef_round_pd(U, A, B, C)   \
+    (__m512d)__builtin_ia32_scalefpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_scalef_round_ps(A, B, C)            \
+    (__m512)__builtin_ia32_scalefps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)
+
+#define _mm512_mask_scalef_round_ps(W, U, A, B, C) \
+    (__m512)__builtin_ia32_scalefps512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_scalef_round_ps(U, A, B, C)   \
+    (__m512)__builtin_ia32_scalefps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm_scalef_round_sd(A, B, C)            \
+    (__m128d)__builtin_ia32_scalefsd_round(A, B, C)
+
+#define _mm_scalef_round_ss(A, B, C)            \
+    (__m128)__builtin_ia32_scalefss_round(A, B, C)
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    (__v8df) __C,
+						    (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+			    __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    (__v8df) __C,
+						    (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_round_pd (__m512d __A, __m512d __B, __m512d __C,
+			     __mmask8 __U, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
+						     (__v8df) __B,
+						     (__v8df) __C,
+						     (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+			     __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+						     (__v8df) __B,
+						     (__v8df) __C,
+						     (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   (__v16sf) __C,
+						   (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+			    __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   (__v16sf) __C,
+						   (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_round_ps (__m512 __A, __m512 __B, __m512 __C,
+			     __mmask16 __U, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
+						    (__v16sf) __B,
+						    (__v16sf) __C,
+						    (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+			     __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+						    (__v16sf) __B,
+						    (__v16sf) __C,
+						    (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    -(__v8df) __C,
+						    (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+			    __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    -(__v8df) __C,
+						    (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsub_round_pd (__m512d __A, __m512d __B, __m512d __C,
+			     __mmask8 __U, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
+						     (__v8df) __B,
+						     (__v8df) __C,
+						     (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+			     __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+						     (__v8df) __B,
+						     -(__v8df) __C,
+						     (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   -(__v16sf) __C,
+						   (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+			    __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   -(__v16sf) __C,
+						   (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsub_round_ps (__m512 __A, __m512 __B, __m512 __C,
+			     __mmask16 __U, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
+						    (__v16sf) __B,
+						    (__v16sf) __C,
+						    (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+			     __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+						    (__v16sf) __B,
+						    -(__v16sf) __C,
+						    (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmaddsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+						       (__v8df) __B,
+						       (__v8df) __C,
+						       (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmaddsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+			       __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+						       (__v8df) __B,
+						       (__v8df) __C,
+						       (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmaddsub_round_pd (__m512d __A, __m512d __B, __m512d __C,
+				__mmask8 __U, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
+							(__v8df) __B,
+							(__v8df) __C,
+							(__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmaddsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+				__m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+							(__v8df) __B,
+							(__v8df) __C,
+							(__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmaddsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+						      (__v16sf) __B,
+						      (__v16sf) __C,
+						      (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmaddsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+			       __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+						      (__v16sf) __B,
+						      (__v16sf) __C,
+						      (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmaddsub_round_ps (__m512 __A, __m512 __B, __m512 __C,
+				__mmask16 __U, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
+						       (__v16sf) __B,
+						       (__v16sf) __C,
+						       (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmaddsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+				__m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+						       (__v16sf) __B,
+						       (__v16sf) __C,
+						       (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsubadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+						       (__v8df) __B,
+						       -(__v8df) __C,
+						       (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsubadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+			       __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+						       (__v8df) __B,
+						       -(__v8df) __C,
+						       (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsubadd_round_pd (__m512d __A, __m512d __B, __m512d __C,
+				__mmask8 __U, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
+							(__v8df) __B,
+							(__v8df) __C,
+							(__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsubadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+				__m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+							(__v8df) __B,
+							-(__v8df) __C,
+							(__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsubadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+						      (__v16sf) __B,
+						      -(__v16sf) __C,
+						      (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsubadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+			       __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+						      (__v16sf) __B,
+						      -(__v16sf) __C,
+						      (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsubadd_round_ps (__m512 __A, __m512 __B, __m512 __C,
+				__mmask16 __U, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
+						       (__v16sf) __B,
+						       (__v16sf) __C,
+						       (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsubadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+				__m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+						       (__v16sf) __B,
+						       -(__v16sf) __C,
+						       (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
+						    (__v8df) __B,
+						    (__v8df) __C,
+						    (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+			     __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
+						     (__v8df) __B,
+						     (__v8df) __C,
+						     (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmadd_round_pd (__m512d __A, __m512d __B, __m512d __C,
+			      __mmask8 __U, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
+						     (__v8df) __B,
+						     (__v8df) __C,
+						     (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+			      __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
+						     (__v8df) __B,
+						     (__v8df) __C,
+						     (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
+						   (__v16sf) __B,
+						   (__v16sf) __C,
+						   (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+			     __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
+						    (__v16sf) __B,
+						    (__v16sf) __C,
+						    (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmadd_round_ps (__m512 __A, __m512 __B, __m512 __C,
+			      __mmask16 __U, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
+						    (__v16sf) __B,
+						    (__v16sf) __C,
+						    (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+			      __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
+						    (__v16sf) __B,
+						    (__v16sf) __C,
+						    (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
+						    (__v8df) __B,
+						    -(__v8df) __C,
+						    (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+			     __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
+						     (__v8df) __B,
+						     (__v8df) __C,
+						     (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmsub_round_pd (__m512d __A, __m512d __B, __m512d __C,
+			      __mmask8 __U, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A,
+						      (__v8df) __B,
+						      (__v8df) __C,
+						      (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+			      __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
+						     (__v8df) __B,
+						     -(__v8df) __C,
+						     (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
+						   (__v16sf) __B,
+						   -(__v16sf) __C,
+						   (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+			     __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
+						    (__v16sf) __B,
+						    (__v16sf) __C,
+						    (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmsub_round_ps (__m512 __A, __m512 __B, __m512 __C,
+			      __mmask16 __U, const int __R)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A,
+						     (__v16sf) __B,
+						     (__v16sf) __C,
+						     (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+			      __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
+						    (__v16sf) __B,
+						    -(__v16sf) __C,
+						    (__mmask16) __U, __R);
+}
+#else
+#define _mm512_fmadd_round_pd(A, B, C, R)            \
+    (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fmadd_round_pd(A, U, B, C, R)    \
+    (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R)   \
+    (__m512d)__builtin_ia32_vfmaddpd512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R)   \
+    (__m512d)__builtin_ia32_vfmaddpd512_maskz(A, B, C, U, R)
+
+#define _mm512_fmadd_round_ps(A, B, C, R)            \
+    (__m512)__builtin_ia32_vfmaddps512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fmadd_round_ps(A, U, B, C, R)    \
+    (__m512)__builtin_ia32_vfmaddps512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R)   \
+    (__m512)__builtin_ia32_vfmaddps512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R)   \
+    (__m512)__builtin_ia32_vfmaddps512_maskz(A, B, C, U, R)
+
+#define _mm512_fmsub_round_pd(A, B, C, R)            \
+    (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, -(C), -1, R)
+
+#define _mm512_mask_fmsub_round_pd(A, U, B, C, R)    \
+    (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, -(C), U, R)
+
+#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R)   \
+    (__m512d)__builtin_ia32_vfmsubpd512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R)   \
+    (__m512d)__builtin_ia32_vfmaddpd512_maskz(A, B, -(C), U, R)
+
+#define _mm512_fmsub_round_ps(A, B, C, R)            \
+    (__m512)__builtin_ia32_vfmaddps512_mask(A, B, -(C), -1, R)
+
+#define _mm512_mask_fmsub_round_ps(A, U, B, C, R)    \
+    (__m512)__builtin_ia32_vfmaddps512_mask(A, B, -(C), U, R)
+
+#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R)   \
+    (__m512)__builtin_ia32_vfmsubps512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R)   \
+    (__m512)__builtin_ia32_vfmaddps512_maskz(A, B, -(C), U, R)
+
+#define _mm512_fmaddsub_round_pd(A, B, C, R)            \
+    (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R)    \
+    (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R)   \
+    (__m512d)__builtin_ia32_vfmaddsubpd512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R)   \
+    (__m512d)__builtin_ia32_vfmaddsubpd512_maskz(A, B, C, U, R)
+
+#define _mm512_fmaddsub_round_ps(A, B, C, R)            \
+    (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R)    \
+    (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R)   \
+    (__m512)__builtin_ia32_vfmaddsubps512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R)   \
+    (__m512)__builtin_ia32_vfmaddsubps512_maskz(A, B, C, U, R)
+
+#define _mm512_fmsubadd_round_pd(A, B, C, R)            \
+    (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), -1, R)
+
+#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R)    \
+    (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), U, R)
+
+#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R)   \
+    (__m512d)__builtin_ia32_vfmsubaddpd512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R)   \
+    (__m512d)__builtin_ia32_vfmaddsubpd512_maskz(A, B, -(C), U, R)
+
+#define _mm512_fmsubadd_round_ps(A, B, C, R)            \
+    (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, -(C), -1, R)
+
+#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R)    \
+    (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, -(C), U, R)
+
+#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R)   \
+    (__m512)__builtin_ia32_vfmsubaddps512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R)   \
+    (__m512)__builtin_ia32_vfmaddsubps512_maskz(A, B, -(C), U, R)
+
+#define _mm512_fnmadd_round_pd(A, B, C, R)            \
+    (__m512d)__builtin_ia32_vfmaddpd512_mask(-(A), B, C, -1, R)
+
+#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R)    \
+    (__m512d)__builtin_ia32_vfnmaddpd512_mask(-(A), B, C, U, R)
+
+#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R)   \
+    (__m512d)__builtin_ia32_vfmaddpd512_mask3(-(A), B, C, U, R)
+
+#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R)   \
+    (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(A), B, C, U, R)
+
+#define _mm512_fnmadd_round_ps(A, B, C, R)            \
+    (__m512)__builtin_ia32_vfmaddps512_mask(-(A), B, C, -1, R)
+
+#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R)    \
+    (__m512)__builtin_ia32_vfnmaddps512_mask(-(A), B, C, U, R)
+
+#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R)   \
+    (__m512)__builtin_ia32_vfmaddps512_mask3(-(A), B, C, U, R)
+
+#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R)   \
+    (__m512)__builtin_ia32_vfmaddps512_maskz(-(A), B, C, U, R)
+
+#define _mm512_fnmsub_round_pd(A, B, C, R)            \
+    (__m512d)__builtin_ia32_vfmaddpd512_mask(-(A), B, -(C), -1, R)
+
+#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R)    \
+    (__m512d)__builtin_ia32_vfnmsubpd512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R)   \
+    (__m512d)__builtin_ia32_vfnmsubpd512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R)   \
+    (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(A), B, -(C), U, R)
+
+#define _mm512_fnmsub_round_ps(A, B, C, R)            \
+    (__m512)__builtin_ia32_vfmaddps512_mask(-(A), B, -(C), -1, R)
+
+#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R)    \
+    (__m512)__builtin_ia32_vfnmsubps512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R)   \
+    (__m512)__builtin_ia32_vfnmsubps512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R)   \
+    (__m512)__builtin_ia32_vfmaddps512_maskz(-(A), B, -(C), U, R)
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_epi64 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+						 (__v8di)
+						 _mm512_undefined_si512 (),
+						 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+						 (__v8di) __W,
+						 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+						 (__v8di)
+						 _mm512_setzero_si512 (),
+						 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_epi32 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+						 (__v16si)
+						 _mm512_undefined_si512 (),
+						 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+						 (__v16si) __W,
+						 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+						 (__v16si)
+						 _mm512_setzero_si512 (),
+						 (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastss_ps (__m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A,
+						 (__v16sf)
+						 _mm512_undefined_ps (),
+						 (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A,
+						 (__v16sf) __O, __M);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A,
+						 (__v16sf)
+						 _mm512_setzero_ps (),
+						 __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastsd_pd (__m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A,
+						  (__v8df)
+						  _mm512_undefined_pd (),
+						  (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A,
+						  (__v8df) __O, __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A,
+						  (__v8df)
+						  _mm512_setzero_pd (),
+						  __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastd_epi32 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A,
+						  (__v16si)
+						  _mm512_undefined_si512 (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A,
+						  (__v16si) __O, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A,
+						  (__v16si)
+						  _mm512_setzero_si512 (),
+						  __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_epi32 (int __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A,
+							   (__v16si)
+							   _mm512_undefined_si512 (),
+							   (__mmask16)(-1));
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, (__v16si) __O,
+							   __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_set1_epi32 (__mmask16 __M, int __A)
+{
+  return (__m512i)
+	 __builtin_ia32_pbroadcastd512_gpr_mask (__A,
+						 (__v16si) _mm512_setzero_si512 (),
+						 __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastq_epi64 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A,
+						  (__v8di)
+						  _mm512_undefined_si512 (),
+						  (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A,
+						  (__v8di) __O, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A,
+						  (__v8di)
+						  _mm512_setzero_si512 (),
+						  __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_epi64 (long long __A)
+{
+#ifdef TARGET_64BIT
+  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A,
+							   (__v8di)
+							   _mm512_undefined_si512 (),
+							   (__mmask8)(-1));
+#else
+  return (__m512i) __builtin_ia32_pbroadcastq512_mem_mask (__A,
+							   (__v8di)
+							   _mm512_undefined_si512 (),
+							   (__mmask8)(-1));
+#endif
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
+{
+#ifdef TARGET_64BIT
+  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O,
+							   __M);
+#else
+  return (__m512i) __builtin_ia32_pbroadcastq512_mem_mask (__A, (__v8di) __O,
+							   __M);
+#endif
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_set1_epi64 (__mmask8 __M, long long __A)
+{
+#ifdef TARGET_64BIT
+  return (__m512i)
+	 __builtin_ia32_pbroadcastq512_gpr_mask (__A,
+						 (__v8di) _mm512_setzero_si512 (),
+						 __M);
+#else
+  return (__m512i)
+	 __builtin_ia32_pbroadcastq512_mem_mask (__A,
+						 (__v8di) _mm512_setzero_si512 (),
+						 __M);
+#endif
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_f32x4 (__m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+						     (__v16sf)
+						     _mm512_undefined_ps (),
+						     (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+						     (__v16sf) __O,
+						     __M);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+						     (__v16sf)
+						     _mm512_setzero_ps (),
+						     __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_i32x4 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+						      (__v16si)
+						      _mm512_undefined_si512 (),
+						      (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+						      (__v16si) __O,
+						      __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+						      (__v16si)
+						      _mm512_setzero_si512 (),
+						      __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_f64x4 (__m256d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+						      (__v8df)
+						      _mm512_undefined_pd (),
+						      (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+						      (__v8df) __O,
+						      __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+						      (__v8df)
+						      _mm512_setzero_pd (),
+						      __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_i64x4 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+						      (__v8di)
+						      _mm512_undefined_si512 (),
+						      (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+						      (__v8di) __O,
+						      __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+						      (__v8di)
+						      _mm512_setzero_si512 (),
+						      __M);
+}
+
+typedef enum
+{
+  _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
+  _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05,
+  _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08,
+  _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B,
+  _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E,
+  _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11,
+  _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14,
+  _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17,
+  _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A,
+  _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D,
+  _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20,
+  _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23,
+  _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26,
+  _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29,
+  _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C,
+  _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F,
+  _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32,
+  _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35,
+  _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38,
+  _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B,
+  _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E,
+  _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41,
+  _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44,
+  _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47,
+  _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A,
+  _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D,
+  _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50,
+  _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53,
+  _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56,
+  _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59,
+  _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C,
+  _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F,
+  _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62,
+  _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65,
+  _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68,
+  _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B,
+  _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E,
+  _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71,
+  _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74,
+  _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77,
+  _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A,
+  _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D,
+  _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80,
+  _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83,
+  _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86,
+  _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89,
+  _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C,
+  _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F,
+  _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92,
+  _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95,
+  _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98,
+  _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B,
+  _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E,
+  _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1,
+  _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4,
+  _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7,
+  _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA,
+  _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD,
+  _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0,
+  _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3,
+  _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6,
+  _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9,
+  _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC,
+  _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF,
+  _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2,
+  _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5,
+  _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8,
+  _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB,
+  _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE,
+  _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1,
+  _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4,
+  _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7,
+  _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA,
+  _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD,
+  _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0,
+  _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3,
+  _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6,
+  _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9,
+  _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC,
+  _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF,
+  _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2,
+  _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5,
+  _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8,
+  _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB,
+  _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE,
+  _MM_PERM_DDDD = 0xFF
+} _MM_PERM_ENUM;
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_epi32 (__m512i __A, _MM_PERM_ENUM __mask)
+{
+  return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A,
+						  __mask,
+						  (__v16si)
+						  _mm512_undefined_si512 (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+			   _MM_PERM_ENUM __mask)
+{
+  return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A,
+						  __mask,
+						  (__v16si) __W,
+						  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_epi32 (__mmask16 __U, __m512i __A, _MM_PERM_ENUM __mask)
+{
+  return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A,
+						  __mask,
+						  (__v16si)
+						  _mm512_setzero_si512 (),
+						  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_i64x2 (__m512i __A, __m512i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A,
+						   (__v8di) __B, __imm,
+						   (__v8di)
+						   _mm512_undefined_si512 (),
+						   (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_i64x2 (__m512i __W, __mmask8 __U, __m512i __A,
+			   __m512i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A,
+						   (__v8di) __B, __imm,
+						   (__v8di) __W,
+						   (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_i64x2 (__mmask8 __U, __m512i __A, __m512i __B,
+			    const int __imm)
+{
+  return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A,
+						   (__v8di) __B, __imm,
+						   (__v8di)
+						   _mm512_setzero_si512 (),
+						   (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_i32x4 (__m512i __A, __m512i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A,
+						   (__v16si) __B,
+						   __imm,
+						   (__v16si)
+						   _mm512_undefined_si512 (),
+						   (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_i32x4 (__m512i __W, __mmask16 __U, __m512i __A,
+			   __m512i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A,
+						   (__v16si) __B,
+						   __imm,
+						   (__v16si) __W,
+						   (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_i32x4 (__mmask16 __U, __m512i __A, __m512i __B,
+			    const int __imm)
+{
+  return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A,
+						   (__v16si) __B,
+						   __imm,
+						   (__v16si)
+						   _mm512_setzero_si512 (),
+						   (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_f64x2 (__m512d __A, __m512d __B, const int __imm)
+{
+  return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A,
+						   (__v8df) __B, __imm,
+						   (__v8df)
+						   _mm512_undefined_pd (),
+						   (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_f64x2 (__m512d __W, __mmask8 __U, __m512d __A,
+			   __m512d __B, const int __imm)
+{
+  return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A,
+						   (__v8df) __B, __imm,
+						   (__v8df) __W,
+						   (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_f64x2 (__mmask8 __U, __m512d __A, __m512d __B,
+			    const int __imm)
+{
+  return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A,
+						   (__v8df) __B, __imm,
+						   (__v8df)
+						   _mm512_setzero_pd (),
+						   (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_f32x4 (__m512 __A, __m512 __B, const int __imm)
+{
+  return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A,
+						  (__v16sf) __B, __imm,
+						  (__v16sf)
+						  _mm512_undefined_ps (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_f32x4 (__m512 __W, __mmask16 __U, __m512 __A,
+			   __m512 __B, const int __imm)
+{
+  return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A,
+						  (__v16sf) __B, __imm,
+						  (__v16sf) __W,
+						  (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_f32x4 (__mmask16 __U, __m512 __A, __m512 __B,
+			    const int __imm)
+{
+  return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A,
+						  (__v16sf) __B, __imm,
+						  (__v16sf)
+						  _mm512_setzero_ps (),
+						  (__mmask16) __U);
+}
+
+#else
+#define _mm512_shuffle_epi32(X, C)                                      \
+  ((__m512i)  __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)_mm512_undefined_si512 (),\
+    (__mmask16)-1))
+
+#define _mm512_mask_shuffle_epi32(W, U, X, C)                           \
+  ((__m512i)  __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)(W),\
+    (__mmask16)(U)))
+
+#define _mm512_maskz_shuffle_epi32(U, X, C)                             \
+  ((__m512i)  __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask16)(U)))
+
+#define _mm512_shuffle_i64x2(X, Y, C)                                   \
+  ((__m512i)  __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X),     \
+      (__v8di)(__m512i)(Y), (int)(C),\
+    (__v8di)(__m512i)_mm512_undefined_si512 (),\
+    (__mmask8)-1))
+
+#define _mm512_mask_shuffle_i64x2(W, U, X, Y, C)                        \
+  ((__m512i)  __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X),     \
+      (__v8di)(__m512i)(Y), (int)(C),\
+    (__v8di)(__m512i)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_shuffle_i64x2(U, X, Y, C)                          \
+  ((__m512i)  __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X),     \
+      (__v8di)(__m512i)(Y), (int)(C),\
+    (__v8di)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask8)(U)))
+
+#define _mm512_shuffle_i32x4(X, Y, C)                                   \
+  ((__m512i)  __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X),    \
+      (__v16si)(__m512i)(Y), (int)(C),\
+    (__v16si)(__m512i)_mm512_undefined_si512 (),\
+    (__mmask16)-1))
+
+#define _mm512_mask_shuffle_i32x4(W, U, X, Y, C)                        \
+  ((__m512i)  __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X),    \
+      (__v16si)(__m512i)(Y), (int)(C),\
+    (__v16si)(__m512i)(W),\
+    (__mmask16)(U)))
+
+#define _mm512_maskz_shuffle_i32x4(U, X, Y, C)                          \
+  ((__m512i)  __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X),    \
+      (__v16si)(__m512i)(Y), (int)(C),\
+    (__v16si)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask16)(U)))
+
+#define _mm512_shuffle_f64x2(X, Y, C)                                   \
+  ((__m512d)  __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X),     \
+      (__v8df)(__m512d)(Y), (int)(C),\
+    (__v8df)(__m512d)_mm512_undefined_pd(),\
+    (__mmask8)-1))
+
+#define _mm512_mask_shuffle_f64x2(W, U, X, Y, C)                        \
+  ((__m512d)  __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X),     \
+      (__v8df)(__m512d)(Y), (int)(C),\
+    (__v8df)(__m512d)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_shuffle_f64x2(U, X, Y, C)                         \
+  ((__m512d)  __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X),    \
+      (__v8df)(__m512d)(Y), (int)(C),\
+    (__v8df)(__m512d)_mm512_setzero_pd(),\
+    (__mmask8)(U)))
+
+#define _mm512_shuffle_f32x4(X, Y, C)                                  \
+  ((__m512)  __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X),     \
+      (__v16sf)(__m512)(Y), (int)(C),\
+    (__v16sf)(__m512)_mm512_undefined_ps(),\
+    (__mmask16)-1))
+
+#define _mm512_mask_shuffle_f32x4(W, U, X, Y, C)                       \
+  ((__m512)  __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X),     \
+      (__v16sf)(__m512)(Y), (int)(C),\
+    (__v16sf)(__m512)(W),\
+    (__mmask16)(U)))
+
+#define _mm512_maskz_shuffle_f32x4(U, X, Y, C)                         \
+  ((__m512)  __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X),     \
+      (__v16sf)(__m512)(Y), (int)(C),\
+    (__v16sf)(__m512)_mm512_setzero_ps(),\
+    (__mmask16)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rolv_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si)
+						  _mm512_undefined_si512 (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si) __W,
+						  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si)
+						  _mm512_setzero_si512 (),
+						  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rorv_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si)
+						  _mm512_undefined_si512 (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si) __W,
+						  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si)
+						  _mm512_setzero_si512 (),
+						  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rolv_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di)
+						  _mm512_undefined_si512 (),
+						  (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di) __W,
+						  (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di)
+						  _mm512_setzero_si512 (),
+						  (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rorv_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di)
+						  _mm512_undefined_si512 (),
+						  (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di) __W,
+						  (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di)
+						  _mm512_setzero_si512 (),
+						  (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundpd_epi32 (__m512d __A, const int __R)
+{
+  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+						     (__v8si)
+						     _mm256_undefined_si256 (),
+						     (__mmask8) -1, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A,
+				const int __R)
+{
+  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+						     (__v8si) __W,
+						     (__mmask8) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundpd_epi32 (__mmask8 __U, __m512d __A, const int __R)
+{
+  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+						     (__v8si)
+						     _mm256_setzero_si256 (),
+						     (__mmask8) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundpd_epu32 (__m512d __A, const int __R)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+						      (__v8si)
+						      _mm256_undefined_si256 (),
+						      (__mmask8) -1, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A,
+				const int __R)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+						      (__v8si) __W,
+						      (__mmask8) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundpd_epu32 (__mmask8 __U, __m512d __A, const int __R)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+						      (__v8si)
+						      _mm256_setzero_si256 (),
+						      (__mmask8) __U, __R);
+}
+#else
+#define _mm512_cvtt_roundpd_epi32(A, B)		     \
+    ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B))
+
+#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, B)   \
+    ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)(W), U, B))
+
+#define _mm512_maskz_cvtt_roundpd_epi32(U, A, B)     \
+    ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B))
+
+#define _mm512_cvtt_roundpd_epu32(A, B)		     \
+    ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B))
+
+#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, B)   \
+    ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)(W), U, B))
+
+#define _mm512_maskz_cvtt_roundpd_epu32(U, A, B)     \
+    ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundpd_epi32 (__m512d __A, const int __R)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+						    (__v8si)
+						    _mm256_undefined_si256 (),
+						    (__mmask8) -1, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A,
+			       const int __R)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+						    (__v8si) __W,
+						    (__mmask8) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundpd_epi32 (__mmask8 __U, __m512d __A, const int __R)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+						    (__v8si)
+						    _mm256_setzero_si256 (),
+						    (__mmask8) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundpd_epu32 (__m512d __A, const int __R)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+						     (__v8si)
+						     _mm256_undefined_si256 (),
+						     (__mmask8) -1, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A,
+			       const int __R)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+						     (__v8si) __W,
+						     (__mmask8) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundpd_epu32 (__mmask8 __U, __m512d __A, const int __R)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+						     (__v8si)
+						     _mm256_setzero_si256 (),
+						     (__mmask8) __U, __R);
+}
+#else
+#define _mm512_cvt_roundpd_epi32(A, B)		    \
+    ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B))
+
+#define _mm512_mask_cvt_roundpd_epi32(W, U, A, B)   \
+    ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)(W), U, B))
+
+#define _mm512_maskz_cvt_roundpd_epi32(U, A, B)     \
+    ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B))
+
+#define _mm512_cvt_roundpd_epu32(A, B)		    \
+    ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B))
+
+#define _mm512_mask_cvt_roundpd_epu32(W, U, A, B)   \
+    ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)(W), U, B))
+
+#define _mm512_maskz_cvt_roundpd_epu32(U, A, B)     \
+    ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundps_epi32 (__m512 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+						     (__v16si)
+						     _mm512_undefined_si512 (),
+						     (__mmask16) -1, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundps_epi32 (__m512i __W, __mmask16 __U, __m512 __A,
+				const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+						     (__v16si) __W,
+						     (__mmask16) __U, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundps_epi32 (__mmask16 __U, __m512 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+						     (__v16si)
+						     _mm512_setzero_si512 (),
+						     (__mmask16) __U, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundps_epu32 (__m512 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+						      (__v16si)
+						      _mm512_undefined_si512 (),
+						      (__mmask16) -1, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundps_epu32 (__m512i __W, __mmask16 __U, __m512 __A,
+				const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+						      (__v16si) __W,
+						      (__mmask16) __U, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+						      (__v16si)
+						      _mm512_setzero_si512 (),
+						      (__mmask16) __U, __R);
+}
+#else
+#define _mm512_cvtt_roundps_epi32(A, B)		     \
+    ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)_mm512_undefined_si512 (), -1, B))
+
+#define _mm512_mask_cvtt_roundps_epi32(W, U, A, B)   \
+    ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)(W), U, B))
+
+#define _mm512_maskz_cvtt_roundps_epi32(U, A, B)     \
+    ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B))
+
+#define _mm512_cvtt_roundps_epu32(A, B)		     \
+    ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)_mm512_undefined_si512 (), -1, B))
+
+#define _mm512_mask_cvtt_roundps_epu32(W, U, A, B)   \
+    ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)(W), U, B))
+
+#define _mm512_maskz_cvtt_roundps_epu32(U, A, B)     \
+    ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundps_epi32 (__m512 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+						    (__v16si)
+						    _mm512_undefined_si512 (),
+						    (__mmask16) -1, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundps_epi32 (__m512i __W, __mmask16 __U, __m512 __A,
+			       const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+						    (__v16si) __W,
+						    (__mmask16) __U, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundps_epi32 (__mmask16 __U, __m512 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+						    (__v16si)
+						    _mm512_setzero_si512 (),
+						    (__mmask16) __U, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundps_epu32 (__m512 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+						     (__v16si)
+						     _mm512_undefined_si512 (),
+						     (__mmask16) -1, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundps_epu32 (__m512i __W, __mmask16 __U, __m512 __A,
+			       const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+						     (__v16si) __W,
+						     (__mmask16) __U, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+						     (__v16si)
+						     _mm512_setzero_si512 (),
+						     (__mmask16) __U, __R);
+}
+#else
+#define _mm512_cvt_roundps_epi32(A, B)		    \
+    ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)_mm512_undefined_si512 (), -1, B))
+
+#define _mm512_mask_cvt_roundps_epi32(W, U, A, B)   \
+    ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)(W), U, B))
+
+#define _mm512_maskz_cvt_roundps_epi32(U, A, B)     \
+    ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B))
+
+#define _mm512_cvt_roundps_epu32(A, B)		    \
+    ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)_mm512_undefined_si512 (), -1, B))
+
+#define _mm512_mask_cvt_roundps_epu32(W, U, A, B)   \
+    ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)(W), U, B))
+
+#define _mm512_maskz_cvt_roundps_epu32(U, A, B)     \
+    ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B))
+#endif
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu32_sd (__m128d __A, unsigned __B)
+{
+  return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B);
+}
+
+#ifdef __x86_64__
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundu64_sd (__m128d __A, unsigned long long __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundi64_sd (__m128d __A, long long __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsi64_sd (__m128d __A, long long __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R);
+}
+#else
+#define _mm_cvt_roundu64_sd(A, B, C)   \
+    (__m128d)__builtin_ia32_cvtusi2sd64(A, B, C)
+
+#define _mm_cvt_roundi64_sd(A, B, C)   \
+    (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C)
+
+#define _mm_cvt_roundsi64_sd(A, B, C)   \
+    (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C)
+#endif
+
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundu32_ss (__m128 __A, unsigned __B, const int __R)
+{
+  return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsi32_ss (__m128 __A, int __B, const int __R)
+{
+  return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundi32_ss (__m128 __A, int __B, const int __R)
+{
+  return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R);
+}
+#else
+#define _mm_cvt_roundu32_ss(A, B, C)   \
+    (__m128)__builtin_ia32_cvtusi2ss32(A, B, C)
+
+#define _mm_cvt_roundi32_ss(A, B, C)   \
+    (__m128)__builtin_ia32_cvtsi2ss32(A, B, C)
+
+#define _mm_cvt_roundsi32_ss(A, B, C)   \
+    (__m128)__builtin_ia32_cvtsi2ss32(A, B, C)
+#endif
+
+#ifdef __x86_64__
+#ifdef __OPTIMIZE__
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundu64_ss (__m128 __A, unsigned long long __B, const int __R)
+{
+  return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsi64_ss (__m128 __A, long long __B, const int __R)
+{
+  return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundi64_ss (__m128 __A, long long __B, const int __R)
+{
+  return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R);
+}
+#else
+#define _mm_cvt_roundu64_ss(A, B, C)   \
+    (__m128)__builtin_ia32_cvtusi2ss64(A, B, C)
+
+#define _mm_cvt_roundi64_ss(A, B, C)   \
+    (__m128)__builtin_ia32_cvtsi2ss64(A, B, C)
+
+#define _mm_cvt_roundsi64_ss(A, B, C)   \
+    (__m128)__builtin_ia32_cvtsi2ss64(A, B, C)
+#endif
+
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi32_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+						  (__v16qi)
+						  _mm_undefined_si128 (),
+						  (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+						  (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+						  (__v16qi)
+						  _mm_setzero_si128 (),
+						  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsepi32_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+						   (__v16qi)
+						   _mm_undefined_si128 (),
+						   (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+						   (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+						   (__v16qi)
+						   _mm_setzero_si128 (),
+						   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtusepi32_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+						    (__v16qi)
+						    _mm_undefined_si128 (),
+						    (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+						    (__v16qi) __O,
+						    __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+						    (__v16qi)
+						    _mm_setzero_si128 (),
+						    __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi32_epi16 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+						  (__v16hi)
+						  _mm256_undefined_si256 (),
+						  (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+						  (__v16hi) __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+						  (__v16hi)
+						  _mm256_setzero_si256 (),
+						  __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsepi32_epi16 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+						   (__v16hi)
+						   _mm256_undefined_si256 (),
+						   (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+						   (__v16hi) __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+						   (__v16hi)
+						   _mm256_setzero_si256 (),
+						   __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtusepi32_epi16 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+						    (__v16hi)
+						    _mm256_undefined_si256 (),
+						    (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+						    (__v16hi) __O,
+						    __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+						    (__v16hi)
+						    _mm256_setzero_si256 (),
+						    __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi64_epi32 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+						  (__v8si)
+						  _mm256_undefined_si256 (),
+						  (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+						  (__v8si) __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+						  (__v8si)
+						  _mm256_setzero_si256 (),
+						  __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsepi64_epi32 (__m512i __A)
+{
+  __v8si __O;
+  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+						   (__v8si)
+						   _mm256_undefined_si256 (),
+						   (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+						   (__v8si) __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+						   (__v8si)
+						   _mm256_setzero_si256 (),
+						   __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtusepi64_epi32 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+						    (__v8si)
+						    _mm256_undefined_si256 (),
+						    (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+						    (__v8si) __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+						    (__v8si)
+						    _mm256_setzero_si256 (),
+						    __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi64_epi16 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+						  (__v8hi)
+						  _mm_undefined_si128 (),
+						  (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+						  (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+						  (__v8hi)
+						  _mm_setzero_si128 (),
+						  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsepi64_epi16 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+						   (__v8hi)
+						   _mm_undefined_si128 (),
+						   (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+						   (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+						   (__v8hi)
+						   _mm_setzero_si128 (),
+						   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtusepi64_epi16 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+						    (__v8hi)
+						    _mm_undefined_si128 (),
+						    (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+						    (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+						    (__v8hi)
+						    _mm_setzero_si128 (),
+						    __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi64_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+						  (__v16qi)
+						  _mm_undefined_si128 (),
+						  (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+						  (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+						  (__v16qi)
+						  _mm_setzero_si128 (),
+						  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsepi64_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+						   (__v16qi)
+						   _mm_undefined_si128 (),
+						   (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+						   (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+						   (__v16qi)
+						   _mm_setzero_si128 (),
+						   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtusepi64_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+						    (__v16qi)
+						    _mm_undefined_si128 (),
+						    (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+						    (__v16qi) __O,
+						    __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+						    (__v16qi)
+						    _mm_setzero_si128 (),
+						    __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi32_pd (__m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
+						    (__v8df)
+						    _mm512_undefined_pd (),
+						    (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
+						    (__v8df) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
+						    (__v8df)
+						    _mm512_setzero_pd (),
+						    (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu32_pd (__m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
+						     (__v8df)
+						     _mm512_undefined_pd (),
+						     (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
+						     (__v8df) __W,
+						     (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
+						     (__v8df)
+						     _mm512_setzero_pd (),
+						     (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepi32_ps (__m512i __A, const int __R)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+						   (__v16sf)
+						   _mm512_undefined_ps (),
+						   (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepi32_ps (__m512 __W, __mmask16 __U, __m512i __A,
+			       const int __R)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+						   (__v16sf) __W,
+						   (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepi32_ps (__mmask16 __U, __m512i __A, const int __R)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+						   (__v16sf)
+						   _mm512_setzero_ps (),
+						   (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepu32_ps (__m512i __A, const int __R)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+						    (__v16sf)
+						    _mm512_undefined_ps (),
+						    (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepu32_ps (__m512 __W, __mmask16 __U, __m512i __A,
+			       const int __R)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+						    (__v16sf) __W,
+						    (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepu32_ps (__mmask16 __U, __m512i __A, const int __R)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+						    (__v16sf)
+						    _mm512_setzero_ps (),
+						    (__mmask16) __U, __R);
+}
+
+#else
+#define _mm512_cvt_roundepi32_ps(A, B)        \
+    (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), (__v16sf)_mm512_undefined_ps(), -1, B)
+
+#define _mm512_mask_cvt_roundepi32_ps(W, U, A, B)   \
+    (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), W, U, B)
+
+#define _mm512_maskz_cvt_roundepi32_ps(U, A, B)      \
+    (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), (__v16sf)_mm512_setzero_ps(), U, B)
+
+#define _mm512_cvt_roundepu32_ps(A, B)        \
+    (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), (__v16sf)_mm512_undefined_ps(), -1, B)
+
+#define _mm512_mask_cvt_roundepu32_ps(W, U, A, B)   \
+    (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), W, U, B)
+
+#define _mm512_maskz_cvt_roundepu32_ps(U, A, B)      \
+    (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), (__v16sf)_mm512_setzero_ps(), U, B)
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extractf64x4_pd (__m512d __A, const int __imm)
+{
+  return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A,
+						     __imm,
+						     (__v4df)
+						     _mm256_undefined_pd (),
+						     (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extractf64x4_pd (__m256d __W, __mmask8 __U, __m512d __A,
+			     const int __imm)
+{
+  return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A,
+						     __imm,
+						     (__v4df) __W,
+						     (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extractf64x4_pd (__mmask8 __U, __m512d __A, const int __imm)
+{
+  return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A,
+						     __imm,
+						     (__v4df)
+						     _mm256_setzero_pd (),
+						     (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extractf32x4_ps (__m512 __A, const int __imm)
+{
+  return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A,
+						    __imm,
+						    (__v4sf)
+						    _mm_undefined_ps (),
+						    (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extractf32x4_ps (__m128 __W, __mmask8 __U, __m512 __A,
+			     const int __imm)
+{
+  return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A,
+						    __imm,
+						    (__v4sf) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extractf32x4_ps (__mmask8 __U, __m512 __A, const int __imm)
+{
+  return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A,
+						    __imm,
+						    (__v4sf)
+						    _mm_setzero_ps (),
+						    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extracti64x4_epi64 (__m512i __A, const int __imm)
+{
+  return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A,
+						     __imm,
+						     (__v4di)
+						     _mm256_undefined_si256 (),
+						     (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extracti64x4_epi64 (__m256i __W, __mmask8 __U, __m512i __A,
+				const int __imm)
+{
+  return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A,
+						     __imm,
+						     (__v4di) __W,
+						     (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extracti64x4_epi64 (__mmask8 __U, __m512i __A, const int __imm)
+{
+  return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A,
+						     __imm,
+						     (__v4di)
+						     _mm256_setzero_si256 (),
+						     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extracti32x4_epi32 (__m512i __A, const int __imm)
+{
+  return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A,
+						     __imm,
+						     (__v4si)
+						     _mm_undefined_si128 (),
+						     (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extracti32x4_epi32 (__m128i __W, __mmask8 __U, __m512i __A,
+				const int __imm)
+{
+  return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A,
+						     __imm,
+						     (__v4si) __W,
+						     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extracti32x4_epi32 (__mmask8 __U, __m512i __A, const int __imm)
+{
+  return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A,
+						     __imm,
+						     (__v4si)
+						     _mm_setzero_si128 (),
+						     (__mmask8) __U);
+}
+#else
+
+#define _mm512_extractf64x4_pd(X, C)                                    \
+  ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X),   \
+    (int) (C),\
+    (__v4df)(__m256d)_mm256_undefined_pd(),\
+    (__mmask8)-1))
+
+#define _mm512_mask_extractf64x4_pd(W, U, X, C)                         \
+  ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X),   \
+    (int) (C),\
+    (__v4df)(__m256d)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_extractf64x4_pd(U, X, C)                           \
+  ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X),   \
+    (int) (C),\
+    (__v4df)(__m256d)_mm256_setzero_pd(),\
+    (__mmask8)(U)))
+
+#define _mm512_extractf32x4_ps(X, C)                                    \
+  ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X),    \
+    (int) (C),\
+    (__v4sf)(__m128)_mm_undefined_ps(),\
+    (__mmask8)-1))
+
+#define _mm512_mask_extractf32x4_ps(W, U, X, C)                         \
+  ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X),    \
+    (int) (C),\
+    (__v4sf)(__m128)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_extractf32x4_ps(U, X, C)                           \
+  ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X),    \
+    (int) (C),\
+    (__v4sf)(__m128)_mm_setzero_ps(),\
+    (__mmask8)(U)))
+
+#define _mm512_extracti64x4_epi64(X, C)                                 \
+  ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X),   \
+    (int) (C),\
+    (__v4di)(__m256i)_mm256_undefined_si256 (),\
+    (__mmask8)-1))
+
+#define _mm512_mask_extracti64x4_epi64(W, U, X, C)                      \
+  ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X),   \
+    (int) (C),\
+    (__v4di)(__m256i)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_extracti64x4_epi64(U, X, C)                        \
+  ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X),   \
+    (int) (C),\
+    (__v4di)(__m256i)_mm256_setzero_si256 (),\
+    (__mmask8)(U)))
+
+#define _mm512_extracti32x4_epi32(X, C)                                 \
+  ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X),  \
+    (int) (C),\
+    (__v4si)(__m128i)_mm_undefined_si128 (),\
+    (__mmask8)-1))
+
+#define _mm512_mask_extracti32x4_epi32(W, U, X, C)                      \
+  ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X),  \
+    (int) (C),\
+    (__v4si)(__m128i)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_extracti32x4_epi32(U, X, C)                        \
+  ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X),  \
+    (int) (C),\
+    (__v4si)(__m128i)_mm_setzero_si128 (),\
+    (__mmask8)(U)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_inserti32x4 (__m512i __A, __m128i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __A,
+						    (__v4si) __B,
+						    __imm,
+						    (__v16si) __A, -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_insertf32x4 (__m512 __A, __m128 __B, const int __imm)
+{
+  return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __A,
+						   (__v4sf) __B,
+						   __imm,
+						   (__v16sf) __A, -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_inserti64x4 (__m512i __A, __m256i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A,
+						    (__v4di) __B,
+						    __imm,
+						    (__v8di)
+						    _mm512_undefined_si512 (),
+						    (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_inserti64x4 (__m512i __W, __mmask8 __U, __m512i __A,
+			 __m256i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A,
+						    (__v4di) __B,
+						    __imm,
+						    (__v8di) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_inserti64x4 (__mmask8 __U, __m512i __A, __m256i __B,
+			  const int __imm)
+{
+  return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A,
+						    (__v4di) __B,
+						    __imm,
+						    (__v8di)
+						    _mm512_setzero_si512 (),
+						    (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_insertf64x4 (__m512d __A, __m256d __B, const int __imm)
+{
+  return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A,
+						    (__v4df) __B,
+						    __imm,
+						    (__v8df)
+						    _mm512_undefined_pd (),
+						    (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_insertf64x4 (__m512d __W, __mmask8 __U, __m512d __A,
+			 __m256d __B, const int __imm)
+{
+  return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A,
+						    (__v4df) __B,
+						    __imm,
+						    (__v8df) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_insertf64x4 (__mmask8 __U, __m512d __A, __m256d __B,
+			  const int __imm)
+{
+  return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A,
+						    (__v4df) __B,
+						    __imm,
+						    (__v8df)
+						    _mm512_setzero_pd (),
+						    (__mmask8) __U);
+}
+#else
+#define _mm512_insertf32x4(X, Y, C)                                     \
+  ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X),     \
+    (__v4sf)(__m128) (Y), (int) (C), (__v16sf)(__m512) (X), (__mmask16)(-1)))
+
+#define _mm512_inserti32x4(X, Y, C)                                     \
+  ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X),   \
+    (__v4si)(__m128i) (Y), (int) (C), (__v16si)(__m512i) (X), (__mmask16)(-1)))
+
+#define _mm512_insertf64x4(X, Y, C)                                     \
+  ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X),    \
+    (__v4df)(__m256d) (Y), (int) (C),					\
+    (__v8df)(__m512d)_mm512_undefined_pd(),				\
+    (__mmask8)-1))
+
+#define _mm512_mask_insertf64x4(W, U, X, Y, C)                          \
+  ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X),    \
+    (__v4df)(__m256d) (Y), (int) (C),					\
+    (__v8df)(__m512d)(W),						\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_insertf64x4(U, X, Y, C)                            \
+  ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X),    \
+    (__v4df)(__m256d) (Y), (int) (C),					\
+    (__v8df)(__m512d)_mm512_setzero_pd(),				\
+    (__mmask8)(U)))
+
+#define _mm512_inserti64x4(X, Y, C)                                     \
+  ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X),    \
+    (__v4di)(__m256i) (Y), (int) (C),					\
+    (__v8di)(__m512i)_mm512_undefined_si512 (),				\
+    (__mmask8)-1))
+
+#define _mm512_mask_inserti64x4(W, U, X, Y, C)                          \
+  ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X),    \
+    (__v4di)(__m256i) (Y), (int) (C),\
+    (__v8di)(__m512i)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_inserti64x4(U, X, Y, C)                            \
+  ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X),    \
+    (__v4di)(__m256i) (Y), (int) (C),					\
+    (__v8di)(__m512i)_mm512_setzero_si512 (),				\
+    (__mmask8)(U)))
+#endif
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_pd (void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadupd512_mask ((const __v8df *) __P,
+						   (__v8df)
+						   _mm512_undefined_pd (),
+						   (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadupd512_mask ((const __v8df *) __P,
+						   (__v8df) __W,
+						   (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_loadu_pd (__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadupd512_mask ((const __v8df *) __P,
+						   (__v8df)
+						   _mm512_setzero_pd (),
+						   (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_pd (void *__P, __m512d __A)
+{
+  __builtin_ia32_storeupd512_mask ((__v8df *) __P, (__v8df) __A,
+				   (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_storeu_pd (void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_storeupd512_mask ((__v8df *) __P, (__v8df) __A,
+				   (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_ps (void const *__P)
+{
+  return (__m512) __builtin_ia32_loadups512_mask ((const __v16sf *) __P,
+						  (__v16sf)
+						  _mm512_undefined_ps (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadups512_mask ((const __v16sf *) __P,
+						  (__v16sf) __W,
+						  (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_loadu_ps (__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadups512_mask ((const __v16sf *) __P,
+						  (__v16sf)
+						  _mm512_setzero_ps (),
+						  (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_ps (void *__P, __m512 __A)
+{
+  __builtin_ia32_storeups512_mask ((__v16sf *) __P, (__v16sf) __A,
+				   (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_storeu_ps (void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_storeups512_mask ((__v16sf *) __P, (__v16sf) __A,
+				   (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const __v8di *) __P,
+						     (__v8di) __W,
+						     (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const __v8di *) __P,
+						     (__v8di)
+						     _mm512_setzero_si512 (),
+						     (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_storeu_epi64 (void *__P, __mmask8 __U, __m512i __A)
+{
+  __builtin_ia32_storedqudi512_mask ((__v8di *) __P, (__v8di) __A,
+				     (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_si512 (void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const __v16si *) __P,
+						     (__v16si)
+						     _mm512_setzero_si512 (),
+						     (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const __v16si *) __P,
+						     (__v16si) __W,
+						     (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_loadu_epi32 (__mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const __v16si *) __P,
+						     (__v16si)
+						     _mm512_setzero_si512 (),
+						     (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_si512 (void *__P, __m512i __A)
+{
+  __builtin_ia32_storedqusi512_mask ((__v16si *) __P, (__v16si) __A,
+				     (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_storeu_epi32 (void *__P, __mmask16 __U, __m512i __A)
+{
+  __builtin_ia32_storedqusi512_mask ((__v16si *) __P, (__v16si) __A,
+				     (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutevar_pd (__m512d __A, __m512i __C)
+{
+  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
+							(__v8di) __C,
+							(__v8df)
+							_mm512_undefined_pd (),
+							(__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutevar_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
+{
+  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
+							(__v8di) __C,
+							(__v8df) __W,
+							(__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutevar_pd (__mmask8 __U, __m512d __A, __m512i __C)
+{
+  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
+							(__v8di) __C,
+							(__v8df)
+							_mm512_setzero_pd (),
+							(__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutevar_ps (__m512 __A, __m512i __C)
+{
+  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
+						       (__v16si) __C,
+						       (__v16sf)
+						       _mm512_undefined_ps (),
+						       (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutevar_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
+{
+  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
+						       (__v16si) __C,
+						       (__v16sf) __W,
+						       (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutevar_ps (__mmask16 __U, __m512 __A, __m512i __C)
+{
+  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
+						       (__v16si) __C,
+						       (__v16sf)
+						       _mm512_setzero_ps (),
+						       (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_epi64 (__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
+						       /* idx */ ,
+						       (__v8di) __A,
+						       (__v8di) __B,
+						       (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I,
+				__m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
+						       /* idx */ ,
+						       (__v8di) __A,
+						       (__v8di) __B,
+						       (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I,
+				 __mmask8 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2varq512_mask ((__v8di) __A,
+						       (__v8di) __I
+						       /* idx */ ,
+						       (__v8di) __B,
+						       (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A,
+				 __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varq512_maskz ((__v8di) __I
+							/* idx */ ,
+							(__v8di) __A,
+							(__v8di) __B,
+							(__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_epi32 (__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
+						       /* idx */ ,
+						       (__v16si) __A,
+						       (__v16si) __B,
+						       (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U,
+				__m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
+						       /* idx */ ,
+						       (__v16si) __A,
+						       (__v16si) __B,
+						       (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I,
+				 __mmask16 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2vard512_mask ((__v16si) __A,
+						       (__v16si) __I
+						       /* idx */ ,
+						       (__v16si) __B,
+						       (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex2var_epi32 (__mmask16 __U, __m512i __A,
+				 __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2vard512_maskz ((__v16si) __I
+							/* idx */ ,
+							(__v16si) __A,
+							(__v16si) __B,
+							(__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_pd (__m512d __A, __m512i __I, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
+							/* idx */ ,
+							(__v8df) __A,
+							(__v8df) __B,
+							(__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I,
+			     __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
+							/* idx */ ,
+							(__v8df) __A,
+							(__v8df) __B,
+							(__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U,
+			      __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermi2varpd512_mask ((__v8df) __A,
+							(__v8di) __I
+							/* idx */ ,
+							(__v8df) __B,
+							(__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex2var_pd (__mmask8 __U, __m512d __A, __m512i __I,
+			      __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermt2varpd512_maskz ((__v8di) __I
+							 /* idx */ ,
+							 (__v8df) __A,
+							 (__v8df) __B,
+							 (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_ps (__m512 __A, __m512i __I, __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
+						       /* idx */ ,
+						       (__v16sf) __A,
+						       (__v16sf) __B,
+						       (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
+						       /* idx */ ,
+						       (__v16sf) __A,
+						       (__v16sf) __B,
+						       (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask2_permutex2var_ps (__m512 __A, __m512i __I, __mmask16 __U,
+			      __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermi2varps512_mask ((__v16sf) __A,
+						       (__v16si) __I
+						       /* idx */ ,
+						       (__v16sf) __B,
+						       (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex2var_ps (__mmask16 __U, __m512 __A, __m512i __I,
+			      __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermt2varps512_maskz ((__v16si) __I
+							/* idx */ ,
+							(__v16sf) __A,
+							(__v16sf) __B,
+							(__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permute_pd (__m512d __X, const int __C)
+{
+  return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C,
+						     (__v8df)
+						     _mm512_undefined_pd (),
+						     (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permute_pd (__m512d __W, __mmask8 __U, __m512d __X, const int __C)
+{
+  return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C,
+						     (__v8df) __W,
+						     (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permute_pd (__mmask8 __U, __m512d __X, const int __C)
+{
+  return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C,
+						     (__v8df)
+						     _mm512_setzero_pd (),
+						     (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permute_ps (__m512 __X, const int __C)
+{
+  return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C,
+						    (__v16sf)
+						    _mm512_undefined_ps (),
+						    (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permute_ps (__m512 __W, __mmask16 __U, __m512 __X, const int __C)
+{
+  return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C,
+						    (__v16sf) __W,
+						    (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permute_ps (__mmask16 __U, __m512 __X, const int __C)
+{
+  return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C,
+						    (__v16sf)
+						    _mm512_setzero_ps (),
+						    (__mmask16) __U);
+}
+#else
+#define _mm512_permute_pd(X, C)							    \
+  ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C),	    \
+					      (__v8df)(__m512d)_mm512_undefined_pd(),\
+					      (__mmask8)(-1)))
+
+#define _mm512_mask_permute_pd(W, U, X, C)					    \
+  ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C),	    \
+					      (__v8df)(__m512d)(W),		    \
+					      (__mmask8)(U)))
+
+#define _mm512_maskz_permute_pd(U, X, C)					    \
+  ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C),	    \
+					      (__v8df)(__m512d)_mm512_setzero_pd(), \
+					      (__mmask8)(U)))
+
+#define _mm512_permute_ps(X, C)							    \
+  ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C),	    \
+					      (__v16sf)(__m512)_mm512_undefined_ps(),\
+					      (__mmask16)(-1)))
+
+#define _mm512_mask_permute_ps(W, U, X, C)					    \
+  ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C),	    \
+					      (__v16sf)(__m512)(W),		    \
+					      (__mmask16)(U)))
+
+#define _mm512_maskz_permute_ps(U, X, C)					    \
+  ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C),	    \
+					      (__v16sf)(__m512)_mm512_setzero_ps(), \
+					      (__mmask16)(U)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex_epi64 (__m512i __X, const int __I)
+{
+  return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I,
+						  (__v8di)
+						  _mm512_undefined_si512 (),
+						  (__mmask8) (-1));
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex_epi64 (__m512i __W, __mmask8 __M,
+			    __m512i __X, const int __I)
+{
+  return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I,
+						  (__v8di) __W,
+						  (__mmask8) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex_epi64 (__mmask8 __M, __m512i __X, const int __I)
+{
+  return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I,
+						  (__v8di)
+						  _mm512_setzero_si512 (),
+						  (__mmask8) __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex_pd (__m512d __X, const int __M)
+{
+  return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M,
+						  (__v8df)
+						  _mm512_undefined_pd (),
+						  (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex_pd (__m512d __W, __mmask8 __U, __m512d __X, const int __M)
+{
+  return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M,
+						  (__v8df) __W,
+						  (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex_pd (__mmask8 __U, __m512d __X, const int __M)
+{
+  return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M,
+						  (__v8df)
+						  _mm512_setzero_pd (),
+						  (__mmask8) __U);
+}
+#else
+#define _mm512_permutex_pd(X, M)						\
+  ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M),	\
+					    (__v8df)(__m512d)_mm512_undefined_pd(),\
+					    (__mmask8)-1))
+
+#define _mm512_mask_permutex_pd(W, U, X, M)					\
+  ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M),	\
+					    (__v8df)(__m512d)(W), (__mmask8)(U)))
+
+#define _mm512_maskz_permutex_pd(U, X, M)					\
+  ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M),	\
+					    (__v8df)(__m512d)_mm512_setzero_pd(),\
+					    (__mmask8)(U)))
+
+#define _mm512_permutex_epi64(X, I)			          \
+  ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), \
+					    (int)(I),             \
+					    (__v8di)(__m512i)	  \
+					    (_mm512_undefined_si512 ()),\
+					    (__mmask8)(-1)))
+
+#define _mm512_maskz_permutex_epi64(M, X, I)                 \
+  ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), \
+					    (int)(I),             \
+					    (__v8di)(__m512i)     \
+					    (_mm512_setzero_si512 ()),\
+					    (__mmask8)(M)))
+
+#define _mm512_mask_permutex_epi64(W, M, X, I)               \
+  ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), \
+					    (int)(I),             \
+					    (__v8di)(__m512i)(W), \
+					    (__mmask8)(M)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+						     (__v8di) __X,
+						     (__v8di)
+						     _mm512_setzero_si512 (),
+						     __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+						     (__v8di) __X,
+						     (__v8di)
+						     _mm512_undefined_si512 (),
+						     (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X,
+			       __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+						     (__v8di) __X,
+						     (__v8di) __W,
+						     __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+						     (__v16si) __X,
+						     (__v16si)
+						     _mm512_setzero_si512 (),
+						     __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+						     (__v16si) __X,
+						     (__v16si)
+						     _mm512_undefined_si512 (),
+						     (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
+			       __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+						     (__v16si) __X,
+						     (__v16si) __W,
+						     __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_pd (__m512i __X, __m512d __Y)
+{
+  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+						     (__v8di) __X,
+						     (__v8df)
+						     _mm512_undefined_pd (),
+						     (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y)
+{
+  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+						     (__v8di) __X,
+						     (__v8df) __W,
+						     (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y)
+{
+  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+						     (__v8di) __X,
+						     (__v8df)
+						     _mm512_setzero_pd (),
+						     (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_ps (__m512i __X, __m512 __Y)
+{
+  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+						    (__v16si) __X,
+						    (__v16sf)
+						    _mm512_undefined_ps (),
+						    (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y)
+{
+  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+						    (__v16si) __X,
+						    (__v16sf) __W,
+						    (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y)
+{
+  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+						    (__v16si) __X,
+						    (__v16sf)
+						    _mm512_setzero_ps (),
+						    (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_ps (__m512 __M, __m512 __V, const int __imm)
+{
+  return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M,
+						 (__v16sf) __V, __imm,
+						 (__v16sf)
+						 _mm512_undefined_ps (),
+						 (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_ps (__m512 __W, __mmask16 __U, __m512 __M,
+			__m512 __V, const int __imm)
+{
+  return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M,
+						 (__v16sf) __V, __imm,
+						 (__v16sf) __W,
+						 (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_ps (__mmask16 __U, __m512 __M, __m512 __V, const int __imm)
+{
+  return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M,
+						 (__v16sf) __V, __imm,
+						 (__v16sf)
+						 _mm512_setzero_ps (),
+						 (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_pd (__m512d __M, __m512d __V, const int __imm)
+{
+  return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M,
+						  (__v8df) __V, __imm,
+						  (__v8df)
+						  _mm512_undefined_pd (),
+						  (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_pd (__m512d __W, __mmask8 __U, __m512d __M,
+			__m512d __V, const int __imm)
+{
+  return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M,
+						  (__v8df) __V, __imm,
+						  (__v8df) __W,
+						  (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_pd (__mmask8 __U, __m512d __M, __m512d __V,
+			 const int __imm)
+{
+  return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M,
+						  (__v8df) __V, __imm,
+						  (__v8df)
+						  _mm512_setzero_pd (),
+						  (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fixupimm_round_pd (__m512d __A, __m512d __B, __m512i __C,
+			  const int __imm, const int __R)
+{
+  return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A,
+						      (__v8df) __B,
+						      (__v8di) __C,
+						      __imm,
+						      (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fixupimm_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+			       __m512i __C, const int __imm, const int __R)
+{
+  return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A,
+						      (__v8df) __B,
+						      (__v8di) __C,
+						      __imm,
+						      (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fixupimm_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+				__m512i __C, const int __imm, const int __R)
+{
+  return (__m512d) __builtin_ia32_fixupimmpd512_maskz ((__v8df) __A,
+						       (__v8df) __B,
+						       (__v8di) __C,
+						       __imm,
+						       (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fixupimm_round_ps (__m512 __A, __m512 __B, __m512i __C,
+			  const int __imm, const int __R)
+{
+  return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A,
+						     (__v16sf) __B,
+						     (__v16si) __C,
+						     __imm,
+						     (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fixupimm_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+			       __m512i __C, const int __imm, const int __R)
+{
+  return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A,
+						     (__v16sf) __B,
+						     (__v16si) __C,
+						     __imm,
+						     (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fixupimm_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+				__m512i __C, const int __imm, const int __R)
+{
+  return (__m512) __builtin_ia32_fixupimmps512_maskz ((__v16sf) __A,
+						      (__v16sf) __B,
+						      (__v16si) __C,
+						      __imm,
+						      (__mmask16) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fixupimm_round_sd (__m128d __A, __m128d __B, __m128i __C,
+		       const int __imm, const int __R)
+{
+  return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A,
+						   (__v2df) __B,
+						   (__v2di) __C, __imm,
+						   (__mmask8) -1, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fixupimm_round_sd (__m128d __A, __mmask8 __U, __m128d __B,
+			    __m128i __C, const int __imm, const int __R)
+{
+  return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A,
+						   (__v2df) __B,
+						   (__v2di) __C, __imm,
+						   (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fixupimm_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+			     __m128i __C, const int __imm, const int __R)
+{
+  return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A,
+						    (__v2df) __B,
+						    (__v2di) __C,
+						    __imm,
+						    (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fixupimm_round_ss (__m128 __A, __m128 __B, __m128i __C,
+		       const int __imm, const int __R)
+{
+  return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A,
+						  (__v4sf) __B,
+						  (__v4si) __C, __imm,
+						  (__mmask8) -1, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fixupimm_round_ss (__m128 __A, __mmask8 __U, __m128 __B,
+			    __m128i __C, const int __imm, const int __R)
+{
+  return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A,
+						  (__v4sf) __B,
+						  (__v4si) __C, __imm,
+						  (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fixupimm_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+			     __m128i __C, const int __imm, const int __R)
+{
+  return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A,
+						   (__v4sf) __B,
+						   (__v4si) __C, __imm,
+						   (__mmask8) __U, __R);
+}
+
+#else
+#define _mm512_shuffle_pd(X, Y, C)                                      \
+    ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X),           \
+        (__v8df)(__m512d)(Y), (int)(C),\
+    (__v8df)(__m512d)_mm512_undefined_pd(),\
+    (__mmask8)-1))
+
+#define _mm512_mask_shuffle_pd(W, U, X, Y, C)                           \
+    ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X),           \
+        (__v8df)(__m512d)(Y), (int)(C),\
+    (__v8df)(__m512d)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_shuffle_pd(U, X, Y, C)                             \
+    ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X),           \
+        (__v8df)(__m512d)(Y), (int)(C),\
+    (__v8df)(__m512d)_mm512_setzero_pd(),\
+    (__mmask8)(U)))
+
+#define _mm512_shuffle_ps(X, Y, C)                                      \
+    ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X),            \
+        (__v16sf)(__m512)(Y), (int)(C),\
+    (__v16sf)(__m512)_mm512_undefined_ps(),\
+    (__mmask16)-1))
+
+#define _mm512_mask_shuffle_ps(W, U, X, Y, C)                           \
+    ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X),            \
+        (__v16sf)(__m512)(Y), (int)(C),\
+    (__v16sf)(__m512)(W),\
+    (__mmask16)(U)))
+
+#define _mm512_maskz_shuffle_ps(U, X, Y, C)                             \
+    ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X),            \
+        (__v16sf)(__m512)(Y), (int)(C),\
+    (__v16sf)(__m512)_mm512_setzero_ps(),\
+    (__mmask16)(U)))
+
+#define _mm512_fixupimm_round_pd(X, Y, Z, C, R)					\
+  ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X),	\
+      (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C),		\
+      (__mmask8)(-1), (R)))
+
+#define _mm512_mask_fixupimm_round_pd(X, U, Y, Z, C, R)                          \
+  ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X),    \
+      (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C),             \
+      (__mmask8)(U), (R)))
+
+#define _mm512_maskz_fixupimm_round_pd(U, X, Y, Z, C, R)                         \
+  ((__m512d)__builtin_ia32_fixupimmpd512_maskz ((__v8df)(__m512d)(X),   \
+      (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C),             \
+      (__mmask8)(U), (R)))
+
+#define _mm512_fixupimm_round_ps(X, Y, Z, C, R)					\
+  ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X),	\
+    (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C),		\
+    (__mmask16)(-1), (R)))
+
+#define _mm512_mask_fixupimm_round_ps(X, U, Y, Z, C, R)                          \
+  ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X),     \
+    (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C),              \
+    (__mmask16)(U), (R)))
+
+#define _mm512_maskz_fixupimm_round_ps(U, X, Y, Z, C, R)                         \
+  ((__m512)__builtin_ia32_fixupimmps512_maskz ((__v16sf)(__m512)(X),    \
+    (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C),              \
+    (__mmask16)(U), (R)))
+
+#define _mm_fixupimm_round_sd(X, Y, Z, C, R)					\
+    ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X),	\
+      (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C),		\
+      (__mmask8)(-1), (R)))
+
+#define _mm_mask_fixupimm_round_sd(X, U, Y, Z, C, R)				\
+    ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X),	\
+      (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C),		\
+      (__mmask8)(U), (R)))
+
+#define _mm_maskz_fixupimm_round_sd(U, X, Y, Z, C, R)				\
+    ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X),	\
+      (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C),		\
+      (__mmask8)(U), (R)))
+
+#define _mm_fixupimm_round_ss(X, Y, Z, C, R)					\
+    ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X),	\
+      (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C),		\
+      (__mmask8)(-1), (R)))
+
+#define _mm_mask_fixupimm_round_ss(X, U, Y, Z, C, R)				\
+    ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X),	\
+      (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C),		\
+      (__mmask8)(U), (R)))
+
+#define _mm_maskz_fixupimm_round_ss(U, X, Y, Z, C, R)				\
+    ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X),	\
+      (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C),		\
+      (__mmask8)(U), (R)))
+#endif
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movehdup_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A,
+						   (__v16sf)
+						   _mm512_undefined_ps (),
+						   (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A,
+						   (__v16sf) __W,
+						   (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A,
+						   (__v16sf)
+						   _mm512_setzero_ps (),
+						   (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_moveldup_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A,
+						   (__v16sf)
+						   _mm512_undefined_ps (),
+						   (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A,
+						   (__v16sf) __W,
+						   (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A,
+						   (__v16sf)
+						   _mm512_setzero_ps (),
+						   (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_or_si512 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A,
+						(__v16si) __B,
+						(__v16si)
+						_mm512_undefined_si512 (),
+						(__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_or_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A,
+						(__v16si) __B,
+						(__v16si)
+						_mm512_undefined_si512 (),
+						(__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_or_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A,
+						(__v16si) __B,
+						(__v16si) __W,
+						(__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_or_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A,
+						(__v16si) __B,
+						(__v16si)
+						_mm512_setzero_si512 (),
+						(__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_or_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A,
+						(__v8di) __B,
+						(__v8di)
+						_mm512_undefined_si512 (),
+						(__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_or_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A,
+						(__v8di) __B,
+						(__v8di) __W,
+						(__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_or_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A,
+						(__v8di) __B,
+						(__v8di)
+						_mm512_setzero_si512 (),
+						(__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_xor_si512 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A,
+						 (__v16si) __B,
+						 (__v16si)
+						 _mm512_undefined_si512 (),
+						 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_xor_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A,
+						 (__v16si) __B,
+						 (__v16si)
+						 _mm512_undefined_si512 (),
+						 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_xor_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A,
+						 (__v16si) __B,
+						 (__v16si) __W,
+						 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_xor_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A,
+						 (__v16si) __B,
+						 (__v16si)
+						 _mm512_setzero_si512 (),
+						 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_xor_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A,
+						 (__v8di) __B,
+						 (__v8di)
+						 _mm512_undefined_si512 (),
+						 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_xor_epi64 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A,
+						 (__v8di) __B,
+						 (__v8di) __W,
+						 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_xor_epi64 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A,
+						 (__v8di) __B,
+						 (__v8di)
+						 _mm512_setzero_si512 (),
+						 (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rol_epi32 (__m512i __A, const int __B)
+{
+  return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B,
+						 (__v16si)
+						 _mm512_undefined_si512 (),
+						 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rol_epi32 (__m512i __W, __mmask16 __U, __m512i __A, const int __B)
+{
+  return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B,
+						 (__v16si) __W,
+						 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rol_epi32 (__mmask16 __U, __m512i __A, const int __B)
+{
+  return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B,
+						 (__v16si)
+						 _mm512_setzero_si512 (),
+						 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_ror_epi32 (__m512i __A, int __B)
+{
+  return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B,
+						 (__v16si)
+						 _mm512_undefined_si512 (),
+						 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_ror_epi32 (__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+  return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B,
+						 (__v16si) __W,
+						 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_ror_epi32 (__mmask16 __U, __m512i __A, int __B)
+{
+  return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B,
+						 (__v16si)
+						 _mm512_setzero_si512 (),
+						 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rol_epi64 (__m512i __A, const int __B)
+{
+  return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B,
+						 (__v8di)
+						 _mm512_undefined_si512 (),
+						 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rol_epi64 (__m512i __W, __mmask8 __U, __m512i __A, const int __B)
+{
+  return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B,
+						 (__v8di) __W,
+						 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rol_epi64 (__mmask8 __U, __m512i __A, const int __B)
+{
+  return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B,
+						 (__v8di)
+						 _mm512_setzero_si512 (),
+						 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_ror_epi64 (__m512i __A, int __B)
+{
+  return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B,
+						 (__v8di)
+						 _mm512_undefined_si512 (),
+						 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_ror_epi64 (__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B,
+						 (__v8di) __W,
+						 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_ror_epi64 (__mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B,
+						 (__v8di)
+						 _mm512_setzero_si512 (),
+						 (__mmask8) __U);
+}
+
+#else
+#define _mm512_rol_epi32(A, B)						  \
+    ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A),	  \
+					    (int)(B),			  \
+					    (__v16si)_mm512_undefined_si512 (), \
+					    (__mmask16)(-1)))
+#define _mm512_mask_rol_epi32(W, U, A, B)				  \
+    ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A),	  \
+					    (int)(B),			  \
+					    (__v16si)(__m512i)(W),	  \
+					    (__mmask16)(U)))
+#define _mm512_maskz_rol_epi32(U, A, B)					  \
+    ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A),	  \
+					    (int)(B),			  \
+					    (__v16si)_mm512_setzero_si512 (), \
+					    (__mmask16)(U)))
+#define _mm512_ror_epi32(A, B)						  \
+    ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A),	  \
+					    (int)(B),			  \
+					    (__v16si)_mm512_undefined_si512 (), \
+					    (__mmask16)(-1)))
+#define _mm512_mask_ror_epi32(W, U, A, B)				  \
+    ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A),	  \
+					    (int)(B),			  \
+					    (__v16si)(__m512i)(W),	  \
+					    (__mmask16)(U)))
+#define _mm512_maskz_ror_epi32(U, A, B)					  \
+    ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A),	  \
+					    (int)(B),			  \
+					    (__v16si)_mm512_setzero_si512 (), \
+					    (__mmask16)(U)))
+#define _mm512_rol_epi64(A, B)						  \
+    ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A),	  \
+					    (int)(B),			  \
+					    (__v8di)_mm512_undefined_si512 (),  \
+					    (__mmask8)(-1)))
+#define _mm512_mask_rol_epi64(W, U, A, B)				  \
+    ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A),	  \
+					    (int)(B),			  \
+					    (__v8di)(__m512i)(W),	  \
+					    (__mmask8)(U)))
+#define _mm512_maskz_rol_epi64(U, A, B)					  \
+    ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A),	  \
+					    (int)(B),			  \
+					    (__v8di)_mm512_setzero_si512 (),  \
+					    (__mmask8)(U)))
+
+#define _mm512_ror_epi64(A, B)						  \
+    ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A),	  \
+					    (int)(B),			  \
+					    (__v8di)_mm512_undefined_si512 (),  \
+					    (__mmask8)(-1)))
+#define _mm512_mask_ror_epi64(W, U, A, B)				  \
+    ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A),	  \
+					    (int)(B),			  \
+					    (__v8di)(__m512i)(W),	  \
+					    (__mmask8)(U)))
+#define _mm512_maskz_ror_epi64(U, A, B)					  \
+    ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A),	  \
+					    (int)(B),			  \
+					    (__v8di)_mm512_setzero_si512 (),  \
+					    (__mmask8)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_and_si512 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A,
+						 (__v16si) __B,
+						 (__v16si)
+						 _mm512_undefined_si512 (),
+						 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_and_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A,
+						 (__v16si) __B,
+						 (__v16si)
+						 _mm512_undefined_si512 (),
+						 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_and_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A,
+						 (__v16si) __B,
+						 (__v16si) __W,
+						 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_and_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A,
+						 (__v16si) __B,
+						 (__v16si)
+						 _mm512_setzero_si512 (),
+						 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_and_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A,
+						 (__v8di) __B,
+						 (__v8di)
+						 _mm512_undefined_si512 (),
+						 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_and_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A,
+						 (__v8di) __B,
+						 (__v8di) __W, __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_and_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A,
+						 (__v8di) __B,
+						 (__v8di)
+						 _mm512_setzero_pd (),
+						 __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_andnot_si512 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si)
+						  _mm512_undefined_si512 (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_andnot_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si)
+						  _mm512_undefined_si512 (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_andnot_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si) __W,
+						  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_andnot_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si)
+						  _mm512_setzero_si512 (),
+						  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_andnot_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di)
+						  _mm512_undefined_si512 (),
+						  (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_andnot_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di) __W, __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_andnot_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di)
+						  _mm512_setzero_pd (),
+						  __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_test_epi32_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
+						(__v16si) __B,
+						(__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
+						(__v16si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_test_epi64_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A,
+					       (__v8di) __B,
+					       (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, (__v8di) __B, __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_testn_epi32_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
+						 (__v16si) __B,
+						 (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
+						 (__v16si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_testn_epi64_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
+						(__v8di) __B,
+						(__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
+						(__v8di) __B, __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpackhi_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A,
+						     (__v16si) __B,
+						     (__v16si)
+						     _mm512_undefined_si512 (),
+						     (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpackhi_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+			    __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A,
+						     (__v16si) __B,
+						     (__v16si) __W,
+						     (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpackhi_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A,
+						     (__v16si) __B,
+						     (__v16si)
+						     _mm512_setzero_si512 (),
+						     (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpackhi_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A,
+						      (__v8di) __B,
+						      (__v8di)
+						      _mm512_undefined_si512 (),
+						      (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpackhi_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A,
+						      (__v8di) __B,
+						      (__v8di) __W,
+						      (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpackhi_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A,
+						      (__v8di) __B,
+						      (__v8di)
+						      _mm512_setzero_si512 (),
+						      (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpacklo_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A,
+						     (__v16si) __B,
+						     (__v16si)
+						     _mm512_undefined_si512 (),
+						     (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpacklo_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+			    __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A,
+						     (__v16si) __B,
+						     (__v16si) __W,
+						     (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpacklo_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A,
+						     (__v16si) __B,
+						     (__v16si)
+						     _mm512_setzero_si512 (),
+						     (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A,
+						      (__v8di) __B,
+						      (__v8di)
+						      _mm512_undefined_si512 (),
+						      (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A,
+						      (__v8di) __B,
+						      (__v8di) __W,
+						      (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A,
+						      (__v8di) __B,
+						      (__v8di)
+						      _mm512_setzero_si512 (),
+						      (__mmask8) __U);
+}
+
+#ifdef __x86_64__
+#ifdef __OPTIMIZE__
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_u64 (__m128 __A, const int __R)
+{
+  return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_si64 (__m128 __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_i64 (__m128 __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundss_u64 (__m128 __A, const int __R)
+{
+  return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundss_i64 (__m128 __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundss_si64 (__m128 __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R);
+}
+#else
+#define _mm_cvt_roundss_u64(A, B)   \
+    ((unsigned long long)__builtin_ia32_vcvtss2usi64(A, B))
+
+#define _mm_cvt_roundss_si64(A, B)   \
+    ((long long)__builtin_ia32_vcvtss2si64(A, B))
+
+#define _mm_cvt_roundss_i64(A, B)   \
+    ((long long)__builtin_ia32_vcvtss2si64(A, B))
+
+#define _mm_cvtt_roundss_u64(A, B)  \
+    ((unsigned long long)__builtin_ia32_vcvttss2usi64(A, B))
+
+#define _mm_cvtt_roundss_i64(A, B)  \
+    ((long long)__builtin_ia32_vcvttss2si64(A, B))
+
+#define _mm_cvtt_roundss_si64(A, B)  \
+    ((long long)__builtin_ia32_vcvttss2si64(A, B))
+#endif
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_u32 (__m128 __A, const int __R)
+{
+  return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_si32 (__m128 __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_i32 (__m128 __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundss_u32 (__m128 __A, const int __R)
+{
+  return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundss_i32 (__m128 __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundss_si32 (__m128 __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R);
+}
+#else
+#define _mm_cvt_roundss_u32(A, B)   \
+    ((unsigned)__builtin_ia32_vcvtss2usi32(A, B))
+
+#define _mm_cvt_roundss_si32(A, B)   \
+    ((int)__builtin_ia32_vcvtss2si32(A, B))
+
+#define _mm_cvt_roundss_i32(A, B)   \
+    ((int)__builtin_ia32_vcvtss2si32(A, B))
+
+#define _mm_cvtt_roundss_u32(A, B)  \
+    ((unsigned)__builtin_ia32_vcvttss2usi32(A, B))
+
+#define _mm_cvtt_roundss_si32(A, B)  \
+    ((int)__builtin_ia32_vcvttss2si32(A, B))
+
+#define _mm_cvtt_roundss_i32(A, B)  \
+    ((int)__builtin_ia32_vcvttss2si32(A, B))
+#endif
+
+#ifdef __x86_64__
+#ifdef __OPTIMIZE__
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_u64 (__m128d __A, const int __R)
+{
+  return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_si64 (__m128d __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_i64 (__m128d __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsd_u64 (__m128d __A, const int __R)
+{
+  return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsd_si64 (__m128d __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsd_i64 (__m128d __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R);
+}
+#else
+#define _mm_cvt_roundsd_u64(A, B)   \
+    ((unsigned long long)__builtin_ia32_vcvtsd2usi64(A, B))
+
+#define _mm_cvt_roundsd_si64(A, B)   \
+    ((long long)__builtin_ia32_vcvtsd2si64(A, B))
+
+#define _mm_cvt_roundsd_i64(A, B)   \
+    ((long long)__builtin_ia32_vcvtsd2si64(A, B))
+
+#define _mm_cvtt_roundsd_u64(A, B)   \
+    ((unsigned long long)__builtin_ia32_vcvttsd2usi64(A, B))
+
+#define _mm_cvtt_roundsd_si64(A, B)   \
+    ((long long)__builtin_ia32_vcvttsd2si64(A, B))
+
+#define _mm_cvtt_roundsd_i64(A, B)   \
+    ((long long)__builtin_ia32_vcvttsd2si64(A, B))
+#endif
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_u32 (__m128d __A, const int __R)
+{
+  return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_si32 (__m128d __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_i32 (__m128d __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsd_u32 (__m128d __A, const int __R)
+{
+  return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsd_i32 (__m128d __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsd_si32 (__m128d __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R);
+}
+#else
+#define _mm_cvt_roundsd_u32(A, B)   \
+    ((unsigned)__builtin_ia32_vcvtsd2usi32(A, B))
+
+#define _mm_cvt_roundsd_si32(A, B)   \
+    ((int)__builtin_ia32_vcvtsd2si32(A, B))
+
+#define _mm_cvt_roundsd_i32(A, B)   \
+    ((int)__builtin_ia32_vcvtsd2si32(A, B))
+
+#define _mm_cvtt_roundsd_u32(A, B)   \
+    ((unsigned)__builtin_ia32_vcvttsd2usi32(A, B))
+
+#define _mm_cvtt_roundsd_si32(A, B)   \
+    ((int)__builtin_ia32_vcvttsd2si32(A, B))
+
+#define _mm_cvtt_roundsd_i32(A, B)   \
+    ((int)__builtin_ia32_vcvttsd2si32(A, B))
+#endif
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movedup_pd (__m512d __A)
+{
+  return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A,
+						   (__v8df)
+						   _mm512_undefined_pd (),
+						   (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A,
+						   (__v8df) __W,
+						   (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A,
+						   (__v8df)
+						   _mm512_setzero_pd (),
+						   (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpacklo_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    (__v8df)
+						    _mm512_undefined_pd (),
+						    (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpacklo_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    (__v8df) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    (__v8df)
+						    _mm512_setzero_pd (),
+						    (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpackhi_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    (__v8df)
+						    _mm512_undefined_pd (),
+						    (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpackhi_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    (__v8df) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpackhi_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    (__v8df)
+						    _mm512_setzero_pd (),
+						    (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpackhi_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   (__v16sf)
+						   _mm512_undefined_ps (),
+						   (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpackhi_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   (__v16sf) __W,
+						   (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   (__v16sf)
+						   _mm512_setzero_ps (),
+						   (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundps_pd (__m256 __A, const int __R)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+						    (__v8df)
+						    _mm512_undefined_pd (),
+						    (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundps_pd (__m512d __W, __mmask8 __U, __m256 __A,
+			    const int __R)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+						    (__v8df) __W,
+						    (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundps_pd (__mmask8 __U, __m256 __A, const int __R)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+						    (__v8df)
+						    _mm512_setzero_pd (),
+						    (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_ps (__m256i __A, const int __R)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+						    (__v16sf)
+						    _mm512_undefined_ps (),
+						    (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_ps (__m512 __W, __mmask16 __U, __m256i __A,
+			    const int __R)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+						    (__v16sf) __W,
+						    (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_ps (__mmask16 __U, __m256i __A, const int __R)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+						    (__v16sf)
+						    _mm512_setzero_ps (),
+						    (__mmask16) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundps_ph (__m512 __A, const int __I)
+{
+  return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
+						     __I,
+						     (__v16hi)
+						     _mm256_undefined_si256 (),
+						     -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtps_ph (__m512 __A, const int __I)
+{
+  return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
+						     __I,
+						     (__v16hi)
+						     _mm256_undefined_si256 (),
+						     -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundps_ph (__m256i __U, __mmask16 __W, __m512 __A,
+			    const int __I)
+{
+  return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
+						     __I,
+						     (__v16hi) __U,
+						     (__mmask16) __W);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtps_ph (__m256i __U, __mmask16 __W, __m512 __A, const int __I)
+{
+  return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
+						     __I,
+						     (__v16hi) __U,
+						     (__mmask16) __W);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundps_ph (__mmask16 __W, __m512 __A, const int __I)
+{
+  return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
+						     __I,
+						     (__v16hi)
+						     _mm256_setzero_si256 (),
+						     (__mmask16) __W);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtps_ph (__mmask16 __W, __m512 __A, const int __I)
+{
+  return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
+						     __I,
+						     (__v16hi)
+						     _mm256_setzero_si256 (),
+						     (__mmask16) __W);
+}
+#else
+#define _mm512_cvt_roundps_pd(A, B)		 \
+    (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_undefined_pd(), -1, B)
+
+#define _mm512_mask_cvt_roundps_pd(W, U, A, B)   \
+    (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)(W), U, B)
+
+#define _mm512_maskz_cvt_roundps_pd(U, A, B)     \
+    (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_setzero_pd(), U, B)
+
+#define _mm512_cvt_roundph_ps(A, B)		 \
+    (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)_mm512_undefined_ps(), -1, B)
+
+#define _mm512_mask_cvt_roundph_ps(W, U, A, B)   \
+    (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)(W), U, B)
+
+#define _mm512_maskz_cvt_roundph_ps(U, A, B)     \
+    (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)_mm512_setzero_ps(), U, B)
+
+#define _mm512_cvt_roundps_ph(A, I)						 \
+  ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) A, (int) (I),\
+    (__v16hi)_mm256_undefined_si256 (), -1))
+#define _mm512_cvtps_ph(A, I)						 \
+  ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) A, (int) (I),\
+    (__v16hi)_mm256_undefined_si256 (), -1))
+#define _mm512_mask_cvt_roundps_ph(U, W, A, I)				 \
+  ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) A, (int) (I),\
+    (__v16hi)(__m256i)(U), (__mmask16) (W)))
+#define _mm512_mask_cvtps_ph(U, W, A, I)				 \
+  ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) A, (int) (I),\
+    (__v16hi)(__m256i)(U), (__mmask16) (W)))
+#define _mm512_maskz_cvt_roundps_ph(W, A, I)					 \
+  ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) A, (int) (I),\
+    (__v16hi)_mm256_setzero_si256 (), (__mmask16) (W)))
+#define _mm512_maskz_cvtps_ph(W, A, I)					 \
+  ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) A, (int) (I),\
+    (__v16hi)_mm256_setzero_si256 (), (__mmask16) (W)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundpd_ps (__m512d __A, const int __R)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+						   (__v8sf)
+						   _mm256_undefined_ps (),
+						   (__mmask8) -1, __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundpd_ps (__m256 __W, __mmask8 __U, __m512d __A,
+			    const int __R)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+						   (__v8sf) __W,
+						   (__mmask8) __U, __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundpd_ps (__mmask8 __U, __m512d __A, const int __R)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+						   (__v8sf)
+						   _mm256_setzero_ps (),
+						   (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_ss (__m128 __A, __m128d __B, const int __R)
+{
+  return (__m128) __builtin_ia32_cvtsd2ss_round ((__v4sf) __A,
+						 (__v2df) __B,
+						 __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_sd (__m128d __A, __m128 __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_cvtss2sd_round ((__v2df) __A,
+						  (__v4sf) __B,
+						  __R);
+}
+#else
+#define _mm512_cvt_roundpd_ps(A, B)		 \
+    (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_undefined_ps(), -1, B)
+
+#define _mm512_mask_cvt_roundpd_ps(W, U, A, B)   \
+    (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)(W), U, B)
+
+#define _mm512_maskz_cvt_roundpd_ps(U, A, B)     \
+    (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_setzero_ps(), U, B)
+
+#define _mm_cvt_roundsd_ss(A, B, C)		 \
+    (__m128)__builtin_ia32_cvtsd2ss_round(A, B, C)
+
+#define _mm_cvt_roundss_sd(A, B, C)		 \
+    (__m128d)__builtin_ia32_cvtss2sd_round(A, B, C)
+#endif
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_stream_si512 (__m512i * __P, __m512i __A)
+{
+  __builtin_ia32_movntdq512 ((__v8di *) __P, (__v8di) __A);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_stream_ps (float *__P, __m512 __A)
+{
+  __builtin_ia32_movntps512 (__P, (__v16sf) __A);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_stream_pd (double *__P, __m512d __A)
+{
+  __builtin_ia32_movntpd512 (__P, (__v8df) __A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_stream_load_si512 (void *__P)
+{
+  return __builtin_ia32_movntdqa512 ((__v8di *)__P);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A,
+						    (__v4sf) __B,
+						    __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A,
+						     (__v2df) __B,
+						     __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_round_ps (__m512 __A, const int __R)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+						   (__v16sf)
+						   _mm512_undefined_ps (),
+						   (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+			     const int __R)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+						   (__v16sf) __W,
+						   (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_round_ps (__mmask16 __U, __m512 __A, const int __R)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+						   (__v16sf)
+						   _mm512_setzero_ps (),
+						   (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_round_pd (__m512d __A, const int __R)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+						    (__v8df)
+						    _mm512_undefined_pd (),
+						    (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+			     const int __R)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+						    (__v8df) __W,
+						    (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_round_pd (__mmask8 __U, __m512d __A, const int __R)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+						    (__v8df)
+						    _mm512_setzero_pd (),
+						    (__mmask8) __U, __R);
+}
+
+/* Constants for mantissa extraction */
+typedef enum
+{
+  _MM_MANT_NORM_1_2,		/* interval [1, 2)      */
+  _MM_MANT_NORM_p5_2,		/* interval [0.5, 2)    */
+  _MM_MANT_NORM_p5_1,		/* interval [0.5, 1)    */
+  _MM_MANT_NORM_p75_1p5		/* interval [0.75, 1.5) */
+} _MM_MANTISSA_NORM_ENUM;
+
+typedef enum
+{
+  _MM_MANT_SIGN_src,		/* sign = sign(SRC)     */
+  _MM_MANT_SIGN_zero,		/* sign = 0             */
+  _MM_MANT_SIGN_nan		/* DEST = NaN if sign(SRC) = 1 */
+} _MM_MANTISSA_SIGN_ENUM;
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_round_pd (__m512d __A, _MM_MANTISSA_NORM_ENUM __B,
+			 _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
+						     (__C << 2) | __B,
+						     _mm512_undefined_pd (),
+						     (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+			      _MM_MANTISSA_NORM_ENUM __B,
+			      _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
+						     (__C << 2) | __B,
+						     (__v8df) __W, __U,
+						     __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_round_pd (__mmask8 __U, __m512d __A,
+			       _MM_MANTISSA_NORM_ENUM __B,
+			       _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
+						     (__C << 2) | __B,
+						     (__v8df)
+						     _mm512_setzero_pd (),
+						     __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_round_ps (__m512 __A, _MM_MANTISSA_NORM_ENUM __B,
+			 _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+  return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
+						    (__C << 2) | __B,
+						    _mm512_undefined_ps (),
+						    (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+			      _MM_MANTISSA_NORM_ENUM __B,
+			      _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+  return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
+						    (__C << 2) | __B,
+						    (__v16sf) __W, __U,
+						    __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_round_ps (__mmask16 __U, __m512 __A,
+			       _MM_MANTISSA_NORM_ENUM __B,
+			       _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+  return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
+						    (__C << 2) | __B,
+						    (__v16sf)
+						    _mm512_setzero_ps (),
+						    __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_round_sd (__m128d __A, __m128d __B,
+		      _MM_MANTISSA_NORM_ENUM __C,
+		      _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+  return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A,
+						  (__v2df) __B,
+						  (__D << 2) | __C,
+						   __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_round_ss (__m128 __A, __m128 __B,
+		      _MM_MANTISSA_NORM_ENUM __C,
+		      _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+  return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A,
+						  (__v4sf) __B,
+						  (__D << 2) | __C,
+						  __R);
+}
+
+#else
+#define _mm512_getmant_round_pd(X, B, C, R)                                                  \
+  ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X),                 \
+                                              (int)(((C)<<2) | (B)),                \
+                                              (__v8df)(__m512d)_mm512_undefined_pd(), \
+                                              (__mmask8)-1,\
+					      (R)))
+
+#define _mm512_mask_getmant_round_pd(W, U, X, B, C, R)                                       \
+  ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X),                 \
+                                              (int)(((C)<<2) | (B)),                \
+                                              (__v8df)(__m512d)(W),                 \
+                                              (__mmask8)(U),\
+					      (R)))
+
+#define _mm512_maskz_getmant_round_pd(U, X, B, C, R)                                         \
+  ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X),                 \
+                                              (int)(((C)<<2) | (B)),                \
+                                              (__v8df)(__m512d)_mm512_setzero_pd(), \
+                                              (__mmask8)(U),\
+					      (R)))
+#define _mm512_getmant_round_ps(X, B, C, R)                                                  \
+  ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X),                  \
+                                             (int)(((C)<<2) | (B)),                 \
+                                             (__v16sf)(__m512)_mm512_undefined_ps(), \
+                                             (__mmask16)-1,\
+					     (R)))
+
+#define _mm512_mask_getmant_round_ps(W, U, X, B, C, R)                                       \
+  ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X),                  \
+                                             (int)(((C)<<2) | (B)),                 \
+                                             (__v16sf)(__m512)(W),                  \
+                                             (__mmask16)(U),\
+					     (R)))
+
+#define _mm512_maskz_getmant_round_ps(U, X, B, C, R)                                         \
+  ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X),                  \
+                                             (int)(((C)<<2) | (B)),                 \
+                                             (__v16sf)(__m512)_mm512_setzero_ps(),  \
+                                             (__mmask16)(U),\
+					     (R)))
+#define _mm_getmant_round_sd(X, Y, C, D, R)                                                  \
+  ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X),                    \
+					    (__v2df)(__m128d)(Y),	\
+					    (int)(((D)<<2) | (C)),	\
+					    (R)))
+
+#define _mm_getmant_round_ss(X, Y, C, D, R)                                                  \
+  ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X),                      \
+					   (__v4sf)(__m128)(Y),		\
+					   (int)(((D)<<2) | (C)),	\
+					   (R)))
+
+#define _mm_getexp_round_ss(A, B, R)						      \
+  ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), R))
+
+#define _mm_getexp_round_sd(A, B, R)						       \
+  ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), R))
+
+#define _mm512_getexp_round_ps(A, R)						\
+  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A),		\
+  (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, R))
+
+#define _mm512_mask_getexp_round_ps(W, U, A, R)					\
+  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A),		\
+  (__v16sf)(__m512)(W), (__mmask16)(U), R))
+
+#define _mm512_maskz_getexp_round_ps(U, A, R)					\
+  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A),		\
+  (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), R))
+
+#define _mm512_getexp_round_pd(A, R)						\
+  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A),		\
+  (__v8df)_mm512_undefined_pd(), (__mmask8)-1, R))
+
+#define _mm512_mask_getexp_round_pd(W, U, A, R)					\
+  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A),		\
+  (__v8df)(__m512d)(W), (__mmask8)(U), R))
+
+#define _mm512_maskz_getexp_round_pd(U, A, R)					\
+  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A),		\
+  (__v8df)_mm512_setzero_pd(), (__mmask8)(U), R))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_round_ps (__m512 __A, const int __imm, const int __R)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm,
+						  (__v16sf)
+						  _mm512_undefined_ps (),
+						  -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_round_ps (__m512 __A, __mmask16 __B, __m512 __C,
+				 const int __imm, const int __R)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __C, __imm,
+						  (__v16sf) __A,
+						  (__mmask16) __B, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_round_ps (__mmask16 __A, __m512 __B,
+				  const int __imm, const int __R)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __B,
+						  __imm,
+						  (__v16sf)
+						  _mm512_setzero_ps (),
+						  (__mmask16) __A, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_round_pd (__m512d __A, const int __imm, const int __R)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm,
+						   (__v8df)
+						   _mm512_undefined_pd (),
+						   -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_round_pd (__m512d __A, __mmask8 __B,
+				 __m512d __C, const int __imm, const int __R)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __C, __imm,
+						   (__v8df) __A,
+						   (__mmask8) __B, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_round_pd (__mmask8 __A, __m512d __B,
+				  const int __imm, const int __R)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __B,
+						   __imm,
+						   (__v8df)
+						   _mm512_setzero_pd (),
+						   (__mmask8) __A, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_round_ss (__m128 __A, __m128 __B, const int __imm, const int __R)
+{
+  return (__m128) __builtin_ia32_rndscaless_round ((__v4sf) __A,
+						   (__v4sf) __B, __imm, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_round_sd (__m128d __A, __m128d __B, const int __imm,
+			 const int __R)
+{
+  return (__m128d) __builtin_ia32_rndscalesd_round ((__v2df) __A,
+						    (__v2df) __B, __imm, __R);
+}
+
+#else
+#define _mm512_roundscale_round_ps(A, B, R) \
+  ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), (int)(B),\
+    (__v16sf)_mm512_undefined_ps(), (__mmask16)(-1), R))
+#define _mm512_mask_roundscale_round_ps(A, B, C, D, R)				\
+  ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(C),	\
+					    (int)(D),			\
+					    (__v16sf)(__m512)(A),	\
+					    (__mmask16)(B), R))
+#define _mm512_maskz_roundscale_round_ps(A, B, C, R)				\
+  ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(B),	\
+					    (int)(C),			\
+					    (__v16sf)_mm512_setzero_ps(),\
+					    (__mmask16)(A), R))
+#define _mm512_roundscale_round_pd(A, B, R) \
+  ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(A), (int)(B),\
+    (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), R))
+#define _mm512_mask_roundscale_round_pd(A, B, C, D, R)				\
+  ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(C),	\
+					     (int)(D),			\
+					     (__v8df)(__m512d)(A),	\
+					     (__mmask8)(B), R))
+#define _mm512_maskz_roundscale_round_pd(A, B, C, R)				\
+  ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(B),	\
+					     (int)(C),			\
+					     (__v8df)_mm512_setzero_pd(),\
+					     (__mmask8)(A), R))
+#define _mm_roundscale_round_ss(A, B, C, R)					\
+  ((__m128) __builtin_ia32_rndscaless_round ((__v4sf)(__m128)(A),	\
+    (__v4sf)(__m128)(B), (int)(C), R))
+#define _mm_roundscale_round_sd(A, B, C, R)					\
+  ((__m128d) __builtin_ia32_rndscalesd_round ((__v2df)(__m128d)(A),	\
+    (__v2df)(__m128d)(B), (int)(C), R))
+#endif
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_floor_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+						  _MM_FROUND_FLOOR,
+						  (__v16sf) __A, -1,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_floor_pd (__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+						   _MM_FROUND_FLOOR,
+						   (__v8df) __A, -1,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_ceil_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+						  _MM_FROUND_CEIL,
+						  (__v16sf) __A, -1,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_ceil_pd (__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+						   _MM_FROUND_CEIL,
+						   (__v8df) __A, -1,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+						  _MM_FROUND_FLOOR,
+						  (__v16sf) __W, __U,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+						   _MM_FROUND_FLOOR,
+						   (__v8df) __W, __U,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+						  _MM_FROUND_CEIL,
+						  (__v16sf) __W, __U,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+						   _MM_FROUND_CEIL,
+						   (__v8df) __W, __U,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_alignr_epi32 (__m512i __A, __m512i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A,
+						  (__v16si) __B, __imm,
+						  (__v16si)
+						  _mm512_undefined_si512 (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_alignr_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+			  __m512i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A,
+						  (__v16si) __B, __imm,
+						  (__v16si) __W,
+						  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_alignr_epi32 (__mmask16 __U, __m512i __A, __m512i __B,
+			   const int __imm)
+{
+  return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A,
+						  (__v16si) __B, __imm,
+						  (__v16si)
+						  _mm512_setzero_si512 (),
+						  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_alignr_epi64 (__m512i __A, __m512i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A,
+						  (__v8di) __B, __imm,
+						  (__v8di)
+						  _mm512_undefined_si512 (),
+						  (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_alignr_epi64 (__m512i __W, __mmask8 __U, __m512i __A,
+			  __m512i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A,
+						  (__v8di) __B, __imm,
+						  (__v8di) __W,
+						  (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_alignr_epi64 (__mmask8 __U, __m512i __A, __m512i __B,
+			   const int __imm)
+{
+  return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A,
+						  (__v8di) __B, __imm,
+						  (__v8di)
+						  _mm512_setzero_si512 (),
+						  (__mmask8) __U);
+}
+#else
+#define _mm512_alignr_epi32(X, Y, C)                                        \
+    ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X),         \
+        (__v16si)(__m512i)(Y), (int)(C), (__v16si)_mm512_undefined_si512 (),\
+        (__mmask16)-1))
+
+#define _mm512_mask_alignr_epi32(W, U, X, Y, C)                             \
+    ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X),         \
+        (__v16si)(__m512i)(Y), (int)(C), (__v16si)(__m512i)(W),             \
+        (__mmask16)(U)))
+
+#define _mm512_maskz_alignr_epi32(U, X, Y, C)                               \
+    ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X),         \
+        (__v16si)(__m512i)(Y), (int)(C), (__v16si)_mm512_setzero_si512 (),\
+        (__mmask16)(U)))
+
+#define _mm512_alignr_epi64(X, Y, C)                                        \
+    ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X),          \
+        (__v8di)(__m512i)(Y), (int)(C), (__v8di)_mm512_undefined_si512 (),  \
+	(__mmask8)-1))
+
+#define _mm512_mask_alignr_epi64(W, U, X, Y, C)                             \
+    ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X),          \
+        (__v8di)(__m512i)(Y), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U)))
+
+#define _mm512_maskz_alignr_epi64(U, X, Y, C)                               \
+    ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X),          \
+        (__v8di)(__m512i)(Y), (int)(C), (__v8di)_mm512_setzero_si512 (),\
+        (__mmask8)(U)))
+#endif
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_epi32_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_pcmpeqd512_mask ((__v16si) __A,
+						     (__v16si) __B,
+						     (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_pcmpeqd512_mask ((__v16si) __A,
+						     (__v16si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpeqq512_mask ((__v8di) __A,
+						    (__v8di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_epi64_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpeqq512_mask ((__v8di) __A,
+						    (__v8di) __B,
+						    (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpgt_epi32_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_pcmpgtd512_mask ((__v16si) __A,
+						     (__v16si) __B,
+						     (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpgt_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_pcmpgtd512_mask ((__v16si) __A,
+						     (__v16si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpgt_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpgtq512_mask ((__v8di) __A,
+						    (__v8di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpgt_epi64_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpgtq512_mask ((__v8di) __A,
+						    (__v8di) __B,
+						    (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epi32_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+						    (__v16si) __Y, 5,
+						    (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epu32_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+						    (__v16si) __Y, 5,
+						    (__mmask16) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epi64_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+						    (__v8di) __Y, 5,
+						    (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epu64_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+						    (__v8di) __Y, 5,
+						    (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epi32_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+						    (__v16si) __Y, 2,
+						    (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epu32_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+						    (__v16si) __Y, 2,
+						    (__mmask16) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epi64_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+						    (__v8di) __Y, 2,
+						    (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epu64_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+						    (__v8di) __Y, 2,
+						    (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epi32_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+						    (__v16si) __Y, 1,
+						    (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epu32_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+						    (__v16si) __Y, 1,
+						    (__mmask16) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epi64_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+						    (__v8di) __Y, 1,
+						    (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epu64_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+						    (__v8di) __Y, 1,
+						    (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epi32_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+						    (__v16si) __Y, 4,
+						    (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epu32_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+						    (__v16si) __Y, 4,
+						    (__mmask16) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epi64_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+						    (__v8di) __Y, 4,
+						    (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epu64_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+						    (__v8di) __Y, 4,
+						    (__mmask8) -1);
+}
+
+#define _MM_CMPINT_EQ	    0x0
+#define _MM_CMPINT_LT	    0x1
+#define _MM_CMPINT_LE	    0x2
+#define _MM_CMPINT_UNUSED   0x3
+#define _MM_CMPINT_NE	    0x4
+#define _MM_CMPINT_NLT	    0x5
+#define _MM_CMPINT_GE	    0x5
+#define _MM_CMPINT_NLE	    0x6
+#define _MM_CMPINT_GT	    0x6
+
+#ifdef __OPTIMIZE__
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epi64_mask (__m512i __X, __m512i __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+						 (__v8di) __Y, __P,
+						 (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epi32_mask (__m512i __X, __m512i __Y, const int __P)
+{
+  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+						  (__v16si) __Y, __P,
+						  (__mmask16) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epu64_mask (__m512i __X, __m512i __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+						  (__v8di) __Y, __P,
+						  (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epu32_mask (__m512i __X, __m512i __Y, const int __P)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+						   (__v16si) __Y, __P,
+						   (__mmask16) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_round_pd_mask (__m512d __X, __m512d __Y, const int __P,
+			  const int __R)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+						  (__v8df) __Y, __P,
+						  (__mmask8) -1, __R);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_round_ps_mask (__m512 __X, __m512 __Y, const int __P, const int __R)
+{
+  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+						   (__v16sf) __Y, __P,
+						   (__mmask16) -1, __R);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epi64_mask (__mmask8 __U, __m512i __X, __m512i __Y,
+			    const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+						 (__v8di) __Y, __P,
+						 (__mmask8) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epi32_mask (__mmask16 __U, __m512i __X, __m512i __Y,
+			    const int __P)
+{
+  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+						  (__v16si) __Y, __P,
+						  (__mmask16) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epu64_mask (__mmask8 __U, __m512i __X, __m512i __Y,
+			    const int __P)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+						  (__v8di) __Y, __P,
+						  (__mmask8) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epu32_mask (__mmask16 __U, __m512i __X, __m512i __Y,
+			    const int __P)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+						   (__v16si) __Y, __P,
+						   (__mmask16) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_round_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y,
+			       const int __P, const int __R)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+						  (__v8df) __Y, __P,
+						  (__mmask8) __U, __R);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_round_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y,
+			       const int __P, const int __R)
+{
+  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+						   (__v16sf) __Y, __P,
+						   (__mmask16) __U, __R);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_round_sd_mask (__m128d __X, __m128d __Y, const int __P, const int __R)
+{
+  return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
+					       (__v2df) __Y, __P,
+					       (__mmask8) -1, __R);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_round_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y,
+			    const int __P, const int __R)
+{
+  return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
+					       (__v2df) __Y, __P,
+					       (__mmask8) __M, __R);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_round_ss_mask (__m128 __X, __m128 __Y, const int __P, const int __R)
+{
+  return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
+					       (__v4sf) __Y, __P,
+					       (__mmask8) -1, __R);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_round_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y,
+			    const int __P, const int __R)
+{
+  return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
+					       (__v4sf) __Y, __P,
+					       (__mmask8) __M, __R);
+}
+
+#else
+#define _mm512_cmp_epi64_mask(X, Y, P)					\
+  ((__mmask8) __builtin_ia32_cmpq512_mask ((__v8di)(__m512i)(X),	\
+					   (__v8di)(__m512i)(Y), (int)(P),\
+					   (__mmask8)-1))
+
+#define _mm512_cmp_epi32_mask(X, Y, P)					\
+  ((__mmask8) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X),	\
+					   (__v16si)(__m512i)(Y), (int)(P),\
+					   (__mmask16)-1))
+
+#define _mm512_cmp_epu64_mask(X, Y, P)					\
+  ((__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di)(__m512i)(X),	\
+					    (__v8di)(__m512i)(Y), (int)(P),\
+					    (__mmask8)-1))
+
+#define _mm512_cmp_epu32_mask(X, Y, P)					\
+  ((__mmask8) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X),	\
+					    (__v16si)(__m512i)(Y), (int)(P),\
+					    (__mmask16)-1))
+
+#define _mm512_cmp_round_pd_mask(X, Y, P, R)					\
+  ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X),	\
+					    (__v8df)(__m512d)(Y), (int)(P),\
+					    (__mmask8)-1, R))
+
+#define _mm512_cmp_round_ps_mask(X, Y, P, R)					\
+  ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X),	\
+					     (__v16sf)(__m512)(Y), (int)(P),\
+					     (__mmask16)-1, R))
+
+#define _mm512_mask_cmp_epi64_mask(M, X, Y, P)					\
+  ((__mmask8) __builtin_ia32_cmpq512_mask ((__v8di)(__m512i)(X),	\
+					   (__v8di)(__m512i)(Y), (int)(P),\
+					   (__mmask8)M))
+
+#define _mm512_mask_cmp_epi32_mask(M, X, Y, P)					\
+  ((__mmask8) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X),	\
+					   (__v16si)(__m512i)(Y), (int)(P),\
+					   (__mmask16)M))
+
+#define _mm512_mask_cmp_epu64_mask(M, X, Y, P)					\
+  ((__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di)(__m512i)(X),	\
+					    (__v8di)(__m512i)(Y), (int)(P),\
+					    (__mmask8)M))
+
+#define _mm512_mask_cmp_epu32_mask(M, X, Y, P)					\
+  ((__mmask8) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X),	\
+					    (__v16si)(__m512i)(Y), (int)(P),\
+					    (__mmask16)M))
+
+#define _mm512_mask_cmp_round_pd_mask(M, X, Y, P, R)					\
+  ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X),	\
+					    (__v8df)(__m512d)(Y), (int)(P),\
+					    (__mmask8)M, R))
+
+#define _mm512_mask_cmp_round_ps_mask(M, X, Y, P, R)					\
+  ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X),	\
+					     (__v16sf)(__m512)(Y), (int)(P),\
+					     (__mmask16)M, R))
+
+#define _mm_cmp_round_sd_mask(X, Y, P, R)					\
+  ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X),		\
+					 (__v2df)(__m128d)(Y), (int)(P),\
+					 (__mmask8)-1, R))
+
+#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R)					\
+  ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X),		\
+					 (__v2df)(__m128d)(Y), (int)(P),\
+					 (M), R))
+
+#define _mm_cmp_round_ss_mask(X, Y, P, R)					\
+  ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X),		\
+					 (__v4sf)(__m128)(Y), (int)(P), \
+					 (__mmask8)-1, R))
+
+#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R)					\
+  ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X),		\
+					 (__v4sf)(__m128)(Y), (int)(P), \
+					 (M), R))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32gather_ps (__m512i __index, float const *__addr, int __scale)
+{
+  __m512 v1_old = _mm512_undefined_ps ();
+  __mmask16 mask = 0xFFFF;
+
+  return (__m512) __builtin_ia32_gathersiv16sf ((__v16sf) v1_old,
+						__addr,
+						(__v16si) __index,
+						mask, __scale);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32gather_ps (__m512 v1_old, __mmask16 __mask,
+			  __m512i __index, float const *__addr, int __scale)
+{
+  return (__m512) __builtin_ia32_gathersiv16sf ((__v16sf) v1_old,
+						__addr,
+						(__v16si) __index,
+						__mask, __scale);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32gather_pd (__m256i __index, double const *__addr, int __scale)
+{
+  __m512d v1_old = _mm512_undefined_pd ();
+  __mmask8 mask = 0xFF;
+
+  return (__m512d) __builtin_ia32_gathersiv8df ((__v8df) v1_old,
+						__addr,
+						(__v8si) __index, mask,
+						__scale);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32gather_pd (__m512d __v1_old, __mmask8 __mask,
+			  __m256i __index, double const *__addr, int __scale)
+{
+  return (__m512d) __builtin_ia32_gathersiv8df ((__v8df) __v1_old,
+						__addr,
+						(__v8si) __index,
+						__mask, __scale);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64gather_ps (__m512i __index, float const *__addr, int __scale)
+{
+  __m256 v1_old = _mm256_undefined_ps ();
+  __mmask8 mask = 0xFF;
+
+  return (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf) v1_old,
+						__addr,
+						(__v8di) __index, mask,
+						__scale);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64gather_ps (__m256 __v1_old, __mmask8 __mask,
+			  __m512i __index, float const *__addr, int __scale)
+{
+  return (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,
+						__addr,
+						(__v8di) __index,
+						__mask, __scale);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64gather_pd (__m512i __index, double const *__addr, int __scale)
+{
+  __m512d v1_old = _mm512_undefined_pd ();
+  __mmask8 mask = 0xFF;
+
+  return (__m512d) __builtin_ia32_gatherdiv8df ((__v8df) v1_old,
+						__addr,
+						(__v8di) __index, mask,
+						__scale);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64gather_pd (__m512d __v1_old, __mmask8 __mask,
+			  __m512i __index, double const *__addr, int __scale)
+{
+  return (__m512d) __builtin_ia32_gatherdiv8df ((__v8df) __v1_old,
+						__addr,
+						(__v8di) __index,
+						__mask, __scale);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32gather_epi32 (__m512i __index, int const *__addr, int __scale)
+{
+  __m512i v1_old = _mm512_undefined_si512 ();
+  __mmask16 mask = 0xFFFF;
+
+  return (__m512i) __builtin_ia32_gathersiv16si ((__v16si) v1_old,
+						 __addr,
+						 (__v16si) __index,
+						 mask, __scale);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32gather_epi32 (__m512i __v1_old, __mmask16 __mask,
+			     __m512i __index, int const *__addr, int __scale)
+{
+  return (__m512i) __builtin_ia32_gathersiv16si ((__v16si) __v1_old,
+						 __addr,
+						 (__v16si) __index,
+						 __mask, __scale);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32gather_epi64 (__m256i __index, long long const *__addr, int __scale)
+{
+  __m512i v1_old = _mm512_undefined_si512 ();
+  __mmask8 mask = 0xFF;
+
+  return (__m512i) __builtin_ia32_gathersiv8di ((__v8di) v1_old,
+						__addr,
+						(__v8si) __index, mask,
+						__scale);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32gather_epi64 (__m512i __v1_old, __mmask8 __mask,
+			     __m256i __index, long long const *__addr,
+			     int __scale)
+{
+  return (__m512i) __builtin_ia32_gathersiv8di ((__v8di) __v1_old,
+						__addr,
+						(__v8si) __index,
+						__mask, __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64gather_epi32 (__m512i __index, int const *__addr, int __scale)
+{
+  __m256i v1_old = _mm256_undefined_si256 ();
+  __mmask8 mask = 0xFF;
+
+  return (__m256i) __builtin_ia32_gatherdiv16si ((__v8si) v1_old,
+						 __addr,
+						 (__v8di) __index,
+						 mask, __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64gather_epi32 (__m256i __v1_old, __mmask8 __mask,
+			     __m512i __index, int const *__addr, int __scale)
+{
+  return (__m256i) __builtin_ia32_gatherdiv16si ((__v8si) __v1_old,
+						 __addr,
+						 (__v8di) __index,
+						 __mask, __scale);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64gather_epi64 (__m512i __index, long long const *__addr, int __scale)
+{
+  __m512i v1_old = _mm512_undefined_si512 ();
+  __mmask8 mask = 0xFF;
+
+  return (__m512i) __builtin_ia32_gatherdiv8di ((__v8di) v1_old,
+						__addr,
+						(__v8di) __index, mask,
+						__scale);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64gather_epi64 (__m512i __v1_old, __mmask8 __mask,
+			     __m512i __index, long long const *__addr,
+			     int __scale)
+{
+  return (__m512i) __builtin_ia32_gatherdiv8di ((__v8di) __v1_old,
+						__addr,
+						(__v8di) __index,
+						__mask, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32scatter_ps (float *__addr, __m512i __index, __m512 __v1, int __scale)
+{
+  __builtin_ia32_scattersiv16sf (__addr, (__mmask16) 0xFFFF,
+				 (__v16si) __index, (__v16sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32scatter_ps (float *__addr, __mmask16 __mask,
+			   __m512i __index, __m512 __v1, int __scale)
+{
+  __builtin_ia32_scattersiv16sf (__addr, __mask, (__v16si) __index,
+				 (__v16sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32scatter_pd (double *__addr, __m256i __index, __m512d __v1,
+		      int __scale)
+{
+  __builtin_ia32_scattersiv8df (__addr, (__mmask8) 0xFF,
+				(__v8si) __index, (__v8df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32scatter_pd (double *__addr, __mmask8 __mask,
+			   __m256i __index, __m512d __v1, int __scale)
+{
+  __builtin_ia32_scattersiv8df (__addr, __mask, (__v8si) __index,
+				(__v8df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64scatter_ps (float *__addr, __m512i __index, __m256 __v1, int __scale)
+{
+  __builtin_ia32_scatterdiv16sf (__addr, (__mmask8) 0xFF,
+				 (__v8di) __index, (__v8sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64scatter_ps (float *__addr, __mmask8 __mask,
+			   __m512i __index, __m256 __v1, int __scale)
+{
+  __builtin_ia32_scatterdiv16sf (__addr, __mask, (__v8di) __index,
+				 (__v8sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64scatter_pd (double *__addr, __m512i __index, __m512d __v1,
+		      int __scale)
+{
+  __builtin_ia32_scatterdiv8df (__addr, (__mmask8) 0xFF,
+				(__v8di) __index, (__v8df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64scatter_pd (double *__addr, __mmask8 __mask,
+			   __m512i __index, __m512d __v1, int __scale)
+{
+  __builtin_ia32_scatterdiv8df (__addr, __mask, (__v8di) __index,
+				(__v8df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32scatter_epi32 (int *__addr, __m512i __index,
+			 __m512i __v1, int __scale)
+{
+  __builtin_ia32_scattersiv16si (__addr, (__mmask16) 0xFFFF,
+				 (__v16si) __index, (__v16si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32scatter_epi32 (int *__addr, __mmask16 __mask,
+			      __m512i __index, __m512i __v1, int __scale)
+{
+  __builtin_ia32_scattersiv16si (__addr, __mask, (__v16si) __index,
+				 (__v16si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32scatter_epi64 (long long *__addr, __m256i __index,
+			 __m512i __v1, int __scale)
+{
+  __builtin_ia32_scattersiv8di (__addr, (__mmask8) 0xFF,
+				(__v8si) __index, (__v8di) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32scatter_epi64 (long long *__addr, __mmask8 __mask,
+			      __m256i __index, __m512i __v1, int __scale)
+{
+  __builtin_ia32_scattersiv8di (__addr, __mask, (__v8si) __index,
+				(__v8di) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64scatter_epi32 (int *__addr, __m512i __index,
+			 __m256i __v1, int __scale)
+{
+  __builtin_ia32_scatterdiv16si (__addr, (__mmask8) 0xFF,
+				 (__v8di) __index, (__v8si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64scatter_epi32 (int *__addr, __mmask8 __mask,
+			      __m512i __index, __m256i __v1, int __scale)
+{
+  __builtin_ia32_scatterdiv16si (__addr, __mask, (__v8di) __index,
+				 (__v8si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64scatter_epi64 (long long *__addr, __m512i __index,
+			 __m512i __v1, int __scale)
+{
+  __builtin_ia32_scatterdiv8di (__addr, (__mmask8) 0xFF,
+				(__v8di) __index, (__v8di) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64scatter_epi64 (long long *__addr, __mmask8 __mask,
+			      __m512i __index, __m512i __v1, int __scale)
+{
+  __builtin_ia32_scatterdiv8di (__addr, __mask, (__v8di) __index,
+				(__v8di) __v1, __scale);
+}
+#else
+#define _mm512_i32gather_ps(INDEX, ADDR, SCALE)				\
+  (__m512) __builtin_ia32_gathersiv16sf ((__v16sf)_mm512_undefined_ps(),\
+					 (float const *)ADDR,		\
+					 (__v16si)(__m512i)INDEX,	\
+					 (__mmask16)0xFFFF, (int)SCALE)
+
+#define _mm512_mask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE)	\
+  (__m512) __builtin_ia32_gathersiv16sf ((__v16sf)(__m512)V1OLD,	\
+					 (float const *)ADDR,		\
+					 (__v16si)(__m512i)INDEX,	\
+					 (__mmask16)MASK, (int)SCALE)
+
+#define _mm512_i32gather_pd(INDEX, ADDR, SCALE)				\
+  (__m512d) __builtin_ia32_gathersiv8df ((__v8df)_mm512_undefined_pd(),	\
+					 (double const *)ADDR,		\
+					 (__v8si)(__m256i)INDEX,	\
+					 (__mmask8)0xFF, (int)SCALE)
+
+#define _mm512_mask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE)	\
+  (__m512d) __builtin_ia32_gathersiv8df ((__v8df)(__m512d)V1OLD,	\
+					 (double const *)ADDR,		\
+					 (__v8si)(__m256i)INDEX,	\
+					 (__mmask8)MASK, (int)SCALE)
+
+#define _mm512_i64gather_ps(INDEX, ADDR, SCALE)				\
+  (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf)_mm256_undefined_ps(),	\
+					 (float const *)ADDR,		\
+					 (__v8di)(__m512i)INDEX,	\
+					 (__mmask8)0xFF, (int)SCALE)
+
+#define _mm512_mask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE)	\
+  (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf)(__m256)V1OLD,		\
+					 (float const *)ADDR,		\
+					 (__v8di)(__m512i)INDEX,	\
+					 (__mmask8)MASK, (int)SCALE)
+
+#define _mm512_i64gather_pd(INDEX, ADDR, SCALE)				\
+  (__m512d) __builtin_ia32_gatherdiv8df ((__v8df)_mm512_undefined_pd(),	\
+					 (double const *)ADDR,		\
+					 (__v8di)(__m512i)INDEX,	\
+					 (__mmask8)0xFF, (int)SCALE)
+
+#define _mm512_mask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE)	\
+  (__m512d) __builtin_ia32_gatherdiv8df ((__v8df)(__m512d)V1OLD,	\
+					 (double const *)ADDR,		\
+					 (__v8di)(__m512i)INDEX,	\
+					 (__mmask8)MASK, (int)SCALE)
+
+#define _mm512_i32gather_epi32(INDEX, ADDR, SCALE)			\
+  (__m512i) __builtin_ia32_gathersiv16si ((__v16si)_mm512_undefined_si512 (),	\
+					  (int const *)ADDR,		\
+					  (__v16si)(__m512i)INDEX,	\
+					  (__mmask16)0xFFFF, (int)SCALE)
+
+#define _mm512_mask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE)	\
+  (__m512i) __builtin_ia32_gathersiv16si ((__v16si)(__m512i)V1OLD,	\
+					  (int const *)ADDR,		\
+					  (__v16si)(__m512i)INDEX,	\
+					  (__mmask16)MASK, (int)SCALE)
+
+#define _mm512_i32gather_epi64(INDEX, ADDR, SCALE)			\
+  (__m512i) __builtin_ia32_gathersiv8di ((__v8di)_mm512_undefined_si512 (),	\
+					 (long long const *)ADDR,	\
+					 (__v8si)(__m256i)INDEX,	\
+					 (__mmask8)0xFF, (int)SCALE)
+
+#define _mm512_mask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE)	\
+  (__m512i) __builtin_ia32_gathersiv8di ((__v8di)(__m512i)V1OLD,	\
+					 (long long const *)ADDR,	\
+					 (__v8si)(__m256i)INDEX,	\
+					 (__mmask8)MASK, (int)SCALE)
+
+#define _mm512_i64gather_epi32(INDEX, ADDR, SCALE)			  \
+  (__m256i) __builtin_ia32_gatherdiv16si ((__v8si)_mm256_undefined_si256(), \
+					  (int const *)ADDR,		  \
+					  (__v8di)(__m512i)INDEX,	  \
+					  (__mmask8)0xFF, (int)SCALE)
+
+#define _mm512_mask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE)	\
+  (__m256i) __builtin_ia32_gatherdiv16si ((__v8si)(__m256i)V1OLD,	\
+					  (int const *)ADDR,		\
+					  (__v8di)(__m512i)INDEX,	\
+					  (__mmask8)MASK, (int)SCALE)
+
+#define _mm512_i64gather_epi64(INDEX, ADDR, SCALE)			\
+  (__m512i) __builtin_ia32_gatherdiv8di ((__v8di)_mm512_undefined_si512 (),	\
+					 (long long const *)ADDR,	\
+					 (__v8di)(__m512i)INDEX,	\
+					 (__mmask8)0xFF, (int)SCALE)
+
+#define _mm512_mask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE)	\
+  (__m512i) __builtin_ia32_gatherdiv8di ((__v8di)(__m512i)V1OLD,	\
+					 (long long const *)ADDR,	\
+					 (__v8di)(__m512i)INDEX,	\
+					 (__mmask8)MASK, (int)SCALE)
+
+#define _mm512_i32scatter_ps(ADDR, INDEX, V1, SCALE)			\
+  __builtin_ia32_scattersiv16sf ((float *)ADDR, (__mmask16)0xFFFF,	\
+				 (__v16si)(__m512i)INDEX,		\
+				 (__v16sf)(__m512)V1, (int)SCALE)
+
+#define _mm512_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE)		\
+  __builtin_ia32_scattersiv16sf ((float *)ADDR, (__mmask16)MASK,		\
+				 (__v16si)(__m512i)INDEX,		\
+				 (__v16sf)(__m512)V1, (int)SCALE)
+
+#define _mm512_i32scatter_pd(ADDR, INDEX, V1, SCALE)			\
+  __builtin_ia32_scattersiv8df ((double *)ADDR, (__mmask8)0xFF,		\
+				(__v8si)(__m256i)INDEX,			\
+				(__v8df)(__m512d)V1, (int)SCALE)
+
+#define _mm512_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE)		\
+  __builtin_ia32_scattersiv8df ((double *)ADDR, (__mmask8)MASK,		\
+				(__v8si)(__m256i)INDEX,			\
+				(__v8df)(__m512d)V1, (int)SCALE)
+
+#define _mm512_i64scatter_ps(ADDR, INDEX, V1, SCALE)			\
+  __builtin_ia32_scatterdiv16sf ((float *)ADDR, (__mmask8)0xFF,		\
+				 (__v8di)(__m512i)INDEX,		\
+				 (__v8sf)(__m256)V1, (int)SCALE)
+
+#define _mm512_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE)		\
+  __builtin_ia32_scatterdiv16sf ((float *)ADDR, (__mmask16)MASK,		\
+				 (__v8di)(__m512i)INDEX,		\
+				 (__v8sf)(__m256)V1, (int)SCALE)
+
+#define _mm512_i64scatter_pd(ADDR, INDEX, V1, SCALE)			\
+  __builtin_ia32_scatterdiv8df ((double *)ADDR, (__mmask8)0xFF,		\
+				(__v8di)(__m512i)INDEX,			\
+				(__v8df)(__m512d)V1, (int)SCALE)
+
+#define _mm512_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE)		\
+  __builtin_ia32_scatterdiv8df ((double *)ADDR, (__mmask8)MASK,		\
+				(__v8di)(__m512i)INDEX,			\
+				(__v8df)(__m512d)V1, (int)SCALE)
+
+#define _mm512_i32scatter_epi32(ADDR, INDEX, V1, SCALE)			\
+  __builtin_ia32_scattersiv16si ((int *)ADDR, (__mmask16)0xFFFF,	\
+				 (__v16si)(__m512i)INDEX,		\
+				 (__v16si)(__m512i)V1, (int)SCALE)
+
+#define _mm512_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE)	\
+  __builtin_ia32_scattersiv16si ((int *)ADDR, (__mmask16)MASK,		\
+				 (__v16si)(__m512i)INDEX,		\
+				 (__v16si)(__m512i)V1, (int)SCALE)
+
+#define _mm512_i32scatter_epi64(ADDR, INDEX, V1, SCALE)			\
+  __builtin_ia32_scattersiv8di ((long long *)ADDR, (__mmask8)0xFF,	\
+				(__v8si)(__m256i)INDEX,			\
+				(__v8di)(__m512i)V1, (int)SCALE)
+
+#define _mm512_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE)	\
+  __builtin_ia32_scattersiv8di ((long long *)ADDR, (__mmask8)MASK,	\
+				(__v8si)(__m256i)INDEX,			\
+				(__v8di)(__m512i)V1, (int)SCALE)
+
+#define _mm512_i64scatter_epi32(ADDR, INDEX, V1, SCALE)			\
+  __builtin_ia32_scatterdiv16si ((int *)ADDR, (__mmask8)0xFF,		\
+				 (__v8di)(__m512i)INDEX,		\
+				 (__v8si)(__m256i)V1, (int)SCALE)
+
+#define _mm512_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE)	\
+  __builtin_ia32_scatterdiv16si ((int *)ADDR, (__mmask8)MASK,		\
+				 (__v8di)(__m512i)INDEX,		\
+				 (__v8si)(__m256i)V1, (int)SCALE)
+
+#define _mm512_i64scatter_epi64(ADDR, INDEX, V1, SCALE)			\
+  __builtin_ia32_scatterdiv8di ((long long *)ADDR, (__mmask8)0xFF,	\
+				(__v8di)(__m512i)INDEX,			\
+				(__v8di)(__m512i)V1, (int)SCALE)
+
+#define _mm512_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE)	\
+  __builtin_ia32_scatterdiv8di ((long long *)ADDR, (__mmask8)MASK,	\
+				(__v8di)(__m512i)INDEX,			\
+				(__v8di)(__m512i)V1, (int)SCALE)
+#endif
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
+						      (__v8df) __W,
+						      (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
+						      (__v8df)
+						      _mm512_setzero_pd (),
+						      (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A,
+					  (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
+						     (__v16sf) __W,
+						     (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
+						     (__v16sf)
+						     _mm512_setzero_ps (),
+						     (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A,
+					  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
+						      (__v8di) __W,
+						      (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
+						      (__v8di)
+						      _mm512_setzero_si512 (),
+						      (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A)
+{
+  __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A,
+					  (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
+						      (__v16si) __W,
+						      (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
+						      (__v16si)
+						      _mm512_setzero_si512 (),
+						      (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
+{
+  __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A,
+					  (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
+						    (__v8df) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_expanddf512_maskz ((__v8df) __A,
+						     (__v8df)
+						     _mm512_setzero_pd (),
+						     (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expandloadu_pd (__m512d __W, __mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *) __P,
+							(__v8df) __W,
+							(__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expandloadu_pd (__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_expandloaddf512_maskz ((const __v8df *) __P,
+							 (__v8df)
+							 _mm512_setzero_pd (),
+							 (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
+						   (__v16sf) __W,
+						   (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_expandsf512_maskz ((__v16sf) __A,
+						    (__v16sf)
+						    _mm512_setzero_ps (),
+						    (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expandloadu_ps (__m512 __W, __mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *) __P,
+						       (__v16sf) __W,
+						       (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expandloadu_ps (__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_expandloadsf512_maskz ((const __v16sf *) __P,
+							(__v16sf)
+							_mm512_setzero_ps (),
+							(__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
+						    (__v8di) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expand_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expanddi512_maskz ((__v8di) __A,
+						     (__v8di)
+						     _mm512_setzero_si512 (),
+						     (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expandloadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *) __P,
+							(__v8di) __W,
+							(__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m512i)
+	 __builtin_ia32_expandloaddi512_maskz ((const __v8di *) __P,
+					       (__v8di)
+					       _mm512_setzero_si512 (),
+					       (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
+						    (__v16si) __W,
+						    (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expandsi512_maskz ((__v16si) __A,
+						     (__v16si)
+						     _mm512_setzero_si512 (),
+						     (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expandloadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *) __P,
+							(__v16si) __W,
+							(__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expandloadu_epi32 (__mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadsi512_maskz ((const __v16si *) __P,
+							 (__v16si)
+							 _mm512_setzero_si512
+							 (), (__mmask16) __U);
+}
+
+/* Mask arithmetic operations */
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kand (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kandn (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kor (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kortestz (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kortestzhi ((__mmask16) __A,
+						(__mmask16) __B);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kortestc (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kortestchi ((__mmask16) __A,
+						(__mmask16) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kxnor (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kxor (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_knot (__mmask16 __A)
+{
+  return (__mmask16) __builtin_ia32_knothi ((__mmask16) __A);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kunpackb (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_inserti32x4 (__mmask16 __B, __m512i __C, __m128i __D,
+			  const int __imm)
+{
+  return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __C,
+						    (__v4si) __D,
+						    __imm,
+						    (__v16si)
+						    _mm512_setzero_si512 (),
+						    __B);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_insertf32x4 (__mmask16 __B, __m512 __C, __m128 __D,
+			  const int __imm)
+{
+  return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __C,
+						   (__v4sf) __D,
+						   __imm,
+						   (__v16sf)
+						   _mm512_setzero_ps (), __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_inserti32x4 (__m512i __A, __mmask16 __B, __m512i __C,
+			 __m128i __D, const int __imm)
+{
+  return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __C,
+						    (__v4si) __D,
+						    __imm,
+						    (__v16si) __A,
+						    __B);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_insertf32x4 (__m512 __A, __mmask16 __B, __m512 __C,
+			 __m128 __D, const int __imm)
+{
+  return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __C,
+						   (__v4sf) __D,
+						   __imm,
+						   (__v16sf) __A, __B);
+}
+#else
+#define _mm512_maskz_insertf32x4(A, X, Y, C)                            \
+  ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X),     \
+    (__v4sf)(__m128) (Y), (int) (C), (__v16sf)_mm512_setzero_ps(),      \
+    (__mmask8)(A)))
+
+#define _mm512_maskz_inserti32x4(A, X, Y, C)                            \
+  ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X),   \
+    (__v4si)(__m128i) (Y), (int) (C), (__v16si)_mm512_setzero_si512 (),     \
+    (__mmask8)(A)))
+
+#define _mm512_mask_insertf32x4(A, B, X, Y, C)                          \
+  ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X),     \
+    (__v4sf)(__m128) (Y), (int) (C), (__v16sf)(__m512) (A),             \
+					     (__mmask8)(B)))
+
+#define _mm512_mask_inserti32x4(A, B, X, Y, C)                          \
+  ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X),   \
+    (__v4si)(__m128i) (Y), (int) (C), (__v16si)(__m512i) (A),           \
+					      (__mmask8)(B)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di)
+						  _mm512_undefined_si512 (),
+						  (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di)
+						  _mm512_setzero_si512 (),
+						  __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di)
+						  _mm512_undefined_si512 (),
+						  (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di)
+						  _mm512_setzero_si512 (),
+						  __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epu64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di)
+						  _mm512_undefined_si512 (),
+						  (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di)
+						  _mm512_setzero_si512 (),
+						  __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epu64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di)
+						  _mm512_undefined_si512 (),
+						  (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+						  (__v8di) __B,
+						  (__v8di)
+						  _mm512_setzero_si512 (),
+						  __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si)
+						  _mm512_undefined_si512 (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si)
+						  _mm512_setzero_si512 (),
+						  __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si)
+						  _mm512_undefined_si512 (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si)
+						  _mm512_setzero_si512 (),
+						  __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epu32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si)
+						  _mm512_undefined_si512 (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si)
+						  _mm512_setzero_si512 (),
+						  __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epu32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si)
+						  _mm512_undefined_si512 (),
+						  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si)
+						  _mm512_setzero_si512 (),
+						  __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+						  (__v16si) __B,
+						  (__v16si) __W, __M);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpacklo_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   (__v16sf)
+						   _mm512_undefined_ps (),
+						   (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpacklo_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   (__v16sf) __W,
+						   (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   (__v16sf)
+						   _mm512_setzero_ps (),
+						   (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A,
+					       (__v2df) __B,
+					       __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_maxss_round ((__v4sf) __A,
+					      (__v4sf) __B,
+					      __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_minsd_round ((__v2df) __A,
+					       (__v2df) __B,
+					       __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_minss_round ((__v4sf) __A,
+					      (__v4sf) __B,
+					      __R);
+}
+
+#else
+#define _mm_max_round_sd(A, B, C)            \
+    (__m128d)__builtin_ia32_addsd_round(A, B, C)
+
+#define _mm_max_round_ss(A, B, C)            \
+    (__m128)__builtin_ia32_addss_round(A, B, C)
+
+#define _mm_min_round_sd(A, B, C)            \
+    (__m128d)__builtin_ia32_subsd_round(A, B, C)
+
+#define _mm_min_round_ss(A, B, C)            \
+    (__m128)__builtin_ia32_subss_round(A, B, C)
+#endif
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_pd (__mmask8 __U, __m512d __A, __m512d __W)
+{
+  return (__m512d) __builtin_ia32_blendmpd_512_mask ((__v8df) __A,
+						     (__v8df) __W,
+						     (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_ps (__mmask16 __U, __m512 __A, __m512 __W)
+{
+  return (__m512) __builtin_ia32_blendmps_512_mask ((__v16sf) __A,
+						    (__v16sf) __W,
+						    (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_epi64 (__mmask8 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_blendmq_512_mask ((__v8di) __A,
+						    (__v8di) __W,
+						    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_epi32 (__mmask16 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_blendmd_512_mask ((__v16si) __A,
+						    (__v16si) __W,
+						    (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W,
+						   (__v2df) __A,
+						   (__v2df) __B,
+						   __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W,
+						  (__v4sf) __A,
+						  (__v4sf) __B,
+						  __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W,
+						   (__v2df) __A,
+						   -(__v2df) __B,
+						   __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W,
+						  (__v4sf) __A,
+						  -(__v4sf) __B,
+						  __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W,
+						   -(__v2df) __A,
+						   (__v2df) __B,
+						   __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W,
+						  -(__v4sf) __A,
+						  (__v4sf) __B,
+						  __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W,
+						   -(__v2df) __A,
+						   -(__v2df) __B,
+						   __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W,
+						  -(__v4sf) __A,
+						  -(__v4sf) __B,
+						  __R);
+}
+#else
+#define _mm_fmadd_round_sd(A, B, C, R)            \
+    (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, C, R)
+
+#define _mm_fmadd_round_ss(A, B, C, R)            \
+    (__m128)__builtin_ia32_vfmaddss3_round(A, B, C, R)
+
+#define _mm_fmsub_round_sd(A, B, C, R)            \
+    (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, -(C), R)
+
+#define _mm_fmsub_round_ss(A, B, C, R)            \
+    (__m128)__builtin_ia32_vfmaddss3_round(A, B, -(C), R)
+
+#define _mm_fnmadd_round_sd(A, B, C, R)            \
+    (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), C, R)
+
+#define _mm_fnmadd_round_ss(A, B, C, R)            \
+   (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), C, R)
+
+#define _mm_fnmsub_round_sd(A, B, C, R)            \
+    (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), -(C), R)
+
+#define _mm_fnmsub_round_ss(A, B, C, R)            \
+    (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), -(C), R)
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comi_round_ss (__m128 __A, __m128 __B, const int __P, const int __R)
+{
+  return __builtin_ia32_vcomiss ((__v4sf) __A, (__v4sf) __B, __P, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comi_round_sd (__m128d __A, __m128d __B, const int __P, const int __R)
+{
+  return __builtin_ia32_vcomisd ((__v2df) __A, (__v2df) __B, __P, __R);
+}
+#else
+#define _mm_comi_round_ss(A, B, C, D)\
+__builtin_ia32_vcomiss(A, B, C, D)
+#define _mm_comi_round_sd(A, B, C, D)\
+__builtin_ia32_vcomisd(A, B, C, D)
+#endif
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_pd (__m512d __A)
+{
+  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+						  (__v8df)
+						  _mm512_undefined_pd (),
+						  (__mmask8) -1,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+						  (__v8df) __W,
+						  (__mmask8) __U,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+						  (__v8df)
+						  _mm512_setzero_pd (),
+						  (__mmask8) __U,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
+						 (__v16sf)
+						 _mm512_undefined_ps (),
+						 (__mmask16) -1,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
+						 (__v16sf) __W,
+						 (__mmask16) __U,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
+						 (__v16sf)
+						 _mm512_setzero_ps (),
+						 (__mmask16) __U,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_undefined_pd (),
+						 (__mmask8) -1,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df) __W,
+						 (__mmask8) __U,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_setzero_pd (),
+						 (__mmask8) __U,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_undefined_ps (),
+						(__mmask16) -1,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf) __W,
+						(__mmask16) __U,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_setzero_ps (),
+						(__mmask16) __U,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_undefined_pd (),
+						 (__mmask8) -1,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df) __W,
+						 (__mmask8) __U,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_setzero_pd (),
+						 (__mmask8) __U,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_undefined_ps (),
+						(__mmask16) -1,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf) __W,
+						(__mmask16) __U,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_setzero_ps (),
+						(__mmask16) __U,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_undefined_pd (),
+						 (__mmask8) -1,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df) __W,
+						 (__mmask8) __U,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_setzero_pd (),
+						 (__mmask8) __U,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_undefined_ps (),
+						(__mmask16) -1,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf) __W,
+						(__mmask16) __U,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_setzero_ps (),
+						(__mmask16) __U,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_pd (__m512d __M, __m512d __V)
+{
+  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
+						 (__v8df) __V,
+						 (__v8df)
+						 _mm512_undefined_pd (),
+						 (__mmask8) -1,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_pd (__m512d __W, __mmask8 __U, __m512d __M, __m512d __V)
+{
+  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
+						 (__v8df) __V,
+						 (__v8df) __W,
+						 (__mmask8) __U,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_pd (__mmask8 __U, __m512d __M, __m512d __V)
+{
+  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
+						 (__v8df) __V,
+						 (__v8df)
+						 _mm512_setzero_pd (),
+						 (__mmask8) __U,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_undefined_ps (),
+						(__mmask16) -1,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf) __W,
+						(__mmask16) __U,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_setzero_ps (),
+						(__mmask16) __U,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_undefined_pd (),
+						 (__mmask8) -1,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df) __W,
+						 (__mmask8) __U,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_setzero_pd (),
+						 (__mmask8) __U,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_undefined_ps (),
+						(__mmask16) -1,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf) __W,
+						(__mmask16) __U,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_setzero_ps (),
+						(__mmask16) __U,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_undefined_pd (),
+						 (__mmask8) -1,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df) __W,
+						 (__mmask8) __U,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+						 (__v8df) __B,
+						 (__v8df)
+						 _mm512_setzero_pd (),
+						 (__mmask8) __U,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_undefined_ps (),
+						(__mmask16) -1,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf) __W,
+						(__mmask16) __U,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+						(__v16sf) __B,
+						(__v16sf)
+						_mm512_setzero_ps (),
+						(__mmask16) __U,
+						_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    (__v8df)
+						    _mm512_undefined_pd (),
+						    (__mmask8) -1,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    (__v8df) __W,
+						    (__mmask8) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    (__v8df)
+						    _mm512_setzero_pd (),
+						    (__mmask8) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   (__v16sf)
+						   _mm512_undefined_ps (),
+						   (__mmask16) -1,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   (__v16sf) __W,
+						   (__mmask16) __U,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   (__v16sf)
+						   _mm512_setzero_ps (),
+						   (__mmask16) __U,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_scalefsd_round ((__v2df) __A,
+						  (__v2df) __B,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_scalefss_round ((__v4sf) __A,
+						 (__v4sf) __B,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_pd (__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    (__v8df) __C,
+						    (__mmask8) -1,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    (__v8df) __C,
+						    (__mmask8) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
+						     (__v8df) __B,
+						     (__v8df) __C,
+						     (__mmask8) __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+						     (__v8df) __B,
+						     (__v8df) __C,
+						     (__mmask8) __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_ps (__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   (__v16sf) __C,
+						   (__mmask16) -1,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   (__v16sf) __C,
+						   (__mmask16) __U,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
+						    (__v16sf) __B,
+						    (__v16sf) __C,
+						    (__mmask16) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+						    (__v16sf) __B,
+						    (__v16sf) __C,
+						    (__mmask16) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsub_pd (__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    -(__v8df) __C,
+						    (__mmask8) -1,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+						    (__v8df) __B,
+						    -(__v8df) __C,
+						    (__mmask8) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
+						     (__v8df) __B,
+						     (__v8df) __C,
+						     (__mmask8) __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+						     (__v8df) __B,
+						     -(__v8df) __C,
+						     (__mmask8) __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsub_ps (__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   -(__v16sf) __C,
+						   (__mmask16) -1,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+						   (__v16sf) __B,
+						   -(__v16sf) __C,
+						   (__mmask16) __U,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
+						    (__v16sf) __B,
+						    (__v16sf) __C,
+						    (__mmask16) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+						    (__v16sf) __B,
+						    -(__v16sf) __C,
+						    (__mmask16) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmaddsub_pd (__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+						       (__v8df) __B,
+						       (__v8df) __C,
+						       (__mmask8) -1,
+						       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmaddsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+						       (__v8df) __B,
+						       (__v8df) __C,
+						       (__mmask8) __U,
+						       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmaddsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
+							(__v8df) __B,
+							(__v8df) __C,
+							(__mmask8) __U,
+							_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmaddsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+							(__v8df) __B,
+							(__v8df) __C,
+							(__mmask8) __U,
+							_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmaddsub_ps (__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+						      (__v16sf) __B,
+						      (__v16sf) __C,
+						      (__mmask16) -1,
+						      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmaddsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+						      (__v16sf) __B,
+						      (__v16sf) __C,
+						      (__mmask16) __U,
+						      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmaddsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
+						       (__v16sf) __B,
+						       (__v16sf) __C,
+						       (__mmask16) __U,
+						       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmaddsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+						       (__v16sf) __B,
+						       (__v16sf) __C,
+						       (__mmask16) __U,
+						       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsubadd_pd (__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+						       (__v8df) __B,
+						       -(__v8df) __C,
+						       (__mmask8) -1,
+						       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsubadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+						       (__v8df) __B,
+						       -(__v8df) __C,
+						       (__mmask8) __U,
+						       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsubadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
+							(__v8df) __B,
+							(__v8df) __C,
+							(__mmask8) __U,
+							_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsubadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+							(__v8df) __B,
+							-(__v8df) __C,
+							(__mmask8) __U,
+							_MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsubadd_ps (__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+						      (__v16sf) __B,
+						      -(__v16sf) __C,
+						      (__mmask16) -1,
+						      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsubadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+						      (__v16sf) __B,
+						      -(__v16sf) __C,
+						      (__mmask16) __U,
+						      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsubadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
+						       (__v16sf) __B,
+						       (__v16sf) __C,
+						       (__mmask16) __U,
+						       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsubadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+						       (__v16sf) __B,
+						       -(__v16sf) __C,
+						       (__mmask16) __U,
+						       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmadd_pd (__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
+						    (__v8df) __B,
+						    (__v8df) __C,
+						    (__mmask8) -1,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
+						     (__v8df) __B,
+						     (__v8df) __C,
+						     (__mmask8) __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
+						     (__v8df) __B,
+						     (__v8df) __C,
+						     (__mmask8) __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
+						     (__v8df) __B,
+						     (__v8df) __C,
+						     (__mmask8) __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmadd_ps (__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
+						   (__v16sf) __B,
+						   (__v16sf) __C,
+						   (__mmask16) -1,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
+						    (__v16sf) __B,
+						    (__v16sf) __C,
+						    (__mmask16) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
+						    (__v16sf) __B,
+						    (__v16sf) __C,
+						    (__mmask16) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
+						    (__v16sf) __B,
+						    (__v16sf) __C,
+						    (__mmask16) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmsub_pd (__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
+						    (__v8df) __B,
+						    -(__v8df) __C,
+						    (__mmask8) -1,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
+						     (__v8df) __B,
+						     (__v8df) __C,
+						     (__mmask8) __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A,
+						      (__v8df) __B,
+						      (__v8df) __C,
+						      (__mmask8) __U,
+						      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
+						     (__v8df) __B,
+						     -(__v8df) __C,
+						     (__mmask8) __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmsub_ps (__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
+						   (__v16sf) __B,
+						   -(__v16sf) __C,
+						   (__mmask16) -1,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
+						    (__v16sf) __B,
+						    (__v16sf) __C,
+						    (__mmask16) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A,
+						     (__v16sf) __B,
+						     (__v16sf) __C,
+						     (__mmask16) __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
+						    (__v16sf) __B,
+						    -(__v16sf) __C,
+						    (__mmask16) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttpd_epi32 (__m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+						     (__v8si)
+						     _mm256_undefined_si256 (),
+						     (__mmask8) -1,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+						     (__v8si) __W,
+						     (__mmask8) __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+						     (__v8si)
+						     _mm256_setzero_si256 (),
+						     (__mmask8) __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttpd_epu32 (__m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+						      (__v8si)
+						      _mm256_undefined_si256 (),
+						      (__mmask8) -1,
+						      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+						      (__v8si) __W,
+						      (__mmask8) __U,
+						      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+						      (__v8si)
+						      _mm256_setzero_si256 (),
+						      (__mmask8) __U,
+						      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpd_epi32 (__m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+						    (__v8si)
+						    _mm256_undefined_si256 (),
+						    (__mmask8) -1,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+						    (__v8si) __W,
+						    (__mmask8) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+						    (__v8si)
+						    _mm256_setzero_si256 (),
+						    (__mmask8) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpd_epu32 (__m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+						     (__v8si)
+						     _mm256_undefined_si256 (),
+						     (__mmask8) -1,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+						     (__v8si) __W,
+						     (__mmask8) __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+						     (__v8si)
+						     _mm256_setzero_si256 (),
+						     (__mmask8) __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttps_epi32 (__m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+						     (__v16si)
+						     _mm512_undefined_si512 (),
+						     (__mmask16) -1,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+						     (__v16si) __W,
+						     (__mmask16) __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+						     (__v16si)
+						     _mm512_setzero_si512 (),
+						     (__mmask16) __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttps_epu32 (__m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+						      (__v16si)
+						      _mm512_undefined_si512 (),
+						      (__mmask16) -1,
+						      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+						      (__v16si) __W,
+						      (__mmask16) __U,
+						      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+						      (__v16si)
+						      _mm512_setzero_si512 (),
+						      (__mmask16) __U,
+						      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtps_epi32 (__m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+						    (__v16si)
+						    _mm512_undefined_si512 (),
+						    (__mmask16) -1,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+						    (__v16si) __W,
+						    (__mmask16) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+						    (__v16si)
+						    _mm512_setzero_si512 (),
+						    (__mmask16) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtps_epu32 (__m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+						     (__v16si)
+						     _mm512_undefined_si512 (),
+						     (__mmask16) -1,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+						     (__v16si) __W,
+						     (__mmask16) __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtps_epu32 (__mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+						     (__v16si)
+						     _mm512_setzero_si512 (),
+						     (__mmask16) __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu64_ss (__m128 __A, unsigned long long __B)
+{
+  return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B,
+					      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu64_sd (__m128d __A, unsigned long long __B)
+{
+  return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B,
+					       _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu32_ss (__m128 __A, unsigned __B)
+{
+  return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B,
+					      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi32_ps (__m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+						   (__v16sf)
+						   _mm512_undefined_ps (),
+						   (__mmask16) -1,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+						   (__v16sf) __W,
+						   (__mmask16) __U,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+						   (__v16sf)
+						   _mm512_setzero_ps (),
+						   (__mmask16) __U,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu32_ps (__m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+						    (__v16sf)
+						    _mm512_undefined_ps (),
+						    (__mmask16) -1,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+						    (__v16sf) __W,
+						    (__mmask16) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+						    (__v16sf)
+						    _mm512_setzero_ps (),
+						    (__mmask16) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fixupimm_pd (__m512d __A, __m512d __B, __m512i __C, const int __imm)
+{
+  return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A,
+						      (__v8df) __B,
+						      (__v8di) __C,
+						      __imm,
+						      (__mmask8) -1,
+						      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fixupimm_pd (__m512d __A, __mmask8 __U, __m512d __B,
+			 __m512i __C, const int __imm)
+{
+  return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A,
+						      (__v8df) __B,
+						      (__v8di) __C,
+						      __imm,
+						      (__mmask8) __U,
+						      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fixupimm_pd (__mmask8 __U, __m512d __A, __m512d __B,
+			  __m512i __C, const int __imm)
+{
+  return (__m512d) __builtin_ia32_fixupimmpd512_maskz ((__v8df) __A,
+						       (__v8df) __B,
+						       (__v8di) __C,
+						       __imm,
+						       (__mmask8) __U,
+						       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fixupimm_ps (__m512 __A, __m512 __B, __m512i __C, const int __imm)
+{
+  return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A,
+						     (__v16sf) __B,
+						     (__v16si) __C,
+						     __imm,
+						     (__mmask16) -1,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fixupimm_ps (__m512 __A, __mmask16 __U, __m512 __B,
+			 __m512i __C, const int __imm)
+{
+  return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A,
+						     (__v16sf) __B,
+						     (__v16si) __C,
+						     __imm,
+						     (__mmask16) __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fixupimm_ps (__mmask16 __U, __m512 __A, __m512 __B,
+			  __m512i __C, const int __imm)
+{
+  return (__m512) __builtin_ia32_fixupimmps512_maskz ((__v16sf) __A,
+						      (__v16sf) __B,
+						      (__v16si) __C,
+						      __imm,
+						      (__mmask16) __U,
+						      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fixupimm_sd (__m128d __A, __m128d __B, __m128i __C, const int __imm)
+{
+  return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A,
+						   (__v2df) __B,
+						   (__v2di) __C, __imm,
+						   (__mmask8) -1,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fixupimm_sd (__m128d __A, __mmask8 __U, __m128d __B,
+		      __m128i __C, const int __imm)
+{
+  return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A,
+						   (__v2df) __B,
+						   (__v2di) __C, __imm,
+						   (__mmask8) __U,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fixupimm_sd (__mmask8 __U, __m128d __A, __m128d __B,
+		       __m128i __C, const int __imm)
+{
+  return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A,
+						    (__v2df) __B,
+						    (__v2di) __C,
+						    __imm,
+						    (__mmask8) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fixupimm_ss (__m128 __A, __m128 __B, __m128i __C, const int __imm)
+{
+  return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A,
+						  (__v4sf) __B,
+						  (__v4si) __C, __imm,
+						  (__mmask8) -1,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fixupimm_ss (__m128 __A, __mmask8 __U, __m128 __B,
+		      __m128i __C, const int __imm)
+{
+  return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A,
+						  (__v4sf) __B,
+						  (__v4si) __C, __imm,
+						  (__mmask8) __U,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fixupimm_ss (__mmask8 __U, __m128 __A, __m128 __B,
+		       __m128i __C, const int __imm)
+{
+  return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A,
+						   (__v4sf) __B,
+						   (__v4si) __C, __imm,
+						   (__mmask8) __U,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+#else
+#define _mm512_fixupimm_pd(X, Y, Z, C)					\
+  ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X),	\
+      (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C),		\
+      (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_fixupimm_pd(X, U, Y, Z, C)                          \
+  ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X),    \
+      (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C),             \
+      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_fixupimm_pd(U, X, Y, Z, C)                         \
+  ((__m512d)__builtin_ia32_fixupimmpd512_maskz ((__v8df)(__m512d)(X),   \
+      (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C),             \
+      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_fixupimm_ps(X, Y, Z, C)					\
+  ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X),	\
+    (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C),		\
+    (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_fixupimm_ps(X, U, Y, Z, C)                          \
+  ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X),     \
+    (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C),              \
+    (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_fixupimm_ps(U, X, Y, Z, C)                         \
+  ((__m512)__builtin_ia32_fixupimmps512_maskz ((__v16sf)(__m512)(X),    \
+    (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C),              \
+    (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_fixupimm_sd(X, Y, Z, C)					\
+    ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X),	\
+      (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C),		\
+      (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_fixupimm_sd(X, U, Y, Z, C)				\
+    ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X),	\
+      (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C),		\
+      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_fixupimm_sd(U, X, Y, Z, C)				\
+    ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X),	\
+      (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C),		\
+      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_fixupimm_ss(X, Y, Z, C)					\
+    ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X),	\
+      (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C),		\
+      (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_fixupimm_ss(X, U, Y, Z, C)				\
+    ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X),	\
+      (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C),		\
+      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_fixupimm_ss(U, X, Y, Z, C)				\
+    ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X),	\
+      (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C),		\
+      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+#endif
+
+#ifdef __x86_64__
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_u64 (__m128 __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf)
+							   __A,
+							   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_u64 (__m128 __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf)
+							    __A,
+							    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_i64 (__m128 __A)
+{
+  return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+#endif /* __x86_64__ */
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_u32 (__m128 __A)
+{
+  return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_u32 (__m128 __A)
+{
+  return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_i32 (__m128 __A)
+{
+  return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A,
+					    _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_u64 (__m128d __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df)
+							   __A,
+							   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_u64 (__m128d __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df)
+							    __A,
+							    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_i64 (__m128d __A)
+{
+  return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+#endif /* __x86_64__ */
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_u32 (__m128d __A)
+{
+  return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A,
+						 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_u32 (__m128d __A)
+{
+  return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_i32 (__m128d __A)
+{
+  return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A,
+					    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtps_pd (__m256 __A)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+						    (__v8df)
+						    _mm512_undefined_pd (),
+						    (__mmask8) -1,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+						    (__v8df) __W,
+						    (__mmask8) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+						    (__v8df)
+						    _mm512_setzero_pd (),
+						    (__mmask8) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_ps (__m256i __A)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+						    (__v16sf)
+						    _mm512_undefined_ps (),
+						    (__mmask16) -1,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+						    (__v16sf) __W,
+						    (__mmask16) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+						    (__v16sf)
+						    _mm512_setzero_ps (),
+						    (__mmask16) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpd_ps (__m512d __A)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+						   (__v8sf)
+						   _mm256_undefined_ps (),
+						   (__mmask8) -1,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+						   (__v8sf) __W,
+						   (__mmask8) __U,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+						   (__v8sf)
+						   _mm256_setzero_ps (),
+						   (__mmask8) __U,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+						   (__v16sf)
+						   _mm512_undefined_ps (),
+						   (__mmask16) -1,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+						   (__v16sf) __W,
+						   (__mmask16) __U,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+						   (__v16sf)
+						   _mm512_setzero_ps (),
+						   (__mmask16) __U,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_pd (__m512d __A)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+						    (__v8df)
+						    _mm512_undefined_pd (),
+						    (__mmask8) -1,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+						    (__v8df) __W,
+						    (__mmask8) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+						    (__v8df)
+						    _mm512_setzero_pd (),
+						    (__mmask8) __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A,
+						    (__v4sf) __B,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A,
+						     (__v2df) __B,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_pd (__m512d __A, _MM_MANTISSA_NORM_ENUM __B,
+		   _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
+						     (__C << 2) | __B,
+						     _mm512_undefined_pd (),
+						     (__mmask8) -1,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_pd (__m512d __W, __mmask8 __U, __m512d __A,
+			_MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
+						     (__C << 2) | __B,
+						     (__v8df) __W, __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_pd (__mmask8 __U, __m512d __A,
+			 _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
+						     (__C << 2) | __B,
+						     (__v8df)
+						     _mm512_setzero_pd (),
+						     __U,
+						     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_ps (__m512 __A, _MM_MANTISSA_NORM_ENUM __B,
+		   _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
+						    (__C << 2) | __B,
+						    _mm512_undefined_ps (),
+						    (__mmask16) -1,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_ps (__m512 __W, __mmask16 __U, __m512 __A,
+			_MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
+						    (__C << 2) | __B,
+						    (__v16sf) __W, __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_ps (__mmask16 __U, __m512 __A,
+			 _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
+						    (__C << 2) | __B,
+						    (__v16sf)
+						    _mm512_setzero_ps (),
+						    __U,
+						    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_sd (__m128d __A, __m128d __B, _MM_MANTISSA_NORM_ENUM __C,
+		_MM_MANTISSA_SIGN_ENUM __D)
+{
+  return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A,
+						   (__v2df) __B,
+						   (__D << 2) | __C,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_ss (__m128 __A, __m128 __B, _MM_MANTISSA_NORM_ENUM __C,
+		_MM_MANTISSA_SIGN_ENUM __D)
+{
+  return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A,
+						  (__v4sf) __B,
+						  (__D << 2) | __C,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+#else
+#define _mm512_getmant_pd(X, B, C)                                                  \
+  ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X),                 \
+                                              (int)(((C)<<2) | (B)),                \
+                                              (__v8df)_mm512_undefined_pd(),        \
+                                              (__mmask8)-1,\
+					      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getmant_pd(W, U, X, B, C)                                       \
+  ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X),                 \
+                                              (int)(((C)<<2) | (B)),                \
+                                              (__v8df)(__m512d)(W),                 \
+                                              (__mmask8)(U),\
+					      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_getmant_pd(U, X, B, C)                                         \
+  ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X),                 \
+                                              (int)(((C)<<2) | (B)),                \
+                                              (__v8df)_mm512_setzero_pd(),          \
+                                              (__mmask8)(U),\
+					      _MM_FROUND_CUR_DIRECTION))
+#define _mm512_getmant_ps(X, B, C)                                                  \
+  ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X),                  \
+                                             (int)(((C)<<2) | (B)),                 \
+                                             (__v16sf)_mm512_undefined_ps(),        \
+                                             (__mmask16)-1,\
+					     _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getmant_ps(W, U, X, B, C)                                       \
+  ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X),                  \
+                                             (int)(((C)<<2) | (B)),                 \
+                                             (__v16sf)(__m512)(W),                  \
+                                             (__mmask16)(U),\
+					     _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_getmant_ps(U, X, B, C)                                         \
+  ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X),                  \
+                                             (int)(((C)<<2) | (B)),                 \
+                                             (__v16sf)_mm512_setzero_ps(),          \
+                                             (__mmask16)(U),\
+					     _MM_FROUND_CUR_DIRECTION))
+#define _mm_getmant_sd(X, Y, C, D)                                                  \
+  ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X),                    \
+                                           (__v2df)(__m128d)(Y),                    \
+                                           (int)(((D)<<2) | (C)),                   \
+					   _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_getmant_ss(X, Y, C, D)                                                  \
+  ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X),                      \
+                                          (__v4sf)(__m128)(Y),                      \
+                                          (int)(((D)<<2) | (C)),                    \
+					  _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_getexp_ss(A, B)						      \
+  ((__m128)__builtin_ia32_getexpss128_mask((__v4sf)(__m128)(A), (__v4sf)(__m128)(B),  \
+					   _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_getexp_sd(A, B)						       \
+  ((__m128d)__builtin_ia32_getexpsd128_mask((__v2df)(__m128d)(A), (__v2df)(__m128d)(B),\
+					    _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_getexp_ps(A)						\
+  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A),		\
+  (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getexp_ps(W, U, A)					\
+  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A),		\
+  (__v16sf)(__m512)(W), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_getexp_ps(U, A)					\
+  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A),		\
+  (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_getexp_pd(A)						\
+  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A),		\
+  (__v8df)_mm512_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getexp_pd(W, U, A)					\
+  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A),		\
+  (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_getexp_pd(U, A)					\
+  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A),		\
+  (__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_ps (__m512 __A, const int __imm)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm,
+						  (__v16sf)
+						  _mm512_undefined_ps (),
+						  -1,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_ps (__m512 __A, __mmask16 __B, __m512 __C,
+			   const int __imm)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __C, __imm,
+						  (__v16sf) __A,
+						  (__mmask16) __B,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_ps (__mmask16 __A, __m512 __B, const int __imm)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __B,
+						  __imm,
+						  (__v16sf)
+						  _mm512_setzero_ps (),
+						  (__mmask16) __A,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_pd (__m512d __A, const int __imm)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm,
+						   (__v8df)
+						   _mm512_undefined_pd (),
+						   -1,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_pd (__m512d __A, __mmask8 __B, __m512d __C,
+			   const int __imm)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __C, __imm,
+						   (__v8df) __A,
+						   (__mmask8) __B,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_pd (__mmask8 __A, __m512d __B, const int __imm)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __B,
+						   __imm,
+						   (__v8df)
+						   _mm512_setzero_pd (),
+						   (__mmask8) __A,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_ss (__m128 __A, __m128 __B, const int __imm)
+{
+  return (__m128) __builtin_ia32_rndscaless_round ((__v4sf) __A,
+						   (__v4sf) __B, __imm,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_sd (__m128d __A, __m128d __B, const int __imm)
+{
+  return (__m128d) __builtin_ia32_rndscalesd_round ((__v2df) __A,
+						    (__v2df) __B, __imm,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+#else
+#define _mm512_roundscale_ps(A, B) \
+  ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), (int)(B),\
+    (__v16sf)_mm512_undefined_ps(), (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION))
+#define _mm512_mask_roundscale_ps(A, B, C, D)				\
+  ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(C),	\
+					    (int)(D),			\
+					    (__v16sf)(__m512)(A),	\
+					    (__mmask16)(B), _MM_FROUND_CUR_DIRECTION))
+#define _mm512_maskz_roundscale_ps(A, B, C)				\
+  ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(B),	\
+					    (int)(C),			\
+					    (__v16sf)_mm512_setzero_ps(),\
+					    (__mmask16)(A), _MM_FROUND_CUR_DIRECTION))
+#define _mm512_roundscale_pd(A, B) \
+  ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(A), (int)(B),\
+    (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION))
+#define _mm512_mask_roundscale_pd(A, B, C, D)				\
+  ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(C),	\
+					     (int)(D),			\
+					     (__v8df)(__m512d)(A),	\
+					     (__mmask8)(B), _MM_FROUND_CUR_DIRECTION))
+#define _mm512_maskz_roundscale_pd(A, B, C)				\
+  ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(B),	\
+					     (int)(C),			\
+					     (__v8df)_mm512_setzero_pd(),\
+					     (__mmask8)(A), _MM_FROUND_CUR_DIRECTION))
+#define _mm_roundscale_ss(A, B, C)					\
+  ((__m128) __builtin_ia32_rndscaless_round ((__v4sf)(__m128)(A),	\
+  (__v4sf)(__m128)(B), (int)(C), _MM_FROUND_CUR_DIRECTION))
+#define _mm_roundscale_sd(A, B, C)					\
+  ((__m128d) __builtin_ia32_rndscalesd_round ((__v2df)(__m128d)(A),	\
+    (__v2df)(__m128d)(B), (int)(C), _MM_FROUND_CUR_DIRECTION))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_pd_mask (__m512d __X, __m512d __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+						  (__v8df) __Y, __P,
+						  (__mmask8) -1,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_ps_mask (__m512 __X, __m512 __Y, const int __P)
+{
+  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+						   (__v16sf) __Y, __P,
+						   (__mmask16) -1,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y, const int __P)
+{
+  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+						   (__v16sf) __Y, __P,
+						   (__mmask16) __U,
+						   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+						  (__v8df) __Y, __P,
+						  (__mmask8) __U,
+						  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_sd_mask (__m128d __X, __m128d __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
+					       (__v2df) __Y, __P,
+					       (__mmask8) -1,
+					       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
+					       (__v2df) __Y, __P,
+					       (__mmask8) __M,
+					       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_ss_mask (__m128 __X, __m128 __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
+					       (__v4sf) __Y, __P,
+					       (__mmask8) -1,
+					       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
+					       (__v4sf) __Y, __P,
+					       (__mmask8) __M,
+					       _MM_FROUND_CUR_DIRECTION);
+}
+
+#else
+#define _mm512_cmp_pd_mask(X, Y, P)					\
+  ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X),	\
+					    (__v8df)(__m512d)(Y), (int)(P),\
+					    (__mmask8)-1,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_cmp_ps_mask(X, Y, P)					\
+  ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X),	\
+					     (__v16sf)(__m512)(Y), (int)(P),\
+					     (__mmask16)-1,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_cmp_pd_mask(M, X, Y, P)					\
+  ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X),	\
+					    (__v8df)(__m512d)(Y), (int)(P),\
+					    (__mmask8)M, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_cmp_ps_mask(M, X, Y, P)					\
+  ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X),	\
+					     (__v16sf)(__m512)(Y), (int)(P),\
+					     (__mmask16)M,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm_cmp_sd_mask(X, Y, P)					\
+  ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X),		\
+					 (__v2df)(__m128d)(Y), (int)(P),\
+					 (__mmask8)-1,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_cmp_sd_mask(M, X, Y, P)					\
+  ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X),		\
+					 (__v2df)(__m128d)(Y), (int)(P),\
+					 M,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm_cmp_ss_mask(X, Y, P)					\
+  ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X),		\
+					 (__v4sf)(__m128)(Y), (int)(P), \
+					 (__mmask8)-1,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_cmp_ss_mask(M, X, Y, P)					\
+  ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X),		\
+					 (__v4sf)(__m128)(Y), (int)(P), \
+					 M,_MM_FROUND_CUR_DIRECTION))
+#endif
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kmov (__mmask16 __A)
+{
+  return __builtin_ia32_kmov16 (__A);
+}
+
+#ifdef __DISABLE_AVX512F__
+#undef __DISABLE_AVX512F__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512F__ */
+
+#endif /* _AVX512FINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/avx512pfintrin.h b/gcc-4.9/gcc/config/i386/avx512pfintrin.h
new file mode 100644
index 000000000..bc7598e7a
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/avx512pfintrin.h
@@ -0,0 +1,212 @@
+/* Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512PFINTRIN_H_INCLUDED
+#define _AVX512PFINTRIN_H_INCLUDED
+
+#ifndef __AVX512PF__
+#pragma GCC push_options
+#pragma GCC target("avx512pf")
+#define __DISABLE_AVX512PF__
+#endif /* __AVX512PF__ */
+
+/* Internal data types for implementing the intrinsics.  */
+typedef long long __v8di __attribute__ ((__vector_size__ (64)));
+typedef int __v16si __attribute__ ((__vector_size__ (64)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__));
+
+typedef unsigned char  __mmask8;
+typedef unsigned short __mmask16;
+
+#ifdef __OPTIMIZE__
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i32gather_pd (__m256i index, __mmask8 mask,
+				   void *addr, int scale, int hint)
+{
+  __builtin_ia32_gatherpfdpd (mask, (__v8si) index, (long long const *) addr,
+			      scale, hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i32gather_ps (__m512i index, __mmask16 mask,
+				   void *addr, int scale, int hint)
+{
+  __builtin_ia32_gatherpfdps (mask, (__v16si) index, (int const *) addr,
+			      scale, hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i64gather_pd (__m512i index, __mmask8 mask,
+				   void *addr, int scale, int hint)
+{
+  __builtin_ia32_gatherpfqpd (mask, (__v8di) index, (long long const *) addr,
+			      scale, hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i64gather_ps (__m512i index, __mmask8 mask,
+				   void *addr, int scale, int hint)
+{
+  __builtin_ia32_gatherpfqps (mask, (__v8di) index, (int const *) addr,
+			      scale, hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i32scatter_pd (void *addr, __m256i index, int scale,
+			       int hint)
+{
+  __builtin_ia32_scatterpfdpd ((__mmask8) 0xFF, (__v8si) index, 
+			       (long long const *)addr, scale, hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i32scatter_ps (void *addr, __m512i index, int scale,
+			       int hint)
+{
+  __builtin_ia32_scatterpfdps ((__mmask16) 0xFFFF, (__v16si) index, (int const *) addr,
+			       scale, hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i32scatter_pd (void *addr, __mmask8 mask,
+				    __m256i index, int scale, int hint)
+{
+  __builtin_ia32_scatterpfdpd (mask, (__v8si) index, (long long const *) addr,
+			       scale, hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i32scatter_ps (void *addr, __mmask16 mask,
+				    __m512i index, int scale, int hint)
+{
+  __builtin_ia32_scatterpfdps (mask, (__v16si) index, (int const *) addr,
+			       scale, hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i64scatter_pd (void *addr, __m512i index, int scale,
+			       int hint)
+{
+  __builtin_ia32_scatterpfqpd ((__mmask8) 0xFF, (__v8di) index, (long long const *) addr,
+			       scale, hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i64scatter_ps (void *addr, __m512i index, int scale,
+			       int hint)
+{
+  __builtin_ia32_scatterpfqps ((__mmask8) 0xFF, (__v8di) index, (int const *) addr,
+			       scale, hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i64scatter_pd (void *addr, __mmask16 mask,
+				    __m512i index, int scale, int hint)
+{
+  __builtin_ia32_scatterpfqpd (mask, (__v8di) index, (long long const *) addr,
+			       scale, hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i64scatter_ps (void *addr, __mmask16 mask,
+				    __m512i index, int scale, int hint)
+{
+  __builtin_ia32_scatterpfqps (mask, (__v8di) index, (int const *) addr,
+			       scale, hint);
+}
+
+#else
+#define _mm512_mask_prefetch_i32gather_pd(INDEX, MASK, ADDR, SCALE, HINT)    \
+  __builtin_ia32_gatherpfdpd ((__mmask8)MASK, (__v8si)(__m256i)INDEX,	     \
+			      (long long const *)ADDR, (int)SCALE, (int)HINT)
+
+#define _mm512_mask_prefetch_i32gather_ps(INDEX, MASK, ADDR, SCALE, HINT)    \
+  __builtin_ia32_gatherpfdps ((__mmask16)MASK, (__v16si)(__m512i)INDEX,      \
+			      (int const *)ADDR, (int)SCALE, (int)HINT)
+
+#define _mm512_mask_prefetch_i64gather_pd(INDEX, MASK, ADDR, SCALE, HINT)    \
+  __builtin_ia32_gatherpfqpd ((__mmask8)MASK, (__v8di)(__m512i)INDEX,	     \
+			      (long long const *)ADDR, (int)SCALE, (int)HINT)
+
+#define _mm512_mask_prefetch_i64gather_ps(INDEX, MASK, ADDR, SCALE, HINT)    \
+  __builtin_ia32_gatherpfqps ((__mmask8)MASK, (__v8di)(__m512i)INDEX,	     \
+			      (int const *)ADDR, (int)SCALE, (int)HINT)
+
+#define _mm512_prefetch_i32scatter_pd(ADDR, INDEX, SCALE, HINT)              \
+  __builtin_ia32_scatterpfdpd ((__mmask8)0xFF, (__v8si)(__m256i)INDEX,       \
+			       (long long const *)ADDR, (int)SCALE, (int)HINT)
+
+#define _mm512_prefetch_i32scatter_ps(ADDR, INDEX, SCALE, HINT)              \
+  __builtin_ia32_scatterpfdps ((__mmask16)0xFFFF, (__v16si)(__m512i)INDEX,   \
+			       (int const *)ADDR, (int)SCALE, (int)HINT)
+
+#define _mm512_mask_prefetch_i32scatter_pd(ADDR, MASK, INDEX, SCALE, HINT)   \
+  __builtin_ia32_scatterpfdpd ((__mmask8)MASK, (__v8si)(__m256i)INDEX,       \
+			       (long long const *)ADDR, (int)SCALE, (int)HINT)
+
+#define _mm512_mask_prefetch_i32scatter_ps(ADDR, MASK, INDEX, SCALE, HINT)   \
+  __builtin_ia32_scatterpfdps ((__mmask16)MASK, (__v16si)(__m512i)INDEX,     \
+			       (int const *)ADDR, (int)SCALE, (int)HINT)
+
+#define _mm512_prefetch_i64scatter_pd(ADDR, INDEX, SCALE, HINT)              \
+  __builtin_ia32_scatterpfqpd ((__mmask8)0xFF, (__v8di)(__m512i)INDEX,	     \
+			       (long long const *)ADDR, (int)SCALE, (int)HINT)
+
+#define _mm512_prefetch_i64scatter_ps(ADDR, INDEX, SCALE, HINT)              \
+  __builtin_ia32_scatterpfqps ((__mmask8)0xFF, (__v8di)(__m512i)INDEX,	     \
+			       (int const *)ADDR, (int)SCALE, (int)HINT)
+
+#define _mm512_mask_prefetch_i64scatter_pd(ADDR, MASK, INDEX, SCALE, HINT)   \
+  __builtin_ia32_scatterpfqpd ((__mmask8)MASK, (__v8di)(__m512i)INDEX,	     \
+			       (long long const *)ADDR, (int)SCALE, (int)HINT)
+
+#define _mm512_mask_prefetch_i64scatter_ps(ADDR, MASK, INDEX, SCALE, HINT)   \
+  __builtin_ia32_scatterpfqps ((__mmask8)MASK, (__v8di)(__m512i)INDEX,	     \
+			       (int const *)ADDR, (int)SCALE, (int)HINT)
+#endif
+
+#ifdef __DISABLE_AVX512PF__
+#undef __DISABLE_AVX512PF__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512PF__ */
+
+#endif /* _AVX512PFINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/avxintrin.h b/gcc-4.9/gcc/config/i386/avxintrin.h
new file mode 100644
index 000000000..2ea327c5a
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/avxintrin.h
@@ -0,0 +1,1463 @@
+/* Copyright (C) 2008-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 11.0.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+# error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVXINTRIN_H_INCLUDED
+#define _AVXINTRIN_H_INCLUDED
+
+#ifndef __AVX__
+#pragma GCC push_options
+#pragma GCC target("avx")
+#define __DISABLE_AVX__
+#endif /* __AVX__ */
+
+/* Internal data types for implementing the intrinsics.  */
+typedef double __v4df __attribute__ ((__vector_size__ (32)));
+typedef float __v8sf __attribute__ ((__vector_size__ (32)));
+typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+typedef int __v8si __attribute__ ((__vector_size__ (32)));
+typedef short __v16hi __attribute__ ((__vector_size__ (32)));
+typedef char __v32qi __attribute__ ((__vector_size__ (32)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef float __m256 __attribute__ ((__vector_size__ (32),
+				     __may_alias__));
+typedef long long __m256i __attribute__ ((__vector_size__ (32),
+					  __may_alias__));
+typedef double __m256d __attribute__ ((__vector_size__ (32),
+				       __may_alias__));
+
+/* Compare predicates for scalar and packed compare intrinsics.  */
+
+/* Equal (ordered, non-signaling)  */
+#define _CMP_EQ_OQ	0x00
+/* Less-than (ordered, signaling)  */
+#define _CMP_LT_OS	0x01
+/* Less-than-or-equal (ordered, signaling)  */
+#define _CMP_LE_OS	0x02
+/* Unordered (non-signaling)  */
+#define _CMP_UNORD_Q	0x03
+/* Not-equal (unordered, non-signaling)  */
+#define _CMP_NEQ_UQ	0x04
+/* Not-less-than (unordered, signaling)  */
+#define _CMP_NLT_US	0x05
+/* Not-less-than-or-equal (unordered, signaling)  */
+#define _CMP_NLE_US	0x06
+/* Ordered (nonsignaling)   */
+#define _CMP_ORD_Q	0x07
+/* Equal (unordered, non-signaling)  */
+#define _CMP_EQ_UQ	0x08
+/* Not-greater-than-or-equal (unordered, signaling)  */
+#define _CMP_NGE_US	0x09
+/* Not-greater-than (unordered, signaling)  */
+#define _CMP_NGT_US	0x0a
+/* False (ordered, non-signaling)  */
+#define _CMP_FALSE_OQ	0x0b
+/* Not-equal (ordered, non-signaling)  */
+#define _CMP_NEQ_OQ	0x0c
+/* Greater-than-or-equal (ordered, signaling)  */
+#define _CMP_GE_OS	0x0d
+/* Greater-than (ordered, signaling)  */
+#define _CMP_GT_OS	0x0e
+/* True (unordered, non-signaling)  */
+#define _CMP_TRUE_UQ	0x0f
+/* Equal (ordered, signaling)  */
+#define _CMP_EQ_OS	0x10
+/* Less-than (ordered, non-signaling)  */
+#define _CMP_LT_OQ	0x11
+/* Less-than-or-equal (ordered, non-signaling)  */
+#define _CMP_LE_OQ	0x12
+/* Unordered (signaling)  */
+#define _CMP_UNORD_S	0x13
+/* Not-equal (unordered, signaling)  */
+#define _CMP_NEQ_US	0x14
+/* Not-less-than (unordered, non-signaling)  */
+#define _CMP_NLT_UQ	0x15
+/* Not-less-than-or-equal (unordered, non-signaling)  */
+#define _CMP_NLE_UQ	0x16
+/* Ordered (signaling)  */
+#define _CMP_ORD_S	0x17
+/* Equal (unordered, signaling)  */
+#define _CMP_EQ_US	0x18
+/* Not-greater-than-or-equal (unordered, non-signaling)  */
+#define _CMP_NGE_UQ	0x19
+/* Not-greater-than (unordered, non-signaling)  */
+#define _CMP_NGT_UQ	0x1a
+/* False (ordered, signaling)  */
+#define _CMP_FALSE_OS	0x1b
+/* Not-equal (ordered, signaling)  */
+#define _CMP_NEQ_OS	0x1c
+/* Greater-than-or-equal (ordered, non-signaling)  */
+#define _CMP_GE_OQ	0x1d
+/* Greater-than (ordered, non-signaling)  */
+#define _CMP_GT_OQ	0x1e
+/* True (unordered, signaling)  */
+#define _CMP_TRUE_US	0x1f
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_addsub_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_addsub_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_and_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_and_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_andnot_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_andnot_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+/* Double/single precision floating point blend instructions - select
+   data from 2 sources using constant/variable mask.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
+{
+  return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
+					      (__v4df)__Y,
+					      __M);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
+{
+  return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
+					     (__v8sf)__Y,
+					     __M);
+}
+#else
+#define _mm256_blend_pd(X, Y, M)					\
+  ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X),		\
+					(__v4df)(__m256d)(Y), (int)(M)))
+
+#define _mm256_blend_ps(X, Y, M)					\
+  ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X),		\
+				       (__v8sf)(__m256)(Y), (int)(M)))
+#endif
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
+{
+  return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
+					       (__v4df)__Y,
+					       (__v4df)__M);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
+{
+  return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
+					      (__v8sf)__Y,
+					      (__v8sf)__M);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_div_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_div_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+/* Dot product instructions with mask-defined summing and zeroing parts
+   of result.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
+{
+  return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
+					  (__v8sf)__Y,
+					  __M);
+}
+#else
+#define _mm256_dp_ps(X, Y, M)						\
+  ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X),		\
+				    (__v8sf)(__m256)(Y), (int)(M)))
+#endif
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadd_pd (__m256d __X, __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadd_ps (__m256 __X, __m256 __Y)
+{
+  return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsub_pd (__m256d __X, __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsub_ps (__m256 __X, __m256 __Y)
+{
+  return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_or_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_or_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
+{
+  return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
+					     __mask);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
+{
+  return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
+					    __mask);
+}
+#else
+#define _mm256_shuffle_pd(A, B, N)					\
+  ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A),		\
+				      (__v4df)(__m256d)(B), (int)(N)))
+
+#define _mm256_shuffle_ps(A, B, N)					\
+  ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A),		\
+				      (__v8sf)(__m256)(B), (int)(N)))
+#endif
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_xor_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_xor_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
+{
+  return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
+{
+  return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
+{
+  return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
+					    __P);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
+{
+  return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
+					   __P);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
+{
+  return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
+{
+  return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
+}
+#else
+#define _mm_cmp_pd(X, Y, P)						\
+  ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X),		\
+				   (__v2df)(__m128d)(Y), (int)(P)))
+
+#define _mm_cmp_ps(X, Y, P)						\
+  ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X),			\
+				  (__v4sf)(__m128)(Y), (int)(P)))
+
+#define _mm256_cmp_pd(X, Y, P)						\
+  ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X),		\
+				      (__v4df)(__m256d)(Y), (int)(P)))
+
+#define _mm256_cmp_ps(X, Y, P)						\
+  ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X),		\
+				     (__v8sf)(__m256)(Y), (int)(P)))
+
+#define _mm_cmp_sd(X, Y, P)						\
+  ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X),		\
+				   (__v2df)(__m128d)(Y), (int)(P)))
+
+#define _mm_cmp_ss(X, Y, P)						\
+  ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X),			\
+				  (__v4sf)(__m128)(Y), (int)(P)))
+#endif
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_pd (__m128i __A)
+{
+  return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_ps (__m256i __A)
+{
+  return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpd_ps (__m256d __A)
+{
+  return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtps_epi32 (__m256 __A)
+{
+  return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtps_pd (__m128 __A)
+{
+  return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttpd_epi32 (__m256d __A)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpd_epi32 (__m256d __A)
+{
+  return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttps_epi32 (__m256 __A)
+{
+  return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extractf128_pd (__m256d __X, const int __N)
+{
+  return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extractf128_ps (__m256 __X, const int __N)
+{
+  return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extractf128_si256 (__m256i __X, const int __N)
+{
+  return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extract_epi32 (__m256i __X, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
+  return _mm_extract_epi32 (__Y, __N % 4);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extract_epi16 (__m256i __X, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
+  return _mm_extract_epi16 (__Y, __N % 8);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extract_epi8 (__m256i __X, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
+  return _mm_extract_epi8 (__Y, __N % 16);
+}
+
+#ifdef __x86_64__
+extern __inline long long  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extract_epi64 (__m256i __X, const int __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
+  return _mm_extract_epi64 (__Y, __N % 2);
+}
+#endif
+#else
+#define _mm256_extractf128_pd(X, N)					\
+  ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X),	\
+						(int)(N)))
+
+#define _mm256_extractf128_ps(X, N)					\
+  ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X),	\
+					       (int)(N)))
+
+#define _mm256_extractf128_si256(X, N)					\
+  ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X),	\
+						(int)(N)))
+
+#define _mm256_extract_epi32(X, N)					\
+  (__extension__							\
+   ({									\
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
+      _mm_extract_epi32 (__Y, (N) % 4);					\
+    }))
+
+#define _mm256_extract_epi16(X, N)					\
+  (__extension__							\
+   ({									\
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
+      _mm_extract_epi16 (__Y, (N) % 8);					\
+    }))
+
+#define _mm256_extract_epi8(X, N)					\
+  (__extension__							\
+   ({									\
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
+      _mm_extract_epi8 (__Y, (N) % 16);					\
+    }))
+
+#ifdef __x86_64__
+#define _mm256_extract_epi64(X, N)					\
+  (__extension__							\
+   ({									\
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
+      _mm_extract_epi64 (__Y, (N) % 2);					\
+    }))
+#endif
+#endif
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zeroall (void)
+{
+  __builtin_ia32_vzeroall ();
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zeroupper (void)
+{
+  __builtin_ia32_vzeroupper ();
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutevar_pd (__m128d __A, __m128i __C)
+{
+  return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
+						(__v2di)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutevar_pd (__m256d __A, __m256i __C)
+{
+  return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
+						   (__v4di)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutevar_ps (__m128 __A, __m128i __C)
+{
+  return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
+					       (__v4si)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutevar_ps (__m256 __A, __m256i __C)
+{
+  return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
+						  (__v8si)__C);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permute_pd (__m128d __X, const int __C)
+{
+  return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute_pd (__m256d __X, const int __C)
+{
+  return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permute_ps (__m128 __X, const int __C)
+{
+  return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute_ps (__m256 __X, const int __C)
+{
+  return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
+}
+#else
+#define _mm_permute_pd(X, C)						\
+  ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
+
+#define _mm256_permute_pd(X, C)						\
+  ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X),	(int)(C)))
+
+#define _mm_permute_ps(X, C)						\
+  ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
+
+#define _mm256_permute_ps(X, C)						\
+  ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
+{
+  return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
+						    (__v4df)__Y,
+						    __C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
+{
+  return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
+						   (__v8sf)__Y,
+						   __C);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
+{
+  return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
+						    (__v8si)__Y,
+						    __C);
+}
+#else
+#define _mm256_permute2f128_pd(X, Y, C)					\
+  ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X),	\
+					      (__v4df)(__m256d)(Y),	\
+					      (int)(C)))
+
+#define _mm256_permute2f128_ps(X, Y, C)					\
+  ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X),	\
+					     (__v8sf)(__m256)(Y),	\
+					     (int)(C)))
+
+#define _mm256_permute2f128_si256(X, Y, C)				\
+  ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X),	\
+					      (__v8si)(__m256i)(Y),	\
+					      (int)(C)))
+#endif
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcast_ss (float const *__X)
+{
+  return (__m128) __builtin_ia32_vbroadcastss (__X);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_sd (double const *__X)
+{
+  return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_ss (float const *__X)
+{
+  return (__m256) __builtin_ia32_vbroadcastss256 (__X);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_pd (__m128d const *__X)
+{
+  return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_ps (__m128 const *__X)
+{
+  return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
+{
+  return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
+						     (__v2df)__Y,
+						     __O);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
+{
+  return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
+						    (__v4sf)__Y,
+						    __O);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
+{
+  return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
+						     (__v4si)__Y,
+						     __O);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insert_epi32 (__m256i __X, int __D, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
+  __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
+  return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insert_epi16 (__m256i __X, int __D, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
+  __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
+  return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insert_epi8 (__m256i __X, int __D, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
+  __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
+  return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
+}
+
+#ifdef __x86_64__
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
+  __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
+  return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
+}
+#endif
+#else
+#define _mm256_insertf128_pd(X, Y, O)					\
+  ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X),	\
+					       (__v2df)(__m128d)(Y),	\
+					       (int)(O)))
+
+#define _mm256_insertf128_ps(X, Y, O)					\
+  ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X),	\
+					      (__v4sf)(__m128)(Y),  	\
+					      (int)(O)))
+
+#define _mm256_insertf128_si256(X, Y, O)				\
+  ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X),	\
+					       (__v4si)(__m128i)(Y),	\
+					       (int)(O)))
+
+#define _mm256_insert_epi32(X, D, N)					\
+  (__extension__							\
+   ({									\
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
+      __Y = _mm_insert_epi32 (__Y, (D), (N) % 4);			\
+      _mm256_insertf128_si256 ((X), __Y, (N) >> 2);			\
+    }))
+
+#define _mm256_insert_epi16(X, D, N)					\
+  (__extension__							\
+   ({									\
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
+      __Y = _mm_insert_epi16 (__Y, (D), (N) % 8);			\
+      _mm256_insertf128_si256 ((X), __Y, (N) >> 3);			\
+    }))
+
+#define _mm256_insert_epi8(X, D, N)					\
+  (__extension__							\
+   ({									\
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
+      __Y = _mm_insert_epi8 (__Y, (D), (N) % 16);			\
+      _mm256_insertf128_si256 ((X), __Y, (N) >> 4);			\
+    }))
+
+#ifdef __x86_64__
+#define _mm256_insert_epi64(X, D, N)					\
+  (__extension__							\
+   ({									\
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
+      __Y = _mm_insert_epi64 (__Y, (D), (N) % 2);			\
+      _mm256_insertf128_si256 ((X), __Y, (N) >> 1);			\
+    }))
+#endif
+#endif
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_pd (double const *__P)
+{
+  return *(__m256d *)__P;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_pd (double *__P, __m256d __A)
+{
+  *(__m256d *)__P = __A;
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_ps (float const *__P)
+{
+  return *(__m256 *)__P;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_ps (float *__P, __m256 __A)
+{
+  *(__m256 *)__P = __A;
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_pd (double const *__P)
+{
+  return (__m256d) __builtin_ia32_loadupd256 (__P);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_pd (double *__P, __m256d __A)
+{
+  __builtin_ia32_storeupd256 (__P, (__v4df)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_ps (float const *__P)
+{
+  return (__m256) __builtin_ia32_loadups256 (__P);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_ps (float *__P, __m256 __A)
+{
+  __builtin_ia32_storeups256 (__P, (__v8sf)__A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_si256 (__m256i const *__P)
+{
+  return *__P;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_si256 (__m256i *__P, __m256i __A)
+{
+  *__P = __A;
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_si256 (__m256i const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_si256 (__m256i *__P, __m256i __A)
+{
+  __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskload_pd (double const *__P, __m128i __M)
+{
+  return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
+					      (__v2di)__M);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
+{
+  __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskload_pd (double const *__P, __m256i __M)
+{
+  return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
+						 (__v4di)__M);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
+{
+  __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskload_ps (float const *__P, __m128i __M)
+{
+  return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
+					     (__v4si)__M);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
+{
+  __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskload_ps (float const *__P, __m256i __M)
+{
+  return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
+						(__v8si)__M);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
+{
+  __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movehdup_ps (__m256 __X)
+{
+  return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_moveldup_ps (__m256 __X)
+{
+  return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movedup_pd (__m256d __X)
+{
+  return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_lddqu_si256 (__m256i const *__P)
+{
+  return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_stream_si256 (__m256i *__A, __m256i __B)
+{
+  __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_stream_pd (double *__A, __m256d __B)
+{
+  __builtin_ia32_movntpd256 (__A, (__v4df)__B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_stream_ps (float *__P, __m256 __A)
+{
+  __builtin_ia32_movntps256 (__P, (__v8sf)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rcp_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rsqrt_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_pd (__m256d __A)
+{
+  return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_round_pd (__m256d __V, const int __M)
+{
+  return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_round_ps (__m256 __V, const int __M)
+{
+  return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
+}
+#else
+#define _mm256_round_pd(V, M) \
+  ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
+
+#define _mm256_round_ps(V, M) \
+  ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
+#endif
+
+#define _mm256_ceil_pd(V)	_mm256_round_pd ((V), _MM_FROUND_CEIL)
+#define _mm256_floor_pd(V)	_mm256_round_pd ((V), _MM_FROUND_FLOOR)
+#define _mm256_ceil_ps(V)	_mm256_round_ps ((V), _MM_FROUND_CEIL)
+#define _mm256_floor_ps(V)	_mm256_round_ps ((V), _MM_FROUND_FLOOR)
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testz_pd (__m128d __M, __m128d __V)
+{
+  return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testc_pd (__m128d __M, __m128d __V)
+{
+  return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testnzc_pd (__m128d __M, __m128d __V)
+{
+  return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testz_ps (__m128 __M, __m128 __V)
+{
+  return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testc_ps (__m128 __M, __m128 __V)
+{
+  return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testnzc_ps (__m128 __M, __m128 __V)
+{
+  return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testz_pd (__m256d __M, __m256d __V)
+{
+  return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testc_pd (__m256d __M, __m256d __V)
+{
+  return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testnzc_pd (__m256d __M, __m256d __V)
+{
+  return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testz_ps (__m256 __M, __m256 __V)
+{
+  return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testc_ps (__m256 __M, __m256 __V)
+{
+  return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testnzc_ps (__m256 __M, __m256 __V)
+{
+  return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testz_si256 (__m256i __M, __m256i __V)
+{
+  return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testc_si256 (__m256i __M, __m256i __V)
+{
+  return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testnzc_si256 (__m256i __M, __m256i __V)
+{
+  return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movemask_pd (__m256d __A)
+{
+  return __builtin_ia32_movmskpd256 ((__v4df)__A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movemask_ps (__m256 __A)
+{
+  return __builtin_ia32_movmskps256 ((__v8sf)__A);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_undefined_pd (void)
+{
+  __m256d __Y = __Y;
+  return __Y;
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_undefined_ps (void)
+{
+  __m256 __Y = __Y;
+  return __Y;
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_undefined_si256 (void)
+{
+  __m256i __Y = __Y;
+  return __Y;
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setzero_pd (void)
+{
+  return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setzero_ps (void)
+{
+  return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
+				 0.0, 0.0, 0.0, 0.0 };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setzero_si256 (void)
+{
+  return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
+}
+
+/* Create the vector [A B C D].  */
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_pd (double __A, double __B, double __C, double __D)
+{
+  return __extension__ (__m256d){ __D, __C, __B, __A };
+}
+
+/* Create the vector [A B C D E F G H].  */
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_ps (float __A, float __B, float __C, float __D,
+	       float __E, float __F, float __G, float __H)
+{
+  return __extension__ (__m256){ __H, __G, __F, __E,
+				 __D, __C, __B, __A };
+}
+
+/* Create the vector [A B C D E F G H].  */
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_epi32 (int __A, int __B, int __C, int __D,
+		  int __E, int __F, int __G, int __H)
+{
+  return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
+					  __D, __C, __B, __A };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
+		  short __q11, short __q10, short __q09, short __q08,
+		  short __q07, short __q06, short __q05, short __q04,
+		  short __q03, short __q02, short __q01, short __q00)
+{
+  return __extension__ (__m256i)(__v16hi){
+    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
+  };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_epi8  (char __q31, char __q30, char __q29, char __q28,
+		  char __q27, char __q26, char __q25, char __q24,
+		  char __q23, char __q22, char __q21, char __q20,
+		  char __q19, char __q18, char __q17, char __q16,
+		  char __q15, char __q14, char __q13, char __q12,
+		  char __q11, char __q10, char __q09, char __q08,
+		  char __q07, char __q06, char __q05, char __q04,
+		  char __q03, char __q02, char __q01, char __q00)
+{
+  return __extension__ (__m256i)(__v32qi){
+    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
+    __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
+    __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
+  };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_epi64x (long long __A, long long __B, long long __C,
+		   long long __D)
+{
+  return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
+}
+
+/* Create a vector with all elements equal to A.  */
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_pd (double __A)
+{
+  return __extension__ (__m256d){ __A, __A, __A, __A };
+}
+
+/* Create a vector with all elements equal to A.  */
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_ps (float __A)
+{
+  return __extension__ (__m256){ __A, __A, __A, __A,
+				 __A, __A, __A, __A };
+}
+
+/* Create a vector with all elements equal to A.  */
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_epi32 (int __A)
+{
+  return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
+					  __A, __A, __A, __A };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_epi16 (short __A)
+{
+  return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
+			   __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_epi8 (char __A)
+{
+  return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
+			  __A, __A, __A, __A, __A, __A, __A, __A,
+			  __A, __A, __A, __A, __A, __A, __A, __A,
+			  __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_epi64x (long long __A)
+{
+  return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
+}
+
+/* Create vectors of elements in the reversed order from the
+   _mm256_set_XXX functions.  */
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_pd (double __A, double __B, double __C, double __D)
+{
+  return _mm256_set_pd (__D, __C, __B, __A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_ps (float __A, float __B, float __C, float __D,
+		float __E, float __F, float __G, float __H)
+{
+  return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_epi32 (int __A, int __B, int __C, int __D,
+		   int __E, int __F, int __G, int __H)
+{
+  return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
+		   short __q11, short __q10, short __q09, short __q08,
+		   short __q07, short __q06, short __q05, short __q04,
+		   short __q03, short __q02, short __q01, short __q00)
+{
+  return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
+			   __q04, __q05, __q06, __q07,
+			   __q08, __q09, __q10, __q11,
+			   __q12, __q13, __q14, __q15);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_epi8  (char __q31, char __q30, char __q29, char __q28,
+		   char __q27, char __q26, char __q25, char __q24,
+		   char __q23, char __q22, char __q21, char __q20,
+		   char __q19, char __q18, char __q17, char __q16,
+		   char __q15, char __q14, char __q13, char __q12,
+		   char __q11, char __q10, char __q09, char __q08,
+		   char __q07, char __q06, char __q05, char __q04,
+		   char __q03, char __q02, char __q01, char __q00)
+{
+  return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
+			  __q04, __q05, __q06, __q07,
+			  __q08, __q09, __q10, __q11,
+			  __q12, __q13, __q14, __q15,
+			  __q16, __q17, __q18, __q19,
+			  __q20, __q21, __q22, __q23,
+			  __q24, __q25, __q26, __q27,
+			  __q28, __q29, __q30, __q31);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_epi64x (long long __A, long long __B, long long __C,
+		    long long __D)
+{
+  return _mm256_set_epi64x (__D, __C, __B, __A);
+}
+
+/* Casts between various SP, DP, INT vector types.  Note that these do no
+   conversion of values, they just change the type.  */
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd_ps (__m256d __A)
+{
+  return (__m256) __A;
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd_si256 (__m256d __A)
+{
+  return (__m256i) __A;
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps_pd (__m256 __A)
+{
+  return (__m256d) __A;
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps_si256(__m256 __A)
+{
+  return (__m256i) __A;
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi256_ps (__m256i __A)
+{
+  return (__m256) __A;
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi256_pd (__m256i __A)
+{
+  return (__m256d) __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd256_pd128 (__m256d __A)
+{
+  return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps256_ps128 (__m256 __A)
+{
+  return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi256_si128 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
+}
+
+/* When cast is done from a 128 to 256-bit type, the low 128 bits of
+   the 256-bit result contain source parameter value and the upper 128
+   bits of the result are undefined.  Those intrinsics shouldn't
+   generate any extra moves.  */
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd128_pd256 (__m128d __A)
+{
+  return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps128_ps256 (__m128 __A)
+{
+  return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi128_si256 (__m128i __A)
+{
+  return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
+}
+
+#ifdef __DISABLE_AVX__
+#undef __DISABLE_AVX__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX__ */
+
+#endif /* _AVXINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/avxmath.h b/gcc-4.9/gcc/config/i386/avxmath.h
new file mode 100644
index 000000000..e444993a0
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/avxmath.h
@@ -0,0 +1,28 @@
+/* Copyright (C) 2010-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#undef TARGET_FPMATH_DEFAULT
+#define TARGET_FPMATH_DEFAULT FPMATH_SSE
+
+#undef TARGET_SUBTARGET_ISA_DEFAULT
+#define TARGET_SUBTARGET_ISA_DEFAULT					\
+  (OPTION_MASK_ISA_MMX | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_SSE2	\
+   | OPTION_MASK_ISA_SSE3 | OPTION_MASK_ISA_SSSE3			\
+   | OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_2			\
+   | OPTION_MASK_ISA_AVX)
+
diff --git a/gcc-4.9/gcc/config/i386/bdver1.md b/gcc-4.9/gcc/config/i386/bdver1.md
new file mode 100644
index 000000000..578bb3b21
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/bdver1.md
@@ -0,0 +1,800 @@
+;; Copyright (C) 2010-2014 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+;;
+;; AMD bdver1 Scheduling
+;;
+;; The bdver1 contains four pipelined FP units, two integer units and
+;; two address generation units.
+;;
+;; The predecode logic is determining boundaries of instructions in the 64
+;; byte cache line.  So the cache line straddling problem of K6 might be issue
+;; here as well, but it is not noted in the documentation.
+;;
+;; Three DirectPath instructions decoders and only one VectorPath decoder
+;; is available.  They can decode three DirectPath instructions or one
+;; VectorPath instruction per cycle.
+;;
+;; The load/store queue unit is not attached to the schedulers but
+;; communicates with all the execution units separately instead.
+
+
+(define_attr "bdver1_decode" "direct,vector,double"
+  (const_string "direct"))
+
+(define_automaton "bdver1,bdver1_ieu,bdver1_load,bdver1_fp,bdver1_agu")
+
+(define_cpu_unit "bdver1-decode0" "bdver1")
+(define_cpu_unit "bdver1-decode1" "bdver1")
+(define_cpu_unit "bdver1-decode2" "bdver1")
+(define_cpu_unit "bdver1-decodev" "bdver1")
+
+;; Model the fact that double decoded instruction may take 2 cycles
+;; to decode when decoder2 and decoder0 in next cycle
+;; is used (this is needed to allow throughput of 1.5 double decoded
+;; instructions per cycle).
+;;
+;; In order to avoid dependence between reservation of decoder
+;; and other units, we model decoder as two stage fully pipelined unit
+;; and only double decoded instruction may occupy unit in the first cycle.
+;; With this scheme however two double instructions can be issued cycle0.
+;;
+;; Avoid this by using presence set requiring decoder0 to be allocated
+;; too.  Vector decoded instructions then can't be issued when modeled
+;; as consuming decoder0+decoder1+decoder2.
+;; We solve that by specialized vector decoder unit and exclusion set.
+(presence_set "bdver1-decode2" "bdver1-decode0")
+(exclusion_set "bdver1-decodev" "bdver1-decode0,bdver1-decode1,bdver1-decode2")
+
+(define_reservation "bdver1-vector" "nothing,bdver1-decodev")
+(define_reservation "bdver1-direct1" "nothing,bdver1-decode1")
+(define_reservation "bdver1-direct" "nothing,
+				     (bdver1-decode0 | bdver1-decode1
+				     | bdver1-decode2)")
+;; Double instructions behaves like two direct instructions.
+(define_reservation "bdver1-double" "((bdver1-decode2,bdver1-decode0)
+				     | (nothing,(bdver1-decode0 + bdver1-decode1))
+				     | (nothing,(bdver1-decode1 + bdver1-decode2)))")
+
+
+(define_cpu_unit "bdver1-ieu0" "bdver1_ieu")
+(define_cpu_unit "bdver1-ieu1" "bdver1_ieu")
+(define_reservation "bdver1-ieu" "(bdver1-ieu0 | bdver1-ieu1)")
+
+(define_cpu_unit "bdver1-agu0" "bdver1_agu")
+(define_cpu_unit "bdver1-agu1" "bdver1_agu")
+(define_reservation "bdver1-agu" "(bdver1-agu0 | bdver1-agu1)")
+
+(define_cpu_unit "bdver1-load0" "bdver1_load")
+(define_cpu_unit "bdver1-load1" "bdver1_load")
+(define_reservation "bdver1-load" "bdver1-agu,
+				   (bdver1-load0 | bdver1-load1),nothing")
+;; 128bit SSE instructions issue two loads at once.
+(define_reservation "bdver1-load2" "bdver1-agu,
+				   (bdver1-load0 + bdver1-load1),nothing")
+
+(define_reservation "bdver1-store" "(bdver1-load0 | bdver1-load1)")
+;; 128bit SSE instructions issue two stores at once.
+(define_reservation "bdver1-store2" "(bdver1-load0 + bdver1-load1)")
+
+;; vectorpath (microcoded) instructions are single issue instructions.
+;; So, they occupy all the integer units.
+(define_reservation "bdver1-ivector" "bdver1-ieu0+bdver1-ieu1+
+                                      bdver1-agu0+bdver1-agu1+
+                                      bdver1-load0+bdver1-load1")
+
+;; The FP operations start to execute at stage 12 in the pipeline, while
+;; integer operations start to execute at stage 9 for athlon and 11 for K8
+;; Compensate the difference for athlon because it results in significantly
+;; smaller automata.
+;; NOTE: the above information was just copied from athlon.md, and was not
+;; actually verified for bdver1.
+(define_reservation "bdver1-fpsched" "nothing,nothing,nothing")
+;; The floating point loads.
+(define_reservation "bdver1-fpload" "(bdver1-fpsched + bdver1-load)")
+(define_reservation "bdver1-fpload2" "(bdver1-fpsched + bdver1-load2)")
+
+;; Four FP units.
+(define_cpu_unit "bdver1-ffma0" "bdver1_fp")
+(define_cpu_unit "bdver1-ffma1" "bdver1_fp")
+(define_cpu_unit "bdver1-fmal0" "bdver1_fp")
+(define_cpu_unit "bdver1-fmal1" "bdver1_fp")
+
+(define_reservation "bdver1-ffma"     "(bdver1-ffma0 | bdver1-ffma1)")
+(define_reservation "bdver1-fcvt"     "bdver1-ffma0")
+(define_reservation "bdver1-fmma"     "bdver1-ffma0")
+(define_reservation "bdver1-fxbar"    "bdver1-ffma1")
+(define_reservation "bdver1-fmal"     "(bdver1-fmal0 | bdver1-fmal1)")
+(define_reservation "bdver1-fsto"     "bdver1-fmal1")
+
+;; Vector operations usually consume many of pipes.
+(define_reservation "bdver1-fvector"  "(bdver1-ffma0 + bdver1-ffma1
+					+ bdver1-fmal0 + bdver1-fmal1)")
+
+;; Jump instructions are executed in the branch unit completely transparent to us.
+(define_insn_reservation "bdver1_call" 0
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (eq_attr "type" "call,callv"))
+			 "bdver1-double,bdver1-agu")
+;; PUSH mem is double path.
+(define_insn_reservation "bdver1_push" 1
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (eq_attr "type" "push"))
+			 "bdver1-direct,bdver1-agu,bdver1-store")
+;; POP r16/mem are double path.
+(define_insn_reservation "bdver1_pop" 1
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (eq_attr "type" "pop"))
+			 "bdver1-direct,bdver1-ivector")
+;; LEAVE no latency info so far, assume same with amdfam10.
+(define_insn_reservation "bdver1_leave" 3
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (eq_attr "type" "leave"))
+			 "bdver1-vector,bdver1-ivector")
+;; LEA executes in AGU unit with 1 cycle latency on BDVER1.
+(define_insn_reservation "bdver1_lea" 1
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (eq_attr "type" "lea"))
+			 "bdver1-direct,bdver1-agu")
+
+;; MUL executes in special multiplier unit attached to IEU1.
+(define_insn_reservation "bdver1_imul_DI" 6
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "imul")
+				   (and (eq_attr "mode" "DI")
+					(eq_attr "memory" "none,unknown"))))
+			 "bdver1-direct1,bdver1-ieu1")
+(define_insn_reservation "bdver1_imul" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "imul")
+				   (eq_attr "memory" "none,unknown")))
+			 "bdver1-direct1,bdver1-ieu1")
+(define_insn_reservation "bdver1_imul_mem_DI" 10
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "imul")
+				   (and (eq_attr "mode" "DI")
+					(eq_attr "memory" "load,both"))))
+                         "bdver1-direct1,bdver1-load,bdver1-ieu1")
+(define_insn_reservation "bdver1_imul_mem" 8
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "imul")
+				   (eq_attr "memory" "load,both")))
+			 "bdver1-direct1,bdver1-load,bdver1-ieu1")
+
+;; IDIV cannot execute in parallel with other instructions.  Dealing with it
+;; as with short latency vector instruction is good approximation avoiding
+;; scheduler from trying too hard to can hide it's latency by overlap with
+;; other instructions.
+;; ??? Experiments show that the IDIV can overlap with roughly 6 cycles
+;; of the other code.
+(define_insn_reservation "bdver1_idiv" 6
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "idiv")
+				   (eq_attr "memory" "none,unknown")))
+			 "bdver1-vector,(bdver1-ieu0*6+(bdver1-fpsched,bdver1-fvector))")
+
+(define_insn_reservation "bdver1_idiv_mem" 10
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "idiv")
+				   (eq_attr "memory" "load,both")))
+			 "bdver1-vector,((bdver1-load,bdver1-ieu0*6)+(bdver1-fpsched,bdver1-fvector))")
+
+;; The parallelism of string instructions is not documented.  Model it same way
+;; as IDIV to create smaller automata.  This probably does not matter much.
+;; Using the same heuristics for bdver1 as amdfam10 and K8 with IDIV.
+(define_insn_reservation "bdver1_str" 6
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "str")
+				   (eq_attr "memory" "load,both,store")))
+			 "bdver1-vector,bdver1-load,bdver1-ieu0*6")
+
+;; Integer instructions.
+(define_insn_reservation "bdver1_idirect" 1
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "bdver1_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "none,unknown"))))
+			 "bdver1-direct,bdver1-ieu")
+(define_insn_reservation "bdver1_ivector" 2
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "bdver1_decode" "vector")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "none,unknown"))))
+			 "bdver1-vector,bdver1-ieu,bdver1-ieu")
+(define_insn_reservation "bdver1_idirect_loadmov" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "imov")
+				   (eq_attr "memory" "load")))
+			 "bdver1-direct,bdver1-load")
+(define_insn_reservation "bdver1_idirect_load" 5
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "bdver1_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "load"))))
+			 "bdver1-direct,bdver1-load,bdver1-ieu")
+(define_insn_reservation "bdver1_ivector_load" 6
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "bdver1_decode" "vector")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "load"))))
+			 "bdver1-vector,bdver1-load,bdver1-ieu,bdver1-ieu")
+(define_insn_reservation "bdver1_idirect_movstore" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "imov")
+				   (eq_attr "memory" "store")))
+			 "bdver1-direct,bdver1-agu,bdver1-store")
+(define_insn_reservation "bdver1_idirect_both" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "bdver1_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "both"))))
+			 "bdver1-direct,bdver1-load,
+			  bdver1-ieu,bdver1-store,
+			  bdver1-store")
+(define_insn_reservation "bdver1_ivector_both" 5
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "bdver1_decode" "vector")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "both"))))
+			 "bdver1-vector,bdver1-load,
+			  bdver1-ieu,
+			  bdver1-ieu,
+			  bdver1-store")
+(define_insn_reservation "bdver1_idirect_store" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "bdver1_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "store"))))
+			 "bdver1-direct,(bdver1-ieu+bdver1-agu),
+			  bdver1-store")
+(define_insn_reservation "bdver1_ivector_store" 5
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "bdver1_decode" "vector")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "store"))))
+			 "bdver1-vector,(bdver1-ieu+bdver1-agu),bdver1-ieu,
+			  bdver1-store")
+
+;; BDVER1 floating point units.
+(define_insn_reservation "bdver1_fldxf" 13
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "fmov")
+				   (and (eq_attr "memory" "load")
+					(eq_attr "mode" "XF"))))
+			 "bdver1-vector,bdver1-fpload2,bdver1-fvector*9")
+(define_insn_reservation "bdver1_fld" 5
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "fmov")
+				   (eq_attr "memory" "load")))
+			 "bdver1-direct,bdver1-fpload,bdver1-ffma")
+(define_insn_reservation "bdver1_fstxf" 8
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "fmov")
+				   (and (eq_attr "memory" "store,both")
+					(eq_attr "mode" "XF"))))
+			 "bdver1-vector,(bdver1-fpsched+bdver1-agu),(bdver1-store2+(bdver1-fvector*6))")
+(define_insn_reservation "bdver1_fst" 2
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "fmov")
+				   (eq_attr "memory" "store,both")))
+			 "bdver1-double,(bdver1-fpsched+bdver1-agu),(bdver1-fsto+bdver1-store)")
+(define_insn_reservation "bdver1_fist" 2
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (eq_attr "type" "fistp,fisttp"))
+			 "bdver1-double,(bdver1-fpsched+bdver1-agu),(bdver1-fsto+bdver1-store)")
+(define_insn_reservation "bdver1_fmov_bdver1" 2
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (eq_attr "type" "fmov"))
+			 "bdver1-direct,bdver1-fpsched,bdver1-ffma")
+(define_insn_reservation "bdver1_fadd_load" 10
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "fop")
+				   (eq_attr "memory" "load")))
+			 "bdver1-direct,bdver1-fpload,bdver1-ffma")
+(define_insn_reservation "bdver1_fadd" 6
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (eq_attr "type" "fop"))
+			 "bdver1-direct,bdver1-fpsched,bdver1-ffma")
+(define_insn_reservation "bdver1_fmul_load" 10
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "fmul")
+				   (eq_attr "memory" "load")))
+			 "bdver1-double,bdver1-fpload,bdver1-ffma")
+(define_insn_reservation "bdver1_fmul" 6
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (eq_attr "type" "fmul"))
+			 "bdver1-direct,bdver1-fpsched,bdver1-ffma")
+(define_insn_reservation "bdver1_fsgn" 2
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (eq_attr "type" "fsgn"))
+			 "bdver1-direct,bdver1-fpsched,bdver1-ffma")
+(define_insn_reservation "bdver1_fdiv_load" 46
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "fdiv")
+				   (eq_attr "memory" "load")))
+			 "bdver1-direct,bdver1-fpload,bdver1-ffma")
+(define_insn_reservation "bdver1_fdiv" 42
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (eq_attr "type" "fdiv"))
+			 "bdver1-direct,bdver1-fpsched,bdver1-ffma")
+(define_insn_reservation "bdver1_fpspc_load" 103
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "fpspc")
+				   (eq_attr "memory" "load")))
+			 "bdver1-vector,bdver1-fpload,bdver1-fvector")
+(define_insn_reservation "bdver1_fpspc" 100
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "fpspc")
+				   (eq_attr "memory" "load")))
+			 "bdver1-vector,bdver1-fpload,bdver1-fvector")
+(define_insn_reservation "bdver1_fcmov_load" 17
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "fcmov")
+				   (eq_attr "memory" "load")))
+			 "bdver1-vector,bdver1-fpload,bdver1-fvector")
+(define_insn_reservation "bdver1_fcmov" 15
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (eq_attr "type" "fcmov"))
+			 "bdver1-vector,bdver1-fpsched,bdver1-fvector")
+(define_insn_reservation "bdver1_fcomi_load" 6
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "fcmp")
+				   (and (eq_attr "bdver1_decode" "double")
+					(eq_attr "memory" "load"))))
+			 "bdver1-double,bdver1-fpload,(bdver1-ffma | bdver1-fsto)")
+(define_insn_reservation "bdver1_fcomi" 2
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "bdver1_decode" "double")
+				   (eq_attr "type" "fcmp")))
+			 "bdver1-double,bdver1-fpsched,(bdver1-ffma | bdver1-fsto)")
+(define_insn_reservation "bdver1_fcom_load" 6
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "fcmp")
+				   (eq_attr "memory" "load")))
+			 "bdver1-direct,bdver1-fpload,bdver1-ffma")
+(define_insn_reservation "bdver1_fcom" 2
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (eq_attr "type" "fcmp"))
+			 "bdver1-direct,bdver1-fpsched,bdver1-ffma")
+(define_insn_reservation "bdver1_fxch" 2
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (eq_attr "type" "fxch"))
+			 "bdver1-direct,bdver1-fpsched,bdver1-ffma")
+
+;; SSE loads.
+(define_insn_reservation "bdver1_ssevector_avx128_unaligned_load" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "prefix" "vex")
+					(and (eq_attr "movu" "1")
+					     (and (eq_attr "mode" "V4SF,V2DF")
+						  (eq_attr "memory" "load"))))))
+			 "bdver1-direct,bdver1-fpload")
+(define_insn_reservation "bdver1_ssevector_avx256_unaligned_load" 5
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "movu" "1")
+				        (and (eq_attr "mode" "V8SF,V4DF")
+				             (eq_attr "memory" "load")))))
+			 "bdver1-double,bdver1-fpload")
+(define_insn_reservation "bdver1_ssevector_sse128_unaligned_load" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "movu" "1")
+				        (and (eq_attr "mode" "V4SF,V2DF")
+				             (eq_attr "memory" "load")))))
+			 "bdver1-direct,bdver1-fpload,bdver1-fmal")
+(define_insn_reservation "bdver1_ssevector_avx128_load" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "prefix" "vex")
+				        (and (eq_attr "mode" "V4SF,V2DF,TI")
+				             (eq_attr "memory" "load")))))
+			 "bdver1-direct,bdver1-fpload,bdver1-fmal")
+(define_insn_reservation "bdver1_ssevector_avx256_load" 5
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "V8SF,V4DF,OI")
+				        (eq_attr "memory" "load"))))
+			 "bdver1-double,bdver1-fpload,bdver1-fmal")
+(define_insn_reservation "bdver1_ssevector_sse128_load" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "V4SF,V2DF,TI")
+				        (eq_attr "memory" "load"))))
+			 "bdver1-direct,bdver1-fpload")
+(define_insn_reservation "bdver1_ssescalar_movq_load" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "DI")
+				        (eq_attr "memory" "load"))))
+			 "bdver1-direct,bdver1-fpload,bdver1-fmal")
+(define_insn_reservation "bdver1_ssescalar_vmovss_load" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "prefix" "vex")
+				        (and (eq_attr "mode" "SF")
+				             (eq_attr "memory" "load")))))
+			 "bdver1-direct,bdver1-fpload")
+(define_insn_reservation "bdver1_ssescalar_sse128_load" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "SF,DF")
+				        (eq_attr "memory" "load"))))
+			 "bdver1-direct,bdver1-fpload, bdver1-ffma")
+(define_insn_reservation "bdver1_mmxsse_load" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "mmxmov,ssemov")
+				   (eq_attr "memory" "load")))
+			 "bdver1-direct,bdver1-fpload, bdver1-fmal")
+
+;; SSE stores.
+(define_insn_reservation "bdver1_sse_store_avx256" 5
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "V8SF,V4DF,OI")
+					(eq_attr "memory" "store,both"))))
+			 "bdver1-double,(bdver1-fpsched+bdver1-agu),((bdver1-fsto+bdver1-store)*2)")
+(define_insn_reservation "bdver1_sse_store" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "V4SF,V2DF,TI")
+					(eq_attr "memory" "store,both"))))
+			 "bdver1-direct,(bdver1-fpsched+bdver1-agu),((bdver1-fsto+bdver1-store)*2)")
+(define_insn_reservation "bdver1_mmxsse_store_short" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "mmxmov,ssemov")
+				   (eq_attr "memory" "store,both")))
+			 "bdver1-direct,(bdver1-fpsched+bdver1-agu),(bdver1-fsto+bdver1-store)")
+
+;; Register moves.
+(define_insn_reservation "bdver1_ssevector_avx256" 3
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "V8SF,V4DF,OI")
+					(eq_attr "memory" "none"))))
+			 "bdver1-double,bdver1-fpsched,bdver1-fmal")
+(define_insn_reservation "bdver1_movss_movsd" 2
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "SF,DF")
+                                        (eq_attr "memory" "none"))))
+			 "bdver1-direct,bdver1-fpsched,bdver1-ffma")
+(define_insn_reservation "bdver1_mmxssemov" 2
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "mmxmov,ssemov")
+				   (eq_attr "memory" "none")))
+			 "bdver1-direct,bdver1-fpsched,bdver1-fmal")
+;; SSE logs.
+(define_insn_reservation "bdver1_sselog_load_256" 7
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1")
+				   (and (eq_attr "mode" "V8SF")
+				   (eq_attr "memory" "load"))))
+			 "bdver1-double,bdver1-fpload,bdver1-fmal")
+(define_insn_reservation "bdver1_sselog_256" 3
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1")
+                                   (eq_attr "mode" "V8SF")))
+			 "bdver1-double,bdver1-fpsched,bdver1-fmal")
+(define_insn_reservation "bdver1_sselog_load" 6
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1")
+				   (eq_attr "memory" "load")))
+			 "bdver1-direct,bdver1-fpload,bdver1-fxbar")
+(define_insn_reservation "bdver1_sselog" 2
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1"))
+			 "bdver1-direct,bdver1-fpsched,bdver1-fxbar")
+
+;; PCMP actually executes in FMAL.
+(define_insn_reservation "bdver1_ssecmp_load" 6
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecmp")
+				   (eq_attr "memory" "load")))
+			 "bdver1-direct,bdver1-fpload,bdver1-ffma")
+(define_insn_reservation "bdver1_ssecmp" 2
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (eq_attr "type" "ssecmp"))
+			 "bdver1-direct,bdver1-fpsched,bdver1-ffma")
+(define_insn_reservation "bdver1_ssecomi_load" 6
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecomi")
+				   (eq_attr "memory" "load")))
+			 "bdver1-double,bdver1-fpload,(bdver1-ffma | bdver1-fsto)")
+(define_insn_reservation "bdver1_ssecomi" 2
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (eq_attr "type" "ssecomi"))
+			 "bdver1-double,bdver1-fpsched,(bdver1-ffma | bdver1-fsto)")
+
+;; Conversions behaves very irregularly and the scheduling is critical here.
+;; Take each instruction separately.
+
+;; 256 bit conversion.
+(define_insn_reservation "bdver1_vcvtX2Y_avx256_load" 8
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "load")
+					(ior (ior (match_operand:V4DF 0 "register_operand")
+					          (ior (match_operand:V8SF 0 "register_operand")
+						       (match_operand:V8SI 0 "register_operand")))
+					     (ior (match_operand:V4DF 1 "nonimmediate_operand")
+						  (ior (match_operand:V8SF 1 "nonimmediate_operand")
+						       (match_operand:V8SI 1 "nonimmediate_operand")))))))
+			 "bdver1-vector,bdver1-fpload,bdver1-fvector")
+(define_insn_reservation "bdver1_vcvtX2Y_avx256" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "none")
+					(ior (ior (match_operand:V4DF 0 "register_operand")
+					          (ior (match_operand:V8SF 0 "register_operand")
+						       (match_operand:V8SI 0 "register_operand")))
+					     (ior (match_operand:V4DF 1 "nonimmediate_operand")
+						  (ior (match_operand:V8SF 1 "nonimmediate_operand")
+						       (match_operand:V8SI 1 "nonimmediate_operand")))))))
+			 "bdver1-vector,bdver1-fpsched,bdver1-fvector")
+;; CVTSS2SD, CVTSD2SS.
+(define_insn_reservation "bdver1_ssecvt_cvtss2sd_load" 8
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "mode" "SF,DF")
+					(eq_attr "memory" "load"))))
+			 "bdver1-direct,bdver1-fpload,bdver1-fcvt")
+(define_insn_reservation "bdver1_ssecvt_cvtss2sd" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "mode" "SF,DF")
+					(eq_attr "memory" "none"))))
+			 "bdver1-direct,bdver1-fpsched,bdver1-fcvt")
+;; CVTSI2SD, CVTSI2SS, CVTSI2SDQ, CVTSI2SSQ.
+(define_insn_reservation "bdver1_sseicvt_cvtsi2sd_load" 8
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "mode" "SF,DF")
+					(eq_attr "memory" "load"))))
+			 "bdver1-direct,bdver1-fpload,bdver1-fcvt")
+(define_insn_reservation "bdver1_sseicvt_cvtsi2sd" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "mode" "SF,DF")
+					(eq_attr "memory" "none"))))
+			 "bdver1-double,bdver1-fpsched,(nothing | bdver1-fcvt)")
+;; CVTPD2PS.
+(define_insn_reservation "bdver1_ssecvt_cvtpd2ps_load" 8
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "load")
+                                        (and (match_operand:V4SF 0 "register_operand")
+					     (match_operand:V2DF 1 "nonimmediate_operand")))))
+			 "bdver1-double,bdver1-fpload,(bdver1-fxbar | bdver1-fcvt)")
+(define_insn_reservation "bdver1_ssecvt_cvtpd2ps" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "none")
+                                        (and (match_operand:V4SF 0 "register_operand")
+					     (match_operand:V2DF 1 "nonimmediate_operand")))))
+			 "bdver1-double,bdver1-fpsched,(bdver1-fxbar | bdver1-fcvt)")
+;; CVTPI2PS, CVTDQ2PS.
+(define_insn_reservation "bdver1_ssecvt_cvtdq2ps_load" 8
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "load")
+                                        (and (match_operand:V4SF 0 "register_operand")
+					     (ior (match_operand:V2SI 1 "nonimmediate_operand")
+					          (match_operand:V4SI 1 "nonimmediate_operand"))))))
+			 "bdver1-direct,bdver1-fpload,bdver1-fcvt")
+(define_insn_reservation "bdver1_ssecvt_cvtdq2ps" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "none")
+                                        (and (match_operand:V4SF 0 "register_operand")
+					     (ior (match_operand:V2SI 1 "nonimmediate_operand")
+					          (match_operand:V4SI 1 "nonimmediate_operand"))))))
+			 "bdver1-direct,bdver1-fpsched,bdver1-fcvt")
+;; CVTDQ2PD.
+(define_insn_reservation "bdver1_ssecvt_cvtdq2pd_load" 8
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "load")
+                                        (and (match_operand:V2DF 0 "register_operand")
+					     (match_operand:V4SI 1 "nonimmediate_operand")))))
+			 "bdver1-double,bdver1-fpload,(bdver1-fxbar | bdver1-fcvt)")
+(define_insn_reservation "bdver1_ssecvt_cvtdq2pd" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "none")
+                                        (and (match_operand:V2DF 0 "register_operand")
+					     (match_operand:V4SI 1 "nonimmediate_operand")))))
+			 "bdver1-double,bdver1-fpsched,(bdver1-fxbar | bdver1-fcvt)")
+;; CVTPS2PD, CVTPI2PD.
+(define_insn_reservation "bdver1_ssecvt_cvtps2pd_load" 6
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "load")
+                                        (and (match_operand:V2DF 0 "register_operand")
+					     (ior (match_operand:V2SI 1 "nonimmediate_operand")
+					          (match_operand:V4SF 1 "nonimmediate_operand"))))))
+			 "bdver1-double,bdver1-fpload,(bdver1-fxbar | bdver1-fcvt)")
+(define_insn_reservation "bdver1_ssecvt_cvtps2pd" 2
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "load")
+                                        (and (match_operand:V2DF 0 "register_operand")
+					     (ior (match_operand:V2SI 1 "nonimmediate_operand")
+					          (match_operand:V4SF 1 "nonimmediate_operand"))))))
+			 "bdver1-double,bdver1-fpsched,(bdver1-fxbar | bdver1-fcvt)")
+;; CVTSD2SI, CVTSD2SIQ, CVTSS2SI, CVTSS2SIQ, CVTTSD2SI, CVTTSD2SIQ, CVTTSS2SI, CVTTSS2SIQ.
+(define_insn_reservation "bdver1_ssecvt_cvtsX2si_load" 8
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "mode" "SI,DI")
+					(eq_attr "memory" "load"))))
+			 "bdver1-double,bdver1-fpload,(bdver1-fcvt | bdver1-fsto)")
+(define_insn_reservation "bdver1_ssecvt_cvtsX2si" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "mode" "SI,DI")
+					(eq_attr "memory" "none"))))
+			 "bdver1-double,bdver1-fpsched,(bdver1-fcvt | bdver1-fsto)")
+;; CVTPD2PI, CVTTPD2PI.
+(define_insn_reservation "bdver1_ssecvt_cvtpd2pi_load" 8
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "load")
+				        (and (match_operand:V2DF 1 "nonimmediate_operand")
+					     (match_operand:V2SI 0 "register_operand")))))
+			 "bdver1-double,bdver1-fpload,(bdver1-fcvt | bdver1-fxbar)")
+(define_insn_reservation "bdver1_ssecvt_cvtpd2pi" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "none")
+				        (and (match_operand:V2DF 1 "nonimmediate_operand")
+					     (match_operand:V2SI 0 "register_operand")))))
+			 "bdver1-double,bdver1-fpsched,(bdver1-fcvt | bdver1-fxbar)")
+;; CVTPD2DQ, CVTTPD2DQ.
+(define_insn_reservation "bdver1_ssecvt_cvtpd2dq_load" 6
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "load")
+				        (and (match_operand:V2DF 1 "nonimmediate_operand")
+					     (match_operand:V4SI 0 "register_operand")))))
+			 "bdver1-double,bdver1-fpload,(bdver1-fcvt | bdver1-fxbar)")
+(define_insn_reservation "bdver1_ssecvt_cvtpd2dq" 2
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "none")
+				        (and (match_operand:V2DF 1 "nonimmediate_operand")
+					     (match_operand:V4SI 0 "register_operand")))))
+			 "bdver1-double,bdver1-fpsched,(bdver1-fcvt | bdver1-fxbar)")
+;; CVTPS2PI, CVTTPS2PI, CVTPS2DQ, CVTTPS2DQ.
+(define_insn_reservation "bdver1_ssecvt_cvtps2pi_load" 8
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecvt")
+                                   (and (eq_attr "memory" "load")
+				        (and (match_operand:V4SF 1 "nonimmediate_operand")
+				             (ior (match_operand: V2SI 0 "register_operand")
+						  (match_operand: V4SI 0 "register_operand"))))))
+			 "bdver1-direct,bdver1-fpload,bdver1-fcvt")
+(define_insn_reservation "bdver1_ssecvt_cvtps2pi" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "none")
+				        (and (match_operand:V4SF 1 "nonimmediate_operand")
+				             (ior (match_operand: V2SI 0 "register_operand")
+						  (match_operand: V4SI 0 "register_operand"))))))
+			 "bdver1-direct,bdver1-fpsched,bdver1-fcvt")
+
+;; SSE MUL, ADD, and MULADD.
+(define_insn_reservation "bdver1_ssemuladd_load_256" 11
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssemul,sseadd,sseadd1,ssemuladd")
+				   (and (eq_attr "mode" "V8SF,V4DF")
+					(eq_attr "memory" "load"))))
+			 "bdver1-double,bdver1-fpload,bdver1-ffma")
+(define_insn_reservation "bdver1_ssemuladd_256" 7
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssemul,sseadd,sseadd1,ssemuladd")
+				   (and (eq_attr "mode" "V8SF,V4DF")
+					(eq_attr "memory" "none"))))
+			 "bdver1-double,bdver1-fpsched,bdver1-ffma")
+(define_insn_reservation "bdver1_ssemuladd_load" 10
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssemul,sseadd,sseadd1,ssemuladd")
+				   (eq_attr "memory" "load")))
+			 "bdver1-direct,bdver1-fpload,bdver1-ffma")
+(define_insn_reservation "bdver1_ssemuladd" 6
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssemul,sseadd,sseadd1,ssemuladd")
+				   (eq_attr "memory" "none")))
+			 "bdver1-direct,bdver1-fpsched,bdver1-ffma")
+(define_insn_reservation "bdver1_sseimul_load" 8
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "sseimul")
+				   (eq_attr "memory" "load")))
+			 "bdver1-direct,bdver1-fpload,bdver1-fmma")
+(define_insn_reservation "bdver1_sseimul" 4
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "sseimul")
+				   (eq_attr "memory" "none")))
+			 "bdver1-direct,bdver1-fpsched,bdver1-fmma")
+(define_insn_reservation "bdver1_sseiadd_load" 6
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "sseiadd")
+				   (eq_attr "memory" "load")))
+			 "bdver1-direct,bdver1-fpload,bdver1-fmal")
+(define_insn_reservation "bdver1_sseiadd" 2
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "sseiadd")
+				   (eq_attr "memory" "none")))
+			 "bdver1-direct,bdver1-fpsched,bdver1-fmal")
+
+;; SSE DIV: no throughput information (assume same as amdfam10).
+(define_insn_reservation "bdver1_ssediv_double_load_256" 31
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssediv")
+				   (and (eq_attr "mode" "V4DF")
+				        (eq_attr "memory" "load"))))
+			 "bdver1-double,bdver1-fpload,(bdver1-ffma0*17 | bdver1-ffma1*17)")
+(define_insn_reservation "bdver1_ssediv_double_256" 27
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssediv")
+				   (and (eq_attr "mode" "V4DF")
+				        (eq_attr "memory" "none"))))
+			 "bdver1-double,bdver1-fpsched,(bdver1-ffma0*17 | bdver1-ffma1*17)")
+(define_insn_reservation "bdver1_ssediv_single_load_256" 28
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssediv")
+				   (and (eq_attr "mode" "V8SF")
+				        (eq_attr "memory" "load"))))
+			 "bdver1-double,bdver1-fpload,(bdver1-ffma0*17 | bdver1-ffma1*17)")
+(define_insn_reservation "bdver1_ssediv_single_256" 24
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssediv")
+				   (and (eq_attr "mode" "V8SF")
+				        (eq_attr "memory" "none"))))
+			 "bdver1-double,bdver1-fpsched,(bdver1-ffma0*17 | bdver1-ffma1*17)")
+(define_insn_reservation "bdver1_ssediv_double_load" 31
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssediv")
+				   (and (eq_attr "mode" "DF,V2DF")
+					(eq_attr "memory" "load"))))
+			 "bdver1-direct,bdver1-fpload,(bdver1-ffma0*17 | bdver1-ffma1*17)")
+(define_insn_reservation "bdver1_ssediv_double" 27
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssediv")
+				   (and (eq_attr "mode" "DF,V2DF")
+					(eq_attr "memory" "none"))))
+			 "bdver1-direct,bdver1-fpsched,(bdver1-ffma0*17 | bdver1-ffma1*17)")
+(define_insn_reservation "bdver1_ssediv_single_load" 28
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssediv")
+				   (and (eq_attr "mode" "SF,V4SF")
+					(eq_attr "memory" "load"))))
+			 "bdver1-direct,bdver1-fpload,(bdver1-ffma0*17 | bdver1-ffma1*17)")
+(define_insn_reservation "bdver1_ssediv_single" 24
+			 (and (eq_attr "cpu" "bdver1,bdver2")
+			      (and (eq_attr "type" "ssediv")
+				   (and (eq_attr "mode" "SF,V4SF")
+					(eq_attr "memory" "none"))))
+			 "bdver1-direct,bdver1-fpsched,(bdver1-ffma0*17 | bdver1-ffma1*17)")
+
+(define_insn_reservation "bdver1_sseins" 3
+                         (and (eq_attr "cpu" "bdver1,bdver2")
+                              (and (eq_attr "type" "sseins")
+                                   (eq_attr "mode" "TI")))
+                         "bdver1-direct,bdver1-fpsched,bdver1-fxbar")
+
diff --git a/gcc-4.9/gcc/config/i386/bdver3.md b/gcc-4.9/gcc/config/i386/bdver3.md
new file mode 100644
index 000000000..4c85ade31
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/bdver3.md
@@ -0,0 +1,748 @@
+;; Copyright (C) 2012-2014 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+;;
+;; AMD bdver3 and bdver4 Scheduling
+;;
+;; The bdver3 and bdver4 contains three pipelined FP units and two integer
+;; units. ;; Fetching and decoding logic is different from previous fam15 
+;; processors. Fetching is done every two cycles rather than every cycle 
+;; and two decode units are available. The decode units therefore decode
+;; four instructions in two cycles.
+;;
+;; The load/store queue unit is not attached to the schedulers but
+;; communicates with all the execution units separately instead.
+;;
+;; bdver3 and bdver4 belong to fam15 processors. We use the same insn 
+;; attribute that was used for bdver1 decoding scheme.
+
+(define_automaton "bdver3,bdver3_ieu,bdver3_load,bdver3_fp,bdver3_agu")
+
+(define_cpu_unit "bdver3-decode0" "bdver3")
+(define_cpu_unit "bdver3-decode1" "bdver3")
+(define_cpu_unit "bdver3-decode2" "bdver3")
+(define_cpu_unit "bdver3-decode3" "bdver3")
+
+;; Double decoded instructions take two cycles whereas
+;; direct instructions take one cycle.
+;; Vectorpath instructions are single issue instructions.
+;; So, we engage all units vector instructions.
+(define_reservation "bdver3-vector" "bdver3-decode0+bdver3-decode1+bdver3-decode2+bdver3-decode3")
+
+;; Direct instructions can be issued to any of the four decoders
+(define_reservation "bdver3-direct" "(bdver3-decode0|bdver3-decode1|bdver3-decode2|bdver3-decode3)")
+
+;; Double instructions take two cycles to decode.
+(define_reservation "bdver3-double" "(bdver3-decode0,bdver3-decode0)|
+               (bdver3-decode1,bdver3-decode1)| (bdver3-decode2,bdver3-decode2)|
+               (bdver3-decode3,bdver3-decode3)")
+
+(define_cpu_unit "bdver3-ieu0" "bdver3_ieu")
+(define_cpu_unit "bdver3-ieu1" "bdver3_ieu")
+(define_reservation "bdver3-ieu" "(bdver3-ieu0|bdver3-ieu1)")
+
+(define_cpu_unit "bdver3-agu0" "bdver3_agu")
+(define_cpu_unit "bdver3-agu1" "bdver3_agu")
+(define_reservation "bdver3-agu" "(bdver3-agu0|bdver3-agu1)")
+
+(define_cpu_unit "bdver3-load0" "bdver3_load")
+(define_cpu_unit "bdver3-load1" "bdver3_load")
+(define_reservation "bdver3-load" "bdver3-agu,
+				   (bdver3-load0|bdver3-load1),nothing")
+;; 128bit SSE instructions issue two loads at once.
+(define_reservation "bdver3-load2" "bdver3-agu,
+				   (bdver3-load0+bdver3-load1),nothing")
+
+(define_reservation "bdver3-store" "(bdver3-load0 | bdver3-load1)")
+;; 128bit SSE instructions issue two stores at once.
+(define_reservation "bdver3-store2" "(bdver3-load0+bdver3-load1)")
+
+;; vectorpath (microcoded) instructions are single issue instructions.
+;; So, they occupy all the integer units.
+(define_reservation "bdver3-ivector" "bdver3-ieu0+bdver3-ieu1+
+                                      bdver3-agu0+bdver3-agu1+
+                                      bdver3-load0+bdver3-load1")
+
+(define_reservation "bdver3-fpsched" "nothing,nothing,nothing")
+
+;; The floating point loads.
+(define_reservation "bdver3-fpload" "(bdver3-fpsched + bdver3-load)")
+(define_reservation "bdver3-fpload2" "(bdver3-fpsched + bdver3-load2)")
+
+;; Three FP units.
+(define_cpu_unit "bdver3-ffma0" "bdver3_fp")
+(define_cpu_unit "bdver3-ffma1" "bdver3_fp")
+(define_cpu_unit "bdver3-fpsto" "bdver3_fp")
+
+(define_reservation "bdver3-fvector" "bdver3-ffma0+bdver3-ffma1+
+                                      bdver3-fpsto+bdver3-load0+
+                                      bdver3-load1")
+
+(define_reservation "bdver3-ffma"     "(bdver3-ffma0 | bdver3-ffma1)")
+(define_reservation "bdver3-fcvt"     "bdver3-ffma0")
+(define_reservation "bdver3-fmma"     "bdver3-ffma0")
+(define_reservation "bdver3-fxbar"    "bdver3-ffma1")
+(define_reservation "bdver3-fmal"     "(bdver3-ffma0 | bdver3-fpsto)")
+(define_reservation "bdver3-fsto"     "bdver3-fpsto")
+(define_reservation "bdver3-fpshuf"    "bdver3-fpsto")
+
+;; Jump instructions are executed in the branch unit completely transparent to us.
+(define_insn_reservation "bdver3_call" 2
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (eq_attr "type" "call,callv"))
+			 "bdver3-double,(bdver3-agu | bdver3-ieu),nothing")
+;; PUSH mem is double path.
+(define_insn_reservation "bdver3_push" 1
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (eq_attr "type" "push"))
+			 "bdver3-direct,bdver3-ieu,bdver3-store")
+;; POP r16/mem are double path.
+(define_insn_reservation "bdver3_pop" 1
+                         (and (eq_attr "cpu" "bdver3,bdver4")
+                              (eq_attr "type" "pop"))
+                         "bdver3-direct,bdver3-ivector")
+;; LEAVE no latency info so far, assume same with amdfam10.
+(define_insn_reservation "bdver3_leave" 3
+                         (and (eq_attr "cpu" "bdver3,bdver4")
+                              (eq_attr "type" "leave"))
+                         "bdver3-vector,bdver3-ivector")
+;; LEA executes in AGU unit with 1 cycle latency on BDVER3.
+(define_insn_reservation "bdver3_lea" 1
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (eq_attr "type" "lea"))
+			 "bdver3-direct,bdver3-ieu")
+;; MUL executes in special multiplier unit attached to IEU1.
+(define_insn_reservation "bdver3_imul_DI" 6
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "imul")
+				   (and (eq_attr "mode" "DI")
+					(eq_attr "memory" "none,unknown"))))
+			 "bdver3-direct,bdver3-ieu1")
+(define_insn_reservation "bdver3_imul" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "imul")
+				   (eq_attr "memory" "none,unknown")))
+			 "bdver3-direct,bdver3-ieu1")
+(define_insn_reservation "bdver3_imul_mem_DI" 10
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "imul")
+				   (and (eq_attr "mode" "DI")
+					(eq_attr "memory" "load,both"))))
+			 "bdver3-direct,bdver3-load,bdver3-ieu1")
+(define_insn_reservation "bdver3_imul_mem" 8
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "imul")
+				   (eq_attr "memory" "load,both")))
+			 "bdver3-direct,bdver3-load,bdver3-ieu1")
+
+(define_insn_reservation "bdver3_str" 6
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "str")
+				   (eq_attr "memory" "load,both,store")))
+			 "bdver3-vector,bdver3-load,bdver3-ivector")
+
+;; Integer instructions.
+(define_insn_reservation "bdver3_idirect" 1
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "bdver1_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "none,unknown"))))
+			 "bdver3-direct,(bdver3-ieu|bdver3-agu)")
+(define_insn_reservation "bdver3_ivector" 2
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "bdver1_decode" "vector")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "none,unknown"))))
+			 "bdver3-vector,bdver3-ivector")
+(define_insn_reservation "bdver3_idirect_loadmov" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "imov")
+				   (eq_attr "memory" "load")))
+			 "bdver3-direct,bdver3-load")
+(define_insn_reservation "bdver3_idirect_load" 5
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "bdver1_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "load"))))
+			 "bdver3-direct,bdver3-load,bdver3-ieu")
+(define_insn_reservation "bdver3_idirect_movstore" 5
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "imov")
+				   (eq_attr "memory" "store")))
+			 "bdver3-direct,bdver3-ieu,bdver3-store")
+(define_insn_reservation "bdver3_idirect_both" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "bdver1_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "both"))))
+			 "bdver3-direct,bdver3-load,
+			  bdver3-ieu,bdver3-store,
+			  bdver3-store")
+(define_insn_reservation "bdver3_idirect_store" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "bdver1_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "store"))))
+			 "bdver3-direct,(bdver3-ieu+bdver3-agu),
+			  bdver3-store")
+;; BDVER3 floating point units.
+(define_insn_reservation "bdver3_fldxf" 13
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "fmov")
+				   (and (eq_attr "memory" "load")
+					(eq_attr "mode" "XF"))))
+			 "bdver3-vector,bdver3-fpload2,bdver3-fvector*9")
+(define_insn_reservation "bdver3_fld" 2
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "fmov")
+				   (eq_attr "memory" "load")))
+			 "bdver3-direct,bdver3-fpload,bdver3-ffma")
+(define_insn_reservation "bdver3_fstxf" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "fmov")
+				   (and (eq_attr "memory" "store,both")
+					(eq_attr "mode" "XF"))))
+			 "bdver3-vector,(bdver3-fpsched+bdver3-agu),(bdver3-store2+(bdver3-fvector*6))")
+(define_insn_reservation "bdver3_fst" 2
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "fmov")
+				   (eq_attr "memory" "store,both")))
+			 "bdver3-double,(bdver3-fpsched),(bdver3-fsto+bdver3-store)")
+(define_insn_reservation "bdver3_fist" 2
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (eq_attr "type" "fistp,fisttp"))
+			 "bdver3-double,(bdver3-fpsched),(bdver3-fsto+bdver3-store)")
+(define_insn_reservation "bdver3_fmov_bdver3" 2
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (eq_attr "type" "fmov"))
+			 "bdver3-direct,bdver3-fpsched,bdver3-ffma")
+(define_insn_reservation "bdver3_fadd_load" 10
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "fop")
+				   (eq_attr "memory" "load")))
+			 "bdver3-direct,bdver3-fpload,bdver3-ffma")
+(define_insn_reservation "bdver3_fadd" 6
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (eq_attr "type" "fop"))
+			 "bdver3-direct,bdver3-fpsched,bdver3-ffma")
+(define_insn_reservation "bdver3_fmul_load" 6
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "fmul")
+				   (eq_attr "memory" "load")))
+			 "bdver3-double,bdver3-fpload,bdver3-ffma")
+(define_insn_reservation "bdver3_fmul" 6
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (eq_attr "type" "fmul"))
+			 "bdver3-direct,bdver3-fpsched,bdver3-ffma")
+(define_insn_reservation "bdver3_fsgn" 2
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (eq_attr "type" "fsgn"))
+			 "bdver3-direct,bdver3-fpsched,bdver3-ffma")
+(define_insn_reservation "bdver3_fdiv_load" 42
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "fdiv")
+				   (eq_attr "memory" "load")))
+			 "bdver3-direct,bdver3-fpload,bdver3-ffma")
+(define_insn_reservation "bdver3_fdiv" 42
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (eq_attr "type" "fdiv"))
+			 "bdver3-direct,bdver3-fpsched,bdver3-ffma")
+(define_insn_reservation "bdver3_fpspc_load" 143
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "fpspc")
+				   (eq_attr "memory" "load")))
+			 "bdver3-vector,bdver3-fpload,bdver3-fvector")
+(define_insn_reservation "bdver3_fcmov_load" 17
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "fcmov")
+				   (eq_attr "memory" "load")))
+			 "bdver3-vector,bdver3-fpload,bdver3-fvector")
+(define_insn_reservation "bdver3_fcmov" 15
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (eq_attr "type" "fcmov"))
+			 "bdver3-vector,bdver3-fpsched,bdver3-fvector")
+(define_insn_reservation "bdver3_fcomi_load" 6
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "fcmp")
+				   (and (eq_attr "bdver1_decode" "double")
+					(eq_attr "memory" "load"))))
+			 "bdver3-double,bdver3-fpload,(bdver3-ffma | bdver3-fsto)")
+(define_insn_reservation "bdver3_fcomi" 2
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "bdver1_decode" "double")
+				   (eq_attr "type" "fcmp")))
+			 "bdver3-double,bdver3-fpsched,(bdver3-ffma | bdver3-fsto)")
+(define_insn_reservation "bdver3_fcom_load" 6
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "fcmp")
+				   (eq_attr "memory" "load")))
+			 "bdver3-direct,bdver3-fpload,bdver3-ffma")
+(define_insn_reservation "bdver3_fcom" 2
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (eq_attr "type" "fcmp"))
+			 "bdver3-direct,bdver3-fpsched,bdver3-ffma")
+(define_insn_reservation "bdver3_fxch" 2
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (eq_attr "type" "fxch"))
+			 "bdver3-direct,bdver3-fpsched,bdver3-ffma")
+
+;; SSE loads.
+(define_insn_reservation "bdver3_ssevector_avx128_unaligned_load" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "prefix" "vex")
+					(and (eq_attr "movu" "1")
+					     (and (eq_attr "mode" "V4SF,V2DF")
+						  (eq_attr "memory" "load"))))))
+			 "bdver3-direct,bdver3-fpload")
+(define_insn_reservation "bdver3_ssevector_avx256_unaligned_load" 5
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "movu" "1")
+				        (and (eq_attr "mode" "V8SF,V4DF")
+				             (eq_attr "memory" "load")))))
+			 "bdver3-double,bdver3-fpload")
+(define_insn_reservation "bdver3_ssevector_sse128_unaligned_load" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "movu" "1")
+				        (and (eq_attr "mode" "V4SF,V2DF")
+				             (eq_attr "memory" "load")))))
+			 "bdver3-direct,bdver3-fpload,bdver3-fmal")
+(define_insn_reservation "bdver3_ssevector_avx128_load" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "prefix" "vex")
+				        (and (eq_attr "mode" "V4SF,V2DF,TI")
+				             (eq_attr "memory" "load")))))
+			 "bdver3-direct,bdver3-fpload,bdver3-fmal")
+(define_insn_reservation "bdver3_ssevector_avx256_load" 5
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "V8SF,V4DF,OI")
+				        (eq_attr "memory" "load"))))
+			 "bdver3-double,bdver3-fpload,bdver3-fmal")
+(define_insn_reservation "bdver3_ssevector_sse128_load" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "V4SF,V2DF,TI")
+				        (eq_attr "memory" "load"))))
+			 "bdver3-direct,bdver3-fpload")
+(define_insn_reservation "bdver3_ssescalar_movq_load" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "DI")
+				        (eq_attr "memory" "load"))))
+			 "bdver3-direct,bdver3-fpload,bdver3-fmal")
+(define_insn_reservation "bdver3_ssescalar_vmovss_load" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "prefix" "vex")
+				        (and (eq_attr "mode" "SF")
+				             (eq_attr "memory" "load")))))
+			 "bdver3-direct,bdver3-fpload")
+(define_insn_reservation "bdver3_ssescalar_sse128_load" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "SF,DF")
+				        (eq_attr "memory" "load"))))
+			 "bdver3-direct,bdver3-fpload, bdver3-ffma")
+(define_insn_reservation "bdver3_mmxsse_load" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "mmxmov,ssemov")
+				   (eq_attr "memory" "load")))
+			 "bdver3-direct,bdver3-fpload, bdver3-fmal")
+
+;; SSE stores.
+(define_insn_reservation "bdver3_sse_store_avx256" 5
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "V8SF,V4DF,OI")
+					(eq_attr "memory" "store,both"))))
+			 "bdver3-double,bdver3-fpsched,((bdver3-fsto+bdver3-store)*2)")
+(define_insn_reservation "bdver3_sse_store" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "V4SF,V2DF,TI")
+					(eq_attr "memory" "store,both"))))
+			 "bdver3-direct,bdver3-fpsched,((bdver3-fsto+bdver3-store)*2)")
+(define_insn_reservation "bdver3_mmxsse_store_short" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "mmxmov,ssemov")
+				   (eq_attr "memory" "store,both")))
+			 "bdver3-direct,bdver3-fpsched,(bdver3-fsto+bdver3-store)")
+
+;; Register moves.
+(define_insn_reservation "bdver3_ssevector_avx256" 3
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "V8SF,V4DF,OI")
+					(eq_attr "memory" "none"))))
+			 "bdver3-double,bdver3-fpsched,bdver3-fmal")
+(define_insn_reservation "bdver3_movss_movsd" 2
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssemov")
+				   (and (eq_attr "mode" "SF,DF")
+                                        (eq_attr "memory" "none"))))
+			 "bdver3-direct,bdver3-fpsched,bdver3-ffma")
+(define_insn_reservation "bdver3_mmxssemov" 2
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "mmxmov,ssemov")
+				   (eq_attr "memory" "none")))
+			 "bdver3-direct,bdver3-fpsched,bdver3-fmal")
+;; SSE logs.
+(define_insn_reservation "bdver3_sselog_load_256" 7
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "sselog,sselog1")
+				   (and (eq_attr "mode" "V8SF")
+				   (eq_attr "memory" "load"))))
+			 "bdver3-double,bdver3-fpload,bdver3-fmal")
+(define_insn_reservation "bdver3_sselog_256" 3
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "sselog,sselog1")
+                                   (eq_attr "mode" "V8SF")))
+			 "bdver3-double,bdver3-fpsched,bdver3-fmal")
+(define_insn_reservation "bdver3_sselog_load" 6
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "sselog,sselog1")
+				   (eq_attr "memory" "load")))
+			 "bdver3-direct,bdver3-fpload,bdver3-fxbar")
+(define_insn_reservation "bdver3_sselog" 2
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (eq_attr "type" "sselog,sselog1"))
+			 "bdver3-direct,bdver3-fpsched,bdver3-fxbar")
+
+;; SSE Shuffles
+(define_insn_reservation "bdver3_sseshuf_load_256" 7
+                         (and (eq_attr "cpu" "bdver3,bdver4")
+                              (and (eq_attr "type" "sseshuf,sseshuf1")
+                                   (and (eq_attr "mode" "V8SF")
+                                   (eq_attr "memory" "load"))))
+                         "bdver3-double,bdver3-fpload,bdver3-fpshuf")
+(define_insn_reservation "bdver3_sseshuf_load" 6
+                         (and (eq_attr "cpu" "bdver3,bdver4")
+                              (and (eq_attr "type" "sseshuf,sseshuf1")
+                                   (eq_attr "memory" "load")))
+                         "bdver3-direct,bdver3-fpload,bdver3-fpshuf")
+
+(define_insn_reservation "bdver3_sseshuf_256" 3
+                         (and (eq_attr "cpu" "bdver3,bdver4")
+                              (and (eq_attr "type" "sseshuf")
+                                   (eq_attr "mode" "V8SF")))
+                         "bdver3-double,bdver3-fpsched,bdver3-fpshuf")
+(define_insn_reservation "bdver3_sseshuf" 2
+                         (and (eq_attr "cpu" "bdver3,bdver4")
+                              (eq_attr "type" "sseshuf,sseshuf1"))
+                         "bdver3-direct,bdver3-fpsched,bdver3-fpshuf")
+
+;; PCMP actually executes in FMAL.
+(define_insn_reservation "bdver3_ssecmp_load" 6
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecmp")
+				   (eq_attr "memory" "load")))
+			 "bdver3-direct,bdver3-fpload,bdver3-ffma")
+(define_insn_reservation "bdver3_ssecmp" 2
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (eq_attr "type" "ssecmp"))
+			 "bdver3-direct,bdver3-fpsched,bdver3-ffma")
+(define_insn_reservation "bdver3_ssecomi_load" 6
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecomi")
+				   (eq_attr "memory" "load")))
+			 "bdver3-double,bdver3-fpload,(bdver3-ffma | bdver3-fsto)")
+(define_insn_reservation "bdver3_ssecomi" 2
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (eq_attr "type" "ssecomi"))
+			 "bdver3-double,bdver3-fpsched,(bdver3-ffma | bdver3-fsto)")
+
+;; Conversions behaves very irregularly and the scheduling is critical here.
+;; Take each instruction separately.
+
+;; 256 bit conversion.
+(define_insn_reservation "bdver3_vcvtX2Y_avx256_load" 8
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "load")
+					(ior (ior (match_operand:V4DF 0 "register_operand")
+					          (ior (match_operand:V8SF 0 "register_operand")
+						       (match_operand:V8SI 0 "register_operand")))
+					     (ior (match_operand:V4DF 1 "nonimmediate_operand")
+						  (ior (match_operand:V8SF 1 "nonimmediate_operand")
+						       (match_operand:V8SI 1 "nonimmediate_operand")))))))
+			 "bdver3-vector,bdver3-fpload,bdver3-fvector")
+(define_insn_reservation "bdver3_vcvtX2Y_avx256" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "none")
+					(ior (ior (match_operand:V4DF 0 "register_operand")
+					          (ior (match_operand:V8SF 0 "register_operand")
+						       (match_operand:V8SI 0 "register_operand")))
+					     (ior (match_operand:V4DF 1 "nonimmediate_operand")
+						  (ior (match_operand:V8SF 1 "nonimmediate_operand")
+						       (match_operand:V8SI 1 "nonimmediate_operand")))))))
+			 "bdver3-vector,bdver3-fpsched,bdver3-fvector")
+;; CVTSS2SD, CVTSD2SS.
+(define_insn_reservation "bdver3_ssecvt_cvtss2sd_load" 8
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "mode" "SF,DF")
+					(eq_attr "memory" "load"))))
+			 "bdver3-direct,bdver3-fpload,bdver3-fcvt")
+(define_insn_reservation "bdver3_ssecvt_cvtss2sd" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "mode" "SF,DF")
+					(eq_attr "memory" "none"))))
+			 "bdver3-direct,bdver3-fpsched,bdver3-fcvt")
+;; CVTSI2SD, CVTSI2SS, CVTSI2SDQ, CVTSI2SSQ.
+(define_insn_reservation "bdver3_sseicvt_cvtsi2sd_load" 8
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "mode" "SF,DF")
+					(eq_attr "memory" "load"))))
+			 "bdver3-direct,bdver3-fpload,bdver3-fcvt")
+(define_insn_reservation "bdver3_sseicvt_cvtsi2sd" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "mode" "SF,DF")
+					(eq_attr "memory" "none"))))
+			 "bdver3-double,bdver3-fpsched,(nothing | bdver3-fcvt)")
+;; CVTPD2PS.
+(define_insn_reservation "bdver3_ssecvt_cvtpd2ps_load" 8
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "load")
+                                        (and (match_operand:V4SF 0 "register_operand")
+					     (match_operand:V2DF 1 "nonimmediate_operand")))))
+			 "bdver3-double,bdver3-fpload,(bdver3-fxbar | bdver3-fcvt)")
+(define_insn_reservation "bdver3_ssecvt_cvtpd2ps" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "none")
+                                        (and (match_operand:V4SF 0 "register_operand")
+					     (match_operand:V2DF 1 "nonimmediate_operand")))))
+			 "bdver3-double,bdver3-fpsched,(bdver3-fxbar | bdver3-fcvt)")
+;; CVTPI2PS, CVTDQ2PS.
+(define_insn_reservation "bdver3_ssecvt_cvtdq2ps_load" 8
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "load")
+                                        (and (match_operand:V4SF 0 "register_operand")
+					     (ior (match_operand:V2SI 1 "nonimmediate_operand")
+					          (match_operand:V4SI 1 "nonimmediate_operand"))))))
+			 "bdver3-direct,bdver3-fpload,bdver3-fcvt")
+(define_insn_reservation "bdver3_ssecvt_cvtdq2ps" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "none")
+                                        (and (match_operand:V4SF 0 "register_operand")
+					     (ior (match_operand:V2SI 1 "nonimmediate_operand")
+					          (match_operand:V4SI 1 "nonimmediate_operand"))))))
+			 "bdver3-direct,bdver3-fpsched,bdver3-fcvt")
+;; CVTDQ2PD.
+(define_insn_reservation "bdver3_ssecvt_cvtdq2pd_load" 8
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "load")
+                                        (and (match_operand:V2DF 0 "register_operand")
+					     (match_operand:V4SI 1 "nonimmediate_operand")))))
+			 "bdver3-double,bdver3-fpload,(bdver3-fxbar | bdver3-fcvt)")
+(define_insn_reservation "bdver3_ssecvt_cvtdq2pd" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "none")
+                                        (and (match_operand:V2DF 0 "register_operand")
+					     (match_operand:V4SI 1 "nonimmediate_operand")))))
+			 "bdver3-double,bdver3-fpsched,(bdver3-fxbar | bdver3-fcvt)")
+;; CVTPS2PD, CVTPI2PD.
+(define_insn_reservation "bdver3_ssecvt_cvtps2pd_load" 6
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "load")
+                                        (and (match_operand:V2DF 0 "register_operand")
+					     (ior (match_operand:V2SI 1 "nonimmediate_operand")
+					          (match_operand:V4SF 1 "nonimmediate_operand"))))))
+			 "bdver3-double,bdver3-fpload,(bdver3-fxbar | bdver3-fcvt)")
+(define_insn_reservation "bdver3_ssecvt_cvtps2pd" 2
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "load")
+                                        (and (match_operand:V2DF 0 "register_operand")
+					     (ior (match_operand:V2SI 1 "nonimmediate_operand")
+					          (match_operand:V4SF 1 "nonimmediate_operand"))))))
+			 "bdver3-double,bdver3-fpsched,(bdver3-fxbar | bdver3-fcvt)")
+;; CVTSD2SI, CVTSD2SIQ, CVTSS2SI, CVTSS2SIQ, CVTTSD2SI, CVTTSD2SIQ, CVTTSS2SI, CVTTSS2SIQ.
+(define_insn_reservation "bdver3_ssecvt_cvtsX2si_load" 8
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "mode" "SI,DI")
+					(eq_attr "memory" "load"))))
+			 "bdver3-double,bdver3-fpload,(bdver3-fcvt | bdver3-fsto)")
+(define_insn_reservation "bdver3_ssecvt_cvtsX2si" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "sseicvt")
+				   (and (eq_attr "mode" "SI,DI")
+					(eq_attr "memory" "none"))))
+			 "bdver3-double,bdver3-fpsched,(bdver3-fcvt | bdver3-fsto)")
+;; CVTPD2PI, CVTTPD2PI.
+(define_insn_reservation "bdver3_ssecvt_cvtpd2pi_load" 8
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "load")
+				        (and (match_operand:V2DF 1 "nonimmediate_operand")
+					     (match_operand:V2SI 0 "register_operand")))))
+			 "bdver3-double,bdver3-fpload,(bdver3-fcvt | bdver3-fxbar)")
+(define_insn_reservation "bdver3_ssecvt_cvtpd2pi" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "none")
+				        (and (match_operand:V2DF 1 "nonimmediate_operand")
+					     (match_operand:V2SI 0 "register_operand")))))
+			 "bdver3-double,bdver3-fpsched,(bdver3-fcvt | bdver3-fxbar)")
+;; CVTPD2DQ, CVTTPD2DQ.
+(define_insn_reservation "bdver3_ssecvt_cvtpd2dq_load" 6
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "load")
+				        (and (match_operand:V2DF 1 "nonimmediate_operand")
+					     (match_operand:V4SI 0 "register_operand")))))
+			 "bdver3-double,bdver3-fpload,(bdver3-fcvt | bdver3-fxbar)")
+(define_insn_reservation "bdver3_ssecvt_cvtpd2dq" 2
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "none")
+				        (and (match_operand:V2DF 1 "nonimmediate_operand")
+					     (match_operand:V4SI 0 "register_operand")))))
+			 "bdver3-double,bdver3-fpsched,(bdver3-fcvt | bdver3-fxbar)")
+;; CVTPS2PI, CVTTPS2PI, CVTPS2DQ, CVTTPS2DQ.
+(define_insn_reservation "bdver3_ssecvt_cvtps2pi_load" 8
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecvt")
+                                   (and (eq_attr "memory" "load")
+				        (and (match_operand:V4SF 1 "nonimmediate_operand")
+				             (ior (match_operand: V2SI 0 "register_operand")
+						  (match_operand: V4SI 0 "register_operand"))))))
+			 "bdver3-direct,bdver3-fpload,bdver3-fcvt")
+(define_insn_reservation "bdver3_ssecvt_cvtps2pi" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssecvt")
+				   (and (eq_attr "memory" "none")
+				        (and (match_operand:V4SF 1 "nonimmediate_operand")
+				             (ior (match_operand: V2SI 0 "register_operand")
+						  (match_operand: V4SI 0 "register_operand"))))))
+			 "bdver3-direct,bdver3-fpsched,bdver3-fcvt")
+
+;; SSE MUL, ADD, and MULADD.
+(define_insn_reservation "bdver3_ssemuladd_load_256" 11
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssemul,sseadd,sseadd1,ssemuladd")
+				   (and (eq_attr "mode" "V8SF,V4DF")
+					(eq_attr "memory" "load"))))
+			 "bdver3-double,bdver3-fpload,bdver3-ffma")
+(define_insn_reservation "bdver3_ssemuladd_256" 7
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssemul,sseadd,sseadd1,ssemuladd")
+				   (and (eq_attr "mode" "V8SF,V4DF")
+					(eq_attr "memory" "none"))))
+			 "bdver3-double,bdver3-fpsched,bdver3-ffma")
+(define_insn_reservation "bdver3_ssemuladd_load" 10
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssemul,sseadd,sseadd1,ssemuladd")
+				   (eq_attr "memory" "load")))
+			 "bdver3-direct,bdver3-fpload,bdver3-ffma")
+(define_insn_reservation "bdver3_ssemuladd" 6
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssemul,sseadd,sseadd1,ssemuladd")
+				   (eq_attr "memory" "none")))
+			 "bdver3-direct,bdver3-fpsched,bdver3-ffma")
+(define_insn_reservation "bdver3_sseimul_load" 8
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "sseimul")
+				   (eq_attr "memory" "load")))
+			 "bdver3-direct,bdver3-fpload,bdver3-fmma")
+(define_insn_reservation "bdver3_sseimul" 4
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "sseimul")
+				   (eq_attr "memory" "none")))
+			 "bdver3-direct,bdver3-fpsched,bdver3-fmma")
+(define_insn_reservation "bdver3_sseiadd_load" 6
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "sseiadd")
+				   (eq_attr "memory" "load")))
+			 "bdver3-direct,bdver3-fpload,bdver3-fmal")
+(define_insn_reservation "bdver3_sseiadd" 2
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "sseiadd")
+				   (eq_attr "memory" "none")))
+			 "bdver3-direct,bdver3-fpsched,bdver3-fmal")
+
+;; SSE DIV: no throughput information (assume same as amdfam10).
+(define_insn_reservation "bdver3_ssediv_double_load_256" 27
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssediv")
+				   (and (eq_attr "mode" "V4DF")
+				        (eq_attr "memory" "load"))))
+			 "bdver3-double,bdver3-fpload,(bdver3-ffma0*17 | bdver3-ffma1*17)")
+(define_insn_reservation "bdver3_ssediv_double_256" 27
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssediv")
+				   (and (eq_attr "mode" "V4DF")
+				        (eq_attr "memory" "none"))))
+			 "bdver3-double,bdver3-fpsched,(bdver3-ffma0*17 | bdver3-ffma1*17)")
+(define_insn_reservation "bdver3_ssediv_single_load_256" 27
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssediv")
+				   (and (eq_attr "mode" "V8SF")
+				        (eq_attr "memory" "load"))))
+			 "bdver3-double,bdver3-fpload,(bdver3-ffma0*17 | bdver3-ffma1*17)")
+(define_insn_reservation "bdver3_ssediv_single_256" 24
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssediv")
+				   (and (eq_attr "mode" "V8SF")
+				        (eq_attr "memory" "none"))))
+			 "bdver3-double,bdver3-fpsched,(bdver3-ffma0*17 | bdver3-ffma1*17)")
+(define_insn_reservation "bdver3_ssediv_double_load" 27
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssediv")
+				   (and (eq_attr "mode" "DF,V2DF")
+					(eq_attr "memory" "load"))))
+			 "bdver3-direct,bdver3-fpload,(bdver3-ffma0*17 | bdver3-ffma1*17)")
+(define_insn_reservation "bdver3_ssediv_double" 27
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssediv")
+				   (and (eq_attr "mode" "DF,V2DF")
+					(eq_attr "memory" "none"))))
+			 "bdver3-direct,bdver3-fpsched,(bdver3-ffma0*17 | bdver3-ffma1*17)")
+(define_insn_reservation "bdver3_ssediv_single_load" 27 
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssediv")
+				   (and (eq_attr "mode" "SF,V4SF")
+					(eq_attr "memory" "load"))))
+			 "bdver3-direct,bdver3-fpload,(bdver3-ffma0*17 | bdver3-ffma1*17)")
+(define_insn_reservation "bdver3_ssediv_single" 24
+			 (and (eq_attr "cpu" "bdver3,bdver4")
+			      (and (eq_attr "type" "ssediv")
+				   (and (eq_attr "mode" "SF,V4SF")
+					(eq_attr "memory" "none"))))
+			 "bdver3-direct,bdver3-fpsched,(bdver3-ffma0*17 | bdver3-ffma1*17)")
+
+(define_insn_reservation "bdver3_sseins" 3
+                         (and (eq_attr "cpu" "bdver3,bdver4")
+                              (and (eq_attr "type" "sseins")
+                                   (eq_attr "mode" "TI")))
+                         "bdver3-direct,bdver3-fpsched,bdver3-fxbar")
+
diff --git a/gcc-4.9/gcc/config/i386/biarch64.h b/gcc-4.9/gcc/config/i386/biarch64.h
new file mode 100644
index 000000000..c2c816401
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/biarch64.h
@@ -0,0 +1,29 @@
+/* Make configure files to produce biarch compiler defaulting to 64bit mode.
+   This file must be included very first, while the OS specific file later
+   to overwrite otherwise wrong defaults. 
+   Copyright (C) 2001-2014 Free Software Foundation, Inc.
+   Contributed by Bo Thorsen <bo@suse.de>.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#define TARGET_64BIT_DEFAULT (OPTION_MASK_ISA_64BIT | OPTION_MASK_ABI_64)
+#define TARGET_BI_ARCH 1
diff --git a/gcc-4.9/gcc/config/i386/biarchx32.h b/gcc-4.9/gcc/config/i386/biarchx32.h
new file mode 100644
index 000000000..941b93b3d
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/biarchx32.h
@@ -0,0 +1,28 @@
+/* Make configure files to produce biarch compiler defaulting to x32 mode.
+   This file must be included very first, while the OS specific file later
+   to overwrite otherwise wrong defaults.
+   Copyright (C) 2012-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#define TARGET_64BIT_DEFAULT (OPTION_MASK_ISA_64BIT | OPTION_MASK_ABI_X32)
+#define TARGET_BI_ARCH 2
diff --git a/gcc-4.9/gcc/config/i386/bmi2intrin.h b/gcc-4.9/gcc/config/i386/bmi2intrin.h
new file mode 100644
index 000000000..ff962962e
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/bmi2intrin.h
@@ -0,0 +1,109 @@
+/* Copyright (C) 2011-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _BMI2INTRIN_H_INCLUDED
+#define _BMI2INTRIN_H_INCLUDED
+
+#ifndef __BMI2__
+#pragma GCC push_options
+#pragma GCC target("bmi2")
+#define __DISABLE_BMI2__
+#endif /* __BMI2__ */
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bzhi_si (__X, __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pdep_si (__X, __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pext_si (__X, __Y);
+}
+
+#ifdef  __x86_64__
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bzhi_di (__X, __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pdep_di (__X, __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pext_di (__X, __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mulx_u64 (unsigned long long __X, unsigned long long __Y,
+	   unsigned long long *__P)
+{
+  unsigned __int128 __res = (unsigned __int128) __X * __Y;
+  *__P = (unsigned long long) (__res >> 64);
+  return (unsigned long long) __res;
+}
+
+#else /* !__x86_64__ */
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
+{
+  unsigned long long __res = (unsigned long long) __X * __Y;
+  *__P = (unsigned int) (__res >> 32);
+  return (unsigned int) __res;
+}
+
+#endif /* !__x86_64__  */
+
+#ifdef __DISABLE_BMI2__
+#undef __DISABLE_BMI2__
+#pragma GCC pop_options
+#endif /* __DISABLE_BMI2__ */
+
+#endif /* _BMI2INTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/bmiintrin.h b/gcc-4.9/gcc/config/i386/bmiintrin.h
new file mode 100644
index 000000000..b86adf179
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/bmiintrin.h
@@ -0,0 +1,138 @@
+/* Copyright (C) 2010-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _BMIINTRIN_H_INCLUDED
+#define _BMIINTRIN_H_INCLUDED
+
+#ifndef __BMI__
+#pragma GCC push_options
+#pragma GCC target("bmi")
+#define __DISABLE_BMI__
+#endif /* __BMI__ */
+
+extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzcnt_u16 (unsigned short __X)
+{
+  return __builtin_ctzs (__X);
+}
+
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__andn_u32 (unsigned int __X, unsigned int __Y)
+{
+  return ~__X & __Y;
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bextr_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bextr_u32 (__X, __Y);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bextr_u32 (unsigned int __X, unsigned int __Y, unsigned __Z)
+{
+  return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsi_u32 (unsigned int __X)
+{
+  return __X & -__X;
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsmsk_u32 (unsigned int __X)
+{
+  return __X ^ (__X - 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsr_u32 (unsigned int __X)
+{
+  return __X & (__X - 1);
+}
+
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzcnt_u32 (unsigned int __X)
+{
+  return __builtin_ctz (__X);
+}
+
+
+#ifdef  __x86_64__
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__andn_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return ~__X & __Y;
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bextr_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bextr_u64 (__X, __Y);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bextr_u64 (unsigned long long __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsi_u64 (unsigned long long __X)
+{
+  return __X & -__X;
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsmsk_u64 (unsigned long long __X)
+{
+  return __X ^ (__X - 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsr_u64 (unsigned long long __X)
+{
+  return __X & (__X - 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzcnt_u64 (unsigned long long __X)
+{
+  return __builtin_ctzll (__X);
+}
+
+#endif /* __x86_64__  */
+
+#ifdef __DISABLE_BMI__
+#undef __DISABLE_BMI__
+#pragma GCC pop_options
+#endif /* __DISABLE_BMI__ */
+
+#endif /* _BMIINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/bmmintrin.h b/gcc-4.9/gcc/config/i386/bmmintrin.h
new file mode 100644
index 000000000..24cf26e85
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/bmmintrin.h
@@ -0,0 +1,29 @@
+/* Copyright (C) 2007-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _BMMINTRIN_H_INCLUDED
+#define _BMMINTRIN_H_INCLUDED
+
+# error "SSE5 instruction set removed from compiler"
+
+#endif /* _BMMINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/bsd.h b/gcc-4.9/gcc/config/i386/bsd.h
new file mode 100644
index 000000000..54715a853
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/bsd.h
@@ -0,0 +1,99 @@
+/* Definitions for BSD assembler syntax for Intel 386
+   (actually AT&T syntax for insns and operands,
+   adapted to BSD conventions for symbol names and debugging.)
+   Copyright (C) 1988-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Use the Sequent Symmetry assembler syntax.  */
+
+/* Define the syntax of pseudo-ops, labels and comments.  */
+
+/* Prefix for internally generated assembler labels.  If we aren't using
+   underscores, we are using prefix `.'s to identify labels that should
+   be ignored, as in `i386/gas.h' --karl@cs.umb.edu  */
+
+#define LPREFIX "L"
+
+/* Assembler pseudos to introduce constants of various size.  */
+
+#define ASM_BYTE "\t.byte\t"
+#define ASM_SHORT "\t.word\t"
+#define ASM_LONG "\t.long\t"
+#define ASM_QUAD "\t.quad\t"  /* Should not be used for 32bit compilation.  */
+
+/* This was suggested, but it shouldn't be right for DBX output. -- RMS
+   #define ASM_OUTPUT_SOURCE_FILENAME(FILE, NAME) */
+
+
+/* Define the syntax of labels and symbol definitions/declarations.  */
+
+/* This is how to output an assembler line
+   that says to advance the location counter by SIZE bytes.  */
+
+#define ASM_OUTPUT_SKIP(FILE,SIZE)  \
+  fprintf (FILE, "\t.space "HOST_WIDE_INT_PRINT_UNSIGNED"\n", (SIZE))
+
+/* Define the syntax of labels and symbol definitions/declarations.  */
+
+/* This says how to output an assembler line
+   to define a global common symbol.  */
+
+#define ASM_OUTPUT_COMMON(FILE, NAME, SIZE, ROUNDED)  \
+( fputs (".comm ", (FILE)),			\
+  assemble_name ((FILE), (NAME)),		\
+  fprintf ((FILE), ",%u\n", (int)(ROUNDED)))
+
+/* This says how to output an assembler line
+   to define a local common symbol.  */
+
+#define ASM_OUTPUT_LOCAL(FILE, NAME, SIZE, ROUNDED)  \
+( fputs (".lcomm ", (FILE)),			\
+  assemble_name ((FILE), (NAME)),		\
+  fprintf ((FILE), ",%u\n", (int)(ROUNDED)))
+
+#ifdef HAVE_GAS_LCOMM_WITH_ALIGNMENT
+#define ASM_OUTPUT_ALIGNED_LOCAL(FILE, NAME, SIZE, ALIGNMENT)  \
+( fputs (".lcomm ", (FILE)),			\
+  assemble_name ((FILE), (NAME)),		\
+  fprintf ((FILE), ",%u,%u\n", (int)(SIZE), (int)(ALIGNMENT) / BITS_PER_UNIT))
+#endif
+
+/* This is how to output an assembler line
+   that says to advance the location counter
+   to a multiple of 2**LOG bytes.  */
+
+#define ASM_OUTPUT_ALIGN(FILE,LOG)	\
+  if ((LOG)!=0) fprintf ((FILE), "\t.align %d\n", (LOG))
+
+/* This is how to store into the string BUF
+   the symbol_ref name of an internal numbered label where
+   PREFIX is the class of label and NUM is the number within the class.
+   This is suitable for output with `assemble_name'.  */
+
+#define ASM_GENERATE_INTERNAL_LABEL(BUF,PREFIX,NUMBER)	\
+    sprintf ((BUF), "*%s%ld", (PREFIX), (long)(NUMBER))
+
+/* The prefix to add to user-visible assembler symbols.  */
+
+#define USER_LABEL_PREFIX "_"
+
+/* Sequent has some changes in the format of DBX symbols.  */
+#define DBX_NO_XREFS 1
+
+/* Don't split DBX symbols into continuations.  */
+#define DBX_CONTIN_LENGTH 0
diff --git a/gcc-4.9/gcc/config/i386/btver2.md b/gcc-4.9/gcc/config/i386/btver2.md
new file mode 100644
index 000000000..06a97cb95
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/btver2.md
@@ -0,0 +1,1391 @@
+;; Copyright (C) 2012-2014 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; AMD btver2 scheduling
+
+;; Instructions decoded are that are classifed as direct (fast path single),
+;; double (fast path double) and vector instructions.
+;; Direct instrucions are decoded and convereted into 1 cop
+;; Double instrucions are decoded and converetd into 2 cops
+;; Vector instrucions are microcoded and they generated converted to 
+;; 3 or more cops.
+
+(define_attr "btver2_decode" "direct,vector,double"
+  (const_string "direct"))
+
+(define_attr "btver2_sse_attr" "other,rcp,sqrt,maxmin"
+  (const_string "other"))
+
+(define_automaton "btver2,btver2_int,btver2_agu,btver2_fp")
+
+;; Decoder decodes up to two insns (2 fastpath singles) or 
+;;(2 fastpath doubles) or combination of both at a cycle.
+;; In case of vector (microded) instruction decoder decodes only one insn
+;; at a cycle .To model that we have 2 "decoder" units.
+
+(define_cpu_unit "btver2-decode0" "btver2")
+(define_cpu_unit "btver2-decode1" "btver2")
+
+;; "me" unit converts the decoded insn into cops.
+;; It can generate upto 2 cops from two fast path singles in cycle x+1, 
+;; to model we have two "mes". In case of fast path double it converts
+;; them to 2 cops in cycle x+1. Vector instructions are modelled to block
+;; all decoder units. 
+
+(define_cpu_unit "me0" "btver2")
+(define_cpu_unit "me1" "btver2")
+
+(define_reservation "btver2-direct" "(btver2-decode0|btver2-decode1),(me0|me1)")
+
+(define_reservation "btver2-double" "(btver2-decode0|btver2-decode1),(me0+me1)")
+
+(define_reservation "btver2-vector" "(btver2-decode0+btver2-decode1),(me0+me1)")
+
+;; Integer operations 
+;; There are 2 ALU pipes 
+
+(define_cpu_unit "btver2-ieu0" "btver2_int")
+(define_cpu_unit "btver2-ieu1" "btver2_int")
+
+;; There are 2 AGU pipes one for load and one for store.
+
+(define_cpu_unit "btver2-load"  "btver2_agu")
+(define_cpu_unit "btver2-store" "btver2_agu")
+
+;; ALU operations can take place in ALU pipe0 or pipe1. 
+(define_reservation "btver2-alu" "(btver2-ieu0|btver2-ieu1)")
+
+;; MUL and DIV operations can take place in to ALU pipe1.
+(define_reservation "btver2-mul" "btver2-ieu1")
+(define_reservation "btver2-div" "btver2-ieu1")
+
+;; vectorpath (microcoded) instructions are single issue instructions.
+;; So, they occupy all the integer units.
+(define_reservation "btver2-ivector" "btver2-ieu0+btver2-ieu1+
+                                      btver2-load+btver2-store")
+
+;;Floating point pipes.
+(define_cpu_unit "btver2-fp0" "btver2_fp")
+(define_cpu_unit "btver2-fp1" "btver2_fp")
+
+(define_reservation "btver2-fpa" "btver2-fp0")
+(define_reservation "btver2-vimul" "btver2-fp0")
+(define_reservation "btver2-valu" "btver2-fp0|btver2-fp1")
+(define_reservation "btver2-stc" "btver2-fp1")
+(define_reservation "btver2-fpm" "btver2-fp1")
+
+;; vectorpath (microcoded) instructions are single issue instructions.
+;; So, they occupy all the fp units.
+(define_reservation "btver2-fvector" "btver2-fp0+btver2-fp1+
+                                      btver2-load+btver2-store")
+
+;; Call instruction
+(define_insn_reservation "btver2_call" 2
+			 (and (eq_attr "cpu" "btver2")
+			      (eq_attr "type" "call,callv"))
+			 "btver2-double,btver2-load")
+
+;; General instructions
+;;
+
+(define_insn_reservation "btver2_push_mem" 4
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "push")))
+			 "btver2-direct,btver2-load,btver2-alu")
+
+(define_insn_reservation "btver2_push" 1
+			 (and (eq_attr "cpu" "btver2")
+			      (eq_attr "type" "push"))
+			 "btver2-direct,btver2-alu")
+
+(define_insn_reservation "btver2_pop_mem" 4
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "pop")))
+			 "btver2-direct,btver2-load,btver2-alu")
+
+(define_insn_reservation "btver2_pop" 1
+			 (and (eq_attr "cpu" "btver2")
+			      (eq_attr "type" "pop"))
+			 "btver2-direct,btver2-alu")
+
+(define_insn_reservation "btver2_leave" 3
+			 (and (eq_attr "cpu" "btver2")
+			      (eq_attr "type" "leave"))
+			 "btver2-double,btver2-alu")
+
+(define_insn_reservation "btver2_lea" 1
+			 (and (eq_attr "cpu" "btver2")
+			      (eq_attr "type" "lea"))
+			 "btver2-direct,btver2-alu")
+
+;; Integer  
+(define_insn_reservation "btver2_imul_DI" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "imul")
+				   (and (eq_attr "mode" "DI")
+					(eq_attr "memory" "none,unknown"))))
+			 "btver2-direct,btver2-mul*4")
+
+(define_insn_reservation "btver2_imul" 3
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "imul")
+				   (eq_attr "memory" "none,unknown")))
+			 "btver2-direct,btver2-mul")
+
+(define_insn_reservation "btver2_imul_mem_DI" 9
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "imul")
+				   (and (eq_attr "mode" "DI")
+					(eq_attr "memory" "load,both"))))
+			 "btver2-direct,btver2-load,btver2-mul*4")
+
+(define_insn_reservation "btver2_imul_mem" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "imul")
+				   (eq_attr "memory" "load,both")))
+			 "btver2-direct,btver2-load,btver2-mul")
+
+(define_insn_reservation "btver2_idiv_DI" 41
+			    (and (eq_attr "cpu" "btver2")
+				 (and (eq_attr "type" "idiv")
+				      (and (eq_attr "mode" "DI")
+					   (eq_attr "memory" "none,unknown"))))
+			 "btver2-double,btver2-div")
+
+(define_insn_reservation "btver2_idiv_mem_DI" 44
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "idiv")
+				   (and (eq_attr "mode" "DI")
+					(eq_attr "memory" "load"))))
+			 "btver2-double,btver2-load,btver2-div")
+
+(define_insn_reservation "btver2_idiv_SI" 25
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "idiv")
+				   (and (eq_attr "mode" "SI")
+					(eq_attr "memory" "none,unknown"))))
+			 "btver2-double,btver2-div*25")
+
+(define_insn_reservation "btver2_idiv_mem_SI" 28
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "idiv")
+				   (and (eq_attr "mode" "SI")
+					(eq_attr "memory" "load"))))
+			 "btver2-double,btver2-load,btver2-div*25")
+
+(define_insn_reservation "btver2_idiv_HI" 17
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "idiv")
+				   (and (eq_attr "mode" "HI")
+					(eq_attr "memory" "none,unknown"))))
+			 "btver2-double,btver2-div*17")
+
+(define_insn_reservation "btver2_idiv_mem_HI" 20
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "idiv")
+				   (and (eq_attr "mode" "HI")
+					(eq_attr "memory" "load"))))
+			 "btver2-double,btver2-load,btver2-div*17")
+
+(define_insn_reservation "btver2_idiv_QI" 12
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "idiv")
+				   (and (eq_attr "mode" "SI")
+					(eq_attr "memory" "none,unknown"))))
+			 "btver2-direct,btver2-div*12")
+
+(define_insn_reservation "btver2_idiv_mem_QI" 15
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "idiv")
+				   (and (eq_attr "mode" "SI")
+					(eq_attr "memory" "load"))))
+			 "btver2-direct,btver2-load,btver2-div*12")
+
+(define_insn_reservation "btver2_str" 7
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "str")
+				   (eq_attr "memory" "load,both,store")))
+			 "btver2-vector,btver2-ivector")
+
+(define_insn_reservation "btver2_idirect_loadmov" 4
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "imov")
+				   (eq_attr "memory" "load")))
+			 "btver2-direct,btver2-load,btver2-alu")
+
+(define_insn_reservation "btver2_idirect_load" 4
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "bdver1_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "load"))))
+			 "btver2-direct,btver2-load,btver2-alu")
+
+(define_insn_reservation "btver2_idirect_movstore" 4
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "imov")
+				   (eq_attr "memory" "store")))
+			 "btver2-direct,btver2-alu,btver2-store")
+
+(define_insn_reservation "btver2_idirect_both" 4
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "bdver1_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "both"))))
+			 "btver2-direct,btver2-load,btver2-alu,btver2-store")
+
+(define_insn_reservation "btver2_idirect_store" 4
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "bdver1_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "store"))))
+			 "btver2-direct,btver2-alu,btver2-store")
+
+;; Other integer instrucions 
+(define_insn_reservation "btver2_idirect" 1
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "btver2_decode" "direct")
+				   (and (eq_attr "unit" "integer,unknown")
+					(eq_attr "memory" "none,unknown"))))
+			 "btver2-direct,btver2-alu")
+
+;; Floating point instructions 
+(define_insn_reservation "btver2_fldxf" 19
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "fmov")
+				   (and (eq_attr "memory" "load")
+					(eq_attr "mode" "XF"))))
+			 "btver2-vector,btver2-load,btver2-fvector*5")
+
+(define_insn_reservation "btver2_fld" 11
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "fmov")
+				   (eq_attr "memory" "load")))
+			 "btver2-direct,btver2-load,(btver2-fp0|btver2-fp1)")
+
+(define_insn_reservation "btver2_fstxf" 24
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "fmov")
+				   (and (eq_attr "memory" "both")
+					(eq_attr "mode" "XF"))))
+			 "btver2-vector,btver2-fvector*9,btver2-store")
+
+(define_insn_reservation "btver2_fst" 11
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "fmov")
+				   (eq_attr "memory" "store,both")))
+			 "btver2-direct,btver2-fp1,btver2-store")
+
+(define_insn_reservation "btver2_fist" 9
+			 (and (eq_attr "cpu" "btver2")
+			      (eq_attr "type" "fistp,fisttp"))
+			 "btver2-direct,btver2-load,btver2-fp1")
+
+(define_insn_reservation "btver2_fmov" 2
+			 (and (eq_attr "cpu" "btver2")
+			      (eq_attr "type" "fmov"))
+			 "btver2-direct,(btver2-fp0|btver2-fp1)")
+
+(define_insn_reservation "btver2_fadd_load" 8
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "fop")
+				   (eq_attr "memory" "load")))
+			 "btver2-direct,btver2-load,btver2-fp0")
+
+(define_insn_reservation "btver2_fadd" 3
+			 (and (eq_attr "cpu" "btver2")
+			      (eq_attr "type" "fop"))
+			 "btver2-direct,btver2-fp0")
+
+(define_insn_reservation "btver2_fmul_load" 10
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "fmul")
+				   (eq_attr "memory" "load")))
+			 "btver2-direct,btver2-load,btver2-fp1*3")
+
+(define_insn_reservation "btver2_fmul" 5
+			 (and (eq_attr "cpu" "btver2")
+			      (eq_attr "type" "fmul"))
+			 "btver2-direct,(btver2-fp1*3)")
+
+(define_insn_reservation "btver2_fsgn" 2
+			 (and (eq_attr "cpu" "btver2")
+			      (eq_attr "type" "fsgn"))
+			 "btver2-direct,btver2-fp1*2")
+
+(define_insn_reservation "btver2_fdiv_load" 24
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "fdiv")
+				   (eq_attr "memory" "load")))
+			 "btver2-direct,btver2-load,btver2-fp1*19")
+
+(define_insn_reservation "btver2_fdiv" 19
+			 (and (eq_attr "cpu" "btver2")
+			      (eq_attr "type" "fdiv"))
+			 "btver2-direct,btver2-fp1*19")
+
+(define_insn_reservation "btver2_fcmov_load" 12
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "fcmov")
+				   (eq_attr "memory" "load")))
+			 "btver2-vector,btver2-load,(btver2-fp0|btver2-fp1)*7")
+
+(define_insn_reservation "btver2_fcmov" 7
+			 (and (eq_attr "cpu" "btver2")
+			      (eq_attr "type" "fcmov"))
+			 "btver2-vector,(btver2-fp0|btver2-fp1)*7")
+
+(define_insn_reservation "btver2_fcomi_load" 7
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "fcmp")
+				   (and (eq_attr "bdver1_decode" "double")
+					(eq_attr "memory" "load"))))
+			 "btver2-direct,btver2-load,btver2-fp0*2")
+
+(define_insn_reservation "btver2_fcomi" 2
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "bdver1_decode" "double")
+				   (eq_attr "type" "fcmp")))
+			 "btver2-direct, btver2-fp0*2")
+
+(define_insn_reservation "btver2_fcom_load" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "type" "fcmp")
+				   (eq_attr "memory" "load")))
+			 "btver2-direct,btver2-load,btver2-fp0")
+
+(define_insn_reservation "btver2_fcom" 1
+			 (and (eq_attr "cpu" "btver2")
+			      (eq_attr "type" "fcmp"))
+			  "btver2-direct,btver2-fp0")
+
+(define_insn_reservation "btver2_fxch" 1
+			 (and (eq_attr "cpu" "btver2")
+			      (eq_attr "type" "fxch"))
+			 "btver2-direct,btver2-fp1")
+
+;; SSE AVX maxmin,rcp,sqrt
+(define_insn_reservation "btver2_sse_maxmin" 2
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V8SF,V4DF,V2DF,V4SF,SF,DF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_sse_attr" "maxmin")
+					     (eq_attr "type" "sse,sseadd")))))
+			 "btver2-direct,btver2-fpa")
+
+(define_insn_reservation "btver2_sse_maxmin_mem" 7 
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V8SF,V4DF,V2DF,V4SF,SF,DF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_sse_attr" "maxmin")
+					     (eq_attr "type" "sse,sseadd")))))
+			 "btver2-direct,btver2-load,btver2-fpa")
+
+(define_insn_reservation "btver2_sse_rcp" 2
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4SF,SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_sse_attr" "rcp")
+					     (eq_attr "type" "sse")))))
+			 "btver2-direct,btver2-fpm")
+
+(define_insn_reservation "btver2_sse_rcp_mem" 7
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4SF,SF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_sse_attr" "rcp")
+					     (eq_attr "type" "sse")))))
+			 "btver2-direct,btver2-load,btver2-fpm")
+
+(define_insn_reservation "btver2_avx_rcp" 2
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V8SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_sse_attr" "rcp")
+					     (eq_attr "type" "sse")))))
+			 "btver2-double,btver2-fpm*2")
+
+(define_insn_reservation "btver2_avx_rcp_mem" 7 
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V8SF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_sse_attr" "rcp")
+					     (eq_attr "type" "sse")))))
+			 "btver2-double,btver2-load,btver2-fpm*2")
+
+(define_insn_reservation "btver2_sse_sqrt_v4sf" 21
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_sse_attr" "sqrt")
+					     (eq_attr "type" "sse")))))
+			 "btver2-direct,btver2-fpm*21")
+
+(define_insn_reservation "btver2_sse_sqrt_v4sf_mem" 26
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4SF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_sse_attr" "sqrt")
+					     (eq_attr "type" "sse")))))
+			 "btver2-direct,btver2-load,btver2-fpm*21")
+
+(define_insn_reservation "btver2_sse_sqrt_v4df" 54
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_sse_attr" "sqrt")
+					     (eq_attr "type" "sse")))))
+			 "btver2-double,btver2-fpm*54")
+
+(define_insn_reservation "btver2_sse_sqrt_v4df_mem" 59
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_sse_attr" "sqrt")
+					     (eq_attr "type" "sse")))))
+			 "btver2-double,btver2-load,btver2-fpm*54")
+
+(define_insn_reservation "btver2_sse_sqrt_sf" 16
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_sse_attr" "sqrt")
+					     (eq_attr "type" "sse")))))
+			 "btver2-direct,btver2-fpm*16")
+
+(define_insn_reservation "btver2_sse_sqrt_sf_mem" 21
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "SF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_sse_attr" "sqrt")
+					     (eq_attr "type" "sse")))))
+			 "btver2-direct,btver2-load,btver2-fpm*16")
+
+(define_insn_reservation "btver2_sse_sqrt_df" 27
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V2DF,DF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_sse_attr" "sqrt")
+					     (eq_attr "type" "sse")))))
+			 "btver2-direct,btver2-fpm*27")
+
+(define_insn_reservation "btver2_sse_sqrt_df_mem" 32
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V2DF,DF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_sse_attr" "sqrt")
+					     (eq_attr "type" "sse")))))
+			 "btver2-direct,btver2-load,btver2-fpm*27")
+
+(define_insn_reservation "btver2_sse_sqrt_v8sf" 42
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V8SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_sse_attr" "sqrt")
+					     (eq_attr "type" "sse")))))
+			 "btver2-double,btver2-fpm*42")
+
+(define_insn_reservation "btver2_sse_sqrt_v8sf_mem" 42
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V8SF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_sse_attr" "sqrt")
+					     (eq_attr "type" "sse")))))
+			 "btver2-double,btver2-load,btver2-fpm*42")
+
+;; Bitmanipulation instrucions BMI LZCNT POPCNT 
+(define_insn_reservation "btver2_bmi_reg_direct"   1
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "btver2_decode" "direct")
+				   (and (eq_attr "memory" "none")
+					(eq_attr "type" "bitmanip"))))
+			 "btver2-direct,btver2-alu")
+
+(define_insn_reservation "btver2_bmi_mem_direct" 4
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "btver2_decode" "direct")
+				   (and (eq_attr "memory" "load")
+					(eq_attr "type" "bitmanip"))))
+			 "btver2-direct,btver2-load,btver2-alu")
+
+(define_insn_reservation "btver2_bmi_reg_double"  2
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "btver2_decode" "double")
+				   (and (eq_attr "memory" "none")
+					(eq_attr "type" "bitmanip,alu1"))))
+			 "btver2-double,btver2-alu")
+
+(define_insn_reservation "btver2_bmi_double_store"  5
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "store")
+				   (and (eq_attr "btver2_decode" "double")
+					(eq_attr "type" "bitmanip,alu1"))))
+			 "btver2-double,btver2-alu,btver2-store")
+
+(define_insn_reservation "btver2_bmi_double_load" 4 
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "btver2_decode" "double")
+				   (and (eq_attr "memory" "load")
+					(eq_attr "type" "bitmanip,alu1"))))
+			 "btver2-double,btver2-load,btver2-alu")
+
+;; F16C converts
+(define_insn_reservation "btver2_ssecvt_load_direct" 8
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V8SF,V4SF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "ssecvt")))))
+			 "btver2-direct,btver2-load,btver2-stc")
+
+(define_insn_reservation "btver2_ssecvt_store_direct" 8
+			 (and (eq_attr "cpu" "btver2")
+			     (and (eq_attr "mode" "V8SF,V4SF")
+				  (and (eq_attr "memory" "store")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "ssecvt")))))
+			 "btver2-direct,btver2-stc,btver2-store")
+
+(define_insn_reservation "btver2_ssecvt_reg_direct" 3
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V8SF,V4SF")
+				   (and (eq_attr "btver2_decode" "direct")
+					(eq_attr "type" "ssecvt"))))
+			 "btver2-direct,btver2-stc")
+
+(define_insn_reservation "btver2_ssecvt_load_double" 8
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V8SF,V4SF")
+				  (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "double")
+					     (eq_attr "type" "ssecvt")))))
+			 "btver2-double,btver2-load,btver2-stc*2")
+
+(define_insn_reservation "btver2_ssecvt_reg_double" 3
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V8SF,V4SF")
+				   (and (eq_attr "btver2_decode" "double")
+					(eq_attr "type" "ssecvt"))))
+			 "btver2-double,btver2-stc*2")
+
+(define_insn_reservation "btver2_ssecvt_store_vector" 11
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V8SF,V4SF")
+				   (and (eq_attr "memory" "store")
+					(and (eq_attr "btver2_decode" "vector")
+					     (eq_attr "type" "ssecvt")))))
+			 "btver2-vector,btver2-stc,(btver2-fpa|btver2-fpm),btver2-store")
+
+(define_insn_reservation "btver2_ssecvt_reg_vector" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V8SF,V4SF")
+				   (and (eq_attr "btver2_decode" "vector")
+					(eq_attr "type" "ssecvt"))))
+			 "btver2-vector,btver2-stc,(btver2-fpa|btver2-fpm)")
+
+;; avx256 adds
+(define_insn_reservation "btver2_avx_add_load_256" 8
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF,V8SF")
+				   (and (eq_attr "memory" "load")
+					(eq_attr "type" "sseadd,sseadd1"))))
+			 "btver2-double,btver2-load,btver2-fpa")
+
+(define_insn_reservation "btver2_avx_add_reg_256" 3
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF,V8SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(eq_attr "type" "sseadd,sseadd1"))))
+			 "btver2-double,btver2-fpa")
+
+;; avx256 logs 
+(define_insn_reservation "btver2_avx_load_log" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF,V8SF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "!vector")
+					     (eq_attr "type" "sselog,sselog1")))))
+			 "btver2-double,btver2-load,(btver2-fpa|btver2-fpm)")
+
+(define_insn_reservation "btver2_avx_reg_log" 1
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF,V8SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "!vector")
+					     (eq_attr "type" "sselog,sselog1")))))
+			 "btver2-double,(btver2-fpa|btver2-fpm)")
+
+;; avx256 sse
+
+(define_insn_reservation "btver2_avx_load_sse" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF,V8SF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "!vector")
+					     (eq_attr "type" "sse")))))
+			 "btver2-double,btver2-load,(btver2-fpa|btver2-fpm)")
+
+(define_insn_reservation "btver2_avx_reg_sse" 1
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF,V8SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "!vector")
+					     (eq_attr "type" "sse")))))
+			 "btver2-double,(btver2-fpa|btver2-fpm)")
+
+;; avx256 moves
+(define_insn_reservation "btver2_avx_load_int_mov" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "OI")
+				   (and (eq_attr "memory" "load")
+					(eq_attr "type" "ssemov"))))
+			 "btver2-double,btver2-load,btver2-valu")
+
+(define_insn_reservation "btver2_avx_store_int_mov" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "OI")
+				   (and (eq_attr "memory" "store")
+					(eq_attr "type" "ssemov"))))
+			 "btver2-double,btver2-valu,btver2-store")
+
+(define_insn_reservation "btver2_avx_int_mov" 1 
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "OI")
+				   (and (eq_attr "memory" "none,unknown")
+					(eq_attr "type" "ssemov"))))
+			 "btver2-double,btver2-valu")
+
+(define_insn_reservation "btver2_avx_load_from_vectors" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V8SF,V4DF")
+				   (and (ior ( match_operand:V4SF 1 "memory_operand")
+					     ( match_operand:V2DF 1 "memory_operand"))
+					(and (eq_attr "memory" "load")
+					     (eq_attr "type" "ssemov")))))
+			 "btver2-double,btver2-load,(btver2-fpa|btver2-fpm)")
+
+(define_insn_reservation "btver2_avx_loads_from_scalar" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V8SF,V4DF")
+				   (and (ior ( match_operand:SF 1 "memory_operand")
+					     ( match_operand:DF 1 "memory_operand"))
+					(and (eq_attr "memory" "load")
+					     (eq_attr "type" "ssemov")))))
+			 "btver2-double,btver2-load,(btver2-fpa|btver2-fpm)*2")
+
+(define_insn_reservation "btver2_avx_store_move" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF,V8SF")
+				   (and (eq_attr "memory" "store")
+					(and (eq_attr "btver2_decode" "!vector")
+					     (eq_attr "type" "ssemov")))))
+			 "btver2-double,(btver2-fpa|btver2-fpm),btver2-store")
+
+(define_insn_reservation "btver2_avx_load_move" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF,V8SF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "!vector")
+					     (eq_attr "type" "ssemov")))))
+			 "btver2-double,btver2-load,(btver2-fpa|btver2-fpm)")
+
+(define_insn_reservation "btver2_avx_reg_move" 1
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF,V8SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "!vector")
+					     (eq_attr "type" "ssemov")))))
+			 "btver2-double,(btver2-fpa|btver2-fpm)")
+;; avx256 cmps
+(define_insn_reservation "btver2_avx_load_cmp" 7
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF,V8SF")
+				   (and (eq_attr "memory" "load")
+					(eq_attr "type" "ssecmp"))))
+			 "btver2-double,btver2-load,(btver2-fpa|btver2-fpm)*2")
+
+(define_insn_reservation "btver2_avx_cmp" 2
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF,V8SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(eq_attr "type" "ssecmp"))))
+			 "btver2-double,(btver2-fpa|btver2-fpm)*2")
+
+;; ssecvts 256 
+(define_insn_reservation "btver2_ssecvt_256_load" 8
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF,OI")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "!vector")
+					     (eq_attr "type" "ssecvt")))))
+			 "btver2-double,btver2-load,btver2-stc*2")
+
+(define_insn_reservation "btver2_ssecvt_256" 3 
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF,OI")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "!vector")
+					     (eq_attr "type" "ssecvt")))))
+			 "btver2-double,btver2-stc*2")
+
+(define_insn_reservation "btver2_ssecvt_256_vector_load" 11
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF,OI")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "vector")
+					     (eq_attr "type" "ssecvt")))))
+			 "btver2-vector,btver2-load,btver2-stc*2,(btver2-fpa|btver2-fpm)")
+
+(define_insn_reservation "btver2_ssecvt_256_vector" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF,OI")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "vector")
+					     (eq_attr "type" "ssecvt")))))
+			 "btver2-vector,btver2-stc*2,(btver2-fpa|btver2-fpm)")
+
+;; avx256 divides
+(define_insn_reservation "btver2_avx_load_div" 43
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF,V8SF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "!vector")
+					     (eq_attr "type" "ssediv")))))
+			 "btver2-double,btver2-load,btver2-fpm*38")
+
+(define_insn_reservation "btver2_avx_div" 38
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF,V8SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "!vector")
+					     (eq_attr "type" "ssediv")))))
+			 "btver2-double,btver2-fpm*38")
+
+;; avx256  multiply
+
+(define_insn_reservation "btver2_avx_mul_load_pd" 9
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "!vector")
+					     (eq_attr "type" "ssemul")))))
+			"btver2-double,btver2-load,btver2-fpm*4")
+
+(define_insn_reservation "btver2_avx_mul_load_ps" 7
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V8SF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "!vector")
+					     (eq_attr "type" "ssemul")))))
+			 "btver2-double,btver2-load,btver2-fpm*2")
+
+
+(define_insn_reservation "btver2_avx_mul_256_pd" 4
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4DF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "!vector")
+					     (eq_attr "type" "ssemul")))))
+			 "btver2-double,btver2-fpm*4")
+
+(define_insn_reservation "btver2_avx_mul_256_ps" 2	
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V8SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "!vector")
+					     (eq_attr "type" "ssemul")))))
+			 "btver2-double,btver2-fpm*2")
+
+(define_insn_reservation "btver2_avx_dpps_load_ps" 17
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V8SF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "vector")
+					     (eq_attr "type" "ssemul")))))
+			 "btver2-vector,btver2-fpm*6,btver2-fpa*6")
+
+(define_insn_reservation "btver2_avx_dpps_ps" 12
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V8SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "vector")
+					     (eq_attr "type" "ssemul")))))
+			 "btver2-vector,btver2-fpm*6,btver2-fpa*6")
+
+;; AES/CLMUL
+
+(define_insn_reservation "btver2_aes_double" 3
+			 (and (eq_attr "cpu" "btver2")
+			      (and (match_operand:V2DI 0 "register_operand")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "double")
+					     (eq_attr "type" "sselog1")))))
+			 "btver2-double,btver2-valu,btver2-vimul")
+
+(define_insn_reservation "btver2_aes_direct" 2
+			 (and (eq_attr "cpu" "btver2")
+			      (and (match_operand:V2DI 0 "register_operand")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "sselog1")))))
+			 "btver2-direct,btver2-vimul")
+
+;; AVX128 SSE4* SSSE3 SSE3* SSE2 SSE instructions 
+
+(define_insn_reservation "btver2_sseint_load_direct" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "TI")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "sse,ssecmp,sseiadd")))))
+			 "btver2-direct,btver2-load,btver2-valu")
+
+(define_insn_reservation "btver2_sseint_direct" 1
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "TI")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "sse,ssecmp,sseiadd")))))
+			 "btver2-direct,btver2-valu")
+
+(define_insn_reservation "btver2_sselog_direct" 1
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V2DF,V4SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "sse,sselog")))))
+			 "btver2-direct,(btver2-fpa|btver2-fpm)")
+
+(define_insn_reservation "btver2_sselog_load_direct" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V2DF,V4SF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "sse,sselog")))))
+			 "btver2-direct,btver2-load,(btver2-fpa|btver2-fpm)")
+
+(define_insn_reservation "btver2_intext_reg_128" 3
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "SF,QI,SI,HI,SI")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "sselog")))))
+			 "btver2-direct,btver2-fpa")
+
+(define_insn_reservation "btver2_sse_mov_direct" 1
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V2DF,V4SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "ssemov")))))
+			 "btver2-direct,(btver2-fpa|btver2-fpm)")
+
+(define_insn_reservation "btver2_sse_mov_vector" 2 
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V2DF,V4SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "vector")
+					     (eq_attr "type" "ssemov")))))
+			 "btver2-vector,(btver2-fpa|btver2-fpm)*2")
+
+(define_insn_reservation "btver2_ssecomi_load_128" 8
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "TI")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "!vector")
+					     (eq_attr "type" "ssecomi")))))
+			 "btver2-direct,btver2-load,btver2-fpa")
+
+(define_insn_reservation "btver2_ssecomi_reg_128" 3
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "TI")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "!vector")
+					     (eq_attr "type" "ssecomi")))))
+			 "btver2-direct,btver2-fpa")
+
+(define_insn_reservation "btver2_ssemul_load_v2df" 14
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V2DF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "vector")
+					     (eq_attr "type" "ssemul")))))
+			 "btver2-vector,btver2-load,btver2-fpm*2,btver2-fpa")
+
+(define_insn_reservation "btver2_ssemul_reg_v2df" 9
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V2DF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "vector")
+					     (eq_attr "type" "ssemul")))))
+			 "btver2-vector,btver2-fpm*2,btver2-fpa")
+
+(define_insn_reservation "btver2_ssemul_load_v4sf" 16
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "vector")
+					     (eq_attr "type" "ssemul")))))
+			"btver2-vector,btver2-load,btver2-fpm*3,btver2-fpa*2")
+
+(define_insn_reservation "btver2_ssemul_reg_v4sf" 11
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "vector")
+					     (eq_attr "type" "ssemul")))))
+			 "btver2-vector,btver2-fpm*3,btver2-fpa*2")
+
+(define_insn_reservation "btver2_sse_store_vectmov" 8
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "btver2_decode" "vector")
+					(eq_attr "type" "ssemov"))))
+			"btver2-vector,btver2-valu*3,btver2-store")
+
+(define_insn_reservation "btver2_sse_load_vectmov" 8
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "btver2_decode" "vector")
+					(eq_attr "type" "ssemov"))))
+			 "btver2-vector,btver2-load,btver2-valu*3")
+
+(define_insn_reservation "btver2_sse_vectmov" 3
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "none,unknown")
+				   (and (eq_attr "btver2_decode" "vector")
+					(eq_attr "type" "ssemov"))))
+			 "btver2-vector,btver2-valu*3")
+
+
+(define_insn_reservation "btver2_sseimul" 2 
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "none,unknown")
+				   (and (eq_attr "btver2_decode" "direct")
+					(eq_attr "type" "sseimul"))))
+			 "btver2-direct,btver2-vimul")
+
+(define_insn_reservation "btver2_sseimul_load" 7
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "btver2_decode" "direct")
+					(eq_attr "type" "sseimul"))))
+			 "btver2-direct,btver2-load,btver2-vimul")
+
+(define_insn_reservation "btver2_sseimul_load_vect" 9
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "btver2_decode" "vector")
+					(eq_attr "type" "sseimul"))))
+			 "btver2-vector,btver2-load,btver2-vimul*2,btver2-valu")
+
+(define_insn_reservation "btver2_sseimul_vect" 4
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "none,unknown")
+				   (and (eq_attr "btver2_decode" "vector")
+					(eq_attr "type" "sseimul"))))
+			 "btver2-vector,btver2-vimul*2,btver2-valu")
+
+(define_insn_reservation "btver2_sseins" 3
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "none,unknown")
+				   (eq_attr "type" "sseins")))
+			 "btver2-vector,btver2-valu*3")
+
+(define_insn_reservation "btver2_sseishft_load" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "btver2_decode" "direct")
+					(eq_attr "type" "sseishft"))))
+			 "btver2-direct,btver2-load,btver2-valu")
+
+(define_insn_reservation "btver2_sseishft_direct" 1
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "none,unknown")
+				   (and (eq_attr "btver2_decode" "direct") 
+					(eq_attr "type" "sseishft"))))
+			 "btver2-direct,btver2-valu")
+
+(define_insn_reservation "btver2_sselog1_load" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode"  "!V8SF,!V4DF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "sselog1")))))
+			 "btver2-direct,btver2-load,btver2-valu")
+
+(define_insn_reservation "btver2_sselog1_direct" 1
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode"  "!V8SF,!V4DF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "sselog1")))))
+			 "btver2-direct,btver2-valu")
+
+(define_insn_reservation "btver2_sselog1_vector_load" 7
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "btver2_decode" "vector")
+					(eq_attr "type" "sselog1"))))
+			 "btver2-vector,btver2-valu*2")
+
+(define_insn_reservation "btver2_sselog1_vector" 2
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "none,unknown")
+				   (and (eq_attr "btver2_decode" "vector")
+					(eq_attr "type" "sselog1"))))
+			 "btver2-vector,btver2-valu*2")
+
+(define_insn_reservation "btver2_sseadd_load" 8
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4SF,V2DF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "sseadd,sseadd1")))))
+			 "btver2-direct,btver2-load,btver2-fpa")
+
+(define_insn_reservation "btver2_sseadd_reg" 3
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V4SF,V2DF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "sseadd,sseadd1")))))
+			 "btver2-direct,btver2-fpa")
+
+;;SSE2 SSEint SSEfp SSE
+
+(define_insn_reservation "btver2_sseint_to_scalar_move_with_load" 8
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "SI,DI")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "ssemov")))))
+			 "btver2-direct,btver2-load,btver2-fpa")
+
+(define_insn_reservation "btver2_sseint_to_scalar_move_with_store" 8
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "SI,DI")
+				   (and (eq_attr "memory" "store")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "ssemov")))))
+			 "btver2-direct,btver2-fpa,btver2-store")
+
+
+(define_insn_reservation "btver2_scalar_to_sseint_move_with_load" 11
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "TI")
+				   (and (ior ( match_operand:SI 1 "memory_operand")
+					     ( match_operand:DI 1 "memory_operand"))
+					(eq_attr "type" "ssemov"))))
+			 "btver2-direct,btver2-load,btver2-stc,btver2-valu")
+
+(define_insn_reservation "btver2_sseint_to_scalar" 3
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "SI,DI")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "ssemov")))))
+			 "btver2-direct,btver2-fpa")
+
+(define_insn_reservation "btver2_scalar_to_sseint" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "TI")
+				   (and (ior ( match_operand:SI 1 "register_operand")
+					     ( match_operand:DI 1 "register_operand"))
+					(eq_attr "type" "ssemov"))))
+			    "btver2-direct,btver2-stc,btver2-valu")
+
+(define_insn_reservation "btver2_sse_int_load" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "TI")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "ssemov,sselog,sseishft1")))))
+			 "btver2-direct,btver2-load,btver2-valu")
+
+(define_insn_reservation "btver2_sse_int_direct" 1
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "TI")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "direct") 
+					     (eq_attr "type" "ssemov,sselog,sseishft1")))))
+			 "btver2-direct,btver2-valu")
+
+(define_insn_reservation "btver2_sse_int_cvt_load" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "DI")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "sseicvt")))))
+			 "btver2-direct,btver2-load,btver2-valu")
+
+(define_insn_reservation "btver2_sse_int_cvt" 1
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "DI")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "direct") 
+					     (eq_attr "type" "sseicvt")))))
+			 "btver2-direct,btver2-valu")
+
+(define_insn_reservation "btver2_sse_int_32_move" 3
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "SI,DI")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "ssemov")))))
+			 "btver2-direct,btver2-fpa")
+
+(define_insn_reservation "btver2_int_32_sse_move" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "TI")
+				   (and (ior ( match_operand:SI 1 "register_operand")
+					     ( match_operand:DI 1 "register_operand"))
+					(eq_attr "type" "ssemov"))))
+			 "btver2-direct,btver2-stc,btver2-valu")
+
+(define_insn_reservation "btver2_sse2cvt_load_direct" 8
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "TI,V4SF,V2DF,DI")
+				   (and (eq_attr "memory" "load") 
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "ssecvt")))))
+			 "btver2-direct,btver2-load,btver2-stc")
+
+(define_insn_reservation "btver2_sse2cvt_reg_direct" 3
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "TI,V4SF,V2DF,DI") 
+				   (and (eq_attr "btver2_decode" "direct")
+					(eq_attr "type" "ssecvt"))))
+			 "btver2-direct,btver2-stc")
+
+(define_insn_reservation "btver2_sseicvt_load_si" 11
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "SI")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "double")
+					     (eq_attr "type" "sseicvt")))))
+			 "btver2-double,btver2-load,btver2-stc,btver2-fpa")
+
+(define_insn_reservation "btver2_sseicvt_si" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "SI")
+				   (and (eq_attr "btver2_decode" "double")
+					(eq_attr "type" "sseicvt"))))
+			 "btver2-double,btver2-stc,btver2-fpa")
+
+(define_insn_reservation "btver2_ssecvt_load_df" 11
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "DF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "double")
+					     (eq_attr "type" "ssecvt")))))
+			 "btver2-double,btver2-load,btver2-stc*2")
+
+
+(define_insn_reservation "btver2_ssecvt_df" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "DF")
+				   (and (eq_attr "btver2_decode" "double")
+					(eq_attr "type" "ssecvt"))))
+			 "btver2-double,btver2-stc*2")
+
+(define_insn_reservation "btver2_ssecvt_load_sf" 12
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "SF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "double")
+					     (eq_attr "type" "ssecvt")))))
+			 "btver2-double,btver2-load,btver2-stc*2")
+
+(define_insn_reservation "btver2_ssecvt_sf" 7
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "SF")
+				   (and (eq_attr "btver2_decode" "double")
+					(eq_attr "type" "ssecvt"))))
+			 "btver2-double,btver2-stc*2")
+
+(define_insn_reservation "btver2_sseicvt_load_df" 14
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "DF,SF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "double")
+					     (eq_attr "type" "sseicvt")))))
+			 "btver2-double,btver2-load,btver2-stc")
+;;st,ld-stc
+(define_insn_reservation "btver2_sseicvt_df" 9
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "DF,SF")
+				   (and (eq_attr "btver2_decode" "double")
+					(eq_attr "type" "sseicvt"))))
+			 "btver2-double,btver2-stc")
+
+
+(define_insn_reservation "btver2_scalar_sse_load_add" 8
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "DF,SF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "sseadd")))))
+			 "btver2-direct,btver2-load,btver2-fpa")
+
+(define_insn_reservation "btver2_scalar_sse_add" 3
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "DF,SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "sseadd")))))
+			 "btver2-direct,btver2-fpa")
+
+(define_insn_reservation "btver2_int_sse_cmp_load" 7
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V2DF,V4SF,DF,SF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "ssecmp")))))
+			 "btver2-direct,btver2-load,btver2-fpa")
+
+(define_insn_reservation "btver2_int_sse_cmp" 2
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V2DF,V4SF,DF,SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "ssecmp")))))
+			 "btver2-direct,btver2-fpa")
+
+(define_insn_reservation "btver2_int_sse_comsi_load" 7
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "DF,SF")
+				   (and (eq_attr "memory" "load")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "ssecomi")))))
+			 "btver2-direct,btver2-fpa")
+
+(define_insn_reservation "btver2_int_sse_comsi" 2
+			 (and (eq_attr "cpu" "btver2")
+			     (and (eq_attr "mode" "DF,SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(and (eq_attr "btver2_decode" "direct")
+					     (eq_attr "type" "ssecomi")))))
+			 "btver2-direct,btver2-fpa")
+
+(define_insn_reservation "btver2_ssemmx_mov_load_default" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "btver2_decode" "direct")
+					(eq_attr "type" "ssemov,mmxmov"))))
+			 "btver2-direct,btver2-load,(btver2-fpa|btver2-fpm)")
+
+(define_insn_reservation "btver2_ssemmx_mov_store_default" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "store,both")
+				   (and (eq_attr "btver2_decode" "direct")
+					(eq_attr "type" "ssemov,mmxmov"))))
+			 "btver2-direct,(btver2-fpa|btver2-fpm),btver2-store")
+
+(define_insn_reservation "btver2_sse_mov_default" 1
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "none,unknown")
+				   (and (eq_attr "btver2_decode" "direct")
+					(eq_attr "type" "ssemov,mmxmov"))))
+			 "btver2-direct,(btver2-fpa|btver2-fpm)")
+
+(define_insn_reservation "btver2_sse_shuf_double" 2
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "none,unknown")
+				   (and (eq_attr "mode" "V4DF,V8SF")     
+					(eq_attr "type" "sseshuf"))))
+			 "btver2-double,(btver2-fpa|btver2-fpm)")
+
+(define_insn_reservation "btver2_sse_shuf_direct" 1
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "none,unknown")
+				   (and (eq_attr "mode" "V2DF,V4SF")
+					(eq_attr "type" "sseshuf,sseshuf1"))))
+			 "btver2-direct,(btver2-fpa|btver2-fpm)")
+
+(define_insn_reservation "btver2_sse_shuf_double_load" 7
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "V4DF,V8SF")
+					(eq_attr "type" "sseshuf"))))
+			 "btver2-double,btver2-load,(btver2-fpa|btver2-fpm)")
+
+(define_insn_reservation "btver2_sse_shuf_direct_load" 6
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "V2DF,V4SF")
+					(eq_attr "type" "sseshuf,sseshuf1"))))
+			 "btver2-direct,btver2-load,(btver2-fpa|btver2-fpm)")
+
+(define_insn_reservation "btver2_sse_div" 19
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V2DF,DF,V4SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(eq_attr "type" "ssediv"))))
+			 "btver2-direct,btver2-fpm*19")
+
+(define_insn_reservation "btver2_sse_div_sf" 14
+			 (and (eq_attr "cpu" "btver2")
+			     (and (eq_attr "mode" "SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(eq_attr "type" "ssediv"))))
+			 "btver2-direct,btver2-fpm*14")
+
+(define_insn_reservation "btver2_sse_mul" 4
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V2DF,DF,V4SF,SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(eq_attr "type" "ssemul"))))
+			 "btver2-direct,btver2-fpm*2")
+
+(define_insn_reservation "btver2_sse_mul_sf" 2
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V2DF,DF,V4SF,SF")
+				   (and (eq_attr "memory" "none,unknown")
+					(eq_attr "type" "ssemul"))))
+			 "btver2-direct,btver2-fpm")
+
+(define_insn_reservation "btver2_sse_div_load" 24
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V2DF,DF,V4SF")
+				   (and (eq_attr "memory" "load")
+					(eq_attr "type" "ssediv"))))
+			 "btver2-direct,btver2-load,btver2-fpm*19")
+
+(define_insn_reservation "btver2_sse_div_sf_load" 19
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "SF")
+				   (and (eq_attr "memory" "load")
+					(eq_attr "type" "ssediv"))))
+			 "btver2-direct,btver2-load,btver2-fpm*14")
+
+(define_insn_reservation "btver2_sse_mul_load" 9
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V2DF,DF,V4SF,SF")
+				   (and (eq_attr "memory" "load")
+					(eq_attr "type" "ssemul"))))
+			 "btver2-direct,btver2-load,btver2-fpm*2")
+
+(define_insn_reservation "btver2_sse_mul_sf_load" 7
+			 (and (eq_attr "cpu" "btver2")
+			      (and (eq_attr "mode" "V2DF,DF,V4SF,SF")
+				   (and (eq_attr "memory" "load")
+					(eq_attr "type" "ssemul"))))
+			 "btver2-direct,btver2-load,btver2-fpm")
+
diff --git a/gcc-4.9/gcc/config/i386/constraints.md b/gcc-4.9/gcc/config/i386/constraints.md
new file mode 100644
index 000000000..65335f128
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/constraints.md
@@ -0,0 +1,246 @@
+;; Constraint definitions for IA-32 and x86-64.
+;; Copyright (C) 2006-2014 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;;; Unused letters:
+;;;     B     H
+;;;           h j
+
+;; Integer register constraints.
+;; It is not necessary to define 'r' here.
+(define_register_constraint "R" "LEGACY_REGS"
+ "Legacy register---the eight integer registers available on all
+  i386 processors (@code{a}, @code{b}, @code{c}, @code{d},
+  @code{si}, @code{di}, @code{bp}, @code{sp}).")
+
+(define_register_constraint "q" "TARGET_64BIT ? GENERAL_REGS : Q_REGS"
+ "Any register accessible as @code{@var{r}l}.  In 32-bit mode, @code{a},
+  @code{b}, @code{c}, and @code{d}; in 64-bit mode, any integer register.")
+
+(define_register_constraint "Q" "Q_REGS"
+ "Any register accessible as @code{@var{r}h}: @code{a}, @code{b},
+  @code{c}, and @code{d}.")
+
+(define_register_constraint "l" "INDEX_REGS"
+ "@internal Any register that can be used as the index in a base+index
+  memory access: that is, any general register except the stack pointer.")
+
+(define_register_constraint "a" "AREG"
+ "The @code{a} register.")
+
+(define_register_constraint "b" "BREG"
+ "The @code{b} register.")
+
+(define_register_constraint "c" "CREG"
+ "The @code{c} register.")
+
+(define_register_constraint "d" "DREG"
+ "The @code{d} register.")
+
+(define_register_constraint "S" "SIREG"
+ "The @code{si} register.")
+
+(define_register_constraint "D" "DIREG"
+ "The @code{di} register.")
+
+(define_register_constraint "A" "AD_REGS"
+ "The @code{a} and @code{d} registers, as a pair (for instructions
+  that return half the result in one and half in the other).")
+
+(define_register_constraint "U" "CLOBBERED_REGS"
+ "The call-clobbered integer registers.")
+
+;; Floating-point register constraints.
+(define_register_constraint "f"
+ "TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387 ? FLOAT_REGS : NO_REGS"
+ "Any 80387 floating-point (stack) register.")
+
+(define_register_constraint "t"
+ "TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387 ? FP_TOP_REG : NO_REGS"
+ "Top of 80387 floating-point stack (@code{%st(0)}).")
+
+(define_register_constraint "u"
+ "TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387 ? FP_SECOND_REG : NO_REGS"
+ "Second from top of 80387 floating-point stack (@code{%st(1)}).")
+
+(define_register_constraint "Yk" "TARGET_AVX512F ? MASK_EVEX_REGS : NO_REGS"
+"@internal Any mask register that can be used as predicate, i.e. k1-k7.")
+
+(define_register_constraint "k" "TARGET_AVX512F ? MASK_REGS : NO_REGS"
+"@internal Any mask register.")
+
+;; Vector registers (also used for plain floating point nowadays).
+(define_register_constraint "y" "TARGET_MMX ? MMX_REGS : NO_REGS"
+ "Any MMX register.")
+
+(define_register_constraint "x" "TARGET_SSE ? SSE_REGS : NO_REGS"
+ "Any SSE register.")
+
+;; We use the Y prefix to denote any number of conditional register sets:
+;;  z	First SSE register.
+;;  i	SSE2 inter-unit moves to SSE register enabled
+;;  j	SSE2 inter-unit moves from SSE register enabled
+;;  m	MMX inter-unit moves to MMX register enabled
+;;  n	MMX inter-unit moves from MMX register enabled
+;;  a	Integer register when zero extensions with AND are disabled
+;;  p	Integer register when TARGET_PARTIAL_REG_STALL is disabled
+;;  d	Integer register when integer DFmode moves are enabled
+;;  x	Integer register when integer XFmode moves are enabled
+;;  f	x87 register when 80387 floating point arithmetic is enabled
+
+(define_register_constraint "Yz" "TARGET_SSE ? SSE_FIRST_REG : NO_REGS"
+ "First SSE register (@code{%xmm0}).")
+
+(define_register_constraint "Yi"
+ "TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_TO_VEC ? ALL_SSE_REGS : NO_REGS"
+ "@internal Any SSE register, when SSE2 and inter-unit moves to vector registers are enabled.")
+
+(define_register_constraint "Yj"
+ "TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_FROM_VEC ? ALL_SSE_REGS : NO_REGS"
+ "@internal Any SSE register, when SSE2 and inter-unit moves from vector registers are enabled.")
+
+(define_register_constraint "Ym"
+ "TARGET_MMX && TARGET_INTER_UNIT_MOVES_TO_VEC ? MMX_REGS : NO_REGS"
+ "@internal Any MMX register, when inter-unit moves to vector registers are enabled.")
+
+(define_register_constraint "Yn"
+ "TARGET_MMX && TARGET_INTER_UNIT_MOVES_FROM_VEC ? MMX_REGS : NO_REGS"
+ "@internal Any MMX register, when inter-unit moves from vector registers are enabled.")
+
+(define_register_constraint "Yp"
+ "TARGET_PARTIAL_REG_STALL ? NO_REGS : GENERAL_REGS"
+ "@internal Any integer register when TARGET_PARTIAL_REG_STALL is disabled.")
+
+(define_register_constraint "Ya"
+ "TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun)
+  ? NO_REGS : GENERAL_REGS"
+ "@internal Any integer register when zero extensions with AND are disabled.")
+
+(define_register_constraint "Yd"
+ "TARGET_INTEGER_DFMODE_MOVES && optimize_function_for_speed_p (cfun)
+  ? GENERAL_REGS : NO_REGS"
+ "@internal Any integer register when integer DFmode moves are enabled.")
+
+(define_register_constraint "Yx"
+ "optimize_function_for_speed_p (cfun) ? GENERAL_REGS : NO_REGS"
+ "@internal Any integer register when integer XFmode moves are enabled.")
+
+(define_register_constraint "Yf"
+ "(ix86_fpmath & FPMATH_387) ? FLOAT_REGS : NO_REGS"
+ "@internal Any x87 register when 80387 FP arithmetic is enabled.")
+
+(define_register_constraint "v" "TARGET_SSE ? ALL_SSE_REGS : NO_REGS"
+ "Any EVEX encodable SSE register (@code{%xmm0-%xmm31}).")
+
+(define_constraint "z"
+  "@internal Constant call address operand."
+  (match_operand 0 "constant_call_address_operand"))
+
+(define_constraint "w"
+  "@internal Call memory operand."
+  (and (not (match_test "TARGET_X32"))
+       (match_operand 0 "memory_operand")))
+
+;; Integer constant constraints.
+(define_constraint "I"
+  "Integer constant in the range 0 @dots{} 31, for 32-bit shifts."
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (ival, 0, 31)")))
+
+(define_constraint "J"
+  "Integer constant in the range 0 @dots{} 63, for 64-bit shifts."
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (ival, 0, 63)")))
+
+(define_constraint "K"
+  "Signed 8-bit integer constant."
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (ival, -128, 127)")))
+
+(define_constraint "L"
+  "@code{0xFF}, @code{0xFFFF} or @code{0xFFFFFFFF}
+   for AND as a zero-extending move."
+  (and (match_code "const_int")
+       (match_test "ival == 0xff || ival == 0xffff
+		    || ival == (HOST_WIDE_INT) 0xffffffff")))
+
+(define_constraint "M"
+  "0, 1, 2, or 3 (shifts for the @code{lea} instruction)."
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (ival, 0, 3)")))
+
+(define_constraint "N"
+  "Unsigned 8-bit integer constant (for @code{in} and @code{out}
+   instructions)."
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (ival, 0, 255)")))
+
+(define_constraint "O"
+  "@internal Integer constant in the range 0 @dots{} 127, for 128-bit shifts."
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (ival, 0, 127)")))
+
+;; Floating-point constant constraints.
+;; We allow constants even if TARGET_80387 isn't set, because the
+;; stack register converter may need to load 0.0 into the function
+;; value register (top of stack).
+(define_constraint "G"
+  "Standard 80387 floating point constant."
+  (and (match_code "const_double")
+       (match_test "standard_80387_constant_p (op) > 0")))
+
+;; This can theoretically be any mode's CONST0_RTX.
+(define_constraint "C"
+  "Standard SSE floating point constant."
+  (match_test "standard_sse_constant_p (op)"))
+
+;; Constant-or-symbol-reference constraints.
+
+(define_constraint "e"
+  "32-bit signed integer constant, or a symbolic reference known
+   to fit that range (for immediate operands in sign-extending x86-64
+   instructions)."
+  (match_operand 0 "x86_64_immediate_operand"))
+
+;; We use W prefix to denote any number of
+;; constant-or-symbol-reference constraints
+
+(define_constraint "Wz"
+  "32-bit unsigned integer constant, or a symbolic reference known
+   to fit that range (for zero-extending conversion operations that
+   require non-VOIDmode immediate operands)."
+  (and (match_operand 0 "x86_64_zext_immediate_operand")
+       (match_test "GET_MODE (op) != VOIDmode")))
+
+(define_constraint "Z"
+  "32-bit unsigned integer constant, or a symbolic reference known
+   to fit that range (for immediate operands in zero-extending x86-64
+   instructions)."
+  (match_operand 0 "x86_64_zext_immediate_operand"))
+
+;; T prefix is used for different address constraints
+;;   v - VSIB address
+;;   s - address with no segment register
+
+(define_address_constraint "Tv"
+  "VSIB address operand"
+  (match_operand 0 "vsib_address_operand"))
+
+(define_address_constraint "Ts"
+  "Address operand without segment register"
+  (match_operand 0 "address_no_seg_operand"))
diff --git a/gcc-4.9/gcc/config/i386/core2.md b/gcc-4.9/gcc/config/i386/core2.md
new file mode 100644
index 000000000..53df9eede
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/core2.md
@@ -0,0 +1,691 @@
+;; Scheduling for Core 2 and derived processors.
+;; Copyright (C) 2004-2014 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; The scheduling description in this file is based on the one in ppro.md,
+;; with additional information obtained from
+;;
+;;    "How to optimize for the Pentium family of microprocessors",
+;;    by Agner Fog, PhD.
+;;
+;; The major difference from the P6 pipeline is one extra decoder, and
+;; one extra execute unit.  Due to micro-op fusion, many insns no longer
+;; need to be decoded in decoder 0, but can be handled by all of them.
+
+;; The core2_idiv, core2_fdiv and core2_ssediv automata are used to
+;; model issue latencies of idiv, fdiv and ssediv type insns.
+(define_automaton "core2_decoder,core2_core,core2_idiv,core2_fdiv,core2_ssediv,core2_load,core2_store")
+
+;; The CPU domain, used for Core i7 bypass latencies
+(define_attr "i7_domain" "int,float,simd"
+  (cond [(eq_attr "type" "fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint")
+	   (const_string "float")
+	 (eq_attr "type" "sselog,sselog1,sseiadd,sseiadd1,sseishft,sseishft1,sseimul,
+			  sse,ssemov,sseadd,sseadd1,ssemul,ssecmp,ssecomi,ssecvt,
+			  ssecvt1,sseicvt,ssediv,sseins,ssemuladd,sse4arg")
+	   (cond [(eq_attr "mode" "V4DF,V8SF,V2DF,V4SF,SF,DF")
+		    (const_string "float")
+		  (eq_attr "mode" "SI")
+		    (const_string "int")]
+		  (const_string "simd"))
+	 (eq_attr "type" "mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft")
+	   (const_string "simd")]
+	(const_string "int")))
+
+;; As for the Pentium Pro,
+;;  - an instruction with 1 uop can be decoded by any of the three
+;;    decoders in one cycle.
+;;  - an instruction with 1 to 4 uops can be decoded only by decoder 0
+;;    but still in only one cycle.
+;;  - a complex (microcode) instruction can also only be decoded by
+;;    decoder 0, and this takes an unspecified number of cycles.
+;;
+;; The goal is to schedule such that we have a few-one-one uops sequence
+;; in each cycle, to decode as many instructions per cycle as possible.
+(define_cpu_unit "c2_decoder0" "core2_decoder")
+(define_cpu_unit "c2_decoder1" "core2_decoder")
+(define_cpu_unit "c2_decoder2" "core2_decoder")
+(define_cpu_unit "c2_decoder3" "core2_decoder")
+
+;; We first wish to find an instruction for c2_decoder0, so exclude
+;; c2_decoder1 and c2_decoder2 from being reserved until c2_decoder 0 is
+;; reserved.
+(presence_set "c2_decoder1" "c2_decoder0")
+(presence_set "c2_decoder2" "c2_decoder0")
+(presence_set "c2_decoder3" "c2_decoder0")
+
+;; Most instructions can be decoded on any of the three decoders.
+(define_reservation "c2_decodern" "(c2_decoder0|c2_decoder1|c2_decoder2|c2_decoder3)")
+
+;; The out-of-order core has six pipelines.  These are similar to the
+;; Pentium Pro's five pipelines.  Port 2 is responsible for memory loads,
+;; port 3 for store address calculations, port 4 for memory stores, and
+;; ports 0, 1 and 5 for everything else.
+
+(define_cpu_unit "c2_p0,c2_p1,c2_p5" "core2_core")
+(define_cpu_unit "c2_p2" "core2_load")
+(define_cpu_unit "c2_p3,c2_p4" "core2_store")
+(define_cpu_unit "c2_idiv" "core2_idiv")
+(define_cpu_unit "c2_fdiv" "core2_fdiv")
+(define_cpu_unit "c2_ssediv" "core2_ssediv")
+
+;; Only the irregular instructions have to be modeled here.  A load
+;; increases the latency by 2 or 3, or by nothing if the manual gives
+;; a latency already.  Store latencies are not accounted for.
+;;
+;; The simple instructions follow a very regular pattern of 1 uop per
+;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store
+;; on port 4 and port 3.  These instructions are modelled at the bottom
+;; of this file.
+;;
+;; For microcoded instructions we don't know how many uops are produced.
+;; These instructions are the "complex" ones in the Intel manuals.  All
+;; we _do_ know is that they typically produce four or more uops, so
+;; they can only be decoded on c2_decoder0.  Modelling their latencies
+;; doesn't make sense because we don't know how these instructions are
+;; executed in the core.  So we just model that they can only be decoded
+;; on decoder 0, and say that it takes a little while before the result
+;; is available.
+(define_insn_reservation "c2_complex_insn" 6
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (eq_attr "type" "other,multi,str"))
+			 "c2_decoder0")
+
+(define_insn_reservation "c2_call" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (eq_attr "type" "call,callv"))
+			 "c2_decoder0")
+
+;; imov with memory operands does not use the integer units.
+;; imovx always decodes to one uop, and also doesn't use the integer
+;; units if it has memory operands.
+(define_insn_reservation "c2_imov" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "imov,imovx")))
+			 "c2_decodern,(c2_p0|c2_p1|c2_p5)")
+
+(define_insn_reservation "c2_imov_load" 4
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "imov,imovx")))
+			 "c2_decodern,c2_p2")
+
+(define_insn_reservation "c2_imov_store" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "store")
+				   (eq_attr "type" "imov")))
+			 "c2_decodern,c2_p4+c2_p3")
+
+(define_insn_reservation "c2_icmov" 2
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "icmov")))
+			 "c2_decoder0,(c2_p0|c2_p1|c2_p5)*2")
+
+(define_insn_reservation "c2_icmov_load" 2
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "icmov")))
+			 "c2_decoder0,c2_p2,(c2_p0|c2_p1|c2_p5)*2")
+
+(define_insn_reservation "c2_push_reg" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "store")
+				   (eq_attr "type" "push")))
+			 "c2_decodern,c2_p4+c2_p3")
+
+(define_insn_reservation "c2_push_mem" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "both")
+				   (eq_attr "type" "push")))
+			 "c2_decoder0,c2_p2,c2_p4+c2_p3")
+
+;; lea executes on port 0 with latency one and throughput 1.
+(define_insn_reservation "c2_lea" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "lea")))
+			 "c2_decodern,c2_p0")
+
+;; Shift and rotate decode as two uops which can go to port 0 or 5.
+;; The load and store units need to be reserved when memory operands
+;; are involved.
+(define_insn_reservation "c2_shift_rotate" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
+			 "c2_decodern,(c2_p0|c2_p5)")
+
+(define_insn_reservation "c2_shift_rotate_mem" 4
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "!none")
+				   (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
+			 "c2_decoder0,c2_p2,(c2_p0|c2_p5),c2_p4+c2_p3")
+
+;; See comments in ppro.md for the corresponding reservation.
+(define_insn_reservation "c2_branch" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "ibr")))
+			 "c2_decodern,c2_p5")
+
+;; ??? Indirect branches probably have worse latency than this.
+(define_insn_reservation "c2_indirect_branch" 6
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "!none")
+				   (eq_attr "type" "ibr")))
+			 "c2_decoder0,c2_p2+c2_p5")
+
+(define_insn_reservation "c2_leave" 4
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (eq_attr "type" "leave"))
+			 "c2_decoder0,c2_p2+(c2_p0|c2_p1),(c2_p0|c2_p1)")
+
+;; mul and imul with two/three operands only execute on port 1 for HImode
+;; and SImode, port 0 for DImode.
+(define_insn_reservation "c2_imul_hisi" 3
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "HI,SI")
+					(eq_attr "type" "imul"))))
+			 "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_imul_hisi_mem" 3
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "!none")
+				   (and (eq_attr "mode" "HI,SI")
+					(eq_attr "type" "imul"))))
+			 "c2_decoder0,c2_p2+c2_p1")
+
+(define_insn_reservation "c2_imul_di" 5
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "DI")
+					(eq_attr "type" "imul"))))
+			 "c2_decodern,c2_p0")
+
+(define_insn_reservation "c2_imul_di_mem" 5
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "!none")
+				   (and (eq_attr "mode" "DI")
+					(eq_attr "type" "imul"))))
+			 "c2_decoder0,c2_p2+c2_p0")
+
+;; div and idiv are very similar, so we model them the same.
+;; QI, HI, and SI have issue latency 12, 21, and 37, respectively.
+;; These issue latencies are modelled via the c2_div automaton.
+(define_insn_reservation "c2_idiv_QI" 19
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "QI")
+					(eq_attr "type" "idiv"))))
+			 "c2_decoder0,(c2_p0+c2_idiv)*2,(c2_p0|c2_p1)+c2_idiv,c2_idiv*9")
+
+(define_insn_reservation "c2_idiv_QI_load" 19
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "QI")
+					(eq_attr "type" "idiv"))))
+			 "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*9")
+
+(define_insn_reservation "c2_idiv_HI" 23
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "HI")
+					(eq_attr "type" "idiv"))))
+			 "c2_decoder0,(c2_p0+c2_idiv)*3,(c2_p0|c2_p1)+c2_idiv,c2_idiv*17")
+
+(define_insn_reservation "c2_idiv_HI_load" 23
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "HI")
+					(eq_attr "type" "idiv"))))
+			 "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*18")
+
+(define_insn_reservation "c2_idiv_SI" 39
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SI")
+					(eq_attr "type" "idiv"))))
+			 "c2_decoder0,(c2_p0+c2_idiv)*3,(c2_p0|c2_p1)+c2_idiv,c2_idiv*33")
+
+(define_insn_reservation "c2_idiv_SI_load" 39
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "SI")
+					(eq_attr "type" "idiv"))))
+			 "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*34")
+
+;; x87 floating point operations.
+
+(define_insn_reservation "c2_fxch" 0
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (eq_attr "type" "fxch"))
+			 "c2_decodern")
+
+(define_insn_reservation "c2_fop" 3
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none,unknown")
+				   (eq_attr "type" "fop")))
+			 "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_fop_load" 5
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "fop")))
+			 "c2_decoder0,c2_p2+c2_p1,c2_p1")
+
+(define_insn_reservation "c2_fop_store" 3
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "store")
+				   (eq_attr "type" "fop")))
+			 "c2_decoder0,c2_p0,c2_p0,c2_p0+c2_p4+c2_p3")
+
+(define_insn_reservation "c2_fop_both" 5
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "both")
+				   (eq_attr "type" "fop")))
+			 "c2_decoder0,c2_p2+c2_p0,c2_p0+c2_p4+c2_p3")
+
+(define_insn_reservation "c2_fsgn" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (eq_attr "type" "fsgn"))
+			 "c2_decodern,c2_p0")
+
+(define_insn_reservation "c2_fistp" 5
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (eq_attr "type" "fistp"))
+			 "c2_decoder0,c2_p0*2,c2_p4+c2_p3")
+
+(define_insn_reservation "c2_fcmov" 2
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (eq_attr "type" "fcmov"))
+			 "c2_decoder0,c2_p0*2")
+
+(define_insn_reservation "c2_fcmp" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "fcmp")))
+			 "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_fcmp_load" 4
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "fcmp")))
+			 "c2_decoder0,c2_p2+c2_p1")
+
+(define_insn_reservation "c2_fmov" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "fmov")))
+			 "c2_decodern,c2_p0")
+
+(define_insn_reservation "c2_fmov_load" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "!XF")
+					(eq_attr "type" "fmov"))))
+			 "c2_decodern,c2_p2")
+
+(define_insn_reservation "c2_fmov_XF_load" 3
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "XF")
+					(eq_attr "type" "fmov"))))
+			 "c2_decoder0,(c2_p2+c2_p0)*2")
+
+(define_insn_reservation "c2_fmov_store" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "store")
+				   (and (eq_attr "mode" "!XF")
+					(eq_attr "type" "fmov"))))
+			 "c2_decodern,c2_p3+c2_p4")
+
+(define_insn_reservation "c2_fmov_XF_store" 3
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "store")
+				   (and (eq_attr "mode" "XF")
+					(eq_attr "type" "fmov"))))
+			 "c2_decoder0,(c2_p3+c2_p4),(c2_p3+c2_p4)")
+
+;; fmul executes on port 0 with latency 5.  It has issue latency 2,
+;; but we don't model this.
+(define_insn_reservation "c2_fmul" 5
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "fmul")))
+			 "c2_decoder0,c2_p0*2")
+
+(define_insn_reservation "c2_fmul_load" 6
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "fmul")))
+			 "c2_decoder0,c2_p2+c2_p0,c2_p0")
+
+;; fdiv latencies depend on the mode of the operands.  XFmode gives
+;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18.
+;; Division by a power of 2 takes only 9 cycles, but we cannot model
+;; that.  Throughput is equal to latency - 1, which we model using the
+;; c2_div automaton.
+(define_insn_reservation "c2_fdiv_SF" 18
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "fdiv,fpspc"))))
+			 "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*16")
+
+(define_insn_reservation "c2_fdiv_SF_load" 19
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "fdiv,fpspc"))))
+			 "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*16")
+
+(define_insn_reservation "c2_fdiv_DF" 32
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "DF")
+					(eq_attr "type" "fdiv,fpspc"))))
+			 "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*30")
+
+(define_insn_reservation "c2_fdiv_DF_load" 33
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "DF")
+					(eq_attr "type" "fdiv,fpspc"))))
+			 "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*30")
+
+(define_insn_reservation "c2_fdiv_XF" 38
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "XF")
+					(eq_attr "type" "fdiv,fpspc"))))
+			 "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*36")
+
+(define_insn_reservation "c2_fdiv_XF_load" 39
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "XF")
+					(eq_attr "type" "fdiv,fpspc"))))
+			 "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*36")
+
+;; MMX instructions.
+
+(define_insn_reservation "c2_mmx_add" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "mmxadd,sseiadd")))
+			 "c2_decodern,c2_p0|c2_p5")
+
+(define_insn_reservation "c2_mmx_add_load" 2
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "mmxadd,sseiadd")))
+			 "c2_decodern,c2_p2+c2_p0|c2_p5")
+
+(define_insn_reservation "c2_mmx_shft" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "mmxshft")))
+			 "c2_decodern,c2_p0|c2_p5")
+
+(define_insn_reservation "c2_mmx_shft_load" 2
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "mmxshft")))
+			 "c2_decoder0,c2_p2+c2_p1")
+
+(define_insn_reservation "c2_mmx_sse_shft" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "type" "sseishft")
+					(eq_attr "length_immediate" "!0"))))
+			 "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_mmx_sse_shft_load" 2
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "type" "sseishft")
+					(eq_attr "length_immediate" "!0"))))
+			 "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_mmx_sse_shft1" 2
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "type" "sseishft")
+					(eq_attr "length_immediate" "0"))))
+			 "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_mmx_sse_shft1_load" 3
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "type" "sseishft")
+					(eq_attr "length_immediate" "0"))))
+			 "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_mmx_mul" 3
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "mmxmul,sseimul")))
+			 "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_mmx_mul_load" 3
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "mmxmul,sseimul")))
+			 "c2_decoder0,c2_p2+c2_p1")
+
+(define_insn_reservation "c2_sse_mmxcvt" 4
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "mode" "DI")
+				   (eq_attr "type" "mmxcvt")))
+			 "c2_decodern,c2_p1")
+
+;; FIXME: These are Pentium III only, but we cannot tell here if
+;; we're generating code for PentiumPro/Pentium II or Pentium III
+;; (define_insn_reservation "c2_sse_mmxshft" 2
+;;			 (and (eq_attr "cpu" "core2,nehalem")
+;;			      (and (eq_attr "mode" "TI")
+;;				   (eq_attr "type" "mmxshft")))
+;;			 "c2_decodern,c2_p0")
+
+;; The sfence instruction.
+(define_insn_reservation "c2_sse_sfence" 3
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "unknown")
+				   (eq_attr "type" "sse")))
+			 "c2_decoder0,c2_p4+c2_p3")
+
+;; FIXME: This reservation is all wrong when we're scheduling sqrtss.
+(define_insn_reservation "c2_sse_SFDF" 3
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "mode" "SF,DF")
+				   (eq_attr "type" "sse")))
+			 "c2_decodern,c2_p0")
+
+(define_insn_reservation "c2_sse_V4SF" 4
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "mode" "V4SF")
+				   (eq_attr "type" "sse")))
+			 "c2_decoder0,c2_p1*2")
+
+(define_insn_reservation "c2_sse_addcmp" 3
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "sseadd,sseadd1,ssecmp,ssecomi")))
+			 "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_sse_addcmp_load" 3
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "sseadd,sseadd1,ssecmp,ssecomi")))
+			 "c2_decodern,c2_p2+c2_p1")
+
+(define_insn_reservation "c2_sse_mul_SF" 4
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SF,V4SF")
+					(eq_attr "type" "ssemul"))))
+			"c2_decodern,c2_p0")
+
+(define_insn_reservation "c2_sse_mul_SF_load" 4
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "SF,V4SF")
+					(eq_attr "type" "ssemul"))))
+			"c2_decodern,c2_p2+c2_p0")
+
+(define_insn_reservation "c2_sse_mul_DF" 5
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "DF,V2DF")
+					(eq_attr "type" "ssemul"))))
+			"c2_decodern,c2_p0")
+
+(define_insn_reservation "c2_sse_mul_DF_load" 5
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "DF,V2DF")
+					(eq_attr "type" "ssemul"))))
+			"c2_decodern,c2_p2+c2_p0")
+
+(define_insn_reservation "c2_sse_div_SF" 18
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SF,V4SF")
+					(eq_attr "type" "ssediv"))))
+			 "c2_decodern,c2_p0,c2_ssediv*17")
+
+(define_insn_reservation "c2_sse_div_SF_load" 18
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SF,V4SF")
+					(eq_attr "type" "ssediv"))))
+			 "c2_decodern,(c2_p2+c2_p0),c2_ssediv*17")
+
+(define_insn_reservation "c2_sse_div_DF" 32
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "DF,V2DF")
+					(eq_attr "type" "ssediv"))))
+			 "c2_decodern,c2_p0,c2_ssediv*31")
+
+(define_insn_reservation "c2_sse_div_DF_load" 32
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "DF,V2DF")
+					(eq_attr "type" "ssediv"))))
+			 "c2_decodern,(c2_p2+c2_p0),c2_ssediv*31")
+
+;; FIXME: these have limited throughput
+(define_insn_reservation "c2_sse_icvt_SF" 4
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "sseicvt"))))
+			 "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_sse_icvt_SF_load" 4
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "!none")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "sseicvt"))))
+			 "c2_decodern,c2_p2+c2_p1")
+
+(define_insn_reservation "c2_sse_icvt_DF" 4
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "DF")
+					(eq_attr "type" "sseicvt"))))
+			 "c2_decoder0,c2_p0+c2_p1")
+
+(define_insn_reservation "c2_sse_icvt_DF_load" 4
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "!none")
+				   (and (eq_attr "mode" "DF")
+					(eq_attr "type" "sseicvt"))))
+			 "c2_decoder0,(c2_p2+c2_p1)")
+
+(define_insn_reservation "c2_sse_icvt_SI" 3
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SI")
+					(eq_attr "type" "sseicvt"))))
+			 "c2_decodern,c2_p1")
+
+(define_insn_reservation "c2_sse_icvt_SI_load" 3
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "!none")
+				   (and (eq_attr "mode" "SI")
+					(eq_attr "type" "sseicvt"))))
+			 "c2_decodern,(c2_p2+c2_p1)")
+
+(define_insn_reservation "c2_sse_mov" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "ssemov")))
+			 "c2_decodern,(c2_p0|c2_p1|c2_p5)")
+
+(define_insn_reservation "c2_sse_mov_load" 2
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "ssemov")))
+			 "c2_decodern,c2_p2")
+
+(define_insn_reservation "c2_sse_mov_store" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "store")
+				   (eq_attr "type" "ssemov")))
+			 "c2_decodern,c2_p4+c2_p3")
+
+;; All other instructions are modelled as simple instructions.
+;; We have already modelled all i387 floating point instructions, so all
+;; other instructions execute on either port 0, 1 or 5.  This includes
+;; the ALU units, and the MMX units.
+;;
+;; reg-reg instructions produce 1 uop so they can be decoded on any of
+;; the three decoders.  Loads benefit from micro-op fusion and can be
+;; treated in the same way.
+(define_insn_reservation "c2_insn" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "none,unknown")
+				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,sseishft1,mmx,mmxcmp")))
+			 "c2_decodern,(c2_p0|c2_p1|c2_p5)")
+
+(define_insn_reservation "c2_insn_load" 4
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,pop,sseishft1,mmx,mmxcmp")))
+			 "c2_decodern,c2_p2,(c2_p0|c2_p1|c2_p5)")
+
+;; register-memory instructions have three uops,  so they have to be
+;; decoded on c2_decoder0.
+(define_insn_reservation "c2_insn_store" 1
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "store")
+				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,sseishft1,mmx,mmxcmp")))
+			 "c2_decoder0,(c2_p0|c2_p1|c2_p5),c2_p4+c2_p3")
+
+;; read-modify-store instructions produce 4 uops so they have to be
+;; decoded on c2_decoder0 as well.
+(define_insn_reservation "c2_insn_both" 4
+			 (and (eq_attr "cpu" "core2,nehalem")
+			      (and (eq_attr "memory" "both")
+				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,pop,sseishft1,mmx,mmxcmp")))
+			 "c2_decoder0,c2_p2,(c2_p0|c2_p1|c2_p5),c2_p4+c2_p3")
diff --git a/gcc-4.9/gcc/config/i386/cpuid.h b/gcc-4.9/gcc/config/i386/cpuid.h
new file mode 100644
index 000000000..8c323ae3a
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/cpuid.h
@@ -0,0 +1,277 @@
+/*
+ * Copyright (C) 2007-2014 Free Software Foundation, Inc.
+ *
+ * This file is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 3, or (at your option) any
+ * later version.
+ * 
+ * This file is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * Under Section 7 of GPL version 3, you are granted additional
+ * permissions described in the GCC Runtime Library Exception, version
+ * 3.1, as published by the Free Software Foundation.
+ * 
+ * You should have received a copy of the GNU General Public License and
+ * a copy of the GCC Runtime Library Exception along with this program;
+ * see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+/* %ecx */
+#define bit_SSE3	(1 << 0)
+#define bit_PCLMUL	(1 << 1)
+#define bit_LZCNT	(1 << 5)
+#define bit_SSSE3	(1 << 9)
+#define bit_FMA		(1 << 12)
+#define bit_CMPXCHG16B	(1 << 13)
+#define bit_SSE4_1	(1 << 19)
+#define bit_SSE4_2	(1 << 20)
+#define bit_MOVBE	(1 << 22)
+#define bit_POPCNT	(1 << 23)
+#define bit_AES		(1 << 25)
+#define bit_XSAVE	(1 << 26)
+#define bit_OSXSAVE	(1 << 27)
+#define bit_AVX		(1 << 28)
+#define bit_F16C	(1 << 29)
+#define bit_RDRND	(1 << 30)
+
+/* %edx */
+#define bit_CMPXCHG8B	(1 << 8)
+#define bit_CMOV	(1 << 15)
+#define bit_MMX		(1 << 23)
+#define bit_FXSAVE	(1 << 24)
+#define bit_SSE		(1 << 25)
+#define bit_SSE2	(1 << 26)
+
+/* Extended Features */
+/* %ecx */
+#define bit_LAHF_LM	(1 << 0)
+#define bit_ABM		(1 << 5)
+#define bit_SSE4a	(1 << 6)
+#define bit_PRFCHW	(1 << 8)
+#define bit_XOP         (1 << 11)
+#define bit_LWP 	(1 << 15)
+#define bit_FMA4        (1 << 16)
+#define bit_TBM         (1 << 21)
+
+/* %edx */
+#define bit_MMXEXT	(1 << 22)
+#define bit_LM		(1 << 29)
+#define bit_3DNOWP	(1 << 30)
+#define bit_3DNOW	(1 << 31)
+
+/* Extended Features (%eax == 7) */
+/* %ebx */
+#define bit_FSGSBASE	(1 << 0)
+#define bit_BMI	(1 << 3)
+#define bit_HLE	(1 << 4)
+#define bit_AVX2	(1 << 5)
+#define bit_BMI2	(1 << 8)
+#define bit_RTM	(1 << 11)
+#define bit_AVX512F	(1 << 16)
+#define bit_RDSEED	(1 << 18)
+#define bit_ADX	(1 << 19)
+#define bit_AVX512PF	(1 << 26)
+#define bit_AVX512ER	(1 << 27)
+#define bit_AVX512CD	(1 << 28)
+#define bit_SHA		(1 << 29)
+
+/* %ecx */
+#define bit_PREFETCHWT1	  (1 << 0)
+
+/* Extended State Enumeration Sub-leaf (%eax == 13, %ecx == 1) */
+#define bit_XSAVEOPT	(1 << 0)
+
+/* Signatures for different CPU implementations as returned in uses
+   of cpuid with level 0.  */
+#define signature_AMD_ebx	0x68747541
+#define signature_AMD_ecx	0x444d4163
+#define signature_AMD_edx	0x69746e65
+
+#define signature_CENTAUR_ebx	0x746e6543
+#define signature_CENTAUR_ecx	0x736c7561
+#define signature_CENTAUR_edx	0x48727561
+
+#define signature_CYRIX_ebx	0x69727943
+#define signature_CYRIX_ecx	0x64616574
+#define signature_CYRIX_edx	0x736e4978
+
+#define signature_INTEL_ebx	0x756e6547
+#define signature_INTEL_ecx	0x6c65746e
+#define signature_INTEL_edx	0x49656e69
+
+#define signature_TM1_ebx	0x6e617254
+#define signature_TM1_ecx	0x55504361
+#define signature_TM1_edx	0x74656d73
+
+#define signature_TM2_ebx	0x756e6547
+#define signature_TM2_ecx	0x3638784d
+#define signature_TM2_edx	0x54656e69
+
+#define signature_NSC_ebx	0x646f6547
+#define signature_NSC_ecx	0x43534e20
+#define signature_NSC_edx	0x79622065
+
+#define signature_NEXGEN_ebx	0x4778654e
+#define signature_NEXGEN_ecx	0x6e657669
+#define signature_NEXGEN_edx	0x72446e65
+
+#define signature_RISE_ebx	0x65736952
+#define signature_RISE_ecx	0x65736952
+#define signature_RISE_edx	0x65736952
+
+#define signature_SIS_ebx	0x20536953
+#define signature_SIS_ecx	0x20536953
+#define signature_SIS_edx	0x20536953
+
+#define signature_UMC_ebx	0x20434d55
+#define signature_UMC_ecx	0x20434d55
+#define signature_UMC_edx	0x20434d55
+
+#define signature_VIA_ebx	0x20414956
+#define signature_VIA_ecx	0x20414956
+#define signature_VIA_edx	0x20414956
+
+#define signature_VORTEX_ebx	0x74726f56
+#define signature_VORTEX_ecx	0x436f5320
+#define signature_VORTEX_edx	0x36387865
+
+#if defined(__i386__) && defined(__PIC__)
+/* %ebx may be the PIC register.  */
+#if __GNUC__ >= 3
+#define __cpuid(level, a, b, c, d)			\
+  __asm__ ("xchg{l}\t{%%}ebx, %k1\n\t"			\
+	   "cpuid\n\t"					\
+	   "xchg{l}\t{%%}ebx, %k1\n\t"			\
+	   : "=a" (a), "=&r" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d)		\
+  __asm__ ("xchg{l}\t{%%}ebx, %k1\n\t"			\
+	   "cpuid\n\t"					\
+	   "xchg{l}\t{%%}ebx, %k1\n\t"			\
+	   : "=a" (a), "=&r" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level), "2" (count))
+#else
+/* Host GCCs older than 3.0 weren't supporting Intel asm syntax
+   nor alternatives in i386 code.  */
+#define __cpuid(level, a, b, c, d)			\
+  __asm__ ("xchgl\t%%ebx, %k1\n\t"			\
+	   "cpuid\n\t"					\
+	   "xchgl\t%%ebx, %k1\n\t"			\
+	   : "=a" (a), "=&r" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d)		\
+  __asm__ ("xchgl\t%%ebx, %k1\n\t"			\
+	   "cpuid\n\t"					\
+	   "xchgl\t%%ebx, %k1\n\t"			\
+	   : "=a" (a), "=&r" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level), "2" (count))
+#endif
+#elif defined(__x86_64__) && (defined(__code_model_medium__) || defined(__code_model_large__)) && defined(__PIC__)
+/* %rbx may be the PIC register.  */
+#define __cpuid(level, a, b, c, d)			\
+  __asm__ ("xchg{q}\t{%%}rbx, %q1\n\t"			\
+	   "cpuid\n\t"					\
+	   "xchg{q}\t{%%}rbx, %q1\n\t"			\
+	   : "=a" (a), "=&r" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d)		\
+  __asm__ ("xchg{q}\t{%%}rbx, %q1\n\t"			\
+	   "cpuid\n\t"					\
+	   "xchg{q}\t{%%}rbx, %q1\n\t"			\
+	   : "=a" (a), "=&r" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level), "2" (count))
+#else
+#define __cpuid(level, a, b, c, d)			\
+  __asm__ ("cpuid\n\t"					\
+	   : "=a" (a), "=b" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level))
+
+#define __cpuid_count(level, count, a, b, c, d)		\
+  __asm__ ("cpuid\n\t"					\
+	   : "=a" (a), "=b" (b), "=c" (c), "=d" (d)	\
+	   : "0" (level), "2" (count))
+#endif
+
+/* Return highest supported input value for cpuid instruction.  ext can
+   be either 0x0 or 0x8000000 to return highest supported value for
+   basic or extended cpuid information.  Function returns 0 if cpuid
+   is not supported or whatever cpuid returns in eax register.  If sig
+   pointer is non-null, then first four bytes of the signature
+   (as found in ebx register) are returned in location pointed by sig.  */
+
+static __inline unsigned int
+__get_cpuid_max (unsigned int __ext, unsigned int *__sig)
+{
+  unsigned int __eax, __ebx, __ecx, __edx;
+
+#ifndef __x86_64__
+  /* See if we can use cpuid.  On AMD64 we always can.  */
+#if __GNUC__ >= 3
+  __asm__ ("pushf{l|d}\n\t"
+	   "pushf{l|d}\n\t"
+	   "pop{l}\t%0\n\t"
+	   "mov{l}\t{%0, %1|%1, %0}\n\t"
+	   "xor{l}\t{%2, %0|%0, %2}\n\t"
+	   "push{l}\t%0\n\t"
+	   "popf{l|d}\n\t"
+	   "pushf{l|d}\n\t"
+	   "pop{l}\t%0\n\t"
+	   "popf{l|d}\n\t"
+	   : "=&r" (__eax), "=&r" (__ebx)
+	   : "i" (0x00200000));
+#else
+/* Host GCCs older than 3.0 weren't supporting Intel asm syntax
+   nor alternatives in i386 code.  */
+  __asm__ ("pushfl\n\t"
+	   "pushfl\n\t"
+	   "popl\t%0\n\t"
+	   "movl\t%0, %1\n\t"
+	   "xorl\t%2, %0\n\t"
+	   "pushl\t%0\n\t"
+	   "popfl\n\t"
+	   "pushfl\n\t"
+	   "popl\t%0\n\t"
+	   "popfl\n\t"
+	   : "=&r" (__eax), "=&r" (__ebx)
+	   : "i" (0x00200000));
+#endif
+
+  if (!((__eax ^ __ebx) & 0x00200000))
+    return 0;
+#endif
+
+  /* Host supports cpuid.  Return highest supported cpuid input value.  */
+  __cpuid (__ext, __eax, __ebx, __ecx, __edx);
+
+  if (__sig)
+    *__sig = __ebx;
+
+  return __eax;
+}
+
+/* Return cpuid data for requested cpuid level, as found in returned
+   eax, ebx, ecx and edx registers.  The function checks if cpuid is
+   supported and returns 1 for valid cpuid information or 0 for
+   unsupported cpuid level.  All pointers are required to be non-null.  */
+
+static __inline int
+__get_cpuid (unsigned int __level,
+	     unsigned int *__eax, unsigned int *__ebx,
+	     unsigned int *__ecx, unsigned int *__edx)
+{
+  unsigned int __ext = __level & 0x80000000;
+
+  if (__get_cpuid_max (__ext, 0) < __level)
+    return 0;
+
+  __cpuid (__level, *__eax, *__ebx, *__ecx, *__edx);
+  return 1;
+}
diff --git a/gcc-4.9/gcc/config/i386/cross-stdarg.h b/gcc-4.9/gcc/config/i386/cross-stdarg.h
new file mode 100644
index 000000000..d16cef820
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/cross-stdarg.h
@@ -0,0 +1,72 @@
+/* Copyright (C) 2002-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef __CROSS_STDARG_H_INCLUDED
+#define __CROSS_STDARG_H_INCLUDED
+
+/* Make sure that for non x64 targets cross builtins are defined.  */
+#ifndef __x86_64__
+/* Call abi ms_abi.  */
+#define __builtin_ms_va_list __builtin_va_list
+#define __builtin_ms_va_copy __builtin_va_copy
+#define __builtin_ms_va_start __builtin_va_start
+#define __builtin_ms_va_end __builtin_va_end
+
+/* Call abi sysv_abi.  */
+#define __builtin_sysv_va_list __builtin_va_list
+#define __builtin_sysv_va_copy __builtin_va_copy
+#define __builtin_sysv_va_start __builtin_va_start
+#define __builtin_sysv_va_end __builtin_va_end
+#endif
+
+#define __ms_va_copy(__d,__s) __builtin_ms_va_copy(__d,__s)
+#define __ms_va_start(__v,__l) __builtin_ms_va_start(__v,__l)
+#define __ms_va_arg(__v,__l)	__builtin_va_arg(__v,__l)
+#define __ms_va_end(__v) __builtin_ms_va_end(__v)
+
+#define __sysv_va_copy(__d,__s) __builtin_sysv_va_copy(__d,__s)
+#define __sysv_va_start(__v,__l) __builtin_sysv_va_start(__v,__l)
+#define __sysv_va_arg(__v,__l)	__builtin_va_arg(__v,__l)
+#define __sysv_va_end(__v) __builtin_sysv_va_end(__v)
+
+#ifndef __GNUC_SYSV_VA_LIST
+#define __GNUC_SYSV_VA_LIST
+  typedef __builtin_sysv_va_list __gnuc_sysv_va_list;
+#endif
+
+#ifndef _SYSV_VA_LIST_DEFINED
+#define _SYSV_VA_LIST_DEFINED
+  typedef __gnuc_sysv_va_list sysv_va_list;
+#endif
+
+#ifndef __GNUC_MS_VA_LIST
+#define __GNUC_MS_VA_LIST
+  typedef __builtin_ms_va_list __gnuc_ms_va_list;
+#endif
+
+#ifndef _MS_VA_LIST_DEFINED
+#define _MS_VA_LIST_DEFINED
+  typedef __gnuc_ms_va_list ms_va_list;
+#endif
+
+#endif /* __CROSS_STDARG_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/crtdll.h b/gcc-4.9/gcc/config/i386/crtdll.h
new file mode 100644
index 000000000..b18168ee7
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/crtdll.h
@@ -0,0 +1,42 @@
+/* Operating system specific defines to be used when targeting GCC for
+   hosting on Windows32, using GNU tools and the Windows32 API Library.
+   This variant uses CRTDLL.DLL instead of MSVCRTDLL.DLL.
+   Copyright (C) 1998-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#undef EXTRA_OS_CPP_BUILTINS
+#define EXTRA_OS_CPP_BUILTINS()					\
+  do								\
+    {								\
+      builtin_define ("__CRTDLL__");				\
+      builtin_define ("__MINGW32__");			   	\
+      builtin_define ("_WIN32");				\
+      builtin_define_std ("WIN32");				\
+      builtin_define_std ("WINNT");				\
+    }								\
+  while (0)
+
+#undef LIBGCC_SPEC
+#define LIBGCC_SPEC \
+  "%{mthreads:-lmingwthrd} -lmingw32 -lgcc -lcoldname -libmingwex -lcrtdll"
+
+/* Specify a different entry point when linking a DLL */
+#undef STARTFILE_SPEC
+#define STARTFILE_SPEC "%{shared|mdll:dllcrt1%O%s} \
+  %{!shared:%{!mdll:crt1%O%s}} %{pg:gcrt1%O%s}"
+
diff --git a/gcc-4.9/gcc/config/i386/cygming.h b/gcc-4.9/gcc/config/i386/cygming.h
new file mode 100644
index 000000000..039edccb9
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/cygming.h
@@ -0,0 +1,487 @@
+/* Operating system specific defines to be used when targeting GCC for
+   hosting on Windows32, using a Unix style C library and tools.
+   Copyright (C) 1995-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define DBX_DEBUGGING_INFO 1
+#define SDB_DEBUGGING_INFO 1
+#if TARGET_64BIT_DEFAULT || defined (HAVE_GAS_PE_SECREL32_RELOC)
+#define DWARF2_DEBUGGING_INFO 1
+#endif
+
+#undef PREFERRED_DEBUGGING_TYPE
+#if (DWARF2_DEBUGGING_INFO)
+#define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG
+#else
+#define PREFERRED_DEBUGGING_TYPE DBX_DEBUG
+#endif
+
+#undef TARGET_SEH
+#define TARGET_SEH  (TARGET_64BIT_MS_ABI && flag_unwind_tables)
+
+/* Win64 with SEH cannot represent DRAP stack frames.  Disable its use.
+   Force the use of different mechanisms to allocate aligned local data.  */
+#undef MAX_STACK_ALIGNMENT
+#define MAX_STACK_ALIGNMENT  (TARGET_SEH ? 128 : MAX_OFILE_ALIGNMENT)
+
+/* Support hooks for SEH.  */
+#undef  TARGET_ASM_UNWIND_EMIT
+#define TARGET_ASM_UNWIND_EMIT  i386_pe_seh_unwind_emit
+#undef  TARGET_ASM_UNWIND_EMIT_BEFORE_INSN
+#define TARGET_ASM_UNWIND_EMIT_BEFORE_INSN  false
+#undef  TARGET_ASM_FUNCTION_END_PROLOGUE
+#define TARGET_ASM_FUNCTION_END_PROLOGUE  i386_pe_seh_end_prologue
+#undef  TARGET_ASM_EMIT_EXCEPT_PERSONALITY
+#define TARGET_ASM_EMIT_EXCEPT_PERSONALITY i386_pe_seh_emit_except_personality
+#undef  TARGET_ASM_INIT_SECTIONS
+#define TARGET_ASM_INIT_SECTIONS  i386_pe_seh_init_sections
+#define SUBTARGET_ASM_UNWIND_INIT  i386_pe_seh_init
+
+#undef DEFAULT_ABI
+#define DEFAULT_ABI (TARGET_64BIT ? MS_ABI : SYSV_ABI)
+
+#undef TARGET_PECOFF
+#define TARGET_PECOFF 1
+
+#if ! defined (USE_MINGW64_LEADING_UNDERSCORES)
+#undef USER_LABEL_PREFIX
+#define USER_LABEL_PREFIX (TARGET_64BIT ? "" : "_")
+
+#undef LOCAL_LABEL_PREFIX
+#define LOCAL_LABEL_PREFIX (TARGET_64BIT ? "." : "")
+
+#undef ASM_GENERATE_INTERNAL_LABEL
+#define ASM_GENERATE_INTERNAL_LABEL(BUF,PREFIX,NUMBER)  \
+  sprintf ((BUF), "*%s%s%ld", LOCAL_LABEL_PREFIX, \
+	   (PREFIX), (long)(NUMBER))
+
+#undef LPREFIX
+#define LPREFIX (TARGET_64BIT ? ".L" : "L")
+
+#endif
+
+#undef DBX_REGISTER_NUMBER
+#define DBX_REGISTER_NUMBER(n)				\
+  (TARGET_64BIT ? dbx64_register_map[n]			\
+   : (write_symbols == DWARF2_DEBUG			\
+      ? svr4_dbx_register_map[n] : dbx_register_map[n]))
+
+/* Map gcc register number to DWARF 2 CFA column number. For 32 bit
+   target, always use the svr4_dbx_register_map for DWARF .eh_frame
+   even if we don't use DWARF .debug_frame. */
+#undef DWARF_FRAME_REGNUM
+#define DWARF_FRAME_REGNUM(n)				\
+  (TARGET_64BIT ? dbx64_register_map[(n)]		\
+		: svr4_dbx_register_map[(n)])
+
+/* The 64-bit MS_ABI changes the set of call-used registers.  */
+#undef DWARF_FRAME_REGISTERS
+#define DWARF_FRAME_REGISTERS (TARGET_64BIT ? 33 : 17)
+
+#ifdef HAVE_GAS_PE_SECREL32_RELOC
+/* Use section relative relocations for debugging offsets.  Unlike
+   other targets that fake this by putting the section VMA at 0, PE
+   won't allow it.  */
+#define ASM_OUTPUT_DWARF_OFFSET(FILE, SIZE, LABEL, SECTION)	\
+  do {								\
+    switch (SIZE)						\
+      {								\
+      case 4:							\
+	fputs ("\t.secrel32\t", FILE);				\
+	assemble_name (FILE, LABEL);				\
+	break;							\
+      case 8:							\
+	/* This is a hack.  There is no 64-bit section relative	\
+	   relocation.  However, the COFF format also does not	\
+	   support 64-bit file offsets; 64-bit applications are	\
+	   limited to 32-bits of code+data in any one module.	\
+	   Fake the 64-bit offset by zero-extending it.  */	\
+	fputs ("\t.secrel32\t", FILE);				\
+	assemble_name (FILE, LABEL);				\
+	fputs ("\n\t.long\t0", FILE);				\
+	break;							\
+      default:							\
+	gcc_unreachable ();					\
+      }								\
+  } while (0)
+#endif
+
+#define TARGET_EXECUTABLE_SUFFIX ".exe"
+
+#define TARGET_OS_CPP_BUILTINS()					\
+  do									\
+    {									\
+	if (!TARGET_64BIT)						\
+	  builtin_define ("_X86_=1");					\
+	if (TARGET_SEH)							\
+	  builtin_define ("__SEH__");				\
+	builtin_assert ("system=winnt");				\
+	builtin_define ("__stdcall=__attribute__((__stdcall__))");	\
+	builtin_define ("__fastcall=__attribute__((__fastcall__))");	\
+	builtin_define ("__thiscall=__attribute__((__thiscall__))");	\
+	builtin_define ("__cdecl=__attribute__((__cdecl__))");		\
+	if (!flag_iso)							\
+	  {								\
+	    builtin_define ("_stdcall=__attribute__((__stdcall__))");	\
+	    builtin_define ("_fastcall=__attribute__((__fastcall__))");	\
+	    builtin_define ("_thiscall=__attribute__((__thiscall__))");	\
+	    builtin_define ("_cdecl=__attribute__((__cdecl__))");	\
+	  }								\
+	/* Even though linkonce works with static libs, this is needed 	\
+	    to compare typeinfo symbols across dll boundaries.  */	\
+	builtin_define ("__GXX_MERGED_TYPEINFO_NAMES=0");		\
+	builtin_define ("__GXX_TYPEINFO_EQUALITY_INLINE=0");		\
+	EXTRA_OS_CPP_BUILTINS ();					\
+  }									\
+  while (0)
+
+/* Get tree.c to declare a target-specific specialization of
+   merge_decl_attributes.  */
+#define TARGET_DLLIMPORT_DECL_ATTRIBUTES 1
+
+/* This macro defines names of additional specifications to put in the specs
+   that can be used in various specifications like CC1_SPEC.  Its definition
+   is an initializer with a subgrouping for each command option.
+
+   Each subgrouping contains a string constant, that defines the
+   specification name, and a string constant that used by the GCC driver
+   program.
+
+   Do not define this macro if it does not need to do anything.  */
+
+#undef  SUBTARGET_EXTRA_SPECS
+#define SUBTARGET_EXTRA_SPECS						\
+  { "mingw_include_path", DEFAULT_TARGET_MACHINE }
+
+#undef MATH_LIBRARY
+#define MATH_LIBRARY ""
+
+#undef TARGET_LIBC_HAS_FUNCTION
+#define TARGET_LIBC_HAS_FUNCTION no_c99_libc_has_function
+
+#define SIZE_TYPE (TARGET_64BIT ? "long long unsigned int" : "unsigned int")
+#define PTRDIFF_TYPE (TARGET_64BIT ? "long long int" : "int")
+
+#define WCHAR_TYPE_SIZE 16
+#define WCHAR_TYPE "short unsigned int"
+
+/* Windows64 continues to use a 32-bit long type.  */
+#undef LONG_TYPE_SIZE
+#define LONG_TYPE_SIZE 32
+
+#define drectve_section() \
+  (fprintf (asm_out_file, "\t.section .drectve\n"), \
+   in_section = NULL)
+
+/* Older versions of gas don't handle 'r' as data.
+   Explicitly set data flag with 'd'.  */  
+#define READONLY_DATA_SECTION_ASM_OP "\t.section .rdata,\"dr\""
+
+/* Don't allow flag_pic to propagate since gas may produce invalid code
+   otherwise.  */
+
+#undef  SUBTARGET_OVERRIDE_OPTIONS
+#define SUBTARGET_OVERRIDE_OPTIONS					\
+do {									\
+  if (TARGET_64BIT && flag_pic != 1)					\
+    {									\
+      if (flag_pic > 1)							\
+        warning (0,							\
+	         "-fPIC ignored for target (all code is position independent)"\
+                 );                         				\
+      flag_pic = 1;							\
+    }									\
+  else if (!TARGET_64BIT && flag_pic)					\
+    {									\
+      warning (0, "-f%s ignored for target (all code is position independent)",\
+	       (flag_pic > 1) ? "PIC" : "pic");				\
+      flag_pic = 0;							\
+    }									\
+} while (0)
+
+/* Define this macro if references to a symbol must be treated
+   differently depending on something about the variable or
+   function named by the symbol (such as what section it is in).
+
+   On i386 running Windows NT, modify the assembler name with a suffix
+   consisting of an atsign (@) followed by string of digits that represents
+   the number of bytes of arguments passed to the function, if it has the
+   attribute STDCALL.
+
+   In addition, we must mark dll symbols specially. Definitions of
+   dllexport'd objects install some info in the .drectve section.
+   References to dllimport'd objects are fetched indirectly via
+   _imp__.  If both are declared, dllexport overrides.  This is also
+   needed to implement one-only vtables: they go into their own
+   section and we need to set DECL_SECTION_NAME so we do that here.
+   Note that we can be called twice on the same decl.  */
+
+#define SUBTARGET_ENCODE_SECTION_INFO  i386_pe_encode_section_info
+
+/* Local and global relocs can be placed always into readonly memory
+   for PE-COFF targets.  */
+#undef TARGET_ASM_RELOC_RW_MASK
+#define TARGET_ASM_RELOC_RW_MASK i386_pe_reloc_rw_mask
+
+/* Output a common block.  */
+#undef ASM_OUTPUT_ALIGNED_DECL_COMMON
+#define ASM_OUTPUT_ALIGNED_DECL_COMMON \
+  i386_pe_asm_output_aligned_decl_common
+
+/* Output the label for an initialized variable.  */
+#undef ASM_DECLARE_OBJECT_NAME
+#define ASM_DECLARE_OBJECT_NAME(STREAM, NAME, DECL)	\
+do {							\
+  i386_pe_maybe_record_exported_symbol (DECL, NAME, 1);	\
+  ASM_OUTPUT_LABEL ((STREAM), (NAME));			\
+} while (0)
+
+/* Output a reference to a label. Fastcall function symbols
+   keep their '@' prefix, while other symbols are prefixed
+   with user_label_prefix.  */
+#undef ASM_OUTPUT_LABELREF
+#define  ASM_OUTPUT_LABELREF(STREAM, NAME)	\
+do {						\
+  if ((NAME)[0] != FASTCALL_PREFIX)		\
+    fputs (user_label_prefix, (STREAM));	\
+  fputs ((NAME), (STREAM));			\
+} while (0)
+
+/* This does much the same in memory rather than to a stream.  */
+#undef TARGET_MANGLE_ASSEMBLER_NAME
+#define TARGET_MANGLE_ASSEMBLER_NAME i386_pe_mangle_assembler_name
+
+
+/* Emit code to check the stack when allocating more than 4000
+   bytes in one go.  */
+#define CHECK_STACK_LIMIT 4000
+
+#undef STACK_BOUNDARY
+#define STACK_BOUNDARY	(TARGET_64BIT && ix86_abi == MS_ABI ? 128 : BITS_PER_WORD)
+
+/* By default, target has a 80387, uses IEEE compatible arithmetic,
+   returns float values in the 387 and needs stack probes.
+   We also align doubles to 64-bits for MSVC default compatibility.  */
+
+#undef TARGET_SUBTARGET_DEFAULT
+#define TARGET_SUBTARGET_DEFAULT \
+	(MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS \
+	 | MASK_STACK_PROBE | MASK_ALIGN_DOUBLE)
+
+#undef TARGET_SUBTARGET64_DEFAULT
+#define TARGET_SUBTARGET64_DEFAULT \
+	MASK_128BIT_LONG_DOUBLE
+
+/* This is how to output an assembler line
+   that says to advance the location counter
+   to a multiple of 2**LOG bytes.  */
+
+#undef ASM_OUTPUT_ALIGN
+#define ASM_OUTPUT_ALIGN(FILE,LOG)	\
+    if ((LOG)!=0) fprintf ((FILE), "\t.align %d\n", 1<<(LOG))
+
+/* Windows uses explicit import from shared libraries.  */
+#define MULTIPLE_SYMBOL_SPACES 1
+
+#define TARGET_ASM_UNIQUE_SECTION i386_pe_unique_section
+#define TARGET_ASM_FUNCTION_RODATA_SECTION default_no_function_rodata_section
+
+#define SUPPORTS_ONE_ONLY 1
+
+/* Switch into a generic section.  */
+#define TARGET_ASM_NAMED_SECTION  i386_pe_asm_named_section
+
+/* Select attributes for named sections.  */
+#define TARGET_SECTION_TYPE_FLAGS  i386_pe_section_type_flags
+
+/* Write the extra assembler code needed to declare a function
+   properly.  If we are generating SDB debugging information, this
+   will happen automatically, so we only need to handle other cases.  */
+#undef ASM_DECLARE_FUNCTION_NAME
+#define ASM_DECLARE_FUNCTION_NAME(FILE, NAME, DECL) \
+  i386_pe_start_function (FILE, NAME, DECL)
+
+#undef ASM_DECLARE_FUNCTION_SIZE
+#define ASM_DECLARE_FUNCTION_SIZE(FILE,NAME,DECL) \
+  i386_pe_end_function (FILE, NAME, DECL)
+
+/* Add an external function to the list of functions to be declared at
+   the end of the file.  */
+#define ASM_OUTPUT_EXTERNAL(FILE, DECL, NAME)				\
+  do									\
+    {									\
+      if (TREE_CODE (DECL) == FUNCTION_DECL)				\
+	i386_pe_record_external_function ((DECL), (NAME));		\
+    }									\
+  while (0)
+
+/* Declare the type properly for any external libcall.  */
+#define ASM_OUTPUT_EXTERNAL_LIBCALL(FILE, FUN) \
+  i386_pe_declare_function_type (FILE, XSTR (FUN, 0), 1)
+
+/* This says out to put a global symbol in the BSS section.  */
+#undef ASM_OUTPUT_ALIGNED_BSS
+#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \
+  asm_output_aligned_bss ((FILE), (DECL), (NAME), (SIZE), (ALIGN))
+
+/* Put all *tf routines in libgcc.  */
+#undef LIBGCC2_HAS_TF_MODE
+#define LIBGCC2_HAS_TF_MODE 1
+#define LIBGCC2_TF_CEXT q
+#define TF_SIZE 113
+
+/* Output function declarations at the end of the file.  */
+#undef TARGET_ASM_FILE_END
+#define TARGET_ASM_FILE_END i386_pe_file_end
+
+#undef ASM_COMMENT_START
+#define ASM_COMMENT_START " #"
+
+#ifndef DWARF2_UNWIND_INFO
+/* If configured with --disable-sjlj-exceptions, use DWARF2, else
+   default to SJLJ.  */
+#if  (defined (CONFIG_SJLJ_EXCEPTIONS) && !CONFIG_SJLJ_EXCEPTIONS)
+/* The logic of this #if must be kept synchronised with the logic
+   for selecting the tmake_eh_file fragment in config.gcc.  */
+#define DWARF2_UNWIND_INFO 1
+/* If multilib is selected break build as sjlj is required.  */
+#if defined (TARGET_BI_ARCH)
+#error For 64-bit windows and 32-bit based multilib version of gcc just SJLJ exceptions are supported.
+#endif
+#else
+#define DWARF2_UNWIND_INFO 0
+#endif
+#endif
+
+/* Don't assume anything about the header files.  */
+#define NO_IMPLICIT_EXTERN_C
+
+#undef PROFILE_HOOK
+#define PROFILE_HOOK(LABEL)						\
+  if (MAIN_NAME_P (DECL_NAME (current_function_decl)))			\
+    {									\
+      emit_call_insn (gen_rtx_CALL (VOIDmode,				\
+	gen_rtx_MEM (FUNCTION_MODE,					\
+		     gen_rtx_SYMBOL_REF (Pmode, "_monstartup")),	\
+	const0_rtx));							\
+    }
+
+/* Java Native Interface (JNI) methods on Win32 are invoked using the
+   stdcall calling convention.  */
+#undef MODIFY_JNI_METHOD_CALL
+#define MODIFY_JNI_METHOD_CALL(MDECL)					      \
+  build_type_attribute_variant ((MDECL),				      \
+			       build_tree_list (get_identifier ("stdcall"),   \
+						NULL))
+
+/* For Win32 ABI compatibility */
+#undef DEFAULT_PCC_STRUCT_RETURN
+#define DEFAULT_PCC_STRUCT_RETURN 0
+
+/* MSVC returns aggregate types of up to 8 bytes via registers.
+   See i386.c:ix86_return_in_memory.  */
+#undef MS_AGGREGATE_RETURN
+#define MS_AGGREGATE_RETURN 1
+
+/* Biggest alignment supported by the object file format of this
+   machine.  Use this macro to limit the alignment which can be
+   specified using the `__attribute__ ((aligned (N)))' construct.  If
+   not defined, the default value is `BIGGEST_ALIGNMENT'.  */
+/* IMAGE_SCN_ALIGN_8192BYTES is the largest section alignment flag
+   specified in the PECOFF60 spec.  Native MS compiler also limits
+   user-specified alignment to 8192 bytes.  */
+#undef MAX_OFILE_ALIGNMENT
+#define MAX_OFILE_ALIGNMENT (8192 * 8)
+
+/* BIGGEST_FIELD_ALIGNMENT macro is used directly by libobjc, There, we
+   align internal doubles in structures on dword boundaries. Otherwise,
+   support vector modes using ADJUST_FIELD_ALIGN, defined in i386.h.  */
+#ifdef IN_TARGET_LIBS
+#undef	BIGGEST_FIELD_ALIGNMENT
+#define BIGGEST_FIELD_ALIGNMENT 64
+#endif
+
+/* A bit-field declared as `int' forces `int' alignment for the struct.  */
+#undef PCC_BITFIELD_TYPE_MATTERS
+#define PCC_BITFIELD_TYPE_MATTERS 1
+#define GROUP_BITFIELDS_BY_ALIGN TYPE_NATIVE(rec)
+
+/* Enable alias attribute support.  */
+#ifndef SET_ASM_OP
+#define SET_ASM_OP "\t.set\t"
+#endif
+
+/* This implements the `alias' attribute, keeping any stdcall or
+   fastcall decoration.  */
+#undef	ASM_OUTPUT_DEF_FROM_DECLS
+#define	ASM_OUTPUT_DEF_FROM_DECLS(STREAM, DECL, TARGET)			\
+  do									\
+    {									\
+      const char *alias							\
+	= IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (DECL));		\
+      i386_pe_maybe_record_exported_symbol (DECL, alias, 0);		\
+      if (TREE_CODE (DECL) == FUNCTION_DECL)				\
+	i386_pe_declare_function_type (STREAM, alias,			\
+				       TREE_PUBLIC (DECL));		\
+      ASM_OUTPUT_DEF (STREAM, alias, IDENTIFIER_POINTER (TARGET));	\
+    } while (0)
+
+/* GNU as supports weak symbols on PECOFF. */
+#ifdef HAVE_GAS_WEAK
+#define ASM_WEAKEN_LABEL(FILE, NAME)  \
+  do                                  \
+    {                                 \
+      fputs ("\t.weak\t", (FILE));    \
+      assemble_name ((FILE), (NAME)); \
+      fputc ('\n', (FILE));           \
+    }                                 \
+  while (0)
+#endif /* HAVE_GAS_WEAK */
+
+/* FIXME: SUPPORTS_WEAK && TARGET_HAVE_NAMED_SECTIONS is true,
+   but for .jcr section to work we also need crtbegin and crtend
+   objects.  */
+#define TARGET_USE_JCR_SECTION 1
+
+/* Decide whether it is safe to use a local alias for a virtual function
+   when constructing thunks.  */
+#undef TARGET_USE_LOCAL_THUNK_ALIAS_P
+#define TARGET_USE_LOCAL_THUNK_ALIAS_P(DECL) (!DECL_ONE_ONLY (DECL))
+
+#define SUBTARGET_ATTRIBUTE_TABLE \
+  { "selectany", 0, 0, true, false, false, ix86_handle_selectany_attribute, \
+    false }
+  /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
+       affects_type_identity } */
+
+/*  mcount() does not need a counter variable.  */
+#undef NO_PROFILE_COUNTERS
+#define NO_PROFILE_COUNTERS 1
+
+#define TARGET_VALID_DLLIMPORT_ATTRIBUTE_P i386_pe_valid_dllimport_attribute_p
+#define TARGET_CXX_ADJUST_CLASS_AT_DEFINITION i386_pe_adjust_class_at_definition
+#define SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME i386_pe_mangle_decl_assembler_name
+
+#undef TARGET_ASM_ASSEMBLE_VISIBILITY
+#define TARGET_ASM_ASSEMBLE_VISIBILITY i386_pe_assemble_visibility
+
+#undef SUB_TARGET_RECORD_STUB
+#define SUB_TARGET_RECORD_STUB i386_pe_record_stub
+
+/* Static stack checking is supported by means of probes.  */
+#define STACK_CHECK_STATIC_BUILTIN 1
diff --git a/gcc-4.9/gcc/config/i386/cygming.opt b/gcc-4.9/gcc/config/i386/cygming.opt
new file mode 100644
index 000000000..3437123f4
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/cygming.opt
@@ -0,0 +1,60 @@
+; Cygwin- and MinGW-specific options.
+
+; Copyright (C) 2005-2014 Free Software Foundation, Inc.
+;
+; This file is part of GCC.
+;
+; GCC is free software; you can redistribute it and/or modify it under
+; the terms of the GNU General Public License as published by the Free
+; Software Foundation; either version 3, or (at your option) any later
+; version.
+;
+; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+; for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with GCC; see the file COPYING3.  If not see
+; <http://www.gnu.org/licenses/>.
+
+mconsole
+Target RejectNegative
+Create console application
+
+mdll
+Target RejectNegative
+Generate code for a DLL
+
+mnop-fun-dllimport
+Target Report Var(TARGET_NOP_FUN_DLLIMPORT)
+Ignore dllimport for functions
+
+mthreads
+Target RejectNegative
+Use Mingw-specific thread support
+
+mwin32
+Target
+Set Windows defines
+
+mwindows
+Target
+Create GUI application
+
+mpe-aligned-commons
+Target Var(use_pe_aligned_common) Init(HAVE_GAS_ALIGNED_COMM)
+Use the GNU extension to the PE format for aligned common data
+
+muse-libstdc-wrappers
+Target Condition({defined (USE_CYGWIN_LIBSTDCXX_WRAPPERS)})
+Compile code that relies on Cygwin DLL wrappers to support C++ operator new/delete replacement
+
+posix
+Driver
+
+fwritable-relocated-rdata
+Common Report Var(flag_writable_rel_rdata) Init(0)
+Put relocated read-only data into .data section.
+
+; Retain blank line above
diff --git a/gcc-4.9/gcc/config/i386/cygwin-stdint.h b/gcc-4.9/gcc/config/i386/cygwin-stdint.h
new file mode 100644
index 000000000..3c82cc6c3
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/cygwin-stdint.h
@@ -0,0 +1,94 @@
+/* Definitions for <stdint.h> types on systems using Cygwin.
+   Copyright (C) 2009-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define SIG_ATOMIC_TYPE "int"
+
+/* Exact-width integer types */
+
+#define INT8_TYPE "signed char"
+#define INT16_TYPE "short int"
+#define INT32_TYPE "int"
+#ifdef __x86_64__
+#define INT64_TYPE "long int"
+#else
+#define INT64_TYPE "long long int"
+#endif
+
+#define UINT8_TYPE "unsigned char"
+#define UINT16_TYPE "short unsigned int"
+#define UINT32_TYPE "unsigned int"
+#ifdef __x86_64__
+#define UINT64_TYPE "long unsigned int"
+#else
+#define UINT64_TYPE "long long unsigned int"
+#endif
+
+/* Minimum-width integer types */
+
+#define INT_LEAST8_TYPE "signed char"
+#define INT_LEAST16_TYPE "short int"
+#define INT_LEAST32_TYPE "int"
+#ifdef __x86_64__
+#define INT_LEAST64_TYPE "long int"
+#else
+#define INT_LEAST64_TYPE "long long int"
+#endif
+
+#define UINT_LEAST8_TYPE "unsigned char"
+#define UINT_LEAST16_TYPE "short unsigned int"
+#define UINT_LEAST32_TYPE "unsigned int"
+#ifdef __x86_64__
+#define UINT_LEAST64_TYPE "long unsigned int"
+#else
+#define UINT_LEAST64_TYPE "long long unsigned int"
+#endif
+
+/* Fastest minimum-width integer types */
+
+#define INT_FAST8_TYPE "signed char"
+#ifdef __x86_64__
+#define INT_FAST16_TYPE "long int"
+#define INT_FAST32_TYPE "long int"
+#define INT_FAST64_TYPE "long int"
+#else
+#define INT_FAST16_TYPE "int"
+#define INT_FAST32_TYPE "int"
+#define INT_FAST64_TYPE "long long int"
+#endif
+
+#define UINT_FAST8_TYPE "unsigned char"
+#ifdef __x86_64__
+#define UINT_FAST16_TYPE "long unsigned int"
+#define UINT_FAST32_TYPE "long unsigned int"
+#define UINT_FAST64_TYPE "long unsigned int"
+#else
+#define UINT_FAST16_TYPE "unsigned int"
+#define UINT_FAST32_TYPE "unsigned int"
+#define UINT_FAST64_TYPE "long long unsigned int"
+#endif
+
+/* Integer types capable of holding object pointers */
+
+#ifdef __x86_64__
+#define INTPTR_TYPE "long int"
+#define UINTPTR_TYPE "long unsigned int"
+#else
+#define INTPTR_TYPE "int"
+#define UINTPTR_TYPE "unsigned int"
+#endif
diff --git a/gcc-4.9/gcc/config/i386/cygwin-w64.h b/gcc-4.9/gcc/config/i386/cygwin-w64.h
new file mode 100644
index 000000000..06a6cd98c
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/cygwin-w64.h
@@ -0,0 +1,83 @@
+/* Operating system specific defines to be used when targeting GCC for
+   hosting on Windows 32/64 via Cygwin runtime, using GNU tools and
+   the Windows API Library.
+   Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Enable multilib.  */
+
+#undef ASM_SPEC
+#define ASM_SPEC "%{m32:--32} %{m64:--64}"
+
+/* To implement C++ function replacement we always wrap the cxx
+   malloc-like operators.  See N2800 #17.6.4.6 [replacement.functions] */
+#undef CXX_WRAP_SPEC_LIST
+#define CXX_WRAP_SPEC_LIST " \
+  --wrap _Znwm \
+  --wrap _Znam \
+  --wrap _ZdlPv \
+  --wrap _ZdaPv \
+  --wrap _ZnwmRKSt9nothrow_t \
+  --wrap _ZnamRKSt9nothrow_t \
+  --wrap _ZdlPvRKSt9nothrow_t \
+  --wrap _ZdaPvRKSt9nothrow_t \
+"
+
+#undef SPEC_32
+#undef SPEC_64
+#define SPEC_32 "m32"
+#define SPEC_64 "!m32"
+
+#undef SUB_LINK_ENTRY32
+#undef SUB_LINK_ENTRY64
+#define SUB_LINK_ENTRY32 "-e __cygwin_dll_entry@12"
+#define SUB_LINK_ENTRY64 "-e _cygwin_dll_entry"
+
+#undef SUB_LINK_SPEC
+#undef SUB_LINK_ENTRY
+#define SUB_LINK_SPEC "%{" SPEC_64 ":-m i386pep} %{" SPEC_32 ":-m i386pe}"
+#define SUB_LINK_ENTRY "%{" SPEC_64 ":" SUB_LINK_ENTRY64 "} %{" SPEC_32 ":" SUB_LINK_ENTRY32 "}"
+
+#undef MULTILIB_DEFAULTS
+#define MULTILIB_DEFAULTS { "m64" }
+
+#undef LINK_SPEC
+#define LINK_SPEC SUB_LINK_SPEC "\
+  %{mwindows:--subsystem windows} \
+  %{mconsole:--subsystem console} \
+  " CXX_WRAP_SPEC " \
+  %{shared: %{mdll: %eshared and mdll are not compatible}} \
+  %{shared: --shared} %{mdll:--dll} \
+  %{static:-Bstatic} %{!static:-Bdynamic} \
+  %{shared|mdll: " SUB_LINK_ENTRY " --enable-auto-image-base} \
+  %(shared_libgcc_undefs) \
+  --dll-search-prefix=cyg -tsaware"
+
+/* Cygwin64 will have a 64-bit long type. */
+#undef LONG_TYPE_SIZE
+#undef LONG_TYPE_SIZE
+#define LONG_TYPE_SIZE (TARGET_64BIT ? 64 : 32)
+
+/* Override default "long long unsigned int" from cygming.h. */
+#undef SIZE_TYPE
+#define SIZE_TYPE (TARGET_64BIT ? "long unsigned int" : "unsigned int")
+#undef PTRDIFF_TYPE
+#define PTRDIFF_TYPE (TARGET_64BIT ? "long int" : "int")
+
+#undef LIBGCC_SONAME
+#define LIBGCC_SONAME "cyggcc_s-seh-1.dll"
diff --git a/gcc-4.9/gcc/config/i386/cygwin.h b/gcc-4.9/gcc/config/i386/cygwin.h
new file mode 100644
index 000000000..f7b9a284c
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/cygwin.h
@@ -0,0 +1,146 @@
+/* Operating system specific defines to be used when targeting GCC for
+   hosting on Windows32, using a Unix style C library and tools.
+   Copyright (C) 1995-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define EXTRA_OS_CPP_BUILTINS()					\
+  do								\
+    {								\
+      builtin_define ("__CYGWIN__");				\
+      if (!TARGET_64BIT)					\
+	builtin_define ("__CYGWIN32__");			\
+      builtin_define ("__unix__");				\
+      builtin_define ("__unix");				\
+    }								\
+  while (0)
+
+#undef CPP_SPEC
+#define CPP_SPEC "%(cpp_cpu) %{posix:-D_POSIX_SOURCE} \
+  %{!ansi:-Dunix} \
+  %{mwin32:-DWIN32 -D_WIN32 -D__WIN32 -D__WIN32__ %{!ansi:-DWINNT}} \
+  %{!nostdinc:%{!mno-win32:-idirafter ../include/w32api%s -idirafter ../../include/w32api%s}}\
+"
+
+#undef STARTFILE_SPEC
+#define STARTFILE_SPEC "\
+  %{!shared: %{!mdll: crt0%O%s \
+  %{pg:gcrt0%O%s}}}\
+  crtbegin.o%s"
+
+#undef ENDFILE_SPEC
+#define ENDFILE_SPEC \
+  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}\
+   crtend.o%s"
+
+/* Normally, -lgcc is not needed since everything in it is in the DLL, but we
+   want to allow things to be added to it when installing new versions of
+   GCC without making a new CYGWIN.DLL, so we leave it.  Profiling is handled
+   by calling the init function from main.  */
+
+#ifdef ENABLE_SHARED_LIBGCC
+#define SHARED_LIBGCC_SPEC " \
+ %{static|static-libgcc:-lgcc -lgcc_eh} \
+ %{!static: \
+   %{!static-libgcc: \
+     -lgcc_s -lgcc \
+    } \
+  } "
+#else
+#define SHARED_LIBGCC_SPEC " -lgcc "
+#endif
+
+#undef REAL_LIBGCC_SPEC
+#define REAL_LIBGCC_SPEC SHARED_LIBGCC_SPEC
+
+/* We have to dynamic link to get to the system DLLs.  All of libc, libm and
+   the Unix stuff is in cygwin.dll.  The import library is called
+   'libcygwin.a'.  For Windows applications, include more libraries, but
+   always include kernel32.  We'd like to specific subsystem windows to
+   ld, but that doesn't work just yet.  */
+
+#undef LIB_SPEC
+#define LIB_SPEC "\
+  %{pg:-lgmon} \
+  -lcygwin \
+  %{mwindows:-lgdi32 -lcomdlg32} \
+  -ladvapi32 -lshell32 -luser32 -lkernel32"
+
+/* To implement C++ function replacement we always wrap the cxx
+   malloc-like operators.  See N2800 #17.6.4.6 [replacement.functions] */
+#define CXX_WRAP_SPEC_LIST " \
+  --wrap _Znwj \
+  --wrap _Znaj \
+  --wrap _ZdlPv \
+  --wrap _ZdaPv \
+  --wrap _ZnwjRKSt9nothrow_t \
+  --wrap _ZnajRKSt9nothrow_t \
+  --wrap _ZdlPvRKSt9nothrow_t \
+  --wrap _ZdaPvRKSt9nothrow_t \
+"
+
+#if defined (USE_CYGWIN_LIBSTDCXX_WRAPPERS)
+
+#if USE_CYGWIN_LIBSTDCXX_WRAPPERS
+/* Default on, only explict -mno disables.  */
+#define CXX_WRAP_SPEC_OPT "!mno-use-libstdc-wrappers"
+#else
+/* Default off, only explict -m enables.  */
+#define CXX_WRAP_SPEC_OPT "muse-libstdc-wrappers"
+#endif
+
+#define CXX_WRAP_SPEC "%{" CXX_WRAP_SPEC_OPT ":" CXX_WRAP_SPEC_LIST "}"
+
+#else /* !defined (USE_CYGWIN_LIBSTDCXX_WRAPPERS)  */
+
+#define CXX_WRAP_SPEC ""
+
+#endif /* ?defined (USE_CYGWIN_LIBSTDCXX_WRAPPERS) */
+
+#define LINK_SPEC "\
+  %{mwindows:--subsystem windows} \
+  %{mconsole:--subsystem console} \
+  " CXX_WRAP_SPEC " \
+  %{shared: %{mdll: %eshared and mdll are not compatible}} \
+  %{shared: --shared} %{mdll:--dll} \
+  %{static:-Bstatic} %{!static:-Bdynamic} \
+  %{shared|mdll: --enable-auto-image-base -e __cygwin_dll_entry@12} \
+  --dll-search-prefix=cyg -tsaware"
+
+/* Binutils does not handle weak symbols from dlls correctly.  For now,
+   do not use them unnecessarily in gthr-posix.h.  */
+#define GTHREAD_USE_WEAK 0
+
+/* Every program on cygwin links against cygwin1.dll which contains 
+   the pthread routines.  There is no need to explicitly link them
+   and the -pthread flag is not recognized.  */
+#undef GOMP_SELF_SPECS
+#define GOMP_SELF_SPECS ""
+#undef GTM_SELF_SPECS
+#define GTM_SELF_SPECS ""
+
+/* This matches SHLIB_SONAME and SHLIB_SOVERSION in t-cygwin. */
+#if DWARF2_UNWIND_INFO
+#define LIBGCC_EH_EXTN ""
+#else
+#define LIBGCC_EH_EXTN "-sjlj"
+#endif
+#define LIBGCC_SONAME "cyggcc_s" LIBGCC_EH_EXTN "-1.dll"
+
+/* We should find a way to not have to update this manually.  */
+#define LIBGCJ_SONAME "cyggcj" /*LIBGCC_EH_EXTN*/ "-15.dll"
+
diff --git a/gcc-4.9/gcc/config/i386/darwin.h b/gcc-4.9/gcc/config/i386/darwin.h
new file mode 100644
index 000000000..a85aa42d5
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/darwin.h
@@ -0,0 +1,313 @@
+/* Target definitions for x86 running Darwin.
+   Copyright (C) 2001-2014 Free Software Foundation, Inc.
+   Contributed by Apple Computer Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Enable Mach-O bits in generic x86 code.  */
+#undef TARGET_MACHO
+#define TARGET_MACHO 1
+
+#undef DARWIN_X86
+#define DARWIN_X86 1
+
+#undef  TARGET_64BIT
+#undef	TARGET_64BIT_P
+#define TARGET_64BIT TARGET_ISA_64BIT
+#define	TARGET_64BIT_P(x) TARGET_ISA_64BIT_P(x)
+
+#ifdef IN_LIBGCC2
+#undef TARGET_64BIT
+#ifdef __x86_64__
+#define TARGET_64BIT 1
+#else
+#define TARGET_64BIT 0
+#endif
+#endif
+
+/* Size of the Obj-C jump buffer.  */
+#define OBJC_JBLEN ((TARGET_64BIT) ? ((9 * 2) + 3 + 16) : (18))
+
+#undef TARGET_FPMATH_DEFAULT
+#define TARGET_FPMATH_DEFAULT (TARGET_SSE ? FPMATH_SSE : FPMATH_387)
+
+#define TARGET_OS_CPP_BUILTINS()                \
+  do                                            \
+    {                                           \
+      builtin_define ("__LITTLE_ENDIAN__");     \
+      darwin_cpp_builtins (pfile);		\
+    }                                           \
+  while (0)
+
+#undef PTRDIFF_TYPE
+#define PTRDIFF_TYPE (TARGET_64BIT ? "long int" : "int")
+
+#undef WCHAR_TYPE
+#define WCHAR_TYPE "int"
+
+#undef WCHAR_TYPE_SIZE
+#define WCHAR_TYPE_SIZE 32
+
+/* Generate branch islands stubs if this is true.  */
+extern int darwin_emit_branch_islands;
+
+#undef TARGET_MACHO_BRANCH_ISLANDS
+#define TARGET_MACHO_BRANCH_ISLANDS darwin_emit_branch_islands
+
+/* For compatibility with OSX system tools, use the new style of pic stub
+   if this is set.  */
+#undef  MACHOPIC_ATT_STUB
+#define MACHOPIC_ATT_STUB (darwin_macho_att_stub)
+
+#undef MAX_BITS_PER_WORD
+#define MAX_BITS_PER_WORD 64
+
+#undef FORCE_PREFERRED_STACK_BOUNDARY_IN_MAIN
+#define FORCE_PREFERRED_STACK_BOUNDARY_IN_MAIN (0)
+
+#undef TARGET_KEEPS_VECTOR_ALIGNED_STACK
+#define TARGET_KEEPS_VECTOR_ALIGNED_STACK 1
+
+/* On Darwin, the stack is 128-bit aligned at the point of every call.
+   Failure to ensure this will lead to a crash in the system libraries
+   or dynamic loader.  */
+#undef STACK_BOUNDARY
+#define STACK_BOUNDARY \
+ ((profile_flag || (TARGET_64BIT && ix86_abi == MS_ABI)) \
+  ? 128 : BITS_PER_WORD)
+
+#undef MAIN_STACK_BOUNDARY
+#define MAIN_STACK_BOUNDARY 128
+
+/* Since we'll never want a stack boundary less aligned than 128 bits
+   we need the extra work here otherwise bits of gcc get very grumpy
+   when we ask for lower alignment.  We could just reject values less
+   than 128 bits for Darwin, but it's easier to up the alignment if
+   it's below the minimum.  */
+#undef PREFERRED_STACK_BOUNDARY
+#define PREFERRED_STACK_BOUNDARY			\
+  MAX (128, ix86_preferred_stack_boundary)
+
+/* We want -fPIC by default, unless we're using -static to compile for
+   the kernel or some such.  */
+
+#undef CC1_SPEC
+#define CC1_SPEC "%(cc1_cpu) \
+  %{!mkernel:%{!static:%{!mdynamic-no-pic:-fPIC}}} \
+  %{!mmacosx-version-min=*:-mmacosx-version-min=%(darwin_minversion)} \
+  %{g: %{!fno-eliminate-unused-debug-symbols: -feliminate-unused-debug-symbols }} " \
+  DARWIN_CC1_SPEC
+
+#undef ASM_SPEC
+#define ASM_SPEC "-arch %(darwin_arch) -force_cpusubtype_ALL \
+  %{static}"
+
+#define DARWIN_ARCH_SPEC "%{m64:x86_64;:i386}"
+#define DARWIN_SUBARCH_SPEC DARWIN_ARCH_SPEC
+
+/* Determine a minimum version based on compiler options.  */
+#define DARWIN_MINVERSION_SPEC				\
+ "%{!m64|fgnu-runtime:10.4;				\
+    ,objective-c|,objc-cpp-output:10.5;			\
+    ,objective-c-header:10.5;				\
+    ,objective-c++|,objective-c++-cpp-output:10.5;	\
+    ,objective-c++-header|,objc++-cpp-output:10.5;	\
+    :10.4}"
+
+#undef ENDFILE_SPEC
+#define ENDFILE_SPEC \
+  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+   %{mpc32:crtprec32.o%s} \
+   %{mpc64:crtprec64.o%s} \
+   %{mpc80:crtprec80.o%s}" TM_DESTRUCTOR
+
+#undef SUBTARGET_EXTRA_SPECS
+#define SUBTARGET_EXTRA_SPECS                                   \
+  DARWIN_EXTRA_SPECS                                            \
+  { "darwin_arch", DARWIN_ARCH_SPEC },                          \
+  { "darwin_crt2", "" },                                        \
+  { "darwin_subarch", DARWIN_SUBARCH_SPEC },
+
+/* The Darwin assembler mostly follows AT&T syntax.  */
+#undef ASSEMBLER_DIALECT
+#define ASSEMBLER_DIALECT ASM_ATT
+
+/* Define macro used to output shift-double opcodes when the shift
+   count is in %cl.  Some assemblers require %cl as an argument;
+   some don't.  This macro controls what to do: by default, don't
+   print %cl.  */
+
+#define SHIFT_DOUBLE_OMITS_COUNT 0
+
+/* Put all *tf routines in libgcc.  */
+#undef LIBGCC2_HAS_TF_MODE
+#define LIBGCC2_HAS_TF_MODE 1
+#define LIBGCC2_TF_CEXT q
+#define TF_SIZE 113
+
+#undef TARGET_ASM_FILE_END
+#define TARGET_ASM_FILE_END darwin_file_end
+
+/* Define the syntax of pseudo-ops, labels and comments.  */
+
+/* String containing the assembler's comment-starter.  */
+
+#define ASM_COMMENT_START "#"
+
+/* By default, target has a 80387, uses IEEE compatible arithmetic,
+   and returns float values in the 387.  */
+
+#undef TARGET_SUBTARGET_DEFAULT
+#define TARGET_SUBTARGET_DEFAULT (MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS | MASK_128BIT_LONG_DOUBLE)
+
+/* For darwin we want to target specific processor features as a minimum,
+   but these unfortunately don't correspond to a specific processor.  */
+#undef TARGET_SUBTARGET32_ISA_DEFAULT
+#define TARGET_SUBTARGET32_ISA_DEFAULT (OPTION_MASK_ISA_MMX		\
+					| OPTION_MASK_ISA_SSE		\
+					| OPTION_MASK_ISA_SSE2		\
+					| OPTION_MASK_ISA_SSE3)
+
+#undef TARGET_SUBTARGET64_ISA_DEFAULT
+#define TARGET_SUBTARGET64_ISA_DEFAULT TARGET_SUBTARGET32_ISA_DEFAULT
+
+#undef GOT_SYMBOL_NAME
+#define GOT_SYMBOL_NAME MACHOPIC_FUNCTION_BASE_NAME
+
+/* Define the syntax of pseudo-ops, labels and comments.  */
+
+#define LPREFIX "L"
+
+/* Assembler pseudos to introduce constants of various size.  */
+
+#define ASM_BYTE "\t.byte\t"
+#define ASM_SHORT "\t.word\t"
+#define ASM_LONG "\t.long\t"
+#define ASM_QUAD "\t.quad\t"
+
+#define SUBTARGET_ENCODE_SECTION_INFO  darwin_encode_section_info
+
+#undef ASM_OUTPUT_ALIGN
+#define ASM_OUTPUT_ALIGN(FILE,LOG)	\
+ do { if ((LOG) != 0)			\
+        {				\
+          if (in_section == text_section) \
+            fprintf (FILE, "\t%s %d,0x90\n", ALIGN_ASM_OP, (LOG)); \
+          else				\
+            fprintf (FILE, "\t%s %d\n", ALIGN_ASM_OP, (LOG)); \
+        }				\
+    } while (0)
+
+/* Darwin profiling -- call mcount.  */
+#undef FUNCTION_PROFILER
+#define FUNCTION_PROFILER(FILE, LABELNO)				\
+    do {								\
+      if (TARGET_MACHO_BRANCH_ISLANDS 					\
+	   && MACHOPIC_INDIRECT && !TARGET_64BIT)			\
+	{								\
+	  const char *name = machopic_mcount_stub_name ();		\
+	  fprintf (FILE, "\tcall %s\n", name+1);  /*  skip '&'  */	\
+	  machopic_validate_stub_or_non_lazy_ptr (name);		\
+	}								\
+      else fprintf (FILE, "\tcall mcount\n");				\
+    } while (0)
+
+#define C_COMMON_OVERRIDE_OPTIONS					\
+  do {									\
+    SUBTARGET_C_COMMON_OVERRIDE_OPTIONS;				\
+  } while (0)
+
+#undef SUBTARGET_OVERRIDE_OPTIONS
+#define SUBTARGET_OVERRIDE_OPTIONS \
+do {									\
+  if (TARGET_64BIT && MACHO_DYNAMIC_NO_PIC_P)				\
+    target_flags &= ~MASK_MACHO_DYNAMIC_NO_PIC;				\
+} while (0)
+
+/* Darwin on x86_64 uses dwarf-2 by default.  Pre-darwin9 32-bit
+   compiles default to stabs+.  darwin9+ defaults to dwarf-2.  */
+#ifndef DARWIN_PREFER_DWARF
+#undef PREFERRED_DEBUGGING_TYPE
+#define PREFERRED_DEBUGGING_TYPE (TARGET_64BIT ? DWARF2_DEBUG : DBX_DEBUG)
+#endif
+
+/* Darwin uses the standard DWARF register numbers but the default
+   register numbers for STABS.  Fortunately for 64-bit code the
+   default and the standard are the same.  */
+#undef DBX_REGISTER_NUMBER
+#define DBX_REGISTER_NUMBER(n) 					\
+  (TARGET_64BIT ? dbx64_register_map[n]				\
+   : write_symbols == DWARF2_DEBUG ? svr4_dbx_register_map[n]	\
+   : dbx_register_map[n])
+
+/* Unfortunately, the 32-bit EH information also doesn't use the standard
+   DWARF register numbers.  */
+#define DWARF2_FRAME_REG_OUT(n, for_eh)					\
+  (! (for_eh) || write_symbols != DWARF2_DEBUG || TARGET_64BIT ? (n)	\
+   : (n) == 5 ? 4							\
+   : (n) == 4 ? 5							\
+   : (n) >= 11 && (n) <= 18 ? (n) + 1					\
+   : (n))
+
+#undef REGISTER_SUBTARGET_PRAGMAS
+#define REGISTER_SUBTARGET_PRAGMAS() DARWIN_REGISTER_TARGET_PRAGMAS()
+
+#undef TARGET_SET_DEFAULT_TYPE_ATTRIBUTES
+#define TARGET_SET_DEFAULT_TYPE_ATTRIBUTES darwin_set_default_type_attributes
+
+/* For 64-bit, we need to add 4 because @GOTPCREL is relative to the
+   end of the instruction, but without the 4 we'd only have the right
+   address for the start of the instruction.  */
+#undef ASM_MAYBE_OUTPUT_ENCODED_ADDR_RTX
+#define ASM_MAYBE_OUTPUT_ENCODED_ADDR_RTX(FILE, ENCODING, SIZE, ADDR, DONE)	\
+  if (TARGET_64BIT)				                                \
+    {                                                                           \
+      if ((SIZE) == 4 && ((ENCODING) & 0x70) == DW_EH_PE_pcrel)			\
+        {                                                                       \
+	   fputs (ASM_LONG, FILE);                                              \
+	   assemble_name (FILE, XSTR (ADDR, 0));				\
+	   fputs ("+4@GOTPCREL", FILE);                                         \
+	   goto DONE;                                                           \
+        }									\
+    }										\
+  else                                                                          \
+    {										\
+      if (ENCODING == ASM_PREFERRED_EH_DATA_FORMAT (2, 1))                      \
+        {                                                                       \
+          darwin_non_lazy_pcrel (FILE, ADDR);                                   \
+          goto DONE;								\
+        }                                                                       \
+    }
+
+/* This needs to move since i386 uses the first flag and other flags are
+   used in Mach-O.  */
+#undef MACHO_SYMBOL_FLAG_VARIABLE
+#define MACHO_SYMBOL_FLAG_VARIABLE ((SYMBOL_FLAG_MACH_DEP) << 3)
+
+#undef MACHOPIC_NL_SYMBOL_PTR_SECTION
+#define MACHOPIC_NL_SYMBOL_PTR_SECTION \
+		".section __IMPORT,__pointers,non_lazy_symbol_pointers"
+
+#define SUBTARGET32_DEFAULT_CPU "i686"
+
+#undef  SUBTARGET_INIT_BUILTINS
+#define SUBTARGET_INIT_BUILTINS					\
+do {								\
+  ix86_builtins[(int) IX86_BUILTIN_CFSTRING]			\
+    = darwin_init_cfstring_builtins ((unsigned) (IX86_BUILTIN_CFSTRING));	\
+  darwin_rename_builtins ();					\
+} while(0)
diff --git a/gcc-4.9/gcc/config/i386/darwin64.h b/gcc-4.9/gcc/config/i386/darwin64.h
new file mode 100644
index 000000000..143971136
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/darwin64.h
@@ -0,0 +1,32 @@
+/* Target definitions for x86_64 running Darwin.
+   Copyright (C) 2006-2014 Free Software Foundation, Inc.
+   Contributed by Apple Computer Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#undef  DARWIN_ARCH_SPEC
+#define DARWIN_ARCH_SPEC "%{m32:i386;:x86_64}"
+
+#undef  DARWIN_SUBARCH_SPEC
+#define DARWIN_SUBARCH_SPEC DARWIN_ARCH_SPEC
+
+#undef SUBTARGET_EXTRA_SPECS
+#define SUBTARGET_EXTRA_SPECS                                   \
+  DARWIN_EXTRA_SPECS                                            \
+  { "darwin_arch", DARWIN_ARCH_SPEC },                          \
+  { "darwin_crt2", "" },                                        \
+  { "darwin_subarch", DARWIN_SUBARCH_SPEC },
diff --git a/gcc-4.9/gcc/config/i386/djgpp-stdint.h b/gcc-4.9/gcc/config/i386/djgpp-stdint.h
new file mode 100644
index 000000000..5af1771b2
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/djgpp-stdint.h
@@ -0,0 +1,62 @@
+/* Definitions for <stdint.h> types on systems using DJGPP.
+   Copyright (C) 2009-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define SIG_ATOMIC_TYPE "int"
+
+/* Exact-width integer types */
+
+#define INT8_TYPE "signed char"
+#define INT16_TYPE "signed short int"
+#define INT32_TYPE "signed long int"
+#define INT64_TYPE "signed long long int"
+
+#define UINT8_TYPE "unsigned char"
+#define UINT16_TYPE "short unsigned int"
+#define UINT32_TYPE "long unsigned int"
+#define UINT64_TYPE "long long unsigned int"
+
+/* Minimum-width integer types */
+
+#define INT_LEAST8_TYPE "signed char"
+#define INT_LEAST16_TYPE "signed short int"
+#define INT_LEAST32_TYPE "signed int"
+#define INT_LEAST64_TYPE "signed long long int"
+
+#define UINT_LEAST8_TYPE "unsigned char"
+#define UINT_LEAST16_TYPE "short unsigned int"
+#define UINT_LEAST32_TYPE "unsigned int"
+#define UINT_LEAST64_TYPE "long long unsigned int"
+
+/* Fastest minimum-width integer types */
+
+#define INT_FAST8_TYPE "signed char"
+#define INT_FAST16_TYPE "signed int"
+#define INT_FAST32_TYPE "signed int"
+#define INT_FAST64_TYPE "long long signed int"
+
+#define UINT_FAST8_TYPE "unsigned char"
+#define UINT_FAST16_TYPE "unsigned int"
+#define UINT_FAST32_TYPE "unsigned int"
+#define UINT_FAST64_TYPE "long long unsigned int"
+
+/* Integer types capable of holding object pointers */
+
+#define INTPTR_TYPE "long int"
+#define UINTPTR_TYPE "long unsigned int"
+
diff --git a/gcc-4.9/gcc/config/i386/djgpp.h b/gcc-4.9/gcc/config/i386/djgpp.h
new file mode 100644
index 000000000..6ddd833ba
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/djgpp.h
@@ -0,0 +1,178 @@
+/* Configuration for an i386 running MS-DOS with DJGPP.
+   Copyright (C) 1997-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Support generation of DWARF2 debugging info.  */
+#define DWARF2_DEBUGGING_INFO 1
+
+/* Don't assume anything about the header files.  */
+#define NO_IMPLICIT_EXTERN_C
+
+#undef BSS_SECTION_ASM_OP
+#define BSS_SECTION_ASM_OP "\t.section\t.bss"
+
+/* Define the name of the .data section.  */
+#undef DATA_SECTION_ASM_OP
+#define DATA_SECTION_ASM_OP "\t.section .data"
+
+/* Enable alias attribute support.  */
+#ifndef SET_ASM_OP
+#define SET_ASM_OP "\t.set\t"
+#endif
+
+/* Define the name of the .text section.  */
+#undef TEXT_SECTION_ASM_OP
+#define TEXT_SECTION_ASM_OP "\t.section .text"
+
+/* Define standard DJGPP installation paths.  */
+/* We override default /usr or /usr/local part with /dev/env/DJDIR which */
+/* points to actual DJGPP installation directory.  */
+
+/* Search for as.exe and ld.exe in DJGPP's binary directory.  */ 
+#undef MD_EXEC_PREFIX
+#define MD_EXEC_PREFIX "/dev/env/DJDIR/bin/"
+
+/* Standard DJGPP library and startup files */
+#undef MD_STARTFILE_PREFIX
+#define MD_STARTFILE_PREFIX "/dev/env/DJDIR/lib/"
+
+/* Correctly handle absolute filename detection in cp/xref.c */
+#define FILE_NAME_ABSOLUTE_P(NAME) \
+        (((NAME)[0] == '/') || ((NAME)[0] == '\\') || \
+        (((NAME)[0] >= 'A') && ((NAME)[0] <= 'z') && ((NAME)[1] == ':')))
+
+#define TARGET_OS_CPP_BUILTINS()		\
+  do						\
+    {						\
+	builtin_define_std ("MSDOS");		\
+	builtin_define_std ("GO32");		\
+	builtin_assert ("system=msdos");	\
+    }						\
+  while (0)
+
+/* Include <sys/version.h> so __DJGPP__ and __DJGPP_MINOR__ are defined.  */
+#undef CPP_SPEC
+#define CPP_SPEC "-remap %{posix:-D_POSIX_SOURCE} \
+  -imacros %s../include/sys/version.h"
+
+/* We need to override link_command_spec in gcc.c so support -Tdjgpp.djl.
+   This cannot be done in LINK_SPECS as that LINK_SPECS is processed
+   before library search directories are known by the linker.
+   This avoids problems when specs file is not available. An alternate way,
+   suggested by Robert Hoehne, is to use SUBTARGET_EXTRA_SPECS instead.
+*/ 
+
+#undef LINK_COMMAND_SPEC
+#define LINK_COMMAND_SPEC \
+"%{!fsyntax-only: \
+%{!c:%{!M:%{!MM:%{!E:%{!S:%(linker) %l %X %{o*} %{e*} %{N} %{n} \
+\t%{r} %{s} %{t} %{u*} %{z} %{Z}\
+\t%{!nostdlib:%{!nostartfiles:%S}}\
+\t%{static:} %{L*} %D %o\
+\t%{!nostdlib:%{!nodefaultlibs:%G %L %G}}\
+\t%{!nostdlib:%{!nostartfiles:%E}}\
+\t-Tdjgpp.djl %{T*}}}}}}}\n\
+%{!c:%{!M:%{!MM:%{!E:%{!S:stubify %{v} %{o*:%*} %{!o*:a.out} }}}}}"
+
+/* Always just link in 'libc.a'.  */
+#undef LIB_SPEC
+#define LIB_SPEC "-lc"
+
+/* Pick the right startup code depending on the -pg flag.  */
+#undef STARTFILE_SPEC
+#define STARTFILE_SPEC "%{pg:gcrt0.o%s}%{!pg:crt0.o%s}"
+
+/* Make sure that gcc will not look for .h files in /usr/local/include 
+   unless user explicitly requests it.  */
+#undef LOCAL_INCLUDE_DIR
+
+/* Switch into a generic section.  */
+#define TARGET_ASM_NAMED_SECTION  default_coff_asm_named_section
+
+/* This is how to output an assembler line
+   that says to advance the location counter
+   to a multiple of 2**LOG bytes.  */
+
+#undef ASM_OUTPUT_ALIGN
+#define ASM_OUTPUT_ALIGN(FILE,LOG) \
+  if ((LOG) != 0) fprintf ((FILE), "\t.p2align %d\n", LOG)
+
+/* This is how to output a global symbol in the BSS section.  */
+#undef ASM_OUTPUT_ALIGNED_BSS
+#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \
+  asm_output_aligned_bss ((FILE), (DECL), (NAME), (SIZE), (ALIGN))
+
+/* Write the extra assembler code needed to declare a function properly.  */
+
+#ifndef ASM_DECLARE_FUNCTION_NAME
+#define ASM_DECLARE_FUNCTION_NAME(FILE, NAME, DECL)		\
+  do								\
+    {								\
+      ASM_OUTPUT_FUNCTION_LABEL (FILE, NAME, DECL);		\
+    }								\
+  while (0)
+#endif
+
+/* This is how to tell assembler that a symbol is weak  */ 
+#undef ASM_WEAKEN_LABEL
+#define ASM_WEAKEN_LABEL(FILE,NAME) \
+  do { fputs ("\t.weak\t", FILE); assemble_name (FILE, NAME); \
+       fputc ('\n', FILE); } while (0)
+
+/* djgpp automatically calls its own version of __main, so don't define one
+   in libgcc, nor call one in main().  */
+#define HAS_INIT_SECTION
+
+#undef TARGET_LIBC_HAS_FUNCTION
+#define TARGET_LIBC_HAS_FUNCTION no_c99_libc_has_function
+
+/* Definitions for types and sizes. Wide characters are 16-bits long so
+   Win32 compiler add-ons will be wide character compatible.  */
+#undef WCHAR_TYPE_SIZE
+#define WCHAR_TYPE_SIZE 16
+
+#undef WCHAR_TYPE
+#define WCHAR_TYPE "short unsigned int"
+
+#undef WINT_TYPE
+#define WINT_TYPE "int"
+
+#undef SIZE_TYPE
+#define SIZE_TYPE "long unsigned int"
+
+#undef PTRDIFF_TYPE
+#define PTRDIFF_TYPE "int"
+
+/* Used to be defined in xm-djgpp.h, but moved here for cross-compilers.  */
+#define LIBSTDCXX "stdcxx"
+
+/* Warn that -mbnu210 is now obsolete.  */
+#undef  SUBTARGET_OVERRIDE_OPTIONS
+#define SUBTARGET_OVERRIDE_OPTIONS \
+do \
+  { \
+    if (TARGET_BNU210) \
+      {	\
+        warning (0, "-mbnu210 is ignored (option is obsolete)"); \
+      }	\
+  } \
+while (0)
+
+/* Support for C++ templates.  */
+#undef MAKE_DECL_ONE_ONLY
+#define MAKE_DECL_ONE_ONLY(DECL) (DECL_WEAK (DECL) = 1)
diff --git a/gcc-4.9/gcc/config/i386/djgpp.opt b/gcc-4.9/gcc/config/i386/djgpp.opt
new file mode 100644
index 000000000..21fc54afb
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/djgpp.opt
@@ -0,0 +1,28 @@
+; DJGPP-specific options.
+
+; Copyright (C) 2005-2014 Free Software Foundation, Inc.
+;
+; This file is part of GCC.
+;
+; GCC is free software; you can redistribute it and/or modify it under
+; the terms of the GNU General Public License as published by the Free
+; Software Foundation; either version 3, or (at your option) any later
+; version.
+;
+; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+; for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with GCC; see the file COPYING3.  If not see
+; <http://www.gnu.org/licenses/>.
+
+;; -mbnu210 is now ignored and obsolete.  It was used to enable support for
+;; weak symbols, and .gnu.linkonce support.
+mbnu210
+Target Var(TARGET_BNU210)
+Ignored (obsolete)
+
+posix
+Driver
diff --git a/gcc-4.9/gcc/config/i386/driver-i386.c b/gcc-4.9/gcc/config/i386/driver-i386.c
new file mode 100644
index 000000000..1f5a11c9c
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/driver-i386.c
@@ -0,0 +1,913 @@
+/* Subroutines for the gcc driver.
+   Copyright (C) 2006-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+
+const char *host_detect_local_cpu (int argc, const char **argv);
+
+#ifdef __GNUC__
+#include "cpuid.h"
+
+struct cache_desc
+{
+  unsigned sizekb;
+  unsigned assoc;
+  unsigned line;
+};
+
+/* Returns command line parameters that describe size and
+   cache line size of the processor caches.  */
+
+static char *
+describe_cache (struct cache_desc level1, struct cache_desc level2)
+{
+  char size[100], line[100], size2[100];
+
+  /* At the moment, gcc does not use the information
+     about the associativity of the cache.  */
+
+  snprintf (size, sizeof (size),
+	    "--param l1-cache-size=%u ", level1.sizekb);
+  snprintf (line, sizeof (line),
+	    "--param l1-cache-line-size=%u ", level1.line);
+
+  snprintf (size2, sizeof (size2),
+	    "--param l2-cache-size=%u ", level2.sizekb);
+
+  return concat (size, line, size2, NULL);
+}
+
+/* Detect L2 cache parameters using CPUID extended function 0x80000006.  */
+
+static void
+detect_l2_cache (struct cache_desc *level2)
+{
+  unsigned eax, ebx, ecx, edx;
+  unsigned assoc;
+
+  __cpuid (0x80000006, eax, ebx, ecx, edx);
+
+  level2->sizekb = (ecx >> 16) & 0xffff;
+  level2->line = ecx & 0xff;
+
+  assoc = (ecx >> 12) & 0xf;
+  if (assoc == 6)
+    assoc = 8;
+  else if (assoc == 8)
+    assoc = 16;
+  else if (assoc >= 0xa && assoc <= 0xc)
+    assoc = 32 + (assoc - 0xa) * 16;
+  else if (assoc >= 0xd && assoc <= 0xe)
+    assoc = 96 + (assoc - 0xd) * 32;
+
+  level2->assoc = assoc;
+}
+
+/* Returns the description of caches for an AMD processor.  */
+
+static const char *
+detect_caches_amd (unsigned max_ext_level)
+{
+  unsigned eax, ebx, ecx, edx;
+
+  struct cache_desc level1, level2 = {0, 0, 0};
+
+  if (max_ext_level < 0x80000005)
+    return "";
+
+  __cpuid (0x80000005, eax, ebx, ecx, edx);
+
+  level1.sizekb = (ecx >> 24) & 0xff;
+  level1.assoc = (ecx >> 16) & 0xff;
+  level1.line = ecx & 0xff;
+
+  if (max_ext_level >= 0x80000006)
+    detect_l2_cache (&level2);
+
+  return describe_cache (level1, level2);
+}
+
+/* Decodes the size, the associativity and the cache line size of
+   L1/L2 caches of an Intel processor.  Values are based on
+   "Intel Processor Identification and the CPUID Instruction"
+   [Application Note 485], revision -032, December 2007.  */
+
+static void
+decode_caches_intel (unsigned reg, bool xeon_mp,
+		     struct cache_desc *level1, struct cache_desc *level2)
+{
+  int i;
+
+  for (i = 24; i >= 0; i -= 8)
+    switch ((reg >> i) & 0xff)
+      {
+      case 0x0a:
+	level1->sizekb = 8; level1->assoc = 2; level1->line = 32;
+	break;
+      case 0x0c:
+	level1->sizekb = 16; level1->assoc = 4; level1->line = 32;
+	break;
+      case 0x0d:
+	level1->sizekb = 16; level1->assoc = 4; level1->line = 64;
+	break;
+      case 0x0e:
+	level1->sizekb = 24; level1->assoc = 6; level1->line = 64;
+	break;
+      case 0x21:
+	level2->sizekb = 256; level2->assoc = 8; level2->line = 64;
+	break;
+      case 0x24:
+	level2->sizekb = 1024; level2->assoc = 16; level2->line = 64;
+	break;
+      case 0x2c:
+	level1->sizekb = 32; level1->assoc = 8; level1->line = 64;
+	break;
+      case 0x39:
+	level2->sizekb = 128; level2->assoc = 4; level2->line = 64;
+	break;
+      case 0x3a:
+	level2->sizekb = 192; level2->assoc = 6; level2->line = 64;
+	break;
+      case 0x3b:
+	level2->sizekb = 128; level2->assoc = 2; level2->line = 64;
+	break;
+      case 0x3c:
+	level2->sizekb = 256; level2->assoc = 4; level2->line = 64;
+	break;
+      case 0x3d:
+	level2->sizekb = 384; level2->assoc = 6; level2->line = 64;
+	break;
+      case 0x3e:
+	level2->sizekb = 512; level2->assoc = 4; level2->line = 64;
+	break;
+      case 0x41:
+	level2->sizekb = 128; level2->assoc = 4; level2->line = 32;
+	break;
+      case 0x42:
+	level2->sizekb = 256; level2->assoc = 4; level2->line = 32;
+	break;
+      case 0x43:
+	level2->sizekb = 512; level2->assoc = 4; level2->line = 32;
+	break;
+      case 0x44:
+	level2->sizekb = 1024; level2->assoc = 4; level2->line = 32;
+	break;
+      case 0x45:
+	level2->sizekb = 2048; level2->assoc = 4; level2->line = 32;
+	break;
+      case 0x48:
+	level2->sizekb = 3072; level2->assoc = 12; level2->line = 64;
+	break;
+      case 0x49:
+	if (xeon_mp)
+	  break;
+	level2->sizekb = 4096; level2->assoc = 16; level2->line = 64;
+	break;
+      case 0x4e:
+	level2->sizekb = 6144; level2->assoc = 24; level2->line = 64;
+	break;
+      case 0x60:
+	level1->sizekb = 16; level1->assoc = 8; level1->line = 64;
+	break;
+      case 0x66:
+	level1->sizekb = 8; level1->assoc = 4; level1->line = 64;
+	break;
+      case 0x67:
+	level1->sizekb = 16; level1->assoc = 4; level1->line = 64;
+	break;
+      case 0x68:
+	level1->sizekb = 32; level1->assoc = 4; level1->line = 64;
+	break;
+      case 0x78:
+	level2->sizekb = 1024; level2->assoc = 4; level2->line = 64;
+	break;
+      case 0x79:
+	level2->sizekb = 128; level2->assoc = 8; level2->line = 64;
+	break;
+      case 0x7a:
+	level2->sizekb = 256; level2->assoc = 8; level2->line = 64;
+	break;
+      case 0x7b:
+	level2->sizekb = 512; level2->assoc = 8; level2->line = 64;
+	break;
+      case 0x7c:
+	level2->sizekb = 1024; level2->assoc = 8; level2->line = 64;
+	break;
+      case 0x7d:
+	level2->sizekb = 2048; level2->assoc = 8; level2->line = 64;
+	break;
+      case 0x7f:
+	level2->sizekb = 512; level2->assoc = 2; level2->line = 64;
+	break;
+      case 0x80:
+	level2->sizekb = 512; level2->assoc = 8; level2->line = 64;
+	break;
+      case 0x82:
+	level2->sizekb = 256; level2->assoc = 8; level2->line = 32;
+	break;
+      case 0x83:
+	level2->sizekb = 512; level2->assoc = 8; level2->line = 32;
+	break;
+      case 0x84:
+	level2->sizekb = 1024; level2->assoc = 8; level2->line = 32;
+	break;
+      case 0x85:
+	level2->sizekb = 2048; level2->assoc = 8; level2->line = 32;
+	break;
+      case 0x86:
+	level2->sizekb = 512; level2->assoc = 4; level2->line = 64;
+	break;
+      case 0x87:
+	level2->sizekb = 1024; level2->assoc = 8; level2->line = 64;
+
+      default:
+	break;
+      }
+}
+
+/* Detect cache parameters using CPUID function 2.  */
+
+static void
+detect_caches_cpuid2 (bool xeon_mp, 
+		      struct cache_desc *level1, struct cache_desc *level2)
+{
+  unsigned regs[4];
+  int nreps, i;
+
+  __cpuid (2, regs[0], regs[1], regs[2], regs[3]);
+
+  nreps = regs[0] & 0x0f;
+  regs[0] &= ~0x0f;
+
+  while (--nreps >= 0)
+    {
+      for (i = 0; i < 4; i++)
+	if (regs[i] && !((regs[i] >> 31) & 1))
+	  decode_caches_intel (regs[i], xeon_mp, level1, level2);
+
+      if (nreps)
+	__cpuid (2, regs[0], regs[1], regs[2], regs[3]);
+    }
+}
+
+/* Detect cache parameters using CPUID function 4. This
+   method doesn't require hardcoded tables.  */
+
+enum cache_type
+{
+  CACHE_END = 0,
+  CACHE_DATA = 1,
+  CACHE_INST = 2,
+  CACHE_UNIFIED = 3
+};
+
+static void
+detect_caches_cpuid4 (struct cache_desc *level1, struct cache_desc *level2,
+		      struct cache_desc *level3)
+{
+  struct cache_desc *cache;
+
+  unsigned eax, ebx, ecx, edx;
+  int count;
+
+  for (count = 0;; count++)
+    { 
+      __cpuid_count(4, count, eax, ebx, ecx, edx);
+      switch (eax & 0x1f)
+	{
+	case CACHE_END:
+	  return;
+	case CACHE_DATA:
+	case CACHE_UNIFIED:
+	  {
+	    switch ((eax >> 5) & 0x07)
+	      {
+	      case 1:
+		cache = level1;
+		break;
+	      case 2:
+		cache = level2;
+		break;
+	      case 3:
+		cache = level3;
+		break;
+	      default:
+		cache = NULL;
+	      }
+
+	    if (cache)
+	      {
+		unsigned sets = ecx + 1;
+		unsigned part = ((ebx >> 12) & 0x03ff) + 1;
+
+		cache->assoc = ((ebx >> 22) & 0x03ff) + 1;
+		cache->line = (ebx & 0x0fff) + 1;
+
+		cache->sizekb = (cache->assoc * part
+				 * cache->line * sets) / 1024;
+	      }
+	  }
+	default:
+	  break;
+	}
+    }
+}
+
+/* Returns the description of caches for an Intel processor.  */
+
+static const char *
+detect_caches_intel (bool xeon_mp, unsigned max_level,
+		     unsigned max_ext_level, unsigned *l2sizekb)
+{
+  struct cache_desc level1 = {0, 0, 0}, level2 = {0, 0, 0}, level3 = {0, 0, 0};
+
+  if (max_level >= 4)
+    detect_caches_cpuid4 (&level1, &level2, &level3);
+  else if (max_level >= 2)
+    detect_caches_cpuid2 (xeon_mp, &level1, &level2);
+  else
+    return "";
+
+  if (level1.sizekb == 0)
+    return "";
+
+  /* Let the L3 replace the L2. This assumes inclusive caches
+     and single threaded program for now. */
+  if (level3.sizekb)
+    level2 = level3;
+
+  /* Intel CPUs are equipped with AMD style L2 cache info.  Try this
+     method if other methods fail to provide L2 cache parameters.  */
+  if (level2.sizekb == 0 && max_ext_level >= 0x80000006)
+    detect_l2_cache (&level2);
+
+  *l2sizekb = level2.sizekb;
+
+  return describe_cache (level1, level2);
+}
+
+/* This will be called by the spec parser in gcc.c when it sees
+   a %:local_cpu_detect(args) construct.  Currently it will be called
+   with either "arch" or "tune" as argument depending on if -march=native
+   or -mtune=native is to be substituted.
+
+   It returns a string containing new command line parameters to be
+   put at the place of the above two options, depending on what CPU
+   this is executed.  E.g. "-march=k8" on an AMD64 machine
+   for -march=native.
+
+   ARGC and ARGV are set depending on the actual arguments given
+   in the spec.  */
+
+const char *host_detect_local_cpu (int argc, const char **argv)
+{
+  enum processor_type processor = PROCESSOR_I386;
+  const char *cpu = "i386";
+
+  const char *cache = "";
+  const char *options = "";
+
+  unsigned int eax, ebx, ecx, edx;
+
+  unsigned int max_level, ext_level;
+
+  unsigned int vendor;
+  unsigned int model, family;
+
+  unsigned int has_sse3, has_ssse3, has_cmpxchg16b;
+  unsigned int has_cmpxchg8b, has_cmov, has_mmx, has_sse, has_sse2;
+
+  /* Extended features */
+  unsigned int has_lahf_lm = 0, has_sse4a = 0;
+  unsigned int has_longmode = 0, has_3dnowp = 0, has_3dnow = 0;
+  unsigned int has_movbe = 0, has_sse4_1 = 0, has_sse4_2 = 0;
+  unsigned int has_popcnt = 0, has_aes = 0, has_avx = 0, has_avx2 = 0;
+  unsigned int has_pclmul = 0, has_abm = 0, has_lwp = 0;
+  unsigned int has_fma = 0, has_fma4 = 0, has_xop = 0;
+  unsigned int has_bmi = 0, has_bmi2 = 0, has_tbm = 0, has_lzcnt = 0;
+  unsigned int has_hle = 0, has_rtm = 0;
+  unsigned int has_rdrnd = 0, has_f16c = 0, has_fsgsbase = 0;
+  unsigned int has_rdseed = 0, has_prfchw = 0, has_adx = 0;
+  unsigned int has_osxsave = 0, has_fxsr = 0, has_xsave = 0, has_xsaveopt = 0;
+  unsigned int has_avx512er = 0, has_avx512pf = 0, has_avx512cd = 0;
+  unsigned int has_avx512f = 0, has_sha = 0, has_prefetchwt1 = 0;
+
+  bool arch;
+
+  unsigned int l2sizekb = 0;
+
+  if (argc < 1)
+    return NULL;
+
+  arch = !strcmp (argv[0], "arch");
+
+  if (!arch && strcmp (argv[0], "tune"))
+    return NULL;
+
+  max_level = __get_cpuid_max (0, &vendor);
+  if (max_level < 1)
+    goto done;
+
+  __cpuid (1, eax, ebx, ecx, edx);
+
+  model = (eax >> 4) & 0x0f;
+  family = (eax >> 8) & 0x0f;
+  if (vendor == signature_INTEL_ebx)
+    {
+      unsigned int extended_model, extended_family;
+
+      extended_model = (eax >> 12) & 0xf0;
+      extended_family = (eax >> 20) & 0xff;
+      if (family == 0x0f)
+	{
+	  family += extended_family;
+	  model += extended_model;
+	}
+      else if (family == 0x06)
+	model += extended_model;
+    }
+
+  has_sse3 = ecx & bit_SSE3;
+  has_ssse3 = ecx & bit_SSSE3;
+  has_sse4_1 = ecx & bit_SSE4_1;
+  has_sse4_2 = ecx & bit_SSE4_2;
+  has_avx = ecx & bit_AVX;
+  has_osxsave = ecx & bit_OSXSAVE;
+  has_cmpxchg16b = ecx & bit_CMPXCHG16B;
+  has_movbe = ecx & bit_MOVBE;
+  has_popcnt = ecx & bit_POPCNT;
+  has_aes = ecx & bit_AES;
+  has_pclmul = ecx & bit_PCLMUL;
+  has_fma = ecx & bit_FMA;
+  has_f16c = ecx & bit_F16C;
+  has_rdrnd = ecx & bit_RDRND;
+  has_xsave = ecx & bit_XSAVE;
+
+  has_cmpxchg8b = edx & bit_CMPXCHG8B;
+  has_cmov = edx & bit_CMOV;
+  has_mmx = edx & bit_MMX;
+  has_fxsr = edx & bit_FXSAVE;
+  has_sse = edx & bit_SSE;
+  has_sse2 = edx & bit_SSE2;
+
+  if (max_level >= 7)
+    {
+      __cpuid_count (7, 0, eax, ebx, ecx, edx);
+
+      has_bmi = ebx & bit_BMI;
+      has_hle = ebx & bit_HLE;
+      has_rtm = ebx & bit_RTM;
+      has_avx2 = ebx & bit_AVX2;
+      has_bmi2 = ebx & bit_BMI2;
+      has_fsgsbase = ebx & bit_FSGSBASE;
+      has_rdseed = ebx & bit_RDSEED;
+      has_adx = ebx & bit_ADX;
+      has_avx512f = ebx & bit_AVX512F;
+      has_avx512er = ebx & bit_AVX512ER;
+      has_avx512pf = ebx & bit_AVX512PF;
+      has_avx512cd = ebx & bit_AVX512CD;
+      has_sha = ebx & bit_SHA;
+
+      has_prefetchwt1 = ecx & bit_PREFETCHWT1;
+    }
+
+  if (max_level >= 13)
+    {
+      __cpuid_count (13, 1, eax, ebx, ecx, edx);
+
+      has_xsaveopt = eax & bit_XSAVEOPT;
+    }
+
+  /* Check cpuid level of extended features.  */
+  __cpuid (0x80000000, ext_level, ebx, ecx, edx);
+
+  if (ext_level > 0x80000000)
+    {
+      __cpuid (0x80000001, eax, ebx, ecx, edx);
+
+      has_lahf_lm = ecx & bit_LAHF_LM;
+      has_sse4a = ecx & bit_SSE4a;
+      has_abm = ecx & bit_ABM;
+      has_lwp = ecx & bit_LWP;
+      has_fma4 = ecx & bit_FMA4;
+      has_xop = ecx & bit_XOP;
+      has_tbm = ecx & bit_TBM;
+      has_lzcnt = ecx & bit_LZCNT;
+      has_prfchw = ecx & bit_PRFCHW;
+
+      has_longmode = edx & bit_LM;
+      has_3dnowp = edx & bit_3DNOWP;
+      has_3dnow = edx & bit_3DNOW;
+    }
+
+  /* Get XCR_XFEATURE_ENABLED_MASK register with xgetbv.  */
+#define XCR_XFEATURE_ENABLED_MASK	0x0
+#define XSTATE_FP			0x1
+#define XSTATE_SSE			0x2
+#define XSTATE_YMM			0x4
+  if (has_osxsave)
+    asm (".byte 0x0f; .byte 0x01; .byte 0xd0"
+	 : "=a" (eax), "=d" (edx)
+	 : "c" (XCR_XFEATURE_ENABLED_MASK));
+
+  /* Check if SSE and YMM states are supported.  */
+  if (!has_osxsave
+      || (eax & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM))
+    {
+      has_avx = 0;
+      has_avx2 = 0;
+      has_fma = 0;
+      has_fma4 = 0;
+      has_f16c = 0;
+      has_xop = 0;
+      has_xsave = 0;
+      has_xsaveopt = 0;
+    }
+
+  if (!arch)
+    {
+      if (vendor == signature_AMD_ebx
+	  || vendor == signature_CENTAUR_ebx
+	  || vendor == signature_CYRIX_ebx
+	  || vendor == signature_NSC_ebx)
+	cache = detect_caches_amd (ext_level);
+      else if (vendor == signature_INTEL_ebx)
+	{
+	  bool xeon_mp = (family == 15 && model == 6);
+	  cache = detect_caches_intel (xeon_mp, max_level,
+				       ext_level, &l2sizekb);
+	}
+    }
+
+  if (vendor == signature_AMD_ebx)
+    {
+      unsigned int name;
+
+      /* Detect geode processor by its processor signature.  */
+      if (ext_level > 0x80000001)
+	__cpuid (0x80000002, name, ebx, ecx, edx);
+      else
+	name = 0;
+
+      if (name == signature_NSC_ebx)
+	processor = PROCESSOR_GEODE;
+      else if (has_movbe)
+	processor = PROCESSOR_BTVER2;
+      else if (has_avx2)
+        processor = PROCESSOR_BDVER4;
+      else if (has_xsaveopt)
+        processor = PROCESSOR_BDVER3;
+      else if (has_bmi)
+        processor = PROCESSOR_BDVER2;
+      else if (has_xop)
+	processor = PROCESSOR_BDVER1;
+      else if (has_sse4a && has_ssse3)
+        processor = PROCESSOR_BTVER1;
+      else if (has_sse4a)
+	processor = PROCESSOR_AMDFAM10;
+      else if (has_sse2 || has_longmode)
+	processor = PROCESSOR_K8;
+      else if (has_3dnowp && family == 6)
+	processor = PROCESSOR_ATHLON;
+      else if (has_mmx)
+	processor = PROCESSOR_K6;
+      else
+	processor = PROCESSOR_PENTIUM;
+    }
+  else if (vendor == signature_CENTAUR_ebx)
+    {
+      if (arch)
+	{
+	  switch (family)
+	    {
+	    case 6:
+	      if (model > 9)
+		/* Use the default detection procedure.  */
+		processor = PROCESSOR_GENERIC;
+	      else if (model == 9)
+		cpu = "c3-2";
+	      else if (model >= 6)
+		cpu = "c3";
+	      else
+		processor = PROCESSOR_GENERIC;
+	      break;
+	    case 5:
+	      if (has_3dnow)
+		cpu = "winchip2";
+	      else if (has_mmx)
+		cpu = "winchip2-c6";
+	      else
+		processor = PROCESSOR_GENERIC;
+	      break;
+	    default:
+	      /* We have no idea.  */
+	      processor = PROCESSOR_GENERIC;
+	    }
+	}
+    }
+  else
+    {
+      switch (family)
+	{
+	case 4:
+	  processor = PROCESSOR_I486;
+	  break;
+	case 5:
+	  processor = PROCESSOR_PENTIUM;
+	  break;
+	case 6:
+	  processor = PROCESSOR_PENTIUMPRO;
+	  break;
+	case 15:
+	  processor = PROCESSOR_PENTIUM4;
+	  break;
+	default:
+	  /* We have no idea.  */
+	  processor = PROCESSOR_GENERIC;
+	}
+    }
+
+  switch (processor)
+    {
+    case PROCESSOR_I386:
+      /* Default.  */
+      break;
+    case PROCESSOR_I486:
+      cpu = "i486";
+      break;
+    case PROCESSOR_PENTIUM:
+      if (arch && has_mmx)
+	cpu = "pentium-mmx";
+      else
+	cpu = "pentium";
+      break;
+    case PROCESSOR_PENTIUMPRO:
+      switch (model)
+	{
+	case 0x1c:
+	case 0x26:
+	  /* Bonnell.  */
+	  cpu = "bonnell";
+	  break;
+	case 0x37:
+	case 0x4d:
+	  /* Silvermont.  */
+	  cpu = "silvermont";
+	  break;
+	case 0x0f:
+	  /* Merom.  */
+	case 0x17:
+	case 0x1d:
+	  /* Penryn.  */
+	  cpu = "core2";
+	  break;
+	case 0x1a:
+	case 0x1e:
+	case 0x1f:
+	case 0x2e:
+	  /* Nehalem.  */
+	  cpu = "nehalem";
+	  break;
+	case 0x25:
+	case 0x2c:
+	case 0x2f:
+	  /* Westmere.  */
+	  cpu = "westmere";
+	  break;
+	case 0x2a:
+	case 0x2d:
+	  /* Sandy Bridge.  */
+	  cpu = "sandybridge";
+	  break;
+	case 0x3a:
+	case 0x3e:
+	  /* Ivy Bridge.  */
+	  cpu = "ivybridge";
+	  break;
+	case 0x3c:
+	case 0x45:
+	case 0x46:
+	  /* Haswell.  */
+	  cpu = "haswell";
+	  break;
+	default:
+	  if (arch)
+	    {
+	      /* This is unknown family 0x6 CPU.  */
+	      if (has_adx)
+		cpu = "broadwell";
+	      else if (has_avx2)
+		/* Assume Haswell.  */
+		cpu = "haswell";
+	      else if (has_avx)
+		/* Assume Sandy Bridge.  */
+		cpu = "sandybridge";
+	      else if (has_sse4_2)
+		{
+		  if (has_movbe)
+		    /* Assume Silvermont.  */
+		    cpu = "silvermont";
+		  else
+		    /* Assume Nehalem.  */
+		    cpu = "nehalem";
+		}
+	      else if (has_ssse3)
+		{
+		  if (has_movbe)
+		    /* Assume Bonnell.  */
+		    cpu = "bonnell";
+		  else
+		    /* Assume Core 2.  */
+		    cpu = "core2";
+		}
+	      else if (has_sse3)
+		/* It is Core Duo.  */
+		cpu = "pentium-m";
+	      else if (has_sse2)
+		/* It is Pentium M.  */
+		cpu = "pentium-m";
+	      else if (has_sse)
+		/* It is Pentium III.  */
+		cpu = "pentium3";
+	      else if (has_mmx)
+		/* It is Pentium II.  */
+		cpu = "pentium2";
+	      else
+		/* Default to Pentium Pro.  */
+		cpu = "pentiumpro";
+	    }
+	  else
+	    /* For -mtune, we default to -mtune=generic.  */
+	    cpu = "generic";
+	  break;
+	}
+      break;
+    case PROCESSOR_PENTIUM4:
+      if (has_sse3)
+	{
+	  if (has_longmode)
+	    cpu = "nocona";
+	  else
+	    cpu = "prescott";
+	}
+      else
+	cpu = "pentium4";
+      break;
+    case PROCESSOR_GEODE:
+      cpu = "geode";
+      break;
+    case PROCESSOR_K6:
+      if (arch && has_3dnow)
+	cpu = "k6-3";
+      else
+	cpu = "k6";
+      break;
+    case PROCESSOR_ATHLON:
+      if (arch && has_sse)
+	cpu = "athlon-4";
+      else
+	cpu = "athlon";
+      break;
+    case PROCESSOR_K8:
+      if (arch && has_sse3)
+	cpu = "k8-sse3";
+      else
+	cpu = "k8";
+      break;
+    case PROCESSOR_AMDFAM10:
+      cpu = "amdfam10";
+      break;
+    case PROCESSOR_BDVER1:
+      cpu = "bdver1";
+      break;
+    case PROCESSOR_BDVER2:
+      cpu = "bdver2";
+      break;
+    case PROCESSOR_BDVER3:
+      cpu = "bdver3";
+      break;
+    case PROCESSOR_BDVER4:
+      cpu = "bdver4";
+      break;
+    case PROCESSOR_BTVER1:
+      cpu = "btver1";
+      break;
+    case PROCESSOR_BTVER2:
+      cpu = "btver2";
+      break;
+
+    default:
+      /* Use something reasonable.  */
+      if (arch)
+	{
+	  if (has_ssse3)
+	    cpu = "core2";
+	  else if (has_sse3)
+	    {
+	      if (has_longmode)
+		cpu = "nocona";
+	      else
+		cpu = "prescott";
+	    }
+	  else if (has_sse2)
+	    cpu = "pentium4";
+	  else if (has_cmov)
+	    cpu = "pentiumpro";
+	  else if (has_mmx)
+	    cpu = "pentium-mmx";
+	  else if (has_cmpxchg8b)
+	    cpu = "pentium";
+	}
+      else
+	cpu = "generic";
+    }
+
+  if (arch)
+    {
+      const char *mmx = has_mmx ? " -mmmx" : " -mno-mmx";
+      const char *mmx3dnow = has_3dnow ? " -m3dnow" : " -mno-3dnow";
+      const char *sse = has_sse ? " -msse" : " -mno-sse";
+      const char *sse2 = has_sse2 ? " -msse2" : " -mno-sse2";
+      const char *sse3 = has_sse3 ? " -msse3" : " -mno-sse3";
+      const char *ssse3 = has_ssse3 ? " -mssse3" : " -mno-ssse3";
+      const char *sse4a = has_sse4a ? " -msse4a" : " -mno-sse4a";
+      const char *cx16 = has_cmpxchg16b ? " -mcx16" : " -mno-cx16";
+      const char *sahf = has_lahf_lm ? " -msahf" : " -mno-sahf";
+      const char *movbe = has_movbe ? " -mmovbe" : " -mno-movbe";
+      const char *aes = has_aes ? " -maes" : " -mno-aes";
+      const char *sha = has_sha ? " -msha" : " -mno-sha";
+      const char *pclmul = has_pclmul ? " -mpclmul" : " -mno-pclmul";
+      const char *popcnt = has_popcnt ? " -mpopcnt" : " -mno-popcnt";
+      const char *abm = has_abm ? " -mabm" : " -mno-abm";
+      const char *lwp = has_lwp ? " -mlwp" : " -mno-lwp";
+      const char *fma = has_fma ? " -mfma" : " -mno-fma";
+      const char *fma4 = has_fma4 ? " -mfma4" : " -mno-fma4";
+      const char *xop = has_xop ? " -mxop" : " -mno-xop";
+      const char *bmi = has_bmi ? " -mbmi" : " -mno-bmi";
+      const char *bmi2 = has_bmi2 ? " -mbmi2" : " -mno-bmi2";
+      const char *tbm = has_tbm ? " -mtbm" : " -mno-tbm";
+      const char *avx = has_avx ? " -mavx" : " -mno-avx";
+      const char *avx2 = has_avx2 ? " -mavx2" : " -mno-avx2";
+      const char *sse4_2 = has_sse4_2 ? " -msse4.2" : " -mno-sse4.2";
+      const char *sse4_1 = has_sse4_1 ? " -msse4.1" : " -mno-sse4.1";
+      const char *lzcnt = has_lzcnt ? " -mlzcnt" : " -mno-lzcnt";
+      const char *hle = has_hle ? " -mhle" : " -mno-hle";
+      const char *rtm = has_rtm ? " -mrtm" : " -mno-rtm";
+      const char *rdrnd = has_rdrnd ? " -mrdrnd" : " -mno-rdrnd";
+      const char *f16c = has_f16c ? " -mf16c" : " -mno-f16c";
+      const char *fsgsbase = has_fsgsbase ? " -mfsgsbase" : " -mno-fsgsbase";
+      const char *rdseed = has_rdseed ? " -mrdseed" : " -mno-rdseed";
+      const char *prfchw = has_prfchw ? " -mprfchw" : " -mno-prfchw";
+      const char *adx = has_adx ? " -madx" : " -mno-adx";
+      const char *fxsr = has_fxsr ? " -mfxsr" : " -mno-fxsr";
+      const char *xsave = has_xsave ? " -mxsave" : " -mno-xsave";
+      const char *xsaveopt = has_xsaveopt ? " -mxsaveopt" : " -mno-xsaveopt";
+      const char *avx512f = has_avx512f ? " -mavx512f" : " -mno-avx512f";
+      const char *avx512er = has_avx512er ? " -mavx512er" : " -mno-avx512er";
+      const char *avx512cd = has_avx512cd ? " -mavx512cd" : " -mno-avx512cd";
+      const char *avx512pf = has_avx512pf ? " -mavx512pf" : " -mno-avx512pf";
+      const char *prefetchwt1 = has_prefetchwt1 ? " -mprefetchwt1" : " -mno-prefetchwt1";
+
+      options = concat (options, mmx, mmx3dnow, sse, sse2, sse3, ssse3,
+			sse4a, cx16, sahf, movbe, aes, sha, pclmul,
+			popcnt, abm, lwp, fma, fma4, xop, bmi, bmi2,
+			tbm, avx, avx2, sse4_2, sse4_1, lzcnt, rtm,
+			hle, rdrnd, f16c, fsgsbase, rdseed, prfchw, adx,
+			fxsr, xsave, xsaveopt, avx512f, avx512er,
+			avx512cd, avx512pf, prefetchwt1, NULL);
+    }
+
+done:
+  return concat (cache, "-m", argv[0], "=", cpu, options, NULL);
+}
+#else
+
+/* If we aren't compiling with GCC then the driver will just ignore
+   -march and -mtune "native" target and will leave to the newly
+   built compiler to generate code for its default target.  */
+
+const char *host_detect_local_cpu (int argc ATTRIBUTE_UNUSED,
+				   const char **argv ATTRIBUTE_UNUSED)
+{
+  return NULL;
+}
+#endif /* __GNUC__ */
diff --git a/gcc-4.9/gcc/config/i386/emmintrin.h b/gcc-4.9/gcc/config/i386/emmintrin.h
new file mode 100644
index 000000000..a2bdf0eda
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/emmintrin.h
@@ -0,0 +1,1541 @@
+/* Copyright (C) 2003-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef _EMMINTRIN_H_INCLUDED
+#define _EMMINTRIN_H_INCLUDED
+
+/* We need definitions from the SSE header files*/
+#include <xmmintrin.h>
+
+#ifndef __SSE2__
+#pragma GCC push_options
+#pragma GCC target("sse2")
+#define __DISABLE_SSE2__
+#endif /* __SSE2__ */
+
+/* SSE2 */
+typedef double __v2df __attribute__ ((__vector_size__ (16)));
+typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef int __v4si __attribute__ ((__vector_size__ (16)));
+typedef short __v8hi __attribute__ ((__vector_size__ (16)));
+typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
+
+/* Create a selector for use with the SHUFPD instruction.  */
+#define _MM_SHUFFLE2(fp1,fp0) \
+ (((fp1) << 1) | (fp0))
+
+/* Create a vector with element 0 as F and the rest zero.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_sd (double __F)
+{
+  return __extension__ (__m128d){ __F, 0.0 };
+}
+
+/* Create a vector with both elements equal to F.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_pd (double __F)
+{
+  return __extension__ (__m128d){ __F, __F };
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pd1 (double __F)
+{
+  return _mm_set1_pd (__F);
+}
+
+/* Create a vector with the lower value X and upper value W.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pd (double __W, double __X)
+{
+  return __extension__ (__m128d){ __X, __W };
+}
+
+/* Create a vector with the lower value W and upper value X.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_pd (double __W, double __X)
+{
+  return __extension__ (__m128d){ __W, __X };
+}
+
+/* Create an undefined vector.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_pd (void)
+{
+  __m128d __Y = __Y;
+  return __Y;
+}
+
+/* Create a vector of zeros.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_pd (void)
+{
+  return __extension__ (__m128d){ 0.0, 0.0 };
+}
+
+/* Sets the low DPFP value of A from the low value of B.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
+}
+
+/* Load two DPFP values from P.  The address must be 16-byte aligned.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_pd (double const *__P)
+{
+  return *(__m128d *)__P;
+}
+
+/* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_pd (double const *__P)
+{
+  return __builtin_ia32_loadupd (__P);
+}
+
+/* Create a vector with all two elements equal to *P.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load1_pd (double const *__P)
+{
+  return _mm_set1_pd (*__P);
+}
+
+/* Create a vector with element 0 as *P and the rest zero.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_sd (double const *__P)
+{
+  return _mm_set_sd (*__P);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_pd1 (double const *__P)
+{
+  return _mm_load1_pd (__P);
+}
+
+/* Load two DPFP values in reverse order.  The address must be aligned.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadr_pd (double const *__P)
+{
+  __m128d __tmp = _mm_load_pd (__P);
+  return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
+}
+
+/* Store two DPFP values.  The address must be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_pd (double *__P, __m128d __A)
+{
+  *(__m128d *)__P = __A;
+}
+
+/* Store two DPFP values.  The address need not be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_pd (double *__P, __m128d __A)
+{
+  __builtin_ia32_storeupd (__P, __A);
+}
+
+/* Stores the lower DPFP value.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_sd (double *__P, __m128d __A)
+{
+  *__P = __builtin_ia32_vec_ext_v2df (__A, 0);
+}
+
+extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_f64 (__m128d __A)
+{
+  return __builtin_ia32_vec_ext_v2df (__A, 0);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storel_pd (double *__P, __m128d __A)
+{
+  _mm_store_sd (__P, __A);
+}
+
+/* Stores the upper DPFP value.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeh_pd (double *__P, __m128d __A)
+{
+  *__P = __builtin_ia32_vec_ext_v2df (__A, 1);
+}
+
+/* Store the lower DPFP value across two words.
+   The address must be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store1_pd (double *__P, __m128d __A)
+{
+  _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_pd1 (double *__P, __m128d __A)
+{
+  _mm_store1_pd (__P, __A);
+}
+
+/* Store two DPFP values in reverse order.  The address must be aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storer_pd (double *__P, __m128d __A)
+{
+  _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1)));
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi128_si32 (__m128i __A)
+{
+  return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0);
+}
+
+#ifdef __x86_64__
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi128_si64 (__m128i __A)
+{
+  return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi128_si64x (__m128i __A)
+{
+  return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
+}
+#endif
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_pd (__m128d __A)
+{
+  return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
+}
+
+/* Return pair {sqrt (A[0), B[1]}.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_sd (__m128d __A, __m128d __B)
+{
+  __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
+  return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andnot_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
+					 (__v2df)
+					 __builtin_ia32_cmpltsd ((__v2df) __B,
+								 (__v2df)
+								 __A));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
+					 (__v2df)
+					 __builtin_ia32_cmplesd ((__v2df) __B,
+								 (__v2df)
+								 __A));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
+					 (__v2df)
+					 __builtin_ia32_cmpnltsd ((__v2df) __B,
+								  (__v2df)
+								  __A));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
+					 (__v2df)
+					 __builtin_ia32_cmpnlesd ((__v2df) __B,
+								  (__v2df)
+								  __A));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comieq_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comilt_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comile_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comigt_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comige_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comineq_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomieq_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomilt_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomile_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomigt_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomige_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomineq_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
+}
+
+/* Create a vector of Qi, where i is the element number.  */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi64x (long long __q1, long long __q0)
+{
+  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi64 (__m64 __q1,  __m64 __q0)
+{
+  return _mm_set_epi64x ((long long)__q1, (long long)__q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
+{
+  return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
+	       short __q3, short __q2, short __q1, short __q0)
+{
+  return __extension__ (__m128i)(__v8hi){
+    __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
+	      char __q11, char __q10, char __q09, char __q08,
+	      char __q07, char __q06, char __q05, char __q04,
+	      char __q03, char __q02, char __q01, char __q00)
+{
+  return __extension__ (__m128i)(__v16qi){
+    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
+  };
+}
+
+/* Set all of the elements of the vector to A.  */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi64x (long long __A)
+{
+  return _mm_set_epi64x (__A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi64 (__m64 __A)
+{
+  return _mm_set_epi64 (__A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi32 (int __A)
+{
+  return _mm_set_epi32 (__A, __A, __A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi16 (short __A)
+{
+  return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi8 (char __A)
+{
+  return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
+		       __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+/* Create a vector of Qi, where i is the element number.
+   The parameter order is reversed from the _mm_set_epi* functions.  */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi64 (__m64 __q0, __m64 __q1)
+{
+  return _mm_set_epi64 (__q1, __q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
+{
+  return _mm_set_epi32 (__q3, __q2, __q1, __q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
+	        short __q4, short __q5, short __q6, short __q7)
+{
+  return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
+	       char __q04, char __q05, char __q06, char __q07,
+	       char __q08, char __q09, char __q10, char __q11,
+	       char __q12, char __q13, char __q14, char __q15)
+{
+  return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
+		       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
+}
+
+/* Create a vector with element 0 as *P and the rest zero.  */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_si128 (__m128i const *__P)
+{
+  return *__P;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_si128 (__m128i const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddqu ((char const *)__P);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadl_epi64 (__m128i const *__P)
+{
+  return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_si128 (__m128i *__P, __m128i __B)
+{
+  *__P = __B;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_si128 (__m128i *__P, __m128i __B)
+{
+  __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storel_epi64 (__m128i *__P, __m128i __B)
+{
+  *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movepi64_pi64 (__m128i __B)
+{
+  return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movpi64_epi64 (__m64 __A)
+{
+  return _mm_set_epi64 ((__m64)0LL, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_epi64 (__m128i __A)
+{
+  return (__m128i)__builtin_ia32_movq128 ((__v2di) __A);
+}
+
+/* Create an undefined vector.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_si128 (void)
+{
+  __m128i __Y = __Y;
+  return __Y;
+}
+
+/* Create a vector of zeros.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_si128 (void)
+{
+  return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_pd (__m128i __A)
+{
+  return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_ps (__m128i __A)
+{
+  return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_epi32 (__m128d __A)
+{
+  return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_pi32 (__m128d __A)
+{
+  return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_ps (__m128d __A)
+{
+  return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttpd_epi32 (__m128d __A)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttpd_pi32 (__m128d __A)
+{
+  return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi32_pd (__m64 __A)
+{
+  return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_epi32 (__m128 __A)
+{
+  return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttps_epi32 (__m128 __A)
+{
+  return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pd (__m128 __A)
+{
+  return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_si32 (__m128d __A)
+{
+  return __builtin_ia32_cvtsd2si ((__v2df) __A);
+}
+
+#ifdef __x86_64__
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_si64 (__m128d __A)
+{
+  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_si64x (__m128d __A)
+{
+  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
+}
+#endif
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_si32 (__m128d __A)
+{
+  return __builtin_ia32_cvttsd2si ((__v2df) __A);
+}
+
+#ifdef __x86_64__
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_si64 (__m128d __A)
+{
+  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_si64x (__m128d __A)
+{
+  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
+}
+#endif
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_ss (__m128 __A, __m128d __B)
+{
+  return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_sd (__m128d __A, int __B)
+{
+  return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
+}
+
+#ifdef __x86_64__
+/* Intel intrinsic.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_sd (__m128d __A, long long __B)
+{
+  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_sd (__m128d __A, long long __B)
+{
+  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
+}
+#endif
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_sd (__m128d __A, __m128 __B)
+{
+  return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
+{
+  return (__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, __mask);
+}
+#else
+#define _mm_shuffle_pd(A, B, N)						\
+  ((__m128d)__builtin_ia32_shufpd ((__v2df)(__m128d)(A),		\
+				   (__v2df)(__m128d)(B), (int)(N)))
+#endif
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadh_pd (__m128d __A, double const *__B)
+{
+  return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadl_pd (__m128d __A, double const *__B)
+{
+  return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_pd (__m128d __A)
+{
+  return __builtin_ia32_movmskpd ((__v2df)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packus_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epu16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epu16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_madd_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_su32 (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_epu32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_epi16 (__m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_epi32 (__m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_epi64 (__m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srai_epi16 (__m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srai_epi32 (__m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_si128 (__m128i __A, const int __N)
+{
+  return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_si128 (__m128i __A, const int __N)
+{
+  return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8);
+}
+#else
+#define _mm_srli_si128(A, N) \
+  ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8))
+#define _mm_slli_si128(A, N) \
+  ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8))
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_epi16 (__m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_epi32 (__m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_epi64 (__m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_si128 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andnot_si128 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_si128 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_si128 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi16 (__m128i const __A, int const __N)
+{
+  return (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
+{
+  return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N);
+}
+#else
+#define _mm_extract_epi16(A, N) \
+  ((int) (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)(__m128i)(A), (int)(N)))
+#define _mm_insert_epi16(A, D, N)				\
+  ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(__m128i)(A),	\
+					  (int)(D), (int)(N)))
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_epi8 (__m128i __A)
+{
+  return __builtin_ia32_pmovmskb128 ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_epu16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shufflehi_epi16 (__m128i __A, const int __mask)
+{
+  return (__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __mask);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shufflelo_epi16 (__m128i __A, const int __mask)
+{
+  return (__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __mask);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_epi32 (__m128i __A, const int __mask)
+{
+  return (__m128i)__builtin_ia32_pshufd ((__v4si)__A, __mask);
+}
+#else
+#define _mm_shufflehi_epi16(A, N) \
+  ((__m128i)__builtin_ia32_pshufhw ((__v8hi)(__m128i)(A), (int)(N)))
+#define _mm_shufflelo_epi16(A, N) \
+  ((__m128i)__builtin_ia32_pshuflw ((__v8hi)(__m128i)(A), (int)(N)))
+#define _mm_shuffle_epi32(A, N) \
+  ((__m128i)__builtin_ia32_pshufd ((__v4si)(__m128i)(A), (int)(N)))
+#endif
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
+{
+  __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_epu16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sad_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_si32 (int *__A, int __B)
+{
+  __builtin_ia32_movnti (__A, __B);
+}
+
+#ifdef __x86_64__
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_si64 (long long int *__A, long long int __B)
+{
+  __builtin_ia32_movnti64 (__A, __B);
+}
+#endif
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_si128 (__m128i *__A, __m128i __B)
+{
+  __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_pd (double *__A, __m128d __B)
+{
+  __builtin_ia32_movntpd (__A, (__v2df)__B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_clflush (void const *__A)
+{
+  __builtin_ia32_clflush (__A);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_lfence (void)
+{
+  __builtin_ia32_lfence ();
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mfence (void)
+{
+  __builtin_ia32_mfence ();
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_si128 (int __A)
+{
+  return _mm_set_epi32 (0, 0, 0, __A);
+}
+
+#ifdef __x86_64__
+/* Intel intrinsic.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_si128 (long long __A)
+{
+  return _mm_set_epi64x (0, __A);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_si128 (long long __A)
+{
+  return _mm_set_epi64x (0, __A);
+}
+#endif
+
+/* Casts between various SP, DP, INT vector types.  Note that these do no
+   conversion of values, they just change the type.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_ps(__m128d __A)
+{
+  return (__m128) __A;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_si128(__m128d __A)
+{
+  return (__m128i) __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_pd(__m128 __A)
+{
+  return (__m128d) __A;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_si128(__m128 __A)
+{
+  return (__m128i) __A;
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_ps(__m128i __A)
+{
+  return (__m128) __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_pd(__m128i __A)
+{
+  return (__m128d) __A;
+}
+
+#ifdef __DISABLE_SSE2__
+#undef __DISABLE_SSE2__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE2__ */
+
+#endif /* _EMMINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/f16cintrin.h b/gcc-4.9/gcc/config/i386/f16cintrin.h
new file mode 100644
index 000000000..1181f8b94
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/f16cintrin.h
@@ -0,0 +1,98 @@
+/* Copyright (C) 2011-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <f16intrin.h> directly; include <x86intrin.h> or <immintrin.h> instead."
+#endif
+
+#ifndef _F16CINTRIN_H_INCLUDED
+#define _F16CINTRIN_H_INCLUDED
+
+#ifndef __F16C__
+#pragma GCC push_options
+#pragma GCC target("f16c")
+#define __DISABLE_F16C__
+#endif /* __F16C__ */
+
+extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_cvtsh_ss (unsigned short __S)
+{
+  __v8hi __H = __extension__ (__v8hi){ (short) __S, 0, 0, 0, 0, 0, 0, 0 };
+  __v4sf __A = __builtin_ia32_vcvtph2ps (__H);
+  return __builtin_ia32_vec_ext_v4sf (__A, 0);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_ps (__m128i __A)
+{
+  return (__m128) __builtin_ia32_vcvtph2ps ((__v8hi) __A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_ps (__m128i __A)
+{
+  return (__m256) __builtin_ia32_vcvtph2ps256 ((__v8hi) __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_cvtss_sh (float __F, const int __I)
+{
+  __v4sf __A =  __extension__ (__v4sf){ __F, 0, 0, 0 };
+  __v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I);
+  return (unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_ph (__m128 __A, const int __I)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph ((__v4sf) __A, __I);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtps_ph (__m256 __A, const int __I)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph256 ((__v8sf) __A, __I);
+}
+#else
+#define _cvtss_sh(__F, __I)						\
+  (__extension__ 							\
+   ({									\
+      __v4sf __A =  __extension__ (__v4sf){ __F, 0, 0, 0 };		\
+      __v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I);			\
+      (unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0);		\
+    }))
+
+#define _mm_cvtps_ph(A, I) \
+  ((__m128i) __builtin_ia32_vcvtps2ph ((__v4sf)(__m128) A, (int) (I)))
+
+#define _mm256_cvtps_ph(A, I) \
+  ((__m128i) __builtin_ia32_vcvtps2ph256 ((__v8sf)(__m256) A, (int) (I)))
+#endif /* __OPTIMIZE */
+
+#ifdef __DISABLE_F16C__
+#undef __DISABLE_F16C__
+#pragma GCC pop_options
+#endif /* __DISABLE_F16C__ */
+
+#endif /* _F16CINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/fma4intrin.h b/gcc-4.9/gcc/config/i386/fma4intrin.h
new file mode 100644
index 000000000..e1bdef7b5
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/fma4intrin.h
@@ -0,0 +1,241 @@
+/* Copyright (C) 2007-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86INTRIN_H_INCLUDED
+# error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _FMA4INTRIN_H_INCLUDED
+#define _FMA4INTRIN_H_INCLUDED
+
+/* We need definitions from the SSE4A, SSE3, SSE2 and SSE header files.  */
+#include <ammintrin.h>
+
+#ifndef __FMA4__
+#pragma GCC push_options
+#pragma GCC target("fma4")
+#define __DISABLE_FMA4__
+#endif /* __FMA4__ */
+
+/* 128b Floating point multiply/add type instructions.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_ps (__m128 __A, __m128 __B, __m128 __C)
+
+{
+  return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps (-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd (-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddss (-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd (-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps (-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd (-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddss (-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd (-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddsub_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddsub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msubadd_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msubadd_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+/* 256b Floating point multiply/add type instructions.  */
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_macc_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_macc_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msub_ps (__m256 __A, __m256 __B, __m256 __C)
+
+{
+  return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmacc_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256 (-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmacc_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256 (-(__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmsub_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256 (-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmsub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256 (-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maddsub_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maddsub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msubadd_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msubadd_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+#ifdef __DISABLE_FMA4__
+#undef __DISABLE_FMA4__
+#pragma GCC pop_options
+#endif /* __DISABLE_FMA4__ */
+
+#endif
diff --git a/gcc-4.9/gcc/config/i386/fmaintrin.h b/gcc-4.9/gcc/config/i386/fmaintrin.h
new file mode 100644
index 000000000..bfbb75d59
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/fmaintrin.h
@@ -0,0 +1,302 @@
+/* Copyright (C) 2011-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+# error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _FMAINTRIN_H_INCLUDED
+#define _FMAINTRIN_H_INCLUDED
+
+#ifndef __FMA__
+#pragma GCC push_options
+#pragma GCC target("fma")
+#define __DISABLE_FMA__
+#endif /* __FMA__ */
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B,
+                                           (__v2df)__C);
+}
+
+extern __inline __m256d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B,
+                                              (__v4df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B,
+                                          (__v4sf)__C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B,
+                                             (__v8sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3 ((__v2df)__A, (__v2df)__B,
+                                             (__v2df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddss3 ((__v4sf)__A, (__v4sf)__B,
+                                            (__v4sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B,
+                                           -(__v2df)__C);
+}
+
+extern __inline __m256d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B,
+                                              -(__v4df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B,
+                                          -(__v4sf)__C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsub_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B,
+                                             -(__v8sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd3 ((__v2df)__A, (__v2df)__B,
+                                            -(__v2df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss3 ((__v4sf)__A, (__v4sf)__B,
+                                           -(__v4sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd (-(__v2df)__A, (__v2df)__B,
+                                           (__v2df)__C);
+}
+
+extern __inline __m256d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmadd_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256 (-(__v4df)__A, (__v4df)__B,
+                                              (__v4df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps (-(__v4sf)__A, (__v4sf)__B,
+                                          (__v4sf)__C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmadd_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256 (-(__v8sf)__A, (__v8sf)__B,
+                                             (__v8sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd3 ((__v2df)__A, -(__v2df)__B,
+                                            (__v2df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss3 ((__v4sf)__A, -(__v4sf)__B,
+                                           (__v4sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd (-(__v2df)__A, (__v2df)__B,
+                                           -(__v2df)__C);
+}
+
+extern __inline __m256d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmsub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256 (-(__v4df)__A, (__v4df)__B,
+                                              -(__v4df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps (-(__v4sf)__A, (__v4sf)__B,
+                                          -(__v4sf)__C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmsub_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256 (-(__v8sf)__A, (__v8sf)__B,
+                                             -(__v8sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd3 ((__v2df)__A, -(__v2df)__B,
+                                            -(__v2df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss3 ((__v4sf)__A, -(__v4sf)__B,
+                                           -(__v4sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmaddsub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B,
+                                              (__v2df)__C);
+}
+
+extern __inline __m256d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmaddsub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256 ((__v4df)__A,
+                                                 (__v4df)__B,
+                                                 (__v4df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmaddsub_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B,
+                                             (__v4sf)__C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmaddsub_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256 ((__v8sf)__A,
+                                                (__v8sf)__B,
+                                                (__v8sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsubadd_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B,
+                                              -(__v2df)__C);
+}
+
+extern __inline __m256d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsubadd_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256 ((__v4df)__A,
+                                                 (__v4df)__B,
+                                                 -(__v4df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsubadd_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B,
+                                             -(__v4sf)__C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsubadd_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256 ((__v8sf)__A,
+                                                (__v8sf)__B,
+                                                -(__v8sf)__C);
+}
+
+#ifdef __DISABLE_FMA__
+#undef __DISABLE_FMA__
+#pragma GCC pop_options
+#endif /* __DISABLE_FMA__ */
+
+#endif
diff --git a/gcc-4.9/gcc/config/i386/freebsd.h b/gcc-4.9/gcc/config/i386/freebsd.h
new file mode 100644
index 000000000..bdca1b80b
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/freebsd.h
@@ -0,0 +1,149 @@
+/* Definitions for Intel 386 running FreeBSD with ELF format
+   Copyright (C) 1996-2014 Free Software Foundation, Inc.
+   Contributed by Eric Youngdale.
+   Modified for stabs-in-ELF by H.J. Lu.
+   Adapted from GNU/Linux version by John Polstra.
+   Continued development by David O'Brien <obrien@freebsd.org>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+
+/* Override the default comment-starter of "/".  */
+#undef  ASM_COMMENT_START
+#define ASM_COMMENT_START "#"
+
+#undef  ASM_APP_ON
+#define ASM_APP_ON "#APP\n"
+
+#undef  ASM_APP_OFF
+#define ASM_APP_OFF "#NO_APP\n"
+
+#undef  DBX_REGISTER_NUMBER
+#define DBX_REGISTER_NUMBER(n) \
+  (TARGET_64BIT ? dbx64_register_map[n] : svr4_dbx_register_map[n])
+
+#undef  NO_PROFILE_COUNTERS
+#define NO_PROFILE_COUNTERS	1
+
+/* Tell final.c that we don't need a label passed to mcount.  */
+
+#undef  MCOUNT_NAME
+#define MCOUNT_NAME ".mcount"
+
+/* Make gcc agree with <machine/ansi.h>.  */
+
+#undef  SIZE_TYPE
+#define SIZE_TYPE	(TARGET_64BIT ? "long unsigned int" : "unsigned int")
+ 
+#undef  PTRDIFF_TYPE
+#define PTRDIFF_TYPE	(TARGET_64BIT ? "long int" : "int")
+  
+#undef  WCHAR_TYPE_SIZE
+#define WCHAR_TYPE_SIZE	(TARGET_64BIT ? 32 : BITS_PER_WORD)
+
+#undef  SUBTARGET_EXTRA_SPECS	/* i386.h bogusly defines it.  */
+#define SUBTARGET_EXTRA_SPECS \
+  { "fbsd_dynamic_linker", FBSD_DYNAMIC_LINKER }
+    
+/* Provide a STARTFILE_SPEC appropriate for FreeBSD.  Here we add
+   the magical crtbegin.o file (see crtstuff.c) which provides part 
+	of the support for getting C++ file-scope static object constructed 
+	before entering `main'.  */
+   
+#undef	STARTFILE_SPEC
+#define STARTFILE_SPEC \
+  "%{!shared: \
+     %{pg:gcrt1.o%s} %{!pg:%{p:gcrt1.o%s} \
+		       %{!p:%{profile:gcrt1.o%s} \
+			 %{!profile:crt1.o%s}}}} \
+   crti.o%s %{!shared:crtbegin.o%s} %{shared:crtbeginS.o%s}"
+
+/* Provide a ENDFILE_SPEC appropriate for FreeBSD.  Here we tack on
+   the magical crtend.o file (see crtstuff.c) which provides part of 
+	the support for getting C++ file-scope static object constructed 
+	before entering `main', followed by a normal "finalizer" file, 
+	`crtn.o'.  */
+
+#undef	ENDFILE_SPEC
+#define ENDFILE_SPEC \
+  "%{!shared:crtend.o%s} %{shared:crtendS.o%s} crtn.o%s"
+
+/* Provide a LINK_SPEC appropriate for FreeBSD.  Here we provide support
+   for the special GCC options -static and -shared, which allow us to
+   link things in one of these three modes by applying the appropriate
+   combinations of options at link-time.
+
+   When the -shared link option is used a final link is not being
+   done.  */
+
+#undef	LINK_SPEC
+#define LINK_SPEC "\
+  %{p:%nconsider using '-pg' instead of '-p' with gprof(1)} \
+  %{v:-V} \
+  %{assert*} %{R*} %{rpath*} %{defsym*} \
+  %{shared:-Bshareable %{h*} %{soname*}} \
+    %{!shared: \
+      %{!static: \
+        %{rdynamic:-export-dynamic} \
+        -dynamic-linker %(fbsd_dynamic_linker) } \
+    %{static:-Bstatic}} \
+  %{symbolic:-Bsymbolic}"
+
+/* A C statement to output to the stdio stream FILE an assembler
+   command to advance the location counter to a multiple of 1<<LOG
+   bytes if it is within MAX_SKIP bytes.
+
+   This is used to align code labels according to Intel recommendations.  */
+
+#ifdef HAVE_GAS_MAX_SKIP_P2ALIGN
+#undef  ASM_OUTPUT_MAX_SKIP_ALIGN
+#define ASM_OUTPUT_MAX_SKIP_ALIGN(FILE, LOG, MAX_SKIP)					\
+  if ((LOG) != 0) {														\
+    if ((MAX_SKIP) == 0) fprintf ((FILE), "\t.p2align %d\n", (LOG));	\
+    else fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP));	\
+  }
+#endif
+
+/* Don't default to pcc-struct-return, we want to retain compatibility with
+   older gcc versions AND pcc-struct-return is nonreentrant.
+   (even though the SVR4 ABI for the i386 says that records and unions are
+   returned in memory).  */
+
+#undef  DEFAULT_PCC_STRUCT_RETURN
+#define DEFAULT_PCC_STRUCT_RETURN 0
+
+/* FreeBSD sets the rounding precision of the FPU to 53 bits.  Let the
+   compiler get the contents of <float.h> and std::numeric_limits correct.  */
+#undef TARGET_96_ROUND_53_LONG_DOUBLE
+#define TARGET_96_ROUND_53_LONG_DOUBLE (!TARGET_64BIT)
+
+/* Put all *tf routines in libgcc.  */
+#undef LIBGCC2_HAS_TF_MODE
+#define LIBGCC2_HAS_TF_MODE 1
+#define LIBGCC2_TF_CEXT q
+#define TF_SIZE 113
+
+/* Static stack checking is supported by means of probes.  */
+#define STACK_CHECK_STATIC_BUILTIN 1
+
+/* Support for i386 has been removed from FreeBSD 6.0 onward.  */
+#if FBSD_MAJOR >= 6
+#define SUBTARGET32_DEFAULT_CPU "i486"
+#endif
+
+#define TARGET_ASM_FILE_END file_end_indicate_exec_stack
+
diff --git a/gcc-4.9/gcc/config/i386/freebsd64.h b/gcc-4.9/gcc/config/i386/freebsd64.h
new file mode 100644
index 000000000..89430c432
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/freebsd64.h
@@ -0,0 +1,44 @@
+/* Definitions for AMD x86-64 running FreeBSD with ELF format
+   Copyright (C) 2002-2014 Free Software Foundation, Inc.
+   Contributed by David O'Brien <obrien@FreeBSD.org>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+
+#define SUBTARGET_EXTRA_SPECS \
+  { "fbsd_dynamic_linker", FBSD_DYNAMIC_LINKER }
+
+#undef CC1_SPEC
+#define CC1_SPEC "%(cc1_cpu) %{profile:-p}"
+
+/* Provide a LINK_SPEC appropriate for the FreeBSD/x86-64 ELF target.
+   This is a copy of LINK_SPEC from <i386/freebsd.h> tweaked for
+   the x86-64 target.  */
+
+#undef	LINK_SPEC
+#define LINK_SPEC "\
+  %{m32:-m elf_i386_fbsd} \
+  %{p:%nconsider using '-pg' instead of '-p' with gprof(1)} \
+  %{v:-V} \
+  %{assert*} %{R*} %{rpath*} %{defsym*} \
+  %{shared:-Bshareable %{h*} %{soname*}} \
+    %{!shared: \
+      %{!static: \
+        %{rdynamic:-export-dynamic} \
+	-dynamic-linker %(fbsd_dynamic_linker) } \
+    %{static:-Bstatic}} \
+  %{symbolic:-Bsymbolic}"
diff --git a/gcc-4.9/gcc/config/i386/fxsrintrin.h b/gcc-4.9/gcc/config/i386/fxsrintrin.h
new file mode 100644
index 000000000..98e73ee27
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/fxsrintrin.h
@@ -0,0 +1,73 @@
+/* Copyright (C) 2012-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* #if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED */
+/* # error "Never use <fxsrintrin.h> directly; include <x86intrin.h> instead." */
+/* #endif */
+
+#ifndef _FXSRINTRIN_H_INCLUDED
+#define _FXSRINTRIN_H_INCLUDED
+
+#ifndef __FXSR__
+#pragma GCC push_options
+#pragma GCC target("fxsr")
+#define __DISABLE_FXSR__
+#endif /* __FXSR__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_fxsave (void *__P)
+{
+  return __builtin_ia32_fxsave (__P);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_fxrstor (void *__P)
+{
+  return __builtin_ia32_fxrstor (__P);
+}
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_fxsave64 (void *__P)
+{
+    return __builtin_ia32_fxsave64 (__P);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_fxrstor64 (void *__P)
+{
+    return __builtin_ia32_fxrstor64 (__P);
+}
+#endif
+
+#ifdef __DISABLE_FXSR__
+#undef __DISABLE_FXSR__
+#pragma GCC pop_options
+#endif /* __DISABLE_FXSR__ */
+
+
+#endif /* _FXSRINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/gas.h b/gcc-4.9/gcc/config/i386/gas.h
new file mode 100644
index 000000000..edefc9aad
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/gas.h
@@ -0,0 +1,124 @@
+/* Definitions for Intel 386 using GAS.
+   Copyright (C) 1988-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Note that i386/seq-gas.h is a GAS configuration that does not use this
+   file.  */
+
+/* Use the bsd assembler syntax.  */
+/* we need to do this because gas is really a bsd style assembler,
+ * and so doesn't work well this these att-isms:
+ *
+ *  ASM_OUTPUT_SKIP is .set .,.+N, which isn't implemented in gas
+ *  ASM_OUTPUT_LOCAL is done with .set .,.+N, but that can't be
+ *   used to define bss static space
+ *
+ * Next is the question of whether to uses underscores.  RMS didn't
+ * like this idea at first, but since it is now obvious that we
+ * need this separate tm file for use with gas, at least to get
+ * dbx debugging info, I think we should also switch to underscores.
+ * We can keep i386v for real att style output, and the few
+ * people who want both form will have to compile twice.
+ */
+
+/* these come from i386/bsd.h, but are specific to sequent */
+#undef DBX_NO_XREFS
+#undef DBX_CONTIN_LENGTH
+
+/* Ask for COFF symbols.  */
+
+#define SDB_DEBUGGING_INFO 1
+
+/* Output #ident as a .ident.  */
+
+#undef TARGET_ASM_OUTPUT_IDENT
+#define TARGET_ASM_OUTPUT_IDENT default_asm_output_ident_directive
+
+/* In the past there was confusion as to what the argument to .align was
+   in GAS.  For the last several years the rule has been this: for a.out
+   file formats that argument is LOG, and for all other file formats the
+   argument is 1<<LOG.
+
+   However, GAS now has .p2align and .balign pseudo-ops so to remove any
+   doubt or guess work, and since this file is used for both a.out and other
+   file formats, we use one of them.  */
+
+#ifdef HAVE_GAS_BALIGN_AND_P2ALIGN 
+#undef ASM_OUTPUT_ALIGN
+#define ASM_OUTPUT_ALIGN(FILE,LOG) \
+  if ((LOG)!=0) fprintf ((FILE), "\t.balign %d\n", 1<<(LOG))
+#endif
+
+/* A C statement to output to the stdio stream FILE an assembler
+   command to advance the location counter to a multiple of 1<<LOG
+   bytes if it is within MAX_SKIP bytes.
+
+   This is used to align code labels according to Intel recommendations.  */
+
+#ifdef HAVE_GAS_MAX_SKIP_P2ALIGN
+#  define ASM_OUTPUT_MAX_SKIP_ALIGN(FILE,LOG,MAX_SKIP) \
+     if ((LOG) != 0) {\
+       if ((MAX_SKIP) == 0) fprintf ((FILE), "\t.p2align %d\n", (LOG)); \
+       else fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP)); \
+     }
+#endif
+
+/* A C statement or statements which output an assembler instruction
+   opcode to the stdio stream STREAM.  The macro-operand PTR is a
+   variable of type `char *' which points to the opcode name in its
+   "internal" form--the form that is written in the machine description.
+
+   GAS version 1.38.1 doesn't understand the `repz' opcode mnemonic.
+   So use `repe' instead.  */
+
+#undef ASM_OUTPUT_OPCODE
+#define ASM_OUTPUT_OPCODE(STREAM, PTR)	\
+{									\
+  if ((PTR)[0] == 'r'							\
+      && (PTR)[1] == 'e'						\
+      && (PTR)[2] == 'p')						\
+    {									\
+      if ((PTR)[3] == 'z')						\
+	{								\
+	  fputs ("repe", (STREAM));					\
+	  (PTR) += 4;							\
+	}								\
+      else if ((PTR)[3] == 'n' && (PTR)[4] == 'z')			\
+	{								\
+	  fputs ("repne", (STREAM));					\
+	  (PTR) += 5;							\
+	}								\
+    }									\
+  else									\
+    ASM_OUTPUT_AVX_PREFIX ((STREAM), (PTR));				\
+}
+
+/* Define macro used to output shift-double opcodes when the shift
+   count is in %cl.  Some assemblers require %cl as an argument;
+   some don't.
+
+   GAS requires the %cl argument, so override i386/unix.h.  */
+
+#undef SHIFT_DOUBLE_OMITS_COUNT
+#define SHIFT_DOUBLE_OMITS_COUNT 0
+
+/* The comment-starter string as GAS expects it. */
+#undef ASM_COMMENT_START
+#define ASM_COMMENT_START "#"
+
+#define TARGET_ASM_FILE_START_FILE_DIRECTIVE true
diff --git a/gcc-4.9/gcc/config/i386/geode.md b/gcc-4.9/gcc/config/i386/geode.md
new file mode 100644
index 000000000..7ca39ae35
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/geode.md
@@ -0,0 +1,151 @@
+;; Geode Scheduling
+;; Copyright (C) 2006-2014 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+;;
+;; The Geode architecture is one insn issue processor.
+;;
+;; This description is based on data from the following documents:
+;;
+;;    "AMD Geode GX Processor Data Book"
+;;    Advanced Micro Devices, Inc., Aug 2005.
+;;
+;;    "AMD Geode LX Processor Data Book"
+;;    Advanced Micro Devices, Inc., Jan 2006.
+;;
+;;
+;; CPU execution units of the Geode:
+;;
+;; issue	describes the issue pipeline.
+;; alu		describes the Integer unit
+;; fpu		describes the FP unit
+;;
+;; The fp unit is out of order execution unit with register renaming.
+;; There is also memory management unit and execution pipeline for
+;; load/store operations.  We ignore it and difference between insns
+;; using memory and registers.
+
+(define_automaton "geode")
+
+(define_cpu_unit "geode_issue,geode_alu,geode_fpu" "geode")
+
+(define_insn_reservation "alu" 1
+			 (and (eq_attr "cpu" "geode")
+			      (eq_attr "type" "alu,alu1,negnot,icmp,lea,test,imov,imovx,icmov,incdec,setcc"))
+			 "geode_issue,geode_alu")
+
+(define_insn_reservation "shift" 2
+			 (and (eq_attr "cpu" "geode")
+			      (eq_attr "type" "ishift,ishift1,rotate,rotate1"))
+			 "geode_issue,geode_alu*2")
+
+(define_insn_reservation "imul" 7
+			 (and (eq_attr "cpu" "geode")
+			      (eq_attr "type" "imul"))
+			 "geode_issue,geode_alu*7")
+
+(define_insn_reservation "idiv" 40
+			 (and (eq_attr "cpu" "geode")
+			      (eq_attr "type" "idiv"))
+			 "geode_issue,geode_alu*40")
+
+;; The branch unit.
+(define_insn_reservation "call" 2
+			 (and (eq_attr "cpu" "geode")
+			      (eq_attr "type" "call,callv"))
+			 "geode_issue,geode_alu*2")
+
+(define_insn_reservation "geode_branch" 1
+			 (and (eq_attr "cpu" "geode")
+			      (eq_attr "type" "ibr"))
+			 "geode_issue,geode_alu")
+
+(define_insn_reservation "geode_pop_push" 1
+			 (and (eq_attr "cpu" "geode")
+			      (eq_attr "type" "pop,push"))
+			 "geode_issue,geode_alu")
+
+(define_insn_reservation "geode_leave" 2
+			 (and (eq_attr "cpu" "geode")
+			      (eq_attr "type" "leave"))
+			 "geode_issue,geode_alu*2")
+
+(define_insn_reservation "geode_load_str" 4
+			 (and (eq_attr "cpu" "geode")
+			      (and (eq_attr "type" "str")
+				   (eq_attr "memory" "load,both")))
+			 "geode_issue,geode_alu*4")
+
+(define_insn_reservation "geode_store_str" 2
+			 (and (eq_attr "cpu" "geode")
+			      (and (eq_attr "type" "str")
+				   (eq_attr "memory" "store")))
+			 "geode_issue,geode_alu*2")
+
+;; Be optimistic
+(define_insn_reservation "geode_unknown" 1
+			 (and (eq_attr "cpu" "geode")
+			      (eq_attr "type" "multi,other"))
+			 "geode_issue,geode_alu")
+
+;; FPU
+
+(define_insn_reservation "geode_fop" 6
+			 (and (eq_attr "cpu" "geode")
+			      (eq_attr "type" "fop,fcmp"))
+			 "geode_issue,geode_fpu*6")
+
+(define_insn_reservation "geode_fsimple" 1
+			 (and (eq_attr "cpu" "geode")
+			      (eq_attr "type" "fmov,fcmov,fsgn,fxch"))
+			 "geode_issue,geode_fpu")
+
+(define_insn_reservation "geode_fist" 4
+			 (and (eq_attr "cpu" "geode")
+			      (eq_attr "type" "fistp,fisttp"))
+			 "geode_issue,geode_fpu*4")
+
+(define_insn_reservation "geode_fmul" 10
+			 (and (eq_attr "cpu" "geode")
+			      (eq_attr "type" "fmul"))
+			 "geode_issue,geode_fpu*10")
+
+(define_insn_reservation "geode_fdiv" 47
+			 (and (eq_attr "cpu" "geode")
+			      (eq_attr "type" "fdiv"))
+			 "geode_issue,geode_fpu*47")
+
+;; We use minimal latency (fsin) here
+(define_insn_reservation "geode_fpspc" 54
+			 (and (eq_attr "cpu" "geode")
+			      (eq_attr "type" "fpspc"))
+			 "geode_issue,geode_fpu*54")
+
+(define_insn_reservation "geode_frndint" 12
+			 (and (eq_attr "cpu" "geode")
+			      (eq_attr "type" "frndint"))
+			 "geode_issue,geode_fpu*12")
+
+(define_insn_reservation "geode_mmxmov" 1
+			 (and (eq_attr "cpu" "geode")
+			      (eq_attr "type" "mmxmov"))
+			 "geode_issue,geode_fpu")
+
+(define_insn_reservation "geode_mmx" 2
+			 (and (eq_attr "cpu" "geode")
+			      (eq_attr "type" "mmx,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft"))
+			 "geode_issue,geode_fpu*2")
diff --git a/gcc-4.9/gcc/config/i386/gmm_malloc.h b/gcc-4.9/gcc/config/i386/gmm_malloc.h
new file mode 100644
index 000000000..516b13b9c
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/gmm_malloc.h
@@ -0,0 +1,74 @@
+/* Copyright (C) 2004-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _MM_MALLOC_H_INCLUDED
+#define _MM_MALLOC_H_INCLUDED
+
+#include <stdlib.h>
+#include <errno.h>
+
+static __inline__ void* 
+_mm_malloc (size_t size, size_t align)
+{
+  void * malloc_ptr;
+  void * aligned_ptr;
+
+  /* Error if align is not a power of two.  */
+  if (align & (align - 1))
+    {
+      errno = EINVAL;
+      return ((void*) 0);
+    }
+
+  if (size == 0)
+    return ((void *) 0);
+
+ /* Assume malloc'd pointer is aligned at least to sizeof (void*).
+    If necessary, add another sizeof (void*) to store the value
+    returned by malloc. Effectively this enforces a minimum alignment
+    of sizeof double. */     
+    if (align < 2 * sizeof (void *))
+      align = 2 * sizeof (void *);
+
+  malloc_ptr = malloc (size + align);
+  if (!malloc_ptr)
+    return ((void *) 0);
+
+  /* Align  We have at least sizeof (void *) space below malloc'd ptr. */
+  aligned_ptr = (void *) (((size_t) malloc_ptr + align)
+			  & ~((size_t) (align) - 1));
+
+  /* Store the original pointer just before p.  */	
+  ((void **) aligned_ptr) [-1] = malloc_ptr;
+
+  return aligned_ptr;
+}
+
+static __inline__ void
+_mm_free (void * aligned_ptr)
+{
+  if (aligned_ptr)
+    free (((void **) aligned_ptr) [-1]);
+}
+
+#endif /* _MM_MALLOC_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/gnu-user-common.h b/gcc-4.9/gcc/config/i386/gnu-user-common.h
new file mode 100644
index 000000000..b34528217
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/gnu-user-common.h
@@ -0,0 +1,72 @@
+/* Common definitions for Intel 386 and AMD x86-64 systems using
+   GNU userspace.  Copyright (C) 2012-2014 Free Software Foundation, Inc.
+   Contributed by Ilya Enkovich.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* The svr4 ABI for the i386 says that records and unions are returned
+   in memory.  In the 64bit compilation we will turn this flag off in
+   ix86_option_override_internal, as we never do pcc_struct_return
+   scheme on this target.  */
+#undef DEFAULT_PCC_STRUCT_RETURN
+#define DEFAULT_PCC_STRUCT_RETURN 1
+
+/* We arrange for the whole %fs segment to map the tls area.  */
+#undef TARGET_TLS_DIRECT_SEG_REFS_DEFAULT
+#define TARGET_TLS_DIRECT_SEG_REFS_DEFAULT MASK_TLS_DIRECT_SEG_REFS
+
+#define TARGET_OS_CPP_BUILTINS()				\
+  do								\
+    {								\
+	GNU_USER_TARGET_OS_CPP_BUILTINS();			\
+    }								\
+  while (0)
+
+#undef CPP_SPEC
+#define CPP_SPEC "%{posix:-D_POSIX_SOURCE} %{pthread:-D_REENTRANT}"
+
+#undef GNU_USER_TARGET_CC1_SPEC
+#define GNU_USER_TARGET_CC1_SPEC "%(cc1_cpu) %{profile:-p}"
+
+#undef CC1_SPEC
+#define CC1_SPEC GNU_USER_TARGET_CC1_SPEC
+
+/* Similar to standard GNU userspace, but adding -ffast-math support.  */
+#define GNU_USER_TARGET_MATHFILE_SPEC \
+  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+   %{mpc32:crtprec32.o%s} \
+   %{mpc64:crtprec64.o%s} \
+   %{mpc80:crtprec80.o%s}"
+
+#undef  ENDFILE_SPEC
+#define ENDFILE_SPEC \
+  GNU_USER_TARGET_MATHFILE_SPEC " " \
+  GNU_USER_TARGET_ENDFILE_SPEC
+
+/* Put all *tf routines in libgcc.  */
+#undef LIBGCC2_HAS_TF_MODE
+#define LIBGCC2_HAS_TF_MODE 1
+#define LIBGCC2_TF_CEXT q
+#define TF_SIZE 113
+
+#define TARGET_ASM_FILE_END file_end_indicate_exec_stack
+
+/* The stack pointer needs to be moved while checking the stack.  */
+#define STACK_CHECK_MOVING_SP 1
+
+/* Static stack checking is supported by means of probes.  */
+#define STACK_CHECK_STATIC_BUILTIN 1
diff --git a/gcc-4.9/gcc/config/i386/gnu-user.h b/gcc-4.9/gcc/config/i386/gnu-user.h
new file mode 100644
index 000000000..e1163c9da
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/gnu-user.h
@@ -0,0 +1,164 @@
+/* Definitions for Intel 386 systems using GNU userspace.
+   Copyright (C) 1994-2014 Free Software Foundation, Inc.
+   Contributed by Eric Youngdale.
+   Modified for stabs-in-ELF by H.J. Lu.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Output at beginning of assembler file.  */
+/* The .file command should always begin the output.  */
+#define TARGET_ASM_FILE_START_FILE_DIRECTIVE true
+
+#undef ASM_COMMENT_START
+#define ASM_COMMENT_START "#"
+
+#undef DBX_REGISTER_NUMBER
+#define DBX_REGISTER_NUMBER(n) \
+  (TARGET_64BIT ? dbx64_register_map[n] : svr4_dbx_register_map[n])
+
+/* Output assembler code to FILE to call the profiler.
+   To the best of my knowledge, no GNU userspace libc has required the label
+   argument to mcount.  */
+
+#define NO_PROFILE_COUNTERS	1
+
+#undef MCOUNT_NAME
+#define MCOUNT_NAME "mcount"
+
+/* The GLIBC version of mcount for the x86 assumes that there is a
+   frame, so we cannot allow profiling without a frame pointer.  */
+
+#undef SUBTARGET_FRAME_POINTER_REQUIRED
+#define SUBTARGET_FRAME_POINTER_REQUIRED crtl->profile
+
+#undef SIZE_TYPE
+#define SIZE_TYPE "unsigned int"
+ 
+#undef PTRDIFF_TYPE
+#define PTRDIFF_TYPE "int"
+  
+#undef WCHAR_TYPE
+#define WCHAR_TYPE "long int"
+   
+#undef WCHAR_TYPE_SIZE
+#define WCHAR_TYPE_SIZE BITS_PER_WORD
+    
+/* Provide a LINK_SPEC appropriate for GNU userspace.  Here we provide support
+   for the special GCC options -static and -shared, which allow us to
+   link things in one of these three modes by applying the appropriate
+   combinations of options at link-time.
+
+   When the -shared link option is used a final link is not being
+   done.  */
+
+#undef  ASM_SPEC
+#define ASM_SPEC \
+  "--32 %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}}"
+
+#undef  SUBTARGET_EXTRA_SPECS
+#define SUBTARGET_EXTRA_SPECS \
+  { "link_emulation", GNU_USER_LINK_EMULATION },\
+  { "dynamic_linker", GNU_USER_DYNAMIC_LINKER }
+
+#define GNU_USER_TARGET_LINK_SPEC "-m %(link_emulation) %{shared:-shared} \
+  %{!shared: \
+    %{!static: \
+      %{rdynamic:-export-dynamic} \
+      -dynamic-linker %(dynamic_linker)} \
+      %{static:-static}}"
+
+#undef	LINK_SPEC
+#define LINK_SPEC GNU_USER_TARGET_LINK_SPEC
+
+/* A C statement (sans semicolon) to output to the stdio stream
+   FILE the assembler definition of uninitialized global DECL named
+   NAME whose size is SIZE bytes and alignment is ALIGN bytes.
+   Try to use asm_output_aligned_bss to implement this macro.  */
+
+#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \
+  asm_output_aligned_bss (FILE, DECL, NAME, SIZE, ALIGN)
+
+/* A C statement to output to the stdio stream FILE an assembler
+   command to advance the location counter to a multiple of 1<<LOG
+   bytes if it is within MAX_SKIP bytes.
+
+   This is used to align code labels according to Intel recommendations.  */
+
+#ifdef HAVE_GAS_MAX_SKIP_P2ALIGN
+#define ASM_OUTPUT_MAX_SKIP_ALIGN(FILE,LOG,MAX_SKIP)			\
+  do {									\
+    if ((LOG) != 0) {							\
+      if ((MAX_SKIP) == 0) fprintf ((FILE), "\t.p2align %d\n", (LOG));	\
+      else {								\
+	fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP));	\
+	/* Make sure that we have at least 8 byte alignment if > 8 byte \
+	   alignment is preferred.  */					\
+	if ((LOG) > 3							\
+	    && (1 << (LOG)) > ((MAX_SKIP) + 1)				\
+	    && (MAX_SKIP) >= 7)						\
+	  fputs ("\t.p2align 3\n", (FILE));				\
+      }									\
+    }									\
+  } while (0)
+#endif
+
+/* Handle special EH pointer encodings.  Absolute, pc-relative, and
+   indirect are handled automatically.  */
+#define ASM_MAYBE_OUTPUT_ENCODED_ADDR_RTX(FILE, ENCODING, SIZE, ADDR, DONE) \
+  do {									\
+    if ((SIZE) == 4 && ((ENCODING) & 0x70) == DW_EH_PE_datarel)		\
+      {									\
+        fputs (ASM_LONG, FILE);			\
+        assemble_name (FILE, XSTR (ADDR, 0));				\
+	fputs (((ENCODING) & DW_EH_PE_indirect ? "@GOT" : "@GOTOFF"), FILE); \
+        goto DONE;							\
+      }									\
+  } while (0)
+
+/* Used by crtstuff.c to initialize the base of data-relative relocations.
+   These are GOT relative on x86, so return the pic register.  */
+#ifdef __PIC__
+#define CRT_GET_RFIB_DATA(BASE)			\
+  {						\
+    register void *ebx_ __asm__("ebx");		\
+    BASE = ebx_;				\
+  }
+#else
+#define CRT_GET_RFIB_DATA(BASE)						\
+  __asm__ ("call\t.LPR%=\n"						\
+	   ".LPR%=:\n\t"						\
+	   "pop{l}\t%0\n\t"						\
+	   /* Due to a GAS bug, this cannot use EAX.  That encodes	\
+	      smaller than the traditional EBX, which results in the	\
+	      offset being off by one.  */				\
+	   "add{l}\t{$_GLOBAL_OFFSET_TABLE_+[.-.LPR%=],%0"		\
+		   "|%0,_GLOBAL_OFFSET_TABLE_+(.-.LPR%=)}"		\
+	   : "=d"(BASE))
+#endif
+
+#ifdef TARGET_LIBC_PROVIDES_SSP
+/* i386 glibc provides __stack_chk_guard in %gs:0x14.  */
+#define TARGET_THREAD_SSP_OFFSET	0x14
+
+/* We only build the -fsplit-stack support in libgcc if the
+   assembler has full support for the CFI directives.  */
+#if HAVE_GAS_CFI_PERSONALITY_DIRECTIVE
+#define TARGET_CAN_SPLIT_STACK
+#endif
+/* We steal the last transactional memory word.  */
+#define TARGET_THREAD_SPLIT_STACK_OFFSET 0x30
+#endif
diff --git a/gcc-4.9/gcc/config/i386/gnu-user64.h b/gcc-4.9/gcc/config/i386/gnu-user64.h
new file mode 100644
index 000000000..1c72b41e4
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/gnu-user64.h
@@ -0,0 +1,99 @@
+/* Definitions for AMD x86-64 using GNU userspace.
+   Copyright (C) 2001-2014 Free Software Foundation, Inc.
+   Contributed by Jan Hubicka <jh@suse.cz>, based on linux.h.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* Provide a LINK_SPEC.  Here we provide support for the special GCC
+   options -static and -shared, which allow us to link things in one
+   of these three modes by applying the appropriate combinations of
+   options at link-time.
+
+   When the -shared link option is used a final link is not being
+   done.  */
+
+#if TARGET_64BIT_DEFAULT
+#define SPEC_32 "m16|m32"
+#if TARGET_BI_ARCH == 2
+#define SPEC_64 "m64"
+#define SPEC_X32 "m16|m32|m64:;"
+#else
+#define SPEC_64 "m16|m32|mx32:;"
+#define SPEC_X32 "mx32"
+#endif
+#else
+#define SPEC_32 "m64|mx32:;"
+#define SPEC_64 "m64"
+#define SPEC_X32 "mx32"
+#endif
+
+#undef ASM_SPEC
+#define ASM_SPEC "%{" SPEC_32 ":--32} \
+ %{" SPEC_64 ":--64} \
+ %{" SPEC_X32 ":--x32} \
+ %{!mno-sse2avx:%{mavx:-msse2avx}} %{msse2avx:%{!mavx:-msse2avx}}"
+
+#define GNU_USER_TARGET_LINK_SPEC				   \
+                  "%{" SPEC_64 ":-m " GNU_USER_LINK_EMULATION64 "} \
+                   %{" SPEC_32 ":-m " GNU_USER_LINK_EMULATION32 "} \
+                   %{" SPEC_X32 ":-m " GNU_USER_LINK_EMULATIONX32 "} \
+  %{shared:-shared} \
+  %{!shared: \
+    %{!static: \
+      %{rdynamic:-export-dynamic} \
+      %{" SPEC_32 ":-dynamic-linker " GNU_USER_DYNAMIC_LINKER32 "} \
+      %{" SPEC_64 ":-dynamic-linker " GNU_USER_DYNAMIC_LINKER64 "} \
+      %{" SPEC_X32 ":-dynamic-linker " GNU_USER_DYNAMIC_LINKERX32 "}} \
+    %{static:-static}}"
+
+#undef	LINK_SPEC
+#define LINK_SPEC GNU_USER_TARGET_LINK_SPEC
+
+#if TARGET_64BIT_DEFAULT
+#if TARGET_BI_ARCH == 2
+#define MULTILIB_DEFAULTS { "mx32" }
+#else
+#define MULTILIB_DEFAULTS { "m64" }
+#endif
+#else
+#define MULTILIB_DEFAULTS { "m32" }
+#endif
+
+#ifdef TARGET_LIBC_PROVIDES_SSP
+/* i386 glibc provides __stack_chk_guard in %gs:0x14,
+   x32 glibc provides it in %fs:0x18.
+   x86_64 glibc provides it in %fs:0x28.  */
+#define TARGET_THREAD_SSP_OFFSET \
+  (TARGET_64BIT ? (TARGET_X32 ? 0x18 : 0x28) : 0x14)
+
+/* We only build the -fsplit-stack support in libgcc if the
+   assembler has full support for the CFI directives.  */
+#if HAVE_GAS_CFI_PERSONALITY_DIRECTIVE
+#define TARGET_CAN_SPLIT_STACK
+#endif
+/* We steal the last transactional memory word.  */
+#define TARGET_THREAD_SPLIT_STACK_OFFSET \
+  (TARGET_64BIT ? (TARGET_X32 ? 0x40 : 0x70) : 0x30)
+#endif
+
+#undef WCHAR_TYPE
+#define WCHAR_TYPE (TARGET_LP64 ? "int" : "long int")
diff --git a/gcc-4.9/gcc/config/i386/gnu.h b/gcc-4.9/gcc/config/i386/gnu.h
new file mode 100644
index 000000000..29896e9af
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/gnu.h
@@ -0,0 +1,47 @@
+/* Configuration for an i386 running GNU with ELF as the target machine.  */
+
+/*
+Copyright (C) 1994-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#define GNU_USER_LINK_EMULATION "elf_i386"
+
+#undef GNU_USER_DYNAMIC_LINKER
+#define GNU_USER_DYNAMIC_LINKER "/lib/ld.so"
+
+#undef	STARTFILE_SPEC
+#if defined HAVE_LD_PIE
+#define STARTFILE_SPEC \
+  "%{!shared: %{pg|p|profile:gcrt0.o%s;pie:Scrt1.o%s;static:crt0.o%s;:crt1.o%s}} \
+   crti.o%s %{static:crtbeginT.o%s;shared|pie:crtbeginS.o%s;:crtbegin.o%s}"
+#else
+#define STARTFILE_SPEC \
+  "%{!shared: %{pg|p|profile:gcrt0.o%s;static:crt0.o%s;:crt1.o%s}} \
+   crti.o%s %{static:crtbeginT.o%s;shared|pie:crtbeginS.o%s;:crtbegin.o%s}"
+#endif
+
+#ifdef TARGET_LIBC_PROVIDES_SSP
+
+/* Not supported yet.  */
+# undef TARGET_THREAD_SSP_OFFSET
+
+/* Not supported yet.  */
+# undef TARGET_CAN_SPLIT_STACK
+# undef TARGET_THREAD_SPLIT_STACK_OFFSET
+
+#endif
diff --git a/gcc-4.9/gcc/config/i386/gstabs.h b/gcc-4.9/gcc/config/i386/gstabs.h
new file mode 100644
index 000000000..e9a621871
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/gstabs.h
@@ -0,0 +1,7 @@
+/* We do not want to output SDB debugging information.  */
+
+#undef SDB_DEBUGGING_INFO
+
+/* We want to output DBX debugging information.  */
+
+#define DBX_DEBUGGING_INFO 1
diff --git a/gcc-4.9/gcc/config/i386/host-cygwin.c b/gcc-4.9/gcc/config/i386/host-cygwin.c
new file mode 100644
index 000000000..0bc04bb5e
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/host-cygwin.c
@@ -0,0 +1,78 @@
+/* Cygwin host-specific hook definitions.
+ Copyright (C) 2004-2014 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3, or (at your
+ option) any later version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
+ License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3.  If not see
+ <http://www.gnu.org/licenses/>. */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "hosthooks.h"
+#include "hosthooks-def.h"
+#include "diagnostic.h"
+
+static void * cygwin_gt_pch_get_address (size_t, int fd);
+static size_t cygwin_gt_pch_alloc_granularity (void);
+
+#undef HOST_HOOKS_GT_PCH_GET_ADDRESS
+#define HOST_HOOKS_GT_PCH_GET_ADDRESS cygwin_gt_pch_get_address
+#undef HOST_HOOKS_GT_PCH_ALLOC_GRANULARITY
+#define HOST_HOOKS_GT_PCH_ALLOC_GRANULARITY cygwin_gt_pch_alloc_granularity
+
+/* Granularity for reserving address space.  */
+static const size_t va_granularity = 0x10000;
+
+/*  Return the alignment required for allocating virtual memory. */
+static size_t
+cygwin_gt_pch_alloc_granularity (void)
+{
+  return va_granularity;
+}
+
+/* Identify an address that's likely to be free in a subsequent invocation
+   of the compiler.  The area should be able to hold SIZE bytes.  FD is an
+   open file descriptor if the host would like to probe with mmap.  */
+static void *
+cygwin_gt_pch_get_address (size_t sz, int fd)
+{
+  void *base;
+  off_t p = lseek(fd, 0, SEEK_CUR);
+
+  if (p == (off_t) -1)
+    fatal_error ("can%'t get position in PCH file: %m");
+
+   /* Cygwin requires that the underlying file be at least
+      as large as the requested mapping.  */
+  if ((size_t) p < sz)
+  { 
+    if ( ftruncate (fd, sz) == -1 )
+      fatal_error ("can%'t extend PCH file: %m");
+  }
+
+  base = mmap (NULL, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+
+  if (base == MAP_FAILED)
+    base = NULL;
+  else
+    munmap (base, sz);
+
+  if (lseek (fd, p, SEEK_SET) == (off_t) -1 )
+    fatal_error ("can%'t set position in PCH file: %m");
+
+  return base;
+}
+
+const struct host_hooks host_hooks = HOST_HOOKS_INITIALIZER;
diff --git a/gcc-4.9/gcc/config/i386/host-i386-darwin.c b/gcc-4.9/gcc/config/i386/host-i386-darwin.c
new file mode 100644
index 000000000..9325e8dd3
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/host-i386-darwin.c
@@ -0,0 +1,30 @@
+/* i386-darwin host-specific hook definitions.
+   Copyright (C) 2003-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "hosthooks.h"
+#include "hosthooks-def.h"
+#include "config/host-darwin.h"
+
+/* Darwin doesn't do anything special for x86 hosts; this file exists just
+   to include config/host-darwin.h.  */
+
+const struct host_hooks host_hooks = HOST_HOOKS_INITIALIZER;
diff --git a/gcc-4.9/gcc/config/i386/host-mingw32.c b/gcc-4.9/gcc/config/i386/host-mingw32.c
new file mode 100644
index 000000000..fc01ceb24
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/host-mingw32.c
@@ -0,0 +1,198 @@
+/* mingw32 host-specific hook definitions.
+   Copyright (C) 2004-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "hosthooks.h"
+#include "hosthooks-def.h"
+#include "diagnostic.h"
+
+
+#define WIN32_LEAN_AND_MEAN  /* Not so important if we have windows.h.gch.  */
+#include <windows.h>
+#include <stdlib.h>
+
+static void * mingw32_gt_pch_get_address (size_t, int);
+static int mingw32_gt_pch_use_address (void *, size_t, int, size_t);
+static size_t mingw32_gt_pch_alloc_granularity (void);
+
+#undef HOST_HOOKS_GT_PCH_GET_ADDRESS
+#define HOST_HOOKS_GT_PCH_GET_ADDRESS mingw32_gt_pch_get_address
+#undef HOST_HOOKS_GT_PCH_USE_ADDRESS
+#define HOST_HOOKS_GT_PCH_USE_ADDRESS mingw32_gt_pch_use_address
+#undef HOST_HOOKS_GT_PCH_ALLOC_GRANULARITY
+#define HOST_HOOKS_GT_PCH_ALLOC_GRANULARITY mingw32_gt_pch_alloc_granularity
+
+static inline void w32_error(const char*, const char*, int, const char*);
+
+/* FIXME: Is this big enough?  */
+static const size_t pch_VA_max_size  = 128 * 1024 * 1024;
+
+/* Granularity for reserving address space.  */
+static size_t va_granularity = 0x10000;
+
+/* Print out the GetLastError() translation.  */ 
+static inline void
+w32_error (const char* function, const char* file, int line,
+	   const char* my_msg)
+{
+  LPSTR w32_msgbuf;
+  FormatMessageA (FORMAT_MESSAGE_ALLOCATE_BUFFER
+		  | FORMAT_MESSAGE_FROM_SYSTEM
+		  | FORMAT_MESSAGE_IGNORE_INSERTS
+		  | FORMAT_MESSAGE_MAX_WIDTH_MASK,
+    		  NULL, GetLastError(),
+		  MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+		  (LPSTR) &w32_msgbuf, 0, NULL);
+  fprintf(stderr, "internal error in %s, at %s:%d: %s: %s\n",
+	  function, trim_filename (file), line, my_msg, w32_msgbuf);
+  LocalFree ((HLOCAL)w32_msgbuf);
+}
+
+/* Granularity for reserving address space.  */
+static size_t
+mingw32_gt_pch_alloc_granularity (void)
+{
+  SYSTEM_INFO si;
+
+  GetSystemInfo (&si);
+  va_granularity = (size_t) si.dwAllocationGranularity;
+
+  return va_granularity;
+}
+
+/* Identify an address that's likely to be free in a subsequent invocation
+   of the compiler.  The area should be able to hold SIZE bytes.  FD is an
+   open file descriptor if the host would like to probe with mmap.  */
+
+static void *
+mingw32_gt_pch_get_address (size_t size, int fd  ATTRIBUTE_UNUSED)
+{
+  void* res;
+  size = (size + va_granularity - 1) & ~(va_granularity - 1);
+  if (size > pch_VA_max_size)
+    return NULL;
+
+  /* FIXME: We let system determine base by setting first arg to NULL.
+     Allocating at top of available address space avoids unnecessary
+     fragmentation of "ordinary" (malloc's)  address space but may not
+     be safe  with delayed load of system dll's. Preferred addresses
+     for NT system dlls is in 0x70000000 to 0x78000000 range.
+     If we allocate at bottom we need to reserve the address as early
+     as possible and at the same point in each invocation. */
+ 
+  res = VirtualAlloc (NULL, pch_VA_max_size,
+		      MEM_RESERVE | MEM_TOP_DOWN,
+		      PAGE_NOACCESS);
+  if (!res)
+    w32_error (__FUNCTION__, __FILE__, __LINE__, "VirtualAlloc");
+  else
+    /* We do not need the address space for now, so free it.  */
+    VirtualFree (res, 0, MEM_RELEASE);
+
+  return res; 
+}
+
+/* ADDR is an address returned by gt_pch_get_address.  Attempt to allocate
+   SIZE bytes at the same address and load it with the data from FD at 
+   OFFSET.  Return -1 if we couldn't allocate memory at ADDR, return 0
+   if the memory is allocated but the data not loaded, return 1 if done.  */
+
+static int
+mingw32_gt_pch_use_address (void *addr, size_t size, int fd,
+			    size_t offset)
+{
+  void * mmap_addr;
+  HANDLE mmap_handle;
+ 
+  /* Apparently, MS Vista puts unnamed file mapping objects into Global
+     namespace when running an application in a Terminal Server
+     session.  This causes failure since, by default, applications 
+     don't get SeCreateGlobalPrivilege. We don't need global
+     memory sharing so explicitly put object into Local namespace.
+
+     If multiple concurrent GCC processes are using PCH functionality,
+     MapViewOfFileEx returns "Access Denied" error.  So we ensure the
+     session-wide mapping name is unique by appending process ID.  */
+
+#define OBJECT_NAME_FMT "Local\\MinGWGCCPCH-"
+
+  char* object_name = NULL;
+  /* However, the documentation for CreateFileMapping says that on NT4
+     and earlier, backslashes are invalid in object name.  So, we need
+     to check if we are on Windows2000 or higher.  */
+  OSVERSIONINFO version_info;
+  int r;
+
+  version_info.dwOSVersionInfoSize = sizeof (version_info);
+
+  if (size == 0)
+    return 0; 
+
+  /* Offset must be also be a multiple of allocation granularity for
+     this to work.  We can't change the offset. */ 
+  if ((offset & (va_granularity - 1)) != 0 || size > pch_VA_max_size)
+    return -1;
+
+
+  /* Determine the version of Windows we are running on and use a
+     uniquely-named local object if running > 4.  */
+  GetVersionEx (&version_info);
+  if (version_info.dwMajorVersion > 4)
+    {
+      char local_object_name [sizeof (OBJECT_NAME_FMT)
+			      + sizeof (DWORD) * 2];
+      snprintf (local_object_name, sizeof (local_object_name),
+		OBJECT_NAME_FMT "%lx", GetCurrentProcessId());
+      object_name = local_object_name;
+    }
+  mmap_handle = CreateFileMappingA ((HANDLE) _get_osfhandle (fd), NULL,
+				    PAGE_WRITECOPY | SEC_COMMIT, 0, 0,
+				    object_name);
+
+  if (mmap_handle == NULL)
+    {
+      w32_error (__FUNCTION__,  __FILE__, __LINE__, "CreateFileMapping");
+      return -1; 
+    }
+
+  /* Retry five times, as here might occure a race with multiple gcc's
+     instances at same time.  */
+  for (r = 0; r < 5; r++)
+   {
+      mmap_addr = MapViewOfFileEx (mmap_handle, FILE_MAP_COPY, 0, offset,
+				   size, addr);
+      if (mmap_addr == addr)
+	break;
+      if (r != 4)
+        Sleep (500);
+   }
+      
+  if (mmap_addr != addr)
+    {
+      w32_error (__FUNCTION__, __FILE__, __LINE__, "MapViewOfFileEx");
+      CloseHandle(mmap_handle);
+      return  -1;
+    }
+
+  return 1;
+}
+
+const struct host_hooks host_hooks = HOST_HOOKS_INITIALIZER;
diff --git a/gcc-4.9/gcc/config/i386/i386-builtin-types.awk b/gcc-4.9/gcc/config/i386/i386-builtin-types.awk
new file mode 100644
index 000000000..3fc1455ec
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/i386-builtin-types.awk
@@ -0,0 +1,280 @@
+#  Copyright (C) 2009-2014 Free Software Foundation, Inc.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3, or (at your option) any
+# later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+# Generates compressed tables for types for i386 builtin functions.
+
+function do_error(string) {
+    print FILENAME ":" FNR ": " string > "/dev/stderr"
+    errors = 1
+}
+
+function check_type(string) {
+    if (!(string in type_hash))
+	do_error("undefined type code " string)
+}
+
+# We can significantly reduce the size of the read-only tables
+# by forcing the compiler to use a smaller implementation type
+# for the enumerations.
+function attribute_mode(count) {
+    # ??? Except that we get strange "comparison always false" warnings
+    # for comparisons between different elements of the enumeration.
+    #    print "#ifdef __GNUC__"
+    #    if (count < 256)
+    #	print "  __attribute__((__mode__(__QI__)))"
+    #    else
+    #	print "  __attribute__((__mode__(__HI__)))"
+    #    print "#endif"
+}
+
+BEGIN {
+    FS = "[() \t,]+"
+   
+    prim_defs = 0
+    vect_defs = 0
+    ptr_defs = 0
+    cptr_defs = 0
+    func_defs = 0
+    func_args = 0
+    alias_defs = 0
+}
+
+# Skip blank lines or comments.
+/^[ \t]*(#|$)/ {
+    next
+}
+
+$1 == "DEF_PRIMITIVE_TYPE" {
+    if (NF == 4) {
+	type_hash[$2] = 1
+	prim_name[prim_defs] = $2
+	prim_base[prim_defs] = $3
+	prim_defs++
+    } else
+	do_error("DEF_PRIMITIVE_TYPE expected 2 arguments")
+    next
+}
+
+$1 == "DEF_VECTOR_TYPE" {
+    if (NF == 4 || NF == 5) {
+	check_type($3)
+	type_hash[$2] = 1
+	vect_name[vect_defs] = $2
+	vect_base[vect_defs] = $3
+	vect_mode[vect_defs] = (NF == 5 ? $4 : $2)
+	vect_defs++
+    } else
+	do_error("DEF_VECTOR_TYPE expected 2 arguments")
+    next
+}
+
+$1 == "DEF_POINTER_TYPE" {
+    if (NF == 4) {
+	check_type($3)
+	type_hash[$2] = 1
+	ptr_name[ptr_defs] = $2
+	ptr_base[ptr_defs] = $3
+	ptr_defs++
+    } else if (NF == 5) {
+	check_type($3)
+	if ($4 == "CONST") {
+	    type_hash[$2] = 1
+	    cptr_name[cptr_defs] = $2
+	    cptr_base[cptr_defs] = $3
+	    cptr_defs++
+	} else
+	    do_error("invalid qualifier \"" $4 "\"")
+    }
+    else
+	do_error("DEF_POINTER_TYPE expected 2 or 3 arguments")
+    next
+}
+
+$1 == "DEF_FUNCTION_TYPE" {
+    func_start[func_defs] = func_args
+    for (i = 2; i < NF; ++i) {
+	check_type($i)
+	func_types[func_args++] = $i
+    }
+
+    if (NF < 3)
+	do_error("DEF_FUNCTION_TYPE expected at least 1 argument")
+    else if (NF == 3)
+	name = $2 "_FTYPE_VOID"
+    else {
+	name = $2 "_FTYPE"
+	for (i = 3; i < NF; ++i)
+	    name = name "_" $i
+    }
+    func_hash[name] = 1
+    func_name[func_defs++] = name
+    next
+}
+
+$1 == "DEF_FUNCTION_TYPE_ALIAS" {
+    if (NF == 4) {
+	if ($2 in func_hash) {
+	    alias_base[alias_defs] = $2
+	    alias_name[alias_defs] = $2 "_" $3
+	    alias_defs++
+	} else
+	    do_error("undefined function code " $2)
+    } else
+	do_error("DEF_FUNCTION_TYPE_ALIAS expected 2 arguments")
+    next
+}
+
+{
+    do_error("unknown directive \"" $1 "\"");
+}
+
+END {
+    if (errors)
+	exit 1
+
+    print "/* This file is auto-generated by i386-builtin-types.awk.  */\n"
+
+    # This first enumeration contains all of the non-function types.
+    print "enum ix86_builtin_type {"
+    for (i = 0; i < prim_defs; ++i)
+	print "  IX86_BT_" prim_name[i] ","
+    print "  IX86_BT_LAST_PRIM = IX86_BT_" prim_name[i-1] ","
+    for (i = 0; i < vect_defs; ++i)
+	print "  IX86_BT_" vect_name[i] ","
+    print "  IX86_BT_LAST_VECT = IX86_BT_" vect_name[i-1] ","
+    for (i = 0; i < ptr_defs; ++i)
+	print "  IX86_BT_" ptr_name[i] ","
+    print "  IX86_BT_LAST_PTR = IX86_BT_" ptr_name[i-1] ","
+    for (i = 0; i < cptr_defs; ++i)
+	print "  IX86_BT_" cptr_name[i] ","
+    print "  IX86_BT_LAST_CPTR = IX86_BT_" cptr_name[i-1] "\n}"
+    attribute_mode(prim_defs + vect_defs + ptr_defs + cptr_defs)
+    print ";\n\n"
+
+    # We can't tabularize the initialization of the primitives, since
+    # at least one of them is created via a local variable.  That's ok,
+    # just create a nice big macro to do all the work.
+    print "#define DEFINE_BUILTIN_PRIMITIVE_TYPES \\"
+    for (i = 0; i < prim_defs; ++i) {
+	printf "  ix86_builtin_type_tab[(int)IX86_BT_" prim_name[i] \
+	    "] = " prim_base[i]
+	if (i < prim_defs - 1)
+	    print ", \\"
+    }
+    print "\n\n"
+
+    # The vector types are defined via two tables defining the real
+    # machine mode and the builtin primitive type.  We use two tables
+    # rather than a structure to avoid structure padding and save space.
+    print "static const enum machine_mode ix86_builtin_type_vect_mode[] = {"
+    for (i = 0; i < vect_defs; ++i) {
+	if (i == 0)
+	    printf "  "
+	else if (i % 6 == 0)
+	    printf ",\n  "
+	else
+	    printf ", "
+	printf vect_mode[i] "mode"
+    }
+    print "\n};\n\n"
+
+    print "static const enum ix86_builtin_type " \
+	"ix86_builtin_type_vect_base[] = {"
+    for (i = 0; i < vect_defs; ++i) {
+	if (i == 0)
+	    printf "  "
+	else if (i % 4 == 0)
+	    printf ",\n  "
+	else
+	    printf ", "
+	printf "IX86_BT_" vect_base[i]
+    }
+    print "\n};\n\n"
+
+    # The pointer types are defined via a single table defining the
+    # builtin primitive type.  The const-ness of the pointer is taken
+    # from the enumeration value > IX86_BT_LAST_PTR.
+    print "static const enum ix86_builtin_type " \
+	"ix86_builtin_type_ptr_base[] = {"
+    for (i = 0; i < ptr_defs; ++i) {
+	if (i == 0)
+	    printf " "
+	else if (i % 4 == 0)
+	    printf "\n "
+	printf " IX86_BT_" ptr_base[i] ","
+    }
+    print "\n  /* pointer-to-constant defs start here */"
+    for (i = 0; i < cptr_defs; ++i) {
+	if (i == 0)
+	    printf "  "
+	else if (i % 4 == 0)
+	    printf ",\n  "
+	else
+	    printf ", "
+	printf "IX86_BT_" cptr_base[i]
+    }
+    print "\n};\n\n"
+
+    # This second enumeration contains all of the function types.
+    print "enum ix86_builtin_func_type {"
+    for (i = 0; i < func_defs; ++i)
+	print "  " func_name[i] ","
+    print "  IX86_BT_LAST_FUNC = " func_name[i-1] ","
+    for (i = 0; i < alias_defs; ++i)
+	print "  " alias_name[i] ","
+    print "  IX86_BT_LAST_ALIAS = " alias_name[i-1] "\n}"
+    attribute_mode(func_defs + alias_defs)
+    print ";\n\n"
+
+    # The function types are defined via two tables.  The first contains
+    # ranges consiting of the function's return type, followed by all of
+    # the function argument types.  The ranges for all of the builtin
+    # functions are smooshed together in the same array.  The second array
+    # contains, for each builtin, the index of the function's return type
+    # within the first array.
+    print "static const enum ix86_builtin_type ix86_builtin_func_args[] = {"
+    for (i = 0; i < func_args; ++i) {
+	if (i == 0)
+	    printf "  "
+	else if (i % 4 == 0)
+	    printf ",\n  "
+	else
+	    printf ", "
+	printf "IX86_BT_" func_types[i]
+    }
+    print "\n};\n\n"
+
+    print "static const unsigned short ix86_builtin_func_start[] = {"
+    for (i = 0; i < func_defs; ++i) {
+	if (i == 0)
+	    printf " "
+	else if (i % 10 == 0)
+	    printf "\n "
+	printf " " func_start[i] ","
+    }
+    print " " func_args "\n};\n\n"
+
+    print "static const enum ix86_builtin_func_type " \
+	"ix86_builtin_func_alias_base[] = {"
+    for (i = 0; i < alias_defs; ++i) {
+	if (i == 0)
+	    printf "  "
+	else
+	    printf ",\n  "
+	printf alias_base[i]
+    }
+    print "\n};"
+}
diff --git a/gcc-4.9/gcc/config/i386/i386-builtin-types.def b/gcc-4.9/gcc/config/i386/i386-builtin-types.def
new file mode 100644
index 000000000..822c5e504
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/i386-builtin-types.def
@@ -0,0 +1,808 @@
+# This file provides a declarative way of describing the types that
+# are used when declaring ix86 builtin functions.  It is processed
+# with i386-builtin-type.awk to produce C code.
+#
+# DEF_PRIMITIVE_TYPE (ENUM, TYPE)
+#
+#   The ENUM is an identifier indicating which type is being defined.
+#   TYPE is a variable that represents the type.
+#   ??? Note that the awk program expects a single token for TYPE.
+#   At present, that's all that's required; revisit if it turns out
+#   that we need more than that.
+#
+# DEF_VECTOR_TYPE (ENUM, TYPE [, MODE])
+#
+#  This describes a vector type.  ENUM is an identifier as above.
+#  TYPE is the enumeral for the inner type which should of course
+#  name a type of the proper inner mode.  If present, MODE is the
+#  machine mode, else the machine mode should be the same as ENUM.
+#
+# DEF_POINTER_TYPE (ENUM, TYPE [, CONST])
+#
+#  This describes a pointer type.  ENUM is an identifier as above;
+#  TYPE is the enumeral for the type pointed to.  An optional third 
+#  argument is the keyword CONST, which defines this to be a pointer to
+#  a constant type.
+#
+# DEF_FUNCTION_TYPE (RETURN, ARGN*)
+#
+#   This describes a function type.  The return type and the arguments
+#   are the enumerals defined above.  The enumeration name for the 
+#   function is formed by RETURN ## _FTYPE_ ## ARG1 ## _ ## ARG2 ...
+#
+# DEF_FUNCTION_TYPE_ALIAS (ENUM, SUFFIX)
+#
+#   This defines an enumeration ENUM ## _ ## SUFFIX and arranges for
+#   the function type to be copied from ENUM.  This is used to control
+#   how the expanders treat the function.
+#
+
+DEF_PRIMITIVE_TYPE (VOID, void_type_node)
+DEF_PRIMITIVE_TYPE (CHAR, char_type_node)
+DEF_PRIMITIVE_TYPE (UCHAR, unsigned_char_type_node)
+# ??? Logically this should be intQI_type_node, but that maps to "signed char"
+# which is a different type than "char" even if "char" is signed.  This must
+# match the usage in emmintrin.h and changing this would change name mangling
+# and so is not advisable.
+DEF_PRIMITIVE_TYPE (QI, char_type_node)
+DEF_PRIMITIVE_TYPE (HI, intHI_type_node)
+DEF_PRIMITIVE_TYPE (SI, intSI_type_node)
+# ??? Logically this should be intDI_type_node, but that maps to "long"
+# with 64-bit, and that's not how the emmintrin.h is written.  Again, 
+# changing this would change name mangling.
+DEF_PRIMITIVE_TYPE (DI, long_long_integer_type_node)
+DEF_PRIMITIVE_TYPE (UQI, unsigned_intQI_type_node)
+DEF_PRIMITIVE_TYPE (UHI, unsigned_intHI_type_node)
+DEF_PRIMITIVE_TYPE (USI, unsigned_intSI_type_node)
+DEF_PRIMITIVE_TYPE (UDI, long_long_unsigned_type_node)
+# ??? Some of the types below should use the mode types above.
+DEF_PRIMITIVE_TYPE (USHORT, short_unsigned_type_node)
+DEF_PRIMITIVE_TYPE (INT, integer_type_node)
+DEF_PRIMITIVE_TYPE (UINT, unsigned_type_node)
+DEF_PRIMITIVE_TYPE (UNSIGNED, unsigned_type_node)
+DEF_PRIMITIVE_TYPE (LONGLONG, long_long_integer_type_node)
+DEF_PRIMITIVE_TYPE (ULONGLONG, long_long_unsigned_type_node)
+DEF_PRIMITIVE_TYPE (UINT8, unsigned_char_type_node)
+DEF_PRIMITIVE_TYPE (UINT16, short_unsigned_type_node)
+DEF_PRIMITIVE_TYPE (INT64, long_long_integer_type_node)
+DEF_PRIMITIVE_TYPE (UINT64, long_long_unsigned_type_node)
+DEF_PRIMITIVE_TYPE (FLOAT, float_type_node)
+DEF_PRIMITIVE_TYPE (DOUBLE, double_type_node)
+DEF_PRIMITIVE_TYPE (FLOAT80, float80_type_node)
+DEF_PRIMITIVE_TYPE (FLOAT128, float128_type_node)
+
+# MMX vectors
+DEF_VECTOR_TYPE (V2SF, FLOAT)
+DEF_VECTOR_TYPE (V1DI, DI)
+DEF_VECTOR_TYPE (V2SI, SI)
+DEF_VECTOR_TYPE (V4HI, HI)
+DEF_VECTOR_TYPE (V8QI, QI)
+
+# SSE vectors
+DEF_VECTOR_TYPE (V2DF, DOUBLE)
+DEF_VECTOR_TYPE (V4SF, FLOAT)
+DEF_VECTOR_TYPE (V2DI, DI)
+DEF_VECTOR_TYPE (V4SI, SI)
+DEF_VECTOR_TYPE (V8HI, HI)
+DEF_VECTOR_TYPE (V16QI, QI)
+DEF_VECTOR_TYPE (V2UDI, UDI, V2DI)
+DEF_VECTOR_TYPE (V4USI, USI, V4SI)
+DEF_VECTOR_TYPE (V8UHI, UHI, V8HI)
+DEF_VECTOR_TYPE (V16UQI, UQI, V16QI)
+
+# AVX vectors
+DEF_VECTOR_TYPE (V4DF, DOUBLE)
+DEF_VECTOR_TYPE (V8SF, FLOAT)
+DEF_VECTOR_TYPE (V4DI, DI)
+DEF_VECTOR_TYPE (V8SI, SI)
+DEF_VECTOR_TYPE (V16HI, HI)
+DEF_VECTOR_TYPE (V32QI, QI)
+DEF_VECTOR_TYPE (V4UDI, UDI, V4DI)
+DEF_VECTOR_TYPE (V8USI, USI, V8SI)
+DEF_VECTOR_TYPE (V16UHI, UHI, V16HI)
+
+# AVX512F vectors
+DEF_VECTOR_TYPE (V32SF, FLOAT)
+DEF_VECTOR_TYPE (V16SF, FLOAT)
+DEF_VECTOR_TYPE (V8DF, DOUBLE)
+DEF_VECTOR_TYPE (V8DI, DI)
+DEF_VECTOR_TYPE (V16SI, SI)
+DEF_VECTOR_TYPE (V64QI, QI)
+
+DEF_POINTER_TYPE (PCCHAR, CHAR, CONST)
+DEF_POINTER_TYPE (PCDOUBLE, DOUBLE, CONST)
+DEF_POINTER_TYPE (PCFLOAT, FLOAT, CONST)
+DEF_POINTER_TYPE (PCINT, INT, CONST)
+DEF_POINTER_TYPE (PCINT64, INT64, CONST)
+DEF_POINTER_TYPE (PCHAR, CHAR)
+DEF_POINTER_TYPE (PCVOID, VOID, CONST)
+DEF_POINTER_TYPE (PVOID, VOID)
+DEF_POINTER_TYPE (PDOUBLE, DOUBLE)
+DEF_POINTER_TYPE (PFLOAT, FLOAT)
+DEF_POINTER_TYPE (PUSHORT, USHORT)
+DEF_POINTER_TYPE (PINT, INT)
+DEF_POINTER_TYPE (PLONGLONG, LONGLONG)
+DEF_POINTER_TYPE (PULONGLONG, ULONGLONG)
+DEF_POINTER_TYPE (PUNSIGNED, UNSIGNED)
+
+DEF_POINTER_TYPE (PV2SI, V2SI)
+DEF_POINTER_TYPE (PV2DF, V2DF)
+DEF_POINTER_TYPE (PV2DI, V2DI)
+DEF_POINTER_TYPE (PV2SF, V2SF)
+DEF_POINTER_TYPE (PV4DF, V4DF)
+DEF_POINTER_TYPE (PV4DI, V4DI)
+DEF_POINTER_TYPE (PV4SF, V4SF)
+DEF_POINTER_TYPE (PV8DF, V8DF)
+DEF_POINTER_TYPE (PV8SF, V8SF)
+DEF_POINTER_TYPE (PV4SI, V4SI)
+DEF_POINTER_TYPE (PV8HI, V8HI)
+DEF_POINTER_TYPE (PV8SI, V8SI)
+DEF_POINTER_TYPE (PV8DI, V8DI)
+DEF_POINTER_TYPE (PV16QI, V16QI)
+DEF_POINTER_TYPE (PV16HI, V16HI)
+DEF_POINTER_TYPE (PV16SI, V16SI)
+DEF_POINTER_TYPE (PV16SF, V16SF)
+
+DEF_POINTER_TYPE (PCV2SI, V2SI, CONST)
+DEF_POINTER_TYPE (PCV2DF, V2DF, CONST)
+DEF_POINTER_TYPE (PCV2SF, V2SF, CONST)
+DEF_POINTER_TYPE (PCV4DF, V4DF, CONST)
+DEF_POINTER_TYPE (PCV4SF, V4SF, CONST)
+DEF_POINTER_TYPE (PCV8DF, V8DF, CONST)
+DEF_POINTER_TYPE (PCV8SF, V8SF, CONST)
+DEF_POINTER_TYPE (PCV16SF, V16SF, CONST)
+
+DEF_POINTER_TYPE (PCV2DI, V2DI, CONST)
+DEF_POINTER_TYPE (PCV4SI, V4SI, CONST)
+DEF_POINTER_TYPE (PCV4DI, V4DI, CONST)
+DEF_POINTER_TYPE (PCV8SI, V8SI, CONST)
+DEF_POINTER_TYPE (PCV8DI, V8DI, CONST)
+DEF_POINTER_TYPE (PCV16SI, V16SI, CONST)
+
+DEF_FUNCTION_TYPE (FLOAT128)
+DEF_FUNCTION_TYPE (UINT64)
+DEF_FUNCTION_TYPE (UNSIGNED)
+DEF_FUNCTION_TYPE (INT)
+DEF_FUNCTION_TYPE (VOID)
+DEF_FUNCTION_TYPE (PVOID)
+
+DEF_FUNCTION_TYPE (FLOAT, FLOAT)
+DEF_FUNCTION_TYPE (FLOAT128, FLOAT128)
+DEF_FUNCTION_TYPE (INT, INT)
+DEF_FUNCTION_TYPE (INT, V16QI)
+DEF_FUNCTION_TYPE (INT, V2DF)
+DEF_FUNCTION_TYPE (INT, V4DF)
+DEF_FUNCTION_TYPE (INT, V4SF)
+DEF_FUNCTION_TYPE (INT, V8QI)
+DEF_FUNCTION_TYPE (INT, V8SF)
+DEF_FUNCTION_TYPE (INT, V32QI)
+DEF_FUNCTION_TYPE (INT, PCCHAR)
+DEF_FUNCTION_TYPE (INT64, INT64)
+DEF_FUNCTION_TYPE (INT64, V2DF)
+DEF_FUNCTION_TYPE (INT64, V4SF)
+DEF_FUNCTION_TYPE (UINT64, INT)
+DEF_FUNCTION_TYPE (UINT16, UINT16)
+DEF_FUNCTION_TYPE (UINT64, PUNSIGNED)
+DEF_FUNCTION_TYPE (V16QI, PCCHAR)
+DEF_FUNCTION_TYPE (V16QI, V16QI)
+DEF_FUNCTION_TYPE (V16QI, V16SI)
+DEF_FUNCTION_TYPE (V2DF, PCDOUBLE)
+DEF_FUNCTION_TYPE (V2DF, V2DF)
+DEF_FUNCTION_TYPE (V2DF, V2SI)
+DEF_FUNCTION_TYPE (V2DF, V4DF)
+DEF_FUNCTION_TYPE (V2DF, V4SF)
+DEF_FUNCTION_TYPE (V2DF, V4SI)
+DEF_FUNCTION_TYPE (V2DI, PV2DI)
+DEF_FUNCTION_TYPE (V2DI, V16QI)
+DEF_FUNCTION_TYPE (V2DI, V2DI)
+DEF_FUNCTION_TYPE (V2DI, V4SI)
+DEF_FUNCTION_TYPE (V2DI, V8HI)
+DEF_FUNCTION_TYPE (V2SF, V2SF)
+DEF_FUNCTION_TYPE (V2SF, V2SI)
+DEF_FUNCTION_TYPE (V2SI, V2DF)
+DEF_FUNCTION_TYPE (V2SI, V2SF)
+DEF_FUNCTION_TYPE (V2SI, V2SI)
+DEF_FUNCTION_TYPE (V2SI, PCV2SI)
+DEF_FUNCTION_TYPE (V2SI, V4SF)
+DEF_FUNCTION_TYPE (V32QI, PCCHAR)
+DEF_FUNCTION_TYPE (V4DF, PCDOUBLE)
+DEF_FUNCTION_TYPE (V4DF, PCV2DF)
+DEF_FUNCTION_TYPE (V4DF, V2DF)
+DEF_FUNCTION_TYPE (V4DF, V4DF)
+DEF_FUNCTION_TYPE (V4DF, V4SF)
+DEF_FUNCTION_TYPE (V4DF, V4SI)
+DEF_FUNCTION_TYPE (V8DF, V8SI)
+DEF_FUNCTION_TYPE (V8DF, V8DF)
+DEF_FUNCTION_TYPE (V4HI, V4HI)
+DEF_FUNCTION_TYPE (V4SF, PCFLOAT)
+DEF_FUNCTION_TYPE (V4SF, V2DF)
+DEF_FUNCTION_TYPE (V4SF, V4DF)
+DEF_FUNCTION_TYPE (V4SF, V4SF)
+DEF_FUNCTION_TYPE (V4SF, PCV4SF)
+DEF_FUNCTION_TYPE (V4SF, V4SI)
+DEF_FUNCTION_TYPE (V4SF, V8SF)
+DEF_FUNCTION_TYPE (V4SF, V8HI)
+DEF_FUNCTION_TYPE (V4SI, V16QI)
+DEF_FUNCTION_TYPE (V4SI, V2DF)
+DEF_FUNCTION_TYPE (V4SI, V4DF)
+DEF_FUNCTION_TYPE (V4SI, V4SF)
+DEF_FUNCTION_TYPE (V4SI, V4SI)
+DEF_FUNCTION_TYPE (V4SI, V8HI)
+DEF_FUNCTION_TYPE (V4SI, V8SI)
+DEF_FUNCTION_TYPE (V8HI, V16QI)
+DEF_FUNCTION_TYPE (V8HI, V8DI)
+DEF_FUNCTION_TYPE (V8HI, V8HI)
+DEF_FUNCTION_TYPE (V8QI, V8QI)
+DEF_FUNCTION_TYPE (V8SF, PCFLOAT)
+DEF_FUNCTION_TYPE (V8SF, PCV4SF)
+DEF_FUNCTION_TYPE (V8SF, PCV8SF)
+DEF_FUNCTION_TYPE (V8SF, V4SF)
+DEF_FUNCTION_TYPE (V8SF, V8SF)
+DEF_FUNCTION_TYPE (V8SF, V8SI)
+DEF_FUNCTION_TYPE (V8SF, V8HI)
+DEF_FUNCTION_TYPE (V16SF, V16SF)
+DEF_FUNCTION_TYPE (V8SI, V8DI)
+DEF_FUNCTION_TYPE (V8SI, V4SI)
+DEF_FUNCTION_TYPE (V8SF, V8DF)
+DEF_FUNCTION_TYPE (V8SF, V8DF, V8SF, QI)
+DEF_FUNCTION_TYPE (V8SI, V8SF)
+DEF_FUNCTION_TYPE (V32QI, V32QI)
+DEF_FUNCTION_TYPE (V32QI, V16QI)
+DEF_FUNCTION_TYPE (V16HI, V16SI)
+DEF_FUNCTION_TYPE (V16HI, V16HI)
+DEF_FUNCTION_TYPE (V16HI, V8HI)
+DEF_FUNCTION_TYPE (V8SI, V8SI)
+DEF_FUNCTION_TYPE (VOID, PCVOID)
+DEF_FUNCTION_TYPE (VOID, PVOID)
+DEF_FUNCTION_TYPE (VOID, UINT64)
+DEF_FUNCTION_TYPE (VOID, UNSIGNED)
+DEF_FUNCTION_TYPE (VOID, PUSHORT)
+DEF_FUNCTION_TYPE (INT, PUSHORT)
+DEF_FUNCTION_TYPE (INT, PUNSIGNED)
+DEF_FUNCTION_TYPE (INT, PULONGLONG)
+DEF_FUNCTION_TYPE (V16HI, V16QI)
+DEF_FUNCTION_TYPE (V8SI, V16QI)
+DEF_FUNCTION_TYPE (V4DI, V16QI)
+DEF_FUNCTION_TYPE (V8SI, V8HI)
+DEF_FUNCTION_TYPE (V4DI, V8HI)
+DEF_FUNCTION_TYPE (V4DI, V4SI)
+DEF_FUNCTION_TYPE (V4DI, PV4DI)
+DEF_FUNCTION_TYPE (V4DI, V2DI)
+DEF_FUNCTION_TYPE (V16SF, FLOAT)
+DEF_FUNCTION_TYPE (V16SI, INT)
+DEF_FUNCTION_TYPE (V8DF, DOUBLE)
+DEF_FUNCTION_TYPE (V8DI, INT64)
+DEF_FUNCTION_TYPE (V16SF, V4SF)
+DEF_FUNCTION_TYPE (V8DF, V4DF)
+DEF_FUNCTION_TYPE (V8DI, V4DI)
+DEF_FUNCTION_TYPE (V16QI, V8DI)
+DEF_FUNCTION_TYPE (UINT, V4SF)
+DEF_FUNCTION_TYPE (UINT64, V4SF)
+DEF_FUNCTION_TYPE (UINT, V2DF)
+DEF_FUNCTION_TYPE (UINT64, V2DF)
+DEF_FUNCTION_TYPE (V16SI, V16SI)
+DEF_FUNCTION_TYPE (V16SI, V16SI, V16SI, HI)
+DEF_FUNCTION_TYPE (V8DI, V8DI)
+DEF_FUNCTION_TYPE (V8DI, V8DI, V8DI, QI)
+DEF_FUNCTION_TYPE (V16SI, PV4SI)
+DEF_FUNCTION_TYPE (V16SF, PV4SF)
+DEF_FUNCTION_TYPE (V8DI, PV4DI)
+DEF_FUNCTION_TYPE (V8DF, PV4DF)
+DEF_FUNCTION_TYPE (V8UHI, V8UHI)
+DEF_FUNCTION_TYPE (V8USI, V8USI)
+DEF_FUNCTION_TYPE (V8DI, PV8DI)
+
+DEF_FUNCTION_TYPE (DI, V2DI, INT)
+DEF_FUNCTION_TYPE (DOUBLE, V2DF, INT)
+DEF_FUNCTION_TYPE (FLOAT, V4SF, INT)
+DEF_FUNCTION_TYPE (FLOAT128, FLOAT128, FLOAT128)
+DEF_FUNCTION_TYPE (HI, V4HI, INT)
+DEF_FUNCTION_TYPE (HI, V8HI, INT)
+DEF_FUNCTION_TYPE (INT, V2DF, V2DF)
+DEF_FUNCTION_TYPE (INT, V2DI, V2DI)
+DEF_FUNCTION_TYPE (INT, V4DF, V4DF)
+DEF_FUNCTION_TYPE (INT, V4DI, V4DI)
+DEF_FUNCTION_TYPE (INT, V4SF, V4SF)
+DEF_FUNCTION_TYPE (INT, V8SF, V8SF)
+DEF_FUNCTION_TYPE (QI, V16QI, INT)
+DEF_FUNCTION_TYPE (QI, V8QI, INT)
+DEF_FUNCTION_TYPE (SI, V2SI, INT)
+DEF_FUNCTION_TYPE (SI, V4SI, INT)
+DEF_FUNCTION_TYPE (UINT, UINT, UCHAR)
+DEF_FUNCTION_TYPE (UINT, UINT, UINT)
+DEF_FUNCTION_TYPE (UINT, UINT, USHORT)
+DEF_FUNCTION_TYPE (UINT16, UINT16, INT)
+DEF_FUNCTION_TYPE (UINT64, UINT64, UINT64)
+DEF_FUNCTION_TYPE (UINT8, UINT8, INT)
+DEF_FUNCTION_TYPE (V16QI, V16QI, SI)
+DEF_FUNCTION_TYPE (V16QI, V16QI, V16QI)
+DEF_FUNCTION_TYPE (V16QI, V8HI, V8HI)
+DEF_FUNCTION_TYPE (V1DI, V1DI, SI)
+DEF_FUNCTION_TYPE (V1DI, V1DI, V1DI)
+DEF_FUNCTION_TYPE (V1DI, V2SI, V2SI)
+DEF_FUNCTION_TYPE (V1DI, V8QI, V8QI)
+DEF_FUNCTION_TYPE (V2DF, PCV2DF, V2DI)
+DEF_FUNCTION_TYPE (V2DF, V2DF, UINT)
+DEF_FUNCTION_TYPE (V2DF, V2DF, UINT64)
+DEF_FUNCTION_TYPE (V2DF, V2DF, DI)
+DEF_FUNCTION_TYPE (V2DF, V2DF, INT)
+DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE)
+DEF_FUNCTION_TYPE (V2DF, V2DF, SI)
+DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF)
+DEF_FUNCTION_TYPE (V2DF, V2DF, V2DI)
+DEF_FUNCTION_TYPE (V2DF, V2DF, V4SF)
+DEF_FUNCTION_TYPE (V2DF, V4DF, INT)
+DEF_FUNCTION_TYPE (V2DI, V16QI, V16QI)
+DEF_FUNCTION_TYPE (V2DI, V2DF, V2DF)
+DEF_FUNCTION_TYPE (V2DI, V2DI, INT)
+DEF_FUNCTION_TYPE (V2DI, V2DI, SI)
+DEF_FUNCTION_TYPE (V2DI, V2DI, V16QI)
+DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI)
+DEF_FUNCTION_TYPE (V2DI, V4SI, V4SI)
+DEF_FUNCTION_TYPE (V2UDI, V4USI, V4USI)
+DEF_FUNCTION_TYPE (V2DI, PCV2DI, V2DI)
+DEF_FUNCTION_TYPE (V2SF, V2SF, V2SF)
+DEF_FUNCTION_TYPE (V2SI, INT, INT)
+DEF_FUNCTION_TYPE (V2SI, V2SF, V2SF)
+DEF_FUNCTION_TYPE (V2SI, V2SI, SI)
+DEF_FUNCTION_TYPE (V2SI, V2SI, V2SI)
+DEF_FUNCTION_TYPE (V2SI, V4HI, V4HI)
+DEF_FUNCTION_TYPE (V4DF, PCV4DF, V4DI)
+DEF_FUNCTION_TYPE (V4DF, V4DF, INT)
+DEF_FUNCTION_TYPE (V4DF, V8DF, INT)
+DEF_FUNCTION_TYPE (V4DF, V8DF, INT, V4DF, QI)
+DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF)
+DEF_FUNCTION_TYPE (V4DF, V4DF, V4DI)
+DEF_FUNCTION_TYPE (V8DF, V8DF, V8DI)
+DEF_FUNCTION_TYPE (V4HI, V2SI, V2SI)
+DEF_FUNCTION_TYPE (V4HI, V4HI, INT)
+DEF_FUNCTION_TYPE (V4HI, V4HI, SI)
+DEF_FUNCTION_TYPE (V4HI, V4HI, V4HI)
+DEF_FUNCTION_TYPE (V4HI, V8QI, V8QI)
+DEF_FUNCTION_TYPE (V4SF, PCV4SF, V4SI)
+DEF_FUNCTION_TYPE (V4SF, V4SF, UINT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, UINT64)
+DEF_FUNCTION_TYPE (V4SF, V4SF, DI)
+DEF_FUNCTION_TYPE (V4SF, V4SF, INT)
+DEF_FUNCTION_TYPE (INT, V4SF, V4SF, INT, INT)
+DEF_FUNCTION_TYPE (INT, V2DF, V2DF, INT, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, PCV2SF)
+DEF_FUNCTION_TYPE (V4SF, V4SF, SI)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V2DF)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V2SI)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V4SI)
+DEF_FUNCTION_TYPE (V4SF, V8SF, INT)
+DEF_FUNCTION_TYPE (V4SI, V2DF, V2DF)
+DEF_FUNCTION_TYPE (V4SI, V4SF, V4SF)
+DEF_FUNCTION_TYPE (V4SI, V4SI, INT)
+DEF_FUNCTION_TYPE (V4SI, V4SI, SI)
+DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI)
+DEF_FUNCTION_TYPE (V4SI, V8HI, V8HI)
+DEF_FUNCTION_TYPE (V4SI, V8SI, INT)
+DEF_FUNCTION_TYPE (V4SI, PCV4SI, V4SI)
+DEF_FUNCTION_TYPE (V8HI, V16QI, V16QI)
+DEF_FUNCTION_TYPE (V8HI, V4SI, V4SI)
+DEF_FUNCTION_TYPE (V8HI, V8HI, INT)
+DEF_FUNCTION_TYPE (V8HI, V8HI, SI)
+DEF_FUNCTION_TYPE (V8HI, V8HI, V8HI)
+DEF_FUNCTION_TYPE (V8HI, V8SF, INT)
+DEF_FUNCTION_TYPE (V8HI, V4SF, INT)
+DEF_FUNCTION_TYPE (V8QI, V4HI, V4HI)
+DEF_FUNCTION_TYPE (V8QI, V8QI, V8QI)
+DEF_FUNCTION_TYPE (V8SF, PCV8SF, V8SI)
+DEF_FUNCTION_TYPE (V8SF, V8SF, INT)
+DEF_FUNCTION_TYPE (V16SF, V16SF, INT)
+DEF_FUNCTION_TYPE (V4SF, V16SF, INT)
+DEF_FUNCTION_TYPE (V4SF, V16SF, INT, V4SF, QI)
+DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF)
+DEF_FUNCTION_TYPE (V8SF, V8SF, V8SI)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V16SI)
+DEF_FUNCTION_TYPE (V32QI, V16HI, V16HI)
+DEF_FUNCTION_TYPE (V16HI, V8SI, V8SI)
+DEF_FUNCTION_TYPE (V8DF, V8DF, V4DF, INT, V8DF, QI)
+DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, INT, V8DF, QI)
+DEF_FUNCTION_TYPE (V8DF, V8DF, INT, V8DF, QI)
+DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, V8DI, INT, QI, INT)
+DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, INT)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, INT, V16SF, HI)
+DEF_FUNCTION_TYPE (V16SF, V16SF, INT, V16SF, HI)
+DEF_FUNCTION_TYPE (V16SI, V16SI, V4SI, INT, V16SI, HI)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, V16SI, INT)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, V16SI, INT, HI)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, V16SI, INT, HI, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SI, INT, QI)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SI, INT, QI, INT)
+DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DI, INT, QI)
+DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DI, INT, QI, INT)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V4SF, INT)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V4SF, INT, V16SF, HI)
+DEF_FUNCTION_TYPE (V32QI, V32QI, V32QI)
+DEF_FUNCTION_TYPE (V16HI, V32QI, V32QI)
+DEF_FUNCTION_TYPE (V16HI, V16HI, V8HI)
+DEF_FUNCTION_TYPE (V16HI, V16HI, V16HI)
+DEF_FUNCTION_TYPE (V16HI, V16HI, INT)
+DEF_FUNCTION_TYPE (V16HI, V16SF, INT)
+DEF_FUNCTION_TYPE (V16HI, V16SF, INT, V16HI, HI)
+DEF_FUNCTION_TYPE (V16HI, V16HI, V16HI, INT, V16HI, HI)
+DEF_FUNCTION_TYPE (V16HI, V16HI, SI)
+DEF_FUNCTION_TYPE (V16HI, V16HI, V16HI, INT)
+DEF_FUNCTION_TYPE (V32QI, V32QI, V32QI, INT)
+DEF_FUNCTION_TYPE (V8SI, V4DF, V4DF)
+DEF_FUNCTION_TYPE (V8SI, V8SI, V4SI)
+DEF_FUNCTION_TYPE (V16SI, V16SI, V4SI)
+DEF_FUNCTION_TYPE (V16SI, V16SI, V4SI, INT)
+DEF_FUNCTION_TYPE (V4SI, V16SI, INT)
+DEF_FUNCTION_TYPE (V4SI, V16SI, INT, V4SI, QI)
+DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI)
+DEF_FUNCTION_TYPE (V16SI, V16SI, V16SI)
+DEF_FUNCTION_TYPE (V16SI, V16SI, V16SI, INT, V16SI, HI)
+DEF_FUNCTION_TYPE (V8SI, V16HI, V16HI)
+DEF_FUNCTION_TYPE (V8SI, V8SI, INT)
+DEF_FUNCTION_TYPE (V8SI, V8SI, SI)
+DEF_FUNCTION_TYPE (V16SI, V16SI, SI)
+DEF_FUNCTION_TYPE (V16SI, V16SI, INT)
+DEF_FUNCTION_TYPE (V16SI, V16SI, V4SI, V16SI, HI)
+DEF_FUNCTION_TYPE (V16SI, V16SI, INT, V16SI, HI)
+DEF_FUNCTION_TYPE (V8SI, PCV8SI, V8SI)
+DEF_FUNCTION_TYPE (V4DI, V4DI, V4DI)
+DEF_FUNCTION_TYPE (V8DI, V8DI, V8DI)
+DEF_FUNCTION_TYPE (V16SI, V8DF, V8DF)
+DEF_FUNCTION_TYPE (V8DI, V8DI, V8DI, INT, V8DI, QI)
+DEF_FUNCTION_TYPE (V8DI, V8DI, V4DI, INT, V8DI, QI)
+DEF_FUNCTION_TYPE (V4DI, V8SI, V8SI)
+DEF_FUNCTION_TYPE (V4UDI, V8USI, V8USI)
+DEF_FUNCTION_TYPE (V4DI, V4DI, V2DI)
+DEF_FUNCTION_TYPE (V8DI, V8DI, V2DI)
+DEF_FUNCTION_TYPE (V4DI, PCV4DI, V4DI)
+DEF_FUNCTION_TYPE (V4DI, V8DI, INT)
+DEF_FUNCTION_TYPE (V4DI, V8DI, INT, V4DI, QI)
+DEF_FUNCTION_TYPE (V8DI, V8DI, V2DI, V8DI, QI)
+DEF_FUNCTION_TYPE (V8DI, V8DI, INT, V8DI, QI)
+DEF_FUNCTION_TYPE (V4DI, V4DI, INT)
+DEF_FUNCTION_TYPE (V2DI, V4DI, INT)
+DEF_FUNCTION_TYPE (VOID, PVOID, INT64)
+DEF_FUNCTION_TYPE (VOID, PCHAR, V16QI)
+DEF_FUNCTION_TYPE (VOID, PCHAR, V32QI)
+DEF_FUNCTION_TYPE (VOID, PDOUBLE, V2DF)
+DEF_FUNCTION_TYPE (VOID, PDOUBLE, V4DF)
+DEF_FUNCTION_TYPE (VOID, PDOUBLE, V8DF)
+DEF_FUNCTION_TYPE (VOID, PFLOAT, V4SF)
+DEF_FUNCTION_TYPE (VOID, PFLOAT, V8SF)
+DEF_FUNCTION_TYPE (VOID, PFLOAT, V16SF)
+DEF_FUNCTION_TYPE (VOID, PINT, INT)
+DEF_FUNCTION_TYPE (VOID, PLONGLONG, LONGLONG)
+DEF_FUNCTION_TYPE (VOID, PULONGLONG, ULONGLONG)
+DEF_FUNCTION_TYPE (VOID, PV2SI, V2SI)
+DEF_FUNCTION_TYPE (VOID, PV2DI, V2DI)
+DEF_FUNCTION_TYPE (VOID, PV2SF, V4SF)
+DEF_FUNCTION_TYPE (VOID, PV4DI, V4DI)
+DEF_FUNCTION_TYPE (VOID, PV4SF, V4SF)
+DEF_FUNCTION_TYPE (VOID, PV8SF, V8SF)
+DEF_FUNCTION_TYPE (VOID, UNSIGNED, UNSIGNED)
+DEF_FUNCTION_TYPE (VOID, PV8DI, V8DI)
+
+# Instructions returning mask
+DEF_FUNCTION_TYPE (HI, HI)
+DEF_FUNCTION_TYPE (HI, HI, HI)
+DEF_FUNCTION_TYPE (HI, HI, INT)
+DEF_FUNCTION_TYPE (QI, V8DI, V8DI)
+DEF_FUNCTION_TYPE (QI, V8DI, V8DI, QI)
+DEF_FUNCTION_TYPE (HI, V16SI, V16SI)
+DEF_FUNCTION_TYPE (HI, V16SI, V16SI, HI)
+DEF_FUNCTION_TYPE (QI, V8DI, V8DI, INT)
+DEF_FUNCTION_TYPE (QI, V8DI, V8DI, INT, QI)
+DEF_FUNCTION_TYPE (HI, V16SI, V16SI, INT)
+DEF_FUNCTION_TYPE (HI, V16SI, V16SI, INT ,HI)
+DEF_FUNCTION_TYPE (QI, V8DF, V8DF, INT)
+DEF_FUNCTION_TYPE (QI, V8DF, V8DF, INT, QI)
+DEF_FUNCTION_TYPE (QI, V8DF, V8DF, INT, QI, INT)
+DEF_FUNCTION_TYPE (HI, V16SF, V16SF, INT)
+DEF_FUNCTION_TYPE (HI, V16SF, V16SF, INT, HI)
+DEF_FUNCTION_TYPE (HI, V16SF, V16SF, INT, HI, INT)
+DEF_FUNCTION_TYPE (QI, V2DF, V2DF, INT)
+DEF_FUNCTION_TYPE (QI, V2DF, V2DF, INT, QI)
+DEF_FUNCTION_TYPE (QI, V2DF, V2DF, INT, QI, INT)
+DEF_FUNCTION_TYPE (QI, V4SF, V4SF, INT)
+DEF_FUNCTION_TYPE (QI, V4SF, V4SF, INT, QI)
+DEF_FUNCTION_TYPE (QI, V4SF, V4SF, INT, QI, INT)
+DEF_FUNCTION_TYPE (V16SI, HI)
+DEF_FUNCTION_TYPE (V8DI, QI)
+
+DEF_FUNCTION_TYPE (INT, V16QI, V16QI, INT)
+DEF_FUNCTION_TYPE (UCHAR, UINT, UINT, UINT)
+DEF_FUNCTION_TYPE (UCHAR, UINT64, UINT, UINT)
+DEF_FUNCTION_TYPE (V16HI, V16HI, V16HI, V16HI)
+DEF_FUNCTION_TYPE (V16QI, V16QI, QI, INT)
+DEF_FUNCTION_TYPE (V16QI, V16QI, V16QI, INT)
+DEF_FUNCTION_TYPE (V16QI, V16QI, V16QI, V16QI)
+DEF_FUNCTION_TYPE (V1DI, V1DI, V1DI, INT)
+DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, INT)
+DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, INT, INT)
+DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DF)
+DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DI, INT)
+DEF_FUNCTION_TYPE (V2DI, V2DI, DI, INT)
+DEF_FUNCTION_TYPE (V2DI, V2DI, UINT, UINT)
+DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI, INT)
+DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI, V2DI)
+DEF_FUNCTION_TYPE (V32QI, V32QI, V32QI, V32QI)
+DEF_FUNCTION_TYPE (V4DF, V4DF, V2DF, INT)
+DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF, INT)
+DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF, V4DF)
+DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF, V4DI, INT)
+DEF_FUNCTION_TYPE (V4DI, V4DI, V4DI, V4DI)
+DEF_FUNCTION_TYPE (V4HI, V4HI, HI, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, FLOAT, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, INT, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V2DF, INT)
+DEF_FUNCTION_TYPE (V2DF, V2DF, V4SF, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SF)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SI, INT)
+DEF_FUNCTION_TYPE (V4SI, V4SI, SI, INT)
+DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, INT)
+DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, V2DI)
+DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, V4SI)
+DEF_FUNCTION_TYPE (V8HI, V8HI, HI, INT)
+DEF_FUNCTION_TYPE (V8HI, V8HI, V8HI, INT)
+DEF_FUNCTION_TYPE (V8HI, V8HI, V8HI, V4SI)
+DEF_FUNCTION_TYPE (V8HI, V8HI, V8HI, V8HI)
+DEF_FUNCTION_TYPE (V8SF, V8SF, V4SF, INT)
+DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, INT)
+DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, V8SF)
+DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, V8SI, INT)
+DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, V8DF)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, V16SF)
+DEF_FUNCTION_TYPE (V8SI, V8SI, V4SI, INT)
+DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, INT)
+DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, V8SI)
+DEF_FUNCTION_TYPE (V4DI, V4DI, V4DI, INT)
+DEF_FUNCTION_TYPE (V4DI, V4DI, V2DI, INT)
+
+# Instructions with masking
+DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, QI)
+DEF_FUNCTION_TYPE (V8DF, V8SF, V8DF, QI)
+DEF_FUNCTION_TYPE (V8DF, V8SI, V8DF, QI)
+DEF_FUNCTION_TYPE (V8DI, V8SI, V8DI, QI)
+DEF_FUNCTION_TYPE (V8DI, V8HI, V8DI, QI)
+DEF_FUNCTION_TYPE (V8DI, V16QI, V8DI, QI)
+DEF_FUNCTION_TYPE (V8DI, V8DI, V8DI, V8DI, QI)
+DEF_FUNCTION_TYPE (V8DF, V8DI, V8DF, V8DF)
+DEF_FUNCTION_TYPE (V8DF, V8DI, V8DF, V8DF, QI)
+DEF_FUNCTION_TYPE (V8DF, V8DF, V8DI, V8DF, QI)
+DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, V8DF, QI)
+DEF_FUNCTION_TYPE (V16SI, V16SI, V16SI, V16SI, HI)
+DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DF, QI)
+DEF_FUNCTION_TYPE (V2DF, V2DF, V4SF, V2DF, QI)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, HI)
+DEF_FUNCTION_TYPE (V16SF, V16SI, V16SF, HI)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, V16SF, HI)
+DEF_FUNCTION_TYPE (V16SF, V16SI, V16SF, V16SF)
+DEF_FUNCTION_TYPE (V16SF, V16SI, V16SF, V16SF, HI)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V16SI, V16SF, HI)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V2DF, V4SF, QI)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SF, QI)
+DEF_FUNCTION_TYPE (V16SF, V4SF, V16SF, HI)
+DEF_FUNCTION_TYPE (V8DF, V4DF, V8DF, QI)
+DEF_FUNCTION_TYPE (V8DF, V2DF, V8DF, QI)
+DEF_FUNCTION_TYPE (V16SI, V4SI, V16SI, HI)
+DEF_FUNCTION_TYPE (V16SI, SI, V16SI, HI)
+DEF_FUNCTION_TYPE (V16SI, V16HI, V16SI, HI)
+DEF_FUNCTION_TYPE (V16SI, V16QI, V16SI, HI)
+DEF_FUNCTION_TYPE (V8SI, V8DF, V8SI, QI)
+DEF_FUNCTION_TYPE (V8DI, V4DI, V8DI, QI)
+DEF_FUNCTION_TYPE (V8DI, V2DI, V8DI, QI)
+DEF_FUNCTION_TYPE (V8DI, DI, V8DI, QI)
+DEF_FUNCTION_TYPE (V16SF, PCV16SF, V16SF, HI)
+DEF_FUNCTION_TYPE (V8DF, PCV8DF, V8DF, QI)
+DEF_FUNCTION_TYPE (V16SI, PCV16SI, V16SI, HI)
+DEF_FUNCTION_TYPE (V8DI, PCV8DI, V8DI, QI)
+DEF_FUNCTION_TYPE (V2DF, PCDOUBLE, V2DF, QI)
+DEF_FUNCTION_TYPE (V4SF, PCFLOAT, V4SF, QI)
+DEF_FUNCTION_TYPE (V16QI, V16SI, V16QI, HI)
+DEF_FUNCTION_TYPE (V16HI, V16SI, V16HI, HI)
+DEF_FUNCTION_TYPE (V8SI, V8DI, V8SI, QI)
+DEF_FUNCTION_TYPE (V8HI, V8DI, V8HI, QI)
+DEF_FUNCTION_TYPE (V16QI, V8DI, V16QI, QI)
+DEF_FUNCTION_TYPE (VOID, PV8DF, V8DF, QI)
+DEF_FUNCTION_TYPE (VOID, PV8SI, V8DI, QI)
+DEF_FUNCTION_TYPE (VOID, PV8HI, V8DI, QI)
+DEF_FUNCTION_TYPE (VOID, PV16SF, V16SF, HI)
+DEF_FUNCTION_TYPE (VOID, PV8DI, V8DI, QI)
+DEF_FUNCTION_TYPE (VOID, PV16SI, V16SI, HI)
+DEF_FUNCTION_TYPE (VOID, PV16HI, V16SI, HI)
+DEF_FUNCTION_TYPE (VOID, PV16QI, V16SI, HI)
+DEF_FUNCTION_TYPE (VOID, PV16QI, V8DI, QI)
+DEF_FUNCTION_TYPE (VOID, PDOUBLE, V2DF, QI)
+DEF_FUNCTION_TYPE (VOID, PFLOAT, V4SF, QI)
+DEF_FUNCTION_TYPE (V16SI, V16SF, V16SI, HI)
+DEF_FUNCTION_TYPE (V8DI, V8DI, V8DI, V8DI, INT, QI)
+DEF_FUNCTION_TYPE (V16SI, V16SI, V16SI, V16SI, INT, HI)
+
+DEF_FUNCTION_TYPE (VOID, PCVOID, UNSIGNED, UNSIGNED)
+DEF_FUNCTION_TYPE (VOID, PV2DF, V2DI, V2DF)
+DEF_FUNCTION_TYPE (VOID, PV4DF, V4DI, V4DF)
+DEF_FUNCTION_TYPE (VOID, PV4SF, V4SI, V4SF)
+DEF_FUNCTION_TYPE (VOID, PV8SF, V8SI, V8SF)
+DEF_FUNCTION_TYPE (VOID, PV2DI, V2DI, V2DI)
+DEF_FUNCTION_TYPE (VOID, PV4DI, V4DI, V4DI)
+DEF_FUNCTION_TYPE (VOID, PV4SI, V4SI, V4SI)
+DEF_FUNCTION_TYPE (VOID, PV8SI, V8SI, V8SI)
+DEF_FUNCTION_TYPE (VOID, UINT, UINT, UINT)
+DEF_FUNCTION_TYPE (VOID, UINT64, UINT, UINT)
+DEF_FUNCTION_TYPE (VOID, V16QI, V16QI, PCHAR)
+DEF_FUNCTION_TYPE (VOID, V8QI, V8QI, PCHAR)
+DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DI)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SI)
+DEF_FUNCTION_TYPE (V2UDI, V2UDI, V2UDI, V2UDI)
+DEF_FUNCTION_TYPE (V4USI, V4USI, V4USI, V4USI)
+DEF_FUNCTION_TYPE (V8UHI, V8UHI, V8UHI, V8UHI)
+DEF_FUNCTION_TYPE (V16UQI, V16UQI, V16UQI, V16UQI)
+DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF, V4DI)
+DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, V8SI)
+DEF_FUNCTION_TYPE (V8DI, V8DI, V8DI, V8DI)
+DEF_FUNCTION_TYPE (V16SI, V16SI, V16SI, V16SI)
+DEF_FUNCTION_TYPE (V8DF, V8DF, V8DI, V8DF)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V16SI, V16SF)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, INT, V4SF, QI)
+DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, INT, V2DF, QI)
+DEF_FUNCTION_TYPE (V8DI, V16SI, V16SI, V8DI, QI)
+
+DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI, UINT, UINT)
+DEF_FUNCTION_TYPE (V4HI, HI, HI, HI, HI)
+
+DEF_FUNCTION_TYPE (INT, V16QI, INT, V16QI, INT, INT)
+DEF_FUNCTION_TYPE (V16QI, V16QI, INT, V16QI, INT, INT)
+
+DEF_FUNCTION_TYPE (V8QI, QI, QI, QI, QI, QI, QI, QI, QI)
+
+DEF_FUNCTION_TYPE (UCHAR, UCHAR, UINT, UINT, PUNSIGNED)
+DEF_FUNCTION_TYPE (UCHAR, UCHAR, ULONGLONG, ULONGLONG, PULONGLONG)
+
+# Instructions with rounding
+DEF_FUNCTION_TYPE (UINT64, V2DF, INT)
+DEF_FUNCTION_TYPE (UINT64, V4SF, INT)
+DEF_FUNCTION_TYPE (UINT, V2DF, INT)
+DEF_FUNCTION_TYPE (UINT, V4SF, INT)
+DEF_FUNCTION_TYPE (INT64, V2DF, INT)
+DEF_FUNCTION_TYPE (INT64, V4SF, INT)
+DEF_FUNCTION_TYPE (INT, V2DF, INT)
+DEF_FUNCTION_TYPE (INT, V4SF, INT)
+DEF_FUNCTION_TYPE (V2DF, V2DF, UINT64, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, UINT64, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, UINT, INT)
+DEF_FUNCTION_TYPE (V2DF, V2DF, INT64, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, INT64, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, INT, INT)
+DEF_FUNCTION_TYPE (V16SI, V16SF, V16SI, HI, INT)
+DEF_FUNCTION_TYPE (V16SF, V16SI, V16SF, HI, INT)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, HI, INT)
+DEF_FUNCTION_TYPE (V16SF, V16HI, V16SF, HI, INT)
+DEF_FUNCTION_TYPE (V8SI, V8DF, V8SI, QI, INT)
+DEF_FUNCTION_TYPE (V8SF, V8DF, V8SF, QI, INT)
+DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, QI, INT)
+DEF_FUNCTION_TYPE (V8DF, V8SF, V8DF, QI, INT)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, V16SF, HI, INT)
+DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, V8DF, QI, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SF, QI, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V2DF, V4SF, QI, INT)
+DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DF, QI, INT)
+DEF_FUNCTION_TYPE (V2DF, V2DF, V4SF, V2DF, QI, INT)
+DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DF, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SF, INT)
+
+DEF_FUNCTION_TYPE (V16SF, V16SF, INT, V16SF, HI, INT)
+DEF_FUNCTION_TYPE (V8DF, V8DF, INT, V8DF, QI, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, INT, V4SF, QI, INT)
+DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, INT, V2DF, QI, INT)
+DEF_FUNCTION_TYPE (V8DI, V8DI, SI, V8DI, V8DI)
+
+DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE, V4SI, V2DF, INT)
+DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V4SI, V4DF, INT)
+DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V8SI, V4DF, INT)
+DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE, V2DI, V2DF, INT)
+DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V4DI, V4DF, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V4SI, V4SF, INT)
+DEF_FUNCTION_TYPE (V8SF, V8SF, PCFLOAT, V8SI, V8SF, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V2DI, V4SF, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V4DI, V4SF, INT)
+DEF_FUNCTION_TYPE (V8SF, V8SF, PCFLOAT, V4DI, V8SF, INT)
+DEF_FUNCTION_TYPE (V2DI, V2DI, PCINT64, V4SI, V2DI, INT)
+DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V4SI, V4DI, INT)
+DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V8SI, V4DI, INT)
+DEF_FUNCTION_TYPE (V2DI, V2DI, PCINT64, V2DI, V2DI, INT)
+DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V4DI, V4DI, INT)
+DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V4SI, V4SI, INT)
+DEF_FUNCTION_TYPE (V8SI, V8SI, PCINT, V8SI, V8SI, INT)
+DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V2DI, V4SI, INT)
+DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V4DI, V4SI, INT)
+DEF_FUNCTION_TYPE (V8SI, V8SI, PCINT, V4DI, V8SI, INT)
+
+DEF_FUNCTION_TYPE (V16SF, V16SF, PCFLOAT, V16SI, HI, INT)
+DEF_FUNCTION_TYPE (V16SF, V16SF, PCFLOAT, V8DI, HI, INT)
+DEF_FUNCTION_TYPE (V8DF, V8DF, PCDOUBLE, V8SI, QI, INT)
+DEF_FUNCTION_TYPE (V8DF, V8DF, PCDOUBLE, V16SI, QI, INT)
+DEF_FUNCTION_TYPE (V8SF, V8SF, PCFLOAT, V8DI, QI, INT)
+DEF_FUNCTION_TYPE (V8DF, V8DF, PCDOUBLE, V8DI, QI, INT)
+DEF_FUNCTION_TYPE (V16SI, V16SI, PCINT, V16SI, HI, INT)
+DEF_FUNCTION_TYPE (V16SI, V16SI, PCINT, V8DI, HI, INT)
+DEF_FUNCTION_TYPE (V8DI, V8DI, PCINT64, V8SI, QI, INT)
+DEF_FUNCTION_TYPE (V8DI, V8DI, PCINT64, V16SI, QI, INT)
+DEF_FUNCTION_TYPE (V8SI, V8SI, PCINT, V8DI, QI, INT)
+DEF_FUNCTION_TYPE (V8DI, V8DI, PCINT64, V8DI, QI, INT)
+DEF_FUNCTION_TYPE (VOID, PFLOAT, HI, V16SI, V16SF, INT)
+DEF_FUNCTION_TYPE (VOID, PDOUBLE, QI, V8SI, V8DF, INT)
+DEF_FUNCTION_TYPE (VOID, PFLOAT, QI, V8DI, V8SF, INT)
+DEF_FUNCTION_TYPE (VOID, PDOUBLE, QI, V8DI, V8DF, INT)
+DEF_FUNCTION_TYPE (VOID, PINT, HI, V16SI, V16SI, INT)
+DEF_FUNCTION_TYPE (VOID, PLONGLONG, QI, V8SI, V8DI, INT)
+DEF_FUNCTION_TYPE (VOID, PINT, QI, V8DI, V8SI, INT)
+DEF_FUNCTION_TYPE (VOID, PLONGLONG, QI, V8DI, V8DI, INT)
+
+DEF_FUNCTION_TYPE (VOID, QI, V8SI, PCINT64, INT, INT)
+DEF_FUNCTION_TYPE (VOID, HI, V16SI, PCINT, INT, INT)
+DEF_FUNCTION_TYPE (VOID, QI, V8DI, PCINT64, INT, INT)
+DEF_FUNCTION_TYPE (VOID, QI, V8DI, PCINT, INT, INT)
+
+DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V4DF_FTYPE_V4DF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V4SF_FTYPE_V4SF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V8SF_FTYPE_V8SF, ROUND)
+
+DEF_FUNCTION_TYPE_ALIAS (V4SI_FTYPE_V2DF_V2DF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V8SI_FTYPE_V4DF_V4DF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V16SI_FTYPE_V8DF_V8DF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V4SI_FTYPE_V4SF, ROUND)
+DEF_FUNCTION_TYPE_ALIAS (V8SI_FTYPE_V8SF, ROUND)
+
+DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V2DF_V2DF, PTEST)
+DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V2DI_V2DI, PTEST)
+DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V4DF_V4DF, PTEST)
+DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V4DI_V4DI, PTEST)
+DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V4SF_V4SF, PTEST)
+DEF_FUNCTION_TYPE_ALIAS (INT_FTYPE_V8SF_V8SF, PTEST)
+
+DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF, VEC_MERGE)
+DEF_FUNCTION_TYPE_ALIAS (V4SF_FTYPE_V4SF, VEC_MERGE)
+
+DEF_FUNCTION_TYPE_ALIAS (V1DI_FTYPE_V1DI_SI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V2DI_FTYPE_V2DI_SI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V2SI_FTYPE_V2SI_SI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V4HI_FTYPE_V4HI_SI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V4SI_FTYPE_V4SI_SI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V8HI_FTYPE_V8HI_SI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V1DI_FTYPE_V1DI_V1DI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V2DI_FTYPE_V2DI_V2DI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V2SI_FTYPE_V2SI_V2SI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V4HI_FTYPE_V4HI_V4HI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V4SI_FTYPE_V4SI_V4SI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V8HI_FTYPE_V8HI_V8HI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V16HI_FTYPE_V16HI_SI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V16HI_FTYPE_V16HI_V8HI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V8SI_FTYPE_V8SI_SI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V8SI_FTYPE_V8SI_V4SI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V4DI_FTYPE_V4DI_INT, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V4DI_FTYPE_V4DI_V2DI, COUNT)
+
+DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF_V2DF, SWAP)
+DEF_FUNCTION_TYPE_ALIAS (V4SF_FTYPE_V4SF_V4SF, SWAP)
+
+DEF_FUNCTION_TYPE_ALIAS (V4DI_FTYPE_V4DI_INT, CONVERT)
+DEF_FUNCTION_TYPE_ALIAS (V2DI_FTYPE_V2DI_INT, CONVERT)
+DEF_FUNCTION_TYPE_ALIAS (V4DI_FTYPE_V4DI_V4DI_INT, CONVERT)
+DEF_FUNCTION_TYPE_ALIAS (V2DI_FTYPE_V2DI_V2DI_INT, CONVERT)
+DEF_FUNCTION_TYPE_ALIAS (V1DI_FTYPE_V1DI_V1DI_INT, CONVERT)
+
+DEF_FUNCTION_TYPE_ALIAS (V16QI_FTYPE_V16QI_V16QI, CMP)
+DEF_FUNCTION_TYPE_ALIAS (V2DI_FTYPE_V2DI_V2DI, CMP)
+DEF_FUNCTION_TYPE_ALIAS (V4SI_FTYPE_V4SI_V4SI, CMP)
+DEF_FUNCTION_TYPE_ALIAS (V8HI_FTYPE_V8HI_V8HI, CMP)
+
+DEF_FUNCTION_TYPE_ALIAS (V16QI_FTYPE_V16QI_V16QI, TF)
+DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF_V2DF, TF)
+DEF_FUNCTION_TYPE_ALIAS (V2DI_FTYPE_V2DI_V2DI, TF)
+DEF_FUNCTION_TYPE_ALIAS (V4SF_FTYPE_V4SF_V4SF, TF)
+DEF_FUNCTION_TYPE_ALIAS (V4SI_FTYPE_V4SI_V4SI, TF)
+DEF_FUNCTION_TYPE_ALIAS (V8HI_FTYPE_V8HI_V8HI, TF)
diff --git a/gcc-4.9/gcc/config/i386/i386-c.c b/gcc-4.9/gcc/config/i386/i386-c.c
new file mode 100644
index 000000000..c9977bf2b
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/i386-c.c
@@ -0,0 +1,546 @@
+/* Subroutines used for macro/preprocessor support on the ia-32.
+   Copyright (C) 2008-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "tree.h"
+#include "tm_p.h"
+#include "flags.h"
+#include "c-family/c-common.h"
+#include "ggc.h"
+#include "target.h"
+#include "target-def.h"
+#include "cpplib.h"
+#include "c-family/c-pragma.h"
+
+static bool ix86_pragma_target_parse (tree, tree);
+static void ix86_target_macros_internal
+  (HOST_WIDE_INT, enum processor_type, enum processor_type, enum fpmath_unit,
+   void (*def_or_undef) (cpp_reader *, const char *));
+
+
+/* Internal function to either define or undef the appropriate system
+   macros.  */
+static void
+ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
+			     enum processor_type arch,
+			     enum processor_type tune,
+			     enum fpmath_unit fpmath,
+			     void (*def_or_undef) (cpp_reader *,
+						   const char *))
+{
+  /* For some of the k6/pentium varients there weren't separate ISA bits to
+     identify which tune/arch flag was passed, so figure it out here.  */
+  size_t arch_len = strlen (ix86_arch_string);
+  size_t tune_len = strlen (ix86_tune_string);
+  int last_arch_char = ix86_arch_string[arch_len - 1];
+  int last_tune_char = ix86_tune_string[tune_len - 1];
+
+  /* Built-ins based on -march=.  */
+  switch (arch)
+    {
+    case PROCESSOR_I386:
+      break;
+    case PROCESSOR_I486:
+      def_or_undef (parse_in, "__i486");
+      def_or_undef (parse_in, "__i486__");
+      break;
+    case PROCESSOR_PENTIUM:
+      def_or_undef (parse_in, "__i586");
+      def_or_undef (parse_in, "__i586__");
+      def_or_undef (parse_in, "__pentium");
+      def_or_undef (parse_in, "__pentium__");
+      if (isa_flag & OPTION_MASK_ISA_MMX)
+	def_or_undef (parse_in, "__pentium_mmx__");
+      break;
+    case PROCESSOR_PENTIUMPRO:
+      def_or_undef (parse_in, "__i686");
+      def_or_undef (parse_in, "__i686__");
+      def_or_undef (parse_in, "__pentiumpro");
+      def_or_undef (parse_in, "__pentiumpro__");
+      break;
+    case PROCESSOR_GEODE:
+      def_or_undef (parse_in, "__geode");
+      def_or_undef (parse_in, "__geode__");
+      break;
+    case PROCESSOR_K6:
+      def_or_undef (parse_in, "__k6");
+      def_or_undef (parse_in, "__k6__");
+      if (last_arch_char == '2')
+	def_or_undef (parse_in, "__k6_2__");
+      else if (last_arch_char == '3')
+	def_or_undef (parse_in, "__k6_3__");
+      else if (isa_flag & OPTION_MASK_ISA_3DNOW)
+	def_or_undef (parse_in, "__k6_3__");
+      break;
+    case PROCESSOR_ATHLON:
+      def_or_undef (parse_in, "__athlon");
+      def_or_undef (parse_in, "__athlon__");
+      if (isa_flag & OPTION_MASK_ISA_SSE)
+	def_or_undef (parse_in, "__athlon_sse__");
+      break;
+    case PROCESSOR_K8:
+      def_or_undef (parse_in, "__k8");
+      def_or_undef (parse_in, "__k8__");
+      break;
+    case PROCESSOR_AMDFAM10:
+      def_or_undef (parse_in, "__amdfam10");
+      def_or_undef (parse_in, "__amdfam10__");
+      break;
+    case PROCESSOR_BDVER1:
+      def_or_undef (parse_in, "__bdver1");
+      def_or_undef (parse_in, "__bdver1__");
+      break;
+    case PROCESSOR_BDVER2:
+      def_or_undef (parse_in, "__bdver2");
+      def_or_undef (parse_in, "__bdver2__");
+      break;
+    case PROCESSOR_BDVER3:
+      def_or_undef (parse_in, "__bdver3");
+      def_or_undef (parse_in, "__bdver3__");
+      break;
+    case PROCESSOR_BDVER4:
+      def_or_undef (parse_in, "__bdver4");
+      def_or_undef (parse_in, "__bdver4__");
+      break;
+    case PROCESSOR_BTVER1:
+      def_or_undef (parse_in, "__btver1");
+      def_or_undef (parse_in, "__btver1__");
+      break;
+    case PROCESSOR_BTVER2:
+      def_or_undef (parse_in, "__btver2");
+      def_or_undef (parse_in, "__btver2__");
+      break;
+    case PROCESSOR_PENTIUM4:
+      def_or_undef (parse_in, "__pentium4");
+      def_or_undef (parse_in, "__pentium4__");
+      break;
+    case PROCESSOR_NOCONA:
+      def_or_undef (parse_in, "__nocona");
+      def_or_undef (parse_in, "__nocona__");
+      break;
+    case PROCESSOR_CORE2:
+      def_or_undef (parse_in, "__core2");
+      def_or_undef (parse_in, "__core2__");
+      break;
+    case PROCESSOR_NEHALEM:
+      def_or_undef (parse_in, "__corei7");
+      def_or_undef (parse_in, "__corei7__");
+      def_or_undef (parse_in, "__nehalem");
+      def_or_undef (parse_in, "__nehalem__");
+      break;
+    case PROCESSOR_SANDYBRIDGE:
+      def_or_undef (parse_in, "__corei7_avx");
+      def_or_undef (parse_in, "__corei7_avx__");
+      def_or_undef (parse_in, "__sandybridge");
+      def_or_undef (parse_in, "__sandybridge__");
+      break;
+    case PROCESSOR_HASWELL:
+      def_or_undef (parse_in, "__core_avx2");
+      def_or_undef (parse_in, "__core_avx2__");
+      def_or_undef (parse_in, "__haswell");
+      def_or_undef (parse_in, "__haswell__");
+      break;
+    case PROCESSOR_BONNELL:
+      def_or_undef (parse_in, "__atom");
+      def_or_undef (parse_in, "__atom__");
+      def_or_undef (parse_in, "__bonnell");
+      def_or_undef (parse_in, "__bonnell__");
+      break;
+    case PROCESSOR_SILVERMONT:
+      def_or_undef (parse_in, "__slm");
+      def_or_undef (parse_in, "__slm__");
+      def_or_undef (parse_in, "__silvermont");
+      def_or_undef (parse_in, "__silvermont__");
+      break;
+    /* use PROCESSOR_max to not set/unset the arch macro.  */
+    case PROCESSOR_max:
+      break;
+    case PROCESSOR_INTEL:
+    case PROCESSOR_GENERIC:
+      gcc_unreachable ();
+    }
+
+  /* Built-ins based on -mtune=.  */
+  switch (tune)
+    {
+    case PROCESSOR_I386:
+      def_or_undef (parse_in, "__tune_i386__");
+      break;
+    case PROCESSOR_I486:
+      def_or_undef (parse_in, "__tune_i486__");
+      break;
+    case PROCESSOR_PENTIUM:
+      def_or_undef (parse_in, "__tune_i586__");
+      def_or_undef (parse_in, "__tune_pentium__");
+      if (last_tune_char == 'x')
+	def_or_undef (parse_in, "__tune_pentium_mmx__");
+      break;
+    case PROCESSOR_PENTIUMPRO:
+      def_or_undef (parse_in, "__tune_i686__");
+      def_or_undef (parse_in, "__tune_pentiumpro__");
+      switch (last_tune_char)
+	{
+	case '3':
+	  def_or_undef (parse_in, "__tune_pentium3__");
+	  /* FALLTHRU */
+	case '2':
+	  def_or_undef (parse_in, "__tune_pentium2__");
+	  break;
+	}
+      break;
+    case PROCESSOR_GEODE:
+      def_or_undef (parse_in, "__tune_geode__");
+      break;
+    case PROCESSOR_K6:
+      def_or_undef (parse_in, "__tune_k6__");
+      if (last_tune_char == '2')
+	def_or_undef (parse_in, "__tune_k6_2__");
+      else if (last_tune_char == '3')
+	def_or_undef (parse_in, "__tune_k6_3__");
+      else if (isa_flag & OPTION_MASK_ISA_3DNOW)
+	def_or_undef (parse_in, "__tune_k6_3__");
+      break;
+    case PROCESSOR_ATHLON:
+      def_or_undef (parse_in, "__tune_athlon__");
+      if (isa_flag & OPTION_MASK_ISA_SSE)
+	def_or_undef (parse_in, "__tune_athlon_sse__");
+      break;
+    case PROCESSOR_K8:
+      def_or_undef (parse_in, "__tune_k8__");
+      break;
+    case PROCESSOR_AMDFAM10:
+      def_or_undef (parse_in, "__tune_amdfam10__");
+      break;
+    case PROCESSOR_BDVER1:
+      def_or_undef (parse_in, "__tune_bdver1__");
+      break;
+    case PROCESSOR_BDVER2:
+      def_or_undef (parse_in, "__tune_bdver2__");
+      break;
+    case PROCESSOR_BDVER3:
+      def_or_undef (parse_in, "__tune_bdver3__");
+      break;
+    case PROCESSOR_BDVER4:
+      def_or_undef (parse_in, "__tune_bdver4__");
+      break;
+    case PROCESSOR_BTVER1:
+      def_or_undef (parse_in, "__tune_btver1__");
+      break;
+    case PROCESSOR_BTVER2:
+      def_or_undef (parse_in, "__tune_btver2__");
+       break;
+    case PROCESSOR_PENTIUM4:
+      def_or_undef (parse_in, "__tune_pentium4__");
+      break;
+    case PROCESSOR_NOCONA:
+      def_or_undef (parse_in, "__tune_nocona__");
+      break;
+    case PROCESSOR_CORE2:
+      def_or_undef (parse_in, "__tune_core2__");
+      break;
+    case PROCESSOR_NEHALEM:
+      def_or_undef (parse_in, "__tune_corei7__");
+      def_or_undef (parse_in, "__tune_nehalem__");
+      break;
+    case PROCESSOR_SANDYBRIDGE:
+      def_or_undef (parse_in, "__tune_corei7_avx__");
+      def_or_undef (parse_in, "__tune_sandybridge__");
+      break;
+    case PROCESSOR_HASWELL:
+      def_or_undef (parse_in, "__tune_core_avx2__");
+      def_or_undef (parse_in, "__tune_haswell__");
+      break;
+    case PROCESSOR_BONNELL:
+      def_or_undef (parse_in, "__tune_atom__");
+      def_or_undef (parse_in, "__tune_bonnell__");
+      break;
+    case PROCESSOR_SILVERMONT:
+      def_or_undef (parse_in, "__tune_slm__");
+      def_or_undef (parse_in, "__tune_silvermont__");
+      break;
+    case PROCESSOR_INTEL:
+    case PROCESSOR_GENERIC:
+      break;
+    /* use PROCESSOR_max to not set/unset the tune macro.  */
+    case PROCESSOR_max:
+      break;
+    }
+
+  switch (ix86_cmodel)
+    {
+    case CM_SMALL:
+    case CM_SMALL_PIC:
+      def_or_undef (parse_in, "__code_model_small__");
+      break;
+    case CM_MEDIUM:
+    case CM_MEDIUM_PIC:
+      def_or_undef (parse_in, "__code_model_medium__");
+      break;
+    case CM_LARGE:
+    case CM_LARGE_PIC:
+      def_or_undef (parse_in, "__code_model_large__");
+      break;
+    case CM_32:
+      def_or_undef (parse_in, "__code_model_32__");
+      break;
+    case CM_KERNEL:
+      def_or_undef (parse_in, "__code_model_kernel__");
+      break;
+    default:
+      ;
+    }
+
+  if (isa_flag & OPTION_MASK_ISA_MMX)
+    def_or_undef (parse_in, "__MMX__");
+  if (isa_flag & OPTION_MASK_ISA_3DNOW)
+    def_or_undef (parse_in, "__3dNOW__");
+  if (isa_flag & OPTION_MASK_ISA_3DNOW_A)
+    def_or_undef (parse_in, "__3dNOW_A__");
+  if (isa_flag & OPTION_MASK_ISA_SSE)
+    def_or_undef (parse_in, "__SSE__");
+  if (isa_flag & OPTION_MASK_ISA_SSE2)
+    def_or_undef (parse_in, "__SSE2__");
+  if (isa_flag & OPTION_MASK_ISA_SSE3)
+    def_or_undef (parse_in, "__SSE3__");
+  if (isa_flag & OPTION_MASK_ISA_SSSE3)
+    def_or_undef (parse_in, "__SSSE3__");
+  if (isa_flag & OPTION_MASK_ISA_SSE4_1)
+    def_or_undef (parse_in, "__SSE4_1__");
+  if (isa_flag & OPTION_MASK_ISA_SSE4_2)
+    def_or_undef (parse_in, "__SSE4_2__");
+  if (isa_flag & OPTION_MASK_ISA_AES)
+    def_or_undef (parse_in, "__AES__");
+  if (isa_flag & OPTION_MASK_ISA_SHA)
+    def_or_undef (parse_in, "__SHA__");
+  if (isa_flag & OPTION_MASK_ISA_PCLMUL)
+    def_or_undef (parse_in, "__PCLMUL__");
+  if (isa_flag & OPTION_MASK_ISA_AVX)
+    def_or_undef (parse_in, "__AVX__");
+  if (isa_flag & OPTION_MASK_ISA_AVX2)
+    def_or_undef (parse_in, "__AVX2__");
+  if (isa_flag & OPTION_MASK_ISA_AVX512F)
+    def_or_undef (parse_in, "__AVX512F__");
+  if (isa_flag & OPTION_MASK_ISA_AVX512ER)
+    def_or_undef (parse_in, "__AVX512ER__");
+  if (isa_flag & OPTION_MASK_ISA_AVX512CD)
+    def_or_undef (parse_in, "__AVX512CD__");
+  if (isa_flag & OPTION_MASK_ISA_AVX512PF)
+    def_or_undef (parse_in, "__AVX512PF__");
+  if (isa_flag & OPTION_MASK_ISA_FMA)
+    def_or_undef (parse_in, "__FMA__");
+  if (isa_flag & OPTION_MASK_ISA_RTM)
+    def_or_undef (parse_in, "__RTM__");
+  if (isa_flag & OPTION_MASK_ISA_SSE4A)
+    def_or_undef (parse_in, "__SSE4A__");
+  if (isa_flag & OPTION_MASK_ISA_FMA4)
+    def_or_undef (parse_in, "__FMA4__");
+  if (isa_flag & OPTION_MASK_ISA_XOP)
+    def_or_undef (parse_in, "__XOP__");
+  if (isa_flag & OPTION_MASK_ISA_LWP)
+    def_or_undef (parse_in, "__LWP__");
+  if (isa_flag & OPTION_MASK_ISA_ABM)
+    def_or_undef (parse_in, "__ABM__");
+  if (isa_flag & OPTION_MASK_ISA_BMI)
+    def_or_undef (parse_in, "__BMI__");
+  if (isa_flag & OPTION_MASK_ISA_BMI2)
+    def_or_undef (parse_in, "__BMI2__");
+  if (isa_flag & OPTION_MASK_ISA_LZCNT)
+    def_or_undef (parse_in, "__LZCNT__");
+  if (isa_flag & OPTION_MASK_ISA_TBM)
+    def_or_undef (parse_in, "__TBM__");
+  if (isa_flag & OPTION_MASK_ISA_POPCNT)
+    def_or_undef (parse_in, "__POPCNT__");
+  if (isa_flag & OPTION_MASK_ISA_FSGSBASE)
+    def_or_undef (parse_in, "__FSGSBASE__");
+  if (isa_flag & OPTION_MASK_ISA_RDRND)
+    def_or_undef (parse_in, "__RDRND__");
+  if (isa_flag & OPTION_MASK_ISA_F16C)
+    def_or_undef (parse_in, "__F16C__");
+  if (isa_flag & OPTION_MASK_ISA_RDSEED)
+    def_or_undef (parse_in, "__RDSEED__");
+  if (isa_flag & OPTION_MASK_ISA_PRFCHW)
+    def_or_undef (parse_in, "__PRFCHW__");
+  if (isa_flag & OPTION_MASK_ISA_ADX)
+    def_or_undef (parse_in, "__ADX__");
+  if (isa_flag & OPTION_MASK_ISA_FXSR)
+    def_or_undef (parse_in, "__FXSR__");
+  if (isa_flag & OPTION_MASK_ISA_XSAVE)
+    def_or_undef (parse_in, "__XSAVE__");
+  if (isa_flag & OPTION_MASK_ISA_XSAVEOPT)
+    def_or_undef (parse_in, "__XSAVEOPT__");
+  if (isa_flag & OPTION_MASK_ISA_PREFETCHWT1)
+    def_or_undef (parse_in, "__PREFETCHWT1__");
+  if ((fpmath & FPMATH_SSE) && (isa_flag & OPTION_MASK_ISA_SSE))
+    def_or_undef (parse_in, "__SSE_MATH__");
+  if ((fpmath & FPMATH_SSE) && (isa_flag & OPTION_MASK_ISA_SSE2))
+    def_or_undef (parse_in, "__SSE2_MATH__");
+}
+
+
+/* Hook to validate the current #pragma GCC target and set the state, and
+   update the macros based on what was changed.  If ARGS is NULL, then
+   POP_TARGET is used to reset the options.  */
+
+static bool
+ix86_pragma_target_parse (tree args, tree pop_target)
+{
+  tree prev_tree = build_target_option_node (&global_options);
+  tree cur_tree;
+  struct cl_target_option *prev_opt;
+  struct cl_target_option *cur_opt;
+  HOST_WIDE_INT prev_isa;
+  HOST_WIDE_INT cur_isa;
+  HOST_WIDE_INT diff_isa;
+  enum processor_type prev_arch;
+  enum processor_type prev_tune;
+  enum processor_type cur_arch;
+  enum processor_type cur_tune;
+
+  if (! args)
+    {
+      cur_tree = (pop_target ? pop_target : target_option_default_node);
+      cl_target_option_restore (&global_options,
+				TREE_TARGET_OPTION (cur_tree));
+    }
+  else
+    {
+      cur_tree = ix86_valid_target_attribute_tree (args, &global_options,
+						   &global_options_set);
+      if (!cur_tree || cur_tree == error_mark_node)
+       {
+         cl_target_option_restore (&global_options,
+                                   TREE_TARGET_OPTION (prev_tree));
+         return false;
+       }
+    }
+
+  target_option_current_node = cur_tree;
+  ix86_reset_previous_fndecl ();
+
+  /* Figure out the previous/current isa, arch, tune and the differences.  */
+  prev_opt  = TREE_TARGET_OPTION (prev_tree);
+  cur_opt   = TREE_TARGET_OPTION (cur_tree);
+  prev_isa  = prev_opt->x_ix86_isa_flags;
+  cur_isa   = cur_opt->x_ix86_isa_flags;
+  diff_isa  = (prev_isa ^ cur_isa);
+  prev_arch = (enum processor_type) prev_opt->arch;
+  prev_tune = (enum processor_type) prev_opt->tune;
+  cur_arch  = (enum processor_type) cur_opt->arch;
+  cur_tune  = (enum processor_type) cur_opt->tune;
+
+  /* If the same processor is used for both previous and current options, don't
+     change the macros.  */
+  if (cur_arch == prev_arch)
+    cur_arch = prev_arch = PROCESSOR_max;
+
+  if (cur_tune == prev_tune)
+    cur_tune = prev_tune = PROCESSOR_max;
+
+  /* Undef all of the macros for that are no longer current.  */
+  ix86_target_macros_internal (prev_isa & diff_isa,
+			       prev_arch,
+			       prev_tune,
+			       (enum fpmath_unit) prev_opt->x_ix86_fpmath,
+			       cpp_undef);
+
+  /* For the definitions, ensure all newly defined macros are considered
+     as used for -Wunused-macros.  There is no point warning about the
+     compiler predefined macros.  */
+  cpp_options *cpp_opts = cpp_get_options (parse_in);
+  unsigned char saved_warn_unused_macros = cpp_opts->warn_unused_macros;
+  cpp_opts->warn_unused_macros = 0;
+
+  /* Define all of the macros for new options that were just turned on.  */
+  ix86_target_macros_internal (cur_isa & diff_isa,
+			       cur_arch,
+			       cur_tune,
+			       (enum fpmath_unit) cur_opt->x_ix86_fpmath,
+			       cpp_define);
+
+  cpp_opts->warn_unused_macros = saved_warn_unused_macros;
+
+  return true;
+}
+
+/* Function to tell the preprocessor about the defines for the current target.  */
+
+void
+ix86_target_macros (void)
+{
+  /* 32/64-bit won't change with target specific options, so do the assert and
+     builtin_define_std calls here.  */
+  if (TARGET_64BIT)
+    {
+      cpp_assert (parse_in, "cpu=x86_64");
+      cpp_assert (parse_in, "machine=x86_64");
+      cpp_define (parse_in, "__amd64");
+      cpp_define (parse_in, "__amd64__");
+      cpp_define (parse_in, "__x86_64");
+      cpp_define (parse_in, "__x86_64__");
+      if (TARGET_X32)
+	{
+	  cpp_define (parse_in, "_ILP32");
+	  cpp_define (parse_in, "__ILP32__");
+	}
+    }
+  else
+    {
+      cpp_assert (parse_in, "cpu=i386");
+      cpp_assert (parse_in, "machine=i386");
+      builtin_define_std ("i386");
+    }
+
+  if (!TARGET_80387)
+    cpp_define (parse_in, "_SOFT_FLOAT");
+
+  if (TARGET_LONG_DOUBLE_64)
+    cpp_define (parse_in, "__LONG_DOUBLE_64__");
+
+  if (TARGET_LONG_DOUBLE_128)
+    cpp_define (parse_in, "__LONG_DOUBLE_128__");
+
+  cpp_define_formatted (parse_in, "__ATOMIC_HLE_ACQUIRE=%d", IX86_HLE_ACQUIRE);
+  cpp_define_formatted (parse_in, "__ATOMIC_HLE_RELEASE=%d", IX86_HLE_RELEASE);
+
+  ix86_target_macros_internal (ix86_isa_flags,
+			       ix86_arch,
+			       ix86_tune,
+			       ix86_fpmath,
+			       cpp_define);
+}
+
+
+/* Register target pragmas.  We need to add the hook for parsing #pragma GCC
+   option here rather than in i386.c since it will pull in various preprocessor
+   functions, and those are not present in languages like fortran without a
+   preprocessor.  */
+
+void
+ix86_register_pragmas (void)
+{
+  /* Update pragma hook to allow parsing #pragma GCC target.  */
+  targetm.target_option.pragma_parse = ix86_pragma_target_parse;
+
+#ifdef REGISTER_SUBTARGET_PRAGMAS
+  REGISTER_SUBTARGET_PRAGMAS ();
+#endif
+}
diff --git a/gcc-4.9/gcc/config/i386/i386-interix.h b/gcc-4.9/gcc/config/i386/i386-interix.h
new file mode 100644
index 000000000..59bbb508d
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/i386-interix.h
@@ -0,0 +1,346 @@
+/* Target definitions for GCC for Intel 80386 running Interix
+   Parts Copyright (C) 1991-2014 Free Software Foundation, Inc.
+
+   Parts:
+     by Douglas B. Rupp (drupp@cs.washington.edu).
+     by Ron Guilmette (rfg@netcom.com).
+     by Donn Terry (donn@softway.com).
+     by Mumit Khan (khan@xraylith.wisc.edu).
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Note: Interix doesn't support user-written DLLs (use conventional
+   shared libs (.so) instead).  Thus a lot of the stuff that might apply
+   about dllimport/dllexport and the like does not apply here. */
+
+#include <stdio.h>
+
+/* Names to predefine in the preprocessor for this target machine.  */
+
+#define DBX_DEBUGGING_INFO 1
+#define SDB_DEBUGGING_INFO 1
+#define PREFERRED_DEBUGGING_TYPE DBX_DEBUG
+
+/* Our strategy for finding global constructors is a bit different, although
+   not a lot.  */
+#define DO_GLOBAL_CTORS_BODY						\
+do {									\
+  int i;								\
+  unsigned long nptrs;							\
+  func_ptr *p;								\
+  asm(									\
+       "     .section .ctor_head, \"rw\"\n"				\
+       "1:\n"								\
+       "     .text \n"							\
+       ASM_LOAD_ADDR(1b,%0)						\
+       : "=r" (p) : : "cc");						\
+  for (nptrs = 0; p[nptrs] != 0; nptrs++);				\
+  for (i = nptrs-1; i >= 0; i--)					\
+    p[i] ();								\
+} while (0)
+
+#define DO_GLOBAL_DTORS_BODY						\
+do {									\
+  func_ptr *p;								\
+  asm(									\
+       "     .section .dtor_head, \"rw\"\n"				\
+       "1:\n"								\
+       "     .text \n"							\
+       ASM_LOAD_ADDR(1b,%0)						\
+       : "=r" (p) : : "cc");						\
+  while (*p)								\
+    {									\
+      p++;								\
+      (*(p-1)) ();							\
+    }									\
+} while (0)
+
+/* We don't use the "usual" push-an-address solution. */
+#undef TARGET_ASM_CONSTRUCTOR
+
+#undef  SUBTARGET_SWITCHES
+#define SUBTARGET_SWITCHES \
+{ "ms-bitfields", MASK_MS_BITFIELD_LAYOUT, N_("Use native (MS) bitfield layout") }, \
+{ "no-ms-bitfields", -MASK_MS_BITFIELD_LAYOUT, N_("Use gcc default bitfield layout") },
+
+#undef LIB_SPEC
+#define LIB_SPEC "\
+ %{!shared:%{!dynamic:-lc -lpsxdll \
+ }} \
+ %{!G:%{!dynamic:-lc -lpsxdll \
+ }} \
+ %{dynamic:-lc -lpsxdll \
+ } \
+ %{v}"
+
+#undef LINK_SPEC
+#define LINK_SPEC "%{!shared:-stack 0x4000000,0x10000} \
+		   %{g} \
+		   %{dynamic:-Bdynamic} \
+		   %{static:-Bstatic} \
+		   %{shared:--shared -Bdynamic} \
+		   %{G:--shared -Bdynamic} \
+		   %{symbolic:--shared -Bsymbolic -Bdynamic} \
+   		   %{rpath*:--rpath %*} \
+		   "
+
+#undef STARTFILE_SPEC
+#define STARTFILE_SPEC  \
+  "%{!shared:%{pg:gcrt0%O%s}%{!pg:%{p:mcrt0%O%s}%{!p:crt0%O%s}}} %{shared:crti%O%s}"
+
+#define TARGET_DECLSPEC 1
+
+/* cpp handles __STDC__ */
+#define TARGET_OS_CPP_BUILTINS()					\
+  do									\
+    {									\
+	builtin_define ("__INTERIX");					\
+	builtin_define ("_M_IX86=300");					\
+	builtin_define ("_X86_=1");					\
+	builtin_define ("__stdcall=__attribute__((__stdcall__))");	\
+	builtin_define ("__cdecl=__attribute__((__cdecl__))");		\
+	builtin_define ("__declspec(x)=__attribute__((x))");		\
+	builtin_assert ("system=unix");					\
+	builtin_assert ("system=interix");				\
+	if (preprocessing_asm_p ())					\
+	  builtin_define_std ("LANGUAGE_ASSEMBLY");			\
+	else								\
+	  {								\
+	     builtin_define_std ("LANGUAGE_C");				\
+	     if (c_dialect_cxx ())					\
+	       builtin_define_std ("LANGUAGE_C_PLUS_PLUS");		\
+	     if (c_dialect_objc ())					\
+	       builtin_define_std ("LANGUAGE_OBJECTIVE_C");		\
+	  } 								\
+    }									\
+  while (0)
+
+#undef CPP_SPEC
+#define CPP_SPEC "%{posix:-D_POSIX_SOURCE}"
+
+#define SIZE_TYPE "unsigned int"
+#define PTRDIFF_TYPE "int"
+#define WCHAR_TYPE "short unsigned int"
+#define WCHAR_TYPE_SIZE 16
+
+/* Turn off long double being 96 bits.  */
+#undef LONG_DOUBLE_TYPE_SIZE
+#define LONG_DOUBLE_TYPE_SIZE 64
+#undef LIBGCC2_LONG_DOUBLE_TYPE_SIZE
+#define LIBGCC2_LONG_DOUBLE_TYPE_SIZE 64
+
+#undef TARGET_LIBC_HAS_FUNCTION
+#define TARGET_LIBC_HAS_FUNCTION no_c99_libc_has_function
+
+/* The following are needed for us to be able to use winnt.c, but are not
+   otherwise meaningful to Interix.  (The functions that use these are
+   never called because we don't do DLLs.) */
+#define TARGET_NOP_FUN_DLLIMPORT 1
+#define drectve_section()  /* nothing */
+
+
+#define EH_FRAME_IN_DATA_SECTION
+
+#define READONLY_DATA_SECTION_ASM_OP	"\t.section\t.rdata,\"r\""
+
+/* Define this macro if references to a symbol must be treated
+   differently depending on something about the variale or
+   function named by the symbol (such as what section it is in).  */
+
+#undef TARGET_ENCODE_SECTION_INFO
+#define TARGET_ENCODE_SECTION_INFO i386_pe_encode_section_info
+#undef  TARGET_STRIP_NAME_ENCODING
+#define TARGET_STRIP_NAME_ENCODING  i386_pe_strip_name_encoding_full
+
+/* Emit code to check the stack when allocating more that 4000
+   bytes in one go.  */
+
+#define CHECK_STACK_LIMIT 4000
+
+/* By default, target has a 80387, uses IEEE compatible arithmetic,
+   and returns float values in the 387 and needs stack probes
+   We also align doubles to 64-bits forMSVC default compatibility
+   Ditto for bitfields. */
+#undef TARGET_SUBTARGET_DEFAULT
+#define TARGET_SUBTARGET_DEFAULT \
+   (MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS | MASK_STACK_PROBE | \
+    MASK_ALIGN_DOUBLE | MASK_MS_BITFIELD_LAYOUT)
+
+/* The MS compilers take alignment as a number of bytes, so we do as well */
+#undef ASM_OUTPUT_ALIGN
+#define ASM_OUTPUT_ALIGN(FILE,LOG) \
+  if ((LOG)!=0) fprintf ((FILE), "\t.balign %d\n", 1<<(LOG))
+
+
+/* Define this macro if in some cases global symbols from one translation
+   unit may not be bound to undefined symbols in another translation unit
+   without user intervention.  For instance, under Microsoft Windows
+   symbols must be explicitly imported from shared libraries (DLLs).  */
+/*
+ * Old gcc(3.3) did not have 1 here
+ */
+#define MULTIPLE_SYMBOL_SPACES	1
+
+extern void i386_pe_unique_section (tree, int);
+#define TARGET_ASM_UNIQUE_SECTION i386_pe_unique_section
+
+/* Switch into a generic section.  */
+#define TARGET_ASM_NAMED_SECTION  default_pe_asm_named_section
+
+/* Select attributes for named sections.  */
+#define TARGET_SECTION_TYPE_FLAGS  i386_pe_section_type_flags
+
+/* Write the extra assembler code needed to declare a function
+   properly.  If we are generating SDB debugging information, this
+   will happen automatically, so we only need to handle other cases.  */
+#undef ASM_DECLARE_FUNCTION_NAME
+#define ASM_DECLARE_FUNCTION_NAME(FILE, NAME, DECL)			\
+  do									\
+    {									\
+      if (write_symbols != SDB_DEBUG)					\
+       i386_pe_declare_function_type (FILE, NAME, TREE_PUBLIC (DECL));	\
+      ASM_OUTPUT_LABEL (FILE, NAME);       				\
+    }									\
+  while (0)
+
+/* Add an external function to the list of functions to be declared at
+   the end of the file.  */
+#define ASM_OUTPUT_EXTERNAL(FILE, DECL, NAME)				\
+  do									\
+    {									\
+      if (TREE_CODE (DECL) == FUNCTION_DECL)				\
+        i386_pe_record_external_function (DECL, NAME);			\
+    }									\
+  while (0)
+
+/* Declare the type properly for any external libcall.  */
+#define ASM_OUTPUT_EXTERNAL_LIBCALL(FILE, FUN) \
+  i386_pe_declare_function_type (FILE, XSTR (FUN, 0), 1)
+
+/* This says out to put a global symbol in the BSS section.  */
+#undef ASM_OUTPUT_ALIGNED_BSS
+#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \
+  asm_output_aligned_bss ((FILE), (DECL), (NAME), (SIZE), (ALIGN))
+
+/* Don't assume anything about the header files.  */
+#define NO_IMPLICIT_EXTERN_C
+
+/* External function declarations.  */
+extern void i386_pe_record_external_function (tree, const char *);
+extern void i386_pe_declare_function_type (FILE *, const char *, int);
+extern void i386_pe_record_exported_symbol (const char *, int);
+extern void i386_pe_asm_file_end (FILE *);
+
+/* For Win32 ABI compatibility */
+#undef DEFAULT_PCC_STRUCT_RETURN
+#define DEFAULT_PCC_STRUCT_RETURN 0
+
+/* A bitfield declared as `int' forces `int' alignment for the struct.  */
+#undef PCC_BITFIELD_TYPE_MATTERS
+#define PCC_BITFIELD_TYPE_MATTERS 1
+
+/* Enable alias attribute support.  */
+#ifndef SET_ASM_OP
+#define SET_ASM_OP "\t.set\t"
+#endif
+
+/* Note that there appears to be two different ways to support const
+   sections at the moment.  You can either #define the symbol
+   READONLY_DATA_SECTION (giving it some code which switches to the
+   readonly data section) or else you can #define the symbols
+   EXTRA_SECTIONS, EXTRA_SECTION_FUNCTIONS, SELECT_SECTION, and
+   SELECT_RTX_SECTION.  We do both here just to be on the safe side.  */
+
+#define USE_CONST_SECTION 1
+
+/* The linker will take care of this, and having them causes problems with
+   ld -r (specifically -rU).  */
+#define CTOR_LISTS_DEFINED_EXTERNALLY 1
+
+/* Output a definition (implements alias) */
+#define ASM_OUTPUT_DEF(FILE,LABEL1,LABEL2)				\
+do									\
+{									\
+    fputs (SET_ASM_OP, (FILE));						\
+    assemble_name (FILE, LABEL1);					\
+    fputc (',', (FILE));						\
+    assemble_name (FILE, LABEL2);					\
+    fputc ('\n', (FILE));						\
+    }									\
+while (0)
+
+#define HOST_PTR_AS_INT unsigned long
+
+/* The following two flags are usually "off" for i386, because some non-gnu
+   tools (for the i386) don't handle them.  However, we don't have that
+   problem, so....  */
+
+/* Forward references to tags are allowed.  */
+#define SDB_ALLOW_FORWARD_REFERENCES
+/* Unknown tags are also allowed.  */
+#define SDB_ALLOW_UNKNOWN_REFERENCES
+/* DWARF2 Unwinding doesn't work with exception handling yet.  */
+#define DWARF2_UNWIND_INFO 0
+/* MSVC returns structs of up to 8 bytes via registers. */
+
+#undef SUBTARGET_RETURN_IN_MEMORY
+#define SUBTARGET_RETURN_IN_MEMORY(TYPE, FNTYPE) \
+  (TYPE_MODE (TYPE) == BLKmode || \
+     (AGGREGATE_TYPE_P (TYPE) && int_size_in_bytes(TYPE) > 8 ))
+
+#define ASM_LOAD_ADDR(loc, reg)   "     leal " #loc "," #reg "\n"
+
+/* The integer half of this list needs to be constant.  However, there's
+   a lot of disagreement about what the floating point adjustments should
+   be.  We pick one that works with gdb.  (The underlying problem is
+   what to do about the segment registers.  Since we have access to them
+   from /proc, we'll allow them to be accessed in gdb, even tho the
+   gcc compiler can't generate them.  (There's some evidence that
+   MSVC does, but possibly only for certain special "canned" sequences.)  */
+
+#undef DBX_REGISTER_NUMBER
+#define DBX_REGISTER_NUMBER(n) \
+(TARGET_64BIT ? dbx64_register_map[n] \
+ : (n) == 0 ? (int) 0 \
+ : (n) == 1 ? (int) 2 \
+ : (n) == 2 ? (int) 1 \
+ : (n) == 3 ? (int) 3 \
+ : (n) == 4 ? (int) 6 \
+ : (n) == 5 ? (int) 7 \
+ : (n) == 6 ? (int) 5 \
+ : (n) == 7 ? (int) 4 \
+ : ((n) >= FIRST_STACK_REG && (n) <= LAST_STACK_REG) ? (int) (n)+8 \
+ : (int) (-1))
+
+#define EH_FRAME_IN_DATA_SECTION
+
+/* the following are OSF linker (not gld) specific... we don't want them */
+#undef HAS_INIT_SECTION
+#undef LD_INIT_SWITCH
+#undef LD_FINI_SWITCH
+
+/* The following are needed for us to be able to use winnt.c, but are not
+   otherwise meaningful to Interix.  (The functions that use these are
+   never called because we don't do DLLs.) */
+#define TARGET_NOP_FUN_DLLIMPORT 1
+#define I386_PE_STRIP_ENCODING(SYM_NAME) \
+  ((SYM_NAME) + ((SYM_NAME)[0] == '@' \
+                 ? ((SYM_NAME)[3] == '*' ? 4 : 3) : 0) \
+             + ((SYM_NAME)[0] == '*' ? 1 : 0))
+
+#define drectve_section()  /* nothing */
+
diff --git a/gcc-4.9/gcc/config/i386/i386-modes.def b/gcc-4.9/gcc/config/i386/i386-modes.def
new file mode 100644
index 000000000..07e572058
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/i386-modes.def
@@ -0,0 +1,99 @@
+/* Definitions of target machine for GCC for IA-32.
+   Copyright (C) 2002-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* The x86_64 ABI specifies both XF and TF modes.
+   XFmode is __float80 is IEEE extended; TFmode is __float128
+   is IEEE quad.  */
+
+FRACTIONAL_FLOAT_MODE (XF, 80, 12, ieee_extended_intel_96_format);
+FLOAT_MODE (TF, 16, ieee_quad_format);
+
+/* In ILP32 mode, XFmode has size 12 and alignment 4.
+   In LP64 mode, XFmode has size and alignment 16.  */
+ADJUST_FLOAT_FORMAT (XF, (TARGET_128BIT_LONG_DOUBLE
+			  ? &ieee_extended_intel_128_format
+			  : TARGET_96_ROUND_53_LONG_DOUBLE
+			  ? &ieee_extended_intel_96_round_53_format
+			  : &ieee_extended_intel_96_format));
+ADJUST_BYTESIZE  (XF, TARGET_128BIT_LONG_DOUBLE ? 16 : 12);
+ADJUST_ALIGNMENT (XF, TARGET_128BIT_LONG_DOUBLE ? 16 : 4);
+
+/* Add any extra modes needed to represent the condition code.
+
+   For the i386, we need separate modes when floating-point
+   equality comparisons are being done.
+
+   Add CCNO to indicate comparisons against zero that requires
+   Overflow flag to be unset.  Sign bit test is used instead and
+   thus can be used to form "a&b>0" type of tests.
+
+   Add CCGC to indicate comparisons against zero that allows
+   unspecified garbage in the Carry flag.  This mode is used
+   by inc/dec instructions.
+
+   Add CCGOC to indicate comparisons against zero that allows
+   unspecified garbage in the Carry and Overflow flag. This
+   mode is used to simulate comparisons of (a-b) and (a+b)
+   against zero using sub/cmp/add operations.
+
+   Add CCA to indicate that only the Above flag is valid.
+   Add CCC to indicate that only the Carry flag is valid.
+   Add CCO to indicate that only the Overflow flag is valid.
+   Add CCS to indicate that only the Sign flag is valid.
+   Add CCZ to indicate that only the Zero flag is valid.  */
+
+CC_MODE (CCGC);
+CC_MODE (CCGOC);
+CC_MODE (CCNO);
+CC_MODE (CCA);
+CC_MODE (CCC);
+CC_MODE (CCO);
+CC_MODE (CCS);
+CC_MODE (CCZ);
+CC_MODE (CCFP);
+CC_MODE (CCFPU);
+
+/* Vector modes.  Note that VEC_CONCAT patterns require vector
+   sizes twice as big as implemented in hardware.  */
+VECTOR_MODES (INT, 4);        /*              V4QI V2HI */
+VECTOR_MODES (INT, 8);        /*         V8QI V4HI V2SI */
+VECTOR_MODES (INT, 16);       /*   V16QI V8HI V4SI V2DI */
+VECTOR_MODES (INT, 32);       /*  V32QI V16HI V8SI V4DI */
+VECTOR_MODES (INT, 64);       /* V64QI V32HI V16SI V8DI */
+VECTOR_MODES (INT, 128);      /* V128QI V64HI V32SI V16DI */
+VECTOR_MODES (FLOAT, 8);      /*              V4HF V2SF */
+VECTOR_MODES (FLOAT, 16);     /*         V8HF V4SF V2DF */
+VECTOR_MODES (FLOAT, 32);     /*        V16HF V8SF V4DF */
+VECTOR_MODES (FLOAT, 64);     /*       V32HF V16SF V8DF */
+VECTOR_MODES (FLOAT, 128);    /*      V64HF V32SF V16DF */
+VECTOR_MODE (INT, TI, 1);     /*                   V1TI */
+VECTOR_MODE (INT, DI, 1);     /*                   V1DI */
+VECTOR_MODE (INT, SI, 1);     /*                   V1SI */
+VECTOR_MODE (INT, QI, 2);     /*                   V2QI */
+
+INT_MODE (OI, 32);
+INT_MODE (XI, 64);
+
+/* Keep the OI and XI modes from confusing the compiler into thinking
+   that these modes could actually be used for computation.  They are
+   only holders for vectors during data movement.  */
+#define MAX_BITSIZE_MODE_ANY_INT (128)
+
+/* The symbol Pmode stands for one of the above machine modes (usually SImode).
+   The tm.h file specifies which one.  It is not a distinct mode.  */
diff --git a/gcc-4.9/gcc/config/i386/i386-opts.h b/gcc-4.9/gcc/config/i386/i386-opts.h
new file mode 100644
index 000000000..47a34dbf7
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/i386-opts.h
@@ -0,0 +1,96 @@
+/* Definitions for option handling for IA-32.
+   Copyright (C) 1988-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef I386_OPTS_H
+#define I386_OPTS_H
+
+/* Algorithm to expand string function with.  */
+enum stringop_alg
+{
+#undef DEF_ENUM
+#define DEF_ENUM
+
+#undef DEF_ALG
+#define DEF_ALG(alg, name) alg, 
+
+#include "stringop.def"
+last_alg
+
+#undef DEF_ENUM
+#undef DEF_ALG
+};
+
+/* Available call abi.  */
+enum calling_abi
+{
+  SYSV_ABI = 0,
+  MS_ABI = 1
+};
+
+enum fpmath_unit
+{
+  FPMATH_387 = 1,
+  FPMATH_SSE = 2
+};
+
+enum tls_dialect
+{
+  TLS_DIALECT_GNU,
+  TLS_DIALECT_GNU2,
+  TLS_DIALECT_SUN
+};
+
+enum cmodel {
+  CM_32,	/* The traditional 32-bit ABI.  */
+  CM_SMALL,	/* Assumes all code and data fits in the low 31 bits.  */
+  CM_KERNEL,	/* Assumes all code and data fits in the high 31 bits.  */
+  CM_MEDIUM,	/* Assumes code fits in the low 31 bits; data unlimited.  */
+  CM_LARGE,	/* No assumptions.  */
+  CM_SMALL_PIC,	/* Assumes code+data+got/plt fits in a 31 bit region.  */
+  CM_MEDIUM_PIC,/* Assumes code+got/plt fits in a 31 bit region.  */
+  CM_LARGE_PIC	/* No assumptions.  */
+};
+
+enum pmode {
+  PMODE_SI,	/* Pmode == SImode. */
+  PMODE_DI 	/* Pmode == DImode. */
+};
+
+enum asm_dialect {
+  ASM_ATT,
+  ASM_INTEL
+};
+
+enum ix86_veclibabi {
+  ix86_veclibabi_type_none,
+  ix86_veclibabi_type_svml,
+  ix86_veclibabi_type_acml
+};
+
+enum stack_protector_guard {
+  SSP_TLS,      /* per-thread canary in TLS block */
+  SSP_GLOBAL    /* global canary */
+};
+
+#endif
diff --git a/gcc-4.9/gcc/config/i386/i386-protos.h b/gcc-4.9/gcc/config/i386/i386-protos.h
new file mode 100644
index 000000000..6e3297880
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/i386-protos.h
@@ -0,0 +1,332 @@
+/* Definitions of target machine for GCC for IA-32.
+   Copyright (C) 1988-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* In i386-common.c.  */
+extern bool ix86_handle_option (struct gcc_options *opts,
+				struct gcc_options *opts_set ATTRIBUTE_UNUSED,
+				const struct cl_decoded_option *decoded,
+				location_t loc);
+
+/* Functions in i386.c */
+extern bool ix86_target_stack_probe (void);
+extern bool ix86_can_use_return_insn_p (void);
+extern void ix86_setup_frame_addresses (void);
+
+extern HOST_WIDE_INT ix86_initial_elimination_offset (int, int);
+extern void ix86_expand_prologue (void);
+extern void ix86_maybe_emit_epilogue_vzeroupper (void);
+extern void ix86_expand_epilogue (int);
+extern void ix86_expand_split_stack_prologue (void);
+
+extern void ix86_output_addr_vec_elt (FILE *, int);
+extern void ix86_output_addr_diff_elt (FILE *, int, int);
+
+extern enum calling_abi ix86_cfun_abi (void);
+extern enum calling_abi ix86_function_type_abi (const_tree);
+
+extern void ix86_reset_previous_fndecl (void);
+
+#ifdef RTX_CODE
+extern int standard_80387_constant_p (rtx);
+extern const char *standard_80387_constant_opcode (rtx);
+extern rtx standard_80387_constant_rtx (int);
+extern int standard_sse_constant_p (rtx);
+extern const char *standard_sse_constant_opcode (rtx, rtx);
+extern bool symbolic_reference_mentioned_p (rtx);
+extern bool extended_reg_mentioned_p (rtx);
+extern bool x86_extended_QIreg_mentioned_p (rtx);
+extern bool x86_extended_reg_mentioned_p (rtx);
+extern bool x86_maybe_negate_const_int (rtx *, enum machine_mode);
+extern enum machine_mode ix86_cc_mode (enum rtx_code, rtx, rtx);
+
+extern int avx_vpermilp_parallel (rtx par, enum machine_mode mode);
+extern int avx_vperm2f128_parallel (rtx par, enum machine_mode mode);
+
+extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx);
+extern bool ix86_expand_set_or_movmem (rtx, rtx, rtx, rtx, rtx, rtx,
+				       rtx, rtx, rtx, rtx, bool);
+
+extern bool constant_address_p (rtx);
+extern bool legitimate_pic_operand_p (rtx);
+extern bool legitimate_pic_address_disp_p (rtx);
+extern bool ix86_legitimize_reload_address (rtx, enum machine_mode,
+					    int, int, int);
+extern void print_reg (rtx, int, FILE*);
+extern void ix86_print_operand (FILE *, rtx, int);
+
+extern void split_double_mode (enum machine_mode, rtx[], int, rtx[], rtx[]);
+
+extern const char *output_set_got (rtx, rtx);
+extern const char *output_387_binary_op (rtx, rtx*);
+extern const char *output_387_reg_move (rtx, rtx*);
+extern const char *output_fix_trunc (rtx, rtx*, bool);
+extern const char *output_fp_compare (rtx, rtx*, bool, bool);
+extern const char *output_adjust_stack_and_probe (rtx);
+extern const char *output_probe_stack_range (rtx, rtx);
+
+extern void ix86_expand_clear (rtx);
+extern void ix86_expand_move (enum machine_mode, rtx[]);
+extern void ix86_expand_vector_move (enum machine_mode, rtx[]);
+extern void ix86_expand_vector_move_misalign (enum machine_mode, rtx[]);
+extern rtx ix86_fixup_binary_operands (enum rtx_code,
+				       enum machine_mode, rtx[]);
+extern void ix86_fixup_binary_operands_no_copy (enum rtx_code,
+						enum machine_mode, rtx[]);
+extern void ix86_expand_binary_operator (enum rtx_code,
+					 enum machine_mode, rtx[]);
+extern void ix86_expand_vector_logical_operator (enum rtx_code,
+						 enum machine_mode, rtx[]);
+extern bool ix86_binary_operator_ok (enum rtx_code, enum machine_mode, rtx[]);
+extern bool ix86_avoid_lea_for_add (rtx, rtx[]);
+extern bool ix86_use_lea_for_mov (rtx, rtx[]);
+extern bool ix86_avoid_lea_for_addr (rtx, rtx[]);
+extern void ix86_split_lea_for_addr (rtx, rtx[], enum machine_mode);
+extern bool ix86_lea_for_add_ok (rtx, rtx[]);
+extern bool ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high);
+extern bool ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn);
+extern bool ix86_agi_dependent (rtx set_insn, rtx use_insn);
+extern void ix86_expand_unary_operator (enum rtx_code, enum machine_mode,
+					rtx[]);
+extern rtx ix86_build_const_vector (enum machine_mode, bool, rtx);
+extern rtx ix86_build_signbit_mask (enum machine_mode, bool, bool);
+extern void ix86_split_convert_uns_si_sse (rtx[]);
+extern void ix86_expand_convert_uns_didf_sse (rtx, rtx);
+extern void ix86_expand_convert_uns_sixf_sse (rtx, rtx);
+extern void ix86_expand_convert_uns_sidf_sse (rtx, rtx);
+extern void ix86_expand_convert_uns_sisf_sse (rtx, rtx);
+extern void ix86_expand_convert_sign_didf_sse (rtx, rtx);
+extern void ix86_expand_vector_convert_uns_vsivsf (rtx, rtx);
+extern rtx ix86_expand_adjust_ufix_to_sfix_si (rtx, rtx *);
+extern enum ix86_fpcmp_strategy ix86_fp_comparison_strategy (enum rtx_code);
+extern void ix86_expand_fp_absneg_operator (enum rtx_code, enum machine_mode,
+					    rtx[]);
+extern void ix86_expand_copysign (rtx []);
+extern void ix86_split_copysign_const (rtx []);
+extern void ix86_split_copysign_var (rtx []);
+extern bool ix86_unary_operator_ok (enum rtx_code, enum machine_mode, rtx[]);
+extern bool ix86_match_ccmode (rtx, enum machine_mode);
+extern void ix86_expand_branch (enum rtx_code, rtx, rtx, rtx);
+extern void ix86_expand_setcc (rtx, enum rtx_code, rtx, rtx);
+extern bool ix86_expand_int_movcc (rtx[]);
+extern bool ix86_expand_fp_movcc (rtx[]);
+extern bool ix86_expand_fp_vcond (rtx[]);
+extern bool ix86_expand_int_vcond (rtx[]);
+extern void ix86_expand_vec_perm (rtx[]);
+extern bool ix86_expand_vec_perm_const (rtx[]);
+extern void ix86_expand_sse_unpack (rtx, rtx, bool, bool);
+extern bool ix86_expand_int_addcc (rtx[]);
+extern rtx ix86_expand_call (rtx, rtx, rtx, rtx, rtx, bool);
+extern void ix86_split_call_vzeroupper (rtx, rtx);
+extern void x86_initialize_trampoline (rtx, rtx, rtx);
+extern rtx ix86_zero_extend_to_Pmode (rtx);
+extern void ix86_split_long_move (rtx[]);
+extern void ix86_split_ashl (rtx *, rtx, enum machine_mode);
+extern void ix86_split_ashr (rtx *, rtx, enum machine_mode);
+extern void ix86_split_lshr (rtx *, rtx, enum machine_mode);
+extern rtx ix86_find_base_term (rtx);
+extern bool ix86_check_movabs (rtx, int);
+extern void ix86_split_idivmod (enum machine_mode, rtx[], bool);
+extern bool ix86_emit_cfi ();
+
+extern rtx assign_386_stack_local (enum machine_mode, enum ix86_stack_slot);
+extern int ix86_attr_length_immediate_default (rtx, bool);
+extern int ix86_attr_length_address_default (rtx);
+extern int ix86_attr_length_vex_default (rtx, bool, bool);
+
+extern enum machine_mode ix86_fp_compare_mode (enum rtx_code);
+
+extern rtx ix86_libcall_value (enum machine_mode);
+extern bool ix86_function_arg_regno_p (int);
+extern void ix86_asm_output_function_label (FILE *, const char *, tree);
+extern void ix86_call_abi_override (const_tree);
+extern int ix86_reg_parm_stack_space (const_tree);
+
+extern void ix86_split_fp_branch (enum rtx_code code, rtx, rtx,
+				  rtx, rtx, rtx);
+extern bool ix86_hard_regno_mode_ok (int, enum machine_mode);
+extern bool ix86_modes_tieable_p (enum machine_mode, enum machine_mode);
+extern bool ix86_secondary_memory_needed (enum reg_class, enum reg_class,
+					  enum machine_mode, int);
+extern bool ix86_cannot_change_mode_class (enum machine_mode,
+					   enum machine_mode, enum reg_class);
+
+extern int ix86_mode_needed (int, rtx);
+extern int ix86_mode_after (int, int, rtx);
+extern int ix86_mode_entry (int);
+extern int ix86_mode_exit (int);
+
+extern bool ix86_libc_has_function (enum function_class fn_class);
+
+#ifdef HARD_CONST
+extern void ix86_emit_mode_set (int, int, HARD_REG_SET);
+#endif
+
+extern void x86_order_regs_for_local_alloc (void);
+extern void x86_function_profiler (FILE *, int);
+extern void x86_emit_floatuns (rtx [2]);
+extern void ix86_emit_fp_unordered_jump (rtx);
+
+extern void ix86_emit_i387_log1p (rtx, rtx);
+extern void ix86_emit_i387_round (rtx, rtx);
+extern void ix86_emit_swdivsf (rtx, rtx, rtx, enum machine_mode);
+extern void ix86_emit_swsqrtsf (rtx, rtx, enum machine_mode, bool);
+
+extern enum rtx_code ix86_reverse_condition (enum rtx_code, enum machine_mode);
+
+extern void ix86_expand_lround (rtx, rtx);
+extern void ix86_expand_lfloorceil (rtx, rtx, bool);
+extern void ix86_expand_rint (rtx, rtx);
+extern void ix86_expand_floorceil (rtx, rtx, bool);
+extern void ix86_expand_floorceildf_32 (rtx, rtx, bool);
+extern void ix86_expand_round_sse4 (rtx, rtx);
+extern void ix86_expand_round (rtx, rtx);
+extern void ix86_expand_rounddf_32 (rtx, rtx);
+extern void ix86_expand_trunc (rtx, rtx);
+extern void ix86_expand_truncdf_32 (rtx, rtx);
+
+extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx);
+
+#ifdef TREE_CODE
+extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
+#endif	/* TREE_CODE  */
+
+#endif	/* RTX_CODE  */
+
+#ifdef TREE_CODE
+extern int ix86_data_alignment (tree, int, bool);
+extern unsigned int ix86_local_alignment (tree, enum machine_mode,
+					  unsigned int);
+extern unsigned int ix86_minimum_alignment (tree, enum machine_mode,
+					    unsigned int);
+extern int ix86_constant_alignment (tree, int);
+extern tree ix86_handle_shared_attribute (tree *, tree, tree, int, bool *);
+extern tree ix86_handle_selectany_attribute (tree *, tree, tree, int, bool *);
+extern int x86_field_alignment (tree, int);
+extern tree ix86_valid_target_attribute_tree (tree,
+					      struct gcc_options *,
+					      struct gcc_options *);
+extern unsigned int ix86_get_callcvt (const_tree);
+
+#endif
+
+extern rtx ix86_tls_module_base (void);
+
+extern void ix86_expand_vector_init (bool, rtx, rtx);
+extern void ix86_expand_vector_set (bool, rtx, rtx, int);
+extern void ix86_expand_vector_extract (bool, rtx, rtx, int);
+extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), rtx, rtx);
+
+extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned);
+extern bool ix86_expand_pinsr (rtx *);
+extern void ix86_expand_mul_widen_evenodd (rtx, rtx, rtx, bool, bool);
+extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
+extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
+extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
+extern void ix86_expand_sse2_abs (rtx, rtx);
+
+/* In i386-c.c  */
+extern void ix86_target_macros (void);
+extern void ix86_register_pragmas (void);
+
+/* In winnt.c  */
+extern void i386_pe_unique_section (tree, int);
+extern void i386_pe_declare_function_type (FILE *, const char *, int);
+extern void i386_pe_record_external_function (tree, const char *);
+extern void i386_pe_maybe_record_exported_symbol (tree, const char *, int);
+extern void i386_pe_encode_section_info (tree, rtx, int);
+extern bool i386_pe_binds_local_p (const_tree);
+extern const char *i386_pe_strip_name_encoding_full (const char *);
+extern bool i386_pe_valid_dllimport_attribute_p (const_tree);
+extern unsigned int i386_pe_section_type_flags (tree, const char *, int);
+extern void i386_pe_asm_named_section (const char *, unsigned int, tree);
+extern void i386_pe_asm_output_aligned_decl_common (FILE *, tree,
+						    const char *,
+						    HOST_WIDE_INT,
+						    HOST_WIDE_INT);
+extern void i386_pe_file_end (void);
+extern void i386_pe_start_function (FILE *, const char *, tree);
+extern void i386_pe_end_function (FILE *, const char *, tree);
+extern void i386_pe_assemble_visibility (tree, int);
+extern tree i386_pe_mangle_decl_assembler_name (tree, tree);
+extern tree i386_pe_mangle_assembler_name (const char *);
+extern void i386_pe_record_stub (const char *);
+
+extern void i386_pe_seh_init (FILE *);
+extern void i386_pe_seh_end_prologue (FILE *);
+extern void i386_pe_seh_unwind_emit (FILE *, rtx);
+extern void i386_pe_seh_emit_except_personality (rtx);
+extern void i386_pe_seh_init_sections (void);
+
+/* In winnt-cxx.c and winnt-stubs.c  */
+extern void i386_pe_adjust_class_at_definition (tree);
+extern bool i386_pe_type_dllimport_p (tree);
+extern bool i386_pe_type_dllexport_p (tree);
+
+extern int i386_pe_reloc_rw_mask (void);
+
+extern rtx maybe_get_pool_constant (rtx);
+
+extern char internal_label_prefix[16];
+extern int internal_label_prefix_len;
+
+enum ix86_address_seg { SEG_DEFAULT, SEG_FS, SEG_GS };
+struct ix86_address
+{
+  rtx base, index, disp;
+  HOST_WIDE_INT scale;
+  enum ix86_address_seg seg;
+};
+
+extern int ix86_decompose_address (rtx, struct ix86_address *);
+extern int memory_address_length (rtx, bool);
+extern void x86_output_aligned_bss (FILE *, tree, const char *,
+				    unsigned HOST_WIDE_INT, int);
+extern void x86_elf_aligned_common (FILE *, const char *,
+				    unsigned HOST_WIDE_INT, int);
+
+#ifdef RTX_CODE
+extern void ix86_fp_comparison_codes (enum rtx_code code, enum rtx_code *,
+				      enum rtx_code *, enum rtx_code *);
+extern enum rtx_code ix86_fp_compare_code_to_integer (enum rtx_code);
+#endif
+extern int asm_preferred_eh_data_format (int, int);
+
+#ifdef HAVE_ATTR_cpu
+extern enum attr_cpu ix86_schedule;
+#endif
+
+extern const char * ix86_output_call_insn (rtx insn, rtx call_op);
+
+#ifdef RTX_CODE
+/* Target data for multipass lookahead scheduling.
+   Currently used for Core 2/i7 tuning.  */
+struct ix86_first_cycle_multipass_data_
+{
+  /* The length (in bytes) of ifetch block in this solution.  */
+  int ifetch_block_len;
+  /* Number of instructions in ifetch block in this solution.  */
+  int ifetch_block_n_insns;
+  /* Bitmap to remember changes to ready_try for backtracking.  */
+  sbitmap ready_try_change;
+  /* Size of the bitmap.  */
+  int ready_try_change_size;
+};
+# define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DATA_T	\
+  struct ix86_first_cycle_multipass_data_
+#endif /* RTX_CODE */
diff --git a/gcc-4.9/gcc/config/i386/i386.c b/gcc-4.9/gcc/config/i386/i386.c
new file mode 100644
index 000000000..842be686d
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/i386.c
@@ -0,0 +1,47138 @@
+/* Subroutines used for code generation on IA-32.
+   Copyright (C) 1988-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "rtl.h"
+#include "tree.h"
+#include "stringpool.h"
+#include "attribs.h"
+#include "calls.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "tm_p.h"
+#include "regs.h"
+#include "hard-reg-set.h"
+#include "insn-config.h"
+#include "conditions.h"
+#include "output.h"
+#include "insn-codes.h"
+#include "insn-attr.h"
+#include "flags.h"
+#include "except.h"
+#include "function.h"
+#include "recog.h"
+#include "expr.h"
+#include "optabs.h"
+#include "diagnostic-core.h"
+#include "toplev.h"
+#include "basic-block.h"
+#include "ggc.h"
+#include "target.h"
+#include "target-def.h"
+#include "common/common-target.h"
+#include "langhooks.h"
+#include "reload.h"
+#include "cgraph.h"
+#include "pointer-set.h"
+#include "hash-table.h"
+#include "vec.h"
+#include "basic-block.h"
+#include "tree-ssa-alias.h"
+#include "internal-fn.h"
+#include "gimple-fold.h"
+#include "tree-eh.h"
+#include "gimple-expr.h"
+#include "is-a.h"
+#include "gimple.h"
+#include "gimplify.h"
+#include "cfgloop.h"
+#include "dwarf2.h"
+#include "df.h"
+#include "tm-constrs.h"
+#include "params.h"
+#include "cselib.h"
+#include "debug.h"
+#include "sched-int.h"
+#include "sbitmap.h"
+#include "fibheap.h"
+#include "opts.h"
+#include "diagnostic.h"
+#include "dumpfile.h"
+#include "tree-pass.h"
+#include "context.h"
+#include "pass_manager.h"
+#include "target-globals.h"
+
+static rtx legitimize_dllimport_symbol (rtx, bool);
+static rtx legitimize_pe_coff_extern_decl (rtx, bool);
+static rtx legitimize_pe_coff_symbol (rtx, bool);
+
+#ifndef CHECK_STACK_LIMIT
+#define CHECK_STACK_LIMIT (-1)
+#endif
+
+/* Return index of given mode in mult and division cost tables.  */
+#define MODE_INDEX(mode)					\
+  ((mode) == QImode ? 0						\
+   : (mode) == HImode ? 1					\
+   : (mode) == SImode ? 2					\
+   : (mode) == DImode ? 3					\
+   : 4)
+
+/* Processor costs (relative to an add) */
+/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
+#define COSTS_N_BYTES(N) ((N) * 2)
+
+#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
+
+static stringop_algs ix86_size_memcpy[2] = {
+  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
+  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
+static stringop_algs ix86_size_memset[2] = {
+  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
+  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
+
+const
+struct processor_costs ix86_size_cost = {/* costs for tuning for size */
+  COSTS_N_BYTES (2),			/* cost of an add instruction */
+  COSTS_N_BYTES (3),			/* cost of a lea instruction */
+  COSTS_N_BYTES (2),			/* variable shift costs */
+  COSTS_N_BYTES (3),			/* constant shift costs */
+  {COSTS_N_BYTES (3),			/* cost of starting multiply for QI */
+   COSTS_N_BYTES (3),			/*				 HI */
+   COSTS_N_BYTES (3),			/*				 SI */
+   COSTS_N_BYTES (3),			/*				 DI */
+   COSTS_N_BYTES (5)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_BYTES (3),			/* cost of a divide/mod for QI */
+   COSTS_N_BYTES (3),			/*			    HI */
+   COSTS_N_BYTES (3),			/*			    SI */
+   COSTS_N_BYTES (3),			/*			    DI */
+   COSTS_N_BYTES (5)},			/*			    other */
+  COSTS_N_BYTES (3),			/* cost of movsx */
+  COSTS_N_BYTES (3),			/* cost of movzx */
+  0,					/* "large" insn */
+  2,					/* MOVE_RATIO */
+  2,				     /* cost for loading QImode using movzbl */
+  {2, 2, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 2, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {2, 2, 2},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {2, 2, 2},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  3,					/* cost of moving MMX register */
+  {3, 3},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {3, 3},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  3,					/* cost of moving SSE register */
+  {3, 3, 3},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {3, 3, 3},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  3,					/* MMX or SSE register to integer */
+  0,					/* size of l1 cache  */
+  0,					/* size of l2 cache  */
+  0,					/* size of prefetch block */
+  0,					/* number of parallel prefetches */
+  2,					/* Branch cost */
+  COSTS_N_BYTES (2),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_BYTES (2),			/* cost of FMUL instruction.  */
+  COSTS_N_BYTES (2),			/* cost of FDIV instruction.  */
+  COSTS_N_BYTES (2),			/* cost of FABS instruction.  */
+  COSTS_N_BYTES (2),			/* cost of FCHS instruction.  */
+  COSTS_N_BYTES (2),			/* cost of FSQRT instruction.  */
+  ix86_size_memcpy,
+  ix86_size_memset,
+  1,					/* scalar_stmt_cost.  */
+  1,					/* scalar load_cost.  */
+  1,					/* scalar_store_cost.  */
+  1,					/* vec_stmt_cost.  */
+  1,					/* vec_to_scalar_cost.  */
+  1,					/* scalar_to_vec_cost.  */
+  1,					/* vec_align_load_cost.  */
+  1,					/* vec_unalign_load_cost.  */
+  1,					/* vec_store_cost.  */
+  1,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+/* Processor costs (relative to an add) */
+static stringop_algs i386_memcpy[2] = {
+  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
+  DUMMY_STRINGOP_ALGS};
+static stringop_algs i386_memset[2] = {
+  {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
+  DUMMY_STRINGOP_ALGS};
+
+static const
+struct processor_costs i386_cost = {	/* 386 specific costs */
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (1),			/* cost of a lea instruction */
+  COSTS_N_INSNS (3),			/* variable shift costs */
+  COSTS_N_INSNS (2),			/* constant shift costs */
+  {COSTS_N_INSNS (6),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (6),			/*				 HI */
+   COSTS_N_INSNS (6),			/*				 SI */
+   COSTS_N_INSNS (6),			/*				 DI */
+   COSTS_N_INSNS (6)},			/*			      other */
+  COSTS_N_INSNS (1),			/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (23),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (23),			/*			    HI */
+   COSTS_N_INSNS (23),			/*			    SI */
+   COSTS_N_INSNS (23),			/*			    DI */
+   COSTS_N_INSNS (23)},			/*			    other */
+  COSTS_N_INSNS (3),			/* cost of movsx */
+  COSTS_N_INSNS (2),			/* cost of movzx */
+  15,					/* "large" insn */
+  3,					/* MOVE_RATIO */
+  4,				     /* cost for loading QImode using movzbl */
+  {2, 4, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 4, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {8, 8, 8},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {8, 8, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {4, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {4, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {4, 8, 16},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {4, 8, 16},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  3,					/* MMX or SSE register to integer */
+  0,					/* size of l1 cache  */
+  0,					/* size of l2 cache  */
+  0,					/* size of prefetch block */
+  0,					/* number of parallel prefetches */
+  1,					/* Branch cost */
+  COSTS_N_INSNS (23),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (27),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (88),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (22),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (24),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (122),			/* cost of FSQRT instruction.  */
+  i386_memcpy,
+  i386_memset,
+  1,					/* scalar_stmt_cost.  */
+  1,					/* scalar load_cost.  */
+  1,					/* scalar_store_cost.  */
+  1,					/* vec_stmt_cost.  */
+  1,					/* vec_to_scalar_cost.  */
+  1,					/* scalar_to_vec_cost.  */
+  1,					/* vec_align_load_cost.  */
+  2,					/* vec_unalign_load_cost.  */
+  1,					/* vec_store_cost.  */
+  3,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs i486_memcpy[2] = {
+  {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
+  DUMMY_STRINGOP_ALGS};
+static stringop_algs i486_memset[2] = {
+  {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
+  DUMMY_STRINGOP_ALGS};
+
+static const
+struct processor_costs i486_cost = {	/* 486 specific costs */
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (1),			/* cost of a lea instruction */
+  COSTS_N_INSNS (3),			/* variable shift costs */
+  COSTS_N_INSNS (2),			/* constant shift costs */
+  {COSTS_N_INSNS (12),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (12),			/*				 HI */
+   COSTS_N_INSNS (12),			/*				 SI */
+   COSTS_N_INSNS (12),			/*				 DI */
+   COSTS_N_INSNS (12)},			/*			      other */
+  1,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (40),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (40),			/*			    HI */
+   COSTS_N_INSNS (40),			/*			    SI */
+   COSTS_N_INSNS (40),			/*			    DI */
+   COSTS_N_INSNS (40)},			/*			    other */
+  COSTS_N_INSNS (3),			/* cost of movsx */
+  COSTS_N_INSNS (2),			/* cost of movzx */
+  15,					/* "large" insn */
+  3,					/* MOVE_RATIO */
+  4,				     /* cost for loading QImode using movzbl */
+  {2, 4, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 4, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {8, 8, 8},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {8, 8, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {4, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {4, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {4, 8, 16},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {4, 8, 16},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  3,					/* MMX or SSE register to integer */
+  4,					/* size of l1 cache.  486 has 8kB cache
+					   shared for code and data, so 4kB is
+					   not really precise.  */
+  4,					/* size of l2 cache  */
+  0,					/* size of prefetch block */
+  0,					/* number of parallel prefetches */
+  1,					/* Branch cost */
+  COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (16),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (73),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (83),			/* cost of FSQRT instruction.  */
+  i486_memcpy,
+  i486_memset,
+  1,					/* scalar_stmt_cost.  */
+  1,					/* scalar load_cost.  */
+  1,					/* scalar_store_cost.  */
+  1,					/* vec_stmt_cost.  */
+  1,					/* vec_to_scalar_cost.  */
+  1,					/* scalar_to_vec_cost.  */
+  1,					/* vec_align_load_cost.  */
+  2,					/* vec_unalign_load_cost.  */
+  1,					/* vec_store_cost.  */
+  3,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs pentium_memcpy[2] = {
+  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  DUMMY_STRINGOP_ALGS};
+static stringop_algs pentium_memset[2] = {
+  {libcall, {{-1, rep_prefix_4_byte, false}}},
+  DUMMY_STRINGOP_ALGS};
+
+static const
+struct processor_costs pentium_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (1),			/* cost of a lea instruction */
+  COSTS_N_INSNS (4),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (11),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (11),			/*				 HI */
+   COSTS_N_INSNS (11),			/*				 SI */
+   COSTS_N_INSNS (11),			/*				 DI */
+   COSTS_N_INSNS (11)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (25),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (25),			/*			    HI */
+   COSTS_N_INSNS (25),			/*			    SI */
+   COSTS_N_INSNS (25),			/*			    DI */
+   COSTS_N_INSNS (25)},			/*			    other */
+  COSTS_N_INSNS (3),			/* cost of movsx */
+  COSTS_N_INSNS (2),			/* cost of movzx */
+  8,					/* "large" insn */
+  6,					/* MOVE_RATIO */
+  6,				     /* cost for loading QImode using movzbl */
+  {2, 4, 2},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 4, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {2, 2, 6},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {4, 4, 6},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  8,					/* cost of moving MMX register */
+  {8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {4, 8, 16},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {4, 8, 16},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  3,					/* MMX or SSE register to integer */
+  8,					/* size of l1 cache.  */
+  8,					/* size of l2 cache  */
+  0,					/* size of prefetch block */
+  0,					/* number of parallel prefetches */
+  2,					/* Branch cost */
+  COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (3),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (39),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (70),			/* cost of FSQRT instruction.  */
+  pentium_memcpy,
+  pentium_memset,
+  1,					/* scalar_stmt_cost.  */
+  1,					/* scalar load_cost.  */
+  1,					/* scalar_store_cost.  */
+  1,					/* vec_stmt_cost.  */
+  1,					/* vec_to_scalar_cost.  */
+  1,					/* scalar_to_vec_cost.  */
+  1,					/* vec_align_load_cost.  */
+  2,					/* vec_unalign_load_cost.  */
+  1,					/* vec_store_cost.  */
+  3,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
+   (we ensure the alignment).  For small blocks inline loop is still a
+   noticeable win, for bigger blocks either rep movsl or rep movsb is
+   way to go.  Rep movsb has apparently more expensive startup time in CPU,
+   but after 4K the difference is down in the noise.  */
+static stringop_algs pentiumpro_memcpy[2] = {
+  {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
+                       {8192, rep_prefix_4_byte, false},
+                       {-1, rep_prefix_1_byte, false}}},
+  DUMMY_STRINGOP_ALGS};
+static stringop_algs pentiumpro_memset[2] = {
+  {rep_prefix_4_byte, {{1024, unrolled_loop, false},
+                       {8192, rep_prefix_4_byte, false},
+                       {-1, libcall, false}}},
+  DUMMY_STRINGOP_ALGS};
+static const
+struct processor_costs pentiumpro_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (1),			/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*				 HI */
+   COSTS_N_INSNS (4),			/*				 SI */
+   COSTS_N_INSNS (4),			/*				 DI */
+   COSTS_N_INSNS (4)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (17),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (17),			/*			    HI */
+   COSTS_N_INSNS (17),			/*			    SI */
+   COSTS_N_INSNS (17),			/*			    DI */
+   COSTS_N_INSNS (17)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  6,					/* MOVE_RATIO */
+  2,				     /* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 2, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {2, 2, 6},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {4, 4, 6},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {2, 2},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {2, 2},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {2, 2, 8},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {2, 2, 8},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  3,					/* MMX or SSE register to integer */
+  8,					/* size of l1 cache.  */
+  256,					/* size of l2 cache  */
+  32,					/* size of prefetch block */
+  6,					/* number of parallel prefetches */
+  2,					/* Branch cost */
+  COSTS_N_INSNS (3),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (5),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
+  pentiumpro_memcpy,
+  pentiumpro_memset,
+  1,					/* scalar_stmt_cost.  */
+  1,					/* scalar load_cost.  */
+  1,					/* scalar_store_cost.  */
+  1,					/* vec_stmt_cost.  */
+  1,					/* vec_to_scalar_cost.  */
+  1,					/* scalar_to_vec_cost.  */
+  1,					/* vec_align_load_cost.  */
+  2,					/* vec_unalign_load_cost.  */
+  1,					/* vec_store_cost.  */
+  3,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs geode_memcpy[2] = {
+  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  DUMMY_STRINGOP_ALGS};
+static stringop_algs geode_memset[2] = {
+  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  DUMMY_STRINGOP_ALGS};
+static const
+struct processor_costs geode_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (1),			/* cost of a lea instruction */
+  COSTS_N_INSNS (2),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*				 HI */
+   COSTS_N_INSNS (7),			/*				 SI */
+   COSTS_N_INSNS (7),			/*				 DI */
+   COSTS_N_INSNS (7)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (15),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (23),			/*			    HI */
+   COSTS_N_INSNS (39),			/*			    SI */
+   COSTS_N_INSNS (39),			/*			    DI */
+   COSTS_N_INSNS (39)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  4,					/* MOVE_RATIO */
+  1,				     /* cost for loading QImode using movzbl */
+  {1, 1, 1},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {1, 1, 1},				/* cost of storing integer registers */
+  1,					/* cost of reg,reg fld/fst */
+  {1, 1, 1},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {4, 6, 6},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+
+  1,					/* cost of moving MMX register */
+  {1, 1},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {1, 1},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  1,					/* cost of moving SSE register */
+  {1, 1, 1},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {1, 1, 1},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  1,					/* MMX or SSE register to integer */
+  64,					/* size of l1 cache.  */
+  128,					/* size of l2 cache.  */
+  32,					/* size of prefetch block */
+  1,					/* number of parallel prefetches */
+  1,					/* Branch cost */
+  COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (11),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (47),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (1),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (54),			/* cost of FSQRT instruction.  */
+  geode_memcpy,
+  geode_memset,
+  1,					/* scalar_stmt_cost.  */
+  1,					/* scalar load_cost.  */
+  1,					/* scalar_store_cost.  */
+  1,					/* vec_stmt_cost.  */
+  1,					/* vec_to_scalar_cost.  */
+  1,					/* scalar_to_vec_cost.  */
+  1,					/* vec_align_load_cost.  */
+  2,					/* vec_unalign_load_cost.  */
+  1,					/* vec_store_cost.  */
+  3,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs k6_memcpy[2] = {
+  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  DUMMY_STRINGOP_ALGS};
+static stringop_algs k6_memset[2] = {
+  {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  DUMMY_STRINGOP_ALGS};
+static const
+struct processor_costs k6_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (2),			/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (3),			/*				 HI */
+   COSTS_N_INSNS (3),			/*				 SI */
+   COSTS_N_INSNS (3),			/*				 DI */
+   COSTS_N_INSNS (3)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (18),			/*			    HI */
+   COSTS_N_INSNS (18),			/*			    SI */
+   COSTS_N_INSNS (18),			/*			    DI */
+   COSTS_N_INSNS (18)},			/*			    other */
+  COSTS_N_INSNS (2),			/* cost of movsx */
+  COSTS_N_INSNS (2),			/* cost of movzx */
+  8,					/* "large" insn */
+  4,					/* MOVE_RATIO */
+  3,				     /* cost for loading QImode using movzbl */
+  {4, 5, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 3, 2},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {6, 6, 6},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {4, 4, 4},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {2, 2},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {2, 2},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {2, 2, 8},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {2, 2, 8},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  6,					/* MMX or SSE register to integer */
+  32,					/* size of l1 cache.  */
+  32,					/* size of l2 cache.  Some models
+					   have integrated l2 cache, but
+					   optimizing for k6 is not important
+					   enough to worry about that.  */
+  32,					/* size of prefetch block */
+  1,					/* number of parallel prefetches */
+  1,					/* Branch cost */
+  COSTS_N_INSNS (2),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (2),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (56),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (56),			/* cost of FSQRT instruction.  */
+  k6_memcpy,
+  k6_memset,
+  1,					/* scalar_stmt_cost.  */
+  1,					/* scalar load_cost.  */
+  1,					/* scalar_store_cost.  */
+  1,					/* vec_stmt_cost.  */
+  1,					/* vec_to_scalar_cost.  */
+  1,					/* scalar_to_vec_cost.  */
+  1,					/* vec_align_load_cost.  */
+  2,					/* vec_unalign_load_cost.  */
+  1,					/* vec_store_cost.  */
+  3,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+/* For some reason, Athlon deals better with REP prefix (relative to loops)
+   compared to K8. Alignment becomes important after 8 bytes for memcpy and
+   128 bytes for memset.  */
+static stringop_algs athlon_memcpy[2] = {
+  {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  DUMMY_STRINGOP_ALGS};
+static stringop_algs athlon_memset[2] = {
+  {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  DUMMY_STRINGOP_ALGS};
+static const
+struct processor_costs athlon_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (2),			/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (5),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (5),			/*				 HI */
+   COSTS_N_INSNS (5),			/*				 SI */
+   COSTS_N_INSNS (5),			/*				 DI */
+   COSTS_N_INSNS (5)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (26),			/*			    HI */
+   COSTS_N_INSNS (42),			/*			    SI */
+   COSTS_N_INSNS (74),			/*			    DI */
+   COSTS_N_INSNS (74)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  9,					/* MOVE_RATIO */
+  4,				     /* cost for loading QImode using movzbl */
+  {3, 4, 3},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {3, 4, 3},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {4, 4, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {4, 4},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {4, 4},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {4, 4, 6},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {4, 4, 5},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  5,					/* MMX or SSE register to integer */
+  64,					/* size of l1 cache.  */
+  256,					/* size of l2 cache.  */
+  64,					/* size of prefetch block */
+  6,					/* number of parallel prefetches */
+  5,					/* Branch cost */
+  COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (24),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
+  athlon_memcpy,
+  athlon_memset,
+  1,					/* scalar_stmt_cost.  */
+  1,					/* scalar load_cost.  */
+  1,					/* scalar_store_cost.  */
+  1,					/* vec_stmt_cost.  */
+  1,					/* vec_to_scalar_cost.  */
+  1,					/* scalar_to_vec_cost.  */
+  1,					/* vec_align_load_cost.  */
+  2,					/* vec_unalign_load_cost.  */
+  1,					/* vec_store_cost.  */
+  3,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+/* K8 has optimized REP instruction for medium sized blocks, but for very
+   small blocks it is better to use loop. For large blocks, libcall can
+   do nontemporary accesses and beat inline considerably.  */
+static stringop_algs k8_memcpy[2] = {
+  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+             {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static stringop_algs k8_memset[2] = {
+  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{48, unrolled_loop, false},
+             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static const
+struct processor_costs k8_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (2),			/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*				 HI */
+   COSTS_N_INSNS (3),			/*				 SI */
+   COSTS_N_INSNS (4),			/*				 DI */
+   COSTS_N_INSNS (5)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (26),			/*			    HI */
+   COSTS_N_INSNS (42),			/*			    SI */
+   COSTS_N_INSNS (74),			/*			    DI */
+   COSTS_N_INSNS (74)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  9,					/* MOVE_RATIO */
+  4,				     /* cost for loading QImode using movzbl */
+  {3, 4, 3},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {3, 4, 3},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {4, 4, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {3, 3},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {4, 4},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {4, 3, 6},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {4, 4, 5},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  5,					/* MMX or SSE register to integer */
+  64,					/* size of l1 cache.  */
+  512,					/* size of l2 cache.  */
+  64,					/* size of prefetch block */
+  /* New AMD processors never drop prefetches; if they cannot be performed
+     immediately, they are queued.  We set number of simultaneous prefetches
+     to a large constant to reflect this (it probably is not a good idea not
+     to limit number of prefetches at all, as their execution also takes some
+     time).  */
+  100,					/* number of parallel prefetches */
+  3,					/* Branch cost */
+  COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
+
+  k8_memcpy,
+  k8_memset,
+  4,					/* scalar_stmt_cost.  */
+  2,					/* scalar load_cost.  */
+  2,					/* scalar_store_cost.  */
+  5,					/* vec_stmt_cost.  */
+  0,					/* vec_to_scalar_cost.  */
+  2,					/* scalar_to_vec_cost.  */
+  2,					/* vec_align_load_cost.  */
+  3,					/* vec_unalign_load_cost.  */
+  3,					/* vec_store_cost.  */
+  3,					/* cond_taken_branch_cost.  */
+  2,					/* cond_not_taken_branch_cost.  */
+};
+
+/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
+   very small blocks it is better to use loop. For large blocks, libcall can
+   do nontemporary accesses and beat inline considerably.  */
+static stringop_algs amdfam10_memcpy[2] = {
+  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+             {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static stringop_algs amdfam10_memset[2] = {
+  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+struct processor_costs amdfam10_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (2),			/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*				 HI */
+   COSTS_N_INSNS (3),			/*				 SI */
+   COSTS_N_INSNS (4),			/*				 DI */
+   COSTS_N_INSNS (5)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (35),			/*			    HI */
+   COSTS_N_INSNS (51),			/*			    SI */
+   COSTS_N_INSNS (83),			/*			    DI */
+   COSTS_N_INSNS (83)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  9,					/* MOVE_RATIO */
+  4,				     /* cost for loading QImode using movzbl */
+  {3, 4, 3},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {3, 4, 3},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {4, 4, 12},				/* cost of loading fp registers
+		   			   in SFmode, DFmode and XFmode */
+  {6, 6, 8},				/* cost of storing fp registers
+ 		   			   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {3, 3},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {4, 4},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {4, 4, 3},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {4, 4, 5},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  3,					/* MMX or SSE register to integer */
+  					/* On K8:
+  					    MOVD reg64, xmmreg Double FSTORE 4
+					    MOVD reg32, xmmreg Double FSTORE 4
+					   On AMDFAM10:
+					    MOVD reg64, xmmreg Double FADD 3
+							       1/1  1/1
+					    MOVD reg32, xmmreg Double FADD 3
+							       1/1  1/1 */
+  64,					/* size of l1 cache.  */
+  512,					/* size of l2 cache.  */
+  64,					/* size of prefetch block */
+  /* New AMD processors never drop prefetches; if they cannot be performed
+     immediately, they are queued.  We set number of simultaneous prefetches
+     to a large constant to reflect this (it probably is not a good idea not
+     to limit number of prefetches at all, as their execution also takes some
+     time).  */
+  100,					/* number of parallel prefetches */
+  2,					/* Branch cost */
+  COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
+
+  amdfam10_memcpy,
+  amdfam10_memset,
+  4,					/* scalar_stmt_cost.  */
+  2,					/* scalar load_cost.  */
+  2,					/* scalar_store_cost.  */
+  6,					/* vec_stmt_cost.  */
+  0,					/* vec_to_scalar_cost.  */
+  2,					/* scalar_to_vec_cost.  */
+  2,					/* vec_align_load_cost.  */
+  2,					/* vec_unalign_load_cost.  */
+  2,					/* vec_store_cost.  */
+  2,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+/*  BDVER1 has optimized REP instruction for medium sized blocks, but for
+    very small blocks it is better to use loop. For large blocks, libcall
+    can do nontemporary accesses and beat inline considerably.  */
+static stringop_algs bdver1_memcpy[2] = {
+  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+             {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static stringop_algs bdver1_memset[2] = {
+  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+
+const struct processor_costs bdver1_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (1),			/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*				 HI */
+   COSTS_N_INSNS (4),			/*				 SI */
+   COSTS_N_INSNS (6),			/*				 DI */
+   COSTS_N_INSNS (6)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (35),			/*			    HI */
+   COSTS_N_INSNS (51),			/*			    SI */
+   COSTS_N_INSNS (83),			/*			    DI */
+   COSTS_N_INSNS (83)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  9,					/* MOVE_RATIO */
+  4,				     /* cost for loading QImode using movzbl */
+  {5, 5, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {4, 4, 4},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {5, 5, 12},				/* cost of loading fp registers
+		   			   in SFmode, DFmode and XFmode */
+  {4, 4, 8},				/* cost of storing fp registers
+ 		   			   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {4, 4},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {4, 4},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {4, 4, 4},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {4, 4, 4},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  2,					/* MMX or SSE register to integer */
+  					/* On K8:
+					    MOVD reg64, xmmreg Double FSTORE 4
+					    MOVD reg32, xmmreg Double FSTORE 4
+					   On AMDFAM10:
+					    MOVD reg64, xmmreg Double FADD 3
+							       1/1  1/1
+					    MOVD reg32, xmmreg Double FADD 3
+							       1/1  1/1 */
+  16,					/* size of l1 cache.  */
+  2048,					/* size of l2 cache.  */
+  64,					/* size of prefetch block */
+  /* New AMD processors never drop prefetches; if they cannot be performed
+     immediately, they are queued.  We set number of simultaneous prefetches
+     to a large constant to reflect this (it probably is not a good idea not
+     to limit number of prefetches at all, as their execution also takes some
+     time).  */
+  100,					/* number of parallel prefetches */
+  2,					/* Branch cost */
+  COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
+
+  bdver1_memcpy,
+  bdver1_memset,
+  6,					/* scalar_stmt_cost.  */
+  4,					/* scalar load_cost.  */
+  4,					/* scalar_store_cost.  */
+  6,					/* vec_stmt_cost.  */
+  0,					/* vec_to_scalar_cost.  */
+  2,					/* scalar_to_vec_cost.  */
+  4,					/* vec_align_load_cost.  */
+  4,					/* vec_unalign_load_cost.  */
+  4,					/* vec_store_cost.  */
+  2,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+/*  BDVER2 has optimized REP instruction for medium sized blocks, but for
+    very small blocks it is better to use loop. For large blocks, libcall
+    can do nontemporary accesses and beat inline considerably.  */
+
+static stringop_algs bdver2_memcpy[2] = {
+  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+             {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static stringop_algs bdver2_memset[2] = {
+  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+
+const struct processor_costs bdver2_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (1),			/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*				 HI */
+   COSTS_N_INSNS (4),			/*				 SI */
+   COSTS_N_INSNS (6),			/*				 DI */
+   COSTS_N_INSNS (6)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (35),			/*			    HI */
+   COSTS_N_INSNS (51),			/*			    SI */
+   COSTS_N_INSNS (83),			/*			    DI */
+   COSTS_N_INSNS (83)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  9,					/* MOVE_RATIO */
+  4,				     /* cost for loading QImode using movzbl */
+  {5, 5, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {4, 4, 4},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {5, 5, 12},				/* cost of loading fp registers
+		   			   in SFmode, DFmode and XFmode */
+  {4, 4, 8},				/* cost of storing fp registers
+ 		   			   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {4, 4},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {4, 4},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {4, 4, 4},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {4, 4, 4},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  2,					/* MMX or SSE register to integer */
+  					/* On K8:
+					    MOVD reg64, xmmreg Double FSTORE 4
+					    MOVD reg32, xmmreg Double FSTORE 4
+					   On AMDFAM10:
+					    MOVD reg64, xmmreg Double FADD 3
+							       1/1  1/1
+					    MOVD reg32, xmmreg Double FADD 3
+							       1/1  1/1 */
+  16,					/* size of l1 cache.  */
+  2048,					/* size of l2 cache.  */
+  64,					/* size of prefetch block */
+  /* New AMD processors never drop prefetches; if they cannot be performed
+     immediately, they are queued.  We set number of simultaneous prefetches
+     to a large constant to reflect this (it probably is not a good idea not
+     to limit number of prefetches at all, as their execution also takes some
+     time).  */
+  100,					/* number of parallel prefetches */
+  2,					/* Branch cost */
+  COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
+
+  bdver2_memcpy,
+  bdver2_memset,
+  6,					/* scalar_stmt_cost.  */
+  4,					/* scalar load_cost.  */
+  4,					/* scalar_store_cost.  */
+  6,					/* vec_stmt_cost.  */
+  0,					/* vec_to_scalar_cost.  */
+  2,					/* scalar_to_vec_cost.  */
+  4,					/* vec_align_load_cost.  */
+  4,					/* vec_unalign_load_cost.  */
+  4,					/* vec_store_cost.  */
+  2,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+
+  /*  BDVER3 has optimized REP instruction for medium sized blocks, but for
+      very small blocks it is better to use loop. For large blocks, libcall
+      can do nontemporary accesses and beat inline considerably.  */
+static stringop_algs bdver3_memcpy[2] = {
+  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+             {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static stringop_algs bdver3_memset[2] = {
+  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+struct processor_costs bdver3_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (1),			/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*				 HI */
+   COSTS_N_INSNS (4),			/*				 SI */
+   COSTS_N_INSNS (6),			/*				 DI */
+   COSTS_N_INSNS (6)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (35),			/*			    HI */
+   COSTS_N_INSNS (51),			/*			    SI */
+   COSTS_N_INSNS (83),			/*			    DI */
+   COSTS_N_INSNS (83)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  9,					/* MOVE_RATIO */
+  4,				     /* cost for loading QImode using movzbl */
+  {5, 5, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {4, 4, 4},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {5, 5, 12},				/* cost of loading fp registers
+		   			   in SFmode, DFmode and XFmode */
+  {4, 4, 8},				/* cost of storing fp registers
+ 		   			   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {4, 4},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {4, 4},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {4, 4, 4},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {4, 4, 4},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  2,					/* MMX or SSE register to integer */
+  16,					/* size of l1 cache.  */
+  2048,					/* size of l2 cache.  */
+  64,					/* size of prefetch block */
+  /* New AMD processors never drop prefetches; if they cannot be performed
+     immediately, they are queued.  We set number of simultaneous prefetches
+     to a large constant to reflect this (it probably is not a good idea not
+     to limit number of prefetches at all, as their execution also takes some
+     time).  */
+  100,					/* number of parallel prefetches */
+  2,					/* Branch cost */
+  COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
+
+  bdver3_memcpy,
+  bdver3_memset,
+  6,					/* scalar_stmt_cost.  */
+  4,					/* scalar load_cost.  */
+  4,					/* scalar_store_cost.  */
+  6,					/* vec_stmt_cost.  */
+  0,					/* vec_to_scalar_cost.  */
+  2,					/* scalar_to_vec_cost.  */
+  4,					/* vec_align_load_cost.  */
+  4,					/* vec_unalign_load_cost.  */
+  4,					/* vec_store_cost.  */
+  2,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+/*  BDVER4 has optimized REP instruction for medium sized blocks, but for
+    very small blocks it is better to use loop. For large blocks, libcall
+    can do nontemporary accesses and beat inline considerably.  */
+static stringop_algs bdver4_memcpy[2] = {
+  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+             {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static stringop_algs bdver4_memset[2] = {
+  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+struct processor_costs bdver4_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (1),			/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (4),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*				 HI */
+   COSTS_N_INSNS (4),			/*				 SI */
+   COSTS_N_INSNS (6),			/*				 DI */
+   COSTS_N_INSNS (6)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (35),			/*			    HI */
+   COSTS_N_INSNS (51),			/*			    SI */
+   COSTS_N_INSNS (83),			/*			    DI */
+   COSTS_N_INSNS (83)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  9,					/* MOVE_RATIO */
+  4,				     /* cost for loading QImode using movzbl */
+  {5, 5, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {4, 4, 4},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {5, 5, 12},				/* cost of loading fp registers
+		   			   in SFmode, DFmode and XFmode */
+  {4, 4, 8},				/* cost of storing fp registers
+ 		   			   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {4, 4},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {4, 4},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {4, 4, 4},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {4, 4, 4},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  2,					/* MMX or SSE register to integer */
+  16,					/* size of l1 cache.  */
+  2048,					/* size of l2 cache.  */
+  64,					/* size of prefetch block */
+  /* New AMD processors never drop prefetches; if they cannot be performed
+     immediately, they are queued.  We set number of simultaneous prefetches
+     to a large constant to reflect this (it probably is not a good idea not
+     to limit number of prefetches at all, as their execution also takes some
+     time).  */
+  100,					/* number of parallel prefetches */
+  2,					/* Branch cost */
+  COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (6),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (42),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (52),			/* cost of FSQRT instruction.  */
+
+  bdver4_memcpy,
+  bdver4_memset,
+  6,					/* scalar_stmt_cost.  */
+  4,					/* scalar load_cost.  */
+  4,					/* scalar_store_cost.  */
+  6,					/* vec_stmt_cost.  */
+  0,					/* vec_to_scalar_cost.  */
+  2,					/* scalar_to_vec_cost.  */
+  4,					/* vec_align_load_cost.  */
+  4,					/* vec_unalign_load_cost.  */
+  4,					/* vec_store_cost.  */
+  2,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+  /* BTVER1 has optimized REP instruction for medium sized blocks, but for
+     very small blocks it is better to use loop. For large blocks, libcall can
+     do nontemporary accesses and beat inline considerably.  */
+static stringop_algs btver1_memcpy[2] = {
+  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+             {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static stringop_algs btver1_memset[2] = {
+  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+const struct processor_costs btver1_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (2),			/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*				 HI */
+   COSTS_N_INSNS (3),			/*				 SI */
+   COSTS_N_INSNS (4),			/*				 DI */
+   COSTS_N_INSNS (5)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (35),			/*			    HI */
+   COSTS_N_INSNS (51),			/*			    SI */
+   COSTS_N_INSNS (83),			/*			    DI */
+   COSTS_N_INSNS (83)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  9,					/* MOVE_RATIO */
+  4,				     /* cost for loading QImode using movzbl */
+  {3, 4, 3},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {3, 4, 3},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {4, 4, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {3, 3},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {4, 4},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {4, 4, 3},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {4, 4, 5},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  3,					/* MMX or SSE register to integer */
+					/* On K8:
+					   MOVD reg64, xmmreg Double FSTORE 4
+					   MOVD reg32, xmmreg Double FSTORE 4
+					   On AMDFAM10:
+					   MOVD reg64, xmmreg Double FADD 3
+							       1/1  1/1
+					    MOVD reg32, xmmreg Double FADD 3
+							       1/1  1/1 */
+  32,					/* size of l1 cache.  */
+  512,					/* size of l2 cache.  */
+  64,					/* size of prefetch block */
+  100,					/* number of parallel prefetches */
+  2,					/* Branch cost */
+  COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
+
+  btver1_memcpy,
+  btver1_memset,
+  4,					/* scalar_stmt_cost.  */
+  2,					/* scalar load_cost.  */
+  2,					/* scalar_store_cost.  */
+  6,					/* vec_stmt_cost.  */
+  0,					/* vec_to_scalar_cost.  */
+  2,					/* scalar_to_vec_cost.  */
+  2,					/* vec_align_load_cost.  */
+  2,					/* vec_unalign_load_cost.  */
+  2,					/* vec_store_cost.  */
+  2,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs btver2_memcpy[2] = {
+  {libcall, {{6, loop, false}, {14, unrolled_loop, false},
+             {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static stringop_algs btver2_memset[2] = {
+  {libcall, {{8, loop, false}, {24, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+const struct processor_costs btver2_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (2),			/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*				 HI */
+   COSTS_N_INSNS (3),			/*				 SI */
+   COSTS_N_INSNS (4),			/*				 DI */
+   COSTS_N_INSNS (5)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (19),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (35),			/*			    HI */
+   COSTS_N_INSNS (51),			/*			    SI */
+   COSTS_N_INSNS (83),			/*			    DI */
+   COSTS_N_INSNS (83)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  9,					/* MOVE_RATIO */
+  4,				     /* cost for loading QImode using movzbl */
+  {3, 4, 3},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {3, 4, 3},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {4, 4, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {3, 3},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {4, 4},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {4, 4, 3},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {4, 4, 5},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  3,					/* MMX or SSE register to integer */
+					/* On K8:
+					   MOVD reg64, xmmreg Double FSTORE 4
+					   MOVD reg32, xmmreg Double FSTORE 4
+					   On AMDFAM10:
+					   MOVD reg64, xmmreg Double FADD 3
+							       1/1  1/1
+					    MOVD reg32, xmmreg Double FADD 3
+							       1/1  1/1 */
+  32,					/* size of l1 cache.  */
+  2048,					/* size of l2 cache.  */
+  64,					/* size of prefetch block */
+  100,					/* number of parallel prefetches */
+  2,					/* Branch cost */
+  COSTS_N_INSNS (4),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (4),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (19),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (35),			/* cost of FSQRT instruction.  */
+  btver2_memcpy,
+  btver2_memset,
+  4,					/* scalar_stmt_cost.  */
+  2,					/* scalar load_cost.  */
+  2,					/* scalar_store_cost.  */
+  6,					/* vec_stmt_cost.  */
+  0,					/* vec_to_scalar_cost.  */
+  2,					/* scalar_to_vec_cost.  */
+  2,					/* vec_align_load_cost.  */
+  2,					/* vec_unalign_load_cost.  */
+  2,					/* vec_store_cost.  */
+  2,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs pentium4_memcpy[2] = {
+  {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
+  DUMMY_STRINGOP_ALGS};
+static stringop_algs pentium4_memset[2] = {
+  {libcall, {{6, loop_1_byte, false}, {48, loop, false},
+             {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  DUMMY_STRINGOP_ALGS};
+
+static const
+struct processor_costs pentium4_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (3),			/* cost of a lea instruction */
+  COSTS_N_INSNS (4),			/* variable shift costs */
+  COSTS_N_INSNS (4),			/* constant shift costs */
+  {COSTS_N_INSNS (15),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (15),			/*				 HI */
+   COSTS_N_INSNS (15),			/*				 SI */
+   COSTS_N_INSNS (15),			/*				 DI */
+   COSTS_N_INSNS (15)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (56),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (56),			/*			    HI */
+   COSTS_N_INSNS (56),			/*			    SI */
+   COSTS_N_INSNS (56),			/*			    DI */
+   COSTS_N_INSNS (56)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  16,					/* "large" insn */
+  6,					/* MOVE_RATIO */
+  2,				     /* cost for loading QImode using movzbl */
+  {4, 5, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {2, 3, 2},				/* cost of storing integer registers */
+  2,					/* cost of reg,reg fld/fst */
+  {2, 2, 6},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {4, 4, 6},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {2, 2},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {2, 2},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  12,					/* cost of moving SSE register */
+  {12, 12, 12},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {2, 2, 8},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  10,					/* MMX or SSE register to integer */
+  8,					/* size of l1 cache.  */
+  256,					/* size of l2 cache.  */
+  64,					/* size of prefetch block */
+  6,					/* number of parallel prefetches */
+  2,					/* Branch cost */
+  COSTS_N_INSNS (5),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (7),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (43),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (2),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (43),			/* cost of FSQRT instruction.  */
+  pentium4_memcpy,
+  pentium4_memset,
+  1,					/* scalar_stmt_cost.  */
+  1,					/* scalar load_cost.  */
+  1,					/* scalar_store_cost.  */
+  1,					/* vec_stmt_cost.  */
+  1,					/* vec_to_scalar_cost.  */
+  1,					/* scalar_to_vec_cost.  */
+  1,					/* vec_align_load_cost.  */
+  2,					/* vec_unalign_load_cost.  */
+  1,					/* vec_store_cost.  */
+  3,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs nocona_memcpy[2] = {
+  {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
+             {100000, unrolled_loop, false}, {-1, libcall, false}}}};
+
+static stringop_algs nocona_memset[2] = {
+  {libcall, {{6, loop_1_byte, false}, {48, loop, false},
+             {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{24, loop, false}, {64, unrolled_loop, false},
+             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+
+static const
+struct processor_costs nocona_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (1),			/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (10),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (10),			/*				 HI */
+   COSTS_N_INSNS (10),			/*				 SI */
+   COSTS_N_INSNS (10),			/*				 DI */
+   COSTS_N_INSNS (10)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (66),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (66),			/*			    HI */
+   COSTS_N_INSNS (66),			/*			    SI */
+   COSTS_N_INSNS (66),			/*			    DI */
+   COSTS_N_INSNS (66)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  16,					/* "large" insn */
+  17,					/* MOVE_RATIO */
+  4,				     /* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {4, 4, 4},				/* cost of storing integer registers */
+  3,					/* cost of reg,reg fld/fst */
+  {12, 12, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {4, 4, 4},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  6,					/* cost of moving MMX register */
+  {12, 12},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {12, 12},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  6,					/* cost of moving SSE register */
+  {12, 12, 12},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {12, 12, 12},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  8,					/* MMX or SSE register to integer */
+  8,					/* size of l1 cache.  */
+  1024,					/* size of l2 cache.  */
+  64,					/* size of prefetch block */
+  8,					/* number of parallel prefetches */
+  1,					/* Branch cost */
+  COSTS_N_INSNS (6),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (40),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (3),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (3),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (44),			/* cost of FSQRT instruction.  */
+  nocona_memcpy,
+  nocona_memset,
+  1,					/* scalar_stmt_cost.  */
+  1,					/* scalar load_cost.  */
+  1,					/* scalar_store_cost.  */
+  1,					/* vec_stmt_cost.  */
+  1,					/* vec_to_scalar_cost.  */
+  1,					/* scalar_to_vec_cost.  */
+  1,					/* vec_align_load_cost.  */
+  2,					/* vec_unalign_load_cost.  */
+  1,					/* vec_store_cost.  */
+  3,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs atom_memcpy[2] = {
+  {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
+             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static stringop_algs atom_memset[2] = {
+  {libcall, {{8, loop, false}, {15, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{24, loop, false}, {32, unrolled_loop, false},
+             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static const
+struct processor_costs atom_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*				 HI */
+   COSTS_N_INSNS (3),			/*				 SI */
+   COSTS_N_INSNS (4),			/*				 DI */
+   COSTS_N_INSNS (2)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (26),			/*			    HI */
+   COSTS_N_INSNS (42),			/*			    SI */
+   COSTS_N_INSNS (74),			/*			    DI */
+   COSTS_N_INSNS (74)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  17,					/* MOVE_RATIO */
+  4,					/* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {4, 4, 4},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {12, 12, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {8, 8, 8},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {8, 8, 8},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  5,					/* MMX or SSE register to integer */
+  32,					/* size of l1 cache.  */
+  256,					/* size of l2 cache.  */
+  64,					/* size of prefetch block */
+  6,					/* number of parallel prefetches */
+  3,					/* Branch cost */
+  COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
+  atom_memcpy,
+  atom_memset,
+  1,					/* scalar_stmt_cost.  */
+  1,					/* scalar load_cost.  */
+  1,					/* scalar_store_cost.  */
+  1,					/* vec_stmt_cost.  */
+  1,					/* vec_to_scalar_cost.  */
+  1,					/* scalar_to_vec_cost.  */
+  1,					/* vec_align_load_cost.  */
+  2,					/* vec_unalign_load_cost.  */
+  1,					/* vec_store_cost.  */
+  3,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs slm_memcpy[2] = {
+  {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
+             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static stringop_algs slm_memset[2] = {
+  {libcall, {{8, loop, false}, {15, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{24, loop, false}, {32, unrolled_loop, false},
+             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static const
+struct processor_costs slm_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (3),			/*				 HI */
+   COSTS_N_INSNS (3),			/*				 SI */
+   COSTS_N_INSNS (4),			/*				 DI */
+   COSTS_N_INSNS (2)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (26),			/*			    HI */
+   COSTS_N_INSNS (42),			/*			    SI */
+   COSTS_N_INSNS (74),			/*			    DI */
+   COSTS_N_INSNS (74)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  17,					/* MOVE_RATIO */
+  4,					/* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {4, 4, 4},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {12, 12, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {8, 8, 8},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {8, 8, 8},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  5,					/* MMX or SSE register to integer */
+  32,					/* size of l1 cache.  */
+  256,					/* size of l2 cache.  */
+  64,					/* size of prefetch block */
+  6,					/* number of parallel prefetches */
+  3,					/* Branch cost */
+  COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
+  slm_memcpy,
+  slm_memset,
+  1,					/* scalar_stmt_cost.  */
+  1,					/* scalar load_cost.  */
+  1,					/* scalar_store_cost.  */
+  1,					/* vec_stmt_cost.  */
+  1,					/* vec_to_scalar_cost.  */
+  1,					/* scalar_to_vec_cost.  */
+  1,					/* vec_align_load_cost.  */
+  2,					/* vec_unalign_load_cost.  */
+  1,					/* vec_store_cost.  */
+  3,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+static stringop_algs intel_memcpy[2] = {
+  {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
+  {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
+             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static stringop_algs intel_memset[2] = {
+  {libcall, {{8, loop, false}, {15, unrolled_loop, false},
+             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
+  {libcall, {{24, loop, false}, {32, unrolled_loop, false},
+             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
+static const
+struct processor_costs intel_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (3),			/*				 HI */
+   COSTS_N_INSNS (3),			/*				 SI */
+   COSTS_N_INSNS (4),			/*				 DI */
+   COSTS_N_INSNS (2)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (26),			/*			    HI */
+   COSTS_N_INSNS (42),			/*			    SI */
+   COSTS_N_INSNS (74),			/*			    DI */
+   COSTS_N_INSNS (74)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  17,					/* MOVE_RATIO */
+  4,					/* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {4, 4, 4},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {12, 12, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {8, 8, 8},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {8, 8, 8},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  5,					/* MMX or SSE register to integer */
+  32,					/* size of l1 cache.  */
+  256,					/* size of l2 cache.  */
+  64,					/* size of prefetch block */
+  6,					/* number of parallel prefetches */
+  3,					/* Branch cost */
+  COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
+  intel_memcpy,
+  intel_memset,
+  1,					/* scalar_stmt_cost.  */
+  1,					/* scalar load_cost.  */
+  1,					/* scalar_store_cost.  */
+  1,					/* vec_stmt_cost.  */
+  1,					/* vec_to_scalar_cost.  */
+  1,					/* scalar_to_vec_cost.  */
+  1,					/* vec_align_load_cost.  */
+  2,					/* vec_unalign_load_cost.  */
+  1,					/* vec_store_cost.  */
+  3,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+/* Generic should produce code tuned for Core-i7 (and newer chips)
+   and btver1 (and newer chips).  */
+
+static stringop_algs generic_memcpy[2] = {
+  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
+             {-1, libcall, false}}},
+  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static stringop_algs generic_memset[2] = {
+  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
+             {-1, libcall, false}}},
+  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
+             {-1, libcall, false}}}};
+static const
+struct processor_costs generic_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  /* On all chips taken into consideration lea is 2 cycles and more.  With
+     this cost however our current implementation of synth_mult results in
+     use of unnecessary temporary registers causing regression on several
+     SPECfp benchmarks.  */
+  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*				 HI */
+   COSTS_N_INSNS (3),			/*				 SI */
+   COSTS_N_INSNS (4),			/*				 DI */
+   COSTS_N_INSNS (2)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (26),			/*			    HI */
+   COSTS_N_INSNS (42),			/*			    SI */
+   COSTS_N_INSNS (74),			/*			    DI */
+   COSTS_N_INSNS (74)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  17,					/* MOVE_RATIO */
+  4,				     /* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {4, 4, 4},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {12, 12, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {8, 8, 8},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {8, 8, 8},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  5,					/* MMX or SSE register to integer */
+  32,					/* size of l1 cache.  */
+  512,					/* size of l2 cache.  */
+  64,					/* size of prefetch block */
+  6,					/* number of parallel prefetches */
+  /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
+     value is increased to perhaps more appropriate value of 5.  */
+  3,					/* Branch cost */
+  COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
+  generic_memcpy,
+  generic_memset,
+  1,					/* scalar_stmt_cost.  */
+  1,					/* scalar load_cost.  */
+  1,					/* scalar_store_cost.  */
+  1,					/* vec_stmt_cost.  */
+  1,					/* vec_to_scalar_cost.  */
+  1,					/* scalar_to_vec_cost.  */
+  1,					/* vec_align_load_cost.  */
+  2,					/* vec_unalign_load_cost.  */
+  1,					/* vec_store_cost.  */
+  3,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+/* core_cost should produce code tuned for Core familly of CPUs.  */
+static stringop_algs core_memcpy[2] = {
+  {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
+  {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
+             {-1, libcall, false}}}};
+static stringop_algs core_memset[2] = {
+  {libcall, {{6, loop_1_byte, true},
+             {24, loop, true},
+             {8192, rep_prefix_4_byte, true},
+             {-1, libcall, false}}},
+  {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
+             {-1, libcall, false}}}};
+
+static const
+struct processor_costs core_cost = {
+  COSTS_N_INSNS (1),			/* cost of an add instruction */
+  /* On all chips taken into consideration lea is 2 cycles and more.  With
+     this cost however our current implementation of synth_mult results in
+     use of unnecessary temporary registers causing regression on several
+     SPECfp benchmarks.  */
+  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
+  COSTS_N_INSNS (1),			/* variable shift costs */
+  COSTS_N_INSNS (1),			/* constant shift costs */
+  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
+   COSTS_N_INSNS (4),			/*				 HI */
+   COSTS_N_INSNS (3),			/*				 SI */
+   COSTS_N_INSNS (4),			/*				 DI */
+   COSTS_N_INSNS (2)},			/*			      other */
+  0,					/* cost of multiply per each bit set */
+  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
+   COSTS_N_INSNS (26),			/*			    HI */
+   COSTS_N_INSNS (42),			/*			    SI */
+   COSTS_N_INSNS (74),			/*			    DI */
+   COSTS_N_INSNS (74)},			/*			    other */
+  COSTS_N_INSNS (1),			/* cost of movsx */
+  COSTS_N_INSNS (1),			/* cost of movzx */
+  8,					/* "large" insn */
+  17,					/* MOVE_RATIO */
+  4,				     /* cost for loading QImode using movzbl */
+  {4, 4, 4},				/* cost of loading integer registers
+					   in QImode, HImode and SImode.
+					   Relative to reg-reg move (2).  */
+  {4, 4, 4},				/* cost of storing integer registers */
+  4,					/* cost of reg,reg fld/fst */
+  {12, 12, 12},				/* cost of loading fp registers
+					   in SFmode, DFmode and XFmode */
+  {6, 6, 8},				/* cost of storing fp registers
+					   in SFmode, DFmode and XFmode */
+  2,					/* cost of moving MMX register */
+  {8, 8},				/* cost of loading MMX registers
+					   in SImode and DImode */
+  {8, 8},				/* cost of storing MMX registers
+					   in SImode and DImode */
+  2,					/* cost of moving SSE register */
+  {8, 8, 8},				/* cost of loading SSE registers
+					   in SImode, DImode and TImode */
+  {8, 8, 8},				/* cost of storing SSE registers
+					   in SImode, DImode and TImode */
+  5,					/* MMX or SSE register to integer */
+  64,					/* size of l1 cache.  */
+  512,					/* size of l2 cache.  */
+  64,					/* size of prefetch block */
+  6,					/* number of parallel prefetches */
+  /* FIXME perhaps more appropriate value is 5.  */
+  3,					/* Branch cost */
+  COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
+  COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
+  COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
+  COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
+  COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
+  COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
+  core_memcpy,
+  core_memset,
+  1,					/* scalar_stmt_cost.  */
+  1,					/* scalar load_cost.  */
+  1,					/* scalar_store_cost.  */
+  1,					/* vec_stmt_cost.  */
+  1,					/* vec_to_scalar_cost.  */
+  1,					/* scalar_to_vec_cost.  */
+  1,					/* vec_align_load_cost.  */
+  2,					/* vec_unalign_load_cost.  */
+  1,					/* vec_store_cost.  */
+  3,					/* cond_taken_branch_cost.  */
+  1,					/* cond_not_taken_branch_cost.  */
+};
+
+
+/* Set by -mtune.  */
+const struct processor_costs *ix86_tune_cost = &pentium_cost;
+
+/* Set by -mtune or -Os.  */
+const struct processor_costs *ix86_cost = &pentium_cost;
+
+/* Processor feature/optimization bitmasks.  */
+#define m_386 (1<<PROCESSOR_I386)
+#define m_486 (1<<PROCESSOR_I486)
+#define m_PENT (1<<PROCESSOR_PENTIUM)
+#define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
+#define m_PENT4 (1<<PROCESSOR_PENTIUM4)
+#define m_NOCONA (1<<PROCESSOR_NOCONA)
+#define m_P4_NOCONA (m_PENT4 | m_NOCONA)
+#define m_CORE2 (1<<PROCESSOR_CORE2)
+#define m_NEHALEM (1<<PROCESSOR_NEHALEM)
+#define m_SANDYBRIDGE (1<<PROCESSOR_SANDYBRIDGE)
+#define m_HASWELL (1<<PROCESSOR_HASWELL)
+#define m_CORE_ALL (m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE | m_HASWELL)
+#define m_BONNELL (1<<PROCESSOR_BONNELL)
+#define m_SILVERMONT (1<<PROCESSOR_SILVERMONT)
+#define m_INTEL (1<<PROCESSOR_INTEL)
+
+#define m_GEODE (1<<PROCESSOR_GEODE)
+#define m_K6 (1<<PROCESSOR_K6)
+#define m_K6_GEODE (m_K6 | m_GEODE)
+#define m_K8 (1<<PROCESSOR_K8)
+#define m_ATHLON (1<<PROCESSOR_ATHLON)
+#define m_ATHLON_K8 (m_K8 | m_ATHLON)
+#define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
+#define m_BDVER1 (1<<PROCESSOR_BDVER1)
+#define m_BDVER2 (1<<PROCESSOR_BDVER2)
+#define m_BDVER3 (1<<PROCESSOR_BDVER3)
+#define m_BDVER4 (1<<PROCESSOR_BDVER4)
+#define m_BTVER1 (1<<PROCESSOR_BTVER1)
+#define m_BTVER2 (1<<PROCESSOR_BTVER2)
+#define m_BDVER	(m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
+#define m_BTVER (m_BTVER1 | m_BTVER2)
+#define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
+
+#define m_GENERIC (1<<PROCESSOR_GENERIC)
+
+const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
+#undef DEF_TUNE
+#define DEF_TUNE(tune, name, selector) name,
+#include "x86-tune.def"
+#undef DEF_TUNE
+};
+
+/* Feature tests against the various tunings.  */
+unsigned char ix86_tune_features[X86_TUNE_LAST];
+
+/* Feature tests against the various tunings used to create ix86_tune_features
+   based on the processor mask.  */
+static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
+#undef DEF_TUNE
+#define DEF_TUNE(tune, name, selector) selector,
+#include "x86-tune.def"
+#undef DEF_TUNE
+};
+
+/* Feature tests against the various architecture variations.  */
+unsigned char ix86_arch_features[X86_ARCH_LAST];
+
+/* Feature tests against the various architecture variations, used to create
+   ix86_arch_features based on the processor mask.  */
+static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
+  /* X86_ARCH_CMOV: Conditional move was added for pentiumpro.  */
+  ~(m_386 | m_486 | m_PENT | m_K6),
+
+  /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486.  */
+  ~m_386,
+
+  /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
+  ~(m_386 | m_486),
+
+  /* X86_ARCH_XADD: Exchange and add was added for 80486.  */
+  ~m_386,
+
+  /* X86_ARCH_BSWAP: Byteswap was added for 80486.  */
+  ~m_386,
+};
+
+/* In case the average insn count for single function invocation is
+   lower than this constant, emit fast (but longer) prologue and
+   epilogue code.  */
+#define FAST_PROLOGUE_INSN_COUNT 20
+
+/* Names for 8 (low), 8 (high), and 16-bit registers, respectively.  */
+static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
+static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
+static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
+
+/* Array of the smallest class containing reg number REGNO, indexed by
+   REGNO.  Used by REGNO_REG_CLASS in i386.h.  */
+
+enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
+{
+  /* ax, dx, cx, bx */
+  AREG, DREG, CREG, BREG,
+  /* si, di, bp, sp */
+  SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
+  /* FP registers */
+  FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
+  FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
+  /* arg pointer */
+  NON_Q_REGS,
+  /* flags, fpsr, fpcr, frame */
+  NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
+  /* SSE registers */
+  SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
+  SSE_REGS, SSE_REGS,
+  /* MMX registers */
+  MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
+  MMX_REGS, MMX_REGS,
+  /* REX registers */
+  NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
+  NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
+  /* SSE REX registers */
+  SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
+  SSE_REGS, SSE_REGS,
+  /* AVX-512 SSE registers */
+  EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
+  EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
+  EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
+  EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
+  /* Mask registers.  */
+  MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
+  MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
+};
+
+/* The "default" register map used in 32bit mode.  */
+
+int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
+{
+  0, 2, 1, 3, 6, 7, 4, 5,		/* general regs */
+  12, 13, 14, 15, 16, 17, 18, 19,	/* fp regs */
+  -1, -1, -1, -1, -1,			/* arg, flags, fpsr, fpcr, frame */
+  21, 22, 23, 24, 25, 26, 27, 28,	/* SSE */
+  29, 30, 31, 32, 33, 34, 35, 36,       /* MMX */
+  -1, -1, -1, -1, -1, -1, -1, -1,	/* extended integer registers */
+  -1, -1, -1, -1, -1, -1, -1, -1,	/* extended SSE registers */
+  -1, -1, -1, -1, -1, -1, -1, -1,       /* AVX-512 registers 16-23*/
+  -1, -1, -1, -1, -1, -1, -1, -1,       /* AVX-512 registers 24-31*/
+  93, 94, 95, 96, 97, 98, 99, 100,      /* Mask registers */
+};
+
+/* The "default" register map used in 64bit mode.  */
+
+int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
+{
+  0, 1, 2, 3, 4, 5, 6, 7,		/* general regs */
+  33, 34, 35, 36, 37, 38, 39, 40,	/* fp regs */
+  -1, -1, -1, -1, -1,			/* arg, flags, fpsr, fpcr, frame */
+  17, 18, 19, 20, 21, 22, 23, 24,	/* SSE */
+  41, 42, 43, 44, 45, 46, 47, 48,       /* MMX */
+  8,9,10,11,12,13,14,15,		/* extended integer registers */
+  25, 26, 27, 28, 29, 30, 31, 32,	/* extended SSE registers */
+  67, 68, 69, 70, 71, 72, 73, 74,       /* AVX-512 registers 16-23 */
+  75, 76, 77, 78, 79, 80, 81, 82,       /* AVX-512 registers 24-31 */
+  118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
+};
+
+/* Define the register numbers to be used in Dwarf debugging information.
+   The SVR4 reference port C compiler uses the following register numbers
+   in its Dwarf output code:
+	0 for %eax (gcc regno = 0)
+	1 for %ecx (gcc regno = 2)
+	2 for %edx (gcc regno = 1)
+	3 for %ebx (gcc regno = 3)
+	4 for %esp (gcc regno = 7)
+	5 for %ebp (gcc regno = 6)
+	6 for %esi (gcc regno = 4)
+	7 for %edi (gcc regno = 5)
+   The following three DWARF register numbers are never generated by
+   the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
+   believes these numbers have these meanings.
+	8  for %eip    (no gcc equivalent)
+	9  for %eflags (gcc regno = 17)
+	10 for %trapno (no gcc equivalent)
+   It is not at all clear how we should number the FP stack registers
+   for the x86 architecture.  If the version of SDB on x86/svr4 were
+   a bit less brain dead with respect to floating-point then we would
+   have a precedent to follow with respect to DWARF register numbers
+   for x86 FP registers, but the SDB on x86/svr4 is so completely
+   broken with respect to FP registers that it is hardly worth thinking
+   of it as something to strive for compatibility with.
+   The version of x86/svr4 SDB I have at the moment does (partially)
+   seem to believe that DWARF register number 11 is associated with
+   the x86 register %st(0), but that's about all.  Higher DWARF
+   register numbers don't seem to be associated with anything in
+   particular, and even for DWARF regno 11, SDB only seems to under-
+   stand that it should say that a variable lives in %st(0) (when
+   asked via an `=' command) if we said it was in DWARF regno 11,
+   but SDB still prints garbage when asked for the value of the
+   variable in question (via a `/' command).
+   (Also note that the labels SDB prints for various FP stack regs
+   when doing an `x' command are all wrong.)
+   Note that these problems generally don't affect the native SVR4
+   C compiler because it doesn't allow the use of -O with -g and
+   because when it is *not* optimizing, it allocates a memory
+   location for each floating-point variable, and the memory
+   location is what gets described in the DWARF AT_location
+   attribute for the variable in question.
+   Regardless of the severe mental illness of the x86/svr4 SDB, we
+   do something sensible here and we use the following DWARF
+   register numbers.  Note that these are all stack-top-relative
+   numbers.
+	11 for %st(0) (gcc regno = 8)
+	12 for %st(1) (gcc regno = 9)
+	13 for %st(2) (gcc regno = 10)
+	14 for %st(3) (gcc regno = 11)
+	15 for %st(4) (gcc regno = 12)
+	16 for %st(5) (gcc regno = 13)
+	17 for %st(6) (gcc regno = 14)
+	18 for %st(7) (gcc regno = 15)
+*/
+int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
+{
+  0, 2, 1, 3, 6, 7, 5, 4,		/* general regs */
+  11, 12, 13, 14, 15, 16, 17, 18,	/* fp regs */
+  -1, 9, -1, -1, -1,			/* arg, flags, fpsr, fpcr, frame */
+  21, 22, 23, 24, 25, 26, 27, 28,	/* SSE registers */
+  29, 30, 31, 32, 33, 34, 35, 36,	/* MMX registers */
+  -1, -1, -1, -1, -1, -1, -1, -1,	/* extended integer registers */
+  -1, -1, -1, -1, -1, -1, -1, -1,	/* extended SSE registers */
+  -1, -1, -1, -1, -1, -1, -1, -1,       /* AVX-512 registers 16-23*/
+  -1, -1, -1, -1, -1, -1, -1, -1,       /* AVX-512 registers 24-31*/
+  93, 94, 95, 96, 97, 98, 99, 100,      /* Mask registers */
+};
+
+/* Define parameter passing and return registers.  */
+
+static int const x86_64_int_parameter_registers[6] =
+{
+  DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
+};
+
+static int const x86_64_ms_abi_int_parameter_registers[4] =
+{
+  CX_REG, DX_REG, R8_REG, R9_REG
+};
+
+static int const x86_64_int_return_registers[4] =
+{
+  AX_REG, DX_REG, DI_REG, SI_REG
+};
+
+/* Additional registers that are clobbered by SYSV calls.  */
+
+int const x86_64_ms_sysv_extra_clobbered_registers[12] =
+{
+  SI_REG, DI_REG,
+  XMM6_REG, XMM7_REG,
+  XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
+  XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
+};
+
+/* Define the structure for the machine field in struct function.  */
+
+struct GTY(()) stack_local_entry {
+  unsigned short mode;
+  unsigned short n;
+  rtx rtl;
+  struct stack_local_entry *next;
+};
+
+/* Structure describing stack frame layout.
+   Stack grows downward:
+
+   [arguments]
+					<- ARG_POINTER
+   saved pc
+
+   saved static chain			if ix86_static_chain_on_stack
+
+   saved frame pointer			if frame_pointer_needed
+					<- HARD_FRAME_POINTER
+   [saved regs]
+					<- regs_save_offset
+   [padding0]
+
+   [saved SSE regs]
+					<- sse_regs_save_offset
+   [padding1]          |
+		       |		<- FRAME_POINTER
+   [va_arg registers]  |
+		       |
+   [frame]	       |
+		       |
+   [padding2]	       | = to_allocate
+					<- STACK_POINTER
+  */
+struct ix86_frame
+{
+  int nsseregs;
+  int nregs;
+  int va_arg_size;
+  int red_zone_size;
+  int outgoing_arguments_size;
+
+  /* The offsets relative to ARG_POINTER.  */
+  HOST_WIDE_INT frame_pointer_offset;
+  HOST_WIDE_INT hard_frame_pointer_offset;
+  HOST_WIDE_INT stack_pointer_offset;
+  HOST_WIDE_INT hfp_save_offset;
+  HOST_WIDE_INT reg_save_offset;
+  HOST_WIDE_INT sse_reg_save_offset;
+
+  /* When save_regs_using_mov is set, emit prologue using
+     move instead of push instructions.  */
+  bool save_regs_using_mov;
+};
+
+/* Which cpu are we scheduling for.  */
+enum attr_cpu ix86_schedule;
+
+/* Which cpu are we optimizing for.  */
+enum processor_type ix86_tune;
+
+/* Which instruction set architecture to use.  */
+enum processor_type ix86_arch;
+
+/* True if processor has SSE prefetch instruction.  */
+unsigned char x86_prefetch_sse;
+
+/* -mstackrealign option */
+static const char ix86_force_align_arg_pointer_string[]
+  = "force_align_arg_pointer";
+
+static rtx (*ix86_gen_leave) (void);
+static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
+static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
+static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
+static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
+static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
+static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
+static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
+static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
+static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
+static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
+static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
+
+/* Preferred alignment for stack boundary in bits.  */
+unsigned int ix86_preferred_stack_boundary;
+
+/* Alignment for incoming stack boundary in bits specified at
+   command line.  */
+static unsigned int ix86_user_incoming_stack_boundary;
+
+/* Default alignment for incoming stack boundary in bits.  */
+static unsigned int ix86_default_incoming_stack_boundary;
+
+/* Alignment for incoming stack boundary in bits.  */
+unsigned int ix86_incoming_stack_boundary;
+
+/* Calling abi specific va_list type nodes.  */
+static GTY(()) tree sysv_va_list_type_node;
+static GTY(()) tree ms_va_list_type_node;
+
+/* Prefix built by ASM_GENERATE_INTERNAL_LABEL.  */
+char internal_label_prefix[16];
+int internal_label_prefix_len;
+
+/* Fence to use after loop using movnt.  */
+tree x86_mfence;
+
+/* Register class used for passing given 64bit part of the argument.
+   These represent classes as documented by the PS ABI, with the exception
+   of SSESF, SSEDF classes, that are basically SSE class, just gcc will
+   use SF or DFmode move instead of DImode to avoid reformatting penalties.
+
+   Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
+   whenever possible (upper half does contain padding).  */
+enum x86_64_reg_class
+  {
+    X86_64_NO_CLASS,
+    X86_64_INTEGER_CLASS,
+    X86_64_INTEGERSI_CLASS,
+    X86_64_SSE_CLASS,
+    X86_64_SSESF_CLASS,
+    X86_64_SSEDF_CLASS,
+    X86_64_SSEUP_CLASS,
+    X86_64_X87_CLASS,
+    X86_64_X87UP_CLASS,
+    X86_64_COMPLEX_X87_CLASS,
+    X86_64_MEMORY_CLASS
+  };
+
+#define MAX_CLASSES 8
+
+/* Table of constants used by fldpi, fldln2, etc....  */
+static REAL_VALUE_TYPE ext_80387_constants_table [5];
+static bool ext_80387_constants_init = 0;
+
+
+static struct machine_function * ix86_init_machine_status (void);
+static rtx ix86_function_value (const_tree, const_tree, bool);
+static bool ix86_function_value_regno_p (const unsigned int);
+static unsigned int ix86_function_arg_boundary (enum machine_mode,
+						const_tree);
+static rtx ix86_static_chain (const_tree, bool);
+static int ix86_function_regparm (const_tree, const_tree);
+static void ix86_compute_frame_layout (struct ix86_frame *);
+static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
+						 rtx, rtx, int);
+static void ix86_add_new_builtins (HOST_WIDE_INT);
+static tree ix86_canonical_va_list_type (tree);
+static void predict_jump (int);
+static unsigned int split_stack_prologue_scratch_regno (void);
+static bool i386_asm_output_addr_const_extra (FILE *, rtx);
+
+enum ix86_function_specific_strings
+{
+  IX86_FUNCTION_SPECIFIC_ARCH,
+  IX86_FUNCTION_SPECIFIC_TUNE,
+  IX86_FUNCTION_SPECIFIC_MAX
+};
+
+static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
+				 const char *, enum fpmath_unit, bool);
+static void ix86_function_specific_save (struct cl_target_option *,
+					 struct gcc_options *opts);
+static void ix86_function_specific_restore (struct gcc_options *opts,
+					    struct cl_target_option *);
+static void ix86_function_specific_print (FILE *, int,
+					  struct cl_target_option *);
+static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
+static bool ix86_valid_target_attribute_inner_p (tree, char *[],
+						 struct gcc_options *,
+						 struct gcc_options *,
+						 struct gcc_options *);
+static bool ix86_can_inline_p (tree, tree);
+static void ix86_set_current_function (tree);
+static unsigned int ix86_minimum_incoming_stack_boundary (bool);
+
+static enum calling_abi ix86_function_abi (const_tree);
+
+
+#ifndef SUBTARGET32_DEFAULT_CPU
+#define SUBTARGET32_DEFAULT_CPU "i386"
+#endif
+
+/* Whether -mtune= or -march= were specified */
+static int ix86_tune_defaulted;
+static int ix86_arch_specified;
+
+/* Vectorization library interface and handlers.  */
+static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
+
+static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
+static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
+
+/* Processor target table, indexed by processor number */
+struct ptt
+{
+  const char *const name;			/* processor name  */
+  const struct processor_costs *cost;		/* Processor costs */
+  const int align_loop;				/* Default alignments.  */
+  const int align_loop_max_skip;
+  const int align_jump;
+  const int align_jump_max_skip;
+  const int align_func;
+};
+
+/* This table must be in sync with enum processor_type in i386.h.  */ 
+static const struct ptt processor_target_table[PROCESSOR_max] =
+{
+  {"generic", &generic_cost, 16, 10, 16, 10, 16},
+  {"i386", &i386_cost, 4, 3, 4, 3, 4},
+  {"i486", &i486_cost, 16, 15, 16, 15, 16},
+  {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
+  {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
+  {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
+  {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
+  {"core2", &core_cost, 16, 10, 16, 10, 16},
+  {"nehalem", &core_cost, 16, 10, 16, 10, 16},
+  {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
+  {"haswell", &core_cost, 16, 10, 16, 10, 16},
+  {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
+  {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
+  {"intel", &intel_cost, 16, 15, 16, 7, 16},
+  {"geode", &geode_cost, 0, 0, 0, 0, 0},
+  {"k6", &k6_cost, 32, 7, 32, 7, 32},
+  {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
+  {"k8", &k8_cost, 16, 7, 16, 7, 16},
+  {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
+  {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
+  {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
+  {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
+  {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
+  {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
+  {"btver2", &btver2_cost, 16, 10, 16, 7, 11}
+};
+
+static bool
+gate_insert_vzeroupper (void)
+{
+  return TARGET_AVX && !TARGET_AVX512F && TARGET_VZEROUPPER;
+}
+
+static unsigned int
+rest_of_handle_insert_vzeroupper (void)
+{
+  int i;
+
+  /* vzeroupper instructions are inserted immediately after reload to
+     account for possible spills from 256bit registers.  The pass
+     reuses mode switching infrastructure by re-running mode insertion
+     pass, so disable entities that have already been processed.  */
+  for (i = 0; i < MAX_386_ENTITIES; i++)
+    ix86_optimize_mode_switching[i] = 0;
+
+  ix86_optimize_mode_switching[AVX_U128] = 1;
+
+  /* Call optimize_mode_switching.  */
+  g->get_passes ()->execute_pass_mode_switching ();
+  return 0;
+}
+
+namespace {
+
+const pass_data pass_data_insert_vzeroupper =
+{
+  RTL_PASS, /* type */
+  "vzeroupper", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  true, /* has_gate */
+  true, /* has_execute */
+  TV_NONE, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  ( TODO_df_finish | TODO_verify_rtl_sharing | 0 ), /* todo_flags_finish */
+};
+
+class pass_insert_vzeroupper : public rtl_opt_pass
+{
+public:
+  pass_insert_vzeroupper(gcc::context *ctxt)
+    : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate () { return gate_insert_vzeroupper (); }
+  unsigned int execute () { return rest_of_handle_insert_vzeroupper (); }
+
+}; // class pass_insert_vzeroupper
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_insert_vzeroupper (gcc::context *ctxt)
+{
+  return new pass_insert_vzeroupper (ctxt);
+}
+
+/* Return true if a red-zone is in use.  */
+
+static inline bool
+ix86_using_red_zone (void)
+{
+  return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
+}
+
+/* Return a string that documents the current -m options.  The caller is
+   responsible for freeing the string.  */
+
+static char *
+ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
+		    const char *tune, enum fpmath_unit fpmath,
+		    bool add_nl_p)
+{
+  struct ix86_target_opts
+  {
+    const char *option;		/* option string */
+    HOST_WIDE_INT mask;		/* isa mask options */
+  };
+
+  /* This table is ordered so that options like -msse4.2 that imply
+     preceding options while match those first.  */
+  static struct ix86_target_opts isa_opts[] =
+  {
+    { "-mfma4",		OPTION_MASK_ISA_FMA4 },
+    { "-mfma",		OPTION_MASK_ISA_FMA },
+    { "-mxop",		OPTION_MASK_ISA_XOP },
+    { "-mlwp",		OPTION_MASK_ISA_LWP },
+    { "-mavx512f",	OPTION_MASK_ISA_AVX512F },
+    { "-mavx512er",	OPTION_MASK_ISA_AVX512ER },
+    { "-mavx512cd",	OPTION_MASK_ISA_AVX512CD },
+    { "-mavx512pf",	OPTION_MASK_ISA_AVX512PF },
+    { "-msse4a",	OPTION_MASK_ISA_SSE4A },
+    { "-msse4.2",	OPTION_MASK_ISA_SSE4_2 },
+    { "-msse4.1",	OPTION_MASK_ISA_SSE4_1 },
+    { "-mssse3",	OPTION_MASK_ISA_SSSE3 },
+    { "-msse3",		OPTION_MASK_ISA_SSE3 },
+    { "-msse2",		OPTION_MASK_ISA_SSE2 },
+    { "-msse",		OPTION_MASK_ISA_SSE },
+    { "-m3dnow",	OPTION_MASK_ISA_3DNOW },
+    { "-m3dnowa",	OPTION_MASK_ISA_3DNOW_A },
+    { "-mmmx",		OPTION_MASK_ISA_MMX },
+    { "-mabm",		OPTION_MASK_ISA_ABM },
+    { "-mbmi",		OPTION_MASK_ISA_BMI },
+    { "-mbmi2",		OPTION_MASK_ISA_BMI2 },
+    { "-mlzcnt",	OPTION_MASK_ISA_LZCNT },
+    { "-mhle",		OPTION_MASK_ISA_HLE },
+    { "-mfxsr",		OPTION_MASK_ISA_FXSR },
+    { "-mrdseed",	OPTION_MASK_ISA_RDSEED },
+    { "-mprfchw",	OPTION_MASK_ISA_PRFCHW },
+    { "-madx",		OPTION_MASK_ISA_ADX },
+    { "-mtbm",		OPTION_MASK_ISA_TBM },
+    { "-mpopcnt",	OPTION_MASK_ISA_POPCNT },
+    { "-mmovbe",	OPTION_MASK_ISA_MOVBE },
+    { "-mcrc32",	OPTION_MASK_ISA_CRC32 },
+    { "-maes",		OPTION_MASK_ISA_AES },
+    { "-msha",		OPTION_MASK_ISA_SHA },
+    { "-mpclmul",	OPTION_MASK_ISA_PCLMUL },
+    { "-mfsgsbase",	OPTION_MASK_ISA_FSGSBASE },
+    { "-mrdrnd",	OPTION_MASK_ISA_RDRND },
+    { "-mf16c",		OPTION_MASK_ISA_F16C },
+    { "-mrtm",		OPTION_MASK_ISA_RTM },
+    { "-mxsave",	OPTION_MASK_ISA_XSAVE },
+    { "-mxsaveopt",	OPTION_MASK_ISA_XSAVEOPT },
+    { "-mprefetchwt1",	OPTION_MASK_ISA_PREFETCHWT1 },
+  };
+
+  /* Flag options.  */
+  static struct ix86_target_opts flag_opts[] =
+  {
+    { "-m128bit-long-double",		MASK_128BIT_LONG_DOUBLE },
+    { "-mlong-double-128",		MASK_LONG_DOUBLE_128 },
+    { "-mlong-double-64",		MASK_LONG_DOUBLE_64 },
+    { "-m80387",			MASK_80387 },
+    { "-maccumulate-outgoing-args",	MASK_ACCUMULATE_OUTGOING_ARGS },
+    { "-malign-double",			MASK_ALIGN_DOUBLE },
+    { "-mcld",				MASK_CLD },
+    { "-mfp-ret-in-387",		MASK_FLOAT_RETURNS },
+    { "-mieee-fp",			MASK_IEEE_FP },
+    { "-minline-all-stringops",		MASK_INLINE_ALL_STRINGOPS },
+    { "-minline-stringops-dynamically",	MASK_INLINE_STRINGOPS_DYNAMICALLY },
+    { "-mms-bitfields",			MASK_MS_BITFIELD_LAYOUT },
+    { "-mno-align-stringops",		MASK_NO_ALIGN_STRINGOPS },
+    { "-mno-fancy-math-387",		MASK_NO_FANCY_MATH_387 },
+    { "-mno-push-args",			MASK_NO_PUSH_ARGS },
+    { "-mno-red-zone",			MASK_NO_RED_ZONE },
+    { "-momit-leaf-frame-pointer",	MASK_OMIT_LEAF_FRAME_POINTER },
+    { "-mrecip",			MASK_RECIP },
+    { "-mrtd",				MASK_RTD },
+    { "-msseregparm",			MASK_SSEREGPARM },
+    { "-mstack-arg-probe",		MASK_STACK_PROBE },
+    { "-mtls-direct-seg-refs",		MASK_TLS_DIRECT_SEG_REFS },
+    { "-mvect8-ret-in-mem",		MASK_VECT8_RETURNS },
+    { "-m8bit-idiv",			MASK_USE_8BIT_IDIV },
+    { "-mvzeroupper",			MASK_VZEROUPPER },
+    { "-mavx256-split-unaligned-load",	MASK_AVX256_SPLIT_UNALIGNED_LOAD},
+    { "-mavx256-split-unaligned-store",	MASK_AVX256_SPLIT_UNALIGNED_STORE},
+    { "-mprefer-avx128",		MASK_PREFER_AVX128},
+  };
+
+  const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
+
+  char isa_other[40];
+  char target_other[40];
+  unsigned num = 0;
+  unsigned i, j;
+  char *ret;
+  char *ptr;
+  size_t len;
+  size_t line_len;
+  size_t sep_len;
+  const char *abi;
+
+  memset (opts, '\0', sizeof (opts));
+
+  /* Add -march= option.  */
+  if (arch)
+    {
+      opts[num][0] = "-march=";
+      opts[num++][1] = arch;
+    }
+
+  /* Add -mtune= option.  */
+  if (tune)
+    {
+      opts[num][0] = "-mtune=";
+      opts[num++][1] = tune;
+    }
+
+  /* Add -m32/-m64/-mx32.  */
+  if ((isa & OPTION_MASK_ISA_64BIT) != 0)
+    {
+      if ((isa & OPTION_MASK_ABI_64) != 0)
+	abi = "-m64";
+      else
+	abi = "-mx32";
+      isa &= ~ (OPTION_MASK_ISA_64BIT
+		| OPTION_MASK_ABI_64
+		| OPTION_MASK_ABI_X32);
+    }
+  else
+    abi = "-m32";
+  opts[num++][0] = abi;
+
+  /* Pick out the options in isa options.  */
+  for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
+    {
+      if ((isa & isa_opts[i].mask) != 0)
+	{
+	  opts[num++][0] = isa_opts[i].option;
+	  isa &= ~ isa_opts[i].mask;
+	}
+    }
+
+  if (isa && add_nl_p)
+    {
+      opts[num++][0] = isa_other;
+      sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
+	       isa);
+    }
+
+  /* Add flag options.  */
+  for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
+    {
+      if ((flags & flag_opts[i].mask) != 0)
+	{
+	  opts[num++][0] = flag_opts[i].option;
+	  flags &= ~ flag_opts[i].mask;
+	}
+    }
+
+  if (flags && add_nl_p)
+    {
+      opts[num++][0] = target_other;
+      sprintf (target_other, "(other flags: %#x)", flags);
+    }
+
+  /* Add -fpmath= option.  */
+  if (fpmath)
+    {
+      opts[num][0] = "-mfpmath=";
+      switch ((int) fpmath)
+	{
+	case FPMATH_387:
+	  opts[num++][1] = "387";
+	  break;
+
+	case FPMATH_SSE:
+	  opts[num++][1] = "sse";
+	  break;
+
+	case FPMATH_387 | FPMATH_SSE:
+	  opts[num++][1] = "sse+387";
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+    }
+
+  /* Any options?  */
+  if (num == 0)
+    return NULL;
+
+  gcc_assert (num < ARRAY_SIZE (opts));
+
+  /* Size the string.  */
+  len = 0;
+  sep_len = (add_nl_p) ? 3 : 1;
+  for (i = 0; i < num; i++)
+    {
+      len += sep_len;
+      for (j = 0; j < 2; j++)
+	if (opts[i][j])
+	  len += strlen (opts[i][j]);
+    }
+
+  /* Build the string.  */
+  ret = ptr = (char *) xmalloc (len);
+  line_len = 0;
+
+  for (i = 0; i < num; i++)
+    {
+      size_t len2[2];
+
+      for (j = 0; j < 2; j++)
+	len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
+
+      if (i != 0)
+	{
+	  *ptr++ = ' ';
+	  line_len++;
+
+	  if (add_nl_p && line_len + len2[0] + len2[1] > 70)
+	    {
+	      *ptr++ = '\\';
+	      *ptr++ = '\n';
+	      line_len = 0;
+	    }
+	}
+
+      for (j = 0; j < 2; j++)
+	if (opts[i][j])
+	  {
+	    memcpy (ptr, opts[i][j], len2[j]);
+	    ptr += len2[j];
+	    line_len += len2[j];
+	  }
+    }
+
+  *ptr = '\0';
+  gcc_assert (ret + len >= ptr);
+
+  return ret;
+}
+
+/* Return true, if profiling code should be emitted before
+   prologue. Otherwise it returns false.
+   Note: For x86 with "hotfix" it is sorried.  */
+static bool
+ix86_profile_before_prologue (void)
+{
+  return flag_fentry != 0;
+}
+
+/* Function that is callable from the debugger to print the current
+   options.  */
+void ATTRIBUTE_UNUSED
+ix86_debug_options (void)
+{
+  char *opts = ix86_target_string (ix86_isa_flags, target_flags,
+				   ix86_arch_string, ix86_tune_string,
+				   ix86_fpmath, true);
+
+  if (opts)
+    {
+      fprintf (stderr, "%s\n\n", opts);
+      free (opts);
+    }
+  else
+    fputs ("<no options>\n\n", stderr);
+
+  return;
+}
+
+static const char *stringop_alg_names[] = {
+#define DEF_ENUM
+#define DEF_ALG(alg, name) #name,
+#include "stringop.def"
+#undef DEF_ENUM
+#undef DEF_ALG
+};
+
+/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
+   The string is of the following form (or comma separated list of it):
+
+     strategy_alg:max_size:[align|noalign]
+
+   where the full size range for the strategy is either [0, max_size] or
+   [min_size, max_size], in which min_size is the max_size + 1 of the
+   preceding range.  The last size range must have max_size == -1.
+
+   Examples:
+
+    1.
+       -mmemcpy-strategy=libcall:-1:noalign
+
+      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
+
+
+   2.
+      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
+
+      This is to tell the compiler to use the following strategy for memset
+      1) when the expected size is between [1, 16], use rep_8byte strategy;
+      2) when the size is between [17, 2048], use vector_loop;
+      3) when the size is > 2048, use libcall.  */
+
+struct stringop_size_range
+{
+  int max;
+  stringop_alg alg;
+  bool noalign;
+};
+
+static void
+ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
+{
+  const struct stringop_algs *default_algs;
+  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
+  char *curr_range_str, *next_range_str;
+  int i = 0, n = 0;
+
+  if (is_memset)
+    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
+  else
+    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
+
+  curr_range_str = strategy_str;
+
+  do
+    {
+      int maxs;
+      char alg_name[128];
+      char align[16];
+      next_range_str = strchr (curr_range_str, ',');
+      if (next_range_str)
+        *next_range_str++ = '\0';
+
+      if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
+                       alg_name, &maxs, align))
+        {
+          error ("wrong arg %s to option %s", curr_range_str,
+                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+          return;
+        }
+
+      if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
+        {
+          error ("size ranges of option %s should be increasing",
+                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+          return;
+        }
+
+      for (i = 0; i < last_alg; i++)
+	if (!strcmp (alg_name, stringop_alg_names[i]))
+	  break;
+
+      if (i == last_alg)
+        {
+          error ("wrong stringop strategy name %s specified for option %s",
+                 alg_name,
+                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+          return;
+        }
+
+      input_ranges[n].max = maxs;
+      input_ranges[n].alg = (stringop_alg) i;
+      if (!strcmp (align, "align"))
+        input_ranges[n].noalign = false;
+      else if (!strcmp (align, "noalign"))
+        input_ranges[n].noalign = true;
+      else
+        {
+          error ("unknown alignment %s specified for option %s",
+                 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+          return;
+        }
+      n++;
+      curr_range_str = next_range_str;
+    }
+  while (curr_range_str);
+
+  if (input_ranges[n - 1].max != -1)
+    {
+      error ("the max value for the last size range should be -1"
+             " for option %s",
+             is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+      return;
+    }
+
+  if (n > MAX_STRINGOP_ALGS)
+    {
+      error ("too many size ranges specified in option %s",
+             is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+      return;
+    }
+
+  /* Now override the default algs array.  */
+  for (i = 0; i < n; i++)
+    {
+      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
+      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
+          = input_ranges[i].alg;
+      *const_cast<int *>(&default_algs->size[i].noalign)
+          = input_ranges[i].noalign;
+    }
+}
+
+
+/* parse -mtune-ctrl= option. When DUMP is true,
+   print the features that are explicitly set.  */
+
+static void
+parse_mtune_ctrl_str (bool dump)
+{
+  if (!ix86_tune_ctrl_string)
+    return;
+
+  char *next_feature_string = NULL;
+  char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
+  char *orig = curr_feature_string;
+  int i;
+  do
+    {
+      bool clear = false;
+
+      next_feature_string = strchr (curr_feature_string, ',');
+      if (next_feature_string)
+        *next_feature_string++ = '\0';
+      if (*curr_feature_string == '^')
+        {
+          curr_feature_string++;
+          clear = true;
+        }
+      for (i = 0; i < X86_TUNE_LAST; i++)
+        {
+          if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
+            {
+              ix86_tune_features[i] = !clear;
+              if (dump)
+                fprintf (stderr, "Explicitly %s feature %s\n",
+                         clear ? "clear" : "set", ix86_tune_feature_names[i]);
+              break;
+            }
+        }
+      if (i == X86_TUNE_LAST)
+        error ("Unknown parameter to option -mtune-ctrl: %s",
+               clear ? curr_feature_string - 1 : curr_feature_string);
+      curr_feature_string = next_feature_string;
+    }
+  while (curr_feature_string);
+  free (orig);
+}
+
+/* Helper function to set ix86_tune_features. IX86_TUNE is the
+   processor type.  */
+
+static void
+set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
+{
+  unsigned int ix86_tune_mask = 1u << ix86_tune;
+  int i;
+
+  for (i = 0; i < X86_TUNE_LAST; ++i)
+    {
+      if (ix86_tune_no_default)
+        ix86_tune_features[i] = 0;
+      else
+        ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
+    }
+
+  if (dump)
+    {
+      fprintf (stderr, "List of x86 specific tuning parameter names:\n");
+      for (i = 0; i < X86_TUNE_LAST; i++)
+        fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
+                 ix86_tune_features[i] ? "on" : "off");
+    }
+
+  parse_mtune_ctrl_str (dump);
+}
+
+
+/* Override various settings based on options.  If MAIN_ARGS_P, the
+   options are from the command line, otherwise they are from
+   attributes.  */
+
+static void
+ix86_option_override_internal (bool main_args_p,
+			       struct gcc_options *opts,
+			       struct gcc_options *opts_set)
+{
+  int i;
+  unsigned int ix86_arch_mask;
+  const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
+  const char *prefix;
+  const char *suffix;
+  const char *sw;
+
+#define PTA_3DNOW	 	(HOST_WIDE_INT_1 << 0)
+#define PTA_3DNOW_A	 	(HOST_WIDE_INT_1 << 1)
+#define PTA_64BIT		(HOST_WIDE_INT_1 << 2)
+#define PTA_ABM			(HOST_WIDE_INT_1 << 3)
+#define PTA_AES		 	(HOST_WIDE_INT_1 << 4)
+#define PTA_AVX			(HOST_WIDE_INT_1 << 5)
+#define PTA_BMI		 	(HOST_WIDE_INT_1 << 6)
+#define PTA_CX16		(HOST_WIDE_INT_1 << 7)
+#define PTA_F16C		(HOST_WIDE_INT_1 << 8)
+#define PTA_FMA			(HOST_WIDE_INT_1 << 9)
+#define PTA_FMA4	 	(HOST_WIDE_INT_1 << 10)
+#define PTA_FSGSBASE		(HOST_WIDE_INT_1 << 11)
+#define PTA_LWP		 	(HOST_WIDE_INT_1 << 12)
+#define PTA_LZCNT	 	(HOST_WIDE_INT_1 << 13)
+#define PTA_MMX			(HOST_WIDE_INT_1 << 14)
+#define PTA_MOVBE		(HOST_WIDE_INT_1 << 15)
+#define PTA_NO_SAHF		(HOST_WIDE_INT_1 << 16)
+#define PTA_PCLMUL		(HOST_WIDE_INT_1 << 17)
+#define PTA_POPCNT		(HOST_WIDE_INT_1 << 18)
+#define PTA_PREFETCH_SSE	(HOST_WIDE_INT_1 << 19)
+#define PTA_RDRND	 	(HOST_WIDE_INT_1 << 20)
+#define PTA_SSE			(HOST_WIDE_INT_1 << 21)
+#define PTA_SSE2		(HOST_WIDE_INT_1 << 22)
+#define PTA_SSE3		(HOST_WIDE_INT_1 << 23)
+#define PTA_SSE4_1	 	(HOST_WIDE_INT_1 << 24)
+#define PTA_SSE4_2	 	(HOST_WIDE_INT_1 << 25)
+#define PTA_SSE4A		(HOST_WIDE_INT_1 << 26)
+#define PTA_SSSE3		(HOST_WIDE_INT_1 << 27)
+#define PTA_TBM		 	(HOST_WIDE_INT_1 << 28)
+#define PTA_XOP		 	(HOST_WIDE_INT_1 << 29)
+#define PTA_AVX2		(HOST_WIDE_INT_1 << 30)
+#define PTA_BMI2	 	(HOST_WIDE_INT_1 << 31)
+#define PTA_RTM		 	(HOST_WIDE_INT_1 << 32)
+#define PTA_HLE			(HOST_WIDE_INT_1 << 33)
+#define PTA_PRFCHW		(HOST_WIDE_INT_1 << 34)
+#define PTA_RDSEED		(HOST_WIDE_INT_1 << 35)
+#define PTA_ADX			(HOST_WIDE_INT_1 << 36)
+#define PTA_FXSR		(HOST_WIDE_INT_1 << 37)
+#define PTA_XSAVE		(HOST_WIDE_INT_1 << 38)
+#define PTA_XSAVEOPT		(HOST_WIDE_INT_1 << 39)
+#define PTA_AVX512F		(HOST_WIDE_INT_1 << 40)
+#define PTA_AVX512ER		(HOST_WIDE_INT_1 << 41)
+#define PTA_AVX512PF		(HOST_WIDE_INT_1 << 42)
+#define PTA_AVX512CD		(HOST_WIDE_INT_1 << 43)
+#define PTA_SHA			(HOST_WIDE_INT_1 << 45)
+#define PTA_PREFETCHWT1		(HOST_WIDE_INT_1 << 46)
+
+#define PTA_CORE2 \
+  (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
+   | PTA_CX16 | PTA_FXSR)
+#define PTA_NEHALEM \
+  (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
+#define PTA_WESTMERE \
+  (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
+#define PTA_SANDYBRIDGE \
+  (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
+#define PTA_IVYBRIDGE \
+  (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
+#define PTA_HASWELL \
+  (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
+   | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE)
+#define PTA_BROADWELL \
+  (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
+#define PTA_BONNELL \
+  (PTA_CORE2 | PTA_MOVBE)
+#define PTA_SILVERMONT \
+  (PTA_WESTMERE | PTA_MOVBE)
+
+/* if this reaches 64, need to widen struct pta flags below */
+
+  static struct pta
+    {
+      const char *const name;		/* processor name or nickname.  */
+      const enum processor_type processor;
+      const enum attr_cpu schedule;
+      const unsigned HOST_WIDE_INT flags;
+    }
+  const processor_alias_table[] =
+    {
+      {"i386", PROCESSOR_I386, CPU_NONE, 0},
+      {"i486", PROCESSOR_I486, CPU_NONE, 0},
+      {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
+      {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
+      {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
+      {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
+      {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
+      {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
+      {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
+	PTA_MMX | PTA_SSE | PTA_FXSR},
+      {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
+      {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
+      {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
+      {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
+	PTA_MMX | PTA_SSE | PTA_FXSR},
+      {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
+	PTA_MMX | PTA_SSE | PTA_FXSR},
+      {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
+	PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
+      {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
+	PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
+      {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
+	PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
+      {"prescott", PROCESSOR_NOCONA, CPU_NONE,
+	PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
+      {"nocona", PROCESSOR_NOCONA, CPU_NONE,
+	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
+	| PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
+      {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
+      {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
+      {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
+      {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
+      {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
+	PTA_SANDYBRIDGE},
+      {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
+	PTA_SANDYBRIDGE},
+      {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
+	PTA_IVYBRIDGE},
+      {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
+	PTA_IVYBRIDGE},
+      {"haswell", PROCESSOR_HASWELL, CPU_NEHALEM, PTA_HASWELL},
+      {"core-avx2", PROCESSOR_HASWELL, CPU_NEHALEM, PTA_HASWELL},
+      {"broadwell", PROCESSOR_HASWELL, CPU_NEHALEM, PTA_BROADWELL},
+      {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
+      {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
+      {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
+      {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
+      {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
+      {"geode", PROCESSOR_GEODE, CPU_GEODE,
+	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
+      {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
+      {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
+      {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
+      {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
+	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
+      {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
+	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
+      {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
+	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
+      {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
+	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
+      {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
+	PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
+      {"x86-64", PROCESSOR_K8, CPU_K8,
+	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
+      {"k8", PROCESSOR_K8, CPU_K8,
+	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
+	| PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
+      {"k8-sse3", PROCESSOR_K8, CPU_K8,
+	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
+	| PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
+      {"opteron", PROCESSOR_K8, CPU_K8,
+	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
+	| PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
+      {"opteron-sse3", PROCESSOR_K8, CPU_K8,
+	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
+	| PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
+      {"athlon64", PROCESSOR_K8, CPU_K8,
+	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
+	| PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
+      {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
+	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
+	| PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
+      {"athlon-fx", PROCESSOR_K8, CPU_K8,
+	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
+	| PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
+      {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
+	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
+	| PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
+      {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
+	PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
+	| PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
+      {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
+	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
+	| PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
+	| PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
+	| PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
+      {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
+	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
+	| PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
+	| PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
+	| PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
+	| PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
+      {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
+	PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
+	| PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
+	| PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
+	| PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
+	| PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE 
+	| PTA_XSAVEOPT | PTA_FSGSBASE},
+     {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
+        PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
+        | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
+        | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2 
+	| PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2 
+	| PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR 
+	| PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE},
+      {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
+	PTA_64BIT | PTA_MMX |  PTA_SSE  | PTA_SSE2 | PTA_SSE3
+	| PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
+	| PTA_FXSR | PTA_XSAVE},
+      {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
+	PTA_64BIT | PTA_MMX |  PTA_SSE  | PTA_SSE2 | PTA_SSE3
+	| PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
+	| PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
+	| PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
+	| PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
+
+      {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
+	PTA_64BIT
+	| PTA_HLE /* flags are only used for -march switch.  */ },
+    };
+
+  /* -mrecip options.  */
+  static struct
+    {
+      const char *string;           /* option name */
+      unsigned int mask;            /* mask bits to set */
+    }
+  const recip_options[] =
+    {
+      { "all",       RECIP_MASK_ALL },
+      { "none",      RECIP_MASK_NONE },
+      { "div",       RECIP_MASK_DIV },
+      { "sqrt",      RECIP_MASK_SQRT },
+      { "vec-div",   RECIP_MASK_VEC_DIV },
+      { "vec-sqrt",  RECIP_MASK_VEC_SQRT },
+    };
+
+  int const pta_size = ARRAY_SIZE (processor_alias_table);
+
+  /* Set up prefix/suffix so the error messages refer to either the command
+     line argument, or the attribute(target).  */
+  if (main_args_p)
+    {
+      prefix = "-m";
+      suffix = "";
+      sw = "switch";
+    }
+  else
+    {
+      prefix = "option(\"";
+      suffix = "\")";
+      sw = "attribute";
+    }
+
+  /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
+     TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false.  */
+  if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
+    opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
+#ifdef TARGET_BI_ARCH
+  else
+    {
+#if TARGET_BI_ARCH == 1
+      /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
+	 is on and OPTION_MASK_ABI_X32 is off.  We turn off
+	 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
+	 -mx32.  */
+      if (TARGET_X32_P (opts->x_ix86_isa_flags))
+	opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
+#else
+      /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
+	 on and OPTION_MASK_ABI_64 is off.  We turn off
+	 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
+	 -m64.  */
+      if (TARGET_LP64_P (opts->x_ix86_isa_flags))
+	opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
+#endif
+    }
+#endif
+
+  if (TARGET_X32_P (opts->x_ix86_isa_flags))
+    {
+      /* Always turn on OPTION_MASK_ISA_64BIT and turn off
+	 OPTION_MASK_ABI_64 for TARGET_X32.  */
+      opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
+      opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
+    }
+  else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
+    opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
+				| OPTION_MASK_ABI_X32
+				| OPTION_MASK_ABI_64);
+  else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
+    {
+      /* Always turn on OPTION_MASK_ISA_64BIT and turn off
+	 OPTION_MASK_ABI_X32 for TARGET_LP64.  */
+      opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
+      opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
+    }
+
+#ifdef SUBTARGET_OVERRIDE_OPTIONS
+  SUBTARGET_OVERRIDE_OPTIONS;
+#endif
+
+#ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
+  SUBSUBTARGET_OVERRIDE_OPTIONS;
+#endif
+
+  /* -fPIC is the default for x86_64.  */
+  if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
+    opts->x_flag_pic = 2;
+
+  /* Need to check -mtune=generic first.  */
+  if (opts->x_ix86_tune_string)
+    {
+      /* As special support for cross compilers we read -mtune=native
+	     as -mtune=generic.  With native compilers we won't see the
+	     -mtune=native, as it was changed by the driver.  */
+      if (!strcmp (opts->x_ix86_tune_string, "native"))
+	{
+	  opts->x_ix86_tune_string = "generic";
+	}
+      else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
+        warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
+                 "%stune=k8%s or %stune=generic%s instead as appropriate",
+                 prefix, suffix, prefix, suffix, prefix, suffix);
+    }
+  else
+    {
+      if (opts->x_ix86_arch_string)
+	opts->x_ix86_tune_string = opts->x_ix86_arch_string;
+      if (!opts->x_ix86_tune_string)
+	{
+	  opts->x_ix86_tune_string
+	    = processor_target_table[TARGET_CPU_DEFAULT].name;
+	  ix86_tune_defaulted = 1;
+	}
+
+      /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
+	 or defaulted.  We need to use a sensible tune option.  */
+      if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
+	{
+	  opts->x_ix86_tune_string = "generic";
+	}
+    }
+
+  if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
+      && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
+    {
+      /* rep; movq isn't available in 32-bit code.  */
+      error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
+      opts->x_ix86_stringop_alg = no_stringop;
+    }
+
+  if (!opts->x_ix86_arch_string)
+    opts->x_ix86_arch_string
+      = TARGET_64BIT_P (opts->x_ix86_isa_flags)
+	? "x86-64" : SUBTARGET32_DEFAULT_CPU;
+  else
+    ix86_arch_specified = 1;
+
+  if (opts_set->x_ix86_pmode)
+    {
+      if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
+	   && opts->x_ix86_pmode == PMODE_SI)
+	  || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
+	       && opts->x_ix86_pmode == PMODE_DI))
+	error ("address mode %qs not supported in the %s bit mode",
+	       TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
+	       TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
+    }
+  else
+    opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
+			 ? PMODE_DI : PMODE_SI;
+
+  if (!opts_set->x_ix86_abi)
+    opts->x_ix86_abi = DEFAULT_ABI;
+
+  /* For targets using ms ABI enable ms-extensions, if not
+     explicit turned off.  For non-ms ABI we turn off this
+     option.  */
+  if (!opts_set->x_flag_ms_extensions)
+    opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
+
+  if (opts_set->x_ix86_cmodel)
+    {
+      switch (opts->x_ix86_cmodel)
+	{
+	case CM_SMALL:
+	case CM_SMALL_PIC:
+	  if (opts->x_flag_pic)
+	    opts->x_ix86_cmodel = CM_SMALL_PIC;
+	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
+	    error ("code model %qs not supported in the %s bit mode",
+		   "small", "32");
+	  break;
+
+	case CM_MEDIUM:
+	case CM_MEDIUM_PIC:
+	  if (opts->x_flag_pic)
+	    opts->x_ix86_cmodel = CM_MEDIUM_PIC;
+	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
+	    error ("code model %qs not supported in the %s bit mode",
+		   "medium", "32");
+	  else if (TARGET_X32_P (opts->x_ix86_isa_flags))
+	    error ("code model %qs not supported in x32 mode",
+		   "medium");
+	  break;
+
+	case CM_LARGE:
+	case CM_LARGE_PIC:
+	  if (opts->x_flag_pic)
+	    opts->x_ix86_cmodel = CM_LARGE_PIC;
+	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
+	    error ("code model %qs not supported in the %s bit mode",
+		   "large", "32");
+	  else if (TARGET_X32_P (opts->x_ix86_isa_flags))
+	    error ("code model %qs not supported in x32 mode",
+		   "large");
+	  break;
+
+	case CM_32:
+	  if (opts->x_flag_pic)
+	    error ("code model %s does not support PIC mode", "32");
+	  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+	    error ("code model %qs not supported in the %s bit mode",
+		   "32", "64");
+	  break;
+
+	case CM_KERNEL:
+	  if (opts->x_flag_pic)
+	    {
+	      error ("code model %s does not support PIC mode", "kernel");
+	      opts->x_ix86_cmodel = CM_32;
+	    }
+	  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
+	    error ("code model %qs not supported in the %s bit mode",
+		   "kernel", "32");
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+    }
+  else
+    {
+      /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
+	 use of rip-relative addressing.  This eliminates fixups that
+	 would otherwise be needed if this object is to be placed in a
+	 DLL, and is essentially just as efficient as direct addressing.  */
+      if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
+	  && (TARGET_RDOS || TARGET_PECOFF))
+	opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
+      else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+	opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
+      else
+	opts->x_ix86_cmodel = CM_32;
+    }
+  if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
+    {
+      error ("-masm=intel not supported in this configuration");
+      opts->x_ix86_asm_dialect = ASM_ATT;
+    }
+  if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
+      != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
+    sorry ("%i-bit mode not compiled in",
+	   (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
+
+  for (i = 0; i < pta_size; i++)
+    if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
+      {
+	ix86_schedule = processor_alias_table[i].schedule;
+	ix86_arch = processor_alias_table[i].processor;
+	/* Default cpu tuning to the architecture.  */
+	ix86_tune = ix86_arch;
+
+	if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
+	    && !(processor_alias_table[i].flags & PTA_64BIT))
+	  error ("CPU you selected does not support x86-64 "
+		 "instruction set");
+
+	if (processor_alias_table[i].flags & PTA_MMX
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
+	if (processor_alias_table[i].flags & PTA_3DNOW
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
+	if (processor_alias_table[i].flags & PTA_3DNOW_A
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
+	if (processor_alias_table[i].flags & PTA_SSE
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
+	if (processor_alias_table[i].flags & PTA_SSE2
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
+	if (processor_alias_table[i].flags & PTA_SSE3
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
+	if (processor_alias_table[i].flags & PTA_SSSE3
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
+	if (processor_alias_table[i].flags & PTA_SSE4_1
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
+	if (processor_alias_table[i].flags & PTA_SSE4_2
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
+	if (processor_alias_table[i].flags & PTA_AVX
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
+	if (processor_alias_table[i].flags & PTA_AVX2
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
+	if (processor_alias_table[i].flags & PTA_FMA
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
+	if (processor_alias_table[i].flags & PTA_SSE4A
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
+	if (processor_alias_table[i].flags & PTA_FMA4
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
+	if (processor_alias_table[i].flags & PTA_XOP
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
+	if (processor_alias_table[i].flags & PTA_LWP
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
+	if (processor_alias_table[i].flags & PTA_ABM
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
+	if (processor_alias_table[i].flags & PTA_BMI
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
+	if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
+	if (processor_alias_table[i].flags & PTA_TBM
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
+	if (processor_alias_table[i].flags & PTA_BMI2
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
+	if (processor_alias_table[i].flags & PTA_CX16
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
+	if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
+	if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
+	    && (processor_alias_table[i].flags & PTA_NO_SAHF))
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
+	if (processor_alias_table[i].flags & PTA_MOVBE
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
+	if (processor_alias_table[i].flags & PTA_AES
+	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
+	  ix86_isa_flags |= OPTION_MASK_ISA_AES;
+	if (processor_alias_table[i].flags & PTA_SHA
+	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
+	  ix86_isa_flags |= OPTION_MASK_ISA_SHA;
+	if (processor_alias_table[i].flags & PTA_PCLMUL
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
+	if (processor_alias_table[i].flags & PTA_FSGSBASE
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
+	if (processor_alias_table[i].flags & PTA_RDRND
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
+	if (processor_alias_table[i].flags & PTA_F16C
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
+	if (processor_alias_table[i].flags & PTA_RTM
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
+	if (processor_alias_table[i].flags & PTA_HLE
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
+	if (processor_alias_table[i].flags & PTA_PRFCHW
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
+	if (processor_alias_table[i].flags & PTA_RDSEED
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
+	if (processor_alias_table[i].flags & PTA_ADX
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
+	if (processor_alias_table[i].flags & PTA_FXSR
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
+	if (processor_alias_table[i].flags & PTA_XSAVE
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
+	if (processor_alias_table[i].flags & PTA_XSAVEOPT
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
+	if (processor_alias_table[i].flags & PTA_AVX512F
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
+	if (processor_alias_table[i].flags & PTA_AVX512ER
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
+	if (processor_alias_table[i].flags & PTA_AVX512PF
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
+	if (processor_alias_table[i].flags & PTA_AVX512CD
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
+	if (processor_alias_table[i].flags & PTA_PREFETCHWT1
+	    && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
+	  opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
+	if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
+	  x86_prefetch_sse = true;
+
+	break;
+      }
+
+  if (!strcmp (opts->x_ix86_arch_string, "generic"))
+    error ("generic CPU can be used only for %stune=%s %s",
+	   prefix, suffix, sw);
+  else if (!strcmp (opts->x_ix86_arch_string, "intel"))
+    error ("intel CPU can be used only for %stune=%s %s",
+	   prefix, suffix, sw);
+  else if (i == pta_size)
+    error ("bad value (%s) for %sarch=%s %s",
+	   opts->x_ix86_arch_string, prefix, suffix, sw);
+
+  ix86_arch_mask = 1u << ix86_arch;
+  for (i = 0; i < X86_ARCH_LAST; ++i)
+    ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
+
+  for (i = 0; i < pta_size; i++)
+    if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
+      {
+	ix86_schedule = processor_alias_table[i].schedule;
+	ix86_tune = processor_alias_table[i].processor;
+	if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+	  {
+	    if (!(processor_alias_table[i].flags & PTA_64BIT))
+	      {
+		if (ix86_tune_defaulted)
+		  {
+		    opts->x_ix86_tune_string = "x86-64";
+		    for (i = 0; i < pta_size; i++)
+		      if (! strcmp (opts->x_ix86_tune_string,
+				    processor_alias_table[i].name))
+			break;
+		    ix86_schedule = processor_alias_table[i].schedule;
+		    ix86_tune = processor_alias_table[i].processor;
+		  }
+		else
+		  error ("CPU you selected does not support x86-64 "
+			 "instruction set");
+	      }
+	  }
+	/* Intel CPUs have always interpreted SSE prefetch instructions as
+	   NOPs; so, we can enable SSE prefetch instructions even when
+	   -mtune (rather than -march) points us to a processor that has them.
+	   However, the VIA C3 gives a SIGILL, so we only do that for i686 and
+	   higher processors.  */
+	if (TARGET_CMOV
+	    && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
+	  x86_prefetch_sse = true;
+	break;
+      }
+
+  if (ix86_tune_specified && i == pta_size)
+    error ("bad value (%s) for %stune=%s %s",
+	   opts->x_ix86_tune_string, prefix, suffix, sw);
+
+  set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
+
+#ifndef USE_IX86_FRAME_POINTER
+#define USE_IX86_FRAME_POINTER 0
+#endif
+
+#ifndef USE_X86_64_FRAME_POINTER
+#define USE_X86_64_FRAME_POINTER 0
+#endif
+
+  /* Set the default values for switches whose default depends on TARGET_64BIT
+     in case they weren't overwritten by command line options.  */
+  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+    {
+      if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
+	opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
+      if (opts->x_flag_asynchronous_unwind_tables
+	  && !opts_set->x_flag_unwind_tables
+	  && TARGET_64BIT_MS_ABI)
+	opts->x_flag_unwind_tables = 1;
+      if (opts->x_flag_asynchronous_unwind_tables == 2)
+	opts->x_flag_unwind_tables
+	  = opts->x_flag_asynchronous_unwind_tables = 1;
+      if (opts->x_flag_pcc_struct_return == 2)
+	opts->x_flag_pcc_struct_return = 0;
+    }
+  else
+    {
+      if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
+	opts->x_flag_omit_frame_pointer
+	  = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
+      if (opts->x_flag_asynchronous_unwind_tables == 2)
+	opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
+      if (opts->x_flag_pcc_struct_return == 2)
+	opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
+    }
+
+  ix86_tune_cost = processor_target_table[ix86_tune].cost;
+  if (opts->x_optimize_size)
+    ix86_cost = &ix86_size_cost;
+  else
+    ix86_cost = ix86_tune_cost;
+
+  /* Arrange to set up i386_stack_locals for all functions.  */
+  init_machine_status = ix86_init_machine_status;
+
+  /* Validate -mregparm= value.  */
+  if (opts_set->x_ix86_regparm)
+    {
+      if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+	warning (0, "-mregparm is ignored in 64-bit mode");
+      if (opts->x_ix86_regparm > REGPARM_MAX)
+	{
+	  error ("-mregparm=%d is not between 0 and %d",
+		 opts->x_ix86_regparm, REGPARM_MAX);
+	  opts->x_ix86_regparm = 0;
+	}
+    }
+  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+    opts->x_ix86_regparm = REGPARM_MAX;
+
+  /* Default align_* from the processor table.  */
+  if (opts->x_align_loops == 0)
+    {
+      opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
+      align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
+    }
+  if (opts->x_align_jumps == 0)
+    {
+      opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
+      align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
+    }
+  if (opts->x_align_functions == 0)
+    {
+      opts->x_align_functions = processor_target_table[ix86_tune].align_func;
+    }
+
+  /* Provide default for -mbranch-cost= value.  */
+  if (!opts_set->x_ix86_branch_cost)
+    opts->x_ix86_branch_cost = ix86_cost->branch_cost;
+
+  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+    {
+      opts->x_target_flags
+	|= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
+
+      /* Enable by default the SSE and MMX builtins.  Do allow the user to
+	 explicitly disable any of these.  In particular, disabling SSE and
+	 MMX for kernel code is extremely useful.  */
+      if (!ix86_arch_specified)
+      opts->x_ix86_isa_flags
+	|= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
+	     | TARGET_SUBTARGET64_ISA_DEFAULT)
+            & ~opts->x_ix86_isa_flags_explicit);
+
+      if (TARGET_RTD_P (opts->x_target_flags))
+	warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
+    }
+  else
+    {
+      opts->x_target_flags
+	|= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
+
+      if (!ix86_arch_specified)
+        opts->x_ix86_isa_flags
+	  |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
+
+      /* i386 ABI does not specify red zone.  It still makes sense to use it
+         when programmer takes care to stack from being destroyed.  */
+      if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
+        opts->x_target_flags |= MASK_NO_RED_ZONE;
+    }
+
+  /* Keep nonleaf frame pointers.  */
+  if (opts->x_flag_omit_frame_pointer)
+    opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
+  else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
+    opts->x_flag_omit_frame_pointer = 1;
+
+  /* If we're doing fast math, we don't care about comparison order
+     wrt NaNs.  This lets us use a shorter comparison sequence.  */
+  if (opts->x_flag_finite_math_only)
+    opts->x_target_flags &= ~MASK_IEEE_FP;
+
+  /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
+     since the insns won't need emulation.  */
+  if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
+    opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
+
+  /* Likewise, if the target doesn't have a 387, or we've specified
+     software floating point, don't use 387 inline intrinsics.  */
+  if (!TARGET_80387_P (opts->x_target_flags))
+    opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
+
+  /* Turn on MMX builtins for -msse.  */
+  if (TARGET_SSE_P (opts->x_ix86_isa_flags))
+    opts->x_ix86_isa_flags
+      |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
+
+  /* Enable SSE prefetch.  */
+  if (TARGET_SSE_P (opts->x_ix86_isa_flags)
+      || (TARGET_PRFCHW && !TARGET_3DNOW_P (opts->x_ix86_isa_flags)))
+    x86_prefetch_sse = true;
+
+  /* Enable prefetch{,w} instructions for -m3dnow and -mprefetchwt1.  */
+  if (TARGET_3DNOW_P (opts->x_ix86_isa_flags)
+      || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
+    opts->x_ix86_isa_flags
+      |= OPTION_MASK_ISA_PRFCHW & ~opts->x_ix86_isa_flags_explicit;
+
+  /* Enable popcnt instruction for -msse4.2 or -mabm.  */
+  if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
+      || TARGET_ABM_P (opts->x_ix86_isa_flags))
+    opts->x_ix86_isa_flags
+      |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
+
+  /* Enable lzcnt instruction for -mabm.  */
+  if (TARGET_ABM_P(opts->x_ix86_isa_flags))
+    opts->x_ix86_isa_flags
+      |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
+
+  /* Validate -mpreferred-stack-boundary= value or default it to
+     PREFERRED_STACK_BOUNDARY_DEFAULT.  */
+  ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
+  if (opts_set->x_ix86_preferred_stack_boundary_arg)
+    {
+      int min = (TARGET_64BIT_P (opts->x_ix86_isa_flags)
+		 ? (TARGET_SSE_P (opts->x_ix86_isa_flags) ? 4 : 3) : 2);
+      int max = (TARGET_SEH ? 4 : 12);
+
+      if (opts->x_ix86_preferred_stack_boundary_arg < min
+	  || opts->x_ix86_preferred_stack_boundary_arg > max)
+	{
+	  if (min == max)
+	    error ("-mpreferred-stack-boundary is not supported "
+		   "for this target");
+	  else
+	    error ("-mpreferred-stack-boundary=%d is not between %d and %d",
+		   opts->x_ix86_preferred_stack_boundary_arg, min, max);
+	}
+      else
+	ix86_preferred_stack_boundary
+	  = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
+    }
+
+  /* Set the default value for -mstackrealign.  */
+  if (opts->x_ix86_force_align_arg_pointer == -1)
+    opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
+
+  ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
+
+  /* Validate -mincoming-stack-boundary= value or default it to
+     MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY.  */
+  ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
+  if (opts_set->x_ix86_incoming_stack_boundary_arg)
+    {
+      if (opts->x_ix86_incoming_stack_boundary_arg
+	  < (TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2)
+	  || opts->x_ix86_incoming_stack_boundary_arg > 12)
+	error ("-mincoming-stack-boundary=%d is not between %d and 12",
+	       opts->x_ix86_incoming_stack_boundary_arg,
+	       TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2);
+      else
+	{
+	  ix86_user_incoming_stack_boundary
+	    = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
+	  ix86_incoming_stack_boundary
+	    = ix86_user_incoming_stack_boundary;
+	}
+    }
+
+  /* Accept -msseregparm only if at least SSE support is enabled.  */
+  if (TARGET_SSEREGPARM_P (opts->x_target_flags)
+      && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
+    error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
+
+  if (opts_set->x_ix86_fpmath)
+    {
+      if (opts->x_ix86_fpmath & FPMATH_SSE)
+	{
+	  if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
+	    {
+	      warning (0, "SSE instruction set disabled, using 387 arithmetics");
+	      opts->x_ix86_fpmath = FPMATH_387;
+	    }
+	  else if ((opts->x_ix86_fpmath & FPMATH_387)
+		   && !TARGET_80387_P (opts->x_target_flags))
+	    {
+	      warning (0, "387 instruction set disabled, using SSE arithmetics");
+	      opts->x_ix86_fpmath = FPMATH_SSE;
+	    }
+	}
+    }
+  /* For all chips supporting SSE2, -mfpmath=sse performs better than
+     fpmath=387.  The second is however default at many targets since the
+     extra 80bit precision of temporaries is considered to be part of ABI.
+     Overwrite the default at least for -ffast-math. 
+     TODO: -mfpmath=both seems to produce same performing code with bit
+     smaller binaries.  It is however not clear if register allocation is
+     ready for this setting.
+     Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
+     codegen.  We may switch to 387 with -ffast-math for size optimized
+     functions. */
+  else if (fast_math_flags_set_p (&global_options)
+	   && TARGET_SSE2_P (opts->x_ix86_isa_flags))
+    opts->x_ix86_fpmath = FPMATH_SSE;
+  else
+    opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
+
+  /* If the i387 is disabled, then do not return values in it. */
+  if (!TARGET_80387_P (opts->x_target_flags))
+    opts->x_target_flags &= ~MASK_FLOAT_RETURNS;
+
+  /* Use external vectorized library in vectorizing intrinsics.  */
+  if (opts_set->x_ix86_veclibabi_type)
+    switch (opts->x_ix86_veclibabi_type)
+      {
+      case ix86_veclibabi_type_svml:
+	ix86_veclib_handler = ix86_veclibabi_svml;
+	break;
+
+      case ix86_veclibabi_type_acml:
+	ix86_veclib_handler = ix86_veclibabi_acml;
+	break;
+
+      default:
+	gcc_unreachable ();
+      }
+
+  if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
+      && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
+      && !opts->x_optimize_size)
+    opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
+
+  /* If stack probes are required, the space used for large function
+     arguments on the stack must also be probed, so enable
+     -maccumulate-outgoing-args so this happens in the prologue.  */
+  if (TARGET_STACK_PROBE_P (opts->x_target_flags)
+      && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
+    {
+      if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
+	warning (0, "stack probing requires %saccumulate-outgoing-args%s "
+		 "for correctness", prefix, suffix);
+      opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
+    }
+
+  /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix.  */
+  {
+    char *p;
+    ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
+    p = strchr (internal_label_prefix, 'X');
+    internal_label_prefix_len = p - internal_label_prefix;
+    *p = '\0';
+  }
+
+  /* When scheduling description is not available, disable scheduler pass
+     so it won't slow down the compilation and make x87 code slower.  */
+  if (!TARGET_SCHEDULE)
+    opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
+
+  maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
+			 ix86_tune_cost->simultaneous_prefetches,
+			 opts->x_param_values,
+			 opts_set->x_param_values);
+  maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
+			 ix86_tune_cost->prefetch_block,
+			 opts->x_param_values,
+			 opts_set->x_param_values);
+  maybe_set_param_value (PARAM_L1_CACHE_SIZE,
+			 ix86_tune_cost->l1_cache_size,
+			 opts->x_param_values,
+			 opts_set->x_param_values);
+  maybe_set_param_value (PARAM_L2_CACHE_SIZE,
+			 ix86_tune_cost->l2_cache_size,
+			 opts->x_param_values,
+			 opts_set->x_param_values);
+
+  /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful.  */
+  if (opts->x_flag_prefetch_loop_arrays < 0
+      && HAVE_prefetch
+      && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
+      && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
+    opts->x_flag_prefetch_loop_arrays = 1;
+
+  /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
+     can be opts->x_optimized to ap = __builtin_next_arg (0).  */
+  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
+    targetm.expand_builtin_va_start = NULL;
+
+  if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
+    {
+      ix86_gen_leave = gen_leave_rex64;
+      if (Pmode == DImode)
+	{
+	  ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
+	  ix86_gen_tls_local_dynamic_base_64
+	    = gen_tls_local_dynamic_base_64_di;
+	}
+      else
+	{
+	  ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
+	  ix86_gen_tls_local_dynamic_base_64
+	    = gen_tls_local_dynamic_base_64_si;
+	}
+    }
+  else
+    ix86_gen_leave = gen_leave;
+
+  if (Pmode == DImode)
+    {
+      ix86_gen_add3 = gen_adddi3;
+      ix86_gen_sub3 = gen_subdi3;
+      ix86_gen_sub3_carry = gen_subdi3_carry;
+      ix86_gen_one_cmpl2 = gen_one_cmpldi2;
+      ix86_gen_andsp = gen_anddi3;
+      ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
+      ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
+      ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
+      ix86_gen_monitor = gen_sse3_monitor_di;
+    }
+  else
+    {
+      ix86_gen_add3 = gen_addsi3;
+      ix86_gen_sub3 = gen_subsi3;
+      ix86_gen_sub3_carry = gen_subsi3_carry;
+      ix86_gen_one_cmpl2 = gen_one_cmplsi2;
+      ix86_gen_andsp = gen_andsi3;
+      ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
+      ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
+      ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
+      ix86_gen_monitor = gen_sse3_monitor_si;
+    }
+
+#ifdef USE_IX86_CLD
+  /* Use -mcld by default for 32-bit code if configured with --enable-cld.  */
+  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
+    opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
+#endif
+
+  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
+    {
+      if (opts->x_flag_fentry > 0)
+        sorry ("-mfentry isn%'t supported for 32-bit in combination "
+	       "with -fpic");
+      opts->x_flag_fentry = 0;
+    }
+  else if (TARGET_SEH)
+    {
+      if (opts->x_flag_fentry == 0)
+	sorry ("-mno-fentry isn%'t compatible with SEH");
+      opts->x_flag_fentry = 1;
+    }
+  else if (opts->x_flag_fentry < 0)
+   {
+#if defined(PROFILE_BEFORE_PROLOGUE)
+     opts->x_flag_fentry = 1;
+#else
+     opts->x_flag_fentry = 0;
+#endif
+   }
+
+  /* When not opts->x_optimize for size, enable vzeroupper optimization for
+     TARGET_AVX with -fexpensive-optimizations and split 32-byte
+     AVX unaligned load/store.  */
+  if (!opts->x_optimize_size)
+    {
+      if (flag_expensive_optimizations
+	  && !(opts_set->x_target_flags & MASK_VZEROUPPER))
+	opts->x_target_flags |= MASK_VZEROUPPER;
+      if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
+	  && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
+	opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
+      if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
+	  && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
+	opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
+      /* Enable 128-bit AVX instruction generation
+         for the auto-vectorizer.  */
+      if (TARGET_AVX128_OPTIMAL
+	  && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
+	opts->x_target_flags |= MASK_PREFER_AVX128;
+    }
+
+  if (opts->x_ix86_recip_name)
+    {
+      char *p = ASTRDUP (opts->x_ix86_recip_name);
+      char *q;
+      unsigned int mask, i;
+      bool invert;
+
+      while ((q = strtok (p, ",")) != NULL)
+	{
+	  p = NULL;
+	  if (*q == '!')
+	    {
+	      invert = true;
+	      q++;
+	    }
+	  else
+	    invert = false;
+
+	  if (!strcmp (q, "default"))
+	    mask = RECIP_MASK_ALL;
+	  else
+	    {
+	      for (i = 0; i < ARRAY_SIZE (recip_options); i++)
+		if (!strcmp (q, recip_options[i].string))
+		  {
+		    mask = recip_options[i].mask;
+		    break;
+		  }
+
+	      if (i == ARRAY_SIZE (recip_options))
+		{
+		  error ("unknown option for -mrecip=%s", q);
+		  invert = false;
+		  mask = RECIP_MASK_NONE;
+		}
+	    }
+
+	  opts->x_recip_mask_explicit |= mask;
+	  if (invert)
+	    opts->x_recip_mask &= ~mask;
+	  else
+	    opts->x_recip_mask |= mask;
+	}
+    }
+
+  if (TARGET_RECIP_P (opts->x_target_flags))
+    opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
+  else if (opts_set->x_target_flags & MASK_RECIP)
+    opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
+
+  /* Default long double to 64-bit for 32-bit Bionic and to __float128
+     for 64-bit Bionic.  */
+  if (TARGET_HAS_BIONIC
+      && !(opts_set->x_target_flags
+	   & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
+    opts->x_target_flags |= (TARGET_64BIT
+			     ? MASK_LONG_DOUBLE_128
+			     : MASK_LONG_DOUBLE_64);
+
+  /* Only one of them can be active.  */
+  gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
+	      || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
+
+  /* Save the initial options in case the user does function specific
+     options.  */
+  if (main_args_p)
+    target_option_default_node = target_option_current_node
+      = build_target_option_node (opts);
+
+  /* Handle stack protector */
+  if (!opts_set->x_ix86_stack_protector_guard)
+    opts->x_ix86_stack_protector_guard
+      = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
+
+  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
+  if (opts->x_ix86_tune_memcpy_strategy)
+    {
+      char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
+      ix86_parse_stringop_strategy_string (str, false);
+      free (str);
+    }
+
+  if (opts->x_ix86_tune_memset_strategy)
+    {
+      char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
+      ix86_parse_stringop_strategy_string (str, true);
+      free (str);
+    }
+}
+
+/* Implement the TARGET_OPTION_OVERRIDE hook.  */
+
+static void
+ix86_option_override (void)
+{
+  opt_pass *pass_insert_vzeroupper = make_pass_insert_vzeroupper (g);
+  static struct register_pass_info insert_vzeroupper_info
+    = { pass_insert_vzeroupper, "reload",
+	1, PASS_POS_INSERT_AFTER
+      };
+
+  ix86_option_override_internal (true, &global_options, &global_options_set);
+
+
+  /* This needs to be done at start up.  It's convenient to do it here.  */
+  register_pass (&insert_vzeroupper_info);
+}
+
+/* Update register usage after having seen the compiler flags.  */
+
+static void
+ix86_conditional_register_usage (void)
+{
+  int i, c_mask;
+  unsigned int j;
+
+  /* The PIC register, if it exists, is fixed.  */
+  j = PIC_OFFSET_TABLE_REGNUM;
+  if (j != INVALID_REGNUM)
+    fixed_regs[j] = call_used_regs[j] = 1;
+
+  /* For 32-bit targets, squash the REX registers.  */
+  if (! TARGET_64BIT)
+    {
+      for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
+	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+      for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
+	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+      for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
+	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+    }
+
+  /*  See the definition of CALL_USED_REGISTERS in i386.h.  */
+  c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
+	    : TARGET_64BIT ? (1 << 2)
+	    : (1 << 1));
+  
+  CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
+
+  for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+    {
+      /* Set/reset conditionally defined registers from
+	 CALL_USED_REGISTERS initializer.  */
+      if (call_used_regs[i] > 1)
+	call_used_regs[i] = !!(call_used_regs[i] & c_mask);
+
+      /* Calculate registers of CLOBBERED_REGS register set
+	 as call used registers from GENERAL_REGS register set.  */
+      if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
+	  && call_used_regs[i])
+	SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
+    }
+
+  /* If MMX is disabled, squash the registers.  */
+  if (! TARGET_MMX)
+    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+      if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
+	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+
+  /* If SSE is disabled, squash the registers.  */
+  if (! TARGET_SSE)
+    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+      if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
+	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+
+  /* If the FPU is disabled, squash the registers.  */
+  if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
+    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+      if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
+	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+
+  /* If AVX512F is disabled, squash the registers.  */
+  if (! TARGET_AVX512F)
+    {
+      for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
+	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+
+      for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
+	fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+    }
+}
+
+
+/* Save the current options */
+
+static void
+ix86_function_specific_save (struct cl_target_option *ptr,
+			     struct gcc_options *opts)
+{
+  ptr->arch = ix86_arch;
+  ptr->schedule = ix86_schedule;
+  ptr->tune = ix86_tune;
+  ptr->branch_cost = ix86_branch_cost;
+  ptr->tune_defaulted = ix86_tune_defaulted;
+  ptr->arch_specified = ix86_arch_specified;
+  ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
+  ptr->x_ix86_target_flags_explicit = opts->x_ix86_target_flags_explicit;
+  ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
+  ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
+  ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
+  ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
+  ptr->x_ix86_abi = opts->x_ix86_abi;
+  ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
+  ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
+  ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
+  ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
+  ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
+  ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
+  ptr->x_ix86_pmode = opts->x_ix86_pmode;
+  ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
+  ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
+  ptr->x_ix86_regparm = opts->x_ix86_regparm;
+  ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
+  ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
+  ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
+  ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
+  ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
+  ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
+  ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
+  ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
+  ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
+  ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
+
+  /* The fields are char but the variables are not; make sure the
+     values fit in the fields.  */
+  gcc_assert (ptr->arch == ix86_arch);
+  gcc_assert (ptr->schedule == ix86_schedule);
+  gcc_assert (ptr->tune == ix86_tune);
+  gcc_assert (ptr->branch_cost == ix86_branch_cost);
+}
+
+/* Restore the current options */
+
+static void
+ix86_function_specific_restore (struct gcc_options *opts,
+				struct cl_target_option *ptr)
+{
+  enum processor_type old_tune = ix86_tune;
+  enum processor_type old_arch = ix86_arch;
+  unsigned int ix86_arch_mask;
+  int i;
+
+  /* We don't change -fPIC.  */
+  opts->x_flag_pic = flag_pic;
+
+  ix86_arch = (enum processor_type) ptr->arch;
+  ix86_schedule = (enum attr_cpu) ptr->schedule;
+  ix86_tune = (enum processor_type) ptr->tune;
+  opts->x_ix86_branch_cost = ptr->branch_cost;
+  ix86_tune_defaulted = ptr->tune_defaulted;
+  ix86_arch_specified = ptr->arch_specified;
+  opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
+  opts->x_ix86_target_flags_explicit = ptr->x_ix86_target_flags_explicit;
+  opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
+  opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
+  opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
+  opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
+  opts->x_ix86_abi = ptr->x_ix86_abi;
+  opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
+  opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
+  opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
+  opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
+  opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
+  opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
+  opts->x_ix86_pmode = ptr->x_ix86_pmode;
+  opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
+  opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
+  opts->x_ix86_regparm = ptr->x_ix86_regparm;
+  opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
+  opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
+  opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
+  opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
+  opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
+  opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
+  opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
+  opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
+  opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
+  opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
+
+  /* Recreate the arch feature tests if the arch changed */
+  if (old_arch != ix86_arch)
+    {
+      ix86_arch_mask = 1u << ix86_arch;
+      for (i = 0; i < X86_ARCH_LAST; ++i)
+	ix86_arch_features[i]
+	  = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
+    }
+
+  /* Recreate the tune optimization tests */
+  if (old_tune != ix86_tune)
+    set_ix86_tune_features (ix86_tune, false);
+}
+
+/* Print the current options */
+
+static void
+ix86_function_specific_print (FILE *file, int indent,
+			      struct cl_target_option *ptr)
+{
+  char *target_string
+    = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
+			  NULL, NULL, ptr->x_ix86_fpmath, false);
+
+  gcc_assert (ptr->arch < PROCESSOR_max);
+  fprintf (file, "%*sarch = %d (%s)\n",
+	   indent, "",
+	   ptr->arch, processor_target_table[ptr->arch].name);
+
+  gcc_assert (ptr->tune < PROCESSOR_max);
+  fprintf (file, "%*stune = %d (%s)\n",
+	   indent, "",
+	   ptr->tune, processor_target_table[ptr->tune].name);
+
+  fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
+
+  if (target_string)
+    {
+      fprintf (file, "%*s%s\n", indent, "", target_string);
+      free (target_string);
+    }
+}
+
+
+/* Inner function to process the attribute((target(...))), take an argument and
+   set the current options from the argument. If we have a list, recursively go
+   over the list.  */
+
+static bool
+ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
+				     struct gcc_options *opts,
+				     struct gcc_options *opts_set,
+				     struct gcc_options *enum_opts_set)
+{
+  char *next_optstr;
+  bool ret = true;
+
+#define IX86_ATTR_ISA(S,O)   { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
+#define IX86_ATTR_STR(S,O)   { S, sizeof (S)-1, ix86_opt_str, O, 0 }
+#define IX86_ATTR_ENUM(S,O)  { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
+#define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
+#define IX86_ATTR_NO(S,O,M)  { S, sizeof (S)-1, ix86_opt_no,  O, M }
+
+  enum ix86_opt_type
+  {
+    ix86_opt_unknown,
+    ix86_opt_yes,
+    ix86_opt_no,
+    ix86_opt_str,
+    ix86_opt_enum,
+    ix86_opt_isa
+  };
+
+  static const struct
+  {
+    const char *string;
+    size_t len;
+    enum ix86_opt_type type;
+    int opt;
+    int mask;
+  } attrs[] = {
+    /* isa options */
+    IX86_ATTR_ISA ("3dnow",	OPT_m3dnow),
+    IX86_ATTR_ISA ("abm",	OPT_mabm),
+    IX86_ATTR_ISA ("bmi",	OPT_mbmi),
+    IX86_ATTR_ISA ("bmi2",	OPT_mbmi2),
+    IX86_ATTR_ISA ("lzcnt",	OPT_mlzcnt),
+    IX86_ATTR_ISA ("tbm",	OPT_mtbm),
+    IX86_ATTR_ISA ("aes",	OPT_maes),
+    IX86_ATTR_ISA ("sha",	OPT_msha),
+    IX86_ATTR_ISA ("avx",	OPT_mavx),
+    IX86_ATTR_ISA ("avx2",	OPT_mavx2),
+    IX86_ATTR_ISA ("avx512f",	OPT_mavx512f),
+    IX86_ATTR_ISA ("avx512pf",	OPT_mavx512pf),
+    IX86_ATTR_ISA ("avx512er",	OPT_mavx512er),
+    IX86_ATTR_ISA ("avx512cd",	OPT_mavx512cd),
+    IX86_ATTR_ISA ("mmx",	OPT_mmmx),
+    IX86_ATTR_ISA ("pclmul",	OPT_mpclmul),
+    IX86_ATTR_ISA ("popcnt",	OPT_mpopcnt),
+    IX86_ATTR_ISA ("sse",	OPT_msse),
+    IX86_ATTR_ISA ("sse2",	OPT_msse2),
+    IX86_ATTR_ISA ("sse3",	OPT_msse3),
+    IX86_ATTR_ISA ("sse4",	OPT_msse4),
+    IX86_ATTR_ISA ("sse4.1",	OPT_msse4_1),
+    IX86_ATTR_ISA ("sse4.2",	OPT_msse4_2),
+    IX86_ATTR_ISA ("sse4a",	OPT_msse4a),
+    IX86_ATTR_ISA ("ssse3",	OPT_mssse3),
+    IX86_ATTR_ISA ("fma4",	OPT_mfma4),
+    IX86_ATTR_ISA ("fma",	OPT_mfma),
+    IX86_ATTR_ISA ("xop",	OPT_mxop),
+    IX86_ATTR_ISA ("lwp",	OPT_mlwp),
+    IX86_ATTR_ISA ("fsgsbase",	OPT_mfsgsbase),
+    IX86_ATTR_ISA ("rdrnd",	OPT_mrdrnd),
+    IX86_ATTR_ISA ("f16c",	OPT_mf16c),
+    IX86_ATTR_ISA ("rtm",	OPT_mrtm),
+    IX86_ATTR_ISA ("hle",	OPT_mhle),
+    IX86_ATTR_ISA ("prfchw",	OPT_mprfchw),
+    IX86_ATTR_ISA ("rdseed",	OPT_mrdseed),
+    IX86_ATTR_ISA ("adx",	OPT_madx),
+    IX86_ATTR_ISA ("fxsr",	OPT_mfxsr),
+    IX86_ATTR_ISA ("xsave",	OPT_mxsave),
+    IX86_ATTR_ISA ("xsaveopt",	OPT_mxsaveopt),
+    IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
+
+    /* enum options */
+    IX86_ATTR_ENUM ("fpmath=",	OPT_mfpmath_),
+
+    /* string options */
+    IX86_ATTR_STR ("arch=",	IX86_FUNCTION_SPECIFIC_ARCH),
+    IX86_ATTR_STR ("tune=",	IX86_FUNCTION_SPECIFIC_TUNE),
+
+    /* flag options */
+    IX86_ATTR_YES ("cld",
+		   OPT_mcld,
+		   MASK_CLD),
+
+    IX86_ATTR_NO ("fancy-math-387",
+		  OPT_mfancy_math_387,
+		  MASK_NO_FANCY_MATH_387),
+
+    IX86_ATTR_YES ("ieee-fp",
+		   OPT_mieee_fp,
+		   MASK_IEEE_FP),
+
+    IX86_ATTR_YES ("inline-all-stringops",
+		   OPT_minline_all_stringops,
+		   MASK_INLINE_ALL_STRINGOPS),
+
+    IX86_ATTR_YES ("inline-stringops-dynamically",
+		   OPT_minline_stringops_dynamically,
+		   MASK_INLINE_STRINGOPS_DYNAMICALLY),
+
+    IX86_ATTR_NO ("align-stringops",
+		  OPT_mno_align_stringops,
+		  MASK_NO_ALIGN_STRINGOPS),
+
+    IX86_ATTR_YES ("recip",
+		   OPT_mrecip,
+		   MASK_RECIP),
+
+  };
+
+  /* If this is a list, recurse to get the options.  */
+  if (TREE_CODE (args) == TREE_LIST)
+    {
+      bool ret = true;
+
+      for (; args; args = TREE_CHAIN (args))
+	if (TREE_VALUE (args)
+	    && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
+						     p_strings, opts, opts_set,
+						     enum_opts_set))
+	  ret = false;
+
+      return ret;
+    }
+
+  else if (TREE_CODE (args) != STRING_CST)
+    {
+      error ("attribute %<target%> argument not a string");
+      return false;
+    }
+
+  /* Handle multiple arguments separated by commas.  */
+  next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
+
+  while (next_optstr && *next_optstr != '\0')
+    {
+      char *p = next_optstr;
+      char *orig_p = p;
+      char *comma = strchr (next_optstr, ',');
+      const char *opt_string;
+      size_t len, opt_len;
+      int opt;
+      bool opt_set_p;
+      char ch;
+      unsigned i;
+      enum ix86_opt_type type = ix86_opt_unknown;
+      int mask = 0;
+
+      if (comma)
+	{
+	  *comma = '\0';
+	  len = comma - next_optstr;
+	  next_optstr = comma + 1;
+	}
+      else
+	{
+	  len = strlen (p);
+	  next_optstr = NULL;
+	}
+
+      /* Recognize no-xxx.  */
+      if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
+	{
+	  opt_set_p = false;
+	  p += 3;
+	  len -= 3;
+	}
+      else
+	opt_set_p = true;
+
+      /* Find the option.  */
+      ch = *p;
+      opt = N_OPTS;
+      for (i = 0; i < ARRAY_SIZE (attrs); i++)
+	{
+	  type = attrs[i].type;
+	  opt_len = attrs[i].len;
+	  if (ch == attrs[i].string[0]
+	      && ((type != ix86_opt_str && type != ix86_opt_enum)
+		  ? len == opt_len
+		  : len > opt_len)
+	      && memcmp (p, attrs[i].string, opt_len) == 0)
+	    {
+	      opt = attrs[i].opt;
+	      mask = attrs[i].mask;
+	      opt_string = attrs[i].string;
+	      break;
+	    }
+	}
+
+      /* Process the option.  */
+      if (opt == N_OPTS)
+	{
+	  error ("attribute(target(\"%s\")) is unknown", orig_p);
+	  ret = false;
+	}
+
+      else if (type == ix86_opt_isa)
+	{
+	  struct cl_decoded_option decoded;
+
+	  generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
+	  ix86_handle_option (opts, opts_set,
+			      &decoded, input_location);
+	}
+
+      else if (type == ix86_opt_yes || type == ix86_opt_no)
+	{
+	  if (type == ix86_opt_no)
+	    opt_set_p = !opt_set_p;
+
+	  if (opt_set_p)
+	    opts->x_target_flags |= mask;
+	  else
+	    opts->x_target_flags &= ~mask;
+	}
+
+      else if (type == ix86_opt_str)
+	{
+	  if (p_strings[opt])
+	    {
+	      error ("option(\"%s\") was already specified", opt_string);
+	      ret = false;
+	    }
+	  else
+	    p_strings[opt] = xstrdup (p + opt_len);
+	}
+
+      else if (type == ix86_opt_enum)
+	{
+	  bool arg_ok;
+	  int value;
+
+	  arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
+	  if (arg_ok)
+	    set_option (opts, enum_opts_set, opt, value,
+			p + opt_len, DK_UNSPECIFIED, input_location,
+			global_dc);
+	  else
+	    {
+	      error ("attribute(target(\"%s\")) is unknown", orig_p);
+	      ret = false;
+	    }
+	}
+
+      else
+	gcc_unreachable ();
+    }
+
+  return ret;
+}
+
+/* Return a TARGET_OPTION_NODE tree of the target options listed or NULL.  */
+
+tree
+ix86_valid_target_attribute_tree (tree args,
+				  struct gcc_options *opts,
+				  struct gcc_options *opts_set)
+{
+  const char *orig_arch_string = opts->x_ix86_arch_string;
+  const char *orig_tune_string = opts->x_ix86_tune_string;
+  enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
+  int orig_tune_defaulted = ix86_tune_defaulted;
+  int orig_arch_specified = ix86_arch_specified;
+  char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
+  tree t = NULL_TREE;
+  int i;
+  struct cl_target_option *def
+    = TREE_TARGET_OPTION (target_option_default_node);
+  struct gcc_options enum_opts_set;
+
+  memset (&enum_opts_set, 0, sizeof (enum_opts_set));
+
+  /* Process each of the options on the chain.  */
+  if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
+					     opts_set, &enum_opts_set))
+    return error_mark_node;
+
+  /* If the changed options are different from the default, rerun
+     ix86_option_override_internal, and then save the options away.
+     The string options are are attribute options, and will be undone
+     when we copy the save structure.  */
+  if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
+      || opts->x_target_flags != def->x_target_flags
+      || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
+      || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
+      || enum_opts_set.x_ix86_fpmath)
+    {
+      /* If we are using the default tune= or arch=, undo the string assigned,
+	 and use the default.  */
+      if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
+	opts->x_ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
+      else if (!orig_arch_specified)
+	opts->x_ix86_arch_string = NULL;
+
+      if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
+	opts->x_ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
+      else if (orig_tune_defaulted)
+	opts->x_ix86_tune_string = NULL;
+
+      /* If fpmath= is not set, and we now have sse2 on 32-bit, use it.  */
+      if (enum_opts_set.x_ix86_fpmath)
+	opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
+      else if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
+	       && TARGET_SSE_P (opts->x_ix86_isa_flags))
+	{
+	  opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
+	  opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
+	}
+
+      /* Do any overrides, such as arch=xxx, or tune=xxx support.  */
+      ix86_option_override_internal (false, opts, opts_set);
+
+      /* Add any builtin functions with the new isa if any.  */
+      ix86_add_new_builtins (opts->x_ix86_isa_flags);
+
+      /* Save the current options unless we are validating options for
+	 #pragma.  */
+      t = build_target_option_node (opts);
+
+      opts->x_ix86_arch_string = orig_arch_string;
+      opts->x_ix86_tune_string = orig_tune_string;
+      opts_set->x_ix86_fpmath = orig_fpmath_set;
+
+      /* Free up memory allocated to hold the strings */
+      for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
+	free (option_strings[i]);
+    }
+
+  return t;
+}
+
+/* Hook to validate attribute((target("string"))).  */
+
+static bool
+ix86_valid_target_attribute_p (tree fndecl,
+			       tree ARG_UNUSED (name),
+			       tree args,
+			       int ARG_UNUSED (flags))
+{
+  struct gcc_options func_options;
+  tree new_target, new_optimize;
+  bool ret = true;
+
+  /* attribute((target("default"))) does nothing, beyond
+     affecting multi-versioning.  */
+  if (TREE_VALUE (args)
+      && TREE_CODE (TREE_VALUE (args)) == STRING_CST
+      && TREE_CHAIN (args) == NULL_TREE
+      && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
+    return true;
+
+  tree old_optimize = build_optimization_node (&global_options);
+
+  /* Get the optimization options of the current function.  */  
+  tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
+ 
+  if (!func_optimize)
+    func_optimize = old_optimize;
+
+  /* Init func_options.  */
+  memset (&func_options, 0, sizeof (func_options));
+  init_options_struct (&func_options, NULL);
+  lang_hooks.init_options_struct (&func_options);
+ 
+  cl_optimization_restore (&func_options,
+			   TREE_OPTIMIZATION (func_optimize));
+
+  /* Initialize func_options to the default before its target options can
+     be set.  */
+  cl_target_option_restore (&func_options,
+			    TREE_TARGET_OPTION (target_option_default_node));
+
+  new_target = ix86_valid_target_attribute_tree (args, &func_options,
+						 &global_options_set);
+
+  new_optimize = build_optimization_node (&func_options);
+
+  if (new_target == error_mark_node)
+    ret = false;
+
+  else if (fndecl && new_target)
+    {
+      DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
+
+      if (old_optimize != new_optimize)
+	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
+    }
+
+  return ret;
+}
+
+
+/* Hook to determine if one function can safely inline another.  */
+
+static bool
+ix86_can_inline_p (tree caller, tree callee)
+{
+  bool ret = false;
+  tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
+  tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
+
+  /* If callee has no option attributes, then it is ok to inline.  */
+  if (!callee_tree)
+    ret = true;
+
+  /* If caller has no option attributes, but callee does then it is not ok to
+     inline.  */
+  else if (!caller_tree)
+    ret = false;
+
+  else
+    {
+      struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
+      struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
+
+      /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
+	 can inline a SSE2 function but a SSE2 function can't inline a SSE4
+	 function.  */
+      if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
+	  != callee_opts->x_ix86_isa_flags)
+	ret = false;
+
+      /* See if we have the same non-isa options.  */
+      else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
+	ret = false;
+
+      /* See if arch, tune, etc. are the same.  */
+      else if (caller_opts->arch != callee_opts->arch)
+	ret = false;
+
+      else if (caller_opts->tune != callee_opts->tune)
+	ret = false;
+
+      else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
+	ret = false;
+
+      else if (caller_opts->branch_cost != callee_opts->branch_cost)
+	ret = false;
+
+      else
+	ret = true;
+    }
+
+  return ret;
+}
+
+
+/* Remember the last target of ix86_set_current_function.  */
+static GTY(()) tree ix86_previous_fndecl;
+
+/* Invalidate ix86_previous_fndecl cache.  */
+void
+ix86_reset_previous_fndecl (void)
+{
+  ix86_previous_fndecl = NULL_TREE;
+}
+
+/* Establish appropriate back-end context for processing the function
+   FNDECL.  The argument might be NULL to indicate processing at top
+   level, outside of any function scope.  */
+static void
+ix86_set_current_function (tree fndecl)
+{
+  /* Only change the context if the function changes.  This hook is called
+     several times in the course of compiling a function, and we don't want to
+     slow things down too much or call target_reinit when it isn't safe.  */
+  if (fndecl && fndecl != ix86_previous_fndecl)
+    {
+      tree old_tree = (ix86_previous_fndecl
+		       ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
+		       : NULL_TREE);
+
+      tree new_tree = (fndecl
+		       ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
+		       : NULL_TREE);
+
+      ix86_previous_fndecl = fndecl;
+      if (old_tree == new_tree)
+	;
+
+      else if (new_tree)
+	{
+	  cl_target_option_restore (&global_options,
+				    TREE_TARGET_OPTION (new_tree));
+	  if (TREE_TARGET_GLOBALS (new_tree))
+	    restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
+	  else
+	    TREE_TARGET_GLOBALS (new_tree)
+	      = save_target_globals_default_opts ();
+	}
+
+      else if (old_tree)
+	{
+	  new_tree = target_option_current_node;
+	  cl_target_option_restore (&global_options,
+				    TREE_TARGET_OPTION (new_tree));
+	  if (TREE_TARGET_GLOBALS (new_tree))
+	    restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
+	  else if (new_tree == target_option_default_node)
+	    restore_target_globals (&default_target_globals);
+	  else
+	    TREE_TARGET_GLOBALS (new_tree)
+	      = save_target_globals_default_opts ();
+	}
+    }
+}
+
+
+/* Return true if this goes in large data/bss.  */
+
+static bool
+ix86_in_large_data_p (tree exp)
+{
+  if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
+    return false;
+
+  /* Functions are never large data.  */
+  if (TREE_CODE (exp) == FUNCTION_DECL)
+    return false;
+
+  if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
+    {
+      const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
+      if (strcmp (section, ".ldata") == 0
+	  || strcmp (section, ".lbss") == 0)
+	return true;
+      return false;
+    }
+  else
+    {
+      HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
+
+      /* If this is an incomplete type with size 0, then we can't put it
+	 in data because it might be too big when completed.  */
+      if (!size || size > ix86_section_threshold)
+	return true;
+    }
+
+  return false;
+}
+
+/* Switch to the appropriate section for output of DECL.
+   DECL is either a `VAR_DECL' node or a constant of some sort.
+   RELOC indicates whether forming the initial value of DECL requires
+   link-time relocations.  */
+
+ATTRIBUTE_UNUSED static section *
+x86_64_elf_select_section (tree decl, int reloc,
+			   unsigned HOST_WIDE_INT align)
+{
+  if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
+      && ix86_in_large_data_p (decl))
+    {
+      const char *sname = NULL;
+      unsigned int flags = SECTION_WRITE;
+      switch (categorize_decl_for_section (decl, reloc))
+	{
+	case SECCAT_DATA:
+	  sname = ".ldata";
+	  break;
+	case SECCAT_DATA_REL:
+	  sname = ".ldata.rel";
+	  break;
+	case SECCAT_DATA_REL_LOCAL:
+	  sname = ".ldata.rel.local";
+	  break;
+	case SECCAT_DATA_REL_RO:
+	  sname = ".ldata.rel.ro";
+	  break;
+	case SECCAT_DATA_REL_RO_LOCAL:
+	  sname = ".ldata.rel.ro.local";
+	  break;
+	case SECCAT_BSS:
+	  sname = ".lbss";
+	  flags |= SECTION_BSS;
+	  break;
+	case SECCAT_RODATA:
+	case SECCAT_RODATA_MERGE_STR:
+	case SECCAT_RODATA_MERGE_STR_INIT:
+	case SECCAT_RODATA_MERGE_CONST:
+	  sname = ".lrodata";
+	  flags = 0;
+	  break;
+	case SECCAT_SRODATA:
+	case SECCAT_SDATA:
+	case SECCAT_SBSS:
+	  gcc_unreachable ();
+	case SECCAT_TEXT:
+	case SECCAT_TDATA:
+	case SECCAT_TBSS:
+	  /* We don't split these for medium model.  Place them into
+	     default sections and hope for best.  */
+	  break;
+	}
+      if (sname)
+	{
+	  /* We might get called with string constants, but get_named_section
+	     doesn't like them as they are not DECLs.  Also, we need to set
+	     flags in that case.  */
+	  if (!DECL_P (decl))
+	    return get_section (sname, flags, NULL);
+	  return get_named_section (decl, sname, reloc);
+	}
+    }
+  return default_elf_select_section (decl, reloc, align);
+}
+
+/* Select a set of attributes for section NAME based on the properties
+   of DECL and whether or not RELOC indicates that DECL's initializer
+   might contain runtime relocations.  */
+
+static unsigned int ATTRIBUTE_UNUSED
+x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
+{
+  unsigned int flags = default_section_type_flags (decl, name, reloc);
+
+  if (decl == NULL_TREE
+      && (strcmp (name, ".ldata.rel.ro") == 0
+	  || strcmp (name, ".ldata.rel.ro.local") == 0))
+    flags |= SECTION_RELRO;
+
+  if (strcmp (name, ".lbss") == 0
+      || strncmp (name, ".lbss.", 5) == 0
+      || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
+    flags |= SECTION_BSS;
+
+  return flags;
+}
+
+/* Build up a unique section name, expressed as a
+   STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
+   RELOC indicates whether the initial value of EXP requires
+   link-time relocations.  */
+
+static void ATTRIBUTE_UNUSED
+x86_64_elf_unique_section (tree decl, int reloc)
+{
+  if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
+      && ix86_in_large_data_p (decl))
+    {
+      const char *prefix = NULL;
+      /* We only need to use .gnu.linkonce if we don't have COMDAT groups.  */
+      bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
+
+      switch (categorize_decl_for_section (decl, reloc))
+	{
+	case SECCAT_DATA:
+	case SECCAT_DATA_REL:
+	case SECCAT_DATA_REL_LOCAL:
+	case SECCAT_DATA_REL_RO:
+	case SECCAT_DATA_REL_RO_LOCAL:
+          prefix = one_only ? ".ld" : ".ldata";
+	  break;
+	case SECCAT_BSS:
+          prefix = one_only ? ".lb" : ".lbss";
+	  break;
+	case SECCAT_RODATA:
+	case SECCAT_RODATA_MERGE_STR:
+	case SECCAT_RODATA_MERGE_STR_INIT:
+	case SECCAT_RODATA_MERGE_CONST:
+          prefix = one_only ? ".lr" : ".lrodata";
+	  break;
+	case SECCAT_SRODATA:
+	case SECCAT_SDATA:
+	case SECCAT_SBSS:
+	  gcc_unreachable ();
+	case SECCAT_TEXT:
+	case SECCAT_TDATA:
+	case SECCAT_TBSS:
+	  /* We don't split these for medium model.  Place them into
+	     default sections and hope for best.  */
+	  break;
+	}
+      if (prefix)
+	{
+	  const char *name, *linkonce;
+	  char *string;
+
+	  name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
+	  name = targetm.strip_name_encoding (name);
+
+	  /* If we're using one_only, then there needs to be a .gnu.linkonce
+     	     prefix to the section name.  */
+	  linkonce = one_only ? ".gnu.linkonce" : "";
+
+	  string = ACONCAT ((linkonce, prefix, ".", name, NULL));
+
+	  DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
+	  return;
+	}
+    }
+  default_unique_section (decl, reloc);
+}
+
+#ifdef COMMON_ASM_OP
+/* This says how to output assembler code to declare an
+   uninitialized external linkage data object.
+
+   For medium model x86-64 we need to use .largecomm opcode for
+   large objects.  */
+void
+x86_elf_aligned_common (FILE *file,
+			const char *name, unsigned HOST_WIDE_INT size,
+			int align)
+{
+  if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
+      && size > (unsigned int)ix86_section_threshold)
+    fputs (".largecomm\t", file);
+  else
+    fputs (COMMON_ASM_OP, file);
+  assemble_name (file, name);
+  fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
+	   size, align / BITS_PER_UNIT);
+}
+#endif
+
+/* Utility function for targets to use in implementing
+   ASM_OUTPUT_ALIGNED_BSS.  */
+
+void
+x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
+			const char *name, unsigned HOST_WIDE_INT size,
+			int align)
+{
+  if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
+      && size > (unsigned int)ix86_section_threshold)
+    switch_to_section (get_named_section (decl, ".lbss", 0));
+  else
+    switch_to_section (bss_section);
+  ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
+#ifdef ASM_DECLARE_OBJECT_NAME
+  last_assemble_variable_decl = decl;
+  ASM_DECLARE_OBJECT_NAME (file, name, decl);
+#else
+  /* Standard thing is just output label for the object.  */
+  ASM_OUTPUT_LABEL (file, name);
+#endif /* ASM_DECLARE_OBJECT_NAME */
+  ASM_OUTPUT_SKIP (file, size ? size : 1);
+}
+
+/* Decide whether we must probe the stack before any space allocation
+   on this target.  It's essentially TARGET_STACK_PROBE except when
+   -fstack-check causes the stack to be already probed differently.  */
+
+bool
+ix86_target_stack_probe (void)
+{
+  /* Do not probe the stack twice if static stack checking is enabled.  */
+  if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
+    return false;
+
+  return TARGET_STACK_PROBE;
+}
+
+/* Decide whether we can make a sibling call to a function.  DECL is the
+   declaration of the function being targeted by the call and EXP is the
+   CALL_EXPR representing the call.  */
+
+static bool
+ix86_function_ok_for_sibcall (tree decl, tree exp)
+{
+  tree type, decl_or_type;
+  rtx a, b;
+
+  /* If we are generating position-independent code, we cannot sibcall
+     optimize any indirect call, or a direct call to a global function,
+     as the PLT requires %ebx be live. (Darwin does not have a PLT.)  */
+  if (!TARGET_MACHO
+      && !TARGET_64BIT
+      && flag_pic
+      && (!decl || !targetm.binds_local_p (decl)))
+    return false;
+
+  /* If we need to align the outgoing stack, then sibcalling would
+     unalign the stack, which may break the called function.  */
+  if (ix86_minimum_incoming_stack_boundary (true)
+      < PREFERRED_STACK_BOUNDARY)
+    return false;
+
+  if (decl)
+    {
+      decl_or_type = decl;
+      type = TREE_TYPE (decl);
+    }
+  else
+    {
+      /* We're looking at the CALL_EXPR, we need the type of the function.  */
+      type = CALL_EXPR_FN (exp);		/* pointer expression */
+      type = TREE_TYPE (type);			/* pointer type */
+      type = TREE_TYPE (type);			/* function type */
+      decl_or_type = type;
+    }
+
+  /* Check that the return value locations are the same.  Like
+     if we are returning floats on the 80387 register stack, we cannot
+     make a sibcall from a function that doesn't return a float to a
+     function that does or, conversely, from a function that does return
+     a float to a function that doesn't; the necessary stack adjustment
+     would not be executed.  This is also the place we notice
+     differences in the return value ABI.  Note that it is ok for one
+     of the functions to have void return type as long as the return
+     value of the other is passed in a register.  */
+  a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
+  b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
+			   cfun->decl, false);
+  if (STACK_REG_P (a) || STACK_REG_P (b))
+    {
+      if (!rtx_equal_p (a, b))
+	return false;
+    }
+  else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
+    ;
+  else if (!rtx_equal_p (a, b))
+    return false;
+
+  if (TARGET_64BIT)
+    {
+      /* The SYSV ABI has more call-clobbered registers;
+	 disallow sibcalls from MS to SYSV.  */
+      if (cfun->machine->call_abi == MS_ABI
+	  && ix86_function_type_abi (type) == SYSV_ABI)
+	return false;
+    }
+  else
+    {
+      /* If this call is indirect, we'll need to be able to use a
+	 call-clobbered register for the address of the target function.
+	 Make sure that all such registers are not used for passing
+	 parameters.  Note that DLLIMPORT functions are indirect.  */
+      if (!decl
+	  || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
+	{
+	  if (ix86_function_regparm (type, NULL) >= 3)
+	    {
+	      /* ??? Need to count the actual number of registers to be used,
+		 not the possible number of registers.  Fix later.  */
+	      return false;
+	    }
+	}
+    }
+
+  /* Otherwise okay.  That also includes certain types of indirect calls.  */
+  return true;
+}
+
+/* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
+   and "sseregparm" calling convention attributes;
+   arguments as in struct attribute_spec.handler.  */
+
+static tree
+ix86_handle_cconv_attribute (tree *node, tree name,
+				   tree args,
+				   int flags ATTRIBUTE_UNUSED,
+				   bool *no_add_attrs)
+{
+  if (TREE_CODE (*node) != FUNCTION_TYPE
+      && TREE_CODE (*node) != METHOD_TYPE
+      && TREE_CODE (*node) != FIELD_DECL
+      && TREE_CODE (*node) != TYPE_DECL)
+    {
+      warning (OPT_Wattributes, "%qE attribute only applies to functions",
+	       name);
+      *no_add_attrs = true;
+      return NULL_TREE;
+    }
+
+  /* Can combine regparm with all attributes but fastcall, and thiscall.  */
+  if (is_attribute_p ("regparm", name))
+    {
+      tree cst;
+
+      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("fastcall and regparm attributes are not compatible");
+	}
+
+      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
+	{
+	  error ("regparam and thiscall attributes are not compatible");
+	}
+
+      cst = TREE_VALUE (args);
+      if (TREE_CODE (cst) != INTEGER_CST)
+	{
+	  warning (OPT_Wattributes,
+		   "%qE attribute requires an integer constant argument",
+		   name);
+	  *no_add_attrs = true;
+	}
+      else if (compare_tree_int (cst, REGPARM_MAX) > 0)
+	{
+	  warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
+		   name, REGPARM_MAX);
+	  *no_add_attrs = true;
+	}
+
+      return NULL_TREE;
+    }
+
+  if (TARGET_64BIT)
+    {
+      /* Do not warn when emulating the MS ABI.  */
+      if ((TREE_CODE (*node) != FUNCTION_TYPE
+	   && TREE_CODE (*node) != METHOD_TYPE)
+	  || ix86_function_type_abi (*node) != MS_ABI)
+	warning (OPT_Wattributes, "%qE attribute ignored",
+	         name);
+      *no_add_attrs = true;
+      return NULL_TREE;
+    }
+
+  /* Can combine fastcall with stdcall (redundant) and sseregparm.  */
+  if (is_attribute_p ("fastcall", name))
+    {
+      if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("fastcall and cdecl attributes are not compatible");
+	}
+      if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("fastcall and stdcall attributes are not compatible");
+	}
+      if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("fastcall and regparm attributes are not compatible");
+	}
+      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
+	{
+	  error ("fastcall and thiscall attributes are not compatible");
+	}
+    }
+
+  /* Can combine stdcall with fastcall (redundant), regparm and
+     sseregparm.  */
+  else if (is_attribute_p ("stdcall", name))
+    {
+      if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("stdcall and cdecl attributes are not compatible");
+	}
+      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("stdcall and fastcall attributes are not compatible");
+	}
+      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
+	{
+	  error ("stdcall and thiscall attributes are not compatible");
+	}
+    }
+
+  /* Can combine cdecl with regparm and sseregparm.  */
+  else if (is_attribute_p ("cdecl", name))
+    {
+      if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("stdcall and cdecl attributes are not compatible");
+	}
+      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("fastcall and cdecl attributes are not compatible");
+	}
+      if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
+	{
+	  error ("cdecl and thiscall attributes are not compatible");
+	}
+    }
+  else if (is_attribute_p ("thiscall", name))
+    {
+      if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
+	warning (OPT_Wattributes, "%qE attribute is used for none class-method",
+	         name);
+      if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
+	{
+	  error ("stdcall and thiscall attributes are not compatible");
+	}
+      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
+	{
+	  error ("fastcall and thiscall attributes are not compatible");
+	}
+      if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
+	{
+	  error ("cdecl and thiscall attributes are not compatible");
+	}
+    }
+
+  /* Can combine sseregparm with all attributes.  */
+
+  return NULL_TREE;
+}
+
+/* The transactional memory builtins are implicitly regparm or fastcall
+   depending on the ABI.  Override the generic do-nothing attribute that
+   these builtins were declared with, and replace it with one of the two
+   attributes that we expect elsewhere.  */
+
+static tree
+ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
+    				  tree args ATTRIBUTE_UNUSED,
+				  int flags, bool *no_add_attrs)
+{
+  tree alt;
+
+  /* In no case do we want to add the placeholder attribute.  */
+  *no_add_attrs = true;
+
+  /* The 64-bit ABI is unchanged for transactional memory.  */
+  if (TARGET_64BIT)
+    return NULL_TREE;
+
+  /* ??? Is there a better way to validate 32-bit windows?  We have
+     cfun->machine->call_abi, but that seems to be set only for 64-bit.  */
+  if (CHECK_STACK_LIMIT > 0)
+    alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
+  else
+    {
+      alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
+      alt = tree_cons (get_identifier ("regparm"), alt, NULL);
+    }
+  decl_attributes (node, alt, flags);
+
+  return NULL_TREE;
+}
+
+/* This function determines from TYPE the calling-convention.  */
+
+unsigned int
+ix86_get_callcvt (const_tree type)
+{
+  unsigned int ret = 0;
+  bool is_stdarg;
+  tree attrs;
+
+  if (TARGET_64BIT)
+    return IX86_CALLCVT_CDECL;
+
+  attrs = TYPE_ATTRIBUTES (type);
+  if (attrs != NULL_TREE)
+    {
+      if (lookup_attribute ("cdecl", attrs))
+	ret |= IX86_CALLCVT_CDECL;
+      else if (lookup_attribute ("stdcall", attrs))
+	ret |= IX86_CALLCVT_STDCALL;
+      else if (lookup_attribute ("fastcall", attrs))
+	ret |= IX86_CALLCVT_FASTCALL;
+      else if (lookup_attribute ("thiscall", attrs))
+	ret |= IX86_CALLCVT_THISCALL;
+
+      /* Regparam isn't allowed for thiscall and fastcall.  */
+      if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
+	{
+	  if (lookup_attribute ("regparm", attrs))
+	    ret |= IX86_CALLCVT_REGPARM;
+	  if (lookup_attribute ("sseregparm", attrs))
+	    ret |= IX86_CALLCVT_SSEREGPARM;
+	}
+
+      if (IX86_BASE_CALLCVT(ret) != 0)
+	return ret;
+    }
+
+  is_stdarg = stdarg_p (type);
+  if (TARGET_RTD && !is_stdarg)
+    return IX86_CALLCVT_STDCALL | ret;
+
+  if (ret != 0
+      || is_stdarg
+      || TREE_CODE (type) != METHOD_TYPE
+      || ix86_function_type_abi (type) != MS_ABI)
+    return IX86_CALLCVT_CDECL | ret;
+
+  return IX86_CALLCVT_THISCALL;
+}
+
+/* Return 0 if the attributes for two types are incompatible, 1 if they
+   are compatible, and 2 if they are nearly compatible (which causes a
+   warning to be generated).  */
+
+static int
+ix86_comp_type_attributes (const_tree type1, const_tree type2)
+{
+  unsigned int ccvt1, ccvt2;
+
+  if (TREE_CODE (type1) != FUNCTION_TYPE
+      && TREE_CODE (type1) != METHOD_TYPE)
+    return 1;
+
+  ccvt1 = ix86_get_callcvt (type1);
+  ccvt2 = ix86_get_callcvt (type2);
+  if (ccvt1 != ccvt2)
+    return 0;
+  if (ix86_function_regparm (type1, NULL)
+      != ix86_function_regparm (type2, NULL))
+    return 0;
+
+  return 1;
+}
+
+/* Return the regparm value for a function with the indicated TYPE and DECL.
+   DECL may be NULL when calling function indirectly
+   or considering a libcall.  */
+
+static int
+ix86_function_regparm (const_tree type, const_tree decl)
+{
+  tree attr;
+  int regparm;
+  unsigned int ccvt;
+
+  if (TARGET_64BIT)
+    return (ix86_function_type_abi (type) == SYSV_ABI
+	    ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
+  ccvt = ix86_get_callcvt (type);
+  regparm = ix86_regparm;
+
+  if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
+    {
+      attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
+      if (attr)
+	{
+	  regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
+	  return regparm;
+	}
+    }
+  else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+    return 2;
+  else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
+    return 1;
+
+  /* Use register calling convention for local functions when possible.  */
+  if (decl
+      && TREE_CODE (decl) == FUNCTION_DECL
+      /* Caller and callee must agree on the calling convention, so
+	 checking here just optimize means that with
+	 __attribute__((optimize (...))) caller could use regparm convention
+	 and callee not, or vice versa.  Instead look at whether the callee
+	 is optimized or not.  */
+      && opt_for_fn (decl, optimize)
+      && !(profile_flag && !flag_fentry))
+    {
+      /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified.  */
+      struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
+      if (i && i->local && i->can_change_signature)
+	{
+	  int local_regparm, globals = 0, regno;
+
+	  /* Make sure no regparm register is taken by a
+	     fixed register variable.  */
+	  for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
+	    if (fixed_regs[local_regparm])
+	      break;
+
+	  /* We don't want to use regparm(3) for nested functions as
+	     these use a static chain pointer in the third argument.  */
+	  if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
+	    local_regparm = 2;
+
+	  /* In 32-bit mode save a register for the split stack.  */
+	  if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
+	    local_regparm = 2;
+
+	  /* Each fixed register usage increases register pressure,
+	     so less registers should be used for argument passing.
+	     This functionality can be overriden by an explicit
+	     regparm value.  */
+	  for (regno = AX_REG; regno <= DI_REG; regno++)
+	    if (fixed_regs[regno])
+	      globals++;
+
+	  local_regparm
+	    = globals < local_regparm ? local_regparm - globals : 0;
+
+	  if (local_regparm > regparm)
+	    regparm = local_regparm;
+	}
+    }
+
+  return regparm;
+}
+
+/* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
+   DFmode (2) arguments in SSE registers for a function with the
+   indicated TYPE and DECL.  DECL may be NULL when calling function
+   indirectly or considering a libcall.  Otherwise return 0.  */
+
+static int
+ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
+{
+  gcc_assert (!TARGET_64BIT);
+
+  /* Use SSE registers to pass SFmode and DFmode arguments if requested
+     by the sseregparm attribute.  */
+  if (TARGET_SSEREGPARM
+      || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
+    {
+      if (!TARGET_SSE)
+	{
+	  if (warn)
+	    {
+	      if (decl)
+		error ("calling %qD with attribute sseregparm without "
+		       "SSE/SSE2 enabled", decl);
+	      else
+		error ("calling %qT with attribute sseregparm without "
+		       "SSE/SSE2 enabled", type);
+	    }
+	  return 0;
+	}
+
+      return 2;
+    }
+
+  /* For local functions, pass up to SSE_REGPARM_MAX SFmode
+     (and DFmode for SSE2) arguments in SSE registers.  */
+  if (decl && TARGET_SSE_MATH && optimize
+      && !(profile_flag && !flag_fentry))
+    {
+      /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified.  */
+      struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
+      if (i && i->local && i->can_change_signature)
+	return TARGET_SSE2 ? 2 : 1;
+    }
+
+  return 0;
+}
+
+/* Return true if EAX is live at the start of the function.  Used by
+   ix86_expand_prologue to determine if we need special help before
+   calling allocate_stack_worker.  */
+
+static bool
+ix86_eax_live_at_start_p (void)
+{
+  /* Cheat.  Don't bother working forward from ix86_function_regparm
+     to the function type to whether an actual argument is located in
+     eax.  Instead just look at cfg info, which is still close enough
+     to correct at this point.  This gives false positives for broken
+     functions that might use uninitialized data that happens to be
+     allocated in eax, but who cares?  */
+  return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
+}
+
+static bool
+ix86_keep_aggregate_return_pointer (tree fntype)
+{
+  tree attr;
+
+  if (!TARGET_64BIT)
+    {
+      attr = lookup_attribute ("callee_pop_aggregate_return",
+			       TYPE_ATTRIBUTES (fntype));
+      if (attr)
+	return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
+
+      /* For 32-bit MS-ABI the default is to keep aggregate
+         return pointer.  */
+      if (ix86_function_type_abi (fntype) == MS_ABI)
+	return true;
+    }
+  return KEEP_AGGREGATE_RETURN_POINTER != 0;
+}
+
+/* Value is the number of bytes of arguments automatically
+   popped when returning from a subroutine call.
+   FUNDECL is the declaration node of the function (as a tree),
+   FUNTYPE is the data type of the function (as a tree),
+   or for a library call it is an identifier node for the subroutine name.
+   SIZE is the number of bytes of arguments passed on the stack.
+
+   On the 80386, the RTD insn may be used to pop them if the number
+     of args is fixed, but if the number is variable then the caller
+     must pop them all.  RTD can't be used for library calls now
+     because the library is compiled with the Unix compiler.
+   Use of RTD is a selectable option, since it is incompatible with
+   standard Unix calling sequences.  If the option is not selected,
+   the caller must always pop the args.
+
+   The attribute stdcall is equivalent to RTD on a per module basis.  */
+
+static int
+ix86_return_pops_args (tree fundecl, tree funtype, int size)
+{
+  unsigned int ccvt;
+
+  /* None of the 64-bit ABIs pop arguments.  */
+  if (TARGET_64BIT)
+    return 0;
+
+  ccvt = ix86_get_callcvt (funtype);
+
+  if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
+	       | IX86_CALLCVT_THISCALL)) != 0
+      && ! stdarg_p (funtype))
+    return size;
+
+  /* Lose any fake structure return argument if it is passed on the stack.  */
+  if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
+      && !ix86_keep_aggregate_return_pointer (funtype))
+    {
+      int nregs = ix86_function_regparm (funtype, fundecl);
+      if (nregs == 0)
+	return GET_MODE_SIZE (Pmode);
+    }
+
+  return 0;
+}
+
+/* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook.  */
+
+static bool
+ix86_legitimate_combined_insn (rtx insn)
+{
+  /* Check operand constraints in case hard registers were propagated
+     into insn pattern.  This check prevents combine pass from
+     generating insn patterns with invalid hard register operands.
+     These invalid insns can eventually confuse reload to error out
+     with a spill failure.  See also PRs 46829 and 46843.  */
+  if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
+    {
+      int i;
+
+      extract_insn (insn);
+      preprocess_constraints ();
+
+      for (i = 0; i < recog_data.n_operands; i++)
+	{
+	  rtx op = recog_data.operand[i];
+	  enum machine_mode mode = GET_MODE (op);
+	  struct operand_alternative *op_alt;
+	  int offset = 0;
+	  bool win;
+	  int j;
+
+	  /* For pre-AVX disallow unaligned loads/stores where the
+	     instructions don't support it.  */
+	  if (!TARGET_AVX
+	      && VECTOR_MODE_P (GET_MODE (op))
+	      && misaligned_operand (op, GET_MODE (op)))
+	    {
+	      int min_align = get_attr_ssememalign (insn);
+	      if (min_align == 0)
+		return false;
+	    }
+
+	  /* A unary operator may be accepted by the predicate, but it
+	     is irrelevant for matching constraints.  */
+	  if (UNARY_P (op))
+	    op = XEXP (op, 0);
+
+	  if (GET_CODE (op) == SUBREG)
+	    {
+	      if (REG_P (SUBREG_REG (op))
+		  && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
+		offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
+					      GET_MODE (SUBREG_REG (op)),
+					      SUBREG_BYTE (op),
+					      GET_MODE (op));
+	      op = SUBREG_REG (op);
+	    }
+
+	  if (!(REG_P (op) && HARD_REGISTER_P (op)))
+	    continue;
+
+	  op_alt = recog_op_alt[i];
+
+	  /* Operand has no constraints, anything is OK.  */
+ 	  win = !recog_data.n_alternatives;
+
+	  for (j = 0; j < recog_data.n_alternatives; j++)
+	    {
+	      if (op_alt[j].anything_ok
+		  || (op_alt[j].matches != -1
+		      && operands_match_p
+			  (recog_data.operand[i],
+			   recog_data.operand[op_alt[j].matches]))
+		  || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
+		{
+		  win = true;
+		  break;
+		}
+	    }
+
+	  if (!win)
+	    return false;
+	}
+    }
+
+  return true;
+}
+
+/* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
+
+static unsigned HOST_WIDE_INT
+ix86_asan_shadow_offset (void)
+{
+  return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
+				     : HOST_WIDE_INT_C (0x7fff8000))
+		     : (HOST_WIDE_INT_1 << 29);
+}
+
+/* Argument support functions.  */
+
+/* Return true when register may be used to pass function parameters.  */
+bool
+ix86_function_arg_regno_p (int regno)
+{
+  int i;
+  const int *parm_regs;
+
+  if (!TARGET_64BIT)
+    {
+      if (TARGET_MACHO)
+        return (regno < REGPARM_MAX
+                || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
+      else
+        return (regno < REGPARM_MAX
+	        || (TARGET_MMX && MMX_REGNO_P (regno)
+	  	    && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
+	        || (TARGET_SSE && SSE_REGNO_P (regno)
+		    && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
+    }
+
+  if (TARGET_SSE && SSE_REGNO_P (regno)
+      && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
+    return true;
+
+  /* TODO: The function should depend on current function ABI but
+     builtins.c would need updating then. Therefore we use the
+     default ABI.  */
+
+  /* RAX is used as hidden argument to va_arg functions.  */
+  if (ix86_abi == SYSV_ABI && regno == AX_REG)
+    return true;
+
+  if (ix86_abi == MS_ABI)
+    parm_regs = x86_64_ms_abi_int_parameter_registers;
+  else
+    parm_regs = x86_64_int_parameter_registers;
+  for (i = 0; i < (ix86_abi == MS_ABI
+		   ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
+    if (regno == parm_regs[i])
+      return true;
+  return false;
+}
+
+/* Return if we do not know how to pass TYPE solely in registers.  */
+
+static bool
+ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
+{
+  if (must_pass_in_stack_var_size_or_pad (mode, type))
+    return true;
+
+  /* For 32-bit, we want TImode aggregates to go on the stack.  But watch out!
+     The layout_type routine is crafty and tries to trick us into passing
+     currently unsupported vector types on the stack by using TImode.  */
+  return (!TARGET_64BIT && mode == TImode
+	  && type && TREE_CODE (type) != VECTOR_TYPE);
+}
+
+/* It returns the size, in bytes, of the area reserved for arguments passed
+   in registers for the function represented by fndecl dependent to the used
+   abi format.  */
+int
+ix86_reg_parm_stack_space (const_tree fndecl)
+{
+  enum calling_abi call_abi = SYSV_ABI;
+  if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
+    call_abi = ix86_function_abi (fndecl);
+  else
+    call_abi = ix86_function_type_abi (fndecl);
+  if (TARGET_64BIT && call_abi == MS_ABI)
+    return 32;
+  return 0;
+}
+
+/* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
+   call abi used.  */
+enum calling_abi
+ix86_function_type_abi (const_tree fntype)
+{
+  if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
+    {
+      enum calling_abi abi = ix86_abi;
+      if (abi == SYSV_ABI)
+	{
+	  if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
+	    abi = MS_ABI;
+	}
+      else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
+	abi = SYSV_ABI;
+      return abi;
+    }
+  return ix86_abi;
+}
+
+/* We add this as a workaround in order to use libc_has_function
+   hook in i386.md.  */
+bool
+ix86_libc_has_function (enum function_class fn_class)
+{
+  return targetm.libc_has_function (fn_class);
+}
+
+static bool
+ix86_function_ms_hook_prologue (const_tree fn)
+{
+  if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
+    {
+      if (decl_function_context (fn) != NULL_TREE)
+	error_at (DECL_SOURCE_LOCATION (fn),
+		  "ms_hook_prologue is not compatible with nested function");
+      else
+        return true;
+    }
+  return false;
+}
+
+static enum calling_abi
+ix86_function_abi (const_tree fndecl)
+{
+  if (! fndecl)
+    return ix86_abi;
+  return ix86_function_type_abi (TREE_TYPE (fndecl));
+}
+
+/* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
+   call abi used.  */
+enum calling_abi
+ix86_cfun_abi (void)
+{
+  if (! cfun)
+    return ix86_abi;
+  return cfun->machine->call_abi;
+}
+
+/* Write the extra assembler code needed to declare a function properly.  */
+
+void
+ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
+				tree decl)
+{
+  bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
+
+  if (is_ms_hook)
+    {
+      int i, filler_count = (TARGET_64BIT ? 32 : 16);
+      unsigned int filler_cc = 0xcccccccc;
+
+      for (i = 0; i < filler_count; i += 4)
+        fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
+    }
+
+#ifdef SUBTARGET_ASM_UNWIND_INIT
+  SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
+#endif
+
+  ASM_OUTPUT_LABEL (asm_out_file, fname);
+
+  /* Output magic byte marker, if hot-patch attribute is set.  */
+  if (is_ms_hook)
+    {
+      if (TARGET_64BIT)
+	{
+	  /* leaq [%rsp + 0], %rsp  */
+	  asm_fprintf (asm_out_file, ASM_BYTE
+		       "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
+	}
+      else
+	{
+          /* movl.s %edi, %edi
+	     push   %ebp
+	     movl.s %esp, %ebp */
+	  asm_fprintf (asm_out_file, ASM_BYTE
+		       "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
+	}
+    }
+}
+
+/* regclass.c  */
+extern void init_regs (void);
+
+/* Implementation of call abi switching target hook. Specific to FNDECL
+   the specific call register sets are set.  See also
+   ix86_conditional_register_usage for more details.  */
+void
+ix86_call_abi_override (const_tree fndecl)
+{
+  if (fndecl == NULL_TREE)
+    cfun->machine->call_abi = ix86_abi;
+  else
+    cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
+}
+
+/* 64-bit MS and SYSV ABI have different set of call used registers.  Avoid
+   expensive re-initialization of init_regs each time we switch function context
+   since this is needed only during RTL expansion.  */
+static void
+ix86_maybe_switch_abi (void)
+{
+  if (TARGET_64BIT &&
+      call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
+    reinit_regs ();
+}
+
+/* Initialize a variable CUM of type CUMULATIVE_ARGS
+   for a call to a function whose data type is FNTYPE.
+   For a library call, FNTYPE is 0.  */
+
+void
+init_cumulative_args (CUMULATIVE_ARGS *cum,  /* Argument info to initialize */
+		      tree fntype,	/* tree ptr for function decl */
+		      rtx libname,	/* SYMBOL_REF of library name or 0 */
+		      tree fndecl,
+		      int caller)
+{
+  struct cgraph_local_info *i;
+
+  memset (cum, 0, sizeof (*cum));
+
+  if (fndecl)
+    {
+      i = cgraph_local_info (fndecl);
+      cum->call_abi = ix86_function_abi (fndecl);
+    }
+  else
+    {
+      i = NULL;
+      cum->call_abi = ix86_function_type_abi (fntype);
+    }
+
+  cum->caller = caller;
+
+  /* Set up the number of registers to use for passing arguments.  */
+  cum->nregs = ix86_regparm;
+  if (TARGET_64BIT)
+    {
+      cum->nregs = (cum->call_abi == SYSV_ABI
+                   ? X86_64_REGPARM_MAX
+                   : X86_64_MS_REGPARM_MAX);
+    }
+  if (TARGET_SSE)
+    {
+      cum->sse_nregs = SSE_REGPARM_MAX;
+      if (TARGET_64BIT)
+        {
+          cum->sse_nregs = (cum->call_abi == SYSV_ABI
+                           ? X86_64_SSE_REGPARM_MAX
+                           : X86_64_MS_SSE_REGPARM_MAX);
+        }
+    }
+  if (TARGET_MMX)
+    cum->mmx_nregs = MMX_REGPARM_MAX;
+  cum->warn_avx512f = true;
+  cum->warn_avx = true;
+  cum->warn_sse = true;
+  cum->warn_mmx = true;
+
+  /* Because type might mismatch in between caller and callee, we need to
+     use actual type of function for local calls.
+     FIXME: cgraph_analyze can be told to actually record if function uses
+     va_start so for local functions maybe_vaarg can be made aggressive
+     helping K&R code.
+     FIXME: once typesytem is fixed, we won't need this code anymore.  */
+  if (i && i->local && i->can_change_signature)
+    fntype = TREE_TYPE (fndecl);
+  cum->maybe_vaarg = (fntype
+		      ? (!prototype_p (fntype) || stdarg_p (fntype))
+		      : !libname);
+
+  if (!TARGET_64BIT)
+    {
+      /* If there are variable arguments, then we won't pass anything
+         in registers in 32-bit mode. */
+      if (stdarg_p (fntype))
+	{
+	  cum->nregs = 0;
+	  cum->sse_nregs = 0;
+	  cum->mmx_nregs = 0;
+	  cum->warn_avx512f = false;
+	  cum->warn_avx = false;
+	  cum->warn_sse = false;
+	  cum->warn_mmx = false;
+	  return;
+	}
+
+      /* Use ecx and edx registers if function has fastcall attribute,
+	 else look for regparm information.  */
+      if (fntype)
+	{
+	  unsigned int ccvt = ix86_get_callcvt (fntype);
+	  if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
+	    {
+	      cum->nregs = 1;
+	      cum->fastcall = 1; /* Same first register as in fastcall.  */
+	    }
+	  else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+	    {
+	      cum->nregs = 2;
+	      cum->fastcall = 1;
+	    }
+	  else
+	    cum->nregs = ix86_function_regparm (fntype, fndecl);
+	}
+
+      /* Set up the number of SSE registers used for passing SFmode
+	 and DFmode arguments.  Warn for mismatching ABI.  */
+      cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
+    }
+}
+
+/* Return the "natural" mode for TYPE.  In most cases, this is just TYPE_MODE.
+   But in the case of vector types, it is some vector mode.
+
+   When we have only some of our vector isa extensions enabled, then there
+   are some modes for which vector_mode_supported_p is false.  For these
+   modes, the generic vector support in gcc will choose some non-vector mode
+   in order to implement the type.  By computing the natural mode, we'll
+   select the proper ABI location for the operand and not depend on whatever
+   the middle-end decides to do with these vector types.
+
+   The midde-end can't deal with the vector types > 16 bytes.  In this
+   case, we return the original mode and warn ABI change if CUM isn't
+   NULL. 
+
+   If INT_RETURN is true, warn ABI change if the vector mode isn't
+   available for function return value.  */
+
+static enum machine_mode
+type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
+		   bool in_return)
+{
+  enum machine_mode mode = TYPE_MODE (type);
+
+  if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
+    {
+      HOST_WIDE_INT size = int_size_in_bytes (type);
+      if ((size == 8 || size == 16 || size == 32 || size == 64)
+	  /* ??? Generic code allows us to create width 1 vectors.  Ignore.  */
+	  && TYPE_VECTOR_SUBPARTS (type) > 1)
+	{
+	  enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
+
+	  if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
+	    mode = MIN_MODE_VECTOR_FLOAT;
+	  else
+	    mode = MIN_MODE_VECTOR_INT;
+
+	  /* Get the mode which has this inner mode and number of units.  */
+	  for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
+	    if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
+		&& GET_MODE_INNER (mode) == innermode)
+	      {
+		if (size == 64 && !TARGET_AVX512F)
+		  {
+		    static bool warnedavx512f;
+		    static bool warnedavx512f_ret;
+
+		    if (cum && cum->warn_avx512f && !warnedavx512f)
+		      {
+			if (warning (OPT_Wpsabi, "AVX512F vector argument "
+				     "without AVX512F enabled changes the ABI"))
+			  warnedavx512f = true;
+		      }
+		    else if (in_return && !warnedavx512f_ret)
+		      {
+			if (warning (OPT_Wpsabi, "AVX512F vector return "
+				     "without AVX512F enabled changes the ABI"))
+			  warnedavx512f_ret = true;
+		      }
+
+		    return TYPE_MODE (type);
+		  }
+		else if (size == 32 && !TARGET_AVX)
+		  {
+		    static bool warnedavx;
+		    static bool warnedavx_ret;
+
+		    if (cum && cum->warn_avx && !warnedavx)
+		      {
+			if (warning (OPT_Wpsabi, "AVX vector argument "
+				     "without AVX enabled changes the ABI"))
+			  warnedavx = true;
+		      }
+		    else if (in_return && !warnedavx_ret)
+		      {
+			if (warning (OPT_Wpsabi, "AVX vector return "
+				     "without AVX enabled changes the ABI"))
+			  warnedavx_ret = true;
+		      }
+
+		    return TYPE_MODE (type);
+		  }
+		else if (((size == 8 && TARGET_64BIT) || size == 16)
+			 && !TARGET_SSE)
+		  {
+		    static bool warnedsse;
+		    static bool warnedsse_ret;
+
+		    if (cum && cum->warn_sse && !warnedsse)
+		      {
+			if (warning (OPT_Wpsabi, "SSE vector argument "
+				     "without SSE enabled changes the ABI"))
+			  warnedsse = true;
+		      }
+		    else if (!TARGET_64BIT && in_return && !warnedsse_ret)
+		      {
+			if (warning (OPT_Wpsabi, "SSE vector return "
+				     "without SSE enabled changes the ABI"))
+			  warnedsse_ret = true;
+		      }
+		  }
+		else if ((size == 8 && !TARGET_64BIT) && !TARGET_MMX)
+		  {
+		    static bool warnedmmx;
+		    static bool warnedmmx_ret;
+
+		    if (cum && cum->warn_mmx && !warnedmmx)
+		      {
+			if (warning (OPT_Wpsabi, "MMX vector argument "
+				     "without MMX enabled changes the ABI"))
+			  warnedmmx = true;
+		      }
+		    else if (in_return && !warnedmmx_ret)
+		      {
+			if (warning (OPT_Wpsabi, "MMX vector return "
+				     "without MMX enabled changes the ABI"))
+			  warnedmmx_ret = true;
+		      }
+		  }
+		return mode;
+	      }
+
+	  gcc_unreachable ();
+	}
+    }
+
+  return mode;
+}
+
+/* We want to pass a value in REGNO whose "natural" mode is MODE.  However,
+   this may not agree with the mode that the type system has chosen for the
+   register, which is ORIG_MODE.  If ORIG_MODE is not BLKmode, then we can
+   go ahead and use it.  Otherwise we have to build a PARALLEL instead.  */
+
+static rtx
+gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
+		     unsigned int regno)
+{
+  rtx tmp;
+
+  if (orig_mode != BLKmode)
+    tmp = gen_rtx_REG (orig_mode, regno);
+  else
+    {
+      tmp = gen_rtx_REG (mode, regno);
+      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
+      tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
+    }
+
+  return tmp;
+}
+
+/* x86-64 register passing implementation.  See x86-64 ABI for details.  Goal
+   of this code is to classify each 8bytes of incoming argument by the register
+   class and assign registers accordingly.  */
+
+/* Return the union class of CLASS1 and CLASS2.
+   See the x86-64 PS ABI for details.  */
+
+static enum x86_64_reg_class
+merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
+{
+  /* Rule #1: If both classes are equal, this is the resulting class.  */
+  if (class1 == class2)
+    return class1;
+
+  /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
+     the other class.  */
+  if (class1 == X86_64_NO_CLASS)
+    return class2;
+  if (class2 == X86_64_NO_CLASS)
+    return class1;
+
+  /* Rule #3: If one of the classes is MEMORY, the result is MEMORY.  */
+  if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
+    return X86_64_MEMORY_CLASS;
+
+  /* Rule #4: If one of the classes is INTEGER, the result is INTEGER.  */
+  if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
+      || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
+    return X86_64_INTEGERSI_CLASS;
+  if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
+      || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
+    return X86_64_INTEGER_CLASS;
+
+  /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
+     MEMORY is used.  */
+  if (class1 == X86_64_X87_CLASS
+      || class1 == X86_64_X87UP_CLASS
+      || class1 == X86_64_COMPLEX_X87_CLASS
+      || class2 == X86_64_X87_CLASS
+      || class2 == X86_64_X87UP_CLASS
+      || class2 == X86_64_COMPLEX_X87_CLASS)
+    return X86_64_MEMORY_CLASS;
+
+  /* Rule #6: Otherwise class SSE is used.  */
+  return X86_64_SSE_CLASS;
+}
+
+/* Classify the argument of type TYPE and mode MODE.
+   CLASSES will be filled by the register class used to pass each word
+   of the operand.  The number of words is returned.  In case the parameter
+   should be passed in memory, 0 is returned. As a special case for zero
+   sized containers, classes[0] will be NO_CLASS and 1 is returned.
+
+   BIT_OFFSET is used internally for handling records and specifies offset
+   of the offset in bits modulo 512 to avoid overflow cases.
+
+   See the x86-64 PS ABI for details.
+*/
+
+static int
+classify_argument (enum machine_mode mode, const_tree type,
+		   enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
+{
+  HOST_WIDE_INT bytes =
+    (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
+  int words
+    = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
+
+  /* Variable sized entities are always passed/returned in memory.  */
+  if (bytes < 0)
+    return 0;
+
+  if (mode != VOIDmode
+      && targetm.calls.must_pass_in_stack (mode, type))
+    return 0;
+
+  if (type && AGGREGATE_TYPE_P (type))
+    {
+      int i;
+      tree field;
+      enum x86_64_reg_class subclasses[MAX_CLASSES];
+
+      /* On x86-64 we pass structures larger than 64 bytes on the stack.  */
+      if (bytes > 64)
+	return 0;
+
+      for (i = 0; i < words; i++)
+	classes[i] = X86_64_NO_CLASS;
+
+      /* Zero sized arrays or structures are NO_CLASS.  We return 0 to
+	 signalize memory class, so handle it as special case.  */
+      if (!words)
+	{
+	  classes[0] = X86_64_NO_CLASS;
+	  return 1;
+	}
+
+      /* Classify each field of record and merge classes.  */
+      switch (TREE_CODE (type))
+	{
+	case RECORD_TYPE:
+	  /* And now merge the fields of structure.  */
+	  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
+	    {
+	      if (TREE_CODE (field) == FIELD_DECL)
+		{
+		  int num;
+
+		  if (TREE_TYPE (field) == error_mark_node)
+		    continue;
+
+		  /* Bitfields are always classified as integer.  Handle them
+		     early, since later code would consider them to be
+		     misaligned integers.  */
+		  if (DECL_BIT_FIELD (field))
+		    {
+		      for (i = (int_bit_position (field)
+				+ (bit_offset % 64)) / 8 / 8;
+			   i < ((int_bit_position (field) + (bit_offset % 64))
+			        + tree_to_shwi (DECL_SIZE (field))
+				+ 63) / 8 / 8; i++)
+			classes[i] =
+			  merge_classes (X86_64_INTEGER_CLASS,
+					 classes[i]);
+		    }
+		  else
+		    {
+		      int pos;
+
+		      type = TREE_TYPE (field);
+
+		      /* Flexible array member is ignored.  */
+		      if (TYPE_MODE (type) == BLKmode
+			  && TREE_CODE (type) == ARRAY_TYPE
+			  && TYPE_SIZE (type) == NULL_TREE
+			  && TYPE_DOMAIN (type) != NULL_TREE
+			  && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
+			      == NULL_TREE))
+			{
+			  static bool warned;
+
+			  if (!warned && warn_psabi)
+			    {
+			      warned = true;
+			      inform (input_location,
+				      "the ABI of passing struct with"
+				      " a flexible array member has"
+				      " changed in GCC 4.4");
+			    }
+			  continue;
+			}
+		      num = classify_argument (TYPE_MODE (type), type,
+					       subclasses,
+					       (int_bit_position (field)
+						+ bit_offset) % 512);
+		      if (!num)
+			return 0;
+		      pos = (int_bit_position (field)
+			     + (bit_offset % 64)) / 8 / 8;
+		      for (i = 0; i < num && (i + pos) < words; i++)
+			classes[i + pos] =
+			  merge_classes (subclasses[i], classes[i + pos]);
+		    }
+		}
+	    }
+	  break;
+
+	case ARRAY_TYPE:
+	  /* Arrays are handled as small records.  */
+	  {
+	    int num;
+	    num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
+				     TREE_TYPE (type), subclasses, bit_offset);
+	    if (!num)
+	      return 0;
+
+	    /* The partial classes are now full classes.  */
+	    if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
+	      subclasses[0] = X86_64_SSE_CLASS;
+	    if (subclasses[0] == X86_64_INTEGERSI_CLASS
+		&& !((bit_offset % 64) == 0 && bytes == 4))
+	      subclasses[0] = X86_64_INTEGER_CLASS;
+
+	    for (i = 0; i < words; i++)
+	      classes[i] = subclasses[i % num];
+
+	    break;
+	  }
+	case UNION_TYPE:
+	case QUAL_UNION_TYPE:
+	  /* Unions are similar to RECORD_TYPE but offset is always 0.
+	     */
+	  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
+	    {
+	      if (TREE_CODE (field) == FIELD_DECL)
+		{
+		  int num;
+
+		  if (TREE_TYPE (field) == error_mark_node)
+		    continue;
+
+		  num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
+					   TREE_TYPE (field), subclasses,
+					   bit_offset);
+		  if (!num)
+		    return 0;
+		  for (i = 0; i < num; i++)
+		    classes[i] = merge_classes (subclasses[i], classes[i]);
+		}
+	    }
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+
+      if (words > 2)
+	{
+	  /* When size > 16 bytes, if the first one isn't
+	     X86_64_SSE_CLASS or any other ones aren't
+	     X86_64_SSEUP_CLASS, everything should be passed in
+	     memory.  */
+	  if (classes[0] != X86_64_SSE_CLASS)
+	      return 0;
+
+	  for (i = 1; i < words; i++)
+	    if (classes[i] != X86_64_SSEUP_CLASS)
+	      return 0;
+	}
+
+      /* Final merger cleanup.  */
+      for (i = 0; i < words; i++)
+	{
+	  /* If one class is MEMORY, everything should be passed in
+	     memory.  */
+	  if (classes[i] == X86_64_MEMORY_CLASS)
+	    return 0;
+
+	  /* The X86_64_SSEUP_CLASS should be always preceded by
+	     X86_64_SSE_CLASS or X86_64_SSEUP_CLASS.  */
+	  if (classes[i] == X86_64_SSEUP_CLASS
+	      && classes[i - 1] != X86_64_SSE_CLASS
+	      && classes[i - 1] != X86_64_SSEUP_CLASS)
+	    {
+	      /* The first one should never be X86_64_SSEUP_CLASS.  */
+	      gcc_assert (i != 0);
+	      classes[i] = X86_64_SSE_CLASS;
+	    }
+
+	  /*  If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
+	       everything should be passed in memory.  */
+	  if (classes[i] == X86_64_X87UP_CLASS
+	      && (classes[i - 1] != X86_64_X87_CLASS))
+	    {
+	      static bool warned;
+
+	      /* The first one should never be X86_64_X87UP_CLASS.  */
+	      gcc_assert (i != 0);
+	      if (!warned && warn_psabi)
+		{
+		  warned = true;
+		  inform (input_location,
+			  "the ABI of passing union with long double"
+			  " has changed in GCC 4.4");
+		}
+	      return 0;
+	    }
+	}
+      return words;
+    }
+
+  /* Compute alignment needed.  We align all types to natural boundaries with
+     exception of XFmode that is aligned to 64bits.  */
+  if (mode != VOIDmode && mode != BLKmode)
+    {
+      int mode_alignment = GET_MODE_BITSIZE (mode);
+
+      if (mode == XFmode)
+	mode_alignment = 128;
+      else if (mode == XCmode)
+	mode_alignment = 256;
+      if (COMPLEX_MODE_P (mode))
+	mode_alignment /= 2;
+      /* Misaligned fields are always returned in memory.  */
+      if (bit_offset % mode_alignment)
+	return 0;
+    }
+
+  /* for V1xx modes, just use the base mode */
+  if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
+      && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
+    mode = GET_MODE_INNER (mode);
+
+  /* Classification of atomic types.  */
+  switch (mode)
+    {
+    case SDmode:
+    case DDmode:
+      classes[0] = X86_64_SSE_CLASS;
+      return 1;
+    case TDmode:
+      classes[0] = X86_64_SSE_CLASS;
+      classes[1] = X86_64_SSEUP_CLASS;
+      return 2;
+    case DImode:
+    case SImode:
+    case HImode:
+    case QImode:
+    case CSImode:
+    case CHImode:
+    case CQImode:
+      {
+	int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
+
+	/* Analyze last 128 bits only.  */
+	size = (size - 1) & 0x7f;
+
+	if (size < 32)
+	  {
+	    classes[0] = X86_64_INTEGERSI_CLASS;
+	    return 1;
+	  }
+	else if (size < 64)
+	  {
+	    classes[0] = X86_64_INTEGER_CLASS;
+	    return 1;
+	  }
+	else if (size < 64+32)
+	  {
+	    classes[0] = X86_64_INTEGER_CLASS;
+	    classes[1] = X86_64_INTEGERSI_CLASS;
+	    return 2;
+	  }
+	else if (size < 64+64)
+	  {
+	    classes[0] = classes[1] = X86_64_INTEGER_CLASS;
+	    return 2;
+	  }
+	else
+	  gcc_unreachable ();
+      }
+    case CDImode:
+    case TImode:
+      classes[0] = classes[1] = X86_64_INTEGER_CLASS;
+      return 2;
+    case COImode:
+    case OImode:
+      /* OImode shouldn't be used directly.  */
+      gcc_unreachable ();
+    case CTImode:
+      return 0;
+    case SFmode:
+      if (!(bit_offset % 64))
+	classes[0] = X86_64_SSESF_CLASS;
+      else
+	classes[0] = X86_64_SSE_CLASS;
+      return 1;
+    case DFmode:
+      classes[0] = X86_64_SSEDF_CLASS;
+      return 1;
+    case XFmode:
+      classes[0] = X86_64_X87_CLASS;
+      classes[1] = X86_64_X87UP_CLASS;
+      return 2;
+    case TFmode:
+      classes[0] = X86_64_SSE_CLASS;
+      classes[1] = X86_64_SSEUP_CLASS;
+      return 2;
+    case SCmode:
+      classes[0] = X86_64_SSE_CLASS;
+      if (!(bit_offset % 64))
+	return 1;
+      else
+	{
+	  static bool warned;
+
+	  if (!warned && warn_psabi)
+	    {
+	      warned = true;
+	      inform (input_location,
+		      "the ABI of passing structure with complex float"
+		      " member has changed in GCC 4.4");
+	    }
+	  classes[1] = X86_64_SSESF_CLASS;
+	  return 2;
+	}
+    case DCmode:
+      classes[0] = X86_64_SSEDF_CLASS;
+      classes[1] = X86_64_SSEDF_CLASS;
+      return 2;
+    case XCmode:
+      classes[0] = X86_64_COMPLEX_X87_CLASS;
+      return 1;
+    case TCmode:
+      /* This modes is larger than 16 bytes.  */
+      return 0;
+    case V8SFmode:
+    case V8SImode:
+    case V32QImode:
+    case V16HImode:
+    case V4DFmode:
+    case V4DImode:
+      classes[0] = X86_64_SSE_CLASS;
+      classes[1] = X86_64_SSEUP_CLASS;
+      classes[2] = X86_64_SSEUP_CLASS;
+      classes[3] = X86_64_SSEUP_CLASS;
+      return 4;
+    case V8DFmode:
+    case V16SFmode:
+    case V8DImode:
+    case V16SImode:
+    case V32HImode:
+    case V64QImode:
+      classes[0] = X86_64_SSE_CLASS;
+      classes[1] = X86_64_SSEUP_CLASS;
+      classes[2] = X86_64_SSEUP_CLASS;
+      classes[3] = X86_64_SSEUP_CLASS;
+      classes[4] = X86_64_SSEUP_CLASS;
+      classes[5] = X86_64_SSEUP_CLASS;
+      classes[6] = X86_64_SSEUP_CLASS;
+      classes[7] = X86_64_SSEUP_CLASS;
+      return 8;
+    case V4SFmode:
+    case V4SImode:
+    case V16QImode:
+    case V8HImode:
+    case V2DFmode:
+    case V2DImode:
+      classes[0] = X86_64_SSE_CLASS;
+      classes[1] = X86_64_SSEUP_CLASS;
+      return 2;
+    case V1TImode:
+    case V1DImode:
+    case V2SFmode:
+    case V2SImode:
+    case V4HImode:
+    case V8QImode:
+      classes[0] = X86_64_SSE_CLASS;
+      return 1;
+    case BLKmode:
+    case VOIDmode:
+      return 0;
+    default:
+      gcc_assert (VECTOR_MODE_P (mode));
+
+      if (bytes > 16)
+	return 0;
+
+      gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
+
+      if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
+	classes[0] = X86_64_INTEGERSI_CLASS;
+      else
+	classes[0] = X86_64_INTEGER_CLASS;
+      classes[1] = X86_64_INTEGER_CLASS;
+      return 1 + (bytes > 8);
+    }
+}
+
+/* Examine the argument and return set number of register required in each
+   class.  Return 0 iff parameter should be passed in memory.  */
+static int
+examine_argument (enum machine_mode mode, const_tree type, int in_return,
+		  int *int_nregs, int *sse_nregs)
+{
+  enum x86_64_reg_class regclass[MAX_CLASSES];
+  int n = classify_argument (mode, type, regclass, 0);
+
+  *int_nregs = 0;
+  *sse_nregs = 0;
+  if (!n)
+    return 0;
+  for (n--; n >= 0; n--)
+    switch (regclass[n])
+      {
+      case X86_64_INTEGER_CLASS:
+      case X86_64_INTEGERSI_CLASS:
+	(*int_nregs)++;
+	break;
+      case X86_64_SSE_CLASS:
+      case X86_64_SSESF_CLASS:
+      case X86_64_SSEDF_CLASS:
+	(*sse_nregs)++;
+	break;
+      case X86_64_NO_CLASS:
+      case X86_64_SSEUP_CLASS:
+	break;
+      case X86_64_X87_CLASS:
+      case X86_64_X87UP_CLASS:
+	if (!in_return)
+	  return 0;
+	break;
+      case X86_64_COMPLEX_X87_CLASS:
+	return in_return ? 2 : 0;
+      case X86_64_MEMORY_CLASS:
+	gcc_unreachable ();
+      }
+  return 1;
+}
+
+/* Construct container for the argument used by GCC interface.  See
+   FUNCTION_ARG for the detailed description.  */
+
+static rtx
+construct_container (enum machine_mode mode, enum machine_mode orig_mode,
+		     const_tree type, int in_return, int nintregs, int nsseregs,
+		     const int *intreg, int sse_regno)
+{
+  /* The following variables hold the static issued_error state.  */
+  static bool issued_sse_arg_error;
+  static bool issued_sse_ret_error;
+  static bool issued_x87_ret_error;
+
+  enum machine_mode tmpmode;
+  int bytes =
+    (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
+  enum x86_64_reg_class regclass[MAX_CLASSES];
+  int n;
+  int i;
+  int nexps = 0;
+  int needed_sseregs, needed_intregs;
+  rtx exp[MAX_CLASSES];
+  rtx ret;
+
+  n = classify_argument (mode, type, regclass, 0);
+  if (!n)
+    return NULL;
+  if (!examine_argument (mode, type, in_return, &needed_intregs,
+			 &needed_sseregs))
+    return NULL;
+  if (needed_intregs > nintregs || needed_sseregs > nsseregs)
+    return NULL;
+
+  /* We allowed the user to turn off SSE for kernel mode.  Don't crash if
+     some less clueful developer tries to use floating-point anyway.  */
+  if (needed_sseregs && !TARGET_SSE)
+    {
+      if (in_return)
+	{
+	  if (!issued_sse_ret_error)
+	    {
+	      error ("SSE register return with SSE disabled");
+	      issued_sse_ret_error = true;
+	    }
+	}
+      else if (!issued_sse_arg_error)
+	{
+	  error ("SSE register argument with SSE disabled");
+	  issued_sse_arg_error = true;
+	}
+      return NULL;
+    }
+
+  /* Likewise, error if the ABI requires us to return values in the
+     x87 registers and the user specified -mno-80387.  */
+  if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
+    for (i = 0; i < n; i++)
+      if (regclass[i] == X86_64_X87_CLASS
+	  || regclass[i] == X86_64_X87UP_CLASS
+	  || regclass[i] == X86_64_COMPLEX_X87_CLASS)
+	{
+	  if (!issued_x87_ret_error)
+	    {
+	      error ("x87 register return with x87 disabled");
+	      issued_x87_ret_error = true;
+	    }
+	  return NULL;
+	}
+
+  /* First construct simple cases.  Avoid SCmode, since we want to use
+     single register to pass this type.  */
+  if (n == 1 && mode != SCmode)
+    switch (regclass[0])
+      {
+      case X86_64_INTEGER_CLASS:
+      case X86_64_INTEGERSI_CLASS:
+	return gen_rtx_REG (mode, intreg[0]);
+      case X86_64_SSE_CLASS:
+      case X86_64_SSESF_CLASS:
+      case X86_64_SSEDF_CLASS:
+	if (mode != BLKmode)
+	  return gen_reg_or_parallel (mode, orig_mode,
+				      SSE_REGNO (sse_regno));
+	break;
+      case X86_64_X87_CLASS:
+      case X86_64_COMPLEX_X87_CLASS:
+	return gen_rtx_REG (mode, FIRST_STACK_REG);
+      case X86_64_NO_CLASS:
+	/* Zero sized array, struct or class.  */
+	return NULL;
+      default:
+	gcc_unreachable ();
+      }
+  if (n == 2
+      && regclass[0] == X86_64_SSE_CLASS
+      && regclass[1] == X86_64_SSEUP_CLASS
+      && mode != BLKmode)
+    return gen_reg_or_parallel (mode, orig_mode,
+				SSE_REGNO (sse_regno));
+  if (n == 4
+      && regclass[0] == X86_64_SSE_CLASS
+      && regclass[1] == X86_64_SSEUP_CLASS
+      && regclass[2] == X86_64_SSEUP_CLASS
+      && regclass[3] == X86_64_SSEUP_CLASS
+      && mode != BLKmode)
+    return gen_reg_or_parallel (mode, orig_mode,
+				SSE_REGNO (sse_regno));
+  if (n == 8
+      && regclass[0] == X86_64_SSE_CLASS
+      && regclass[1] == X86_64_SSEUP_CLASS
+      && regclass[2] == X86_64_SSEUP_CLASS
+      && regclass[3] == X86_64_SSEUP_CLASS
+      && regclass[4] == X86_64_SSEUP_CLASS
+      && regclass[5] == X86_64_SSEUP_CLASS
+      && regclass[6] == X86_64_SSEUP_CLASS
+      && regclass[7] == X86_64_SSEUP_CLASS
+      && mode != BLKmode)
+    return gen_reg_or_parallel (mode, orig_mode,
+				SSE_REGNO (sse_regno));
+  if (n == 2
+      && regclass[0] == X86_64_X87_CLASS
+      && regclass[1] == X86_64_X87UP_CLASS)
+    return gen_rtx_REG (XFmode, FIRST_STACK_REG);
+
+  if (n == 2
+      && regclass[0] == X86_64_INTEGER_CLASS
+      && regclass[1] == X86_64_INTEGER_CLASS
+      && (mode == CDImode || mode == TImode)
+      && intreg[0] + 1 == intreg[1])
+    return gen_rtx_REG (mode, intreg[0]);
+
+  /* Otherwise figure out the entries of the PARALLEL.  */
+  for (i = 0; i < n; i++)
+    {
+      int pos;
+
+      switch (regclass[i])
+        {
+	  case X86_64_NO_CLASS:
+	    break;
+	  case X86_64_INTEGER_CLASS:
+	  case X86_64_INTEGERSI_CLASS:
+	    /* Merge TImodes on aligned occasions here too.  */
+	    if (i * 8 + 8 > bytes)
+	      tmpmode
+		= mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
+	    else if (regclass[i] == X86_64_INTEGERSI_CLASS)
+	      tmpmode = SImode;
+	    else
+	      tmpmode = DImode;
+	    /* We've requested 24 bytes we
+	       don't have mode for.  Use DImode.  */
+	    if (tmpmode == BLKmode)
+	      tmpmode = DImode;
+	    exp [nexps++]
+	      = gen_rtx_EXPR_LIST (VOIDmode,
+				   gen_rtx_REG (tmpmode, *intreg),
+				   GEN_INT (i*8));
+	    intreg++;
+	    break;
+	  case X86_64_SSESF_CLASS:
+	    exp [nexps++]
+	      = gen_rtx_EXPR_LIST (VOIDmode,
+				   gen_rtx_REG (SFmode,
+						SSE_REGNO (sse_regno)),
+				   GEN_INT (i*8));
+	    sse_regno++;
+	    break;
+	  case X86_64_SSEDF_CLASS:
+	    exp [nexps++]
+	      = gen_rtx_EXPR_LIST (VOIDmode,
+				   gen_rtx_REG (DFmode,
+						SSE_REGNO (sse_regno)),
+				   GEN_INT (i*8));
+	    sse_regno++;
+	    break;
+	  case X86_64_SSE_CLASS:
+	    pos = i;
+	    switch (n)
+	      {
+	      case 1:
+		tmpmode = DImode;
+		break;
+	      case 2:
+		if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
+		  {
+		    tmpmode = TImode;
+		    i++;
+		  }
+		else
+		  tmpmode = DImode;
+		break;
+	      case 4:
+		gcc_assert (i == 0
+			    && regclass[1] == X86_64_SSEUP_CLASS
+			    && regclass[2] == X86_64_SSEUP_CLASS
+			    && regclass[3] == X86_64_SSEUP_CLASS);
+		tmpmode = OImode;
+		i += 3;
+		break;
+	      case 8:
+		gcc_assert (i == 0
+			    && regclass[1] == X86_64_SSEUP_CLASS
+			    && regclass[2] == X86_64_SSEUP_CLASS
+			    && regclass[3] == X86_64_SSEUP_CLASS
+			    && regclass[4] == X86_64_SSEUP_CLASS
+			    && regclass[5] == X86_64_SSEUP_CLASS
+			    && regclass[6] == X86_64_SSEUP_CLASS
+			    && regclass[7] == X86_64_SSEUP_CLASS);
+		tmpmode = XImode;
+		i += 7;
+		break;
+	      default:
+		gcc_unreachable ();
+	      }
+	    exp [nexps++]
+	      = gen_rtx_EXPR_LIST (VOIDmode,
+				   gen_rtx_REG (tmpmode,
+						SSE_REGNO (sse_regno)),
+				   GEN_INT (pos*8));
+	    sse_regno++;
+	    break;
+	  default:
+	    gcc_unreachable ();
+	}
+    }
+
+  /* Empty aligned struct, union or class.  */
+  if (nexps == 0)
+    return NULL;
+
+  ret =  gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
+  for (i = 0; i < nexps; i++)
+    XVECEXP (ret, 0, i) = exp [i];
+  return ret;
+}
+
+/* Update the data in CUM to advance over an argument of mode MODE
+   and data type TYPE.  (TYPE is null for libcalls where that information
+   may not be available.)  */
+
+static void
+function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
+			 const_tree type, HOST_WIDE_INT bytes,
+			 HOST_WIDE_INT words)
+{
+  switch (mode)
+    {
+    default:
+      break;
+
+    case BLKmode:
+      if (bytes < 0)
+	break;
+      /* FALLTHRU */
+
+    case DImode:
+    case SImode:
+    case HImode:
+    case QImode:
+      cum->words += words;
+      cum->nregs -= words;
+      cum->regno += words;
+
+      if (cum->nregs <= 0)
+	{
+	  cum->nregs = 0;
+	  cum->regno = 0;
+	}
+      break;
+
+    case OImode:
+      /* OImode shouldn't be used directly.  */
+      gcc_unreachable ();
+
+    case DFmode:
+      if (cum->float_in_sse < 2)
+	break;
+    case SFmode:
+      if (cum->float_in_sse < 1)
+	break;
+      /* FALLTHRU */
+
+    case V8SFmode:
+    case V8SImode:
+    case V64QImode:
+    case V32HImode:
+    case V16SImode:
+    case V8DImode:
+    case V16SFmode:
+    case V8DFmode:
+    case V32QImode:
+    case V16HImode:
+    case V4DFmode:
+    case V4DImode:
+    case TImode:
+    case V16QImode:
+    case V8HImode:
+    case V4SImode:
+    case V2DImode:
+    case V4SFmode:
+    case V2DFmode:
+      if (!type || !AGGREGATE_TYPE_P (type))
+	{
+	  cum->sse_words += words;
+	  cum->sse_nregs -= 1;
+	  cum->sse_regno += 1;
+	  if (cum->sse_nregs <= 0)
+	    {
+	      cum->sse_nregs = 0;
+	      cum->sse_regno = 0;
+	    }
+	}
+      break;
+
+    case V8QImode:
+    case V4HImode:
+    case V2SImode:
+    case V2SFmode:
+    case V1TImode:
+    case V1DImode:
+      if (!type || !AGGREGATE_TYPE_P (type))
+	{
+	  cum->mmx_words += words;
+	  cum->mmx_nregs -= 1;
+	  cum->mmx_regno += 1;
+	  if (cum->mmx_nregs <= 0)
+	    {
+	      cum->mmx_nregs = 0;
+	      cum->mmx_regno = 0;
+	    }
+	}
+      break;
+    }
+}
+
+static void
+function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
+			 const_tree type, HOST_WIDE_INT words, bool named)
+{
+  int int_nregs, sse_nregs;
+
+  /* Unnamed 512 and 256bit vector mode parameters are passed on stack.  */
+  if (!named && (VALID_AVX512F_REG_MODE (mode)
+		 || VALID_AVX256_REG_MODE (mode)))
+    return;
+
+  if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
+      && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
+    {
+      cum->nregs -= int_nregs;
+      cum->sse_nregs -= sse_nregs;
+      cum->regno += int_nregs;
+      cum->sse_regno += sse_nregs;
+    }
+  else
+    {
+      int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
+      cum->words = (cum->words + align - 1) & ~(align - 1);
+      cum->words += words;
+    }
+}
+
+static void
+function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
+			    HOST_WIDE_INT words)
+{
+  /* Otherwise, this should be passed indirect.  */
+  gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
+
+  cum->words += words;
+  if (cum->nregs > 0)
+    {
+      cum->nregs -= 1;
+      cum->regno += 1;
+    }
+}
+
+/* Update the data in CUM to advance over an argument of mode MODE and
+   data type TYPE.  (TYPE is null for libcalls where that information
+   may not be available.)  */
+
+static void
+ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
+			   const_tree type, bool named)
+{
+  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+  HOST_WIDE_INT bytes, words;
+
+  if (mode == BLKmode)
+    bytes = int_size_in_bytes (type);
+  else
+    bytes = GET_MODE_SIZE (mode);
+  words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
+
+  if (type)
+    mode = type_natural_mode (type, NULL, false);
+
+  if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
+    function_arg_advance_ms_64 (cum, bytes, words);
+  else if (TARGET_64BIT)
+    function_arg_advance_64 (cum, mode, type, words, named);
+  else
+    function_arg_advance_32 (cum, mode, type, bytes, words);
+}
+
+/* Define where to put the arguments to a function.
+   Value is zero to push the argument on the stack,
+   or a hard register in which to store the argument.
+
+   MODE is the argument's machine mode.
+   TYPE is the data type of the argument (as a tree).
+    This is null for libcalls where that information may
+    not be available.
+   CUM is a variable of type CUMULATIVE_ARGS which gives info about
+    the preceding args and about the function being called.
+   NAMED is nonzero if this argument is a named parameter
+    (otherwise it is an extra parameter matching an ellipsis).  */
+
+static rtx
+function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
+		 enum machine_mode orig_mode, const_tree type,
+		 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
+{
+  /* Avoid the AL settings for the Unix64 ABI.  */
+  if (mode == VOIDmode)
+    return constm1_rtx;
+
+  switch (mode)
+    {
+    default:
+      break;
+
+    case BLKmode:
+      if (bytes < 0)
+	break;
+      /* FALLTHRU */
+    case DImode:
+    case SImode:
+    case HImode:
+    case QImode:
+      if (words <= cum->nregs)
+	{
+	  int regno = cum->regno;
+
+	  /* Fastcall allocates the first two DWORD (SImode) or
+            smaller arguments to ECX and EDX if it isn't an
+            aggregate type .  */
+	  if (cum->fastcall)
+	    {
+	      if (mode == BLKmode
+		  || mode == DImode
+		  || (type && AGGREGATE_TYPE_P (type)))
+	        break;
+
+	      /* ECX not EAX is the first allocated register.  */
+	      if (regno == AX_REG)
+		regno = CX_REG;
+	    }
+	  return gen_rtx_REG (mode, regno);
+	}
+      break;
+
+    case DFmode:
+      if (cum->float_in_sse < 2)
+	break;
+    case SFmode:
+      if (cum->float_in_sse < 1)
+	break;
+      /* FALLTHRU */
+    case TImode:
+      /* In 32bit, we pass TImode in xmm registers.  */
+    case V16QImode:
+    case V8HImode:
+    case V4SImode:
+    case V2DImode:
+    case V4SFmode:
+    case V2DFmode:
+      if (!type || !AGGREGATE_TYPE_P (type))
+	{
+	  if (cum->sse_nregs)
+	    return gen_reg_or_parallel (mode, orig_mode,
+				        cum->sse_regno + FIRST_SSE_REG);
+	}
+      break;
+
+    case OImode:
+    case XImode:
+      /* OImode and XImode shouldn't be used directly.  */
+      gcc_unreachable ();
+
+    case V64QImode:
+    case V32HImode:
+    case V16SImode:
+    case V8DImode:
+    case V16SFmode:
+    case V8DFmode:
+    case V8SFmode:
+    case V8SImode:
+    case V32QImode:
+    case V16HImode:
+    case V4DFmode:
+    case V4DImode:
+      if (!type || !AGGREGATE_TYPE_P (type))
+	{
+	  if (cum->sse_nregs)
+	    return gen_reg_or_parallel (mode, orig_mode,
+				        cum->sse_regno + FIRST_SSE_REG);
+	}
+      break;
+
+    case V8QImode:
+    case V4HImode:
+    case V2SImode:
+    case V2SFmode:
+    case V1TImode:
+    case V1DImode:
+      if (!type || !AGGREGATE_TYPE_P (type))
+	{
+	  if (cum->mmx_nregs)
+	    return gen_reg_or_parallel (mode, orig_mode,
+				        cum->mmx_regno + FIRST_MMX_REG);
+	}
+      break;
+    }
+
+  return NULL_RTX;
+}
+
+static rtx
+function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
+		 enum machine_mode orig_mode, const_tree type, bool named)
+{
+  /* Handle a hidden AL argument containing number of registers
+     for varargs x86-64 functions.  */
+  if (mode == VOIDmode)
+    return GEN_INT (cum->maybe_vaarg
+		    ? (cum->sse_nregs < 0
+		       ? X86_64_SSE_REGPARM_MAX
+		       : cum->sse_regno)
+		    : -1);
+
+  switch (mode)
+    {
+    default:
+      break;
+
+    case V8SFmode:
+    case V8SImode:
+    case V32QImode:
+    case V16HImode:
+    case V4DFmode:
+    case V4DImode:
+    case V16SFmode:
+    case V16SImode:
+    case V64QImode:
+    case V32HImode:
+    case V8DFmode:
+    case V8DImode:
+      /* Unnamed 256 and 512bit vector mode parameters are passed on stack.  */
+      if (!named)
+	return NULL;
+      break;
+    }
+
+  return construct_container (mode, orig_mode, type, 0, cum->nregs,
+			      cum->sse_nregs,
+			      &x86_64_int_parameter_registers [cum->regno],
+			      cum->sse_regno);
+}
+
+static rtx
+function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
+		    enum machine_mode orig_mode, bool named,
+		    HOST_WIDE_INT bytes)
+{
+  unsigned int regno;
+
+  /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
+     We use value of -2 to specify that current function call is MSABI.  */
+  if (mode == VOIDmode)
+    return GEN_INT (-2);
+
+  /* If we've run out of registers, it goes on the stack.  */
+  if (cum->nregs == 0)
+    return NULL_RTX;
+
+  regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
+
+  /* Only floating point modes are passed in anything but integer regs.  */
+  if (TARGET_SSE && (mode == SFmode || mode == DFmode))
+    {
+      if (named)
+	regno = cum->regno + FIRST_SSE_REG;
+      else
+	{
+	  rtx t1, t2;
+
+	  /* Unnamed floating parameters are passed in both the
+	     SSE and integer registers.  */
+	  t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
+	  t2 = gen_rtx_REG (mode, regno);
+	  t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
+	  t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
+	  return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
+	}
+    }
+  /* Handle aggregated types passed in register.  */
+  if (orig_mode == BLKmode)
+    {
+      if (bytes > 0 && bytes <= 8)
+        mode = (bytes > 4 ? DImode : SImode);
+      if (mode == BLKmode)
+        mode = DImode;
+    }
+
+  return gen_reg_or_parallel (mode, orig_mode, regno);
+}
+
+/* Return where to put the arguments to a function.
+   Return zero to push the argument on the stack, or a hard register in which to store the argument.
+
+   MODE is the argument's machine mode.  TYPE is the data type of the
+   argument.  It is null for libcalls where that information may not be
+   available.  CUM gives information about the preceding args and about
+   the function being called.  NAMED is nonzero if this argument is a
+   named parameter (otherwise it is an extra parameter matching an
+   ellipsis).  */
+
+static rtx
+ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
+		   const_tree type, bool named)
+{
+  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+  enum machine_mode mode = omode;
+  HOST_WIDE_INT bytes, words;
+  rtx arg;
+
+  if (mode == BLKmode)
+    bytes = int_size_in_bytes (type);
+  else
+    bytes = GET_MODE_SIZE (mode);
+  words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
+
+  /* To simplify the code below, represent vector types with a vector mode
+     even if MMX/SSE are not active.  */
+  if (type && TREE_CODE (type) == VECTOR_TYPE)
+    mode = type_natural_mode (type, cum, false);
+
+  if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
+    arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
+  else if (TARGET_64BIT)
+    arg = function_arg_64 (cum, mode, omode, type, named);
+  else
+    arg = function_arg_32 (cum, mode, omode, type, bytes, words);
+
+  return arg;
+}
+
+/* A C expression that indicates when an argument must be passed by
+   reference.  If nonzero for an argument, a copy of that argument is
+   made in memory and a pointer to the argument is passed instead of
+   the argument itself.  The pointer is passed in whatever way is
+   appropriate for passing a pointer to that type.  */
+
+static bool
+ix86_pass_by_reference (cumulative_args_t cum_v, enum machine_mode mode,
+			const_tree type, bool named ATTRIBUTE_UNUSED)
+{
+  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+
+  /* See Windows x64 Software Convention.  */
+  if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
+    {
+      int msize = (int) GET_MODE_SIZE (mode);
+      if (type)
+	{
+	  /* Arrays are passed by reference.  */
+	  if (TREE_CODE (type) == ARRAY_TYPE)
+	    return true;
+
+	  if (AGGREGATE_TYPE_P (type))
+	    {
+	      /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
+	         are passed by reference.  */
+	      msize = int_size_in_bytes (type);
+	    }
+	}
+
+      /* __m128 is passed by reference.  */
+      switch (msize) {
+      case 1: case 2: case 4: case 8:
+        break;
+      default:
+        return true;
+      }
+    }
+  else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
+    return 1;
+
+  return 0;
+}
+
+/* Return true when TYPE should be 128bit aligned for 32bit argument
+   passing ABI.  XXX: This function is obsolete and is only used for
+   checking psABI compatibility with previous versions of GCC.  */
+
+static bool
+ix86_compat_aligned_value_p (const_tree type)
+{
+  enum machine_mode mode = TYPE_MODE (type);
+  if (((TARGET_SSE && SSE_REG_MODE_P (mode))
+       || mode == TDmode
+       || mode == TFmode
+       || mode == TCmode)
+      && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
+    return true;
+  if (TYPE_ALIGN (type) < 128)
+    return false;
+
+  if (AGGREGATE_TYPE_P (type))
+    {
+      /* Walk the aggregates recursively.  */
+      switch (TREE_CODE (type))
+	{
+	case RECORD_TYPE:
+	case UNION_TYPE:
+	case QUAL_UNION_TYPE:
+	  {
+	    tree field;
+
+	    /* Walk all the structure fields.  */
+	    for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
+	      {
+		if (TREE_CODE (field) == FIELD_DECL
+		    && ix86_compat_aligned_value_p (TREE_TYPE (field)))
+		  return true;
+	      }
+	    break;
+	  }
+
+	case ARRAY_TYPE:
+	  /* Just for use if some languages passes arrays by value.  */
+	  if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
+	    return true;
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+    }
+  return false;
+}
+
+/* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
+   XXX: This function is obsolete and is only used for checking psABI
+   compatibility with previous versions of GCC.  */
+
+static unsigned int
+ix86_compat_function_arg_boundary (enum machine_mode mode,
+				   const_tree type, unsigned int align)
+{
+  /* In 32bit, only _Decimal128 and __float128 are aligned to their
+     natural boundaries.  */
+  if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
+    {
+      /* i386 ABI defines all arguments to be 4 byte aligned.  We have to
+	 make an exception for SSE modes since these require 128bit
+	 alignment.
+
+	 The handling here differs from field_alignment.  ICC aligns MMX
+	 arguments to 4 byte boundaries, while structure fields are aligned
+	 to 8 byte boundaries.  */
+      if (!type)
+	{
+	  if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
+	    align = PARM_BOUNDARY;
+	}
+      else
+	{
+	  if (!ix86_compat_aligned_value_p (type))
+	    align = PARM_BOUNDARY;
+	}
+    }
+  if (align > BIGGEST_ALIGNMENT)
+    align = BIGGEST_ALIGNMENT;
+  return align;
+}
+
+/* Return true when TYPE should be 128bit aligned for 32bit argument
+   passing ABI.  */
+
+static bool
+ix86_contains_aligned_value_p (const_tree type)
+{
+  enum machine_mode mode = TYPE_MODE (type);
+
+  if (mode == XFmode || mode == XCmode)
+    return false;
+
+  if (TYPE_ALIGN (type) < 128)
+    return false;
+
+  if (AGGREGATE_TYPE_P (type))
+    {
+      /* Walk the aggregates recursively.  */
+      switch (TREE_CODE (type))
+	{
+	case RECORD_TYPE:
+	case UNION_TYPE:
+	case QUAL_UNION_TYPE:
+	  {
+	    tree field;
+
+	    /* Walk all the structure fields.  */
+	    for (field = TYPE_FIELDS (type);
+		 field;
+		 field = DECL_CHAIN (field))
+	      {
+		if (TREE_CODE (field) == FIELD_DECL
+		    && ix86_contains_aligned_value_p (TREE_TYPE (field)))
+		  return true;
+	      }
+	    break;
+	  }
+
+	case ARRAY_TYPE:
+	  /* Just for use if some languages passes arrays by value.  */
+	  if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
+	    return true;
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+    }
+  else
+    return TYPE_ALIGN (type) >= 128;
+
+  return false;
+}
+
+/* Gives the alignment boundary, in bits, of an argument with the
+   specified mode and type.  */
+
+static unsigned int
+ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
+{
+  unsigned int align;
+  if (type)
+    {
+      /* Since the main variant type is used for call, we convert it to
+	 the main variant type.  */
+      type = TYPE_MAIN_VARIANT (type);
+      align = TYPE_ALIGN (type);
+    }
+  else
+    align = GET_MODE_ALIGNMENT (mode);
+  if (align < PARM_BOUNDARY)
+    align = PARM_BOUNDARY;
+  else
+    {
+      static bool warned;
+      unsigned int saved_align = align;
+
+      if (!TARGET_64BIT)
+	{
+	  /* i386 ABI defines XFmode arguments to be 4 byte aligned.  */
+	  if (!type)
+	    {
+	      if (mode == XFmode || mode == XCmode)
+		align = PARM_BOUNDARY;
+	    }
+	  else if (!ix86_contains_aligned_value_p (type))
+	    align = PARM_BOUNDARY;
+
+	  if (align < 128)
+	    align = PARM_BOUNDARY;
+	}
+
+      if (warn_psabi
+	  && !warned
+	  && align != ix86_compat_function_arg_boundary (mode, type,
+							 saved_align))
+	{
+	  warned = true;
+	  inform (input_location,
+		  "The ABI for passing parameters with %d-byte"
+		  " alignment has changed in GCC 4.6",
+		  align / BITS_PER_UNIT);
+	}
+    }
+
+  return align;
+}
+
+/* Return true if N is a possible register number of function value.  */
+
+static bool
+ix86_function_value_regno_p (const unsigned int regno)
+{
+  switch (regno)
+    {
+    case AX_REG:
+    case DX_REG:
+      return true;
+    case DI_REG:
+    case SI_REG:
+      return TARGET_64BIT && ix86_abi != MS_ABI;
+
+      /* Complex values are returned in %st(0)/%st(1) pair.  */
+    case ST0_REG:
+    case ST1_REG:
+      /* TODO: The function should depend on current function ABI but
+       builtins.c would need updating then. Therefore we use the
+       default ABI.  */
+      if (TARGET_64BIT && ix86_abi == MS_ABI)
+	return false;
+      return TARGET_FLOAT_RETURNS_IN_80387;
+
+      /* Complex values are returned in %xmm0/%xmm1 pair.  */
+    case XMM0_REG:
+    case XMM1_REG:
+      return TARGET_SSE;
+
+    case MM0_REG:
+      if (TARGET_MACHO || TARGET_64BIT)
+	return false;
+      return TARGET_MMX;
+    }
+
+  return false;
+}
+
+/* Define how to find the value returned by a function.
+   VALTYPE is the data type of the value (as a tree).
+   If the precise function being called is known, FUNC is its FUNCTION_DECL;
+   otherwise, FUNC is 0.  */
+
+static rtx
+function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
+		   const_tree fntype, const_tree fn)
+{
+  unsigned int regno;
+
+  /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
+     we normally prevent this case when mmx is not available.  However
+     some ABIs may require the result to be returned like DImode.  */
+  if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
+    regno = FIRST_MMX_REG;
+
+  /* 16-byte vector modes in %xmm0.  See ix86_return_in_memory for where
+     we prevent this case when sse is not available.  However some ABIs
+     may require the result to be returned like integer TImode.  */
+  else if (mode == TImode
+	   || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
+    regno = FIRST_SSE_REG;
+
+  /* 32-byte vector modes in %ymm0.   */
+  else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
+    regno = FIRST_SSE_REG;
+
+  /* 64-byte vector modes in %zmm0.   */
+  else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
+    regno = FIRST_SSE_REG;
+
+  /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387).  */
+  else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
+    regno = FIRST_FLOAT_REG;
+  else
+    /* Most things go in %eax.  */
+    regno = AX_REG;
+
+  /* Override FP return register with %xmm0 for local functions when
+     SSE math is enabled or for functions with sseregparm attribute.  */
+  if ((fn || fntype) && (mode == SFmode || mode == DFmode))
+    {
+      int sse_level = ix86_function_sseregparm (fntype, fn, false);
+      if ((sse_level >= 1 && mode == SFmode)
+	  || (sse_level == 2 && mode == DFmode))
+	regno = FIRST_SSE_REG;
+    }
+
+  /* OImode shouldn't be used directly.  */
+  gcc_assert (mode != OImode);
+
+  return gen_rtx_REG (orig_mode, regno);
+}
+
+static rtx
+function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
+		   const_tree valtype)
+{
+  rtx ret;
+
+  /* Handle libcalls, which don't provide a type node.  */
+  if (valtype == NULL)
+    {
+      unsigned int regno;
+
+      switch (mode)
+	{
+	case SFmode:
+	case SCmode:
+	case DFmode:
+	case DCmode:
+	case TFmode:
+	case SDmode:
+	case DDmode:
+	case TDmode:
+	  regno = FIRST_SSE_REG;
+	  break;
+	case XFmode:
+	case XCmode:
+	  regno = FIRST_FLOAT_REG;
+	  break;
+	case TCmode:
+	  return NULL;
+	default:
+	  regno = AX_REG;
+	}
+
+      return gen_rtx_REG (mode, regno);
+    }
+  else if (POINTER_TYPE_P (valtype))
+    {
+      /* Pointers are always returned in word_mode.  */
+      mode = word_mode;
+    }
+
+  ret = construct_container (mode, orig_mode, valtype, 1,
+			     X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
+			     x86_64_int_return_registers, 0);
+
+  /* For zero sized structures, construct_container returns NULL, but we
+     need to keep rest of compiler happy by returning meaningful value.  */
+  if (!ret)
+    ret = gen_rtx_REG (orig_mode, AX_REG);
+
+  return ret;
+}
+
+static rtx
+function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode,
+		      const_tree valtype)
+{
+  unsigned int regno = AX_REG;
+
+  if (TARGET_SSE)
+    {
+      switch (GET_MODE_SIZE (mode))
+	{
+	case 16:
+	  if (valtype != NULL_TREE
+	      && !VECTOR_INTEGER_TYPE_P (valtype)
+	      && !VECTOR_INTEGER_TYPE_P (valtype)
+	      && !INTEGRAL_TYPE_P (valtype)
+	      && !VECTOR_FLOAT_TYPE_P (valtype))
+	    break;
+	  if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
+	      && !COMPLEX_MODE_P (mode))
+	    regno = FIRST_SSE_REG;
+	  break;
+	case 8:
+	case 4:
+	  if (mode == SFmode || mode == DFmode)
+	    regno = FIRST_SSE_REG;
+	  break;
+	default:
+	  break;
+        }
+    }
+  return gen_rtx_REG (orig_mode, regno);
+}
+
+static rtx
+ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
+		       enum machine_mode orig_mode, enum machine_mode mode)
+{
+  const_tree fn, fntype;
+
+  fn = NULL_TREE;
+  if (fntype_or_decl && DECL_P (fntype_or_decl))
+    fn = fntype_or_decl;
+  fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
+
+  if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
+    return function_value_ms_64 (orig_mode, mode, valtype);
+  else if (TARGET_64BIT)
+    return function_value_64 (orig_mode, mode, valtype);
+  else
+    return function_value_32 (orig_mode, mode, fntype, fn);
+}
+
+static rtx
+ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
+		     bool outgoing ATTRIBUTE_UNUSED)
+{
+  enum machine_mode mode, orig_mode;
+
+  orig_mode = TYPE_MODE (valtype);
+  mode = type_natural_mode (valtype, NULL, true);
+  return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
+}
+
+/* Pointer function arguments and return values are promoted to
+   word_mode.  */
+
+static enum machine_mode
+ix86_promote_function_mode (const_tree type, enum machine_mode mode,
+			    int *punsignedp, const_tree fntype,
+			    int for_return)
+{
+  if (type != NULL_TREE && POINTER_TYPE_P (type))
+    {
+      *punsignedp = POINTERS_EXTEND_UNSIGNED;
+      return word_mode;
+    }
+  return default_promote_function_mode (type, mode, punsignedp, fntype,
+					for_return);
+}
+
+/* Return true if a structure, union or array with MODE containing FIELD
+   should be accessed using BLKmode.  */
+
+static bool
+ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
+{
+  /* Union with XFmode must be in BLKmode.  */
+  return (mode == XFmode
+	  && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
+	      || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
+}
+
+rtx
+ix86_libcall_value (enum machine_mode mode)
+{
+  return ix86_function_value_1 (NULL, NULL, mode, mode);
+}
+
+/* Return true iff type is returned in memory.  */
+
+static bool ATTRIBUTE_UNUSED
+return_in_memory_32 (const_tree type, enum machine_mode mode)
+{
+  HOST_WIDE_INT size;
+
+  if (mode == BLKmode)
+    return true;
+
+  size = int_size_in_bytes (type);
+
+  if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
+    return false;
+
+  if (VECTOR_MODE_P (mode) || mode == TImode)
+    {
+      /* User-created vectors small enough to fit in EAX.  */
+      if (size < 8)
+	return false;
+
+      /* MMX/3dNow values are returned in MM0,
+	 except when it doesn't exits or the ABI prescribes otherwise.  */
+      if (size == 8)
+	return !TARGET_MMX || TARGET_VECT8_RETURNS;
+
+      /* SSE values are returned in XMM0, except when it doesn't exist.  */
+      if (size == 16)
+	return !TARGET_SSE;
+
+      /* AVX values are returned in YMM0, except when it doesn't exist.  */
+      if (size == 32)
+	return !TARGET_AVX;
+
+      /* AVX512F values are returned in ZMM0, except when it doesn't exist.  */
+      if (size == 64)
+	return !TARGET_AVX512F;
+    }
+
+  if (mode == XFmode)
+    return false;
+
+  if (size > 12)
+    return true;
+
+  /* OImode shouldn't be used directly.  */
+  gcc_assert (mode != OImode);
+
+  return false;
+}
+
+static bool ATTRIBUTE_UNUSED
+return_in_memory_64 (const_tree type, enum machine_mode mode)
+{
+  int needed_intregs, needed_sseregs;
+  return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
+}
+
+static bool ATTRIBUTE_UNUSED
+return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
+{
+  HOST_WIDE_INT size = int_size_in_bytes (type);
+
+  /* __m128 is returned in xmm0.  */
+  if ((!type || VECTOR_INTEGER_TYPE_P (type) || INTEGRAL_TYPE_P (type)
+       || VECTOR_FLOAT_TYPE_P (type))
+      && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
+      && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
+    return false;
+
+  /* Otherwise, the size must be exactly in [1248]. */
+  return size != 1 && size != 2 && size != 4 && size != 8;
+}
+
+static bool
+ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
+{
+#ifdef SUBTARGET_RETURN_IN_MEMORY
+  return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
+#else
+  const enum machine_mode mode = type_natural_mode (type, NULL, true);
+
+  if (TARGET_64BIT)
+    {
+      if (ix86_function_type_abi (fntype) == MS_ABI)
+	return return_in_memory_ms_64 (type, mode);
+      else
+	return return_in_memory_64 (type, mode);
+    }
+  else
+    return return_in_memory_32 (type, mode);
+#endif
+}
+
+
+/* Create the va_list data type.  */
+
+/* Returns the calling convention specific va_list date type.
+   The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI.  */
+
+static tree
+ix86_build_builtin_va_list_abi (enum calling_abi abi)
+{
+  tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
+
+  /* For i386 we use plain pointer to argument area.  */
+  if (!TARGET_64BIT || abi == MS_ABI)
+    return build_pointer_type (char_type_node);
+
+  record = lang_hooks.types.make_type (RECORD_TYPE);
+  type_decl = build_decl (BUILTINS_LOCATION,
+			  TYPE_DECL, get_identifier ("__va_list_tag"), record);
+
+  f_gpr = build_decl (BUILTINS_LOCATION,
+		      FIELD_DECL, get_identifier ("gp_offset"),
+		      unsigned_type_node);
+  f_fpr = build_decl (BUILTINS_LOCATION,
+		      FIELD_DECL, get_identifier ("fp_offset"),
+		      unsigned_type_node);
+  f_ovf = build_decl (BUILTINS_LOCATION,
+		      FIELD_DECL, get_identifier ("overflow_arg_area"),
+		      ptr_type_node);
+  f_sav = build_decl (BUILTINS_LOCATION,
+		      FIELD_DECL, get_identifier ("reg_save_area"),
+		      ptr_type_node);
+
+  va_list_gpr_counter_field = f_gpr;
+  va_list_fpr_counter_field = f_fpr;
+
+  DECL_FIELD_CONTEXT (f_gpr) = record;
+  DECL_FIELD_CONTEXT (f_fpr) = record;
+  DECL_FIELD_CONTEXT (f_ovf) = record;
+  DECL_FIELD_CONTEXT (f_sav) = record;
+
+  TYPE_STUB_DECL (record) = type_decl;
+  TYPE_NAME (record) = type_decl;
+  TYPE_FIELDS (record) = f_gpr;
+  DECL_CHAIN (f_gpr) = f_fpr;
+  DECL_CHAIN (f_fpr) = f_ovf;
+  DECL_CHAIN (f_ovf) = f_sav;
+
+  layout_type (record);
+
+  /* The correct type is an array type of one element.  */
+  return build_array_type (record, build_index_type (size_zero_node));
+}
+
+/* Setup the builtin va_list data type and for 64-bit the additional
+   calling convention specific va_list data types.  */
+
+static tree
+ix86_build_builtin_va_list (void)
+{
+  tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
+
+  /* Initialize abi specific va_list builtin types.  */
+  if (TARGET_64BIT)
+    {
+      tree t;
+      if (ix86_abi == MS_ABI)
+        {
+          t = ix86_build_builtin_va_list_abi (SYSV_ABI);
+          if (TREE_CODE (t) != RECORD_TYPE)
+            t = build_variant_type_copy (t);
+          sysv_va_list_type_node = t;
+        }
+      else
+        {
+          t = ret;
+          if (TREE_CODE (t) != RECORD_TYPE)
+            t = build_variant_type_copy (t);
+          sysv_va_list_type_node = t;
+        }
+      if (ix86_abi != MS_ABI)
+        {
+          t = ix86_build_builtin_va_list_abi (MS_ABI);
+          if (TREE_CODE (t) != RECORD_TYPE)
+            t = build_variant_type_copy (t);
+          ms_va_list_type_node = t;
+        }
+      else
+        {
+          t = ret;
+          if (TREE_CODE (t) != RECORD_TYPE)
+            t = build_variant_type_copy (t);
+          ms_va_list_type_node = t;
+        }
+    }
+
+  return ret;
+}
+
+/* Worker function for TARGET_SETUP_INCOMING_VARARGS.  */
+
+static void
+setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
+{
+  rtx save_area, mem;
+  alias_set_type set;
+  int i, max;
+
+  /* GPR size of varargs save area.  */
+  if (cfun->va_list_gpr_size)
+    ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
+  else
+    ix86_varargs_gpr_size = 0;
+
+  /* FPR size of varargs save area.  We don't need it if we don't pass
+     anything in SSE registers.  */
+  if (TARGET_SSE && cfun->va_list_fpr_size)
+    ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
+  else
+    ix86_varargs_fpr_size = 0;
+
+  if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
+    return;
+
+  save_area = frame_pointer_rtx;
+  set = get_varargs_alias_set ();
+
+  max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
+  if (max > X86_64_REGPARM_MAX)
+    max = X86_64_REGPARM_MAX;
+
+  for (i = cum->regno; i < max; i++)
+    {
+      mem = gen_rtx_MEM (word_mode,
+			 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
+      MEM_NOTRAP_P (mem) = 1;
+      set_mem_alias_set (mem, set);
+      emit_move_insn (mem,
+		      gen_rtx_REG (word_mode,
+				   x86_64_int_parameter_registers[i]));
+    }
+
+  if (ix86_varargs_fpr_size)
+    {
+      enum machine_mode smode;
+      rtx label, test;
+
+      /* Now emit code to save SSE registers.  The AX parameter contains number
+	 of SSE parameter registers used to call this function, though all we
+	 actually check here is the zero/non-zero status.  */
+
+      label = gen_label_rtx ();
+      test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
+      emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
+				      label));
+
+      /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
+	 we used movdqa (i.e. TImode) instead?  Perhaps even better would
+	 be if we could determine the real mode of the data, via a hook
+	 into pass_stdarg.  Ignore all that for now.  */
+      smode = V4SFmode;
+      if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
+	crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
+
+      max = cum->sse_regno + cfun->va_list_fpr_size / 16;
+      if (max > X86_64_SSE_REGPARM_MAX)
+	max = X86_64_SSE_REGPARM_MAX;
+
+      for (i = cum->sse_regno; i < max; ++i)
+	{
+	  mem = plus_constant (Pmode, save_area,
+			       i * 16 + ix86_varargs_gpr_size);
+	  mem = gen_rtx_MEM (smode, mem);
+	  MEM_NOTRAP_P (mem) = 1;
+	  set_mem_alias_set (mem, set);
+	  set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
+
+	  emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
+	}
+
+      emit_label (label);
+    }
+}
+
+static void
+setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
+{
+  alias_set_type set = get_varargs_alias_set ();
+  int i;
+
+  /* Reset to zero, as there might be a sysv vaarg used
+     before.  */
+  ix86_varargs_gpr_size = 0;
+  ix86_varargs_fpr_size = 0;
+
+  for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
+    {
+      rtx reg, mem;
+
+      mem = gen_rtx_MEM (Pmode,
+			 plus_constant (Pmode, virtual_incoming_args_rtx,
+					i * UNITS_PER_WORD));
+      MEM_NOTRAP_P (mem) = 1;
+      set_mem_alias_set (mem, set);
+
+      reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
+      emit_move_insn (mem, reg);
+    }
+}
+
+static void
+ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
+			     tree type, int *pretend_size ATTRIBUTE_UNUSED,
+			     int no_rtl)
+{
+  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+  CUMULATIVE_ARGS next_cum;
+  tree fntype;
+
+  /* This argument doesn't appear to be used anymore.  Which is good,
+     because the old code here didn't suppress rtl generation.  */
+  gcc_assert (!no_rtl);
+
+  if (!TARGET_64BIT)
+    return;
+
+  fntype = TREE_TYPE (current_function_decl);
+
+  /* For varargs, we do not want to skip the dummy va_dcl argument.
+     For stdargs, we do want to skip the last named argument.  */
+  next_cum = *cum;
+  if (stdarg_p (fntype))
+    ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
+			       true);
+
+  if (cum->call_abi == MS_ABI)
+    setup_incoming_varargs_ms_64 (&next_cum);
+  else
+    setup_incoming_varargs_64 (&next_cum);
+}
+
+/* Checks if TYPE is of kind va_list char *.  */
+
+static bool
+is_va_list_char_pointer (tree type)
+{
+  tree canonic;
+
+  /* For 32-bit it is always true.  */
+  if (!TARGET_64BIT)
+    return true;
+  canonic = ix86_canonical_va_list_type (type);
+  return (canonic == ms_va_list_type_node
+          || (ix86_abi == MS_ABI && canonic == va_list_type_node));
+}
+
+/* Implement va_start.  */
+
+static void
+ix86_va_start (tree valist, rtx nextarg)
+{
+  HOST_WIDE_INT words, n_gpr, n_fpr;
+  tree f_gpr, f_fpr, f_ovf, f_sav;
+  tree gpr, fpr, ovf, sav, t;
+  tree type;
+  rtx ovf_rtx;
+
+  if (flag_split_stack
+      && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
+    {
+      unsigned int scratch_regno;
+
+      /* When we are splitting the stack, we can't refer to the stack
+	 arguments using internal_arg_pointer, because they may be on
+	 the old stack.  The split stack prologue will arrange to
+	 leave a pointer to the old stack arguments in a scratch
+	 register, which we here copy to a pseudo-register.  The split
+	 stack prologue can't set the pseudo-register directly because
+	 it (the prologue) runs before any registers have been saved.  */
+
+      scratch_regno = split_stack_prologue_scratch_regno ();
+      if (scratch_regno != INVALID_REGNUM)
+	{
+	  rtx reg, seq;
+
+	  reg = gen_reg_rtx (Pmode);
+	  cfun->machine->split_stack_varargs_pointer = reg;
+
+	  start_sequence ();
+	  emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
+	  seq = get_insns ();
+	  end_sequence ();
+
+	  push_topmost_sequence ();
+	  emit_insn_after (seq, entry_of_function ());
+	  pop_topmost_sequence ();
+	}
+    }
+
+  /* Only 64bit target needs something special.  */
+  if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
+    {
+      if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
+	std_expand_builtin_va_start (valist, nextarg);
+      else
+	{
+	  rtx va_r, next;
+
+	  va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
+	  next = expand_binop (ptr_mode, add_optab,
+			       cfun->machine->split_stack_varargs_pointer,
+			       crtl->args.arg_offset_rtx,
+			       NULL_RTX, 0, OPTAB_LIB_WIDEN);
+	  convert_move (va_r, next, 0);
+	}
+      return;
+    }
+
+  f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
+  f_fpr = DECL_CHAIN (f_gpr);
+  f_ovf = DECL_CHAIN (f_fpr);
+  f_sav = DECL_CHAIN (f_ovf);
+
+  valist = build_simple_mem_ref (valist);
+  TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
+  /* The following should be folded into the MEM_REF offset.  */
+  gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
+		f_gpr, NULL_TREE);
+  fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
+		f_fpr, NULL_TREE);
+  ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
+		f_ovf, NULL_TREE);
+  sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
+		f_sav, NULL_TREE);
+
+  /* Count number of gp and fp argument registers used.  */
+  words = crtl->args.info.words;
+  n_gpr = crtl->args.info.regno;
+  n_fpr = crtl->args.info.sse_regno;
+
+  if (cfun->va_list_gpr_size)
+    {
+      type = TREE_TYPE (gpr);
+      t = build2 (MODIFY_EXPR, type,
+		  gpr, build_int_cst (type, n_gpr * 8));
+      TREE_SIDE_EFFECTS (t) = 1;
+      expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
+    }
+
+  if (TARGET_SSE && cfun->va_list_fpr_size)
+    {
+      type = TREE_TYPE (fpr);
+      t = build2 (MODIFY_EXPR, type, fpr,
+		  build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
+      TREE_SIDE_EFFECTS (t) = 1;
+      expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
+    }
+
+  /* Find the overflow area.  */
+  type = TREE_TYPE (ovf);
+  if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
+    ovf_rtx = crtl->args.internal_arg_pointer;
+  else
+    ovf_rtx = cfun->machine->split_stack_varargs_pointer;
+  t = make_tree (type, ovf_rtx);
+  if (words != 0)
+    t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
+  t = build2 (MODIFY_EXPR, type, ovf, t);
+  TREE_SIDE_EFFECTS (t) = 1;
+  expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
+
+  if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
+    {
+      /* Find the register save area.
+	 Prologue of the function save it right above stack frame.  */
+      type = TREE_TYPE (sav);
+      t = make_tree (type, frame_pointer_rtx);
+      if (!ix86_varargs_gpr_size)
+	t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
+      t = build2 (MODIFY_EXPR, type, sav, t);
+      TREE_SIDE_EFFECTS (t) = 1;
+      expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
+    }
+}
+
+/* Implement va_arg.  */
+
+static tree
+ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
+		      gimple_seq *post_p)
+{
+  static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
+  tree f_gpr, f_fpr, f_ovf, f_sav;
+  tree gpr, fpr, ovf, sav, t;
+  int size, rsize;
+  tree lab_false, lab_over = NULL_TREE;
+  tree addr, t2;
+  rtx container;
+  int indirect_p = 0;
+  tree ptrtype;
+  enum machine_mode nat_mode;
+  unsigned int arg_boundary;
+
+  /* Only 64bit target needs something special.  */
+  if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
+    return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
+
+  f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
+  f_fpr = DECL_CHAIN (f_gpr);
+  f_ovf = DECL_CHAIN (f_fpr);
+  f_sav = DECL_CHAIN (f_ovf);
+
+  gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
+		build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
+  valist = build_va_arg_indirect_ref (valist);
+  fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
+  ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
+  sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
+
+  indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
+  if (indirect_p)
+    type = build_pointer_type (type);
+  size = int_size_in_bytes (type);
+  rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
+
+  nat_mode = type_natural_mode (type, NULL, false);
+  switch (nat_mode)
+    {
+    case V8SFmode:
+    case V8SImode:
+    case V32QImode:
+    case V16HImode:
+    case V4DFmode:
+    case V4DImode:
+    case V16SFmode:
+    case V16SImode:
+    case V64QImode:
+    case V32HImode:
+    case V8DFmode:
+    case V8DImode:
+      /* Unnamed 256 and 512bit vector mode parameters are passed on stack.  */
+      if (!TARGET_64BIT_MS_ABI)
+	{
+	  container = NULL;
+	  break;
+	}
+
+    default:
+      container = construct_container (nat_mode, TYPE_MODE (type),
+				       type, 0, X86_64_REGPARM_MAX,
+				       X86_64_SSE_REGPARM_MAX, intreg,
+				       0);
+      break;
+    }
+
+  /* Pull the value out of the saved registers.  */
+
+  addr = create_tmp_var (ptr_type_node, "addr");
+
+  if (container)
+    {
+      int needed_intregs, needed_sseregs;
+      bool need_temp;
+      tree int_addr, sse_addr;
+
+      lab_false = create_artificial_label (UNKNOWN_LOCATION);
+      lab_over = create_artificial_label (UNKNOWN_LOCATION);
+
+      examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
+
+      need_temp = (!REG_P (container)
+		   && ((needed_intregs && TYPE_ALIGN (type) > 64)
+		       || TYPE_ALIGN (type) > 128));
+
+      /* In case we are passing structure, verify that it is consecutive block
+         on the register save area.  If not we need to do moves.  */
+      if (!need_temp && !REG_P (container))
+	{
+	  /* Verify that all registers are strictly consecutive  */
+	  if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
+	    {
+	      int i;
+
+	      for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
+		{
+		  rtx slot = XVECEXP (container, 0, i);
+		  if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
+		      || INTVAL (XEXP (slot, 1)) != i * 16)
+		    need_temp = 1;
+		}
+	    }
+	  else
+	    {
+	      int i;
+
+	      for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
+		{
+		  rtx slot = XVECEXP (container, 0, i);
+		  if (REGNO (XEXP (slot, 0)) != (unsigned int) i
+		      || INTVAL (XEXP (slot, 1)) != i * 8)
+		    need_temp = 1;
+		}
+	    }
+	}
+      if (!need_temp)
+	{
+	  int_addr = addr;
+	  sse_addr = addr;
+	}
+      else
+	{
+	  int_addr = create_tmp_var (ptr_type_node, "int_addr");
+	  sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
+	}
+
+      /* First ensure that we fit completely in registers.  */
+      if (needed_intregs)
+	{
+	  t = build_int_cst (TREE_TYPE (gpr),
+			     (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
+	  t = build2 (GE_EXPR, boolean_type_node, gpr, t);
+	  t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
+	  t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
+	  gimplify_and_add (t, pre_p);
+	}
+      if (needed_sseregs)
+	{
+	  t = build_int_cst (TREE_TYPE (fpr),
+			     (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
+			     + X86_64_REGPARM_MAX * 8);
+	  t = build2 (GE_EXPR, boolean_type_node, fpr, t);
+	  t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
+	  t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
+	  gimplify_and_add (t, pre_p);
+	}
+
+      /* Compute index to start of area used for integer regs.  */
+      if (needed_intregs)
+	{
+	  /* int_addr = gpr + sav; */
+	  t = fold_build_pointer_plus (sav, gpr);
+	  gimplify_assign (int_addr, t, pre_p);
+	}
+      if (needed_sseregs)
+	{
+	  /* sse_addr = fpr + sav; */
+	  t = fold_build_pointer_plus (sav, fpr);
+	  gimplify_assign (sse_addr, t, pre_p);
+	}
+      if (need_temp)
+	{
+	  int i, prev_size = 0;
+	  tree temp = create_tmp_var (type, "va_arg_tmp");
+
+	  /* addr = &temp; */
+	  t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
+	  gimplify_assign (addr, t, pre_p);
+
+	  for (i = 0; i < XVECLEN (container, 0); i++)
+	    {
+	      rtx slot = XVECEXP (container, 0, i);
+	      rtx reg = XEXP (slot, 0);
+	      enum machine_mode mode = GET_MODE (reg);
+	      tree piece_type;
+	      tree addr_type;
+	      tree daddr_type;
+	      tree src_addr, src;
+	      int src_offset;
+	      tree dest_addr, dest;
+	      int cur_size = GET_MODE_SIZE (mode);
+
+	      gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
+	      prev_size = INTVAL (XEXP (slot, 1));
+	      if (prev_size + cur_size > size)
+		{
+		  cur_size = size - prev_size;
+		  mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
+		  if (mode == BLKmode)
+		    mode = QImode;
+		}
+	      piece_type = lang_hooks.types.type_for_mode (mode, 1);
+	      if (mode == GET_MODE (reg))
+		addr_type = build_pointer_type (piece_type);
+	      else
+		addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
+							 true);
+	      daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
+							true);
+
+	      if (SSE_REGNO_P (REGNO (reg)))
+		{
+		  src_addr = sse_addr;
+		  src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
+		}
+	      else
+		{
+		  src_addr = int_addr;
+		  src_offset = REGNO (reg) * 8;
+		}
+	      src_addr = fold_convert (addr_type, src_addr);
+	      src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
+
+	      dest_addr = fold_convert (daddr_type, addr);
+	      dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
+	      if (cur_size == GET_MODE_SIZE (mode))
+		{
+		  src = build_va_arg_indirect_ref (src_addr);
+		  dest = build_va_arg_indirect_ref (dest_addr);
+
+		  gimplify_assign (dest, src, pre_p);
+		}
+	      else
+		{
+		  tree copy
+		    = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
+				       3, dest_addr, src_addr,
+				       size_int (cur_size));
+		  gimplify_and_add (copy, pre_p);
+		}
+	      prev_size += cur_size;
+	    }
+	}
+
+      if (needed_intregs)
+	{
+	  t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
+		      build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
+	  gimplify_assign (gpr, t, pre_p);
+	}
+
+      if (needed_sseregs)
+	{
+	  t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
+		      build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
+	  gimplify_assign (fpr, t, pre_p);
+	}
+
+      gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
+
+      gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
+    }
+
+  /* ... otherwise out of the overflow area.  */
+
+  /* When we align parameter on stack for caller, if the parameter
+     alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
+     aligned at MAX_SUPPORTED_STACK_ALIGNMENT.  We will match callee
+     here with caller.  */
+  arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
+  if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
+    arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
+
+  /* Care for on-stack alignment if needed.  */
+  if (arg_boundary <= 64 || size == 0)
+    t = ovf;
+ else
+    {
+      HOST_WIDE_INT align = arg_boundary / 8;
+      t = fold_build_pointer_plus_hwi (ovf, align - 1);
+      t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
+		  build_int_cst (TREE_TYPE (t), -align));
+    }
+
+  gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
+  gimplify_assign (addr, t, pre_p);
+
+  t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
+  gimplify_assign (unshare_expr (ovf), t, pre_p);
+
+  if (container)
+    gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
+
+  ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
+  addr = fold_convert (ptrtype, addr);
+
+  if (indirect_p)
+    addr = build_va_arg_indirect_ref (addr);
+  return build_va_arg_indirect_ref (addr);
+}
+
+/* Return true if OPNUM's MEM should be matched
+   in movabs* patterns.  */
+
+bool
+ix86_check_movabs (rtx insn, int opnum)
+{
+  rtx set, mem;
+
+  set = PATTERN (insn);
+  if (GET_CODE (set) == PARALLEL)
+    set = XVECEXP (set, 0, 0);
+  gcc_assert (GET_CODE (set) == SET);
+  mem = XEXP (set, opnum);
+  while (GET_CODE (mem) == SUBREG)
+    mem = SUBREG_REG (mem);
+  gcc_assert (MEM_P (mem));
+  return volatile_ok || !MEM_VOLATILE_P (mem);
+}
+
+/* Initialize the table of extra 80387 mathematical constants.  */
+
+static void
+init_ext_80387_constants (void)
+{
+  static const char * cst[5] =
+  {
+    "0.3010299956639811952256464283594894482",  /* 0: fldlg2  */
+    "0.6931471805599453094286904741849753009",  /* 1: fldln2  */
+    "1.4426950408889634073876517827983434472",  /* 2: fldl2e  */
+    "3.3219280948873623478083405569094566090",  /* 3: fldl2t  */
+    "3.1415926535897932385128089594061862044",  /* 4: fldpi   */
+  };
+  int i;
+
+  for (i = 0; i < 5; i++)
+    {
+      real_from_string (&ext_80387_constants_table[i], cst[i]);
+      /* Ensure each constant is rounded to XFmode precision.  */
+      real_convert (&ext_80387_constants_table[i],
+		    XFmode, &ext_80387_constants_table[i]);
+    }
+
+  ext_80387_constants_init = 1;
+}
+
+/* Return non-zero if the constant is something that
+   can be loaded with a special instruction.  */
+
+int
+standard_80387_constant_p (rtx x)
+{
+  enum machine_mode mode = GET_MODE (x);
+
+  REAL_VALUE_TYPE r;
+
+  if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
+    return -1;
+
+  if (x == CONST0_RTX (mode))
+    return 1;
+  if (x == CONST1_RTX (mode))
+    return 2;
+
+  REAL_VALUE_FROM_CONST_DOUBLE (r, x);
+
+  /* For XFmode constants, try to find a special 80387 instruction when
+     optimizing for size or on those CPUs that benefit from them.  */
+  if (mode == XFmode
+      && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
+    {
+      int i;
+
+      if (! ext_80387_constants_init)
+	init_ext_80387_constants ();
+
+      for (i = 0; i < 5; i++)
+        if (real_identical (&r, &ext_80387_constants_table[i]))
+	  return i + 3;
+    }
+
+  /* Load of the constant -0.0 or -1.0 will be split as
+     fldz;fchs or fld1;fchs sequence.  */
+  if (real_isnegzero (&r))
+    return 8;
+  if (real_identical (&r, &dconstm1))
+    return 9;
+
+  return 0;
+}
+
+/* Return the opcode of the special instruction to be used to load
+   the constant X.  */
+
+const char *
+standard_80387_constant_opcode (rtx x)
+{
+  switch (standard_80387_constant_p (x))
+    {
+    case 1:
+      return "fldz";
+    case 2:
+      return "fld1";
+    case 3:
+      return "fldlg2";
+    case 4:
+      return "fldln2";
+    case 5:
+      return "fldl2e";
+    case 6:
+      return "fldl2t";
+    case 7:
+      return "fldpi";
+    case 8:
+    case 9:
+      return "#";
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Return the CONST_DOUBLE representing the 80387 constant that is
+   loaded by the specified special instruction.  The argument IDX
+   matches the return value from standard_80387_constant_p.  */
+
+rtx
+standard_80387_constant_rtx (int idx)
+{
+  int i;
+
+  if (! ext_80387_constants_init)
+    init_ext_80387_constants ();
+
+  switch (idx)
+    {
+    case 3:
+    case 4:
+    case 5:
+    case 6:
+    case 7:
+      i = idx - 3;
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
+				       XFmode);
+}
+
+/* Return 1 if X is all 0s and 2 if x is all 1s
+   in supported SSE/AVX vector mode.  */
+
+int
+standard_sse_constant_p (rtx x)
+{
+  enum machine_mode mode = GET_MODE (x);
+
+  if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
+    return 1;
+  if (vector_all_ones_operand (x, mode))
+    switch (mode)
+      {
+      case V16QImode:
+      case V8HImode:
+      case V4SImode:
+      case V2DImode:
+	if (TARGET_SSE2)
+	  return 2;
+      case V32QImode:
+      case V16HImode:
+      case V8SImode:
+      case V4DImode:
+	if (TARGET_AVX2)
+	  return 2;
+      case V64QImode:
+      case V32HImode:
+      case V16SImode:
+      case V8DImode:
+	if (TARGET_AVX512F)
+	  return 2;
+      default:
+	break;
+      }
+
+  return 0;
+}
+
+/* Return the opcode of the special instruction to be used to load
+   the constant X.  */
+
+const char *
+standard_sse_constant_opcode (rtx insn, rtx x)
+{
+  switch (standard_sse_constant_p (x))
+    {
+    case 1:
+      switch (get_attr_mode (insn))
+	{
+	case MODE_XI:
+	case MODE_V16SF:
+	  return "vpxord\t%g0, %g0, %g0";
+	case MODE_V8DF:
+	  return "vpxorq\t%g0, %g0, %g0";
+	case MODE_TI:
+	  return "%vpxor\t%0, %d0";
+	case MODE_V2DF:
+	  return "%vxorpd\t%0, %d0";
+	case MODE_V4SF:
+	  return "%vxorps\t%0, %d0";
+
+	case MODE_OI:
+	  return "vpxor\t%x0, %x0, %x0";
+	case MODE_V4DF:
+	  return "vxorpd\t%x0, %x0, %x0";
+	case MODE_V8SF:
+	  return "vxorps\t%x0, %x0, %x0";
+
+	default:
+	  break;
+	}
+
+    case 2:
+      if (get_attr_mode (insn) == MODE_XI
+	  || get_attr_mode (insn) == MODE_V8DF
+	  || get_attr_mode (insn) == MODE_V16SF)
+	return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
+      if (TARGET_AVX)
+	return "vpcmpeqd\t%0, %0, %0";
+      else
+	return "pcmpeqd\t%0, %0";
+
+    default:
+      break;
+    }
+  gcc_unreachable ();
+}
+
+/* Returns true if OP contains a symbol reference */
+
+bool
+symbolic_reference_mentioned_p (rtx op)
+{
+  const char *fmt;
+  int i;
+
+  if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
+    return true;
+
+  fmt = GET_RTX_FORMAT (GET_CODE (op));
+  for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
+    {
+      if (fmt[i] == 'E')
+	{
+	  int j;
+
+	  for (j = XVECLEN (op, i) - 1; j >= 0; j--)
+	    if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
+	      return true;
+	}
+
+      else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
+	return true;
+    }
+
+  return false;
+}
+
+/* Return true if it is appropriate to emit `ret' instructions in the
+   body of a function.  Do this only if the epilogue is simple, needing a
+   couple of insns.  Prior to reloading, we can't tell how many registers
+   must be saved, so return false then.  Return false if there is no frame
+   marker to de-allocate.  */
+
+bool
+ix86_can_use_return_insn_p (void)
+{
+  struct ix86_frame frame;
+
+  if (! reload_completed || frame_pointer_needed)
+    return 0;
+
+  /* Don't allow more than 32k pop, since that's all we can do
+     with one instruction.  */
+  if (crtl->args.pops_args && crtl->args.size >= 32768)
+    return 0;
+
+  ix86_compute_frame_layout (&frame);
+  return (frame.stack_pointer_offset == UNITS_PER_WORD
+	  && (frame.nregs + frame.nsseregs) == 0);
+}
+
+/* Value should be nonzero if functions must have frame pointers.
+   Zero means the frame pointer need not be set up (and parms may
+   be accessed via the stack pointer) in functions that seem suitable.  */
+
+static bool
+ix86_frame_pointer_required (void)
+{
+  /* If we accessed previous frames, then the generated code expects
+     to be able to access the saved ebp value in our frame.  */
+  if (cfun->machine->accesses_prev_frame)
+    return true;
+
+  /* Several x86 os'es need a frame pointer for other reasons,
+     usually pertaining to setjmp.  */
+  if (SUBTARGET_FRAME_POINTER_REQUIRED)
+    return true;
+
+  /* For older 32-bit runtimes setjmp requires valid frame-pointer.  */
+  if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
+    return true;
+
+  /* Win64 SEH, very large frames need a frame-pointer as maximum stack
+     allocation is 4GB.  */
+  if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
+    return true;
+
+  /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
+     turns off the frame pointer by default.  Turn it back on now if
+     we've not got a leaf function.  */
+  if (TARGET_OMIT_LEAF_FRAME_POINTER
+      && (!crtl->is_leaf
+	  || ix86_current_function_calls_tls_descriptor))
+    return true;
+
+  if (crtl->profile && !flag_fentry)
+    return true;
+
+  return false;
+}
+
+/* Record that the current function accesses previous call frames.  */
+
+void
+ix86_setup_frame_addresses (void)
+{
+  cfun->machine->accesses_prev_frame = 1;
+}
+
+#ifndef USE_HIDDEN_LINKONCE
+# if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
+#  define USE_HIDDEN_LINKONCE 1
+# else
+#  define USE_HIDDEN_LINKONCE 0
+# endif
+#endif
+
+static int pic_labels_used;
+
+/* Fills in the label name that should be used for a pc thunk for
+   the given register.  */
+
+static void
+get_pc_thunk_name (char name[32], unsigned int regno)
+{
+  gcc_assert (!TARGET_64BIT);
+
+  if (USE_HIDDEN_LINKONCE)
+    sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
+  else
+    ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
+}
+
+
+/* This function generates code for -fpic that loads %ebx with
+   the return address of the caller and then returns.  */
+
+static void
+ix86_code_end (void)
+{
+  rtx xops[2];
+  int regno;
+
+  for (regno = AX_REG; regno <= SP_REG; regno++)
+    {
+      char name[32];
+      tree decl;
+
+      if (!(pic_labels_used & (1 << regno)))
+	continue;
+
+      get_pc_thunk_name (name, regno);
+
+      decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
+			 get_identifier (name),
+			 build_function_type_list (void_type_node, NULL_TREE));
+      DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
+				       NULL_TREE, void_type_node);
+      TREE_PUBLIC (decl) = 1;
+      TREE_STATIC (decl) = 1;
+      DECL_IGNORED_P (decl) = 1;
+
+#if TARGET_MACHO
+      if (TARGET_MACHO)
+	{
+	  switch_to_section (darwin_sections[text_coal_section]);
+	  fputs ("\t.weak_definition\t", asm_out_file);
+	  assemble_name (asm_out_file, name);
+	  fputs ("\n\t.private_extern\t", asm_out_file);
+	  assemble_name (asm_out_file, name);
+	  putc ('\n', asm_out_file);
+	  ASM_OUTPUT_LABEL (asm_out_file, name);
+	  DECL_WEAK (decl) = 1;
+	}
+      else
+#endif
+      if (USE_HIDDEN_LINKONCE)
+	{
+	  DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
+
+	  targetm.asm_out.unique_section (decl, 0);
+	  switch_to_section (get_named_section (decl, NULL, 0));
+
+	  targetm.asm_out.globalize_label (asm_out_file, name);
+	  fputs ("\t.hidden\t", asm_out_file);
+	  assemble_name (asm_out_file, name);
+	  putc ('\n', asm_out_file);
+	  ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
+	}
+      else
+	{
+	  switch_to_section (text_section);
+	  ASM_OUTPUT_LABEL (asm_out_file, name);
+	}
+
+      DECL_INITIAL (decl) = make_node (BLOCK);
+      current_function_decl = decl;
+      init_function_start (decl);
+      first_function_block_is_cold = false;
+      /* Make sure unwind info is emitted for the thunk if needed.  */
+      final_start_function (emit_barrier (), asm_out_file, 1);
+
+      /* Pad stack IP move with 4 instructions (two NOPs count
+	 as one instruction).  */
+      if (TARGET_PAD_SHORT_FUNCTION)
+	{
+	  int i = 8;
+
+	  while (i--)
+	    fputs ("\tnop\n", asm_out_file);
+	}
+
+      xops[0] = gen_rtx_REG (Pmode, regno);
+      xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
+      output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
+      fputs ("\tret\n", asm_out_file);
+      final_end_function ();
+      init_insn_lengths ();
+      free_after_compilation (cfun);
+      set_cfun (NULL);
+      current_function_decl = NULL;
+    }
+
+  if (flag_split_stack)
+    file_end_indicate_split_stack ();
+}
+
+/* Emit code for the SET_GOT patterns.  */
+
+const char *
+output_set_got (rtx dest, rtx label)
+{
+  rtx xops[3];
+
+  xops[0] = dest;
+
+  if (TARGET_VXWORKS_RTP && flag_pic)
+    {
+      /* Load (*VXWORKS_GOTT_BASE) into the PIC register.  */
+      xops[2] = gen_rtx_MEM (Pmode,
+			     gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
+      output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
+
+      /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
+	 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
+	 an unadorned address.  */
+      xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
+      SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
+      output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
+      return "";
+    }
+
+  xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
+
+  if (!flag_pic)
+    {
+      if (TARGET_MACHO)
+	/* We don't need a pic base, we're not producing pic.  */
+	gcc_unreachable ();
+
+      xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
+      output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
+      targetm.asm_out.internal_label (asm_out_file, "L",
+				      CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
+    }
+  else
+    {
+      char name[32];
+      get_pc_thunk_name (name, REGNO (dest));
+      pic_labels_used |= 1 << REGNO (dest);
+
+      xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
+      xops[2] = gen_rtx_MEM (QImode, xops[2]);
+      output_asm_insn ("call\t%X2", xops);
+
+#if TARGET_MACHO
+      /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
+         This is what will be referenced by the Mach-O PIC subsystem.  */
+      if (machopic_should_output_picbase_label () || !label)
+	ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
+
+      /* When we are restoring the pic base at the site of a nonlocal label,
+         and we decided to emit the pic base above, we will still output a
+         local label used for calculating the correction offset (even though
+         the offset will be 0 in that case).  */
+      if (label)
+        targetm.asm_out.internal_label (asm_out_file, "L",
+					   CODE_LABEL_NUMBER (label));
+#endif
+    }
+
+  if (!TARGET_MACHO)
+    output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
+
+  return "";
+}
+
+/* Generate an "push" pattern for input ARG.  */
+
+static rtx
+gen_push (rtx arg)
+{
+  struct machine_function *m = cfun->machine;
+
+  if (m->fs.cfa_reg == stack_pointer_rtx)
+    m->fs.cfa_offset += UNITS_PER_WORD;
+  m->fs.sp_offset += UNITS_PER_WORD;
+
+  if (REG_P (arg) && GET_MODE (arg) != word_mode)
+    arg = gen_rtx_REG (word_mode, REGNO (arg));
+
+  return gen_rtx_SET (VOIDmode,
+		      gen_rtx_MEM (word_mode,
+				   gen_rtx_PRE_DEC (Pmode,
+						    stack_pointer_rtx)),
+		      arg);
+}
+
+/* Generate an "pop" pattern for input ARG.  */
+
+static rtx
+gen_pop (rtx arg)
+{
+  if (REG_P (arg) && GET_MODE (arg) != word_mode)
+    arg = gen_rtx_REG (word_mode, REGNO (arg));
+
+  return gen_rtx_SET (VOIDmode,
+		      arg,
+		      gen_rtx_MEM (word_mode,
+				   gen_rtx_POST_INC (Pmode,
+						     stack_pointer_rtx)));
+}
+
+/* Return >= 0 if there is an unused call-clobbered register available
+   for the entire function.  */
+
+static unsigned int
+ix86_select_alt_pic_regnum (void)
+{
+  if (crtl->is_leaf
+      && !crtl->profile
+      && !ix86_current_function_calls_tls_descriptor)
+    {
+      int i, drap;
+      /* Can't use the same register for both PIC and DRAP.  */
+      if (crtl->drap_reg)
+	drap = REGNO (crtl->drap_reg);
+      else
+	drap = -1;
+      for (i = 2; i >= 0; --i)
+        if (i != drap && !df_regs_ever_live_p (i))
+	  return i;
+    }
+
+  return INVALID_REGNUM;
+}
+
+/* Return TRUE if we need to save REGNO.  */
+
+static bool
+ix86_save_reg (unsigned int regno, bool maybe_eh_return)
+{
+  if (pic_offset_table_rtx
+      && regno == REAL_PIC_OFFSET_TABLE_REGNUM
+      && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
+	  || crtl->profile
+	  || crtl->calls_eh_return
+	  || crtl->uses_const_pool
+	  || cfun->has_nonlocal_label))
+    return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
+
+  if (crtl->calls_eh_return && maybe_eh_return)
+    {
+      unsigned i;
+      for (i = 0; ; i++)
+	{
+	  unsigned test = EH_RETURN_DATA_REGNO (i);
+	  if (test == INVALID_REGNUM)
+	    break;
+	  if (test == regno)
+	    return true;
+	}
+    }
+
+  if (crtl->drap_reg
+      && regno == REGNO (crtl->drap_reg)
+      && !cfun->machine->no_drap_save_restore)
+    return true;
+
+  return (df_regs_ever_live_p (regno)
+	  && !call_used_regs[regno]
+	  && !fixed_regs[regno]
+	  && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
+}
+
+/* Return number of saved general prupose registers.  */
+
+static int
+ix86_nsaved_regs (void)
+{
+  int nregs = 0;
+  int regno;
+
+  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+    if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
+      nregs ++;
+  return nregs;
+}
+
+/* Return number of saved SSE registrers.  */
+
+static int
+ix86_nsaved_sseregs (void)
+{
+  int nregs = 0;
+  int regno;
+
+  if (!TARGET_64BIT_MS_ABI)
+    return 0;
+  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+    if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
+      nregs ++;
+  return nregs;
+}
+
+/* Given FROM and TO register numbers, say whether this elimination is
+   allowed.  If stack alignment is needed, we can only replace argument
+   pointer with hard frame pointer, or replace frame pointer with stack
+   pointer.  Otherwise, frame pointer elimination is automatically
+   handled and all other eliminations are valid.  */
+
+static bool
+ix86_can_eliminate (const int from, const int to)
+{
+  if (stack_realign_fp)
+    return ((from == ARG_POINTER_REGNUM
+	     && to == HARD_FRAME_POINTER_REGNUM)
+	    || (from == FRAME_POINTER_REGNUM
+		&& to == STACK_POINTER_REGNUM));
+  else
+    return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
+}
+
+/* Return the offset between two registers, one to be eliminated, and the other
+   its replacement, at the start of a routine.  */
+
+HOST_WIDE_INT
+ix86_initial_elimination_offset (int from, int to)
+{
+  struct ix86_frame frame;
+  ix86_compute_frame_layout (&frame);
+
+  if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
+    return frame.hard_frame_pointer_offset;
+  else if (from == FRAME_POINTER_REGNUM
+	   && to == HARD_FRAME_POINTER_REGNUM)
+    return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
+  else
+    {
+      gcc_assert (to == STACK_POINTER_REGNUM);
+
+      if (from == ARG_POINTER_REGNUM)
+	return frame.stack_pointer_offset;
+
+      gcc_assert (from == FRAME_POINTER_REGNUM);
+      return frame.stack_pointer_offset - frame.frame_pointer_offset;
+    }
+}
+
+/* In a dynamically-aligned function, we can't know the offset from
+   stack pointer to frame pointer, so we must ensure that setjmp
+   eliminates fp against the hard fp (%ebp) rather than trying to
+   index from %esp up to the top of the frame across a gap that is
+   of unknown (at compile-time) size.  */
+static rtx
+ix86_builtin_setjmp_frame_value (void)
+{
+  return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
+}
+
+/* When using -fsplit-stack, the allocation routines set a field in
+   the TCB to the bottom of the stack plus this much space, measured
+   in bytes.  */
+
+#define SPLIT_STACK_AVAILABLE 256
+
+/* Fill structure ix86_frame about frame of currently computed function.  */
+
+static void
+ix86_compute_frame_layout (struct ix86_frame *frame)
+{
+  unsigned HOST_WIDE_INT stack_alignment_needed;
+  HOST_WIDE_INT offset;
+  unsigned HOST_WIDE_INT preferred_alignment;
+  HOST_WIDE_INT size = get_frame_size ();
+  HOST_WIDE_INT to_allocate;
+
+  frame->nregs = ix86_nsaved_regs ();
+  frame->nsseregs = ix86_nsaved_sseregs ();
+
+  stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
+  preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
+
+  /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
+     function prologues and leaf.  */
+  if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
+      && (!crtl->is_leaf || cfun->calls_alloca != 0
+          || ix86_current_function_calls_tls_descriptor))
+    {
+      preferred_alignment = 16;
+      stack_alignment_needed = 16;
+      crtl->preferred_stack_boundary = 128;
+      crtl->stack_alignment_needed = 128;
+    }
+
+  gcc_assert (!size || stack_alignment_needed);
+  gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
+  gcc_assert (preferred_alignment <= stack_alignment_needed);
+
+  /* For SEH we have to limit the amount of code movement into the prologue.
+     At present we do this via a BLOCKAGE, at which point there's very little
+     scheduling that can be done, which means that there's very little point
+     in doing anything except PUSHs.  */
+  if (TARGET_SEH)
+    cfun->machine->use_fast_prologue_epilogue = false;
+
+  /* During reload iteration the amount of registers saved can change.
+     Recompute the value as needed.  Do not recompute when amount of registers
+     didn't change as reload does multiple calls to the function and does not
+     expect the decision to change within single iteration.  */
+  else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun))
+           && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
+    {
+      int count = frame->nregs;
+      struct cgraph_node *node = cgraph_get_node (current_function_decl);
+
+      cfun->machine->use_fast_prologue_epilogue_nregs = count;
+
+      /* The fast prologue uses move instead of push to save registers.  This
+         is significantly longer, but also executes faster as modern hardware
+         can execute the moves in parallel, but can't do that for push/pop.
+
+	 Be careful about choosing what prologue to emit:  When function takes
+	 many instructions to execute we may use slow version as well as in
+	 case function is known to be outside hot spot (this is known with
+	 feedback only).  Weight the size of function by number of registers
+	 to save as it is cheap to use one or two push instructions but very
+	 slow to use many of them.  */
+      if (count)
+	count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
+      if (node->frequency < NODE_FREQUENCY_NORMAL
+	  || (flag_branch_probabilities
+	      && node->frequency < NODE_FREQUENCY_HOT))
+        cfun->machine->use_fast_prologue_epilogue = false;
+      else
+        cfun->machine->use_fast_prologue_epilogue
+	   = !expensive_function_p (count);
+    }
+
+  frame->save_regs_using_mov
+    = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
+       /* If static stack checking is enabled and done with probes,
+	  the registers need to be saved before allocating the frame.  */
+       && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
+
+  /* Skip return address.  */
+  offset = UNITS_PER_WORD;
+
+  /* Skip pushed static chain.  */
+  if (ix86_static_chain_on_stack)
+    offset += UNITS_PER_WORD;
+
+  /* Skip saved base pointer.  */
+  if (frame_pointer_needed)
+    offset += UNITS_PER_WORD;
+  frame->hfp_save_offset = offset;
+
+  /* The traditional frame pointer location is at the top of the frame.  */
+  frame->hard_frame_pointer_offset = offset;
+
+  /* Register save area */
+  offset += frame->nregs * UNITS_PER_WORD;
+  frame->reg_save_offset = offset;
+
+  /* On SEH target, registers are pushed just before the frame pointer
+     location.  */
+  if (TARGET_SEH)
+    frame->hard_frame_pointer_offset = offset;
+
+  /* Align and set SSE register save area.  */
+  if (frame->nsseregs)
+    {
+      /* The only ABI that has saved SSE registers (Win64) also has a
+         16-byte aligned default stack, and thus we don't need to be
+	 within the re-aligned local stack frame to save them.  */
+      gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
+      offset = (offset + 16 - 1) & -16;
+      offset += frame->nsseregs * 16;
+    }
+  frame->sse_reg_save_offset = offset;
+
+  /* The re-aligned stack starts here.  Values before this point are not
+     directly comparable with values below this point.  In order to make
+     sure that no value happens to be the same before and after, force
+     the alignment computation below to add a non-zero value.  */
+  if (stack_realign_fp)
+    offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
+
+  /* Va-arg area */
+  frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
+  offset += frame->va_arg_size;
+
+  /* Align start of frame for local function.  */
+  if (stack_realign_fp
+      || offset != frame->sse_reg_save_offset
+      || size != 0
+      || !crtl->is_leaf
+      || cfun->calls_alloca
+      || ix86_current_function_calls_tls_descriptor)
+    offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
+
+  /* Frame pointer points here.  */
+  frame->frame_pointer_offset = offset;
+
+  offset += size;
+
+  /* Add outgoing arguments area.  Can be skipped if we eliminated
+     all the function calls as dead code.
+     Skipping is however impossible when function calls alloca.  Alloca
+     expander assumes that last crtl->outgoing_args_size
+     of stack frame are unused.  */
+  if (ACCUMULATE_OUTGOING_ARGS
+      && (!crtl->is_leaf || cfun->calls_alloca
+	  || ix86_current_function_calls_tls_descriptor))
+    {
+      offset += crtl->outgoing_args_size;
+      frame->outgoing_arguments_size = crtl->outgoing_args_size;
+    }
+  else
+    frame->outgoing_arguments_size = 0;
+
+  /* Align stack boundary.  Only needed if we're calling another function
+     or using alloca.  */
+  if (!crtl->is_leaf || cfun->calls_alloca
+      || ix86_current_function_calls_tls_descriptor)
+    offset = (offset + preferred_alignment - 1) & -preferred_alignment;
+
+  /* We've reached end of stack frame.  */
+  frame->stack_pointer_offset = offset;
+
+  /* Size prologue needs to allocate.  */
+  to_allocate = offset - frame->sse_reg_save_offset;
+
+  if ((!to_allocate && frame->nregs <= 1)
+      || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
+    frame->save_regs_using_mov = false;
+
+  if (ix86_using_red_zone ()
+      && crtl->sp_is_unchanging
+      && crtl->is_leaf
+      && !ix86_current_function_calls_tls_descriptor)
+    {
+      frame->red_zone_size = to_allocate;
+      if (frame->save_regs_using_mov)
+	frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
+      if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
+	frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
+    }
+  else
+    frame->red_zone_size = 0;
+  frame->stack_pointer_offset -= frame->red_zone_size;
+
+  /* The SEH frame pointer location is near the bottom of the frame.
+     This is enforced by the fact that the difference between the
+     stack pointer and the frame pointer is limited to 240 bytes in
+     the unwind data structure.  */
+  if (TARGET_SEH)
+    {
+      HOST_WIDE_INT diff;
+
+      /* If we can leave the frame pointer where it is, do so.  Also, returns
+	 the establisher frame for __builtin_frame_address (0).  */
+      diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
+      if (diff <= SEH_MAX_FRAME_SIZE
+	  && (diff > 240 || (diff & 15) != 0)
+	  && !crtl->accesses_prior_frames)
+	{
+	  /* Ideally we'd determine what portion of the local stack frame
+	     (within the constraint of the lowest 240) is most heavily used.
+	     But without that complication, simply bias the frame pointer
+	     by 128 bytes so as to maximize the amount of the local stack
+	     frame that is addressable with 8-bit offsets.  */
+	  frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
+	}
+    }
+}
+
+/* This is semi-inlined memory_address_length, but simplified
+   since we know that we're always dealing with reg+offset, and
+   to avoid having to create and discard all that rtl.  */
+
+static inline int
+choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
+{
+  int len = 4;
+
+  if (offset == 0)
+    {
+      /* EBP and R13 cannot be encoded without an offset.  */
+      len = (regno == BP_REG || regno == R13_REG);
+    }
+  else if (IN_RANGE (offset, -128, 127))
+    len = 1;
+
+  /* ESP and R12 must be encoded with a SIB byte.  */
+  if (regno == SP_REG || regno == R12_REG)
+    len++;
+
+  return len;
+}
+
+/* Return an RTX that points to CFA_OFFSET within the stack frame.
+   The valid base registers are taken from CFUN->MACHINE->FS.  */
+
+static rtx
+choose_baseaddr (HOST_WIDE_INT cfa_offset)
+{
+  const struct machine_function *m = cfun->machine;
+  rtx base_reg = NULL;
+  HOST_WIDE_INT base_offset = 0;
+
+  if (m->use_fast_prologue_epilogue)
+    {
+      /* Choose the base register most likely to allow the most scheduling
+         opportunities.  Generally FP is valid throughout the function,
+         while DRAP must be reloaded within the epilogue.  But choose either
+         over the SP due to increased encoding size.  */
+
+      if (m->fs.fp_valid)
+	{
+	  base_reg = hard_frame_pointer_rtx;
+	  base_offset = m->fs.fp_offset - cfa_offset;
+	}
+      else if (m->fs.drap_valid)
+	{
+	  base_reg = crtl->drap_reg;
+	  base_offset = 0 - cfa_offset;
+	}
+      else if (m->fs.sp_valid)
+	{
+	  base_reg = stack_pointer_rtx;
+	  base_offset = m->fs.sp_offset - cfa_offset;
+	}
+    }
+  else
+    {
+      HOST_WIDE_INT toffset;
+      int len = 16, tlen;
+
+      /* Choose the base register with the smallest address encoding.
+         With a tie, choose FP > DRAP > SP.  */
+      if (m->fs.sp_valid)
+	{
+	  base_reg = stack_pointer_rtx;
+	  base_offset = m->fs.sp_offset - cfa_offset;
+          len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
+	}
+      if (m->fs.drap_valid)
+	{
+	  toffset = 0 - cfa_offset;
+	  tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
+	  if (tlen <= len)
+	    {
+	      base_reg = crtl->drap_reg;
+	      base_offset = toffset;
+	      len = tlen;
+	    }
+	}
+      if (m->fs.fp_valid)
+	{
+	  toffset = m->fs.fp_offset - cfa_offset;
+	  tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
+	  if (tlen <= len)
+	    {
+	      base_reg = hard_frame_pointer_rtx;
+	      base_offset = toffset;
+	      len = tlen;
+	    }
+	}
+    }
+  gcc_assert (base_reg != NULL);
+
+  return plus_constant (Pmode, base_reg, base_offset);
+}
+
+/* Emit code to save registers in the prologue.  */
+
+static void
+ix86_emit_save_regs (void)
+{
+  unsigned int regno;
+  rtx insn;
+
+  for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
+    if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
+      {
+	insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
+	RTX_FRAME_RELATED_P (insn) = 1;
+      }
+}
+
+/* Emit a single register save at CFA - CFA_OFFSET.  */
+
+static void
+ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
+			      HOST_WIDE_INT cfa_offset)
+{
+  struct machine_function *m = cfun->machine;
+  rtx reg = gen_rtx_REG (mode, regno);
+  rtx mem, addr, base, insn;
+
+  addr = choose_baseaddr (cfa_offset);
+  mem = gen_frame_mem (mode, addr);
+
+  /* For SSE saves, we need to indicate the 128-bit alignment.  */
+  set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
+
+  insn = emit_move_insn (mem, reg);
+  RTX_FRAME_RELATED_P (insn) = 1;
+
+  base = addr;
+  if (GET_CODE (base) == PLUS)
+    base = XEXP (base, 0);
+  gcc_checking_assert (REG_P (base));
+
+  /* When saving registers into a re-aligned local stack frame, avoid
+     any tricky guessing by dwarf2out.  */
+  if (m->fs.realigned)
+    {
+      gcc_checking_assert (stack_realign_drap);
+
+      if (regno == REGNO (crtl->drap_reg))
+	{
+	  /* A bit of a hack.  We force the DRAP register to be saved in
+	     the re-aligned stack frame, which provides us with a copy
+	     of the CFA that will last past the prologue.  Install it.  */
+	  gcc_checking_assert (cfun->machine->fs.fp_valid);
+	  addr = plus_constant (Pmode, hard_frame_pointer_rtx,
+				cfun->machine->fs.fp_offset - cfa_offset);
+	  mem = gen_rtx_MEM (mode, addr);
+	  add_reg_note (insn, REG_CFA_DEF_CFA, mem);
+	}
+      else
+	{
+	  /* The frame pointer is a stable reference within the
+	     aligned frame.  Use it.  */
+	  gcc_checking_assert (cfun->machine->fs.fp_valid);
+	  addr = plus_constant (Pmode, hard_frame_pointer_rtx,
+				cfun->machine->fs.fp_offset - cfa_offset);
+	  mem = gen_rtx_MEM (mode, addr);
+	  add_reg_note (insn, REG_CFA_EXPRESSION,
+			gen_rtx_SET (VOIDmode, mem, reg));
+	}
+    }
+
+  /* The memory may not be relative to the current CFA register,
+     which means that we may need to generate a new pattern for
+     use by the unwind info.  */
+  else if (base != m->fs.cfa_reg)
+    {
+      addr = plus_constant (Pmode, m->fs.cfa_reg,
+			    m->fs.cfa_offset - cfa_offset);
+      mem = gen_rtx_MEM (mode, addr);
+      add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
+    }
+}
+
+/* Emit code to save registers using MOV insns.
+   First register is stored at CFA - CFA_OFFSET.  */
+static void
+ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
+{
+  unsigned int regno;
+
+  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+    if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
+      {
+        ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
+	cfa_offset -= UNITS_PER_WORD;
+      }
+}
+
+/* Emit code to save SSE registers using MOV insns.
+   First register is stored at CFA - CFA_OFFSET.  */
+static void
+ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
+{
+  unsigned int regno;
+
+  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+    if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
+      {
+	ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
+	cfa_offset -= 16;
+      }
+}
+
+static GTY(()) rtx queued_cfa_restores;
+
+/* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
+   manipulation insn.  The value is on the stack at CFA - CFA_OFFSET.
+   Don't add the note if the previously saved value will be left untouched
+   within stack red-zone till return, as unwinders can find the same value
+   in the register and on the stack.  */
+
+static void
+ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
+{
+  if (!crtl->shrink_wrapped
+      && cfa_offset <= cfun->machine->fs.red_zone_offset)
+    return;
+
+  if (insn)
+    {
+      add_reg_note (insn, REG_CFA_RESTORE, reg);
+      RTX_FRAME_RELATED_P (insn) = 1;
+    }
+  else
+    queued_cfa_restores
+      = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
+}
+
+/* Add queued REG_CFA_RESTORE notes if any to INSN.  */
+
+static void
+ix86_add_queued_cfa_restore_notes (rtx insn)
+{
+  rtx last;
+  if (!queued_cfa_restores)
+    return;
+  for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
+    ;
+  XEXP (last, 1) = REG_NOTES (insn);
+  REG_NOTES (insn) = queued_cfa_restores;
+  queued_cfa_restores = NULL_RTX;
+  RTX_FRAME_RELATED_P (insn) = 1;
+}
+
+/* Expand prologue or epilogue stack adjustment.
+   The pattern exist to put a dependency on all ebp-based memory accesses.
+   STYLE should be negative if instructions should be marked as frame related,
+   zero if %r11 register is live and cannot be freely used and positive
+   otherwise.  */
+
+static void
+pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
+			   int style, bool set_cfa)
+{
+  struct machine_function *m = cfun->machine;
+  rtx insn;
+  bool add_frame_related_expr = false;
+
+  if (Pmode == SImode)
+    insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
+  else if (x86_64_immediate_operand (offset, DImode))
+    insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
+  else
+    {
+      rtx tmp;
+      /* r11 is used by indirect sibcall return as well, set before the
+	 epilogue and used after the epilogue.  */
+      if (style)
+        tmp = gen_rtx_REG (DImode, R11_REG);
+      else
+	{
+	  gcc_assert (src != hard_frame_pointer_rtx
+		      && dest != hard_frame_pointer_rtx);
+	  tmp = hard_frame_pointer_rtx;
+	}
+      insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
+      if (style < 0)
+	add_frame_related_expr = true;
+
+      insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
+    }
+
+  insn = emit_insn (insn);
+  if (style >= 0)
+    ix86_add_queued_cfa_restore_notes (insn);
+
+  if (set_cfa)
+    {
+      rtx r;
+
+      gcc_assert (m->fs.cfa_reg == src);
+      m->fs.cfa_offset += INTVAL (offset);
+      m->fs.cfa_reg = dest;
+
+      r = gen_rtx_PLUS (Pmode, src, offset);
+      r = gen_rtx_SET (VOIDmode, dest, r);
+      add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
+      RTX_FRAME_RELATED_P (insn) = 1;
+    }
+  else if (style < 0)
+    {
+      RTX_FRAME_RELATED_P (insn) = 1;
+      if (add_frame_related_expr)
+	{
+	  rtx r = gen_rtx_PLUS (Pmode, src, offset);
+	  r = gen_rtx_SET (VOIDmode, dest, r);
+	  add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
+	}
+    }
+
+  if (dest == stack_pointer_rtx)
+    {
+      HOST_WIDE_INT ooffset = m->fs.sp_offset;
+      bool valid = m->fs.sp_valid;
+
+      if (src == hard_frame_pointer_rtx)
+	{
+	  valid = m->fs.fp_valid;
+	  ooffset = m->fs.fp_offset;
+	}
+      else if (src == crtl->drap_reg)
+	{
+	  valid = m->fs.drap_valid;
+	  ooffset = 0;
+	}
+      else
+	{
+	  /* Else there are two possibilities: SP itself, which we set
+	     up as the default above.  Or EH_RETURN_STACKADJ_RTX, which is
+	     taken care of this by hand along the eh_return path.  */
+	  gcc_checking_assert (src == stack_pointer_rtx
+			       || offset == const0_rtx);
+	}
+
+      m->fs.sp_offset = ooffset - INTVAL (offset);
+      m->fs.sp_valid = valid;
+    }
+}
+
+/* Find an available register to be used as dynamic realign argument
+   pointer regsiter.  Such a register will be written in prologue and
+   used in begin of body, so it must not be
+	1. parameter passing register.
+	2. GOT pointer.
+   We reuse static-chain register if it is available.  Otherwise, we
+   use DI for i386 and R13 for x86-64.  We chose R13 since it has
+   shorter encoding.
+
+   Return: the regno of chosen register.  */
+
+static unsigned int
+find_drap_reg (void)
+{
+  tree decl = cfun->decl;
+
+  if (TARGET_64BIT)
+    {
+      /* Use R13 for nested function or function need static chain.
+	 Since function with tail call may use any caller-saved
+	 registers in epilogue, DRAP must not use caller-saved
+	 register in such case.  */
+      if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
+	return R13_REG;
+
+      return R10_REG;
+    }
+  else
+    {
+      /* Use DI for nested function or function need static chain.
+	 Since function with tail call may use any caller-saved
+	 registers in epilogue, DRAP must not use caller-saved
+	 register in such case.  */
+      if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
+	return DI_REG;
+
+      /* Reuse static chain register if it isn't used for parameter
+         passing.  */
+      if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
+	{
+	  unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
+	  if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
+	    return CX_REG;
+	}
+      return DI_REG;
+    }
+}
+
+/* Return minimum incoming stack alignment.  */
+
+static unsigned int
+ix86_minimum_incoming_stack_boundary (bool sibcall)
+{
+  unsigned int incoming_stack_boundary;
+
+  /* Prefer the one specified at command line. */
+  if (ix86_user_incoming_stack_boundary)
+    incoming_stack_boundary = ix86_user_incoming_stack_boundary;
+  /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
+     if -mstackrealign is used, it isn't used for sibcall check and
+     estimated stack alignment is 128bit.  */
+  else if (!sibcall
+	   && !TARGET_64BIT
+	   && ix86_force_align_arg_pointer
+	   && crtl->stack_alignment_estimated == 128)
+    incoming_stack_boundary = MIN_STACK_BOUNDARY;
+  else
+    incoming_stack_boundary = ix86_default_incoming_stack_boundary;
+
+  /* Incoming stack alignment can be changed on individual functions
+     via force_align_arg_pointer attribute.  We use the smallest
+     incoming stack boundary.  */
+  if (incoming_stack_boundary > MIN_STACK_BOUNDARY
+      && lookup_attribute (ix86_force_align_arg_pointer_string,
+			   TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
+    incoming_stack_boundary = MIN_STACK_BOUNDARY;
+
+  /* The incoming stack frame has to be aligned at least at
+     parm_stack_boundary.  */
+  if (incoming_stack_boundary < crtl->parm_stack_boundary)
+    incoming_stack_boundary = crtl->parm_stack_boundary;
+
+  /* Stack at entrance of main is aligned by runtime.  We use the
+     smallest incoming stack boundary. */
+  if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
+      && DECL_NAME (current_function_decl)
+      && MAIN_NAME_P (DECL_NAME (current_function_decl))
+      && DECL_FILE_SCOPE_P (current_function_decl))
+    incoming_stack_boundary = MAIN_STACK_BOUNDARY;
+
+  return incoming_stack_boundary;
+}
+
+/* Update incoming stack boundary and estimated stack alignment.  */
+
+static void
+ix86_update_stack_boundary (void)
+{
+  ix86_incoming_stack_boundary
+    = ix86_minimum_incoming_stack_boundary (false);
+
+  /* x86_64 vararg needs 16byte stack alignment for register save
+     area.  */
+  if (TARGET_64BIT
+      && cfun->stdarg
+      && crtl->stack_alignment_estimated < 128)
+    crtl->stack_alignment_estimated = 128;
+}
+
+/* Handle the TARGET_GET_DRAP_RTX hook.  Return NULL if no DRAP is
+   needed or an rtx for DRAP otherwise.  */
+
+static rtx
+ix86_get_drap_rtx (void)
+{
+  if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
+    crtl->need_drap = true;
+
+  if (stack_realign_drap)
+    {
+      /* Assign DRAP to vDRAP and returns vDRAP */
+      unsigned int regno = find_drap_reg ();
+      rtx drap_vreg;
+      rtx arg_ptr;
+      rtx seq, insn;
+
+      arg_ptr = gen_rtx_REG (Pmode, regno);
+      crtl->drap_reg = arg_ptr;
+
+      start_sequence ();
+      drap_vreg = copy_to_reg (arg_ptr);
+      seq = get_insns ();
+      end_sequence ();
+
+      insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
+      if (!optimize)
+	{
+	  add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+      return drap_vreg;
+    }
+  else
+    return NULL;
+}
+
+/* Handle the TARGET_INTERNAL_ARG_POINTER hook.  */
+
+static rtx
+ix86_internal_arg_pointer (void)
+{
+  return virtual_incoming_args_rtx;
+}
+
+struct scratch_reg {
+  rtx reg;
+  bool saved;
+};
+
+/* Return a short-lived scratch register for use on function entry.
+   In 32-bit mode, it is valid only after the registers are saved
+   in the prologue.  This register must be released by means of
+   release_scratch_register_on_entry once it is dead.  */
+
+static void
+get_scratch_register_on_entry (struct scratch_reg *sr)
+{
+  int regno;
+
+  sr->saved = false;
+
+  if (TARGET_64BIT)
+    {
+      /* We always use R11 in 64-bit mode.  */
+      regno = R11_REG;
+    }
+  else
+    {
+      tree decl = current_function_decl, fntype = TREE_TYPE (decl);
+      bool fastcall_p
+	= lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
+      bool thiscall_p
+	= lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
+      bool static_chain_p = DECL_STATIC_CHAIN (decl);
+      int regparm = ix86_function_regparm (fntype, decl);
+      int drap_regno
+	= crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
+
+      /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
+	  for the static chain register.  */
+      if ((regparm < 1 || (fastcall_p && !static_chain_p))
+	  && drap_regno != AX_REG)
+	regno = AX_REG;
+      /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
+	  for the static chain register.  */
+      else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
+        regno = AX_REG;
+      else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
+	regno = DX_REG;
+      /* ecx is the static chain register.  */
+      else if (regparm < 3 && !fastcall_p && !thiscall_p
+	       && !static_chain_p
+	       && drap_regno != CX_REG)
+	regno = CX_REG;
+      else if (ix86_save_reg (BX_REG, true))
+	regno = BX_REG;
+      /* esi is the static chain register.  */
+      else if (!(regparm == 3 && static_chain_p)
+	       && ix86_save_reg (SI_REG, true))
+	regno = SI_REG;
+      else if (ix86_save_reg (DI_REG, true))
+	regno = DI_REG;
+      else
+	{
+	  regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
+	  sr->saved = true;
+	}
+    }
+
+  sr->reg = gen_rtx_REG (Pmode, regno);
+  if (sr->saved)
+    {
+      rtx insn = emit_insn (gen_push (sr->reg));
+      RTX_FRAME_RELATED_P (insn) = 1;
+    }
+}
+
+/* Release a scratch register obtained from the preceding function.  */
+
+static void
+release_scratch_register_on_entry (struct scratch_reg *sr)
+{
+  if (sr->saved)
+    {
+      struct machine_function *m = cfun->machine;
+      rtx x, insn = emit_insn (gen_pop (sr->reg));
+
+      /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop.  */
+      RTX_FRAME_RELATED_P (insn) = 1;
+      x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
+      x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
+      add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
+      m->fs.sp_offset -= UNITS_PER_WORD;
+    }
+}
+
+#define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
+
+/* Emit code to adjust the stack pointer by SIZE bytes while probing it.  */
+
+static void
+ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
+{
+  /* We skip the probe for the first interval + a small dope of 4 words and
+     probe that many bytes past the specified size to maintain a protection
+     area at the botton of the stack.  */
+  const int dope = 4 * UNITS_PER_WORD;
+  rtx size_rtx = GEN_INT (size), last;
+
+  /* See if we have a constant small number of probes to generate.  If so,
+     that's the easy case.  The run-time loop is made up of 11 insns in the
+     generic case while the compile-time loop is made up of 3+2*(n-1) insns
+     for n # of intervals.  */
+  if (size <= 5 * PROBE_INTERVAL)
+    {
+      HOST_WIDE_INT i, adjust;
+      bool first_probe = true;
+
+      /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
+	 values of N from 1 until it exceeds SIZE.  If only one probe is
+	 needed, this will not generate any code.  Then adjust and probe
+	 to PROBE_INTERVAL + SIZE.  */
+      for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
+	{
+	  if (first_probe)
+	    {
+	      adjust = 2 * PROBE_INTERVAL + dope;
+	      first_probe = false;
+	    }
+	  else
+	    adjust = PROBE_INTERVAL;
+
+	  emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+				  plus_constant (Pmode, stack_pointer_rtx,
+						 -adjust)));
+	  emit_stack_probe (stack_pointer_rtx);
+	}
+
+      if (first_probe)
+	adjust = size + PROBE_INTERVAL + dope;
+      else
+        adjust = size + PROBE_INTERVAL - i;
+
+      emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+			      plus_constant (Pmode, stack_pointer_rtx,
+					     -adjust)));
+      emit_stack_probe (stack_pointer_rtx);
+
+      /* Adjust back to account for the additional first interval.  */
+      last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+				     plus_constant (Pmode, stack_pointer_rtx,
+						    PROBE_INTERVAL + dope)));
+    }
+
+  /* Otherwise, do the same as above, but in a loop.  Note that we must be
+     extra careful with variables wrapping around because we might be at
+     the very top (or the very bottom) of the address space and we have
+     to be able to handle this case properly; in particular, we use an
+     equality test for the loop condition.  */
+  else
+    {
+      HOST_WIDE_INT rounded_size;
+      struct scratch_reg sr;
+
+      get_scratch_register_on_entry (&sr);
+
+
+      /* Step 1: round SIZE to the previous multiple of the interval.  */
+
+      rounded_size = size & -PROBE_INTERVAL;
+
+
+      /* Step 2: compute initial and final value of the loop counter.  */
+
+      /* SP = SP_0 + PROBE_INTERVAL.  */
+      emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+			      plus_constant (Pmode, stack_pointer_rtx,
+					     - (PROBE_INTERVAL + dope))));
+
+      /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE.  */
+      emit_move_insn (sr.reg, GEN_INT (-rounded_size));
+      emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
+			      gen_rtx_PLUS (Pmode, sr.reg,
+					    stack_pointer_rtx)));
+
+
+      /* Step 3: the loop
+
+	 while (SP != LAST_ADDR)
+	   {
+	     SP = SP + PROBE_INTERVAL
+	     probe at SP
+	   }
+
+	 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
+	 values of N from 1 until it is equal to ROUNDED_SIZE.  */
+
+      emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
+
+
+      /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
+	 assert at compile-time that SIZE is equal to ROUNDED_SIZE.  */
+
+      if (size != rounded_size)
+	{
+	  emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+			          plus_constant (Pmode, stack_pointer_rtx,
+						 rounded_size - size)));
+	  emit_stack_probe (stack_pointer_rtx);
+	}
+
+      /* Adjust back to account for the additional first interval.  */
+      last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+				     plus_constant (Pmode, stack_pointer_rtx,
+						    PROBE_INTERVAL + dope)));
+
+      release_scratch_register_on_entry (&sr);
+    }
+
+  gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
+
+  /* Even if the stack pointer isn't the CFA register, we need to correctly
+     describe the adjustments made to it, in particular differentiate the
+     frame-related ones from the frame-unrelated ones.  */
+  if (size > 0)
+    {
+      rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
+      XVECEXP (expr, 0, 0)
+	= gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+		       plus_constant (Pmode, stack_pointer_rtx, -size));
+      XVECEXP (expr, 0, 1)
+	= gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+		       plus_constant (Pmode, stack_pointer_rtx,
+				      PROBE_INTERVAL + dope + size));
+      add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
+      RTX_FRAME_RELATED_P (last) = 1;
+
+      cfun->machine->fs.sp_offset += size;
+    }
+
+  /* Make sure nothing is scheduled before we are done.  */
+  emit_insn (gen_blockage ());
+}
+
+/* Adjust the stack pointer up to REG while probing it.  */
+
+const char *
+output_adjust_stack_and_probe (rtx reg)
+{
+  static int labelno = 0;
+  char loop_lab[32], end_lab[32];
+  rtx xops[2];
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
+  ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
+
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
+
+  /* Jump to END_LAB if SP == LAST_ADDR.  */
+  xops[0] = stack_pointer_rtx;
+  xops[1] = reg;
+  output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
+  fputs ("\tje\t", asm_out_file);
+  assemble_name_raw (asm_out_file, end_lab);
+  fputc ('\n', asm_out_file);
+
+  /* SP = SP + PROBE_INTERVAL.  */
+  xops[1] = GEN_INT (PROBE_INTERVAL);
+  output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
+
+  /* Probe at SP.  */
+  xops[1] = const0_rtx;
+  output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
+
+  fprintf (asm_out_file, "\tjmp\t");
+  assemble_name_raw (asm_out_file, loop_lab);
+  fputc ('\n', asm_out_file);
+
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
+
+  return "";
+}
+
+/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
+   inclusive.  These are offsets from the current stack pointer.  */
+
+static void
+ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
+{
+  /* See if we have a constant small number of probes to generate.  If so,
+     that's the easy case.  The run-time loop is made up of 7 insns in the
+     generic case while the compile-time loop is made up of n insns for n #
+     of intervals.  */
+  if (size <= 7 * PROBE_INTERVAL)
+    {
+      HOST_WIDE_INT i;
+
+      /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
+	 it exceeds SIZE.  If only one probe is needed, this will not
+	 generate any code.  Then probe at FIRST + SIZE.  */
+      for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
+	emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+					 -(first + i)));
+
+      emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+				       -(first + size)));
+    }
+
+  /* Otherwise, do the same as above, but in a loop.  Note that we must be
+     extra careful with variables wrapping around because we might be at
+     the very top (or the very bottom) of the address space and we have
+     to be able to handle this case properly; in particular, we use an
+     equality test for the loop condition.  */
+  else
+    {
+      HOST_WIDE_INT rounded_size, last;
+      struct scratch_reg sr;
+
+      get_scratch_register_on_entry (&sr);
+
+
+      /* Step 1: round SIZE to the previous multiple of the interval.  */
+
+      rounded_size = size & -PROBE_INTERVAL;
+
+
+      /* Step 2: compute initial and final value of the loop counter.  */
+
+      /* TEST_OFFSET = FIRST.  */
+      emit_move_insn (sr.reg, GEN_INT (-first));
+
+      /* LAST_OFFSET = FIRST + ROUNDED_SIZE.  */
+      last = first + rounded_size;
+
+
+      /* Step 3: the loop
+
+	 while (TEST_ADDR != LAST_ADDR)
+	   {
+	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
+	     probe at TEST_ADDR
+	   }
+
+         probes at FIRST + N * PROBE_INTERVAL for values of N from 1
+         until it is equal to ROUNDED_SIZE.  */
+
+      emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
+
+
+      /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
+	 that SIZE is equal to ROUNDED_SIZE.  */
+
+      if (size != rounded_size)
+	emit_stack_probe (plus_constant (Pmode,
+					 gen_rtx_PLUS (Pmode,
+						       stack_pointer_rtx,
+						       sr.reg),
+					 rounded_size - size));
+
+      release_scratch_register_on_entry (&sr);
+    }
+
+  /* Make sure nothing is scheduled before we are done.  */
+  emit_insn (gen_blockage ());
+}
+
+/* Probe a range of stack addresses from REG to END, inclusive.  These are
+   offsets from the current stack pointer.  */
+
+const char *
+output_probe_stack_range (rtx reg, rtx end)
+{
+  static int labelno = 0;
+  char loop_lab[32], end_lab[32];
+  rtx xops[3];
+
+  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
+  ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
+
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
+
+  /* Jump to END_LAB if TEST_ADDR == LAST_ADDR.  */
+  xops[0] = reg;
+  xops[1] = end;
+  output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
+  fputs ("\tje\t", asm_out_file);
+  assemble_name_raw (asm_out_file, end_lab);
+  fputc ('\n', asm_out_file);
+
+  /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
+  xops[1] = GEN_INT (PROBE_INTERVAL);
+  output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
+
+  /* Probe at TEST_ADDR.  */
+  xops[0] = stack_pointer_rtx;
+  xops[1] = reg;
+  xops[2] = const0_rtx;
+  output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
+
+  fprintf (asm_out_file, "\tjmp\t");
+  assemble_name_raw (asm_out_file, loop_lab);
+  fputc ('\n', asm_out_file);
+
+  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
+
+  return "";
+}
+
+/* Finalize stack_realign_needed flag, which will guide prologue/epilogue
+   to be generated in correct form.  */
+static void
+ix86_finalize_stack_realign_flags (void)
+{
+  /* Check if stack realign is really needed after reload, and
+     stores result in cfun */
+  unsigned int incoming_stack_boundary
+    = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
+       ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
+  unsigned int stack_realign = (incoming_stack_boundary
+				< (crtl->is_leaf
+				   ? crtl->max_used_stack_slot_alignment
+				   : crtl->stack_alignment_needed));
+
+  if (crtl->stack_realign_finalized)
+    {
+      /* After stack_realign_needed is finalized, we can't no longer
+	 change it.  */
+      gcc_assert (crtl->stack_realign_needed == stack_realign);
+      return;
+    }
+
+  /* If the only reason for frame_pointer_needed is that we conservatively
+     assumed stack realignment might be needed, but in the end nothing that
+     needed the stack alignment had been spilled, clear frame_pointer_needed
+     and say we don't need stack realignment.  */
+  if (stack_realign
+      && frame_pointer_needed
+      && crtl->is_leaf
+      && flag_omit_frame_pointer
+      && crtl->sp_is_unchanging
+      && !ix86_current_function_calls_tls_descriptor
+      && !crtl->accesses_prior_frames
+      && !cfun->calls_alloca
+      && !crtl->calls_eh_return
+      && !(flag_stack_check && STACK_CHECK_MOVING_SP)
+      && !ix86_frame_pointer_required ()
+      && get_frame_size () == 0
+      && ix86_nsaved_sseregs () == 0
+      && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
+    {
+      HARD_REG_SET set_up_by_prologue, prologue_used;
+      basic_block bb;
+
+      CLEAR_HARD_REG_SET (prologue_used);
+      CLEAR_HARD_REG_SET (set_up_by_prologue);
+      add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
+      add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
+      add_to_hard_reg_set (&set_up_by_prologue, Pmode,
+			   HARD_FRAME_POINTER_REGNUM);
+      FOR_EACH_BB_FN (bb, cfun)
+        {
+          rtx insn;
+	  FOR_BB_INSNS (bb, insn)
+	    if (NONDEBUG_INSN_P (insn)
+		&& requires_stack_frame_p (insn, prologue_used,
+					   set_up_by_prologue))
+	      {
+		crtl->stack_realign_needed = stack_realign;
+		crtl->stack_realign_finalized = true;
+		return;
+	      }
+	}
+
+      /* If drap has been set, but it actually isn't live at the start
+	 of the function, there is no reason to set it up.  */
+      if (crtl->drap_reg)
+	{
+	  basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
+	  if (! REGNO_REG_SET_P (DF_LR_IN (bb), REGNO (crtl->drap_reg)))
+	    {
+	      crtl->drap_reg = NULL_RTX;
+	      crtl->need_drap = false;
+	    }
+	}
+      else
+	cfun->machine->no_drap_save_restore = true;
+
+      frame_pointer_needed = false;
+      stack_realign = false;
+      crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
+      crtl->stack_alignment_needed = incoming_stack_boundary;
+      crtl->stack_alignment_estimated = incoming_stack_boundary;
+      if (crtl->preferred_stack_boundary > incoming_stack_boundary)
+	crtl->preferred_stack_boundary = incoming_stack_boundary;
+      df_finish_pass (true);
+      df_scan_alloc (NULL);
+      df_scan_blocks ();
+      df_compute_regs_ever_live (true);
+      df_analyze ();
+    }
+
+  crtl->stack_realign_needed = stack_realign;
+  crtl->stack_realign_finalized = true;
+}
+
+/* Expand the prologue into a bunch of separate insns.  */
+
+void
+ix86_expand_prologue (void)
+{
+  struct machine_function *m = cfun->machine;
+  rtx insn, t;
+  bool pic_reg_used;
+  struct ix86_frame frame;
+  HOST_WIDE_INT allocate;
+  bool int_registers_saved;
+  bool sse_registers_saved;
+
+  ix86_finalize_stack_realign_flags ();
+
+  /* DRAP should not coexist with stack_realign_fp */
+  gcc_assert (!(crtl->drap_reg && stack_realign_fp));
+
+  memset (&m->fs, 0, sizeof (m->fs));
+
+  /* Initialize CFA state for before the prologue.  */
+  m->fs.cfa_reg = stack_pointer_rtx;
+  m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
+
+  /* Track SP offset to the CFA.  We continue tracking this after we've
+     swapped the CFA register away from SP.  In the case of re-alignment
+     this is fudged; we're interested to offsets within the local frame.  */
+  m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
+  m->fs.sp_valid = true;
+
+  ix86_compute_frame_layout (&frame);
+
+  if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
+    {
+      /* We should have already generated an error for any use of
+         ms_hook on a nested function.  */
+      gcc_checking_assert (!ix86_static_chain_on_stack);
+
+      /* Check if profiling is active and we shall use profiling before
+         prologue variant. If so sorry.  */
+      if (crtl->profile && flag_fentry != 0)
+        sorry ("ms_hook_prologue attribute isn%'t compatible "
+	       "with -mfentry for 32-bit");
+
+      /* In ix86_asm_output_function_label we emitted:
+	 8b ff     movl.s %edi,%edi
+	 55        push   %ebp
+	 8b ec     movl.s %esp,%ebp
+
+	 This matches the hookable function prologue in Win32 API
+	 functions in Microsoft Windows XP Service Pack 2 and newer.
+	 Wine uses this to enable Windows apps to hook the Win32 API
+	 functions provided by Wine.
+
+	 What that means is that we've already set up the frame pointer.  */
+
+      if (frame_pointer_needed
+	  && !(crtl->drap_reg && crtl->stack_realign_needed))
+	{
+	  rtx push, mov;
+
+	  /* We've decided to use the frame pointer already set up.
+	     Describe this to the unwinder by pretending that both
+	     push and mov insns happen right here.
+
+	     Putting the unwind info here at the end of the ms_hook
+	     is done so that we can make absolutely certain we get
+	     the required byte sequence at the start of the function,
+	     rather than relying on an assembler that can produce
+	     the exact encoding required.
+
+	     However it does mean (in the unpatched case) that we have
+	     a 1 insn window where the asynchronous unwind info is
+	     incorrect.  However, if we placed the unwind info at
+	     its correct location we would have incorrect unwind info
+	     in the patched case.  Which is probably all moot since
+	     I don't expect Wine generates dwarf2 unwind info for the
+	     system libraries that use this feature.  */
+
+	  insn = emit_insn (gen_blockage ());
+
+	  push = gen_push (hard_frame_pointer_rtx);
+	  mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
+			     stack_pointer_rtx);
+	  RTX_FRAME_RELATED_P (push) = 1;
+	  RTX_FRAME_RELATED_P (mov) = 1;
+
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	  add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+			gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
+
+	  /* Note that gen_push incremented m->fs.cfa_offset, even
+	     though we didn't emit the push insn here.  */
+	  m->fs.cfa_reg = hard_frame_pointer_rtx;
+	  m->fs.fp_offset = m->fs.cfa_offset;
+	  m->fs.fp_valid = true;
+	}
+      else
+	{
+	  /* The frame pointer is not needed so pop %ebp again.
+	     This leaves us with a pristine state.  */
+	  emit_insn (gen_pop (hard_frame_pointer_rtx));
+	}
+    }
+
+  /* The first insn of a function that accepts its static chain on the
+     stack is to push the register that would be filled in by a direct
+     call.  This insn will be skipped by the trampoline.  */
+  else if (ix86_static_chain_on_stack)
+    {
+      insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
+      emit_insn (gen_blockage ());
+
+      /* We don't want to interpret this push insn as a register save,
+	 only as a stack adjustment.  The real copy of the register as
+	 a save will be done later, if needed.  */
+      t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
+      t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
+      add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
+      RTX_FRAME_RELATED_P (insn) = 1;
+    }
+
+  /* Emit prologue code to adjust stack alignment and setup DRAP, in case
+     of DRAP is needed and stack realignment is really needed after reload */
+  if (stack_realign_drap)
+    {
+      int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
+
+      /* Only need to push parameter pointer reg if it is caller saved.  */
+      if (!call_used_regs[REGNO (crtl->drap_reg)])
+	{
+	  /* Push arg pointer reg */
+	  insn = emit_insn (gen_push (crtl->drap_reg));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+
+      /* Grab the argument pointer.  */
+      t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
+      insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
+      RTX_FRAME_RELATED_P (insn) = 1;
+      m->fs.cfa_reg = crtl->drap_reg;
+      m->fs.cfa_offset = 0;
+
+      /* Align the stack.  */
+      insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
+					stack_pointer_rtx,
+					GEN_INT (-align_bytes)));
+      RTX_FRAME_RELATED_P (insn) = 1;
+
+      /* Replicate the return address on the stack so that return
+	 address can be reached via (argp - 1) slot.  This is needed
+	 to implement macro RETURN_ADDR_RTX and intrinsic function
+	 expand_builtin_return_addr etc.  */
+      t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
+      t = gen_frame_mem (word_mode, t);
+      insn = emit_insn (gen_push (t));
+      RTX_FRAME_RELATED_P (insn) = 1;
+
+      /* For the purposes of frame and register save area addressing,
+	 we've started over with a new frame.  */
+      m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
+      m->fs.realigned = true;
+    }
+
+  int_registers_saved = (frame.nregs == 0);
+  sse_registers_saved = (frame.nsseregs == 0);
+
+  if (frame_pointer_needed && !m->fs.fp_valid)
+    {
+      /* Note: AT&T enter does NOT have reversed args.  Enter is probably
+         slower on all targets.  Also sdb doesn't like it.  */
+      insn = emit_insn (gen_push (hard_frame_pointer_rtx));
+      RTX_FRAME_RELATED_P (insn) = 1;
+
+      /* Push registers now, before setting the frame pointer
+	 on SEH target.  */
+      if (!int_registers_saved
+	  && TARGET_SEH
+	  && !frame.save_regs_using_mov)
+	{
+	  ix86_emit_save_regs ();
+	  int_registers_saved = true;
+	  gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
+	}
+
+      if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
+	{
+	  insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
+	  RTX_FRAME_RELATED_P (insn) = 1;
+
+	  if (m->fs.cfa_reg == stack_pointer_rtx)
+	    m->fs.cfa_reg = hard_frame_pointer_rtx;
+	  m->fs.fp_offset = m->fs.sp_offset;
+	  m->fs.fp_valid = true;
+	}
+    }
+
+  if (!int_registers_saved)
+    {
+      /* If saving registers via PUSH, do so now.  */
+      if (!frame.save_regs_using_mov)
+	{
+	  ix86_emit_save_regs ();
+	  int_registers_saved = true;
+	  gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
+	}
+
+      /* When using red zone we may start register saving before allocating
+	 the stack frame saving one cycle of the prologue.  However, avoid
+	 doing this if we have to probe the stack; at least on x86_64 the
+	 stack probe can turn into a call that clobbers a red zone location. */
+      else if (ix86_using_red_zone ()
+	       && (! TARGET_STACK_PROBE
+		   || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
+	{
+	  ix86_emit_save_regs_using_mov (frame.reg_save_offset);
+	  int_registers_saved = true;
+	}
+    }
+
+  if (stack_realign_fp)
+    {
+      int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
+      gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
+
+      /* The computation of the size of the re-aligned stack frame means
+	 that we must allocate the size of the register save area before
+	 performing the actual alignment.  Otherwise we cannot guarantee
+	 that there's enough storage above the realignment point.  */
+      if (m->fs.sp_offset != frame.sse_reg_save_offset)
+        pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+				   GEN_INT (m->fs.sp_offset
+					    - frame.sse_reg_save_offset),
+				   -1, false);
+
+      /* Align the stack.  */
+      insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
+					stack_pointer_rtx,
+					GEN_INT (-align_bytes)));
+
+      /* For the purposes of register save area addressing, the stack
+         pointer is no longer valid.  As for the value of sp_offset,
+	 see ix86_compute_frame_layout, which we need to match in order
+	 to pass verification of stack_pointer_offset at the end.  */
+      m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
+      m->fs.sp_valid = false;
+    }
+
+  allocate = frame.stack_pointer_offset - m->fs.sp_offset;
+
+  if (flag_stack_usage_info)
+    {
+      /* We start to count from ARG_POINTER.  */
+      HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
+
+      /* If it was realigned, take into account the fake frame.  */
+      if (stack_realign_drap)
+	{
+	  if (ix86_static_chain_on_stack)
+	    stack_size += UNITS_PER_WORD;
+
+	  if (!call_used_regs[REGNO (crtl->drap_reg)])
+	    stack_size += UNITS_PER_WORD;
+
+	  /* This over-estimates by 1 minimal-stack-alignment-unit but
+	     mitigates that by counting in the new return address slot.  */
+	  current_function_dynamic_stack_size
+	    += crtl->stack_alignment_needed / BITS_PER_UNIT;
+	}
+
+      current_function_static_stack_size = stack_size;
+    }
+
+  /* On SEH target with very large frame size, allocate an area to save
+     SSE registers (as the very large allocation won't be described).  */
+  if (TARGET_SEH
+      && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
+      && !sse_registers_saved)
+    {
+      HOST_WIDE_INT sse_size =
+	frame.sse_reg_save_offset - frame.reg_save_offset;
+
+      gcc_assert (int_registers_saved);
+
+      /* No need to do stack checking as the area will be immediately
+	 written.  */
+      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+			         GEN_INT (-sse_size), -1,
+				 m->fs.cfa_reg == stack_pointer_rtx);
+      allocate -= sse_size;
+      ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
+      sse_registers_saved = true;
+    }
+
+  /* The stack has already been decremented by the instruction calling us
+     so probe if the size is non-negative to preserve the protection area.  */
+  if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
+    {
+      /* We expect the registers to be saved when probes are used.  */
+      gcc_assert (int_registers_saved);
+
+      if (STACK_CHECK_MOVING_SP)
+	{
+	  if (!(crtl->is_leaf && !cfun->calls_alloca
+		&& allocate <= PROBE_INTERVAL))
+	    {
+	      ix86_adjust_stack_and_probe (allocate);
+	      allocate = 0;
+	    }
+	}
+      else
+	{
+	  HOST_WIDE_INT size = allocate;
+
+	  if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
+	    size = 0x80000000 - STACK_CHECK_PROTECT - 1;
+
+	  if (TARGET_STACK_PROBE)
+	    {
+	      if (crtl->is_leaf && !cfun->calls_alloca)
+		{
+		  if (size > PROBE_INTERVAL)
+		    ix86_emit_probe_stack_range (0, size);
+		}
+	      else
+		ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
+	    }
+	  else
+	    {
+	      if (crtl->is_leaf && !cfun->calls_alloca)
+		{
+		  if (size > PROBE_INTERVAL && size > STACK_CHECK_PROTECT)
+		    ix86_emit_probe_stack_range (STACK_CHECK_PROTECT,
+						 size - STACK_CHECK_PROTECT);
+		}
+	      else
+		ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
+	    }
+	}
+    }
+
+  if (allocate == 0)
+    ;
+  else if (!ix86_target_stack_probe ()
+	   || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
+    {
+      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+			         GEN_INT (-allocate), -1,
+			         m->fs.cfa_reg == stack_pointer_rtx);
+    }
+  else
+    {
+      rtx eax = gen_rtx_REG (Pmode, AX_REG);
+      rtx r10 = NULL;
+      rtx (*adjust_stack_insn)(rtx, rtx, rtx);
+      const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
+      bool eax_live = ix86_eax_live_at_start_p ();
+      bool r10_live = false;
+
+      if (TARGET_64BIT)
+        r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
+
+      if (eax_live)
+	{
+	  insn = emit_insn (gen_push (eax));
+	  allocate -= UNITS_PER_WORD;
+	  /* Note that SEH directives need to continue tracking the stack
+	     pointer even after the frame pointer has been set up.  */
+	  if (sp_is_cfa_reg || TARGET_SEH)
+	    {
+	      if (sp_is_cfa_reg)
+		m->fs.cfa_offset += UNITS_PER_WORD;
+	      RTX_FRAME_RELATED_P (insn) = 1;
+	    }
+	}
+
+      if (r10_live)
+	{
+	  r10 = gen_rtx_REG (Pmode, R10_REG);
+	  insn = emit_insn (gen_push (r10));
+	  allocate -= UNITS_PER_WORD;
+	  if (sp_is_cfa_reg || TARGET_SEH)
+	    {
+	      if (sp_is_cfa_reg)
+		m->fs.cfa_offset += UNITS_PER_WORD;
+	      RTX_FRAME_RELATED_P (insn) = 1;
+	    }
+	}
+
+      emit_move_insn (eax, GEN_INT (allocate));
+      emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
+
+      /* Use the fact that AX still contains ALLOCATE.  */
+      adjust_stack_insn = (Pmode == DImode
+			   ? gen_pro_epilogue_adjust_stack_di_sub
+			   : gen_pro_epilogue_adjust_stack_si_sub);
+
+      insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
+					   stack_pointer_rtx, eax));
+
+      if (sp_is_cfa_reg || TARGET_SEH)
+	{
+	  if (sp_is_cfa_reg)
+	    m->fs.cfa_offset += allocate;
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	  add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+			gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+				     plus_constant (Pmode, stack_pointer_rtx,
+						    -allocate)));
+	}
+      m->fs.sp_offset += allocate;
+
+      /* Use stack_pointer_rtx for relative addressing so that code
+	 works for realigned stack, too.  */
+      if (r10_live && eax_live)
+        {
+	  t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
+	  emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
+			  gen_frame_mem (word_mode, t));
+	  t = plus_constant (Pmode, t, UNITS_PER_WORD);
+	  emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
+			  gen_frame_mem (word_mode, t));
+	}
+      else if (eax_live || r10_live)
+	{
+	  t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
+	  emit_move_insn (gen_rtx_REG (word_mode,
+				       (eax_live ? AX_REG : R10_REG)),
+			  gen_frame_mem (word_mode, t));
+	}
+    }
+  gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
+
+  /* If we havn't already set up the frame pointer, do so now.  */
+  if (frame_pointer_needed && !m->fs.fp_valid)
+    {
+      insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
+			    GEN_INT (frame.stack_pointer_offset
+				     - frame.hard_frame_pointer_offset));
+      insn = emit_insn (insn);
+      RTX_FRAME_RELATED_P (insn) = 1;
+      add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
+
+      if (m->fs.cfa_reg == stack_pointer_rtx)
+	m->fs.cfa_reg = hard_frame_pointer_rtx;
+      m->fs.fp_offset = frame.hard_frame_pointer_offset;
+      m->fs.fp_valid = true;
+    }
+
+  if (!int_registers_saved)
+    ix86_emit_save_regs_using_mov (frame.reg_save_offset);
+  if (!sse_registers_saved)
+    ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
+
+  pic_reg_used = false;
+  /* We don't use pic-register for pe-coff target.  */
+  if (pic_offset_table_rtx
+      && !TARGET_PECOFF
+      && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
+	  || crtl->profile))
+    {
+      unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
+
+      if (alt_pic_reg_used != INVALID_REGNUM)
+	SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
+
+      pic_reg_used = true;
+    }
+
+  if (pic_reg_used)
+    {
+      if (TARGET_64BIT)
+	{
+	  if (ix86_cmodel == CM_LARGE_PIC)
+	    {
+	      rtx label, tmp_reg;
+
+	      gcc_assert (Pmode == DImode);
+	      label = gen_label_rtx ();
+	      emit_label (label);
+	      LABEL_PRESERVE_P (label) = 1;
+	      tmp_reg = gen_rtx_REG (Pmode, R11_REG);
+	      gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
+	      insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
+						   label));
+	      insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
+	      insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
+					       pic_offset_table_rtx, tmp_reg));
+	    }
+	  else
+            insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
+	}
+      else
+	{
+          insn = emit_insn (gen_set_got (pic_offset_table_rtx));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	  add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
+	}
+    }
+
+  /* In the pic_reg_used case, make sure that the got load isn't deleted
+     when mcount needs it.  Blockage to avoid call movement across mcount
+     call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
+     note.  */
+  if (crtl->profile && !flag_fentry && pic_reg_used)
+    emit_insn (gen_prologue_use (pic_offset_table_rtx));
+
+  if (crtl->drap_reg && !crtl->stack_realign_needed)
+    {
+      /* vDRAP is setup but after reload it turns out stack realign
+         isn't necessary, here we will emit prologue to setup DRAP
+         without stack realign adjustment */
+      t = choose_baseaddr (0);
+      emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
+    }
+
+  /* Prevent instructions from being scheduled into register save push
+     sequence when access to the redzone area is done through frame pointer.
+     The offset between the frame pointer and the stack pointer is calculated
+     relative to the value of the stack pointer at the end of the function
+     prologue, and moving instructions that access redzone area via frame
+     pointer inside push sequence violates this assumption.  */
+  if (frame_pointer_needed && frame.red_zone_size)
+    emit_insn (gen_memory_blockage ());
+
+  /* Emit cld instruction if stringops are used in the function.  */
+  if (TARGET_CLD && ix86_current_function_needs_cld)
+    emit_insn (gen_cld ());
+
+  /* SEH requires that the prologue end within 256 bytes of the start of
+     the function.  Prevent instruction schedules that would extend that.
+     Further, prevent alloca modifications to the stack pointer from being
+     combined with prologue modifications.  */
+  if (TARGET_SEH)
+    emit_insn (gen_prologue_use (stack_pointer_rtx));
+}
+
+/* Emit code to restore REG using a POP insn.  */
+
+static void
+ix86_emit_restore_reg_using_pop (rtx reg)
+{
+  struct machine_function *m = cfun->machine;
+  rtx insn = emit_insn (gen_pop (reg));
+
+  ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
+  m->fs.sp_offset -= UNITS_PER_WORD;
+
+  if (m->fs.cfa_reg == crtl->drap_reg
+      && REGNO (reg) == REGNO (crtl->drap_reg))
+    {
+      /* Previously we'd represented the CFA as an expression
+	 like *(%ebp - 8).  We've just popped that value from
+	 the stack, which means we need to reset the CFA to
+	 the drap register.  This will remain until we restore
+	 the stack pointer.  */
+      add_reg_note (insn, REG_CFA_DEF_CFA, reg);
+      RTX_FRAME_RELATED_P (insn) = 1;
+
+      /* This means that the DRAP register is valid for addressing too.  */
+      m->fs.drap_valid = true;
+      return;
+    }
+
+  if (m->fs.cfa_reg == stack_pointer_rtx)
+    {
+      rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+      x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
+      add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
+      RTX_FRAME_RELATED_P (insn) = 1;
+
+      m->fs.cfa_offset -= UNITS_PER_WORD;
+    }
+
+  /* When the frame pointer is the CFA, and we pop it, we are
+     swapping back to the stack pointer as the CFA.  This happens
+     for stack frames that don't allocate other data, so we assume
+     the stack pointer is now pointing at the return address, i.e.
+     the function entry state, which makes the offset be 1 word.  */
+  if (reg == hard_frame_pointer_rtx)
+    {
+      m->fs.fp_valid = false;
+      if (m->fs.cfa_reg == hard_frame_pointer_rtx)
+	{
+	  m->fs.cfa_reg = stack_pointer_rtx;
+	  m->fs.cfa_offset -= UNITS_PER_WORD;
+
+	  add_reg_note (insn, REG_CFA_DEF_CFA,
+			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+				      GEN_INT (m->fs.cfa_offset)));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	}
+    }
+}
+
+/* Emit code to restore saved registers using POP insns.  */
+
+static void
+ix86_emit_restore_regs_using_pop (void)
+{
+  unsigned int regno;
+
+  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+    if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
+      ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
+}
+
+/* Emit code and notes for the LEAVE instruction.  */
+
+static void
+ix86_emit_leave (void)
+{
+  struct machine_function *m = cfun->machine;
+  rtx insn = emit_insn (ix86_gen_leave ());
+
+  ix86_add_queued_cfa_restore_notes (insn);
+
+  gcc_assert (m->fs.fp_valid);
+  m->fs.sp_valid = true;
+  m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
+  m->fs.fp_valid = false;
+
+  if (m->fs.cfa_reg == hard_frame_pointer_rtx)
+    {
+      m->fs.cfa_reg = stack_pointer_rtx;
+      m->fs.cfa_offset = m->fs.sp_offset;
+
+      add_reg_note (insn, REG_CFA_DEF_CFA,
+		    plus_constant (Pmode, stack_pointer_rtx,
+				   m->fs.sp_offset));
+      RTX_FRAME_RELATED_P (insn) = 1;
+    }
+  ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
+			     m->fs.fp_offset);
+}
+
+/* Emit code to restore saved registers using MOV insns.
+   First register is restored from CFA - CFA_OFFSET.  */
+static void
+ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
+				  bool maybe_eh_return)
+{
+  struct machine_function *m = cfun->machine;
+  unsigned int regno;
+
+  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+    if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
+      {
+	rtx reg = gen_rtx_REG (word_mode, regno);
+	rtx insn, mem;
+
+	mem = choose_baseaddr (cfa_offset);
+	mem = gen_frame_mem (word_mode, mem);
+	insn = emit_move_insn (reg, mem);
+
+        if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
+	  {
+	    /* Previously we'd represented the CFA as an expression
+	       like *(%ebp - 8).  We've just popped that value from
+	       the stack, which means we need to reset the CFA to
+	       the drap register.  This will remain until we restore
+	       the stack pointer.  */
+	    add_reg_note (insn, REG_CFA_DEF_CFA, reg);
+	    RTX_FRAME_RELATED_P (insn) = 1;
+
+	    /* This means that the DRAP register is valid for addressing.  */
+	    m->fs.drap_valid = true;
+	  }
+	else
+	  ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
+
+	cfa_offset -= UNITS_PER_WORD;
+      }
+}
+
+/* Emit code to restore saved registers using MOV insns.
+   First register is restored from CFA - CFA_OFFSET.  */
+static void
+ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
+				      bool maybe_eh_return)
+{
+  unsigned int regno;
+
+  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+    if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
+      {
+	rtx reg = gen_rtx_REG (V4SFmode, regno);
+	rtx mem;
+
+	mem = choose_baseaddr (cfa_offset);
+	mem = gen_rtx_MEM (V4SFmode, mem);
+	set_mem_align (mem, 128);
+	emit_move_insn (reg, mem);
+
+	ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
+
+	cfa_offset -= 16;
+      }
+}
+
+/* Restore function stack, frame, and registers.  */
+
+void
+ix86_expand_epilogue (int style)
+{
+  struct machine_function *m = cfun->machine;
+  struct machine_frame_state frame_state_save = m->fs;
+  struct ix86_frame frame;
+  bool restore_regs_via_mov;
+  bool using_drap;
+
+  ix86_finalize_stack_realign_flags ();
+  ix86_compute_frame_layout (&frame);
+
+  m->fs.sp_valid = (!frame_pointer_needed
+		    || (crtl->sp_is_unchanging
+			&& !stack_realign_fp));
+  gcc_assert (!m->fs.sp_valid
+	      || m->fs.sp_offset == frame.stack_pointer_offset);
+
+  /* The FP must be valid if the frame pointer is present.  */
+  gcc_assert (frame_pointer_needed == m->fs.fp_valid);
+  gcc_assert (!m->fs.fp_valid
+	      || m->fs.fp_offset == frame.hard_frame_pointer_offset);
+
+  /* We must have *some* valid pointer to the stack frame.  */
+  gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
+
+  /* The DRAP is never valid at this point.  */
+  gcc_assert (!m->fs.drap_valid);
+
+  /* See the comment about red zone and frame
+     pointer usage in ix86_expand_prologue.  */
+  if (frame_pointer_needed && frame.red_zone_size)
+    emit_insn (gen_memory_blockage ());
+
+  using_drap = crtl->drap_reg && crtl->stack_realign_needed;
+  gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
+
+  /* Determine the CFA offset of the end of the red-zone.  */
+  m->fs.red_zone_offset = 0;
+  if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
+    {
+      /* The red-zone begins below the return address.  */
+      m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
+
+      /* When the register save area is in the aligned portion of
+         the stack, determine the maximum runtime displacement that
+	 matches up with the aligned frame.  */
+      if (stack_realign_drap)
+	m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
+				  + UNITS_PER_WORD);
+    }
+
+  /* Special care must be taken for the normal return case of a function
+     using eh_return: the eax and edx registers are marked as saved, but
+     not restored along this path.  Adjust the save location to match.  */
+  if (crtl->calls_eh_return && style != 2)
+    frame.reg_save_offset -= 2 * UNITS_PER_WORD;
+
+  /* EH_RETURN requires the use of moves to function properly.  */
+  if (crtl->calls_eh_return)
+    restore_regs_via_mov = true;
+  /* SEH requires the use of pops to identify the epilogue.  */
+  else if (TARGET_SEH)
+    restore_regs_via_mov = false;
+  /* If we're only restoring one register and sp is not valid then
+     using a move instruction to restore the register since it's
+     less work than reloading sp and popping the register.  */
+  else if (!m->fs.sp_valid && frame.nregs <= 1)
+    restore_regs_via_mov = true;
+  else if (TARGET_EPILOGUE_USING_MOVE
+	   && cfun->machine->use_fast_prologue_epilogue
+	   && (frame.nregs > 1
+	       || m->fs.sp_offset != frame.reg_save_offset))
+    restore_regs_via_mov = true;
+  else if (frame_pointer_needed
+	   && !frame.nregs
+	   && m->fs.sp_offset != frame.reg_save_offset)
+    restore_regs_via_mov = true;
+  else if (frame_pointer_needed
+	   && TARGET_USE_LEAVE
+	   && cfun->machine->use_fast_prologue_epilogue
+	   && frame.nregs == 1)
+    restore_regs_via_mov = true;
+  else
+    restore_regs_via_mov = false;
+
+  if (restore_regs_via_mov || frame.nsseregs)
+    {
+      /* Ensure that the entire register save area is addressable via
+	 the stack pointer, if we will restore via sp.  */
+      if (TARGET_64BIT
+	  && m->fs.sp_offset > 0x7fffffff
+	  && !(m->fs.fp_valid || m->fs.drap_valid)
+	  && (frame.nsseregs + frame.nregs) != 0)
+	{
+	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+				     GEN_INT (m->fs.sp_offset
+					      - frame.sse_reg_save_offset),
+				     style,
+				     m->fs.cfa_reg == stack_pointer_rtx);
+	}
+    }
+
+  /* If there are any SSE registers to restore, then we have to do it
+     via moves, since there's obviously no pop for SSE regs.  */
+  if (frame.nsseregs)
+    ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
+					  style == 2);
+
+  if (restore_regs_via_mov)
+    {
+      rtx t;
+
+      if (frame.nregs)
+	ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
+
+      /* eh_return epilogues need %ecx added to the stack pointer.  */
+      if (style == 2)
+	{
+	  rtx insn, sa = EH_RETURN_STACKADJ_RTX;
+
+	  /* Stack align doesn't work with eh_return.  */
+	  gcc_assert (!stack_realign_drap);
+	  /* Neither does regparm nested functions.  */
+	  gcc_assert (!ix86_static_chain_on_stack);
+
+	  if (frame_pointer_needed)
+	    {
+	      t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
+	      t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
+	      emit_insn (gen_rtx_SET (VOIDmode, sa, t));
+
+	      t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
+	      insn = emit_move_insn (hard_frame_pointer_rtx, t);
+
+	      /* Note that we use SA as a temporary CFA, as the return
+		 address is at the proper place relative to it.  We
+		 pretend this happens at the FP restore insn because
+		 prior to this insn the FP would be stored at the wrong
+		 offset relative to SA, and after this insn we have no
+		 other reasonable register to use for the CFA.  We don't
+		 bother resetting the CFA to the SP for the duration of
+		 the return insn.  */
+	      add_reg_note (insn, REG_CFA_DEF_CFA,
+			    plus_constant (Pmode, sa, UNITS_PER_WORD));
+	      ix86_add_queued_cfa_restore_notes (insn);
+	      add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
+	      RTX_FRAME_RELATED_P (insn) = 1;
+
+	      m->fs.cfa_reg = sa;
+	      m->fs.cfa_offset = UNITS_PER_WORD;
+	      m->fs.fp_valid = false;
+
+	      pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
+					 const0_rtx, style, false);
+	    }
+	  else
+	    {
+	      t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
+	      t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
+	      insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
+	      ix86_add_queued_cfa_restore_notes (insn);
+
+	      gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
+	      if (m->fs.cfa_offset != UNITS_PER_WORD)
+		{
+		  m->fs.cfa_offset = UNITS_PER_WORD;
+		  add_reg_note (insn, REG_CFA_DEF_CFA,
+				plus_constant (Pmode, stack_pointer_rtx,
+					       UNITS_PER_WORD));
+		  RTX_FRAME_RELATED_P (insn) = 1;
+		}
+	    }
+	  m->fs.sp_offset = UNITS_PER_WORD;
+	  m->fs.sp_valid = true;
+	}
+    }
+  else
+    {
+      /* SEH requires that the function end with (1) a stack adjustment
+	 if necessary, (2) a sequence of pops, and (3) a return or
+	 jump instruction.  Prevent insns from the function body from
+	 being scheduled into this sequence.  */
+      if (TARGET_SEH)
+	{
+	  /* Prevent a catch region from being adjacent to the standard
+	     epilogue sequence.  Unfortuantely crtl->uses_eh_lsda nor
+	     several other flags that would be interesting to test are
+	     not yet set up.  */
+	  if (flag_non_call_exceptions)
+	    emit_insn (gen_nops (const1_rtx));
+	  else
+	    emit_insn (gen_blockage ());
+	}
+
+      /* First step is to deallocate the stack frame so that we can
+	 pop the registers.  Also do it on SEH target for very large
+	 frame as the emitted instructions aren't allowed by the ABI in
+	 epilogues.  */
+      if (!m->fs.sp_valid
+ 	  || (TARGET_SEH
+	      && (m->fs.sp_offset - frame.reg_save_offset
+		  >= SEH_MAX_FRAME_SIZE)))
+	{
+	  pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
+				     GEN_INT (m->fs.fp_offset
+					      - frame.reg_save_offset),
+				     style, false);
+	}
+      else if (m->fs.sp_offset != frame.reg_save_offset)
+	{
+	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+				     GEN_INT (m->fs.sp_offset
+					      - frame.reg_save_offset),
+				     style,
+				     m->fs.cfa_reg == stack_pointer_rtx);
+	}
+
+      ix86_emit_restore_regs_using_pop ();
+    }
+
+  /* If we used a stack pointer and haven't already got rid of it,
+     then do so now.  */
+  if (m->fs.fp_valid)
+    {
+      /* If the stack pointer is valid and pointing at the frame
+	 pointer store address, then we only need a pop.  */
+      if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
+	ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
+      /* Leave results in shorter dependency chains on CPUs that are
+	 able to grok it fast.  */
+      else if (TARGET_USE_LEAVE
+	       || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
+	       || !cfun->machine->use_fast_prologue_epilogue)
+	ix86_emit_leave ();
+      else
+        {
+	  pro_epilogue_adjust_stack (stack_pointer_rtx,
+				     hard_frame_pointer_rtx,
+				     const0_rtx, style, !using_drap);
+	  ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
+        }
+    }
+
+  if (using_drap)
+    {
+      int param_ptr_offset = UNITS_PER_WORD;
+      rtx insn;
+
+      gcc_assert (stack_realign_drap);
+
+      if (ix86_static_chain_on_stack)
+	param_ptr_offset += UNITS_PER_WORD;
+      if (!call_used_regs[REGNO (crtl->drap_reg)])
+	param_ptr_offset += UNITS_PER_WORD;
+
+      insn = emit_insn (gen_rtx_SET
+			(VOIDmode, stack_pointer_rtx,
+			 gen_rtx_PLUS (Pmode,
+				       crtl->drap_reg,
+				       GEN_INT (-param_ptr_offset))));
+      m->fs.cfa_reg = stack_pointer_rtx;
+      m->fs.cfa_offset = param_ptr_offset;
+      m->fs.sp_offset = param_ptr_offset;
+      m->fs.realigned = false;
+
+      add_reg_note (insn, REG_CFA_DEF_CFA,
+		    gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+				  GEN_INT (param_ptr_offset)));
+      RTX_FRAME_RELATED_P (insn) = 1;
+
+      if (!call_used_regs[REGNO (crtl->drap_reg)])
+	ix86_emit_restore_reg_using_pop (crtl->drap_reg);
+    }
+
+  /* At this point the stack pointer must be valid, and we must have
+     restored all of the registers.  We may not have deallocated the
+     entire stack frame.  We've delayed this until now because it may
+     be possible to merge the local stack deallocation with the
+     deallocation forced by ix86_static_chain_on_stack.   */
+  gcc_assert (m->fs.sp_valid);
+  gcc_assert (!m->fs.fp_valid);
+  gcc_assert (!m->fs.realigned);
+  if (m->fs.sp_offset != UNITS_PER_WORD)
+    {
+      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+				 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
+				 style, true);
+    }
+  else
+    ix86_add_queued_cfa_restore_notes (get_last_insn ());
+
+  /* Sibcall epilogues don't want a return instruction.  */
+  if (style == 0)
+    {
+      m->fs = frame_state_save;
+      return;
+    }
+
+  if (crtl->args.pops_args && crtl->args.size)
+    {
+      rtx popc = GEN_INT (crtl->args.pops_args);
+
+      /* i386 can only pop 64K bytes.  If asked to pop more, pop return
+	 address, do explicit add, and jump indirectly to the caller.  */
+
+      if (crtl->args.pops_args >= 65536)
+	{
+	  rtx ecx = gen_rtx_REG (SImode, CX_REG);
+	  rtx insn;
+
+	  /* There is no "pascal" calling convention in any 64bit ABI.  */
+	  gcc_assert (!TARGET_64BIT);
+
+	  insn = emit_insn (gen_pop (ecx));
+	  m->fs.cfa_offset -= UNITS_PER_WORD;
+	  m->fs.sp_offset -= UNITS_PER_WORD;
+
+	  rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+	  x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
+	  add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
+	  add_reg_note (insn, REG_CFA_REGISTER,
+			gen_rtx_SET (VOIDmode, ecx, pc_rtx));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+
+	  pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+				     popc, -1, true);
+	  emit_jump_insn (gen_simple_return_indirect_internal (ecx));
+	}
+      else
+	emit_jump_insn (gen_simple_return_pop_internal (popc));
+    }
+  else
+    emit_jump_insn (gen_simple_return_internal ());
+
+  /* Restore the state back to the state from the prologue,
+     so that it's correct for the next epilogue.  */
+  m->fs = frame_state_save;
+}
+
+/* Reset from the function's potential modifications.  */
+
+static void
+ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
+			       HOST_WIDE_INT size ATTRIBUTE_UNUSED)
+{
+  if (pic_offset_table_rtx)
+    SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
+#if TARGET_MACHO
+  /* Mach-O doesn't support labels at the end of objects, so if
+     it looks like we might want one, insert a NOP.  */
+  {
+    rtx insn = get_last_insn ();
+    rtx deleted_debug_label = NULL_RTX;
+    while (insn
+	   && NOTE_P (insn)
+	   && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
+      {
+	/* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
+	   notes only, instead set their CODE_LABEL_NUMBER to -1,
+	   otherwise there would be code generation differences
+	   in between -g and -g0.  */
+	if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
+	  deleted_debug_label = insn;
+	insn = PREV_INSN (insn);
+      }
+    if (insn
+	&& (LABEL_P (insn)
+	    || (NOTE_P (insn)
+		&& NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
+      fputs ("\tnop\n", file);
+    else if (deleted_debug_label)
+      for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
+	if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
+	  CODE_LABEL_NUMBER (insn) = -1;
+  }
+#endif
+
+}
+
+/* Return a scratch register to use in the split stack prologue.  The
+   split stack prologue is used for -fsplit-stack.  It is the first
+   instructions in the function, even before the regular prologue.
+   The scratch register can be any caller-saved register which is not
+   used for parameters or for the static chain.  */
+
+static unsigned int
+split_stack_prologue_scratch_regno (void)
+{
+  if (TARGET_64BIT)
+    return R11_REG;
+  else
+    {
+      bool is_fastcall, is_thiscall;
+      int regparm;
+
+      is_fastcall = (lookup_attribute ("fastcall",
+				       TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
+		     != NULL);
+      is_thiscall = (lookup_attribute ("thiscall",
+				       TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
+		     != NULL);
+      regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
+
+      if (is_fastcall)
+	{
+	  if (DECL_STATIC_CHAIN (cfun->decl))
+	    {
+	      sorry ("-fsplit-stack does not support fastcall with "
+		     "nested function");
+	      return INVALID_REGNUM;
+	    }
+	  return AX_REG;
+	}
+      else if (is_thiscall)
+        {
+	  if (!DECL_STATIC_CHAIN (cfun->decl))
+	    return DX_REG;
+	  return AX_REG;
+	}
+      else if (regparm < 3)
+	{
+	  if (!DECL_STATIC_CHAIN (cfun->decl))
+	    return CX_REG;
+	  else
+	    {
+	      if (regparm >= 2)
+		{
+		  sorry ("-fsplit-stack does not support 2 register "
+			 " parameters for a nested function");
+		  return INVALID_REGNUM;
+		}
+	      return DX_REG;
+	    }
+	}
+      else
+	{
+	  /* FIXME: We could make this work by pushing a register
+	     around the addition and comparison.  */
+	  sorry ("-fsplit-stack does not support 3 register parameters");
+	  return INVALID_REGNUM;
+	}
+    }
+}
+
+/* A SYMBOL_REF for the function which allocates new stackspace for
+   -fsplit-stack.  */
+
+static GTY(()) rtx split_stack_fn;
+
+/* A SYMBOL_REF for the more stack function when using the large
+   model.  */
+
+static GTY(()) rtx split_stack_fn_large;
+
+/* Handle -fsplit-stack.  These are the first instructions in the
+   function, even before the regular prologue.  */
+
+void
+ix86_expand_split_stack_prologue (void)
+{
+  struct ix86_frame frame;
+  HOST_WIDE_INT allocate;
+  unsigned HOST_WIDE_INT args_size;
+  rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
+  rtx scratch_reg = NULL_RTX;
+  rtx varargs_label = NULL_RTX;
+  rtx fn;
+
+  gcc_assert (flag_split_stack && reload_completed);
+
+  ix86_finalize_stack_realign_flags ();
+  ix86_compute_frame_layout (&frame);
+  allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
+
+  /* This is the label we will branch to if we have enough stack
+     space.  We expect the basic block reordering pass to reverse this
+     branch if optimizing, so that we branch in the unlikely case.  */
+  label = gen_label_rtx ();
+
+  /* We need to compare the stack pointer minus the frame size with
+     the stack boundary in the TCB.  The stack boundary always gives
+     us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
+     can compare directly.  Otherwise we need to do an addition.  */
+
+  limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
+			  UNSPEC_STACK_CHECK);
+  limit = gen_rtx_CONST (Pmode, limit);
+  limit = gen_rtx_MEM (Pmode, limit);
+  if (allocate < SPLIT_STACK_AVAILABLE)
+    current = stack_pointer_rtx;
+  else
+    {
+      unsigned int scratch_regno;
+      rtx offset;
+
+      /* We need a scratch register to hold the stack pointer minus
+	 the required frame size.  Since this is the very start of the
+	 function, the scratch register can be any caller-saved
+	 register which is not used for parameters.  */
+      offset = GEN_INT (- allocate);
+      scratch_regno = split_stack_prologue_scratch_regno ();
+      if (scratch_regno == INVALID_REGNUM)
+	return;
+      scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
+      if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
+	{
+	  /* We don't use ix86_gen_add3 in this case because it will
+	     want to split to lea, but when not optimizing the insn
+	     will not be split after this point.  */
+	  emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
+				  gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+						offset)));
+	}
+      else
+	{
+	  emit_move_insn (scratch_reg, offset);
+	  emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
+				    stack_pointer_rtx));
+	}
+      current = scratch_reg;
+    }
+
+  ix86_expand_branch (GEU, current, limit, label);
+  jump_insn = get_last_insn ();
+  JUMP_LABEL (jump_insn) = label;
+
+  /* Mark the jump as very likely to be taken.  */
+  add_int_reg_note (jump_insn, REG_BR_PROB,
+		    REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100);
+
+  if (split_stack_fn == NULL_RTX)
+    split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
+  fn = split_stack_fn;
+
+  /* Get more stack space.  We pass in the desired stack space and the
+     size of the arguments to copy to the new stack.  In 32-bit mode
+     we push the parameters; __morestack will return on a new stack
+     anyhow.  In 64-bit mode we pass the parameters in r10 and
+     r11.  */
+  allocate_rtx = GEN_INT (allocate);
+  args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
+  call_fusage = NULL_RTX;
+  if (TARGET_64BIT)
+    {
+      rtx reg10, reg11;
+
+      reg10 = gen_rtx_REG (Pmode, R10_REG);
+      reg11 = gen_rtx_REG (Pmode, R11_REG);
+
+      /* If this function uses a static chain, it will be in %r10.
+	 Preserve it across the call to __morestack.  */
+      if (DECL_STATIC_CHAIN (cfun->decl))
+	{
+	  rtx rax;
+
+	  rax = gen_rtx_REG (word_mode, AX_REG);
+	  emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
+	  use_reg (&call_fusage, rax);
+	}
+
+      if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
+          && !TARGET_PECOFF)
+	{
+	  HOST_WIDE_INT argval;
+
+	  gcc_assert (Pmode == DImode);
+	  /* When using the large model we need to load the address
+	     into a register, and we've run out of registers.  So we
+	     switch to a different calling convention, and we call a
+	     different function: __morestack_large.  We pass the
+	     argument size in the upper 32 bits of r10 and pass the
+	     frame size in the lower 32 bits.  */
+	  gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
+	  gcc_assert ((args_size & 0xffffffff) == args_size);
+
+	  if (split_stack_fn_large == NULL_RTX)
+	    split_stack_fn_large =
+	      gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
+
+	  if (ix86_cmodel == CM_LARGE_PIC)
+	    {
+	      rtx label, x;
+
+	      label = gen_label_rtx ();
+	      emit_label (label);
+	      LABEL_PRESERVE_P (label) = 1;
+	      emit_insn (gen_set_rip_rex64 (reg10, label));
+	      emit_insn (gen_set_got_offset_rex64 (reg11, label));
+	      emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
+	      x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
+				  UNSPEC_GOT);
+	      x = gen_rtx_CONST (Pmode, x);
+	      emit_move_insn (reg11, x);
+	      x = gen_rtx_PLUS (Pmode, reg10, reg11);
+	      x = gen_const_mem (Pmode, x);
+	      emit_move_insn (reg11, x);
+	    }
+	  else
+	    emit_move_insn (reg11, split_stack_fn_large);
+
+	  fn = reg11;
+
+	  argval = ((args_size << 16) << 16) + allocate;
+	  emit_move_insn (reg10, GEN_INT (argval));
+	}
+      else
+	{
+	  emit_move_insn (reg10, allocate_rtx);
+	  emit_move_insn (reg11, GEN_INT (args_size));
+	  use_reg (&call_fusage, reg11);
+	}
+
+      use_reg (&call_fusage, reg10);
+    }
+  else
+    {
+      emit_insn (gen_push (GEN_INT (args_size)));
+      emit_insn (gen_push (allocate_rtx));
+    }
+  call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
+				GEN_INT (UNITS_PER_WORD), constm1_rtx,
+				NULL_RTX, false);
+  add_function_usage_to (call_insn, call_fusage);
+
+  /* In order to make call/return prediction work right, we now need
+     to execute a return instruction.  See
+     libgcc/config/i386/morestack.S for the details on how this works.
+
+     For flow purposes gcc must not see this as a return
+     instruction--we need control flow to continue at the subsequent
+     label.  Therefore, we use an unspec.  */
+  gcc_assert (crtl->args.pops_args < 65536);
+  emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
+
+  /* If we are in 64-bit mode and this function uses a static chain,
+     we saved %r10 in %rax before calling _morestack.  */
+  if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
+    emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
+		    gen_rtx_REG (word_mode, AX_REG));
+
+  /* If this function calls va_start, we need to store a pointer to
+     the arguments on the old stack, because they may not have been
+     all copied to the new stack.  At this point the old stack can be
+     found at the frame pointer value used by __morestack, because
+     __morestack has set that up before calling back to us.  Here we
+     store that pointer in a scratch register, and in
+     ix86_expand_prologue we store the scratch register in a stack
+     slot.  */
+  if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
+    {
+      unsigned int scratch_regno;
+      rtx frame_reg;
+      int words;
+
+      scratch_regno = split_stack_prologue_scratch_regno ();
+      scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
+      frame_reg = gen_rtx_REG (Pmode, BP_REG);
+
+      /* 64-bit:
+	 fp -> old fp value
+	       return address within this function
+	       return address of caller of this function
+	       stack arguments
+	 So we add three words to get to the stack arguments.
+
+	 32-bit:
+	 fp -> old fp value
+	       return address within this function
+               first argument to __morestack
+               second argument to __morestack
+               return address of caller of this function
+               stack arguments
+         So we add five words to get to the stack arguments.
+      */
+      words = TARGET_64BIT ? 3 : 5;
+      emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
+			      gen_rtx_PLUS (Pmode, frame_reg,
+					    GEN_INT (words * UNITS_PER_WORD))));
+
+      varargs_label = gen_label_rtx ();
+      emit_jump_insn (gen_jump (varargs_label));
+      JUMP_LABEL (get_last_insn ()) = varargs_label;
+
+      emit_barrier ();
+    }
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  /* If this function calls va_start, we now have to set the scratch
+     register for the case where we do not call __morestack.  In this
+     case we need to set it based on the stack pointer.  */
+  if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
+    {
+      emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
+			      gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+					    GEN_INT (UNITS_PER_WORD))));
+
+      emit_label (varargs_label);
+      LABEL_NUSES (varargs_label) = 1;
+    }
+}
+
+/* We may have to tell the dataflow pass that the split stack prologue
+   is initializing a scratch register.  */
+
+static void
+ix86_live_on_entry (bitmap regs)
+{
+  if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
+    {
+      gcc_assert (flag_split_stack);
+      bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
+    }
+}
+
+/* Extract the parts of an RTL expression that is a valid memory address
+   for an instruction.  Return 0 if the structure of the address is
+   grossly off.  Return -1 if the address contains ASHIFT, so it is not
+   strictly valid, but still used for computing length of lea instruction.  */
+
+int
+ix86_decompose_address (rtx addr, struct ix86_address *out)
+{
+  rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
+  rtx base_reg, index_reg;
+  HOST_WIDE_INT scale = 1;
+  rtx scale_rtx = NULL_RTX;
+  rtx tmp;
+  int retval = 1;
+  enum ix86_address_seg seg = SEG_DEFAULT;
+
+  /* Allow zero-extended SImode addresses,
+     they will be emitted with addr32 prefix.  */
+  if (TARGET_64BIT && GET_MODE (addr) == DImode)
+    {
+      if (GET_CODE (addr) == ZERO_EXTEND
+	  && GET_MODE (XEXP (addr, 0)) == SImode)
+	{
+	  addr = XEXP (addr, 0);
+	  if (CONST_INT_P (addr))
+	    return 0;
+	}	      
+      else if (GET_CODE (addr) == AND
+	       && const_32bit_mask (XEXP (addr, 1), DImode))
+	{
+	  addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
+	  if (addr == NULL_RTX)
+	    return 0;
+
+	  if (CONST_INT_P (addr))
+	    return 0;
+	}
+    }
+
+  /* Allow SImode subregs of DImode addresses,
+     they will be emitted with addr32 prefix.  */
+  if (TARGET_64BIT && GET_MODE (addr) == SImode)
+    {
+      if (GET_CODE (addr) == SUBREG
+	  && GET_MODE (SUBREG_REG (addr)) == DImode)
+	{
+	  addr = SUBREG_REG (addr);
+	  if (CONST_INT_P (addr))
+	    return 0;
+	}
+    }
+
+  if (REG_P (addr))
+    base = addr;
+  else if (GET_CODE (addr) == SUBREG)
+    {
+      if (REG_P (SUBREG_REG (addr)))
+	base = addr;
+      else
+	return 0;
+    }
+  else if (GET_CODE (addr) == PLUS)
+    {
+      rtx addends[4], op;
+      int n = 0, i;
+
+      op = addr;
+      do
+	{
+	  if (n >= 4)
+	    return 0;
+	  addends[n++] = XEXP (op, 1);
+	  op = XEXP (op, 0);
+	}
+      while (GET_CODE (op) == PLUS);
+      if (n >= 4)
+	return 0;
+      addends[n] = op;
+
+      for (i = n; i >= 0; --i)
+	{
+	  op = addends[i];
+	  switch (GET_CODE (op))
+	    {
+	    case MULT:
+	      if (index)
+		return 0;
+	      index = XEXP (op, 0);
+	      scale_rtx = XEXP (op, 1);
+	      break;
+
+	    case ASHIFT:
+	      if (index)
+		return 0;
+	      index = XEXP (op, 0);
+	      tmp = XEXP (op, 1);
+	      if (!CONST_INT_P (tmp))
+		return 0;
+	      scale = INTVAL (tmp);
+	      if ((unsigned HOST_WIDE_INT) scale > 3)
+		return 0;
+	      scale = 1 << scale;
+	      break;
+
+	    case ZERO_EXTEND:
+	      op = XEXP (op, 0);
+	      if (GET_CODE (op) != UNSPEC)
+		return 0;
+	      /* FALLTHRU */
+
+	    case UNSPEC:
+	      if (XINT (op, 1) == UNSPEC_TP
+	          && TARGET_TLS_DIRECT_SEG_REFS
+	          && seg == SEG_DEFAULT)
+		seg = DEFAULT_TLS_SEG_REG;
+	      else
+		return 0;
+	      break;
+
+	    case SUBREG:
+	      if (!REG_P (SUBREG_REG (op)))
+		return 0;
+	      /* FALLTHRU */
+
+	    case REG:
+	      if (!base)
+		base = op;
+	      else if (!index)
+		index = op;
+	      else
+		return 0;
+	      break;
+
+	    case CONST:
+	    case CONST_INT:
+	    case SYMBOL_REF:
+	    case LABEL_REF:
+	      if (disp)
+		return 0;
+	      disp = op;
+	      break;
+
+	    default:
+	      return 0;
+	    }
+	}
+    }
+  else if (GET_CODE (addr) == MULT)
+    {
+      index = XEXP (addr, 0);		/* index*scale */
+      scale_rtx = XEXP (addr, 1);
+    }
+  else if (GET_CODE (addr) == ASHIFT)
+    {
+      /* We're called for lea too, which implements ashift on occasion.  */
+      index = XEXP (addr, 0);
+      tmp = XEXP (addr, 1);
+      if (!CONST_INT_P (tmp))
+	return 0;
+      scale = INTVAL (tmp);
+      if ((unsigned HOST_WIDE_INT) scale > 3)
+	return 0;
+      scale = 1 << scale;
+      retval = -1;
+    }
+  else
+    disp = addr;			/* displacement */
+
+  if (index)
+    {
+      if (REG_P (index))
+	;
+      else if (GET_CODE (index) == SUBREG
+	       && REG_P (SUBREG_REG (index)))
+	;
+      else
+	return 0;
+    }
+
+  /* Extract the integral value of scale.  */
+  if (scale_rtx)
+    {
+      if (!CONST_INT_P (scale_rtx))
+	return 0;
+      scale = INTVAL (scale_rtx);
+    }
+
+  base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
+  index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
+
+  /* Avoid useless 0 displacement.  */
+  if (disp == const0_rtx && (base || index))
+    disp = NULL_RTX;
+
+  /* Allow arg pointer and stack pointer as index if there is not scaling.  */
+  if (base_reg && index_reg && scale == 1
+      && (index_reg == arg_pointer_rtx
+	  || index_reg == frame_pointer_rtx
+	  || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
+    {
+      rtx tmp;
+      tmp = base, base = index, index = tmp;
+      tmp = base_reg, base_reg = index_reg, index_reg = tmp;
+    }
+
+  /* Special case: %ebp cannot be encoded as a base without a displacement.
+     Similarly %r13.  */
+  if (!disp
+      && base_reg
+      && (base_reg == hard_frame_pointer_rtx
+	  || base_reg == frame_pointer_rtx
+	  || base_reg == arg_pointer_rtx
+	  || (REG_P (base_reg)
+	      && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
+		  || REGNO (base_reg) == R13_REG))))
+    disp = const0_rtx;
+
+  /* Special case: on K6, [%esi] makes the instruction vector decoded.
+     Avoid this by transforming to [%esi+0].
+     Reload calls address legitimization without cfun defined, so we need
+     to test cfun for being non-NULL. */
+  if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
+      && base_reg && !index_reg && !disp
+      && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
+    disp = const0_rtx;
+
+  /* Special case: encode reg+reg instead of reg*2.  */
+  if (!base && index && scale == 2)
+    base = index, base_reg = index_reg, scale = 1;
+
+  /* Special case: scaling cannot be encoded without base or displacement.  */
+  if (!base && !disp && index && scale != 1)
+    disp = const0_rtx;
+
+  out->base = base;
+  out->index = index;
+  out->disp = disp;
+  out->scale = scale;
+  out->seg = seg;
+
+  return retval;
+}
+
+/* Return cost of the memory address x.
+   For i386, it is better to use a complex address than let gcc copy
+   the address into a reg and make a new pseudo.  But not if the address
+   requires to two regs - that would mean more pseudos with longer
+   lifetimes.  */
+static int
+ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
+		   addr_space_t as ATTRIBUTE_UNUSED,
+		   bool speed ATTRIBUTE_UNUSED)
+{
+  struct ix86_address parts;
+  int cost = 1;
+  int ok = ix86_decompose_address (x, &parts);
+
+  gcc_assert (ok);
+
+  if (parts.base && GET_CODE (parts.base) == SUBREG)
+    parts.base = SUBREG_REG (parts.base);
+  if (parts.index && GET_CODE (parts.index) == SUBREG)
+    parts.index = SUBREG_REG (parts.index);
+
+  /* Attempt to minimize number of registers in the address.  */
+  if ((parts.base
+       && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
+      || (parts.index
+	  && (!REG_P (parts.index)
+	      || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
+    cost++;
+
+  if (parts.base
+      && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
+      && parts.index
+      && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
+      && parts.base != parts.index)
+    cost++;
+
+  /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
+     since it's predecode logic can't detect the length of instructions
+     and it degenerates to vector decoded.  Increase cost of such
+     addresses here.  The penalty is minimally 2 cycles.  It may be worthwhile
+     to split such addresses or even refuse such addresses at all.
+
+     Following addressing modes are affected:
+      [base+scale*index]
+      [scale*index+disp]
+      [base+index]
+
+     The first and last case  may be avoidable by explicitly coding the zero in
+     memory address, but I don't have AMD-K6 machine handy to check this
+     theory.  */
+
+  if (TARGET_K6
+      && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
+	  || (parts.disp && !parts.base && parts.index && parts.scale != 1)
+	  || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
+    cost += 10;
+
+  return cost;
+}
+
+/* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
+   this is used for to form addresses to local data when -fPIC is in
+   use.  */
+
+static bool
+darwin_local_data_pic (rtx disp)
+{
+  return (GET_CODE (disp) == UNSPEC
+	  && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
+}
+
+/* Determine if a given RTX is a valid constant.  We already know this
+   satisfies CONSTANT_P.  */
+
+static bool
+ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
+{
+  switch (GET_CODE (x))
+    {
+    case CONST:
+      x = XEXP (x, 0);
+
+      if (GET_CODE (x) == PLUS)
+	{
+	  if (!CONST_INT_P (XEXP (x, 1)))
+	    return false;
+	  x = XEXP (x, 0);
+	}
+
+      if (TARGET_MACHO && darwin_local_data_pic (x))
+	return true;
+
+      /* Only some unspecs are valid as "constants".  */
+      if (GET_CODE (x) == UNSPEC)
+	switch (XINT (x, 1))
+	  {
+	  case UNSPEC_GOT:
+	  case UNSPEC_GOTOFF:
+	  case UNSPEC_PLTOFF:
+	    return TARGET_64BIT;
+	  case UNSPEC_TPOFF:
+	  case UNSPEC_NTPOFF:
+	    x = XVECEXP (x, 0, 0);
+	    return (GET_CODE (x) == SYMBOL_REF
+		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
+	  case UNSPEC_DTPOFF:
+	    x = XVECEXP (x, 0, 0);
+	    return (GET_CODE (x) == SYMBOL_REF
+		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
+	  default:
+	    return false;
+	  }
+
+      /* We must have drilled down to a symbol.  */
+      if (GET_CODE (x) == LABEL_REF)
+	return true;
+      if (GET_CODE (x) != SYMBOL_REF)
+	return false;
+      /* FALLTHRU */
+
+    case SYMBOL_REF:
+      /* TLS symbols are never valid.  */
+      if (SYMBOL_REF_TLS_MODEL (x))
+	return false;
+
+      /* DLLIMPORT symbols are never valid.  */
+      if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
+	  && SYMBOL_REF_DLLIMPORT_P (x))
+	return false;
+
+#if TARGET_MACHO
+      /* mdynamic-no-pic */
+      if (MACHO_DYNAMIC_NO_PIC_P)
+	return machopic_symbol_defined_p (x);
+#endif
+      break;
+
+    case CONST_DOUBLE:
+      if (GET_MODE (x) == TImode
+	  && x != CONST0_RTX (TImode)
+          && !TARGET_64BIT)
+	return false;
+      break;
+
+    case CONST_VECTOR:
+      if (!standard_sse_constant_p (x))
+	return false;
+
+    default:
+      break;
+    }
+
+  /* Otherwise we handle everything else in the move patterns.  */
+  return true;
+}
+
+/* Determine if it's legal to put X into the constant pool.  This
+   is not possible for the address of thread-local symbols, which
+   is checked above.  */
+
+static bool
+ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
+{
+  /* We can always put integral constants and vectors in memory.  */
+  switch (GET_CODE (x))
+    {
+    case CONST_INT:
+    case CONST_DOUBLE:
+    case CONST_VECTOR:
+      return false;
+
+    default:
+      break;
+    }
+  return !ix86_legitimate_constant_p (mode, x);
+}
+
+/*  Nonzero if the symbol is marked as dllimport, or as stub-variable,
+    otherwise zero.  */
+
+static bool
+is_imported_p (rtx x)
+{
+  if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
+      || GET_CODE (x) != SYMBOL_REF)
+    return false;
+
+  return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
+}
+
+
+/* Nonzero if the constant value X is a legitimate general operand
+   when generating PIC code.  It is given that flag_pic is on and
+   that X satisfies CONSTANT_P or is a CONST_DOUBLE.  */
+
+bool
+legitimate_pic_operand_p (rtx x)
+{
+  rtx inner;
+
+  switch (GET_CODE (x))
+    {
+    case CONST:
+      inner = XEXP (x, 0);
+      if (GET_CODE (inner) == PLUS
+	  && CONST_INT_P (XEXP (inner, 1)))
+	inner = XEXP (inner, 0);
+
+      /* Only some unspecs are valid as "constants".  */
+      if (GET_CODE (inner) == UNSPEC)
+	switch (XINT (inner, 1))
+	  {
+	  case UNSPEC_GOT:
+	  case UNSPEC_GOTOFF:
+	  case UNSPEC_PLTOFF:
+	    return TARGET_64BIT;
+	  case UNSPEC_TPOFF:
+	    x = XVECEXP (inner, 0, 0);
+	    return (GET_CODE (x) == SYMBOL_REF
+		    && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
+	  case UNSPEC_MACHOPIC_OFFSET:
+	    return legitimate_pic_address_disp_p (x);
+	  default:
+	    return false;
+	  }
+      /* FALLTHRU */
+
+    case SYMBOL_REF:
+    case LABEL_REF:
+      return legitimate_pic_address_disp_p (x);
+
+    default:
+      return true;
+    }
+}
+
+/* Determine if a given CONST RTX is a valid memory displacement
+   in PIC mode.  */
+
+bool
+legitimate_pic_address_disp_p (rtx disp)
+{
+  bool saw_plus;
+
+  /* In 64bit mode we can allow direct addresses of symbols and labels
+     when they are not dynamic symbols.  */
+  if (TARGET_64BIT)
+    {
+      rtx op0 = disp, op1;
+
+      switch (GET_CODE (disp))
+	{
+	case LABEL_REF:
+	  return true;
+
+	case CONST:
+	  if (GET_CODE (XEXP (disp, 0)) != PLUS)
+	    break;
+	  op0 = XEXP (XEXP (disp, 0), 0);
+	  op1 = XEXP (XEXP (disp, 0), 1);
+	  if (!CONST_INT_P (op1)
+	      || INTVAL (op1) >= 16*1024*1024
+	      || INTVAL (op1) < -16*1024*1024)
+            break;
+	  if (GET_CODE (op0) == LABEL_REF)
+	    return true;
+	  if (GET_CODE (op0) == CONST
+	      && GET_CODE (XEXP (op0, 0)) == UNSPEC
+	      && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
+	    return true;
+	  if (GET_CODE (op0) == UNSPEC
+	      && XINT (op0, 1) == UNSPEC_PCREL)
+	    return true;
+	  if (GET_CODE (op0) != SYMBOL_REF)
+	    break;
+	  /* FALLTHRU */
+
+	case SYMBOL_REF:
+	  /* TLS references should always be enclosed in UNSPEC.
+	     The dllimported symbol needs always to be resolved.  */
+	  if (SYMBOL_REF_TLS_MODEL (op0)
+	      || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
+	    return false;
+
+	  if (TARGET_PECOFF)
+	    {
+	      if (is_imported_p (op0))
+		return true;
+
+	      if (SYMBOL_REF_FAR_ADDR_P (op0)
+		  || !SYMBOL_REF_LOCAL_P (op0))
+		break;
+
+	      /* Function-symbols need to be resolved only for
+	         large-model.
+	         For the small-model we don't need to resolve anything
+	         here.  */
+	      if ((ix86_cmodel != CM_LARGE_PIC
+	           && SYMBOL_REF_FUNCTION_P (op0))
+		  || ix86_cmodel == CM_SMALL_PIC)
+		return true;
+	      /* Non-external symbols don't need to be resolved for
+	         large, and medium-model.  */
+	      if ((ix86_cmodel == CM_LARGE_PIC
+		   || ix86_cmodel == CM_MEDIUM_PIC)
+		  && !SYMBOL_REF_EXTERNAL_P (op0))
+		return true;
+	    }
+	  else if (!SYMBOL_REF_FAR_ADDR_P (op0)
+		   && SYMBOL_REF_LOCAL_P (op0)
+		   && ix86_cmodel != CM_LARGE_PIC)
+	    return true;
+	  break;
+
+	default:
+	  break;
+	}
+    }
+  if (GET_CODE (disp) != CONST)
+    return false;
+  disp = XEXP (disp, 0);
+
+  if (TARGET_64BIT)
+    {
+      /* We are unsafe to allow PLUS expressions.  This limit allowed distance
+         of GOT tables.  We should not need these anyway.  */
+      if (GET_CODE (disp) != UNSPEC
+	  || (XINT (disp, 1) != UNSPEC_GOTPCREL
+	      && XINT (disp, 1) != UNSPEC_GOTOFF
+	      && XINT (disp, 1) != UNSPEC_PCREL
+	      && XINT (disp, 1) != UNSPEC_PLTOFF))
+	return false;
+
+      if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
+	  && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
+	return false;
+      return true;
+    }
+
+  saw_plus = false;
+  if (GET_CODE (disp) == PLUS)
+    {
+      if (!CONST_INT_P (XEXP (disp, 1)))
+	return false;
+      disp = XEXP (disp, 0);
+      saw_plus = true;
+    }
+
+  if (TARGET_MACHO && darwin_local_data_pic (disp))
+    return true;
+
+  if (GET_CODE (disp) != UNSPEC)
+    return false;
+
+  switch (XINT (disp, 1))
+    {
+    case UNSPEC_GOT:
+      if (saw_plus)
+	return false;
+      /* We need to check for both symbols and labels because VxWorks loads
+	 text labels with @GOT rather than @GOTOFF.  See gotoff_operand for
+	 details.  */
+      return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
+	      || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
+    case UNSPEC_GOTOFF:
+      /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
+	 While ABI specify also 32bit relocation but we don't produce it in
+	 small PIC model at all.  */
+      if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
+	   || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
+	  && !TARGET_64BIT)
+        return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
+      return false;
+    case UNSPEC_GOTTPOFF:
+    case UNSPEC_GOTNTPOFF:
+    case UNSPEC_INDNTPOFF:
+      if (saw_plus)
+	return false;
+      disp = XVECEXP (disp, 0, 0);
+      return (GET_CODE (disp) == SYMBOL_REF
+	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
+    case UNSPEC_NTPOFF:
+      disp = XVECEXP (disp, 0, 0);
+      return (GET_CODE (disp) == SYMBOL_REF
+	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
+    case UNSPEC_DTPOFF:
+      disp = XVECEXP (disp, 0, 0);
+      return (GET_CODE (disp) == SYMBOL_REF
+	      && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
+    }
+
+  return false;
+}
+
+/* Our implementation of LEGITIMIZE_RELOAD_ADDRESS.  Returns a value to
+   replace the input X, or the original X if no replacement is called for.
+   The output parameter *WIN is 1 if the calling macro should goto WIN,
+   0 if it should not.  */
+
+bool
+ix86_legitimize_reload_address (rtx x,
+				enum machine_mode mode ATTRIBUTE_UNUSED,
+				int opnum, int type,
+				int ind_levels ATTRIBUTE_UNUSED)
+{
+  /* Reload can generate:
+
+     (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
+		       (reg:DI 97))
+	      (reg:DI 2 cx))
+
+     This RTX is rejected from ix86_legitimate_address_p due to
+     non-strictness of base register 97.  Following this rejection, 
+     reload pushes all three components into separate registers,
+     creating invalid memory address RTX.
+
+     Following code reloads only the invalid part of the
+     memory address RTX.  */
+
+  if (GET_CODE (x) == PLUS
+      && REG_P (XEXP (x, 1))
+      && GET_CODE (XEXP (x, 0)) == PLUS
+      && REG_P (XEXP (XEXP (x, 0), 1)))
+    {
+      rtx base, index;
+      bool something_reloaded = false;
+
+      base = XEXP (XEXP (x, 0), 1);      
+      if (!REG_OK_FOR_BASE_STRICT_P (base))
+	{
+	  push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
+		       BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
+		       opnum, (enum reload_type) type);
+	  something_reloaded = true;
+	}
+
+      index = XEXP (x, 1);
+      if (!REG_OK_FOR_INDEX_STRICT_P (index))
+	{
+	  push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
+		       INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
+		       opnum, (enum reload_type) type);
+	  something_reloaded = true;
+	}
+
+      gcc_assert (something_reloaded);
+      return true;
+    }
+
+  return false;
+}
+
+/* Determine if op is suitable RTX for an address register.
+   Return naked register if a register or a register subreg is
+   found, otherwise return NULL_RTX.  */
+
+static rtx
+ix86_validate_address_register (rtx op)
+{
+  enum machine_mode mode = GET_MODE (op);
+
+  /* Only SImode or DImode registers can form the address.  */
+  if (mode != SImode && mode != DImode)
+    return NULL_RTX;
+
+  if (REG_P (op))
+    return op;
+  else if (GET_CODE (op) == SUBREG)
+    {
+      rtx reg = SUBREG_REG (op);
+
+      if (!REG_P (reg))
+	return NULL_RTX;
+
+      mode = GET_MODE (reg);
+
+      /* Don't allow SUBREGs that span more than a word.  It can
+	 lead to spill failures when the register is one word out
+	 of a two word structure.  */
+      if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+	return NULL_RTX;
+
+      /* Allow only SUBREGs of non-eliminable hard registers.  */
+      if (register_no_elim_operand (reg, mode))
+	return reg;
+    }
+
+  /* Op is not a register.  */
+  return NULL_RTX;
+}
+
+/* Recognizes RTL expressions that are valid memory addresses for an
+   instruction.  The MODE argument is the machine mode for the MEM
+   expression that wants to use this address.
+
+   It only recognizes address in canonical form.  LEGITIMIZE_ADDRESS should
+   convert common non-canonical forms to canonical form so that they will
+   be recognized.  */
+
+static bool
+ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
+		           rtx addr, bool strict)
+{
+  struct ix86_address parts;
+  rtx base, index, disp;
+  HOST_WIDE_INT scale;
+  enum ix86_address_seg seg;
+
+  if (ix86_decompose_address (addr, &parts) <= 0)
+    /* Decomposition failed.  */
+    return false;
+
+  base = parts.base;
+  index = parts.index;
+  disp = parts.disp;
+  scale = parts.scale;
+  seg = parts.seg;
+
+  /* Validate base register.  */
+  if (base)
+    {
+      rtx reg = ix86_validate_address_register (base);
+
+      if (reg == NULL_RTX)
+	return false;
+
+      if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
+	  || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
+	/* Base is not valid.  */
+	return false;
+    }
+
+  /* Validate index register.  */
+  if (index)
+    {
+      rtx reg = ix86_validate_address_register (index);
+
+      if (reg == NULL_RTX)
+	return false;
+
+      if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
+	  || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
+	/* Index is not valid.  */
+	return false;
+    }
+
+  /* Index and base should have the same mode.  */
+  if (base && index
+      && GET_MODE (base) != GET_MODE (index))
+    return false;
+
+  /* Address override works only on the (%reg) part of %fs:(%reg).  */
+  if (seg != SEG_DEFAULT
+      && ((base && GET_MODE (base) != word_mode)
+	  || (index && GET_MODE (index) != word_mode)))
+    return false;
+
+  /* Validate scale factor.  */
+  if (scale != 1)
+    {
+      if (!index)
+	/* Scale without index.  */
+	return false;
+
+      if (scale != 2 && scale != 4 && scale != 8)
+	/* Scale is not a valid multiplier.  */
+	return false;
+    }
+
+  /* Validate displacement.  */
+  if (disp)
+    {
+      if (GET_CODE (disp) == CONST
+	  && GET_CODE (XEXP (disp, 0)) == UNSPEC
+	  && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
+	switch (XINT (XEXP (disp, 0), 1))
+	  {
+	  /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
+	     used.  While ABI specify also 32bit relocations, we don't produce
+	     them at all and use IP relative instead.  */
+	  case UNSPEC_GOT:
+	  case UNSPEC_GOTOFF:
+	    gcc_assert (flag_pic);
+	    if (!TARGET_64BIT)
+	      goto is_legitimate_pic;
+
+	    /* 64bit address unspec.  */
+	    return false;
+
+	  case UNSPEC_GOTPCREL:
+	  case UNSPEC_PCREL:
+	    gcc_assert (flag_pic);
+	    goto is_legitimate_pic;
+
+	  case UNSPEC_GOTTPOFF:
+	  case UNSPEC_GOTNTPOFF:
+	  case UNSPEC_INDNTPOFF:
+	  case UNSPEC_NTPOFF:
+	  case UNSPEC_DTPOFF:
+	    break;
+
+	  case UNSPEC_STACK_CHECK:
+	    gcc_assert (flag_split_stack);
+	    break;
+
+	  default:
+	    /* Invalid address unspec.  */
+	    return false;
+	  }
+
+      else if (SYMBOLIC_CONST (disp)
+	       && (flag_pic
+		   || (TARGET_MACHO
+#if TARGET_MACHO
+		       && MACHOPIC_INDIRECT
+		       && !machopic_operand_p (disp)
+#endif
+	       )))
+	{
+
+	is_legitimate_pic:
+	  if (TARGET_64BIT && (index || base))
+	    {
+	      /* foo@dtpoff(%rX) is ok.  */
+	      if (GET_CODE (disp) != CONST
+		  || GET_CODE (XEXP (disp, 0)) != PLUS
+		  || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
+		  || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
+		  || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
+		      && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
+		/* Non-constant pic memory reference.  */
+		return false;
+	    }
+	  else if ((!TARGET_MACHO || flag_pic)
+		    && ! legitimate_pic_address_disp_p (disp))
+	    /* Displacement is an invalid pic construct.  */
+	    return false;
+#if TARGET_MACHO
+	  else if (MACHO_DYNAMIC_NO_PIC_P
+		   && !ix86_legitimate_constant_p (Pmode, disp))
+	    /* displacment must be referenced via non_lazy_pointer */
+	    return false;
+#endif
+
+          /* This code used to verify that a symbolic pic displacement
+	     includes the pic_offset_table_rtx register.
+
+	     While this is good idea, unfortunately these constructs may
+	     be created by "adds using lea" optimization for incorrect
+	     code like:
+
+	     int a;
+	     int foo(int i)
+	       {
+	         return *(&a+i);
+	       }
+
+	     This code is nonsensical, but results in addressing
+	     GOT table with pic_offset_table_rtx base.  We can't
+	     just refuse it easily, since it gets matched by
+	     "addsi3" pattern, that later gets split to lea in the
+	     case output register differs from input.  While this
+	     can be handled by separate addsi pattern for this case
+	     that never results in lea, this seems to be easier and
+	     correct fix for crash to disable this test.  */
+	}
+      else if (GET_CODE (disp) != LABEL_REF
+	       && !CONST_INT_P (disp)
+	       && (GET_CODE (disp) != CONST
+		   || !ix86_legitimate_constant_p (Pmode, disp))
+	       && (GET_CODE (disp) != SYMBOL_REF
+		   || !ix86_legitimate_constant_p (Pmode, disp)))
+	/* Displacement is not constant.  */
+	return false;
+      else if (TARGET_64BIT
+	       && !x86_64_immediate_operand (disp, VOIDmode))
+	/* Displacement is out of range.  */
+	return false;
+      /* In x32 mode, constant addresses are sign extended to 64bit, so
+	 we have to prevent addresses from 0x80000000 to 0xffffffff.  */
+      else if (TARGET_X32 && !(index || base)
+	       && CONST_INT_P (disp)
+	       && val_signbit_known_set_p (SImode, INTVAL (disp)))
+	return false;
+    }
+
+  /* Everything looks valid.  */
+  return true;
+}
+
+/* Determine if a given RTX is a valid constant address.  */
+
+bool
+constant_address_p (rtx x)
+{
+  return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
+}
+
+/* Return a unique alias set for the GOT.  */
+
+static alias_set_type
+ix86_GOT_alias_set (void)
+{
+  static alias_set_type set = -1;
+  if (set == -1)
+    set = new_alias_set ();
+  return set;
+}
+
+/* Return a legitimate reference for ORIG (an address) using the
+   register REG.  If REG is 0, a new pseudo is generated.
+
+   There are two types of references that must be handled:
+
+   1. Global data references must load the address from the GOT, via
+      the PIC reg.  An insn is emitted to do this load, and the reg is
+      returned.
+
+   2. Static data references, constant pool addresses, and code labels
+      compute the address as an offset from the GOT, whose base is in
+      the PIC reg.  Static data objects have SYMBOL_FLAG_LOCAL set to
+      differentiate them from global data objects.  The returned
+      address is the PIC reg + an unspec constant.
+
+   TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
+   reg also appears in the address.  */
+
+static rtx
+legitimize_pic_address (rtx orig, rtx reg)
+{
+  rtx addr = orig;
+  rtx new_rtx = orig;
+
+#if TARGET_MACHO
+  if (TARGET_MACHO && !TARGET_64BIT)
+    {
+      if (reg == 0)
+	reg = gen_reg_rtx (Pmode);
+      /* Use the generic Mach-O PIC machinery.  */
+      return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
+    }
+#endif
+
+  if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
+    {
+      rtx tmp = legitimize_pe_coff_symbol (addr, true);
+      if (tmp)
+        return tmp;
+    }
+
+  if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
+    new_rtx = addr;
+  else if (TARGET_64BIT && !TARGET_PECOFF
+	   && ix86_cmodel != CM_SMALL_PIC && gotoff_operand (addr, Pmode))
+    {
+      rtx tmpreg;
+      /* This symbol may be referenced via a displacement from the PIC
+	 base address (@GOTOFF).  */
+
+      if (reload_in_progress)
+	df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
+      if (GET_CODE (addr) == CONST)
+	addr = XEXP (addr, 0);
+      if (GET_CODE (addr) == PLUS)
+	  {
+            new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
+				      UNSPEC_GOTOFF);
+	    new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
+	  }
+	else
+          new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
+      new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+      if (!reg)
+        tmpreg = gen_reg_rtx (Pmode);
+      else
+	tmpreg = reg;
+      emit_move_insn (tmpreg, new_rtx);
+
+      if (reg != 0)
+	{
+	  new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
+					 tmpreg, 1, OPTAB_DIRECT);
+	  new_rtx = reg;
+	}
+      else
+        new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
+    }
+  else if (!TARGET_64BIT && !TARGET_PECOFF && gotoff_operand (addr, Pmode))
+    {
+      /* This symbol may be referenced via a displacement from the PIC
+	 base address (@GOTOFF).  */
+
+      if (reload_in_progress)
+	df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
+      if (GET_CODE (addr) == CONST)
+	addr = XEXP (addr, 0);
+      if (GET_CODE (addr) == PLUS)
+	  {
+            new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
+				      UNSPEC_GOTOFF);
+	    new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
+	  }
+	else
+          new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
+      new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+      new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
+
+      if (reg != 0)
+	{
+	  emit_move_insn (reg, new_rtx);
+	  new_rtx = reg;
+	}
+    }
+  else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
+	   /* We can't use @GOTOFF for text labels on VxWorks;
+	      see gotoff_operand.  */
+	   || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
+    {
+      rtx tmp = legitimize_pe_coff_symbol (addr, true);
+      if (tmp)
+        return tmp;
+
+      /* For x64 PE-COFF there is no GOT table.  So we use address
+         directly.  */
+      if (TARGET_64BIT && TARGET_PECOFF)
+	{
+	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
+	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+
+	  if (reg == 0)
+	    reg = gen_reg_rtx (Pmode);
+	  emit_move_insn (reg, new_rtx);
+	  new_rtx = reg;
+	}
+      else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
+	{
+	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
+	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+	  new_rtx = gen_const_mem (Pmode, new_rtx);
+	  set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
+
+	  if (reg == 0)
+	    reg = gen_reg_rtx (Pmode);
+	  /* Use directly gen_movsi, otherwise the address is loaded
+	     into register for CSE.  We don't want to CSE this addresses,
+	     instead we CSE addresses from the GOT table, so skip this.  */
+	  emit_insn (gen_movsi (reg, new_rtx));
+	  new_rtx = reg;
+	}
+      else
+	{
+	  /* This symbol must be referenced via a load from the
+	     Global Offset Table (@GOT).  */
+
+	  if (reload_in_progress)
+	    df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
+	  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
+	  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+	  if (TARGET_64BIT)
+	    new_rtx = force_reg (Pmode, new_rtx);
+	  new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
+	  new_rtx = gen_const_mem (Pmode, new_rtx);
+	  set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
+
+	  if (reg == 0)
+	    reg = gen_reg_rtx (Pmode);
+	  emit_move_insn (reg, new_rtx);
+	  new_rtx = reg;
+	}
+    }
+  else
+    {
+      if (CONST_INT_P (addr)
+	  && !x86_64_immediate_operand (addr, VOIDmode))
+	{
+	  if (reg)
+	    {
+	      emit_move_insn (reg, addr);
+	      new_rtx = reg;
+	    }
+	  else
+	    new_rtx = force_reg (Pmode, addr);
+	}
+      else if (GET_CODE (addr) == CONST)
+	{
+	  addr = XEXP (addr, 0);
+
+	  /* We must match stuff we generate before.  Assume the only
+	     unspecs that can get here are ours.  Not that we could do
+	     anything with them anyway....  */
+	  if (GET_CODE (addr) == UNSPEC
+	      || (GET_CODE (addr) == PLUS
+		  && GET_CODE (XEXP (addr, 0)) == UNSPEC))
+	    return orig;
+	  gcc_assert (GET_CODE (addr) == PLUS);
+	}
+      if (GET_CODE (addr) == PLUS)
+	{
+	  rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
+
+	  /* Check first to see if this is a constant offset from a @GOTOFF
+	     symbol reference.  */
+	  if (!TARGET_PECOFF && gotoff_operand (op0, Pmode)
+	      && CONST_INT_P (op1))
+	    {
+	      if (!TARGET_64BIT)
+		{
+		  if (reload_in_progress)
+		    df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
+		  new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
+					    UNSPEC_GOTOFF);
+		  new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
+		  new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+		  new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
+
+		  if (reg != 0)
+		    {
+		      emit_move_insn (reg, new_rtx);
+		      new_rtx = reg;
+		    }
+		}
+	      else
+		{
+		  if (INTVAL (op1) < -16*1024*1024
+		      || INTVAL (op1) >= 16*1024*1024)
+		    {
+		      if (!x86_64_immediate_operand (op1, Pmode))
+			op1 = force_reg (Pmode, op1);
+		      new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
+		    }
+		}
+	    }
+	  else
+	    {
+	      rtx base = legitimize_pic_address (op0, reg);
+	      enum machine_mode mode = GET_MODE (base);
+	      new_rtx
+	        = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
+
+	      if (CONST_INT_P (new_rtx))
+		{
+		  if (INTVAL (new_rtx) < -16*1024*1024
+		      || INTVAL (new_rtx) >= 16*1024*1024)
+		    {
+		      if (!x86_64_immediate_operand (new_rtx, mode))
+			new_rtx = force_reg (mode, new_rtx);
+		      new_rtx
+		        = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
+		    }
+		  else
+		    new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
+		}
+	      else
+		{
+		  if (GET_CODE (new_rtx) == PLUS
+		      && CONSTANT_P (XEXP (new_rtx, 1)))
+		    {
+		      base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
+		      new_rtx = XEXP (new_rtx, 1);
+		    }
+		  new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
+		}
+	    }
+	}
+    }
+  return new_rtx;
+}
+
+/* Load the thread pointer.  If TO_REG is true, force it into a register.  */
+
+static rtx
+get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
+{
+  rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
+
+  if (GET_MODE (tp) != tp_mode)
+    {
+      gcc_assert (GET_MODE (tp) == SImode);
+      gcc_assert (tp_mode == DImode);
+
+      tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
+    }
+
+  if (to_reg)
+    tp = copy_to_mode_reg (tp_mode, tp);
+
+  return tp;
+}
+
+/* Construct the SYMBOL_REF for the tls_get_addr function.  */
+
+static GTY(()) rtx ix86_tls_symbol;
+
+static rtx
+ix86_tls_get_addr (void)
+{
+  if (!ix86_tls_symbol)
+    {
+      const char *sym
+	= ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
+	   ? "___tls_get_addr" : "__tls_get_addr");
+
+      ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
+    }
+
+  if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
+    {
+      rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
+				   UNSPEC_PLTOFF);
+      return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
+			   gen_rtx_CONST (Pmode, unspec));
+    }
+
+  return ix86_tls_symbol;
+}
+
+/* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol.  */
+
+static GTY(()) rtx ix86_tls_module_base_symbol;
+
+rtx
+ix86_tls_module_base (void)
+{
+  if (!ix86_tls_module_base_symbol)
+    {
+      ix86_tls_module_base_symbol
+	= gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
+
+      SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
+	|= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
+    }
+
+  return ix86_tls_module_base_symbol;
+}
+
+/* A subroutine of ix86_legitimize_address and ix86_expand_move.  FOR_MOV is
+   false if we expect this to be used for a memory address and true if
+   we expect to load the address into a register.  */
+
+static rtx
+legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
+{
+  rtx dest, base, off;
+  rtx pic = NULL_RTX, tp = NULL_RTX;
+  enum machine_mode tp_mode = Pmode;
+  int type;
+
+  /* Fall back to global dynamic model if tool chain cannot support local
+     dynamic.  */
+  if (TARGET_SUN_TLS && !TARGET_64BIT
+      && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
+      && model == TLS_MODEL_LOCAL_DYNAMIC)
+    model = TLS_MODEL_GLOBAL_DYNAMIC;
+
+  switch (model)
+    {
+    case TLS_MODEL_GLOBAL_DYNAMIC:
+      dest = gen_reg_rtx (Pmode);
+
+      if (!TARGET_64BIT)
+	{
+	  if (flag_pic && !TARGET_PECOFF)
+	    pic = pic_offset_table_rtx;
+	  else
+	    {
+	      pic = gen_reg_rtx (Pmode);
+	      emit_insn (gen_set_got (pic));
+	    }
+	}
+
+      if (TARGET_GNU2_TLS)
+	{
+	  if (TARGET_64BIT)
+	    emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
+	  else
+	    emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
+
+	  tp = get_thread_pointer (Pmode, true);
+	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
+
+	  if (GET_MODE (x) != Pmode)
+	    x = gen_rtx_ZERO_EXTEND (Pmode, x);
+
+	  set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
+	}
+      else
+	{
+	  rtx caddr = ix86_tls_get_addr ();
+
+	  if (TARGET_64BIT)
+	    {
+	      rtx rax = gen_rtx_REG (Pmode, AX_REG);
+	      rtx insns;
+
+	      start_sequence ();
+	      emit_call_insn
+		(ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
+	      insns = get_insns ();
+	      end_sequence ();
+
+	      if (GET_MODE (x) != Pmode)
+		x = gen_rtx_ZERO_EXTEND (Pmode, x);
+
+	      RTL_CONST_CALL_P (insns) = 1;
+	      emit_libcall_block (insns, dest, rax, x);
+	    }
+	  else
+	    emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
+	}
+      break;
+
+    case TLS_MODEL_LOCAL_DYNAMIC:
+      base = gen_reg_rtx (Pmode);
+
+      if (!TARGET_64BIT)
+	{
+	  if (flag_pic)
+	    pic = pic_offset_table_rtx;
+	  else
+	    {
+	      pic = gen_reg_rtx (Pmode);
+	      emit_insn (gen_set_got (pic));
+	    }
+	}
+
+      if (TARGET_GNU2_TLS)
+	{
+	  rtx tmp = ix86_tls_module_base ();
+
+	  if (TARGET_64BIT)
+	    emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
+	  else
+	    emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
+
+	  tp = get_thread_pointer (Pmode, true);
+	  set_unique_reg_note (get_last_insn (), REG_EQUAL,
+			       gen_rtx_MINUS (Pmode, tmp, tp));
+	}
+      else
+	{
+	  rtx caddr = ix86_tls_get_addr ();
+
+	  if (TARGET_64BIT)
+	    {
+	      rtx rax = gen_rtx_REG (Pmode, AX_REG);
+	      rtx insns, eqv;
+
+	      start_sequence ();
+	      emit_call_insn
+		(ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
+	      insns = get_insns ();
+	      end_sequence ();
+
+	      /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
+		 share the LD_BASE result with other LD model accesses.  */
+	      eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
+				    UNSPEC_TLS_LD_BASE);
+
+	      RTL_CONST_CALL_P (insns) = 1;
+	      emit_libcall_block (insns, base, rax, eqv);
+	    }
+	  else
+	    emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
+	}
+
+      off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
+      off = gen_rtx_CONST (Pmode, off);
+
+      dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
+
+      if (TARGET_GNU2_TLS)
+	{
+	  dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
+
+	  if (GET_MODE (x) != Pmode)
+	    x = gen_rtx_ZERO_EXTEND (Pmode, x);
+
+	  set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
+	}
+      break;
+
+    case TLS_MODEL_INITIAL_EXEC:
+      if (TARGET_64BIT)
+	{
+	  if (TARGET_SUN_TLS && !TARGET_X32)
+	    {
+	      /* The Sun linker took the AMD64 TLS spec literally
+		 and can only handle %rax as destination of the
+		 initial executable code sequence.  */
+
+	      dest = gen_reg_rtx (DImode);
+	      emit_insn (gen_tls_initial_exec_64_sun (dest, x));
+	      return dest;
+	    }
+
+	  /* Generate DImode references to avoid %fs:(%reg32)
+	     problems and linker IE->LE relaxation bug.  */
+	  tp_mode = DImode;
+	  pic = NULL;
+	  type = UNSPEC_GOTNTPOFF;
+	}
+      else if (flag_pic)
+	{
+	  if (reload_in_progress)
+	    df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
+	  pic = pic_offset_table_rtx;
+	  type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
+	}
+      else if (!TARGET_ANY_GNU_TLS)
+	{
+	  pic = gen_reg_rtx (Pmode);
+	  emit_insn (gen_set_got (pic));
+	  type = UNSPEC_GOTTPOFF;
+	}
+      else
+	{
+	  pic = NULL;
+	  type = UNSPEC_INDNTPOFF;
+	}
+
+      off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
+      off = gen_rtx_CONST (tp_mode, off);
+      if (pic)
+	off = gen_rtx_PLUS (tp_mode, pic, off);
+      off = gen_const_mem (tp_mode, off);
+      set_mem_alias_set (off, ix86_GOT_alias_set ());
+
+      if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
+	{
+	  base = get_thread_pointer (tp_mode,
+				     for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
+	  off = force_reg (tp_mode, off);
+	  return gen_rtx_PLUS (tp_mode, base, off);
+	}
+      else
+	{
+	  base = get_thread_pointer (Pmode, true);
+	  dest = gen_reg_rtx (Pmode);
+	  emit_insn (ix86_gen_sub3 (dest, base, off));
+	}
+      break;
+
+    case TLS_MODEL_LOCAL_EXEC:
+      off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
+			    (TARGET_64BIT || TARGET_ANY_GNU_TLS)
+			    ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
+      off = gen_rtx_CONST (Pmode, off);
+
+      if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
+	{
+	  base = get_thread_pointer (Pmode,
+				     for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
+	  return gen_rtx_PLUS (Pmode, base, off);
+	}
+      else
+	{
+	  base = get_thread_pointer (Pmode, true);
+	  dest = gen_reg_rtx (Pmode);
+	  emit_insn (ix86_gen_sub3 (dest, base, off));
+	}
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return dest;
+}
+
+/* Create or return the unique __imp_DECL dllimport symbol corresponding
+   to symbol DECL if BEIMPORT is true.  Otherwise create or return the
+   unique refptr-DECL symbol corresponding to symbol DECL.  */
+
+static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
+  htab_t dllimport_map;
+
+static tree
+get_dllimport_decl (tree decl, bool beimport)
+{
+  struct tree_map *h, in;
+  void **loc;
+  const char *name;
+  const char *prefix;
+  size_t namelen, prefixlen;
+  char *imp_name;
+  tree to;
+  rtx rtl;
+
+  if (!dllimport_map)
+    dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
+
+  in.hash = htab_hash_pointer (decl);
+  in.base.from = decl;
+  loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
+  h = (struct tree_map *) *loc;
+  if (h)
+    return h->to;
+
+  *loc = h = ggc_alloc_tree_map ();
+  h->hash = in.hash;
+  h->base.from = decl;
+  h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
+			   VAR_DECL, NULL, ptr_type_node);
+  DECL_ARTIFICIAL (to) = 1;
+  DECL_IGNORED_P (to) = 1;
+  DECL_EXTERNAL (to) = 1;
+  TREE_READONLY (to) = 1;
+
+  name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
+  name = targetm.strip_name_encoding (name);
+  if (beimport)
+    prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
+      ? "*__imp_" : "*__imp__";
+  else
+    prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
+  namelen = strlen (name);
+  prefixlen = strlen (prefix);
+  imp_name = (char *) alloca (namelen + prefixlen + 1);
+  memcpy (imp_name, prefix, prefixlen);
+  memcpy (imp_name + prefixlen, name, namelen + 1);
+
+  name = ggc_alloc_string (imp_name, namelen + prefixlen);
+  rtl = gen_rtx_SYMBOL_REF (Pmode, name);
+  SET_SYMBOL_REF_DECL (rtl, to);
+  SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
+  if (!beimport)
+    {
+      SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
+#ifdef SUB_TARGET_RECORD_STUB
+      SUB_TARGET_RECORD_STUB (name);
+#endif
+    }      
+
+  rtl = gen_const_mem (Pmode, rtl);
+  set_mem_alias_set (rtl, ix86_GOT_alias_set ());
+
+  SET_DECL_RTL (to, rtl);
+  SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
+
+  return to;
+}
+
+/* Expand SYMBOL into its corresponding far-addresse symbol.
+   WANT_REG is true if we require the result be a register.  */
+
+static rtx
+legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
+{
+  tree imp_decl;
+  rtx x;
+
+  gcc_assert (SYMBOL_REF_DECL (symbol));
+  imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
+
+  x = DECL_RTL (imp_decl);
+  if (want_reg)
+    x = force_reg (Pmode, x);
+  return x;
+}
+
+/* Expand SYMBOL into its corresponding dllimport symbol.  WANT_REG is
+   true if we require the result be a register.  */
+
+static rtx
+legitimize_dllimport_symbol (rtx symbol, bool want_reg)
+{
+  tree imp_decl;
+  rtx x;
+
+  gcc_assert (SYMBOL_REF_DECL (symbol));
+  imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
+
+  x = DECL_RTL (imp_decl);
+  if (want_reg)
+    x = force_reg (Pmode, x);
+  return x;
+}
+
+/* Expand SYMBOL into its corresponding dllimport or refptr symbol.  WANT_REG 
+   is true if we require the result be a register.  */
+
+static rtx
+legitimize_pe_coff_symbol (rtx addr, bool inreg)
+{
+  if (!TARGET_PECOFF)
+    return NULL_RTX;
+
+  if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
+    {
+      if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
+	return legitimize_dllimport_symbol (addr, inreg);
+      if (GET_CODE (addr) == CONST
+	  && GET_CODE (XEXP (addr, 0)) == PLUS
+	  && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
+	  && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
+	{
+	  rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
+	  return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
+	}
+    }
+
+  if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
+    return NULL_RTX;
+  if (GET_CODE (addr) == SYMBOL_REF
+      && !is_imported_p (addr)
+      && SYMBOL_REF_EXTERNAL_P (addr)
+      && SYMBOL_REF_DECL (addr))
+    return legitimize_pe_coff_extern_decl (addr, inreg);
+
+  if (GET_CODE (addr) == CONST
+      && GET_CODE (XEXP (addr, 0)) == PLUS
+      && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
+      && !is_imported_p (XEXP (XEXP (addr, 0), 0))
+      && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
+      && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
+    {
+      rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
+      return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
+    }
+  return NULL_RTX;
+}
+
+/* Try machine-dependent ways of modifying an illegitimate address
+   to be legitimate.  If we find one, return the new, valid address.
+   This macro is used in only one place: `memory_address' in explow.c.
+
+   OLDX is the address as it was before break_out_memory_refs was called.
+   In some cases it is useful to look at this to decide what needs to be done.
+
+   It is always safe for this macro to do nothing.  It exists to recognize
+   opportunities to optimize the output.
+
+   For the 80386, we handle X+REG by loading X into a register R and
+   using R+REG.  R will go in a general reg and indexing will be used.
+   However, if REG is a broken-out memory address or multiplication,
+   nothing needs to be done because REG can certainly go in a general reg.
+
+   When -fpic is used, special handling is needed for symbolic references.
+   See comments by legitimize_pic_address in i386.c for details.  */
+
+static rtx
+ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
+			 enum machine_mode mode)
+{
+  int changed = 0;
+  unsigned log;
+
+  log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
+  if (log)
+    return legitimize_tls_address (x, (enum tls_model) log, false);
+  if (GET_CODE (x) == CONST
+      && GET_CODE (XEXP (x, 0)) == PLUS
+      && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
+      && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
+    {
+      rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
+				      (enum tls_model) log, false);
+      return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
+    }
+
+  if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
+    {
+      rtx tmp = legitimize_pe_coff_symbol (x, true);
+      if (tmp)
+        return tmp;
+    }
+
+  if (flag_pic && SYMBOLIC_CONST (x))
+    return legitimize_pic_address (x, 0);
+
+#if TARGET_MACHO
+  if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
+    return machopic_indirect_data_reference (x, 0);
+#endif
+
+  /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
+  if (GET_CODE (x) == ASHIFT
+      && CONST_INT_P (XEXP (x, 1))
+      && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
+    {
+      changed = 1;
+      log = INTVAL (XEXP (x, 1));
+      x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
+			GEN_INT (1 << log));
+    }
+
+  if (GET_CODE (x) == PLUS)
+    {
+      /* Canonicalize shifts by 0, 1, 2, 3 into multiply.  */
+
+      if (GET_CODE (XEXP (x, 0)) == ASHIFT
+	  && CONST_INT_P (XEXP (XEXP (x, 0), 1))
+	  && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
+	{
+	  changed = 1;
+	  log = INTVAL (XEXP (XEXP (x, 0), 1));
+	  XEXP (x, 0) = gen_rtx_MULT (Pmode,
+				      force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
+				      GEN_INT (1 << log));
+	}
+
+      if (GET_CODE (XEXP (x, 1)) == ASHIFT
+	  && CONST_INT_P (XEXP (XEXP (x, 1), 1))
+	  && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
+	{
+	  changed = 1;
+	  log = INTVAL (XEXP (XEXP (x, 1), 1));
+	  XEXP (x, 1) = gen_rtx_MULT (Pmode,
+				      force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
+				      GEN_INT (1 << log));
+	}
+
+      /* Put multiply first if it isn't already.  */
+      if (GET_CODE (XEXP (x, 1)) == MULT)
+	{
+	  rtx tmp = XEXP (x, 0);
+	  XEXP (x, 0) = XEXP (x, 1);
+	  XEXP (x, 1) = tmp;
+	  changed = 1;
+	}
+
+      /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
+	 into (plus (plus (mult (reg) (const)) (reg)) (const)).  This can be
+	 created by virtual register instantiation, register elimination, and
+	 similar optimizations.  */
+      if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
+	{
+	  changed = 1;
+	  x = gen_rtx_PLUS (Pmode,
+			    gen_rtx_PLUS (Pmode, XEXP (x, 0),
+					  XEXP (XEXP (x, 1), 0)),
+			    XEXP (XEXP (x, 1), 1));
+	}
+
+      /* Canonicalize
+	 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
+	 into (plus (plus (mult (reg) (const)) (reg)) (const)).  */
+      else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
+	       && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
+	       && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
+	       && CONSTANT_P (XEXP (x, 1)))
+	{
+	  rtx constant;
+	  rtx other = NULL_RTX;
+
+	  if (CONST_INT_P (XEXP (x, 1)))
+	    {
+	      constant = XEXP (x, 1);
+	      other = XEXP (XEXP (XEXP (x, 0), 1), 1);
+	    }
+	  else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
+	    {
+	      constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
+	      other = XEXP (x, 1);
+	    }
+	  else
+	    constant = 0;
+
+	  if (constant)
+	    {
+	      changed = 1;
+	      x = gen_rtx_PLUS (Pmode,
+				gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
+					      XEXP (XEXP (XEXP (x, 0), 1), 0)),
+				plus_constant (Pmode, other,
+					       INTVAL (constant)));
+	    }
+	}
+
+      if (changed && ix86_legitimate_address_p (mode, x, false))
+	return x;
+
+      if (GET_CODE (XEXP (x, 0)) == MULT)
+	{
+	  changed = 1;
+	  XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
+	}
+
+      if (GET_CODE (XEXP (x, 1)) == MULT)
+	{
+	  changed = 1;
+	  XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
+	}
+
+      if (changed
+	  && REG_P (XEXP (x, 1))
+	  && REG_P (XEXP (x, 0)))
+	return x;
+
+      if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
+	{
+	  changed = 1;
+	  x = legitimize_pic_address (x, 0);
+	}
+
+      if (changed && ix86_legitimate_address_p (mode, x, false))
+	return x;
+
+      if (REG_P (XEXP (x, 0)))
+	{
+	  rtx temp = gen_reg_rtx (Pmode);
+	  rtx val  = force_operand (XEXP (x, 1), temp);
+	  if (val != temp)
+	    {
+	      val = convert_to_mode (Pmode, val, 1);
+	      emit_move_insn (temp, val);
+	    }
+
+	  XEXP (x, 1) = temp;
+	  return x;
+	}
+
+      else if (REG_P (XEXP (x, 1)))
+	{
+	  rtx temp = gen_reg_rtx (Pmode);
+	  rtx val  = force_operand (XEXP (x, 0), temp);
+	  if (val != temp)
+	    {
+	      val = convert_to_mode (Pmode, val, 1);
+	      emit_move_insn (temp, val);
+	    }
+
+	  XEXP (x, 0) = temp;
+	  return x;
+	}
+    }
+
+  return x;
+}
+
+/* Print an integer constant expression in assembler syntax.  Addition
+   and subtraction are the only arithmetic that may appear in these
+   expressions.  FILE is the stdio stream to write to, X is the rtx, and
+   CODE is the operand print code from the output string.  */
+
+static void
+output_pic_addr_const (FILE *file, rtx x, int code)
+{
+  char buf[256];
+
+  switch (GET_CODE (x))
+    {
+    case PC:
+      gcc_assert (flag_pic);
+      putc ('.', file);
+      break;
+
+    case SYMBOL_REF:
+      if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
+	output_addr_const (file, x);
+      else
+	{
+	  const char *name = XSTR (x, 0);
+
+	  /* Mark the decl as referenced so that cgraph will
+	     output the function.  */
+	  if (SYMBOL_REF_DECL (x))
+	    mark_decl_referenced (SYMBOL_REF_DECL (x));
+
+#if TARGET_MACHO
+	  if (MACHOPIC_INDIRECT
+	      && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
+	    name = machopic_indirection_name (x, /*stub_p=*/true);
+#endif
+	  assemble_name (file, name);
+	}
+      if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
+	  && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
+	fputs ("@PLT", file);
+      break;
+
+    case LABEL_REF:
+      x = XEXP (x, 0);
+      /* FALLTHRU */
+    case CODE_LABEL:
+      ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
+      assemble_name (asm_out_file, buf);
+      break;
+
+    case CONST_INT:
+      fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
+      break;
+
+    case CONST:
+      /* This used to output parentheses around the expression,
+	 but that does not work on the 386 (either ATT or BSD assembler).  */
+      output_pic_addr_const (file, XEXP (x, 0), code);
+      break;
+
+    case CONST_DOUBLE:
+      if (GET_MODE (x) == VOIDmode)
+	{
+	  /* We can use %d if the number is <32 bits and positive.  */
+	  if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
+	    fprintf (file, "0x%lx%08lx",
+		     (unsigned long) CONST_DOUBLE_HIGH (x),
+		     (unsigned long) CONST_DOUBLE_LOW (x));
+	  else
+	    fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
+	}
+      else
+	/* We can't handle floating point constants;
+	   TARGET_PRINT_OPERAND must handle them.  */
+	output_operand_lossage ("floating constant misused");
+      break;
+
+    case PLUS:
+      /* Some assemblers need integer constants to appear first.  */
+      if (CONST_INT_P (XEXP (x, 0)))
+	{
+	  output_pic_addr_const (file, XEXP (x, 0), code);
+	  putc ('+', file);
+	  output_pic_addr_const (file, XEXP (x, 1), code);
+	}
+      else
+	{
+	  gcc_assert (CONST_INT_P (XEXP (x, 1)));
+	  output_pic_addr_const (file, XEXP (x, 1), code);
+	  putc ('+', file);
+	  output_pic_addr_const (file, XEXP (x, 0), code);
+	}
+      break;
+
+    case MINUS:
+      if (!TARGET_MACHO)
+	putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
+      output_pic_addr_const (file, XEXP (x, 0), code);
+      putc ('-', file);
+      output_pic_addr_const (file, XEXP (x, 1), code);
+      if (!TARGET_MACHO)
+	putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
+      break;
+
+     case UNSPEC:
+       if (XINT (x, 1) == UNSPEC_STACK_CHECK)
+	 {
+	   bool f = i386_asm_output_addr_const_extra (file, x);
+	   gcc_assert (f);
+	   break;
+	 }
+
+       gcc_assert (XVECLEN (x, 0) == 1);
+       output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
+       switch (XINT (x, 1))
+	{
+	case UNSPEC_GOT:
+	  fputs ("@GOT", file);
+	  break;
+	case UNSPEC_GOTOFF:
+	  fputs ("@GOTOFF", file);
+	  break;
+	case UNSPEC_PLTOFF:
+	  fputs ("@PLTOFF", file);
+	  break;
+	case UNSPEC_PCREL:
+	  fputs (ASSEMBLER_DIALECT == ASM_ATT ?
+		 "(%rip)" : "[rip]", file);
+	  break;
+	case UNSPEC_GOTPCREL:
+	  fputs (ASSEMBLER_DIALECT == ASM_ATT ?
+		 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
+	  break;
+	case UNSPEC_GOTTPOFF:
+	  /* FIXME: This might be @TPOFF in Sun ld too.  */
+	  fputs ("@gottpoff", file);
+	  break;
+	case UNSPEC_TPOFF:
+	  fputs ("@tpoff", file);
+	  break;
+	case UNSPEC_NTPOFF:
+	  if (TARGET_64BIT)
+	    fputs ("@tpoff", file);
+	  else
+	    fputs ("@ntpoff", file);
+	  break;
+	case UNSPEC_DTPOFF:
+	  fputs ("@dtpoff", file);
+	  break;
+	case UNSPEC_GOTNTPOFF:
+	  if (TARGET_64BIT)
+	    fputs (ASSEMBLER_DIALECT == ASM_ATT ?
+		   "@gottpoff(%rip)": "@gottpoff[rip]", file);
+	  else
+	    fputs ("@gotntpoff", file);
+	  break;
+	case UNSPEC_INDNTPOFF:
+	  fputs ("@indntpoff", file);
+	  break;
+#if TARGET_MACHO
+	case UNSPEC_MACHOPIC_OFFSET:
+	  putc ('-', file);
+	  machopic_output_function_base_name (file);
+	  break;
+#endif
+	default:
+	  output_operand_lossage ("invalid UNSPEC as operand");
+	  break;
+	}
+       break;
+
+    default:
+      output_operand_lossage ("invalid expression as operand");
+    }
+}
+
+/* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
+   We need to emit DTP-relative relocations.  */
+
+static void ATTRIBUTE_UNUSED
+i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
+{
+  fputs (ASM_LONG, file);
+  output_addr_const (file, x);
+  fputs ("@dtpoff", file);
+  switch (size)
+    {
+    case 4:
+      break;
+    case 8:
+      fputs (", 0", file);
+      break;
+    default:
+      gcc_unreachable ();
+   }
+}
+
+/* Return true if X is a representation of the PIC register.  This copes
+   with calls from ix86_find_base_term, where the register might have
+   been replaced by a cselib value.  */
+
+static bool
+ix86_pic_register_p (rtx x)
+{
+  if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
+    return (pic_offset_table_rtx
+	    && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
+  else
+    return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
+}
+
+/* Helper function for ix86_delegitimize_address.
+   Attempt to delegitimize TLS local-exec accesses.  */
+
+static rtx
+ix86_delegitimize_tls_address (rtx orig_x)
+{
+  rtx x = orig_x, unspec;
+  struct ix86_address addr;
+
+  if (!TARGET_TLS_DIRECT_SEG_REFS)
+    return orig_x;
+  if (MEM_P (x))
+    x = XEXP (x, 0);
+  if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
+    return orig_x;
+  if (ix86_decompose_address (x, &addr) == 0
+      || addr.seg != DEFAULT_TLS_SEG_REG
+      || addr.disp == NULL_RTX
+      || GET_CODE (addr.disp) != CONST)
+    return orig_x;
+  unspec = XEXP (addr.disp, 0);
+  if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
+    unspec = XEXP (unspec, 0);
+  if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
+    return orig_x;
+  x = XVECEXP (unspec, 0, 0);
+  gcc_assert (GET_CODE (x) == SYMBOL_REF);
+  if (unspec != XEXP (addr.disp, 0))
+    x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
+  if (addr.index)
+    {
+      rtx idx = addr.index;
+      if (addr.scale != 1)
+	idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
+      x = gen_rtx_PLUS (Pmode, idx, x);
+    }
+  if (addr.base)
+    x = gen_rtx_PLUS (Pmode, addr.base, x);
+  if (MEM_P (orig_x))
+    x = replace_equiv_address_nv (orig_x, x);
+  return x;
+}
+
+/* In the name of slightly smaller debug output, and to cater to
+   general assembler lossage, recognize PIC+GOTOFF and turn it back
+   into a direct symbol reference.
+
+   On Darwin, this is necessary to avoid a crash, because Darwin
+   has a different PIC label for each routine but the DWARF debugging
+   information is not associated with any particular routine, so it's
+   necessary to remove references to the PIC label from RTL stored by
+   the DWARF output code.  */
+
+static rtx
+ix86_delegitimize_address (rtx x)
+{
+  rtx orig_x = delegitimize_mem_from_attrs (x);
+  /* addend is NULL or some rtx if x is something+GOTOFF where
+     something doesn't include the PIC register.  */
+  rtx addend = NULL_RTX;
+  /* reg_addend is NULL or a multiple of some register.  */
+  rtx reg_addend = NULL_RTX;
+  /* const_addend is NULL or a const_int.  */
+  rtx const_addend = NULL_RTX;
+  /* This is the result, or NULL.  */
+  rtx result = NULL_RTX;
+
+  x = orig_x;
+
+  if (MEM_P (x))
+    x = XEXP (x, 0);
+
+  if (TARGET_64BIT)
+    {
+      if (GET_CODE (x) == CONST
+          && GET_CODE (XEXP (x, 0)) == PLUS
+          && GET_MODE (XEXP (x, 0)) == Pmode
+          && CONST_INT_P (XEXP (XEXP (x, 0), 1))
+          && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
+          && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
+        {
+	  rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
+	  x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
+	  if (MEM_P (orig_x))
+	    x = replace_equiv_address_nv (orig_x, x);
+	  return x;
+	}
+
+      if (GET_CODE (x) == CONST
+	  && GET_CODE (XEXP (x, 0)) == UNSPEC
+	  && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
+	      || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
+	  && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
+	{
+	  x = XVECEXP (XEXP (x, 0), 0, 0);
+	  if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
+	    {
+	      x = simplify_gen_subreg (GET_MODE (orig_x), x,
+				       GET_MODE (x), 0);
+	      if (x == NULL_RTX)
+		return orig_x;
+	    }
+	  return x;
+	}
+
+      if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
+	return ix86_delegitimize_tls_address (orig_x);
+
+      /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
+	 and -mcmodel=medium -fpic.  */
+    }
+
+  if (GET_CODE (x) != PLUS
+      || GET_CODE (XEXP (x, 1)) != CONST)
+    return ix86_delegitimize_tls_address (orig_x);
+
+  if (ix86_pic_register_p (XEXP (x, 0)))
+    /* %ebx + GOT/GOTOFF */
+    ;
+  else if (GET_CODE (XEXP (x, 0)) == PLUS)
+    {
+      /* %ebx + %reg * scale + GOT/GOTOFF */
+      reg_addend = XEXP (x, 0);
+      if (ix86_pic_register_p (XEXP (reg_addend, 0)))
+	reg_addend = XEXP (reg_addend, 1);
+      else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
+	reg_addend = XEXP (reg_addend, 0);
+      else
+	{
+	  reg_addend = NULL_RTX;
+	  addend = XEXP (x, 0);
+	}
+    }
+  else
+    addend = XEXP (x, 0);
+
+  x = XEXP (XEXP (x, 1), 0);
+  if (GET_CODE (x) == PLUS
+      && CONST_INT_P (XEXP (x, 1)))
+    {
+      const_addend = XEXP (x, 1);
+      x = XEXP (x, 0);
+    }
+
+  if (GET_CODE (x) == UNSPEC
+      && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
+	  || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
+	  || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
+	      && !MEM_P (orig_x) && !addend)))
+    result = XVECEXP (x, 0, 0);
+
+  if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
+      && !MEM_P (orig_x))
+    result = XVECEXP (x, 0, 0);
+
+  if (! result)
+    return ix86_delegitimize_tls_address (orig_x);
+
+  if (const_addend)
+    result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
+  if (reg_addend)
+    result = gen_rtx_PLUS (Pmode, reg_addend, result);
+  if (addend)
+    {
+      /* If the rest of original X doesn't involve the PIC register, add
+	 addend and subtract pic_offset_table_rtx.  This can happen e.g.
+	 for code like:
+	 leal (%ebx, %ecx, 4), %ecx
+	 ...
+	 movl foo@GOTOFF(%ecx), %edx
+	 in which case we return (%ecx - %ebx) + foo.  */
+      if (pic_offset_table_rtx)
+        result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
+						     pic_offset_table_rtx),
+			       result);
+      else
+	return orig_x;
+    }
+  if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
+    {
+      result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
+      if (result == NULL_RTX)
+	return orig_x;
+    }
+  return result;
+}
+
+/* If X is a machine specific address (i.e. a symbol or label being
+   referenced as a displacement from the GOT implemented using an
+   UNSPEC), then return the base term.  Otherwise return X.  */
+
+rtx
+ix86_find_base_term (rtx x)
+{
+  rtx term;
+
+  if (TARGET_64BIT)
+    {
+      if (GET_CODE (x) != CONST)
+	return x;
+      term = XEXP (x, 0);
+      if (GET_CODE (term) == PLUS
+	  && (CONST_INT_P (XEXP (term, 1))
+	      || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
+	term = XEXP (term, 0);
+      if (GET_CODE (term) != UNSPEC
+	  || (XINT (term, 1) != UNSPEC_GOTPCREL
+	      && XINT (term, 1) != UNSPEC_PCREL))
+	return x;
+
+      return XVECEXP (term, 0, 0);
+    }
+
+  return ix86_delegitimize_address (x);
+}
+
+static void
+put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
+		    bool fp, FILE *file)
+{
+  const char *suffix;
+
+  if (mode == CCFPmode || mode == CCFPUmode)
+    {
+      code = ix86_fp_compare_code_to_integer (code);
+      mode = CCmode;
+    }
+  if (reverse)
+    code = reverse_condition (code);
+
+  switch (code)
+    {
+    case EQ:
+      switch (mode)
+	{
+	case CCAmode:
+	  suffix = "a";
+	  break;
+
+	case CCCmode:
+	  suffix = "c";
+	  break;
+
+	case CCOmode:
+	  suffix = "o";
+	  break;
+
+	case CCSmode:
+	  suffix = "s";
+	  break;
+
+	default:
+	  suffix = "e";
+	}
+      break;
+    case NE:
+      switch (mode)
+	{
+	case CCAmode:
+	  suffix = "na";
+	  break;
+
+	case CCCmode:
+	  suffix = "nc";
+	  break;
+
+	case CCOmode:
+	  suffix = "no";
+	  break;
+
+	case CCSmode:
+	  suffix = "ns";
+	  break;
+
+	default:
+	  suffix = "ne";
+	}
+      break;
+    case GT:
+      gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
+      suffix = "g";
+      break;
+    case GTU:
+      /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
+	 Those same assemblers have the same but opposite lossage on cmov.  */
+      if (mode == CCmode)
+	suffix = fp ? "nbe" : "a";
+      else
+	gcc_unreachable ();
+      break;
+    case LT:
+      switch (mode)
+	{
+	case CCNOmode:
+	case CCGOCmode:
+	  suffix = "s";
+	  break;
+
+	case CCmode:
+	case CCGCmode:
+	  suffix = "l";
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+      break;
+    case LTU:
+      if (mode == CCmode)
+	suffix = "b";
+      else if (mode == CCCmode)
+	suffix = "c";
+      else
+	gcc_unreachable ();
+      break;
+    case GE:
+      switch (mode)
+	{
+	case CCNOmode:
+	case CCGOCmode:
+	  suffix = "ns";
+	  break;
+
+	case CCmode:
+	case CCGCmode:
+	  suffix = "ge";
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+      break;
+    case GEU:
+      if (mode == CCmode)
+	suffix = fp ? "nb" : "ae";
+      else if (mode == CCCmode)
+	suffix = "nc";
+      else
+	gcc_unreachable ();
+      break;
+    case LE:
+      gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
+      suffix = "le";
+      break;
+    case LEU:
+      if (mode == CCmode)
+	suffix = "be";
+      else
+	gcc_unreachable ();
+      break;
+    case UNORDERED:
+      suffix = fp ? "u" : "p";
+      break;
+    case ORDERED:
+      suffix = fp ? "nu" : "np";
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  fputs (suffix, file);
+}
+
+/* Print the name of register X to FILE based on its machine mode and number.
+   If CODE is 'w', pretend the mode is HImode.
+   If CODE is 'b', pretend the mode is QImode.
+   If CODE is 'k', pretend the mode is SImode.
+   If CODE is 'q', pretend the mode is DImode.
+   If CODE is 'x', pretend the mode is V4SFmode.
+   If CODE is 't', pretend the mode is V8SFmode.
+   If CODE is 'g', pretend the mode is V16SFmode.
+   If CODE is 'h', pretend the reg is the 'high' byte register.
+   If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
+   If CODE is 'd', duplicate the operand for AVX instruction.
+ */
+
+void
+print_reg (rtx x, int code, FILE *file)
+{
+  const char *reg;
+  unsigned int regno;
+  bool duplicated = code == 'd' && TARGET_AVX;
+
+  if (ASSEMBLER_DIALECT == ASM_ATT)
+    putc ('%', file);
+
+  if (x == pc_rtx)
+    {
+      gcc_assert (TARGET_64BIT);
+      fputs ("rip", file);
+      return;
+    }
+
+  regno = true_regnum (x);
+  gcc_assert (regno != ARG_POINTER_REGNUM
+	      && regno != FRAME_POINTER_REGNUM
+	      && regno != FLAGS_REG
+	      && regno != FPSR_REG
+	      && regno != FPCR_REG);
+
+  if (code == 'w' || MMX_REG_P (x))
+    code = 2;
+  else if (code == 'b')
+    code = 1;
+  else if (code == 'k')
+    code = 4;
+  else if (code == 'q')
+    code = 8;
+  else if (code == 'y')
+    code = 3;
+  else if (code == 'h')
+    code = 0;
+  else if (code == 'x')
+    code = 16;
+  else if (code == 't')
+    code = 32;
+  else if (code == 'g')
+    code = 64;
+  else
+    code = GET_MODE_SIZE (GET_MODE (x));
+
+  /* Irritatingly, AMD extended registers use different naming convention
+     from the normal registers: "r%d[bwd]"  */
+  if (REX_INT_REGNO_P (regno))
+    {
+      gcc_assert (TARGET_64BIT);
+      putc ('r', file);
+      fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
+      switch (code)
+	{
+	  case 0:
+	    error ("extended registers have no high halves");
+	    break;
+	  case 1:
+	    putc ('b', file);
+	    break;
+	  case 2:
+	    putc ('w', file);
+	    break;
+	  case 4:
+	    putc ('d', file);
+	    break;
+	  case 8:
+	    /* no suffix */
+	    break;
+	  default:
+	    error ("unsupported operand size for extended register");
+	    break;
+	}
+      return;
+    }
+
+  reg = NULL;
+  switch (code)
+    {
+    case 3:
+      if (STACK_TOP_P (x))
+	{
+	  reg = "st(0)";
+	  break;
+	}
+      /* FALLTHRU */
+    case 8:
+    case 4:
+    case 12:
+      if (! ANY_FP_REG_P (x))
+	putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
+      /* FALLTHRU */
+    case 16:
+    case 2:
+    normal:
+      reg = hi_reg_name[regno];
+      break;
+    case 1:
+      if (regno >= ARRAY_SIZE (qi_reg_name))
+	goto normal;
+      reg = qi_reg_name[regno];
+      break;
+    case 0:
+      if (regno >= ARRAY_SIZE (qi_high_reg_name))
+	goto normal;
+      reg = qi_high_reg_name[regno];
+      break;
+    case 32:
+      if (SSE_REG_P (x))
+	{
+	  gcc_assert (!duplicated);
+	  putc ('y', file);
+	  fputs (hi_reg_name[regno] + 1, file);
+	  return;
+	}
+    case 64:
+      if (SSE_REG_P (x))
+        {
+          gcc_assert (!duplicated);
+          putc ('z', file);
+          fputs (hi_reg_name[REGNO (x)] + 1, file);
+          return;
+        }
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  fputs (reg, file);
+  if (duplicated)
+    {
+      if (ASSEMBLER_DIALECT == ASM_ATT)
+	fprintf (file, ", %%%s", reg);
+      else
+	fprintf (file, ", %s", reg);
+    }
+}
+
+/* Locate some local-dynamic symbol still in use by this function
+   so that we can print its name in some tls_local_dynamic_base
+   pattern.  */
+
+static int
+get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
+{
+  rtx x = *px;
+
+  if (GET_CODE (x) == SYMBOL_REF
+      && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
+    {
+      cfun->machine->some_ld_name = XSTR (x, 0);
+      return 1;
+    }
+
+  return 0;
+}
+
+static const char *
+get_some_local_dynamic_name (void)
+{
+  rtx insn;
+
+  if (cfun->machine->some_ld_name)
+    return cfun->machine->some_ld_name;
+
+  for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
+    if (NONDEBUG_INSN_P (insn)
+	&& for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
+      return cfun->machine->some_ld_name;
+
+  return NULL;
+}
+
+/* Meaning of CODE:
+   L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
+   C -- print opcode suffix for set/cmov insn.
+   c -- like C, but print reversed condition
+   F,f -- likewise, but for floating-point.
+   O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
+	otherwise nothing
+   R -- print embeded rounding and sae.
+   r -- print only sae.
+   z -- print the opcode suffix for the size of the current operand.
+   Z -- likewise, with special suffixes for x87 instructions.
+   * -- print a star (in certain assembler syntax)
+   A -- print an absolute memory reference.
+   E -- print address with DImode register names if TARGET_64BIT.
+   w -- print the operand as if it's a "word" (HImode) even if it isn't.
+   s -- print a shift double count, followed by the assemblers argument
+	delimiter.
+   b -- print the QImode name of the register for the indicated operand.
+	%b0 would print %al if operands[0] is reg 0.
+   w --  likewise, print the HImode name of the register.
+   k --  likewise, print the SImode name of the register.
+   q --  likewise, print the DImode name of the register.
+   x --  likewise, print the V4SFmode name of the register.
+   t --  likewise, print the V8SFmode name of the register.
+   g --  likewise, print the V16SFmode name of the register.
+   h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
+   y -- print "st(0)" instead of "st" as a register.
+   d -- print duplicated register operand for AVX instruction.
+   D -- print condition for SSE cmp instruction.
+   P -- if PIC, print an @PLT suffix.
+   p -- print raw symbol name.
+   X -- don't print any sort of PIC '@' suffix for a symbol.
+   & -- print some in-use local-dynamic symbol name.
+   H -- print a memory address offset by 8; used for sse high-parts
+   Y -- print condition for XOP pcom* instruction.
+   + -- print a branch hint as 'cs' or 'ds' prefix
+   ; -- print a semicolon (after prefixes due to bug in older gas).
+   ~ -- print "i" if TARGET_AVX2, "f" otherwise.
+   @ -- print a segment register of thread base pointer load
+   ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
+ */
+
+void
+ix86_print_operand (FILE *file, rtx x, int code)
+{
+  if (code)
+    {
+      switch (code)
+	{
+	case 'A':
+	  switch (ASSEMBLER_DIALECT)
+	    {
+	    case ASM_ATT:
+	      putc ('*', file);
+	      break;
+
+	    case ASM_INTEL:
+	      /* Intel syntax. For absolute addresses, registers should not
+		 be surrounded by braces.  */
+	      if (!REG_P (x))
+		{
+		  putc ('[', file);
+		  ix86_print_operand (file, x, 0);
+		  putc (']', file);
+		  return;
+		}
+	      break;
+
+	    default:
+	      gcc_unreachable ();
+	    }
+
+	  ix86_print_operand (file, x, 0);
+	  return;
+
+	case 'E':
+	  /* Wrap address in an UNSPEC to declare special handling.  */
+	  if (TARGET_64BIT)
+	    x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
+
+	  output_address (x);
+	  return;
+
+	case 'L':
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    putc ('l', file);
+	  return;
+
+	case 'W':
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    putc ('w', file);
+	  return;
+
+	case 'B':
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    putc ('b', file);
+	  return;
+
+	case 'Q':
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    putc ('l', file);
+	  return;
+
+	case 'S':
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    putc ('s', file);
+	  return;
+
+	case 'T':
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    putc ('t', file);
+	  return;
+
+	case 'O':
+#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
+	  if (ASSEMBLER_DIALECT != ASM_ATT)
+	    return;
+
+	  switch (GET_MODE_SIZE (GET_MODE (x)))
+	    {
+	    case 2:
+	      putc ('w', file);
+	      break;
+  
+	    case 4:
+	      putc ('l', file);
+	      break;
+
+	    case 8:
+	      putc ('q', file);
+	      break;
+
+	    default:
+	      output_operand_lossage
+		("invalid operand size for operand code 'O'");
+	      return;
+	    }
+
+	  putc ('.', file);
+#endif
+	  return;
+
+	case 'z':
+	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
+	    {
+	      /* Opcodes don't get size suffixes if using Intel opcodes.  */
+	      if (ASSEMBLER_DIALECT == ASM_INTEL)
+		return;
+
+	      switch (GET_MODE_SIZE (GET_MODE (x)))
+		{
+		case 1:
+		  putc ('b', file);
+		  return;
+
+		case 2:
+		  putc ('w', file);
+		  return;
+
+		case 4:
+		  putc ('l', file);
+		  return;
+
+		case 8:
+		  putc ('q', file);
+		  return;
+
+		default:
+		  output_operand_lossage
+		    ("invalid operand size for operand code 'z'");
+		  return;
+		}
+	    }
+
+	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
+	    warning
+	      (0, "non-integer operand used with operand code 'z'");
+	  /* FALLTHRU */
+
+	case 'Z':
+	  /* 387 opcodes don't get size suffixes if using Intel opcodes.  */
+	  if (ASSEMBLER_DIALECT == ASM_INTEL)
+	    return;
+
+	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
+	    {
+	      switch (GET_MODE_SIZE (GET_MODE (x)))
+		{
+		case 2:
+#ifdef HAVE_AS_IX86_FILDS
+		  putc ('s', file);
+#endif
+		  return;
+
+		case 4:
+		  putc ('l', file);
+		  return;
+
+		case 8:
+#ifdef HAVE_AS_IX86_FILDQ
+		  putc ('q', file);
+#else
+		  fputs ("ll", file);
+#endif
+		  return;
+
+		default:
+		  break;
+		}
+	    }
+	  else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
+	    {
+	      /* 387 opcodes don't get size suffixes
+		 if the operands are registers.  */
+	      if (STACK_REG_P (x))
+		return;
+
+	      switch (GET_MODE_SIZE (GET_MODE (x)))
+		{
+		case 4:
+		  putc ('s', file);
+		  return;
+
+		case 8:
+		  putc ('l', file);
+		  return;
+
+		case 12:
+		case 16:
+		  putc ('t', file);
+		  return;
+
+		default:
+		  break;
+		}
+	    }
+	  else
+	    {
+	      output_operand_lossage
+		("invalid operand type used with operand code 'Z'");
+	      return;
+	    }
+
+	  output_operand_lossage
+	    ("invalid operand size for operand code 'Z'");
+	  return;
+
+	case 'd':
+	case 'b':
+	case 'w':
+	case 'k':
+	case 'q':
+	case 'h':
+	case 't':
+	case 'g':
+	case 'y':
+	case 'x':
+	case 'X':
+	case 'P':
+	case 'p':
+	  break;
+
+	case 's':
+	  if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
+	    {
+	      ix86_print_operand (file, x, 0);
+	      fputs (", ", file);
+	    }
+	  return;
+
+	case 'Y':
+	  switch (GET_CODE (x))
+	    {
+	    case NE:
+	      fputs ("neq", file);
+	      break;
+	    case EQ:
+	      fputs ("eq", file);
+	      break;
+	    case GE:
+	    case GEU:
+	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
+	      break;
+	    case GT:
+	    case GTU:
+	      fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
+	      break;
+	    case LE:
+	    case LEU:
+	      fputs ("le", file);
+	      break;
+	    case LT:
+	    case LTU:
+	      fputs ("lt", file);
+	      break;
+	    case UNORDERED:
+	      fputs ("unord", file);
+	      break;
+	    case ORDERED:
+	      fputs ("ord", file);
+	      break;
+	    case UNEQ:
+	      fputs ("ueq", file);
+	      break;
+	    case UNGE:
+	      fputs ("nlt", file);
+	      break;
+	    case UNGT:
+	      fputs ("nle", file);
+	      break;
+	    case UNLE:
+	      fputs ("ule", file);
+	      break;
+	    case UNLT:
+	      fputs ("ult", file);
+	      break;
+	    case LTGT:
+	      fputs ("une", file);
+	      break;
+	    default:
+	      output_operand_lossage ("operand is not a condition code, "
+				      "invalid operand code 'Y'");
+	      return;
+	    }
+	  return;
+
+	case 'D':
+	  /* Little bit of braindamage here.  The SSE compare instructions
+	     does use completely different names for the comparisons that the
+	     fp conditional moves.  */
+	  switch (GET_CODE (x))
+	    {
+	    case UNEQ:
+	      if (TARGET_AVX)
+		{
+		  fputs ("eq_us", file);
+		  break;
+		}
+	    case EQ:
+	      fputs ("eq", file);
+	      break;
+	    case UNLT:
+	      if (TARGET_AVX)
+		{
+		  fputs ("nge", file);
+		  break;
+		}
+	    case LT:
+	      fputs ("lt", file);
+	      break;
+	    case UNLE:
+	      if (TARGET_AVX)
+		{
+		  fputs ("ngt", file);
+		  break;
+		}
+	    case LE:
+	      fputs ("le", file);
+	      break;
+	    case UNORDERED:
+	      fputs ("unord", file);
+	      break;
+	    case LTGT:
+	      if (TARGET_AVX)
+		{
+		  fputs ("neq_oq", file);
+		  break;
+		}
+	    case NE:
+	      fputs ("neq", file);
+	      break;
+	    case GE:
+	      if (TARGET_AVX)
+		{
+		  fputs ("ge", file);
+		  break;
+		}
+	    case UNGE:
+	      fputs ("nlt", file);
+	      break;
+	    case GT:
+	      if (TARGET_AVX)
+		{
+		  fputs ("gt", file);
+		  break;
+		}
+	    case UNGT:
+	      fputs ("nle", file);
+	      break;
+	    case ORDERED:
+	      fputs ("ord", file);
+	      break;
+	    default:
+	      output_operand_lossage ("operand is not a condition code, "
+				      "invalid operand code 'D'");
+	      return;
+	    }
+	  return;
+
+	case 'F':
+	case 'f':
+#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    putc ('.', file);
+#endif
+
+	case 'C':
+	case 'c':
+	  if (!COMPARISON_P (x))
+	    {
+	      output_operand_lossage ("operand is not a condition code, "
+				      "invalid operand code '%c'", code);
+	      return;
+	    }
+	  put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
+			      code == 'c' || code == 'f',
+			      code == 'F' || code == 'f',
+			      file);
+	  return;
+
+	case 'H':
+	  if (!offsettable_memref_p (x))
+	    {
+	      output_operand_lossage ("operand is not an offsettable memory "
+				      "reference, invalid operand code 'H'");
+	      return;
+	    }
+	  /* It doesn't actually matter what mode we use here, as we're
+	     only going to use this for printing.  */
+	  x = adjust_address_nv (x, DImode, 8);
+	  /* Output 'qword ptr' for intel assembler dialect.  */
+	  if (ASSEMBLER_DIALECT == ASM_INTEL)
+	    code = 'q';
+	  break;
+
+	case 'K':
+	  gcc_assert (CONST_INT_P (x));
+
+	  if (INTVAL (x) & IX86_HLE_ACQUIRE)
+#ifdef HAVE_AS_IX86_HLE
+	    fputs ("xacquire ", file);
+#else
+	    fputs ("\n" ASM_BYTE "0xf2\n\t", file);
+#endif
+	  else if (INTVAL (x) & IX86_HLE_RELEASE)
+#ifdef HAVE_AS_IX86_HLE
+	    fputs ("xrelease ", file);
+#else
+	    fputs ("\n" ASM_BYTE "0xf3\n\t", file);
+#endif
+	  /* We do not want to print value of the operand.  */
+	  return;
+
+	case 'N':
+	  if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
+	    fputs ("{z}", file);
+	  return;
+
+	case 'r':
+	  gcc_assert (CONST_INT_P (x));
+	  gcc_assert (INTVAL (x) == ROUND_SAE);
+
+	  if (ASSEMBLER_DIALECT == ASM_INTEL)
+	    fputs (", ", file);
+
+	  fputs ("{sae}", file);
+
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    fputs (", ", file);
+
+	  return;
+
+	case 'R':
+	  gcc_assert (CONST_INT_P (x));
+
+	  if (ASSEMBLER_DIALECT == ASM_INTEL)
+	    fputs (", ", file);
+
+	  switch (INTVAL (x))
+	    {
+	    case ROUND_NEAREST_INT | ROUND_SAE:
+	      fputs ("{rn-sae}", file);
+	      break;
+	    case ROUND_NEG_INF | ROUND_SAE:
+	      fputs ("{rd-sae}", file);
+	      break;
+	    case ROUND_POS_INF | ROUND_SAE:
+	      fputs ("{ru-sae}", file);
+	      break;
+	    case ROUND_ZERO | ROUND_SAE:
+	      fputs ("{rz-sae}", file);
+	      break;
+	    default:
+	      gcc_unreachable ();
+	    }
+
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    fputs (", ", file);
+
+	  return;
+
+	case '*':
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    putc ('*', file);
+	  return;
+
+	case '&':
+	  {
+	    const char *name = get_some_local_dynamic_name ();
+	    if (name == NULL)
+	      output_operand_lossage ("'%%&' used without any "
+				      "local dynamic TLS references");
+	    else
+	      assemble_name (file, name);
+	    return;
+	  }
+
+	case '+':
+	  {
+	    rtx x;
+
+	    if (!optimize
+	        || optimize_function_for_size_p (cfun)
+		|| !TARGET_BRANCH_PREDICTION_HINTS)
+	      return;
+
+	    x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
+	    if (x)
+	      {
+		int pred_val = XINT (x, 0);
+
+		if (pred_val < REG_BR_PROB_BASE * 45 / 100
+		    || pred_val > REG_BR_PROB_BASE * 55 / 100)
+		  {
+		    bool taken = pred_val > REG_BR_PROB_BASE / 2;
+		    bool cputaken
+		      = final_forward_branch_p (current_output_insn) == 0;
+
+		    /* Emit hints only in the case default branch prediction
+		       heuristics would fail.  */
+		    if (taken != cputaken)
+		      {
+			/* We use 3e (DS) prefix for taken branches and
+			   2e (CS) prefix for not taken branches.  */
+			if (taken)
+			  fputs ("ds ; ", file);
+			else
+			  fputs ("cs ; ", file);
+		      }
+		  }
+	      }
+	    return;
+	  }
+
+	case ';':
+#ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
+	  putc (';', file);
+#endif
+	  return;
+
+	case '@':
+	  if (ASSEMBLER_DIALECT == ASM_ATT)
+	    putc ('%', file);
+
+	  /* The kernel uses a different segment register for performance
+	     reasons; a system call would not have to trash the userspace
+	     segment register, which would be expensive.  */
+	  if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
+	    fputs ("fs", file);
+	  else
+	    fputs ("gs", file);
+	  return;
+
+	case '~':
+	  putc (TARGET_AVX2 ? 'i' : 'f', file);
+	  return;
+
+	case '^':
+	  if (TARGET_64BIT && Pmode != word_mode)
+	    fputs ("addr32 ", file);
+	  return;
+
+	default:
+	    output_operand_lossage ("invalid operand code '%c'", code);
+	}
+    }
+
+  if (REG_P (x))
+    print_reg (x, code, file);
+
+  else if (MEM_P (x))
+    {
+      /* No `byte ptr' prefix for call instructions or BLKmode operands.  */
+      if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
+	  && GET_MODE (x) != BLKmode)
+	{
+	  const char * size;
+	  switch (GET_MODE_SIZE (GET_MODE (x)))
+	    {
+	    case 1: size = "BYTE"; break;
+	    case 2: size = "WORD"; break;
+	    case 4: size = "DWORD"; break;
+	    case 8: size = "QWORD"; break;
+	    case 12: size = "TBYTE"; break;
+	    case 16:
+	      if (GET_MODE (x) == XFmode)
+		size = "TBYTE";
+              else
+		size = "XMMWORD";
+              break;
+	    case 32: size = "YMMWORD"; break;
+	    case 64: size = "ZMMWORD"; break;
+	    default:
+	      gcc_unreachable ();
+	    }
+
+	  /* Check for explicit size override (codes 'b', 'w', 'k',
+	     'q' and 'x')  */
+	  if (code == 'b')
+	    size = "BYTE";
+	  else if (code == 'w')
+	    size = "WORD";
+	  else if (code == 'k')
+	    size = "DWORD";
+	  else if (code == 'q')
+	    size = "QWORD";
+	  else if (code == 'x')
+	    size = "XMMWORD";
+
+	  fputs (size, file);
+	  fputs (" PTR ", file);
+	}
+
+      x = XEXP (x, 0);
+      /* Avoid (%rip) for call operands.  */
+      if (CONSTANT_ADDRESS_P (x) && code == 'P'
+	  && !CONST_INT_P (x))
+	output_addr_const (file, x);
+      else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
+	output_operand_lossage ("invalid constraints for operand");
+      else
+	output_address (x);
+    }
+
+  else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
+    {
+      REAL_VALUE_TYPE r;
+      long l;
+
+      REAL_VALUE_FROM_CONST_DOUBLE (r, x);
+      REAL_VALUE_TO_TARGET_SINGLE (r, l);
+
+      if (ASSEMBLER_DIALECT == ASM_ATT)
+	putc ('$', file);
+      /* Sign extend 32bit SFmode immediate to 8 bytes.  */
+      if (code == 'q')
+	fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
+		 (unsigned long long) (int) l);
+      else
+	fprintf (file, "0x%08x", (unsigned int) l);
+    }
+
+  else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
+    {
+      REAL_VALUE_TYPE r;
+      long l[2];
+
+      REAL_VALUE_FROM_CONST_DOUBLE (r, x);
+      REAL_VALUE_TO_TARGET_DOUBLE (r, l);
+
+      if (ASSEMBLER_DIALECT == ASM_ATT)
+	putc ('$', file);
+      fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
+    }
+
+  /* These float cases don't actually occur as immediate operands.  */
+  else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
+    {
+      char dstr[30];
+
+      real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
+      fputs (dstr, file);
+    }
+
+  else
+    {
+      /* We have patterns that allow zero sets of memory, for instance.
+	 In 64-bit mode, we should probably support all 8-byte vectors,
+	 since we can in fact encode that into an immediate.  */
+      if (GET_CODE (x) == CONST_VECTOR)
+	{
+	  gcc_assert (x == CONST0_RTX (GET_MODE (x)));
+	  x = const0_rtx;
+	}
+
+      if (code != 'P' && code != 'p')
+	{
+	  if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
+	    {
+	      if (ASSEMBLER_DIALECT == ASM_ATT)
+		putc ('$', file);
+	    }
+	  else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
+		   || GET_CODE (x) == LABEL_REF)
+	    {
+	      if (ASSEMBLER_DIALECT == ASM_ATT)
+		putc ('$', file);
+	      else
+		fputs ("OFFSET FLAT:", file);
+	    }
+	}
+      if (CONST_INT_P (x))
+	fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
+      else if (flag_pic || MACHOPIC_INDIRECT)
+	output_pic_addr_const (file, x, code);
+      else
+	output_addr_const (file, x);
+    }
+}
+
+static bool
+ix86_print_operand_punct_valid_p (unsigned char code)
+{
+  return (code == '@' || code == '*' || code == '+' || code == '&'
+	  || code == ';' || code == '~' || code == '^');
+}
+
+/* Print a memory operand whose address is ADDR.  */
+
+static void
+ix86_print_operand_address (FILE *file, rtx addr)
+{
+  struct ix86_address parts;
+  rtx base, index, disp;
+  int scale;
+  int ok;
+  bool vsib = false;
+  int code = 0;
+
+  if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
+    {
+      ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
+      gcc_assert (parts.index == NULL_RTX);
+      parts.index = XVECEXP (addr, 0, 1);
+      parts.scale = INTVAL (XVECEXP (addr, 0, 2));
+      addr = XVECEXP (addr, 0, 0);
+      vsib = true;
+    }
+  else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
+    {
+      gcc_assert (TARGET_64BIT);
+      ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
+      code = 'q';
+    }
+  else
+    ok = ix86_decompose_address (addr, &parts);
+
+  gcc_assert (ok);
+
+  base = parts.base;
+  index = parts.index;
+  disp = parts.disp;
+  scale = parts.scale;
+
+  switch (parts.seg)
+    {
+    case SEG_DEFAULT:
+      break;
+    case SEG_FS:
+    case SEG_GS:
+      if (ASSEMBLER_DIALECT == ASM_ATT)
+	putc ('%', file);
+      fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Use one byte shorter RIP relative addressing for 64bit mode.  */
+  if (TARGET_64BIT && !base && !index)
+    {
+      rtx symbol = disp;
+
+      if (GET_CODE (disp) == CONST
+	  && GET_CODE (XEXP (disp, 0)) == PLUS
+	  && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
+	symbol = XEXP (XEXP (disp, 0), 0);
+
+      if (GET_CODE (symbol) == LABEL_REF
+	  || (GET_CODE (symbol) == SYMBOL_REF
+	      && SYMBOL_REF_TLS_MODEL (symbol) == 0))
+	base = pc_rtx;
+    }
+  if (!base && !index)
+    {
+      /* Displacement only requires special attention.  */
+
+      if (CONST_INT_P (disp))
+	{
+	  if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
+	    fputs ("ds:", file);
+	  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
+	}
+      else if (flag_pic)
+	output_pic_addr_const (file, disp, 0);
+      else
+	output_addr_const (file, disp);
+    }
+  else
+    {
+      /* Print SImode register names to force addr32 prefix.  */
+      if (SImode_address_operand (addr, VOIDmode))
+	{
+#ifdef ENABLE_CHECKING
+	  gcc_assert (TARGET_64BIT);
+	  switch (GET_CODE (addr))
+	    {
+	    case SUBREG:
+	      gcc_assert (GET_MODE (addr) == SImode);
+	      gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
+	      break;
+	    case ZERO_EXTEND:
+	    case AND:
+	      gcc_assert (GET_MODE (addr) == DImode);
+	      break;
+	    default:
+	      gcc_unreachable ();
+	    }
+#endif
+	  gcc_assert (!code);
+	  code = 'k';
+	}
+      else if (code == 0
+	       && TARGET_X32
+	       && disp
+	       && CONST_INT_P (disp)
+	       && INTVAL (disp) < -16*1024*1024)
+	{
+	  /* X32 runs in 64-bit mode, where displacement, DISP, in
+	     address DISP(%r64), is encoded as 32-bit immediate sign-
+	     extended from 32-bit to 64-bit.  For -0x40000300(%r64),
+	     address is %r64 + 0xffffffffbffffd00.  When %r64 <
+	     0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
+	     which is invalid for x32.  The correct address is %r64
+	     - 0x40000300 == 0xf7ffdd64.  To properly encode
+	     -0x40000300(%r64) for x32, we zero-extend negative
+	     displacement by forcing addr32 prefix which truncates
+	     0xfffffffff7ffdd64 to 0xf7ffdd64.  In theory, we should
+	     zero-extend all negative displacements, including -1(%rsp).
+	     However, for small negative displacements, sign-extension
+	     won't cause overflow.  We only zero-extend negative
+	     displacements if they < -16*1024*1024, which is also used
+	     to check legitimate address displacements for PIC.  */
+	  code = 'k';
+	}
+
+      if (ASSEMBLER_DIALECT == ASM_ATT)
+	{
+	  if (disp)
+	    {
+	      if (flag_pic)
+		output_pic_addr_const (file, disp, 0);
+	      else if (GET_CODE (disp) == LABEL_REF)
+		output_asm_label (disp);
+	      else
+		output_addr_const (file, disp);
+	    }
+
+	  putc ('(', file);
+	  if (base)
+	    print_reg (base, code, file);
+	  if (index)
+	    {
+	      putc (',', file);
+	      print_reg (index, vsib ? 0 : code, file);
+	      if (scale != 1 || vsib)
+		fprintf (file, ",%d", scale);
+	    }
+	  putc (')', file);
+	}
+      else
+	{
+	  rtx offset = NULL_RTX;
+
+	  if (disp)
+	    {
+	      /* Pull out the offset of a symbol; print any symbol itself.  */
+	      if (GET_CODE (disp) == CONST
+		  && GET_CODE (XEXP (disp, 0)) == PLUS
+		  && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
+		{
+		  offset = XEXP (XEXP (disp, 0), 1);
+		  disp = gen_rtx_CONST (VOIDmode,
+					XEXP (XEXP (disp, 0), 0));
+		}
+
+	      if (flag_pic)
+		output_pic_addr_const (file, disp, 0);
+	      else if (GET_CODE (disp) == LABEL_REF)
+		output_asm_label (disp);
+	      else if (CONST_INT_P (disp))
+		offset = disp;
+	      else
+		output_addr_const (file, disp);
+	    }
+
+	  putc ('[', file);
+	  if (base)
+	    {
+	      print_reg (base, code, file);
+	      if (offset)
+		{
+		  if (INTVAL (offset) >= 0)
+		    putc ('+', file);
+		  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
+		}
+	    }
+	  else if (offset)
+	    fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
+	  else
+	    putc ('0', file);
+
+	  if (index)
+	    {
+	      putc ('+', file);
+	      print_reg (index, vsib ? 0 : code, file);
+	      if (scale != 1 || vsib)
+		fprintf (file, "*%d", scale);
+	    }
+	  putc (']', file);
+	}
+    }
+}
+
+/* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
+
+static bool
+i386_asm_output_addr_const_extra (FILE *file, rtx x)
+{
+  rtx op;
+
+  if (GET_CODE (x) != UNSPEC)
+    return false;
+
+  op = XVECEXP (x, 0, 0);
+  switch (XINT (x, 1))
+    {
+    case UNSPEC_GOTTPOFF:
+      output_addr_const (file, op);
+      /* FIXME: This might be @TPOFF in Sun ld.  */
+      fputs ("@gottpoff", file);
+      break;
+    case UNSPEC_TPOFF:
+      output_addr_const (file, op);
+      fputs ("@tpoff", file);
+      break;
+    case UNSPEC_NTPOFF:
+      output_addr_const (file, op);
+      if (TARGET_64BIT)
+	fputs ("@tpoff", file);
+      else
+	fputs ("@ntpoff", file);
+      break;
+    case UNSPEC_DTPOFF:
+      output_addr_const (file, op);
+      fputs ("@dtpoff", file);
+      break;
+    case UNSPEC_GOTNTPOFF:
+      output_addr_const (file, op);
+      if (TARGET_64BIT)
+	fputs (ASSEMBLER_DIALECT == ASM_ATT ?
+	       "@gottpoff(%rip)" : "@gottpoff[rip]", file);
+      else
+	fputs ("@gotntpoff", file);
+      break;
+    case UNSPEC_INDNTPOFF:
+      output_addr_const (file, op);
+      fputs ("@indntpoff", file);
+      break;
+#if TARGET_MACHO
+    case UNSPEC_MACHOPIC_OFFSET:
+      output_addr_const (file, op);
+      putc ('-', file);
+      machopic_output_function_base_name (file);
+      break;
+#endif
+
+    case UNSPEC_STACK_CHECK:
+      {
+	int offset;
+
+	gcc_assert (flag_split_stack);
+
+#ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
+	offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
+#else
+	gcc_unreachable ();
+#endif
+
+	fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
+      }
+      break;
+
+    default:
+      return false;
+    }
+
+  return true;
+}
+
+/* Split one or more double-mode RTL references into pairs of half-mode
+   references.  The RTL can be REG, offsettable MEM, integer constant, or
+   CONST_DOUBLE.  "operands" is a pointer to an array of double-mode RTLs to
+   split and "num" is its length.  lo_half and hi_half are output arrays
+   that parallel "operands".  */
+
+void
+split_double_mode (enum machine_mode mode, rtx operands[],
+		   int num, rtx lo_half[], rtx hi_half[])
+{
+  enum machine_mode half_mode;
+  unsigned int byte;
+
+  switch (mode)
+    {
+    case TImode:
+      half_mode = DImode;
+      break;
+    case DImode:
+      half_mode = SImode;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  byte = GET_MODE_SIZE (half_mode);
+
+  while (num--)
+    {
+      rtx op = operands[num];
+
+      /* simplify_subreg refuse to split volatile memory addresses,
+         but we still have to handle it.  */
+      if (MEM_P (op))
+	{
+	  lo_half[num] = adjust_address (op, half_mode, 0);
+	  hi_half[num] = adjust_address (op, half_mode, byte);
+	}
+      else
+	{
+	  lo_half[num] = simplify_gen_subreg (half_mode, op,
+					      GET_MODE (op) == VOIDmode
+					      ? mode : GET_MODE (op), 0);
+	  hi_half[num] = simplify_gen_subreg (half_mode, op,
+					      GET_MODE (op) == VOIDmode
+					      ? mode : GET_MODE (op), byte);
+	}
+    }
+}
+
+/* Output code to perform a 387 binary operation in INSN, one of PLUS,
+   MINUS, MULT or DIV.  OPERANDS are the insn operands, where operands[3]
+   is the expression of the binary operation.  The output may either be
+   emitted here, or returned to the caller, like all output_* functions.
+
+   There is no guarantee that the operands are the same mode, as they
+   might be within FLOAT or FLOAT_EXTEND expressions.  */
+
+#ifndef SYSV386_COMPAT
+/* Set to 1 for compatibility with brain-damaged assemblers.  No-one
+   wants to fix the assemblers because that causes incompatibility
+   with gcc.  No-one wants to fix gcc because that causes
+   incompatibility with assemblers...  You can use the option of
+   -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way.  */
+#define SYSV386_COMPAT 1
+#endif
+
+const char *
+output_387_binary_op (rtx insn, rtx *operands)
+{
+  static char buf[40];
+  const char *p;
+  const char *ssep;
+  int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
+
+#ifdef ENABLE_CHECKING
+  /* Even if we do not want to check the inputs, this documents input
+     constraints.  Which helps in understanding the following code.  */
+  if (STACK_REG_P (operands[0])
+      && ((REG_P (operands[1])
+	   && REGNO (operands[0]) == REGNO (operands[1])
+	   && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
+	  || (REG_P (operands[2])
+	      && REGNO (operands[0]) == REGNO (operands[2])
+	      && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
+      && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
+    ; /* ok */
+  else
+    gcc_assert (is_sse);
+#endif
+
+  switch (GET_CODE (operands[3]))
+    {
+    case PLUS:
+      if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
+	  || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
+	p = "fiadd";
+      else
+	p = "fadd";
+      ssep = "vadd";
+      break;
+
+    case MINUS:
+      if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
+	  || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
+	p = "fisub";
+      else
+	p = "fsub";
+      ssep = "vsub";
+      break;
+
+    case MULT:
+      if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
+	  || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
+	p = "fimul";
+      else
+	p = "fmul";
+      ssep = "vmul";
+      break;
+
+    case DIV:
+      if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
+	  || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
+	p = "fidiv";
+      else
+	p = "fdiv";
+      ssep = "vdiv";
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  if (is_sse)
+   {
+     if (TARGET_AVX)
+       {
+	 strcpy (buf, ssep);
+	 if (GET_MODE (operands[0]) == SFmode)
+	   strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
+	 else
+	   strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
+       }
+     else
+       {
+	 strcpy (buf, ssep + 1);
+	 if (GET_MODE (operands[0]) == SFmode)
+	   strcat (buf, "ss\t{%2, %0|%0, %2}");
+	 else
+	   strcat (buf, "sd\t{%2, %0|%0, %2}");
+       }
+      return buf;
+   }
+  strcpy (buf, p);
+
+  switch (GET_CODE (operands[3]))
+    {
+    case MULT:
+    case PLUS:
+      if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
+	{
+	  rtx temp = operands[2];
+	  operands[2] = operands[1];
+	  operands[1] = temp;
+	}
+
+      /* know operands[0] == operands[1].  */
+
+      if (MEM_P (operands[2]))
+	{
+	  p = "%Z2\t%2";
+	  break;
+	}
+
+      if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
+	{
+	  if (STACK_TOP_P (operands[0]))
+	    /* How is it that we are storing to a dead operand[2]?
+	       Well, presumably operands[1] is dead too.  We can't
+	       store the result to st(0) as st(0) gets popped on this
+	       instruction.  Instead store to operands[2] (which I
+	       think has to be st(1)).  st(1) will be popped later.
+	       gcc <= 2.8.1 didn't have this check and generated
+	       assembly code that the Unixware assembler rejected.  */
+	    p = "p\t{%0, %2|%2, %0}";	/* st(1) = st(0) op st(1); pop */
+	  else
+	    p = "p\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0); pop */
+	  break;
+	}
+
+      if (STACK_TOP_P (operands[0]))
+	p = "\t{%y2, %0|%0, %y2}";	/* st(0) = st(0) op st(r2) */
+      else
+	p = "\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0) */
+      break;
+
+    case MINUS:
+    case DIV:
+      if (MEM_P (operands[1]))
+	{
+	  p = "r%Z1\t%1";
+	  break;
+	}
+
+      if (MEM_P (operands[2]))
+	{
+	  p = "%Z2\t%2";
+	  break;
+	}
+
+      if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
+	{
+#if SYSV386_COMPAT
+	  /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
+	     derived assemblers, confusingly reverse the direction of
+	     the operation for fsub{r} and fdiv{r} when the
+	     destination register is not st(0).  The Intel assembler
+	     doesn't have this brain damage.  Read !SYSV386_COMPAT to
+	     figure out what the hardware really does.  */
+	  if (STACK_TOP_P (operands[0]))
+	    p = "{p\t%0, %2|rp\t%2, %0}";
+	  else
+	    p = "{rp\t%2, %0|p\t%0, %2}";
+#else
+	  if (STACK_TOP_P (operands[0]))
+	    /* As above for fmul/fadd, we can't store to st(0).  */
+	    p = "rp\t{%0, %2|%2, %0}";	/* st(1) = st(0) op st(1); pop */
+	  else
+	    p = "p\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0); pop */
+#endif
+	  break;
+	}
+
+      if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
+	{
+#if SYSV386_COMPAT
+	  if (STACK_TOP_P (operands[0]))
+	    p = "{rp\t%0, %1|p\t%1, %0}";
+	  else
+	    p = "{p\t%1, %0|rp\t%0, %1}";
+#else
+	  if (STACK_TOP_P (operands[0]))
+	    p = "p\t{%0, %1|%1, %0}";	/* st(1) = st(1) op st(0); pop */
+	  else
+	    p = "rp\t{%1, %0|%0, %1}";	/* st(r2) = st(0) op st(r2); pop */
+#endif
+	  break;
+	}
+
+      if (STACK_TOP_P (operands[0]))
+	{
+	  if (STACK_TOP_P (operands[1]))
+	    p = "\t{%y2, %0|%0, %y2}";	/* st(0) = st(0) op st(r2) */
+	  else
+	    p = "r\t{%y1, %0|%0, %y1}";	/* st(0) = st(r1) op st(0) */
+	  break;
+	}
+      else if (STACK_TOP_P (operands[1]))
+	{
+#if SYSV386_COMPAT
+	  p = "{\t%1, %0|r\t%0, %1}";
+#else
+	  p = "r\t{%1, %0|%0, %1}";	/* st(r2) = st(0) op st(r2) */
+#endif
+	}
+      else
+	{
+#if SYSV386_COMPAT
+	  p = "{r\t%2, %0|\t%0, %2}";
+#else
+	  p = "\t{%2, %0|%0, %2}";	/* st(r1) = st(r1) op st(0) */
+#endif
+	}
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  strcat (buf, p);
+  return buf;
+}
+
+/* Check if a 256bit AVX register is referenced inside of EXP.   */
+
+static int
+ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
+{
+  rtx exp = *pexp;
+
+  if (GET_CODE (exp) == SUBREG)
+    exp = SUBREG_REG (exp);
+
+  if (REG_P (exp)
+      && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
+    return 1;
+
+  return 0;
+}
+
+/* Return needed mode for entity in optimize_mode_switching pass.  */
+
+static int
+ix86_avx_u128_mode_needed (rtx insn)
+{
+  if (CALL_P (insn))
+    {
+      rtx link;
+
+      /* Needed mode is set to AVX_U128_CLEAN if there are
+	 no 256bit modes used in function arguments.  */
+      for (link = CALL_INSN_FUNCTION_USAGE (insn);
+	   link;
+	   link = XEXP (link, 1))
+	{
+	  if (GET_CODE (XEXP (link, 0)) == USE)
+	    {
+	      rtx arg = XEXP (XEXP (link, 0), 0);
+
+	      if (ix86_check_avx256_register (&arg, NULL))
+		return AVX_U128_DIRTY;
+	    }
+	}
+
+      return AVX_U128_CLEAN;
+    }
+
+  /* Require DIRTY mode if a 256bit AVX register is referenced.  Hardware
+     changes state only when a 256bit register is written to, but we need
+     to prevent the compiler from moving optimal insertion point above
+     eventual read from 256bit register.  */
+  if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
+    return AVX_U128_DIRTY;
+
+  return AVX_U128_ANY;
+}
+
+/* Return mode that i387 must be switched into
+   prior to the execution of insn.  */
+
+static int
+ix86_i387_mode_needed (int entity, rtx insn)
+{
+  enum attr_i387_cw mode;
+
+  /* The mode UNINITIALIZED is used to store control word after a
+     function call or ASM pattern.  The mode ANY specify that function
+     has no requirements on the control word and make no changes in the
+     bits we are interested in.  */
+
+  if (CALL_P (insn)
+      || (NONJUMP_INSN_P (insn)
+	  && (asm_noperands (PATTERN (insn)) >= 0
+	      || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
+    return I387_CW_UNINITIALIZED;
+
+  if (recog_memoized (insn) < 0)
+    return I387_CW_ANY;
+
+  mode = get_attr_i387_cw (insn);
+
+  switch (entity)
+    {
+    case I387_TRUNC:
+      if (mode == I387_CW_TRUNC)
+	return mode;
+      break;
+
+    case I387_FLOOR:
+      if (mode == I387_CW_FLOOR)
+	return mode;
+      break;
+
+    case I387_CEIL:
+      if (mode == I387_CW_CEIL)
+	return mode;
+      break;
+
+    case I387_MASK_PM:
+      if (mode == I387_CW_MASK_PM)
+	return mode;
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return I387_CW_ANY;
+}
+
+/* Return mode that entity must be switched into
+   prior to the execution of insn.  */
+
+int
+ix86_mode_needed (int entity, rtx insn)
+{
+  switch (entity)
+    {
+    case AVX_U128:
+      return ix86_avx_u128_mode_needed (insn);
+    case I387_TRUNC:
+    case I387_FLOOR:
+    case I387_CEIL:
+    case I387_MASK_PM:
+      return ix86_i387_mode_needed (entity, insn);
+    default:
+      gcc_unreachable ();
+    }
+  return 0;
+}
+
+/* Check if a 256bit AVX register is referenced in stores.   */
+ 
+static void
+ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
+ {
+   if (ix86_check_avx256_register (&dest, NULL))
+    {
+      bool *used = (bool *) data;
+      *used = true;
+    }
+ } 
+
+/* Calculate mode of upper 128bit AVX registers after the insn.  */
+
+static int
+ix86_avx_u128_mode_after (int mode, rtx insn)
+{
+  rtx pat = PATTERN (insn);
+
+  if (vzeroupper_operation (pat, VOIDmode)
+      || vzeroall_operation (pat, VOIDmode))
+    return AVX_U128_CLEAN;
+
+  /* We know that state is clean after CALL insn if there are no
+     256bit registers used in the function return register.  */
+  if (CALL_P (insn))
+    {
+      bool avx_reg256_found = false;
+      note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
+
+      return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
+    }
+
+  /* Otherwise, return current mode.  Remember that if insn
+     references AVX 256bit registers, the mode was already changed
+     to DIRTY from MODE_NEEDED.  */
+  return mode;
+}
+
+/* Return the mode that an insn results in.  */
+
+int
+ix86_mode_after (int entity, int mode, rtx insn)
+{
+  switch (entity)
+    {
+    case AVX_U128:
+      return ix86_avx_u128_mode_after (mode, insn);
+    case I387_TRUNC:
+    case I387_FLOOR:
+    case I387_CEIL:
+    case I387_MASK_PM:
+      return mode;
+    default:
+      gcc_unreachable ();
+    }
+}
+
+static int
+ix86_avx_u128_mode_entry (void)
+{
+  tree arg;
+
+  /* Entry mode is set to AVX_U128_DIRTY if there are
+     256bit modes used in function arguments.  */
+  for (arg = DECL_ARGUMENTS (current_function_decl); arg;
+       arg = TREE_CHAIN (arg))
+    {
+      rtx incoming = DECL_INCOMING_RTL (arg);
+
+      if (incoming && ix86_check_avx256_register (&incoming, NULL))
+	return AVX_U128_DIRTY;
+    }
+
+  return AVX_U128_CLEAN;
+}
+
+/* Return a mode that ENTITY is assumed to be
+   switched to at function entry.  */
+
+int
+ix86_mode_entry (int entity)
+{
+  switch (entity)
+    {
+    case AVX_U128:
+      return ix86_avx_u128_mode_entry ();
+    case I387_TRUNC:
+    case I387_FLOOR:
+    case I387_CEIL:
+    case I387_MASK_PM:
+      return I387_CW_ANY;
+    default:
+      gcc_unreachable ();
+    }
+}
+
+static int
+ix86_avx_u128_mode_exit (void)
+{
+  rtx reg = crtl->return_rtx;
+
+  /* Exit mode is set to AVX_U128_DIRTY if there are
+     256bit modes used in the function return register.  */
+  if (reg && ix86_check_avx256_register (&reg, NULL))
+    return AVX_U128_DIRTY;
+
+  return AVX_U128_CLEAN;
+}
+
+/* Return a mode that ENTITY is assumed to be
+   switched to at function exit.  */
+
+int
+ix86_mode_exit (int entity)
+{
+  switch (entity)
+    {
+    case AVX_U128:
+      return ix86_avx_u128_mode_exit ();
+    case I387_TRUNC:
+    case I387_FLOOR:
+    case I387_CEIL:
+    case I387_MASK_PM:
+      return I387_CW_ANY;
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Output code to initialize control word copies used by trunc?f?i and
+   rounding patterns.  CURRENT_MODE is set to current control word,
+   while NEW_MODE is set to new control word.  */
+
+static void
+emit_i387_cw_initialization (int mode)
+{
+  rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
+  rtx new_mode;
+
+  enum ix86_stack_slot slot;
+
+  rtx reg = gen_reg_rtx (HImode);
+
+  emit_insn (gen_x86_fnstcw_1 (stored_mode));
+  emit_move_insn (reg, copy_rtx (stored_mode));
+
+  if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
+      || optimize_insn_for_size_p ())
+    {
+      switch (mode)
+	{
+	case I387_CW_TRUNC:
+	  /* round toward zero (truncate) */
+	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
+	  slot = SLOT_CW_TRUNC;
+	  break;
+
+	case I387_CW_FLOOR:
+	  /* round down toward -oo */
+	  emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
+	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
+	  slot = SLOT_CW_FLOOR;
+	  break;
+
+	case I387_CW_CEIL:
+	  /* round up toward +oo */
+	  emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
+	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
+	  slot = SLOT_CW_CEIL;
+	  break;
+
+	case I387_CW_MASK_PM:
+	  /* mask precision exception for nearbyint() */
+	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
+	  slot = SLOT_CW_MASK_PM;
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+    }
+  else
+    {
+      switch (mode)
+	{
+	case I387_CW_TRUNC:
+	  /* round toward zero (truncate) */
+	  emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
+	  slot = SLOT_CW_TRUNC;
+	  break;
+
+	case I387_CW_FLOOR:
+	  /* round down toward -oo */
+	  emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
+	  slot = SLOT_CW_FLOOR;
+	  break;
+
+	case I387_CW_CEIL:
+	  /* round up toward +oo */
+	  emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
+	  slot = SLOT_CW_CEIL;
+	  break;
+
+	case I387_CW_MASK_PM:
+	  /* mask precision exception for nearbyint() */
+	  emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
+	  slot = SLOT_CW_MASK_PM;
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+    }
+
+  gcc_assert (slot < MAX_386_STACK_LOCALS);
+
+  new_mode = assign_386_stack_local (HImode, slot);
+  emit_move_insn (new_mode, reg);
+}
+
+/* Emit vzeroupper.  */
+
+void
+ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
+{
+  int i;
+
+  /* Cancel automatic vzeroupper insertion if there are
+     live call-saved SSE registers at the insertion point.  */
+
+  for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
+    if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
+      return;
+
+  if (TARGET_64BIT)
+    for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
+      if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
+	return;
+
+  emit_insn (gen_avx_vzeroupper ());
+}
+
+/* Generate one or more insns to set ENTITY to MODE.  */
+
+void
+ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
+{
+  switch (entity)
+    {
+    case AVX_U128:
+      if (mode == AVX_U128_CLEAN)
+	ix86_avx_emit_vzeroupper (regs_live);
+      break;
+    case I387_TRUNC:
+    case I387_FLOOR:
+    case I387_CEIL:
+    case I387_MASK_PM:
+      if (mode != I387_CW_ANY
+	  && mode != I387_CW_UNINITIALIZED)
+	emit_i387_cw_initialization (mode);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Output code for INSN to convert a float to a signed int.  OPERANDS
+   are the insn operands.  The output may be [HSD]Imode and the input
+   operand may be [SDX]Fmode.  */
+
+const char *
+output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
+{
+  int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
+  int dimode_p = GET_MODE (operands[0]) == DImode;
+  int round_mode = get_attr_i387_cw (insn);
+
+  /* Jump through a hoop or two for DImode, since the hardware has no
+     non-popping instruction.  We used to do this a different way, but
+     that was somewhat fragile and broke with post-reload splitters.  */
+  if ((dimode_p || fisttp) && !stack_top_dies)
+    output_asm_insn ("fld\t%y1", operands);
+
+  gcc_assert (STACK_TOP_P (operands[1]));
+  gcc_assert (MEM_P (operands[0]));
+  gcc_assert (GET_MODE (operands[1]) != TFmode);
+
+  if (fisttp)
+      output_asm_insn ("fisttp%Z0\t%0", operands);
+  else
+    {
+      if (round_mode != I387_CW_ANY)
+	output_asm_insn ("fldcw\t%3", operands);
+      if (stack_top_dies || dimode_p)
+	output_asm_insn ("fistp%Z0\t%0", operands);
+      else
+	output_asm_insn ("fist%Z0\t%0", operands);
+      if (round_mode != I387_CW_ANY)
+	output_asm_insn ("fldcw\t%2", operands);
+    }
+
+  return "";
+}
+
+/* Output code for x87 ffreep insn.  The OPNO argument, which may only
+   have the values zero or one, indicates the ffreep insn's operand
+   from the OPERANDS array.  */
+
+static const char *
+output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
+{
+  if (TARGET_USE_FFREEP)
+#ifdef HAVE_AS_IX86_FFREEP
+    return opno ? "ffreep\t%y1" : "ffreep\t%y0";
+#else
+    {
+      static char retval[32];
+      int regno = REGNO (operands[opno]);
+
+      gcc_assert (STACK_REGNO_P (regno));
+
+      regno -= FIRST_STACK_REG;
+
+      snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
+      return retval;
+    }
+#endif
+
+  return opno ? "fstp\t%y1" : "fstp\t%y0";
+}
+
+
+/* Output code for INSN to compare OPERANDS.  EFLAGS_P is 1 when fcomi
+   should be used.  UNORDERED_P is true when fucom should be used.  */
+
+const char *
+output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
+{
+  int stack_top_dies;
+  rtx cmp_op0, cmp_op1;
+  int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
+
+  if (eflags_p)
+    {
+      cmp_op0 = operands[0];
+      cmp_op1 = operands[1];
+    }
+  else
+    {
+      cmp_op0 = operands[1];
+      cmp_op1 = operands[2];
+    }
+
+  if (is_sse)
+    {
+      if (GET_MODE (operands[0]) == SFmode)
+	if (unordered_p)
+	  return "%vucomiss\t{%1, %0|%0, %1}";
+	else
+	  return "%vcomiss\t{%1, %0|%0, %1}";
+      else
+	if (unordered_p)
+	  return "%vucomisd\t{%1, %0|%0, %1}";
+	else
+	  return "%vcomisd\t{%1, %0|%0, %1}";
+    }
+
+  gcc_assert (STACK_TOP_P (cmp_op0));
+
+  stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
+
+  if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
+    {
+      if (stack_top_dies)
+	{
+	  output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
+	  return output_387_ffreep (operands, 1);
+	}
+      else
+	return "ftst\n\tfnstsw\t%0";
+    }
+
+  if (STACK_REG_P (cmp_op1)
+      && stack_top_dies
+      && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
+      && REGNO (cmp_op1) != FIRST_STACK_REG)
+    {
+      /* If both the top of the 387 stack dies, and the other operand
+	 is also a stack register that dies, then this must be a
+	 `fcompp' float compare */
+
+      if (eflags_p)
+	{
+	  /* There is no double popping fcomi variant.  Fortunately,
+	     eflags is immune from the fstp's cc clobbering.  */
+	  if (unordered_p)
+	    output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
+	  else
+	    output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
+	  return output_387_ffreep (operands, 0);
+	}
+      else
+	{
+	  if (unordered_p)
+	    return "fucompp\n\tfnstsw\t%0";
+	  else
+	    return "fcompp\n\tfnstsw\t%0";
+	}
+    }
+  else
+    {
+      /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies.  */
+
+      static const char * const alt[16] =
+      {
+	"fcom%Z2\t%y2\n\tfnstsw\t%0",
+	"fcomp%Z2\t%y2\n\tfnstsw\t%0",
+	"fucom%Z2\t%y2\n\tfnstsw\t%0",
+	"fucomp%Z2\t%y2\n\tfnstsw\t%0",
+
+	"ficom%Z2\t%y2\n\tfnstsw\t%0",
+	"ficomp%Z2\t%y2\n\tfnstsw\t%0",
+	NULL,
+	NULL,
+
+	"fcomi\t{%y1, %0|%0, %y1}",
+	"fcomip\t{%y1, %0|%0, %y1}",
+	"fucomi\t{%y1, %0|%0, %y1}",
+	"fucomip\t{%y1, %0|%0, %y1}",
+
+	NULL,
+	NULL,
+	NULL,
+	NULL
+      };
+
+      int mask;
+      const char *ret;
+
+      mask  = eflags_p << 3;
+      mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
+      mask |= unordered_p << 1;
+      mask |= stack_top_dies;
+
+      gcc_assert (mask < 16);
+      ret = alt[mask];
+      gcc_assert (ret);
+
+      return ret;
+    }
+}
+
+void
+ix86_output_addr_vec_elt (FILE *file, int value)
+{
+  const char *directive = ASM_LONG;
+
+#ifdef ASM_QUAD
+  if (TARGET_LP64)
+    directive = ASM_QUAD;
+#else
+  gcc_assert (!TARGET_64BIT);
+#endif
+
+  fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
+}
+
+void
+ix86_output_addr_diff_elt (FILE *file, int value, int rel)
+{
+  const char *directive = ASM_LONG;
+
+#ifdef ASM_QUAD
+  if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
+    directive = ASM_QUAD;
+#else
+  gcc_assert (!TARGET_64BIT);
+#endif
+  /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand.  */
+  if (TARGET_64BIT || TARGET_VXWORKS_RTP)
+    fprintf (file, "%s%s%d-%s%d\n",
+	     directive, LPREFIX, value, LPREFIX, rel);
+  else if (HAVE_AS_GOTOFF_IN_DATA)
+    fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
+#if TARGET_MACHO
+  else if (TARGET_MACHO)
+    {
+      fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
+      machopic_output_function_base_name (file);
+      putc ('\n', file);
+    }
+#endif
+  else
+    asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
+		 GOT_SYMBOL_NAME, LPREFIX, value);
+}
+
+/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
+   for the target.  */
+
+void
+ix86_expand_clear (rtx dest)
+{
+  rtx tmp;
+
+  /* We play register width games, which are only valid after reload.  */
+  gcc_assert (reload_completed);
+
+  /* Avoid HImode and its attendant prefix byte.  */
+  if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
+    dest = gen_rtx_REG (SImode, REGNO (dest));
+  tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
+
+  /* This predicate should match that for movsi_xor and movdi_xor_rex64.  */
+  if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
+    {
+      rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
+    }
+
+  emit_insn (tmp);
+}
+
+/* X is an unchanging MEM.  If it is a constant pool reference, return
+   the constant pool rtx, else NULL.  */
+
+rtx
+maybe_get_pool_constant (rtx x)
+{
+  x = ix86_delegitimize_address (XEXP (x, 0));
+
+  if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
+    return get_pool_constant (x);
+
+  return NULL_RTX;
+}
+
+void
+ix86_expand_move (enum machine_mode mode, rtx operands[])
+{
+  rtx op0, op1;
+  enum tls_model model;
+
+  op0 = operands[0];
+  op1 = operands[1];
+
+  if (GET_CODE (op1) == SYMBOL_REF)
+    {
+      rtx tmp;
+
+      model = SYMBOL_REF_TLS_MODEL (op1);
+      if (model)
+	{
+	  op1 = legitimize_tls_address (op1, model, true);
+	  op1 = force_operand (op1, op0);
+	  if (op1 == op0)
+	    return;
+	  op1 = convert_to_mode (mode, op1, 1);
+	}
+      else if ((tmp = legitimize_pe_coff_symbol (op1, false)) != NULL_RTX)
+	op1 = tmp;
+    }
+  else if (GET_CODE (op1) == CONST
+	   && GET_CODE (XEXP (op1, 0)) == PLUS
+	   && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
+    {
+      rtx addend = XEXP (XEXP (op1, 0), 1);
+      rtx symbol = XEXP (XEXP (op1, 0), 0);
+      rtx tmp;
+
+      model = SYMBOL_REF_TLS_MODEL (symbol);
+      if (model)
+	tmp = legitimize_tls_address (symbol, model, true);
+      else
+        tmp = legitimize_pe_coff_symbol (symbol, true);
+
+      if (tmp)
+	{
+	  tmp = force_operand (tmp, NULL);
+	  tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
+				     op0, 1, OPTAB_DIRECT);
+	  if (tmp == op0)
+	    return;
+	  op1 = convert_to_mode (mode, tmp, 1);
+	}
+    }
+
+  if ((flag_pic || MACHOPIC_INDIRECT)
+      && symbolic_operand (op1, mode))
+    {
+      if (TARGET_MACHO && !TARGET_64BIT)
+	{
+#if TARGET_MACHO
+	  /* dynamic-no-pic */
+	  if (MACHOPIC_INDIRECT)
+	    {
+	      rtx temp = ((reload_in_progress
+			   || ((op0 && REG_P (op0))
+			       && mode == Pmode))
+			  ? op0 : gen_reg_rtx (Pmode));
+	      op1 = machopic_indirect_data_reference (op1, temp);
+	      if (MACHOPIC_PURE)
+		op1 = machopic_legitimize_pic_address (op1, mode,
+						       temp == op1 ? 0 : temp);
+	    }
+	  if (op0 != op1 && GET_CODE (op0) != MEM)
+	    {
+	      rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
+	      emit_insn (insn);
+	      return;
+	    }
+	  if (GET_CODE (op0) == MEM)
+	    op1 = force_reg (Pmode, op1);
+	  else
+	    {
+	      rtx temp = op0;
+	      if (GET_CODE (temp) != REG)
+		temp = gen_reg_rtx (Pmode);
+	      temp = legitimize_pic_address (op1, temp);
+	      if (temp == op0)
+	    return;
+	      op1 = temp;
+	    }
+      /* dynamic-no-pic */
+#endif
+	}
+      else
+	{
+	  if (MEM_P (op0))
+	    op1 = force_reg (mode, op1);
+	  else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
+	    {
+	      rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
+	      op1 = legitimize_pic_address (op1, reg);
+	      if (op0 == op1)
+		return;
+	      op1 = convert_to_mode (mode, op1, 1);
+	    }
+	}
+    }
+  else
+    {
+      if (MEM_P (op0)
+	  && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
+	      || !push_operand (op0, mode))
+	  && MEM_P (op1))
+	op1 = force_reg (mode, op1);
+
+      if (push_operand (op0, mode)
+	  && ! general_no_elim_operand (op1, mode))
+	op1 = copy_to_mode_reg (mode, op1);
+
+      /* Force large constants in 64bit compilation into register
+	 to get them CSEed.  */
+      if (can_create_pseudo_p ()
+	  && (mode == DImode) && TARGET_64BIT
+	  && immediate_operand (op1, mode)
+	  && !x86_64_zext_immediate_operand (op1, VOIDmode)
+	  && !register_operand (op0, mode)
+	  && optimize)
+	op1 = copy_to_mode_reg (mode, op1);
+
+      if (can_create_pseudo_p ()
+	  && FLOAT_MODE_P (mode)
+	  && GET_CODE (op1) == CONST_DOUBLE)
+	{
+	  /* If we are loading a floating point constant to a register,
+	     force the value to memory now, since we'll get better code
+	     out the back end.  */
+
+	  op1 = validize_mem (force_const_mem (mode, op1));
+	  if (!register_operand (op0, mode))
+	    {
+	      rtx temp = gen_reg_rtx (mode);
+	      emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
+	      emit_move_insn (op0, temp);
+	      return;
+	    }
+	}
+    }
+
+  emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
+}
+
+void
+ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
+{
+  rtx op0 = operands[0], op1 = operands[1];
+  unsigned int align = GET_MODE_ALIGNMENT (mode);
+
+  if (push_operand (op0, VOIDmode))
+    op0 = emit_move_resolve_push (mode, op0);
+
+  /* Force constants other than zero into memory.  We do not know how
+     the instructions used to build constants modify the upper 64 bits
+     of the register, once we have that information we may be able
+     to handle some of them more efficiently.  */
+  if (can_create_pseudo_p ()
+      && register_operand (op0, mode)
+      && (CONSTANT_P (op1)
+	  || (GET_CODE (op1) == SUBREG
+	      && CONSTANT_P (SUBREG_REG (op1))))
+      && !standard_sse_constant_p (op1))
+    op1 = validize_mem (force_const_mem (mode, op1));
+
+  /* We need to check memory alignment for SSE mode since attribute
+     can make operands unaligned.  */
+  if (can_create_pseudo_p ()
+      && SSE_REG_MODE_P (mode)
+      && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
+	  || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
+    {
+      rtx tmp[2];
+
+      /* ix86_expand_vector_move_misalign() does not like constants ... */
+      if (CONSTANT_P (op1)
+	  || (GET_CODE (op1) == SUBREG
+	      && CONSTANT_P (SUBREG_REG (op1))))
+	op1 = validize_mem (force_const_mem (mode, op1));
+
+      /* ... nor both arguments in memory.  */
+      if (!register_operand (op0, mode)
+	  && !register_operand (op1, mode))
+	op1 = force_reg (mode, op1);
+
+      tmp[0] = op0; tmp[1] = op1;
+      ix86_expand_vector_move_misalign (mode, tmp);
+      return;
+    }
+
+  /* Make operand1 a register if it isn't already.  */
+  if (can_create_pseudo_p ()
+      && !register_operand (op0, mode)
+      && !register_operand (op1, mode))
+    {
+      emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
+      return;
+    }
+
+  emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
+}
+
+/* Split 32-byte AVX unaligned load and store if needed.  */
+
+static void
+ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
+{
+  rtx m;
+  rtx (*extract) (rtx, rtx, rtx);
+  rtx (*load_unaligned) (rtx, rtx);
+  rtx (*store_unaligned) (rtx, rtx);
+  enum machine_mode mode;
+
+  switch (GET_MODE (op0))
+    {
+    default:
+      gcc_unreachable ();
+    case V32QImode:
+      extract = gen_avx_vextractf128v32qi;
+      load_unaligned = gen_avx_loaddquv32qi;
+      store_unaligned = gen_avx_storedquv32qi;
+      mode = V16QImode;
+      break;
+    case V8SFmode:
+      extract = gen_avx_vextractf128v8sf;
+      load_unaligned = gen_avx_loadups256;
+      store_unaligned = gen_avx_storeups256;
+      mode = V4SFmode;
+      break;
+    case V4DFmode:
+      extract = gen_avx_vextractf128v4df;
+      load_unaligned = gen_avx_loadupd256;
+      store_unaligned = gen_avx_storeupd256;
+      mode = V2DFmode;
+      break;
+    }
+
+  if (MEM_P (op1))
+    {
+      if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
+	{
+	  rtx r = gen_reg_rtx (mode);
+	  m = adjust_address (op1, mode, 0);
+	  emit_move_insn (r, m);
+	  m = adjust_address (op1, mode, 16);
+	  r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
+	  emit_move_insn (op0, r);
+	}
+      /* Normal *mov<mode>_internal pattern will handle
+	 unaligned loads just fine if misaligned_operand
+	 is true, and without the UNSPEC it can be combined
+	 with arithmetic instructions.  */
+      else if (misaligned_operand (op1, GET_MODE (op1)))
+	emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
+      else
+	emit_insn (load_unaligned (op0, op1));
+    }
+  else if (MEM_P (op0))
+    {
+      if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
+	{
+	  m = adjust_address (op0, mode, 0);
+	  emit_insn (extract (m, op1, const0_rtx));
+	  m = adjust_address (op0, mode, 16);
+	  emit_insn (extract (m, op1, const1_rtx));
+	}
+      else
+	emit_insn (store_unaligned (op0, op1));
+    }
+  else
+    gcc_unreachable ();
+}
+
+/* Implement the movmisalign patterns for SSE.  Non-SSE modes go
+   straight to ix86_expand_vector_move.  */
+/* Code generation for scalar reg-reg moves of single and double precision data:
+     if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
+       movaps reg, reg
+     else
+       movss reg, reg
+     if (x86_sse_partial_reg_dependency == true)
+       movapd reg, reg
+     else
+       movsd reg, reg
+
+   Code generation for scalar loads of double precision data:
+     if (x86_sse_split_regs == true)
+       movlpd mem, reg      (gas syntax)
+     else
+       movsd mem, reg
+
+   Code generation for unaligned packed loads of single precision data
+   (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
+     if (x86_sse_unaligned_move_optimal)
+       movups mem, reg
+
+     if (x86_sse_partial_reg_dependency == true)
+       {
+         xorps  reg, reg
+         movlps mem, reg
+         movhps mem+8, reg
+       }
+     else
+       {
+         movlps mem, reg
+         movhps mem+8, reg
+       }
+
+   Code generation for unaligned packed loads of double precision data
+   (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
+     if (x86_sse_unaligned_move_optimal)
+       movupd mem, reg
+
+     if (x86_sse_split_regs == true)
+       {
+         movlpd mem, reg
+         movhpd mem+8, reg
+       }
+     else
+       {
+         movsd  mem, reg
+         movhpd mem+8, reg
+       }
+ */
+
+void
+ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
+{
+  rtx op0, op1, orig_op0 = NULL_RTX, m;
+  rtx (*load_unaligned) (rtx, rtx);
+  rtx (*store_unaligned) (rtx, rtx);
+
+  op0 = operands[0];
+  op1 = operands[1];
+
+  if (GET_MODE_SIZE (mode) == 64)
+    {
+      switch (GET_MODE_CLASS (mode))
+	{
+	case MODE_VECTOR_INT:
+	case MODE_INT:
+	  if (GET_MODE (op0) != V16SImode)
+	    {
+	      if (!MEM_P (op0))
+		{
+		  orig_op0 = op0;
+		  op0 = gen_reg_rtx (V16SImode);
+		}
+	      else
+		op0 = gen_lowpart (V16SImode, op0);
+	    }
+	  op1 = gen_lowpart (V16SImode, op1);
+	  /* FALLTHRU */
+
+	case MODE_VECTOR_FLOAT:
+	  switch (GET_MODE (op0))
+	    {
+	    default:
+	      gcc_unreachable ();
+	    case V16SImode:
+	      load_unaligned = gen_avx512f_loaddquv16si;
+	      store_unaligned = gen_avx512f_storedquv16si;
+	      break;
+	    case V16SFmode:
+	      load_unaligned = gen_avx512f_loadups512;
+	      store_unaligned = gen_avx512f_storeups512;
+	      break;
+	    case V8DFmode:
+	      load_unaligned = gen_avx512f_loadupd512;
+	      store_unaligned = gen_avx512f_storeupd512;
+	      break;
+	    }
+
+	  if (MEM_P (op1))
+	    emit_insn (load_unaligned (op0, op1));
+	  else if (MEM_P (op0))
+	    emit_insn (store_unaligned (op0, op1));
+	  else
+	    gcc_unreachable ();
+	  if (orig_op0)
+	    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+
+      return;
+    }
+
+  if (TARGET_AVX
+      && GET_MODE_SIZE (mode) == 32)
+    {
+      switch (GET_MODE_CLASS (mode))
+	{
+	case MODE_VECTOR_INT:
+	case MODE_INT:
+	  if (GET_MODE (op0) != V32QImode)
+	    {
+	      if (!MEM_P (op0))
+		{
+		  orig_op0 = op0;
+		  op0 = gen_reg_rtx (V32QImode);
+		}
+	      else
+		op0 = gen_lowpart (V32QImode, op0);
+	    }
+	  op1 = gen_lowpart (V32QImode, op1);
+	  /* FALLTHRU */
+
+	case MODE_VECTOR_FLOAT:
+	  ix86_avx256_split_vector_move_misalign (op0, op1);
+	  if (orig_op0)
+	    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+
+      return;
+    }
+
+  if (MEM_P (op1))
+    {
+      /* Normal *mov<mode>_internal pattern will handle
+	 unaligned loads just fine if misaligned_operand
+	 is true, and without the UNSPEC it can be combined
+	 with arithmetic instructions.  */
+      if (TARGET_AVX
+	  && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+	      || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
+	  && misaligned_operand (op1, GET_MODE (op1)))
+	emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
+      /* ??? If we have typed data, then it would appear that using
+	 movdqu is the only way to get unaligned data loaded with
+	 integer type.  */
+      else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+	{
+	  if (GET_MODE (op0) != V16QImode)
+	    {
+	      orig_op0 = op0;
+	      op0 = gen_reg_rtx (V16QImode);
+	    }
+	  op1 = gen_lowpart (V16QImode, op1);
+	  /* We will eventually emit movups based on insn attributes.  */
+	  emit_insn (gen_sse2_loaddquv16qi (op0, op1));
+	  if (orig_op0)
+	    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
+	}
+      else if (TARGET_SSE2 && mode == V2DFmode)
+        {
+          rtx zero;
+
+	  if (TARGET_AVX
+	      || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
+	      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+	      || optimize_insn_for_size_p ())
+	    {
+	      /* We will eventually emit movups based on insn attributes.  */
+	      emit_insn (gen_sse2_loadupd (op0, op1));
+	      return;
+	    }
+
+	  /* When SSE registers are split into halves, we can avoid
+	     writing to the top half twice.  */
+	  if (TARGET_SSE_SPLIT_REGS)
+	    {
+	      emit_clobber (op0);
+	      zero = op0;
+	    }
+	  else
+	    {
+	      /* ??? Not sure about the best option for the Intel chips.
+		 The following would seem to satisfy; the register is
+		 entirely cleared, breaking the dependency chain.  We
+		 then store to the upper half, with a dependency depth
+		 of one.  A rumor has it that Intel recommends two movsd
+		 followed by an unpacklpd, but this is unconfirmed.  And
+		 given that the dependency depth of the unpacklpd would
+		 still be one, I'm not sure why this would be better.  */
+	      zero = CONST0_RTX (V2DFmode);
+	    }
+
+	  m = adjust_address (op1, DFmode, 0);
+	  emit_insn (gen_sse2_loadlpd (op0, zero, m));
+	  m = adjust_address (op1, DFmode, 8);
+	  emit_insn (gen_sse2_loadhpd (op0, op0, m));
+	}
+      else
+        {
+	  rtx t;
+
+	  if (TARGET_AVX
+	      || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
+	      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+	      || optimize_insn_for_size_p ())
+	    {
+	      if (GET_MODE (op0) != V4SFmode)
+		{
+		  orig_op0 = op0;
+		  op0 = gen_reg_rtx (V4SFmode);
+		}
+	      op1 = gen_lowpart (V4SFmode, op1);
+	      emit_insn (gen_sse_loadups (op0, op1));
+	      if (orig_op0)
+		emit_move_insn (orig_op0,
+				gen_lowpart (GET_MODE (orig_op0), op0));
+	      return;
+            }
+
+	  if (mode != V4SFmode)
+	    t = gen_reg_rtx (V4SFmode);
+	  else
+	    t = op0;
+	    
+	  if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
+	    emit_move_insn (t, CONST0_RTX (V4SFmode));
+	  else
+	    emit_clobber (t);
+
+	  m = adjust_address (op1, V2SFmode, 0);
+	  emit_insn (gen_sse_loadlps (t, t, m));
+	  m = adjust_address (op1, V2SFmode, 8);
+	  emit_insn (gen_sse_loadhps (t, t, m));
+	  if (mode != V4SFmode)
+	    emit_move_insn (op0, gen_lowpart (mode, t));
+	}
+    }
+  else if (MEM_P (op0))
+    {
+      if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+        {
+	  op0 = gen_lowpart (V16QImode, op0);
+	  op1 = gen_lowpart (V16QImode, op1);
+	  /* We will eventually emit movups based on insn attributes.  */
+	  emit_insn (gen_sse2_storedquv16qi (op0, op1));
+	}
+      else if (TARGET_SSE2 && mode == V2DFmode)
+	{
+	  if (TARGET_AVX
+	      || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
+	      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+	      || optimize_insn_for_size_p ())
+	    /* We will eventually emit movups based on insn attributes.  */
+	    emit_insn (gen_sse2_storeupd (op0, op1));
+	  else
+	    {
+	      m = adjust_address (op0, DFmode, 0);
+	      emit_insn (gen_sse2_storelpd (m, op1));
+	      m = adjust_address (op0, DFmode, 8);
+	      emit_insn (gen_sse2_storehpd (m, op1));
+	    }
+	}
+      else
+	{
+	  if (mode != V4SFmode)
+	    op1 = gen_lowpart (V4SFmode, op1);
+
+	  if (TARGET_AVX
+	      || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
+	      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+	      || optimize_insn_for_size_p ())
+	    {
+	      op0 = gen_lowpart (V4SFmode, op0);
+	      emit_insn (gen_sse_storeups (op0, op1));
+	    }
+	  else
+	    {
+	      m = adjust_address (op0, V2SFmode, 0);
+	      emit_insn (gen_sse_storelps (m, op1));
+	      m = adjust_address (op0, V2SFmode, 8);
+	      emit_insn (gen_sse_storehps (m, op1));
+	    }
+	}
+    }
+  else
+    gcc_unreachable ();
+}
+
+/* Helper function of ix86_fixup_binary_operands to canonicalize
+   operand order.  Returns true if the operands should be swapped.  */
+
+static bool
+ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
+			     rtx operands[])
+{
+  rtx dst = operands[0];
+  rtx src1 = operands[1];
+  rtx src2 = operands[2];
+
+  /* If the operation is not commutative, we can't do anything.  */
+  if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
+    return false;
+
+  /* Highest priority is that src1 should match dst.  */
+  if (rtx_equal_p (dst, src1))
+    return false;
+  if (rtx_equal_p (dst, src2))
+    return true;
+
+  /* Next highest priority is that immediate constants come second.  */
+  if (immediate_operand (src2, mode))
+    return false;
+  if (immediate_operand (src1, mode))
+    return true;
+
+  /* Lowest priority is that memory references should come second.  */
+  if (MEM_P (src2))
+    return false;
+  if (MEM_P (src1))
+    return true;
+
+  return false;
+}
+
+
+/* Fix up OPERANDS to satisfy ix86_binary_operator_ok.  Return the
+   destination to use for the operation.  If different from the true
+   destination in operands[0], a copy operation will be required.  */
+
+rtx
+ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
+			    rtx operands[])
+{
+  rtx dst = operands[0];
+  rtx src1 = operands[1];
+  rtx src2 = operands[2];
+
+  /* Canonicalize operand order.  */
+  if (ix86_swap_binary_operands_p (code, mode, operands))
+    {
+      rtx temp;
+
+      /* It is invalid to swap operands of different modes.  */
+      gcc_assert (GET_MODE (src1) == GET_MODE (src2));
+
+      temp = src1;
+      src1 = src2;
+      src2 = temp;
+    }
+
+  /* Both source operands cannot be in memory.  */
+  if (MEM_P (src1) && MEM_P (src2))
+    {
+      /* Optimization: Only read from memory once.  */
+      if (rtx_equal_p (src1, src2))
+	{
+	  src2 = force_reg (mode, src2);
+	  src1 = src2;
+	}
+      else if (rtx_equal_p (dst, src1))
+	src2 = force_reg (mode, src2);
+      else
+	src1 = force_reg (mode, src1);
+    }
+
+  /* If the destination is memory, and we do not have matching source
+     operands, do things in registers.  */
+  if (MEM_P (dst) && !rtx_equal_p (dst, src1))
+    dst = gen_reg_rtx (mode);
+
+  /* Source 1 cannot be a constant.  */
+  if (CONSTANT_P (src1))
+    src1 = force_reg (mode, src1);
+
+  /* Source 1 cannot be a non-matching memory.  */
+  if (MEM_P (src1) && !rtx_equal_p (dst, src1))
+    src1 = force_reg (mode, src1);
+
+  /* Improve address combine.  */
+  if (code == PLUS
+      && GET_MODE_CLASS (mode) == MODE_INT
+      && MEM_P (src2))
+    src2 = force_reg (mode, src2);
+
+  operands[1] = src1;
+  operands[2] = src2;
+  return dst;
+}
+
+/* Similarly, but assume that the destination has already been
+   set up properly.  */
+
+void
+ix86_fixup_binary_operands_no_copy (enum rtx_code code,
+				    enum machine_mode mode, rtx operands[])
+{
+  rtx dst = ix86_fixup_binary_operands (code, mode, operands);
+  gcc_assert (dst == operands[0]);
+}
+
+/* Attempt to expand a binary operator.  Make the expansion closer to the
+   actual machine, then just general_operand, which will allow 3 separate
+   memory references (one output, two input) in a single insn.  */
+
+void
+ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
+			     rtx operands[])
+{
+  rtx src1, src2, dst, op, clob;
+
+  dst = ix86_fixup_binary_operands (code, mode, operands);
+  src1 = operands[1];
+  src2 = operands[2];
+
+ /* Emit the instruction.  */
+
+  op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
+  if (reload_in_progress)
+    {
+      /* Reload doesn't know about the flags register, and doesn't know that
+         it doesn't want to clobber it.  We can only do this with PLUS.  */
+      gcc_assert (code == PLUS);
+      emit_insn (op);
+    }
+  else if (reload_completed
+	   && code == PLUS
+	   && !rtx_equal_p (dst, src1))
+    {
+      /* This is going to be an LEA; avoid splitting it later.  */
+      emit_insn (op);
+    }
+  else
+    {
+      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+    }
+
+  /* Fix up the destination if needed.  */
+  if (dst != operands[0])
+    emit_move_insn (operands[0], dst);
+}
+
+/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
+   the given OPERANDS.  */
+
+void
+ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
+				     rtx operands[])
+{
+  rtx op1 = NULL_RTX, op2 = NULL_RTX;
+  if (GET_CODE (operands[1]) == SUBREG)
+    {
+      op1 = operands[1];
+      op2 = operands[2];
+    }
+  else if (GET_CODE (operands[2]) == SUBREG)
+    {
+      op1 = operands[2];
+      op2 = operands[1];
+    }
+  /* Optimize (__m128i) d | (__m128i) e and similar code
+     when d and e are float vectors into float vector logical
+     insn.  In C/C++ without using intrinsics there is no other way
+     to express vector logical operation on float vectors than
+     to cast them temporarily to integer vectors.  */
+  if (op1
+      && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+      && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
+      && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
+      && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
+      && SUBREG_BYTE (op1) == 0
+      && (GET_CODE (op2) == CONST_VECTOR
+	  || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
+	      && SUBREG_BYTE (op2) == 0))
+      && can_create_pseudo_p ())
+    {
+      rtx dst;
+      switch (GET_MODE (SUBREG_REG (op1)))
+	{
+	case V4SFmode:
+	case V8SFmode:
+	case V2DFmode:
+	case V4DFmode:
+	  dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
+	  if (GET_CODE (op2) == CONST_VECTOR)
+	    {
+	      op2 = gen_lowpart (GET_MODE (dst), op2);
+	      op2 = force_reg (GET_MODE (dst), op2);
+	    }
+	  else
+	    {
+	      op1 = operands[1];
+	      op2 = SUBREG_REG (operands[2]);
+	      if (!nonimmediate_operand (op2, GET_MODE (dst)))
+		op2 = force_reg (GET_MODE (dst), op2);
+	    }
+	  op1 = SUBREG_REG (op1);
+	  if (!nonimmediate_operand (op1, GET_MODE (dst)))
+	    op1 = force_reg (GET_MODE (dst), op1);
+	  emit_insn (gen_rtx_SET (VOIDmode, dst,
+				  gen_rtx_fmt_ee (code, GET_MODE (dst),
+						  op1, op2)));
+	  emit_move_insn (operands[0], gen_lowpart (mode, dst));
+	  return;
+	default:
+	  break;
+	}
+    }
+  if (!nonimmediate_operand (operands[1], mode))
+    operands[1] = force_reg (mode, operands[1]);
+  if (!nonimmediate_operand (operands[2], mode))
+    operands[2] = force_reg (mode, operands[2]);
+  ix86_fixup_binary_operands_no_copy (code, mode, operands);
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+			  gen_rtx_fmt_ee (code, mode, operands[1],
+					  operands[2])));
+}
+
+/* Return TRUE or FALSE depending on whether the binary operator meets the
+   appropriate constraints.  */
+
+bool
+ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
+			 rtx operands[3])
+{
+  rtx dst = operands[0];
+  rtx src1 = operands[1];
+  rtx src2 = operands[2];
+
+  /* Both source operands cannot be in memory.  */
+  if (MEM_P (src1) && MEM_P (src2))
+    return false;
+
+  /* Canonicalize operand order for commutative operators.  */
+  if (ix86_swap_binary_operands_p (code, mode, operands))
+    {
+      rtx temp = src1;
+      src1 = src2;
+      src2 = temp;
+    }
+
+  /* If the destination is memory, we must have a matching source operand.  */
+  if (MEM_P (dst) && !rtx_equal_p (dst, src1))
+      return false;
+
+  /* Source 1 cannot be a constant.  */
+  if (CONSTANT_P (src1))
+    return false;
+
+  /* Source 1 cannot be a non-matching memory.  */
+  if (MEM_P (src1) && !rtx_equal_p (dst, src1))
+    /* Support "andhi/andsi/anddi" as a zero-extending move.  */
+    return (code == AND
+	    && (mode == HImode
+		|| mode == SImode
+		|| (TARGET_64BIT && mode == DImode))
+	    && satisfies_constraint_L (src2));
+
+  return true;
+}
+
+/* Attempt to expand a unary operator.  Make the expansion closer to the
+   actual machine, then just general_operand, which will allow 2 separate
+   memory references (one output, one input) in a single insn.  */
+
+void
+ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
+			    rtx operands[])
+{
+  int matching_memory;
+  rtx src, dst, op, clob;
+
+  dst = operands[0];
+  src = operands[1];
+
+  /* If the destination is memory, and we do not have matching source
+     operands, do things in registers.  */
+  matching_memory = 0;
+  if (MEM_P (dst))
+    {
+      if (rtx_equal_p (dst, src))
+	matching_memory = 1;
+      else
+	dst = gen_reg_rtx (mode);
+    }
+
+  /* When source operand is memory, destination must match.  */
+  if (MEM_P (src) && !matching_memory)
+    src = force_reg (mode, src);
+
+  /* Emit the instruction.  */
+
+  op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
+  if (reload_in_progress || code == NOT)
+    {
+      /* Reload doesn't know about the flags register, and doesn't know that
+         it doesn't want to clobber it.  */
+      gcc_assert (code == NOT);
+      emit_insn (op);
+    }
+  else
+    {
+      clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+    }
+
+  /* Fix up the destination if needed.  */
+  if (dst != operands[0])
+    emit_move_insn (operands[0], dst);
+}
+
+/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
+   divisor are within the range [0-255].  */
+
+void
+ix86_split_idivmod (enum machine_mode mode, rtx operands[],
+		    bool signed_p)
+{
+  rtx end_label, qimode_label;
+  rtx insn, div, mod;
+  rtx scratch, tmp0, tmp1, tmp2;
+  rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
+  rtx (*gen_zero_extend) (rtx, rtx);
+  rtx (*gen_test_ccno_1) (rtx, rtx);
+
+  switch (mode)
+    {
+    case SImode:
+      gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
+      gen_test_ccno_1 = gen_testsi_ccno_1;
+      gen_zero_extend = gen_zero_extendqisi2;
+      break;
+    case DImode:
+      gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
+      gen_test_ccno_1 = gen_testdi_ccno_1;
+      gen_zero_extend = gen_zero_extendqidi2;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  end_label = gen_label_rtx ();
+  qimode_label = gen_label_rtx ();
+
+  scratch = gen_reg_rtx (mode);
+
+  /* Use 8bit unsigned divimod if dividend and divisor are within
+     the range [0-255].  */
+  emit_move_insn (scratch, operands[2]);
+  scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
+				 scratch, 1, OPTAB_DIRECT);
+  emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
+  tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
+  tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
+  tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
+			       gen_rtx_LABEL_REF (VOIDmode, qimode_label),
+			       pc_rtx);
+  insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
+  predict_jump (REG_BR_PROB_BASE * 50 / 100);
+  JUMP_LABEL (insn) = qimode_label;
+
+  /* Generate original signed/unsigned divimod.  */
+  div = gen_divmod4_1 (operands[0], operands[1],
+		       operands[2], operands[3]);
+  emit_insn (div);
+
+  /* Branch to the end.  */
+  emit_jump_insn (gen_jump (end_label));
+  emit_barrier ();
+
+  /* Generate 8bit unsigned divide.  */
+  emit_label (qimode_label);
+  /* Don't use operands[0] for result of 8bit divide since not all
+     registers support QImode ZERO_EXTRACT.  */
+  tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
+  tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
+  tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
+  emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
+
+  if (signed_p)
+    {
+      div = gen_rtx_DIV (SImode, operands[2], operands[3]);
+      mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
+    }
+  else
+    {
+      div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
+      mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
+    }
+
+  /* Extract remainder from AH.  */
+  tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
+  if (REG_P (operands[1]))
+    insn = emit_move_insn (operands[1], tmp1);
+  else
+    {
+      /* Need a new scratch register since the old one has result
+	 of 8bit divide.  */
+      scratch = gen_reg_rtx (mode);
+      emit_move_insn (scratch, tmp1);
+      insn = emit_move_insn (operands[1], scratch);
+    }
+  set_unique_reg_note (insn, REG_EQUAL, mod);
+
+  /* Zero extend quotient from AL.  */
+  tmp1 = gen_lowpart (QImode, tmp0);
+  insn = emit_insn (gen_zero_extend (operands[0], tmp1));
+  set_unique_reg_note (insn, REG_EQUAL, div);
+
+  emit_label (end_label);
+}
+
+/* Whether it is OK to emit CFI directives when emitting asm code.  */
+
+bool
+ix86_emit_cfi ()
+{
+  return dwarf2out_do_cfi_asm ();
+}
+
+#define LEA_MAX_STALL (3)
+#define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
+
+/* Increase given DISTANCE in half-cycles according to
+   dependencies between PREV and NEXT instructions.
+   Add 1 half-cycle if there is no dependency and
+   go to next cycle if there is some dependecy.  */
+
+static unsigned int
+increase_distance (rtx prev, rtx next, unsigned int distance)
+{
+  df_ref *use_rec;
+  df_ref *def_rec;
+
+  if (!prev || !next)
+    return distance + (distance & 1) + 2;
+
+  if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
+    return distance + 1;
+
+  for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
+    for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
+      if (!DF_REF_IS_ARTIFICIAL (*def_rec)
+	  && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
+	return distance + (distance & 1) + 2;
+
+  return distance + 1;
+}
+
+/* Function checks if instruction INSN defines register number
+   REGNO1 or REGNO2.  */
+
+static bool
+insn_defines_reg (unsigned int regno1, unsigned int regno2,
+		  rtx insn)
+{
+  df_ref *def_rec;
+
+  for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
+    if (DF_REF_REG_DEF_P (*def_rec)
+	&& !DF_REF_IS_ARTIFICIAL (*def_rec)
+	&& (regno1 == DF_REF_REGNO (*def_rec)
+	    || regno2 == DF_REF_REGNO (*def_rec)))
+      {
+	return true;
+      }
+
+  return false;
+}
+
+/* Function checks if instruction INSN uses register number
+   REGNO as a part of address expression.  */
+
+static bool
+insn_uses_reg_mem (unsigned int regno, rtx insn)
+{
+  df_ref *use_rec;
+
+  for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
+    if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
+      return true;
+
+  return false;
+}
+
+/* Search backward for non-agu definition of register number REGNO1
+   or register number REGNO2 in basic block starting from instruction
+   START up to head of basic block or instruction INSN.
+
+   Function puts true value into *FOUND var if definition was found
+   and false otherwise.
+
+   Distance in half-cycles between START and found instruction or head
+   of BB is added to DISTANCE and returned.  */
+
+static int
+distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
+			       rtx insn, int distance,
+			       rtx start, bool *found)
+{
+  basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
+  rtx prev = start;
+  rtx next = NULL;
+
+  *found = false;
+
+  while (prev
+	 && prev != insn
+	 && distance < LEA_SEARCH_THRESHOLD)
+    {
+      if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
+	{
+	  distance = increase_distance (prev, next, distance);
+	  if (insn_defines_reg (regno1, regno2, prev))
+	    {
+	      if (recog_memoized (prev) < 0
+		  || get_attr_type (prev) != TYPE_LEA)
+		{
+		  *found = true;
+		  return distance;
+		}
+	    }
+
+	  next = prev;
+	}
+      if (prev == BB_HEAD (bb))
+	break;
+
+      prev = PREV_INSN (prev);
+    }
+
+  return distance;
+}
+
+/* Search backward for non-agu definition of register number REGNO1
+   or register number REGNO2 in INSN's basic block until
+   1. Pass LEA_SEARCH_THRESHOLD instructions, or
+   2. Reach neighbour BBs boundary, or
+   3. Reach agu definition.
+   Returns the distance between the non-agu definition point and INSN.
+   If no definition point, returns -1.  */
+
+static int
+distance_non_agu_define (unsigned int regno1, unsigned int regno2,
+			 rtx insn)
+{
+  basic_block bb = BLOCK_FOR_INSN (insn);
+  int distance = 0;
+  bool found = false;
+
+  if (insn != BB_HEAD (bb))
+    distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
+					      distance, PREV_INSN (insn),
+					      &found);
+
+  if (!found && distance < LEA_SEARCH_THRESHOLD)
+    {
+      edge e;
+      edge_iterator ei;
+      bool simple_loop = false;
+
+      FOR_EACH_EDGE (e, ei, bb->preds)
+	if (e->src == bb)
+	  {
+	    simple_loop = true;
+	    break;
+	  }
+
+      if (simple_loop)
+	distance = distance_non_agu_define_in_bb (regno1, regno2,
+						  insn, distance,
+						  BB_END (bb), &found);
+      else
+	{
+	  int shortest_dist = -1;
+	  bool found_in_bb = false;
+
+	  FOR_EACH_EDGE (e, ei, bb->preds)
+	    {
+	      int bb_dist
+		= distance_non_agu_define_in_bb (regno1, regno2,
+						 insn, distance,
+						 BB_END (e->src),
+						 &found_in_bb);
+	      if (found_in_bb)
+		{
+		  if (shortest_dist < 0)
+		    shortest_dist = bb_dist;
+		  else if (bb_dist > 0)
+		    shortest_dist = MIN (bb_dist, shortest_dist);
+
+		  found = true;
+		}
+	    }
+
+	  distance = shortest_dist;
+	}
+    }
+
+  /* get_attr_type may modify recog data.  We want to make sure
+     that recog data is valid for instruction INSN, on which
+     distance_non_agu_define is called.  INSN is unchanged here.  */
+  extract_insn_cached (insn);
+
+  if (!found)
+    return -1;
+
+  return distance >> 1;
+}
+
+/* Return the distance in half-cycles between INSN and the next
+   insn that uses register number REGNO in memory address added
+   to DISTANCE.  Return -1 if REGNO0 is set.
+
+   Put true value into *FOUND if register usage was found and
+   false otherwise.
+   Put true value into *REDEFINED if register redefinition was
+   found and false otherwise.  */
+
+static int
+distance_agu_use_in_bb (unsigned int regno,
+			rtx insn, int distance, rtx start,
+			bool *found, bool *redefined)
+{
+  basic_block bb = NULL;
+  rtx next = start;
+  rtx prev = NULL;
+
+  *found = false;
+  *redefined = false;
+
+  if (start != NULL_RTX)
+    {
+      bb = BLOCK_FOR_INSN (start);
+      if (start != BB_HEAD (bb))
+	/* If insn and start belong to the same bb, set prev to insn,
+	   so the call to increase_distance will increase the distance
+	   between insns by 1.  */
+	prev = insn;
+    }
+
+  while (next
+	 && next != insn
+	 && distance < LEA_SEARCH_THRESHOLD)
+    {
+      if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
+	{
+	  distance = increase_distance(prev, next, distance);
+	  if (insn_uses_reg_mem (regno, next))
+	    {
+	      /* Return DISTANCE if OP0 is used in memory
+		 address in NEXT.  */
+	      *found = true;
+	      return distance;
+	    }
+
+	  if (insn_defines_reg (regno, INVALID_REGNUM, next))
+	    {
+	      /* Return -1 if OP0 is set in NEXT.  */
+	      *redefined = true;
+	      return -1;
+	    }
+
+	  prev = next;
+	}
+
+      if (next == BB_END (bb))
+	break;
+
+      next = NEXT_INSN (next);
+    }
+
+  return distance;
+}
+
+/* Return the distance between INSN and the next insn that uses
+   register number REGNO0 in memory address.  Return -1 if no such
+   a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set.  */
+
+static int
+distance_agu_use (unsigned int regno0, rtx insn)
+{
+  basic_block bb = BLOCK_FOR_INSN (insn);
+  int distance = 0;
+  bool found = false;
+  bool redefined = false;
+
+  if (insn != BB_END (bb))
+    distance = distance_agu_use_in_bb (regno0, insn, distance,
+				       NEXT_INSN (insn),
+				       &found, &redefined);
+
+  if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
+    {
+      edge e;
+      edge_iterator ei;
+      bool simple_loop = false;
+
+      FOR_EACH_EDGE (e, ei, bb->succs)
+        if (e->dest == bb)
+	  {
+	    simple_loop = true;
+	    break;
+	  }
+
+      if (simple_loop)
+	distance = distance_agu_use_in_bb (regno0, insn,
+					   distance, BB_HEAD (bb),
+					   &found, &redefined);
+      else
+	{
+	  int shortest_dist = -1;
+	  bool found_in_bb = false;
+	  bool redefined_in_bb = false;
+
+	  FOR_EACH_EDGE (e, ei, bb->succs)
+	    {
+	      int bb_dist
+		= distance_agu_use_in_bb (regno0, insn,
+					  distance, BB_HEAD (e->dest),
+					  &found_in_bb, &redefined_in_bb);
+	      if (found_in_bb)
+		{
+		  if (shortest_dist < 0)
+		    shortest_dist = bb_dist;
+		  else if (bb_dist > 0)
+		    shortest_dist = MIN (bb_dist, shortest_dist);
+
+		  found = true;
+		}
+	    }
+
+	  distance = shortest_dist;
+	}
+    }
+
+  if (!found || redefined)
+    return -1;
+
+  return distance >> 1;
+}
+
+/* Define this macro to tune LEA priority vs ADD, it take effect when
+   there is a dilemma of choicing LEA or ADD
+   Negative value: ADD is more preferred than LEA
+   Zero: Netrual
+   Positive value: LEA is more preferred than ADD*/
+#define IX86_LEA_PRIORITY 0
+
+/* Return true if usage of lea INSN has performance advantage
+   over a sequence of instructions.  Instructions sequence has
+   SPLIT_COST cycles higher latency than lea latency.  */
+
+static bool
+ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
+		      unsigned int regno2, int split_cost, bool has_scale)
+{
+  int dist_define, dist_use;
+
+  /* For Silvermont if using a 2-source or 3-source LEA for
+     non-destructive destination purposes, or due to wanting
+     ability to use SCALE, the use of LEA is justified.  */
+  if (TARGET_SILVERMONT || TARGET_INTEL)
+    {
+      if (has_scale)
+	return true;
+      if (split_cost < 1)
+	return false;
+      if (regno0 == regno1 || regno0 == regno2)
+	return false;
+      return true;
+    }
+
+  dist_define = distance_non_agu_define (regno1, regno2, insn);
+  dist_use = distance_agu_use (regno0, insn);
+
+  if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
+    {
+      /* If there is no non AGU operand definition, no AGU
+	 operand usage and split cost is 0 then both lea
+	 and non lea variants have same priority.  Currently
+	 we prefer lea for 64 bit code and non lea on 32 bit
+	 code.  */
+      if (dist_use < 0 && split_cost == 0)
+	return TARGET_64BIT || IX86_LEA_PRIORITY;
+      else
+	return true;
+    }
+
+  /* With longer definitions distance lea is more preferable.
+     Here we change it to take into account splitting cost and
+     lea priority.  */
+  dist_define += split_cost + IX86_LEA_PRIORITY;
+
+  /* If there is no use in memory addess then we just check
+     that split cost exceeds AGU stall.  */
+  if (dist_use < 0)
+    return dist_define > LEA_MAX_STALL;
+
+  /* If this insn has both backward non-agu dependence and forward
+     agu dependence, the one with short distance takes effect.  */
+  return dist_define >= dist_use;
+}
+
+/* Return true if it is legal to clobber flags by INSN and
+   false otherwise.  */
+
+static bool
+ix86_ok_to_clobber_flags (rtx insn)
+{
+  basic_block bb = BLOCK_FOR_INSN (insn);
+  df_ref *use;
+  bitmap live;
+
+  while (insn)
+    {
+      if (NONDEBUG_INSN_P (insn))
+	{
+	  for (use = DF_INSN_USES (insn); *use; use++)
+	    if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
+	      return false;
+
+	  if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
+	    return true;
+	}
+
+      if (insn == BB_END (bb))
+	break;
+
+      insn = NEXT_INSN (insn);
+    }
+
+  live = df_get_live_out(bb);
+  return !REGNO_REG_SET_P (live, FLAGS_REG);
+}
+
+/* Return true if we need to split op0 = op1 + op2 into a sequence of
+   move and add to avoid AGU stalls.  */
+
+bool
+ix86_avoid_lea_for_add (rtx insn, rtx operands[])
+{
+  unsigned int regno0, regno1, regno2;
+
+  /* Check if we need to optimize.  */
+  if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
+    return false;
+
+  /* Check it is correct to split here.  */
+  if (!ix86_ok_to_clobber_flags(insn))
+    return false;
+
+  regno0 = true_regnum (operands[0]);
+  regno1 = true_regnum (operands[1]);
+  regno2 = true_regnum (operands[2]);
+
+  /* We need to split only adds with non destructive
+     destination operand.  */
+  if (regno0 == regno1 || regno0 == regno2)
+    return false;
+  else
+    return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
+}
+
+/* Return true if we should emit lea instruction instead of mov
+   instruction.  */
+
+bool
+ix86_use_lea_for_mov (rtx insn, rtx operands[])
+{
+  unsigned int regno0, regno1;
+
+  /* Check if we need to optimize.  */
+  if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
+    return false;
+
+  /* Use lea for reg to reg moves only.  */
+  if (!REG_P (operands[0]) || !REG_P (operands[1]))
+    return false;
+
+  regno0 = true_regnum (operands[0]);
+  regno1 = true_regnum (operands[1]);
+
+  return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
+}
+
+/* Return true if we need to split lea into a sequence of
+   instructions to avoid AGU stalls. */
+
+bool
+ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
+{
+  unsigned int regno0, regno1, regno2;
+  int split_cost;
+  struct ix86_address parts;
+  int ok;
+
+  /* Check we need to optimize.  */
+  if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
+    return false;
+
+  /* The "at least two components" test below might not catch simple
+     move or zero extension insns if parts.base is non-NULL and parts.disp
+     is const0_rtx as the only components in the address, e.g. if the
+     register is %rbp or %r13.  As this test is much cheaper and moves or
+     zero extensions are the common case, do this check first.  */
+  if (REG_P (operands[1])
+      || (SImode_address_operand (operands[1], VOIDmode)
+	  && REG_P (XEXP (operands[1], 0))))
+    return false;
+
+  /* Check if it is OK to split here.  */
+  if (!ix86_ok_to_clobber_flags (insn))
+    return false;
+
+  ok = ix86_decompose_address (operands[1], &parts);
+  gcc_assert (ok);
+
+  /* There should be at least two components in the address.  */
+  if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
+      + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
+    return false;
+
+  /* We should not split into add if non legitimate pic
+     operand is used as displacement. */
+  if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
+    return false;
+
+  regno0 = true_regnum (operands[0]) ;
+  regno1 = INVALID_REGNUM;
+  regno2 = INVALID_REGNUM;
+
+  if (parts.base)
+    regno1 = true_regnum (parts.base);
+  if (parts.index)
+    regno2 = true_regnum (parts.index);
+
+  split_cost = 0;
+
+  /* Compute how many cycles we will add to execution time
+     if split lea into a sequence of instructions.  */
+  if (parts.base || parts.index)
+    {
+      /* Have to use mov instruction if non desctructive
+	 destination form is used.  */
+      if (regno1 != regno0 && regno2 != regno0)
+	split_cost += 1;
+
+      /* Have to add index to base if both exist.  */
+      if (parts.base && parts.index)
+	split_cost += 1;
+
+      /* Have to use shift and adds if scale is 2 or greater.  */
+      if (parts.scale > 1)
+	{
+	  if (regno0 != regno1)
+	    split_cost += 1;
+	  else if (regno2 == regno0)
+	    split_cost += 4;
+	  else
+	    split_cost += parts.scale;
+	}
+
+      /* Have to use add instruction with immediate if
+	 disp is non zero.  */
+      if (parts.disp && parts.disp != const0_rtx)
+	split_cost += 1;
+
+      /* Subtract the price of lea.  */
+      split_cost -= 1;
+    }
+
+  return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
+				parts.scale > 1);
+}
+
+/* Emit x86 binary operand CODE in mode MODE, where the first operand
+   matches destination.  RTX includes clobber of FLAGS_REG.  */
+
+static void
+ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
+		 rtx dst, rtx src)
+{
+  rtx op, clob;
+
+  op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
+  clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+  
+  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+}
+
+/* Return true if regno1 def is nearest to the insn.  */
+
+static bool
+find_nearest_reg_def (rtx insn, int regno1, int regno2)
+{
+  rtx prev = insn;
+  rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
+
+  if (insn == start)
+    return false;
+  while (prev && prev != start)
+    {
+      if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
+	{
+	  prev = PREV_INSN (prev);
+	  continue;
+	}
+      if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
+	return true;
+      else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
+	return false;
+      prev = PREV_INSN (prev);
+    }
+
+  /* None of the regs is defined in the bb.  */
+  return false;
+}
+
+/* Split lea instructions into a sequence of instructions
+   which are executed on ALU to avoid AGU stalls.
+   It is assumed that it is allowed to clobber flags register
+   at lea position.  */
+
+void
+ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
+{
+  unsigned int regno0, regno1, regno2;
+  struct ix86_address parts;
+  rtx target, tmp;
+  int ok, adds;
+
+  ok = ix86_decompose_address (operands[1], &parts);
+  gcc_assert (ok);
+
+  target = gen_lowpart (mode, operands[0]);
+
+  regno0 = true_regnum (target);
+  regno1 = INVALID_REGNUM;
+  regno2 = INVALID_REGNUM;
+
+  if (parts.base)
+    {
+      parts.base = gen_lowpart (mode, parts.base);
+      regno1 = true_regnum (parts.base);
+    }
+
+  if (parts.index)
+    {
+      parts.index = gen_lowpart (mode, parts.index);
+      regno2 = true_regnum (parts.index);
+    }
+
+  if (parts.disp)
+    parts.disp = gen_lowpart (mode, parts.disp);
+
+  if (parts.scale > 1)
+    {
+      /* Case r1 = r1 + ...  */
+      if (regno1 == regno0)
+	{
+	  /* If we have a case r1 = r1 + C * r2 then we
+	     should use multiplication which is very
+	     expensive.  Assume cost model is wrong if we
+	     have such case here.  */
+	  gcc_assert (regno2 != regno0);
+
+	  for (adds = parts.scale; adds > 0; adds--)
+	    ix86_emit_binop (PLUS, mode, target, parts.index);
+	}
+      else
+	{
+	  /* r1 = r2 + r3 * C case.  Need to move r3 into r1.  */
+	  if (regno0 != regno2)
+	    emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
+
+	  /* Use shift for scaling.  */
+	  ix86_emit_binop (ASHIFT, mode, target,
+			   GEN_INT (exact_log2 (parts.scale)));
+
+	  if (parts.base)
+	    ix86_emit_binop (PLUS, mode, target, parts.base);
+
+	  if (parts.disp && parts.disp != const0_rtx)
+	    ix86_emit_binop (PLUS, mode, target, parts.disp);
+	}
+    }
+  else if (!parts.base && !parts.index)
+    {
+      gcc_assert(parts.disp);
+      emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
+    }
+  else
+    {
+      if (!parts.base)
+	{
+	  if (regno0 != regno2)
+	    emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
+	}
+      else if (!parts.index)
+	{
+	  if (regno0 != regno1)
+	    emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
+	}
+      else
+	{
+	  if (regno0 == regno1)
+	    tmp = parts.index;
+	  else if (regno0 == regno2)
+	    tmp = parts.base;
+	  else
+	    {
+	      rtx tmp1;
+
+	      /* Find better operand for SET instruction, depending
+		 on which definition is farther from the insn.  */
+	      if (find_nearest_reg_def (insn, regno1, regno2))
+		tmp = parts.index, tmp1 = parts.base;
+	      else
+		tmp = parts.base, tmp1 = parts.index;
+
+	      emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
+
+	      if (parts.disp && parts.disp != const0_rtx)
+		ix86_emit_binop (PLUS, mode, target, parts.disp);
+
+	      ix86_emit_binop (PLUS, mode, target, tmp1);
+	      return;
+	    }
+
+	  ix86_emit_binop (PLUS, mode, target, tmp);
+	}
+
+      if (parts.disp && parts.disp != const0_rtx)
+	ix86_emit_binop (PLUS, mode, target, parts.disp);
+    }
+}
+
+/* Return true if it is ok to optimize an ADD operation to LEA
+   operation to avoid flag register consumation.  For most processors,
+   ADD is faster than LEA.  For the processors like BONNELL, if the
+   destination register of LEA holds an actual address which will be
+   used soon, LEA is better and otherwise ADD is better.  */
+
+bool
+ix86_lea_for_add_ok (rtx insn, rtx operands[])
+{
+  unsigned int regno0 = true_regnum (operands[0]);
+  unsigned int regno1 = true_regnum (operands[1]);
+  unsigned int regno2 = true_regnum (operands[2]);
+
+  /* If a = b + c, (a!=b && a!=c), must use lea form. */
+  if (regno0 != regno1 && regno0 != regno2)
+    return true;
+
+  if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
+    return false;
+
+  return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
+}
+
+/* Return true if destination reg of SET_BODY is shift count of
+   USE_BODY.  */
+
+static bool
+ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
+{
+  rtx set_dest;
+  rtx shift_rtx;
+  int i;
+
+  /* Retrieve destination of SET_BODY.  */
+  switch (GET_CODE (set_body))
+    {
+    case SET:
+      set_dest = SET_DEST (set_body);
+      if (!set_dest || !REG_P (set_dest))
+	return false;
+      break;
+    case PARALLEL:
+      for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
+	if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
+					  use_body))
+	  return true;
+    default:
+      return false;
+      break;
+    }
+
+  /* Retrieve shift count of USE_BODY.  */
+  switch (GET_CODE (use_body))
+    {
+    case SET:
+      shift_rtx = XEXP (use_body, 1);
+      break;
+    case PARALLEL:
+      for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
+	if (ix86_dep_by_shift_count_body (set_body,
+					  XVECEXP (use_body, 0, i)))
+	  return true;
+    default:
+      return false;
+      break;
+    }
+
+  if (shift_rtx
+      && (GET_CODE (shift_rtx) == ASHIFT
+	  || GET_CODE (shift_rtx) == LSHIFTRT
+	  || GET_CODE (shift_rtx) == ASHIFTRT
+	  || GET_CODE (shift_rtx) == ROTATE
+	  || GET_CODE (shift_rtx) == ROTATERT))
+    {
+      rtx shift_count = XEXP (shift_rtx, 1);
+
+      /* Return true if shift count is dest of SET_BODY.  */
+      if (REG_P (shift_count))
+	{
+	  /* Add check since it can be invoked before register
+	     allocation in pre-reload schedule.  */
+	  if (reload_completed
+	      && true_regnum (set_dest) == true_regnum (shift_count))
+	    return true;
+	  else if (REGNO(set_dest) == REGNO(shift_count))
+	    return true;
+	}
+    }
+
+  return false;
+}
+
+/* Return true if destination reg of SET_INSN is shift count of
+   USE_INSN.  */
+
+bool
+ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
+{
+  return ix86_dep_by_shift_count_body (PATTERN (set_insn),
+				       PATTERN (use_insn));
+}
+
+/* Return TRUE or FALSE depending on whether the unary operator meets the
+   appropriate constraints.  */
+
+bool
+ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
+			enum machine_mode mode ATTRIBUTE_UNUSED,
+			rtx operands[2])
+{
+  /* If one of operands is memory, source and destination must match.  */
+  if ((MEM_P (operands[0])
+       || MEM_P (operands[1]))
+      && ! rtx_equal_p (operands[0], operands[1]))
+    return false;
+  return true;
+}
+
+/* Return TRUE if the operands to a vec_interleave_{high,low}v2df
+   are ok, keeping in mind the possible movddup alternative.  */
+
+bool
+ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
+{
+  if (MEM_P (operands[0]))
+    return rtx_equal_p (operands[0], operands[1 + high]);
+  if (MEM_P (operands[1]) && MEM_P (operands[2]))
+    return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
+  return true;
+}
+
+/* Post-reload splitter for converting an SF or DFmode value in an
+   SSE register into an unsigned SImode.  */
+
+void
+ix86_split_convert_uns_si_sse (rtx operands[])
+{
+  enum machine_mode vecmode;
+  rtx value, large, zero_or_two31, input, two31, x;
+
+  large = operands[1];
+  zero_or_two31 = operands[2];
+  input = operands[3];
+  two31 = operands[4];
+  vecmode = GET_MODE (large);
+  value = gen_rtx_REG (vecmode, REGNO (operands[0]));
+
+  /* Load up the value into the low element.  We must ensure that the other
+     elements are valid floats -- zero is the easiest such value.  */
+  if (MEM_P (input))
+    {
+      if (vecmode == V4SFmode)
+	emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
+      else
+	emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
+    }
+  else
+    {
+      input = gen_rtx_REG (vecmode, REGNO (input));
+      emit_move_insn (value, CONST0_RTX (vecmode));
+      if (vecmode == V4SFmode)
+	emit_insn (gen_sse_movss (value, value, input));
+      else
+	emit_insn (gen_sse2_movsd (value, value, input));
+    }
+
+  emit_move_insn (large, two31);
+  emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
+
+  x = gen_rtx_fmt_ee (LE, vecmode, large, value);
+  emit_insn (gen_rtx_SET (VOIDmode, large, x));
+
+  x = gen_rtx_AND (vecmode, zero_or_two31, large);
+  emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
+
+  x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
+  emit_insn (gen_rtx_SET (VOIDmode, value, x));
+
+  large = gen_rtx_REG (V4SImode, REGNO (large));
+  emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
+
+  x = gen_rtx_REG (V4SImode, REGNO (value));
+  if (vecmode == V4SFmode)
+    emit_insn (gen_fix_truncv4sfv4si2 (x, value));
+  else
+    emit_insn (gen_sse2_cvttpd2dq (x, value));
+  value = x;
+
+  emit_insn (gen_xorv4si3 (value, value, large));
+}
+
+/* Convert an unsigned DImode value into a DFmode, using only SSE.
+   Expects the 64-bit DImode to be supplied in a pair of integral
+   registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
+   -mfpmath=sse, !optimize_size only.  */
+
+void
+ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
+{
+  REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
+  rtx int_xmm, fp_xmm;
+  rtx biases, exponents;
+  rtx x;
+
+  int_xmm = gen_reg_rtx (V4SImode);
+  if (TARGET_INTER_UNIT_MOVES_TO_VEC)
+    emit_insn (gen_movdi_to_sse (int_xmm, input));
+  else if (TARGET_SSE_SPLIT_REGS)
+    {
+      emit_clobber (int_xmm);
+      emit_move_insn (gen_lowpart (DImode, int_xmm), input);
+    }
+  else
+    {
+      x = gen_reg_rtx (V2DImode);
+      ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
+      emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
+    }
+
+  x = gen_rtx_CONST_VECTOR (V4SImode,
+			    gen_rtvec (4, GEN_INT (0x43300000UL),
+				       GEN_INT (0x45300000UL),
+				       const0_rtx, const0_rtx));
+  exponents = validize_mem (force_const_mem (V4SImode, x));
+
+  /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
+  emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
+
+  /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
+     yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
+     Similarly (0x45300000UL ## fp_value_hi_xmm) yields
+     (0x1.0p84 + double(fp_value_hi_xmm)).
+     Note these exponents differ by 32.  */
+
+  fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
+
+  /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
+     in [0,2**32-1] and [0]+[2**32,2**64-1] respectively.  */
+  real_ldexp (&bias_lo_rvt, &dconst1, 52);
+  real_ldexp (&bias_hi_rvt, &dconst1, 84);
+  biases = const_double_from_real_value (bias_lo_rvt, DFmode);
+  x = const_double_from_real_value (bias_hi_rvt, DFmode);
+  biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
+  biases = validize_mem (force_const_mem (V2DFmode, biases));
+  emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
+
+  /* Add the upper and lower DFmode values together.  */
+  if (TARGET_SSE3)
+    emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
+  else
+    {
+      x = copy_to_mode_reg (V2DFmode, fp_xmm);
+      emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
+      emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
+    }
+
+  ix86_expand_vector_extract (false, target, fp_xmm, 0);
+}
+
+/* Not used, but eases macroization of patterns.  */
+void
+ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
+				  rtx input ATTRIBUTE_UNUSED)
+{
+  gcc_unreachable ();
+}
+
+/* Convert an unsigned SImode value into a DFmode.  Only currently used
+   for SSE, but applicable anywhere.  */
+
+void
+ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
+{
+  REAL_VALUE_TYPE TWO31r;
+  rtx x, fp;
+
+  x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
+			   NULL, 1, OPTAB_DIRECT);
+
+  fp = gen_reg_rtx (DFmode);
+  emit_insn (gen_floatsidf2 (fp, x));
+
+  real_ldexp (&TWO31r, &dconst1, 31);
+  x = const_double_from_real_value (TWO31r, DFmode);
+
+  x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
+  if (x != target)
+    emit_move_insn (target, x);
+}
+
+/* Convert a signed DImode value into a DFmode.  Only used for SSE in
+   32-bit mode; otherwise we have a direct convert instruction.  */
+
+void
+ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
+{
+  REAL_VALUE_TYPE TWO32r;
+  rtx fp_lo, fp_hi, x;
+
+  fp_lo = gen_reg_rtx (DFmode);
+  fp_hi = gen_reg_rtx (DFmode);
+
+  emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
+
+  real_ldexp (&TWO32r, &dconst1, 32);
+  x = const_double_from_real_value (TWO32r, DFmode);
+  fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
+
+  ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
+
+  x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
+			   0, OPTAB_DIRECT);
+  if (x != target)
+    emit_move_insn (target, x);
+}
+
+/* Convert an unsigned SImode value into a SFmode, using only SSE.
+   For x86_32, -mfpmath=sse, !optimize_size only.  */
+void
+ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
+{
+  REAL_VALUE_TYPE ONE16r;
+  rtx fp_hi, fp_lo, int_hi, int_lo, x;
+
+  real_ldexp (&ONE16r, &dconst1, 16);
+  x = const_double_from_real_value (ONE16r, SFmode);
+  int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
+				      NULL, 0, OPTAB_DIRECT);
+  int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
+				      NULL, 0, OPTAB_DIRECT);
+  fp_hi = gen_reg_rtx (SFmode);
+  fp_lo = gen_reg_rtx (SFmode);
+  emit_insn (gen_floatsisf2 (fp_hi, int_hi));
+  emit_insn (gen_floatsisf2 (fp_lo, int_lo));
+  fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
+			       0, OPTAB_DIRECT);
+  fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
+			       0, OPTAB_DIRECT);
+  if (!rtx_equal_p (target, fp_hi))
+    emit_move_insn (target, fp_hi);
+}
+
+/* floatunsv{4,8}siv{4,8}sf2 expander.  Expand code to convert
+   a vector of unsigned ints VAL to vector of floats TARGET.  */
+
+void
+ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
+{
+  rtx tmp[8];
+  REAL_VALUE_TYPE TWO16r;
+  enum machine_mode intmode = GET_MODE (val);
+  enum machine_mode fltmode = GET_MODE (target);
+  rtx (*cvt) (rtx, rtx);
+
+  if (intmode == V4SImode)
+    cvt = gen_floatv4siv4sf2;
+  else
+    cvt = gen_floatv8siv8sf2;
+  tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
+  tmp[0] = force_reg (intmode, tmp[0]);
+  tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
+				OPTAB_DIRECT);
+  tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
+				NULL_RTX, 1, OPTAB_DIRECT);
+  tmp[3] = gen_reg_rtx (fltmode);
+  emit_insn (cvt (tmp[3], tmp[1]));
+  tmp[4] = gen_reg_rtx (fltmode);
+  emit_insn (cvt (tmp[4], tmp[2]));
+  real_ldexp (&TWO16r, &dconst1, 16);
+  tmp[5] = const_double_from_real_value (TWO16r, SFmode);
+  tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
+  tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
+				OPTAB_DIRECT);
+  tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
+				OPTAB_DIRECT);
+  if (tmp[7] != target)
+    emit_move_insn (target, tmp[7]);
+}
+
+/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
+   pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
+   This is done by doing just signed conversion if < 0x1p31, and otherwise by
+   subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
+
+rtx
+ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
+{
+  REAL_VALUE_TYPE TWO31r;
+  rtx two31r, tmp[4];
+  enum machine_mode mode = GET_MODE (val);
+  enum machine_mode scalarmode = GET_MODE_INNER (mode);
+  enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
+  rtx (*cmp) (rtx, rtx, rtx, rtx);
+  int i;
+
+  for (i = 0; i < 3; i++)
+    tmp[i] = gen_reg_rtx (mode);
+  real_ldexp (&TWO31r, &dconst1, 31);
+  two31r = const_double_from_real_value (TWO31r, scalarmode);
+  two31r = ix86_build_const_vector (mode, 1, two31r);
+  two31r = force_reg (mode, two31r);
+  switch (mode)
+    {
+    case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
+    case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
+    case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
+    case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
+    default: gcc_unreachable ();
+    }
+  tmp[3] = gen_rtx_LE (mode, two31r, val);
+  emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
+  tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
+				0, OPTAB_DIRECT);
+  if (intmode == V4SImode || TARGET_AVX2)
+    *xorp = expand_simple_binop (intmode, ASHIFT,
+				 gen_lowpart (intmode, tmp[0]),
+				 GEN_INT (31), NULL_RTX, 0,
+				 OPTAB_DIRECT);
+  else
+    {
+      rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
+      two31 = ix86_build_const_vector (intmode, 1, two31);
+      *xorp = expand_simple_binop (intmode, AND,
+				   gen_lowpart (intmode, tmp[0]),
+				   two31, NULL_RTX, 0,
+				   OPTAB_DIRECT);
+    }
+  return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
+			      0, OPTAB_DIRECT);
+}
+
+/* A subroutine of ix86_build_signbit_mask.  If VECT is true,
+   then replicate the value for all elements of the vector
+   register.  */
+
+rtx
+ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
+{
+  int i, n_elt;
+  rtvec v;
+  enum machine_mode scalar_mode;
+
+  switch (mode)
+    {
+    case V64QImode:
+    case V32QImode:
+    case V16QImode:
+    case V32HImode:
+    case V16HImode:
+    case V8HImode:
+    case V16SImode:
+    case V8SImode:
+    case V4SImode:
+    case V8DImode:
+    case V4DImode:
+    case V2DImode:
+      gcc_assert (vect);
+    case V16SFmode:
+    case V8SFmode:
+    case V4SFmode:
+    case V8DFmode:
+    case V4DFmode:
+    case V2DFmode:
+      n_elt = GET_MODE_NUNITS (mode);
+      v = rtvec_alloc (n_elt);
+      scalar_mode = GET_MODE_INNER (mode);
+
+      RTVEC_ELT (v, 0) = value;
+
+      for (i = 1; i < n_elt; ++i)
+	RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
+
+      return gen_rtx_CONST_VECTOR (mode, v);
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
+   and ix86_expand_int_vcond.  Create a mask for the sign bit in MODE
+   for an SSE register.  If VECT is true, then replicate the mask for
+   all elements of the vector register.  If INVERT is true, then create
+   a mask excluding the sign bit.  */
+
+rtx
+ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
+{
+  enum machine_mode vec_mode, imode;
+  HOST_WIDE_INT hi, lo;
+  int shift = 63;
+  rtx v;
+  rtx mask;
+
+  /* Find the sign bit, sign extended to 2*HWI.  */
+  switch (mode)
+    {
+    case V16SImode:
+    case V16SFmode:
+    case V8SImode:
+    case V4SImode:
+    case V8SFmode:
+    case V4SFmode:
+      vec_mode = mode;
+      mode = GET_MODE_INNER (mode);
+      imode = SImode;
+      lo = 0x80000000, hi = lo < 0;
+      break;
+
+    case V8DImode:
+    case V4DImode:
+    case V2DImode:
+    case V8DFmode:
+    case V4DFmode:
+    case V2DFmode:
+      vec_mode = mode;
+      mode = GET_MODE_INNER (mode);
+      imode = DImode;
+      if (HOST_BITS_PER_WIDE_INT >= 64)
+	lo = (HOST_WIDE_INT)1 << shift, hi = -1;
+      else
+	lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
+      break;
+
+    case TImode:
+    case TFmode:
+      vec_mode = VOIDmode;
+      if (HOST_BITS_PER_WIDE_INT >= 64)
+	{
+	  imode = TImode;
+	  lo = 0, hi = (HOST_WIDE_INT)1 << shift;
+	}
+      else
+	{
+	  rtvec vec;
+
+	  imode = DImode;
+	  lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
+
+	  if (invert)
+	    {
+	      lo = ~lo, hi = ~hi;
+	      v = constm1_rtx;
+	    }
+	  else
+	    v = const0_rtx;
+
+	  mask = immed_double_const (lo, hi, imode);
+
+	  vec = gen_rtvec (2, v, mask);
+	  v = gen_rtx_CONST_VECTOR (V2DImode, vec);
+	  v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
+
+	  return v;
+	}
+     break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  if (invert)
+    lo = ~lo, hi = ~hi;
+
+  /* Force this value into the low part of a fp vector constant.  */
+  mask = immed_double_const (lo, hi, imode);
+  mask = gen_lowpart (mode, mask);
+
+  if (vec_mode == VOIDmode)
+    return force_reg (mode, mask);
+
+  v = ix86_build_const_vector (vec_mode, vect, mask);
+  return force_reg (vec_mode, v);
+}
+
+/* Generate code for floating point ABS or NEG.  */
+
+void
+ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
+				rtx operands[])
+{
+  rtx mask, set, dst, src;
+  bool use_sse = false;
+  bool vector_mode = VECTOR_MODE_P (mode);
+  enum machine_mode vmode = mode;
+
+  if (vector_mode)
+    use_sse = true;
+  else if (mode == TFmode)
+    use_sse = true;
+  else if (TARGET_SSE_MATH)
+    {
+      use_sse = SSE_FLOAT_MODE_P (mode);
+      if (mode == SFmode)
+	vmode = V4SFmode;
+      else if (mode == DFmode)
+	vmode = V2DFmode;
+    }
+
+  /* NEG and ABS performed with SSE use bitwise mask operations.
+     Create the appropriate mask now.  */
+  if (use_sse)
+    mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
+  else
+    mask = NULL_RTX;
+
+  dst = operands[0];
+  src = operands[1];
+
+  set = gen_rtx_fmt_e (code, mode, src);
+  set = gen_rtx_SET (VOIDmode, dst, set);
+
+  if (mask)
+    {
+      rtx use, clob;
+      rtvec par;
+
+      use = gen_rtx_USE (VOIDmode, mask);
+      if (vector_mode)
+	par = gen_rtvec (2, set, use);
+      else
+	{
+          clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+	  par = gen_rtvec (3, set, use, clob);
+        }
+      emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
+    }
+  else
+    emit_insn (set);
+}
+
+/* Expand a copysign operation.  Special case operand 0 being a constant.  */
+
+void
+ix86_expand_copysign (rtx operands[])
+{
+  enum machine_mode mode, vmode;
+  rtx dest, op0, op1, mask, nmask;
+
+  dest = operands[0];
+  op0 = operands[1];
+  op1 = operands[2];
+
+  mode = GET_MODE (dest);
+
+  if (mode == SFmode)
+    vmode = V4SFmode;
+  else if (mode == DFmode)
+    vmode = V2DFmode;
+  else
+    vmode = mode;
+
+  if (GET_CODE (op0) == CONST_DOUBLE)
+    {
+      rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
+
+      if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
+	op0 = simplify_unary_operation (ABS, mode, op0, mode);
+
+      if (mode == SFmode || mode == DFmode)
+	{
+	  if (op0 == CONST0_RTX (mode))
+	    op0 = CONST0_RTX (vmode);
+	  else
+	    {
+	      rtx v = ix86_build_const_vector (vmode, false, op0);
+
+	      op0 = force_reg (vmode, v);
+	    }
+	}
+      else if (op0 != CONST0_RTX (mode))
+	op0 = force_reg (mode, op0);
+
+      mask = ix86_build_signbit_mask (vmode, 0, 0);
+
+      if (mode == SFmode)
+	copysign_insn = gen_copysignsf3_const;
+      else if (mode == DFmode)
+	copysign_insn = gen_copysigndf3_const;
+      else
+	copysign_insn = gen_copysigntf3_const;
+
+	emit_insn (copysign_insn (dest, op0, op1, mask));
+    }
+  else
+    {
+      rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
+
+      nmask = ix86_build_signbit_mask (vmode, 0, 1);
+      mask = ix86_build_signbit_mask (vmode, 0, 0);
+
+      if (mode == SFmode)
+	copysign_insn = gen_copysignsf3_var;
+      else if (mode == DFmode)
+	copysign_insn = gen_copysigndf3_var;
+      else
+	copysign_insn = gen_copysigntf3_var;
+
+      emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
+    }
+}
+
+/* Deconstruct a copysign operation into bit masks.  Operand 0 is known to
+   be a constant, and so has already been expanded into a vector constant.  */
+
+void
+ix86_split_copysign_const (rtx operands[])
+{
+  enum machine_mode mode, vmode;
+  rtx dest, op0, mask, x;
+
+  dest = operands[0];
+  op0 = operands[1];
+  mask = operands[3];
+
+  mode = GET_MODE (dest);
+  vmode = GET_MODE (mask);
+
+  dest = simplify_gen_subreg (vmode, dest, mode, 0);
+  x = gen_rtx_AND (vmode, dest, mask);
+  emit_insn (gen_rtx_SET (VOIDmode, dest, x));
+
+  if (op0 != CONST0_RTX (vmode))
+    {
+      x = gen_rtx_IOR (vmode, dest, op0);
+      emit_insn (gen_rtx_SET (VOIDmode, dest, x));
+    }
+}
+
+/* Deconstruct a copysign operation into bit masks.  Operand 0 is variable,
+   so we have to do two masks.  */
+
+void
+ix86_split_copysign_var (rtx operands[])
+{
+  enum machine_mode mode, vmode;
+  rtx dest, scratch, op0, op1, mask, nmask, x;
+
+  dest = operands[0];
+  scratch = operands[1];
+  op0 = operands[2];
+  op1 = operands[3];
+  nmask = operands[4];
+  mask = operands[5];
+
+  mode = GET_MODE (dest);
+  vmode = GET_MODE (mask);
+
+  if (rtx_equal_p (op0, op1))
+    {
+      /* Shouldn't happen often (it's useless, obviously), but when it does
+	 we'd generate incorrect code if we continue below.  */
+      emit_move_insn (dest, op0);
+      return;
+    }
+
+  if (REG_P (mask) && REGNO (dest) == REGNO (mask))	/* alternative 0 */
+    {
+      gcc_assert (REGNO (op1) == REGNO (scratch));
+
+      x = gen_rtx_AND (vmode, scratch, mask);
+      emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
+
+      dest = mask;
+      op0 = simplify_gen_subreg (vmode, op0, mode, 0);
+      x = gen_rtx_NOT (vmode, dest);
+      x = gen_rtx_AND (vmode, x, op0);
+      emit_insn (gen_rtx_SET (VOIDmode, dest, x));
+    }
+  else
+    {
+      if (REGNO (op1) == REGNO (scratch))		/* alternative 1,3 */
+	{
+	  x = gen_rtx_AND (vmode, scratch, mask);
+	}
+      else						/* alternative 2,4 */
+	{
+          gcc_assert (REGNO (mask) == REGNO (scratch));
+          op1 = simplify_gen_subreg (vmode, op1, mode, 0);
+	  x = gen_rtx_AND (vmode, scratch, op1);
+	}
+      emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
+
+      if (REGNO (op0) == REGNO (dest))			/* alternative 1,2 */
+	{
+	  dest = simplify_gen_subreg (vmode, op0, mode, 0);
+	  x = gen_rtx_AND (vmode, dest, nmask);
+	}
+      else						/* alternative 3,4 */
+	{
+          gcc_assert (REGNO (nmask) == REGNO (dest));
+	  dest = nmask;
+	  op0 = simplify_gen_subreg (vmode, op0, mode, 0);
+	  x = gen_rtx_AND (vmode, dest, op0);
+	}
+      emit_insn (gen_rtx_SET (VOIDmode, dest, x));
+    }
+
+  x = gen_rtx_IOR (vmode, dest, scratch);
+  emit_insn (gen_rtx_SET (VOIDmode, dest, x));
+}
+
+/* Return TRUE or FALSE depending on whether the first SET in INSN
+   has source and destination with matching CC modes, and that the
+   CC mode is at least as constrained as REQ_MODE.  */
+
+bool
+ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
+{
+  rtx set;
+  enum machine_mode set_mode;
+
+  set = PATTERN (insn);
+  if (GET_CODE (set) == PARALLEL)
+    set = XVECEXP (set, 0, 0);
+  gcc_assert (GET_CODE (set) == SET);
+  gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
+
+  set_mode = GET_MODE (SET_DEST (set));
+  switch (set_mode)
+    {
+    case CCNOmode:
+      if (req_mode != CCNOmode
+	  && (req_mode != CCmode
+	      || XEXP (SET_SRC (set), 1) != const0_rtx))
+	return false;
+      break;
+    case CCmode:
+      if (req_mode == CCGCmode)
+	return false;
+      /* FALLTHRU */
+    case CCGCmode:
+      if (req_mode == CCGOCmode || req_mode == CCNOmode)
+	return false;
+      /* FALLTHRU */
+    case CCGOCmode:
+      if (req_mode == CCZmode)
+	return false;
+      /* FALLTHRU */
+    case CCZmode:
+      break;
+
+    case CCAmode:
+    case CCCmode:
+    case CCOmode:
+    case CCSmode:
+      if (set_mode != req_mode)
+	return false;
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return GET_MODE (SET_SRC (set)) == set_mode;
+}
+
+/* Generate insn patterns to do an integer compare of OPERANDS.  */
+
+static rtx
+ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
+{
+  enum machine_mode cmpmode;
+  rtx tmp, flags;
+
+  cmpmode = SELECT_CC_MODE (code, op0, op1);
+  flags = gen_rtx_REG (cmpmode, FLAGS_REG);
+
+  /* This is very simple, but making the interface the same as in the
+     FP case makes the rest of the code easier.  */
+  tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
+  emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
+
+  /* Return the test that should be put into the flags user, i.e.
+     the bcc, scc, or cmov instruction.  */
+  return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
+}
+
+/* Figure out whether to use ordered or unordered fp comparisons.
+   Return the appropriate mode to use.  */
+
+enum machine_mode
+ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
+{
+  /* ??? In order to make all comparisons reversible, we do all comparisons
+     non-trapping when compiling for IEEE.  Once gcc is able to distinguish
+     all forms trapping and nontrapping comparisons, we can make inequality
+     comparisons trapping again, since it results in better code when using
+     FCOM based compares.  */
+  return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
+}
+
+enum machine_mode
+ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
+{
+  enum machine_mode mode = GET_MODE (op0);
+
+  if (SCALAR_FLOAT_MODE_P (mode))
+    {
+      gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
+      return ix86_fp_compare_mode (code);
+    }
+
+  switch (code)
+    {
+      /* Only zero flag is needed.  */
+    case EQ:			/* ZF=0 */
+    case NE:			/* ZF!=0 */
+      return CCZmode;
+      /* Codes needing carry flag.  */
+    case GEU:			/* CF=0 */
+    case LTU:			/* CF=1 */
+      /* Detect overflow checks.  They need just the carry flag.  */
+      if (GET_CODE (op0) == PLUS
+	  && rtx_equal_p (op1, XEXP (op0, 0)))
+	return CCCmode;
+      else
+	return CCmode;
+    case GTU:			/* CF=0 & ZF=0 */
+    case LEU:			/* CF=1 | ZF=1 */
+      return CCmode;
+      /* Codes possibly doable only with sign flag when
+         comparing against zero.  */
+    case GE:			/* SF=OF   or   SF=0 */
+    case LT:			/* SF<>OF  or   SF=1 */
+      if (op1 == const0_rtx)
+	return CCGOCmode;
+      else
+	/* For other cases Carry flag is not required.  */
+	return CCGCmode;
+      /* Codes doable only with sign flag when comparing
+         against zero, but we miss jump instruction for it
+         so we need to use relational tests against overflow
+         that thus needs to be zero.  */
+    case GT:			/* ZF=0 & SF=OF */
+    case LE:			/* ZF=1 | SF<>OF */
+      if (op1 == const0_rtx)
+	return CCNOmode;
+      else
+	return CCGCmode;
+      /* strcmp pattern do (use flags) and combine may ask us for proper
+	 mode.  */
+    case USE:
+      return CCmode;
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Return the fixed registers used for condition codes.  */
+
+static bool
+ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
+{
+  *p1 = FLAGS_REG;
+  *p2 = FPSR_REG;
+  return true;
+}
+
+/* If two condition code modes are compatible, return a condition code
+   mode which is compatible with both.  Otherwise, return
+   VOIDmode.  */
+
+static enum machine_mode
+ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
+{
+  if (m1 == m2)
+    return m1;
+
+  if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
+    return VOIDmode;
+
+  if ((m1 == CCGCmode && m2 == CCGOCmode)
+      || (m1 == CCGOCmode && m2 == CCGCmode))
+    return CCGCmode;
+
+  if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
+    return m2;
+  else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
+    return m1;
+
+  switch (m1)
+    {
+    default:
+      gcc_unreachable ();
+
+    case CCmode:
+    case CCGCmode:
+    case CCGOCmode:
+    case CCNOmode:
+    case CCAmode:
+    case CCCmode:
+    case CCOmode:
+    case CCSmode:
+    case CCZmode:
+      switch (m2)
+	{
+	default:
+	  return VOIDmode;
+
+	case CCmode:
+	case CCGCmode:
+	case CCGOCmode:
+	case CCNOmode:
+	case CCAmode:
+	case CCCmode:
+	case CCOmode:
+	case CCSmode:
+	case CCZmode:
+	  return CCmode;
+	}
+
+    case CCFPmode:
+    case CCFPUmode:
+      /* These are only compatible with themselves, which we already
+	 checked above.  */
+      return VOIDmode;
+    }
+}
+
+
+/* Return a comparison we can do and that it is equivalent to
+   swap_condition (code) apart possibly from orderedness.
+   But, never change orderedness if TARGET_IEEE_FP, returning
+   UNKNOWN in that case if necessary.  */
+
+static enum rtx_code
+ix86_fp_swap_condition (enum rtx_code code)
+{
+  switch (code)
+    {
+    case GT:                   /* GTU - CF=0 & ZF=0 */
+      return TARGET_IEEE_FP ? UNKNOWN : UNLT;
+    case GE:                   /* GEU - CF=0 */
+      return TARGET_IEEE_FP ? UNKNOWN : UNLE;
+    case UNLT:                 /* LTU - CF=1 */
+      return TARGET_IEEE_FP ? UNKNOWN : GT;
+    case UNLE:                 /* LEU - CF=1 | ZF=1 */
+      return TARGET_IEEE_FP ? UNKNOWN : GE;
+    default:
+      return swap_condition (code);
+    }
+}
+
+/* Return cost of comparison CODE using the best strategy for performance.
+   All following functions do use number of instructions as a cost metrics.
+   In future this should be tweaked to compute bytes for optimize_size and
+   take into account performance of various instructions on various CPUs.  */
+
+static int
+ix86_fp_comparison_cost (enum rtx_code code)
+{
+  int arith_cost;
+
+  /* The cost of code using bit-twiddling on %ah.  */
+  switch (code)
+    {
+    case UNLE:
+    case UNLT:
+    case LTGT:
+    case GT:
+    case GE:
+    case UNORDERED:
+    case ORDERED:
+    case UNEQ:
+      arith_cost = 4;
+      break;
+    case LT:
+    case NE:
+    case EQ:
+    case UNGE:
+      arith_cost = TARGET_IEEE_FP ? 5 : 4;
+      break;
+    case LE:
+    case UNGT:
+      arith_cost = TARGET_IEEE_FP ? 6 : 4;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  switch (ix86_fp_comparison_strategy (code))
+    {
+    case IX86_FPCMP_COMI:
+      return arith_cost > 4 ? 3 : 2;
+    case IX86_FPCMP_SAHF:
+      return arith_cost > 4 ? 4 : 3;
+    default:
+      return arith_cost;
+    }
+}
+
+/* Return strategy to use for floating-point.  We assume that fcomi is always
+   preferrable where available, since that is also true when looking at size
+   (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test).  */
+
+enum ix86_fpcmp_strategy
+ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
+{
+  /* Do fcomi/sahf based test when profitable.  */
+
+  if (TARGET_CMOVE)
+    return IX86_FPCMP_COMI;
+
+  if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
+    return IX86_FPCMP_SAHF;
+
+  return IX86_FPCMP_ARITH;
+}
+
+/* Swap, force into registers, or otherwise massage the two operands
+   to a fp comparison.  The operands are updated in place; the new
+   comparison code is returned.  */
+
+static enum rtx_code
+ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
+{
+  enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
+  rtx op0 = *pop0, op1 = *pop1;
+  enum machine_mode op_mode = GET_MODE (op0);
+  int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
+
+  /* All of the unordered compare instructions only work on registers.
+     The same is true of the fcomi compare instructions.  The XFmode
+     compare instructions require registers except when comparing
+     against zero or when converting operand 1 from fixed point to
+     floating point.  */
+
+  if (!is_sse
+      && (fpcmp_mode == CCFPUmode
+	  || (op_mode == XFmode
+	      && ! (standard_80387_constant_p (op0) == 1
+		    || standard_80387_constant_p (op1) == 1)
+	      && GET_CODE (op1) != FLOAT)
+	  || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
+    {
+      op0 = force_reg (op_mode, op0);
+      op1 = force_reg (op_mode, op1);
+    }
+  else
+    {
+      /* %%% We only allow op1 in memory; op0 must be st(0).  So swap
+	 things around if they appear profitable, otherwise force op0
+	 into a register.  */
+
+      if (standard_80387_constant_p (op0) == 0
+	  || (MEM_P (op0)
+	      && ! (standard_80387_constant_p (op1) == 0
+		    || MEM_P (op1))))
+	{
+	  enum rtx_code new_code = ix86_fp_swap_condition (code);
+	  if (new_code != UNKNOWN)
+	    {
+	      rtx tmp;
+	      tmp = op0, op0 = op1, op1 = tmp;
+	      code = new_code;
+	    }
+	}
+
+      if (!REG_P (op0))
+	op0 = force_reg (op_mode, op0);
+
+      if (CONSTANT_P (op1))
+	{
+	  int tmp = standard_80387_constant_p (op1);
+	  if (tmp == 0)
+	    op1 = validize_mem (force_const_mem (op_mode, op1));
+	  else if (tmp == 1)
+	    {
+	      if (TARGET_CMOVE)
+		op1 = force_reg (op_mode, op1);
+	    }
+	  else
+	    op1 = force_reg (op_mode, op1);
+	}
+    }
+
+  /* Try to rearrange the comparison to make it cheaper.  */
+  if (ix86_fp_comparison_cost (code)
+      > ix86_fp_comparison_cost (swap_condition (code))
+      && (REG_P (op1) || can_create_pseudo_p ()))
+    {
+      rtx tmp;
+      tmp = op0, op0 = op1, op1 = tmp;
+      code = swap_condition (code);
+      if (!REG_P (op0))
+	op0 = force_reg (op_mode, op0);
+    }
+
+  *pop0 = op0;
+  *pop1 = op1;
+  return code;
+}
+
+/* Convert comparison codes we use to represent FP comparison to integer
+   code that will result in proper branch.  Return UNKNOWN if no such code
+   is available.  */
+
+enum rtx_code
+ix86_fp_compare_code_to_integer (enum rtx_code code)
+{
+  switch (code)
+    {
+    case GT:
+      return GTU;
+    case GE:
+      return GEU;
+    case ORDERED:
+    case UNORDERED:
+      return code;
+      break;
+    case UNEQ:
+      return EQ;
+      break;
+    case UNLT:
+      return LTU;
+      break;
+    case UNLE:
+      return LEU;
+      break;
+    case LTGT:
+      return NE;
+      break;
+    default:
+      return UNKNOWN;
+    }
+}
+
+/* Generate insn patterns to do a floating point compare of OPERANDS.  */
+
+static rtx
+ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
+{
+  enum machine_mode fpcmp_mode, intcmp_mode;
+  rtx tmp, tmp2;
+
+  fpcmp_mode = ix86_fp_compare_mode (code);
+  code = ix86_prepare_fp_compare_args (code, &op0, &op1);
+
+  /* Do fcomi/sahf based test when profitable.  */
+  switch (ix86_fp_comparison_strategy (code))
+    {
+    case IX86_FPCMP_COMI:
+      intcmp_mode = fpcmp_mode;
+      tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
+      tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
+			 tmp);
+      emit_insn (tmp);
+      break;
+
+    case IX86_FPCMP_SAHF:
+      intcmp_mode = fpcmp_mode;
+      tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
+      tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
+			 tmp);
+
+      if (!scratch)
+	scratch = gen_reg_rtx (HImode);
+      tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
+      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
+      break;
+
+    case IX86_FPCMP_ARITH:
+      /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first.  */
+      tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
+      tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
+      if (!scratch)
+	scratch = gen_reg_rtx (HImode);
+      emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
+
+      /* In the unordered case, we have to check C2 for NaN's, which
+	 doesn't happen to work out to anything nice combination-wise.
+	 So do some bit twiddling on the value we've got in AH to come
+	 up with an appropriate set of condition codes.  */
+
+      intcmp_mode = CCNOmode;
+      switch (code)
+	{
+	case GT:
+	case UNGT:
+	  if (code == GT || !TARGET_IEEE_FP)
+	    {
+	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
+	      code = EQ;
+	    }
+	  else
+	    {
+	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
+	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
+	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
+	      intcmp_mode = CCmode;
+	      code = GEU;
+	    }
+	  break;
+	case LT:
+	case UNLT:
+	  if (code == LT && TARGET_IEEE_FP)
+	    {
+	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
+	      emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
+	      intcmp_mode = CCmode;
+	      code = EQ;
+	    }
+	  else
+	    {
+	      emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
+	      code = NE;
+	    }
+	  break;
+	case GE:
+	case UNGE:
+	  if (code == GE || !TARGET_IEEE_FP)
+	    {
+	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
+	      code = EQ;
+	    }
+	  else
+	    {
+	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
+	      emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
+	      code = NE;
+	    }
+	  break;
+	case LE:
+	case UNLE:
+	  if (code == LE && TARGET_IEEE_FP)
+	    {
+	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
+	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
+	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
+	      intcmp_mode = CCmode;
+	      code = LTU;
+	    }
+	  else
+	    {
+	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
+	      code = NE;
+	    }
+	  break;
+	case EQ:
+	case UNEQ:
+	  if (code == EQ && TARGET_IEEE_FP)
+	    {
+	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
+	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
+	      intcmp_mode = CCmode;
+	      code = EQ;
+	    }
+	  else
+	    {
+	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
+	      code = NE;
+	    }
+	  break;
+	case NE:
+	case LTGT:
+	  if (code == NE && TARGET_IEEE_FP)
+	    {
+	      emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
+	      emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
+					     GEN_INT (0x40)));
+	      code = NE;
+	    }
+	  else
+	    {
+	      emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
+	      code = EQ;
+	    }
+	  break;
+
+	case UNORDERED:
+	  emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
+	  code = NE;
+	  break;
+	case ORDERED:
+	  emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
+	  code = EQ;
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+	break;
+
+    default:
+      gcc_unreachable();
+    }
+
+  /* Return the test that should be put into the flags user, i.e.
+     the bcc, scc, or cmov instruction.  */
+  return gen_rtx_fmt_ee (code, VOIDmode,
+			 gen_rtx_REG (intcmp_mode, FLAGS_REG),
+			 const0_rtx);
+}
+
+static rtx
+ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
+{
+  rtx ret;
+
+  if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
+    ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
+
+  else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
+    {
+      gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
+      ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
+    }
+  else
+    ret = ix86_expand_int_compare (code, op0, op1);
+
+  return ret;
+}
+
+void
+ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
+{
+  enum machine_mode mode = GET_MODE (op0);
+  rtx tmp;
+
+  switch (mode)
+    {
+    case SFmode:
+    case DFmode:
+    case XFmode:
+    case QImode:
+    case HImode:
+    case SImode:
+      simple:
+      tmp = ix86_expand_compare (code, op0, op1);
+      tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+				  gen_rtx_LABEL_REF (VOIDmode, label),
+				  pc_rtx);
+      emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
+      return;
+
+    case DImode:
+      if (TARGET_64BIT)
+	goto simple;
+    case TImode:
+      /* Expand DImode branch into multiple compare+branch.  */
+      {
+	rtx lo[2], hi[2], label2;
+	enum rtx_code code1, code2, code3;
+	enum machine_mode submode;
+
+	if (CONSTANT_P (op0) && !CONSTANT_P (op1))
+	  {
+	    tmp = op0, op0 = op1, op1 = tmp;
+	    code = swap_condition (code);
+	  }
+
+	split_double_mode (mode, &op0, 1, lo+0, hi+0);
+	split_double_mode (mode, &op1, 1, lo+1, hi+1);
+
+	submode = mode == DImode ? SImode : DImode;
+
+	/* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
+	   avoid two branches.  This costs one extra insn, so disable when
+	   optimizing for size.  */
+
+	if ((code == EQ || code == NE)
+	    && (!optimize_insn_for_size_p ()
+	        || hi[1] == const0_rtx || lo[1] == const0_rtx))
+	  {
+	    rtx xor0, xor1;
+
+	    xor1 = hi[0];
+	    if (hi[1] != const0_rtx)
+	      xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
+				   NULL_RTX, 0, OPTAB_WIDEN);
+
+	    xor0 = lo[0];
+	    if (lo[1] != const0_rtx)
+	      xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
+				   NULL_RTX, 0, OPTAB_WIDEN);
+
+	    tmp = expand_binop (submode, ior_optab, xor1, xor0,
+				NULL_RTX, 0, OPTAB_WIDEN);
+
+	    ix86_expand_branch (code, tmp, const0_rtx, label);
+	    return;
+	  }
+
+	/* Otherwise, if we are doing less-than or greater-or-equal-than,
+	   op1 is a constant and the low word is zero, then we can just
+	   examine the high word.  Similarly for low word -1 and
+	   less-or-equal-than or greater-than.  */
+
+	if (CONST_INT_P (hi[1]))
+	  switch (code)
+	    {
+	    case LT: case LTU: case GE: case GEU:
+	      if (lo[1] == const0_rtx)
+		{
+		  ix86_expand_branch (code, hi[0], hi[1], label);
+		  return;
+		}
+	      break;
+	    case LE: case LEU: case GT: case GTU:
+	      if (lo[1] == constm1_rtx)
+		{
+		  ix86_expand_branch (code, hi[0], hi[1], label);
+		  return;
+		}
+	      break;
+	    default:
+	      break;
+	    }
+
+	/* Otherwise, we need two or three jumps.  */
+
+	label2 = gen_label_rtx ();
+
+	code1 = code;
+	code2 = swap_condition (code);
+	code3 = unsigned_condition (code);
+
+	switch (code)
+	  {
+	  case LT: case GT: case LTU: case GTU:
+	    break;
+
+	  case LE:   code1 = LT;  code2 = GT;  break;
+	  case GE:   code1 = GT;  code2 = LT;  break;
+	  case LEU:  code1 = LTU; code2 = GTU; break;
+	  case GEU:  code1 = GTU; code2 = LTU; break;
+
+	  case EQ:   code1 = UNKNOWN; code2 = NE;  break;
+	  case NE:   code2 = UNKNOWN; break;
+
+	  default:
+	    gcc_unreachable ();
+	  }
+
+	/*
+	 * a < b =>
+	 *    if (hi(a) < hi(b)) goto true;
+	 *    if (hi(a) > hi(b)) goto false;
+	 *    if (lo(a) < lo(b)) goto true;
+	 *  false:
+	 */
+
+	if (code1 != UNKNOWN)
+	  ix86_expand_branch (code1, hi[0], hi[1], label);
+	if (code2 != UNKNOWN)
+	  ix86_expand_branch (code2, hi[0], hi[1], label2);
+
+	ix86_expand_branch (code3, lo[0], lo[1], label);
+
+	if (code2 != UNKNOWN)
+	  emit_label (label2);
+	return;
+      }
+
+    default:
+      gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
+      goto simple;
+    }
+}
+
+/* Split branch based on floating point condition.  */
+void
+ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
+		      rtx target1, rtx target2, rtx tmp)
+{
+  rtx condition;
+  rtx i;
+
+  if (target2 != pc_rtx)
+    {
+      rtx tmp = target2;
+      code = reverse_condition_maybe_unordered (code);
+      target2 = target1;
+      target1 = tmp;
+    }
+
+  condition = ix86_expand_fp_compare (code, op1, op2,
+				      tmp);
+
+  i = emit_jump_insn (gen_rtx_SET
+		      (VOIDmode, pc_rtx,
+		       gen_rtx_IF_THEN_ELSE (VOIDmode,
+					     condition, target1, target2)));
+  if (split_branch_probability >= 0)
+    add_int_reg_note (i, REG_BR_PROB, split_branch_probability);
+}
+
+void
+ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
+{
+  rtx ret;
+
+  gcc_assert (GET_MODE (dest) == QImode);
+
+  ret = ix86_expand_compare (code, op0, op1);
+  PUT_MODE (ret, QImode);
+  emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
+}
+
+/* Expand comparison setting or clearing carry flag.  Return true when
+   successful and set pop for the operation.  */
+static bool
+ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
+{
+  enum machine_mode mode =
+    GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
+
+  /* Do not handle double-mode compares that go through special path.  */
+  if (mode == (TARGET_64BIT ? TImode : DImode))
+    return false;
+
+  if (SCALAR_FLOAT_MODE_P (mode))
+    {
+      rtx compare_op, compare_seq;
+
+      gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
+
+      /* Shortcut:  following common codes never translate
+	 into carry flag compares.  */
+      if (code == EQ || code == NE || code == UNEQ || code == LTGT
+	  || code == ORDERED || code == UNORDERED)
+	return false;
+
+      /* These comparisons require zero flag; swap operands so they won't.  */
+      if ((code == GT || code == UNLE || code == LE || code == UNGT)
+	  && !TARGET_IEEE_FP)
+	{
+	  rtx tmp = op0;
+	  op0 = op1;
+	  op1 = tmp;
+	  code = swap_condition (code);
+	}
+
+      /* Try to expand the comparison and verify that we end up with
+	 carry flag based comparison.  This fails to be true only when
+	 we decide to expand comparison using arithmetic that is not
+	 too common scenario.  */
+      start_sequence ();
+      compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
+      compare_seq = get_insns ();
+      end_sequence ();
+
+      if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
+	  || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
+        code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
+      else
+	code = GET_CODE (compare_op);
+
+      if (code != LTU && code != GEU)
+	return false;
+
+      emit_insn (compare_seq);
+      *pop = compare_op;
+      return true;
+    }
+
+  if (!INTEGRAL_MODE_P (mode))
+    return false;
+
+  switch (code)
+    {
+    case LTU:
+    case GEU:
+      break;
+
+    /* Convert a==0 into (unsigned)a<1.  */
+    case EQ:
+    case NE:
+      if (op1 != const0_rtx)
+	return false;
+      op1 = const1_rtx;
+      code = (code == EQ ? LTU : GEU);
+      break;
+
+    /* Convert a>b into b<a or a>=b-1.  */
+    case GTU:
+    case LEU:
+      if (CONST_INT_P (op1))
+	{
+	  op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
+	  /* Bail out on overflow.  We still can swap operands but that
+	     would force loading of the constant into register.  */
+	  if (op1 == const0_rtx
+	      || !x86_64_immediate_operand (op1, GET_MODE (op1)))
+	    return false;
+	  code = (code == GTU ? GEU : LTU);
+	}
+      else
+	{
+	  rtx tmp = op1;
+	  op1 = op0;
+	  op0 = tmp;
+	  code = (code == GTU ? LTU : GEU);
+	}
+      break;
+
+    /* Convert a>=0 into (unsigned)a<0x80000000.  */
+    case LT:
+    case GE:
+      if (mode == DImode || op1 != const0_rtx)
+	return false;
+      op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
+      code = (code == LT ? GEU : LTU);
+      break;
+    case LE:
+    case GT:
+      if (mode == DImode || op1 != constm1_rtx)
+	return false;
+      op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
+      code = (code == LE ? GEU : LTU);
+      break;
+
+    default:
+      return false;
+    }
+  /* Swapping operands may cause constant to appear as first operand.  */
+  if (!nonimmediate_operand (op0, VOIDmode))
+    {
+      if (!can_create_pseudo_p ())
+	return false;
+      op0 = force_reg (mode, op0);
+    }
+  *pop = ix86_expand_compare (code, op0, op1);
+  gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
+  return true;
+}
+
+bool
+ix86_expand_int_movcc (rtx operands[])
+{
+  enum rtx_code code = GET_CODE (operands[1]), compare_code;
+  rtx compare_seq, compare_op;
+  enum machine_mode mode = GET_MODE (operands[0]);
+  bool sign_bit_compare_p = false;
+  rtx op0 = XEXP (operands[1], 0);
+  rtx op1 = XEXP (operands[1], 1);
+
+  if (GET_MODE (op0) == TImode
+      || (GET_MODE (op0) == DImode
+	  && !TARGET_64BIT))
+    return false;
+
+  start_sequence ();
+  compare_op = ix86_expand_compare (code, op0, op1);
+  compare_seq = get_insns ();
+  end_sequence ();
+
+  compare_code = GET_CODE (compare_op);
+
+  if ((op1 == const0_rtx && (code == GE || code == LT))
+      || (op1 == constm1_rtx && (code == GT || code == LE)))
+    sign_bit_compare_p = true;
+
+  /* Don't attempt mode expansion here -- if we had to expand 5 or 6
+     HImode insns, we'd be swallowed in word prefix ops.  */
+
+  if ((mode != HImode || TARGET_FAST_PREFIX)
+      && (mode != (TARGET_64BIT ? TImode : DImode))
+      && CONST_INT_P (operands[2])
+      && CONST_INT_P (operands[3]))
+    {
+      rtx out = operands[0];
+      HOST_WIDE_INT ct = INTVAL (operands[2]);
+      HOST_WIDE_INT cf = INTVAL (operands[3]);
+      HOST_WIDE_INT diff;
+
+      diff = ct - cf;
+      /*  Sign bit compares are better done using shifts than we do by using
+	  sbb.  */
+      if (sign_bit_compare_p
+	  || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
+	{
+	  /* Detect overlap between destination and compare sources.  */
+	  rtx tmp = out;
+
+          if (!sign_bit_compare_p)
+	    {
+	      rtx flags;
+	      bool fpcmp = false;
+
+	      compare_code = GET_CODE (compare_op);
+
+	      flags = XEXP (compare_op, 0);
+
+	      if (GET_MODE (flags) == CCFPmode
+		  || GET_MODE (flags) == CCFPUmode)
+		{
+		  fpcmp = true;
+		  compare_code
+		    = ix86_fp_compare_code_to_integer (compare_code);
+		}
+
+	      /* To simplify rest of code, restrict to the GEU case.  */
+	      if (compare_code == LTU)
+		{
+		  HOST_WIDE_INT tmp = ct;
+		  ct = cf;
+		  cf = tmp;
+		  compare_code = reverse_condition (compare_code);
+		  code = reverse_condition (code);
+		}
+	      else
+		{
+		  if (fpcmp)
+		    PUT_CODE (compare_op,
+			      reverse_condition_maybe_unordered
+			        (GET_CODE (compare_op)));
+		  else
+		    PUT_CODE (compare_op,
+			      reverse_condition (GET_CODE (compare_op)));
+		}
+	      diff = ct - cf;
+
+	      if (reg_overlap_mentioned_p (out, op0)
+		  || reg_overlap_mentioned_p (out, op1))
+		tmp = gen_reg_rtx (mode);
+
+	      if (mode == DImode)
+		emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
+	      else
+		emit_insn (gen_x86_movsicc_0_m1	(gen_lowpart (SImode, tmp),
+						 flags, compare_op));
+	    }
+	  else
+	    {
+	      if (code == GT || code == GE)
+		code = reverse_condition (code);
+	      else
+		{
+		  HOST_WIDE_INT tmp = ct;
+		  ct = cf;
+		  cf = tmp;
+		  diff = ct - cf;
+		}
+	      tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
+	    }
+
+	  if (diff == 1)
+	    {
+	      /*
+	       * cmpl op0,op1
+	       * sbbl dest,dest
+	       * [addl dest, ct]
+	       *
+	       * Size 5 - 8.
+	       */
+	      if (ct)
+		tmp = expand_simple_binop (mode, PLUS,
+					   tmp, GEN_INT (ct),
+					   copy_rtx (tmp), 1, OPTAB_DIRECT);
+	    }
+	  else if (cf == -1)
+	    {
+	      /*
+	       * cmpl op0,op1
+	       * sbbl dest,dest
+	       * orl $ct, dest
+	       *
+	       * Size 8.
+	       */
+	      tmp = expand_simple_binop (mode, IOR,
+					 tmp, GEN_INT (ct),
+					 copy_rtx (tmp), 1, OPTAB_DIRECT);
+	    }
+	  else if (diff == -1 && ct)
+	    {
+	      /*
+	       * cmpl op0,op1
+	       * sbbl dest,dest
+	       * notl dest
+	       * [addl dest, cf]
+	       *
+	       * Size 8 - 11.
+	       */
+	      tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
+	      if (cf)
+		tmp = expand_simple_binop (mode, PLUS,
+					   copy_rtx (tmp), GEN_INT (cf),
+					   copy_rtx (tmp), 1, OPTAB_DIRECT);
+	    }
+	  else
+	    {
+	      /*
+	       * cmpl op0,op1
+	       * sbbl dest,dest
+	       * [notl dest]
+	       * andl cf - ct, dest
+	       * [addl dest, ct]
+	       *
+	       * Size 8 - 11.
+	       */
+
+	      if (cf == 0)
+		{
+		  cf = ct;
+		  ct = 0;
+		  tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
+		}
+
+	      tmp = expand_simple_binop (mode, AND,
+					 copy_rtx (tmp),
+					 gen_int_mode (cf - ct, mode),
+					 copy_rtx (tmp), 1, OPTAB_DIRECT);
+	      if (ct)
+		tmp = expand_simple_binop (mode, PLUS,
+					   copy_rtx (tmp), GEN_INT (ct),
+					   copy_rtx (tmp), 1, OPTAB_DIRECT);
+	    }
+
+	  if (!rtx_equal_p (tmp, out))
+	    emit_move_insn (copy_rtx (out), copy_rtx (tmp));
+
+	  return true;
+	}
+
+      if (diff < 0)
+	{
+	  enum machine_mode cmp_mode = GET_MODE (op0);
+
+	  HOST_WIDE_INT tmp;
+	  tmp = ct, ct = cf, cf = tmp;
+	  diff = -diff;
+
+	  if (SCALAR_FLOAT_MODE_P (cmp_mode))
+	    {
+	      gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
+
+	      /* We may be reversing unordered compare to normal compare, that
+		 is not valid in general (we may convert non-trapping condition
+		 to trapping one), however on i386 we currently emit all
+		 comparisons unordered.  */
+	      compare_code = reverse_condition_maybe_unordered (compare_code);
+	      code = reverse_condition_maybe_unordered (code);
+	    }
+	  else
+	    {
+	      compare_code = reverse_condition (compare_code);
+	      code = reverse_condition (code);
+	    }
+	}
+
+      compare_code = UNKNOWN;
+      if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
+	  && CONST_INT_P (op1))
+	{
+	  if (op1 == const0_rtx
+	      && (code == LT || code == GE))
+	    compare_code = code;
+	  else if (op1 == constm1_rtx)
+	    {
+	      if (code == LE)
+		compare_code = LT;
+	      else if (code == GT)
+		compare_code = GE;
+	    }
+	}
+
+      /* Optimize dest = (op0 < 0) ? -1 : cf.  */
+      if (compare_code != UNKNOWN
+	  && GET_MODE (op0) == GET_MODE (out)
+	  && (cf == -1 || ct == -1))
+	{
+	  /* If lea code below could be used, only optimize
+	     if it results in a 2 insn sequence.  */
+
+	  if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
+		 || diff == 3 || diff == 5 || diff == 9)
+	      || (compare_code == LT && ct == -1)
+	      || (compare_code == GE && cf == -1))
+	    {
+	      /*
+	       * notl op1	(if necessary)
+	       * sarl $31, op1
+	       * orl cf, op1
+	       */
+	      if (ct != -1)
+		{
+		  cf = ct;
+		  ct = -1;
+		  code = reverse_condition (code);
+		}
+
+	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
+
+	      out = expand_simple_binop (mode, IOR,
+					 out, GEN_INT (cf),
+					 out, 1, OPTAB_DIRECT);
+	      if (out != operands[0])
+		emit_move_insn (operands[0], out);
+
+	      return true;
+	    }
+	}
+
+
+      if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
+	   || diff == 3 || diff == 5 || diff == 9)
+	  && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
+	  && (mode != DImode
+	      || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
+	{
+	  /*
+	   * xorl dest,dest
+	   * cmpl op1,op2
+	   * setcc dest
+	   * lea cf(dest*(ct-cf)),dest
+	   *
+	   * Size 14.
+	   *
+	   * This also catches the degenerate setcc-only case.
+	   */
+
+	  rtx tmp;
+	  int nops;
+
+	  out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
+
+	  nops = 0;
+	  /* On x86_64 the lea instruction operates on Pmode, so we need
+	     to get arithmetics done in proper mode to match.  */
+	  if (diff == 1)
+	    tmp = copy_rtx (out);
+	  else
+	    {
+	      rtx out1;
+	      out1 = copy_rtx (out);
+	      tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
+	      nops++;
+	      if (diff & 1)
+		{
+		  tmp = gen_rtx_PLUS (mode, tmp, out1);
+		  nops++;
+		}
+	    }
+	  if (cf != 0)
+	    {
+	      tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
+	      nops++;
+	    }
+	  if (!rtx_equal_p (tmp, out))
+	    {
+	      if (nops == 1)
+		out = force_operand (tmp, copy_rtx (out));
+	      else
+		emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
+	    }
+	  if (!rtx_equal_p (out, operands[0]))
+	    emit_move_insn (operands[0], copy_rtx (out));
+
+	  return true;
+	}
+
+      /*
+       * General case:			Jumpful:
+       *   xorl dest,dest		cmpl op1, op2
+       *   cmpl op1, op2		movl ct, dest
+       *   setcc dest			jcc 1f
+       *   decl dest			movl cf, dest
+       *   andl (cf-ct),dest		1:
+       *   addl ct,dest
+       *
+       * Size 20.			Size 14.
+       *
+       * This is reasonably steep, but branch mispredict costs are
+       * high on modern cpus, so consider failing only if optimizing
+       * for space.
+       */
+
+      if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
+	  && BRANCH_COST (optimize_insn_for_speed_p (),
+		  	  false) >= 2)
+	{
+	  if (cf == 0)
+	    {
+	      enum machine_mode cmp_mode = GET_MODE (op0);
+
+	      cf = ct;
+	      ct = 0;
+
+	      if (SCALAR_FLOAT_MODE_P (cmp_mode))
+		{
+		  gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
+
+		  /* We may be reversing unordered compare to normal compare,
+		     that is not valid in general (we may convert non-trapping
+		     condition to trapping one), however on i386 we currently
+		     emit all comparisons unordered.  */
+		  code = reverse_condition_maybe_unordered (code);
+		}
+	      else
+		{
+		  code = reverse_condition (code);
+		  if (compare_code != UNKNOWN)
+		    compare_code = reverse_condition (compare_code);
+		}
+	    }
+
+	  if (compare_code != UNKNOWN)
+	    {
+	      /* notl op1	(if needed)
+		 sarl $31, op1
+		 andl (cf-ct), op1
+		 addl ct, op1
+
+		 For x < 0 (resp. x <= -1) there will be no notl,
+		 so if possible swap the constants to get rid of the
+		 complement.
+		 True/false will be -1/0 while code below (store flag
+		 followed by decrement) is 0/-1, so the constants need
+		 to be exchanged once more.  */
+
+	      if (compare_code == GE || !cf)
+		{
+		  code = reverse_condition (code);
+		  compare_code = LT;
+		}
+	      else
+		{
+		  HOST_WIDE_INT tmp = cf;
+		  cf = ct;
+		  ct = tmp;
+		}
+
+	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
+	    }
+	  else
+	    {
+	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
+
+	      out = expand_simple_binop (mode, PLUS, copy_rtx (out),
+					 constm1_rtx,
+					 copy_rtx (out), 1, OPTAB_DIRECT);
+	    }
+
+	  out = expand_simple_binop (mode, AND, copy_rtx (out),
+				     gen_int_mode (cf - ct, mode),
+				     copy_rtx (out), 1, OPTAB_DIRECT);
+	  if (ct)
+	    out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
+				       copy_rtx (out), 1, OPTAB_DIRECT);
+	  if (!rtx_equal_p (out, operands[0]))
+	    emit_move_insn (operands[0], copy_rtx (out));
+
+	  return true;
+	}
+    }
+
+  if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
+    {
+      /* Try a few things more with specific constants and a variable.  */
+
+      optab op;
+      rtx var, orig_out, out, tmp;
+
+      if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
+	return false;
+
+      /* If one of the two operands is an interesting constant, load a
+	 constant with the above and mask it in with a logical operation.  */
+
+      if (CONST_INT_P (operands[2]))
+	{
+	  var = operands[3];
+	  if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
+	    operands[3] = constm1_rtx, op = and_optab;
+	  else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
+	    operands[3] = const0_rtx, op = ior_optab;
+	  else
+	    return false;
+	}
+      else if (CONST_INT_P (operands[3]))
+	{
+	  var = operands[2];
+	  if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
+	    operands[2] = constm1_rtx, op = and_optab;
+	  else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
+	    operands[2] = const0_rtx, op = ior_optab;
+	  else
+	    return false;
+	}
+      else
+        return false;
+
+      orig_out = operands[0];
+      tmp = gen_reg_rtx (mode);
+      operands[0] = tmp;
+
+      /* Recurse to get the constant loaded.  */
+      if (ix86_expand_int_movcc (operands) == 0)
+        return false;
+
+      /* Mask in the interesting variable.  */
+      out = expand_binop (mode, op, var, tmp, orig_out, 0,
+			  OPTAB_WIDEN);
+      if (!rtx_equal_p (out, orig_out))
+	emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
+
+      return true;
+    }
+
+  /*
+   * For comparison with above,
+   *
+   * movl cf,dest
+   * movl ct,tmp
+   * cmpl op1,op2
+   * cmovcc tmp,dest
+   *
+   * Size 15.
+   */
+
+  if (! nonimmediate_operand (operands[2], mode))
+    operands[2] = force_reg (mode, operands[2]);
+  if (! nonimmediate_operand (operands[3], mode))
+    operands[3] = force_reg (mode, operands[3]);
+
+  if (! register_operand (operands[2], VOIDmode)
+      && (mode == QImode
+          || ! register_operand (operands[3], VOIDmode)))
+    operands[2] = force_reg (mode, operands[2]);
+
+  if (mode == QImode
+      && ! register_operand (operands[3], VOIDmode))
+    operands[3] = force_reg (mode, operands[3]);
+
+  emit_insn (compare_seq);
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+			  gen_rtx_IF_THEN_ELSE (mode,
+						compare_op, operands[2],
+						operands[3])));
+  return true;
+}
+
+/* Swap, force into registers, or otherwise massage the two operands
+   to an sse comparison with a mask result.  Thus we differ a bit from
+   ix86_prepare_fp_compare_args which expects to produce a flags result.
+
+   The DEST operand exists to help determine whether to commute commutative
+   operators.  The POP0/POP1 operands are updated in place.  The new
+   comparison code is returned, or UNKNOWN if not implementable.  */
+
+static enum rtx_code
+ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
+				  rtx *pop0, rtx *pop1)
+{
+  rtx tmp;
+
+  switch (code)
+    {
+    case LTGT:
+    case UNEQ:
+      /* AVX supports all the needed comparisons.  */
+      if (TARGET_AVX)
+	break;
+      /* We have no LTGT as an operator.  We could implement it with
+	 NE & ORDERED, but this requires an extra temporary.  It's
+	 not clear that it's worth it.  */
+      return UNKNOWN;
+
+    case LT:
+    case LE:
+    case UNGT:
+    case UNGE:
+      /* These are supported directly.  */
+      break;
+
+    case EQ:
+    case NE:
+    case UNORDERED:
+    case ORDERED:
+      /* AVX has 3 operand comparisons, no need to swap anything.  */
+      if (TARGET_AVX)
+	break;
+      /* For commutative operators, try to canonicalize the destination
+	 operand to be first in the comparison - this helps reload to
+	 avoid extra moves.  */
+      if (!dest || !rtx_equal_p (dest, *pop1))
+	break;
+      /* FALLTHRU */
+
+    case GE:
+    case GT:
+    case UNLE:
+    case UNLT:
+      /* These are not supported directly before AVX, and furthermore
+	 ix86_expand_sse_fp_minmax only optimizes LT/UNGE.  Swap the
+	 comparison operands to transform into something that is
+	 supported.  */
+      tmp = *pop0;
+      *pop0 = *pop1;
+      *pop1 = tmp;
+      code = swap_condition (code);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return code;
+}
+
+/* Detect conditional moves that exactly match min/max operational
+   semantics.  Note that this is IEEE safe, as long as we don't
+   interchange the operands.
+
+   Returns FALSE if this conditional move doesn't match a MIN/MAX,
+   and TRUE if the operation is successful and instructions are emitted.  */
+
+static bool
+ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
+			   rtx cmp_op1, rtx if_true, rtx if_false)
+{
+  enum machine_mode mode;
+  bool is_min;
+  rtx tmp;
+
+  if (code == LT)
+    ;
+  else if (code == UNGE)
+    {
+      tmp = if_true;
+      if_true = if_false;
+      if_false = tmp;
+    }
+  else
+    return false;
+
+  if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
+    is_min = true;
+  else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
+    is_min = false;
+  else
+    return false;
+
+  mode = GET_MODE (dest);
+
+  /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
+     but MODE may be a vector mode and thus not appropriate.  */
+  if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
+    {
+      int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
+      rtvec v;
+
+      if_true = force_reg (mode, if_true);
+      v = gen_rtvec (2, if_true, if_false);
+      tmp = gen_rtx_UNSPEC (mode, v, u);
+    }
+  else
+    {
+      code = is_min ? SMIN : SMAX;
+      tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
+    }
+
+  emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
+  return true;
+}
+
+/* Expand an sse vector comparison.  Return the register with the result.  */
+
+static rtx
+ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
+		     rtx op_true, rtx op_false)
+{
+  enum machine_mode mode = GET_MODE (dest);
+  enum machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
+
+  /* In general case result of comparison can differ from operands' type.  */
+  enum machine_mode cmp_mode;
+
+  /* In AVX512F the result of comparison is an integer mask.  */
+  bool maskcmp = false;
+  rtx x;
+
+  if (GET_MODE_SIZE (cmp_ops_mode) == 64)
+    {
+      cmp_mode = mode_for_size (GET_MODE_NUNITS (cmp_ops_mode), MODE_INT, 0);
+      gcc_assert (cmp_mode != BLKmode);
+
+      maskcmp = true;
+    }
+  else
+    cmp_mode = cmp_ops_mode;
+
+
+  cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
+  if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
+    cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
+
+  if (optimize
+      || reg_overlap_mentioned_p (dest, op_true)
+      || reg_overlap_mentioned_p (dest, op_false))
+    dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
+
+  /* Compare patterns for int modes are unspec in AVX512F only.  */
+  if (maskcmp && (code == GT || code == EQ))
+    {
+      rtx (*gen)(rtx, rtx, rtx);
+
+      switch (cmp_ops_mode)
+	{
+	case V16SImode:
+	  gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
+	  break;
+	case V8DImode:
+	  gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
+	  break;
+	default:
+	  gen = NULL;
+	}
+
+      if (gen)
+	{
+	  emit_insn (gen (dest, cmp_op0, cmp_op1));
+	  return dest;
+	}
+    }
+  x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
+
+  if (cmp_mode != mode && !maskcmp)
+    {
+      x = force_reg (cmp_ops_mode, x);
+      convert_move (dest, x, false);
+    }
+  else
+    emit_insn (gen_rtx_SET (VOIDmode, dest, x));
+
+  return dest;
+}
+
+/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
+   operations.  This is used for both scalar and vector conditional moves.  */
+
+static void
+ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
+{
+  enum machine_mode mode = GET_MODE (dest);
+  enum machine_mode cmpmode = GET_MODE (cmp);
+
+  /* In AVX512F the result of comparison is an integer mask.  */
+  bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
+
+  rtx t2, t3, x;
+
+  if (vector_all_ones_operand (op_true, mode)
+      && rtx_equal_p (op_false, CONST0_RTX (mode))
+      && !maskcmp)
+    {
+      emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
+    }
+  else if (op_false == CONST0_RTX (mode)
+      && !maskcmp)
+    {
+      op_true = force_reg (mode, op_true);
+      x = gen_rtx_AND (mode, cmp, op_true);
+      emit_insn (gen_rtx_SET (VOIDmode, dest, x));
+    }
+  else if (op_true == CONST0_RTX (mode)
+      && !maskcmp)
+    {
+      op_false = force_reg (mode, op_false);
+      x = gen_rtx_NOT (mode, cmp);
+      x = gen_rtx_AND (mode, x, op_false);
+      emit_insn (gen_rtx_SET (VOIDmode, dest, x));
+    }
+  else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
+      && !maskcmp)
+    {
+      op_false = force_reg (mode, op_false);
+      x = gen_rtx_IOR (mode, cmp, op_false);
+      emit_insn (gen_rtx_SET (VOIDmode, dest, x));
+    }
+  else if (TARGET_XOP
+      && !maskcmp)
+    {
+      op_true = force_reg (mode, op_true);
+
+      if (!nonimmediate_operand (op_false, mode))
+	op_false = force_reg (mode, op_false);
+
+      emit_insn (gen_rtx_SET (mode, dest,
+			      gen_rtx_IF_THEN_ELSE (mode, cmp,
+						    op_true,
+						    op_false)));
+    }
+  else
+    {
+      rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
+      rtx d = dest;
+
+      if (!nonimmediate_operand (op_true, mode))
+	op_true = force_reg (mode, op_true);
+
+      op_false = force_reg (mode, op_false);
+
+      switch (mode)
+	{
+	case V4SFmode:
+	  if (TARGET_SSE4_1)
+	    gen = gen_sse4_1_blendvps;
+	  break;
+	case V2DFmode:
+	  if (TARGET_SSE4_1)
+	    gen = gen_sse4_1_blendvpd;
+	  break;
+	case V16QImode:
+	case V8HImode:
+	case V4SImode:
+	case V2DImode:
+	  if (TARGET_SSE4_1)
+	    {
+	      gen = gen_sse4_1_pblendvb;
+	      if (mode != V16QImode)
+		d = gen_reg_rtx (V16QImode);
+	      op_false = gen_lowpart (V16QImode, op_false);
+	      op_true = gen_lowpart (V16QImode, op_true);
+	      cmp = gen_lowpart (V16QImode, cmp);
+	    }
+	  break;
+	case V8SFmode:
+	  if (TARGET_AVX)
+	    gen = gen_avx_blendvps256;
+	  break;
+	case V4DFmode:
+	  if (TARGET_AVX)
+	    gen = gen_avx_blendvpd256;
+	  break;
+	case V32QImode:
+	case V16HImode:
+	case V8SImode:
+	case V4DImode:
+	  if (TARGET_AVX2)
+	    {
+	      gen = gen_avx2_pblendvb;
+	      if (mode != V32QImode)
+		d = gen_reg_rtx (V32QImode);
+	      op_false = gen_lowpart (V32QImode, op_false);
+	      op_true = gen_lowpart (V32QImode, op_true);
+	      cmp = gen_lowpart (V32QImode, cmp);
+	    }
+	  break;
+
+	case V16SImode:
+	  gen = gen_avx512f_blendmv16si;
+	  break;
+	case V8DImode:
+	  gen = gen_avx512f_blendmv8di;
+	  break;
+	case V8DFmode:
+	  gen = gen_avx512f_blendmv8df;
+	  break;
+	case V16SFmode:
+	  gen = gen_avx512f_blendmv16sf;
+	  break;
+
+	default:
+	  break;
+	}
+
+      if (gen != NULL)
+	{
+	  emit_insn (gen (d, op_false, op_true, cmp));
+	  if (d != dest)
+	    emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
+	}
+      else
+	{
+	  op_true = force_reg (mode, op_true);
+
+	  t2 = gen_reg_rtx (mode);
+	  if (optimize)
+	    t3 = gen_reg_rtx (mode);
+	  else
+	    t3 = dest;
+
+	  x = gen_rtx_AND (mode, op_true, cmp);
+	  emit_insn (gen_rtx_SET (VOIDmode, t2, x));
+
+	  x = gen_rtx_NOT (mode, cmp);
+	  x = gen_rtx_AND (mode, x, op_false);
+	  emit_insn (gen_rtx_SET (VOIDmode, t3, x));
+
+	  x = gen_rtx_IOR (mode, t3, t2);
+	  emit_insn (gen_rtx_SET (VOIDmode, dest, x));
+	}
+    }
+}
+
+/* Expand a floating-point conditional move.  Return true if successful.  */
+
+bool
+ix86_expand_fp_movcc (rtx operands[])
+{
+  enum machine_mode mode = GET_MODE (operands[0]);
+  enum rtx_code code = GET_CODE (operands[1]);
+  rtx tmp, compare_op;
+  rtx op0 = XEXP (operands[1], 0);
+  rtx op1 = XEXP (operands[1], 1);
+
+  if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
+    {
+      enum machine_mode cmode;
+
+      /* Since we've no cmove for sse registers, don't force bad register
+	 allocation just to gain access to it.  Deny movcc when the
+	 comparison mode doesn't match the move mode.  */
+      cmode = GET_MODE (op0);
+      if (cmode == VOIDmode)
+	cmode = GET_MODE (op1);
+      if (cmode != mode)
+	return false;
+
+      code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
+      if (code == UNKNOWN)
+	return false;
+
+      if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
+				     operands[2], operands[3]))
+	return true;
+
+      tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
+				 operands[2], operands[3]);
+      ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
+      return true;
+    }
+
+  if (GET_MODE (op0) == TImode
+      || (GET_MODE (op0) == DImode
+	  && !TARGET_64BIT))
+    return false;
+
+  /* The floating point conditional move instructions don't directly
+     support conditions resulting from a signed integer comparison.  */
+
+  compare_op = ix86_expand_compare (code, op0, op1);
+  if (!fcmov_comparison_operator (compare_op, VOIDmode))
+    {
+      tmp = gen_reg_rtx (QImode);
+      ix86_expand_setcc (tmp, code, op0, op1);
+
+      compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
+    }
+
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+			  gen_rtx_IF_THEN_ELSE (mode, compare_op,
+						operands[2], operands[3])));
+
+  return true;
+}
+
+/* Expand a floating-point vector conditional move; a vcond operation
+   rather than a movcc operation.  */
+
+bool
+ix86_expand_fp_vcond (rtx operands[])
+{
+  enum rtx_code code = GET_CODE (operands[3]);
+  rtx cmp;
+
+  code = ix86_prepare_sse_fp_compare_args (operands[0], code,
+					   &operands[4], &operands[5]);
+  if (code == UNKNOWN)
+    {
+      rtx temp;
+      switch (GET_CODE (operands[3]))
+	{
+	case LTGT:
+	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
+				      operands[5], operands[0], operands[0]);
+	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
+				     operands[5], operands[1], operands[2]);
+	  code = AND;
+	  break;
+	case UNEQ:
+	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
+				      operands[5], operands[0], operands[0]);
+	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
+				     operands[5], operands[1], operands[2]);
+	  code = IOR;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
+				 OPTAB_DIRECT);
+      ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
+      return true;
+    }
+
+  if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
+				 operands[5], operands[1], operands[2]))
+    return true;
+
+  cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
+			     operands[1], operands[2]);
+  ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
+  return true;
+}
+
+/* Expand a signed/unsigned integral vector conditional move.  */
+
+bool
+ix86_expand_int_vcond (rtx operands[])
+{
+  enum machine_mode data_mode = GET_MODE (operands[0]);
+  enum machine_mode mode = GET_MODE (operands[4]);
+  enum rtx_code code = GET_CODE (operands[3]);
+  bool negate = false;
+  rtx x, cop0, cop1;
+
+  cop0 = operands[4];
+  cop1 = operands[5];
+
+  /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
+     and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
+  if ((code == LT || code == GE)
+      && data_mode == mode
+      && cop1 == CONST0_RTX (mode)
+      && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
+      && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
+      && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
+      && (GET_MODE_SIZE (data_mode) == 16
+	  || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
+    {
+      rtx negop = operands[2 - (code == LT)];
+      int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
+      if (negop == CONST1_RTX (data_mode))
+	{
+	  rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
+					 operands[0], 1, OPTAB_DIRECT);
+	  if (res != operands[0])
+	    emit_move_insn (operands[0], res);
+	  return true;
+	}
+      else if (GET_MODE_INNER (data_mode) != DImode
+	       && vector_all_ones_operand (negop, data_mode))
+	{
+	  rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
+					 operands[0], 0, OPTAB_DIRECT);
+	  if (res != operands[0])
+	    emit_move_insn (operands[0], res);
+	  return true;
+	}
+    }
+
+  if (!nonimmediate_operand (cop1, mode))
+    cop1 = force_reg (mode, cop1);
+  if (!general_operand (operands[1], data_mode))
+    operands[1] = force_reg (data_mode, operands[1]);
+  if (!general_operand (operands[2], data_mode))
+    operands[2] = force_reg (data_mode, operands[2]);
+
+  /* XOP supports all of the comparisons on all 128-bit vector int types.  */
+  if (TARGET_XOP
+      && (mode == V16QImode || mode == V8HImode
+	  || mode == V4SImode || mode == V2DImode))
+    ;
+  else
+    {
+      /* Canonicalize the comparison to EQ, GT, GTU.  */
+      switch (code)
+	{
+	case EQ:
+	case GT:
+	case GTU:
+	  break;
+
+	case NE:
+	case LE:
+	case LEU:
+	  code = reverse_condition (code);
+	  negate = true;
+	  break;
+
+	case GE:
+	case GEU:
+	  code = reverse_condition (code);
+	  negate = true;
+	  /* FALLTHRU */
+
+	case LT:
+	case LTU:
+	  code = swap_condition (code);
+	  x = cop0, cop0 = cop1, cop1 = x;
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+
+      /* Only SSE4.1/SSE4.2 supports V2DImode.  */
+      if (mode == V2DImode)
+	{
+	  switch (code)
+	    {
+	    case EQ:
+	      /* SSE4.1 supports EQ.  */
+	      if (!TARGET_SSE4_1)
+		return false;
+	      break;
+
+	    case GT:
+	    case GTU:
+	      /* SSE4.2 supports GT/GTU.  */
+	      if (!TARGET_SSE4_2)
+		return false;
+	      break;
+
+	    default:
+	      gcc_unreachable ();
+	    }
+	}
+
+      /* Unsigned parallel compare is not supported by the hardware.
+	 Play some tricks to turn this into a signed comparison
+	 against 0.  */
+      if (code == GTU)
+	{
+	  cop0 = force_reg (mode, cop0);
+
+	  switch (mode)
+	    {
+	    case V16SImode:
+	    case V8DImode:
+	    case V8SImode:
+	    case V4DImode:
+	    case V4SImode:
+	    case V2DImode:
+		{
+		  rtx t1, t2, mask;
+		  rtx (*gen_sub3) (rtx, rtx, rtx);
+
+		  switch (mode)
+		    {
+		    case V16SImode: gen_sub3 = gen_subv16si3; break;
+		    case V8DImode: gen_sub3 = gen_subv8di3; break;
+		    case V8SImode: gen_sub3 = gen_subv8si3; break;
+		    case V4DImode: gen_sub3 = gen_subv4di3; break;
+		    case V4SImode: gen_sub3 = gen_subv4si3; break;
+		    case V2DImode: gen_sub3 = gen_subv2di3; break;
+		    default:
+		      gcc_unreachable ();
+		    }
+		  /* Subtract (-(INT MAX) - 1) from both operands to make
+		     them signed.  */
+		  mask = ix86_build_signbit_mask (mode, true, false);
+		  t1 = gen_reg_rtx (mode);
+		  emit_insn (gen_sub3 (t1, cop0, mask));
+
+		  t2 = gen_reg_rtx (mode);
+		  emit_insn (gen_sub3 (t2, cop1, mask));
+
+		  cop0 = t1;
+		  cop1 = t2;
+		  code = GT;
+		}
+	      break;
+
+	    case V32QImode:
+	    case V16HImode:
+	    case V16QImode:
+	    case V8HImode:
+	      /* Perform a parallel unsigned saturating subtraction.  */
+	      x = gen_reg_rtx (mode);
+	      emit_insn (gen_rtx_SET (VOIDmode, x,
+				      gen_rtx_US_MINUS (mode, cop0, cop1)));
+
+	      cop0 = x;
+	      cop1 = CONST0_RTX (mode);
+	      code = EQ;
+	      negate = !negate;
+	      break;
+
+	    default:
+	      gcc_unreachable ();
+	    }
+	}
+    }
+
+  /* Allow the comparison to be done in one mode, but the movcc to
+     happen in another mode.  */
+  if (data_mode == mode)
+    {
+      x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
+			       operands[1+negate], operands[2-negate]);
+    }
+  else
+    {
+      gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
+      x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
+			       operands[1+negate], operands[2-negate]);
+      if (GET_MODE (x) == mode)
+	x = gen_lowpart (data_mode, x);
+    }
+
+  ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
+			 operands[2-negate]);
+  return true;
+}
+
+static bool
+ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
+{
+  enum machine_mode mode = GET_MODE (op0);
+  switch (mode)
+    {
+    case V16SImode:
+      emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0,
+					      force_reg (V16SImode, mask),
+					      op1));
+      return true;
+    case V16SFmode:
+      emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0,
+					      force_reg (V16SImode, mask),
+					      op1));
+      return true;
+    case V8DImode:
+      emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0,
+					     force_reg (V8DImode, mask), op1));
+      return true;
+    case V8DFmode:
+      emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0,
+					     force_reg (V8DImode, mask), op1));
+      return true;
+    default:
+      return false;
+    }
+}
+
+/* Expand a variable vector permutation.  */
+
+void
+ix86_expand_vec_perm (rtx operands[])
+{
+  rtx target = operands[0];
+  rtx op0 = operands[1];
+  rtx op1 = operands[2];
+  rtx mask = operands[3];
+  rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
+  enum machine_mode mode = GET_MODE (op0);
+  enum machine_mode maskmode = GET_MODE (mask);
+  int w, e, i;
+  bool one_operand_shuffle = rtx_equal_p (op0, op1);
+
+  /* Number of elements in the vector.  */
+  w = GET_MODE_NUNITS (mode);
+  e = GET_MODE_UNIT_SIZE (mode);
+  gcc_assert (w <= 64);
+
+  if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1))
+    return;
+
+  if (TARGET_AVX2)
+    {
+      if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
+	{
+	  /* Unfortunately, the VPERMQ and VPERMPD instructions only support
+	     an constant shuffle operand.  With a tiny bit of effort we can
+	     use VPERMD instead.  A re-interpretation stall for V4DFmode is
+	     unfortunate but there's no avoiding it.
+	     Similarly for V16HImode we don't have instructions for variable
+	     shuffling, while for V32QImode we can use after preparing suitable
+	     masks vpshufb; vpshufb; vpermq; vpor.  */
+
+	  if (mode == V16HImode)
+	    {
+	      maskmode = mode = V32QImode;
+	      w = 32;
+	      e = 1;
+	    }
+	  else
+	    {
+	      maskmode = mode = V8SImode;
+	      w = 8;
+	      e = 4;
+	    }
+	  t1 = gen_reg_rtx (maskmode);
+
+	  /* Replicate the low bits of the V4DImode mask into V8SImode:
+	       mask = { A B C D }
+	       t1 = { A A B B C C D D }.  */
+	  for (i = 0; i < w / 2; ++i)
+	    vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
+	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+	  vt = force_reg (maskmode, vt);
+	  mask = gen_lowpart (maskmode, mask);
+	  if (maskmode == V8SImode)
+	    emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
+	  else
+	    emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
+
+	  /* Multiply the shuffle indicies by two.  */
+	  t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
+				    OPTAB_DIRECT);
+
+	  /* Add one to the odd shuffle indicies:
+		t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
+	  for (i = 0; i < w / 2; ++i)
+	    {
+	      vec[i * 2] = const0_rtx;
+	      vec[i * 2 + 1] = const1_rtx;
+	    }
+	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+	  vt = validize_mem (force_const_mem (maskmode, vt));
+	  t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
+				    OPTAB_DIRECT);
+
+	  /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
+	  operands[3] = mask = t1;
+	  target = gen_reg_rtx (mode);
+	  op0 = gen_lowpart (mode, op0);
+	  op1 = gen_lowpart (mode, op1);
+	}
+
+      switch (mode)
+	{
+	case V8SImode:
+	  /* The VPERMD and VPERMPS instructions already properly ignore
+	     the high bits of the shuffle elements.  No need for us to
+	     perform an AND ourselves.  */
+	  if (one_operand_shuffle)
+	    {
+	      emit_insn (gen_avx2_permvarv8si (target, op0, mask));
+	      if (target != operands[0])
+		emit_move_insn (operands[0],
+				gen_lowpart (GET_MODE (operands[0]), target));
+	    }
+	  else
+	    {
+	      t1 = gen_reg_rtx (V8SImode);
+	      t2 = gen_reg_rtx (V8SImode);
+	      emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
+	      emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
+	      goto merge_two;
+	    }
+	  return;
+
+	case V8SFmode:
+	  mask = gen_lowpart (V8SImode, mask);
+	  if (one_operand_shuffle)
+	    emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
+	  else
+	    {
+	      t1 = gen_reg_rtx (V8SFmode);
+	      t2 = gen_reg_rtx (V8SFmode);
+	      emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
+	      emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
+	      goto merge_two;
+	    }
+	  return;
+
+        case V4SImode:
+	  /* By combining the two 128-bit input vectors into one 256-bit
+	     input vector, we can use VPERMD and VPERMPS for the full
+	     two-operand shuffle.  */
+	  t1 = gen_reg_rtx (V8SImode);
+	  t2 = gen_reg_rtx (V8SImode);
+	  emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
+	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
+	  emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
+	  emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
+	  return;
+
+        case V4SFmode:
+	  t1 = gen_reg_rtx (V8SFmode);
+	  t2 = gen_reg_rtx (V8SImode);
+	  mask = gen_lowpart (V4SImode, mask);
+	  emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
+	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
+	  emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
+	  emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
+	  return;
+
+	case V32QImode:
+	  t1 = gen_reg_rtx (V32QImode);
+	  t2 = gen_reg_rtx (V32QImode);
+	  t3 = gen_reg_rtx (V32QImode);
+	  vt2 = GEN_INT (128);
+	  for (i = 0; i < 32; i++)
+	    vec[i] = vt2;
+	  vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
+	  vt = force_reg (V32QImode, vt);
+	  for (i = 0; i < 32; i++)
+	    vec[i] = i < 16 ? vt2 : const0_rtx;
+	  vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
+	  vt2 = force_reg (V32QImode, vt2);
+	  /* From mask create two adjusted masks, which contain the same
+	     bits as mask in the low 7 bits of each vector element.
+	     The first mask will have the most significant bit clear
+	     if it requests element from the same 128-bit lane
+	     and MSB set if it requests element from the other 128-bit lane.
+	     The second mask will have the opposite values of the MSB,
+	     and additionally will have its 128-bit lanes swapped.
+	     E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
+	     t1   { 07 92 9e 09 ... | 17 19 85 1f ... } and
+	     t3   { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
+	     stands for other 12 bytes.  */
+	  /* The bit whether element is from the same lane or the other
+	     lane is bit 4, so shift it up by 3 to the MSB position.  */
+	  t5 = gen_reg_rtx (V4DImode);
+	  emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
+				    GEN_INT (3)));
+	  /* Clear MSB bits from the mask just in case it had them set.  */
+	  emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
+	  /* After this t1 will have MSB set for elements from other lane.  */
+	  emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
+	  /* Clear bits other than MSB.  */
+	  emit_insn (gen_andv32qi3 (t1, t1, vt));
+	  /* Or in the lower bits from mask into t3.  */
+	  emit_insn (gen_iorv32qi3 (t3, t1, t2));
+	  /* And invert MSB bits in t1, so MSB is set for elements from the same
+	     lane.  */
+	  emit_insn (gen_xorv32qi3 (t1, t1, vt));
+	  /* Swap 128-bit lanes in t3.  */
+	  t6 = gen_reg_rtx (V4DImode);
+	  emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
+					  const2_rtx, GEN_INT (3),
+					  const0_rtx, const1_rtx));
+	  /* And or in the lower bits from mask into t1.  */
+	  emit_insn (gen_iorv32qi3 (t1, t1, t2));
+	  if (one_operand_shuffle)
+	    {
+	      /* Each of these shuffles will put 0s in places where
+		 element from the other 128-bit lane is needed, otherwise
+		 will shuffle in the requested value.  */
+	      emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
+						gen_lowpart (V32QImode, t6)));
+	      emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
+	      /* For t3 the 128-bit lanes are swapped again.  */
+	      t7 = gen_reg_rtx (V4DImode);
+	      emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
+					      const2_rtx, GEN_INT (3),
+					      const0_rtx, const1_rtx));
+	      /* And oring both together leads to the result.  */
+	      emit_insn (gen_iorv32qi3 (target, t1,
+					gen_lowpart (V32QImode, t7)));
+	      if (target != operands[0])
+		emit_move_insn (operands[0],
+				gen_lowpart (GET_MODE (operands[0]), target));
+	      return;
+	    }
+
+	  t4 = gen_reg_rtx (V32QImode);
+	  /* Similarly to the above one_operand_shuffle code,
+	     just for repeated twice for each operand.  merge_two:
+	     code will merge the two results together.  */
+	  emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
+					    gen_lowpart (V32QImode, t6)));
+	  emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
+					    gen_lowpart (V32QImode, t6)));
+	  emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
+	  emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
+	  t7 = gen_reg_rtx (V4DImode);
+	  emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
+					  const2_rtx, GEN_INT (3),
+					  const0_rtx, const1_rtx));
+	  t8 = gen_reg_rtx (V4DImode);
+	  emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
+					  const2_rtx, GEN_INT (3),
+					  const0_rtx, const1_rtx));
+	  emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
+	  emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
+	  t1 = t4;
+	  t2 = t3;
+	  goto merge_two;
+
+	default:
+	  gcc_assert (GET_MODE_SIZE (mode) <= 16);
+	  break;
+	}
+    }
+
+  if (TARGET_XOP)
+    {
+      /* The XOP VPPERM insn supports three inputs.  By ignoring the 
+	 one_operand_shuffle special case, we avoid creating another
+	 set of constant vectors in memory.  */
+      one_operand_shuffle = false;
+
+      /* mask = mask & {2*w-1, ...} */
+      vt = GEN_INT (2*w - 1);
+    }
+  else
+    {
+      /* mask = mask & {w-1, ...} */
+      vt = GEN_INT (w - 1);
+    }
+
+  for (i = 0; i < w; i++)
+    vec[i] = vt;
+  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+  mask = expand_simple_binop (maskmode, AND, mask, vt,
+			      NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* For non-QImode operations, convert the word permutation control
+     into a byte permutation control.  */
+  if (mode != V16QImode)
+    {
+      mask = expand_simple_binop (maskmode, ASHIFT, mask,
+				  GEN_INT (exact_log2 (e)),
+				  NULL_RTX, 0, OPTAB_DIRECT);
+
+      /* Convert mask to vector of chars.  */
+      mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
+
+      /* Replicate each of the input bytes into byte positions:
+	 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
+	 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
+	 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}.  */
+      for (i = 0; i < 16; ++i)
+	vec[i] = GEN_INT (i/e * e);
+      vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
+      vt = validize_mem (force_const_mem (V16QImode, vt));
+      if (TARGET_XOP)
+	emit_insn (gen_xop_pperm (mask, mask, mask, vt));
+      else
+	emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
+
+      /* Convert it into the byte positions by doing
+	 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...}  */
+      for (i = 0; i < 16; ++i)
+	vec[i] = GEN_INT (i % e);
+      vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
+      vt = validize_mem (force_const_mem (V16QImode, vt));
+      emit_insn (gen_addv16qi3 (mask, mask, vt));
+    }
+
+  /* The actual shuffle operations all operate on V16QImode.  */
+  op0 = gen_lowpart (V16QImode, op0);
+  op1 = gen_lowpart (V16QImode, op1);
+
+  if (TARGET_XOP)
+    {
+      if (GET_MODE (target) != V16QImode)
+	target = gen_reg_rtx (V16QImode);
+      emit_insn (gen_xop_pperm (target, op0, op1, mask));
+      if (target != operands[0])
+	emit_move_insn (operands[0],
+			gen_lowpart (GET_MODE (operands[0]), target));
+    }
+  else if (one_operand_shuffle)
+    {
+      if (GET_MODE (target) != V16QImode)
+	target = gen_reg_rtx (V16QImode);
+      emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
+      if (target != operands[0])
+	emit_move_insn (operands[0],
+			gen_lowpart (GET_MODE (operands[0]), target));
+    }
+  else
+    {
+      rtx xops[6];
+      bool ok;
+
+      /* Shuffle the two input vectors independently.  */
+      t1 = gen_reg_rtx (V16QImode);
+      t2 = gen_reg_rtx (V16QImode);
+      emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
+      emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
+
+ merge_two:
+      /* Then merge them together.  The key is whether any given control
+         element contained a bit set that indicates the second word.  */
+      mask = operands[3];
+      vt = GEN_INT (w);
+      if (maskmode == V2DImode && !TARGET_SSE4_1)
+	{
+	  /* Without SSE4.1, we don't have V2DImode EQ.  Perform one
+	     more shuffle to convert the V2DI input mask into a V4SI
+	     input mask.  At which point the masking that expand_int_vcond
+	     will work as desired.  */
+	  rtx t3 = gen_reg_rtx (V4SImode);
+	  emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
+				        const0_rtx, const0_rtx,
+				        const2_rtx, const2_rtx));
+	  mask = t3;
+	  maskmode = V4SImode;
+	  e = w = 4;
+	}
+
+      for (i = 0; i < w; i++)
+	vec[i] = vt;
+      vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+      vt = force_reg (maskmode, vt);
+      mask = expand_simple_binop (maskmode, AND, mask, vt,
+				  NULL_RTX, 0, OPTAB_DIRECT);
+
+      if (GET_MODE (target) != mode)
+	target = gen_reg_rtx (mode);
+      xops[0] = target;
+      xops[1] = gen_lowpart (mode, t2);
+      xops[2] = gen_lowpart (mode, t1);
+      xops[3] = gen_rtx_EQ (maskmode, mask, vt);
+      xops[4] = mask;
+      xops[5] = vt;
+      ok = ix86_expand_int_vcond (xops);
+      gcc_assert (ok);
+      if (target != operands[0])
+	emit_move_insn (operands[0],
+			gen_lowpart (GET_MODE (operands[0]), target));
+    }
+}
+
+/* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
+   true if we should do zero extension, else sign extension.  HIGH_P is
+   true if we want the N/2 high elements, else the low elements.  */
+
+void
+ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
+{
+  enum machine_mode imode = GET_MODE (src);
+  rtx tmp;
+
+  if (TARGET_SSE4_1)
+    {
+      rtx (*unpack)(rtx, rtx);
+      rtx (*extract)(rtx, rtx) = NULL;
+      enum machine_mode halfmode = BLKmode;
+
+      switch (imode)
+	{
+	case V32QImode:
+	  if (unsigned_p)
+	    unpack = gen_avx2_zero_extendv16qiv16hi2;
+	  else
+	    unpack = gen_avx2_sign_extendv16qiv16hi2;
+	  halfmode = V16QImode;
+	  extract
+	    = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
+	  break;
+	case V32HImode:
+	  if (unsigned_p)
+	    unpack = gen_avx512f_zero_extendv16hiv16si2;
+	  else
+	    unpack = gen_avx512f_sign_extendv16hiv16si2;
+	  halfmode = V16HImode;
+	  extract
+	    = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
+	  break;
+	case V16HImode:
+	  if (unsigned_p)
+	    unpack = gen_avx2_zero_extendv8hiv8si2;
+	  else
+	    unpack = gen_avx2_sign_extendv8hiv8si2;
+	  halfmode = V8HImode;
+	  extract
+	    = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
+	  break;
+	case V16SImode:
+	  if (unsigned_p)
+	    unpack = gen_avx512f_zero_extendv8siv8di2;
+	  else
+	    unpack = gen_avx512f_sign_extendv8siv8di2;
+	  halfmode = V8SImode;
+	  extract
+	    = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
+	  break;
+	case V8SImode:
+	  if (unsigned_p)
+	    unpack = gen_avx2_zero_extendv4siv4di2;
+	  else
+	    unpack = gen_avx2_sign_extendv4siv4di2;
+	  halfmode = V4SImode;
+	  extract
+	    = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
+	  break;
+	case V16QImode:
+	  if (unsigned_p)
+	    unpack = gen_sse4_1_zero_extendv8qiv8hi2;
+	  else
+	    unpack = gen_sse4_1_sign_extendv8qiv8hi2;
+	  break;
+	case V8HImode:
+	  if (unsigned_p)
+	    unpack = gen_sse4_1_zero_extendv4hiv4si2;
+	  else
+	    unpack = gen_sse4_1_sign_extendv4hiv4si2;
+	  break;
+	case V4SImode:
+	  if (unsigned_p)
+	    unpack = gen_sse4_1_zero_extendv2siv2di2;
+	  else
+	    unpack = gen_sse4_1_sign_extendv2siv2di2;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+
+      if (GET_MODE_SIZE (imode) >= 32)
+	{
+	  tmp = gen_reg_rtx (halfmode);
+	  emit_insn (extract (tmp, src));
+	}
+      else if (high_p)
+	{
+	  /* Shift higher 8 bytes to lower 8 bytes.  */
+	  tmp = gen_reg_rtx (V1TImode);
+	  emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
+					 GEN_INT (64)));
+	  tmp = gen_lowpart (imode, tmp);
+	}
+      else
+	tmp = src;
+
+      emit_insn (unpack (dest, tmp));
+    }
+  else
+    {
+      rtx (*unpack)(rtx, rtx, rtx);
+
+      switch (imode)
+	{
+	case V16QImode:
+	  if (high_p)
+	    unpack = gen_vec_interleave_highv16qi;
+	  else
+	    unpack = gen_vec_interleave_lowv16qi;
+	  break;
+	case V8HImode:
+	  if (high_p)
+	    unpack = gen_vec_interleave_highv8hi;
+	  else
+	    unpack = gen_vec_interleave_lowv8hi;
+	  break;
+	case V4SImode:
+	  if (high_p)
+	    unpack = gen_vec_interleave_highv4si;
+	  else
+	    unpack = gen_vec_interleave_lowv4si;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+
+      if (unsigned_p)
+	tmp = force_reg (imode, CONST0_RTX (imode));
+      else
+	tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
+				   src, pc_rtx, pc_rtx);
+
+      rtx tmp2 = gen_reg_rtx (imode);
+      emit_insn (unpack (tmp2, src, tmp));
+      emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
+    }
+}
+
+/* Expand conditional increment or decrement using adb/sbb instructions.
+   The default case using setcc followed by the conditional move can be
+   done by generic code.  */
+bool
+ix86_expand_int_addcc (rtx operands[])
+{
+  enum rtx_code code = GET_CODE (operands[1]);
+  rtx flags;
+  rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
+  rtx compare_op;
+  rtx val = const0_rtx;
+  bool fpcmp = false;
+  enum machine_mode mode;
+  rtx op0 = XEXP (operands[1], 0);
+  rtx op1 = XEXP (operands[1], 1);
+
+  if (operands[3] != const1_rtx
+      && operands[3] != constm1_rtx)
+    return false;
+  if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
+     return false;
+  code = GET_CODE (compare_op);
+
+  flags = XEXP (compare_op, 0);
+
+  if (GET_MODE (flags) == CCFPmode
+      || GET_MODE (flags) == CCFPUmode)
+    {
+      fpcmp = true;
+      code = ix86_fp_compare_code_to_integer (code);
+    }
+
+  if (code != LTU)
+    {
+      val = constm1_rtx;
+      if (fpcmp)
+	PUT_CODE (compare_op,
+		  reverse_condition_maybe_unordered
+		    (GET_CODE (compare_op)));
+      else
+	PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
+    }
+
+  mode = GET_MODE (operands[0]);
+
+  /* Construct either adc or sbb insn.  */
+  if ((code == LTU) == (operands[3] == constm1_rtx))
+    {
+      switch (mode)
+	{
+	  case QImode:
+	    insn = gen_subqi3_carry;
+	    break;
+	  case HImode:
+	    insn = gen_subhi3_carry;
+	    break;
+	  case SImode:
+	    insn = gen_subsi3_carry;
+	    break;
+	  case DImode:
+	    insn = gen_subdi3_carry;
+	    break;
+	  default:
+	    gcc_unreachable ();
+	}
+    }
+  else
+    {
+      switch (mode)
+	{
+	  case QImode:
+	    insn = gen_addqi3_carry;
+	    break;
+	  case HImode:
+	    insn = gen_addhi3_carry;
+	    break;
+	  case SImode:
+	    insn = gen_addsi3_carry;
+	    break;
+	  case DImode:
+	    insn = gen_adddi3_carry;
+	    break;
+	  default:
+	    gcc_unreachable ();
+	}
+    }
+  emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
+
+  return true;
+}
+
+
+/* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
+   but works for floating pointer parameters and nonoffsetable memories.
+   For pushes, it returns just stack offsets; the values will be saved
+   in the right order.  Maximally three parts are generated.  */
+
+static int
+ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
+{
+  int size;
+
+  if (!TARGET_64BIT)
+    size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
+  else
+    size = (GET_MODE_SIZE (mode) + 4) / 8;
+
+  gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
+  gcc_assert (size >= 2 && size <= 4);
+
+  /* Optimize constant pool reference to immediates.  This is used by fp
+     moves, that force all constants to memory to allow combining.  */
+  if (MEM_P (operand) && MEM_READONLY_P (operand))
+    {
+      rtx tmp = maybe_get_pool_constant (operand);
+      if (tmp)
+	operand = tmp;
+    }
+
+  if (MEM_P (operand) && !offsettable_memref_p (operand))
+    {
+      /* The only non-offsetable memories we handle are pushes.  */
+      int ok = push_operand (operand, VOIDmode);
+
+      gcc_assert (ok);
+
+      operand = copy_rtx (operand);
+      PUT_MODE (operand, word_mode);
+      parts[0] = parts[1] = parts[2] = parts[3] = operand;
+      return size;
+    }
+
+  if (GET_CODE (operand) == CONST_VECTOR)
+    {
+      enum machine_mode imode = int_mode_for_mode (mode);
+      /* Caution: if we looked through a constant pool memory above,
+	 the operand may actually have a different mode now.  That's
+	 ok, since we want to pun this all the way back to an integer.  */
+      operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
+      gcc_assert (operand != NULL);
+      mode = imode;
+    }
+
+  if (!TARGET_64BIT)
+    {
+      if (mode == DImode)
+	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
+      else
+	{
+	  int i;
+
+	  if (REG_P (operand))
+	    {
+	      gcc_assert (reload_completed);
+	      for (i = 0; i < size; i++)
+		parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
+	    }
+	  else if (offsettable_memref_p (operand))
+	    {
+	      operand = adjust_address (operand, SImode, 0);
+	      parts[0] = operand;
+	      for (i = 1; i < size; i++)
+		parts[i] = adjust_address (operand, SImode, 4 * i);
+	    }
+	  else if (GET_CODE (operand) == CONST_DOUBLE)
+	    {
+	      REAL_VALUE_TYPE r;
+	      long l[4];
+
+	      REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
+	      switch (mode)
+		{
+		case TFmode:
+		  real_to_target (l, &r, mode);
+		  parts[3] = gen_int_mode (l[3], SImode);
+		  parts[2] = gen_int_mode (l[2], SImode);
+		  break;
+		case XFmode:
+		  /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
+		     long double may not be 80-bit.  */
+		  real_to_target (l, &r, mode);
+		  parts[2] = gen_int_mode (l[2], SImode);
+		  break;
+		case DFmode:
+		  REAL_VALUE_TO_TARGET_DOUBLE (r, l);
+		  break;
+		default:
+		  gcc_unreachable ();
+		}
+	      parts[1] = gen_int_mode (l[1], SImode);
+	      parts[0] = gen_int_mode (l[0], SImode);
+	    }
+	  else
+	    gcc_unreachable ();
+	}
+    }
+  else
+    {
+      if (mode == TImode)
+	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
+      if (mode == XFmode || mode == TFmode)
+	{
+	  enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
+	  if (REG_P (operand))
+	    {
+	      gcc_assert (reload_completed);
+	      parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
+	      parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
+	    }
+	  else if (offsettable_memref_p (operand))
+	    {
+	      operand = adjust_address (operand, DImode, 0);
+	      parts[0] = operand;
+	      parts[1] = adjust_address (operand, upper_mode, 8);
+	    }
+	  else if (GET_CODE (operand) == CONST_DOUBLE)
+	    {
+	      REAL_VALUE_TYPE r;
+	      long l[4];
+
+	      REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
+	      real_to_target (l, &r, mode);
+
+	      /* Do not use shift by 32 to avoid warning on 32bit systems.  */
+	      if (HOST_BITS_PER_WIDE_INT >= 64)
+	        parts[0]
+		  = gen_int_mode
+		      ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
+		       + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
+		       DImode);
+	      else
+	        parts[0] = immed_double_const (l[0], l[1], DImode);
+
+	      if (upper_mode == SImode)
+	        parts[1] = gen_int_mode (l[2], SImode);
+	      else if (HOST_BITS_PER_WIDE_INT >= 64)
+	        parts[1]
+		  = gen_int_mode
+		      ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
+		       + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
+		       DImode);
+	      else
+	        parts[1] = immed_double_const (l[2], l[3], DImode);
+	    }
+	  else
+	    gcc_unreachable ();
+	}
+    }
+
+  return size;
+}
+
+/* Emit insns to perform a move or push of DI, DF, XF, and TF values.
+   Return false when normal moves are needed; true when all required
+   insns have been emitted.  Operands 2-4 contain the input values
+   int the correct order; operands 5-7 contain the output values.  */
+
+void
+ix86_split_long_move (rtx operands[])
+{
+  rtx part[2][4];
+  int nparts, i, j;
+  int push = 0;
+  int collisions = 0;
+  enum machine_mode mode = GET_MODE (operands[0]);
+  bool collisionparts[4];
+
+  /* The DFmode expanders may ask us to move double.
+     For 64bit target this is single move.  By hiding the fact
+     here we simplify i386.md splitters.  */
+  if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
+    {
+      /* Optimize constant pool reference to immediates.  This is used by
+	 fp moves, that force all constants to memory to allow combining.  */
+
+      if (MEM_P (operands[1])
+	  && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
+	  && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
+	operands[1] = get_pool_constant (XEXP (operands[1], 0));
+      if (push_operand (operands[0], VOIDmode))
+	{
+	  operands[0] = copy_rtx (operands[0]);
+	  PUT_MODE (operands[0], word_mode);
+	}
+      else
+        operands[0] = gen_lowpart (DImode, operands[0]);
+      operands[1] = gen_lowpart (DImode, operands[1]);
+      emit_move_insn (operands[0], operands[1]);
+      return;
+    }
+
+  /* The only non-offsettable memory we handle is push.  */
+  if (push_operand (operands[0], VOIDmode))
+    push = 1;
+  else
+    gcc_assert (!MEM_P (operands[0])
+		|| offsettable_memref_p (operands[0]));
+
+  nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
+  ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
+
+  /* When emitting push, take care for source operands on the stack.  */
+  if (push && MEM_P (operands[1])
+      && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
+    {
+      rtx src_base = XEXP (part[1][nparts - 1], 0);
+
+      /* Compensate for the stack decrement by 4.  */
+      if (!TARGET_64BIT && nparts == 3
+	  && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
+	src_base = plus_constant (Pmode, src_base, 4);
+
+      /* src_base refers to the stack pointer and is
+	 automatically decreased by emitted push.  */
+      for (i = 0; i < nparts; i++)
+	part[1][i] = change_address (part[1][i],
+				     GET_MODE (part[1][i]), src_base);
+    }
+
+  /* We need to do copy in the right order in case an address register
+     of the source overlaps the destination.  */
+  if (REG_P (part[0][0]) && MEM_P (part[1][0]))
+    {
+      rtx tmp;
+
+      for (i = 0; i < nparts; i++)
+	{
+	  collisionparts[i]
+	    = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
+	  if (collisionparts[i])
+	    collisions++;
+	}
+
+      /* Collision in the middle part can be handled by reordering.  */
+      if (collisions == 1 && nparts == 3 && collisionparts [1])
+	{
+	  tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
+	  tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
+	}
+      else if (collisions == 1
+	       && nparts == 4
+	       && (collisionparts [1] || collisionparts [2]))
+	{
+	  if (collisionparts [1])
+	    {
+	      tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
+	      tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
+	    }
+	  else
+	    {
+	      tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
+	      tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
+	    }
+	}
+
+      /* If there are more collisions, we can't handle it by reordering.
+	 Do an lea to the last part and use only one colliding move.  */
+      else if (collisions > 1)
+	{
+	  rtx base;
+
+	  collisions = 1;
+
+	  base = part[0][nparts - 1];
+
+	  /* Handle the case when the last part isn't valid for lea.
+	     Happens in 64-bit mode storing the 12-byte XFmode.  */
+	  if (GET_MODE (base) != Pmode)
+	    base = gen_rtx_REG (Pmode, REGNO (base));
+
+	  emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
+	  part[1][0] = replace_equiv_address (part[1][0], base);
+	  for (i = 1; i < nparts; i++)
+	    {
+	      tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
+	      part[1][i] = replace_equiv_address (part[1][i], tmp);
+	    }
+	}
+    }
+
+  if (push)
+    {
+      if (!TARGET_64BIT)
+	{
+	  if (nparts == 3)
+	    {
+	      if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
+                emit_insn (ix86_gen_add3 (stack_pointer_rtx,
+					  stack_pointer_rtx, GEN_INT (-4)));
+	      emit_move_insn (part[0][2], part[1][2]);
+	    }
+	  else if (nparts == 4)
+	    {
+	      emit_move_insn (part[0][3], part[1][3]);
+	      emit_move_insn (part[0][2], part[1][2]);
+	    }
+	}
+      else
+	{
+	  /* In 64bit mode we don't have 32bit push available.  In case this is
+	     register, it is OK - we will just use larger counterpart.  We also
+	     retype memory - these comes from attempt to avoid REX prefix on
+	     moving of second half of TFmode value.  */
+	  if (GET_MODE (part[1][1]) == SImode)
+	    {
+	      switch (GET_CODE (part[1][1]))
+		{
+		case MEM:
+		  part[1][1] = adjust_address (part[1][1], DImode, 0);
+		  break;
+
+		case REG:
+		  part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
+		  break;
+
+		default:
+		  gcc_unreachable ();
+		}
+
+	      if (GET_MODE (part[1][0]) == SImode)
+		part[1][0] = part[1][1];
+	    }
+	}
+      emit_move_insn (part[0][1], part[1][1]);
+      emit_move_insn (part[0][0], part[1][0]);
+      return;
+    }
+
+  /* Choose correct order to not overwrite the source before it is copied.  */
+  if ((REG_P (part[0][0])
+       && REG_P (part[1][1])
+       && (REGNO (part[0][0]) == REGNO (part[1][1])
+	   || (nparts == 3
+	       && REGNO (part[0][0]) == REGNO (part[1][2]))
+	   || (nparts == 4
+	       && REGNO (part[0][0]) == REGNO (part[1][3]))))
+      || (collisions > 0
+	  && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
+    {
+      for (i = 0, j = nparts - 1; i < nparts; i++, j--)
+	{
+	  operands[2 + i] = part[0][j];
+	  operands[6 + i] = part[1][j];
+	}
+    }
+  else
+    {
+      for (i = 0; i < nparts; i++)
+	{
+	  operands[2 + i] = part[0][i];
+	  operands[6 + i] = part[1][i];
+	}
+    }
+
+  /* If optimizing for size, attempt to locally unCSE nonzero constants.  */
+  if (optimize_insn_for_size_p ())
+    {
+      for (j = 0; j < nparts - 1; j++)
+	if (CONST_INT_P (operands[6 + j])
+	    && operands[6 + j] != const0_rtx
+	    && REG_P (operands[2 + j]))
+	  for (i = j; i < nparts - 1; i++)
+	    if (CONST_INT_P (operands[7 + i])
+		&& INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
+	      operands[7 + i] = operands[2 + j];
+    }
+
+  for (i = 0; i < nparts; i++)
+    emit_move_insn (operands[2 + i], operands[6 + i]);
+
+  return;
+}
+
+/* Helper function of ix86_split_ashl used to generate an SImode/DImode
+   left shift by a constant, either using a single shift or
+   a sequence of add instructions.  */
+
+static void
+ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
+{
+  rtx (*insn)(rtx, rtx, rtx);
+
+  if (count == 1
+      || (count * ix86_cost->add <= ix86_cost->shift_const
+	  && !optimize_insn_for_size_p ()))
+    {
+      insn = mode == DImode ? gen_addsi3 : gen_adddi3;
+      while (count-- > 0)
+	emit_insn (insn (operand, operand, operand));
+    }
+  else
+    {
+      insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
+      emit_insn (insn (operand, operand, GEN_INT (count)));
+    }
+}
+
+void
+ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
+{
+  rtx (*gen_ashl3)(rtx, rtx, rtx);
+  rtx (*gen_shld)(rtx, rtx, rtx);
+  int half_width = GET_MODE_BITSIZE (mode) >> 1;
+
+  rtx low[2], high[2];
+  int count;
+
+  if (CONST_INT_P (operands[2]))
+    {
+      split_double_mode (mode, operands, 2, low, high);
+      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
+
+      if (count >= half_width)
+	{
+	  emit_move_insn (high[0], low[1]);
+	  emit_move_insn (low[0], const0_rtx);
+
+	  if (count > half_width)
+	    ix86_expand_ashl_const (high[0], count - half_width, mode);
+	}
+      else
+	{
+	  gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
+
+	  if (!rtx_equal_p (operands[0], operands[1]))
+	    emit_move_insn (operands[0], operands[1]);
+
+	  emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
+	  ix86_expand_ashl_const (low[0], count, mode);
+	}
+      return;
+    }
+
+  split_double_mode (mode, operands, 1, low, high);
+
+  gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
+
+  if (operands[1] == const1_rtx)
+    {
+      /* Assuming we've chosen a QImode capable registers, then 1 << N
+	 can be done with two 32/64-bit shifts, no branches, no cmoves.  */
+      if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
+	{
+	  rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
+
+	  ix86_expand_clear (low[0]);
+	  ix86_expand_clear (high[0]);
+	  emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
+
+	  d = gen_lowpart (QImode, low[0]);
+	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
+	  s = gen_rtx_EQ (QImode, flags, const0_rtx);
+	  emit_insn (gen_rtx_SET (VOIDmode, d, s));
+
+	  d = gen_lowpart (QImode, high[0]);
+	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
+	  s = gen_rtx_NE (QImode, flags, const0_rtx);
+	  emit_insn (gen_rtx_SET (VOIDmode, d, s));
+	}
+
+      /* Otherwise, we can get the same results by manually performing
+	 a bit extract operation on bit 5/6, and then performing the two
+	 shifts.  The two methods of getting 0/1 into low/high are exactly
+	 the same size.  Avoiding the shift in the bit extract case helps
+	 pentium4 a bit; no one else seems to care much either way.  */
+      else
+	{
+	  enum machine_mode half_mode;
+	  rtx (*gen_lshr3)(rtx, rtx, rtx);
+	  rtx (*gen_and3)(rtx, rtx, rtx);
+	  rtx (*gen_xor3)(rtx, rtx, rtx);
+	  HOST_WIDE_INT bits;
+	  rtx x;
+
+	  if (mode == DImode)
+	    {
+	      half_mode = SImode;
+	      gen_lshr3 = gen_lshrsi3;
+	      gen_and3 = gen_andsi3;
+	      gen_xor3 = gen_xorsi3;
+	      bits = 5;
+	    }
+	  else
+	    {
+	      half_mode = DImode;
+	      gen_lshr3 = gen_lshrdi3;
+	      gen_and3 = gen_anddi3;
+	      gen_xor3 = gen_xordi3;
+	      bits = 6;
+	    }
+
+	  if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
+	    x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
+	  else
+	    x = gen_lowpart (half_mode, operands[2]);
+	  emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
+
+	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
+	  emit_insn (gen_and3 (high[0], high[0], const1_rtx));
+	  emit_move_insn (low[0], high[0]);
+	  emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
+	}
+
+      emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
+      emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
+      return;
+    }
+
+  if (operands[1] == constm1_rtx)
+    {
+      /* For -1 << N, we can avoid the shld instruction, because we
+	 know that we're shifting 0...31/63 ones into a -1.  */
+      emit_move_insn (low[0], constm1_rtx);
+      if (optimize_insn_for_size_p ())
+	emit_move_insn (high[0], low[0]);
+      else
+	emit_move_insn (high[0], constm1_rtx);
+    }
+  else
+    {
+      gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
+
+      if (!rtx_equal_p (operands[0], operands[1]))
+	emit_move_insn (operands[0], operands[1]);
+
+      split_double_mode (mode, operands, 1, low, high);
+      emit_insn (gen_shld (high[0], low[0], operands[2]));
+    }
+
+  emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
+
+  if (TARGET_CMOVE && scratch)
+    {
+      rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
+	= mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
+
+      ix86_expand_clear (scratch);
+      emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
+    }
+  else
+    {
+      rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
+	= mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
+
+      emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
+    }
+}
+
+void
+ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
+{
+  rtx (*gen_ashr3)(rtx, rtx, rtx)
+    = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
+  rtx (*gen_shrd)(rtx, rtx, rtx);
+  int half_width = GET_MODE_BITSIZE (mode) >> 1;
+
+  rtx low[2], high[2];
+  int count;
+
+  if (CONST_INT_P (operands[2]))
+    {
+      split_double_mode (mode, operands, 2, low, high);
+      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
+
+      if (count == GET_MODE_BITSIZE (mode) - 1)
+	{
+	  emit_move_insn (high[0], high[1]);
+	  emit_insn (gen_ashr3 (high[0], high[0],
+				GEN_INT (half_width - 1)));
+	  emit_move_insn (low[0], high[0]);
+
+	}
+      else if (count >= half_width)
+	{
+	  emit_move_insn (low[0], high[1]);
+	  emit_move_insn (high[0], low[0]);
+	  emit_insn (gen_ashr3 (high[0], high[0],
+				GEN_INT (half_width - 1)));
+
+	  if (count > half_width)
+	    emit_insn (gen_ashr3 (low[0], low[0],
+				  GEN_INT (count - half_width)));
+	}
+      else
+	{
+	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
+
+	  if (!rtx_equal_p (operands[0], operands[1]))
+	    emit_move_insn (operands[0], operands[1]);
+
+	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
+	  emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
+	}
+    }
+  else
+    {
+      gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
+
+     if (!rtx_equal_p (operands[0], operands[1]))
+	emit_move_insn (operands[0], operands[1]);
+
+      split_double_mode (mode, operands, 1, low, high);
+
+      emit_insn (gen_shrd (low[0], high[0], operands[2]));
+      emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
+
+      if (TARGET_CMOVE && scratch)
+	{
+	  rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
+	    = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
+
+	  emit_move_insn (scratch, high[0]);
+	  emit_insn (gen_ashr3 (scratch, scratch,
+				GEN_INT (half_width - 1)));
+	  emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
+					  scratch));
+	}
+      else
+	{
+	  rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
+	    = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
+
+	  emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
+	}
+    }
+}
+
+void
+ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
+{
+  rtx (*gen_lshr3)(rtx, rtx, rtx)
+    = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
+  rtx (*gen_shrd)(rtx, rtx, rtx);
+  int half_width = GET_MODE_BITSIZE (mode) >> 1;
+
+  rtx low[2], high[2];
+  int count;
+
+  if (CONST_INT_P (operands[2]))
+    {
+      split_double_mode (mode, operands, 2, low, high);
+      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
+
+      if (count >= half_width)
+	{
+	  emit_move_insn (low[0], high[1]);
+	  ix86_expand_clear (high[0]);
+
+	  if (count > half_width)
+	    emit_insn (gen_lshr3 (low[0], low[0],
+				  GEN_INT (count - half_width)));
+	}
+      else
+	{
+	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
+
+	  if (!rtx_equal_p (operands[0], operands[1]))
+	    emit_move_insn (operands[0], operands[1]);
+
+	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
+	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
+	}
+    }
+  else
+    {
+      gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
+
+      if (!rtx_equal_p (operands[0], operands[1]))
+	emit_move_insn (operands[0], operands[1]);
+
+      split_double_mode (mode, operands, 1, low, high);
+
+      emit_insn (gen_shrd (low[0], high[0], operands[2]));
+      emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
+
+      if (TARGET_CMOVE && scratch)
+	{
+	  rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
+	    = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
+
+	  ix86_expand_clear (scratch);
+	  emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
+					  scratch));
+	}
+      else
+	{
+	  rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
+	    = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
+
+	  emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
+	}
+    }
+}
+
+/* Predict just emitted jump instruction to be taken with probability PROB.  */
+static void
+predict_jump (int prob)
+{
+  rtx insn = get_last_insn ();
+  gcc_assert (JUMP_P (insn));
+  add_int_reg_note (insn, REG_BR_PROB, prob);
+}
+
+/* Helper function for the string operations below.  Dest VARIABLE whether
+   it is aligned to VALUE bytes.  If true, jump to the label.  */
+static rtx
+ix86_expand_aligntest (rtx variable, int value, bool epilogue)
+{
+  rtx label = gen_label_rtx ();
+  rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
+  if (GET_MODE (variable) == DImode)
+    emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
+  else
+    emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
+  emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
+			   1, label);
+  if (epilogue)
+    predict_jump (REG_BR_PROB_BASE * 50 / 100);
+  else
+    predict_jump (REG_BR_PROB_BASE * 90 / 100);
+  return label;
+}
+
+/* Adjust COUNTER by the VALUE.  */
+static void
+ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
+{
+  rtx (*gen_add)(rtx, rtx, rtx)
+    = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
+
+  emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
+}
+
+/* Zero extend possibly SImode EXP to Pmode register.  */
+rtx
+ix86_zero_extend_to_Pmode (rtx exp)
+{
+  return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
+}
+
+/* Divide COUNTREG by SCALE.  */
+static rtx
+scale_counter (rtx countreg, int scale)
+{
+  rtx sc;
+
+  if (scale == 1)
+    return countreg;
+  if (CONST_INT_P (countreg))
+    return GEN_INT (INTVAL (countreg) / scale);
+  gcc_assert (REG_P (countreg));
+
+  sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
+			    GEN_INT (exact_log2 (scale)),
+			    NULL, 1, OPTAB_DIRECT);
+  return sc;
+}
+
+/* Return mode for the memcpy/memset loop counter.  Prefer SImode over
+   DImode for constant loop counts.  */
+
+static enum machine_mode
+counter_mode (rtx count_exp)
+{
+  if (GET_MODE (count_exp) != VOIDmode)
+    return GET_MODE (count_exp);
+  if (!CONST_INT_P (count_exp))
+    return Pmode;
+  if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
+    return DImode;
+  return SImode;
+}
+
+/* Copy the address to a Pmode register.  This is used for x32 to
+   truncate DImode TLS address to a SImode register. */
+
+static rtx
+ix86_copy_addr_to_reg (rtx addr)
+{
+  if (GET_MODE (addr) == Pmode)
+    return copy_addr_to_reg (addr);
+  else
+    {
+      gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
+      return gen_rtx_SUBREG (SImode, copy_to_mode_reg (DImode, addr), 0);
+    }
+}
+
+/* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
+   to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
+   specified in bytes.  When ISSETMEM is TRUE, output the equivalent loop to set
+   memory by VALUE (supposed to be in MODE).
+
+   The size is rounded down to whole number of chunk size moved at once.
+   SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.  */
+
+
+static void
+expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
+			       rtx destptr, rtx srcptr, rtx value,
+			       rtx count, enum machine_mode mode, int unroll,
+			       int expected_size, bool issetmem)
+{
+  rtx out_label, top_label, iter, tmp;
+  enum machine_mode iter_mode = counter_mode (count);
+  int piece_size_n = GET_MODE_SIZE (mode) * unroll;
+  rtx piece_size = GEN_INT (piece_size_n);
+  rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
+  rtx size;
+  int i;
+
+  top_label = gen_label_rtx ();
+  out_label = gen_label_rtx ();
+  iter = gen_reg_rtx (iter_mode);
+
+  size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
+			      NULL, 1, OPTAB_DIRECT);
+  /* Those two should combine.  */
+  if (piece_size == const1_rtx)
+    {
+      emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
+			       true, out_label);
+      predict_jump (REG_BR_PROB_BASE * 10 / 100);
+    }
+  emit_move_insn (iter, const0_rtx);
+
+  emit_label (top_label);
+
+  tmp = convert_modes (Pmode, iter_mode, iter, true);
+
+  /* This assert could be relaxed - in this case we'll need to compute
+     smallest power of two, containing in PIECE_SIZE_N and pass it to
+     offset_address.  */
+  gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
+  destmem = offset_address (destmem, tmp, piece_size_n);
+  destmem = adjust_address (destmem, mode, 0);
+
+  if (!issetmem)
+    {
+      srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
+      srcmem = adjust_address (srcmem, mode, 0);
+
+      /* When unrolling for chips that reorder memory reads and writes,
+	 we can save registers by using single temporary.
+	 Also using 4 temporaries is overkill in 32bit mode.  */
+      if (!TARGET_64BIT && 0)
+	{
+	  for (i = 0; i < unroll; i++)
+	    {
+	      if (i)
+		{
+		  destmem =
+		    adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
+		  srcmem =
+		    adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
+		}
+	      emit_move_insn (destmem, srcmem);
+	    }
+	}
+      else
+	{
+	  rtx tmpreg[4];
+	  gcc_assert (unroll <= 4);
+	  for (i = 0; i < unroll; i++)
+	    {
+	      tmpreg[i] = gen_reg_rtx (mode);
+	      if (i)
+		{
+		  srcmem =
+		    adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
+		}
+	      emit_move_insn (tmpreg[i], srcmem);
+	    }
+	  for (i = 0; i < unroll; i++)
+	    {
+	      if (i)
+		{
+		  destmem =
+		    adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
+		}
+	      emit_move_insn (destmem, tmpreg[i]);
+	    }
+	}
+    }
+  else
+    for (i = 0; i < unroll; i++)
+      {
+	if (i)
+	  destmem =
+	    adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
+	emit_move_insn (destmem, value);
+      }
+
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+			   true, top_label);
+  if (expected_size != -1)
+    {
+      expected_size /= GET_MODE_SIZE (mode) * unroll;
+      if (expected_size == 0)
+	predict_jump (0);
+      else if (expected_size > REG_BR_PROB_BASE)
+	predict_jump (REG_BR_PROB_BASE - 1);
+      else
+        predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
+    }
+  else
+    predict_jump (REG_BR_PROB_BASE * 80 / 100);
+  iter = ix86_zero_extend_to_Pmode (iter);
+  tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != destptr)
+    emit_move_insn (destptr, tmp);
+  if (!issetmem)
+    {
+      tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
+				 true, OPTAB_LIB_WIDEN);
+      if (tmp != srcptr)
+	emit_move_insn (srcptr, tmp);
+    }
+  emit_label (out_label);
+}
+
+/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
+   When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
+   When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
+   For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
+   ORIG_VALUE is the original value passed to memset to fill the memory with.
+   Other arguments have same meaning as for previous function.  */
+
+static void
+expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
+			   rtx destptr, rtx srcptr, rtx value, rtx orig_value,
+			   rtx count,
+			   enum machine_mode mode, bool issetmem)
+{
+  rtx destexp;
+  rtx srcexp;
+  rtx countreg;
+  HOST_WIDE_INT rounded_count;
+
+  /* If possible, it is shorter to use rep movs.
+     TODO: Maybe it is better to move this logic to decide_alg.  */
+  if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
+      && (!issetmem || orig_value == const0_rtx))
+    mode = SImode;
+
+  if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
+    destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
+
+  countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
+						       GET_MODE_SIZE (mode)));
+  if (mode != QImode)
+    {
+      destexp = gen_rtx_ASHIFT (Pmode, countreg,
+				GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
+      destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
+    }
+  else
+    destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
+  if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
+    {
+      rounded_count = (INTVAL (count)
+		       & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
+      destmem = shallow_copy_rtx (destmem);
+      set_mem_size (destmem, rounded_count);
+    }
+  else if (MEM_SIZE_KNOWN_P (destmem))
+    clear_mem_size (destmem);
+
+  if (issetmem)
+    {
+      value = force_reg (mode, gen_lowpart (mode, value));
+      emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
+    }
+  else
+    {
+      if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
+	srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
+      if (mode != QImode)
+	{
+	  srcexp = gen_rtx_ASHIFT (Pmode, countreg,
+				   GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
+	  srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
+	}
+      else
+	srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
+      if (CONST_INT_P (count))
+	{
+	  rounded_count = (INTVAL (count)
+			   & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
+	  srcmem = shallow_copy_rtx (srcmem);
+	  set_mem_size (srcmem, rounded_count);
+	}
+      else
+	{
+	  if (MEM_SIZE_KNOWN_P (srcmem))
+	    clear_mem_size (srcmem);
+	}
+      emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
+			      destexp, srcexp));
+    }
+}
+
+/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
+   DESTMEM.
+   SRC is passed by pointer to be updated on return.
+   Return value is updated DST.  */
+static rtx
+emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
+	     HOST_WIDE_INT size_to_move)
+{
+  rtx dst = destmem, src = *srcmem, adjust, tempreg;
+  enum insn_code code;
+  enum machine_mode move_mode;
+  int piece_size, i;
+
+  /* Find the widest mode in which we could perform moves.
+     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
+     it until move of such size is supported.  */
+  piece_size = 1 << floor_log2 (size_to_move);
+  move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
+  code = optab_handler (mov_optab, move_mode);
+  while (code == CODE_FOR_nothing && piece_size > 1)
+    {
+      piece_size >>= 1;
+      move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
+      code = optab_handler (mov_optab, move_mode);
+    }
+
+  /* Find the corresponding vector mode with the same size as MOVE_MODE.
+     MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
+  if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
+    {
+      int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
+      move_mode = mode_for_vector (word_mode, nunits);
+      code = optab_handler (mov_optab, move_mode);
+      if (code == CODE_FOR_nothing)
+	{
+	  move_mode = word_mode;
+	  piece_size = GET_MODE_SIZE (move_mode);
+	  code = optab_handler (mov_optab, move_mode);
+	}
+    }
+  gcc_assert (code != CODE_FOR_nothing);
+
+  dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
+  src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
+
+  /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
+  gcc_assert (size_to_move % piece_size == 0);
+  adjust = GEN_INT (piece_size);
+  for (i = 0; i < size_to_move; i += piece_size)
+    {
+      /* We move from memory to memory, so we'll need to do it via
+	 a temporary register.  */
+      tempreg = gen_reg_rtx (move_mode);
+      emit_insn (GEN_FCN (code) (tempreg, src));
+      emit_insn (GEN_FCN (code) (dst, tempreg));
+
+      emit_move_insn (destptr,
+		      gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
+      emit_move_insn (srcptr,
+		      gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
+
+      dst = adjust_automodify_address_nv (dst, move_mode, destptr,
+					  piece_size);
+      src = adjust_automodify_address_nv (src, move_mode, srcptr,
+					  piece_size);
+    }
+
+  /* Update DST and SRC rtx.  */
+  *srcmem = src;
+  return dst;
+}
+
+/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
+static void
+expand_movmem_epilogue (rtx destmem, rtx srcmem,
+			rtx destptr, rtx srcptr, rtx count, int max_size)
+{
+  rtx src, dest;
+  if (CONST_INT_P (count))
+    {
+      HOST_WIDE_INT countval = INTVAL (count);
+      HOST_WIDE_INT epilogue_size = countval % max_size;
+      int i;
+
+      /* For now MAX_SIZE should be a power of 2.  This assert could be
+	 relaxed, but it'll require a bit more complicated epilogue
+	 expanding.  */
+      gcc_assert ((max_size & (max_size - 1)) == 0);
+      for (i = max_size; i >= 1; i >>= 1)
+	{
+	  if (epilogue_size & i)
+	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
+	}
+      return;
+    }
+  if (max_size > 8)
+    {
+      count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
+				    count, 1, OPTAB_DIRECT);
+      expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
+				     count, QImode, 1, 4, false);
+      return;
+    }
+
+  /* When there are stringops, we can cheaply increase dest and src pointers.
+     Otherwise we save code size by maintaining offset (zero is readily
+     available from preceding rep operation) and using x86 addressing modes.
+   */
+  if (TARGET_SINGLE_STRINGOP)
+    {
+      if (max_size > 4)
+	{
+	  rtx label = ix86_expand_aligntest (count, 4, true);
+	  src = change_address (srcmem, SImode, srcptr);
+	  dest = change_address (destmem, SImode, destptr);
+	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+      if (max_size > 2)
+	{
+	  rtx label = ix86_expand_aligntest (count, 2, true);
+	  src = change_address (srcmem, HImode, srcptr);
+	  dest = change_address (destmem, HImode, destptr);
+	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+      if (max_size > 1)
+	{
+	  rtx label = ix86_expand_aligntest (count, 1, true);
+	  src = change_address (srcmem, QImode, srcptr);
+	  dest = change_address (destmem, QImode, destptr);
+	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+    }
+  else
+    {
+      rtx offset = force_reg (Pmode, const0_rtx);
+      rtx tmp;
+
+      if (max_size > 4)
+	{
+	  rtx label = ix86_expand_aligntest (count, 4, true);
+	  src = change_address (srcmem, SImode, srcptr);
+	  dest = change_address (destmem, SImode, destptr);
+	  emit_move_insn (dest, src);
+	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
+				     true, OPTAB_LIB_WIDEN);
+	  if (tmp != offset)
+	    emit_move_insn (offset, tmp);
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+      if (max_size > 2)
+	{
+	  rtx label = ix86_expand_aligntest (count, 2, true);
+	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
+	  src = change_address (srcmem, HImode, tmp);
+	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
+	  dest = change_address (destmem, HImode, tmp);
+	  emit_move_insn (dest, src);
+	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
+				     true, OPTAB_LIB_WIDEN);
+	  if (tmp != offset)
+	    emit_move_insn (offset, tmp);
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+      if (max_size > 1)
+	{
+	  rtx label = ix86_expand_aligntest (count, 1, true);
+	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
+	  src = change_address (srcmem, QImode, tmp);
+	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
+	  dest = change_address (destmem, QImode, tmp);
+	  emit_move_insn (dest, src);
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+    }
+}
+
+/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
+   with value PROMOTED_VAL.
+   SRC is passed by pointer to be updated on return.
+   Return value is updated DST.  */
+static rtx
+emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
+	     HOST_WIDE_INT size_to_move)
+{
+  rtx dst = destmem, adjust;
+  enum insn_code code;
+  enum machine_mode move_mode;
+  int piece_size, i;
+
+  /* Find the widest mode in which we could perform moves.
+     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
+     it until move of such size is supported.  */
+  move_mode = GET_MODE (promoted_val);
+  if (move_mode == VOIDmode)
+    move_mode = QImode;
+  if (size_to_move < GET_MODE_SIZE (move_mode))
+    {
+      move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
+      promoted_val = gen_lowpart (move_mode, promoted_val);
+    }
+  piece_size = GET_MODE_SIZE (move_mode);
+  code = optab_handler (mov_optab, move_mode);
+  gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
+
+  dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
+
+  /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
+  gcc_assert (size_to_move % piece_size == 0);
+  adjust = GEN_INT (piece_size);
+  for (i = 0; i < size_to_move; i += piece_size)
+    {
+      if (piece_size <= GET_MODE_SIZE (word_mode))
+	{
+	  emit_insn (gen_strset (destptr, dst, promoted_val));
+	  dst = adjust_automodify_address_nv (dst, move_mode, destptr,
+					      piece_size);
+	  continue;
+	}
+
+      emit_insn (GEN_FCN (code) (dst, promoted_val));
+
+      emit_move_insn (destptr,
+		      gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
+
+      dst = adjust_automodify_address_nv (dst, move_mode, destptr,
+					  piece_size);
+    }
+
+  /* Update DST rtx.  */
+  return dst;
+}
+/* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
+static void
+expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
+				 rtx count, int max_size)
+{
+  count =
+    expand_simple_binop (counter_mode (count), AND, count,
+			 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
+  expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
+				 gen_lowpart (QImode, value), count, QImode,
+				 1, max_size / 2, true);
+}
+
+/* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
+static void
+expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
+			rtx count, int max_size)
+{
+  rtx dest;
+
+  if (CONST_INT_P (count))
+    {
+      HOST_WIDE_INT countval = INTVAL (count);
+      HOST_WIDE_INT epilogue_size = countval % max_size;
+      int i;
+
+      /* For now MAX_SIZE should be a power of 2.  This assert could be
+	 relaxed, but it'll require a bit more complicated epilogue
+	 expanding.  */
+      gcc_assert ((max_size & (max_size - 1)) == 0);
+      for (i = max_size; i >= 1; i >>= 1)
+	{
+	  if (epilogue_size & i)
+	    {
+	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
+		destmem = emit_memset (destmem, destptr, vec_value, i);
+	      else
+		destmem = emit_memset (destmem, destptr, value, i);
+	    }
+	}
+      return;
+    }
+  if (max_size > 32)
+    {
+      expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
+      return;
+    }
+  if (max_size > 16)
+    {
+      rtx label = ix86_expand_aligntest (count, 16, true);
+      if (TARGET_64BIT)
+	{
+	  dest = change_address (destmem, DImode, destptr);
+	  emit_insn (gen_strset (destptr, dest, value));
+	  dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
+	  emit_insn (gen_strset (destptr, dest, value));
+	}
+      else
+	{
+	  dest = change_address (destmem, SImode, destptr);
+	  emit_insn (gen_strset (destptr, dest, value));
+	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
+	  emit_insn (gen_strset (destptr, dest, value));
+	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
+	  emit_insn (gen_strset (destptr, dest, value));
+	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
+	  emit_insn (gen_strset (destptr, dest, value));
+	}
+      emit_label (label);
+      LABEL_NUSES (label) = 1;
+    }
+  if (max_size > 8)
+    {
+      rtx label = ix86_expand_aligntest (count, 8, true);
+      if (TARGET_64BIT)
+	{
+	  dest = change_address (destmem, DImode, destptr);
+	  emit_insn (gen_strset (destptr, dest, value));
+	}
+      else
+	{
+	  dest = change_address (destmem, SImode, destptr);
+	  emit_insn (gen_strset (destptr, dest, value));
+	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
+	  emit_insn (gen_strset (destptr, dest, value));
+	}
+      emit_label (label);
+      LABEL_NUSES (label) = 1;
+    }
+  if (max_size > 4)
+    {
+      rtx label = ix86_expand_aligntest (count, 4, true);
+      dest = change_address (destmem, SImode, destptr);
+      emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
+      emit_label (label);
+      LABEL_NUSES (label) = 1;
+    }
+  if (max_size > 2)
+    {
+      rtx label = ix86_expand_aligntest (count, 2, true);
+      dest = change_address (destmem, HImode, destptr);
+      emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
+      emit_label (label);
+      LABEL_NUSES (label) = 1;
+    }
+  if (max_size > 1)
+    {
+      rtx label = ix86_expand_aligntest (count, 1, true);
+      dest = change_address (destmem, QImode, destptr);
+      emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
+      emit_label (label);
+      LABEL_NUSES (label) = 1;
+    }
+}
+
+/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
+   DESTMEM to align it to DESIRED_ALIGNMENT.  Original alignment is ALIGN.
+   Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
+   ignored.
+   Return value is updated DESTMEM.  */
+static rtx
+expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
+				  rtx destptr, rtx srcptr, rtx value,
+				  rtx vec_value, rtx count, int align,
+				  int desired_alignment, bool issetmem)
+{
+  int i;
+  for (i = 1; i < desired_alignment; i <<= 1)
+    {
+      if (align <= i)
+	{
+	  rtx label = ix86_expand_aligntest (destptr, i, false);
+	  if (issetmem)
+	    {
+	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
+		destmem = emit_memset (destmem, destptr, vec_value, i);
+	      else
+		destmem = emit_memset (destmem, destptr, value, i);
+	    }
+	  else
+	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
+	  ix86_adjust_counter (count, i);
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	  set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
+	}
+    }
+  return destmem;
+}
+
+/* Test if COUNT&SIZE is nonzero and if so, expand movme
+   or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
+   and jump to DONE_LABEL.  */
+static void
+expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
+			       rtx destptr, rtx srcptr,
+			       rtx value, rtx vec_value,
+			       rtx count, int size,
+			       rtx done_label, bool issetmem)
+{
+  rtx label = ix86_expand_aligntest (count, size, false);
+  enum machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
+  rtx modesize;
+  int n;
+
+  /* If we do not have vector value to copy, we must reduce size.  */
+  if (issetmem)
+    {
+      if (!vec_value)
+	{
+	  if (GET_MODE (value) == VOIDmode && size > 8)
+	    mode = Pmode;
+	  else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
+	    mode = GET_MODE (value);
+	}
+      else
+	mode = GET_MODE (vec_value), value = vec_value;
+    }
+  else
+    {
+      /* Choose appropriate vector mode.  */
+      if (size >= 32)
+	mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
+      else if (size >= 16)
+	mode = TARGET_SSE ? V16QImode : DImode;
+      srcmem = change_address (srcmem, mode, srcptr);
+    }
+  destmem = change_address (destmem, mode, destptr);
+  modesize = GEN_INT (GET_MODE_SIZE (mode));
+  gcc_assert (GET_MODE_SIZE (mode) <= size);
+  for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
+    {
+      if (issetmem)
+	emit_move_insn (destmem, gen_lowpart (mode, value));
+      else
+	{
+          emit_move_insn (destmem, srcmem);
+          srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
+	}
+      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
+    }
+
+  destmem = offset_address (destmem, count, 1);
+  destmem = offset_address (destmem, GEN_INT (-2 * size),
+			    GET_MODE_SIZE (mode));
+  if (!issetmem)
+    {
+      srcmem = offset_address (srcmem, count, 1);
+      srcmem = offset_address (srcmem, GEN_INT (-2 * size),
+			       GET_MODE_SIZE (mode));
+    }
+  for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
+    {
+      if (issetmem)
+	emit_move_insn (destmem, gen_lowpart (mode, value));
+      else
+	{
+	  emit_move_insn (destmem, srcmem);
+	  srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
+	}
+      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
+    }
+  emit_jump_insn (gen_jump (done_label));
+  emit_barrier ();
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+}
+
+/* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
+   and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
+   bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
+   proceed with an loop copying SIZE bytes at once. Do moves in MODE.
+   DONE_LABEL is a label after the whole copying sequence. The label is created
+   on demand if *DONE_LABEL is NULL.
+   MIN_SIZE is minimal size of block copied.  This value gets adjusted for new
+   bounds after the initial copies. 
+
+   DESTMEM/SRCMEM are memory expressions pointing to the copies block,
+   DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
+   we will dispatch to a library call for large blocks.
+
+   In pseudocode we do:
+
+   if (COUNT < SIZE)
+     {
+       Assume that SIZE is 4. Bigger sizes are handled analogously
+       if (COUNT & 4)
+	 {
+	    copy 4 bytes from SRCPTR to DESTPTR
+	    copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
+	    goto done_label
+	 }
+       if (!COUNT)
+	 goto done_label;
+       copy 1 byte from SRCPTR to DESTPTR
+       if (COUNT & 2)
+	 {
+	    copy 2 bytes from SRCPTR to DESTPTR
+	    copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
+	 }
+     }
+   else
+     {
+       copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
+       copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
+
+       OLD_DESPTR = DESTPTR;
+       Align DESTPTR up to DESIRED_ALIGN
+       SRCPTR += DESTPTR - OLD_DESTPTR
+       COUNT -= DEST_PTR - OLD_DESTPTR
+       if (DYNAMIC_CHECK)
+	 Round COUNT down to multiple of SIZE
+       << optional caller supplied zero size guard is here >>
+       << optional caller suppplied dynamic check is here >>
+       << caller supplied main copy loop is here >>
+     }
+   done_label:
+  */
+static void
+expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
+							    rtx *destptr, rtx *srcptr,
+							    enum machine_mode mode,
+							    rtx value, rtx vec_value,
+							    rtx *count,
+							    rtx *done_label,
+							    int size,
+							    int desired_align,
+							    int align,
+							    unsigned HOST_WIDE_INT *min_size,
+							    bool dynamic_check,
+							    bool issetmem)
+{
+  rtx loop_label = NULL, label;
+  int n;
+  rtx modesize;
+  int prolog_size = 0;
+  rtx mode_value;
+
+  /* Chose proper value to copy.  */
+  if (issetmem && VECTOR_MODE_P (mode))
+    mode_value = vec_value;
+  else
+    mode_value = value;
+  gcc_assert (GET_MODE_SIZE (mode) <= size);
+
+  /* See if block is big or small, handle small blocks.  */
+  if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
+    {
+      int size2 = size;
+      loop_label = gen_label_rtx ();
+
+      if (!*done_label)
+	*done_label = gen_label_rtx ();
+
+      emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
+			       1, loop_label);
+      size2 >>= 1;
+
+      /* Handle sizes > 3.  */
+      for (;size2 > 2; size2 >>= 1)
+	expand_small_movmem_or_setmem (destmem, srcmem,
+				       *destptr, *srcptr,
+				       value, vec_value,
+				       *count,
+				       size2, *done_label, issetmem);
+      /* Nothing to copy?  Jump to DONE_LABEL if so */
+      emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
+			       1, *done_label);
+
+      /* Do a byte copy.  */
+      destmem = change_address (destmem, QImode, *destptr);
+      if (issetmem)
+	emit_move_insn (destmem, gen_lowpart (QImode, value));
+      else
+	{
+          srcmem = change_address (srcmem, QImode, *srcptr);
+          emit_move_insn (destmem, srcmem);
+	}
+
+      /* Handle sizes 2 and 3.  */
+      label = ix86_expand_aligntest (*count, 2, false);
+      destmem = change_address (destmem, HImode, *destptr);
+      destmem = offset_address (destmem, *count, 1);
+      destmem = offset_address (destmem, GEN_INT (-2), 2);
+      if (issetmem)
+        emit_move_insn (destmem, gen_lowpart (HImode, value));
+      else
+	{
+	  srcmem = change_address (srcmem, HImode, *srcptr);
+	  srcmem = offset_address (srcmem, *count, 1);
+	  srcmem = offset_address (srcmem, GEN_INT (-2), 2);
+	  emit_move_insn (destmem, srcmem);
+	}
+
+      emit_label (label);
+      LABEL_NUSES (label) = 1;
+      emit_jump_insn (gen_jump (*done_label));
+      emit_barrier ();
+    }
+  else
+    gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
+		|| UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
+
+  /* Start memcpy for COUNT >= SIZE.  */
+  if (loop_label)
+    {
+       emit_label (loop_label);
+       LABEL_NUSES (loop_label) = 1;
+    }
+
+  /* Copy first desired_align bytes.  */
+  if (!issetmem)
+    srcmem = change_address (srcmem, mode, *srcptr);
+  destmem = change_address (destmem, mode, *destptr);
+  modesize = GEN_INT (GET_MODE_SIZE (mode));
+  for (n = 0; prolog_size < desired_align - align; n++)
+    {
+      if (issetmem)
+        emit_move_insn (destmem, mode_value);
+      else
+	{
+          emit_move_insn (destmem, srcmem);
+          srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
+	}
+      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
+      prolog_size += GET_MODE_SIZE (mode);
+    }
+
+
+  /* Copy last SIZE bytes.  */
+  destmem = offset_address (destmem, *count, 1);
+  destmem = offset_address (destmem,
+			    GEN_INT (-size - prolog_size),
+			    1);
+  if (issetmem)
+    emit_move_insn (destmem, mode_value);
+  else
+    {
+      srcmem = offset_address (srcmem, *count, 1);
+      srcmem = offset_address (srcmem,
+			       GEN_INT (-size - prolog_size),
+			       1);
+      emit_move_insn (destmem, srcmem);
+    }
+  for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
+    {
+      destmem = offset_address (destmem, modesize, 1);
+      if (issetmem)
+	emit_move_insn (destmem, mode_value);
+      else
+	{
+          srcmem = offset_address (srcmem, modesize, 1);
+          emit_move_insn (destmem, srcmem);
+	}
+    }
+
+  /* Align destination.  */
+  if (desired_align > 1 && desired_align > align)
+    {
+      rtx saveddest = *destptr;
+
+      gcc_assert (desired_align <= size);
+      /* Align destptr up, place it to new register.  */
+      *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
+				      GEN_INT (prolog_size),
+				      NULL_RTX, 1, OPTAB_DIRECT);
+      *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
+				      GEN_INT (-desired_align),
+				      *destptr, 1, OPTAB_DIRECT);
+      /* See how many bytes we skipped.  */
+      saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
+				       *destptr,
+				       saveddest, 1, OPTAB_DIRECT);
+      /* Adjust srcptr and count.  */
+      if (!issetmem)
+	*srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr, saveddest,
+					*srcptr, 1, OPTAB_DIRECT);
+      *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
+				    saveddest, *count, 1, OPTAB_DIRECT);
+      /* We copied at most size + prolog_size.  */
+      if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
+	*min_size = (*min_size - size) & ~(unsigned HOST_WIDE_INT)(size - 1);
+      else
+	*min_size = 0;
+
+      /* Our loops always round down the bock size, but for dispatch to library
+	 we need precise value.  */
+      if (dynamic_check)
+	*count = expand_simple_binop (GET_MODE (*count), AND, *count,
+				      GEN_INT (-size), *count, 1, OPTAB_DIRECT);
+    }
+  else
+    {
+      gcc_assert (prolog_size == 0);
+      /* Decrease count, so we won't end up copying last word twice.  */
+      if (!CONST_INT_P (*count))
+	*count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
+				      constm1_rtx, *count, 1, OPTAB_DIRECT);
+      else
+	*count = GEN_INT ((UINTVAL (*count) - 1) & ~(unsigned HOST_WIDE_INT)(size - 1));
+      if (*min_size)
+	*min_size = (*min_size - 1) & ~(unsigned HOST_WIDE_INT)(size - 1);
+    }
+}
+
+
+/* This function is like the previous one, except here we know how many bytes
+   need to be copied.  That allows us to update alignment not only of DST, which
+   is returned, but also of SRC, which is passed as a pointer for that
+   reason.  */
+static rtx
+expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
+					   rtx srcreg, rtx value, rtx vec_value,
+					   int desired_align, int align_bytes,
+					   bool issetmem)
+{
+  rtx src = NULL;
+  rtx orig_dst = dst;
+  rtx orig_src = NULL;
+  int piece_size = 1;
+  int copied_bytes = 0;
+
+  if (!issetmem)
+    {
+      gcc_assert (srcp != NULL);
+      src = *srcp;
+      orig_src = src;
+    }
+
+  for (piece_size = 1;
+       piece_size <= desired_align && copied_bytes < align_bytes;
+       piece_size <<= 1)
+    {
+      if (align_bytes & piece_size)
+	{
+	  if (issetmem)
+	    {
+	      if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
+		dst = emit_memset (dst, destreg, vec_value, piece_size);
+	      else
+		dst = emit_memset (dst, destreg, value, piece_size);
+	    }
+	  else
+	    dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
+	  copied_bytes += piece_size;
+	}
+    }
+  if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
+    set_mem_align (dst, desired_align * BITS_PER_UNIT);
+  if (MEM_SIZE_KNOWN_P (orig_dst))
+    set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
+
+  if (!issetmem)
+    {
+      int src_align_bytes = get_mem_align_offset (src, desired_align
+						       * BITS_PER_UNIT);
+      if (src_align_bytes >= 0)
+	src_align_bytes = desired_align - src_align_bytes;
+      if (src_align_bytes >= 0)
+	{
+	  unsigned int src_align;
+	  for (src_align = desired_align; src_align >= 2; src_align >>= 1)
+	    {
+	      if ((src_align_bytes & (src_align - 1))
+		   == (align_bytes & (src_align - 1)))
+		break;
+	    }
+	  if (src_align > (unsigned int) desired_align)
+	    src_align = desired_align;
+	  if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
+	    set_mem_align (src, src_align * BITS_PER_UNIT);
+	}
+      if (MEM_SIZE_KNOWN_P (orig_src))
+	set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
+      *srcp = src;
+    }
+
+  return dst;
+}
+
+/* Return true if ALG can be used in current context.  
+   Assume we expand memset if MEMSET is true.  */
+static bool
+alg_usable_p (enum stringop_alg alg, bool memset)
+{
+  if (alg == no_stringop)
+    return false;
+  if (alg == vector_loop)
+    return TARGET_SSE || TARGET_AVX;
+  /* Algorithms using the rep prefix want at least edi and ecx;
+     additionally, memset wants eax and memcpy wants esi.  Don't
+     consider such algorithms if the user has appropriated those
+     registers for their own purposes.	*/
+  if (alg == rep_prefix_1_byte
+      || alg == rep_prefix_4_byte
+      || alg == rep_prefix_8_byte)
+    return !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
+             || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
+  return true;
+}
+
+/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
+static enum stringop_alg
+decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
+	    unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
+	    bool memset, bool zero_memset, int *dynamic_check, bool *noalign)
+{
+  const struct stringop_algs * algs;
+  bool optimize_for_speed;
+  int max = -1;
+  const struct processor_costs *cost;
+  int i;
+  bool any_alg_usable_p = false;
+
+  *noalign = false;
+  *dynamic_check = -1;
+
+  /* Even if the string operation call is cold, we still might spend a lot
+     of time processing large blocks.  */
+  if (optimize_function_for_size_p (cfun)
+      || (optimize_insn_for_size_p ()
+ 	  && (max_size < 256
+              || (expected_size != -1 && expected_size < 256))))
+    optimize_for_speed = false;
+  else
+    optimize_for_speed = true;
+
+  cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
+  if (memset)
+    algs = &cost->memset[TARGET_64BIT != 0];
+  else
+    algs = &cost->memcpy[TARGET_64BIT != 0];
+
+  /* See maximal size for user defined algorithm.  */
+  for (i = 0; i < MAX_STRINGOP_ALGS; i++)
+    {
+      enum stringop_alg candidate = algs->size[i].alg;
+      bool usable = alg_usable_p (candidate, memset);
+      any_alg_usable_p |= usable;
+
+      if (candidate != libcall && candidate && usable)
+	  max = algs->size[i].max;
+    }
+
+  /* If expected size is not known but max size is small enough
+     so inline version is a win, set expected size into
+     the range.  */
+  if (max > 1 && (unsigned HOST_WIDE_INT) max >= max_size
+      && expected_size == -1)
+    expected_size = min_size / 2 + max_size / 2;
+
+  /* If user specified the algorithm, honnor it if possible.  */
+  if (ix86_stringop_alg != no_stringop
+      && alg_usable_p (ix86_stringop_alg, memset))
+    return ix86_stringop_alg;
+  /* rep; movq or rep; movl is the smallest variant.  */
+  else if (!optimize_for_speed)
+    {
+      *noalign = true;
+      if (!count || (count & 3) || (memset && !zero_memset))
+	return alg_usable_p (rep_prefix_1_byte, memset)
+	       ? rep_prefix_1_byte : loop_1_byte;
+      else
+	return alg_usable_p (rep_prefix_4_byte, memset)
+	       ? rep_prefix_4_byte : loop;
+    }
+  /* Very tiny blocks are best handled via the loop, REP is expensive to
+     setup.  */
+  else if (expected_size != -1 && expected_size < 4)
+    return loop_1_byte;
+  else if (expected_size != -1)
+    {
+      enum stringop_alg alg = libcall;
+      bool alg_noalign = false;
+      for (i = 0; i < MAX_STRINGOP_ALGS; i++)
+	{
+	  /* We get here if the algorithms that were not libcall-based
+	     were rep-prefix based and we are unable to use rep prefixes
+	     based on global register usage.  Break out of the loop and
+	     use the heuristic below.  */
+	  if (algs->size[i].max == 0)
+	    break;
+	  if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
+	    {
+	      enum stringop_alg candidate = algs->size[i].alg;
+
+	      if (candidate != libcall && alg_usable_p (candidate, memset))
+		{
+		  alg = candidate;
+		  alg_noalign = algs->size[i].noalign;
+		}
+	      /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
+		 last non-libcall inline algorithm.  */
+	      if (TARGET_INLINE_ALL_STRINGOPS)
+		{
+		  /* When the current size is best to be copied by a libcall,
+		     but we are still forced to inline, run the heuristic below
+		     that will pick code for medium sized blocks.  */
+		  if (alg != libcall)
+		    {
+		      *noalign = alg_noalign;
+		      return alg;
+		    }
+		  break;
+		}
+	      else if (alg_usable_p (candidate, memset))
+		{
+		  *noalign = algs->size[i].noalign;
+		  return candidate;
+		}
+	    }
+	}
+    }
+  /* When asked to inline the call anyway, try to pick meaningful choice.
+     We look for maximal size of block that is faster to copy by hand and
+     take blocks of at most of that size guessing that average size will
+     be roughly half of the block.
+
+     If this turns out to be bad, we might simply specify the preferred
+     choice in ix86_costs.  */
+  if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
+      && (algs->unknown_size == libcall
+	  || !alg_usable_p (algs->unknown_size, memset)))
+    {
+      enum stringop_alg alg;
+
+      /* If there aren't any usable algorithms, then recursing on
+         smaller sizes isn't going to find anything.  Just return the
+         simple byte-at-a-time copy loop.  */
+      if (!any_alg_usable_p)
+        {
+          /* Pick something reasonable.  */
+          if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
+            *dynamic_check = 128;
+          return loop_1_byte;
+        }
+      if (max == -1)
+	max = 4096;
+      alg = decide_alg (count, max / 2, min_size, max_size, memset,
+			zero_memset, dynamic_check, noalign);
+      gcc_assert (*dynamic_check == -1);
+      gcc_assert (alg != libcall);
+      if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
+	*dynamic_check = max;
+      return alg;
+    }
+  return (alg_usable_p (algs->unknown_size, memset)
+	  ? algs->unknown_size : libcall);
+}
+
+/* Decide on alignment.  We know that the operand is already aligned to ALIGN
+   (ALIGN can be based on profile feedback and thus it is not 100% guaranteed).  */
+static int
+decide_alignment (int align,
+		  enum stringop_alg alg,
+		  int expected_size,
+		  enum machine_mode move_mode)
+{
+  int desired_align = 0;
+
+  gcc_assert (alg != no_stringop);
+
+  if (alg == libcall)
+    return 0;
+  if (move_mode == VOIDmode)
+    return 0;
+
+  desired_align = GET_MODE_SIZE (move_mode);
+  /* PentiumPro has special logic triggering for 8 byte aligned blocks.
+     copying whole cacheline at once.  */
+  if (TARGET_PENTIUMPRO
+      && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
+    desired_align = 8;
+
+  if (optimize_size)
+    desired_align = 1;
+  if (desired_align < align)
+    desired_align = align;
+  if (expected_size != -1 && expected_size < 4)
+    desired_align = align;
+
+  return desired_align;
+}
+
+
+/* Helper function for memcpy.  For QImode value 0xXY produce
+   0xXYXYXYXY of wide specified by MODE.  This is essentially
+   a * 0x10101010, but we can do slightly better than
+   synth_mult by unwinding the sequence by hand on CPUs with
+   slow multiply.  */
+static rtx
+promote_duplicated_reg (enum machine_mode mode, rtx val)
+{
+  enum machine_mode valmode = GET_MODE (val);
+  rtx tmp;
+  int nops = mode == DImode ? 3 : 2;
+
+  gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
+  if (val == const0_rtx)
+    return copy_to_mode_reg (mode, CONST0_RTX (mode));
+  if (CONST_INT_P (val))
+    {
+      HOST_WIDE_INT v = INTVAL (val) & 255;
+
+      v |= v << 8;
+      v |= v << 16;
+      if (mode == DImode)
+        v |= (v << 16) << 16;
+      return copy_to_mode_reg (mode, gen_int_mode (v, mode));
+    }
+
+  if (valmode == VOIDmode)
+    valmode = QImode;
+  if (valmode != QImode)
+    val = gen_lowpart (QImode, val);
+  if (mode == QImode)
+    return val;
+  if (!TARGET_PARTIAL_REG_STALL)
+    nops--;
+  if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
+      + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
+      <= (ix86_cost->shift_const + ix86_cost->add) * nops
+          + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
+    {
+      rtx reg = convert_modes (mode, QImode, val, true);
+      tmp = promote_duplicated_reg (mode, const1_rtx);
+      return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
+				  OPTAB_DIRECT);
+    }
+  else
+    {
+      rtx reg = convert_modes (mode, QImode, val, true);
+
+      if (!TARGET_PARTIAL_REG_STALL)
+	if (mode == SImode)
+	  emit_insn (gen_movsi_insv_1 (reg, reg));
+	else
+	  emit_insn (gen_movdi_insv_1 (reg, reg));
+      else
+	{
+	  tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
+				     NULL, 1, OPTAB_DIRECT);
+	  reg =
+	    expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
+	}
+      tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
+			         NULL, 1, OPTAB_DIRECT);
+      reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
+      if (mode == SImode)
+	return reg;
+      tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
+				 NULL, 1, OPTAB_DIRECT);
+      reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
+      return reg;
+    }
+}
+
+/* Duplicate value VAL using promote_duplicated_reg into maximal size that will
+   be needed by main loop copying SIZE_NEEDED chunks and prologue getting
+   alignment from ALIGN to DESIRED_ALIGN.  */
+static rtx
+promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
+				int align)
+{
+  rtx promoted_val;
+
+  if (TARGET_64BIT
+      && (size_needed > 4 || (desired_align > align && desired_align > 4)))
+    promoted_val = promote_duplicated_reg (DImode, val);
+  else if (size_needed > 2 || (desired_align > align && desired_align > 2))
+    promoted_val = promote_duplicated_reg (SImode, val);
+  else if (size_needed > 1 || (desired_align > align && desired_align > 1))
+    promoted_val = promote_duplicated_reg (HImode, val);
+  else
+    promoted_val = val;
+
+  return promoted_val;
+}
+
+/* Expand string move (memcpy) ot store (memset) operation.  Use i386 string
+   operations when profitable.  The code depends upon architecture, block size
+   and alignment, but always has one of the following overall structures:
+
+   Aligned move sequence:
+
+     1) Prologue guard: Conditional that jumps up to epilogues for small
+	blocks that can be handled by epilogue alone.  This is faster
+	but also needed for correctness, since prologue assume the block
+	is larger than the desired alignment.
+
+	Optional dynamic check for size and libcall for large
+	blocks is emitted here too, with -minline-stringops-dynamically.
+
+     2) Prologue: copy first few bytes in order to get destination
+	aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
+	than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
+	copied.  We emit either a jump tree on power of two sized
+	blocks, or a byte loop.
+
+     3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
+	with specified algorithm.
+
+     4) Epilogue: code copying tail of the block that is too small to be
+	handled by main body (or up to size guarded by prologue guard). 
+
+  Misaligned move sequence
+
+     1) missaligned move prologue/epilogue containing:
+        a) Prologue handling small memory blocks and jumping to done_label
+	   (skipped if blocks are known to be large enough)
+	b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
+           needed by single possibly misaligned move
+	   (skipped if alignment is not needed)
+        c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
+
+     2) Zero size guard dispatching to done_label, if needed
+
+     3) dispatch to library call, if needed,
+
+     3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
+	with specified algorithm.  */
+bool
+ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
+			   rtx align_exp, rtx expected_align_exp,
+			   rtx expected_size_exp, rtx min_size_exp,
+			   rtx max_size_exp, rtx probable_max_size_exp,
+			   bool issetmem)
+{
+  rtx destreg;
+  rtx srcreg = NULL;
+  rtx label = NULL;
+  rtx tmp;
+  rtx jump_around_label = NULL;
+  HOST_WIDE_INT align = 1;
+  unsigned HOST_WIDE_INT count = 0;
+  HOST_WIDE_INT expected_size = -1;
+  int size_needed = 0, epilogue_size_needed;
+  int desired_align = 0, align_bytes = 0;
+  enum stringop_alg alg;
+  rtx promoted_val = NULL;
+  rtx vec_promoted_val = NULL;
+  bool force_loopy_epilogue = false;
+  int dynamic_check;
+  bool need_zero_guard = false;
+  bool noalign;
+  enum machine_mode move_mode = VOIDmode;
+  int unroll_factor = 1;
+  /* TODO: Once value ranges are available, fill in proper data.  */
+  unsigned HOST_WIDE_INT min_size = 0;
+  unsigned HOST_WIDE_INT max_size = -1;
+  unsigned HOST_WIDE_INT probable_max_size = -1;
+  bool misaligned_prologue_used = false;
+
+  if (CONST_INT_P (align_exp))
+    align = INTVAL (align_exp);
+  /* i386 can do misaligned access on reasonably increased cost.  */
+  if (CONST_INT_P (expected_align_exp)
+      && INTVAL (expected_align_exp) > align)
+    align = INTVAL (expected_align_exp);
+  /* ALIGN is the minimum of destination and source alignment, but we care here
+     just about destination alignment.  */
+  else if (!issetmem
+	   && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
+    align = MEM_ALIGN (dst) / BITS_PER_UNIT;
+
+  if (CONST_INT_P (count_exp))
+    min_size = max_size = probable_max_size = count = expected_size
+      = INTVAL (count_exp);
+  else
+    {
+      if (min_size_exp)
+	min_size = INTVAL (min_size_exp);
+      if (max_size_exp)
+	max_size = INTVAL (max_size_exp);
+      if (probable_max_size_exp)
+	probable_max_size = INTVAL (probable_max_size_exp);
+      if (CONST_INT_P (expected_size_exp) && count == 0)
+	expected_size = INTVAL (expected_size_exp);
+     }
+
+  /* Make sure we don't need to care about overflow later on.  */
+  if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
+    return false;
+
+  /* Step 0: Decide on preferred algorithm, desired alignment and
+     size of chunks to be copied by main loop.  */
+  alg = decide_alg (count, expected_size, min_size, probable_max_size,
+		    issetmem,
+		    issetmem && val_exp == const0_rtx,
+		    &dynamic_check, &noalign);
+  if (alg == libcall)
+    return false;
+  gcc_assert (alg != no_stringop);
+
+  /* For now vector-version of memset is generated only for memory zeroing, as
+     creating of promoted vector value is very cheap in this case.  */
+  if (issetmem && alg == vector_loop && val_exp != const0_rtx)
+    alg = unrolled_loop;
+
+  if (!count)
+    count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
+  destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
+  if (!issetmem)
+    srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
+
+  unroll_factor = 1;
+  move_mode = word_mode;
+  switch (alg)
+    {
+    case libcall:
+    case no_stringop:
+    case last_alg:
+      gcc_unreachable ();
+    case loop_1_byte:
+      need_zero_guard = true;
+      move_mode = QImode;
+      break;
+    case loop:
+      need_zero_guard = true;
+      break;
+    case unrolled_loop:
+      need_zero_guard = true;
+      unroll_factor = (TARGET_64BIT ? 4 : 2);
+      break;
+    case vector_loop:
+      need_zero_guard = true;
+      unroll_factor = 4;
+      /* Find the widest supported mode.  */
+      move_mode = word_mode;
+      while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
+	     != CODE_FOR_nothing)
+	  move_mode = GET_MODE_WIDER_MODE (move_mode);
+
+      /* Find the corresponding vector mode with the same size as MOVE_MODE.
+	 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
+      if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
+	{
+	  int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
+	  move_mode = mode_for_vector (word_mode, nunits);
+	  if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
+	    move_mode = word_mode;
+	}
+      gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
+      break;
+    case rep_prefix_8_byte:
+      move_mode = DImode;
+      break;
+    case rep_prefix_4_byte:
+      move_mode = SImode;
+      break;
+    case rep_prefix_1_byte:
+      move_mode = QImode;
+      break;
+    }
+  size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
+  epilogue_size_needed = size_needed;
+
+  desired_align = decide_alignment (align, alg, expected_size, move_mode);
+  if (!TARGET_ALIGN_STRINGOPS || noalign)
+    align = desired_align;
+
+  /* Step 1: Prologue guard.  */
+
+  /* Alignment code needs count to be in register.  */
+  if (CONST_INT_P (count_exp) && desired_align > align)
+    {
+      if (INTVAL (count_exp) > desired_align
+	  && INTVAL (count_exp) > size_needed)
+	{
+	  align_bytes
+	    = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
+	  if (align_bytes <= 0)
+	    align_bytes = 0;
+	  else
+	    align_bytes = desired_align - align_bytes;
+	}
+      if (align_bytes == 0)
+	count_exp = force_reg (counter_mode (count_exp), count_exp);
+    }
+  gcc_assert (desired_align >= 1 && align >= 1);
+
+  /* Misaligned move sequences handle both prologue and epilogue at once.
+     Default code generation results in a smaller code for large alignments
+     and also avoids redundant job when sizes are known precisely.  */
+  misaligned_prologue_used
+    = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
+       && MAX (desired_align, epilogue_size_needed) <= 32
+       && desired_align <= epilogue_size_needed
+       && ((desired_align > align && !align_bytes)
+	   || (!count && epilogue_size_needed > 1)));
+
+  /* Do the cheap promotion to allow better CSE across the
+     main loop and epilogue (ie one load of the big constant in the
+     front of all code.  
+     For now the misaligned move sequences do not have fast path
+     without broadcasting.  */
+  if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
+    {
+      if (alg == vector_loop)
+	{
+	  gcc_assert (val_exp == const0_rtx);
+	  vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
+	  promoted_val = promote_duplicated_reg_to_size (val_exp,
+							 GET_MODE_SIZE (word_mode),
+							 desired_align, align);
+	}
+      else
+	{
+	  promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
+							 desired_align, align);
+	}
+    }
+  /* Misaligned move sequences handles both prologues and epilogues at once.
+     Default code generation results in smaller code for large alignments and
+     also avoids redundant job when sizes are known precisely.  */
+  if (misaligned_prologue_used)
+    {
+      /* Misaligned move prologue handled small blocks by itself.  */
+      expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
+	   (dst, src, &destreg, &srcreg,
+	    move_mode, promoted_val, vec_promoted_val,
+	    &count_exp,
+	    &jump_around_label,
+            desired_align < align
+	    ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
+	    desired_align, align, &min_size, dynamic_check, issetmem);
+      if (!issetmem)
+        src = change_address (src, BLKmode, srcreg);
+      dst = change_address (dst, BLKmode, destreg);
+      set_mem_align (dst, desired_align * BITS_PER_UNIT);
+      epilogue_size_needed = 0;
+      if (need_zero_guard && !min_size)
+	{
+	  /* It is possible that we copied enough so the main loop will not
+	     execute.  */
+	  gcc_assert (size_needed > 1);
+	  if (jump_around_label == NULL_RTX)
+	    jump_around_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (count_exp,
+				   GEN_INT (size_needed),
+				   LTU, 0, counter_mode (count_exp), 1, jump_around_label);
+	  if (expected_size == -1
+	      || expected_size < (desired_align - align) / 2 + size_needed)
+	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
+	  else
+	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
+	}
+    }
+  /* Ensure that alignment prologue won't copy past end of block.  */
+  else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
+    {
+      epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
+      /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
+	 Make sure it is power of 2.  */
+      epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
+
+      /* To improve performance of small blocks, we jump around the VAL
+	 promoting mode.  This mean that if the promoted VAL is not constant,
+	 we might not use it in the epilogue and have to use byte
+	 loop variant.  */
+      if (issetmem && epilogue_size_needed > 2 && !promoted_val)
+	force_loopy_epilogue = true;
+      if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
+	  || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
+	{
+	  /* If main algorithm works on QImode, no epilogue is needed.
+	     For small sizes just don't align anything.  */
+	  if (size_needed == 1)
+	    desired_align = align;
+	  else
+	    goto epilogue;
+	}
+      else if (!count
+	       && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
+	{
+	  label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (count_exp,
+				   GEN_INT (epilogue_size_needed),
+				   LTU, 0, counter_mode (count_exp), 1, label);
+	  if (expected_size == -1 || expected_size < epilogue_size_needed)
+	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
+	  else
+	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
+	}
+    }
+
+  /* Emit code to decide on runtime whether library call or inline should be
+     used.  */
+  if (dynamic_check != -1)
+    {
+      if (!issetmem && CONST_INT_P (count_exp))
+	{
+	  if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
+	    {
+	      emit_block_move_via_libcall (dst, src, count_exp, false);
+	      count_exp = const0_rtx;
+	      goto epilogue;
+	    }
+	}
+      else
+	{
+	  rtx hot_label = gen_label_rtx ();
+	  if (jump_around_label == NULL_RTX)
+	    jump_around_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
+				   LEU, 0, GET_MODE (count_exp), 1, hot_label);
+	  predict_jump (REG_BR_PROB_BASE * 90 / 100);
+	  if (issetmem)
+	    set_storage_via_libcall (dst, count_exp, val_exp, false);
+	  else
+	    emit_block_move_via_libcall (dst, src, count_exp, false);
+	  emit_jump (jump_around_label);
+	  emit_label (hot_label);
+	}
+    }
+
+  /* Step 2: Alignment prologue.  */
+  /* Do the expensive promotion once we branched off the small blocks.  */
+  if (issetmem && !promoted_val)
+    promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
+						   desired_align, align);
+
+  if (desired_align > align && !misaligned_prologue_used)
+    {
+      if (align_bytes == 0)
+	{
+	  /* Except for the first move in prologue, we no longer know
+	     constant offset in aliasing info.  It don't seems to worth
+	     the pain to maintain it for the first move, so throw away
+	     the info early.  */
+	  dst = change_address (dst, BLKmode, destreg);
+	  if (!issetmem)
+	    src = change_address (src, BLKmode, srcreg);
+	  dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
+					    promoted_val, vec_promoted_val,
+					    count_exp, align, desired_align,
+					    issetmem);
+	  /* At most desired_align - align bytes are copied.  */
+	  if (min_size < (unsigned)(desired_align - align))
+	    min_size = 0;
+	  else
+	    min_size -= desired_align - align;
+	}
+      else
+	{
+	  /* If we know how many bytes need to be stored before dst is
+	     sufficiently aligned, maintain aliasing info accurately.  */
+	  dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
+							   srcreg,
+							   promoted_val,
+							   vec_promoted_val,
+							   desired_align,
+							   align_bytes,
+							   issetmem);
+
+	  count_exp = plus_constant (counter_mode (count_exp),
+				     count_exp, -align_bytes);
+	  count -= align_bytes;
+	  min_size -= align_bytes;
+	  max_size -= align_bytes;
+	}
+      if (need_zero_guard
+	  && !min_size
+	  && (count < (unsigned HOST_WIDE_INT) size_needed
+	      || (align_bytes == 0
+		  && count < ((unsigned HOST_WIDE_INT) size_needed
+			      + desired_align - align))))
+	{
+	  /* It is possible that we copied enough so the main loop will not
+	     execute.  */
+	  gcc_assert (size_needed > 1);
+	  if (label == NULL_RTX)
+	    label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (count_exp,
+				   GEN_INT (size_needed),
+				   LTU, 0, counter_mode (count_exp), 1, label);
+	  if (expected_size == -1
+	      || expected_size < (desired_align - align) / 2 + size_needed)
+	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
+	  else
+	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
+	}
+    }
+  if (label && size_needed == 1)
+    {
+      emit_label (label);
+      LABEL_NUSES (label) = 1;
+      label = NULL;
+      epilogue_size_needed = 1;
+      if (issetmem)
+	promoted_val = val_exp;
+    }
+  else if (label == NULL_RTX && !misaligned_prologue_used)
+    epilogue_size_needed = size_needed;
+
+  /* Step 3: Main loop.  */
+
+  switch (alg)
+    {
+    case libcall:
+    case no_stringop:
+    case last_alg:
+      gcc_unreachable ();
+    case loop_1_byte:
+    case loop:
+    case unrolled_loop:
+      expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
+				     count_exp, move_mode, unroll_factor,
+				     expected_size, issetmem);
+      break;
+    case vector_loop:
+      expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
+				     vec_promoted_val, count_exp, move_mode,
+				     unroll_factor, expected_size, issetmem);
+      break;
+    case rep_prefix_8_byte:
+    case rep_prefix_4_byte:
+    case rep_prefix_1_byte:
+      expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
+				       val_exp, count_exp, move_mode, issetmem);
+      break;
+    }
+  /* Adjust properly the offset of src and dest memory for aliasing.  */
+  if (CONST_INT_P (count_exp))
+    {
+      if (!issetmem)
+	src = adjust_automodify_address_nv (src, BLKmode, srcreg,
+					    (count / size_needed) * size_needed);
+      dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
+					  (count / size_needed) * size_needed);
+    }
+  else
+    {
+      if (!issetmem)
+	src = change_address (src, BLKmode, srcreg);
+      dst = change_address (dst, BLKmode, destreg);
+    }
+
+  /* Step 4: Epilogue to copy the remaining bytes.  */
+ epilogue:
+  if (label)
+    {
+      /* When the main loop is done, COUNT_EXP might hold original count,
+	 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
+	 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
+	 bytes. Compensate if needed.  */
+
+      if (size_needed < epilogue_size_needed)
+	{
+	  tmp =
+	    expand_simple_binop (counter_mode (count_exp), AND, count_exp,
+				 GEN_INT (size_needed - 1), count_exp, 1,
+				 OPTAB_DIRECT);
+	  if (tmp != count_exp)
+	    emit_move_insn (count_exp, tmp);
+	}
+      emit_label (label);
+      LABEL_NUSES (label) = 1;
+    }
+
+  if (count_exp != const0_rtx && epilogue_size_needed > 1)
+    {
+      if (force_loopy_epilogue)
+	expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
+					 epilogue_size_needed);
+      else
+	{
+	  if (issetmem)
+	    expand_setmem_epilogue (dst, destreg, promoted_val,
+				    vec_promoted_val, count_exp,
+				    epilogue_size_needed);
+	  else
+	    expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
+				    epilogue_size_needed);
+	}
+    }
+  if (jump_around_label)
+    emit_label (jump_around_label);
+  return true;
+}
+
+
+/* Expand the appropriate insns for doing strlen if not just doing
+   repnz; scasb
+
+   out = result, initialized with the start address
+   align_rtx = alignment of the address.
+   scratch = scratch register, initialized with the startaddress when
+	not aligned, otherwise undefined
+
+   This is just the body. It needs the initializations mentioned above and
+   some address computing at the end.  These things are done in i386.md.  */
+
+static void
+ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
+{
+  int align;
+  rtx tmp;
+  rtx align_2_label = NULL_RTX;
+  rtx align_3_label = NULL_RTX;
+  rtx align_4_label = gen_label_rtx ();
+  rtx end_0_label = gen_label_rtx ();
+  rtx mem;
+  rtx tmpreg = gen_reg_rtx (SImode);
+  rtx scratch = gen_reg_rtx (SImode);
+  rtx cmp;
+
+  align = 0;
+  if (CONST_INT_P (align_rtx))
+    align = INTVAL (align_rtx);
+
+  /* Loop to check 1..3 bytes for null to get an aligned pointer.  */
+
+  /* Is there a known alignment and is it less than 4?  */
+  if (align < 4)
+    {
+      rtx scratch1 = gen_reg_rtx (Pmode);
+      emit_move_insn (scratch1, out);
+      /* Is there a known alignment and is it not 2? */
+      if (align != 2)
+	{
+	  align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
+	  align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
+
+	  /* Leave just the 3 lower bits.  */
+	  align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
+				    NULL_RTX, 0, OPTAB_WIDEN);
+
+	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
+				   Pmode, 1, align_4_label);
+	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
+				   Pmode, 1, align_2_label);
+	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
+				   Pmode, 1, align_3_label);
+	}
+      else
+        {
+	  /* Since the alignment is 2, we have to check 2 or 0 bytes;
+	     check if is aligned to 4 - byte.  */
+
+	  align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
+				    NULL_RTX, 0, OPTAB_WIDEN);
+
+	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
+				   Pmode, 1, align_4_label);
+        }
+
+      mem = change_address (src, QImode, out);
+
+      /* Now compare the bytes.  */
+
+      /* Compare the first n unaligned byte on a byte per byte basis.  */
+      emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
+			       QImode, 1, end_0_label);
+
+      /* Increment the address.  */
+      emit_insn (ix86_gen_add3 (out, out, const1_rtx));
+
+      /* Not needed with an alignment of 2 */
+      if (align != 2)
+	{
+	  emit_label (align_2_label);
+
+	  emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
+				   end_0_label);
+
+	  emit_insn (ix86_gen_add3 (out, out, const1_rtx));
+
+	  emit_label (align_3_label);
+	}
+
+      emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
+			       end_0_label);
+
+      emit_insn (ix86_gen_add3 (out, out, const1_rtx));
+    }
+
+  /* Generate loop to check 4 bytes at a time.  It is not a good idea to
+     align this loop.  It gives only huge programs, but does not help to
+     speed up.  */
+  emit_label (align_4_label);
+
+  mem = change_address (src, SImode, out);
+  emit_move_insn (scratch, mem);
+  emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
+
+  /* This formula yields a nonzero result iff one of the bytes is zero.
+     This saves three branches inside loop and many cycles.  */
+
+  emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
+  emit_insn (gen_one_cmplsi2 (scratch, scratch));
+  emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
+  emit_insn (gen_andsi3 (tmpreg, tmpreg,
+			 gen_int_mode (0x80808080, SImode)));
+  emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
+			   align_4_label);
+
+  if (TARGET_CMOVE)
+    {
+       rtx reg = gen_reg_rtx (SImode);
+       rtx reg2 = gen_reg_rtx (Pmode);
+       emit_move_insn (reg, tmpreg);
+       emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
+
+       /* If zero is not in the first two bytes, move two bytes forward.  */
+       emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
+       tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
+       tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
+       emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
+			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
+						     reg,
+						     tmpreg)));
+       /* Emit lea manually to avoid clobbering of flags.  */
+       emit_insn (gen_rtx_SET (SImode, reg2,
+			       gen_rtx_PLUS (Pmode, out, const2_rtx)));
+
+       tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
+       tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
+       emit_insn (gen_rtx_SET (VOIDmode, out,
+			       gen_rtx_IF_THEN_ELSE (Pmode, tmp,
+						     reg2,
+						     out)));
+    }
+  else
+    {
+       rtx end_2_label = gen_label_rtx ();
+       /* Is zero in the first two bytes? */
+
+       emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
+       tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
+       tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
+       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+                            gen_rtx_LABEL_REF (VOIDmode, end_2_label),
+                            pc_rtx);
+       tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
+       JUMP_LABEL (tmp) = end_2_label;
+
+       /* Not in the first two.  Move two bytes forward.  */
+       emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
+       emit_insn (ix86_gen_add3 (out, out, const2_rtx));
+
+       emit_label (end_2_label);
+
+    }
+
+  /* Avoid branch in fixing the byte.  */
+  tmpreg = gen_lowpart (QImode, tmpreg);
+  emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
+  tmp = gen_rtx_REG (CCmode, FLAGS_REG);
+  cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
+  emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
+
+  emit_label (end_0_label);
+}
+
+/* Expand strlen.  */
+
+bool
+ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
+{
+  rtx addr, scratch1, scratch2, scratch3, scratch4;
+
+  /* The generic case of strlen expander is long.  Avoid it's
+     expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
+
+  if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
+      && !TARGET_INLINE_ALL_STRINGOPS
+      && !optimize_insn_for_size_p ()
+      && (!CONST_INT_P (align) || INTVAL (align) < 4))
+    return false;
+
+  addr = force_reg (Pmode, XEXP (src, 0));
+  scratch1 = gen_reg_rtx (Pmode);
+
+  if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
+      && !optimize_insn_for_size_p ())
+    {
+      /* Well it seems that some optimizer does not combine a call like
+         foo(strlen(bar), strlen(bar));
+         when the move and the subtraction is done here.  It does calculate
+         the length just once when these instructions are done inside of
+         output_strlen_unroll().  But I think since &bar[strlen(bar)] is
+         often used and I use one fewer register for the lifetime of
+         output_strlen_unroll() this is better.  */
+
+      emit_move_insn (out, addr);
+
+      ix86_expand_strlensi_unroll_1 (out, src, align);
+
+      /* strlensi_unroll_1 returns the address of the zero at the end of
+         the string, like memchr(), so compute the length by subtracting
+         the start address.  */
+      emit_insn (ix86_gen_sub3 (out, out, addr));
+    }
+  else
+    {
+      rtx unspec;
+
+      /* Can't use this if the user has appropriated eax, ecx, or edi.  */
+      if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
+        return false;
+
+      scratch2 = gen_reg_rtx (Pmode);
+      scratch3 = gen_reg_rtx (Pmode);
+      scratch4 = force_reg (Pmode, constm1_rtx);
+
+      emit_move_insn (scratch3, addr);
+      eoschar = force_reg (QImode, eoschar);
+
+      src = replace_equiv_address_nv (src, scratch3);
+
+      /* If .md starts supporting :P, this can be done in .md.  */
+      unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
+						 scratch4), UNSPEC_SCAS);
+      emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
+      emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
+      emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
+    }
+  return true;
+}
+
+/* For given symbol (function) construct code to compute address of it's PLT
+   entry in large x86-64 PIC model.  */
+static rtx
+construct_plt_address (rtx symbol)
+{
+  rtx tmp, unspec;
+
+  gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
+  gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
+  gcc_assert (Pmode == DImode);
+
+  tmp = gen_reg_rtx (Pmode);
+  unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
+
+  emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
+  emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
+  return tmp;
+}
+
+rtx
+ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
+		  rtx callarg2,
+		  rtx pop, bool sibcall)
+{
+  unsigned int const cregs_size
+    = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
+  rtx vec[3 + cregs_size];
+  rtx use = NULL, call;
+  unsigned int vec_len = 0;
+
+  if (pop == const0_rtx)
+    pop = NULL;
+  gcc_assert (!TARGET_64BIT || !pop);
+
+  if (TARGET_MACHO && !TARGET_64BIT)
+    {
+#if TARGET_MACHO
+      if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
+	fnaddr = machopic_indirect_call_target (fnaddr);
+#endif
+    }
+  else
+    {
+      /* Static functions and indirect calls don't need the pic register.  */
+      if (flag_pic
+	  && (!TARGET_64BIT
+	      || (ix86_cmodel == CM_LARGE_PIC
+		  && DEFAULT_ABI != MS_ABI))
+	  && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
+	  && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
+	use_reg (&use, pic_offset_table_rtx);
+    }
+
+  if (TARGET_64BIT && INTVAL (callarg2) >= 0)
+    {
+      rtx al = gen_rtx_REG (QImode, AX_REG);
+      emit_move_insn (al, callarg2);
+      use_reg (&use, al);
+    }
+
+  if (ix86_cmodel == CM_LARGE_PIC
+      && !TARGET_PECOFF
+      && MEM_P (fnaddr)
+      && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
+      && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
+    fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
+  else if (sibcall
+	   ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
+	   : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
+    {
+      fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
+      fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
+    }
+
+  call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
+  if (retval)
+    call = gen_rtx_SET (VOIDmode, retval, call);
+  vec[vec_len++] = call;
+
+  if (pop)
+    {
+      pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
+      pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
+      vec[vec_len++] = pop;
+    }
+
+  if (TARGET_64BIT_MS_ABI
+      && (!callarg2 || INTVAL (callarg2) != -2))
+    {
+      unsigned i;
+
+      vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
+				       UNSPEC_MS_TO_SYSV_CALL);
+
+      for (i = 0; i < cregs_size; i++)
+	{
+	  int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
+	  enum machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
+
+	  vec[vec_len++]
+	    = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (mode, regno));
+	}
+    }
+
+  if (vec_len > 1)
+    call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
+  call = emit_call_insn (call);
+  if (use)
+    CALL_INSN_FUNCTION_USAGE (call) = use;
+
+  return call;
+}
+
+/* Output the assembly for a call instruction.  */
+
+const char *
+ix86_output_call_insn (rtx insn, rtx call_op)
+{
+  bool direct_p = constant_call_address_operand (call_op, VOIDmode);
+  bool seh_nop_p = false;
+  const char *xasm;
+
+  if (SIBLING_CALL_P (insn))
+    {
+      if (direct_p)
+	xasm = "jmp\t%P0";
+      /* SEH epilogue detection requires the indirect branch case
+	 to include REX.W.  */
+      else if (TARGET_SEH)
+	xasm = "rex.W jmp %A0";
+      else
+	xasm = "jmp\t%A0";
+
+      output_asm_insn (xasm, &call_op);
+      return "";
+    }
+
+  /* SEH unwinding can require an extra nop to be emitted in several
+     circumstances.  Determine if we have one of those.  */
+  if (TARGET_SEH)
+    {
+      rtx i;
+
+      for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
+	{
+	  /* If we get to another real insn, we don't need the nop.  */
+	  if (INSN_P (i))
+	    break;
+
+	  /* If we get to the epilogue note, prevent a catch region from
+	     being adjacent to the standard epilogue sequence.  If non-
+	     call-exceptions, we'll have done this during epilogue emission. */
+	  if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
+	      && !flag_non_call_exceptions
+	      && !can_throw_internal (insn))
+	    {
+	      seh_nop_p = true;
+	      break;
+	    }
+	}
+
+      /* If we didn't find a real insn following the call, prevent the
+	 unwinder from looking into the next function.  */
+      if (i == NULL)
+	seh_nop_p = true;
+    }
+
+  if (direct_p)
+    xasm = "call\t%P0";
+  else
+    xasm = "call\t%A0";
+
+  output_asm_insn (xasm, &call_op);
+
+  if (seh_nop_p)
+    return "nop";
+
+  return "";
+}
+
+/* Clear stack slot assignments remembered from previous functions.
+   This is called from INIT_EXPANDERS once before RTL is emitted for each
+   function.  */
+
+static struct machine_function *
+ix86_init_machine_status (void)
+{
+  struct machine_function *f;
+
+  f = ggc_alloc_cleared_machine_function ();
+  f->use_fast_prologue_epilogue_nregs = -1;
+  f->call_abi = ix86_abi;
+
+  return f;
+}
+
+/* Return a MEM corresponding to a stack slot with mode MODE.
+   Allocate a new slot if necessary.
+
+   The RTL for a function can have several slots available: N is
+   which slot to use.  */
+
+rtx
+assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
+{
+  struct stack_local_entry *s;
+
+  gcc_assert (n < MAX_386_STACK_LOCALS);
+
+  for (s = ix86_stack_locals; s; s = s->next)
+    if (s->mode == mode && s->n == n)
+      return validize_mem (copy_rtx (s->rtl));
+
+  s = ggc_alloc_stack_local_entry ();
+  s->n = n;
+  s->mode = mode;
+  s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
+
+  s->next = ix86_stack_locals;
+  ix86_stack_locals = s;
+  return validize_mem (s->rtl);
+}
+
+static void
+ix86_instantiate_decls (void)
+{
+  struct stack_local_entry *s;
+
+  for (s = ix86_stack_locals; s; s = s->next)
+    if (s->rtl != NULL_RTX)
+      instantiate_decl_rtl (s->rtl);
+}
+
+/* Check whether x86 address PARTS is a pc-relative address.  */
+
+static bool
+rip_relative_addr_p (struct ix86_address *parts)
+{
+  rtx base, index, disp;
+
+  base = parts->base;
+  index = parts->index;
+  disp = parts->disp;
+
+  if (disp && !base && !index)
+    {
+      if (TARGET_64BIT)
+	{
+	  rtx symbol = disp;
+
+	  if (GET_CODE (disp) == CONST)
+	    symbol = XEXP (disp, 0);
+	  if (GET_CODE (symbol) == PLUS
+	      && CONST_INT_P (XEXP (symbol, 1)))
+	    symbol = XEXP (symbol, 0);
+
+	  if (GET_CODE (symbol) == LABEL_REF
+	      || (GET_CODE (symbol) == SYMBOL_REF
+		  && SYMBOL_REF_TLS_MODEL (symbol) == 0)
+	      || (GET_CODE (symbol) == UNSPEC
+		  && (XINT (symbol, 1) == UNSPEC_GOTPCREL
+		      || XINT (symbol, 1) == UNSPEC_PCREL
+		      || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
+	    return true;
+	}
+    }
+  return false;
+}
+
+/* Calculate the length of the memory address in the instruction encoding.
+   Includes addr32 prefix, does not include the one-byte modrm, opcode,
+   or other prefixes.  We never generate addr32 prefix for LEA insn.  */
+
+int
+memory_address_length (rtx addr, bool lea)
+{
+  struct ix86_address parts;
+  rtx base, index, disp;
+  int len;
+  int ok;
+
+  if (GET_CODE (addr) == PRE_DEC
+      || GET_CODE (addr) == POST_INC
+      || GET_CODE (addr) == PRE_MODIFY
+      || GET_CODE (addr) == POST_MODIFY)
+    return 0;
+
+  ok = ix86_decompose_address (addr, &parts);
+  gcc_assert (ok);
+
+  len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
+
+  /*  If this is not LEA instruction, add the length of addr32 prefix.  */
+  if (TARGET_64BIT && !lea
+      && (SImode_address_operand (addr, VOIDmode)
+	  || (parts.base && GET_MODE (parts.base) == SImode)
+	  || (parts.index && GET_MODE (parts.index) == SImode)))
+    len++;
+
+  base = parts.base;
+  index = parts.index;
+  disp = parts.disp;
+
+  if (base && GET_CODE (base) == SUBREG)
+    base = SUBREG_REG (base);
+  if (index && GET_CODE (index) == SUBREG)
+    index = SUBREG_REG (index);
+
+  gcc_assert (base == NULL_RTX || REG_P (base));
+  gcc_assert (index == NULL_RTX || REG_P (index));
+
+  /* Rule of thumb:
+       - esp as the base always wants an index,
+       - ebp as the base always wants a displacement,
+       - r12 as the base always wants an index,
+       - r13 as the base always wants a displacement.  */
+
+  /* Register Indirect.  */
+  if (base && !index && !disp)
+    {
+      /* esp (for its index) and ebp (for its displacement) need
+	 the two-byte modrm form.  Similarly for r12 and r13 in 64-bit
+	 code.  */
+      if (base == arg_pointer_rtx
+	  || base == frame_pointer_rtx
+	  || REGNO (base) == SP_REG
+	  || REGNO (base) == BP_REG
+	  || REGNO (base) == R12_REG
+	  || REGNO (base) == R13_REG)
+	len++;
+    }
+
+  /* Direct Addressing.  In 64-bit mode mod 00 r/m 5
+     is not disp32, but disp32(%rip), so for disp32
+     SIB byte is needed, unless print_operand_address
+     optimizes it into disp32(%rip) or (%rip) is implied
+     by UNSPEC.  */
+  else if (disp && !base && !index)
+    {
+      len += 4;
+      if (rip_relative_addr_p (&parts))
+	len++;
+    }
+  else
+    {
+      /* Find the length of the displacement constant.  */
+      if (disp)
+	{
+	  if (base && satisfies_constraint_K (disp))
+	    len += 1;
+	  else
+	    len += 4;
+	}
+      /* ebp always wants a displacement.  Similarly r13.  */
+      else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
+	len++;
+
+      /* An index requires the two-byte modrm form....  */
+      if (index
+	  /* ...like esp (or r12), which always wants an index.  */
+	  || base == arg_pointer_rtx
+	  || base == frame_pointer_rtx
+	  || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
+	len++;
+    }
+
+  return len;
+}
+
+/* Compute default value for "length_immediate" attribute.  When SHORTFORM
+   is set, expect that insn have 8bit immediate alternative.  */
+int
+ix86_attr_length_immediate_default (rtx insn, bool shortform)
+{
+  int len = 0;
+  int i;
+  extract_insn_cached (insn);
+  for (i = recog_data.n_operands - 1; i >= 0; --i)
+    if (CONSTANT_P (recog_data.operand[i]))
+      {
+        enum attr_mode mode = get_attr_mode (insn);
+
+	gcc_assert (!len);
+	if (shortform && CONST_INT_P (recog_data.operand[i]))
+	  {
+	    HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
+	    switch (mode)
+	      {
+	      case MODE_QI:
+		len = 1;
+		continue;
+	      case MODE_HI:
+		ival = trunc_int_for_mode (ival, HImode);
+		break;
+	      case MODE_SI:
+		ival = trunc_int_for_mode (ival, SImode);
+		break;
+	      default:
+		break;
+	      }
+	    if (IN_RANGE (ival, -128, 127))
+	      {
+		len = 1;
+		continue;
+	      }
+	  }
+	switch (mode)
+	  {
+	  case MODE_QI:
+	    len = 1;
+	    break;
+	  case MODE_HI:
+	    len = 2;
+	    break;
+	  case MODE_SI:
+	    len = 4;
+	    break;
+	  /* Immediates for DImode instructions are encoded
+	     as 32bit sign extended values.  */
+	  case MODE_DI:
+	    len = 4;
+	    break;
+	  default:
+	    fatal_insn ("unknown insn mode", insn);
+	}
+      }
+  return len;
+}
+
+/* Compute default value for "length_address" attribute.  */
+int
+ix86_attr_length_address_default (rtx insn)
+{
+  int i;
+
+  if (get_attr_type (insn) == TYPE_LEA)
+    {
+      rtx set = PATTERN (insn), addr;
+
+      if (GET_CODE (set) == PARALLEL)
+	set = XVECEXP (set, 0, 0);
+
+      gcc_assert (GET_CODE (set) == SET);
+
+      addr = SET_SRC (set);
+
+      return memory_address_length (addr, true);
+    }
+
+  extract_insn_cached (insn);
+  for (i = recog_data.n_operands - 1; i >= 0; --i)
+    if (MEM_P (recog_data.operand[i]))
+      {
+        constrain_operands_cached (reload_completed);
+        if (which_alternative != -1)
+	  {
+	    const char *constraints = recog_data.constraints[i];
+	    int alt = which_alternative;
+
+	    while (*constraints == '=' || *constraints == '+')
+	      constraints++;
+	    while (alt-- > 0)
+	      while (*constraints++ != ',')
+		;
+	    /* Skip ignored operands.  */
+	    if (*constraints == 'X')
+	      continue;
+	  }
+	return memory_address_length (XEXP (recog_data.operand[i], 0), false);
+      }
+  return 0;
+}
+
+/* Compute default value for "length_vex" attribute. It includes
+   2 or 3 byte VEX prefix and 1 opcode byte.  */
+
+int
+ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
+{
+  int i;
+
+  /* Only 0f opcode can use 2 byte VEX prefix and  VEX W bit uses 3
+     byte VEX prefix.  */
+  if (!has_0f_opcode || has_vex_w)
+    return 3 + 1;
+
+ /* We can always use 2 byte VEX prefix in 32bit.  */
+  if (!TARGET_64BIT)
+    return 2 + 1;
+
+  extract_insn_cached (insn);
+
+  for (i = recog_data.n_operands - 1; i >= 0; --i)
+    if (REG_P (recog_data.operand[i]))
+      {
+	/* REX.W bit uses 3 byte VEX prefix.  */
+	if (GET_MODE (recog_data.operand[i]) == DImode
+	    && GENERAL_REG_P (recog_data.operand[i]))
+	  return 3 + 1;
+      }
+    else
+      {
+	/* REX.X or REX.B bits use 3 byte VEX prefix.  */
+	if (MEM_P (recog_data.operand[i])
+	    && x86_extended_reg_mentioned_p (recog_data.operand[i]))
+	  return 3 + 1;
+      }
+
+  return 2 + 1;
+}
+
+/* Return the maximum number of instructions a cpu can issue.  */
+
+static int
+ix86_issue_rate (void)
+{
+  switch (ix86_tune)
+    {
+    case PROCESSOR_PENTIUM:
+    case PROCESSOR_BONNELL:
+    case PROCESSOR_SILVERMONT:
+    case PROCESSOR_INTEL:
+    case PROCESSOR_K6:
+    case PROCESSOR_BTVER2:
+    case PROCESSOR_PENTIUM4:
+    case PROCESSOR_NOCONA:
+      return 2;
+
+    case PROCESSOR_PENTIUMPRO:
+    case PROCESSOR_ATHLON:
+    case PROCESSOR_K8:
+    case PROCESSOR_AMDFAM10:
+    case PROCESSOR_GENERIC:
+    case PROCESSOR_BTVER1:
+      return 3;
+
+    case PROCESSOR_BDVER1:
+    case PROCESSOR_BDVER2:
+    case PROCESSOR_BDVER3:
+    case PROCESSOR_BDVER4:
+    case PROCESSOR_CORE2:
+    case PROCESSOR_NEHALEM:
+    case PROCESSOR_SANDYBRIDGE:
+    case PROCESSOR_HASWELL:
+      return 4;
+
+    default:
+      return 1;
+    }
+}
+
+/* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
+   by DEP_INSN and nothing set by DEP_INSN.  */
+
+static bool
+ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
+{
+  rtx set, set2;
+
+  /* Simplify the test for uninteresting insns.  */
+  if (insn_type != TYPE_SETCC
+      && insn_type != TYPE_ICMOV
+      && insn_type != TYPE_FCMOV
+      && insn_type != TYPE_IBR)
+    return false;
+
+  if ((set = single_set (dep_insn)) != 0)
+    {
+      set = SET_DEST (set);
+      set2 = NULL_RTX;
+    }
+  else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
+	   && XVECLEN (PATTERN (dep_insn), 0) == 2
+	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
+	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
+    {
+      set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
+      set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
+    }
+  else
+    return false;
+
+  if (!REG_P (set) || REGNO (set) != FLAGS_REG)
+    return false;
+
+  /* This test is true if the dependent insn reads the flags but
+     not any other potentially set register.  */
+  if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
+    return false;
+
+  if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
+    return false;
+
+  return true;
+}
+
+/* Return true iff USE_INSN has a memory address with operands set by
+   SET_INSN.  */
+
+bool
+ix86_agi_dependent (rtx set_insn, rtx use_insn)
+{
+  int i;
+  extract_insn_cached (use_insn);
+  for (i = recog_data.n_operands - 1; i >= 0; --i)
+    if (MEM_P (recog_data.operand[i]))
+      {
+	rtx addr = XEXP (recog_data.operand[i], 0);
+	return modified_in_p (addr, set_insn) != 0;
+      }
+  return false;
+}
+
+/* Helper function for exact_store_load_dependency.
+   Return true if addr is found in insn.  */
+static bool
+exact_dependency_1 (rtx addr, rtx insn)
+{
+  enum rtx_code code;
+  const char *format_ptr;
+  int i, j;
+
+  code = GET_CODE (insn);
+  switch (code)
+    {
+    case MEM:
+      if (rtx_equal_p (addr, insn))
+	return true;
+      break;
+    case REG:
+    CASE_CONST_ANY:
+    case SYMBOL_REF:
+    case CODE_LABEL:
+    case PC:
+    case CC0:
+    case EXPR_LIST:
+      return false;
+    default:
+      break;
+    }
+
+  format_ptr = GET_RTX_FORMAT (code);
+  for (i = 0; i < GET_RTX_LENGTH (code); i++)
+    {
+      switch (*format_ptr++)
+	{
+	case 'e':
+	  if (exact_dependency_1 (addr, XEXP (insn, i)))
+	    return true;
+	  break;
+	case 'E':
+	  for (j = 0; j < XVECLEN (insn, i); j++)
+	    if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
+	      return true;
+	    break;
+	}
+    }
+  return false;
+}
+
+/* Return true if there exists exact dependency for store & load, i.e.
+   the same memory address is used in them.  */
+static bool
+exact_store_load_dependency (rtx store, rtx load)
+{
+  rtx set1, set2;
+
+  set1 = single_set (store);
+  if (!set1)
+    return false;
+  if (!MEM_P (SET_DEST (set1)))
+    return false;
+  set2 = single_set (load);
+  if (!set2)
+    return false;
+  if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
+    return true;
+  return false;
+}
+
+static int
+ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
+{
+  enum attr_type insn_type, dep_insn_type;
+  enum attr_memory memory;
+  rtx set, set2;
+  int dep_insn_code_number;
+
+  /* Anti and output dependencies have zero cost on all CPUs.  */
+  if (REG_NOTE_KIND (link) != 0)
+    return 0;
+
+  dep_insn_code_number = recog_memoized (dep_insn);
+
+  /* If we can't recognize the insns, we can't really do anything.  */
+  if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
+    return cost;
+
+  insn_type = get_attr_type (insn);
+  dep_insn_type = get_attr_type (dep_insn);
+
+  switch (ix86_tune)
+    {
+    case PROCESSOR_PENTIUM:
+      /* Address Generation Interlock adds a cycle of latency.  */
+      if (insn_type == TYPE_LEA)
+	{
+	  rtx addr = PATTERN (insn);
+
+	  if (GET_CODE (addr) == PARALLEL)
+	    addr = XVECEXP (addr, 0, 0);
+
+	  gcc_assert (GET_CODE (addr) == SET);
+
+	  addr = SET_SRC (addr);
+	  if (modified_in_p (addr, dep_insn))
+	    cost += 1;
+	}
+      else if (ix86_agi_dependent (dep_insn, insn))
+	cost += 1;
+
+      /* ??? Compares pair with jump/setcc.  */
+      if (ix86_flags_dependent (insn, dep_insn, insn_type))
+	cost = 0;
+
+      /* Floating point stores require value to be ready one cycle earlier.  */
+      if (insn_type == TYPE_FMOV
+	  && get_attr_memory (insn) == MEMORY_STORE
+	  && !ix86_agi_dependent (dep_insn, insn))
+	cost += 1;
+      break;
+
+    case PROCESSOR_PENTIUMPRO:
+      /* INT->FP conversion is expensive.  */
+      if (get_attr_fp_int_src (dep_insn))
+	cost += 5;
+
+      /* There is one cycle extra latency between an FP op and a store.  */
+      if (insn_type == TYPE_FMOV
+	  && (set = single_set (dep_insn)) != NULL_RTX
+	  && (set2 = single_set (insn)) != NULL_RTX
+	  && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
+	  && MEM_P (SET_DEST (set2)))
+	cost += 1;
+
+      memory = get_attr_memory (insn);
+
+      /* Show ability of reorder buffer to hide latency of load by executing
+	 in parallel with previous instruction in case
+	 previous instruction is not needed to compute the address.  */
+      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+	  && !ix86_agi_dependent (dep_insn, insn))
+	{
+	  /* Claim moves to take one cycle, as core can issue one load
+	     at time and the next load can start cycle later.  */
+	  if (dep_insn_type == TYPE_IMOV
+	      || dep_insn_type == TYPE_FMOV)
+	    cost = 1;
+	  else if (cost > 1)
+	    cost--;
+	}
+      break;
+
+    case PROCESSOR_K6:
+     /* The esp dependency is resolved before
+	the instruction is really finished.  */
+      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
+	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
+	return 1;
+
+      /* INT->FP conversion is expensive.  */
+      if (get_attr_fp_int_src (dep_insn))
+	cost += 5;
+
+      memory = get_attr_memory (insn);
+
+      /* Show ability of reorder buffer to hide latency of load by executing
+	 in parallel with previous instruction in case
+	 previous instruction is not needed to compute the address.  */
+      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+	  && !ix86_agi_dependent (dep_insn, insn))
+	{
+	  /* Claim moves to take one cycle, as core can issue one load
+	     at time and the next load can start cycle later.  */
+	  if (dep_insn_type == TYPE_IMOV
+	      || dep_insn_type == TYPE_FMOV)
+	    cost = 1;
+	  else if (cost > 2)
+	    cost -= 2;
+	  else
+	    cost = 1;
+	}
+      break;
+
+    case PROCESSOR_AMDFAM10:
+    case PROCESSOR_BDVER1:
+    case PROCESSOR_BDVER2:
+    case PROCESSOR_BDVER3:
+    case PROCESSOR_BDVER4:
+    case PROCESSOR_BTVER1:
+    case PROCESSOR_BTVER2:
+    case PROCESSOR_GENERIC:
+      /* Stack engine allows to execute push&pop instructions in parall.  */
+      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
+	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
+	return 0;
+      /* FALLTHRU */
+
+    case PROCESSOR_ATHLON:
+    case PROCESSOR_K8:
+      memory = get_attr_memory (insn);
+
+      /* Show ability of reorder buffer to hide latency of load by executing
+	 in parallel with previous instruction in case
+	 previous instruction is not needed to compute the address.  */
+      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+	  && !ix86_agi_dependent (dep_insn, insn))
+	{
+	  enum attr_unit unit = get_attr_unit (insn);
+	  int loadcost = 3;
+
+	  /* Because of the difference between the length of integer and
+	     floating unit pipeline preparation stages, the memory operands
+	     for floating point are cheaper.
+
+	     ??? For Athlon it the difference is most probably 2.  */
+	  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
+	    loadcost = 3;
+	  else
+	    loadcost = TARGET_ATHLON ? 2 : 0;
+
+	  if (cost >= loadcost)
+	    cost -= loadcost;
+	  else
+	    cost = 0;
+	}
+      break;
+
+    case PROCESSOR_CORE2:
+    case PROCESSOR_NEHALEM:
+    case PROCESSOR_SANDYBRIDGE:
+    case PROCESSOR_HASWELL:
+      /* Stack engine allows to execute push&pop instructions in parall.  */
+      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
+	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
+	return 0;
+
+      memory = get_attr_memory (insn);
+
+      /* Show ability of reorder buffer to hide latency of load by executing
+	 in parallel with previous instruction in case
+	 previous instruction is not needed to compute the address.  */
+      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+	  && !ix86_agi_dependent (dep_insn, insn))
+	{
+	  if (cost >= 4)
+	    cost -= 4;
+	  else
+	    cost = 0;
+	}
+      break;
+
+    case PROCESSOR_SILVERMONT:
+    case PROCESSOR_INTEL:
+      if (!reload_completed)
+	return cost;
+
+      /* Increase cost of integer loads.  */
+      memory = get_attr_memory (dep_insn);
+      if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
+	{
+	  enum attr_unit unit = get_attr_unit (dep_insn);
+	  if (unit == UNIT_INTEGER && cost == 1)
+	    {
+	      if (memory == MEMORY_LOAD)
+		cost = 3;
+	      else
+		{
+		  /* Increase cost of ld/st for short int types only
+		     because of store forwarding issue.  */
+		  rtx set = single_set (dep_insn);
+		  if (set && (GET_MODE (SET_DEST (set)) == QImode
+			      || GET_MODE (SET_DEST (set)) == HImode))
+		    {
+		      /* Increase cost of store/load insn if exact
+			 dependence exists and it is load insn.  */
+		      enum attr_memory insn_memory = get_attr_memory (insn);
+		      if (insn_memory == MEMORY_LOAD
+			  && exact_store_load_dependency (dep_insn, insn))
+			cost = 3;
+		    }
+		}
+	    }
+	}
+
+    default:
+      break;
+    }
+
+  return cost;
+}
+
+/* How many alternative schedules to try.  This should be as wide as the
+   scheduling freedom in the DFA, but no wider.  Making this value too
+   large results extra work for the scheduler.  */
+
+static int
+ia32_multipass_dfa_lookahead (void)
+{
+  switch (ix86_tune)
+    {
+    case PROCESSOR_PENTIUM:
+      return 2;
+
+    case PROCESSOR_PENTIUMPRO:
+    case PROCESSOR_K6:
+      return 1;
+
+    case PROCESSOR_BDVER1:
+    case PROCESSOR_BDVER2:
+    case PROCESSOR_BDVER3:
+    case PROCESSOR_BDVER4:
+      /* We use lookahead value 4 for BD both before and after reload
+	 schedules. Plan is to have value 8 included for O3. */
+        return 4;
+
+    case PROCESSOR_CORE2:
+    case PROCESSOR_NEHALEM:
+    case PROCESSOR_SANDYBRIDGE:
+    case PROCESSOR_HASWELL:
+    case PROCESSOR_BONNELL:
+    case PROCESSOR_SILVERMONT:
+    case PROCESSOR_INTEL:
+      /* Generally, we want haifa-sched:max_issue() to look ahead as far
+	 as many instructions can be executed on a cycle, i.e.,
+	 issue_rate.  I wonder why tuning for many CPUs does not do this.  */
+      if (reload_completed)
+        return ix86_issue_rate ();
+      /* Don't use lookahead for pre-reload schedule to save compile time.  */
+      return 0;
+
+    default:
+      return 0;
+    }
+}
+
+/* Return true if target platform supports macro-fusion.  */
+
+static bool
+ix86_macro_fusion_p ()
+{
+  return TARGET_FUSE_CMP_AND_BRANCH;
+}
+
+/* Check whether current microarchitecture support macro fusion
+   for insn pair "CONDGEN + CONDJMP". Refer to
+   "Intel Architectures Optimization Reference Manual". */
+
+static bool
+ix86_macro_fusion_pair_p (rtx condgen, rtx condjmp)
+{
+  rtx src, dest;
+  rtx single_set = single_set (condgen);
+  enum rtx_code ccode;
+  rtx compare_set = NULL_RTX, test_if, cond;
+  rtx alu_set = NULL_RTX, addr = NULL_RTX;
+
+  if (get_attr_type (condgen) != TYPE_TEST
+      && get_attr_type (condgen) != TYPE_ICMP
+      && get_attr_type (condgen) != TYPE_INCDEC
+      && get_attr_type (condgen) != TYPE_ALU)
+    return false;
+
+  if (single_set == NULL_RTX
+      && !TARGET_FUSE_ALU_AND_BRANCH)
+    return false;
+
+  if (single_set != NULL_RTX)
+    compare_set = single_set;
+  else
+    {
+      int i;
+      rtx pat = PATTERN (condgen);
+      for (i = 0; i < XVECLEN (pat, 0); i++)
+	if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
+	  {
+	    rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
+	    if (GET_CODE (set_src) == COMPARE)
+	      compare_set = XVECEXP (pat, 0, i);
+	    else
+	      alu_set = XVECEXP (pat, 0, i);
+	  }
+    }
+  if (compare_set == NULL_RTX)
+    return false;
+  src = SET_SRC (compare_set);
+  if (GET_CODE (src) != COMPARE)
+    return false;
+
+  /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
+     supported.  */
+  if ((MEM_P (XEXP (src, 0))
+       && CONST_INT_P (XEXP (src, 1)))
+      || (MEM_P (XEXP (src, 1))
+	  && CONST_INT_P (XEXP (src, 0))))
+    return false;
+
+  /* No fusion for RIP-relative address.  */
+  if (MEM_P (XEXP (src, 0)))
+    addr = XEXP (XEXP (src, 0), 0);
+  else if (MEM_P (XEXP (src, 1)))
+    addr = XEXP (XEXP (src, 1), 0);
+
+  if (addr) {
+    ix86_address parts;
+    int ok = ix86_decompose_address (addr, &parts);
+    gcc_assert (ok);
+
+    if (rip_relative_addr_p (&parts))
+      return false;
+  }
+
+  test_if = SET_SRC (pc_set (condjmp));
+  cond = XEXP (test_if, 0);
+  ccode = GET_CODE (cond);
+  /* Check whether conditional jump use Sign or Overflow Flags.  */
+  if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
+      && (ccode == GE
+          || ccode == GT
+	  || ccode == LE
+	  || ccode == LT))
+    return false;
+
+  /* Return true for TYPE_TEST and TYPE_ICMP.  */
+  if (get_attr_type (condgen) == TYPE_TEST
+      || get_attr_type (condgen) == TYPE_ICMP)
+    return true;
+
+  /* The following is the case that macro-fusion for alu + jmp.  */
+  if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
+    return false;
+
+  /* No fusion for alu op with memory destination operand.  */
+  dest = SET_DEST (alu_set);
+  if (MEM_P (dest))
+    return false;
+
+  /* Macro-fusion for inc/dec + unsigned conditional jump is not
+     supported.  */
+  if (get_attr_type (condgen) == TYPE_INCDEC
+      && (ccode == GEU
+	  || ccode == GTU
+	  || ccode == LEU
+	  || ccode == LTU))
+    return false;
+
+  return true;
+}
+
+/* Try to reorder ready list to take advantage of Atom pipelined IMUL
+   execution. It is applied if
+   (1) IMUL instruction is on the top of list;
+   (2) There exists the only producer of independent IMUL instruction in
+       ready list.
+   Return index of IMUL producer if it was found and -1 otherwise.  */
+static int
+do_reorder_for_imul (rtx *ready, int n_ready)
+{
+  rtx insn, set, insn1, insn2;
+  sd_iterator_def sd_it;
+  dep_t dep;
+  int index = -1;
+  int i;
+
+  if (!TARGET_BONNELL)
+    return index;
+
+  /* Check that IMUL instruction is on the top of ready list.  */
+  insn = ready[n_ready - 1];
+  set = single_set (insn);
+  if (!set)
+    return index;
+  if (!(GET_CODE (SET_SRC (set)) == MULT
+      && GET_MODE (SET_SRC (set)) == SImode))
+    return index;
+
+  /* Search for producer of independent IMUL instruction.  */
+  for (i = n_ready - 2; i >= 0; i--)
+    {
+      insn = ready[i];
+      if (!NONDEBUG_INSN_P (insn))
+	continue;
+      /* Skip IMUL instruction.  */
+      insn2 = PATTERN (insn);
+      if (GET_CODE (insn2) == PARALLEL)
+	insn2 = XVECEXP (insn2, 0, 0);
+      if (GET_CODE (insn2) == SET
+	  && GET_CODE (SET_SRC (insn2)) == MULT
+	  && GET_MODE (SET_SRC (insn2)) == SImode)
+	continue;
+
+      FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
+	{
+	  rtx con;
+	  con = DEP_CON (dep);
+	  if (!NONDEBUG_INSN_P (con))
+	    continue;
+	  insn1 = PATTERN (con);
+	  if (GET_CODE (insn1) == PARALLEL)
+	    insn1 = XVECEXP (insn1, 0, 0);
+
+	  if (GET_CODE (insn1) == SET
+	      && GET_CODE (SET_SRC (insn1)) == MULT
+	      && GET_MODE (SET_SRC (insn1)) == SImode)
+	    {
+	      sd_iterator_def sd_it1;
+	      dep_t dep1;
+	      /* Check if there is no other dependee for IMUL.  */
+	      index = i;
+	      FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
+		{
+		  rtx pro;
+		  pro = DEP_PRO (dep1);
+		  if (!NONDEBUG_INSN_P (pro))
+		    continue;
+		  if (pro != insn)
+		    index = -1;
+		}
+	      if (index >= 0)
+		break;
+	    }
+	}
+      if (index >= 0)
+	break;
+    }
+  return index;
+}
+
+/* Try to find the best candidate on the top of ready list if two insns
+   have the same priority - candidate is best if its dependees were
+   scheduled earlier. Applied for Silvermont only.
+   Return true if top 2 insns must be interchanged.  */
+static bool
+swap_top_of_ready_list (rtx *ready, int n_ready)
+{
+  rtx top = ready[n_ready - 1];
+  rtx next = ready[n_ready - 2];
+  rtx set;
+  sd_iterator_def sd_it;
+  dep_t dep;
+  int clock1 = -1;
+  int clock2 = -1;
+  #define INSN_TICK(INSN) (HID (INSN)->tick)
+
+  if (!TARGET_SILVERMONT && !TARGET_INTEL)
+    return false;
+
+  if (!NONDEBUG_INSN_P (top))
+    return false;
+  if (!NONJUMP_INSN_P (top))
+    return false;
+  if (!NONDEBUG_INSN_P (next))
+    return false;
+  if (!NONJUMP_INSN_P (next))
+    return false;
+  set = single_set (top);
+  if (!set)
+    return false;
+  set = single_set (next);
+  if (!set)
+    return false;
+
+  if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
+    {
+      if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
+	return false;
+      /* Determine winner more precise.  */
+      FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
+	{
+	  rtx pro;
+	  pro = DEP_PRO (dep);
+	  if (!NONDEBUG_INSN_P (pro))
+	    continue;
+	  if (INSN_TICK (pro) > clock1)
+	    clock1 = INSN_TICK (pro);
+	}
+      FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
+	{
+	  rtx pro;
+	  pro = DEP_PRO (dep);
+	  if (!NONDEBUG_INSN_P (pro))
+	    continue;
+	  if (INSN_TICK (pro) > clock2)
+	    clock2 = INSN_TICK (pro);
+	}
+
+      if (clock1 == clock2)
+	{
+	  /* Determine winner - load must win. */
+	  enum attr_memory memory1, memory2;
+	  memory1 = get_attr_memory (top);
+	  memory2 = get_attr_memory (next);
+	  if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
+	    return true;
+	}
+	return (bool) (clock2 < clock1);
+    }
+  return false;
+  #undef INSN_TICK
+}
+
+/* Perform possible reodering of ready list for Atom/Silvermont only.
+   Return issue rate.  */
+static int
+ix86_sched_reorder (FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
+		   int clock_var)
+{
+  int issue_rate = -1;
+  int n_ready = *pn_ready;
+  int i;
+  rtx insn;
+  int index = -1;
+
+  /* Set up issue rate.  */
+  issue_rate = ix86_issue_rate ();
+
+  /* Do reodering for BONNELL/SILVERMONT only.  */
+  if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
+    return issue_rate;
+
+  /* Nothing to do if ready list contains only 1 instruction.  */
+  if (n_ready <= 1)
+    return issue_rate;
+
+  /* Do reodering for post-reload scheduler only.  */
+  if (!reload_completed)
+    return issue_rate;
+
+  if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
+    {
+      if (sched_verbose > 1)
+	fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
+		 INSN_UID (ready[index]));
+
+      /* Put IMUL producer (ready[index]) at the top of ready list.  */
+      insn = ready[index];
+      for (i = index; i < n_ready - 1; i++)
+	ready[i] = ready[i + 1];
+      ready[n_ready - 1] = insn;
+      return issue_rate;
+    }
+  if (clock_var != 0 && swap_top_of_ready_list (ready, n_ready))
+    {
+      if (sched_verbose > 1)
+	fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
+		 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
+      /* Swap 2 top elements of ready list.  */
+      insn = ready[n_ready - 1];
+      ready[n_ready - 1] = ready[n_ready - 2];
+      ready[n_ready - 2] = insn;
+    }
+  return issue_rate;
+}
+
+static bool
+ix86_class_likely_spilled_p (reg_class_t);
+
+/* Returns true if lhs of insn is HW function argument register and set up
+   is_spilled to true if it is likely spilled HW register.  */
+static bool
+insn_is_function_arg (rtx insn, bool* is_spilled)
+{
+  rtx dst;
+
+  if (!NONDEBUG_INSN_P (insn))
+    return false;
+  /* Call instructions are not movable, ignore it.  */
+  if (CALL_P (insn))
+    return false;
+  insn = PATTERN (insn);
+  if (GET_CODE (insn) == PARALLEL)
+    insn = XVECEXP (insn, 0, 0);
+  if (GET_CODE (insn) != SET)
+    return false;
+  dst = SET_DEST (insn);
+  if (REG_P (dst) && HARD_REGISTER_P (dst)
+      && ix86_function_arg_regno_p (REGNO (dst)))
+    {
+      /* Is it likely spilled HW register?  */
+      if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
+	  && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
+	*is_spilled = true;
+      return true;
+    }
+  return false;
+}
+
+/* Add output dependencies for chain of function adjacent arguments if only
+   there is a move to likely spilled HW register.  Return first argument
+   if at least one dependence was added or NULL otherwise.  */
+static rtx
+add_parameter_dependencies (rtx call, rtx head)
+{
+  rtx insn;
+  rtx last = call;
+  rtx first_arg = NULL;
+  bool is_spilled = false;
+
+  head = PREV_INSN (head);
+
+  /* Find nearest to call argument passing instruction.  */
+  while (true)
+    {
+      last = PREV_INSN (last);
+      if (last == head)
+	return NULL;
+      if (!NONDEBUG_INSN_P (last))
+	continue;
+      if (insn_is_function_arg (last, &is_spilled))
+	break;
+      return NULL;
+    }
+
+  first_arg = last;
+  while (true)
+    {
+      insn = PREV_INSN (last);
+      if (!INSN_P (insn))
+	break;
+      if (insn == head)
+	break;
+      if (!NONDEBUG_INSN_P (insn))
+	{
+	  last = insn;
+	  continue;
+	}
+      if (insn_is_function_arg (insn, &is_spilled))
+	{
+	  /* Add output depdendence between two function arguments if chain
+	     of output arguments contains likely spilled HW registers.  */
+	  if (is_spilled)
+	    add_dependence (first_arg, insn, REG_DEP_OUTPUT);
+	  first_arg = last = insn;
+	}
+      else
+	break;
+    }
+  if (!is_spilled)
+    return NULL;
+  return first_arg;
+}
+
+/* Add output or anti dependency from insn to first_arg to restrict its code
+   motion.  */
+static void
+avoid_func_arg_motion (rtx first_arg, rtx insn)
+{
+  rtx set;
+  rtx tmp;
+
+  set = single_set (insn);
+  if (!set)
+    return;
+  tmp = SET_DEST (set);
+  if (REG_P (tmp))
+    {
+      /* Add output dependency to the first function argument.  */
+      add_dependence (first_arg, insn, REG_DEP_OUTPUT);
+      return;
+    }
+  /* Add anti dependency.  */
+  add_dependence (first_arg, insn, REG_DEP_ANTI);
+}
+
+/* Avoid cross block motion of function argument through adding dependency
+   from the first non-jump instruction in bb.  */
+static void
+add_dependee_for_func_arg (rtx arg, basic_block bb)
+{
+  rtx insn = BB_END (bb);
+
+  while (insn)
+    {
+      if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
+	{
+	  rtx set = single_set (insn);
+	  if (set)
+	    {
+	      avoid_func_arg_motion (arg, insn);
+	      return;
+	    }
+	}
+      if (insn == BB_HEAD (bb))
+	return;
+      insn = PREV_INSN (insn);
+    }
+}
+
+/* Hook for pre-reload schedule - avoid motion of function arguments
+   passed in likely spilled HW registers.  */
+static void
+ix86_dependencies_evaluation_hook (rtx head, rtx tail)
+{
+  rtx insn;
+  rtx first_arg = NULL;
+  if (reload_completed)
+    return;
+  while (head != tail && DEBUG_INSN_P (head))
+    head = NEXT_INSN (head);
+  for (insn = tail; insn != head; insn = PREV_INSN (insn))
+    if (INSN_P (insn) && CALL_P (insn))
+      {
+	first_arg = add_parameter_dependencies (insn, head);
+	if (first_arg)
+	  {
+	    /* Add dependee for first argument to predecessors if only
+	       region contains more than one block.  */
+	    basic_block bb =  BLOCK_FOR_INSN (insn);
+	    int rgn = CONTAINING_RGN (bb->index);
+	    int nr_blks = RGN_NR_BLOCKS (rgn);
+	    /* Skip trivial regions and region head blocks that can have
+	       predecessors outside of region.  */
+	    if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
+	      {
+		edge e;
+		edge_iterator ei;
+		/* Assume that region is SCC, i.e. all immediate predecessors
+	           of non-head block are in the same region.  */
+		FOR_EACH_EDGE (e, ei, bb->preds)
+		  {
+		    /* Avoid creating of loop-carried dependencies through
+		       using topological odering in region.  */
+		    if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
+		      add_dependee_for_func_arg (first_arg, e->src); 
+		  }
+	      }
+	    insn = first_arg;
+	    if (insn == head)
+	      break;
+	  }
+      }
+    else if (first_arg)
+      avoid_func_arg_motion (first_arg, insn);
+}
+
+/* Hook for pre-reload schedule - set priority of moves from likely spilled
+   HW registers to maximum, to schedule them at soon as possible. These are
+   moves from function argument registers at the top of the function entry
+   and moves from function return value registers after call.  */
+static int
+ix86_adjust_priority (rtx insn, int priority)
+{
+  rtx set;
+
+  if (reload_completed)
+    return priority;
+
+  if (!NONDEBUG_INSN_P (insn))
+    return priority;
+
+  set = single_set (insn);
+  if (set)
+    {
+      rtx tmp = SET_SRC (set);
+      if (REG_P (tmp)
+          && HARD_REGISTER_P (tmp)
+          && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
+          && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
+	return current_sched_info->sched_max_insns_priority;
+    }
+
+  return priority;
+}
+
+/* Model decoder of Core 2/i7.
+   Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
+   track the instruction fetch block boundaries and make sure that long
+   (9+ bytes) instructions are assigned to D0.  */
+
+/* Maximum length of an insn that can be handled by
+   a secondary decoder unit.  '8' for Core 2/i7.  */
+static int core2i7_secondary_decoder_max_insn_size;
+
+/* Ifetch block size, i.e., number of bytes decoder reads per cycle.
+   '16' for Core 2/i7.  */
+static int core2i7_ifetch_block_size;
+
+/* Maximum number of instructions decoder can handle per cycle.
+   '6' for Core 2/i7.  */
+static int core2i7_ifetch_block_max_insns;
+
+typedef struct ix86_first_cycle_multipass_data_ *
+  ix86_first_cycle_multipass_data_t;
+typedef const struct ix86_first_cycle_multipass_data_ *
+  const_ix86_first_cycle_multipass_data_t;
+
+/* A variable to store target state across calls to max_issue within
+   one cycle.  */
+static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
+  *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
+
+/* Initialize DATA.  */
+static void
+core2i7_first_cycle_multipass_init (void *_data)
+{
+  ix86_first_cycle_multipass_data_t data
+    = (ix86_first_cycle_multipass_data_t) _data;
+
+  data->ifetch_block_len = 0;
+  data->ifetch_block_n_insns = 0;
+  data->ready_try_change = NULL;
+  data->ready_try_change_size = 0;
+}
+
+/* Advancing the cycle; reset ifetch block counts.  */
+static void
+core2i7_dfa_post_advance_cycle (void)
+{
+  ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
+
+  gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
+
+  data->ifetch_block_len = 0;
+  data->ifetch_block_n_insns = 0;
+}
+
+static int min_insn_size (rtx);
+
+/* Filter out insns from ready_try that the core will not be able to issue
+   on current cycle due to decoder.  */
+static void
+core2i7_first_cycle_multipass_filter_ready_try
+(const_ix86_first_cycle_multipass_data_t data,
+ char *ready_try, int n_ready, bool first_cycle_insn_p)
+{
+  while (n_ready--)
+    {
+      rtx insn;
+      int insn_size;
+
+      if (ready_try[n_ready])
+	continue;
+
+      insn = get_ready_element (n_ready);
+      insn_size = min_insn_size (insn);
+
+      if (/* If this is a too long an insn for a secondary decoder ...  */
+	  (!first_cycle_insn_p
+	   && insn_size > core2i7_secondary_decoder_max_insn_size)
+	  /* ... or it would not fit into the ifetch block ...  */
+	  || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
+	  /* ... or the decoder is full already ...  */
+	  || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
+	/* ... mask the insn out.  */
+	{
+	  ready_try[n_ready] = 1;
+
+	  if (data->ready_try_change)
+	    bitmap_set_bit (data->ready_try_change, n_ready);
+	}
+    }
+}
+
+/* Prepare for a new round of multipass lookahead scheduling.  */
+static void
+core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
+				     bool first_cycle_insn_p)
+{
+  ix86_first_cycle_multipass_data_t data
+    = (ix86_first_cycle_multipass_data_t) _data;
+  const_ix86_first_cycle_multipass_data_t prev_data
+    = ix86_first_cycle_multipass_data;
+
+  /* Restore the state from the end of the previous round.  */
+  data->ifetch_block_len = prev_data->ifetch_block_len;
+  data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
+
+  /* Filter instructions that cannot be issued on current cycle due to
+     decoder restrictions.  */
+  core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
+						  first_cycle_insn_p);
+}
+
+/* INSN is being issued in current solution.  Account for its impact on
+   the decoder model.  */
+static void
+core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
+				     rtx insn, const void *_prev_data)
+{
+  ix86_first_cycle_multipass_data_t data
+    = (ix86_first_cycle_multipass_data_t) _data;
+  const_ix86_first_cycle_multipass_data_t prev_data
+    = (const_ix86_first_cycle_multipass_data_t) _prev_data;
+
+  int insn_size = min_insn_size (insn);
+
+  data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
+  data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
+  gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
+	      && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
+
+  /* Allocate or resize the bitmap for storing INSN's effect on ready_try.  */
+  if (!data->ready_try_change)
+    {
+      data->ready_try_change = sbitmap_alloc (n_ready);
+      data->ready_try_change_size = n_ready;
+    }
+  else if (data->ready_try_change_size < n_ready)
+    {
+      data->ready_try_change = sbitmap_resize (data->ready_try_change,
+					       n_ready, 0);
+      data->ready_try_change_size = n_ready;
+    }
+  bitmap_clear (data->ready_try_change);
+
+  /* Filter out insns from ready_try that the core will not be able to issue
+     on current cycle due to decoder.  */
+  core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
+						  false);
+}
+
+/* Revert the effect on ready_try.  */
+static void
+core2i7_first_cycle_multipass_backtrack (const void *_data,
+					 char *ready_try,
+					 int n_ready ATTRIBUTE_UNUSED)
+{
+  const_ix86_first_cycle_multipass_data_t data
+    = (const_ix86_first_cycle_multipass_data_t) _data;
+  unsigned int i = 0;
+  sbitmap_iterator sbi;
+
+  gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
+  EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
+    {
+      ready_try[i] = 0;
+    }
+}
+
+/* Save the result of multipass lookahead scheduling for the next round.  */
+static void
+core2i7_first_cycle_multipass_end (const void *_data)
+{
+  const_ix86_first_cycle_multipass_data_t data
+    = (const_ix86_first_cycle_multipass_data_t) _data;
+  ix86_first_cycle_multipass_data_t next_data
+    = ix86_first_cycle_multipass_data;
+
+  if (data != NULL)
+    {
+      next_data->ifetch_block_len = data->ifetch_block_len;
+      next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
+    }
+}
+
+/* Deallocate target data.  */
+static void
+core2i7_first_cycle_multipass_fini (void *_data)
+{
+  ix86_first_cycle_multipass_data_t data
+    = (ix86_first_cycle_multipass_data_t) _data;
+
+  if (data->ready_try_change)
+    {
+      sbitmap_free (data->ready_try_change);
+      data->ready_try_change = NULL;
+      data->ready_try_change_size = 0;
+    }
+}
+
+/* Prepare for scheduling pass.  */
+static void
+ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
+			int verbose ATTRIBUTE_UNUSED,
+			int max_uid ATTRIBUTE_UNUSED)
+{
+  /* Install scheduling hooks for current CPU.  Some of these hooks are used
+     in time-critical parts of the scheduler, so we only set them up when
+     they are actually used.  */
+  switch (ix86_tune)
+    {
+    case PROCESSOR_CORE2:
+    case PROCESSOR_NEHALEM:
+    case PROCESSOR_SANDYBRIDGE:
+    case PROCESSOR_HASWELL:
+      /* Do not perform multipass scheduling for pre-reload schedule
+         to save compile time.  */
+      if (reload_completed)
+	{
+	  targetm.sched.dfa_post_advance_cycle
+	    = core2i7_dfa_post_advance_cycle;
+	  targetm.sched.first_cycle_multipass_init
+	    = core2i7_first_cycle_multipass_init;
+	  targetm.sched.first_cycle_multipass_begin
+	    = core2i7_first_cycle_multipass_begin;
+	  targetm.sched.first_cycle_multipass_issue
+	    = core2i7_first_cycle_multipass_issue;
+	  targetm.sched.first_cycle_multipass_backtrack
+	    = core2i7_first_cycle_multipass_backtrack;
+	  targetm.sched.first_cycle_multipass_end
+	    = core2i7_first_cycle_multipass_end;
+	  targetm.sched.first_cycle_multipass_fini
+	    = core2i7_first_cycle_multipass_fini;
+
+	  /* Set decoder parameters.  */
+	  core2i7_secondary_decoder_max_insn_size = 8;
+	  core2i7_ifetch_block_size = 16;
+	  core2i7_ifetch_block_max_insns = 6;
+	  break;
+	}
+      /* ... Fall through ...  */
+    default:
+      targetm.sched.dfa_post_advance_cycle = NULL;
+      targetm.sched.first_cycle_multipass_init = NULL;
+      targetm.sched.first_cycle_multipass_begin = NULL;
+      targetm.sched.first_cycle_multipass_issue = NULL;
+      targetm.sched.first_cycle_multipass_backtrack = NULL;
+      targetm.sched.first_cycle_multipass_end = NULL;
+      targetm.sched.first_cycle_multipass_fini = NULL;
+      break;
+    }
+}
+
+
+/* Compute the alignment given to a constant that is being placed in memory.
+   EXP is the constant and ALIGN is the alignment that the object would
+   ordinarily have.
+   The value of this function is used instead of that alignment to align
+   the object.  */
+
+int
+ix86_constant_alignment (tree exp, int align)
+{
+  if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
+      || TREE_CODE (exp) == INTEGER_CST)
+    {
+      if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
+	return 64;
+      else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
+	return 128;
+    }
+  else if (!optimize_size && TREE_CODE (exp) == STRING_CST
+	   && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
+    return BITS_PER_WORD;
+
+  return align;
+}
+
+/* Compute the alignment for a static variable.
+   TYPE is the data type, and ALIGN is the alignment that
+   the object would ordinarily have.  The value of this function is used
+   instead of that alignment to align the object.  */
+
+int
+ix86_data_alignment (tree type, int align, bool opt)
+{
+  /* GCC 4.8 and earlier used to incorrectly assume this alignment even
+     for symbols from other compilation units or symbols that don't need
+     to bind locally.  In order to preserve some ABI compatibility with
+     those compilers, ensure we don't decrease alignment from what we
+     used to assume.  */
+
+  int max_align_compat
+    = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
+
+  /* A data structure, equal or greater than the size of a cache line
+     (64 bytes in the Pentium 4 and other recent Intel processors, including
+     processors based on Intel Core microarchitecture) should be aligned
+     so that its base address is a multiple of a cache line size.  */
+
+  int max_align
+    = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
+
+  if (max_align < BITS_PER_WORD)
+    max_align = BITS_PER_WORD;
+
+  if (opt
+      && AGGREGATE_TYPE_P (type)
+      && TYPE_SIZE (type)
+      && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
+    {
+      if ((TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align_compat
+	   || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
+	  && align < max_align_compat)
+	align = max_align_compat;
+      if ((TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
+	   || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
+	  && align < max_align)
+	align = max_align;
+    }
+
+  /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
+     to 16byte boundary.  */
+  if (TARGET_64BIT)
+    {
+      if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
+	  && TYPE_SIZE (type)
+	  && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
+	  && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
+	      || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
+	return 128;
+    }
+
+  if (!opt)
+    return align;
+
+  if (TREE_CODE (type) == ARRAY_TYPE)
+    {
+      if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
+	return 64;
+      if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
+	return 128;
+    }
+  else if (TREE_CODE (type) == COMPLEX_TYPE)
+    {
+
+      if (TYPE_MODE (type) == DCmode && align < 64)
+	return 64;
+      if ((TYPE_MODE (type) == XCmode
+	   || TYPE_MODE (type) == TCmode) && align < 128)
+	return 128;
+    }
+  else if ((TREE_CODE (type) == RECORD_TYPE
+	    || TREE_CODE (type) == UNION_TYPE
+	    || TREE_CODE (type) == QUAL_UNION_TYPE)
+	   && TYPE_FIELDS (type))
+    {
+      if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
+	return 64;
+      if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
+	return 128;
+    }
+  else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
+	   || TREE_CODE (type) == INTEGER_TYPE)
+    {
+      if (TYPE_MODE (type) == DFmode && align < 64)
+	return 64;
+      if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
+	return 128;
+    }
+
+  return align;
+}
+
+/* Compute the alignment for a local variable or a stack slot.  EXP is
+   the data type or decl itself, MODE is the widest mode available and
+   ALIGN is the alignment that the object would ordinarily have.  The
+   value of this macro is used instead of that alignment to align the
+   object.  */
+
+unsigned int
+ix86_local_alignment (tree exp, enum machine_mode mode,
+		      unsigned int align)
+{
+  tree type, decl;
+
+  if (exp && DECL_P (exp))
+    {
+      type = TREE_TYPE (exp);
+      decl = exp;
+    }
+  else
+    {
+      type = exp;
+      decl = NULL;
+    }
+
+  /* Don't do dynamic stack realignment for long long objects with
+     -mpreferred-stack-boundary=2.  */
+  if (!TARGET_64BIT
+      && align == 64
+      && ix86_preferred_stack_boundary < 64
+      && (mode == DImode || (type && TYPE_MODE (type) == DImode))
+      && (!type || !TYPE_USER_ALIGN (type))
+      && (!decl || !DECL_USER_ALIGN (decl)))
+    align = 32;
+
+  /* If TYPE is NULL, we are allocating a stack slot for caller-save
+     register in MODE.  We will return the largest alignment of XF
+     and DF.  */
+  if (!type)
+    {
+      if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
+	align = GET_MODE_ALIGNMENT (DFmode);
+      return align;
+    }
+
+  /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
+     to 16byte boundary.  Exact wording is:
+
+     An array uses the same alignment as its elements, except that a local or
+     global array variable of length at least 16 bytes or
+     a C99 variable-length array variable always has alignment of at least 16 bytes.
+
+     This was added to allow use of aligned SSE instructions at arrays.  This
+     rule is meant for static storage (where compiler can not do the analysis
+     by itself).  We follow it for automatic variables only when convenient.
+     We fully control everything in the function compiled and functions from
+     other unit can not rely on the alignment.
+
+     Exclude va_list type.  It is the common case of local array where
+     we can not benefit from the alignment.  
+
+     TODO: Probably one should optimize for size only when var is not escaping.  */
+  if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
+      && TARGET_SSE)
+    {
+      if (AGGREGATE_TYPE_P (type)
+	   && (va_list_type_node == NULL_TREE
+	       || (TYPE_MAIN_VARIANT (type)
+		   != TYPE_MAIN_VARIANT (va_list_type_node)))
+	   && TYPE_SIZE (type)
+	   && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
+	   && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
+	       || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
+	return 128;
+    }
+  if (TREE_CODE (type) == ARRAY_TYPE)
+    {
+      if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
+	return 64;
+      if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
+	return 128;
+    }
+  else if (TREE_CODE (type) == COMPLEX_TYPE)
+    {
+      if (TYPE_MODE (type) == DCmode && align < 64)
+	return 64;
+      if ((TYPE_MODE (type) == XCmode
+	   || TYPE_MODE (type) == TCmode) && align < 128)
+	return 128;
+    }
+  else if ((TREE_CODE (type) == RECORD_TYPE
+	    || TREE_CODE (type) == UNION_TYPE
+	    || TREE_CODE (type) == QUAL_UNION_TYPE)
+	   && TYPE_FIELDS (type))
+    {
+      if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
+	return 64;
+      if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
+	return 128;
+    }
+  else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
+	   || TREE_CODE (type) == INTEGER_TYPE)
+    {
+
+      if (TYPE_MODE (type) == DFmode && align < 64)
+	return 64;
+      if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
+	return 128;
+    }
+  return align;
+}
+
+/* Compute the minimum required alignment for dynamic stack realignment
+   purposes for a local variable, parameter or a stack slot.  EXP is
+   the data type or decl itself, MODE is its mode and ALIGN is the
+   alignment that the object would ordinarily have.  */
+
+unsigned int
+ix86_minimum_alignment (tree exp, enum machine_mode mode,
+			unsigned int align)
+{
+  tree type, decl;
+
+  if (exp && DECL_P (exp))
+    {
+      type = TREE_TYPE (exp);
+      decl = exp;
+    }
+  else
+    {
+      type = exp;
+      decl = NULL;
+    }
+
+  if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
+    return align;
+
+  /* Don't do dynamic stack realignment for long long objects with
+     -mpreferred-stack-boundary=2.  */
+  if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
+      && (!type || !TYPE_USER_ALIGN (type))
+      && (!decl || !DECL_USER_ALIGN (decl)))
+    return 32;
+
+  return align;
+}
+
+/* Find a location for the static chain incoming to a nested function.
+   This is a register, unless all free registers are used by arguments.  */
+
+static rtx
+ix86_static_chain (const_tree fndecl, bool incoming_p)
+{
+  unsigned regno;
+
+  if (!DECL_STATIC_CHAIN (fndecl))
+    return NULL;
+
+  if (TARGET_64BIT)
+    {
+      /* We always use R10 in 64-bit mode.  */
+      regno = R10_REG;
+    }
+  else
+    {
+      tree fntype;
+      unsigned int ccvt;
+
+      /* By default in 32-bit mode we use ECX to pass the static chain.  */
+      regno = CX_REG;
+
+      fntype = TREE_TYPE (fndecl);
+      ccvt = ix86_get_callcvt (fntype);
+      if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+	{
+	  /* Fastcall functions use ecx/edx for arguments, which leaves
+	     us with EAX for the static chain.
+	     Thiscall functions use ecx for arguments, which also
+	     leaves us with EAX for the static chain.  */
+	  regno = AX_REG;
+	}
+      else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
+	{
+	  /* Thiscall functions use ecx for arguments, which leaves
+	     us with EAX and EDX for the static chain.
+	     We are using for abi-compatibility EAX.  */
+	  regno = AX_REG;
+	}
+      else if (ix86_function_regparm (fntype, fndecl) == 3)
+	{
+	  /* For regparm 3, we have no free call-clobbered registers in
+	     which to store the static chain.  In order to implement this,
+	     we have the trampoline push the static chain to the stack.
+	     However, we can't push a value below the return address when
+	     we call the nested function directly, so we have to use an
+	     alternate entry point.  For this we use ESI, and have the
+	     alternate entry point push ESI, so that things appear the
+	     same once we're executing the nested function.  */
+	  if (incoming_p)
+	    {
+	      if (fndecl == current_function_decl)
+		ix86_static_chain_on_stack = true;
+	      return gen_frame_mem (SImode,
+				    plus_constant (Pmode,
+						   arg_pointer_rtx, -8));
+	    }
+	  regno = SI_REG;
+	}
+    }
+
+  return gen_rtx_REG (Pmode, regno);
+}
+
+/* Emit RTL insns to initialize the variable parts of a trampoline.
+   FNDECL is the decl of the target address; M_TRAMP is a MEM for
+   the trampoline, and CHAIN_VALUE is an RTX for the static chain
+   to be passed to the target function.  */
+
+static void
+ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
+{
+  rtx mem, fnaddr;
+  int opcode;
+  int offset = 0;
+
+  fnaddr = XEXP (DECL_RTL (fndecl), 0);
+
+  if (TARGET_64BIT)
+    {
+      int size;
+
+      /* Load the function address to r11.  Try to load address using
+	 the shorter movl instead of movabs.  We may want to support
+	 movq for kernel mode, but kernel does not use trampolines at
+	 the moment.  FNADDR is a 32bit address and may not be in
+	 DImode when ptr_mode == SImode.  Always use movl in this
+	 case.  */
+      if (ptr_mode == SImode
+	  || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
+	{
+	  fnaddr = copy_addr_to_reg (fnaddr);
+
+	  mem = adjust_address (m_tramp, HImode, offset);
+	  emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
+
+	  mem = adjust_address (m_tramp, SImode, offset + 2);
+	  emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
+	  offset += 6;
+	}
+      else
+	{
+	  mem = adjust_address (m_tramp, HImode, offset);
+	  emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
+
+	  mem = adjust_address (m_tramp, DImode, offset + 2);
+	  emit_move_insn (mem, fnaddr);
+	  offset += 10;
+	}
+
+      /* Load static chain using movabs to r10.  Use the shorter movl
+         instead of movabs when ptr_mode == SImode.  */
+      if (ptr_mode == SImode)
+	{
+	  opcode = 0xba41;
+	  size = 6;
+	}
+      else
+	{
+	  opcode = 0xba49;
+	  size = 10;
+	}
+
+      mem = adjust_address (m_tramp, HImode, offset);
+      emit_move_insn (mem, gen_int_mode (opcode, HImode));
+
+      mem = adjust_address (m_tramp, ptr_mode, offset + 2);
+      emit_move_insn (mem, chain_value);
+      offset += size;
+
+      /* Jump to r11; the last (unused) byte is a nop, only there to
+	 pad the write out to a single 32-bit store.  */
+      mem = adjust_address (m_tramp, SImode, offset);
+      emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
+      offset += 4;
+    }
+  else
+    {
+      rtx disp, chain;
+
+      /* Depending on the static chain location, either load a register
+	 with a constant, or push the constant to the stack.  All of the
+	 instructions are the same size.  */
+      chain = ix86_static_chain (fndecl, true);
+      if (REG_P (chain))
+	{
+	  switch (REGNO (chain))
+	    {
+	    case AX_REG:
+	      opcode = 0xb8; break;
+	    case CX_REG:
+	      opcode = 0xb9; break;
+	    default:
+	      gcc_unreachable ();
+	    }
+	}
+      else
+	opcode = 0x68;
+
+      mem = adjust_address (m_tramp, QImode, offset);
+      emit_move_insn (mem, gen_int_mode (opcode, QImode));
+
+      mem = adjust_address (m_tramp, SImode, offset + 1);
+      emit_move_insn (mem, chain_value);
+      offset += 5;
+
+      mem = adjust_address (m_tramp, QImode, offset);
+      emit_move_insn (mem, gen_int_mode (0xe9, QImode));
+
+      mem = adjust_address (m_tramp, SImode, offset + 1);
+
+      /* Compute offset from the end of the jmp to the target function.
+	 In the case in which the trampoline stores the static chain on
+	 the stack, we need to skip the first insn which pushes the
+	 (call-saved) register static chain; this push is 1 byte.  */
+      offset += 5;
+      disp = expand_binop (SImode, sub_optab, fnaddr,
+			   plus_constant (Pmode, XEXP (m_tramp, 0),
+					  offset - (MEM_P (chain) ? 1 : 0)),
+			   NULL_RTX, 1, OPTAB_DIRECT);
+      emit_move_insn (mem, disp);
+    }
+
+  gcc_assert (offset <= TRAMPOLINE_SIZE);
+
+#ifdef HAVE_ENABLE_EXECUTE_STACK
+#ifdef CHECK_EXECUTE_STACK_ENABLED
+  if (CHECK_EXECUTE_STACK_ENABLED)
+#endif
+  emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
+		     LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
+#endif
+}
+
+/* The following file contains several enumerations and data structures
+   built from the definitions in i386-builtin-types.def.  */
+
+#include "i386-builtin-types.inc"
+
+/* Table for the ix86 builtin non-function types.  */
+static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
+
+/* Retrieve an element from the above table, building some of
+   the types lazily.  */
+
+static tree
+ix86_get_builtin_type (enum ix86_builtin_type tcode)
+{
+  unsigned int index;
+  tree type, itype;
+
+  gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
+
+  type = ix86_builtin_type_tab[(int) tcode];
+  if (type != NULL)
+    return type;
+
+  gcc_assert (tcode > IX86_BT_LAST_PRIM);
+  if (tcode <= IX86_BT_LAST_VECT)
+    {
+      enum machine_mode mode;
+
+      index = tcode - IX86_BT_LAST_PRIM - 1;
+      itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
+      mode = ix86_builtin_type_vect_mode[index];
+
+      type = build_vector_type_for_mode (itype, mode);
+    }
+  else
+    {
+      int quals;
+
+      index = tcode - IX86_BT_LAST_VECT - 1;
+      if (tcode <= IX86_BT_LAST_PTR)
+	quals = TYPE_UNQUALIFIED;
+      else
+	quals = TYPE_QUAL_CONST;
+
+      itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
+      if (quals != TYPE_UNQUALIFIED)
+	itype = build_qualified_type (itype, quals);
+
+      type = build_pointer_type (itype);
+    }
+
+  ix86_builtin_type_tab[(int) tcode] = type;
+  return type;
+}
+
+/* Table for the ix86 builtin function types.  */
+static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
+
+/* Retrieve an element from the above table, building some of
+   the types lazily.  */
+
+static tree
+ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
+{
+  tree type;
+
+  gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
+
+  type = ix86_builtin_func_type_tab[(int) tcode];
+  if (type != NULL)
+    return type;
+
+  if (tcode <= IX86_BT_LAST_FUNC)
+    {
+      unsigned start = ix86_builtin_func_start[(int) tcode];
+      unsigned after = ix86_builtin_func_start[(int) tcode + 1];
+      tree rtype, atype, args = void_list_node;
+      unsigned i;
+
+      rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
+      for (i = after - 1; i > start; --i)
+	{
+	  atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
+	  args = tree_cons (NULL, atype, args);
+	}
+
+      type = build_function_type (rtype, args);
+    }
+  else
+    {
+      unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
+      enum ix86_builtin_func_type icode;
+
+      icode = ix86_builtin_func_alias_base[index];
+      type = ix86_get_builtin_func_type (icode);
+    }
+
+  ix86_builtin_func_type_tab[(int) tcode] = type;
+  return type;
+}
+
+
+/* Codes for all the SSE/MMX builtins.  */
+enum ix86_builtins
+{
+  IX86_BUILTIN_ADDPS,
+  IX86_BUILTIN_ADDSS,
+  IX86_BUILTIN_DIVPS,
+  IX86_BUILTIN_DIVSS,
+  IX86_BUILTIN_MULPS,
+  IX86_BUILTIN_MULSS,
+  IX86_BUILTIN_SUBPS,
+  IX86_BUILTIN_SUBSS,
+
+  IX86_BUILTIN_CMPEQPS,
+  IX86_BUILTIN_CMPLTPS,
+  IX86_BUILTIN_CMPLEPS,
+  IX86_BUILTIN_CMPGTPS,
+  IX86_BUILTIN_CMPGEPS,
+  IX86_BUILTIN_CMPNEQPS,
+  IX86_BUILTIN_CMPNLTPS,
+  IX86_BUILTIN_CMPNLEPS,
+  IX86_BUILTIN_CMPNGTPS,
+  IX86_BUILTIN_CMPNGEPS,
+  IX86_BUILTIN_CMPORDPS,
+  IX86_BUILTIN_CMPUNORDPS,
+  IX86_BUILTIN_CMPEQSS,
+  IX86_BUILTIN_CMPLTSS,
+  IX86_BUILTIN_CMPLESS,
+  IX86_BUILTIN_CMPNEQSS,
+  IX86_BUILTIN_CMPNLTSS,
+  IX86_BUILTIN_CMPNLESS,
+  IX86_BUILTIN_CMPORDSS,
+  IX86_BUILTIN_CMPUNORDSS,
+
+  IX86_BUILTIN_COMIEQSS,
+  IX86_BUILTIN_COMILTSS,
+  IX86_BUILTIN_COMILESS,
+  IX86_BUILTIN_COMIGTSS,
+  IX86_BUILTIN_COMIGESS,
+  IX86_BUILTIN_COMINEQSS,
+  IX86_BUILTIN_UCOMIEQSS,
+  IX86_BUILTIN_UCOMILTSS,
+  IX86_BUILTIN_UCOMILESS,
+  IX86_BUILTIN_UCOMIGTSS,
+  IX86_BUILTIN_UCOMIGESS,
+  IX86_BUILTIN_UCOMINEQSS,
+
+  IX86_BUILTIN_CVTPI2PS,
+  IX86_BUILTIN_CVTPS2PI,
+  IX86_BUILTIN_CVTSI2SS,
+  IX86_BUILTIN_CVTSI642SS,
+  IX86_BUILTIN_CVTSS2SI,
+  IX86_BUILTIN_CVTSS2SI64,
+  IX86_BUILTIN_CVTTPS2PI,
+  IX86_BUILTIN_CVTTSS2SI,
+  IX86_BUILTIN_CVTTSS2SI64,
+
+  IX86_BUILTIN_MAXPS,
+  IX86_BUILTIN_MAXSS,
+  IX86_BUILTIN_MINPS,
+  IX86_BUILTIN_MINSS,
+
+  IX86_BUILTIN_LOADUPS,
+  IX86_BUILTIN_STOREUPS,
+  IX86_BUILTIN_MOVSS,
+
+  IX86_BUILTIN_MOVHLPS,
+  IX86_BUILTIN_MOVLHPS,
+  IX86_BUILTIN_LOADHPS,
+  IX86_BUILTIN_LOADLPS,
+  IX86_BUILTIN_STOREHPS,
+  IX86_BUILTIN_STORELPS,
+
+  IX86_BUILTIN_MASKMOVQ,
+  IX86_BUILTIN_MOVMSKPS,
+  IX86_BUILTIN_PMOVMSKB,
+
+  IX86_BUILTIN_MOVNTPS,
+  IX86_BUILTIN_MOVNTQ,
+
+  IX86_BUILTIN_LOADDQU,
+  IX86_BUILTIN_STOREDQU,
+
+  IX86_BUILTIN_PACKSSWB,
+  IX86_BUILTIN_PACKSSDW,
+  IX86_BUILTIN_PACKUSWB,
+
+  IX86_BUILTIN_PADDB,
+  IX86_BUILTIN_PADDW,
+  IX86_BUILTIN_PADDD,
+  IX86_BUILTIN_PADDQ,
+  IX86_BUILTIN_PADDSB,
+  IX86_BUILTIN_PADDSW,
+  IX86_BUILTIN_PADDUSB,
+  IX86_BUILTIN_PADDUSW,
+  IX86_BUILTIN_PSUBB,
+  IX86_BUILTIN_PSUBW,
+  IX86_BUILTIN_PSUBD,
+  IX86_BUILTIN_PSUBQ,
+  IX86_BUILTIN_PSUBSB,
+  IX86_BUILTIN_PSUBSW,
+  IX86_BUILTIN_PSUBUSB,
+  IX86_BUILTIN_PSUBUSW,
+
+  IX86_BUILTIN_PAND,
+  IX86_BUILTIN_PANDN,
+  IX86_BUILTIN_POR,
+  IX86_BUILTIN_PXOR,
+
+  IX86_BUILTIN_PAVGB,
+  IX86_BUILTIN_PAVGW,
+
+  IX86_BUILTIN_PCMPEQB,
+  IX86_BUILTIN_PCMPEQW,
+  IX86_BUILTIN_PCMPEQD,
+  IX86_BUILTIN_PCMPGTB,
+  IX86_BUILTIN_PCMPGTW,
+  IX86_BUILTIN_PCMPGTD,
+
+  IX86_BUILTIN_PMADDWD,
+
+  IX86_BUILTIN_PMAXSW,
+  IX86_BUILTIN_PMAXUB,
+  IX86_BUILTIN_PMINSW,
+  IX86_BUILTIN_PMINUB,
+
+  IX86_BUILTIN_PMULHUW,
+  IX86_BUILTIN_PMULHW,
+  IX86_BUILTIN_PMULLW,
+
+  IX86_BUILTIN_PSADBW,
+  IX86_BUILTIN_PSHUFW,
+
+  IX86_BUILTIN_PSLLW,
+  IX86_BUILTIN_PSLLD,
+  IX86_BUILTIN_PSLLQ,
+  IX86_BUILTIN_PSRAW,
+  IX86_BUILTIN_PSRAD,
+  IX86_BUILTIN_PSRLW,
+  IX86_BUILTIN_PSRLD,
+  IX86_BUILTIN_PSRLQ,
+  IX86_BUILTIN_PSLLWI,
+  IX86_BUILTIN_PSLLDI,
+  IX86_BUILTIN_PSLLQI,
+  IX86_BUILTIN_PSRAWI,
+  IX86_BUILTIN_PSRADI,
+  IX86_BUILTIN_PSRLWI,
+  IX86_BUILTIN_PSRLDI,
+  IX86_BUILTIN_PSRLQI,
+
+  IX86_BUILTIN_PUNPCKHBW,
+  IX86_BUILTIN_PUNPCKHWD,
+  IX86_BUILTIN_PUNPCKHDQ,
+  IX86_BUILTIN_PUNPCKLBW,
+  IX86_BUILTIN_PUNPCKLWD,
+  IX86_BUILTIN_PUNPCKLDQ,
+
+  IX86_BUILTIN_SHUFPS,
+
+  IX86_BUILTIN_RCPPS,
+  IX86_BUILTIN_RCPSS,
+  IX86_BUILTIN_RSQRTPS,
+  IX86_BUILTIN_RSQRTPS_NR,
+  IX86_BUILTIN_RSQRTSS,
+  IX86_BUILTIN_RSQRTF,
+  IX86_BUILTIN_SQRTPS,
+  IX86_BUILTIN_SQRTPS_NR,
+  IX86_BUILTIN_SQRTSS,
+
+  IX86_BUILTIN_UNPCKHPS,
+  IX86_BUILTIN_UNPCKLPS,
+
+  IX86_BUILTIN_ANDPS,
+  IX86_BUILTIN_ANDNPS,
+  IX86_BUILTIN_ORPS,
+  IX86_BUILTIN_XORPS,
+
+  IX86_BUILTIN_EMMS,
+  IX86_BUILTIN_LDMXCSR,
+  IX86_BUILTIN_STMXCSR,
+  IX86_BUILTIN_SFENCE,
+
+  IX86_BUILTIN_FXSAVE,
+  IX86_BUILTIN_FXRSTOR,
+  IX86_BUILTIN_FXSAVE64,
+  IX86_BUILTIN_FXRSTOR64,
+
+  IX86_BUILTIN_XSAVE,
+  IX86_BUILTIN_XRSTOR,
+  IX86_BUILTIN_XSAVE64,
+  IX86_BUILTIN_XRSTOR64,
+
+  IX86_BUILTIN_XSAVEOPT,
+  IX86_BUILTIN_XSAVEOPT64,
+
+  /* 3DNow! Original */
+  IX86_BUILTIN_FEMMS,
+  IX86_BUILTIN_PAVGUSB,
+  IX86_BUILTIN_PF2ID,
+  IX86_BUILTIN_PFACC,
+  IX86_BUILTIN_PFADD,
+  IX86_BUILTIN_PFCMPEQ,
+  IX86_BUILTIN_PFCMPGE,
+  IX86_BUILTIN_PFCMPGT,
+  IX86_BUILTIN_PFMAX,
+  IX86_BUILTIN_PFMIN,
+  IX86_BUILTIN_PFMUL,
+  IX86_BUILTIN_PFRCP,
+  IX86_BUILTIN_PFRCPIT1,
+  IX86_BUILTIN_PFRCPIT2,
+  IX86_BUILTIN_PFRSQIT1,
+  IX86_BUILTIN_PFRSQRT,
+  IX86_BUILTIN_PFSUB,
+  IX86_BUILTIN_PFSUBR,
+  IX86_BUILTIN_PI2FD,
+  IX86_BUILTIN_PMULHRW,
+
+  /* 3DNow! Athlon Extensions */
+  IX86_BUILTIN_PF2IW,
+  IX86_BUILTIN_PFNACC,
+  IX86_BUILTIN_PFPNACC,
+  IX86_BUILTIN_PI2FW,
+  IX86_BUILTIN_PSWAPDSI,
+  IX86_BUILTIN_PSWAPDSF,
+
+  /* SSE2 */
+  IX86_BUILTIN_ADDPD,
+  IX86_BUILTIN_ADDSD,
+  IX86_BUILTIN_DIVPD,
+  IX86_BUILTIN_DIVSD,
+  IX86_BUILTIN_MULPD,
+  IX86_BUILTIN_MULSD,
+  IX86_BUILTIN_SUBPD,
+  IX86_BUILTIN_SUBSD,
+
+  IX86_BUILTIN_CMPEQPD,
+  IX86_BUILTIN_CMPLTPD,
+  IX86_BUILTIN_CMPLEPD,
+  IX86_BUILTIN_CMPGTPD,
+  IX86_BUILTIN_CMPGEPD,
+  IX86_BUILTIN_CMPNEQPD,
+  IX86_BUILTIN_CMPNLTPD,
+  IX86_BUILTIN_CMPNLEPD,
+  IX86_BUILTIN_CMPNGTPD,
+  IX86_BUILTIN_CMPNGEPD,
+  IX86_BUILTIN_CMPORDPD,
+  IX86_BUILTIN_CMPUNORDPD,
+  IX86_BUILTIN_CMPEQSD,
+  IX86_BUILTIN_CMPLTSD,
+  IX86_BUILTIN_CMPLESD,
+  IX86_BUILTIN_CMPNEQSD,
+  IX86_BUILTIN_CMPNLTSD,
+  IX86_BUILTIN_CMPNLESD,
+  IX86_BUILTIN_CMPORDSD,
+  IX86_BUILTIN_CMPUNORDSD,
+
+  IX86_BUILTIN_COMIEQSD,
+  IX86_BUILTIN_COMILTSD,
+  IX86_BUILTIN_COMILESD,
+  IX86_BUILTIN_COMIGTSD,
+  IX86_BUILTIN_COMIGESD,
+  IX86_BUILTIN_COMINEQSD,
+  IX86_BUILTIN_UCOMIEQSD,
+  IX86_BUILTIN_UCOMILTSD,
+  IX86_BUILTIN_UCOMILESD,
+  IX86_BUILTIN_UCOMIGTSD,
+  IX86_BUILTIN_UCOMIGESD,
+  IX86_BUILTIN_UCOMINEQSD,
+
+  IX86_BUILTIN_MAXPD,
+  IX86_BUILTIN_MAXSD,
+  IX86_BUILTIN_MINPD,
+  IX86_BUILTIN_MINSD,
+
+  IX86_BUILTIN_ANDPD,
+  IX86_BUILTIN_ANDNPD,
+  IX86_BUILTIN_ORPD,
+  IX86_BUILTIN_XORPD,
+
+  IX86_BUILTIN_SQRTPD,
+  IX86_BUILTIN_SQRTSD,
+
+  IX86_BUILTIN_UNPCKHPD,
+  IX86_BUILTIN_UNPCKLPD,
+
+  IX86_BUILTIN_SHUFPD,
+
+  IX86_BUILTIN_LOADUPD,
+  IX86_BUILTIN_STOREUPD,
+  IX86_BUILTIN_MOVSD,
+
+  IX86_BUILTIN_LOADHPD,
+  IX86_BUILTIN_LOADLPD,
+
+  IX86_BUILTIN_CVTDQ2PD,
+  IX86_BUILTIN_CVTDQ2PS,
+
+  IX86_BUILTIN_CVTPD2DQ,
+  IX86_BUILTIN_CVTPD2PI,
+  IX86_BUILTIN_CVTPD2PS,
+  IX86_BUILTIN_CVTTPD2DQ,
+  IX86_BUILTIN_CVTTPD2PI,
+
+  IX86_BUILTIN_CVTPI2PD,
+  IX86_BUILTIN_CVTSI2SD,
+  IX86_BUILTIN_CVTSI642SD,
+
+  IX86_BUILTIN_CVTSD2SI,
+  IX86_BUILTIN_CVTSD2SI64,
+  IX86_BUILTIN_CVTSD2SS,
+  IX86_BUILTIN_CVTSS2SD,
+  IX86_BUILTIN_CVTTSD2SI,
+  IX86_BUILTIN_CVTTSD2SI64,
+
+  IX86_BUILTIN_CVTPS2DQ,
+  IX86_BUILTIN_CVTPS2PD,
+  IX86_BUILTIN_CVTTPS2DQ,
+
+  IX86_BUILTIN_MOVNTI,
+  IX86_BUILTIN_MOVNTI64,
+  IX86_BUILTIN_MOVNTPD,
+  IX86_BUILTIN_MOVNTDQ,
+
+  IX86_BUILTIN_MOVQ128,
+
+  /* SSE2 MMX */
+  IX86_BUILTIN_MASKMOVDQU,
+  IX86_BUILTIN_MOVMSKPD,
+  IX86_BUILTIN_PMOVMSKB128,
+
+  IX86_BUILTIN_PACKSSWB128,
+  IX86_BUILTIN_PACKSSDW128,
+  IX86_BUILTIN_PACKUSWB128,
+
+  IX86_BUILTIN_PADDB128,
+  IX86_BUILTIN_PADDW128,
+  IX86_BUILTIN_PADDD128,
+  IX86_BUILTIN_PADDQ128,
+  IX86_BUILTIN_PADDSB128,
+  IX86_BUILTIN_PADDSW128,
+  IX86_BUILTIN_PADDUSB128,
+  IX86_BUILTIN_PADDUSW128,
+  IX86_BUILTIN_PSUBB128,
+  IX86_BUILTIN_PSUBW128,
+  IX86_BUILTIN_PSUBD128,
+  IX86_BUILTIN_PSUBQ128,
+  IX86_BUILTIN_PSUBSB128,
+  IX86_BUILTIN_PSUBSW128,
+  IX86_BUILTIN_PSUBUSB128,
+  IX86_BUILTIN_PSUBUSW128,
+
+  IX86_BUILTIN_PAND128,
+  IX86_BUILTIN_PANDN128,
+  IX86_BUILTIN_POR128,
+  IX86_BUILTIN_PXOR128,
+
+  IX86_BUILTIN_PAVGB128,
+  IX86_BUILTIN_PAVGW128,
+
+  IX86_BUILTIN_PCMPEQB128,
+  IX86_BUILTIN_PCMPEQW128,
+  IX86_BUILTIN_PCMPEQD128,
+  IX86_BUILTIN_PCMPGTB128,
+  IX86_BUILTIN_PCMPGTW128,
+  IX86_BUILTIN_PCMPGTD128,
+
+  IX86_BUILTIN_PMADDWD128,
+
+  IX86_BUILTIN_PMAXSW128,
+  IX86_BUILTIN_PMAXUB128,
+  IX86_BUILTIN_PMINSW128,
+  IX86_BUILTIN_PMINUB128,
+
+  IX86_BUILTIN_PMULUDQ,
+  IX86_BUILTIN_PMULUDQ128,
+  IX86_BUILTIN_PMULHUW128,
+  IX86_BUILTIN_PMULHW128,
+  IX86_BUILTIN_PMULLW128,
+
+  IX86_BUILTIN_PSADBW128,
+  IX86_BUILTIN_PSHUFHW,
+  IX86_BUILTIN_PSHUFLW,
+  IX86_BUILTIN_PSHUFD,
+
+  IX86_BUILTIN_PSLLDQI128,
+  IX86_BUILTIN_PSLLWI128,
+  IX86_BUILTIN_PSLLDI128,
+  IX86_BUILTIN_PSLLQI128,
+  IX86_BUILTIN_PSRAWI128,
+  IX86_BUILTIN_PSRADI128,
+  IX86_BUILTIN_PSRLDQI128,
+  IX86_BUILTIN_PSRLWI128,
+  IX86_BUILTIN_PSRLDI128,
+  IX86_BUILTIN_PSRLQI128,
+
+  IX86_BUILTIN_PSLLDQ128,
+  IX86_BUILTIN_PSLLW128,
+  IX86_BUILTIN_PSLLD128,
+  IX86_BUILTIN_PSLLQ128,
+  IX86_BUILTIN_PSRAW128,
+  IX86_BUILTIN_PSRAD128,
+  IX86_BUILTIN_PSRLW128,
+  IX86_BUILTIN_PSRLD128,
+  IX86_BUILTIN_PSRLQ128,
+
+  IX86_BUILTIN_PUNPCKHBW128,
+  IX86_BUILTIN_PUNPCKHWD128,
+  IX86_BUILTIN_PUNPCKHDQ128,
+  IX86_BUILTIN_PUNPCKHQDQ128,
+  IX86_BUILTIN_PUNPCKLBW128,
+  IX86_BUILTIN_PUNPCKLWD128,
+  IX86_BUILTIN_PUNPCKLDQ128,
+  IX86_BUILTIN_PUNPCKLQDQ128,
+
+  IX86_BUILTIN_CLFLUSH,
+  IX86_BUILTIN_MFENCE,
+  IX86_BUILTIN_LFENCE,
+  IX86_BUILTIN_PAUSE,
+
+  IX86_BUILTIN_FNSTENV,
+  IX86_BUILTIN_FLDENV,
+  IX86_BUILTIN_FNSTSW,
+  IX86_BUILTIN_FNCLEX,
+
+  IX86_BUILTIN_BSRSI,
+  IX86_BUILTIN_BSRDI,
+  IX86_BUILTIN_RDPMC,
+  IX86_BUILTIN_RDTSC,
+  IX86_BUILTIN_RDTSCP,
+  IX86_BUILTIN_ROLQI,
+  IX86_BUILTIN_ROLHI,
+  IX86_BUILTIN_RORQI,
+  IX86_BUILTIN_RORHI,
+
+  /* SSE3.  */
+  IX86_BUILTIN_ADDSUBPS,
+  IX86_BUILTIN_HADDPS,
+  IX86_BUILTIN_HSUBPS,
+  IX86_BUILTIN_MOVSHDUP,
+  IX86_BUILTIN_MOVSLDUP,
+  IX86_BUILTIN_ADDSUBPD,
+  IX86_BUILTIN_HADDPD,
+  IX86_BUILTIN_HSUBPD,
+  IX86_BUILTIN_LDDQU,
+
+  IX86_BUILTIN_MONITOR,
+  IX86_BUILTIN_MWAIT,
+
+  /* SSSE3.  */
+  IX86_BUILTIN_PHADDW,
+  IX86_BUILTIN_PHADDD,
+  IX86_BUILTIN_PHADDSW,
+  IX86_BUILTIN_PHSUBW,
+  IX86_BUILTIN_PHSUBD,
+  IX86_BUILTIN_PHSUBSW,
+  IX86_BUILTIN_PMADDUBSW,
+  IX86_BUILTIN_PMULHRSW,
+  IX86_BUILTIN_PSHUFB,
+  IX86_BUILTIN_PSIGNB,
+  IX86_BUILTIN_PSIGNW,
+  IX86_BUILTIN_PSIGND,
+  IX86_BUILTIN_PALIGNR,
+  IX86_BUILTIN_PABSB,
+  IX86_BUILTIN_PABSW,
+  IX86_BUILTIN_PABSD,
+
+  IX86_BUILTIN_PHADDW128,
+  IX86_BUILTIN_PHADDD128,
+  IX86_BUILTIN_PHADDSW128,
+  IX86_BUILTIN_PHSUBW128,
+  IX86_BUILTIN_PHSUBD128,
+  IX86_BUILTIN_PHSUBSW128,
+  IX86_BUILTIN_PMADDUBSW128,
+  IX86_BUILTIN_PMULHRSW128,
+  IX86_BUILTIN_PSHUFB128,
+  IX86_BUILTIN_PSIGNB128,
+  IX86_BUILTIN_PSIGNW128,
+  IX86_BUILTIN_PSIGND128,
+  IX86_BUILTIN_PALIGNR128,
+  IX86_BUILTIN_PABSB128,
+  IX86_BUILTIN_PABSW128,
+  IX86_BUILTIN_PABSD128,
+
+  /* AMDFAM10 - SSE4A New Instructions.  */
+  IX86_BUILTIN_MOVNTSD,
+  IX86_BUILTIN_MOVNTSS,
+  IX86_BUILTIN_EXTRQI,
+  IX86_BUILTIN_EXTRQ,
+  IX86_BUILTIN_INSERTQI,
+  IX86_BUILTIN_INSERTQ,
+
+  /* SSE4.1.  */
+  IX86_BUILTIN_BLENDPD,
+  IX86_BUILTIN_BLENDPS,
+  IX86_BUILTIN_BLENDVPD,
+  IX86_BUILTIN_BLENDVPS,
+  IX86_BUILTIN_PBLENDVB128,
+  IX86_BUILTIN_PBLENDW128,
+
+  IX86_BUILTIN_DPPD,
+  IX86_BUILTIN_DPPS,
+
+  IX86_BUILTIN_INSERTPS128,
+
+  IX86_BUILTIN_MOVNTDQA,
+  IX86_BUILTIN_MPSADBW128,
+  IX86_BUILTIN_PACKUSDW128,
+  IX86_BUILTIN_PCMPEQQ,
+  IX86_BUILTIN_PHMINPOSUW128,
+
+  IX86_BUILTIN_PMAXSB128,
+  IX86_BUILTIN_PMAXSD128,
+  IX86_BUILTIN_PMAXUD128,
+  IX86_BUILTIN_PMAXUW128,
+
+  IX86_BUILTIN_PMINSB128,
+  IX86_BUILTIN_PMINSD128,
+  IX86_BUILTIN_PMINUD128,
+  IX86_BUILTIN_PMINUW128,
+
+  IX86_BUILTIN_PMOVSXBW128,
+  IX86_BUILTIN_PMOVSXBD128,
+  IX86_BUILTIN_PMOVSXBQ128,
+  IX86_BUILTIN_PMOVSXWD128,
+  IX86_BUILTIN_PMOVSXWQ128,
+  IX86_BUILTIN_PMOVSXDQ128,
+
+  IX86_BUILTIN_PMOVZXBW128,
+  IX86_BUILTIN_PMOVZXBD128,
+  IX86_BUILTIN_PMOVZXBQ128,
+  IX86_BUILTIN_PMOVZXWD128,
+  IX86_BUILTIN_PMOVZXWQ128,
+  IX86_BUILTIN_PMOVZXDQ128,
+
+  IX86_BUILTIN_PMULDQ128,
+  IX86_BUILTIN_PMULLD128,
+
+  IX86_BUILTIN_ROUNDSD,
+  IX86_BUILTIN_ROUNDSS,
+
+  IX86_BUILTIN_ROUNDPD,
+  IX86_BUILTIN_ROUNDPS,
+
+  IX86_BUILTIN_FLOORPD,
+  IX86_BUILTIN_CEILPD,
+  IX86_BUILTIN_TRUNCPD,
+  IX86_BUILTIN_RINTPD,
+  IX86_BUILTIN_ROUNDPD_AZ,
+
+  IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
+  IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
+  IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
+
+  IX86_BUILTIN_FLOORPS,
+  IX86_BUILTIN_CEILPS,
+  IX86_BUILTIN_TRUNCPS,
+  IX86_BUILTIN_RINTPS,
+  IX86_BUILTIN_ROUNDPS_AZ,
+
+  IX86_BUILTIN_FLOORPS_SFIX,
+  IX86_BUILTIN_CEILPS_SFIX,
+  IX86_BUILTIN_ROUNDPS_AZ_SFIX,
+
+  IX86_BUILTIN_PTESTZ,
+  IX86_BUILTIN_PTESTC,
+  IX86_BUILTIN_PTESTNZC,
+
+  IX86_BUILTIN_VEC_INIT_V2SI,
+  IX86_BUILTIN_VEC_INIT_V4HI,
+  IX86_BUILTIN_VEC_INIT_V8QI,
+  IX86_BUILTIN_VEC_EXT_V2DF,
+  IX86_BUILTIN_VEC_EXT_V2DI,
+  IX86_BUILTIN_VEC_EXT_V4SF,
+  IX86_BUILTIN_VEC_EXT_V4SI,
+  IX86_BUILTIN_VEC_EXT_V8HI,
+  IX86_BUILTIN_VEC_EXT_V2SI,
+  IX86_BUILTIN_VEC_EXT_V4HI,
+  IX86_BUILTIN_VEC_EXT_V16QI,
+  IX86_BUILTIN_VEC_SET_V2DI,
+  IX86_BUILTIN_VEC_SET_V4SF,
+  IX86_BUILTIN_VEC_SET_V4SI,
+  IX86_BUILTIN_VEC_SET_V8HI,
+  IX86_BUILTIN_VEC_SET_V4HI,
+  IX86_BUILTIN_VEC_SET_V16QI,
+
+  IX86_BUILTIN_VEC_PACK_SFIX,
+  IX86_BUILTIN_VEC_PACK_SFIX256,
+
+  /* SSE4.2.  */
+  IX86_BUILTIN_CRC32QI,
+  IX86_BUILTIN_CRC32HI,
+  IX86_BUILTIN_CRC32SI,
+  IX86_BUILTIN_CRC32DI,
+
+  IX86_BUILTIN_PCMPESTRI128,
+  IX86_BUILTIN_PCMPESTRM128,
+  IX86_BUILTIN_PCMPESTRA128,
+  IX86_BUILTIN_PCMPESTRC128,
+  IX86_BUILTIN_PCMPESTRO128,
+  IX86_BUILTIN_PCMPESTRS128,
+  IX86_BUILTIN_PCMPESTRZ128,
+  IX86_BUILTIN_PCMPISTRI128,
+  IX86_BUILTIN_PCMPISTRM128,
+  IX86_BUILTIN_PCMPISTRA128,
+  IX86_BUILTIN_PCMPISTRC128,
+  IX86_BUILTIN_PCMPISTRO128,
+  IX86_BUILTIN_PCMPISTRS128,
+  IX86_BUILTIN_PCMPISTRZ128,
+
+  IX86_BUILTIN_PCMPGTQ,
+
+  /* AES instructions */
+  IX86_BUILTIN_AESENC128,
+  IX86_BUILTIN_AESENCLAST128,
+  IX86_BUILTIN_AESDEC128,
+  IX86_BUILTIN_AESDECLAST128,
+  IX86_BUILTIN_AESIMC128,
+  IX86_BUILTIN_AESKEYGENASSIST128,
+
+  /* PCLMUL instruction */
+  IX86_BUILTIN_PCLMULQDQ128,
+
+  /* AVX */
+  IX86_BUILTIN_ADDPD256,
+  IX86_BUILTIN_ADDPS256,
+  IX86_BUILTIN_ADDSUBPD256,
+  IX86_BUILTIN_ADDSUBPS256,
+  IX86_BUILTIN_ANDPD256,
+  IX86_BUILTIN_ANDPS256,
+  IX86_BUILTIN_ANDNPD256,
+  IX86_BUILTIN_ANDNPS256,
+  IX86_BUILTIN_BLENDPD256,
+  IX86_BUILTIN_BLENDPS256,
+  IX86_BUILTIN_BLENDVPD256,
+  IX86_BUILTIN_BLENDVPS256,
+  IX86_BUILTIN_DIVPD256,
+  IX86_BUILTIN_DIVPS256,
+  IX86_BUILTIN_DPPS256,
+  IX86_BUILTIN_HADDPD256,
+  IX86_BUILTIN_HADDPS256,
+  IX86_BUILTIN_HSUBPD256,
+  IX86_BUILTIN_HSUBPS256,
+  IX86_BUILTIN_MAXPD256,
+  IX86_BUILTIN_MAXPS256,
+  IX86_BUILTIN_MINPD256,
+  IX86_BUILTIN_MINPS256,
+  IX86_BUILTIN_MULPD256,
+  IX86_BUILTIN_MULPS256,
+  IX86_BUILTIN_ORPD256,
+  IX86_BUILTIN_ORPS256,
+  IX86_BUILTIN_SHUFPD256,
+  IX86_BUILTIN_SHUFPS256,
+  IX86_BUILTIN_SUBPD256,
+  IX86_BUILTIN_SUBPS256,
+  IX86_BUILTIN_XORPD256,
+  IX86_BUILTIN_XORPS256,
+  IX86_BUILTIN_CMPSD,
+  IX86_BUILTIN_CMPSS,
+  IX86_BUILTIN_CMPPD,
+  IX86_BUILTIN_CMPPS,
+  IX86_BUILTIN_CMPPD256,
+  IX86_BUILTIN_CMPPS256,
+  IX86_BUILTIN_CVTDQ2PD256,
+  IX86_BUILTIN_CVTDQ2PS256,
+  IX86_BUILTIN_CVTPD2PS256,
+  IX86_BUILTIN_CVTPS2DQ256,
+  IX86_BUILTIN_CVTPS2PD256,
+  IX86_BUILTIN_CVTTPD2DQ256,
+  IX86_BUILTIN_CVTPD2DQ256,
+  IX86_BUILTIN_CVTTPS2DQ256,
+  IX86_BUILTIN_EXTRACTF128PD256,
+  IX86_BUILTIN_EXTRACTF128PS256,
+  IX86_BUILTIN_EXTRACTF128SI256,
+  IX86_BUILTIN_VZEROALL,
+  IX86_BUILTIN_VZEROUPPER,
+  IX86_BUILTIN_VPERMILVARPD,
+  IX86_BUILTIN_VPERMILVARPS,
+  IX86_BUILTIN_VPERMILVARPD256,
+  IX86_BUILTIN_VPERMILVARPS256,
+  IX86_BUILTIN_VPERMILPD,
+  IX86_BUILTIN_VPERMILPS,
+  IX86_BUILTIN_VPERMILPD256,
+  IX86_BUILTIN_VPERMILPS256,
+  IX86_BUILTIN_VPERMIL2PD,
+  IX86_BUILTIN_VPERMIL2PS,
+  IX86_BUILTIN_VPERMIL2PD256,
+  IX86_BUILTIN_VPERMIL2PS256,
+  IX86_BUILTIN_VPERM2F128PD256,
+  IX86_BUILTIN_VPERM2F128PS256,
+  IX86_BUILTIN_VPERM2F128SI256,
+  IX86_BUILTIN_VBROADCASTSS,
+  IX86_BUILTIN_VBROADCASTSD256,
+  IX86_BUILTIN_VBROADCASTSS256,
+  IX86_BUILTIN_VBROADCASTPD256,
+  IX86_BUILTIN_VBROADCASTPS256,
+  IX86_BUILTIN_VINSERTF128PD256,
+  IX86_BUILTIN_VINSERTF128PS256,
+  IX86_BUILTIN_VINSERTF128SI256,
+  IX86_BUILTIN_LOADUPD256,
+  IX86_BUILTIN_LOADUPS256,
+  IX86_BUILTIN_STOREUPD256,
+  IX86_BUILTIN_STOREUPS256,
+  IX86_BUILTIN_LDDQU256,
+  IX86_BUILTIN_MOVNTDQ256,
+  IX86_BUILTIN_MOVNTPD256,
+  IX86_BUILTIN_MOVNTPS256,
+  IX86_BUILTIN_LOADDQU256,
+  IX86_BUILTIN_STOREDQU256,
+  IX86_BUILTIN_MASKLOADPD,
+  IX86_BUILTIN_MASKLOADPS,
+  IX86_BUILTIN_MASKSTOREPD,
+  IX86_BUILTIN_MASKSTOREPS,
+  IX86_BUILTIN_MASKLOADPD256,
+  IX86_BUILTIN_MASKLOADPS256,
+  IX86_BUILTIN_MASKSTOREPD256,
+  IX86_BUILTIN_MASKSTOREPS256,
+  IX86_BUILTIN_MOVSHDUP256,
+  IX86_BUILTIN_MOVSLDUP256,
+  IX86_BUILTIN_MOVDDUP256,
+
+  IX86_BUILTIN_SQRTPD256,
+  IX86_BUILTIN_SQRTPS256,
+  IX86_BUILTIN_SQRTPS_NR256,
+  IX86_BUILTIN_RSQRTPS256,
+  IX86_BUILTIN_RSQRTPS_NR256,
+
+  IX86_BUILTIN_RCPPS256,
+
+  IX86_BUILTIN_ROUNDPD256,
+  IX86_BUILTIN_ROUNDPS256,
+
+  IX86_BUILTIN_FLOORPD256,
+  IX86_BUILTIN_CEILPD256,
+  IX86_BUILTIN_TRUNCPD256,
+  IX86_BUILTIN_RINTPD256,
+  IX86_BUILTIN_ROUNDPD_AZ256,
+
+  IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
+  IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
+  IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
+
+  IX86_BUILTIN_FLOORPS256,
+  IX86_BUILTIN_CEILPS256,
+  IX86_BUILTIN_TRUNCPS256,
+  IX86_BUILTIN_RINTPS256,
+  IX86_BUILTIN_ROUNDPS_AZ256,
+
+  IX86_BUILTIN_FLOORPS_SFIX256,
+  IX86_BUILTIN_CEILPS_SFIX256,
+  IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
+
+  IX86_BUILTIN_UNPCKHPD256,
+  IX86_BUILTIN_UNPCKLPD256,
+  IX86_BUILTIN_UNPCKHPS256,
+  IX86_BUILTIN_UNPCKLPS256,
+
+  IX86_BUILTIN_SI256_SI,
+  IX86_BUILTIN_PS256_PS,
+  IX86_BUILTIN_PD256_PD,
+  IX86_BUILTIN_SI_SI256,
+  IX86_BUILTIN_PS_PS256,
+  IX86_BUILTIN_PD_PD256,
+
+  IX86_BUILTIN_VTESTZPD,
+  IX86_BUILTIN_VTESTCPD,
+  IX86_BUILTIN_VTESTNZCPD,
+  IX86_BUILTIN_VTESTZPS,
+  IX86_BUILTIN_VTESTCPS,
+  IX86_BUILTIN_VTESTNZCPS,
+  IX86_BUILTIN_VTESTZPD256,
+  IX86_BUILTIN_VTESTCPD256,
+  IX86_BUILTIN_VTESTNZCPD256,
+  IX86_BUILTIN_VTESTZPS256,
+  IX86_BUILTIN_VTESTCPS256,
+  IX86_BUILTIN_VTESTNZCPS256,
+  IX86_BUILTIN_PTESTZ256,
+  IX86_BUILTIN_PTESTC256,
+  IX86_BUILTIN_PTESTNZC256,
+
+  IX86_BUILTIN_MOVMSKPD256,
+  IX86_BUILTIN_MOVMSKPS256,
+
+  /* AVX2 */
+  IX86_BUILTIN_MPSADBW256,
+  IX86_BUILTIN_PABSB256,
+  IX86_BUILTIN_PABSW256,
+  IX86_BUILTIN_PABSD256,
+  IX86_BUILTIN_PACKSSDW256,
+  IX86_BUILTIN_PACKSSWB256,
+  IX86_BUILTIN_PACKUSDW256,
+  IX86_BUILTIN_PACKUSWB256,
+  IX86_BUILTIN_PADDB256,
+  IX86_BUILTIN_PADDW256,
+  IX86_BUILTIN_PADDD256,
+  IX86_BUILTIN_PADDQ256,
+  IX86_BUILTIN_PADDSB256,
+  IX86_BUILTIN_PADDSW256,
+  IX86_BUILTIN_PADDUSB256,
+  IX86_BUILTIN_PADDUSW256,
+  IX86_BUILTIN_PALIGNR256,
+  IX86_BUILTIN_AND256I,
+  IX86_BUILTIN_ANDNOT256I,
+  IX86_BUILTIN_PAVGB256,
+  IX86_BUILTIN_PAVGW256,
+  IX86_BUILTIN_PBLENDVB256,
+  IX86_BUILTIN_PBLENDVW256,
+  IX86_BUILTIN_PCMPEQB256,
+  IX86_BUILTIN_PCMPEQW256,
+  IX86_BUILTIN_PCMPEQD256,
+  IX86_BUILTIN_PCMPEQQ256,
+  IX86_BUILTIN_PCMPGTB256,
+  IX86_BUILTIN_PCMPGTW256,
+  IX86_BUILTIN_PCMPGTD256,
+  IX86_BUILTIN_PCMPGTQ256,
+  IX86_BUILTIN_PHADDW256,
+  IX86_BUILTIN_PHADDD256,
+  IX86_BUILTIN_PHADDSW256,
+  IX86_BUILTIN_PHSUBW256,
+  IX86_BUILTIN_PHSUBD256,
+  IX86_BUILTIN_PHSUBSW256,
+  IX86_BUILTIN_PMADDUBSW256,
+  IX86_BUILTIN_PMADDWD256,
+  IX86_BUILTIN_PMAXSB256,
+  IX86_BUILTIN_PMAXSW256,
+  IX86_BUILTIN_PMAXSD256,
+  IX86_BUILTIN_PMAXUB256,
+  IX86_BUILTIN_PMAXUW256,
+  IX86_BUILTIN_PMAXUD256,
+  IX86_BUILTIN_PMINSB256,
+  IX86_BUILTIN_PMINSW256,
+  IX86_BUILTIN_PMINSD256,
+  IX86_BUILTIN_PMINUB256,
+  IX86_BUILTIN_PMINUW256,
+  IX86_BUILTIN_PMINUD256,
+  IX86_BUILTIN_PMOVMSKB256,
+  IX86_BUILTIN_PMOVSXBW256,
+  IX86_BUILTIN_PMOVSXBD256,
+  IX86_BUILTIN_PMOVSXBQ256,
+  IX86_BUILTIN_PMOVSXWD256,
+  IX86_BUILTIN_PMOVSXWQ256,
+  IX86_BUILTIN_PMOVSXDQ256,
+  IX86_BUILTIN_PMOVZXBW256,
+  IX86_BUILTIN_PMOVZXBD256,
+  IX86_BUILTIN_PMOVZXBQ256,
+  IX86_BUILTIN_PMOVZXWD256,
+  IX86_BUILTIN_PMOVZXWQ256,
+  IX86_BUILTIN_PMOVZXDQ256,
+  IX86_BUILTIN_PMULDQ256,
+  IX86_BUILTIN_PMULHRSW256,
+  IX86_BUILTIN_PMULHUW256,
+  IX86_BUILTIN_PMULHW256,
+  IX86_BUILTIN_PMULLW256,
+  IX86_BUILTIN_PMULLD256,
+  IX86_BUILTIN_PMULUDQ256,
+  IX86_BUILTIN_POR256,
+  IX86_BUILTIN_PSADBW256,
+  IX86_BUILTIN_PSHUFB256,
+  IX86_BUILTIN_PSHUFD256,
+  IX86_BUILTIN_PSHUFHW256,
+  IX86_BUILTIN_PSHUFLW256,
+  IX86_BUILTIN_PSIGNB256,
+  IX86_BUILTIN_PSIGNW256,
+  IX86_BUILTIN_PSIGND256,
+  IX86_BUILTIN_PSLLDQI256,
+  IX86_BUILTIN_PSLLWI256,
+  IX86_BUILTIN_PSLLW256,
+  IX86_BUILTIN_PSLLDI256,
+  IX86_BUILTIN_PSLLD256,
+  IX86_BUILTIN_PSLLQI256,
+  IX86_BUILTIN_PSLLQ256,
+  IX86_BUILTIN_PSRAWI256,
+  IX86_BUILTIN_PSRAW256,
+  IX86_BUILTIN_PSRADI256,
+  IX86_BUILTIN_PSRAD256,
+  IX86_BUILTIN_PSRLDQI256,
+  IX86_BUILTIN_PSRLWI256,
+  IX86_BUILTIN_PSRLW256,
+  IX86_BUILTIN_PSRLDI256,
+  IX86_BUILTIN_PSRLD256,
+  IX86_BUILTIN_PSRLQI256,
+  IX86_BUILTIN_PSRLQ256,
+  IX86_BUILTIN_PSUBB256,
+  IX86_BUILTIN_PSUBW256,
+  IX86_BUILTIN_PSUBD256,
+  IX86_BUILTIN_PSUBQ256,
+  IX86_BUILTIN_PSUBSB256,
+  IX86_BUILTIN_PSUBSW256,
+  IX86_BUILTIN_PSUBUSB256,
+  IX86_BUILTIN_PSUBUSW256,
+  IX86_BUILTIN_PUNPCKHBW256,
+  IX86_BUILTIN_PUNPCKHWD256,
+  IX86_BUILTIN_PUNPCKHDQ256,
+  IX86_BUILTIN_PUNPCKHQDQ256,
+  IX86_BUILTIN_PUNPCKLBW256,
+  IX86_BUILTIN_PUNPCKLWD256,
+  IX86_BUILTIN_PUNPCKLDQ256,
+  IX86_BUILTIN_PUNPCKLQDQ256,
+  IX86_BUILTIN_PXOR256,
+  IX86_BUILTIN_MOVNTDQA256,
+  IX86_BUILTIN_VBROADCASTSS_PS,
+  IX86_BUILTIN_VBROADCASTSS_PS256,
+  IX86_BUILTIN_VBROADCASTSD_PD256,
+  IX86_BUILTIN_VBROADCASTSI256,
+  IX86_BUILTIN_PBLENDD256,
+  IX86_BUILTIN_PBLENDD128,
+  IX86_BUILTIN_PBROADCASTB256,
+  IX86_BUILTIN_PBROADCASTW256,
+  IX86_BUILTIN_PBROADCASTD256,
+  IX86_BUILTIN_PBROADCASTQ256,
+  IX86_BUILTIN_PBROADCASTB128,
+  IX86_BUILTIN_PBROADCASTW128,
+  IX86_BUILTIN_PBROADCASTD128,
+  IX86_BUILTIN_PBROADCASTQ128,
+  IX86_BUILTIN_VPERMVARSI256,
+  IX86_BUILTIN_VPERMDF256,
+  IX86_BUILTIN_VPERMVARSF256,
+  IX86_BUILTIN_VPERMDI256,
+  IX86_BUILTIN_VPERMTI256,
+  IX86_BUILTIN_VEXTRACT128I256,
+  IX86_BUILTIN_VINSERT128I256,
+  IX86_BUILTIN_MASKLOADD,
+  IX86_BUILTIN_MASKLOADQ,
+  IX86_BUILTIN_MASKLOADD256,
+  IX86_BUILTIN_MASKLOADQ256,
+  IX86_BUILTIN_MASKSTORED,
+  IX86_BUILTIN_MASKSTOREQ,
+  IX86_BUILTIN_MASKSTORED256,
+  IX86_BUILTIN_MASKSTOREQ256,
+  IX86_BUILTIN_PSLLVV4DI,
+  IX86_BUILTIN_PSLLVV2DI,
+  IX86_BUILTIN_PSLLVV8SI,
+  IX86_BUILTIN_PSLLVV4SI,
+  IX86_BUILTIN_PSRAVV8SI,
+  IX86_BUILTIN_PSRAVV4SI,
+  IX86_BUILTIN_PSRLVV4DI,
+  IX86_BUILTIN_PSRLVV2DI,
+  IX86_BUILTIN_PSRLVV8SI,
+  IX86_BUILTIN_PSRLVV4SI,
+
+  IX86_BUILTIN_GATHERSIV2DF,
+  IX86_BUILTIN_GATHERSIV4DF,
+  IX86_BUILTIN_GATHERDIV2DF,
+  IX86_BUILTIN_GATHERDIV4DF,
+  IX86_BUILTIN_GATHERSIV4SF,
+  IX86_BUILTIN_GATHERSIV8SF,
+  IX86_BUILTIN_GATHERDIV4SF,
+  IX86_BUILTIN_GATHERDIV8SF,
+  IX86_BUILTIN_GATHERSIV2DI,
+  IX86_BUILTIN_GATHERSIV4DI,
+  IX86_BUILTIN_GATHERDIV2DI,
+  IX86_BUILTIN_GATHERDIV4DI,
+  IX86_BUILTIN_GATHERSIV4SI,
+  IX86_BUILTIN_GATHERSIV8SI,
+  IX86_BUILTIN_GATHERDIV4SI,
+  IX86_BUILTIN_GATHERDIV8SI,
+
+  /* AVX512F */
+  IX86_BUILTIN_ADDPD512,
+  IX86_BUILTIN_ADDPS512,
+  IX86_BUILTIN_ADDSD_ROUND,
+  IX86_BUILTIN_ADDSS_ROUND,
+  IX86_BUILTIN_ALIGND512,
+  IX86_BUILTIN_ALIGNQ512,
+  IX86_BUILTIN_BLENDMD512,
+  IX86_BUILTIN_BLENDMPD512,
+  IX86_BUILTIN_BLENDMPS512,
+  IX86_BUILTIN_BLENDMQ512,
+  IX86_BUILTIN_BROADCASTF32X4_512,
+  IX86_BUILTIN_BROADCASTF64X4_512,
+  IX86_BUILTIN_BROADCASTI32X4_512,
+  IX86_BUILTIN_BROADCASTI64X4_512,
+  IX86_BUILTIN_BROADCASTSD512,
+  IX86_BUILTIN_BROADCASTSS512,
+  IX86_BUILTIN_CMPD512,
+  IX86_BUILTIN_CMPPD512,
+  IX86_BUILTIN_CMPPS512,
+  IX86_BUILTIN_CMPQ512,
+  IX86_BUILTIN_CMPSD_MASK,
+  IX86_BUILTIN_CMPSS_MASK,
+  IX86_BUILTIN_COMIDF,
+  IX86_BUILTIN_COMISF,
+  IX86_BUILTIN_COMPRESSPD512,
+  IX86_BUILTIN_COMPRESSPDSTORE512,
+  IX86_BUILTIN_COMPRESSPS512,
+  IX86_BUILTIN_COMPRESSPSSTORE512,
+  IX86_BUILTIN_CVTDQ2PD512,
+  IX86_BUILTIN_CVTDQ2PS512,
+  IX86_BUILTIN_CVTPD2DQ512,
+  IX86_BUILTIN_CVTPD2PS512,
+  IX86_BUILTIN_CVTPD2UDQ512,
+  IX86_BUILTIN_CVTPH2PS512,
+  IX86_BUILTIN_CVTPS2DQ512,
+  IX86_BUILTIN_CVTPS2PD512,
+  IX86_BUILTIN_CVTPS2PH512,
+  IX86_BUILTIN_CVTPS2UDQ512,
+  IX86_BUILTIN_CVTSD2SS_ROUND,
+  IX86_BUILTIN_CVTSI2SD64,
+  IX86_BUILTIN_CVTSI2SS32,
+  IX86_BUILTIN_CVTSI2SS64,
+  IX86_BUILTIN_CVTSS2SD_ROUND,
+  IX86_BUILTIN_CVTTPD2DQ512,
+  IX86_BUILTIN_CVTTPD2UDQ512,
+  IX86_BUILTIN_CVTTPS2DQ512,
+  IX86_BUILTIN_CVTTPS2UDQ512,
+  IX86_BUILTIN_CVTUDQ2PD512,
+  IX86_BUILTIN_CVTUDQ2PS512,
+  IX86_BUILTIN_CVTUSI2SD32,
+  IX86_BUILTIN_CVTUSI2SD64,
+  IX86_BUILTIN_CVTUSI2SS32,
+  IX86_BUILTIN_CVTUSI2SS64,
+  IX86_BUILTIN_DIVPD512,
+  IX86_BUILTIN_DIVPS512,
+  IX86_BUILTIN_DIVSD_ROUND,
+  IX86_BUILTIN_DIVSS_ROUND,
+  IX86_BUILTIN_EXPANDPD512,
+  IX86_BUILTIN_EXPANDPD512Z,
+  IX86_BUILTIN_EXPANDPDLOAD512,
+  IX86_BUILTIN_EXPANDPDLOAD512Z,
+  IX86_BUILTIN_EXPANDPS512,
+  IX86_BUILTIN_EXPANDPS512Z,
+  IX86_BUILTIN_EXPANDPSLOAD512,
+  IX86_BUILTIN_EXPANDPSLOAD512Z,
+  IX86_BUILTIN_EXTRACTF32X4,
+  IX86_BUILTIN_EXTRACTF64X4,
+  IX86_BUILTIN_EXTRACTI32X4,
+  IX86_BUILTIN_EXTRACTI64X4,
+  IX86_BUILTIN_FIXUPIMMPD512_MASK,
+  IX86_BUILTIN_FIXUPIMMPD512_MASKZ,
+  IX86_BUILTIN_FIXUPIMMPS512_MASK,
+  IX86_BUILTIN_FIXUPIMMPS512_MASKZ,
+  IX86_BUILTIN_FIXUPIMMSD128_MASK,
+  IX86_BUILTIN_FIXUPIMMSD128_MASKZ,
+  IX86_BUILTIN_FIXUPIMMSS128_MASK,
+  IX86_BUILTIN_FIXUPIMMSS128_MASKZ,
+  IX86_BUILTIN_GETEXPPD512,
+  IX86_BUILTIN_GETEXPPS512,
+  IX86_BUILTIN_GETEXPSD128,
+  IX86_BUILTIN_GETEXPSS128,
+  IX86_BUILTIN_GETMANTPD512,
+  IX86_BUILTIN_GETMANTPS512,
+  IX86_BUILTIN_GETMANTSD128,
+  IX86_BUILTIN_GETMANTSS128,
+  IX86_BUILTIN_INSERTF32X4,
+  IX86_BUILTIN_INSERTF64X4,
+  IX86_BUILTIN_INSERTI32X4,
+  IX86_BUILTIN_INSERTI64X4,
+  IX86_BUILTIN_LOADAPD512,
+  IX86_BUILTIN_LOADAPS512,
+  IX86_BUILTIN_LOADDQUDI512,
+  IX86_BUILTIN_LOADDQUSI512,
+  IX86_BUILTIN_LOADUPD512,
+  IX86_BUILTIN_LOADUPS512,
+  IX86_BUILTIN_MAXPD512,
+  IX86_BUILTIN_MAXPS512,
+  IX86_BUILTIN_MAXSD_ROUND,
+  IX86_BUILTIN_MAXSS_ROUND,
+  IX86_BUILTIN_MINPD512,
+  IX86_BUILTIN_MINPS512,
+  IX86_BUILTIN_MINSD_ROUND,
+  IX86_BUILTIN_MINSS_ROUND,
+  IX86_BUILTIN_MOVAPD512,
+  IX86_BUILTIN_MOVAPS512,
+  IX86_BUILTIN_MOVDDUP512,
+  IX86_BUILTIN_MOVDQA32LOAD512,
+  IX86_BUILTIN_MOVDQA32STORE512,
+  IX86_BUILTIN_MOVDQA32_512,
+  IX86_BUILTIN_MOVDQA64LOAD512,
+  IX86_BUILTIN_MOVDQA64STORE512,
+  IX86_BUILTIN_MOVDQA64_512,
+  IX86_BUILTIN_MOVNTDQ512,
+  IX86_BUILTIN_MOVNTDQA512,
+  IX86_BUILTIN_MOVNTPD512,
+  IX86_BUILTIN_MOVNTPS512,
+  IX86_BUILTIN_MOVSHDUP512,
+  IX86_BUILTIN_MOVSLDUP512,
+  IX86_BUILTIN_MULPD512,
+  IX86_BUILTIN_MULPS512,
+  IX86_BUILTIN_MULSD_ROUND,
+  IX86_BUILTIN_MULSS_ROUND,
+  IX86_BUILTIN_PABSD512,
+  IX86_BUILTIN_PABSQ512,
+  IX86_BUILTIN_PADDD512,
+  IX86_BUILTIN_PADDQ512,
+  IX86_BUILTIN_PANDD512,
+  IX86_BUILTIN_PANDND512,
+  IX86_BUILTIN_PANDNQ512,
+  IX86_BUILTIN_PANDQ512,
+  IX86_BUILTIN_PBROADCASTD512,
+  IX86_BUILTIN_PBROADCASTD512_GPR,
+  IX86_BUILTIN_PBROADCASTMB512,
+  IX86_BUILTIN_PBROADCASTMW512,
+  IX86_BUILTIN_PBROADCASTQ512,
+  IX86_BUILTIN_PBROADCASTQ512_GPR,
+  IX86_BUILTIN_PBROADCASTQ512_MEM,
+  IX86_BUILTIN_PCMPEQD512_MASK,
+  IX86_BUILTIN_PCMPEQQ512_MASK,
+  IX86_BUILTIN_PCMPGTD512_MASK,
+  IX86_BUILTIN_PCMPGTQ512_MASK,
+  IX86_BUILTIN_PCOMPRESSD512,
+  IX86_BUILTIN_PCOMPRESSDSTORE512,
+  IX86_BUILTIN_PCOMPRESSQ512,
+  IX86_BUILTIN_PCOMPRESSQSTORE512,
+  IX86_BUILTIN_PEXPANDD512,
+  IX86_BUILTIN_PEXPANDD512Z,
+  IX86_BUILTIN_PEXPANDDLOAD512,
+  IX86_BUILTIN_PEXPANDDLOAD512Z,
+  IX86_BUILTIN_PEXPANDQ512,
+  IX86_BUILTIN_PEXPANDQ512Z,
+  IX86_BUILTIN_PEXPANDQLOAD512,
+  IX86_BUILTIN_PEXPANDQLOAD512Z,
+  IX86_BUILTIN_PMAXSD512,
+  IX86_BUILTIN_PMAXSQ512,
+  IX86_BUILTIN_PMAXUD512,
+  IX86_BUILTIN_PMAXUQ512,
+  IX86_BUILTIN_PMINSD512,
+  IX86_BUILTIN_PMINSQ512,
+  IX86_BUILTIN_PMINUD512,
+  IX86_BUILTIN_PMINUQ512,
+  IX86_BUILTIN_PMOVDB512,
+  IX86_BUILTIN_PMOVDB512_MEM,
+  IX86_BUILTIN_PMOVDW512,
+  IX86_BUILTIN_PMOVDW512_MEM,
+  IX86_BUILTIN_PMOVQB512,
+  IX86_BUILTIN_PMOVQB512_MEM,
+  IX86_BUILTIN_PMOVQD512,
+  IX86_BUILTIN_PMOVQD512_MEM,
+  IX86_BUILTIN_PMOVQW512,
+  IX86_BUILTIN_PMOVQW512_MEM,
+  IX86_BUILTIN_PMOVSDB512,
+  IX86_BUILTIN_PMOVSDB512_MEM,
+  IX86_BUILTIN_PMOVSDW512,
+  IX86_BUILTIN_PMOVSDW512_MEM,
+  IX86_BUILTIN_PMOVSQB512,
+  IX86_BUILTIN_PMOVSQB512_MEM,
+  IX86_BUILTIN_PMOVSQD512,
+  IX86_BUILTIN_PMOVSQD512_MEM,
+  IX86_BUILTIN_PMOVSQW512,
+  IX86_BUILTIN_PMOVSQW512_MEM,
+  IX86_BUILTIN_PMOVSXBD512,
+  IX86_BUILTIN_PMOVSXBQ512,
+  IX86_BUILTIN_PMOVSXDQ512,
+  IX86_BUILTIN_PMOVSXWD512,
+  IX86_BUILTIN_PMOVSXWQ512,
+  IX86_BUILTIN_PMOVUSDB512,
+  IX86_BUILTIN_PMOVUSDB512_MEM,
+  IX86_BUILTIN_PMOVUSDW512,
+  IX86_BUILTIN_PMOVUSDW512_MEM,
+  IX86_BUILTIN_PMOVUSQB512,
+  IX86_BUILTIN_PMOVUSQB512_MEM,
+  IX86_BUILTIN_PMOVUSQD512,
+  IX86_BUILTIN_PMOVUSQD512_MEM,
+  IX86_BUILTIN_PMOVUSQW512,
+  IX86_BUILTIN_PMOVUSQW512_MEM,
+  IX86_BUILTIN_PMOVZXBD512,
+  IX86_BUILTIN_PMOVZXBQ512,
+  IX86_BUILTIN_PMOVZXDQ512,
+  IX86_BUILTIN_PMOVZXWD512,
+  IX86_BUILTIN_PMOVZXWQ512,
+  IX86_BUILTIN_PMULDQ512,
+  IX86_BUILTIN_PMULLD512,
+  IX86_BUILTIN_PMULUDQ512,
+  IX86_BUILTIN_PORD512,
+  IX86_BUILTIN_PORQ512,
+  IX86_BUILTIN_PROLD512,
+  IX86_BUILTIN_PROLQ512,
+  IX86_BUILTIN_PROLVD512,
+  IX86_BUILTIN_PROLVQ512,
+  IX86_BUILTIN_PRORD512,
+  IX86_BUILTIN_PRORQ512,
+  IX86_BUILTIN_PRORVD512,
+  IX86_BUILTIN_PRORVQ512,
+  IX86_BUILTIN_PSHUFD512,
+  IX86_BUILTIN_PSLLD512,
+  IX86_BUILTIN_PSLLDI512,
+  IX86_BUILTIN_PSLLQ512,
+  IX86_BUILTIN_PSLLQI512,
+  IX86_BUILTIN_PSLLVV16SI,
+  IX86_BUILTIN_PSLLVV8DI,
+  IX86_BUILTIN_PSRAD512,
+  IX86_BUILTIN_PSRADI512,
+  IX86_BUILTIN_PSRAQ512,
+  IX86_BUILTIN_PSRAQI512,
+  IX86_BUILTIN_PSRAVV16SI,
+  IX86_BUILTIN_PSRAVV8DI,
+  IX86_BUILTIN_PSRLD512,
+  IX86_BUILTIN_PSRLDI512,
+  IX86_BUILTIN_PSRLQ512,
+  IX86_BUILTIN_PSRLQI512,
+  IX86_BUILTIN_PSRLVV16SI,
+  IX86_BUILTIN_PSRLVV8DI,
+  IX86_BUILTIN_PSUBD512,
+  IX86_BUILTIN_PSUBQ512,
+  IX86_BUILTIN_PTESTMD512,
+  IX86_BUILTIN_PTESTMQ512,
+  IX86_BUILTIN_PTESTNMD512,
+  IX86_BUILTIN_PTESTNMQ512,
+  IX86_BUILTIN_PUNPCKHDQ512,
+  IX86_BUILTIN_PUNPCKHQDQ512,
+  IX86_BUILTIN_PUNPCKLDQ512,
+  IX86_BUILTIN_PUNPCKLQDQ512,
+  IX86_BUILTIN_PXORD512,
+  IX86_BUILTIN_PXORQ512,
+  IX86_BUILTIN_RCP14PD512,
+  IX86_BUILTIN_RCP14PS512,
+  IX86_BUILTIN_RCP14SD,
+  IX86_BUILTIN_RCP14SS,
+  IX86_BUILTIN_RNDSCALEPD,
+  IX86_BUILTIN_RNDSCALEPS,
+  IX86_BUILTIN_RNDSCALESD,
+  IX86_BUILTIN_RNDSCALESS,
+  IX86_BUILTIN_RSQRT14PD512,
+  IX86_BUILTIN_RSQRT14PS512,
+  IX86_BUILTIN_RSQRT14SD,
+  IX86_BUILTIN_RSQRT14SS,
+  IX86_BUILTIN_SCALEFPD512,
+  IX86_BUILTIN_SCALEFPS512,
+  IX86_BUILTIN_SCALEFSD,
+  IX86_BUILTIN_SCALEFSS,
+  IX86_BUILTIN_SHUFPD512,
+  IX86_BUILTIN_SHUFPS512,
+  IX86_BUILTIN_SHUF_F32x4,
+  IX86_BUILTIN_SHUF_F64x2,
+  IX86_BUILTIN_SHUF_I32x4,
+  IX86_BUILTIN_SHUF_I64x2,
+  IX86_BUILTIN_SQRTPD512,
+  IX86_BUILTIN_SQRTPD512_MASK,
+  IX86_BUILTIN_SQRTPS512_MASK,
+  IX86_BUILTIN_SQRTPS_NR512,
+  IX86_BUILTIN_SQRTSD_ROUND,
+  IX86_BUILTIN_SQRTSS_ROUND,
+  IX86_BUILTIN_STOREAPD512,
+  IX86_BUILTIN_STOREAPS512,
+  IX86_BUILTIN_STOREDQUDI512,
+  IX86_BUILTIN_STOREDQUSI512,
+  IX86_BUILTIN_STOREUPD512,
+  IX86_BUILTIN_STOREUPS512,
+  IX86_BUILTIN_SUBPD512,
+  IX86_BUILTIN_SUBPS512,
+  IX86_BUILTIN_SUBSD_ROUND,
+  IX86_BUILTIN_SUBSS_ROUND,
+  IX86_BUILTIN_UCMPD512,
+  IX86_BUILTIN_UCMPQ512,
+  IX86_BUILTIN_UNPCKHPD512,
+  IX86_BUILTIN_UNPCKHPS512,
+  IX86_BUILTIN_UNPCKLPD512,
+  IX86_BUILTIN_UNPCKLPS512,
+  IX86_BUILTIN_VCVTSD2SI32,
+  IX86_BUILTIN_VCVTSD2SI64,
+  IX86_BUILTIN_VCVTSD2USI32,
+  IX86_BUILTIN_VCVTSD2USI64,
+  IX86_BUILTIN_VCVTSS2SI32,
+  IX86_BUILTIN_VCVTSS2SI64,
+  IX86_BUILTIN_VCVTSS2USI32,
+  IX86_BUILTIN_VCVTSS2USI64,
+  IX86_BUILTIN_VCVTTSD2SI32,
+  IX86_BUILTIN_VCVTTSD2SI64,
+  IX86_BUILTIN_VCVTTSD2USI32,
+  IX86_BUILTIN_VCVTTSD2USI64,
+  IX86_BUILTIN_VCVTTSS2SI32,
+  IX86_BUILTIN_VCVTTSS2SI64,
+  IX86_BUILTIN_VCVTTSS2USI32,
+  IX86_BUILTIN_VCVTTSS2USI64,
+  IX86_BUILTIN_VFMADDPD512_MASK,
+  IX86_BUILTIN_VFMADDPD512_MASK3,
+  IX86_BUILTIN_VFMADDPD512_MASKZ,
+  IX86_BUILTIN_VFMADDPS512_MASK,
+  IX86_BUILTIN_VFMADDPS512_MASK3,
+  IX86_BUILTIN_VFMADDPS512_MASKZ,
+  IX86_BUILTIN_VFMADDSD3_ROUND,
+  IX86_BUILTIN_VFMADDSS3_ROUND,
+  IX86_BUILTIN_VFMADDSUBPD512_MASK,
+  IX86_BUILTIN_VFMADDSUBPD512_MASK3,
+  IX86_BUILTIN_VFMADDSUBPD512_MASKZ,
+  IX86_BUILTIN_VFMADDSUBPS512_MASK,
+  IX86_BUILTIN_VFMADDSUBPS512_MASK3,
+  IX86_BUILTIN_VFMADDSUBPS512_MASKZ,
+  IX86_BUILTIN_VFMSUBADDPD512_MASK3,
+  IX86_BUILTIN_VFMSUBADDPS512_MASK3,
+  IX86_BUILTIN_VFMSUBPD512_MASK3,
+  IX86_BUILTIN_VFMSUBPS512_MASK3,
+  IX86_BUILTIN_VFMSUBSD3_MASK3,
+  IX86_BUILTIN_VFMSUBSS3_MASK3,
+  IX86_BUILTIN_VFNMADDPD512_MASK,
+  IX86_BUILTIN_VFNMADDPS512_MASK,
+  IX86_BUILTIN_VFNMSUBPD512_MASK,
+  IX86_BUILTIN_VFNMSUBPD512_MASK3,
+  IX86_BUILTIN_VFNMSUBPS512_MASK,
+  IX86_BUILTIN_VFNMSUBPS512_MASK3,
+  IX86_BUILTIN_VPCLZCNTD512,
+  IX86_BUILTIN_VPCLZCNTQ512,
+  IX86_BUILTIN_VPCONFLICTD512,
+  IX86_BUILTIN_VPCONFLICTQ512,
+  IX86_BUILTIN_VPERMDF512,
+  IX86_BUILTIN_VPERMDI512,
+  IX86_BUILTIN_VPERMI2VARD512,
+  IX86_BUILTIN_VPERMI2VARPD512,
+  IX86_BUILTIN_VPERMI2VARPS512,
+  IX86_BUILTIN_VPERMI2VARQ512,
+  IX86_BUILTIN_VPERMILPD512,
+  IX86_BUILTIN_VPERMILPS512,
+  IX86_BUILTIN_VPERMILVARPD512,
+  IX86_BUILTIN_VPERMILVARPS512,
+  IX86_BUILTIN_VPERMT2VARD512,
+  IX86_BUILTIN_VPERMT2VARD512_MASKZ,
+  IX86_BUILTIN_VPERMT2VARPD512,
+  IX86_BUILTIN_VPERMT2VARPD512_MASKZ,
+  IX86_BUILTIN_VPERMT2VARPS512,
+  IX86_BUILTIN_VPERMT2VARPS512_MASKZ,
+  IX86_BUILTIN_VPERMT2VARQ512,
+  IX86_BUILTIN_VPERMT2VARQ512_MASKZ,
+  IX86_BUILTIN_VPERMVARDF512,
+  IX86_BUILTIN_VPERMVARDI512,
+  IX86_BUILTIN_VPERMVARSF512,
+  IX86_BUILTIN_VPERMVARSI512,
+  IX86_BUILTIN_VTERNLOGD512_MASK,
+  IX86_BUILTIN_VTERNLOGD512_MASKZ,
+  IX86_BUILTIN_VTERNLOGQ512_MASK,
+  IX86_BUILTIN_VTERNLOGQ512_MASKZ,
+
+  /* Mask arithmetic operations */
+  IX86_BUILTIN_KAND16,
+  IX86_BUILTIN_KANDN16,
+  IX86_BUILTIN_KNOT16,
+  IX86_BUILTIN_KOR16,
+  IX86_BUILTIN_KORTESTC16,
+  IX86_BUILTIN_KORTESTZ16,
+  IX86_BUILTIN_KUNPCKBW,
+  IX86_BUILTIN_KXNOR16,
+  IX86_BUILTIN_KXOR16,
+  IX86_BUILTIN_KMOV16,
+
+  /* Alternate 4 and 8 element gather/scatter for the vectorizer
+     where all operands are 32-byte or 64-byte wide respectively.  */
+  IX86_BUILTIN_GATHERALTSIV4DF,
+  IX86_BUILTIN_GATHERALTDIV8SF,
+  IX86_BUILTIN_GATHERALTSIV4DI,
+  IX86_BUILTIN_GATHERALTDIV8SI,
+  IX86_BUILTIN_GATHER3ALTDIV16SF,
+  IX86_BUILTIN_GATHER3ALTDIV16SI,
+  IX86_BUILTIN_GATHER3ALTSIV8DF,
+  IX86_BUILTIN_GATHER3ALTSIV8DI,
+  IX86_BUILTIN_GATHER3DIV16SF,
+  IX86_BUILTIN_GATHER3DIV16SI,
+  IX86_BUILTIN_GATHER3DIV8DF,
+  IX86_BUILTIN_GATHER3DIV8DI,
+  IX86_BUILTIN_GATHER3SIV16SF,
+  IX86_BUILTIN_GATHER3SIV16SI,
+  IX86_BUILTIN_GATHER3SIV8DF,
+  IX86_BUILTIN_GATHER3SIV8DI,
+  IX86_BUILTIN_SCATTERDIV16SF,
+  IX86_BUILTIN_SCATTERDIV16SI,
+  IX86_BUILTIN_SCATTERDIV8DF,
+  IX86_BUILTIN_SCATTERDIV8DI,
+  IX86_BUILTIN_SCATTERSIV16SF,
+  IX86_BUILTIN_SCATTERSIV16SI,
+  IX86_BUILTIN_SCATTERSIV8DF,
+  IX86_BUILTIN_SCATTERSIV8DI,
+
+  /* AVX512PF */
+  IX86_BUILTIN_GATHERPFQPD,
+  IX86_BUILTIN_GATHERPFDPS,
+  IX86_BUILTIN_GATHERPFDPD,
+  IX86_BUILTIN_GATHERPFQPS,
+  IX86_BUILTIN_SCATTERPFDPD,
+  IX86_BUILTIN_SCATTERPFDPS,
+  IX86_BUILTIN_SCATTERPFQPD,
+  IX86_BUILTIN_SCATTERPFQPS,
+
+  /* AVX-512ER */
+  IX86_BUILTIN_EXP2PD_MASK,
+  IX86_BUILTIN_EXP2PS_MASK,
+  IX86_BUILTIN_EXP2PS,
+  IX86_BUILTIN_RCP28PD,
+  IX86_BUILTIN_RCP28PS,
+  IX86_BUILTIN_RCP28SD,
+  IX86_BUILTIN_RCP28SS,
+  IX86_BUILTIN_RSQRT28PD,
+  IX86_BUILTIN_RSQRT28PS,
+  IX86_BUILTIN_RSQRT28SD,
+  IX86_BUILTIN_RSQRT28SS,
+
+  /* SHA builtins.  */
+  IX86_BUILTIN_SHA1MSG1,
+  IX86_BUILTIN_SHA1MSG2,
+  IX86_BUILTIN_SHA1NEXTE,
+  IX86_BUILTIN_SHA1RNDS4,
+  IX86_BUILTIN_SHA256MSG1,
+  IX86_BUILTIN_SHA256MSG2,
+  IX86_BUILTIN_SHA256RNDS2,
+
+  /* TFmode support builtins.  */
+  IX86_BUILTIN_INFQ,
+  IX86_BUILTIN_HUGE_VALQ,
+  IX86_BUILTIN_FABSQ,
+  IX86_BUILTIN_COPYSIGNQ,
+
+  /* Vectorizer support builtins.  */
+  IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512,
+  IX86_BUILTIN_CPYSGNPS,
+  IX86_BUILTIN_CPYSGNPD,
+  IX86_BUILTIN_CPYSGNPS256,
+  IX86_BUILTIN_CPYSGNPS512,
+  IX86_BUILTIN_CPYSGNPD256,
+  IX86_BUILTIN_CPYSGNPD512,
+  IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512,
+  IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512,
+
+
+  /* FMA4 instructions.  */
+  IX86_BUILTIN_VFMADDSS,
+  IX86_BUILTIN_VFMADDSD,
+  IX86_BUILTIN_VFMADDPS,
+  IX86_BUILTIN_VFMADDPD,
+  IX86_BUILTIN_VFMADDPS256,
+  IX86_BUILTIN_VFMADDPD256,
+  IX86_BUILTIN_VFMADDSUBPS,
+  IX86_BUILTIN_VFMADDSUBPD,
+  IX86_BUILTIN_VFMADDSUBPS256,
+  IX86_BUILTIN_VFMADDSUBPD256,
+
+  /* FMA3 instructions.  */
+  IX86_BUILTIN_VFMADDSS3,
+  IX86_BUILTIN_VFMADDSD3,
+
+  /* XOP instructions.  */
+  IX86_BUILTIN_VPCMOV,
+  IX86_BUILTIN_VPCMOV_V2DI,
+  IX86_BUILTIN_VPCMOV_V4SI,
+  IX86_BUILTIN_VPCMOV_V8HI,
+  IX86_BUILTIN_VPCMOV_V16QI,
+  IX86_BUILTIN_VPCMOV_V4SF,
+  IX86_BUILTIN_VPCMOV_V2DF,
+  IX86_BUILTIN_VPCMOV256,
+  IX86_BUILTIN_VPCMOV_V4DI256,
+  IX86_BUILTIN_VPCMOV_V8SI256,
+  IX86_BUILTIN_VPCMOV_V16HI256,
+  IX86_BUILTIN_VPCMOV_V32QI256,
+  IX86_BUILTIN_VPCMOV_V8SF256,
+  IX86_BUILTIN_VPCMOV_V4DF256,
+
+  IX86_BUILTIN_VPPERM,
+
+  IX86_BUILTIN_VPMACSSWW,
+  IX86_BUILTIN_VPMACSWW,
+  IX86_BUILTIN_VPMACSSWD,
+  IX86_BUILTIN_VPMACSWD,
+  IX86_BUILTIN_VPMACSSDD,
+  IX86_BUILTIN_VPMACSDD,
+  IX86_BUILTIN_VPMACSSDQL,
+  IX86_BUILTIN_VPMACSSDQH,
+  IX86_BUILTIN_VPMACSDQL,
+  IX86_BUILTIN_VPMACSDQH,
+  IX86_BUILTIN_VPMADCSSWD,
+  IX86_BUILTIN_VPMADCSWD,
+
+  IX86_BUILTIN_VPHADDBW,
+  IX86_BUILTIN_VPHADDBD,
+  IX86_BUILTIN_VPHADDBQ,
+  IX86_BUILTIN_VPHADDWD,
+  IX86_BUILTIN_VPHADDWQ,
+  IX86_BUILTIN_VPHADDDQ,
+  IX86_BUILTIN_VPHADDUBW,
+  IX86_BUILTIN_VPHADDUBD,
+  IX86_BUILTIN_VPHADDUBQ,
+  IX86_BUILTIN_VPHADDUWD,
+  IX86_BUILTIN_VPHADDUWQ,
+  IX86_BUILTIN_VPHADDUDQ,
+  IX86_BUILTIN_VPHSUBBW,
+  IX86_BUILTIN_VPHSUBWD,
+  IX86_BUILTIN_VPHSUBDQ,
+
+  IX86_BUILTIN_VPROTB,
+  IX86_BUILTIN_VPROTW,
+  IX86_BUILTIN_VPROTD,
+  IX86_BUILTIN_VPROTQ,
+  IX86_BUILTIN_VPROTB_IMM,
+  IX86_BUILTIN_VPROTW_IMM,
+  IX86_BUILTIN_VPROTD_IMM,
+  IX86_BUILTIN_VPROTQ_IMM,
+
+  IX86_BUILTIN_VPSHLB,
+  IX86_BUILTIN_VPSHLW,
+  IX86_BUILTIN_VPSHLD,
+  IX86_BUILTIN_VPSHLQ,
+  IX86_BUILTIN_VPSHAB,
+  IX86_BUILTIN_VPSHAW,
+  IX86_BUILTIN_VPSHAD,
+  IX86_BUILTIN_VPSHAQ,
+
+  IX86_BUILTIN_VFRCZSS,
+  IX86_BUILTIN_VFRCZSD,
+  IX86_BUILTIN_VFRCZPS,
+  IX86_BUILTIN_VFRCZPD,
+  IX86_BUILTIN_VFRCZPS256,
+  IX86_BUILTIN_VFRCZPD256,
+
+  IX86_BUILTIN_VPCOMEQUB,
+  IX86_BUILTIN_VPCOMNEUB,
+  IX86_BUILTIN_VPCOMLTUB,
+  IX86_BUILTIN_VPCOMLEUB,
+  IX86_BUILTIN_VPCOMGTUB,
+  IX86_BUILTIN_VPCOMGEUB,
+  IX86_BUILTIN_VPCOMFALSEUB,
+  IX86_BUILTIN_VPCOMTRUEUB,
+
+  IX86_BUILTIN_VPCOMEQUW,
+  IX86_BUILTIN_VPCOMNEUW,
+  IX86_BUILTIN_VPCOMLTUW,
+  IX86_BUILTIN_VPCOMLEUW,
+  IX86_BUILTIN_VPCOMGTUW,
+  IX86_BUILTIN_VPCOMGEUW,
+  IX86_BUILTIN_VPCOMFALSEUW,
+  IX86_BUILTIN_VPCOMTRUEUW,
+
+  IX86_BUILTIN_VPCOMEQUD,
+  IX86_BUILTIN_VPCOMNEUD,
+  IX86_BUILTIN_VPCOMLTUD,
+  IX86_BUILTIN_VPCOMLEUD,
+  IX86_BUILTIN_VPCOMGTUD,
+  IX86_BUILTIN_VPCOMGEUD,
+  IX86_BUILTIN_VPCOMFALSEUD,
+  IX86_BUILTIN_VPCOMTRUEUD,
+
+  IX86_BUILTIN_VPCOMEQUQ,
+  IX86_BUILTIN_VPCOMNEUQ,
+  IX86_BUILTIN_VPCOMLTUQ,
+  IX86_BUILTIN_VPCOMLEUQ,
+  IX86_BUILTIN_VPCOMGTUQ,
+  IX86_BUILTIN_VPCOMGEUQ,
+  IX86_BUILTIN_VPCOMFALSEUQ,
+  IX86_BUILTIN_VPCOMTRUEUQ,
+
+  IX86_BUILTIN_VPCOMEQB,
+  IX86_BUILTIN_VPCOMNEB,
+  IX86_BUILTIN_VPCOMLTB,
+  IX86_BUILTIN_VPCOMLEB,
+  IX86_BUILTIN_VPCOMGTB,
+  IX86_BUILTIN_VPCOMGEB,
+  IX86_BUILTIN_VPCOMFALSEB,
+  IX86_BUILTIN_VPCOMTRUEB,
+
+  IX86_BUILTIN_VPCOMEQW,
+  IX86_BUILTIN_VPCOMNEW,
+  IX86_BUILTIN_VPCOMLTW,
+  IX86_BUILTIN_VPCOMLEW,
+  IX86_BUILTIN_VPCOMGTW,
+  IX86_BUILTIN_VPCOMGEW,
+  IX86_BUILTIN_VPCOMFALSEW,
+  IX86_BUILTIN_VPCOMTRUEW,
+
+  IX86_BUILTIN_VPCOMEQD,
+  IX86_BUILTIN_VPCOMNED,
+  IX86_BUILTIN_VPCOMLTD,
+  IX86_BUILTIN_VPCOMLED,
+  IX86_BUILTIN_VPCOMGTD,
+  IX86_BUILTIN_VPCOMGED,
+  IX86_BUILTIN_VPCOMFALSED,
+  IX86_BUILTIN_VPCOMTRUED,
+
+  IX86_BUILTIN_VPCOMEQQ,
+  IX86_BUILTIN_VPCOMNEQ,
+  IX86_BUILTIN_VPCOMLTQ,
+  IX86_BUILTIN_VPCOMLEQ,
+  IX86_BUILTIN_VPCOMGTQ,
+  IX86_BUILTIN_VPCOMGEQ,
+  IX86_BUILTIN_VPCOMFALSEQ,
+  IX86_BUILTIN_VPCOMTRUEQ,
+
+  /* LWP instructions.  */
+  IX86_BUILTIN_LLWPCB,
+  IX86_BUILTIN_SLWPCB,
+  IX86_BUILTIN_LWPVAL32,
+  IX86_BUILTIN_LWPVAL64,
+  IX86_BUILTIN_LWPINS32,
+  IX86_BUILTIN_LWPINS64,
+
+  IX86_BUILTIN_CLZS,
+
+  /* RTM */
+  IX86_BUILTIN_XBEGIN,
+  IX86_BUILTIN_XEND,
+  IX86_BUILTIN_XABORT,
+  IX86_BUILTIN_XTEST,
+
+  /* BMI instructions.  */
+  IX86_BUILTIN_BEXTR32,
+  IX86_BUILTIN_BEXTR64,
+  IX86_BUILTIN_CTZS,
+
+  /* TBM instructions.  */
+  IX86_BUILTIN_BEXTRI32,
+  IX86_BUILTIN_BEXTRI64,
+
+  /* BMI2 instructions. */
+  IX86_BUILTIN_BZHI32,
+  IX86_BUILTIN_BZHI64,
+  IX86_BUILTIN_PDEP32,
+  IX86_BUILTIN_PDEP64,
+  IX86_BUILTIN_PEXT32,
+  IX86_BUILTIN_PEXT64,
+
+  /* ADX instructions.  */
+  IX86_BUILTIN_ADDCARRYX32,
+  IX86_BUILTIN_ADDCARRYX64,
+
+  /* FSGSBASE instructions.  */
+  IX86_BUILTIN_RDFSBASE32,
+  IX86_BUILTIN_RDFSBASE64,
+  IX86_BUILTIN_RDGSBASE32,
+  IX86_BUILTIN_RDGSBASE64,
+  IX86_BUILTIN_WRFSBASE32,
+  IX86_BUILTIN_WRFSBASE64,
+  IX86_BUILTIN_WRGSBASE32,
+  IX86_BUILTIN_WRGSBASE64,
+
+  /* RDRND instructions.  */
+  IX86_BUILTIN_RDRAND16_STEP,
+  IX86_BUILTIN_RDRAND32_STEP,
+  IX86_BUILTIN_RDRAND64_STEP,
+
+  /* RDSEED instructions.  */
+  IX86_BUILTIN_RDSEED16_STEP,
+  IX86_BUILTIN_RDSEED32_STEP,
+  IX86_BUILTIN_RDSEED64_STEP,
+
+  /* F16C instructions.  */
+  IX86_BUILTIN_CVTPH2PS,
+  IX86_BUILTIN_CVTPH2PS256,
+  IX86_BUILTIN_CVTPS2PH,
+  IX86_BUILTIN_CVTPS2PH256,
+
+  /* CFString built-in for darwin */
+  IX86_BUILTIN_CFSTRING,
+
+  /* Builtins to get CPU type and supported features. */
+  IX86_BUILTIN_CPU_INIT,
+  IX86_BUILTIN_CPU_IS,
+  IX86_BUILTIN_CPU_SUPPORTS,
+
+  /* Read/write FLAGS register built-ins.  */
+  IX86_BUILTIN_READ_FLAGS,
+  IX86_BUILTIN_WRITE_FLAGS,
+
+  IX86_BUILTIN_MAX
+};
+
+/* Table for the ix86 builtin decls.  */
+static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
+
+/* Table of all of the builtin functions that are possible with different ISA's
+   but are waiting to be built until a function is declared to use that
+   ISA.  */
+struct builtin_isa {
+  const char *name;		/* function name */
+  enum ix86_builtin_func_type tcode; /* type to use in the declaration */
+  HOST_WIDE_INT isa;		/* isa_flags this builtin is defined for */
+  bool const_p;			/* true if the declaration is constant */
+  bool set_and_not_built_p;
+};
+
+static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
+
+
+/* Add an ix86 target builtin function with CODE, NAME and TYPE.  Save the MASK
+   of which isa_flags to use in the ix86_builtins_isa array.  Stores the
+   function decl in the ix86_builtins array.  Returns the function decl or
+   NULL_TREE, if the builtin was not added.
+
+   If the front end has a special hook for builtin functions, delay adding
+   builtin functions that aren't in the current ISA until the ISA is changed
+   with function specific optimization.  Doing so, can save about 300K for the
+   default compiler.  When the builtin is expanded, check at that time whether
+   it is valid.
+
+   If the front end doesn't have a special hook, record all builtins, even if
+   it isn't an instruction set in the current ISA in case the user uses
+   function specific options for a different ISA, so that we don't get scope
+   errors if a builtin is added in the middle of a function scope.  */
+
+static inline tree
+def_builtin (HOST_WIDE_INT mask, const char *name,
+	     enum ix86_builtin_func_type tcode,
+	     enum ix86_builtins code)
+{
+  tree decl = NULL_TREE;
+
+  if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
+    {
+      ix86_builtins_isa[(int) code].isa = mask;
+
+      mask &= ~OPTION_MASK_ISA_64BIT;
+      if (mask == 0
+	  || (mask & ix86_isa_flags) != 0
+	  || (lang_hooks.builtin_function
+	      == lang_hooks.builtin_function_ext_scope))
+
+	{
+	  tree type = ix86_get_builtin_func_type (tcode);
+	  decl = add_builtin_function (name, type, code, BUILT_IN_MD,
+				       NULL, NULL_TREE);
+	  ix86_builtins[(int) code] = decl;
+	  ix86_builtins_isa[(int) code].set_and_not_built_p = false;
+	}
+      else
+	{
+	  ix86_builtins[(int) code] = NULL_TREE;
+	  ix86_builtins_isa[(int) code].tcode = tcode;
+	  ix86_builtins_isa[(int) code].name = name;
+	  ix86_builtins_isa[(int) code].const_p = false;
+	  ix86_builtins_isa[(int) code].set_and_not_built_p = true;
+	}
+    }
+
+  return decl;
+}
+
+/* Like def_builtin, but also marks the function decl "const".  */
+
+static inline tree
+def_builtin_const (HOST_WIDE_INT mask, const char *name,
+		   enum ix86_builtin_func_type tcode, enum ix86_builtins code)
+{
+  tree decl = def_builtin (mask, name, tcode, code);
+  if (decl)
+    TREE_READONLY (decl) = 1;
+  else
+    ix86_builtins_isa[(int) code].const_p = true;
+
+  return decl;
+}
+
+/* Add any new builtin functions for a given ISA that may not have been
+   declared.  This saves a bit of space compared to adding all of the
+   declarations to the tree, even if we didn't use them.  */
+
+static void
+ix86_add_new_builtins (HOST_WIDE_INT isa)
+{
+  int i;
+
+  for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
+    {
+      if ((ix86_builtins_isa[i].isa & isa) != 0
+	  && ix86_builtins_isa[i].set_and_not_built_p)
+	{
+	  tree decl, type;
+
+	  /* Don't define the builtin again.  */
+	  ix86_builtins_isa[i].set_and_not_built_p = false;
+
+	  type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
+	  decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
+						 type, i, BUILT_IN_MD, NULL,
+						 NULL_TREE);
+
+	  ix86_builtins[i] = decl;
+	  if (ix86_builtins_isa[i].const_p)
+	    TREE_READONLY (decl) = 1;
+	}
+    }
+}
+
+/* Bits for builtin_description.flag.  */
+
+/* Set when we don't support the comparison natively, and should
+   swap_comparison in order to support it.  */
+#define BUILTIN_DESC_SWAP_OPERANDS	1
+
+struct builtin_description
+{
+  const HOST_WIDE_INT mask;
+  const enum insn_code icode;
+  const char *const name;
+  const enum ix86_builtins code;
+  const enum rtx_code comparison;
+  const int flag;
+};
+
+static const struct builtin_description bdesc_comi[] =
+{
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
+};
+
+static const struct builtin_description bdesc_pcmpestr[] =
+{
+  /* SSE4.2 */
+  { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
+  { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
+  { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
+  { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
+  { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
+  { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
+  { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
+};
+
+static const struct builtin_description bdesc_pcmpistr[] =
+{
+  /* SSE4.2 */
+  { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
+  { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
+  { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
+  { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
+  { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
+  { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
+  { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
+};
+
+/* Special builtins with variable number of arguments.  */
+static const struct builtin_description bdesc_special_args[] =
+{
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
+
+  /* 80387 (for use internally for atomic compound assignment).  */
+  { 0, CODE_FOR_fnstenv, "__builtin_ia32_fnstenv", IX86_BUILTIN_FNSTENV, UNKNOWN, (int) VOID_FTYPE_PVOID },
+  { 0, CODE_FOR_fldenv, "__builtin_ia32_fldenv", IX86_BUILTIN_FLDENV, UNKNOWN, (int) VOID_FTYPE_PCVOID },
+  { 0, CODE_FOR_fnstsw, "__builtin_ia32_fnstsw", IX86_BUILTIN_FNSTSW, UNKNOWN, (int) VOID_FTYPE_PUSHORT },
+  { 0, CODE_FOR_fnclex, "__builtin_ia32_fnclex", IX86_BUILTIN_FNCLEX, UNKNOWN, (int) VOID_FTYPE_VOID },
+
+  /* MMX */
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
+
+  /* 3DNow! */
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
+
+  /* FXSR, XSAVE and XSAVEOPT */
+  { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
+  { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
+  { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
+  { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
+  { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
+
+  { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
+  { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
+  { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
+  { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
+  { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
+
+  /* SSE */
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
+
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
+
+  /* SSE or 3DNow!A  */
+  { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
+  { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
+
+  /* SSE2 */
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedquv16qi, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
+  { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddquv16qi, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
+
+  /* SSE3 */
+  { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
+
+  /* SSE4.1 */
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
+
+  /* SSE4A */
+  { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
+  { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
+
+  /* AVX */
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddquv32qi, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedquv32qi, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
+
+  /* AVX2 */
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
+
+  /* AVX512F */
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_compressstorev16sf_mask, "__builtin_ia32_compressstoresf512_mask", IX86_BUILTIN_COMPRESSPSSTORE512, UNKNOWN, (int) VOID_FTYPE_PV16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_compressstorev16si_mask, "__builtin_ia32_compressstoresi512_mask", IX86_BUILTIN_PCOMPRESSDSTORE512, UNKNOWN, (int) VOID_FTYPE_PV16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_compressstorev8df_mask, "__builtin_ia32_compressstoredf512_mask", IX86_BUILTIN_COMPRESSPDSTORE512, UNKNOWN, (int) VOID_FTYPE_PV8DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_compressstorev8di_mask, "__builtin_ia32_compressstoredi512_mask", IX86_BUILTIN_PCOMPRESSQSTORE512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv16sf_mask, "__builtin_ia32_expandloadsf512_mask", IX86_BUILTIN_EXPANDPSLOAD512, UNKNOWN, (int) V16SF_FTYPE_PCV16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv16sf_maskz, "__builtin_ia32_expandloadsf512_maskz", IX86_BUILTIN_EXPANDPSLOAD512Z, UNKNOWN, (int) V16SF_FTYPE_PCV16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv16si_mask, "__builtin_ia32_expandloadsi512_mask", IX86_BUILTIN_PEXPANDDLOAD512, UNKNOWN, (int) V16SI_FTYPE_PCV16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv16si_maskz, "__builtin_ia32_expandloadsi512_maskz", IX86_BUILTIN_PEXPANDDLOAD512Z, UNKNOWN, (int) V16SI_FTYPE_PCV16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv8df_mask, "__builtin_ia32_expandloaddf512_mask", IX86_BUILTIN_EXPANDPDLOAD512, UNKNOWN, (int) V8DF_FTYPE_PCV8DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv8df_maskz, "__builtin_ia32_expandloaddf512_maskz", IX86_BUILTIN_EXPANDPDLOAD512Z, UNKNOWN, (int) V8DF_FTYPE_PCV8DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv8di_mask, "__builtin_ia32_expandloaddi512_mask", IX86_BUILTIN_PEXPANDQLOAD512, UNKNOWN, (int) V8DI_FTYPE_PCV8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv8di_maskz, "__builtin_ia32_expandloaddi512_maskz", IX86_BUILTIN_PEXPANDQLOAD512Z, UNKNOWN, (int) V8DI_FTYPE_PCV8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loaddquv16si_mask, "__builtin_ia32_loaddqusi512_mask", IX86_BUILTIN_LOADDQUSI512, UNKNOWN, (int) V16SI_FTYPE_PCV16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loaddquv8di_mask, "__builtin_ia32_loaddqudi512_mask", IX86_BUILTIN_LOADDQUDI512, UNKNOWN, (int) V8DI_FTYPE_PCV8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadupd512_mask, "__builtin_ia32_loadupd512_mask", IX86_BUILTIN_LOADUPD512, UNKNOWN, (int) V8DF_FTYPE_PCV8DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadups512_mask, "__builtin_ia32_loadups512_mask", IX86_BUILTIN_LOADUPS512, UNKNOWN, (int) V16SF_FTYPE_PCV16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadv16sf_mask, "__builtin_ia32_loadaps512_mask", IX86_BUILTIN_LOADAPS512, UNKNOWN, (int) V16SF_FTYPE_PCV16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadv16si_mask, "__builtin_ia32_movdqa32load512_mask", IX86_BUILTIN_MOVDQA32LOAD512, UNKNOWN, (int) V16SI_FTYPE_PCV16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadv8df_mask, "__builtin_ia32_loadapd512_mask", IX86_BUILTIN_LOADAPD512, UNKNOWN, (int) V8DF_FTYPE_PCV8DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadv8di_mask, "__builtin_ia32_movdqa64load512_mask", IX86_BUILTIN_MOVDQA64LOAD512, UNKNOWN, (int) V8DI_FTYPE_PCV8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movntv16sf, "__builtin_ia32_movntps512", IX86_BUILTIN_MOVNTPS512, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V16SF },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movntv8df, "__builtin_ia32_movntpd512", IX86_BUILTIN_MOVNTPD512, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V8DF },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movntv8di, "__builtin_ia32_movntdq512", IX86_BUILTIN_MOVNTDQ512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movntdqa, "__builtin_ia32_movntdqa512", IX86_BUILTIN_MOVNTDQA512, UNKNOWN, (int) V8DI_FTYPE_PV8DI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storedquv16si_mask, "__builtin_ia32_storedqusi512_mask", IX86_BUILTIN_STOREDQUSI512, UNKNOWN, (int) VOID_FTYPE_PV16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storedquv8di_mask, "__builtin_ia32_storedqudi512_mask", IX86_BUILTIN_STOREDQUDI512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storeupd512_mask, "__builtin_ia32_storeupd512_mask", IX86_BUILTIN_STOREUPD512, UNKNOWN, (int) VOID_FTYPE_PV8DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_us_truncatev8div8si2_mask_store, "__builtin_ia32_pmovusqd512mem_mask", IX86_BUILTIN_PMOVUSQD512_MEM, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ss_truncatev8div8si2_mask_store, "__builtin_ia32_pmovsqd512mem_mask", IX86_BUILTIN_PMOVSQD512_MEM, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_truncatev8div8si2_mask_store, "__builtin_ia32_pmovqd512mem_mask", IX86_BUILTIN_PMOVQD512_MEM, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_us_truncatev8div8hi2_mask_store, "__builtin_ia32_pmovusqw512mem_mask", IX86_BUILTIN_PMOVUSQW512_MEM, UNKNOWN, (int) VOID_FTYPE_PV8HI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ss_truncatev8div8hi2_mask_store, "__builtin_ia32_pmovsqw512mem_mask", IX86_BUILTIN_PMOVSQW512_MEM, UNKNOWN, (int) VOID_FTYPE_PV8HI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_truncatev8div8hi2_mask_store, "__builtin_ia32_pmovqw512mem_mask", IX86_BUILTIN_PMOVQW512_MEM, UNKNOWN, (int) VOID_FTYPE_PV8HI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_us_truncatev16siv16hi2_mask_store, "__builtin_ia32_pmovusdw512mem_mask", IX86_BUILTIN_PMOVUSDW512_MEM, UNKNOWN, (int) VOID_FTYPE_PV16HI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ss_truncatev16siv16hi2_mask_store, "__builtin_ia32_pmovsdw512mem_mask", IX86_BUILTIN_PMOVSDW512_MEM, UNKNOWN, (int) VOID_FTYPE_PV16HI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_truncatev16siv16hi2_mask_store, "__builtin_ia32_pmovdw512mem_mask", IX86_BUILTIN_PMOVDW512_MEM, UNKNOWN, (int) VOID_FTYPE_PV16HI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_truncatev8div16qi2_mask_store, "__builtin_ia32_pmovqb512mem_mask", IX86_BUILTIN_PMOVQB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV16QI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_us_truncatev8div16qi2_mask_store, "__builtin_ia32_pmovusqb512mem_mask", IX86_BUILTIN_PMOVUSQB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV16QI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ss_truncatev8div16qi2_mask_store, "__builtin_ia32_pmovsqb512mem_mask", IX86_BUILTIN_PMOVSQB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV16QI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_us_truncatev16siv16qi2_mask_store, "__builtin_ia32_pmovusdb512mem_mask", IX86_BUILTIN_PMOVUSDB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV16QI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ss_truncatev16siv16qi2_mask_store, "__builtin_ia32_pmovsdb512mem_mask", IX86_BUILTIN_PMOVSDB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV16QI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_truncatev16siv16qi2_mask_store, "__builtin_ia32_pmovdb512mem_mask", IX86_BUILTIN_PMOVDB512_MEM, UNKNOWN, (int) VOID_FTYPE_PV16QI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storeups512_mask, "__builtin_ia32_storeups512_mask", IX86_BUILTIN_STOREUPS512, UNKNOWN, (int) VOID_FTYPE_PV16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storev16sf_mask, "__builtin_ia32_storeaps512_mask", IX86_BUILTIN_STOREAPS512, UNKNOWN, (int) VOID_FTYPE_PV16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storev16si_mask, "__builtin_ia32_movdqa32store512_mask", IX86_BUILTIN_MOVDQA32STORE512, UNKNOWN, (int) VOID_FTYPE_PV16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storev8df_mask, "__builtin_ia32_storeapd512_mask", IX86_BUILTIN_STOREAPD512, UNKNOWN, (int) VOID_FTYPE_PV8DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_storev8di_mask, "__builtin_ia32_movdqa64store512_mask", IX86_BUILTIN_MOVDQA64STORE512, UNKNOWN, (int) VOID_FTYPE_PV8DI_V8DI_QI },
+
+  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
+  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
+  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
+  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
+  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
+  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
+
+  /* FSGSBASE */
+  { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
+  { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
+  { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
+  { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
+  { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
+  { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
+  { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
+  { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
+
+  /* RTM */
+  { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
+  { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
+  { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
+};
+
+/* Builtins with variable number of arguments.  */
+static const struct builtin_description bdesc_args[] =
+{
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
+  { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
+  { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
+
+  /* MMX */
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
+
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
+
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
+
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
+
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
+
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
+
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
+
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
+
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
+
+  /* 3DNow! */
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
+
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
+  { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+
+  /* 3DNow!A */
+  { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
+  { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
+  { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
+  { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
+  { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
+  { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
+
+  /* SSE */
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
+  { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
+  { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
+
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+
+  { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3,  "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3,  "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3,  "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3,  "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
+
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+
+  { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3,  "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3,  "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+
+  { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3,  "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss,  "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp,  "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp,  "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
+  { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
+
+  { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
+
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
+
+  { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
+
+  /* SSE MMX or 3Dnow!A */
+  { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
+  { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+  { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+
+  { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
+  { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+  { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
+  { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+
+  { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
+  { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
+
+  { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
+
+  /* SSE2 */
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF  },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
+  { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
+  { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_fix_notruncv4sfv4si, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3,  "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3,  "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3,  "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3,  "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3,  "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3,  "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3,  "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd,  "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI  },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI  },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI  },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN,  (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
+  { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
+
+  { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
+
+  /* SSE2 MMX */
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
+
+  /* SSE3 */
+  { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
+  { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
+
+  { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+
+  /* SSSE3 */
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
+
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
+
+  /* SSSE3.  */
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
+  { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
+
+  /* SSE4.1 */
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
+
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
+
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+
+  /* SSE4.1 */
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
+
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
+
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
+
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
+
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
+
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
+
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
+
+  /* SSE4.2 */
+  { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+  { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
+  { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
+  { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+
+  /* SSE4A */
+  { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
+  { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
+  { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
+  { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+
+  /* AES */
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
+
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+
+  /* PCLMUL */
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
+
+  /* AVX */
+  { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_fix_notruncv8sfv8si, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256,  "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256,  "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256,  "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256,  "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF  },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3,  "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3,  "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
+
+  /* AVX2 */
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256",  IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256",  IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256",  IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256",  IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256",  IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256",  IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI  },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI  },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI  },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI  },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2  , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2  , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2  , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2  , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2  , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2  , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2  , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2  , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2  , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2  , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256"  , IX86_BUILTIN_PMULHW256  , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256"  , IX86_BUILTIN_PMULLW256  , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256"  , IX86_BUILTIN_PMULLD256  , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI  },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN,  (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+
+  { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt,   "__builtin_clzs",   IX86_BUILTIN_CLZS,    UNKNOWN,     (int) UINT16_FTYPE_UINT16 },
+
+  /* BMI */
+  { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+  { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2,       "__builtin_ctzs",           IX86_BUILTIN_CTZS,    UNKNOWN, (int) UINT16_FTYPE_UINT16 },
+
+  /* TBM */
+  { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+
+  /* F16C */
+  { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
+  { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
+  { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
+  { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
+
+  /* BMI2 */
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
+  { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
+
+  /* AVX512F */
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_alignv16si_mask, "__builtin_ia32_alignd512_mask", IX86_BUILTIN_ALIGND512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_INT_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_alignv8di_mask, "__builtin_ia32_alignq512_mask", IX86_BUILTIN_ALIGNQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_INT_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_blendmv16si, "__builtin_ia32_blendmd_512_mask", IX86_BUILTIN_BLENDMD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_blendmv8df, "__builtin_ia32_blendmpd_512_mask", IX86_BUILTIN_BLENDMPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_blendmv16sf, "__builtin_ia32_blendmps_512_mask", IX86_BUILTIN_BLENDMPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_blendmv8di, "__builtin_ia32_blendmq_512_mask", IX86_BUILTIN_BLENDMQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_broadcastv16sf_mask, "__builtin_ia32_broadcastf32x4_512", IX86_BUILTIN_BROADCASTF32X4_512, UNKNOWN, (int) V16SF_FTYPE_V4SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_broadcastv8df_mask, "__builtin_ia32_broadcastf64x4_512", IX86_BUILTIN_BROADCASTF64X4_512, UNKNOWN, (int) V8DF_FTYPE_V4DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_broadcastv16si_mask, "__builtin_ia32_broadcasti32x4_512", IX86_BUILTIN_BROADCASTI32X4_512, UNKNOWN, (int) V16SI_FTYPE_V4SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_broadcastv8di_mask, "__builtin_ia32_broadcasti64x4_512", IX86_BUILTIN_BROADCASTI64X4_512, UNKNOWN, (int) V8DI_FTYPE_V4DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vec_dupv8df_mask, "__builtin_ia32_broadcastsd512", IX86_BUILTIN_BROADCASTSD512, UNKNOWN, (int) V8DF_FTYPE_V2DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vec_dupv16sf_mask, "__builtin_ia32_broadcastss512", IX86_BUILTIN_BROADCASTSS512, UNKNOWN, (int) V16SF_FTYPE_V4SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_cmpv16si3_mask, "__builtin_ia32_cmpd512_mask", IX86_BUILTIN_CMPD512, UNKNOWN, (int) HI_FTYPE_V16SI_V16SI_INT_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_cmpv8di3_mask, "__builtin_ia32_cmpq512_mask", IX86_BUILTIN_CMPQ512, UNKNOWN, (int) QI_FTYPE_V8DI_V8DI_INT_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_compressv8df_mask, "__builtin_ia32_compressdf512_mask", IX86_BUILTIN_COMPRESSPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_compressv16sf_mask, "__builtin_ia32_compresssf512_mask", IX86_BUILTIN_COMPRESSPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_floatv8siv8df2_mask, "__builtin_ia32_cvtdq2pd512_mask", IX86_BUILTIN_CVTDQ2PD512, UNKNOWN, (int) V8DF_FTYPE_V8SI_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vcvtps2ph512_mask,  "__builtin_ia32_vcvtps2ph512_mask", IX86_BUILTIN_CVTPS2PH512, UNKNOWN, (int) V16HI_FTYPE_V16SF_INT_V16HI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_ufloatv8siv8df_mask, "__builtin_ia32_cvtudq2pd512_mask", IX86_BUILTIN_CVTUDQ2PD512, UNKNOWN, (int) V8DF_FTYPE_V8SI_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_cvtusi2sd32, "__builtin_ia32_cvtusi2sd32", IX86_BUILTIN_CVTUSI2SD32, UNKNOWN, (int) V2DF_FTYPE_V2DF_UINT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv8df_mask, "__builtin_ia32_expanddf512_mask", IX86_BUILTIN_EXPANDPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv8df_maskz, "__builtin_ia32_expanddf512_maskz", IX86_BUILTIN_EXPANDPD512Z, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv16sf_mask, "__builtin_ia32_expandsf512_mask", IX86_BUILTIN_EXPANDPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv16sf_maskz, "__builtin_ia32_expandsf512_maskz", IX86_BUILTIN_EXPANDPS512Z, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vextractf32x4_mask, "__builtin_ia32_extractf32x4_mask", IX86_BUILTIN_EXTRACTF32X4, UNKNOWN, (int) V4SF_FTYPE_V16SF_INT_V4SF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vextractf64x4_mask, "__builtin_ia32_extractf64x4_mask", IX86_BUILTIN_EXTRACTF64X4, UNKNOWN, (int) V4DF_FTYPE_V8DF_INT_V4DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vextracti32x4_mask, "__builtin_ia32_extracti32x4_mask", IX86_BUILTIN_EXTRACTI32X4, UNKNOWN, (int) V4SI_FTYPE_V16SI_INT_V4SI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vextracti64x4_mask, "__builtin_ia32_extracti64x4_mask", IX86_BUILTIN_EXTRACTI64X4, UNKNOWN, (int) V4DI_FTYPE_V8DI_INT_V4DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vinsertf32x4_mask, "__builtin_ia32_insertf32x4_mask", IX86_BUILTIN_INSERTF32X4, UNKNOWN, (int) V16SF_FTYPE_V16SF_V4SF_INT_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vinsertf64x4_mask, "__builtin_ia32_insertf64x4_mask", IX86_BUILTIN_INSERTF64X4, UNKNOWN, (int) V8DF_FTYPE_V8DF_V4DF_INT_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vinserti32x4_mask, "__builtin_ia32_inserti32x4_mask", IX86_BUILTIN_INSERTI32X4, UNKNOWN, (int) V16SI_FTYPE_V16SI_V4SI_INT_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vinserti64x4_mask, "__builtin_ia32_inserti64x4_mask", IX86_BUILTIN_INSERTI64X4, UNKNOWN, (int) V8DI_FTYPE_V8DI_V4DI_INT_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadv8df_mask, "__builtin_ia32_movapd512_mask", IX86_BUILTIN_MOVAPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadv16sf_mask, "__builtin_ia32_movaps512_mask", IX86_BUILTIN_MOVAPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movddup512_mask, "__builtin_ia32_movddup512_mask", IX86_BUILTIN_MOVDDUP512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadv16si_mask, "__builtin_ia32_movdqa32_512_mask", IX86_BUILTIN_MOVDQA32_512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_loadv8di_mask, "__builtin_ia32_movdqa64_512_mask", IX86_BUILTIN_MOVDQA64_512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movshdup512_mask, "__builtin_ia32_movshdup512_mask", IX86_BUILTIN_MOVSHDUP512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_movsldup512_mask, "__builtin_ia32_movsldup512_mask", IX86_BUILTIN_MOVSLDUP512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_absv16si2_mask, "__builtin_ia32_pabsd512_mask", IX86_BUILTIN_PABSD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_absv8di2_mask, "__builtin_ia32_pabsq512_mask", IX86_BUILTIN_PABSQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_addv16si3_mask, "__builtin_ia32_paddd512_mask", IX86_BUILTIN_PADDD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_addv8di3_mask, "__builtin_ia32_paddq512_mask", IX86_BUILTIN_PADDQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_andv16si3_mask, "__builtin_ia32_pandd512_mask", IX86_BUILTIN_PANDD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_andnotv16si3_mask, "__builtin_ia32_pandnd512_mask", IX86_BUILTIN_PANDND512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_andnotv8di3_mask, "__builtin_ia32_pandnq512_mask", IX86_BUILTIN_PANDNQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_andv8di3_mask, "__builtin_ia32_pandq512_mask", IX86_BUILTIN_PANDQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vec_dupv16si_mask, "__builtin_ia32_pbroadcastd512", IX86_BUILTIN_PBROADCASTD512, UNKNOWN, (int) V16SI_FTYPE_V4SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vec_dup_gprv16si_mask, "__builtin_ia32_pbroadcastd512_gpr_mask", IX86_BUILTIN_PBROADCASTD512_GPR, UNKNOWN, (int) V16SI_FTYPE_SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512CD, CODE_FOR_avx512cd_maskb_vec_dupv8di, "__builtin_ia32_broadcastmb512", IX86_BUILTIN_PBROADCASTMB512, UNKNOWN, (int) V8DI_FTYPE_QI },
+  { OPTION_MASK_ISA_AVX512CD, CODE_FOR_avx512cd_maskw_vec_dupv16si, "__builtin_ia32_broadcastmw512", IX86_BUILTIN_PBROADCASTMW512, UNKNOWN, (int) V16SI_FTYPE_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vec_dupv8di_mask, "__builtin_ia32_pbroadcastq512", IX86_BUILTIN_PBROADCASTQ512, UNKNOWN, (int) V8DI_FTYPE_V2DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_avx512f_vec_dup_gprv8di_mask, "__builtin_ia32_pbroadcastq512_gpr_mask", IX86_BUILTIN_PBROADCASTQ512_GPR, UNKNOWN, (int) V8DI_FTYPE_DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F & ~OPTION_MASK_ISA_64BIT, CODE_FOR_avx512f_vec_dup_memv8di_mask, "__builtin_ia32_pbroadcastq512_mem_mask", IX86_BUILTIN_PBROADCASTQ512_MEM, UNKNOWN, (int) V8DI_FTYPE_DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_eqv16si3_mask, "__builtin_ia32_pcmpeqd512_mask", IX86_BUILTIN_PCMPEQD512_MASK, UNKNOWN, (int) HI_FTYPE_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_eqv8di3_mask, "__builtin_ia32_pcmpeqq512_mask", IX86_BUILTIN_PCMPEQQ512_MASK, UNKNOWN, (int) QI_FTYPE_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_gtv16si3_mask, "__builtin_ia32_pcmpgtd512_mask", IX86_BUILTIN_PCMPGTD512_MASK, UNKNOWN, (int) HI_FTYPE_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_gtv8di3_mask, "__builtin_ia32_pcmpgtq512_mask", IX86_BUILTIN_PCMPGTQ512_MASK, UNKNOWN, (int) QI_FTYPE_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_compressv16si_mask, "__builtin_ia32_compresssi512_mask", IX86_BUILTIN_PCOMPRESSD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_compressv8di_mask, "__builtin_ia32_compressdi512_mask", IX86_BUILTIN_PCOMPRESSQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv16si_mask, "__builtin_ia32_expandsi512_mask", IX86_BUILTIN_PEXPANDD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv16si_maskz, "__builtin_ia32_expandsi512_maskz", IX86_BUILTIN_PEXPANDD512Z, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv8di_mask, "__builtin_ia32_expanddi512_mask", IX86_BUILTIN_PEXPANDQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_expandv8di_maskz, "__builtin_ia32_expanddi512_maskz", IX86_BUILTIN_PEXPANDQ512Z, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_smaxv16si3_mask, "__builtin_ia32_pmaxsd512_mask", IX86_BUILTIN_PMAXSD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_smaxv8di3_mask, "__builtin_ia32_pmaxsq512_mask", IX86_BUILTIN_PMAXSQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_umaxv16si3_mask, "__builtin_ia32_pmaxud512_mask", IX86_BUILTIN_PMAXUD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_umaxv8di3_mask, "__builtin_ia32_pmaxuq512_mask", IX86_BUILTIN_PMAXUQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sminv16si3_mask, "__builtin_ia32_pminsd512_mask", IX86_BUILTIN_PMINSD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sminv8di3_mask, "__builtin_ia32_pminsq512_mask", IX86_BUILTIN_PMINSQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_uminv16si3_mask, "__builtin_ia32_pminud512_mask", IX86_BUILTIN_PMINUD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_uminv8di3_mask, "__builtin_ia32_pminuq512_mask", IX86_BUILTIN_PMINUQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_truncatev16siv16qi2_mask, "__builtin_ia32_pmovdb512_mask", IX86_BUILTIN_PMOVDB512, UNKNOWN, (int) V16QI_FTYPE_V16SI_V16QI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_truncatev16siv16hi2_mask, "__builtin_ia32_pmovdw512_mask", IX86_BUILTIN_PMOVDW512, UNKNOWN, (int) V16HI_FTYPE_V16SI_V16HI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_truncatev8div16qi2_mask, "__builtin_ia32_pmovqb512_mask", IX86_BUILTIN_PMOVQB512, UNKNOWN, (int) V16QI_FTYPE_V8DI_V16QI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_truncatev8div8si2_mask, "__builtin_ia32_pmovqd512_mask", IX86_BUILTIN_PMOVQD512, UNKNOWN, (int) V8SI_FTYPE_V8DI_V8SI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_truncatev8div8hi2_mask, "__builtin_ia32_pmovqw512_mask", IX86_BUILTIN_PMOVQW512, UNKNOWN, (int) V8HI_FTYPE_V8DI_V8HI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ss_truncatev16siv16qi2_mask, "__builtin_ia32_pmovsdb512_mask", IX86_BUILTIN_PMOVSDB512, UNKNOWN, (int) V16QI_FTYPE_V16SI_V16QI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ss_truncatev16siv16hi2_mask, "__builtin_ia32_pmovsdw512_mask", IX86_BUILTIN_PMOVSDW512, UNKNOWN, (int) V16HI_FTYPE_V16SI_V16HI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ss_truncatev8div16qi2_mask, "__builtin_ia32_pmovsqb512_mask", IX86_BUILTIN_PMOVSQB512, UNKNOWN, (int) V16QI_FTYPE_V8DI_V16QI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ss_truncatev8div8si2_mask, "__builtin_ia32_pmovsqd512_mask", IX86_BUILTIN_PMOVSQD512, UNKNOWN, (int) V8SI_FTYPE_V8DI_V8SI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ss_truncatev8div8hi2_mask, "__builtin_ia32_pmovsqw512_mask", IX86_BUILTIN_PMOVSQW512, UNKNOWN, (int) V8HI_FTYPE_V8DI_V8HI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sign_extendv16qiv16si2_mask, "__builtin_ia32_pmovsxbd512_mask", IX86_BUILTIN_PMOVSXBD512, UNKNOWN, (int) V16SI_FTYPE_V16QI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sign_extendv8qiv8di2_mask, "__builtin_ia32_pmovsxbq512_mask", IX86_BUILTIN_PMOVSXBQ512, UNKNOWN, (int) V8DI_FTYPE_V16QI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sign_extendv8siv8di2_mask, "__builtin_ia32_pmovsxdq512_mask", IX86_BUILTIN_PMOVSXDQ512, UNKNOWN, (int) V8DI_FTYPE_V8SI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sign_extendv16hiv16si2_mask, "__builtin_ia32_pmovsxwd512_mask", IX86_BUILTIN_PMOVSXWD512, UNKNOWN, (int) V16SI_FTYPE_V16HI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sign_extendv8hiv8di2_mask, "__builtin_ia32_pmovsxwq512_mask", IX86_BUILTIN_PMOVSXWQ512, UNKNOWN, (int) V8DI_FTYPE_V8HI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_us_truncatev16siv16qi2_mask, "__builtin_ia32_pmovusdb512_mask", IX86_BUILTIN_PMOVUSDB512, UNKNOWN, (int) V16QI_FTYPE_V16SI_V16QI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_us_truncatev16siv16hi2_mask, "__builtin_ia32_pmovusdw512_mask", IX86_BUILTIN_PMOVUSDW512, UNKNOWN, (int) V16HI_FTYPE_V16SI_V16HI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_us_truncatev8div16qi2_mask, "__builtin_ia32_pmovusqb512_mask", IX86_BUILTIN_PMOVUSQB512, UNKNOWN, (int) V16QI_FTYPE_V8DI_V16QI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_us_truncatev8div8si2_mask, "__builtin_ia32_pmovusqd512_mask", IX86_BUILTIN_PMOVUSQD512, UNKNOWN, (int) V8SI_FTYPE_V8DI_V8SI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_us_truncatev8div8hi2_mask, "__builtin_ia32_pmovusqw512_mask", IX86_BUILTIN_PMOVUSQW512, UNKNOWN, (int) V8HI_FTYPE_V8DI_V8HI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_zero_extendv16qiv16si2_mask, "__builtin_ia32_pmovzxbd512_mask", IX86_BUILTIN_PMOVZXBD512, UNKNOWN, (int) V16SI_FTYPE_V16QI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_zero_extendv8qiv8di2_mask, "__builtin_ia32_pmovzxbq512_mask", IX86_BUILTIN_PMOVZXBQ512, UNKNOWN, (int) V8DI_FTYPE_V16QI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_zero_extendv8siv8di2_mask, "__builtin_ia32_pmovzxdq512_mask", IX86_BUILTIN_PMOVZXDQ512, UNKNOWN, (int) V8DI_FTYPE_V8SI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_zero_extendv16hiv16si2_mask, "__builtin_ia32_pmovzxwd512_mask", IX86_BUILTIN_PMOVZXWD512, UNKNOWN, (int) V16SI_FTYPE_V16HI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_zero_extendv8hiv8di2_mask, "__builtin_ia32_pmovzxwq512_mask", IX86_BUILTIN_PMOVZXWQ512, UNKNOWN, (int) V8DI_FTYPE_V8HI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_vec_widen_smult_even_v16si_mask, "__builtin_ia32_pmuldq512_mask", IX86_BUILTIN_PMULDQ512, UNKNOWN, (int) V8DI_FTYPE_V16SI_V16SI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_mulv16si3_mask, "__builtin_ia32_pmulld512_mask"  , IX86_BUILTIN_PMULLD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_vec_widen_umult_even_v16si_mask, "__builtin_ia32_pmuludq512_mask", IX86_BUILTIN_PMULUDQ512, UNKNOWN, (int) V8DI_FTYPE_V16SI_V16SI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_iorv16si3_mask, "__builtin_ia32_pord512_mask", IX86_BUILTIN_PORD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_iorv8di3_mask, "__builtin_ia32_porq512_mask", IX86_BUILTIN_PORQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rolv16si_mask, "__builtin_ia32_prold512_mask", IX86_BUILTIN_PROLD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_INT_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rolv8di_mask, "__builtin_ia32_prolq512_mask", IX86_BUILTIN_PROLQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_INT_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rolvv16si_mask, "__builtin_ia32_prolvd512_mask", IX86_BUILTIN_PROLVD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rolvv8di_mask, "__builtin_ia32_prolvq512_mask", IX86_BUILTIN_PROLVQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rorv16si_mask, "__builtin_ia32_prord512_mask", IX86_BUILTIN_PRORD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_INT_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rorv8di_mask, "__builtin_ia32_prorq512_mask", IX86_BUILTIN_PRORQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_INT_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rorvv16si_mask, "__builtin_ia32_prorvd512_mask", IX86_BUILTIN_PRORVD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rorvv8di_mask, "__builtin_ia32_prorvq512_mask", IX86_BUILTIN_PRORVQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_pshufdv3_mask, "__builtin_ia32_pshufd512_mask", IX86_BUILTIN_PSHUFD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_INT_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_ashlv16si3_mask, "__builtin_ia32_pslld512_mask", IX86_BUILTIN_PSLLD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V4SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_ashlv16si3_mask, "__builtin_ia32_pslldi512_mask", IX86_BUILTIN_PSLLDI512, UNKNOWN, (int) V16SI_FTYPE_V16SI_INT_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_ashlv8di3_mask, "__builtin_ia32_psllq512_mask", IX86_BUILTIN_PSLLQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V2DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_ashlv8di3_mask, "__builtin_ia32_psllqi512_mask", IX86_BUILTIN_PSLLQI512, UNKNOWN, (int) V8DI_FTYPE_V8DI_INT_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ashlvv16si_mask, "__builtin_ia32_psllv16si_mask", IX86_BUILTIN_PSLLVV16SI, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ashlvv8di_mask, "__builtin_ia32_psllv8di_mask", IX86_BUILTIN_PSLLVV8DI, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_ashrv16si3_mask, "__builtin_ia32_psrad512_mask", IX86_BUILTIN_PSRAD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V4SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_ashrv16si3_mask, "__builtin_ia32_psradi512_mask", IX86_BUILTIN_PSRADI512, UNKNOWN, (int) V16SI_FTYPE_V16SI_INT_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_ashrv8di3_mask, "__builtin_ia32_psraq512_mask", IX86_BUILTIN_PSRAQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V2DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_ashrv8di3_mask, "__builtin_ia32_psraqi512_mask", IX86_BUILTIN_PSRAQI512, UNKNOWN, (int) V8DI_FTYPE_V8DI_INT_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ashrvv16si_mask, "__builtin_ia32_psrav16si_mask", IX86_BUILTIN_PSRAVV16SI, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ashrvv8di_mask, "__builtin_ia32_psrav8di_mask", IX86_BUILTIN_PSRAVV8DI, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_lshrv16si3_mask, "__builtin_ia32_psrld512_mask", IX86_BUILTIN_PSRLD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V4SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_lshrv16si3_mask, "__builtin_ia32_psrldi512_mask", IX86_BUILTIN_PSRLDI512, UNKNOWN, (int) V16SI_FTYPE_V16SI_INT_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_lshrv8di3_mask, "__builtin_ia32_psrlq512_mask", IX86_BUILTIN_PSRLQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V2DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_lshrv8di3_mask, "__builtin_ia32_psrlqi512_mask", IX86_BUILTIN_PSRLQI512, UNKNOWN, (int) V8DI_FTYPE_V8DI_INT_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_lshrvv16si_mask, "__builtin_ia32_psrlv16si_mask", IX86_BUILTIN_PSRLVV16SI, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_lshrvv8di_mask, "__builtin_ia32_psrlv8di_mask", IX86_BUILTIN_PSRLVV8DI, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_subv16si3_mask, "__builtin_ia32_psubd512_mask", IX86_BUILTIN_PSUBD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_subv8di3_mask, "__builtin_ia32_psubq512_mask", IX86_BUILTIN_PSUBQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_testmv16si3_mask, "__builtin_ia32_ptestmd512", IX86_BUILTIN_PTESTMD512, UNKNOWN, (int) HI_FTYPE_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_testmv8di3_mask, "__builtin_ia32_ptestmq512", IX86_BUILTIN_PTESTMQ512, UNKNOWN, (int) QI_FTYPE_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_testnmv16si3_mask, "__builtin_ia32_ptestnmd512", IX86_BUILTIN_PTESTNMD512, UNKNOWN, (int) HI_FTYPE_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_testnmv8di3_mask, "__builtin_ia32_ptestnmq512", IX86_BUILTIN_PTESTNMQ512, UNKNOWN, (int) QI_FTYPE_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_interleave_highv16si_mask, "__builtin_ia32_punpckhdq512_mask", IX86_BUILTIN_PUNPCKHDQ512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_interleave_highv8di_mask, "__builtin_ia32_punpckhqdq512_mask", IX86_BUILTIN_PUNPCKHQDQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_interleave_lowv16si_mask, "__builtin_ia32_punpckldq512_mask", IX86_BUILTIN_PUNPCKLDQ512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_interleave_lowv8di_mask, "__builtin_ia32_punpcklqdq512_mask", IX86_BUILTIN_PUNPCKLQDQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_xorv16si3_mask, "__builtin_ia32_pxord512_mask", IX86_BUILTIN_PXORD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_xorv8di3_mask, "__builtin_ia32_pxorq512_mask", IX86_BUILTIN_PXORQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_rcp14v8df_mask, "__builtin_ia32_rcp14pd512_mask", IX86_BUILTIN_RCP14PD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_rcp14v16sf_mask, "__builtin_ia32_rcp14ps512_mask", IX86_BUILTIN_RCP14PS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_srcp14v2df, "__builtin_ia32_rcp14sd", IX86_BUILTIN_RCP14SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_srcp14v4sf, "__builtin_ia32_rcp14ss", IX86_BUILTIN_RCP14SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_rsqrt14v8df_mask, "__builtin_ia32_rsqrt14pd512_mask", IX86_BUILTIN_RSQRT14PD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_rsqrt14v16sf_mask, "__builtin_ia32_rsqrt14ps512_mask", IX86_BUILTIN_RSQRT14PS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_rsqrt14v2df, "__builtin_ia32_rsqrt14sd", IX86_BUILTIN_RSQRT14SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_rsqrt14v4sf, "__builtin_ia32_rsqrt14ss", IX86_BUILTIN_RSQRT14SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_shufpd512_mask, "__builtin_ia32_shufpd512_mask", IX86_BUILTIN_SHUFPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_shufps512_mask, "__builtin_ia32_shufps512_mask", IX86_BUILTIN_SHUFPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_shuf_f32x4_mask, "__builtin_ia32_shuf_f32x4_mask", IX86_BUILTIN_SHUF_F32x4, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_shuf_f64x2_mask, "__builtin_ia32_shuf_f64x2_mask", IX86_BUILTIN_SHUF_F64x2, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_shuf_i32x4_mask, "__builtin_ia32_shuf_i32x4_mask", IX86_BUILTIN_SHUF_I32x4, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_INT_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_shuf_i64x2_mask, "__builtin_ia32_shuf_i64x2_mask", IX86_BUILTIN_SHUF_I64x2, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_INT_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ucmpv16si3_mask, "__builtin_ia32_ucmpd512_mask", IX86_BUILTIN_UCMPD512, UNKNOWN, (int) HI_FTYPE_V16SI_V16SI_INT_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ucmpv8di3_mask, "__builtin_ia32_ucmpq512_mask", IX86_BUILTIN_UCMPQ512, UNKNOWN, (int) QI_FTYPE_V8DI_V8DI_INT_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_unpckhpd512_mask, "__builtin_ia32_unpckhpd512_mask", IX86_BUILTIN_UNPCKHPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_unpckhps512_mask, "__builtin_ia32_unpckhps512_mask", IX86_BUILTIN_UNPCKHPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_unpcklpd512_mask, "__builtin_ia32_unpcklpd512_mask", IX86_BUILTIN_UNPCKLPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_unpcklps512_mask,  "__builtin_ia32_unpcklps512_mask", IX86_BUILTIN_UNPCKLPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512CD, CODE_FOR_clzv16si2_mask, "__builtin_ia32_vplzcntd_512_mask", IX86_BUILTIN_VPCLZCNTD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512CD, CODE_FOR_clzv8di2_mask, "__builtin_ia32_vplzcntq_512_mask", IX86_BUILTIN_VPCLZCNTQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512CD, CODE_FOR_conflictv16si_mask, "__builtin_ia32_vpconflictsi_512_mask", IX86_BUILTIN_VPCONFLICTD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512CD, CODE_FOR_conflictv8di_mask, "__builtin_ia32_vpconflictdi_512_mask", IX86_BUILTIN_VPCONFLICTQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_permv8df_mask, "__builtin_ia32_permdf512_mask", IX86_BUILTIN_VPERMDF512, UNKNOWN, (int) V8DF_FTYPE_V8DF_INT_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_permv8di_mask, "__builtin_ia32_permdi512_mask", IX86_BUILTIN_VPERMDI512, UNKNOWN, (int) V8DI_FTYPE_V8DI_INT_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermi2varv16si3_mask, "__builtin_ia32_vpermi2vard512_mask", IX86_BUILTIN_VPERMI2VARD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermi2varv8df3_mask, "__builtin_ia32_vpermi2varpd512_mask", IX86_BUILTIN_VPERMI2VARPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DI_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermi2varv16sf3_mask, "__builtin_ia32_vpermi2varps512_mask", IX86_BUILTIN_VPERMI2VARPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SI_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermi2varv8di3_mask, "__builtin_ia32_vpermi2varq512_mask", IX86_BUILTIN_VPERMI2VARQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermilv8df_mask, "__builtin_ia32_vpermilpd512_mask", IX86_BUILTIN_VPERMILPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_INT_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermilv16sf_mask, "__builtin_ia32_vpermilps512_mask", IX86_BUILTIN_VPERMILPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_INT_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermilvarv8df3_mask, "__builtin_ia32_vpermilvarpd512_mask", IX86_BUILTIN_VPERMILVARPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DI_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermilvarv16sf3_mask, "__builtin_ia32_vpermilvarps512_mask", IX86_BUILTIN_VPERMILVARPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SI_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermt2varv16si3_mask, "__builtin_ia32_vpermt2vard512_mask", IX86_BUILTIN_VPERMT2VARD512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermt2varv16si3_maskz, "__builtin_ia32_vpermt2vard512_maskz", IX86_BUILTIN_VPERMT2VARD512_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermt2varv8df3_mask, "__builtin_ia32_vpermt2varpd512_mask", IX86_BUILTIN_VPERMT2VARPD512, UNKNOWN, (int) V8DF_FTYPE_V8DI_V8DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermt2varv8df3_maskz, "__builtin_ia32_vpermt2varpd512_maskz", IX86_BUILTIN_VPERMT2VARPD512_MASKZ, UNKNOWN, (int) V8DF_FTYPE_V8DI_V8DF_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermt2varv16sf3_mask, "__builtin_ia32_vpermt2varps512_mask", IX86_BUILTIN_VPERMT2VARPS512, UNKNOWN, (int) V16SF_FTYPE_V16SI_V16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermt2varv16sf3_maskz, "__builtin_ia32_vpermt2varps512_maskz", IX86_BUILTIN_VPERMT2VARPS512_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SI_V16SF_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermt2varv8di3_mask, "__builtin_ia32_vpermt2varq512_mask", IX86_BUILTIN_VPERMT2VARQ512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vpermt2varv8di3_maskz, "__builtin_ia32_vpermt2varq512_maskz", IX86_BUILTIN_VPERMT2VARQ512_MASKZ, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_permvarv8df_mask, "__builtin_ia32_permvardf512_mask", IX86_BUILTIN_VPERMVARDF512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DI_V8DF_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_permvarv8di_mask, "__builtin_ia32_permvardi512_mask", IX86_BUILTIN_VPERMVARDI512, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_permvarv16sf_mask, "__builtin_ia32_permvarsf512_mask", IX86_BUILTIN_VPERMVARSF512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SI_V16SF_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_permvarv16si_mask, "__builtin_ia32_permvarsi512_mask", IX86_BUILTIN_VPERMVARSI512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vternlogv16si_mask, "__builtin_ia32_pternlogd512_mask", IX86_BUILTIN_VTERNLOGD512_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vternlogv16si_maskz, "__builtin_ia32_pternlogd512_maskz", IX86_BUILTIN_VTERNLOGD512_MASKZ, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_INT_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vternlogv8di_mask, "__builtin_ia32_pternlogq512_mask", IX86_BUILTIN_VTERNLOGQ512_MASK, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_INT_QI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vternlogv8di_maskz, "__builtin_ia32_pternlogq512_maskz", IX86_BUILTIN_VTERNLOGQ512_MASKZ, UNKNOWN, (int) V8DI_FTYPE_V8DI_V8DI_V8DI_INT_QI },
+
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_copysignv16sf3,  "__builtin_ia32_copysignps512", IX86_BUILTIN_CPYSGNPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_copysignv8df3,  "__builtin_ia32_copysignpd512", IX86_BUILTIN_CPYSGNPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sqrtv8df2, "__builtin_ia32_sqrtpd512", IX86_BUILTIN_SQRTPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sqrtv16sf2, "__builtin_ia32_sqrtps512", IX86_BUILTIN_SQRTPS_NR512, UNKNOWN, (int) V16SF_FTYPE_V16SF },
+  { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_exp2v16sf, "__builtin_ia32_exp2ps", IX86_BUILTIN_EXP2PS, UNKNOWN, (int) V16SF_FTYPE_V16SF },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_roundv8df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix512", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512, UNKNOWN, (int) V16SI_FTYPE_V8DF_V8DF },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_roundpd_vec_pack_sfix512, "__builtin_ia32_floorpd_vec_pack_sfix512", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512, (enum rtx_code) ROUND_FLOOR, (int) V16SI_FTYPE_V8DF_V8DF_ROUND },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_roundpd_vec_pack_sfix512, "__builtin_ia32_ceilpd_vec_pack_sfix512", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512, (enum rtx_code) ROUND_CEIL, (int) V16SI_FTYPE_V8DF_V8DF_ROUND },
+
+  /* Mask arithmetic operations */
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_andhi3, "__builtin_ia32_kandhi", IX86_BUILTIN_KAND16, UNKNOWN, (int) HI_FTYPE_HI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_kandnhi, "__builtin_ia32_kandnhi", IX86_BUILTIN_KANDN16, UNKNOWN, (int) HI_FTYPE_HI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_one_cmplhi2, "__builtin_ia32_knothi", IX86_BUILTIN_KNOT16, UNKNOWN, (int) HI_FTYPE_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_iorhi3, "__builtin_ia32_korhi", IX86_BUILTIN_KOR16, UNKNOWN, (int) HI_FTYPE_HI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_kortestchi, "__builtin_ia32_kortestchi", IX86_BUILTIN_KORTESTC16, UNKNOWN, (int) HI_FTYPE_HI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_kortestzhi, "__builtin_ia32_kortestzhi", IX86_BUILTIN_KORTESTZ16, UNKNOWN, (int) HI_FTYPE_HI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_kunpckhi, "__builtin_ia32_kunpckhi", IX86_BUILTIN_KUNPCKBW, UNKNOWN, (int) HI_FTYPE_HI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_kxnorhi, "__builtin_ia32_kxnorhi", IX86_BUILTIN_KXNOR16, UNKNOWN, (int) HI_FTYPE_HI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_xorhi3, "__builtin_ia32_kxorhi", IX86_BUILTIN_KXOR16, UNKNOWN, (int) HI_FTYPE_HI_HI },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_kmovw, "__builtin_ia32_kmov16", IX86_BUILTIN_KMOV16, UNKNOWN, (int) HI_FTYPE_HI },
+
+  /* SHA */
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sha1msg1, 0, IX86_BUILTIN_SHA1MSG1, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sha1msg2, 0, IX86_BUILTIN_SHA1MSG2, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sha1nexte, 0, IX86_BUILTIN_SHA1NEXTE, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sha1rnds4, 0, IX86_BUILTIN_SHA1RNDS4, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sha256msg1, 0, IX86_BUILTIN_SHA256MSG1, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sha256msg2, 0, IX86_BUILTIN_SHA256MSG2, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sha256rnds2, 0, IX86_BUILTIN_SHA256RNDS2, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI },
+};
+
+/* Builtins with rounding support.  */
+static const struct builtin_description bdesc_round_args[] =
+{
+  /* AVX512F */
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_addv8df3_mask_round, "__builtin_ia32_addpd512_mask", IX86_BUILTIN_ADDPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_addv16sf3_mask_round, "__builtin_ia32_addps512_mask", IX86_BUILTIN_ADDPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_vmaddv2df3_round, "__builtin_ia32_addsd_round", IX86_BUILTIN_ADDSD_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_vmaddv4sf3_round, "__builtin_ia32_addss_round", IX86_BUILTIN_ADDSS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_cmpv8df3_mask_round, "__builtin_ia32_cmppd512_mask", IX86_BUILTIN_CMPPD512, UNKNOWN, (int) QI_FTYPE_V8DF_V8DF_INT_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_cmpv16sf3_mask_round, "__builtin_ia32_cmpps512_mask", IX86_BUILTIN_CMPPS512, UNKNOWN, (int) HI_FTYPE_V16SF_V16SF_INT_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vmcmpv2df3_mask_round, "__builtin_ia32_cmpsd_mask", IX86_BUILTIN_CMPSD_MASK, UNKNOWN, (int) QI_FTYPE_V2DF_V2DF_INT_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vmcmpv4sf3_mask_round, "__builtin_ia32_cmpss_mask", IX86_BUILTIN_CMPSS_MASK, UNKNOWN, (int) QI_FTYPE_V4SF_V4SF_INT_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_comi_round, "__builtin_ia32_vcomisd", IX86_BUILTIN_COMIDF, UNKNOWN, (int) INT_FTYPE_V2DF_V2DF_INT_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_comi_round, "__builtin_ia32_vcomiss", IX86_BUILTIN_COMISF, UNKNOWN, (int) INT_FTYPE_V4SF_V4SF_INT_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_floatv16siv16sf2_mask_round, "__builtin_ia32_cvtdq2ps512_mask", IX86_BUILTIN_CVTDQ2PS512, UNKNOWN, (int) V16SF_FTYPE_V16SI_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_cvtpd2dq512_mask_round, "__builtin_ia32_cvtpd2dq512_mask", IX86_BUILTIN_CVTPD2DQ512, UNKNOWN, (int) V8SI_FTYPE_V8DF_V8SI_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_cvtpd2ps512_mask_round,  "__builtin_ia32_cvtpd2ps512_mask", IX86_BUILTIN_CVTPD2PS512, UNKNOWN, (int) V8SF_FTYPE_V8DF_V8SF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ufix_notruncv8dfv8si_mask_round, "__builtin_ia32_cvtpd2udq512_mask", IX86_BUILTIN_CVTPD2UDQ512, UNKNOWN, (int) V8SI_FTYPE_V8DF_V8SI_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vcvtph2ps512_mask_round,  "__builtin_ia32_vcvtph2ps512_mask", IX86_BUILTIN_CVTPH2PS512, UNKNOWN, (int) V16SF_FTYPE_V16HI_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fix_notruncv16sfv16si_mask_round, "__builtin_ia32_cvtps2dq512_mask", IX86_BUILTIN_CVTPS2DQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_cvtps2pd512_mask_round, "__builtin_ia32_cvtps2pd512_mask", IX86_BUILTIN_CVTPS2PD512, UNKNOWN, (int) V8DF_FTYPE_V8SF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_ufix_notruncv16sfv16si_mask_round, "__builtin_ia32_cvtps2udq512_mask", IX86_BUILTIN_CVTPS2UDQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_cvtsd2ss_round, "__builtin_ia32_cvtsd2ss_round", IX86_BUILTIN_CVTSD2SS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq_round, "__builtin_ia32_cvtsi2sd64", IX86_BUILTIN_CVTSI2SD64, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT64_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_cvtsi2ss_round, "__builtin_ia32_cvtsi2ss32", IX86_BUILTIN_CVTSI2SS32, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT_INT },
+  { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq_round, "__builtin_ia32_cvtsi2ss64", IX86_BUILTIN_CVTSI2SS64, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT64_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_cvtss2sd_round, "__builtin_ia32_cvtss2sd_round", IX86_BUILTIN_CVTSS2SD_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_fix_truncv8dfv8si2_mask_round, "__builtin_ia32_cvttpd2dq512_mask", IX86_BUILTIN_CVTTPD2DQ512, UNKNOWN, (int) V8SI_FTYPE_V8DF_V8SI_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_ufix_truncv8dfv8si2_mask_round, "__builtin_ia32_cvttpd2udq512_mask", IX86_BUILTIN_CVTTPD2UDQ512, UNKNOWN, (int) V8SI_FTYPE_V8DF_V8SI_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_fix_truncv16sfv16si2_mask_round, "__builtin_ia32_cvttps2dq512_mask", IX86_BUILTIN_CVTTPS2DQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_ufix_truncv16sfv16si2_mask_round, "__builtin_ia32_cvttps2udq512_mask", IX86_BUILTIN_CVTTPS2UDQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_ufloatv16siv16sf2_mask_round, "__builtin_ia32_cvtudq2ps512_mask", IX86_BUILTIN_CVTUDQ2PS512, UNKNOWN, (int) V16SF_FTYPE_V16SI_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_cvtusi2sd64_round, "__builtin_ia32_cvtusi2sd64", IX86_BUILTIN_CVTUSI2SD64, UNKNOWN, (int) V2DF_FTYPE_V2DF_UINT64_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_cvtusi2ss32_round, "__builtin_ia32_cvtusi2ss32", IX86_BUILTIN_CVTUSI2SS32, UNKNOWN, (int) V4SF_FTYPE_V4SF_UINT_INT },
+  { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_cvtusi2ss64_round, "__builtin_ia32_cvtusi2ss64", IX86_BUILTIN_CVTUSI2SS64, UNKNOWN, (int) V4SF_FTYPE_V4SF_UINT64_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_divv8df3_mask_round, "__builtin_ia32_divpd512_mask", IX86_BUILTIN_DIVPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_divv16sf3_mask_round, "__builtin_ia32_divps512_mask", IX86_BUILTIN_DIVPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_vmdivv2df3_round, "__builtin_ia32_divsd_round", IX86_BUILTIN_DIVSD_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_vmdivv4sf3_round, "__builtin_ia32_divss_round", IX86_BUILTIN_DIVSS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fixupimmv8df_mask_round, "__builtin_ia32_fixupimmpd512_mask", IX86_BUILTIN_FIXUPIMMPD512_MASK, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fixupimmv8df_maskz_round, "__builtin_ia32_fixupimmpd512_maskz", IX86_BUILTIN_FIXUPIMMPD512_MASKZ, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fixupimmv16sf_mask_round, "__builtin_ia32_fixupimmps512_mask", IX86_BUILTIN_FIXUPIMMPS512_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fixupimmv16sf_maskz_round, "__builtin_ia32_fixupimmps512_maskz", IX86_BUILTIN_FIXUPIMMPS512_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sfixupimmv2df_mask_round, "__builtin_ia32_fixupimmsd_mask", IX86_BUILTIN_FIXUPIMMSD128_MASK, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sfixupimmv2df_maskz_round, "__builtin_ia32_fixupimmsd_maskz", IX86_BUILTIN_FIXUPIMMSD128_MASKZ, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sfixupimmv4sf_mask_round, "__builtin_ia32_fixupimmss_mask", IX86_BUILTIN_FIXUPIMMSS128_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sfixupimmv4sf_maskz_round, "__builtin_ia32_fixupimmss_maskz", IX86_BUILTIN_FIXUPIMMSS128_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_getexpv8df_mask_round, "__builtin_ia32_getexppd512_mask", IX86_BUILTIN_GETEXPPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_getexpv16sf_mask_round, "__builtin_ia32_getexpps512_mask", IX86_BUILTIN_GETEXPPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sgetexpv2df_round, "__builtin_ia32_getexpsd128_round", IX86_BUILTIN_GETEXPSD128, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sgetexpv4sf_round, "__builtin_ia32_getexpss128_round", IX86_BUILTIN_GETEXPSS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_getmantv8df_mask_round, "__builtin_ia32_getmantpd512_mask", IX86_BUILTIN_GETMANTPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_INT_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_getmantv16sf_mask_round, "__builtin_ia32_getmantps512_mask", IX86_BUILTIN_GETMANTPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_INT_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_getmantv2df_round, "__builtin_ia32_getmantsd_round", IX86_BUILTIN_GETMANTSD128, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_getmantv4sf_round, "__builtin_ia32_getmantss_round", IX86_BUILTIN_GETMANTSS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_smaxv8df3_mask_round, "__builtin_ia32_maxpd512_mask", IX86_BUILTIN_MAXPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_smaxv16sf3_mask_round, "__builtin_ia32_maxps512_mask", IX86_BUILTIN_MAXPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_vmsmaxv2df3_round, "__builtin_ia32_maxsd_round", IX86_BUILTIN_MAXSD_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_vmsmaxv4sf3_round, "__builtin_ia32_maxss_round", IX86_BUILTIN_MAXSS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sminv8df3_mask_round, "__builtin_ia32_minpd512_mask", IX86_BUILTIN_MINPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sminv16sf3_mask_round, "__builtin_ia32_minps512_mask", IX86_BUILTIN_MINPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_vmsminv2df3_round, "__builtin_ia32_minsd_round", IX86_BUILTIN_MINSD_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_vmsminv4sf3_round, "__builtin_ia32_minss_round", IX86_BUILTIN_MINSS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_mulv8df3_mask_round, "__builtin_ia32_mulpd512_mask", IX86_BUILTIN_MULPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_mulv16sf3_mask_round, "__builtin_ia32_mulps512_mask", IX86_BUILTIN_MULPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_vmmulv2df3_round, "__builtin_ia32_mulsd_round", IX86_BUILTIN_MULSD_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_vmmulv4sf3_round, "__builtin_ia32_mulss_round", IX86_BUILTIN_MULSS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rndscalev8df_mask_round, "__builtin_ia32_rndscalepd_mask", IX86_BUILTIN_RNDSCALEPD, UNKNOWN, (int) V8DF_FTYPE_V8DF_INT_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rndscalev16sf_mask_round, "__builtin_ia32_rndscaleps_mask", IX86_BUILTIN_RNDSCALEPS, UNKNOWN, (int) V16SF_FTYPE_V16SF_INT_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rndscalev2df_round, "__builtin_ia32_rndscalesd_round", IX86_BUILTIN_RNDSCALESD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_rndscalev4sf_round, "__builtin_ia32_rndscaless_round", IX86_BUILTIN_RNDSCALESS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_scalefv8df_mask_round, "__builtin_ia32_scalefpd512_mask", IX86_BUILTIN_SCALEFPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_scalefv16sf_mask_round, "__builtin_ia32_scalefps512_mask", IX86_BUILTIN_SCALEFPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vmscalefv2df_round, "__builtin_ia32_scalefsd_round", IX86_BUILTIN_SCALEFSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vmscalefv4sf_round, "__builtin_ia32_scalefss_round", IX86_BUILTIN_SCALEFSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sqrtv8df2_mask_round, "__builtin_ia32_sqrtpd512_mask", IX86_BUILTIN_SQRTPD512_MASK, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_sqrtv16sf2_mask_round, "__builtin_ia32_sqrtps512_mask", IX86_BUILTIN_SQRTPS512_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_vmsqrtv2df2_round, "__builtin_ia32_sqrtsd_round", IX86_BUILTIN_SQRTSD_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_vmsqrtv4sf2_round, "__builtin_ia32_sqrtss_round", IX86_BUILTIN_SQRTSS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_subv8df3_mask_round, "__builtin_ia32_subpd512_mask", IX86_BUILTIN_SUBPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_subv16sf3_mask_round, "__builtin_ia32_subps512_mask", IX86_BUILTIN_SUBPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_vmsubv2df3_round, "__builtin_ia32_subsd_round", IX86_BUILTIN_SUBSD_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_vmsubv4sf3_round, "__builtin_ia32_subss_round", IX86_BUILTIN_SUBSS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_cvtsd2si_round, "__builtin_ia32_vcvtsd2si32", IX86_BUILTIN_VCVTSD2SI32, UNKNOWN, (int) INT_FTYPE_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq_round, "__builtin_ia32_vcvtsd2si64", IX86_BUILTIN_VCVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vcvtsd2usi_round, "__builtin_ia32_vcvtsd2usi32", IX86_BUILTIN_VCVTSD2USI32, UNKNOWN, (int) UINT_FTYPE_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_avx512f_vcvtsd2usiq_round, "__builtin_ia32_vcvtsd2usi64", IX86_BUILTIN_VCVTSD2USI64, UNKNOWN, (int) UINT64_FTYPE_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_cvtss2si_round, "__builtin_ia32_vcvtss2si32", IX86_BUILTIN_VCVTSS2SI32, UNKNOWN, (int) INT_FTYPE_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq_round, "__builtin_ia32_vcvtss2si64", IX86_BUILTIN_VCVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vcvtss2usi_round, "__builtin_ia32_vcvtss2usi32", IX86_BUILTIN_VCVTSS2USI32, UNKNOWN, (int) UINT_FTYPE_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_avx512f_vcvtss2usiq_round, "__builtin_ia32_vcvtss2usi64", IX86_BUILTIN_VCVTSS2USI64, UNKNOWN, (int) UINT64_FTYPE_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse2_cvttsd2si_round, "__builtin_ia32_vcvttsd2si32", IX86_BUILTIN_VCVTTSD2SI32, UNKNOWN, (int) INT_FTYPE_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq_round, "__builtin_ia32_vcvttsd2si64", IX86_BUILTIN_VCVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vcvttsd2usi_round, "__builtin_ia32_vcvttsd2usi32", IX86_BUILTIN_VCVTTSD2USI32, UNKNOWN, (int) UINT_FTYPE_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_avx512f_vcvttsd2usiq_round, "__builtin_ia32_vcvttsd2usi64", IX86_BUILTIN_VCVTTSD2USI64, UNKNOWN, (int) UINT64_FTYPE_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_sse_cvttss2si_round, "__builtin_ia32_vcvttss2si32", IX86_BUILTIN_VCVTTSS2SI32, UNKNOWN, (int) INT_FTYPE_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq_round, "__builtin_ia32_vcvttss2si64", IX86_BUILTIN_VCVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_vcvttss2usi_round, "__builtin_ia32_vcvttss2usi32", IX86_BUILTIN_VCVTTSS2USI32, UNKNOWN, (int) UINT_FTYPE_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, CODE_FOR_avx512f_vcvttss2usiq_round, "__builtin_ia32_vcvttss2usi64", IX86_BUILTIN_VCVTTSS2USI64, UNKNOWN, (int) UINT64_FTYPE_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmadd_v8df_mask_round, "__builtin_ia32_vfmaddpd512_mask", IX86_BUILTIN_VFMADDPD512_MASK, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmadd_v8df_mask3_round, "__builtin_ia32_vfmaddpd512_mask3", IX86_BUILTIN_VFMADDPD512_MASK3, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmadd_v8df_maskz_round, "__builtin_ia32_vfmaddpd512_maskz", IX86_BUILTIN_VFMADDPD512_MASKZ, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmadd_v16sf_mask_round, "__builtin_ia32_vfmaddps512_mask", IX86_BUILTIN_VFMADDPS512_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmadd_v16sf_mask3_round, "__builtin_ia32_vfmaddps512_mask3", IX86_BUILTIN_VFMADDPS512_MASK3, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmadd_v16sf_maskz_round, "__builtin_ia32_vfmaddps512_maskz", IX86_BUILTIN_VFMADDPS512_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_fmai_vmfmadd_v2df_round, "__builtin_ia32_vfmaddsd3_round", IX86_BUILTIN_VFMADDSD3_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_fmai_vmfmadd_v4sf_round, "__builtin_ia32_vfmaddss3_round", IX86_BUILTIN_VFMADDSS3_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmaddsub_v8df_mask_round, "__builtin_ia32_vfmaddsubpd512_mask", IX86_BUILTIN_VFMADDSUBPD512_MASK, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmaddsub_v8df_mask3_round, "__builtin_ia32_vfmaddsubpd512_mask3", IX86_BUILTIN_VFMADDSUBPD512_MASK3, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmaddsub_v8df_maskz_round, "__builtin_ia32_vfmaddsubpd512_maskz", IX86_BUILTIN_VFMADDSUBPD512_MASKZ, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmaddsub_v16sf_mask_round, "__builtin_ia32_vfmaddsubps512_mask", IX86_BUILTIN_VFMADDSUBPS512_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmaddsub_v16sf_mask3_round, "__builtin_ia32_vfmaddsubps512_mask3", IX86_BUILTIN_VFMADDSUBPS512_MASK3, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmaddsub_v16sf_maskz_round, "__builtin_ia32_vfmaddsubps512_maskz", IX86_BUILTIN_VFMADDSUBPS512_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmsubadd_v8df_mask3_round, "__builtin_ia32_vfmsubaddpd512_mask3", IX86_BUILTIN_VFMSUBADDPD512_MASK3, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmsubadd_v16sf_mask3_round, "__builtin_ia32_vfmsubaddps512_mask3", IX86_BUILTIN_VFMSUBADDPS512_MASK3, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmsub_v8df_mask3_round, "__builtin_ia32_vfmsubpd512_mask3", IX86_BUILTIN_VFMSUBPD512_MASK3, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fmsub_v16sf_mask3_round, "__builtin_ia32_vfmsubps512_mask3", IX86_BUILTIN_VFMSUBPS512_MASK3, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fnmadd_v8df_mask_round, "__builtin_ia32_vfnmaddpd512_mask", IX86_BUILTIN_VFNMADDPD512_MASK, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fnmadd_v16sf_mask_round, "__builtin_ia32_vfnmaddps512_mask", IX86_BUILTIN_VFNMADDPS512_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fnmsub_v8df_mask_round, "__builtin_ia32_vfnmsubpd512_mask", IX86_BUILTIN_VFNMSUBPD512_MASK, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fnmsub_v8df_mask3_round, "__builtin_ia32_vfnmsubpd512_mask3", IX86_BUILTIN_VFNMSUBPD512_MASK3, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fnmsub_v16sf_mask_round, "__builtin_ia32_vfnmsubps512_mask", IX86_BUILTIN_VFNMSUBPS512_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512F, CODE_FOR_avx512f_fnmsub_v16sf_mask3_round, "__builtin_ia32_vfnmsubps512_mask3", IX86_BUILTIN_VFNMSUBPS512_MASK3, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT },
+
+  /* AVX512ER */
+  { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_exp2v8df_mask_round, "__builtin_ia32_exp2pd_mask", IX86_BUILTIN_EXP2PD_MASK, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_exp2v16sf_mask_round, "__builtin_ia32_exp2ps_mask", IX86_BUILTIN_EXP2PS_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_rcp28v8df_mask_round, "__builtin_ia32_rcp28pd_mask", IX86_BUILTIN_RCP28PD, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_rcp28v16sf_mask_round, "__builtin_ia32_rcp28ps_mask", IX86_BUILTIN_RCP28PS, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_vmrcp28v2df_round, "__builtin_ia32_rcp28sd_round", IX86_BUILTIN_RCP28SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_vmrcp28v4sf_round, "__builtin_ia32_rcp28ss_round", IX86_BUILTIN_RCP28SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+  { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_rsqrt28v8df_mask_round, "__builtin_ia32_rsqrt28pd_mask", IX86_BUILTIN_RSQRT28PD, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI_INT },
+  { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_rsqrt28v16sf_mask_round, "__builtin_ia32_rsqrt28ps_mask", IX86_BUILTIN_RSQRT28PS, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI_INT },
+  { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_vmrsqrt28v2df_round, "__builtin_ia32_rsqrt28sd_round", IX86_BUILTIN_RSQRT28SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX512ER, CODE_FOR_avx512er_vmrsqrt28v4sf_round, "__builtin_ia32_rsqrt28ss_round", IX86_BUILTIN_RSQRT28SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+};
+
+/* FMA4 and XOP.  */
+#define MULTI_ARG_4_DF2_DI_I	V2DF_FTYPE_V2DF_V2DF_V2DI_INT
+#define MULTI_ARG_4_DF2_DI_I1	V4DF_FTYPE_V4DF_V4DF_V4DI_INT
+#define MULTI_ARG_4_SF2_SI_I	V4SF_FTYPE_V4SF_V4SF_V4SI_INT
+#define MULTI_ARG_4_SF2_SI_I1	V8SF_FTYPE_V8SF_V8SF_V8SI_INT
+#define MULTI_ARG_3_SF		V4SF_FTYPE_V4SF_V4SF_V4SF
+#define MULTI_ARG_3_DF		V2DF_FTYPE_V2DF_V2DF_V2DF
+#define MULTI_ARG_3_SF2		V8SF_FTYPE_V8SF_V8SF_V8SF
+#define MULTI_ARG_3_DF2		V4DF_FTYPE_V4DF_V4DF_V4DF
+#define MULTI_ARG_3_DI		V2DI_FTYPE_V2DI_V2DI_V2DI
+#define MULTI_ARG_3_SI		V4SI_FTYPE_V4SI_V4SI_V4SI
+#define MULTI_ARG_3_SI_DI	V4SI_FTYPE_V4SI_V4SI_V2DI
+#define MULTI_ARG_3_HI		V8HI_FTYPE_V8HI_V8HI_V8HI
+#define MULTI_ARG_3_HI_SI	V8HI_FTYPE_V8HI_V8HI_V4SI
+#define MULTI_ARG_3_QI		V16QI_FTYPE_V16QI_V16QI_V16QI
+#define MULTI_ARG_3_DI2		V4DI_FTYPE_V4DI_V4DI_V4DI
+#define MULTI_ARG_3_SI2		V8SI_FTYPE_V8SI_V8SI_V8SI
+#define MULTI_ARG_3_HI2		V16HI_FTYPE_V16HI_V16HI_V16HI
+#define MULTI_ARG_3_QI2		V32QI_FTYPE_V32QI_V32QI_V32QI
+#define MULTI_ARG_2_SF		V4SF_FTYPE_V4SF_V4SF
+#define MULTI_ARG_2_DF		V2DF_FTYPE_V2DF_V2DF
+#define MULTI_ARG_2_DI		V2DI_FTYPE_V2DI_V2DI
+#define MULTI_ARG_2_SI		V4SI_FTYPE_V4SI_V4SI
+#define MULTI_ARG_2_HI		V8HI_FTYPE_V8HI_V8HI
+#define MULTI_ARG_2_QI		V16QI_FTYPE_V16QI_V16QI
+#define MULTI_ARG_2_DI_IMM	V2DI_FTYPE_V2DI_SI
+#define MULTI_ARG_2_SI_IMM	V4SI_FTYPE_V4SI_SI
+#define MULTI_ARG_2_HI_IMM	V8HI_FTYPE_V8HI_SI
+#define MULTI_ARG_2_QI_IMM	V16QI_FTYPE_V16QI_SI
+#define MULTI_ARG_2_DI_CMP	V2DI_FTYPE_V2DI_V2DI_CMP
+#define MULTI_ARG_2_SI_CMP	V4SI_FTYPE_V4SI_V4SI_CMP
+#define MULTI_ARG_2_HI_CMP	V8HI_FTYPE_V8HI_V8HI_CMP
+#define MULTI_ARG_2_QI_CMP	V16QI_FTYPE_V16QI_V16QI_CMP
+#define MULTI_ARG_2_SF_TF	V4SF_FTYPE_V4SF_V4SF_TF
+#define MULTI_ARG_2_DF_TF	V2DF_FTYPE_V2DF_V2DF_TF
+#define MULTI_ARG_2_DI_TF	V2DI_FTYPE_V2DI_V2DI_TF
+#define MULTI_ARG_2_SI_TF	V4SI_FTYPE_V4SI_V4SI_TF
+#define MULTI_ARG_2_HI_TF	V8HI_FTYPE_V8HI_V8HI_TF
+#define MULTI_ARG_2_QI_TF	V16QI_FTYPE_V16QI_V16QI_TF
+#define MULTI_ARG_1_SF		V4SF_FTYPE_V4SF
+#define MULTI_ARG_1_DF		V2DF_FTYPE_V2DF
+#define MULTI_ARG_1_SF2		V8SF_FTYPE_V8SF
+#define MULTI_ARG_1_DF2		V4DF_FTYPE_V4DF
+#define MULTI_ARG_1_DI		V2DI_FTYPE_V2DI
+#define MULTI_ARG_1_SI		V4SI_FTYPE_V4SI
+#define MULTI_ARG_1_HI		V8HI_FTYPE_V8HI
+#define MULTI_ARG_1_QI		V16QI_FTYPE_V16QI
+#define MULTI_ARG_1_SI_DI	V2DI_FTYPE_V4SI
+#define MULTI_ARG_1_HI_DI	V2DI_FTYPE_V8HI
+#define MULTI_ARG_1_HI_SI	V4SI_FTYPE_V8HI
+#define MULTI_ARG_1_QI_DI	V2DI_FTYPE_V16QI
+#define MULTI_ARG_1_QI_SI	V4SI_FTYPE_V16QI
+#define MULTI_ARG_1_QI_HI	V8HI_FTYPE_V16QI
+
+static const struct builtin_description bdesc_multi_arg[] =
+{
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
+    "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
+    UNKNOWN, (int)MULTI_ARG_3_SF },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
+    "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
+    UNKNOWN, (int)MULTI_ARG_3_DF },
+
+  { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
+    "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
+    UNKNOWN, (int)MULTI_ARG_3_SF },
+  { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
+    "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
+    UNKNOWN, (int)MULTI_ARG_3_DF },
+
+  { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
+    "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
+    UNKNOWN, (int)MULTI_ARG_3_SF },
+  { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
+    "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
+    UNKNOWN, (int)MULTI_ARG_3_DF },
+  { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
+    "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
+    UNKNOWN, (int)MULTI_ARG_3_SF2 },
+  { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
+    "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
+    UNKNOWN, (int)MULTI_ARG_3_DF2 },
+
+  { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
+    "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
+    UNKNOWN, (int)MULTI_ARG_3_SF },
+  { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
+    "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
+    UNKNOWN, (int)MULTI_ARG_3_DF },
+  { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
+    "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
+    UNKNOWN, (int)MULTI_ARG_3_SF2 },
+  { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
+    "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
+    UNKNOWN, (int)MULTI_ARG_3_DF2 },
+
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di,        "__builtin_ia32_vpcmov",      IX86_BUILTIN_VPCMOV,	 UNKNOWN,      (int)MULTI_ARG_3_DI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di,        "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN,      (int)MULTI_ARG_3_DI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si,        "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN,      (int)MULTI_ARG_3_SI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi,        "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN,      (int)MULTI_ARG_3_HI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi,       "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN,      (int)MULTI_ARG_3_QI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df,        "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN,      (int)MULTI_ARG_3_DF },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf,        "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN,      (int)MULTI_ARG_3_SF },
+
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256,        "__builtin_ia32_vpcmov256",       IX86_BUILTIN_VPCMOV256,       UNKNOWN,      (int)MULTI_ARG_3_DI2 },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256,        "__builtin_ia32_vpcmov_v4di256",  IX86_BUILTIN_VPCMOV_V4DI256,  UNKNOWN,      (int)MULTI_ARG_3_DI2 },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256,        "__builtin_ia32_vpcmov_v8si256",  IX86_BUILTIN_VPCMOV_V8SI256,  UNKNOWN,      (int)MULTI_ARG_3_SI2 },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256,       "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN,      (int)MULTI_ARG_3_HI2 },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256,       "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN,      (int)MULTI_ARG_3_QI2 },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256,        "__builtin_ia32_vpcmov_v4df256",  IX86_BUILTIN_VPCMOV_V4DF256,  UNKNOWN,      (int)MULTI_ARG_3_DF2 },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256,        "__builtin_ia32_vpcmov_v8sf256",  IX86_BUILTIN_VPCMOV_V8SF256,  UNKNOWN,      (int)MULTI_ARG_3_SF2 },
+
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm,             "__builtin_ia32_vpperm",      IX86_BUILTIN_VPPERM,      UNKNOWN,      (int)MULTI_ARG_3_QI },
+
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww,          "__builtin_ia32_vpmacssww",   IX86_BUILTIN_VPMACSSWW,   UNKNOWN,      (int)MULTI_ARG_3_HI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww,           "__builtin_ia32_vpmacsww",    IX86_BUILTIN_VPMACSWW,    UNKNOWN,      (int)MULTI_ARG_3_HI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd,          "__builtin_ia32_vpmacsswd",   IX86_BUILTIN_VPMACSSWD,   UNKNOWN,      (int)MULTI_ARG_3_HI_SI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd,           "__builtin_ia32_vpmacswd",    IX86_BUILTIN_VPMACSWD,    UNKNOWN,      (int)MULTI_ARG_3_HI_SI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd,          "__builtin_ia32_vpmacssdd",   IX86_BUILTIN_VPMACSSDD,   UNKNOWN,      (int)MULTI_ARG_3_SI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd,           "__builtin_ia32_vpmacsdd",    IX86_BUILTIN_VPMACSDD,    UNKNOWN,      (int)MULTI_ARG_3_SI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql,         "__builtin_ia32_vpmacssdql",  IX86_BUILTIN_VPMACSSDQL,  UNKNOWN,      (int)MULTI_ARG_3_SI_DI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh,         "__builtin_ia32_vpmacssdqh",  IX86_BUILTIN_VPMACSSDQH,  UNKNOWN,      (int)MULTI_ARG_3_SI_DI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql,          "__builtin_ia32_vpmacsdql",   IX86_BUILTIN_VPMACSDQL,   UNKNOWN,      (int)MULTI_ARG_3_SI_DI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh,          "__builtin_ia32_vpmacsdqh",   IX86_BUILTIN_VPMACSDQH,   UNKNOWN,      (int)MULTI_ARG_3_SI_DI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd,         "__builtin_ia32_vpmadcsswd",  IX86_BUILTIN_VPMADCSSWD,  UNKNOWN,      (int)MULTI_ARG_3_HI_SI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd,          "__builtin_ia32_vpmadcswd",   IX86_BUILTIN_VPMADCSWD,   UNKNOWN,      (int)MULTI_ARG_3_HI_SI },
+
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3,        "__builtin_ia32_vprotq",      IX86_BUILTIN_VPROTQ,      UNKNOWN,      (int)MULTI_ARG_2_DI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3,        "__builtin_ia32_vprotd",      IX86_BUILTIN_VPROTD,      UNKNOWN,      (int)MULTI_ARG_2_SI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3,        "__builtin_ia32_vprotw",      IX86_BUILTIN_VPROTW,      UNKNOWN,      (int)MULTI_ARG_2_HI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3,       "__builtin_ia32_vprotb",      IX86_BUILTIN_VPROTB,      UNKNOWN,      (int)MULTI_ARG_2_QI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3,         "__builtin_ia32_vprotqi",     IX86_BUILTIN_VPROTQ_IMM,  UNKNOWN,      (int)MULTI_ARG_2_DI_IMM },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3,         "__builtin_ia32_vprotdi",     IX86_BUILTIN_VPROTD_IMM,  UNKNOWN,      (int)MULTI_ARG_2_SI_IMM },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3,         "__builtin_ia32_vprotwi",     IX86_BUILTIN_VPROTW_IMM,  UNKNOWN,      (int)MULTI_ARG_2_HI_IMM },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3,        "__builtin_ia32_vprotbi",     IX86_BUILTIN_VPROTB_IMM,  UNKNOWN,      (int)MULTI_ARG_2_QI_IMM },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3,         "__builtin_ia32_vpshaq",      IX86_BUILTIN_VPSHAQ,      UNKNOWN,      (int)MULTI_ARG_2_DI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3,         "__builtin_ia32_vpshad",      IX86_BUILTIN_VPSHAD,      UNKNOWN,      (int)MULTI_ARG_2_SI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3,         "__builtin_ia32_vpshaw",      IX86_BUILTIN_VPSHAW,      UNKNOWN,      (int)MULTI_ARG_2_HI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3,        "__builtin_ia32_vpshab",      IX86_BUILTIN_VPSHAB,      UNKNOWN,      (int)MULTI_ARG_2_QI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3,         "__builtin_ia32_vpshlq",      IX86_BUILTIN_VPSHLQ,      UNKNOWN,      (int)MULTI_ARG_2_DI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3,         "__builtin_ia32_vpshld",      IX86_BUILTIN_VPSHLD,      UNKNOWN,      (int)MULTI_ARG_2_SI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3,         "__builtin_ia32_vpshlw",      IX86_BUILTIN_VPSHLW,      UNKNOWN,      (int)MULTI_ARG_2_HI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3,        "__builtin_ia32_vpshlb",      IX86_BUILTIN_VPSHLB,      UNKNOWN,      (int)MULTI_ARG_2_QI },
+
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2,       "__builtin_ia32_vfrczss",     IX86_BUILTIN_VFRCZSS,     UNKNOWN,      (int)MULTI_ARG_1_SF },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2,       "__builtin_ia32_vfrczsd",     IX86_BUILTIN_VFRCZSD,     UNKNOWN,      (int)MULTI_ARG_1_DF },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2,         "__builtin_ia32_vfrczps",     IX86_BUILTIN_VFRCZPS,     UNKNOWN,      (int)MULTI_ARG_1_SF },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2,         "__builtin_ia32_vfrczpd",     IX86_BUILTIN_VFRCZPD,     UNKNOWN,      (int)MULTI_ARG_1_DF },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2,         "__builtin_ia32_vfrczps256",  IX86_BUILTIN_VFRCZPS256,  UNKNOWN,      (int)MULTI_ARG_1_SF2 },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2,         "__builtin_ia32_vfrczpd256",  IX86_BUILTIN_VFRCZPD256,  UNKNOWN,      (int)MULTI_ARG_1_DF2 },
+
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw,           "__builtin_ia32_vphaddbw",    IX86_BUILTIN_VPHADDBW,    UNKNOWN,      (int)MULTI_ARG_1_QI_HI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd,           "__builtin_ia32_vphaddbd",    IX86_BUILTIN_VPHADDBD,    UNKNOWN,      (int)MULTI_ARG_1_QI_SI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq,           "__builtin_ia32_vphaddbq",    IX86_BUILTIN_VPHADDBQ,    UNKNOWN,      (int)MULTI_ARG_1_QI_DI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd,           "__builtin_ia32_vphaddwd",    IX86_BUILTIN_VPHADDWD,    UNKNOWN,      (int)MULTI_ARG_1_HI_SI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq,           "__builtin_ia32_vphaddwq",    IX86_BUILTIN_VPHADDWQ,    UNKNOWN,      (int)MULTI_ARG_1_HI_DI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq,           "__builtin_ia32_vphadddq",    IX86_BUILTIN_VPHADDDQ,    UNKNOWN,      (int)MULTI_ARG_1_SI_DI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw,          "__builtin_ia32_vphaddubw",   IX86_BUILTIN_VPHADDUBW,   UNKNOWN,      (int)MULTI_ARG_1_QI_HI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd,          "__builtin_ia32_vphaddubd",   IX86_BUILTIN_VPHADDUBD,   UNKNOWN,      (int)MULTI_ARG_1_QI_SI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq,          "__builtin_ia32_vphaddubq",   IX86_BUILTIN_VPHADDUBQ,   UNKNOWN,      (int)MULTI_ARG_1_QI_DI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd,          "__builtin_ia32_vphadduwd",   IX86_BUILTIN_VPHADDUWD,   UNKNOWN,      (int)MULTI_ARG_1_HI_SI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq,          "__builtin_ia32_vphadduwq",   IX86_BUILTIN_VPHADDUWQ,   UNKNOWN,      (int)MULTI_ARG_1_HI_DI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq,          "__builtin_ia32_vphaddudq",   IX86_BUILTIN_VPHADDUDQ,   UNKNOWN,      (int)MULTI_ARG_1_SI_DI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw,           "__builtin_ia32_vphsubbw",    IX86_BUILTIN_VPHSUBBW,    UNKNOWN,      (int)MULTI_ARG_1_QI_HI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd,           "__builtin_ia32_vphsubwd",    IX86_BUILTIN_VPHSUBWD,    UNKNOWN,      (int)MULTI_ARG_1_HI_SI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq,           "__builtin_ia32_vphsubdq",    IX86_BUILTIN_VPHSUBDQ,    UNKNOWN,      (int)MULTI_ARG_1_SI_DI },
+
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomeqb",    IX86_BUILTIN_VPCOMEQB,    EQ,           (int)MULTI_ARG_2_QI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomneb",    IX86_BUILTIN_VPCOMNEB,    NE,           (int)MULTI_ARG_2_QI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomneqb",   IX86_BUILTIN_VPCOMNEB,    NE,           (int)MULTI_ARG_2_QI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomltb",    IX86_BUILTIN_VPCOMLTB,    LT,           (int)MULTI_ARG_2_QI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomleb",    IX86_BUILTIN_VPCOMLEB,    LE,           (int)MULTI_ARG_2_QI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomgtb",    IX86_BUILTIN_VPCOMGTB,    GT,           (int)MULTI_ARG_2_QI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3,     "__builtin_ia32_vpcomgeb",    IX86_BUILTIN_VPCOMGEB,    GE,           (int)MULTI_ARG_2_QI_CMP },
+
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomeqw",    IX86_BUILTIN_VPCOMEQW,    EQ,           (int)MULTI_ARG_2_HI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomnew",    IX86_BUILTIN_VPCOMNEW,    NE,           (int)MULTI_ARG_2_HI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomneqw",   IX86_BUILTIN_VPCOMNEW,    NE,           (int)MULTI_ARG_2_HI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomltw",    IX86_BUILTIN_VPCOMLTW,    LT,           (int)MULTI_ARG_2_HI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomlew",    IX86_BUILTIN_VPCOMLEW,    LE,           (int)MULTI_ARG_2_HI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomgtw",    IX86_BUILTIN_VPCOMGTW,    GT,           (int)MULTI_ARG_2_HI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3,      "__builtin_ia32_vpcomgew",    IX86_BUILTIN_VPCOMGEW,    GE,           (int)MULTI_ARG_2_HI_CMP },
+
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomeqd",    IX86_BUILTIN_VPCOMEQD,    EQ,           (int)MULTI_ARG_2_SI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomned",    IX86_BUILTIN_VPCOMNED,    NE,           (int)MULTI_ARG_2_SI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomneqd",   IX86_BUILTIN_VPCOMNED,    NE,           (int)MULTI_ARG_2_SI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomltd",    IX86_BUILTIN_VPCOMLTD,    LT,           (int)MULTI_ARG_2_SI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomled",    IX86_BUILTIN_VPCOMLED,    LE,           (int)MULTI_ARG_2_SI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomgtd",    IX86_BUILTIN_VPCOMGTD,    GT,           (int)MULTI_ARG_2_SI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3,      "__builtin_ia32_vpcomged",    IX86_BUILTIN_VPCOMGED,    GE,           (int)MULTI_ARG_2_SI_CMP },
+
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomeqq",    IX86_BUILTIN_VPCOMEQQ,    EQ,           (int)MULTI_ARG_2_DI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomneq",    IX86_BUILTIN_VPCOMNEQ,    NE,           (int)MULTI_ARG_2_DI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomneqq",   IX86_BUILTIN_VPCOMNEQ,    NE,           (int)MULTI_ARG_2_DI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomltq",    IX86_BUILTIN_VPCOMLTQ,    LT,           (int)MULTI_ARG_2_DI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomleq",    IX86_BUILTIN_VPCOMLEQ,    LE,           (int)MULTI_ARG_2_DI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomgtq",    IX86_BUILTIN_VPCOMGTQ,    GT,           (int)MULTI_ARG_2_DI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3,      "__builtin_ia32_vpcomgeq",    IX86_BUILTIN_VPCOMGEQ,    GE,           (int)MULTI_ARG_2_DI_CMP },
+
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb",   IX86_BUILTIN_VPCOMEQUB,   EQ,           (int)MULTI_ARG_2_QI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub",   IX86_BUILTIN_VPCOMNEUB,   NE,           (int)MULTI_ARG_2_QI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb",  IX86_BUILTIN_VPCOMNEUB,   NE,           (int)MULTI_ARG_2_QI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub",   IX86_BUILTIN_VPCOMLTUB,   LTU,          (int)MULTI_ARG_2_QI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub",   IX86_BUILTIN_VPCOMLEUB,   LEU,          (int)MULTI_ARG_2_QI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub",   IX86_BUILTIN_VPCOMGTUB,   GTU,          (int)MULTI_ARG_2_QI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub",   IX86_BUILTIN_VPCOMGEUB,   GEU,          (int)MULTI_ARG_2_QI_CMP },
+
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw",   IX86_BUILTIN_VPCOMEQUW,   EQ,           (int)MULTI_ARG_2_HI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw",   IX86_BUILTIN_VPCOMNEUW,   NE,           (int)MULTI_ARG_2_HI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw",  IX86_BUILTIN_VPCOMNEUW,   NE,           (int)MULTI_ARG_2_HI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3,  "__builtin_ia32_vpcomltuw",   IX86_BUILTIN_VPCOMLTUW,   LTU,          (int)MULTI_ARG_2_HI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3,  "__builtin_ia32_vpcomleuw",   IX86_BUILTIN_VPCOMLEUW,   LEU,          (int)MULTI_ARG_2_HI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3,  "__builtin_ia32_vpcomgtuw",   IX86_BUILTIN_VPCOMGTUW,   GTU,          (int)MULTI_ARG_2_HI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3,  "__builtin_ia32_vpcomgeuw",   IX86_BUILTIN_VPCOMGEUW,   GEU,          (int)MULTI_ARG_2_HI_CMP },
+
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd",   IX86_BUILTIN_VPCOMEQUD,   EQ,           (int)MULTI_ARG_2_SI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud",   IX86_BUILTIN_VPCOMNEUD,   NE,           (int)MULTI_ARG_2_SI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd",  IX86_BUILTIN_VPCOMNEUD,   NE,           (int)MULTI_ARG_2_SI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3,  "__builtin_ia32_vpcomltud",   IX86_BUILTIN_VPCOMLTUD,   LTU,          (int)MULTI_ARG_2_SI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3,  "__builtin_ia32_vpcomleud",   IX86_BUILTIN_VPCOMLEUD,   LEU,          (int)MULTI_ARG_2_SI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3,  "__builtin_ia32_vpcomgtud",   IX86_BUILTIN_VPCOMGTUD,   GTU,          (int)MULTI_ARG_2_SI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3,  "__builtin_ia32_vpcomgeud",   IX86_BUILTIN_VPCOMGEUD,   GEU,          (int)MULTI_ARG_2_SI_CMP },
+
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq",   IX86_BUILTIN_VPCOMEQUQ,   EQ,           (int)MULTI_ARG_2_DI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq",   IX86_BUILTIN_VPCOMNEUQ,   NE,           (int)MULTI_ARG_2_DI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq",  IX86_BUILTIN_VPCOMNEUQ,   NE,           (int)MULTI_ARG_2_DI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3,  "__builtin_ia32_vpcomltuq",   IX86_BUILTIN_VPCOMLTUQ,   LTU,          (int)MULTI_ARG_2_DI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3,  "__builtin_ia32_vpcomleuq",   IX86_BUILTIN_VPCOMLEUQ,   LEU,          (int)MULTI_ARG_2_DI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3,  "__builtin_ia32_vpcomgtuq",   IX86_BUILTIN_VPCOMGTUQ,   GTU,          (int)MULTI_ARG_2_DI_CMP },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3,  "__builtin_ia32_vpcomgeuq",   IX86_BUILTIN_VPCOMGEUQ,   GEU,          (int)MULTI_ARG_2_DI_CMP },
+
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3,     "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_QI_TF },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3,      "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_HI_TF },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3,      "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_SI_TF },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3,      "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_DI_TF },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3,     "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_QI_TF },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3,      "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_HI_TF },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3,      "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_SI_TF },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3,      "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE,   (int)MULTI_ARG_2_DI_TF },
+
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3,     "__builtin_ia32_vpcomtrueb",  IX86_BUILTIN_VPCOMTRUEB,  (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_QI_TF },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3,      "__builtin_ia32_vpcomtruew",  IX86_BUILTIN_VPCOMTRUEW,  (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_HI_TF },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3,      "__builtin_ia32_vpcomtrued",  IX86_BUILTIN_VPCOMTRUED,  (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_SI_TF },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3,      "__builtin_ia32_vpcomtrueq",  IX86_BUILTIN_VPCOMTRUEQ,  (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_DI_TF },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3,     "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_QI_TF },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3,      "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_HI_TF },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3,      "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_SI_TF },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3,      "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_DI_TF },
+
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3,     "__builtin_ia32_vpermil2pd",  IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3,     "__builtin_ia32_vpermil2ps",  IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3,     "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3,     "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
+
+};
+
+/* TM vector builtins.  */
+
+/* Reuse the existing x86-specific `struct builtin_description' cause
+   we're lazy.  Add casts to make them fit.  */
+static const struct builtin_description bdesc_tm[] =
+{
+  { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
+
+  { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
+
+  { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
+};
+
+/* TM callbacks.  */
+
+/* Return the builtin decl needed to load a vector of TYPE.  */
+
+static tree
+ix86_builtin_tm_load (tree type)
+{
+  if (TREE_CODE (type) == VECTOR_TYPE)
+    {
+      switch (tree_to_uhwi (TYPE_SIZE (type)))
+	{
+	case 64:
+	  return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
+	case 128:
+	  return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
+	case 256:
+	  return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
+	}
+    }
+  return NULL_TREE;
+}
+
+/* Return the builtin decl needed to store a vector of TYPE.  */
+
+static tree
+ix86_builtin_tm_store (tree type)
+{
+  if (TREE_CODE (type) == VECTOR_TYPE)
+    {
+      switch (tree_to_uhwi (TYPE_SIZE (type)))
+	{
+	case 64:
+	  return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
+	case 128:
+	  return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
+	case 256:
+	  return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
+	}
+    }
+  return NULL_TREE;
+}
+
+/* Initialize the transactional memory vector load/store builtins.  */
+
+static void
+ix86_init_tm_builtins (void)
+{
+  enum ix86_builtin_func_type ftype;
+  const struct builtin_description *d;
+  size_t i;
+  tree decl;
+  tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
+  tree attrs_log, attrs_type_log;
+
+  if (!flag_tm)
+    return;
+
+  /* If there are no builtins defined, we must be compiling in a
+     language without trans-mem support.  */
+  if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
+    return;
+
+  /* Use whatever attributes a normal TM load has.  */
+  decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
+  attrs_load = DECL_ATTRIBUTES (decl);
+  attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
+  /* Use whatever attributes a normal TM store has.  */
+  decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
+  attrs_store = DECL_ATTRIBUTES (decl);
+  attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
+  /* Use whatever attributes a normal TM log has.  */
+  decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
+  attrs_log = DECL_ATTRIBUTES (decl);
+  attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
+
+  for (i = 0, d = bdesc_tm;
+       i < ARRAY_SIZE (bdesc_tm);
+       i++, d++)
+    {
+      if ((d->mask & ix86_isa_flags) != 0
+	  || (lang_hooks.builtin_function
+	      == lang_hooks.builtin_function_ext_scope))
+	{
+	  tree type, attrs, attrs_type;
+	  enum built_in_function code = (enum built_in_function) d->code;
+
+	  ftype = (enum ix86_builtin_func_type) d->flag;
+	  type = ix86_get_builtin_func_type (ftype);
+
+	  if (BUILTIN_TM_LOAD_P (code))
+	    {
+	      attrs = attrs_load;
+	      attrs_type = attrs_type_load;
+	    }
+	  else if (BUILTIN_TM_STORE_P (code))
+	    {
+	      attrs = attrs_store;
+	      attrs_type = attrs_type_store;
+	    }
+	  else
+	    {
+	      attrs = attrs_log;
+	      attrs_type = attrs_type_log;
+	    }
+	  decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
+				       /* The builtin without the prefix for
+					  calling it directly.  */
+				       d->name + strlen ("__builtin_"),
+				       attrs);
+	  /* add_builtin_function() will set the DECL_ATTRIBUTES, now
+	     set the TYPE_ATTRIBUTES.  */
+	  decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
+
+	  set_builtin_decl (code, decl, false);
+	}
+    }
+}
+
+/* Set up all the MMX/SSE builtins, even builtins for instructions that are not
+   in the current target ISA to allow the user to compile particular modules
+   with different target specific options that differ from the command line
+   options.  */
+static void
+ix86_init_mmx_sse_builtins (void)
+{
+  const struct builtin_description * d;
+  enum ix86_builtin_func_type ftype;
+  size_t i;
+
+  /* Add all special builtins with variable number of operands.  */
+  for (i = 0, d = bdesc_special_args;
+       i < ARRAY_SIZE (bdesc_special_args);
+       i++, d++)
+    {
+      if (d->name == 0)
+	continue;
+
+      ftype = (enum ix86_builtin_func_type) d->flag;
+      def_builtin (d->mask, d->name, ftype, d->code);
+    }
+
+  /* Add all builtins with variable number of operands.  */
+  for (i = 0, d = bdesc_args;
+       i < ARRAY_SIZE (bdesc_args);
+       i++, d++)
+    {
+      if (d->name == 0)
+	continue;
+
+      ftype = (enum ix86_builtin_func_type) d->flag;
+      def_builtin_const (d->mask, d->name, ftype, d->code);
+    }
+
+  /* Add all builtins with rounding.  */
+  for (i = 0, d = bdesc_round_args;
+       i < ARRAY_SIZE (bdesc_round_args);
+       i++, d++)
+    {
+      if (d->name == 0)
+	continue;
+
+      ftype = (enum ix86_builtin_func_type) d->flag;
+      def_builtin_const (d->mask, d->name, ftype, d->code);
+    }
+
+  /* pcmpestr[im] insns.  */
+  for (i = 0, d = bdesc_pcmpestr;
+       i < ARRAY_SIZE (bdesc_pcmpestr);
+       i++, d++)
+    {
+      if (d->code == IX86_BUILTIN_PCMPESTRM128)
+	ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
+      else
+	ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
+      def_builtin_const (d->mask, d->name, ftype, d->code);
+    }
+
+  /* pcmpistr[im] insns.  */
+  for (i = 0, d = bdesc_pcmpistr;
+       i < ARRAY_SIZE (bdesc_pcmpistr);
+       i++, d++)
+    {
+      if (d->code == IX86_BUILTIN_PCMPISTRM128)
+	ftype = V16QI_FTYPE_V16QI_V16QI_INT;
+      else
+	ftype = INT_FTYPE_V16QI_V16QI_INT;
+      def_builtin_const (d->mask, d->name, ftype, d->code);
+    }
+
+  /* comi/ucomi insns.  */
+  for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
+    {
+      if (d->mask == OPTION_MASK_ISA_SSE2)
+	ftype = INT_FTYPE_V2DF_V2DF;
+      else
+	ftype = INT_FTYPE_V4SF_V4SF;
+      def_builtin_const (d->mask, d->name, ftype, d->code);
+    }
+
+  /* SSE */
+  def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
+	       VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
+  def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
+	       UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
+
+  /* SSE or 3DNow!A */
+  def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
+	       "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
+	       IX86_BUILTIN_MASKMOVQ);
+
+  /* SSE2 */
+  def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
+	       VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
+
+  def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
+	       VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
+  x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
+			    VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
+
+  /* SSE3.  */
+  def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
+	       VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
+  def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
+	       VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
+
+  /* AES */
+  def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
+		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
+  def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
+		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
+  def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
+		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
+  def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
+		     V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
+  def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
+		     V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
+  def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
+		     V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
+
+  /* PCLMUL */
+  def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
+		     V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
+
+  /* RDRND */
+  def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
+	       INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
+  def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
+	       INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
+  def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
+	       "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
+	       IX86_BUILTIN_RDRAND64_STEP);
+
+  /* AVX2 */
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
+	       V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
+	       IX86_BUILTIN_GATHERSIV2DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
+	       V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
+	       IX86_BUILTIN_GATHERSIV4DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
+	       V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
+	       IX86_BUILTIN_GATHERDIV2DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
+	       V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
+	       IX86_BUILTIN_GATHERDIV4DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
+	       V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
+	       IX86_BUILTIN_GATHERSIV4SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
+	       V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
+	       IX86_BUILTIN_GATHERSIV8SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
+	       V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
+	       IX86_BUILTIN_GATHERDIV4SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
+	       V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
+	       IX86_BUILTIN_GATHERDIV8SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
+	       V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
+	       IX86_BUILTIN_GATHERSIV2DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
+	       V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
+	       IX86_BUILTIN_GATHERSIV4DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
+	       V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
+	       IX86_BUILTIN_GATHERDIV2DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
+	       V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
+	       IX86_BUILTIN_GATHERDIV4DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
+	       V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
+	       IX86_BUILTIN_GATHERSIV4SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
+	       V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
+	       IX86_BUILTIN_GATHERSIV8SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
+	       V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
+	       IX86_BUILTIN_GATHERDIV4SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
+	       V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
+	       IX86_BUILTIN_GATHERDIV8SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
+	       V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
+	       IX86_BUILTIN_GATHERALTSIV4DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
+	       V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
+	       IX86_BUILTIN_GATHERALTDIV8SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
+	       V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
+	       IX86_BUILTIN_GATHERALTSIV4DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
+	       V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
+	       IX86_BUILTIN_GATHERALTDIV8SI);
+
+  /* AVX512F */
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
+	       V16SF_FTYPE_V16SF_PCFLOAT_V16SI_HI_INT,
+	       IX86_BUILTIN_GATHER3SIV16SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
+	       V8DF_FTYPE_V8DF_PCDOUBLE_V8SI_QI_INT,
+	       IX86_BUILTIN_GATHER3SIV8DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
+	       V8SF_FTYPE_V8SF_PCFLOAT_V8DI_QI_INT,
+	       IX86_BUILTIN_GATHER3DIV16SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
+	       V8DF_FTYPE_V8DF_PCDOUBLE_V8DI_QI_INT,
+	       IX86_BUILTIN_GATHER3DIV8DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
+	       V16SI_FTYPE_V16SI_PCINT_V16SI_HI_INT,
+	       IX86_BUILTIN_GATHER3SIV16SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
+	       V8DI_FTYPE_V8DI_PCINT64_V8SI_QI_INT,
+	       IX86_BUILTIN_GATHER3SIV8DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
+	       V8SI_FTYPE_V8SI_PCINT_V8DI_QI_INT,
+	       IX86_BUILTIN_GATHER3DIV16SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
+	       V8DI_FTYPE_V8DI_PCINT64_V8DI_QI_INT,
+	       IX86_BUILTIN_GATHER3DIV8DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
+	       V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
+	       IX86_BUILTIN_GATHER3ALTSIV8DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
+	       V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
+	       IX86_BUILTIN_GATHER3ALTDIV16SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
+	       V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
+	       IX86_BUILTIN_GATHER3ALTSIV8DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
+	       V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
+	       IX86_BUILTIN_GATHER3ALTDIV16SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
+	       VOID_FTYPE_PFLOAT_HI_V16SI_V16SF_INT,
+	       IX86_BUILTIN_SCATTERSIV16SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
+	       VOID_FTYPE_PDOUBLE_QI_V8SI_V8DF_INT,
+	       IX86_BUILTIN_SCATTERSIV8DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
+	       VOID_FTYPE_PFLOAT_QI_V8DI_V8SF_INT,
+	       IX86_BUILTIN_SCATTERDIV16SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
+	       VOID_FTYPE_PDOUBLE_QI_V8DI_V8DF_INT,
+	       IX86_BUILTIN_SCATTERDIV8DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
+	       VOID_FTYPE_PINT_HI_V16SI_V16SI_INT,
+	       IX86_BUILTIN_SCATTERSIV16SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
+	       VOID_FTYPE_PLONGLONG_QI_V8SI_V8DI_INT,
+	       IX86_BUILTIN_SCATTERSIV8DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
+	       VOID_FTYPE_PINT_QI_V8DI_V8SI_INT,
+	       IX86_BUILTIN_SCATTERDIV16SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
+	       VOID_FTYPE_PLONGLONG_QI_V8DI_V8DI_INT,
+	       IX86_BUILTIN_SCATTERDIV8DI);
+
+  /* AVX512PF */
+  def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
+	       VOID_FTYPE_QI_V8SI_PCINT64_INT_INT,
+	       IX86_BUILTIN_GATHERPFDPD);
+  def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
+	       VOID_FTYPE_HI_V16SI_PCINT_INT_INT,
+	       IX86_BUILTIN_GATHERPFDPS);
+  def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
+	       VOID_FTYPE_QI_V8DI_PCINT64_INT_INT,
+	       IX86_BUILTIN_GATHERPFQPD);
+  def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
+	       VOID_FTYPE_QI_V8DI_PCINT_INT_INT,
+	       IX86_BUILTIN_GATHERPFQPS);
+  def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
+	       VOID_FTYPE_QI_V8SI_PCINT64_INT_INT,
+	       IX86_BUILTIN_SCATTERPFDPD);
+  def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
+	       VOID_FTYPE_HI_V16SI_PCINT_INT_INT,
+	       IX86_BUILTIN_SCATTERPFDPS);
+  def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
+	       VOID_FTYPE_QI_V8DI_PCINT64_INT_INT,
+	       IX86_BUILTIN_SCATTERPFQPD);
+  def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
+	       VOID_FTYPE_QI_V8DI_PCINT_INT_INT,
+	       IX86_BUILTIN_SCATTERPFQPS);
+
+  /* SHA */
+  def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
+		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
+  def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
+		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
+  def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
+		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
+  def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
+		     V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
+  def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
+		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
+  def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
+		     V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
+  def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
+		     V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
+
+  /* RTM.  */
+  def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
+	       VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
+
+  /* MMX access to the vec_init patterns.  */
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
+		     V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
+
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
+		     V4HI_FTYPE_HI_HI_HI_HI,
+		     IX86_BUILTIN_VEC_INIT_V4HI);
+
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
+		     V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
+		     IX86_BUILTIN_VEC_INIT_V8QI);
+
+  /* Access to the vec_extract patterns.  */
+  def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
+		     DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
+  def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
+		     DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
+  def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
+		     FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
+  def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
+		     SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
+  def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
+		     HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
+
+  def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
+		     "__builtin_ia32_vec_ext_v4hi",
+		     HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
+
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
+		     SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
+
+  def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
+		     QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
+
+  /* Access to the vec_set patterns.  */
+  def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
+		     "__builtin_ia32_vec_set_v2di",
+		     V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
+
+  def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
+		     V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
+
+  def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
+		     V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
+
+  def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
+		     V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
+
+  def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
+		     "__builtin_ia32_vec_set_v4hi",
+		     V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
+
+  def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
+		     V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
+
+  /* RDSEED */
+  def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
+	       INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
+  def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
+	       INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
+  def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
+	       "__builtin_ia32_rdseed_di_step",
+	       INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
+
+  /* ADCX */
+  def_builtin (0, "__builtin_ia32_addcarryx_u32",
+	       UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
+  def_builtin (OPTION_MASK_ISA_64BIT,
+	       "__builtin_ia32_addcarryx_u64",
+	       UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
+	       IX86_BUILTIN_ADDCARRYX64);
+
+  /* Read/write FLAGS.  */
+  def_builtin (~OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u32",
+               UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
+  def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
+               UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
+  def_builtin (~OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u32",
+               VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
+  def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
+               VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
+
+
+  /* Add FMA4 multi-arg argument instructions */
+  for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
+    {
+      if (d->name == 0)
+	continue;
+
+      ftype = (enum ix86_builtin_func_type) d->flag;
+      def_builtin_const (d->mask, d->name, ftype, d->code);
+    }
+}
+
+/* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
+   to return a pointer to VERSION_DECL if the outcome of the expression
+   formed by PREDICATE_CHAIN is true.  This function will be called during
+   version dispatch to decide which function version to execute.  It returns
+   the basic block at the end, to which more conditions can be added.  */
+
+static basic_block
+add_condition_to_bb (tree function_decl, tree version_decl,
+		     tree predicate_chain, basic_block new_bb)
+{
+  gimple return_stmt;
+  tree convert_expr, result_var;
+  gimple convert_stmt;
+  gimple call_cond_stmt;
+  gimple if_else_stmt;
+
+  basic_block bb1, bb2, bb3;
+  edge e12, e23;
+
+  tree cond_var, and_expr_var = NULL_TREE;
+  gimple_seq gseq;
+
+  tree predicate_decl, predicate_arg;
+
+  push_cfun (DECL_STRUCT_FUNCTION (function_decl));
+
+  gcc_assert (new_bb != NULL);
+  gseq = bb_seq (new_bb);
+
+
+  convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
+	     		 build_fold_addr_expr (version_decl));
+  result_var = create_tmp_var (ptr_type_node, NULL);
+  convert_stmt = gimple_build_assign (result_var, convert_expr); 
+  return_stmt = gimple_build_return (result_var);
+
+  if (predicate_chain == NULL_TREE)
+    {
+      gimple_seq_add_stmt (&gseq, convert_stmt);
+      gimple_seq_add_stmt (&gseq, return_stmt);
+      set_bb_seq (new_bb, gseq);
+      gimple_set_bb (convert_stmt, new_bb);
+      gimple_set_bb (return_stmt, new_bb);
+      pop_cfun ();
+      return new_bb;
+    }
+
+  while (predicate_chain != NULL)
+    {
+      cond_var = create_tmp_var (integer_type_node, NULL);
+      predicate_decl = TREE_PURPOSE (predicate_chain);
+      predicate_arg = TREE_VALUE (predicate_chain);
+      call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
+      gimple_call_set_lhs (call_cond_stmt, cond_var);
+
+      gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
+      gimple_set_bb (call_cond_stmt, new_bb);
+      gimple_seq_add_stmt (&gseq, call_cond_stmt);
+
+      predicate_chain = TREE_CHAIN (predicate_chain);
+      
+      if (and_expr_var == NULL)
+        and_expr_var = cond_var;
+      else
+	{
+	  gimple assign_stmt;
+	  /* Use MIN_EXPR to check if any integer is zero?.
+	     and_expr_var = min_expr <cond_var, and_expr_var>  */
+	  assign_stmt = gimple_build_assign (and_expr_var,
+			  build2 (MIN_EXPR, integer_type_node,
+				  cond_var, and_expr_var));
+
+	  gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
+	  gimple_set_bb (assign_stmt, new_bb);
+	  gimple_seq_add_stmt (&gseq, assign_stmt);
+	}
+    }
+
+  if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
+	  		            integer_zero_node,
+				    NULL_TREE, NULL_TREE);
+  gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
+  gimple_set_bb (if_else_stmt, new_bb);
+  gimple_seq_add_stmt (&gseq, if_else_stmt);
+
+  gimple_seq_add_stmt (&gseq, convert_stmt);
+  gimple_seq_add_stmt (&gseq, return_stmt);
+  set_bb_seq (new_bb, gseq);
+
+  bb1 = new_bb;
+  e12 = split_block (bb1, if_else_stmt);
+  bb2 = e12->dest;
+  e12->flags &= ~EDGE_FALLTHRU;
+  e12->flags |= EDGE_TRUE_VALUE;
+
+  e23 = split_block (bb2, return_stmt);
+
+  gimple_set_bb (convert_stmt, bb2);
+  gimple_set_bb (return_stmt, bb2);
+
+  bb3 = e23->dest;
+  make_edge (bb1, bb3, EDGE_FALSE_VALUE); 
+
+  remove_edge (e23);
+  make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
+
+  pop_cfun ();
+
+  return bb3;
+}
+
+/* This parses the attribute arguments to target in DECL and determines
+   the right builtin to use to match the platform specification.
+   It returns the priority value for this version decl.  If PREDICATE_LIST
+   is not NULL, it stores the list of cpu features that need to be checked
+   before dispatching this function.  */
+
+static unsigned int
+get_builtin_code_for_version (tree decl, tree *predicate_list)
+{
+  tree attrs;
+  struct cl_target_option cur_target;
+  tree target_node;
+  struct cl_target_option *new_target;
+  const char *arg_str = NULL;
+  const char *attrs_str = NULL;
+  char *tok_str = NULL;
+  char *token;
+
+  /* Priority of i386 features, greater value is higher priority.   This is
+     used to decide the order in which function dispatch must happen.  For
+     instance, a version specialized for SSE4.2 should be checked for dispatch
+     before a version for SSE3, as SSE4.2 implies SSE3.  */
+  enum feature_priority
+  {
+    P_ZERO = 0,
+    P_MMX,
+    P_SSE,
+    P_SSE2,
+    P_SSE3,
+    P_SSSE3,
+    P_PROC_SSSE3,
+    P_SSE4_A,
+    P_PROC_SSE4_A,
+    P_SSE4_1,
+    P_SSE4_2,
+    P_PROC_SSE4_2,
+    P_POPCNT,
+    P_AVX,
+    P_PROC_AVX,
+    P_FMA4,
+    P_XOP,
+    P_PROC_XOP,
+    P_FMA,    
+    P_PROC_FMA,
+    P_AVX2,
+    P_PROC_AVX2
+  };
+
+ enum feature_priority priority = P_ZERO;
+
+  /* These are the target attribute strings for which a dispatcher is
+     available, from fold_builtin_cpu.  */
+
+  static struct _feature_list
+    {
+      const char *const name;
+      const enum feature_priority priority;
+    }
+  const feature_list[] =
+    {
+      {"mmx", P_MMX},
+      {"sse", P_SSE},
+      {"sse2", P_SSE2},
+      {"sse3", P_SSE3},
+      {"sse4a", P_SSE4_A},
+      {"ssse3", P_SSSE3},
+      {"sse4.1", P_SSE4_1},
+      {"sse4.2", P_SSE4_2},
+      {"popcnt", P_POPCNT},
+      {"avx", P_AVX},
+      {"fma4", P_FMA4},
+      {"xop", P_XOP},
+      {"fma", P_FMA},
+      {"avx2", P_AVX2}
+    };
+
+
+  static unsigned int NUM_FEATURES
+    = sizeof (feature_list) / sizeof (struct _feature_list);
+
+  unsigned int i;
+
+  tree predicate_chain = NULL_TREE;
+  tree predicate_decl, predicate_arg;
+
+  attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
+  gcc_assert (attrs != NULL);
+
+  attrs = TREE_VALUE (TREE_VALUE (attrs));
+
+  gcc_assert (TREE_CODE (attrs) == STRING_CST);
+  attrs_str = TREE_STRING_POINTER (attrs);
+
+  /* Return priority zero for default function.  */
+  if (strcmp (attrs_str, "default") == 0)
+    return 0;
+
+  /* Handle arch= if specified.  For priority, set it to be 1 more than
+     the best instruction set the processor can handle.  For instance, if
+     there is a version for atom and a version for ssse3 (the highest ISA
+     priority for atom), the atom version must be checked for dispatch
+     before the ssse3 version. */
+  if (strstr (attrs_str, "arch=") != NULL)
+    {
+      cl_target_option_save (&cur_target, &global_options);
+      target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
+						      &global_options_set);
+    
+      gcc_assert (target_node);
+      new_target = TREE_TARGET_OPTION (target_node);
+      gcc_assert (new_target);
+      
+      if (new_target->arch_specified && new_target->arch > 0)
+	{
+	  switch (new_target->arch)
+	    {
+	    case PROCESSOR_CORE2:
+	      arg_str = "core2";
+	      priority = P_PROC_SSSE3;
+	      break;
+	    case PROCESSOR_NEHALEM:
+	      if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
+		arg_str = "westmere";
+	      else
+		/* We translate "arch=corei7" and "arch=nehalem" to
+		   "corei7" so that it will be mapped to M_INTEL_COREI7
+		   as cpu type to cover all M_INTEL_COREI7_XXXs.  */
+		arg_str = "corei7";
+	      priority = P_PROC_SSE4_2;
+	      break;
+	    case PROCESSOR_SANDYBRIDGE:
+	      if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
+		arg_str = "ivybridge";
+	      else
+		arg_str = "sandybridge";
+	      priority = P_PROC_AVX;
+	      break;
+	    case PROCESSOR_HASWELL:
+	      if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
+		arg_str = "broadwell";
+	      else
+		arg_str = "haswell";
+	      priority = P_PROC_AVX2;
+	      break;
+	    case PROCESSOR_BONNELL:
+	      arg_str = "bonnell";
+	      priority = P_PROC_SSSE3;
+	      break;
+	    case PROCESSOR_SILVERMONT:
+	      arg_str = "silvermont";
+	      priority = P_PROC_SSE4_2;
+	      break;
+	    case PROCESSOR_AMDFAM10:
+	      arg_str = "amdfam10h";
+	      priority = P_PROC_SSE4_A;
+	      break;
+	    case PROCESSOR_BTVER1:
+	      arg_str = "btver1";
+	      priority = P_PROC_SSE4_A;
+	      break;
+	    case PROCESSOR_BTVER2:
+	      arg_str = "btver2";
+	      priority = P_PROC_AVX;
+	      break;
+	    case PROCESSOR_BDVER1:
+	      arg_str = "bdver1";
+	      priority = P_PROC_XOP;
+	      break;
+	    case PROCESSOR_BDVER2:
+	      arg_str = "bdver2";
+	      priority = P_PROC_FMA;
+	      break;
+	    case PROCESSOR_BDVER3:
+	      arg_str = "bdver3";
+	      priority = P_PROC_FMA;
+	      break;
+	    case PROCESSOR_BDVER4:
+	      arg_str = "bdver4";
+	      priority = P_PROC_AVX2;
+	      break;
+	    }  
+	}    
+    
+      cl_target_option_restore (&global_options, &cur_target);
+	
+      if (predicate_list && arg_str == NULL)
+	{
+	  error_at (DECL_SOURCE_LOCATION (decl),
+	    	"No dispatcher found for the versioning attributes");
+	  return 0;
+	}
+    
+      if (predicate_list)
+	{
+          predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
+          /* For a C string literal the length includes the trailing NULL.  */
+          predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
+          predicate_chain = tree_cons (predicate_decl, predicate_arg,
+				       predicate_chain);
+	}
+    }
+
+  /* Process feature name.  */
+  tok_str =  (char *) xmalloc (strlen (attrs_str) + 1);
+  strcpy (tok_str, attrs_str);
+  token = strtok (tok_str, ",");
+  predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
+
+  while (token != NULL)
+    {
+      /* Do not process "arch="  */
+      if (strncmp (token, "arch=", 5) == 0)
+	{
+	  token = strtok (NULL, ",");
+	  continue;
+	}
+      for (i = 0; i < NUM_FEATURES; ++i)
+	{
+	  if (strcmp (token, feature_list[i].name) == 0)
+	    {
+	      if (predicate_list)
+		{
+		  predicate_arg = build_string_literal (
+				  strlen (feature_list[i].name) + 1,
+				  feature_list[i].name);
+		  predicate_chain = tree_cons (predicate_decl, predicate_arg,
+					       predicate_chain);
+		}
+	      /* Find the maximum priority feature.  */
+	      if (feature_list[i].priority > priority)
+		priority = feature_list[i].priority;
+
+	      break;
+	    }
+	}
+      if (predicate_list && i == NUM_FEATURES)
+	{
+	  error_at (DECL_SOURCE_LOCATION (decl),
+		    "No dispatcher found for %s", token);
+	  return 0;
+	}
+      token = strtok (NULL, ",");
+    }
+  free (tok_str);
+
+  if (predicate_list && predicate_chain == NULL_TREE)
+    {
+      error_at (DECL_SOURCE_LOCATION (decl),
+	        "No dispatcher found for the versioning attributes : %s",
+	        attrs_str);
+      return 0;
+    }
+  else if (predicate_list)
+    {
+      predicate_chain = nreverse (predicate_chain);
+      *predicate_list = predicate_chain;
+    }
+
+  return priority; 
+}
+
+/* This compares the priority of target features in function DECL1
+   and DECL2.  It returns positive value if DECL1 is higher priority,
+   negative value if DECL2 is higher priority and 0 if they are the
+   same.  */
+
+static int
+ix86_compare_version_priority (tree decl1, tree decl2)
+{
+  unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
+  unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
+
+  return (int)priority1 - (int)priority2;
+}
+
+/* V1 and V2 point to function versions with different priorities
+   based on the target ISA.  This function compares their priorities.  */
+ 
+static int
+feature_compare (const void *v1, const void *v2)
+{
+  typedef struct _function_version_info
+    {
+      tree version_decl;
+      tree predicate_chain;
+      unsigned int dispatch_priority;
+    } function_version_info;
+
+  const function_version_info c1 = *(const function_version_info *)v1;
+  const function_version_info c2 = *(const function_version_info *)v2;
+  return (c2.dispatch_priority - c1.dispatch_priority);
+}
+
+/* This function generates the dispatch function for
+   multi-versioned functions.  DISPATCH_DECL is the function which will
+   contain the dispatch logic.  FNDECLS are the function choices for
+   dispatch, and is a tree chain.  EMPTY_BB is the basic block pointer
+   in DISPATCH_DECL in which the dispatch code is generated.  */
+
+static int
+dispatch_function_versions (tree dispatch_decl,
+			    void *fndecls_p,
+			    basic_block *empty_bb)
+{
+  tree default_decl;
+  gimple ifunc_cpu_init_stmt;
+  gimple_seq gseq;
+  int ix;
+  tree ele;
+  vec<tree> *fndecls;
+  unsigned int num_versions = 0;
+  unsigned int actual_versions = 0;
+  unsigned int i;
+
+  struct _function_version_info
+    {
+      tree version_decl;
+      tree predicate_chain;
+      unsigned int dispatch_priority;
+    }*function_version_info;
+
+  gcc_assert (dispatch_decl != NULL
+	      && fndecls_p != NULL
+	      && empty_bb != NULL);
+
+  /*fndecls_p is actually a vector.  */
+  fndecls = static_cast<vec<tree> *> (fndecls_p);
+
+  /* At least one more version other than the default.  */
+  num_versions = fndecls->length ();
+  gcc_assert (num_versions >= 2);
+
+  function_version_info = (struct _function_version_info *)
+    XNEWVEC (struct _function_version_info, (num_versions - 1));
+
+  /* The first version in the vector is the default decl.  */
+  default_decl = (*fndecls)[0];
+
+  push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
+
+  gseq = bb_seq (*empty_bb);
+  /* Function version dispatch is via IFUNC.  IFUNC resolvers fire before
+     constructors, so explicity call __builtin_cpu_init here.  */
+  ifunc_cpu_init_stmt = gimple_build_call_vec (
+                     ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
+  gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
+  gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
+  set_bb_seq (*empty_bb, gseq);
+
+  pop_cfun ();
+
+
+  for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
+    {
+      tree version_decl = ele;
+      tree predicate_chain = NULL_TREE;
+      unsigned int priority;
+      /* Get attribute string, parse it and find the right predicate decl.
+         The predicate function could be a lengthy combination of many
+	 features, like arch-type and various isa-variants.  */
+      priority = get_builtin_code_for_version (version_decl,
+	 			               &predicate_chain);
+
+      if (predicate_chain == NULL_TREE)
+	continue;
+
+      function_version_info [actual_versions].version_decl = version_decl;
+      function_version_info [actual_versions].predicate_chain
+	 = predicate_chain;
+      function_version_info [actual_versions].dispatch_priority = priority;
+      actual_versions++;
+    }
+
+  /* Sort the versions according to descending order of dispatch priority.  The
+     priority is based on the ISA.  This is not a perfect solution.  There
+     could still be ambiguity.  If more than one function version is suitable
+     to execute,  which one should be dispatched?  In future, allow the user
+     to specify a dispatch  priority next to the version.  */
+  qsort (function_version_info, actual_versions,
+         sizeof (struct _function_version_info), feature_compare);
+
+  for  (i = 0; i < actual_versions; ++i)
+    *empty_bb = add_condition_to_bb (dispatch_decl,
+				     function_version_info[i].version_decl,
+				     function_version_info[i].predicate_chain,
+				     *empty_bb);
+
+  /* dispatch default version at the end.  */
+  *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
+				   NULL, *empty_bb);
+
+  free (function_version_info);
+  return 0;
+}
+
+/* Comparator function to be used in qsort routine to sort attribute
+   specification strings to "target".  */
+
+static int
+attr_strcmp (const void *v1, const void *v2)
+{
+  const char *c1 = *(char *const*)v1;
+  const char *c2 = *(char *const*)v2;
+  return strcmp (c1, c2);
+}
+
+/* ARGLIST is the argument to target attribute.  This function tokenizes
+   the comma separated arguments, sorts them and returns a string which
+   is a unique identifier for the comma separated arguments.   It also
+   replaces non-identifier characters "=,-" with "_".  */
+
+static char *
+sorted_attr_string (tree arglist)
+{
+  tree arg;
+  size_t str_len_sum = 0;
+  char **args = NULL;
+  char *attr_str, *ret_str;
+  char *attr = NULL;
+  unsigned int argnum = 1;
+  unsigned int i;
+
+  for (arg = arglist; arg; arg = TREE_CHAIN (arg))
+    {
+      const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
+      size_t len = strlen (str);
+      str_len_sum += len + 1;
+      if (arg != arglist)
+	argnum++;
+      for (i = 0; i < strlen (str); i++)
+	if (str[i] == ',')
+	  argnum++;
+    }
+
+  attr_str = XNEWVEC (char, str_len_sum);
+  str_len_sum = 0;
+  for (arg = arglist; arg; arg = TREE_CHAIN (arg))
+    {
+      const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
+      size_t len = strlen (str);
+      memcpy (attr_str + str_len_sum, str, len);
+      attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
+      str_len_sum += len + 1;
+    }
+
+  /* Replace "=,-" with "_".  */
+  for (i = 0; i < strlen (attr_str); i++)
+    if (attr_str[i] == '=' || attr_str[i]== '-')
+      attr_str[i] = '_';
+
+  if (argnum == 1)
+    return attr_str;
+
+  args = XNEWVEC (char *, argnum);
+
+  i = 0;
+  attr = strtok (attr_str, ",");
+  while (attr != NULL)
+    {
+      args[i] = attr;
+      i++;
+      attr = strtok (NULL, ",");
+    }
+
+  qsort (args, argnum, sizeof (char *), attr_strcmp);
+
+  ret_str = XNEWVEC (char, str_len_sum);
+  str_len_sum = 0;
+  for (i = 0; i < argnum; i++)
+    {
+      size_t len = strlen (args[i]);
+      memcpy (ret_str + str_len_sum, args[i], len);
+      ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
+      str_len_sum += len + 1;
+    }
+
+  XDELETEVEC (args);
+  XDELETEVEC (attr_str);
+  return ret_str;
+}
+
+/* This function changes the assembler name for functions that are
+   versions.  If DECL is a function version and has a "target"
+   attribute, it appends the attribute string to its assembler name.  */
+
+static tree
+ix86_mangle_function_version_assembler_name (tree decl, tree id)
+{
+  tree version_attr;
+  const char *orig_name, *version_string;
+  char *attr_str, *assembler_name;
+
+  if (DECL_DECLARED_INLINE_P (decl)
+      && lookup_attribute ("gnu_inline",
+			   DECL_ATTRIBUTES (decl)))
+    error_at (DECL_SOURCE_LOCATION (decl),
+	      "Function versions cannot be marked as gnu_inline,"
+	      " bodies have to be generated");
+
+  if (DECL_VIRTUAL_P (decl)
+      || DECL_VINDEX (decl))
+    sorry ("Virtual function multiversioning not supported");
+
+  version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
+
+  /* target attribute string cannot be NULL.  */
+  gcc_assert (version_attr != NULL_TREE);
+
+  orig_name = IDENTIFIER_POINTER (id);
+  version_string
+    = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
+
+  if (strcmp (version_string, "default") == 0)
+    return id;
+
+  attr_str = sorted_attr_string (TREE_VALUE (version_attr));
+  assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
+
+  sprintf (assembler_name, "%s.%s", orig_name, attr_str);
+
+  /* Allow assembler name to be modified if already set.  */
+  if (DECL_ASSEMBLER_NAME_SET_P (decl))
+    SET_DECL_RTL (decl, NULL);
+
+  tree ret = get_identifier (assembler_name);
+  XDELETEVEC (attr_str);
+  XDELETEVEC (assembler_name);
+  return ret;
+}
+
+/* This function returns true if FN1 and FN2 are versions of the same function,
+   that is, the target strings of the function decls are different.  This assumes
+   that FN1 and FN2 have the same signature.  */
+
+static bool
+ix86_function_versions (tree fn1, tree fn2)
+{
+  tree attr1, attr2;
+  char *target1, *target2;
+  bool result;
+
+  if (TREE_CODE (fn1) != FUNCTION_DECL
+      || TREE_CODE (fn2) != FUNCTION_DECL)
+    return false;
+
+  attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
+  attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
+
+  /* At least one function decl should have the target attribute specified.  */
+  if (attr1 == NULL_TREE && attr2 == NULL_TREE)
+    return false;
+
+  /* Diagnose missing target attribute if one of the decls is already
+     multi-versioned.  */
+  if (attr1 == NULL_TREE || attr2 == NULL_TREE)
+    {
+      if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
+	{
+	  if (attr2 != NULL_TREE)
+	    {
+	      tree tem = fn1;
+	      fn1 = fn2;
+	      fn2 = tem;
+	      attr1 = attr2;
+	    }
+	  error_at (DECL_SOURCE_LOCATION (fn2),
+		    "missing %<target%> attribute for multi-versioned %D",
+		    fn2);
+	  inform (DECL_SOURCE_LOCATION (fn1),
+		  "previous declaration of %D", fn1);
+	  /* Prevent diagnosing of the same error multiple times.  */
+	  DECL_ATTRIBUTES (fn2)
+	    = tree_cons (get_identifier ("target"),
+			 copy_node (TREE_VALUE (attr1)),
+			 DECL_ATTRIBUTES (fn2));
+	}
+      return false;
+    }
+
+  target1 = sorted_attr_string (TREE_VALUE (attr1));
+  target2 = sorted_attr_string (TREE_VALUE (attr2));
+
+  /* The sorted target strings must be different for fn1 and fn2
+     to be versions.  */
+  if (strcmp (target1, target2) == 0)
+    result = false;
+  else
+    result = true;
+
+  XDELETEVEC (target1);
+  XDELETEVEC (target2); 
+  
+  return result;
+}
+
+static tree 
+ix86_mangle_decl_assembler_name (tree decl, tree id)
+{
+  /* For function version, add the target suffix to the assembler name.  */
+  if (TREE_CODE (decl) == FUNCTION_DECL
+      && DECL_FUNCTION_VERSIONED (decl))
+    id = ix86_mangle_function_version_assembler_name (decl, id);
+#ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
+  id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
+#endif
+
+  return id;
+}
+
+/* Return a new name by appending SUFFIX to the DECL name.  If make_unique
+   is true, append the full path name of the source file.  */
+
+static char *
+make_name (tree decl, const char *suffix, bool make_unique)
+{
+  char *global_var_name;
+  int name_len;
+  const char *name;
+  const char *unique_name = NULL;
+
+  name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
+
+  /* Get a unique name that can be used globally without any chances
+     of collision at link time.  */
+  if (make_unique)
+    unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
+
+  name_len = strlen (name) + strlen (suffix) + 2;
+
+  if (make_unique)
+    name_len += strlen (unique_name) + 1;
+  global_var_name = XNEWVEC (char, name_len);
+
+  /* Use '.' to concatenate names as it is demangler friendly.  */
+  if (make_unique)
+    snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
+	      suffix);
+  else
+    snprintf (global_var_name, name_len, "%s.%s", name, suffix);
+
+  return global_var_name;
+}
+
+#if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
+
+/* Make a dispatcher declaration for the multi-versioned function DECL.
+   Calls to DECL function will be replaced with calls to the dispatcher
+   by the front-end.  Return the decl created.  */
+
+static tree
+make_dispatcher_decl (const tree decl)
+{
+  tree func_decl;
+  char *func_name;
+  tree fn_type, func_type;
+  bool is_uniq = false;
+
+  if (TREE_PUBLIC (decl) == 0)
+    is_uniq = true;
+
+  func_name = make_name (decl, "ifunc", is_uniq);
+
+  fn_type = TREE_TYPE (decl);
+  func_type = build_function_type (TREE_TYPE (fn_type),
+				   TYPE_ARG_TYPES (fn_type));
+  
+  func_decl = build_fn_decl (func_name, func_type);
+  XDELETEVEC (func_name);
+  TREE_USED (func_decl) = 1;
+  DECL_CONTEXT (func_decl) = NULL_TREE;
+  DECL_INITIAL (func_decl) = error_mark_node;
+  DECL_ARTIFICIAL (func_decl) = 1;
+  /* Mark this func as external, the resolver will flip it again if
+     it gets generated.  */
+  DECL_EXTERNAL (func_decl) = 1;
+  /* This will be of type IFUNCs have to be externally visible.  */
+  TREE_PUBLIC (func_decl) = 1;
+
+  return func_decl;  
+}
+
+#endif
+
+/* Returns true if decl is multi-versioned and DECL is the default function,
+   that is it is not tagged with target specific optimization.  */
+
+static bool
+is_function_default_version (const tree decl)
+{
+  if (TREE_CODE (decl) != FUNCTION_DECL
+      || !DECL_FUNCTION_VERSIONED (decl))
+    return false;
+  tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
+  gcc_assert (attr);
+  attr = TREE_VALUE (TREE_VALUE (attr));
+  return (TREE_CODE (attr) == STRING_CST
+	  && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
+}
+
+/* Make a dispatcher declaration for the multi-versioned function DECL.
+   Calls to DECL function will be replaced with calls to the dispatcher
+   by the front-end.  Returns the decl of the dispatcher function.  */
+
+static tree
+ix86_get_function_versions_dispatcher (void *decl)
+{
+  tree fn = (tree) decl;
+  struct cgraph_node *node = NULL;
+  struct cgraph_node *default_node = NULL;
+  struct cgraph_function_version_info *node_v = NULL;
+  struct cgraph_function_version_info *first_v = NULL;
+
+  tree dispatch_decl = NULL;
+
+  struct cgraph_function_version_info *default_version_info = NULL;
+ 
+  gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
+
+  node = cgraph_get_node (fn);
+  gcc_assert (node != NULL);
+
+  node_v = get_cgraph_node_version (node);
+  gcc_assert (node_v != NULL);
+ 
+  if (node_v->dispatcher_resolver != NULL)
+    return node_v->dispatcher_resolver;
+
+  /* Find the default version and make it the first node.  */
+  first_v = node_v;
+  /* Go to the beginning of the chain.  */
+  while (first_v->prev != NULL)
+    first_v = first_v->prev;
+  default_version_info = first_v;
+  while (default_version_info != NULL)
+    {
+      if (is_function_default_version
+	    (default_version_info->this_node->decl))
+        break;
+      default_version_info = default_version_info->next;
+    }
+
+  /* If there is no default node, just return NULL.  */
+  if (default_version_info == NULL)
+    return NULL;
+
+  /* Make default info the first node.  */
+  if (first_v != default_version_info)
+    {
+      default_version_info->prev->next = default_version_info->next;
+      if (default_version_info->next)
+        default_version_info->next->prev = default_version_info->prev;
+      first_v->prev = default_version_info;
+      default_version_info->next = first_v;
+      default_version_info->prev = NULL;
+    }
+
+  default_node = default_version_info->this_node;
+
+#if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
+  if (targetm.has_ifunc_p ())
+    {
+      struct cgraph_function_version_info *it_v = NULL;
+      struct cgraph_node *dispatcher_node = NULL;
+      struct cgraph_function_version_info *dispatcher_version_info = NULL;
+
+      /* Right now, the dispatching is done via ifunc.  */
+      dispatch_decl = make_dispatcher_decl (default_node->decl);
+
+      dispatcher_node = cgraph_get_create_node (dispatch_decl);
+      gcc_assert (dispatcher_node != NULL);
+      dispatcher_node->dispatcher_function = 1;
+      dispatcher_version_info
+	= insert_new_cgraph_node_version (dispatcher_node);
+      dispatcher_version_info->next = default_version_info;
+      dispatcher_node->definition = 1;
+
+      /* Set the dispatcher for all the versions.  */
+      it_v = default_version_info;
+      while (it_v != NULL)
+	{
+	  it_v->dispatcher_resolver = dispatch_decl;
+	  it_v = it_v->next;
+	}
+    }
+  else
+#endif
+    {
+      error_at (DECL_SOURCE_LOCATION (default_node->decl),
+		"multiversioning needs ifunc which is not supported "
+		"on this target");
+    }
+
+  return dispatch_decl;
+}
+
+/* Makes a function attribute of the form NAME(ARG_NAME) and chains
+   it to CHAIN.  */
+
+static tree
+make_attribute (const char *name, const char *arg_name, tree chain)
+{
+  tree attr_name;
+  tree attr_arg_name;
+  tree attr_args;
+  tree attr;
+
+  attr_name = get_identifier (name);
+  attr_arg_name = build_string (strlen (arg_name), arg_name);
+  attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
+  attr = tree_cons (attr_name, attr_args, chain);
+  return attr;
+}
+
+/* Make the resolver function decl to dispatch the versions of
+   a multi-versioned function,  DEFAULT_DECL.  Create an
+   empty basic block in the resolver and store the pointer in
+   EMPTY_BB.  Return the decl of the resolver function.  */
+
+static tree
+make_resolver_func (const tree default_decl,
+		    const tree dispatch_decl,
+		    basic_block *empty_bb)
+{
+  char *resolver_name;
+  tree decl, type, decl_name, t;
+  bool is_uniq = false;
+
+  /* IFUNC's have to be globally visible.  So, if the default_decl is
+     not, then the name of the IFUNC should be made unique.  */
+  if (TREE_PUBLIC (default_decl) == 0)
+    is_uniq = true;
+
+  /* Append the filename to the resolver function if the versions are
+     not externally visible.  This is because the resolver function has
+     to be externally visible for the loader to find it.  So, appending
+     the filename will prevent conflicts with a resolver function from
+     another module which is based on the same version name.  */
+  resolver_name = make_name (default_decl, "resolver", is_uniq);
+
+  /* The resolver function should return a (void *). */
+  type = build_function_type_list (ptr_type_node, NULL_TREE);
+
+  decl = build_fn_decl (resolver_name, type);
+  decl_name = get_identifier (resolver_name);
+  SET_DECL_ASSEMBLER_NAME (decl, decl_name);
+
+  DECL_NAME (decl) = decl_name;
+  TREE_USED (decl) = 1;
+  DECL_ARTIFICIAL (decl) = 1;
+  DECL_IGNORED_P (decl) = 0;
+  /* IFUNC resolvers have to be externally visible.  */
+  TREE_PUBLIC (decl) = 1;
+  DECL_UNINLINABLE (decl) = 1;
+
+  /* Resolver is not external, body is generated.  */
+  DECL_EXTERNAL (decl) = 0;
+  DECL_EXTERNAL (dispatch_decl) = 0;
+
+  DECL_CONTEXT (decl) = NULL_TREE;
+  DECL_INITIAL (decl) = make_node (BLOCK);
+  DECL_STATIC_CONSTRUCTOR (decl) = 0;
+
+  if (DECL_COMDAT_GROUP (default_decl)
+      || TREE_PUBLIC (default_decl))
+    {
+      /* In this case, each translation unit with a call to this
+	 versioned function will put out a resolver.  Ensure it
+	 is comdat to keep just one copy.  */
+      DECL_COMDAT (decl) = 1;
+      make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
+    }
+  /* Build result decl and add to function_decl. */
+  t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
+  DECL_ARTIFICIAL (t) = 1;
+  DECL_IGNORED_P (t) = 1;
+  DECL_RESULT (decl) = t;
+
+  gimplify_function_tree (decl);
+  push_cfun (DECL_STRUCT_FUNCTION (decl));
+  *empty_bb = init_lowered_empty_function (decl, false);
+
+  cgraph_add_new_function (decl, true);
+  cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
+
+  pop_cfun ();
+
+  gcc_assert (dispatch_decl != NULL);
+  /* Mark dispatch_decl as "ifunc" with resolver as resolver_name.  */
+  DECL_ATTRIBUTES (dispatch_decl) 
+    = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
+
+  /* Create the alias for dispatch to resolver here.  */
+  /*cgraph_create_function_alias (dispatch_decl, decl);*/
+  cgraph_same_body_alias (NULL, dispatch_decl, decl);
+  XDELETEVEC (resolver_name);
+  return decl;
+}
+
+/* Generate the dispatching code body to dispatch multi-versioned function
+   DECL.  The target hook is called to process the "target" attributes and
+   provide the code to dispatch the right function at run-time.  NODE points
+   to the dispatcher decl whose body will be created.  */
+
+static tree 
+ix86_generate_version_dispatcher_body (void *node_p)
+{
+  tree resolver_decl;
+  basic_block empty_bb;
+  tree default_ver_decl;
+  struct cgraph_node *versn;
+  struct cgraph_node *node;
+
+  struct cgraph_function_version_info *node_version_info = NULL;
+  struct cgraph_function_version_info *versn_info = NULL;
+
+  node = (cgraph_node *)node_p;
+
+  node_version_info = get_cgraph_node_version (node);
+  gcc_assert (node->dispatcher_function
+	      && node_version_info != NULL);
+
+  if (node_version_info->dispatcher_resolver)
+    return node_version_info->dispatcher_resolver;
+
+  /* The first version in the chain corresponds to the default version.  */
+  default_ver_decl = node_version_info->next->this_node->decl;
+
+  /* node is going to be an alias, so remove the finalized bit.  */
+  node->definition = false;
+
+  resolver_decl = make_resolver_func (default_ver_decl,
+				      node->decl, &empty_bb);
+
+  node_version_info->dispatcher_resolver = resolver_decl;
+
+  push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
+
+  auto_vec<tree, 2> fn_ver_vec;
+
+  for (versn_info = node_version_info->next; versn_info;
+       versn_info = versn_info->next)
+    {
+      versn = versn_info->this_node;
+      /* Check for virtual functions here again, as by this time it should
+	 have been determined if this function needs a vtable index or
+	 not.  This happens for methods in derived classes that override
+	 virtual methods in base classes but are not explicitly marked as
+	 virtual.  */
+      if (DECL_VINDEX (versn->decl))
+	sorry ("Virtual function multiversioning not supported");
+
+      fn_ver_vec.safe_push (versn->decl);
+    }
+
+  dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
+  rebuild_cgraph_edges (); 
+  pop_cfun ();
+  return resolver_decl;
+}
+/* This builds the processor_model struct type defined in
+   libgcc/config/i386/cpuinfo.c  */
+
+static tree
+build_processor_model_struct (void)
+{
+  const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
+			      "__cpu_features"};
+  tree field = NULL_TREE, field_chain = NULL_TREE;
+  int i;
+  tree type = make_node (RECORD_TYPE);
+
+  /* The first 3 fields are unsigned int.  */
+  for (i = 0; i < 3; ++i)
+    {
+      field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
+			  get_identifier (field_name[i]), unsigned_type_node);
+      if (field_chain != NULL_TREE)
+	DECL_CHAIN (field) = field_chain;
+      field_chain = field;
+    }
+
+  /* The last field is an array of unsigned integers of size one.  */
+  field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
+		      get_identifier (field_name[3]),
+		      build_array_type (unsigned_type_node,
+					build_index_type (size_one_node)));
+  if (field_chain != NULL_TREE)
+    DECL_CHAIN (field) = field_chain;
+  field_chain = field;
+
+  finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
+  return type;
+}
+
+/* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
+
+static tree
+make_var_decl (tree type, const char *name)
+{
+  tree new_decl;
+
+  new_decl = build_decl (UNKNOWN_LOCATION,
+	                 VAR_DECL,
+	  	         get_identifier(name),
+		         type);
+
+  DECL_EXTERNAL (new_decl) = 1;
+  TREE_STATIC (new_decl) = 1;
+  TREE_PUBLIC (new_decl) = 1;
+  DECL_INITIAL (new_decl) = 0;
+  DECL_ARTIFICIAL (new_decl) = 0;
+  DECL_PRESERVE_P (new_decl) = 1;
+
+  make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
+  assemble_variable (new_decl, 0, 0, 0);
+
+  return new_decl;
+}
+
+/* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
+   into an integer defined in libgcc/config/i386/cpuinfo.c */
+
+static tree
+fold_builtin_cpu (tree fndecl, tree *args)
+{
+  unsigned int i;
+  enum ix86_builtins fn_code = (enum ix86_builtins)
+				DECL_FUNCTION_CODE (fndecl);
+  tree param_string_cst = NULL;
+
+  /* This is the order of bit-fields in __processor_features in cpuinfo.c */
+  enum processor_features
+  {
+    F_CMOV = 0,
+    F_MMX,
+    F_POPCNT,
+    F_SSE,
+    F_SSE2,
+    F_SSE3,
+    F_SSSE3,
+    F_SSE4_1,
+    F_SSE4_2,
+    F_AVX,
+    F_AVX2,
+    F_SSE4_A,
+    F_FMA4,
+    F_XOP,
+    F_FMA,
+    F_MAX
+  };
+
+  /* These are the values for vendor types and cpu types  and subtypes
+     in cpuinfo.c.  Cpu types and subtypes should be subtracted by
+     the corresponding start value.  */
+  enum processor_model
+  {
+    M_INTEL = 1,
+    M_AMD,
+    M_CPU_TYPE_START,
+    M_INTEL_BONNELL,
+    M_INTEL_CORE2,
+    M_INTEL_COREI7,
+    M_AMDFAM10H,
+    M_AMDFAM15H,
+    M_INTEL_SILVERMONT,
+    M_AMD_BTVER1,
+    M_AMD_BTVER2,    
+    M_CPU_SUBTYPE_START,
+    M_INTEL_COREI7_NEHALEM,
+    M_INTEL_COREI7_WESTMERE,
+    M_INTEL_COREI7_SANDYBRIDGE,
+    M_AMDFAM10H_BARCELONA,
+    M_AMDFAM10H_SHANGHAI,
+    M_AMDFAM10H_ISTANBUL,
+    M_AMDFAM15H_BDVER1,
+    M_AMDFAM15H_BDVER2,
+    M_AMDFAM15H_BDVER3,
+    M_AMDFAM15H_BDVER4,
+    M_INTEL_COREI7_IVYBRIDGE,
+    M_INTEL_COREI7_HASWELL
+  };
+
+  static struct _arch_names_table
+    {
+      const char *const name;
+      const enum processor_model model;
+    }
+  const arch_names_table[] =
+    {
+      {"amd", M_AMD},
+      {"intel", M_INTEL},
+      {"atom", M_INTEL_BONNELL},
+      {"slm", M_INTEL_SILVERMONT},
+      {"core2", M_INTEL_CORE2},
+      {"corei7", M_INTEL_COREI7},
+      {"nehalem", M_INTEL_COREI7_NEHALEM},
+      {"westmere", M_INTEL_COREI7_WESTMERE},
+      {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
+      {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
+      {"haswell", M_INTEL_COREI7_HASWELL},
+      {"bonnell", M_INTEL_BONNELL},
+      {"silvermont", M_INTEL_SILVERMONT},
+      {"amdfam10h", M_AMDFAM10H},
+      {"barcelona", M_AMDFAM10H_BARCELONA},
+      {"shanghai", M_AMDFAM10H_SHANGHAI},
+      {"istanbul", M_AMDFAM10H_ISTANBUL},
+      {"btver1", M_AMD_BTVER1},      
+      {"amdfam15h", M_AMDFAM15H},
+      {"bdver1", M_AMDFAM15H_BDVER1},
+      {"bdver2", M_AMDFAM15H_BDVER2},
+      {"bdver3", M_AMDFAM15H_BDVER3},
+      {"bdver4", M_AMDFAM15H_BDVER4},
+      {"btver2", M_AMD_BTVER2},      
+    };
+
+  static struct _isa_names_table
+    {
+      const char *const name;
+      const enum processor_features feature;
+    }
+  const isa_names_table[] =
+    {
+      {"cmov",   F_CMOV},
+      {"mmx",    F_MMX},
+      {"popcnt", F_POPCNT},
+      {"sse",    F_SSE},
+      {"sse2",   F_SSE2},
+      {"sse3",   F_SSE3},
+      {"ssse3",  F_SSSE3},
+      {"sse4a",  F_SSE4_A},
+      {"sse4.1", F_SSE4_1},
+      {"sse4.2", F_SSE4_2},
+      {"avx",    F_AVX},
+      {"fma4",   F_FMA4},
+      {"xop",    F_XOP},
+      {"fma",    F_FMA},
+      {"avx2",   F_AVX2}
+    };
+
+  tree __processor_model_type = build_processor_model_struct ();
+  tree __cpu_model_var = make_var_decl (__processor_model_type,
+					"__cpu_model");
+
+
+  varpool_add_new_variable (__cpu_model_var);
+
+  gcc_assert ((args != NULL) && (*args != NULL));
+
+  param_string_cst = *args;
+  while (param_string_cst
+	 && TREE_CODE (param_string_cst) !=  STRING_CST)
+    {
+      /* *args must be a expr that can contain other EXPRS leading to a
+	 STRING_CST.   */
+      if (!EXPR_P (param_string_cst))
+ 	{
+	  error ("Parameter to builtin must be a string constant or literal");
+	  return integer_zero_node;
+	}
+      param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
+    }
+
+  gcc_assert (param_string_cst);
+
+  if (fn_code == IX86_BUILTIN_CPU_IS)
+    {
+      tree ref;
+      tree field;
+      tree final;
+
+      unsigned int field_val = 0;
+      unsigned int NUM_ARCH_NAMES
+	= sizeof (arch_names_table) / sizeof (struct _arch_names_table);
+
+      for (i = 0; i < NUM_ARCH_NAMES; i++)
+	if (strcmp (arch_names_table[i].name,
+	    TREE_STRING_POINTER (param_string_cst)) == 0)
+	  break;
+
+      if (i == NUM_ARCH_NAMES)
+	{
+	  error ("Parameter to builtin not valid: %s",
+	         TREE_STRING_POINTER (param_string_cst));
+	  return integer_zero_node;
+	}
+
+      field = TYPE_FIELDS (__processor_model_type);
+      field_val = arch_names_table[i].model;
+
+      /* CPU types are stored in the next field.  */
+      if (field_val > M_CPU_TYPE_START
+	  && field_val < M_CPU_SUBTYPE_START)
+	{
+	  field = DECL_CHAIN (field);
+	  field_val -= M_CPU_TYPE_START;
+	}
+
+      /* CPU subtypes are stored in the next field.  */
+      if (field_val > M_CPU_SUBTYPE_START)
+	{
+	  field = DECL_CHAIN ( DECL_CHAIN (field));
+	  field_val -= M_CPU_SUBTYPE_START;
+	}
+
+      /* Get the appropriate field in __cpu_model.  */
+      ref =  build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
+		     field, NULL_TREE);
+
+      /* Check the value.  */
+      final = build2 (EQ_EXPR, unsigned_type_node, ref,
+		      build_int_cstu (unsigned_type_node, field_val));
+      return build1 (CONVERT_EXPR, integer_type_node, final);
+    }
+  else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
+    {
+      tree ref;
+      tree array_elt;
+      tree field;
+      tree final;
+
+      unsigned int field_val = 0;
+      unsigned int NUM_ISA_NAMES
+	= sizeof (isa_names_table) / sizeof (struct _isa_names_table);
+
+      for (i = 0; i < NUM_ISA_NAMES; i++)
+	if (strcmp (isa_names_table[i].name,
+	    TREE_STRING_POINTER (param_string_cst)) == 0)
+	  break;
+
+      if (i == NUM_ISA_NAMES)
+	{
+	  error ("Parameter to builtin not valid: %s",
+	       	 TREE_STRING_POINTER (param_string_cst));
+	  return integer_zero_node;
+	}
+
+      field = TYPE_FIELDS (__processor_model_type);
+      /* Get the last field, which is __cpu_features.  */
+      while (DECL_CHAIN (field))
+        field = DECL_CHAIN (field);
+
+      /* Get the appropriate field: __cpu_model.__cpu_features  */
+      ref =  build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
+		     field, NULL_TREE);
+
+      /* Access the 0th element of __cpu_features array.  */
+      array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
+			  integer_zero_node, NULL_TREE, NULL_TREE);
+
+      field_val = (1 << isa_names_table[i].feature);
+      /* Return __cpu_model.__cpu_features[0] & field_val  */
+      final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
+		      build_int_cstu (unsigned_type_node, field_val));
+      return build1 (CONVERT_EXPR, integer_type_node, final);
+    }
+  gcc_unreachable ();
+}
+
+static tree
+ix86_fold_builtin (tree fndecl, int n_args,
+		   tree *args, bool ignore ATTRIBUTE_UNUSED)
+{
+  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
+    {
+      enum ix86_builtins fn_code = (enum ix86_builtins)
+				   DECL_FUNCTION_CODE (fndecl);
+      if (fn_code ==  IX86_BUILTIN_CPU_IS
+	  || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
+	{
+	  gcc_assert (n_args == 1);
+          return fold_builtin_cpu (fndecl, args);
+	}
+    }
+
+#ifdef SUBTARGET_FOLD_BUILTIN
+  return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
+#endif
+
+  return NULL_TREE;
+}
+
+/* Make builtins to detect cpu type and features supported.  NAME is
+   the builtin name, CODE is the builtin code, and FTYPE is the function
+   type of the builtin.  */
+
+static void
+make_cpu_type_builtin (const char* name, int code,
+		       enum ix86_builtin_func_type ftype, bool is_const)
+{
+  tree decl;
+  tree type;
+
+  type = ix86_get_builtin_func_type (ftype);
+  decl = add_builtin_function (name, type, code, BUILT_IN_MD,
+			       NULL, NULL_TREE);
+  gcc_assert (decl != NULL_TREE);
+  ix86_builtins[(int) code] = decl;
+  TREE_READONLY (decl) = is_const;
+}
+
+/* Make builtins to get CPU type and features supported.  The created
+   builtins are :
+
+   __builtin_cpu_init (), to detect cpu type and features,
+   __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
+   __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
+   */
+
+static void
+ix86_init_platform_type_builtins (void)
+{
+  make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
+			 INT_FTYPE_VOID, false);
+  make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
+			 INT_FTYPE_PCCHAR, true);
+  make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
+			 INT_FTYPE_PCCHAR, true);
+}
+
+/* Internal method for ix86_init_builtins.  */
+
+static void
+ix86_init_builtins_va_builtins_abi (void)
+{
+  tree ms_va_ref, sysv_va_ref;
+  tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
+  tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
+  tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
+  tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
+
+  if (!TARGET_64BIT)
+    return;
+  fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
+  fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
+  ms_va_ref = build_reference_type (ms_va_list_type_node);
+  sysv_va_ref =
+    build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
+
+  fnvoid_va_end_ms =
+    build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
+  fnvoid_va_start_ms =
+    build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
+  fnvoid_va_end_sysv =
+    build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
+  fnvoid_va_start_sysv =
+    build_varargs_function_type_list (void_type_node, sysv_va_ref,
+    				       NULL_TREE);
+  fnvoid_va_copy_ms =
+    build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
+    			      NULL_TREE);
+  fnvoid_va_copy_sysv =
+    build_function_type_list (void_type_node, sysv_va_ref,
+    			      sysv_va_ref, NULL_TREE);
+
+  add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
+  			BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
+  add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
+  			BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
+  add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
+			BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
+  add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
+  			BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
+  add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
+  			BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
+  add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
+			BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
+}
+
+static void
+ix86_init_builtin_types (void)
+{
+  tree float128_type_node, float80_type_node;
+
+  /* The __float80 type.  */
+  float80_type_node = long_double_type_node;
+  if (TYPE_MODE (float80_type_node) != XFmode)
+    {
+      /* The __float80 type.  */
+      float80_type_node = make_node (REAL_TYPE);
+
+      TYPE_PRECISION (float80_type_node) = 80;
+      layout_type (float80_type_node);
+    }
+  lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
+
+  /* The __float128 type.  */
+  float128_type_node = make_node (REAL_TYPE);
+  TYPE_PRECISION (float128_type_node) = 128;
+  layout_type (float128_type_node);
+  lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
+
+  /* This macro is built by i386-builtin-types.awk.  */
+  DEFINE_BUILTIN_PRIMITIVE_TYPES;
+}
+
+static void
+ix86_init_builtins (void)
+{
+  tree t;
+
+  ix86_init_builtin_types ();
+
+  /* Builtins to get CPU type and features. */
+  ix86_init_platform_type_builtins ();
+
+  /* TFmode support builtins.  */
+  def_builtin_const (0, "__builtin_infq",
+		     FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
+  def_builtin_const (0, "__builtin_huge_valq",
+		     FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
+
+  /* We will expand them to normal call if SSE isn't available since
+     they are used by libgcc. */
+  t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
+  t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
+			    BUILT_IN_MD, "__fabstf2", NULL_TREE);
+  TREE_READONLY (t) = 1;
+  ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
+
+  t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
+  t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
+			    BUILT_IN_MD, "__copysigntf3", NULL_TREE);
+  TREE_READONLY (t) = 1;
+  ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
+
+  ix86_init_tm_builtins ();
+  ix86_init_mmx_sse_builtins ();
+
+  if (TARGET_LP64)
+    ix86_init_builtins_va_builtins_abi ();
+
+#ifdef SUBTARGET_INIT_BUILTINS
+  SUBTARGET_INIT_BUILTINS;
+#endif
+}
+
+/* Return the ix86 builtin for CODE.  */
+
+static tree
+ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
+{
+  if (code >= IX86_BUILTIN_MAX)
+    return error_mark_node;
+
+  return ix86_builtins[code];
+}
+
+/* Errors in the source file can cause expand_expr to return const0_rtx
+   where we expect a vector.  To avoid crashing, use one of the vector
+   clear instructions.  */
+static rtx
+safe_vector_operand (rtx x, enum machine_mode mode)
+{
+  if (x == const0_rtx)
+    x = CONST0_RTX (mode);
+  return x;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of binop insns.  */
+
+static rtx
+ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+  enum machine_mode tmode = insn_data[icode].operand[0].mode;
+  enum machine_mode mode0 = insn_data[icode].operand[1].mode;
+  enum machine_mode mode1 = insn_data[icode].operand[2].mode;
+
+  if (VECTOR_MODE_P (mode0))
+    op0 = safe_vector_operand (op0, mode0);
+  if (VECTOR_MODE_P (mode1))
+    op1 = safe_vector_operand (op1, mode1);
+
+  if (optimize || !target
+      || GET_MODE (target) != tmode
+      || !insn_data[icode].operand[0].predicate (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  if (GET_MODE (op1) == SImode && mode1 == TImode)
+    {
+      rtx x = gen_reg_rtx (V4SImode);
+      emit_insn (gen_sse2_loadd (x, op1));
+      op1 = gen_lowpart (TImode, x);
+    }
+
+  if (!insn_data[icode].operand[1].predicate (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+  if (!insn_data[icode].operand[2].predicate (op1, mode1))
+    op1 = copy_to_mode_reg (mode1, op1);
+
+  pat = GEN_FCN (icode) (target, op0, op1);
+  if (! pat)
+    return 0;
+
+  emit_insn (pat);
+
+  return target;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
+
+static rtx
+ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
+			       enum ix86_builtin_func_type m_type,
+			       enum rtx_code sub_code)
+{
+  rtx pat;
+  int i;
+  int nargs;
+  bool comparison_p = false;
+  bool tf_p = false;
+  bool last_arg_constant = false;
+  int num_memory = 0;
+  struct {
+    rtx op;
+    enum machine_mode mode;
+  } args[4];
+
+  enum machine_mode tmode = insn_data[icode].operand[0].mode;
+
+  switch (m_type)
+    {
+    case MULTI_ARG_4_DF2_DI_I:
+    case MULTI_ARG_4_DF2_DI_I1:
+    case MULTI_ARG_4_SF2_SI_I:
+    case MULTI_ARG_4_SF2_SI_I1:
+      nargs = 4;
+      last_arg_constant = true;
+      break;
+
+    case MULTI_ARG_3_SF:
+    case MULTI_ARG_3_DF:
+    case MULTI_ARG_3_SF2:
+    case MULTI_ARG_3_DF2:
+    case MULTI_ARG_3_DI:
+    case MULTI_ARG_3_SI:
+    case MULTI_ARG_3_SI_DI:
+    case MULTI_ARG_3_HI:
+    case MULTI_ARG_3_HI_SI:
+    case MULTI_ARG_3_QI:
+    case MULTI_ARG_3_DI2:
+    case MULTI_ARG_3_SI2:
+    case MULTI_ARG_3_HI2:
+    case MULTI_ARG_3_QI2:
+      nargs = 3;
+      break;
+
+    case MULTI_ARG_2_SF:
+    case MULTI_ARG_2_DF:
+    case MULTI_ARG_2_DI:
+    case MULTI_ARG_2_SI:
+    case MULTI_ARG_2_HI:
+    case MULTI_ARG_2_QI:
+      nargs = 2;
+      break;
+
+    case MULTI_ARG_2_DI_IMM:
+    case MULTI_ARG_2_SI_IMM:
+    case MULTI_ARG_2_HI_IMM:
+    case MULTI_ARG_2_QI_IMM:
+      nargs = 2;
+      last_arg_constant = true;
+      break;
+
+    case MULTI_ARG_1_SF:
+    case MULTI_ARG_1_DF:
+    case MULTI_ARG_1_SF2:
+    case MULTI_ARG_1_DF2:
+    case MULTI_ARG_1_DI:
+    case MULTI_ARG_1_SI:
+    case MULTI_ARG_1_HI:
+    case MULTI_ARG_1_QI:
+    case MULTI_ARG_1_SI_DI:
+    case MULTI_ARG_1_HI_DI:
+    case MULTI_ARG_1_HI_SI:
+    case MULTI_ARG_1_QI_DI:
+    case MULTI_ARG_1_QI_SI:
+    case MULTI_ARG_1_QI_HI:
+      nargs = 1;
+      break;
+
+    case MULTI_ARG_2_DI_CMP:
+    case MULTI_ARG_2_SI_CMP:
+    case MULTI_ARG_2_HI_CMP:
+    case MULTI_ARG_2_QI_CMP:
+      nargs = 2;
+      comparison_p = true;
+      break;
+
+    case MULTI_ARG_2_SF_TF:
+    case MULTI_ARG_2_DF_TF:
+    case MULTI_ARG_2_DI_TF:
+    case MULTI_ARG_2_SI_TF:
+    case MULTI_ARG_2_HI_TF:
+    case MULTI_ARG_2_QI_TF:
+      nargs = 2;
+      tf_p = true;
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  if (optimize || !target
+      || GET_MODE (target) != tmode
+      || !insn_data[icode].operand[0].predicate (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  gcc_assert (nargs <= 4);
+
+  for (i = 0; i < nargs; i++)
+    {
+      tree arg = CALL_EXPR_ARG (exp, i);
+      rtx op = expand_normal (arg);
+      int adjust = (comparison_p) ? 1 : 0;
+      enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
+
+      if (last_arg_constant && i == nargs - 1)
+	{
+	  if (!insn_data[icode].operand[i + 1].predicate (op, mode))
+	    {
+	      enum insn_code new_icode = icode;
+	      switch (icode)
+		{
+		case CODE_FOR_xop_vpermil2v2df3:
+		case CODE_FOR_xop_vpermil2v4sf3:
+		case CODE_FOR_xop_vpermil2v4df3:
+		case CODE_FOR_xop_vpermil2v8sf3:
+		  error ("the last argument must be a 2-bit immediate");
+		  return gen_reg_rtx (tmode);
+		case CODE_FOR_xop_rotlv2di3:
+		  new_icode = CODE_FOR_rotlv2di3;
+		  goto xop_rotl;
+		case CODE_FOR_xop_rotlv4si3:
+		  new_icode = CODE_FOR_rotlv4si3;
+		  goto xop_rotl;
+		case CODE_FOR_xop_rotlv8hi3:
+		  new_icode = CODE_FOR_rotlv8hi3;
+		  goto xop_rotl;
+		case CODE_FOR_xop_rotlv16qi3:
+		  new_icode = CODE_FOR_rotlv16qi3;
+		xop_rotl:
+		  if (CONST_INT_P (op))
+		    {
+		      int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
+		      op = GEN_INT (INTVAL (op) & mask);
+		      gcc_checking_assert
+			(insn_data[icode].operand[i + 1].predicate (op, mode));
+		    }
+		  else
+		    {
+		      gcc_checking_assert
+			(nargs == 2
+			 && insn_data[new_icode].operand[0].mode == tmode
+			 && insn_data[new_icode].operand[1].mode == tmode
+			 && insn_data[new_icode].operand[2].mode == mode
+			 && insn_data[new_icode].operand[0].predicate
+			    == insn_data[icode].operand[0].predicate
+			 && insn_data[new_icode].operand[1].predicate
+			    == insn_data[icode].operand[1].predicate);
+		      icode = new_icode;
+		      goto non_constant;
+		    }
+		  break;
+		default:
+		  gcc_unreachable ();
+		}
+	    }
+	}
+      else
+	{
+	non_constant:
+	  if (VECTOR_MODE_P (mode))
+	    op = safe_vector_operand (op, mode);
+
+	  /* If we aren't optimizing, only allow one memory operand to be
+	     generated.  */
+	  if (memory_operand (op, mode))
+	    num_memory++;
+
+	  gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
+
+	  if (optimize
+	      || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
+	      || num_memory > 1)
+	    op = force_reg (mode, op);
+	}
+
+      args[i].op = op;
+      args[i].mode = mode;
+    }
+
+  switch (nargs)
+    {
+    case 1:
+      pat = GEN_FCN (icode) (target, args[0].op);
+      break;
+
+    case 2:
+      if (tf_p)
+	pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+			       GEN_INT ((int)sub_code));
+      else if (! comparison_p)
+	pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
+      else
+	{
+	  rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
+				       args[0].op,
+				       args[1].op);
+
+	  pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
+	}
+      break;
+
+    case 3:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
+      break;
+
+    case 4:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  if (! pat)
+    return 0;
+
+  emit_insn (pat);
+  return target;
+}
+
+/* Subroutine of ix86_expand_args_builtin to take care of scalar unop
+   insns with vec_merge.  */
+
+static rtx
+ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
+				    rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  rtx op1, op0 = expand_normal (arg0);
+  enum machine_mode tmode = insn_data[icode].operand[0].mode;
+  enum machine_mode mode0 = insn_data[icode].operand[1].mode;
+
+  if (optimize || !target
+      || GET_MODE (target) != tmode
+      || !insn_data[icode].operand[0].predicate (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  if (VECTOR_MODE_P (mode0))
+    op0 = safe_vector_operand (op0, mode0);
+
+  if ((optimize && !register_operand (op0, mode0))
+      || !insn_data[icode].operand[1].predicate (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+
+  op1 = op0;
+  if (!insn_data[icode].operand[2].predicate (op1, mode0))
+    op1 = copy_to_mode_reg (mode0, op1);
+
+  pat = GEN_FCN (icode) (target, op0, op1);
+  if (! pat)
+    return 0;
+  emit_insn (pat);
+  return target;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of comparison insns.  */
+
+static rtx
+ix86_expand_sse_compare (const struct builtin_description *d,
+			 tree exp, rtx target, bool swap)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+  rtx op2;
+  enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
+  enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
+  enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
+  enum rtx_code comparison = d->comparison;
+
+  if (VECTOR_MODE_P (mode0))
+    op0 = safe_vector_operand (op0, mode0);
+  if (VECTOR_MODE_P (mode1))
+    op1 = safe_vector_operand (op1, mode1);
+
+  /* Swap operands if we have a comparison that isn't available in
+     hardware.  */
+  if (swap)
+    {
+      rtx tmp = gen_reg_rtx (mode1);
+      emit_move_insn (tmp, op1);
+      op1 = op0;
+      op0 = tmp;
+    }
+
+  if (optimize || !target
+      || GET_MODE (target) != tmode
+      || !insn_data[d->icode].operand[0].predicate (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  if ((optimize && !register_operand (op0, mode0))
+      || !insn_data[d->icode].operand[1].predicate (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+  if ((optimize && !register_operand (op1, mode1))
+      || !insn_data[d->icode].operand[2].predicate (op1, mode1))
+    op1 = copy_to_mode_reg (mode1, op1);
+
+  op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
+  pat = GEN_FCN (d->icode) (target, op0, op1, op2);
+  if (! pat)
+    return 0;
+  emit_insn (pat);
+  return target;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of comi insns.  */
+
+static rtx
+ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
+		      rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+  enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
+  enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
+  enum rtx_code comparison = d->comparison;
+
+  if (VECTOR_MODE_P (mode0))
+    op0 = safe_vector_operand (op0, mode0);
+  if (VECTOR_MODE_P (mode1))
+    op1 = safe_vector_operand (op1, mode1);
+
+  /* Swap operands if we have a comparison that isn't available in
+     hardware.  */
+  if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
+    {
+      rtx tmp = op1;
+      op1 = op0;
+      op0 = tmp;
+    }
+
+  target = gen_reg_rtx (SImode);
+  emit_move_insn (target, const0_rtx);
+  target = gen_rtx_SUBREG (QImode, target, 0);
+
+  if ((optimize && !register_operand (op0, mode0))
+      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+  if ((optimize && !register_operand (op1, mode1))
+      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
+    op1 = copy_to_mode_reg (mode1, op1);
+
+  pat = GEN_FCN (d->icode) (op0, op1);
+  if (! pat)
+    return 0;
+  emit_insn (pat);
+  emit_insn (gen_rtx_SET (VOIDmode,
+			  gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+			  gen_rtx_fmt_ee (comparison, QImode,
+					  SET_DEST (pat),
+					  const0_rtx)));
+
+  return SUBREG_REG (target);
+}
+
+/* Subroutines of ix86_expand_args_builtin to take care of round insns.  */
+
+static rtx
+ix86_expand_sse_round (const struct builtin_description *d, tree exp,
+		       rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  rtx op1, op0 = expand_normal (arg0);
+  enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
+  enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
+
+  if (optimize || target == 0
+      || GET_MODE (target) != tmode
+      || !insn_data[d->icode].operand[0].predicate (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  if (VECTOR_MODE_P (mode0))
+    op0 = safe_vector_operand (op0, mode0);
+
+  if ((optimize && !register_operand (op0, mode0))
+      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+
+  op1 = GEN_INT (d->comparison);
+
+  pat = GEN_FCN (d->icode) (target, op0, op1);
+  if (! pat)
+    return 0;
+  emit_insn (pat);
+  return target;
+}
+
+static rtx
+ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
+				     tree exp, rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+  rtx op2;
+  enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
+  enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
+  enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
+
+  if (optimize || target == 0
+      || GET_MODE (target) != tmode
+      || !insn_data[d->icode].operand[0].predicate (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  op0 = safe_vector_operand (op0, mode0);
+  op1 = safe_vector_operand (op1, mode1);
+
+  if ((optimize && !register_operand (op0, mode0))
+      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+  if ((optimize && !register_operand (op1, mode1))
+      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
+    op1 = copy_to_mode_reg (mode1, op1);
+
+  op2 = GEN_INT (d->comparison);
+
+  pat = GEN_FCN (d->icode) (target, op0, op1, op2);
+  if (! pat)
+    return 0;
+  emit_insn (pat);
+  return target;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
+
+static rtx
+ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
+		       rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+  enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
+  enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
+  enum rtx_code comparison = d->comparison;
+
+  if (VECTOR_MODE_P (mode0))
+    op0 = safe_vector_operand (op0, mode0);
+  if (VECTOR_MODE_P (mode1))
+    op1 = safe_vector_operand (op1, mode1);
+
+  target = gen_reg_rtx (SImode);
+  emit_move_insn (target, const0_rtx);
+  target = gen_rtx_SUBREG (QImode, target, 0);
+
+  if ((optimize && !register_operand (op0, mode0))
+      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+  if ((optimize && !register_operand (op1, mode1))
+      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
+    op1 = copy_to_mode_reg (mode1, op1);
+
+  pat = GEN_FCN (d->icode) (op0, op1);
+  if (! pat)
+    return 0;
+  emit_insn (pat);
+  emit_insn (gen_rtx_SET (VOIDmode,
+			  gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+			  gen_rtx_fmt_ee (comparison, QImode,
+					  SET_DEST (pat),
+					  const0_rtx)));
+
+  return SUBREG_REG (target);
+}
+
+/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns.  */
+
+static rtx
+ix86_expand_sse_pcmpestr (const struct builtin_description *d,
+			  tree exp, rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  tree arg2 = CALL_EXPR_ARG (exp, 2);
+  tree arg3 = CALL_EXPR_ARG (exp, 3);
+  tree arg4 = CALL_EXPR_ARG (exp, 4);
+  rtx scratch0, scratch1;
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+  rtx op2 = expand_normal (arg2);
+  rtx op3 = expand_normal (arg3);
+  rtx op4 = expand_normal (arg4);
+  enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
+
+  tmode0 = insn_data[d->icode].operand[0].mode;
+  tmode1 = insn_data[d->icode].operand[1].mode;
+  modev2 = insn_data[d->icode].operand[2].mode;
+  modei3 = insn_data[d->icode].operand[3].mode;
+  modev4 = insn_data[d->icode].operand[4].mode;
+  modei5 = insn_data[d->icode].operand[5].mode;
+  modeimm = insn_data[d->icode].operand[6].mode;
+
+  if (VECTOR_MODE_P (modev2))
+    op0 = safe_vector_operand (op0, modev2);
+  if (VECTOR_MODE_P (modev4))
+    op2 = safe_vector_operand (op2, modev4);
+
+  if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
+    op0 = copy_to_mode_reg (modev2, op0);
+  if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
+    op1 = copy_to_mode_reg (modei3, op1);
+  if ((optimize && !register_operand (op2, modev4))
+      || !insn_data[d->icode].operand[4].predicate (op2, modev4))
+    op2 = copy_to_mode_reg (modev4, op2);
+  if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
+    op3 = copy_to_mode_reg (modei5, op3);
+
+  if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
+    {
+      error ("the fifth argument must be an 8-bit immediate");
+      return const0_rtx;
+    }
+
+  if (d->code == IX86_BUILTIN_PCMPESTRI128)
+    {
+      if (optimize || !target
+	  || GET_MODE (target) != tmode0
+	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
+	target = gen_reg_rtx (tmode0);
+
+      scratch1 = gen_reg_rtx (tmode1);
+
+      pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
+    }
+  else if (d->code == IX86_BUILTIN_PCMPESTRM128)
+    {
+      if (optimize || !target
+	  || GET_MODE (target) != tmode1
+	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
+	target = gen_reg_rtx (tmode1);
+
+      scratch0 = gen_reg_rtx (tmode0);
+
+      pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
+    }
+  else
+    {
+      gcc_assert (d->flag);
+
+      scratch0 = gen_reg_rtx (tmode0);
+      scratch1 = gen_reg_rtx (tmode1);
+
+      pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
+    }
+
+  if (! pat)
+    return 0;
+
+  emit_insn (pat);
+
+  if (d->flag)
+    {
+      target = gen_reg_rtx (SImode);
+      emit_move_insn (target, const0_rtx);
+      target = gen_rtx_SUBREG (QImode, target, 0);
+
+      emit_insn
+	(gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+		      gen_rtx_fmt_ee (EQ, QImode,
+				      gen_rtx_REG ((enum machine_mode) d->flag,
+						   FLAGS_REG),
+				      const0_rtx)));
+      return SUBREG_REG (target);
+    }
+  else
+    return target;
+}
+
+
+/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns.  */
+
+static rtx
+ix86_expand_sse_pcmpistr (const struct builtin_description *d,
+			  tree exp, rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  tree arg2 = CALL_EXPR_ARG (exp, 2);
+  rtx scratch0, scratch1;
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+  rtx op2 = expand_normal (arg2);
+  enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
+
+  tmode0 = insn_data[d->icode].operand[0].mode;
+  tmode1 = insn_data[d->icode].operand[1].mode;
+  modev2 = insn_data[d->icode].operand[2].mode;
+  modev3 = insn_data[d->icode].operand[3].mode;
+  modeimm = insn_data[d->icode].operand[4].mode;
+
+  if (VECTOR_MODE_P (modev2))
+    op0 = safe_vector_operand (op0, modev2);
+  if (VECTOR_MODE_P (modev3))
+    op1 = safe_vector_operand (op1, modev3);
+
+  if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
+    op0 = copy_to_mode_reg (modev2, op0);
+  if ((optimize && !register_operand (op1, modev3))
+      || !insn_data[d->icode].operand[3].predicate (op1, modev3))
+    op1 = copy_to_mode_reg (modev3, op1);
+
+  if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
+    {
+      error ("the third argument must be an 8-bit immediate");
+      return const0_rtx;
+    }
+
+  if (d->code == IX86_BUILTIN_PCMPISTRI128)
+    {
+      if (optimize || !target
+	  || GET_MODE (target) != tmode0
+	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
+	target = gen_reg_rtx (tmode0);
+
+      scratch1 = gen_reg_rtx (tmode1);
+
+      pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
+    }
+  else if (d->code == IX86_BUILTIN_PCMPISTRM128)
+    {
+      if (optimize || !target
+	  || GET_MODE (target) != tmode1
+	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
+	target = gen_reg_rtx (tmode1);
+
+      scratch0 = gen_reg_rtx (tmode0);
+
+      pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
+    }
+  else
+    {
+      gcc_assert (d->flag);
+
+      scratch0 = gen_reg_rtx (tmode0);
+      scratch1 = gen_reg_rtx (tmode1);
+
+      pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
+    }
+
+  if (! pat)
+    return 0;
+
+  emit_insn (pat);
+
+  if (d->flag)
+    {
+      target = gen_reg_rtx (SImode);
+      emit_move_insn (target, const0_rtx);
+      target = gen_rtx_SUBREG (QImode, target, 0);
+
+      emit_insn
+	(gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+		      gen_rtx_fmt_ee (EQ, QImode,
+				      gen_rtx_REG ((enum machine_mode) d->flag,
+						   FLAGS_REG),
+				      const0_rtx)));
+      return SUBREG_REG (target);
+    }
+  else
+    return target;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of insns with
+   variable number of operands.  */
+
+static rtx
+ix86_expand_args_builtin (const struct builtin_description *d,
+			  tree exp, rtx target)
+{
+  rtx pat, real_target;
+  unsigned int i, nargs;
+  unsigned int nargs_constant = 0;
+  unsigned int mask_pos = 0;
+  int num_memory = 0;
+  struct
+    {
+      rtx op;
+      enum machine_mode mode;
+    } args[6];
+  bool last_arg_count = false;
+  enum insn_code icode = d->icode;
+  const struct insn_data_d *insn_p = &insn_data[icode];
+  enum machine_mode tmode = insn_p->operand[0].mode;
+  enum machine_mode rmode = VOIDmode;
+  bool swap = false;
+  enum rtx_code comparison = d->comparison;
+
+  switch ((enum ix86_builtin_func_type) d->flag)
+    {
+    case V2DF_FTYPE_V2DF_ROUND:
+    case V4DF_FTYPE_V4DF_ROUND:
+    case V4SF_FTYPE_V4SF_ROUND:
+    case V8SF_FTYPE_V8SF_ROUND:
+    case V4SI_FTYPE_V4SF_ROUND:
+    case V8SI_FTYPE_V8SF_ROUND:
+      return ix86_expand_sse_round (d, exp, target);
+    case V4SI_FTYPE_V2DF_V2DF_ROUND:
+    case V8SI_FTYPE_V4DF_V4DF_ROUND:
+    case V16SI_FTYPE_V8DF_V8DF_ROUND:
+      return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
+    case INT_FTYPE_V8SF_V8SF_PTEST:
+    case INT_FTYPE_V4DI_V4DI_PTEST:
+    case INT_FTYPE_V4DF_V4DF_PTEST:
+    case INT_FTYPE_V4SF_V4SF_PTEST:
+    case INT_FTYPE_V2DI_V2DI_PTEST:
+    case INT_FTYPE_V2DF_V2DF_PTEST:
+      return ix86_expand_sse_ptest (d, exp, target);
+    case FLOAT128_FTYPE_FLOAT128:
+    case FLOAT_FTYPE_FLOAT:
+    case INT_FTYPE_INT:
+    case UINT64_FTYPE_INT:
+    case UINT16_FTYPE_UINT16:
+    case INT64_FTYPE_INT64:
+    case INT64_FTYPE_V4SF:
+    case INT64_FTYPE_V2DF:
+    case INT_FTYPE_V16QI:
+    case INT_FTYPE_V8QI:
+    case INT_FTYPE_V8SF:
+    case INT_FTYPE_V4DF:
+    case INT_FTYPE_V4SF:
+    case INT_FTYPE_V2DF:
+    case INT_FTYPE_V32QI:
+    case V16QI_FTYPE_V16QI:
+    case V8SI_FTYPE_V8SF:
+    case V8SI_FTYPE_V4SI:
+    case V8HI_FTYPE_V8HI:
+    case V8HI_FTYPE_V16QI:
+    case V8QI_FTYPE_V8QI:
+    case V8SF_FTYPE_V8SF:
+    case V8SF_FTYPE_V8SI:
+    case V8SF_FTYPE_V4SF:
+    case V8SF_FTYPE_V8HI:
+    case V4SI_FTYPE_V4SI:
+    case V4SI_FTYPE_V16QI:
+    case V4SI_FTYPE_V4SF:
+    case V4SI_FTYPE_V8SI:
+    case V4SI_FTYPE_V8HI:
+    case V4SI_FTYPE_V4DF:
+    case V4SI_FTYPE_V2DF:
+    case V4HI_FTYPE_V4HI:
+    case V4DF_FTYPE_V4DF:
+    case V4DF_FTYPE_V4SI:
+    case V4DF_FTYPE_V4SF:
+    case V4DF_FTYPE_V2DF:
+    case V4SF_FTYPE_V4SF:
+    case V4SF_FTYPE_V4SI:
+    case V4SF_FTYPE_V8SF:
+    case V4SF_FTYPE_V4DF:
+    case V4SF_FTYPE_V8HI:
+    case V4SF_FTYPE_V2DF:
+    case V2DI_FTYPE_V2DI:
+    case V2DI_FTYPE_V16QI:
+    case V2DI_FTYPE_V8HI:
+    case V2DI_FTYPE_V4SI:
+    case V2DF_FTYPE_V2DF:
+    case V2DF_FTYPE_V4SI:
+    case V2DF_FTYPE_V4DF:
+    case V2DF_FTYPE_V4SF:
+    case V2DF_FTYPE_V2SI:
+    case V2SI_FTYPE_V2SI:
+    case V2SI_FTYPE_V4SF:
+    case V2SI_FTYPE_V2SF:
+    case V2SI_FTYPE_V2DF:
+    case V2SF_FTYPE_V2SF:
+    case V2SF_FTYPE_V2SI:
+    case V32QI_FTYPE_V32QI:
+    case V32QI_FTYPE_V16QI:
+    case V16HI_FTYPE_V16HI:
+    case V16HI_FTYPE_V8HI:
+    case V8SI_FTYPE_V8SI:
+    case V16HI_FTYPE_V16QI:
+    case V8SI_FTYPE_V16QI:
+    case V4DI_FTYPE_V16QI:
+    case V8SI_FTYPE_V8HI:
+    case V4DI_FTYPE_V8HI:
+    case V4DI_FTYPE_V4SI:
+    case V4DI_FTYPE_V2DI:
+    case HI_FTYPE_HI:
+    case UINT_FTYPE_V2DF:
+    case UINT_FTYPE_V4SF:
+    case UINT64_FTYPE_V2DF:
+    case UINT64_FTYPE_V4SF:
+    case V16QI_FTYPE_V8DI:
+    case V16HI_FTYPE_V16SI:
+    case V16SI_FTYPE_HI:
+    case V16SI_FTYPE_V16SI:
+    case V16SI_FTYPE_INT:
+    case V16SF_FTYPE_FLOAT:
+    case V16SF_FTYPE_V4SF:
+    case V16SF_FTYPE_V16SF:
+    case V8HI_FTYPE_V8DI:
+    case V8UHI_FTYPE_V8UHI:
+    case V8SI_FTYPE_V8DI:
+    case V8USI_FTYPE_V8USI:
+    case V8SF_FTYPE_V8DF:
+    case V8DI_FTYPE_QI:
+    case V8DI_FTYPE_INT64:
+    case V8DI_FTYPE_V4DI:
+    case V8DI_FTYPE_V8DI:
+    case V8DF_FTYPE_DOUBLE:
+    case V8DF_FTYPE_V4DF:
+    case V8DF_FTYPE_V8DF:
+    case V8DF_FTYPE_V8SI:
+      nargs = 1;
+      break;
+    case V4SF_FTYPE_V4SF_VEC_MERGE:
+    case V2DF_FTYPE_V2DF_VEC_MERGE:
+      return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
+    case FLOAT128_FTYPE_FLOAT128_FLOAT128:
+    case V16QI_FTYPE_V16QI_V16QI:
+    case V16QI_FTYPE_V8HI_V8HI:
+    case V16SI_FTYPE_V16SI_V16SI:
+    case V16SF_FTYPE_V16SF_V16SF:
+    case V16SF_FTYPE_V16SF_V16SI:
+    case V8QI_FTYPE_V8QI_V8QI:
+    case V8QI_FTYPE_V4HI_V4HI:
+    case V8HI_FTYPE_V8HI_V8HI:
+    case V8HI_FTYPE_V16QI_V16QI:
+    case V8HI_FTYPE_V4SI_V4SI:
+    case V8SF_FTYPE_V8SF_V8SF:
+    case V8SF_FTYPE_V8SF_V8SI:
+    case V8DI_FTYPE_V8DI_V8DI:
+    case V8DF_FTYPE_V8DF_V8DF:
+    case V8DF_FTYPE_V8DF_V8DI:
+    case V4SI_FTYPE_V4SI_V4SI:
+    case V4SI_FTYPE_V8HI_V8HI:
+    case V4SI_FTYPE_V4SF_V4SF:
+    case V4SI_FTYPE_V2DF_V2DF:
+    case V4HI_FTYPE_V4HI_V4HI:
+    case V4HI_FTYPE_V8QI_V8QI:
+    case V4HI_FTYPE_V2SI_V2SI:
+    case V4DF_FTYPE_V4DF_V4DF:
+    case V4DF_FTYPE_V4DF_V4DI:
+    case V4SF_FTYPE_V4SF_V4SF:
+    case V4SF_FTYPE_V4SF_V4SI:
+    case V4SF_FTYPE_V4SF_V2SI:
+    case V4SF_FTYPE_V4SF_V2DF:
+    case V4SF_FTYPE_V4SF_UINT:
+    case V4SF_FTYPE_V4SF_UINT64:
+    case V4SF_FTYPE_V4SF_DI:
+    case V4SF_FTYPE_V4SF_SI:
+    case V2DI_FTYPE_V2DI_V2DI:
+    case V2DI_FTYPE_V16QI_V16QI:
+    case V2DI_FTYPE_V4SI_V4SI:
+    case V2UDI_FTYPE_V4USI_V4USI:
+    case V2DI_FTYPE_V2DI_V16QI:
+    case V2DI_FTYPE_V2DF_V2DF:
+    case V2SI_FTYPE_V2SI_V2SI:
+    case V2SI_FTYPE_V4HI_V4HI:
+    case V2SI_FTYPE_V2SF_V2SF:
+    case V2DF_FTYPE_V2DF_V2DF:
+    case V2DF_FTYPE_V2DF_V4SF:
+    case V2DF_FTYPE_V2DF_V2DI:
+    case V2DF_FTYPE_V2DF_DI:
+    case V2DF_FTYPE_V2DF_SI:
+    case V2DF_FTYPE_V2DF_UINT:
+    case V2DF_FTYPE_V2DF_UINT64:
+    case V2SF_FTYPE_V2SF_V2SF:
+    case V1DI_FTYPE_V1DI_V1DI:
+    case V1DI_FTYPE_V8QI_V8QI:
+    case V1DI_FTYPE_V2SI_V2SI:
+    case V32QI_FTYPE_V16HI_V16HI:
+    case V16HI_FTYPE_V8SI_V8SI:
+    case V32QI_FTYPE_V32QI_V32QI:
+    case V16HI_FTYPE_V32QI_V32QI:
+    case V16HI_FTYPE_V16HI_V16HI:
+    case V8SI_FTYPE_V4DF_V4DF:
+    case V8SI_FTYPE_V8SI_V8SI:
+    case V8SI_FTYPE_V16HI_V16HI:
+    case V4DI_FTYPE_V4DI_V4DI:
+    case V4DI_FTYPE_V8SI_V8SI:
+    case V4UDI_FTYPE_V8USI_V8USI:
+    case QI_FTYPE_V8DI_V8DI:
+    case HI_FTYPE_V16SI_V16SI:
+      if (comparison == UNKNOWN)
+	return ix86_expand_binop_builtin (icode, exp, target);
+      nargs = 2;
+      break;
+    case V4SF_FTYPE_V4SF_V4SF_SWAP:
+    case V2DF_FTYPE_V2DF_V2DF_SWAP:
+      gcc_assert (comparison != UNKNOWN);
+      nargs = 2;
+      swap = true;
+      break;
+    case V16HI_FTYPE_V16HI_V8HI_COUNT:
+    case V16HI_FTYPE_V16HI_SI_COUNT:
+    case V8SI_FTYPE_V8SI_V4SI_COUNT:
+    case V8SI_FTYPE_V8SI_SI_COUNT:
+    case V4DI_FTYPE_V4DI_V2DI_COUNT:
+    case V4DI_FTYPE_V4DI_INT_COUNT:
+    case V8HI_FTYPE_V8HI_V8HI_COUNT:
+    case V8HI_FTYPE_V8HI_SI_COUNT:
+    case V4SI_FTYPE_V4SI_V4SI_COUNT:
+    case V4SI_FTYPE_V4SI_SI_COUNT:
+    case V4HI_FTYPE_V4HI_V4HI_COUNT:
+    case V4HI_FTYPE_V4HI_SI_COUNT:
+    case V2DI_FTYPE_V2DI_V2DI_COUNT:
+    case V2DI_FTYPE_V2DI_SI_COUNT:
+    case V2SI_FTYPE_V2SI_V2SI_COUNT:
+    case V2SI_FTYPE_V2SI_SI_COUNT:
+    case V1DI_FTYPE_V1DI_V1DI_COUNT:
+    case V1DI_FTYPE_V1DI_SI_COUNT:
+      nargs = 2;
+      last_arg_count = true;
+      break;
+    case UINT64_FTYPE_UINT64_UINT64:
+    case UINT_FTYPE_UINT_UINT:
+    case UINT_FTYPE_UINT_USHORT:
+    case UINT_FTYPE_UINT_UCHAR:
+    case UINT16_FTYPE_UINT16_INT:
+    case UINT8_FTYPE_UINT8_INT:
+    case HI_FTYPE_HI_HI:
+    case V16SI_FTYPE_V8DF_V8DF:
+      nargs = 2;
+      break;
+    case V2DI_FTYPE_V2DI_INT_CONVERT:
+      nargs = 2;
+      rmode = V1TImode;
+      nargs_constant = 1;
+      break;
+    case V4DI_FTYPE_V4DI_INT_CONVERT:
+      nargs = 2;
+      rmode = V2TImode;
+      nargs_constant = 1;
+      break;
+    case V8HI_FTYPE_V8HI_INT:
+    case V8HI_FTYPE_V8SF_INT:
+    case V16HI_FTYPE_V16SF_INT:
+    case V8HI_FTYPE_V4SF_INT:
+    case V8SF_FTYPE_V8SF_INT:
+    case V4SF_FTYPE_V16SF_INT:
+    case V16SF_FTYPE_V16SF_INT:
+    case V4SI_FTYPE_V4SI_INT:
+    case V4SI_FTYPE_V8SI_INT:
+    case V4HI_FTYPE_V4HI_INT:
+    case V4DF_FTYPE_V4DF_INT:
+    case V4DF_FTYPE_V8DF_INT:
+    case V4SF_FTYPE_V4SF_INT:
+    case V4SF_FTYPE_V8SF_INT:
+    case V2DI_FTYPE_V2DI_INT:
+    case V2DF_FTYPE_V2DF_INT:
+    case V2DF_FTYPE_V4DF_INT:
+    case V16HI_FTYPE_V16HI_INT:
+    case V8SI_FTYPE_V8SI_INT:
+    case V16SI_FTYPE_V16SI_INT:
+    case V4SI_FTYPE_V16SI_INT:
+    case V4DI_FTYPE_V4DI_INT:
+    case V2DI_FTYPE_V4DI_INT:
+    case V4DI_FTYPE_V8DI_INT:
+    case HI_FTYPE_HI_INT:
+      nargs = 2;
+      nargs_constant = 1;
+      break;
+    case V16QI_FTYPE_V16QI_V16QI_V16QI:
+    case V8SF_FTYPE_V8SF_V8SF_V8SF:
+    case V4DF_FTYPE_V4DF_V4DF_V4DF:
+    case V4SF_FTYPE_V4SF_V4SF_V4SF:
+    case V2DF_FTYPE_V2DF_V2DF_V2DF:
+    case V32QI_FTYPE_V32QI_V32QI_V32QI:
+    case HI_FTYPE_V16SI_V16SI_HI:
+    case QI_FTYPE_V8DI_V8DI_QI:
+    case V16HI_FTYPE_V16SI_V16HI_HI:
+    case V16QI_FTYPE_V16SI_V16QI_HI:
+    case V16QI_FTYPE_V8DI_V16QI_QI:
+    case V16SF_FTYPE_V16SF_V16SF_HI:
+    case V16SF_FTYPE_V16SF_V16SF_V16SF:
+    case V16SF_FTYPE_V16SF_V16SI_V16SF:
+    case V16SF_FTYPE_V16SI_V16SF_HI:
+    case V16SF_FTYPE_V16SI_V16SF_V16SF:
+    case V16SF_FTYPE_V4SF_V16SF_HI:
+    case V16SI_FTYPE_SI_V16SI_HI:
+    case V16SI_FTYPE_V16HI_V16SI_HI:
+    case V16SI_FTYPE_V16QI_V16SI_HI:
+    case V16SI_FTYPE_V16SF_V16SI_HI:
+    case V16SI_FTYPE_V16SI_V16SI_HI:
+    case V16SI_FTYPE_V16SI_V16SI_V16SI:
+    case V16SI_FTYPE_V4SI_V16SI_HI:
+    case V2DI_FTYPE_V2DI_V2DI_V2DI:
+    case V4DI_FTYPE_V4DI_V4DI_V4DI:
+    case V8DF_FTYPE_V2DF_V8DF_QI:
+    case V8DF_FTYPE_V4DF_V8DF_QI:
+    case V8DF_FTYPE_V8DF_V8DF_QI:
+    case V8DF_FTYPE_V8DF_V8DF_V8DF:
+    case V8DF_FTYPE_V8DF_V8DI_V8DF:
+    case V8DF_FTYPE_V8DI_V8DF_V8DF:
+    case V8DF_FTYPE_V8SF_V8DF_QI:
+    case V8DF_FTYPE_V8SI_V8DF_QI:
+    case V8DI_FTYPE_DI_V8DI_QI:
+    case V8DI_FTYPE_V16QI_V8DI_QI:
+    case V8DI_FTYPE_V2DI_V8DI_QI:
+    case V8DI_FTYPE_V4DI_V8DI_QI:
+    case V8DI_FTYPE_V8DI_V8DI_QI:
+    case V8DI_FTYPE_V8DI_V8DI_V8DI:
+    case V8DI_FTYPE_V8HI_V8DI_QI:
+    case V8DI_FTYPE_V8SI_V8DI_QI:
+    case V8HI_FTYPE_V8DI_V8HI_QI:
+    case V8SF_FTYPE_V8DF_V8SF_QI:
+    case V8SI_FTYPE_V8DF_V8SI_QI:
+    case V8SI_FTYPE_V8DI_V8SI_QI:
+    case V4SI_FTYPE_V4SI_V4SI_V4SI:
+      nargs = 3;
+      break;
+    case V32QI_FTYPE_V32QI_V32QI_INT:
+    case V16HI_FTYPE_V16HI_V16HI_INT:
+    case V16QI_FTYPE_V16QI_V16QI_INT:
+    case V4DI_FTYPE_V4DI_V4DI_INT:
+    case V8HI_FTYPE_V8HI_V8HI_INT:
+    case V8SI_FTYPE_V8SI_V8SI_INT:
+    case V8SI_FTYPE_V8SI_V4SI_INT:
+    case V8SF_FTYPE_V8SF_V8SF_INT:
+    case V8SF_FTYPE_V8SF_V4SF_INT:
+    case V4SI_FTYPE_V4SI_V4SI_INT:
+    case V4DF_FTYPE_V4DF_V4DF_INT:
+    case V16SF_FTYPE_V16SF_V16SF_INT:
+    case V16SF_FTYPE_V16SF_V4SF_INT:
+    case V16SI_FTYPE_V16SI_V4SI_INT:
+    case V4DF_FTYPE_V4DF_V2DF_INT:
+    case V4SF_FTYPE_V4SF_V4SF_INT:
+    case V2DI_FTYPE_V2DI_V2DI_INT:
+    case V4DI_FTYPE_V4DI_V2DI_INT:
+    case V2DF_FTYPE_V2DF_V2DF_INT:
+    case QI_FTYPE_V8DI_V8DI_INT:
+    case QI_FTYPE_V8DF_V8DF_INT:
+    case QI_FTYPE_V2DF_V2DF_INT:
+    case QI_FTYPE_V4SF_V4SF_INT:
+    case HI_FTYPE_V16SI_V16SI_INT:
+    case HI_FTYPE_V16SF_V16SF_INT:
+      nargs = 3;
+      nargs_constant = 1;
+      break;
+    case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
+      nargs = 3;
+      rmode = V4DImode;
+      nargs_constant = 1;
+      break;
+    case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
+      nargs = 3;
+      rmode = V2DImode;
+      nargs_constant = 1;
+      break;
+    case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
+      nargs = 3;
+      rmode = DImode;
+      nargs_constant = 1;
+      break;
+    case V2DI_FTYPE_V2DI_UINT_UINT:
+      nargs = 3;
+      nargs_constant = 2;
+      break;
+    case V16SF_FTYPE_V16SF_V16SF_V16SF_HI:
+    case V16SF_FTYPE_V16SF_V16SI_V16SF_HI:
+    case V16SF_FTYPE_V16SI_V16SF_V16SF_HI:
+    case V16SI_FTYPE_V16SI_V16SI_V16SI_HI:
+    case V16SI_FTYPE_V16SI_V4SI_V16SI_HI:
+    case V2DF_FTYPE_V2DF_V2DF_V2DF_QI:
+    case V2DF_FTYPE_V2DF_V4SF_V2DF_QI:
+    case V4SF_FTYPE_V4SF_V2DF_V4SF_QI:
+    case V4SF_FTYPE_V4SF_V4SF_V4SF_QI:
+    case V8DF_FTYPE_V8DF_V8DF_V8DF_QI:
+    case V8DF_FTYPE_V8DF_V8DI_V8DF_QI:
+    case V8DF_FTYPE_V8DI_V8DF_V8DF_QI:
+    case V8DI_FTYPE_V16SI_V16SI_V8DI_QI:
+    case V8DI_FTYPE_V8DI_SI_V8DI_V8DI:
+    case V8DI_FTYPE_V8DI_V2DI_V8DI_QI:
+    case V8DI_FTYPE_V8DI_V8DI_V8DI_QI:
+      nargs = 4;
+      break;
+    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
+    case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
+    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
+    case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
+    case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
+      nargs = 4;
+      nargs_constant = 1;
+      break;
+    case QI_FTYPE_V2DF_V2DF_INT_QI:
+    case QI_FTYPE_V4SF_V4SF_INT_QI:
+      nargs = 4;
+      mask_pos = 1;
+      nargs_constant = 1;
+      break;
+    case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
+      nargs = 4;
+      nargs_constant = 2;
+      break;
+    case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
+    case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
+      nargs = 4;
+      break;
+    case QI_FTYPE_V8DI_V8DI_INT_QI:
+    case HI_FTYPE_V16SI_V16SI_INT_HI:
+    case QI_FTYPE_V8DF_V8DF_INT_QI:
+    case HI_FTYPE_V16SF_V16SF_INT_HI:
+      mask_pos = 1;
+      nargs = 4;
+      nargs_constant = 1;
+      break;
+    case V8DF_FTYPE_V8DF_INT_V8DF_QI:
+    case V16SF_FTYPE_V16SF_INT_V16SF_HI:
+    case V16HI_FTYPE_V16SF_INT_V16HI_HI:
+    case V16SI_FTYPE_V16SI_INT_V16SI_HI:
+    case V4SI_FTYPE_V16SI_INT_V4SI_QI:
+    case V4DI_FTYPE_V8DI_INT_V4DI_QI:
+    case V4DF_FTYPE_V8DF_INT_V4DF_QI:
+    case V4SF_FTYPE_V16SF_INT_V4SF_QI:
+    case V8DI_FTYPE_V8DI_INT_V8DI_QI:
+      nargs = 4;
+      mask_pos = 2;
+      nargs_constant = 1;
+      break;
+    case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_HI:
+    case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_HI:
+    case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI:
+    case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_QI:
+    case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI:
+    case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_HI:
+    case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI:
+    case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI:
+    case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_QI:
+    case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_QI:
+      nargs = 5;
+      mask_pos = 2;
+      nargs_constant = 1;
+      break;
+    case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_QI:
+    case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI:
+    case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_HI:
+    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI:
+    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI:
+      nargs = 5;
+      mask_pos = 1;
+      nargs_constant = 1;
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  gcc_assert (nargs <= ARRAY_SIZE (args));
+
+  if (comparison != UNKNOWN)
+    {
+      gcc_assert (nargs == 2);
+      return ix86_expand_sse_compare (d, exp, target, swap);
+    }
+
+  if (rmode == VOIDmode || rmode == tmode)
+    {
+      if (optimize
+	  || target == 0
+	  || GET_MODE (target) != tmode
+	  || !insn_p->operand[0].predicate (target, tmode))
+	target = gen_reg_rtx (tmode);
+      real_target = target;
+    }
+  else
+    {
+      real_target = gen_reg_rtx (tmode);
+      target = simplify_gen_subreg (rmode, real_target, tmode, 0);
+    }
+
+  for (i = 0; i < nargs; i++)
+    {
+      tree arg = CALL_EXPR_ARG (exp, i);
+      rtx op = expand_normal (arg);
+      enum machine_mode mode = insn_p->operand[i + 1].mode;
+      bool match = insn_p->operand[i + 1].predicate (op, mode);
+
+      if (last_arg_count && (i + 1) == nargs)
+	{
+	  /* SIMD shift insns take either an 8-bit immediate or
+	     register as count.  But builtin functions take int as
+	     count.  If count doesn't match, we put it in register.  */
+	  if (!match)
+	    {
+	      op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
+	      if (!insn_p->operand[i + 1].predicate (op, mode))
+		op = copy_to_reg (op);
+	    }
+	}
+      else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
+	       (!mask_pos && (nargs - i) <= nargs_constant))
+	{
+	  if (!match)
+	    switch (icode)
+	      {
+	      case CODE_FOR_avx2_inserti128:
+	      case CODE_FOR_avx2_extracti128:
+		error ("the last argument must be an 1-bit immediate");
+		return const0_rtx;
+
+	      case CODE_FOR_avx512f_cmpv8di3_mask:
+	      case CODE_FOR_avx512f_cmpv16si3_mask:
+	      case CODE_FOR_avx512f_ucmpv8di3_mask:
+	      case CODE_FOR_avx512f_ucmpv16si3_mask:
+		error ("the last argument must be a 3-bit immediate");
+		return const0_rtx;
+
+	      case CODE_FOR_sse4_1_roundsd:
+	      case CODE_FOR_sse4_1_roundss:
+
+	      case CODE_FOR_sse4_1_roundpd:
+	      case CODE_FOR_sse4_1_roundps:
+	      case CODE_FOR_avx_roundpd256:
+	      case CODE_FOR_avx_roundps256:
+
+	      case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
+	      case CODE_FOR_sse4_1_roundps_sfix:
+	      case CODE_FOR_avx_roundpd_vec_pack_sfix256:
+	      case CODE_FOR_avx_roundps_sfix256:
+
+	      case CODE_FOR_sse4_1_blendps:
+	      case CODE_FOR_avx_blendpd256:
+	      case CODE_FOR_avx_vpermilv4df:
+	      case CODE_FOR_avx512f_getmantv8df_mask:
+	      case CODE_FOR_avx512f_getmantv16sf_mask:
+		error ("the last argument must be a 4-bit immediate");
+		return const0_rtx;
+
+	      case CODE_FOR_sha1rnds4:
+	      case CODE_FOR_sse4_1_blendpd:
+	      case CODE_FOR_avx_vpermilv2df:
+	      case CODE_FOR_xop_vpermil2v2df3:
+	      case CODE_FOR_xop_vpermil2v4sf3:
+	      case CODE_FOR_xop_vpermil2v4df3:
+	      case CODE_FOR_xop_vpermil2v8sf3:
+	      case CODE_FOR_avx512f_vinsertf32x4_mask:
+	      case CODE_FOR_avx512f_vinserti32x4_mask:
+	      case CODE_FOR_avx512f_vextractf32x4_mask:
+	      case CODE_FOR_avx512f_vextracti32x4_mask:
+		error ("the last argument must be a 2-bit immediate");
+		return const0_rtx;
+
+	      case CODE_FOR_avx_vextractf128v4df:
+	      case CODE_FOR_avx_vextractf128v8sf:
+	      case CODE_FOR_avx_vextractf128v8si:
+	      case CODE_FOR_avx_vinsertf128v4df:
+	      case CODE_FOR_avx_vinsertf128v8sf:
+	      case CODE_FOR_avx_vinsertf128v8si:
+	      case CODE_FOR_avx512f_vinsertf64x4_mask:
+	      case CODE_FOR_avx512f_vinserti64x4_mask:
+	      case CODE_FOR_avx512f_vextractf64x4_mask:
+	      case CODE_FOR_avx512f_vextracti64x4_mask:
+		error ("the last argument must be a 1-bit immediate");
+		return const0_rtx;
+
+	      case CODE_FOR_avx_vmcmpv2df3:
+	      case CODE_FOR_avx_vmcmpv4sf3:
+	      case CODE_FOR_avx_cmpv2df3:
+	      case CODE_FOR_avx_cmpv4sf3:
+	      case CODE_FOR_avx_cmpv4df3:
+	      case CODE_FOR_avx_cmpv8sf3:
+	      case CODE_FOR_avx512f_cmpv8df3_mask:
+	      case CODE_FOR_avx512f_cmpv16sf3_mask:
+	      case CODE_FOR_avx512f_vmcmpv2df3_mask:
+	      case CODE_FOR_avx512f_vmcmpv4sf3_mask:
+		error ("the last argument must be a 5-bit immediate");
+		return const0_rtx;
+
+	      default:
+		switch (nargs_constant)
+		  {
+		  case 2:
+		    if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
+			(!mask_pos && (nargs - i) == nargs_constant))
+		      {
+			error ("the next to last argument must be an 8-bit immediate");
+			break;
+		      }
+		  case 1:
+		    error ("the last argument must be an 8-bit immediate");
+		    break;
+		  default:
+		    gcc_unreachable ();
+		  }
+		return const0_rtx;
+	      }
+	}
+      else
+	{
+	  if (VECTOR_MODE_P (mode))
+	    op = safe_vector_operand (op, mode);
+
+	  /* If we aren't optimizing, only allow one memory operand to
+	     be generated.  */
+	  if (memory_operand (op, mode))
+	    num_memory++;
+
+	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
+	    {
+	      if (optimize || !match || num_memory > 1)
+		op = copy_to_mode_reg (mode, op);
+	    }
+	  else
+	    {
+	      op = copy_to_reg (op);
+	      op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
+	    }
+	}
+
+      args[i].op = op;
+      args[i].mode = mode;
+    }
+
+  switch (nargs)
+    {
+    case 1:
+      pat = GEN_FCN (icode) (real_target, args[0].op);
+      break;
+    case 2:
+      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
+      break;
+    case 3:
+      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
+			     args[2].op);
+      break;
+    case 4:
+      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
+			     args[2].op, args[3].op);
+      break;
+    case 5:
+      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
+			     args[2].op, args[3].op, args[4].op);
+    case 6:
+      pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
+			     args[2].op, args[3].op, args[4].op,
+			     args[5].op);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  if (! pat)
+    return 0;
+
+  emit_insn (pat);
+  return target;
+}
+
+/* Transform pattern of following layout:
+     (parallel [
+       set (A B)
+       (unspec [C] UNSPEC_EMBEDDED_ROUNDING)])
+     ])
+   into:
+     (set (A B))
+
+   Or:
+     (parallel [ A B
+     ...
+     (unspec [C] UNSPEC_EMBEDDED_ROUNDING)
+     ...
+     ])
+   into:
+     (parallel [ A B ... ])  */
+
+static rtx
+ix86_erase_embedded_rounding (rtx pat)
+{
+  if (GET_CODE (pat) == INSN)
+    pat = PATTERN (pat);
+
+  gcc_assert (GET_CODE (pat) == PARALLEL);
+
+  if (XVECLEN (pat, 0) == 2)
+    {
+      rtx p0 = XVECEXP (pat, 0, 0);
+      rtx p1 = XVECEXP (pat, 0, 1);
+
+      gcc_assert (GET_CODE (p0) == SET
+		  && GET_CODE (p1) == UNSPEC
+		  && XINT (p1, 1) == UNSPEC_EMBEDDED_ROUNDING);
+
+      return p0;
+    }
+  else
+    {
+      rtx *res = XALLOCAVEC (rtx, XVECLEN (pat, 0));
+      int i = 0;
+      int j = 0;
+
+      for (; i < XVECLEN (pat, 0); ++i)
+	{
+	  rtx elem = XVECEXP (pat, 0, i);
+	  if (GET_CODE (elem) != UNSPEC
+	      || XINT (elem, 1) != UNSPEC_EMBEDDED_ROUNDING)
+	    res [j++] = elem;
+	}
+
+      /*  No more than 1 occurence was removed.  */
+      gcc_assert (j >= XVECLEN (pat, 0) - 1);
+
+      return gen_rtx_PARALLEL (GET_MODE (pat), gen_rtvec_v (j, res));
+    }
+}
+
+/* Subroutine of ix86_expand_round_builtin to take care of comi insns
+   with rounding.  */
+static rtx
+ix86_expand_sse_comi_round (const struct builtin_description *d,
+			    tree exp, rtx target)
+{
+  rtx pat, set_dst;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  tree arg2 = CALL_EXPR_ARG (exp, 2);
+  tree arg3 = CALL_EXPR_ARG (exp, 3);
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+  rtx op2 = expand_normal (arg2);
+  rtx op3 = expand_normal (arg3);
+  enum insn_code icode = d->icode;
+  const struct insn_data_d *insn_p = &insn_data[icode];
+  enum machine_mode mode0 = insn_p->operand[0].mode;
+  enum machine_mode mode1 = insn_p->operand[1].mode;
+  enum rtx_code comparison = UNEQ;
+  bool need_ucomi = false;
+
+  /* See avxintrin.h for values.  */
+  enum rtx_code comi_comparisons[32] =
+    {
+      UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
+      UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
+      UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
+    };
+  bool need_ucomi_values[32] =
+    {
+      true,  false, false, true,  true,  false, false, true,
+      true,  false, false, true,  true,  false, false, true,
+      false, true,  true,  false, false, true,  true,  false,
+      false, true,  true,  false, false, true,  true,  false
+    };
+
+  if (!CONST_INT_P (op2))
+    {
+      error ("the third argument must be comparison constant");
+      return const0_rtx;
+    }
+  if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
+    {
+      error ("incorect comparison mode");
+      return const0_rtx;
+    }
+
+  if (!insn_p->operand[2].predicate (op3, SImode))
+    {
+      error ("incorrect rounding operand");
+      return const0_rtx;
+    }
+
+  comparison = comi_comparisons[INTVAL (op2)];
+  need_ucomi = need_ucomi_values[INTVAL (op2)];
+
+  if (VECTOR_MODE_P (mode0))
+    op0 = safe_vector_operand (op0, mode0);
+  if (VECTOR_MODE_P (mode1))
+    op1 = safe_vector_operand (op1, mode1);
+
+  target = gen_reg_rtx (SImode);
+  emit_move_insn (target, const0_rtx);
+  target = gen_rtx_SUBREG (QImode, target, 0);
+
+  if ((optimize && !register_operand (op0, mode0))
+      || !insn_p->operand[0].predicate (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+  if ((optimize && !register_operand (op1, mode1))
+      || !insn_p->operand[1].predicate (op1, mode1))
+    op1 = copy_to_mode_reg (mode1, op1);
+
+  if (need_ucomi)
+    icode = icode == CODE_FOR_sse_comi_round
+		     ? CODE_FOR_sse_ucomi_round
+		     : CODE_FOR_sse2_ucomi_round;
+
+  pat = GEN_FCN (icode) (op0, op1, op3);
+  if (! pat)
+    return 0;
+
+  /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point.  */
+  if (INTVAL (op3) == NO_ROUND)
+    {
+      pat = ix86_erase_embedded_rounding (pat);
+      if (! pat)
+	return 0;
+
+      set_dst = SET_DEST (pat);
+    }
+  else
+    {
+      gcc_assert (GET_CODE (XVECEXP (pat, 0, 0)) == SET);
+      set_dst = SET_DEST (XVECEXP (pat, 0, 0));
+    }
+
+  emit_insn (pat);
+  emit_insn (gen_rtx_SET (VOIDmode,
+			  gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+			  gen_rtx_fmt_ee (comparison, QImode,
+					  set_dst,
+					  const0_rtx)));
+
+  return SUBREG_REG (target);
+}
+
+static rtx
+ix86_expand_round_builtin (const struct builtin_description *d,
+			   tree exp, rtx target)
+{
+  rtx pat;
+  unsigned int i, nargs;
+  struct
+    {
+      rtx op;
+      enum machine_mode mode;
+    } args[6];
+  enum insn_code icode = d->icode;
+  const struct insn_data_d *insn_p = &insn_data[icode];
+  enum machine_mode tmode = insn_p->operand[0].mode;
+  unsigned int nargs_constant = 0;
+  unsigned int redundant_embed_rnd = 0;
+
+  switch ((enum ix86_builtin_func_type) d->flag)
+    {
+    case UINT64_FTYPE_V2DF_INT:
+    case UINT64_FTYPE_V4SF_INT:
+    case UINT_FTYPE_V2DF_INT:
+    case UINT_FTYPE_V4SF_INT:
+    case INT64_FTYPE_V2DF_INT:
+    case INT64_FTYPE_V4SF_INT:
+    case INT_FTYPE_V2DF_INT:
+    case INT_FTYPE_V4SF_INT:
+      nargs = 2;
+      break;
+    case V4SF_FTYPE_V4SF_UINT_INT:
+    case V4SF_FTYPE_V4SF_UINT64_INT:
+    case V2DF_FTYPE_V2DF_UINT64_INT:
+    case V4SF_FTYPE_V4SF_INT_INT:
+    case V4SF_FTYPE_V4SF_INT64_INT:
+    case V2DF_FTYPE_V2DF_INT64_INT:
+    case V4SF_FTYPE_V4SF_V4SF_INT:
+    case V2DF_FTYPE_V2DF_V2DF_INT:
+    case V4SF_FTYPE_V4SF_V2DF_INT:
+    case V2DF_FTYPE_V2DF_V4SF_INT:
+      nargs = 3;
+      break;
+    case V8SF_FTYPE_V8DF_V8SF_QI_INT:
+    case V8DF_FTYPE_V8DF_V8DF_QI_INT:
+    case V8SI_FTYPE_V8DF_V8SI_QI_INT:
+    case V16SF_FTYPE_V16SF_V16SF_HI_INT:
+    case V16SF_FTYPE_V16SI_V16SF_HI_INT:
+    case V16SI_FTYPE_V16SF_V16SI_HI_INT:
+    case V8DF_FTYPE_V8SF_V8DF_QI_INT:
+    case V16SF_FTYPE_V16HI_V16SF_HI_INT:
+    case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
+    case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
+      nargs = 4;
+      break;
+    case V4SF_FTYPE_V4SF_V4SF_INT_INT:
+    case V2DF_FTYPE_V2DF_V2DF_INT_INT:
+      nargs_constant = 2;
+      nargs = 4;
+      break;
+    case INT_FTYPE_V4SF_V4SF_INT_INT:
+    case INT_FTYPE_V2DF_V2DF_INT_INT:
+      return ix86_expand_sse_comi_round (d, exp, target);
+    case V8DF_FTYPE_V8DF_V8DF_V8DF_QI_INT:
+    case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
+    case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
+    case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
+    case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
+    case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
+      nargs = 5;
+      break;
+    case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
+    case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
+      nargs_constant = 4;
+      nargs = 5;
+      break;
+    case QI_FTYPE_V8DF_V8DF_INT_QI_INT:
+    case QI_FTYPE_V2DF_V2DF_INT_QI_INT:
+    case HI_FTYPE_V16SF_V16SF_INT_HI_INT:
+    case QI_FTYPE_V4SF_V4SF_INT_QI_INT:
+      nargs_constant = 3;
+      nargs = 5;
+      break;
+    case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
+    case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
+      nargs = 6;
+      nargs_constant = 4;
+      break;
+    case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
+    case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
+    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
+    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
+      nargs = 6;
+      nargs_constant = 3;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  gcc_assert (nargs <= ARRAY_SIZE (args));
+
+  if (optimize
+      || target == 0
+      || GET_MODE (target) != tmode
+      || !insn_p->operand[0].predicate (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  for (i = 0; i < nargs; i++)
+    {
+      tree arg = CALL_EXPR_ARG (exp, i);
+      rtx op = expand_normal (arg);
+      enum machine_mode mode = insn_p->operand[i + 1].mode;
+      bool match = insn_p->operand[i + 1].predicate (op, mode);
+
+      if (i == nargs - nargs_constant)
+	{
+	  if (!match)
+	    {
+	      switch (icode)
+		{
+		case CODE_FOR_avx512f_getmantv8df_mask_round:
+		case CODE_FOR_avx512f_getmantv16sf_mask_round:
+		case CODE_FOR_avx512f_getmantv2df_round:
+		case CODE_FOR_avx512f_getmantv4sf_round:
+		  error ("the immediate argument must be a 4-bit immediate");
+		  return const0_rtx;
+		case CODE_FOR_avx512f_cmpv8df3_mask_round:
+		case CODE_FOR_avx512f_cmpv16sf3_mask_round:
+		case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
+		case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
+		  error ("the immediate argument must be a 5-bit immediate");
+		  return const0_rtx;
+		default:
+		  error ("the immediate argument must be an 8-bit immediate");
+		  return const0_rtx;
+		}
+	    }
+	}
+      else if (i == nargs-1)
+	{
+	  if (!insn_p->operand[nargs].predicate (op, SImode))
+	    {
+	      error ("incorrect rounding operand");
+	      return const0_rtx;
+	    }
+
+	  /* If there is no rounding use normal version of the pattern.  */
+	  if (INTVAL (op) == NO_ROUND)
+	    redundant_embed_rnd = 1;
+	}
+      else
+	{
+	  if (VECTOR_MODE_P (mode))
+	    op = safe_vector_operand (op, mode);
+
+	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
+	    {
+	      if (optimize || !match)
+		op = copy_to_mode_reg (mode, op);
+	    }
+	  else
+	    {
+	      op = copy_to_reg (op);
+	      op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
+	    }
+	}
+
+      args[i].op = op;
+      args[i].mode = mode;
+    }
+
+  switch (nargs)
+    {
+    case 1:
+      pat = GEN_FCN (icode) (target, args[0].op);
+      break;
+    case 2:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
+      break;
+    case 3:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+			     args[2].op);
+      break;
+    case 4:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+			     args[2].op, args[3].op);
+      break;
+    case 5:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+			     args[2].op, args[3].op, args[4].op);
+    case 6:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+			     args[2].op, args[3].op, args[4].op,
+			     args[5].op);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  if (!pat)
+    return 0;
+
+  if (redundant_embed_rnd)
+    pat = ix86_erase_embedded_rounding (pat);
+
+  emit_insn (pat);
+  return target;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of special insns
+   with variable number of operands.  */
+
+static rtx
+ix86_expand_special_args_builtin (const struct builtin_description *d,
+				  tree exp, rtx target)
+{
+  tree arg;
+  rtx pat, op;
+  unsigned int i, nargs, arg_adjust, memory;
+  bool aligned_mem = false;
+  struct
+    {
+      rtx op;
+      enum machine_mode mode;
+    } args[3];
+  enum insn_code icode = d->icode;
+  bool last_arg_constant = false;
+  const struct insn_data_d *insn_p = &insn_data[icode];
+  enum machine_mode tmode = insn_p->operand[0].mode;
+  enum { load, store } klass;
+
+  switch ((enum ix86_builtin_func_type) d->flag)
+    {
+    case VOID_FTYPE_VOID:
+      emit_insn (GEN_FCN (icode) (target));
+      return 0;
+    case VOID_FTYPE_UINT64:
+    case VOID_FTYPE_UNSIGNED:
+      nargs = 0;
+      klass = store;
+      memory = 0;
+      break;
+
+    case INT_FTYPE_VOID:
+    case UINT64_FTYPE_VOID:
+    case UNSIGNED_FTYPE_VOID:
+      nargs = 0;
+      klass = load;
+      memory = 0;
+      break;
+    case UINT64_FTYPE_PUNSIGNED:
+    case V2DI_FTYPE_PV2DI:
+    case V4DI_FTYPE_PV4DI:
+    case V32QI_FTYPE_PCCHAR:
+    case V16QI_FTYPE_PCCHAR:
+    case V8SF_FTYPE_PCV4SF:
+    case V8SF_FTYPE_PCFLOAT:
+    case V4SF_FTYPE_PCFLOAT:
+    case V4DF_FTYPE_PCV2DF:
+    case V4DF_FTYPE_PCDOUBLE:
+    case V2DF_FTYPE_PCDOUBLE:
+    case VOID_FTYPE_PVOID:
+    case V16SI_FTYPE_PV4SI:
+    case V16SF_FTYPE_PV4SF:
+    case V8DI_FTYPE_PV4DI:
+    case V8DI_FTYPE_PV8DI:
+    case V8DF_FTYPE_PV4DF:
+      nargs = 1;
+      klass = load;
+      memory = 0;
+      switch (icode)
+	{
+	case CODE_FOR_sse4_1_movntdqa:
+	case CODE_FOR_avx2_movntdqa:
+	case CODE_FOR_avx512f_movntdqa:
+	  aligned_mem = true;
+	  break;
+	default:
+	  break;
+	}
+      break;
+    case VOID_FTYPE_PV2SF_V4SF:
+    case VOID_FTYPE_PV8DI_V8DI:
+    case VOID_FTYPE_PV4DI_V4DI:
+    case VOID_FTYPE_PV2DI_V2DI:
+    case VOID_FTYPE_PCHAR_V32QI:
+    case VOID_FTYPE_PCHAR_V16QI:
+    case VOID_FTYPE_PFLOAT_V16SF:
+    case VOID_FTYPE_PFLOAT_V8SF:
+    case VOID_FTYPE_PFLOAT_V4SF:
+    case VOID_FTYPE_PDOUBLE_V8DF:
+    case VOID_FTYPE_PDOUBLE_V4DF:
+    case VOID_FTYPE_PDOUBLE_V2DF:
+    case VOID_FTYPE_PLONGLONG_LONGLONG:
+    case VOID_FTYPE_PULONGLONG_ULONGLONG:
+    case VOID_FTYPE_PINT_INT:
+      nargs = 1;
+      klass = store;
+      /* Reserve memory operand for target.  */
+      memory = ARRAY_SIZE (args);
+      switch (icode)
+	{
+	/* These builtins and instructions require the memory
+	   to be properly aligned.  */
+	case CODE_FOR_avx_movntv4di:
+	case CODE_FOR_sse2_movntv2di:
+	case CODE_FOR_avx_movntv8sf:
+	case CODE_FOR_sse_movntv4sf:
+	case CODE_FOR_sse4a_vmmovntv4sf:
+	case CODE_FOR_avx_movntv4df:
+	case CODE_FOR_sse2_movntv2df:
+	case CODE_FOR_sse4a_vmmovntv2df:
+	case CODE_FOR_sse2_movntidi:
+	case CODE_FOR_sse_movntq:
+	case CODE_FOR_sse2_movntisi:
+	case CODE_FOR_avx512f_movntv16sf:
+	case CODE_FOR_avx512f_movntv8df:
+	case CODE_FOR_avx512f_movntv8di:
+	  aligned_mem = true;
+	  break;
+	default:
+	  break;
+	}
+      break;
+    case V4SF_FTYPE_V4SF_PCV2SF:
+    case V2DF_FTYPE_V2DF_PCDOUBLE:
+      nargs = 2;
+      klass = load;
+      memory = 1;
+      break;
+    case V8SF_FTYPE_PCV8SF_V8SI:
+    case V4DF_FTYPE_PCV4DF_V4DI:
+    case V4SF_FTYPE_PCV4SF_V4SI:
+    case V2DF_FTYPE_PCV2DF_V2DI:
+    case V8SI_FTYPE_PCV8SI_V8SI:
+    case V4DI_FTYPE_PCV4DI_V4DI:
+    case V4SI_FTYPE_PCV4SI_V4SI:
+    case V2DI_FTYPE_PCV2DI_V2DI:
+      nargs = 2;
+      klass = load;
+      memory = 0;
+      break;
+    case VOID_FTYPE_PV8DF_V8DF_QI:
+    case VOID_FTYPE_PV16SF_V16SF_HI:
+    case VOID_FTYPE_PV8DI_V8DI_QI:
+    case VOID_FTYPE_PV16SI_V16SI_HI:
+      switch (icode)
+	{
+	/* These builtins and instructions require the memory
+	   to be properly aligned.  */
+	case CODE_FOR_avx512f_storev16sf_mask:
+	case CODE_FOR_avx512f_storev16si_mask:
+	case CODE_FOR_avx512f_storev8df_mask:
+	case CODE_FOR_avx512f_storev8di_mask:
+	  aligned_mem = true;
+	  break;
+	default:
+	  break;
+	}
+      /* FALLTHRU */
+    case VOID_FTYPE_PV8SF_V8SI_V8SF:
+    case VOID_FTYPE_PV4DF_V4DI_V4DF:
+    case VOID_FTYPE_PV4SF_V4SI_V4SF:
+    case VOID_FTYPE_PV2DF_V2DI_V2DF:
+    case VOID_FTYPE_PV8SI_V8SI_V8SI:
+    case VOID_FTYPE_PV4DI_V4DI_V4DI:
+    case VOID_FTYPE_PV4SI_V4SI_V4SI:
+    case VOID_FTYPE_PV2DI_V2DI_V2DI:
+    case VOID_FTYPE_PDOUBLE_V2DF_QI:
+    case VOID_FTYPE_PFLOAT_V4SF_QI:
+    case VOID_FTYPE_PV8SI_V8DI_QI:
+    case VOID_FTYPE_PV8HI_V8DI_QI:
+    case VOID_FTYPE_PV16HI_V16SI_HI:
+    case VOID_FTYPE_PV16QI_V8DI_QI:
+    case VOID_FTYPE_PV16QI_V16SI_HI:
+      nargs = 2;
+      klass = store;
+      /* Reserve memory operand for target.  */
+      memory = ARRAY_SIZE (args);
+      break;
+    case V16SF_FTYPE_PCV16SF_V16SF_HI:
+    case V16SI_FTYPE_PCV16SI_V16SI_HI:
+    case V8DF_FTYPE_PCV8DF_V8DF_QI:
+    case V8DI_FTYPE_PCV8DI_V8DI_QI:
+    case V2DF_FTYPE_PCDOUBLE_V2DF_QI:
+    case V4SF_FTYPE_PCFLOAT_V4SF_QI:
+      nargs = 3;
+      klass = load;
+      memory = 0;
+      switch (icode)
+	{
+	/* These builtins and instructions require the memory
+	   to be properly aligned.  */
+	case CODE_FOR_avx512f_loadv16sf_mask:
+	case CODE_FOR_avx512f_loadv16si_mask:
+	case CODE_FOR_avx512f_loadv8df_mask:
+	case CODE_FOR_avx512f_loadv8di_mask:
+	  aligned_mem = true;
+	  break;
+	default:
+	  break;
+	}
+      break;
+    case VOID_FTYPE_UINT_UINT_UINT:
+    case VOID_FTYPE_UINT64_UINT_UINT:
+    case UCHAR_FTYPE_UINT_UINT_UINT:
+    case UCHAR_FTYPE_UINT64_UINT_UINT:
+      nargs = 3;
+      klass = load;
+      memory = ARRAY_SIZE (args);
+      last_arg_constant = true;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  gcc_assert (nargs <= ARRAY_SIZE (args));
+
+  if (klass == store)
+    {
+      arg = CALL_EXPR_ARG (exp, 0);
+      op = expand_normal (arg);
+      gcc_assert (target == 0);
+      if (memory)
+	{
+	  op = ix86_zero_extend_to_Pmode (op);
+	  target = gen_rtx_MEM (tmode, op);
+	  /* target at this point has just BITS_PER_UNIT MEM_ALIGN
+	     on it.  Try to improve it using get_pointer_alignment,
+	     and if the special builtin is one that requires strict
+	     mode alignment, also from it's GET_MODE_ALIGNMENT.
+	     Failure to do so could lead to ix86_legitimate_combined_insn
+	     rejecting all changes to such insns.  */
+	  unsigned int align = get_pointer_alignment (arg);
+	  if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
+	    align = GET_MODE_ALIGNMENT (tmode);
+	  if (MEM_ALIGN (target) < align)
+	    set_mem_align (target, align);
+	}
+      else
+	target = force_reg (tmode, op);
+      arg_adjust = 1;
+    }
+  else
+    {
+      arg_adjust = 0;
+      if (optimize
+	  || target == 0
+	  || !register_operand (target, tmode)
+	  || GET_MODE (target) != tmode)
+	target = gen_reg_rtx (tmode);
+    }
+
+  for (i = 0; i < nargs; i++)
+    {
+      enum machine_mode mode = insn_p->operand[i + 1].mode;
+      bool match;
+
+      arg = CALL_EXPR_ARG (exp, i + arg_adjust);
+      op = expand_normal (arg);
+      match = insn_p->operand[i + 1].predicate (op, mode);
+
+      if (last_arg_constant && (i + 1) == nargs)
+	{
+	  if (!match)
+	    {
+	      if (icode == CODE_FOR_lwp_lwpvalsi3
+		  || icode == CODE_FOR_lwp_lwpinssi3
+		  || icode == CODE_FOR_lwp_lwpvaldi3
+		  || icode == CODE_FOR_lwp_lwpinsdi3)
+		error ("the last argument must be a 32-bit immediate");
+	      else
+		error ("the last argument must be an 8-bit immediate");
+	      return const0_rtx;
+	    }
+	}
+      else
+	{
+	  if (i == memory)
+	    {
+	      /* This must be the memory operand.  */
+	      op = ix86_zero_extend_to_Pmode (op);
+	      op = gen_rtx_MEM (mode, op);
+	      /* op at this point has just BITS_PER_UNIT MEM_ALIGN
+		 on it.  Try to improve it using get_pointer_alignment,
+		 and if the special builtin is one that requires strict
+		 mode alignment, also from it's GET_MODE_ALIGNMENT.
+		 Failure to do so could lead to ix86_legitimate_combined_insn
+		 rejecting all changes to such insns.  */
+	      unsigned int align = get_pointer_alignment (arg);
+	      if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
+		align = GET_MODE_ALIGNMENT (mode);
+	      if (MEM_ALIGN (op) < align)
+		set_mem_align (op, align);
+	    }
+	  else
+	    {
+	      /* This must be register.  */
+	      if (VECTOR_MODE_P (mode))
+		op = safe_vector_operand (op, mode);
+
+	      if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
+		op = copy_to_mode_reg (mode, op);
+	      else
+	        {
+	          op = copy_to_reg (op);
+	          op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
+	        }
+	    }
+	}
+
+      args[i].op = op;
+      args[i].mode = mode;
+    }
+
+  switch (nargs)
+    {
+    case 0:
+      pat = GEN_FCN (icode) (target);
+      break;
+    case 1:
+      pat = GEN_FCN (icode) (target, args[0].op);
+      break;
+    case 2:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
+      break;
+    case 3:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  if (! pat)
+    return 0;
+  emit_insn (pat);
+  return klass == store ? 0 : target;
+}
+
+/* Return the integer constant in ARG.  Constrain it to be in the range
+   of the subparts of VEC_TYPE; issue an error if not.  */
+
+static int
+get_element_number (tree vec_type, tree arg)
+{
+  unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
+
+  if (!tree_fits_uhwi_p (arg)
+      || (elt = tree_to_uhwi (arg), elt > max))
+    {
+      error ("selector must be an integer constant in the range 0..%wi", max);
+      return 0;
+    }
+
+  return elt;
+}
+
+/* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
+   ix86_expand_vector_init.  We DO have language-level syntax for this, in
+   the form of  (type){ init-list }.  Except that since we can't place emms
+   instructions from inside the compiler, we can't allow the use of MMX
+   registers unless the user explicitly asks for it.  So we do *not* define
+   vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md.  Instead
+   we have builtins invoked by mmintrin.h that gives us license to emit
+   these sorts of instructions.  */
+
+static rtx
+ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
+{
+  enum machine_mode tmode = TYPE_MODE (type);
+  enum machine_mode inner_mode = GET_MODE_INNER (tmode);
+  int i, n_elt = GET_MODE_NUNITS (tmode);
+  rtvec v = rtvec_alloc (n_elt);
+
+  gcc_assert (VECTOR_MODE_P (tmode));
+  gcc_assert (call_expr_nargs (exp) == n_elt);
+
+  for (i = 0; i < n_elt; ++i)
+    {
+      rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
+      RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
+    }
+
+  if (!target || !register_operand (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
+  return target;
+}
+
+/* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
+   ix86_expand_vector_extract.  They would be redundant (for non-MMX) if we
+   had a language-level syntax for referencing vector elements.  */
+
+static rtx
+ix86_expand_vec_ext_builtin (tree exp, rtx target)
+{
+  enum machine_mode tmode, mode0;
+  tree arg0, arg1;
+  int elt;
+  rtx op0;
+
+  arg0 = CALL_EXPR_ARG (exp, 0);
+  arg1 = CALL_EXPR_ARG (exp, 1);
+
+  op0 = expand_normal (arg0);
+  elt = get_element_number (TREE_TYPE (arg0), arg1);
+
+  tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
+  mode0 = TYPE_MODE (TREE_TYPE (arg0));
+  gcc_assert (VECTOR_MODE_P (mode0));
+
+  op0 = force_reg (mode0, op0);
+
+  if (optimize || !target || !register_operand (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  ix86_expand_vector_extract (true, target, op0, elt);
+
+  return target;
+}
+
+/* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
+   ix86_expand_vector_set.  They would be redundant (for non-MMX) if we had
+   a language-level syntax for referencing vector elements.  */
+
+static rtx
+ix86_expand_vec_set_builtin (tree exp)
+{
+  enum machine_mode tmode, mode1;
+  tree arg0, arg1, arg2;
+  int elt;
+  rtx op0, op1, target;
+
+  arg0 = CALL_EXPR_ARG (exp, 0);
+  arg1 = CALL_EXPR_ARG (exp, 1);
+  arg2 = CALL_EXPR_ARG (exp, 2);
+
+  tmode = TYPE_MODE (TREE_TYPE (arg0));
+  mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
+  gcc_assert (VECTOR_MODE_P (tmode));
+
+  op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
+  op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
+  elt = get_element_number (TREE_TYPE (arg0), arg2);
+
+  if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
+    op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
+
+  op0 = force_reg (tmode, op0);
+  op1 = force_reg (mode1, op1);
+
+  /* OP0 is the source of these builtin functions and shouldn't be
+     modified.  Create a copy, use it and return it as target.  */
+  target = gen_reg_rtx (tmode);
+  emit_move_insn (target, op0);
+  ix86_expand_vector_set (true, target, op1, elt);
+
+  return target;
+}
+
+/* Expand an expression EXP that calls a built-in function,
+   with result going to TARGET if that's convenient
+   (and in mode MODE if that's convenient).
+   SUBTARGET may be used as the target for computing one of EXP's operands.
+   IGNORE is nonzero if the value is to be ignored.  */
+
+static rtx
+ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
+		     enum machine_mode mode, int ignore)
+{
+  const struct builtin_description *d;
+  size_t i;
+  enum insn_code icode;
+  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
+  tree arg0, arg1, arg2, arg3, arg4;
+  rtx op0, op1, op2, op3, op4, pat, insn;
+  enum machine_mode mode0, mode1, mode2, mode3, mode4;
+  unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
+
+  /* For CPU builtins that can be folded, fold first and expand the fold.  */
+  switch (fcode)
+    {
+    case IX86_BUILTIN_CPU_INIT:
+      {
+	/* Make it call __cpu_indicator_init in libgcc. */
+	tree call_expr, fndecl, type;
+        type = build_function_type_list (integer_type_node, NULL_TREE); 
+	fndecl = build_fn_decl ("__cpu_indicator_init", type);
+	call_expr = build_call_expr (fndecl, 0); 
+	return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
+      }
+    case IX86_BUILTIN_CPU_IS:
+    case IX86_BUILTIN_CPU_SUPPORTS:
+      {
+	tree arg0 = CALL_EXPR_ARG (exp, 0);
+	tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
+	gcc_assert (fold_expr != NULL_TREE);
+	return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
+      }
+    }
+
+  /* Determine whether the builtin function is available under the current ISA.
+     Originally the builtin was not created if it wasn't applicable to the
+     current ISA based on the command line switches.  With function specific
+     options, we need to check in the context of the function making the call
+     whether it is supported.  */
+  if (ix86_builtins_isa[fcode].isa
+      && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
+    {
+      char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
+				       NULL, (enum fpmath_unit) 0, false);
+
+      if (!opts)
+	error ("%qE needs unknown isa option", fndecl);
+      else
+	{
+	  gcc_assert (opts != NULL);
+	  error ("%qE needs isa option %s", fndecl, opts);
+	  free (opts);
+	}
+      return const0_rtx;
+    }
+
+  switch (fcode)
+    {
+    case IX86_BUILTIN_MASKMOVQ:
+    case IX86_BUILTIN_MASKMOVDQU:
+      icode = (fcode == IX86_BUILTIN_MASKMOVQ
+	       ? CODE_FOR_mmx_maskmovq
+	       : CODE_FOR_sse2_maskmovdqu);
+      /* Note the arg order is different from the operand order.  */
+      arg1 = CALL_EXPR_ARG (exp, 0);
+      arg2 = CALL_EXPR_ARG (exp, 1);
+      arg0 = CALL_EXPR_ARG (exp, 2);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+      op2 = expand_normal (arg2);
+      mode0 = insn_data[icode].operand[0].mode;
+      mode1 = insn_data[icode].operand[1].mode;
+      mode2 = insn_data[icode].operand[2].mode;
+
+      op0 = ix86_zero_extend_to_Pmode (op0);
+      op0 = gen_rtx_MEM (mode1, op0);
+
+      if (!insn_data[icode].operand[0].predicate (op0, mode0))
+	op0 = copy_to_mode_reg (mode0, op0);
+      if (!insn_data[icode].operand[1].predicate (op1, mode1))
+	op1 = copy_to_mode_reg (mode1, op1);
+      if (!insn_data[icode].operand[2].predicate (op2, mode2))
+	op2 = copy_to_mode_reg (mode2, op2);
+      pat = GEN_FCN (icode) (op0, op1, op2);
+      if (! pat)
+	return 0;
+      emit_insn (pat);
+      return 0;
+
+    case IX86_BUILTIN_LDMXCSR:
+      op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
+      target = assign_386_stack_local (SImode, SLOT_TEMP);
+      emit_move_insn (target, op0);
+      emit_insn (gen_sse_ldmxcsr (target));
+      return 0;
+
+    case IX86_BUILTIN_STMXCSR:
+      target = assign_386_stack_local (SImode, SLOT_TEMP);
+      emit_insn (gen_sse_stmxcsr (target));
+      return copy_to_mode_reg (SImode, target);
+
+    case IX86_BUILTIN_CLFLUSH:
+	arg0 = CALL_EXPR_ARG (exp, 0);
+	op0 = expand_normal (arg0);
+	icode = CODE_FOR_sse2_clflush;
+	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+	  op0 = ix86_zero_extend_to_Pmode (op0);
+
+	emit_insn (gen_sse2_clflush (op0));
+	return 0;
+
+    case IX86_BUILTIN_MONITOR:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      arg2 = CALL_EXPR_ARG (exp, 2);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+      op2 = expand_normal (arg2);
+      if (!REG_P (op0))
+	op0 = ix86_zero_extend_to_Pmode (op0);
+      if (!REG_P (op1))
+	op1 = copy_to_mode_reg (SImode, op1);
+      if (!REG_P (op2))
+	op2 = copy_to_mode_reg (SImode, op2);
+      emit_insn (ix86_gen_monitor (op0, op1, op2));
+      return 0;
+
+    case IX86_BUILTIN_MWAIT:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+      if (!REG_P (op0))
+	op0 = copy_to_mode_reg (SImode, op0);
+      if (!REG_P (op1))
+	op1 = copy_to_mode_reg (SImode, op1);
+      emit_insn (gen_sse3_mwait (op0, op1));
+      return 0;
+
+    case IX86_BUILTIN_VEC_INIT_V2SI:
+    case IX86_BUILTIN_VEC_INIT_V4HI:
+    case IX86_BUILTIN_VEC_INIT_V8QI:
+      return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
+
+    case IX86_BUILTIN_VEC_EXT_V2DF:
+    case IX86_BUILTIN_VEC_EXT_V2DI:
+    case IX86_BUILTIN_VEC_EXT_V4SF:
+    case IX86_BUILTIN_VEC_EXT_V4SI:
+    case IX86_BUILTIN_VEC_EXT_V8HI:
+    case IX86_BUILTIN_VEC_EXT_V2SI:
+    case IX86_BUILTIN_VEC_EXT_V4HI:
+    case IX86_BUILTIN_VEC_EXT_V16QI:
+      return ix86_expand_vec_ext_builtin (exp, target);
+
+    case IX86_BUILTIN_VEC_SET_V2DI:
+    case IX86_BUILTIN_VEC_SET_V4SF:
+    case IX86_BUILTIN_VEC_SET_V4SI:
+    case IX86_BUILTIN_VEC_SET_V8HI:
+    case IX86_BUILTIN_VEC_SET_V4HI:
+    case IX86_BUILTIN_VEC_SET_V16QI:
+      return ix86_expand_vec_set_builtin (exp);
+
+    case IX86_BUILTIN_INFQ:
+    case IX86_BUILTIN_HUGE_VALQ:
+      {
+	REAL_VALUE_TYPE inf;
+	rtx tmp;
+
+	real_inf (&inf);
+	tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
+
+	tmp = validize_mem (force_const_mem (mode, tmp));
+
+	if (target == 0)
+	  target = gen_reg_rtx (mode);
+
+	emit_move_insn (target, tmp);
+	return target;
+      }
+
+    case IX86_BUILTIN_RDPMC:
+    case IX86_BUILTIN_RDTSC:
+    case IX86_BUILTIN_RDTSCP:
+
+      op0 = gen_reg_rtx (DImode);
+      op1 = gen_reg_rtx (DImode);
+
+      if (fcode == IX86_BUILTIN_RDPMC)
+	{
+	  arg0 = CALL_EXPR_ARG (exp, 0);
+	  op2 = expand_normal (arg0);
+	  if (!register_operand (op2, SImode))
+	    op2 = copy_to_mode_reg (SImode, op2);
+
+	  insn = (TARGET_64BIT
+		  ? gen_rdpmc_rex64 (op0, op1, op2)
+		  : gen_rdpmc (op0, op2));
+	  emit_insn (insn);
+	}
+      else if (fcode == IX86_BUILTIN_RDTSC)
+	{
+	  insn = (TARGET_64BIT
+		  ? gen_rdtsc_rex64 (op0, op1)
+		  : gen_rdtsc (op0));
+	  emit_insn (insn);
+	}
+      else
+	{
+	  op2 = gen_reg_rtx (SImode);
+
+	  insn = (TARGET_64BIT
+		  ? gen_rdtscp_rex64 (op0, op1, op2)
+		  : gen_rdtscp (op0, op2));
+	  emit_insn (insn);
+
+	  arg0 = CALL_EXPR_ARG (exp, 0);
+	  op4 = expand_normal (arg0);
+	  if (!address_operand (op4, VOIDmode))
+	    {
+	      op4 = convert_memory_address (Pmode, op4);
+	      op4 = copy_addr_to_reg (op4);
+	    }
+	  emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
+	}
+
+      if (target == 0)
+	{
+	  /* mode is VOIDmode if __builtin_rd* has been called
+	     without lhs.  */
+	  if (mode == VOIDmode)
+	    return target;
+	  target = gen_reg_rtx (mode);
+	}
+
+      if (TARGET_64BIT)
+	{
+	  op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
+				     op1, 1, OPTAB_DIRECT);
+	  op0 = expand_simple_binop (DImode, IOR, op0, op1,
+				     op0, 1, OPTAB_DIRECT);
+	}
+
+      emit_move_insn (target, op0);
+      return target;
+
+    case IX86_BUILTIN_FXSAVE:
+    case IX86_BUILTIN_FXRSTOR:
+    case IX86_BUILTIN_FXSAVE64:
+    case IX86_BUILTIN_FXRSTOR64:
+    case IX86_BUILTIN_FNSTENV:
+    case IX86_BUILTIN_FLDENV:
+    case IX86_BUILTIN_FNSTSW:
+      mode0 = BLKmode;
+      switch (fcode)
+	{
+	case IX86_BUILTIN_FXSAVE:
+	  icode = CODE_FOR_fxsave;
+	  break;
+	case IX86_BUILTIN_FXRSTOR:
+	  icode = CODE_FOR_fxrstor;
+	  break;
+	case IX86_BUILTIN_FXSAVE64:
+	  icode = CODE_FOR_fxsave64;
+	  break;
+	case IX86_BUILTIN_FXRSTOR64:
+	  icode = CODE_FOR_fxrstor64;
+	  break;
+	case IX86_BUILTIN_FNSTENV:
+	  icode = CODE_FOR_fnstenv;
+	  break;
+	case IX86_BUILTIN_FLDENV:
+	  icode = CODE_FOR_fldenv;
+	  break;
+	case IX86_BUILTIN_FNSTSW:
+	  icode = CODE_FOR_fnstsw;
+	  mode0 = HImode;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op0 = expand_normal (arg0);
+
+      if (!address_operand (op0, VOIDmode))
+	{
+	  op0 = convert_memory_address (Pmode, op0);
+	  op0 = copy_addr_to_reg (op0);
+	}
+      op0 = gen_rtx_MEM (mode0, op0);
+
+      pat = GEN_FCN (icode) (op0);
+      if (pat)
+	emit_insn (pat);
+      return 0;
+
+    case IX86_BUILTIN_XSAVE:
+    case IX86_BUILTIN_XRSTOR:
+    case IX86_BUILTIN_XSAVE64:
+    case IX86_BUILTIN_XRSTOR64:
+    case IX86_BUILTIN_XSAVEOPT:
+    case IX86_BUILTIN_XSAVEOPT64:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+
+      if (!address_operand (op0, VOIDmode))
+	{
+	  op0 = convert_memory_address (Pmode, op0);
+	  op0 = copy_addr_to_reg (op0);
+	}
+      op0 = gen_rtx_MEM (BLKmode, op0);
+
+      op1 = force_reg (DImode, op1);
+
+      if (TARGET_64BIT)
+	{
+	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
+				     NULL, 1, OPTAB_DIRECT);
+	  switch (fcode)
+	    {
+	    case IX86_BUILTIN_XSAVE:
+	      icode = CODE_FOR_xsave_rex64;
+	      break;
+	    case IX86_BUILTIN_XRSTOR:
+	      icode = CODE_FOR_xrstor_rex64;
+	      break;
+	    case IX86_BUILTIN_XSAVE64:
+	      icode = CODE_FOR_xsave64;
+	      break;
+	    case IX86_BUILTIN_XRSTOR64:
+	      icode = CODE_FOR_xrstor64;
+	      break;
+	    case IX86_BUILTIN_XSAVEOPT:
+	      icode = CODE_FOR_xsaveopt_rex64;
+	      break;
+	    case IX86_BUILTIN_XSAVEOPT64:
+	      icode = CODE_FOR_xsaveopt64;
+	      break;
+	    default:
+	      gcc_unreachable ();
+	    }
+
+	  op2 = gen_lowpart (SImode, op2);
+	  op1 = gen_lowpart (SImode, op1);
+	  pat = GEN_FCN (icode) (op0, op1, op2);
+	}
+      else
+	{
+	  switch (fcode)
+	    {
+	    case IX86_BUILTIN_XSAVE:
+	      icode = CODE_FOR_xsave;
+	      break;
+	    case IX86_BUILTIN_XRSTOR:
+	      icode = CODE_FOR_xrstor;
+	      break;
+	    case IX86_BUILTIN_XSAVEOPT:
+	      icode = CODE_FOR_xsaveopt;
+	      break;
+	    default:
+	      gcc_unreachable ();
+	    }
+	  pat = GEN_FCN (icode) (op0, op1);
+	}
+
+      if (pat)
+	emit_insn (pat);
+      return 0;
+
+    case IX86_BUILTIN_LLWPCB:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op0 = expand_normal (arg0);
+      icode = CODE_FOR_lwp_llwpcb;
+      if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+	op0 = ix86_zero_extend_to_Pmode (op0);
+      emit_insn (gen_lwp_llwpcb (op0));
+      return 0;
+
+    case IX86_BUILTIN_SLWPCB:
+      icode = CODE_FOR_lwp_slwpcb;
+      if (!target
+	  || !insn_data[icode].operand[0].predicate (target, Pmode))
+	target = gen_reg_rtx (Pmode);
+      emit_insn (gen_lwp_slwpcb (target));
+      return target;
+
+    case IX86_BUILTIN_BEXTRI32:
+    case IX86_BUILTIN_BEXTRI64:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+      icode = (fcode == IX86_BUILTIN_BEXTRI32
+	  ? CODE_FOR_tbm_bextri_si
+	  : CODE_FOR_tbm_bextri_di);
+      if (!CONST_INT_P (op1))
+        {
+          error ("last argument must be an immediate");
+          return const0_rtx;
+        }
+      else
+        {
+          unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
+          unsigned char lsb_index = INTVAL (op1) & 0xFF;
+          op1 = GEN_INT (length);
+          op2 = GEN_INT (lsb_index);
+          pat = GEN_FCN (icode) (target, op0, op1, op2);
+          if (pat)
+            emit_insn (pat);
+          return target;
+        }
+
+    case IX86_BUILTIN_RDRAND16_STEP:
+      icode = CODE_FOR_rdrandhi_1;
+      mode0 = HImode;
+      goto rdrand_step;
+
+    case IX86_BUILTIN_RDRAND32_STEP:
+      icode = CODE_FOR_rdrandsi_1;
+      mode0 = SImode;
+      goto rdrand_step;
+
+    case IX86_BUILTIN_RDRAND64_STEP:
+      icode = CODE_FOR_rdranddi_1;
+      mode0 = DImode;
+
+rdrand_step:
+      op0 = gen_reg_rtx (mode0);
+      emit_insn (GEN_FCN (icode) (op0));
+
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op1 = expand_normal (arg0);
+      if (!address_operand (op1, VOIDmode))
+	{
+	  op1 = convert_memory_address (Pmode, op1);
+	  op1 = copy_addr_to_reg (op1);
+	}
+      emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
+
+      op1 = gen_reg_rtx (SImode);
+      emit_move_insn (op1, CONST1_RTX (SImode));
+
+      /* Emit SImode conditional move.  */
+      if (mode0 == HImode)
+	{
+	  op2 = gen_reg_rtx (SImode);
+	  emit_insn (gen_zero_extendhisi2 (op2, op0));
+	}
+      else if (mode0 == SImode)
+	op2 = op0;
+      else
+	op2 = gen_rtx_SUBREG (SImode, op0, 0);
+
+      if (target == 0)
+	target = gen_reg_rtx (SImode);
+
+      pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
+			 const0_rtx);
+      emit_insn (gen_rtx_SET (VOIDmode, target,
+			      gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
+      return target;
+
+    case IX86_BUILTIN_RDSEED16_STEP:
+      icode = CODE_FOR_rdseedhi_1;
+      mode0 = HImode;
+      goto rdseed_step;
+
+    case IX86_BUILTIN_RDSEED32_STEP:
+      icode = CODE_FOR_rdseedsi_1;
+      mode0 = SImode;
+      goto rdseed_step;
+
+    case IX86_BUILTIN_RDSEED64_STEP:
+      icode = CODE_FOR_rdseeddi_1;
+      mode0 = DImode;
+
+rdseed_step:
+      op0 = gen_reg_rtx (mode0);
+      emit_insn (GEN_FCN (icode) (op0));
+
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op1 = expand_normal (arg0);
+      if (!address_operand (op1, VOIDmode))
+	{
+	  op1 = convert_memory_address (Pmode, op1);
+	  op1 = copy_addr_to_reg (op1);
+	}
+      emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
+
+      op2 = gen_reg_rtx (QImode);
+
+      pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
+                         const0_rtx);
+      emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
+
+      if (target == 0)
+        target = gen_reg_rtx (SImode);
+
+      emit_insn (gen_zero_extendqisi2 (target, op2));
+      return target;
+
+    case IX86_BUILTIN_ADDCARRYX32:
+      icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
+      mode0 = SImode;
+      goto addcarryx;
+
+    case IX86_BUILTIN_ADDCARRYX64:
+      icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
+      mode0 = DImode;
+
+addcarryx:
+      arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in.  */
+      arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1.  */
+      arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2.  */
+      arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
+
+      op0 = gen_reg_rtx (QImode);
+
+      /* Generate CF from input operand.  */
+      op1 = expand_normal (arg0);
+      op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
+      emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
+
+      /* Gen ADCX instruction to compute X+Y+CF.  */
+      op2 = expand_normal (arg1);
+      op3 = expand_normal (arg2);
+
+      if (!REG_P (op2))
+	op2 = copy_to_mode_reg (mode0, op2);
+      if (!REG_P (op3))
+	op3 = copy_to_mode_reg (mode0, op3);
+
+      op0 = gen_reg_rtx (mode0);
+
+      op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
+      pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
+      emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
+
+      /* Store the result.  */
+      op4 = expand_normal (arg3);
+      if (!address_operand (op4, VOIDmode))
+	{
+	  op4 = convert_memory_address (Pmode, op4);
+	  op4 = copy_addr_to_reg (op4);
+	}
+      emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
+
+      /* Return current CF value.  */
+      if (target == 0)
+        target = gen_reg_rtx (QImode);
+
+      PUT_MODE (pat, QImode);
+      emit_insn (gen_rtx_SET (VOIDmode, target, pat));
+      return target;
+
+    case IX86_BUILTIN_READ_FLAGS:
+      emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
+
+      if (optimize
+	  || target == NULL_RTX
+	  || !nonimmediate_operand (target, word_mode)
+	  || GET_MODE (target) != word_mode)
+	target = gen_reg_rtx (word_mode);
+
+      emit_insn (gen_pop (target));
+      return target;
+
+    case IX86_BUILTIN_WRITE_FLAGS:
+
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op0 = expand_normal (arg0);
+      if (!general_no_elim_operand (op0, word_mode))
+	op0 = copy_to_mode_reg (word_mode, op0);
+
+      emit_insn (gen_push (op0));
+      emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
+      return 0;
+
+    case IX86_BUILTIN_KORTESTC16:
+      icode = CODE_FOR_kortestchi;
+      mode0 = HImode;
+      mode1 = CCCmode;
+      goto kortest;
+
+    case IX86_BUILTIN_KORTESTZ16:
+      icode = CODE_FOR_kortestzhi;
+      mode0 = HImode;
+      mode1 = CCZmode;
+
+    kortest:
+      arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1.  */
+      arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2.  */
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+
+      op0 = copy_to_reg (op0);
+      op0 = simplify_gen_subreg (mode0, op0, GET_MODE (op0), 0);
+      op1 = copy_to_reg (op1);
+      op1 = simplify_gen_subreg (mode0, op1, GET_MODE (op1), 0);
+
+      target = gen_reg_rtx (QImode);
+      emit_insn (gen_rtx_SET (mode0, target, const0_rtx));
+
+      /* Emit kortest.  */
+      emit_insn (GEN_FCN (icode) (op0, op1));
+      /* And use setcc to return result from flags.  */
+      ix86_expand_setcc (target, EQ,
+			 gen_rtx_REG (mode1, FLAGS_REG), const0_rtx);
+      return target;
+
+    case IX86_BUILTIN_GATHERSIV2DF:
+      icode = CODE_FOR_avx2_gathersiv2df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV4DF:
+      icode = CODE_FOR_avx2_gathersiv4df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV2DF:
+      icode = CODE_FOR_avx2_gatherdiv2df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV4DF:
+      icode = CODE_FOR_avx2_gatherdiv4df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV4SF:
+      icode = CODE_FOR_avx2_gathersiv4sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV8SF:
+      icode = CODE_FOR_avx2_gathersiv8sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV4SF:
+      icode = CODE_FOR_avx2_gatherdiv4sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV8SF:
+      icode = CODE_FOR_avx2_gatherdiv8sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV2DI:
+      icode = CODE_FOR_avx2_gathersiv2di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV4DI:
+      icode = CODE_FOR_avx2_gathersiv4di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV2DI:
+      icode = CODE_FOR_avx2_gatherdiv2di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV4DI:
+      icode = CODE_FOR_avx2_gatherdiv4di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV4SI:
+      icode = CODE_FOR_avx2_gathersiv4si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV8SI:
+      icode = CODE_FOR_avx2_gathersiv8si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV4SI:
+      icode = CODE_FOR_avx2_gatherdiv4si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV8SI:
+      icode = CODE_FOR_avx2_gatherdiv8si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERALTSIV4DF:
+      icode = CODE_FOR_avx2_gathersiv4df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERALTDIV8SF:
+      icode = CODE_FOR_avx2_gatherdiv8sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERALTSIV4DI:
+      icode = CODE_FOR_avx2_gathersiv4di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERALTDIV8SI:
+      icode = CODE_FOR_avx2_gatherdiv8si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3SIV16SF:
+      icode = CODE_FOR_avx512f_gathersiv16sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3SIV8DF:
+      icode = CODE_FOR_avx512f_gathersiv8df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3DIV16SF:
+      icode = CODE_FOR_avx512f_gatherdiv16sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3DIV8DF:
+      icode = CODE_FOR_avx512f_gatherdiv8df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3SIV16SI:
+      icode = CODE_FOR_avx512f_gathersiv16si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3SIV8DI:
+      icode = CODE_FOR_avx512f_gathersiv8di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3DIV16SI:
+      icode = CODE_FOR_avx512f_gatherdiv16si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3DIV8DI:
+      icode = CODE_FOR_avx512f_gatherdiv8di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3ALTSIV8DF:
+      icode = CODE_FOR_avx512f_gathersiv8df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3ALTDIV16SF:
+      icode = CODE_FOR_avx512f_gatherdiv16sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3ALTSIV8DI:
+      icode = CODE_FOR_avx512f_gathersiv8di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHER3ALTDIV16SI:
+      icode = CODE_FOR_avx512f_gatherdiv16si;
+      goto gather_gen;
+    case IX86_BUILTIN_SCATTERSIV16SF:
+      icode = CODE_FOR_avx512f_scattersiv16sf;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERSIV8DF:
+      icode = CODE_FOR_avx512f_scattersiv8df;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERDIV16SF:
+      icode = CODE_FOR_avx512f_scatterdiv16sf;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERDIV8DF:
+      icode = CODE_FOR_avx512f_scatterdiv8df;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERSIV16SI:
+      icode = CODE_FOR_avx512f_scattersiv16si;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERSIV8DI:
+      icode = CODE_FOR_avx512f_scattersiv8di;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERDIV16SI:
+      icode = CODE_FOR_avx512f_scatterdiv16si;
+      goto scatter_gen;
+    case IX86_BUILTIN_SCATTERDIV8DI:
+      icode = CODE_FOR_avx512f_scatterdiv8di;
+      goto scatter_gen;
+
+    case IX86_BUILTIN_GATHERPFDPD:
+      icode = CODE_FOR_avx512pf_gatherpfv8sidf;
+      goto vec_prefetch_gen;
+    case IX86_BUILTIN_GATHERPFDPS:
+      icode = CODE_FOR_avx512pf_gatherpfv16sisf;
+      goto vec_prefetch_gen;
+    case IX86_BUILTIN_GATHERPFQPD:
+      icode = CODE_FOR_avx512pf_gatherpfv8didf;
+      goto vec_prefetch_gen;
+    case IX86_BUILTIN_GATHERPFQPS:
+      icode = CODE_FOR_avx512pf_gatherpfv8disf;
+      goto vec_prefetch_gen;
+    case IX86_BUILTIN_SCATTERPFDPD:
+      icode = CODE_FOR_avx512pf_scatterpfv8sidf;
+      goto vec_prefetch_gen;
+    case IX86_BUILTIN_SCATTERPFDPS:
+      icode = CODE_FOR_avx512pf_scatterpfv16sisf;
+      goto vec_prefetch_gen;
+    case IX86_BUILTIN_SCATTERPFQPD:
+      icode = CODE_FOR_avx512pf_scatterpfv8didf;
+      goto vec_prefetch_gen;
+    case IX86_BUILTIN_SCATTERPFQPS:
+      icode = CODE_FOR_avx512pf_scatterpfv8disf;
+      goto vec_prefetch_gen;
+
+    gather_gen:
+      rtx half;
+      rtx (*gen) (rtx, rtx);
+
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      arg2 = CALL_EXPR_ARG (exp, 2);
+      arg3 = CALL_EXPR_ARG (exp, 3);
+      arg4 = CALL_EXPR_ARG (exp, 4);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+      op2 = expand_normal (arg2);
+      op3 = expand_normal (arg3);
+      op4 = expand_normal (arg4);
+      /* Note the arg order is different from the operand order.  */
+      mode0 = insn_data[icode].operand[1].mode;
+      mode2 = insn_data[icode].operand[3].mode;
+      mode3 = insn_data[icode].operand[4].mode;
+      mode4 = insn_data[icode].operand[5].mode;
+
+      if (target == NULL_RTX
+	  || GET_MODE (target) != insn_data[icode].operand[0].mode
+	  || !insn_data[icode].operand[0].predicate (target,
+						     GET_MODE (target)))
+	subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
+      else
+	subtarget = target;
+
+      switch (fcode)
+	{
+	case IX86_BUILTIN_GATHER3ALTSIV8DF:
+	case IX86_BUILTIN_GATHER3ALTSIV8DI:
+	  half = gen_reg_rtx (V8SImode);
+	  if (!nonimmediate_operand (op2, V16SImode))
+	    op2 = copy_to_mode_reg (V16SImode, op2);
+	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
+	  op2 = half;
+	  break;
+	case IX86_BUILTIN_GATHERALTSIV4DF:
+	case IX86_BUILTIN_GATHERALTSIV4DI:
+	  half = gen_reg_rtx (V4SImode);
+	  if (!nonimmediate_operand (op2, V8SImode))
+	    op2 = copy_to_mode_reg (V8SImode, op2);
+	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
+	  op2 = half;
+	  break;
+	case IX86_BUILTIN_GATHER3ALTDIV16SF:
+	case IX86_BUILTIN_GATHER3ALTDIV16SI:
+	  half = gen_reg_rtx (mode0);
+	  if (mode0 == V8SFmode)
+	    gen = gen_vec_extract_lo_v16sf;
+	  else
+	    gen = gen_vec_extract_lo_v16si;
+	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
+	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
+	  emit_insn (gen (half, op0));
+	  op0 = half;
+	  if (GET_MODE (op3) != VOIDmode)
+	    {
+	      if (!nonimmediate_operand (op3, GET_MODE (op3)))
+		op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+	      emit_insn (gen (half, op3));
+	      op3 = half;
+	    }
+	  break;
+	case IX86_BUILTIN_GATHERALTDIV8SF:
+	case IX86_BUILTIN_GATHERALTDIV8SI:
+	  half = gen_reg_rtx (mode0);
+	  if (mode0 == V4SFmode)
+	    gen = gen_vec_extract_lo_v8sf;
+	  else
+	    gen = gen_vec_extract_lo_v8si;
+	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
+	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
+	  emit_insn (gen (half, op0));
+	  op0 = half;
+	  if (GET_MODE (op3) != VOIDmode)
+	    {
+	      if (!nonimmediate_operand (op3, GET_MODE (op3)))
+		op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+	      emit_insn (gen (half, op3));
+	      op3 = half;
+	    }
+	  break;
+	default:
+	  break;
+	}
+
+      /* Force memory operand only with base register here.  But we
+	 don't want to do it on memory operand for other builtin
+	 functions.  */
+      op1 = ix86_zero_extend_to_Pmode (op1);
+
+      if (!insn_data[icode].operand[1].predicate (op0, mode0))
+	op0 = copy_to_mode_reg (mode0, op0);
+      if (!insn_data[icode].operand[2].predicate (op1, Pmode))
+	op1 = copy_to_mode_reg (Pmode, op1);
+      if (!insn_data[icode].operand[3].predicate (op2, mode2))
+	op2 = copy_to_mode_reg (mode2, op2);
+      if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
+	{
+	  if (!insn_data[icode].operand[4].predicate (op3, mode3))
+	    op3 = copy_to_mode_reg (mode3, op3);
+	}
+      else
+	{
+	  op3 = copy_to_reg (op3);
+	  op3 = simplify_gen_subreg (mode3, op3, GET_MODE (op3), 0);
+	}
+      if (!insn_data[icode].operand[5].predicate (op4, mode4))
+	{
+          error ("the last argument must be scale 1, 2, 4, 8");
+          return const0_rtx;
+	}
+
+      /* Optimize.  If mask is known to have all high bits set,
+	 replace op0 with pc_rtx to signal that the instruction
+	 overwrites the whole destination and doesn't use its
+	 previous contents.  */
+      if (optimize)
+	{
+	  if (TREE_CODE (arg3) == INTEGER_CST)
+	    {
+	      if (integer_all_onesp (arg3))
+		op0 = pc_rtx;
+	    }
+	  else if (TREE_CODE (arg3) == VECTOR_CST)
+	    {
+	      unsigned int negative = 0;
+	      for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
+		{
+		  tree cst = VECTOR_CST_ELT (arg3, i);
+		  if (TREE_CODE (cst) == INTEGER_CST
+		      && tree_int_cst_sign_bit (cst))
+		    negative++;
+		  else if (TREE_CODE (cst) == REAL_CST
+			   && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
+		    negative++;
+		}
+	      if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
+		op0 = pc_rtx;
+	    }
+	  else if (TREE_CODE (arg3) == SSA_NAME
+		   && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
+	    {
+	      /* Recognize also when mask is like:
+		 __v2df src = _mm_setzero_pd ();
+		 __v2df mask = _mm_cmpeq_pd (src, src);
+		 or
+		 __v8sf src = _mm256_setzero_ps ();
+		 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
+		 as that is a cheaper way to load all ones into
+		 a register than having to load a constant from
+		 memory.  */
+	      gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
+	      if (is_gimple_call (def_stmt))
+		{
+		  tree fndecl = gimple_call_fndecl (def_stmt);
+		  if (fndecl
+		      && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
+		    switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
+		      {
+		      case IX86_BUILTIN_CMPPD:
+		      case IX86_BUILTIN_CMPPS:
+		      case IX86_BUILTIN_CMPPD256:
+		      case IX86_BUILTIN_CMPPS256:
+			if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
+			  break;
+			/* FALLTHRU */
+		      case IX86_BUILTIN_CMPEQPD:
+		      case IX86_BUILTIN_CMPEQPS:
+			if (initializer_zerop (gimple_call_arg (def_stmt, 0))
+			    && initializer_zerop (gimple_call_arg (def_stmt,
+								   1)))
+			  op0 = pc_rtx;
+			break;
+		      default:
+			break;
+		      }
+		}
+	    }
+	}
+
+      pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
+      if (! pat)
+	return const0_rtx;
+      emit_insn (pat);
+
+      switch (fcode)
+	{
+	case IX86_BUILTIN_GATHER3DIV16SF:
+	  if (target == NULL_RTX)
+	    target = gen_reg_rtx (V8SFmode);
+	  emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
+	  break;
+	case IX86_BUILTIN_GATHER3DIV16SI:
+	  if (target == NULL_RTX)
+	    target = gen_reg_rtx (V8SImode);
+	  emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
+	  break;
+	case IX86_BUILTIN_GATHERDIV8SF:
+	  if (target == NULL_RTX)
+	    target = gen_reg_rtx (V4SFmode);
+	  emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
+	  break;
+	case IX86_BUILTIN_GATHERDIV8SI:
+	  if (target == NULL_RTX)
+	    target = gen_reg_rtx (V4SImode);
+	  emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
+	  break;
+	default:
+	  target = subtarget;
+	  break;
+	}
+      return target;
+
+    scatter_gen:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      arg2 = CALL_EXPR_ARG (exp, 2);
+      arg3 = CALL_EXPR_ARG (exp, 3);
+      arg4 = CALL_EXPR_ARG (exp, 4);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+      op2 = expand_normal (arg2);
+      op3 = expand_normal (arg3);
+      op4 = expand_normal (arg4);
+      mode1 = insn_data[icode].operand[1].mode;
+      mode2 = insn_data[icode].operand[2].mode;
+      mode3 = insn_data[icode].operand[3].mode;
+      mode4 = insn_data[icode].operand[4].mode;
+
+      /* Force memory operand only with base register here.  But we
+	 don't want to do it on memory operand for other builtin
+	 functions.  */
+      op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
+
+      if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+	op0 = copy_to_mode_reg (Pmode, op0);
+
+      if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
+	{
+	  if (!insn_data[icode].operand[1].predicate (op1, mode1))
+	    op1 = copy_to_mode_reg (mode1, op1);
+	}
+      else
+	{
+	  op1 = copy_to_reg (op1);
+	  op1 = simplify_gen_subreg (mode1, op1, GET_MODE (op1), 0);
+	}
+
+      if (!insn_data[icode].operand[2].predicate (op2, mode2))
+	op2 = copy_to_mode_reg (mode2, op2);
+
+      if (!insn_data[icode].operand[3].predicate (op3, mode3))
+	op3 = copy_to_mode_reg (mode3, op3);
+
+      if (!insn_data[icode].operand[4].predicate (op4, mode4))
+	{
+	  error ("the last argument must be scale 1, 2, 4, 8");
+	  return const0_rtx;
+	}
+
+      pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
+      if (! pat)
+	return const0_rtx;
+
+      emit_insn (pat);
+      return 0;
+
+    vec_prefetch_gen:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      arg2 = CALL_EXPR_ARG (exp, 2);
+      arg3 = CALL_EXPR_ARG (exp, 3);
+      arg4 = CALL_EXPR_ARG (exp, 4);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+      op2 = expand_normal (arg2);
+      op3 = expand_normal (arg3);
+      op4 = expand_normal (arg4);
+      mode0 = insn_data[icode].operand[0].mode;
+      mode1 = insn_data[icode].operand[1].mode;
+      mode3 = insn_data[icode].operand[3].mode;
+      mode4 = insn_data[icode].operand[4].mode;
+
+      if (GET_MODE (op0) == mode0
+	  || (GET_MODE (op0) == VOIDmode && op0 != constm1_rtx))
+	{
+	  if (!insn_data[icode].operand[0].predicate (op0, mode0))
+	    op0 = copy_to_mode_reg (mode0, op0);
+	}
+      else if (op0 != constm1_rtx)
+	{
+	  op0 = copy_to_reg (op0);
+	  op0 = simplify_gen_subreg (mode0, op0, GET_MODE (op0), 0);
+	}
+
+      if (!insn_data[icode].operand[1].predicate (op1, mode1))
+	op1 = copy_to_mode_reg (mode1, op1);
+
+      /* Force memory operand only with base register here.  But we
+	 don't want to do it on memory operand for other builtin
+	 functions.  */
+      op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
+
+      if (!insn_data[icode].operand[2].predicate (op2, Pmode))
+	op2 = copy_to_mode_reg (Pmode, op2);
+
+      if (!insn_data[icode].operand[3].predicate (op3, mode3))
+	{
+	  error ("the forth argument must be scale 1, 2, 4, 8");
+	  return const0_rtx;
+	}
+
+      if (!insn_data[icode].operand[4].predicate (op4, mode4))
+	{
+	  error ("incorrect hint operand");
+	  return const0_rtx;
+	}
+
+      pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
+      if (! pat)
+	return const0_rtx;
+
+      emit_insn (pat);
+
+      return 0;
+
+    case IX86_BUILTIN_XABORT:
+      icode = CODE_FOR_xabort;
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op0 = expand_normal (arg0);
+      mode0 = insn_data[icode].operand[0].mode;
+      if (!insn_data[icode].operand[0].predicate (op0, mode0))
+	{
+	  error ("the xabort's argument must be an 8-bit immediate");
+	  return const0_rtx;
+	}
+      emit_insn (gen_xabort (op0));
+      return 0;
+
+    default:
+      break;
+    }
+
+  for (i = 0, d = bdesc_special_args;
+       i < ARRAY_SIZE (bdesc_special_args);
+       i++, d++)
+    if (d->code == fcode)
+      return ix86_expand_special_args_builtin (d, exp, target);
+
+  for (i = 0, d = bdesc_args;
+       i < ARRAY_SIZE (bdesc_args);
+       i++, d++)
+    if (d->code == fcode)
+      switch (fcode)
+	{
+	case IX86_BUILTIN_FABSQ:
+	case IX86_BUILTIN_COPYSIGNQ:
+	  if (!TARGET_SSE)
+	    /* Emit a normal call if SSE isn't available.  */
+	    return expand_call (exp, target, ignore);
+	default:
+	  return ix86_expand_args_builtin (d, exp, target);
+	}
+
+  for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
+    if (d->code == fcode)
+      return ix86_expand_sse_comi (d, exp, target);
+
+  for (i = 0, d = bdesc_round_args; i < ARRAY_SIZE (bdesc_round_args); i++, d++)
+    if (d->code == fcode)
+      return ix86_expand_round_builtin (d, exp, target);
+
+  for (i = 0, d = bdesc_pcmpestr;
+       i < ARRAY_SIZE (bdesc_pcmpestr);
+       i++, d++)
+    if (d->code == fcode)
+      return ix86_expand_sse_pcmpestr (d, exp, target);
+
+  for (i = 0, d = bdesc_pcmpistr;
+       i < ARRAY_SIZE (bdesc_pcmpistr);
+       i++, d++)
+    if (d->code == fcode)
+      return ix86_expand_sse_pcmpistr (d, exp, target);
+
+  for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
+    if (d->code == fcode)
+      return ix86_expand_multi_arg_builtin (d->icode, exp, target,
+					    (enum ix86_builtin_func_type)
+					    d->flag, d->comparison);
+
+  gcc_unreachable ();
+}
+
+/* This returns the target-specific builtin with code CODE if
+   current_function_decl has visibility on this builtin, which is checked
+   using isa flags.  Returns NULL_TREE otherwise.  */
+
+static tree ix86_get_builtin (enum ix86_builtins code)
+{
+  struct cl_target_option *opts;
+  tree target_tree = NULL_TREE;
+
+  /* Determine the isa flags of current_function_decl.  */
+
+  if (current_function_decl)
+    target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
+
+  if (target_tree == NULL)
+    target_tree = target_option_default_node;
+
+  opts = TREE_TARGET_OPTION (target_tree);
+
+  if (ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
+    return ix86_builtin_decl (code, true);
+  else
+    return NULL_TREE;
+}
+
+/* Returns a function decl for a vectorized version of the builtin function
+   with builtin function code FN and the result vector type TYPE, or NULL_TREE
+   if it is not available.  */
+
+static tree
+ix86_builtin_vectorized_function (tree fndecl, tree type_out,
+				  tree type_in)
+{
+  enum machine_mode in_mode, out_mode;
+  int in_n, out_n;
+  enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
+
+  if (TREE_CODE (type_out) != VECTOR_TYPE
+      || TREE_CODE (type_in) != VECTOR_TYPE
+      || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
+    return NULL_TREE;
+
+  out_mode = TYPE_MODE (TREE_TYPE (type_out));
+  out_n = TYPE_VECTOR_SUBPARTS (type_out);
+  in_mode = TYPE_MODE (TREE_TYPE (type_in));
+  in_n = TYPE_VECTOR_SUBPARTS (type_in);
+
+  switch (fn)
+    {
+    case BUILT_IN_SQRT:
+      if (out_mode == DFmode && in_mode == DFmode)
+	{
+	  if (out_n == 2 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_SQRTPD);
+	  else if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_SQRTPD256);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_SQRTPD512);
+	}
+      break;
+
+    case BUILT_IN_EXP2F:
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
+	}
+      break;
+
+    case BUILT_IN_SQRTF:
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_SQRTPS_NR);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_SQRTPS_NR256);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_SQRTPS_NR512);
+	}
+      break;
+
+    case BUILT_IN_IFLOOR:
+    case BUILT_IN_LFLOOR:
+    case BUILT_IN_LLFLOOR:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == SImode && in_mode == DFmode)
+	{
+	  if (out_n == 4 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
+	  else if (out_n == 8 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
+	  else if (out_n == 16 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
+	}
+      break;
+
+    case BUILT_IN_IFLOORF:
+    case BUILT_IN_LFLOORF:
+    case BUILT_IN_LLFLOORF:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == SImode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
+	}
+      break;
+
+    case BUILT_IN_ICEIL:
+    case BUILT_IN_LCEIL:
+    case BUILT_IN_LLCEIL:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == SImode && in_mode == DFmode)
+	{
+	  if (out_n == 4 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
+	  else if (out_n == 8 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
+	  else if (out_n == 16 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
+	}
+      break;
+
+    case BUILT_IN_ICEILF:
+    case BUILT_IN_LCEILF:
+    case BUILT_IN_LLCEILF:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == SImode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
+	}
+      break;
+
+    case BUILT_IN_IRINT:
+    case BUILT_IN_LRINT:
+    case BUILT_IN_LLRINT:
+      if (out_mode == SImode && in_mode == DFmode)
+	{
+	  if (out_n == 4 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
+	  else if (out_n == 8 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
+	}
+      break;
+
+    case BUILT_IN_IRINTF:
+    case BUILT_IN_LRINTF:
+    case BUILT_IN_LLRINTF:
+      if (out_mode == SImode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
+	}
+      break;
+
+    case BUILT_IN_IROUND:
+    case BUILT_IN_LROUND:
+    case BUILT_IN_LLROUND:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == SImode && in_mode == DFmode)
+	{
+	  if (out_n == 4 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
+	  else if (out_n == 8 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
+	  else if (out_n == 16 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
+	}
+      break;
+
+    case BUILT_IN_IROUNDF:
+    case BUILT_IN_LROUNDF:
+    case BUILT_IN_LLROUNDF:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == SImode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
+	}
+      break;
+
+    case BUILT_IN_COPYSIGN:
+      if (out_mode == DFmode && in_mode == DFmode)
+	{
+	  if (out_n == 2 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_CPYSGNPD);
+	  else if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_CPYSGNPD256);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_CPYSGNPD512);
+	}
+      break;
+
+    case BUILT_IN_COPYSIGNF:
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_CPYSGNPS);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_CPYSGNPS256);
+	  else if (out_n == 16 && in_n == 16)
+	    return ix86_get_builtin (IX86_BUILTIN_CPYSGNPS512);
+	}
+      break;
+
+    case BUILT_IN_FLOOR:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == DFmode && in_mode == DFmode)
+	{
+	  if (out_n == 2 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
+	  else if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
+	}
+      break;
+
+    case BUILT_IN_FLOORF:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
+	}
+      break;
+
+    case BUILT_IN_CEIL:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == DFmode && in_mode == DFmode)
+	{
+	  if (out_n == 2 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPD);
+	  else if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
+	}
+      break;
+
+    case BUILT_IN_CEILF:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPS);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
+	}
+      break;
+
+    case BUILT_IN_TRUNC:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == DFmode && in_mode == DFmode)
+	{
+	  if (out_n == 2 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
+	  else if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
+	}
+      break;
+
+    case BUILT_IN_TRUNCF:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
+	}
+      break;
+
+    case BUILT_IN_RINT:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == DFmode && in_mode == DFmode)
+	{
+	  if (out_n == 2 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_RINTPD);
+	  else if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
+	}
+      break;
+
+    case BUILT_IN_RINTF:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_RINTPS);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
+	}
+      break;
+
+    case BUILT_IN_ROUND:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == DFmode && in_mode == DFmode)
+	{
+	  if (out_n == 2 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ);
+	  else if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ256);
+	}
+      break;
+
+    case BUILT_IN_ROUNDF:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+	break;
+
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ);
+	  else if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ256);
+	}
+      break;
+
+    case BUILT_IN_FMA:
+      if (out_mode == DFmode && in_mode == DFmode)
+	{
+	  if (out_n == 2 && in_n == 2)
+	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
+	}
+      break;
+
+    case BUILT_IN_FMAF:
+      if (out_mode == SFmode && in_mode == SFmode)
+	{
+	  if (out_n == 4 && in_n == 4)
+	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
+	  if (out_n == 8 && in_n == 8)
+	    return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
+	}
+      break;
+
+    default:
+      break;
+    }
+
+  /* Dispatch to a handler for a vectorization library.  */
+  if (ix86_veclib_handler)
+    return ix86_veclib_handler ((enum built_in_function) fn, type_out,
+				type_in);
+
+  return NULL_TREE;
+}
+
+/* Handler for an SVML-style interface to
+   a library with vectorized intrinsics.  */
+
+static tree
+ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
+{
+  char name[20];
+  tree fntype, new_fndecl, args;
+  unsigned arity;
+  const char *bname;
+  enum machine_mode el_mode, in_mode;
+  int n, in_n;
+
+  /* The SVML is suitable for unsafe math only.  */
+  if (!flag_unsafe_math_optimizations)
+    return NULL_TREE;
+
+  el_mode = TYPE_MODE (TREE_TYPE (type_out));
+  n = TYPE_VECTOR_SUBPARTS (type_out);
+  in_mode = TYPE_MODE (TREE_TYPE (type_in));
+  in_n = TYPE_VECTOR_SUBPARTS (type_in);
+  if (el_mode != in_mode
+      || n != in_n)
+    return NULL_TREE;
+
+  switch (fn)
+    {
+    case BUILT_IN_EXP:
+    case BUILT_IN_LOG:
+    case BUILT_IN_LOG10:
+    case BUILT_IN_POW:
+    case BUILT_IN_TANH:
+    case BUILT_IN_TAN:
+    case BUILT_IN_ATAN:
+    case BUILT_IN_ATAN2:
+    case BUILT_IN_ATANH:
+    case BUILT_IN_CBRT:
+    case BUILT_IN_SINH:
+    case BUILT_IN_SIN:
+    case BUILT_IN_ASINH:
+    case BUILT_IN_ASIN:
+    case BUILT_IN_COSH:
+    case BUILT_IN_COS:
+    case BUILT_IN_ACOSH:
+    case BUILT_IN_ACOS:
+      if (el_mode != DFmode || n != 2)
+	return NULL_TREE;
+      break;
+
+    case BUILT_IN_EXPF:
+    case BUILT_IN_LOGF:
+    case BUILT_IN_LOG10F:
+    case BUILT_IN_POWF:
+    case BUILT_IN_TANHF:
+    case BUILT_IN_TANF:
+    case BUILT_IN_ATANF:
+    case BUILT_IN_ATAN2F:
+    case BUILT_IN_ATANHF:
+    case BUILT_IN_CBRTF:
+    case BUILT_IN_SINHF:
+    case BUILT_IN_SINF:
+    case BUILT_IN_ASINHF:
+    case BUILT_IN_ASINF:
+    case BUILT_IN_COSHF:
+    case BUILT_IN_COSF:
+    case BUILT_IN_ACOSHF:
+    case BUILT_IN_ACOSF:
+      if (el_mode != SFmode || n != 4)
+	return NULL_TREE;
+      break;
+
+    default:
+      return NULL_TREE;
+    }
+
+  bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
+
+  if (fn == BUILT_IN_LOGF)
+    strcpy (name, "vmlsLn4");
+  else if (fn == BUILT_IN_LOG)
+    strcpy (name, "vmldLn2");
+  else if (n == 4)
+    {
+      sprintf (name, "vmls%s", bname+10);
+      name[strlen (name)-1] = '4';
+    }
+  else
+    sprintf (name, "vmld%s2", bname+10);
+
+  /* Convert to uppercase. */
+  name[4] &= ~0x20;
+
+  arity = 0;
+  for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
+       args;
+       args = TREE_CHAIN (args))
+    arity++;
+
+  if (arity == 1)
+    fntype = build_function_type_list (type_out, type_in, NULL);
+  else
+    fntype = build_function_type_list (type_out, type_in, type_in, NULL);
+
+  /* Build a function declaration for the vectorized function.  */
+  new_fndecl = build_decl (BUILTINS_LOCATION,
+			   FUNCTION_DECL, get_identifier (name), fntype);
+  TREE_PUBLIC (new_fndecl) = 1;
+  DECL_EXTERNAL (new_fndecl) = 1;
+  DECL_IS_NOVOPS (new_fndecl) = 1;
+  TREE_READONLY (new_fndecl) = 1;
+
+  return new_fndecl;
+}
+
+/* Handler for an ACML-style interface to
+   a library with vectorized intrinsics.  */
+
+static tree
+ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
+{
+  char name[20] = "__vr.._";
+  tree fntype, new_fndecl, args;
+  unsigned arity;
+  const char *bname;
+  enum machine_mode el_mode, in_mode;
+  int n, in_n;
+
+  /* The ACML is 64bits only and suitable for unsafe math only as
+     it does not correctly support parts of IEEE with the required
+     precision such as denormals.  */
+  if (!TARGET_64BIT
+      || !flag_unsafe_math_optimizations)
+    return NULL_TREE;
+
+  el_mode = TYPE_MODE (TREE_TYPE (type_out));
+  n = TYPE_VECTOR_SUBPARTS (type_out);
+  in_mode = TYPE_MODE (TREE_TYPE (type_in));
+  in_n = TYPE_VECTOR_SUBPARTS (type_in);
+  if (el_mode != in_mode
+      || n != in_n)
+    return NULL_TREE;
+
+  switch (fn)
+    {
+    case BUILT_IN_SIN:
+    case BUILT_IN_COS:
+    case BUILT_IN_EXP:
+    case BUILT_IN_LOG:
+    case BUILT_IN_LOG2:
+    case BUILT_IN_LOG10:
+      name[4] = 'd';
+      name[5] = '2';
+      if (el_mode != DFmode
+	  || n != 2)
+	return NULL_TREE;
+      break;
+
+    case BUILT_IN_SINF:
+    case BUILT_IN_COSF:
+    case BUILT_IN_EXPF:
+    case BUILT_IN_POWF:
+    case BUILT_IN_LOGF:
+    case BUILT_IN_LOG2F:
+    case BUILT_IN_LOG10F:
+      name[4] = 's';
+      name[5] = '4';
+      if (el_mode != SFmode
+	  || n != 4)
+	return NULL_TREE;
+      break;
+
+    default:
+      return NULL_TREE;
+    }
+
+  bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
+  sprintf (name + 7, "%s", bname+10);
+
+  arity = 0;
+  for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
+       args;
+       args = TREE_CHAIN (args))
+    arity++;
+
+  if (arity == 1)
+    fntype = build_function_type_list (type_out, type_in, NULL);
+  else
+    fntype = build_function_type_list (type_out, type_in, type_in, NULL);
+
+  /* Build a function declaration for the vectorized function.  */
+  new_fndecl = build_decl (BUILTINS_LOCATION,
+			   FUNCTION_DECL, get_identifier (name), fntype);
+  TREE_PUBLIC (new_fndecl) = 1;
+  DECL_EXTERNAL (new_fndecl) = 1;
+  DECL_IS_NOVOPS (new_fndecl) = 1;
+  TREE_READONLY (new_fndecl) = 1;
+
+  return new_fndecl;
+}
+
+/* Returns a decl of a function that implements gather load with
+   memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
+   Return NULL_TREE if it is not available.  */
+
+static tree
+ix86_vectorize_builtin_gather (const_tree mem_vectype,
+			       const_tree index_type, int scale)
+{
+  bool si;
+  enum ix86_builtins code;
+
+  if (! TARGET_AVX2)
+    return NULL_TREE;
+
+  if ((TREE_CODE (index_type) != INTEGER_TYPE
+       && !POINTER_TYPE_P (index_type))
+      || (TYPE_MODE (index_type) != SImode
+	  && TYPE_MODE (index_type) != DImode))
+    return NULL_TREE;
+
+  if (TYPE_PRECISION (index_type) > POINTER_SIZE)
+    return NULL_TREE;
+
+  /* v*gather* insn sign extends index to pointer mode.  */
+  if (TYPE_PRECISION (index_type) < POINTER_SIZE
+      && TYPE_UNSIGNED (index_type))
+    return NULL_TREE;
+
+  if (scale <= 0
+      || scale > 8
+      || (scale & (scale - 1)) != 0)
+    return NULL_TREE;
+
+  si = TYPE_MODE (index_type) == SImode;
+  switch (TYPE_MODE (mem_vectype))
+    {
+    case V2DFmode:
+      code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
+      break;
+    case V4DFmode:
+      code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
+      break;
+    case V2DImode:
+      code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
+      break;
+    case V4DImode:
+      code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
+      break;
+    case V4SFmode:
+      code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
+      break;
+    case V8SFmode:
+      code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
+      break;
+    case V4SImode:
+      code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
+      break;
+    case V8SImode:
+      code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
+      break;
+    case V8DFmode:
+      if (TARGET_AVX512F)
+	code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
+      else
+	return NULL_TREE;
+      break;
+    case V8DImode:
+      if (TARGET_AVX512F)
+	code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
+      else
+	return NULL_TREE;
+      break;
+    case V16SFmode:
+      if (TARGET_AVX512F)
+	code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
+      else
+	return NULL_TREE;
+      break;
+    case V16SImode:
+      if (TARGET_AVX512F)
+	code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
+      else
+	return NULL_TREE;
+      break;
+    default:
+      return NULL_TREE;
+    }
+
+  return ix86_get_builtin (code);
+}
+
+/* Returns a code for a target-specific builtin that implements
+   reciprocal of the function, or NULL_TREE if not available.  */
+
+static tree
+ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
+			 bool sqrt ATTRIBUTE_UNUSED)
+{
+  if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
+	 && flag_finite_math_only && !flag_trapping_math
+	 && flag_unsafe_math_optimizations))
+    return NULL_TREE;
+
+  if (md_fn)
+    /* Machine dependent builtins.  */
+    switch (fn)
+      {
+	/* Vectorized version of sqrt to rsqrt conversion.  */
+      case IX86_BUILTIN_SQRTPS_NR:
+	return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
+
+      case IX86_BUILTIN_SQRTPS_NR256:
+	return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
+
+      default:
+	return NULL_TREE;
+      }
+  else
+    /* Normal builtins.  */
+    switch (fn)
+      {
+	/* Sqrt to rsqrt conversion.  */
+      case BUILT_IN_SQRTF:
+	return ix86_get_builtin (IX86_BUILTIN_RSQRTF);
+
+      default:
+	return NULL_TREE;
+      }
+}
+
+/* Helper for avx_vpermilps256_operand et al.  This is also used by
+   the expansion functions to turn the parallel back into a mask.
+   The return value is 0 for no match and the imm8+1 for a match.  */
+
+int
+avx_vpermilp_parallel (rtx par, enum machine_mode mode)
+{
+  unsigned i, nelt = GET_MODE_NUNITS (mode);
+  unsigned mask = 0;
+  unsigned char ipar[16] = {};  /* Silence -Wuninitialized warning.  */
+
+  if (XVECLEN (par, 0) != (int) nelt)
+    return 0;
+
+  /* Validate that all of the elements are constants, and not totally
+     out of range.  Copy the data into an integral array to make the
+     subsequent checks easier.  */
+  for (i = 0; i < nelt; ++i)
+    {
+      rtx er = XVECEXP (par, 0, i);
+      unsigned HOST_WIDE_INT ei;
+
+      if (!CONST_INT_P (er))
+	return 0;
+      ei = INTVAL (er);
+      if (ei >= nelt)
+	return 0;
+      ipar[i] = ei;
+    }
+
+  switch (mode)
+    {
+    case V8DFmode:
+      /* In the 512-bit DFmode case, we can only move elements within
+         a 128-bit lane.  First fill the second part of the mask,
+	 then fallthru.  */
+      for (i = 4; i < 6; ++i)
+	{
+	  if (ipar[i] < 4 || ipar[i] >= 6)
+	    return 0;
+	  mask |= (ipar[i] - 4) << i;
+	}
+      for (i = 6; i < 8; ++i)
+	{
+	  if (ipar[i] < 6)
+	    return 0;
+	  mask |= (ipar[i] - 6) << i;
+	}
+      /* FALLTHRU */
+
+    case V4DFmode:
+      /* In the 256-bit DFmode case, we can only move elements within
+         a 128-bit lane.  */
+      for (i = 0; i < 2; ++i)
+	{
+	  if (ipar[i] >= 2)
+	    return 0;
+	  mask |= ipar[i] << i;
+	}
+      for (i = 2; i < 4; ++i)
+	{
+	  if (ipar[i] < 2)
+	    return 0;
+	  mask |= (ipar[i] - 2) << i;
+	}
+      break;
+
+    case V16SFmode:
+      /* In 512 bit SFmode case, permutation in the upper 256 bits
+	 must mirror the permutation in the lower 256-bits.  */
+      for (i = 0; i < 8; ++i)
+	if (ipar[i] + 8 != ipar[i + 8])
+	  return 0;
+      /* FALLTHRU */
+
+    case V8SFmode:
+      /* In 256 bit SFmode case, we have full freedom of
+         movement within the low 128-bit lane, but the high 128-bit
+         lane must mirror the exact same pattern.  */
+      for (i = 0; i < 4; ++i)
+	if (ipar[i] + 4 != ipar[i + 4])
+	  return 0;
+      nelt = 4;
+      /* FALLTHRU */
+
+    case V2DFmode:
+    case V4SFmode:
+      /* In the 128-bit case, we've full freedom in the placement of
+	 the elements from the source operand.  */
+      for (i = 0; i < nelt; ++i)
+	mask |= ipar[i] << (i * (nelt / 2));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Make sure success has a non-zero value by adding one.  */
+  return mask + 1;
+}
+
+/* Helper for avx_vperm2f128_v4df_operand et al.  This is also used by
+   the expansion functions to turn the parallel back into a mask.
+   The return value is 0 for no match and the imm8+1 for a match.  */
+
+int
+avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
+{
+  unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
+  unsigned mask = 0;
+  unsigned char ipar[8] = {};  /* Silence -Wuninitialized warning.  */
+
+  if (XVECLEN (par, 0) != (int) nelt)
+    return 0;
+
+  /* Validate that all of the elements are constants, and not totally
+     out of range.  Copy the data into an integral array to make the
+     subsequent checks easier.  */
+  for (i = 0; i < nelt; ++i)
+    {
+      rtx er = XVECEXP (par, 0, i);
+      unsigned HOST_WIDE_INT ei;
+
+      if (!CONST_INT_P (er))
+	return 0;
+      ei = INTVAL (er);
+      if (ei >= 2 * nelt)
+	return 0;
+      ipar[i] = ei;
+    }
+
+  /* Validate that the halves of the permute are halves.  */
+  for (i = 0; i < nelt2 - 1; ++i)
+    if (ipar[i] + 1 != ipar[i + 1])
+      return 0;
+  for (i = nelt2; i < nelt - 1; ++i)
+    if (ipar[i] + 1 != ipar[i + 1])
+      return 0;
+
+  /* Reconstruct the mask.  */
+  for (i = 0; i < 2; ++i)
+    {
+      unsigned e = ipar[i * nelt2];
+      if (e % nelt2)
+	return 0;
+      e /= nelt2;
+      mask |= e << (i * 4);
+    }
+
+  /* Make sure success has a non-zero value by adding one.  */
+  return mask + 1;
+}
+
+/* Return a register priority for hard reg REGNO.  */
+static int
+ix86_register_priority (int hard_regno)
+{
+  /* ebp and r13 as the base always wants a displacement, r12 as the
+     base always wants an index.  So discourage their usage in an
+     address.  */
+  if (hard_regno == R12_REG || hard_regno == R13_REG)
+    return 0;
+  if (hard_regno == BP_REG)
+    return 1;
+  /* New x86-64 int registers result in bigger code size.  Discourage
+     them.  */
+  if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
+    return 2;
+  /* New x86-64 SSE registers result in bigger code size.  Discourage
+     them.  */
+  if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
+    return 2;
+  /* Usage of AX register results in smaller code.  Prefer it.  */
+  if (hard_regno == 0)
+    return 4;
+  return 3;
+}
+
+/* Implement TARGET_PREFERRED_RELOAD_CLASS.
+
+   Put float CONST_DOUBLE in the constant pool instead of fp regs.
+   QImode must go into class Q_REGS.
+   Narrow ALL_REGS to GENERAL_REGS.  This supports allowing movsf and
+   movdf to do mem-to-mem moves through integer regs.  */
+
+static reg_class_t
+ix86_preferred_reload_class (rtx x, reg_class_t regclass)
+{
+  enum machine_mode mode = GET_MODE (x);
+
+  /* We're only allowed to return a subclass of CLASS.  Many of the
+     following checks fail for NO_REGS, so eliminate that early.  */
+  if (regclass == NO_REGS)
+    return NO_REGS;
+
+  /* All classes can load zeros.  */
+  if (x == CONST0_RTX (mode))
+    return regclass;
+
+  /* Force constants into memory if we are loading a (nonzero) constant into
+     an MMX, SSE or MASK register.  This is because there are no MMX/SSE/MASK
+     instructions to load from a constant.  */
+  if (CONSTANT_P (x)
+      && (MAYBE_MMX_CLASS_P (regclass)
+	  || MAYBE_SSE_CLASS_P (regclass)
+	  || MAYBE_MASK_CLASS_P (regclass)))
+    return NO_REGS;
+
+  /* Prefer SSE regs only, if we can use them for math.  */
+  if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
+    return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
+
+  /* Floating-point constants need more complex checks.  */
+  if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
+    {
+      /* General regs can load everything.  */
+      if (reg_class_subset_p (regclass, GENERAL_REGS))
+        return regclass;
+
+      /* Floats can load 0 and 1 plus some others.  Note that we eliminated
+	 zero above.  We only want to wind up preferring 80387 registers if
+	 we plan on doing computation with them.  */
+      if (TARGET_80387
+	  && standard_80387_constant_p (x) > 0)
+	{
+	  /* Limit class to non-sse.  */
+	  if (regclass == FLOAT_SSE_REGS)
+	    return FLOAT_REGS;
+	  if (regclass == FP_TOP_SSE_REGS)
+	    return FP_TOP_REG;
+	  if (regclass == FP_SECOND_SSE_REGS)
+	    return FP_SECOND_REG;
+	  if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
+	    return regclass;
+	}
+
+      return NO_REGS;
+    }
+
+  /* Generally when we see PLUS here, it's the function invariant
+     (plus soft-fp const_int).  Which can only be computed into general
+     regs.  */
+  if (GET_CODE (x) == PLUS)
+    return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
+
+  /* QImode constants are easy to load, but non-constant QImode data
+     must go into Q_REGS.  */
+  if (GET_MODE (x) == QImode && !CONSTANT_P (x))
+    {
+      if (reg_class_subset_p (regclass, Q_REGS))
+	return regclass;
+      if (reg_class_subset_p (Q_REGS, regclass))
+	return Q_REGS;
+      return NO_REGS;
+    }
+
+  return regclass;
+}
+
+/* Discourage putting floating-point values in SSE registers unless
+   SSE math is being used, and likewise for the 387 registers.  */
+static reg_class_t
+ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
+{
+  enum machine_mode mode = GET_MODE (x);
+
+  /* Restrict the output reload class to the register bank that we are doing
+     math on.  If we would like not to return a subset of CLASS, reject this
+     alternative: if reload cannot do this, it will still use its choice.  */
+  mode = GET_MODE (x);
+  if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
+    return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
+
+  if (X87_FLOAT_MODE_P (mode))
+    {
+      if (regclass == FP_TOP_SSE_REGS)
+	return FP_TOP_REG;
+      else if (regclass == FP_SECOND_SSE_REGS)
+	return FP_SECOND_REG;
+      else
+	return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
+    }
+
+  return regclass;
+}
+
+static reg_class_t
+ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
+		       enum machine_mode mode, secondary_reload_info *sri)
+{
+  /* Double-word spills from general registers to non-offsettable memory
+     references (zero-extended addresses) require special handling.  */
+  if (TARGET_64BIT
+      && MEM_P (x)
+      && GET_MODE_SIZE (mode) > UNITS_PER_WORD
+      && INTEGER_CLASS_P (rclass)
+      && !offsettable_memref_p (x))
+    {
+      sri->icode = (in_p
+		    ? CODE_FOR_reload_noff_load
+		    : CODE_FOR_reload_noff_store);
+      /* Add the cost of moving address to a temporary.  */
+      sri->extra_cost = 1;
+
+      return NO_REGS;
+    }
+
+  /* QImode spills from non-QI registers require
+     intermediate register on 32bit targets.  */
+  if (mode == QImode
+      && (MAYBE_MASK_CLASS_P (rclass)
+	  || (!TARGET_64BIT && !in_p
+	      && INTEGER_CLASS_P (rclass)
+	      && MAYBE_NON_Q_CLASS_P (rclass))))
+    {
+      int regno;
+
+      if (REG_P (x))
+	regno = REGNO (x);
+      else
+	regno = -1;
+
+      if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
+	regno = true_regnum (x);
+
+      /* Return Q_REGS if the operand is in memory.  */
+      if (regno == -1)
+	return Q_REGS;
+    }
+
+  /* This condition handles corner case where an expression involving
+     pointers gets vectorized.  We're trying to use the address of a
+     stack slot as a vector initializer.
+
+     (set (reg:V2DI 74 [ vect_cst_.2 ])
+          (vec_duplicate:V2DI (reg/f:DI 20 frame)))
+
+     Eventually frame gets turned into sp+offset like this:
+
+     (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
+          (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
+	                               (const_int 392 [0x188]))))
+
+     That later gets turned into:
+
+     (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
+          (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
+	    (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
+
+     We'll have the following reload recorded:
+
+     Reload 0: reload_in (DI) =
+           (plus:DI (reg/f:DI 7 sp)
+            (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
+     reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
+     SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
+     reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
+     reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
+     reload_reg_rtx: (reg:V2DI 22 xmm1)
+
+     Which isn't going to work since SSE instructions can't handle scalar
+     additions.  Returning GENERAL_REGS forces the addition into integer
+     register and reload can handle subsequent reloads without problems.  */
+
+  if (in_p && GET_CODE (x) == PLUS
+      && SSE_CLASS_P (rclass)
+      && SCALAR_INT_MODE_P (mode))
+    return GENERAL_REGS;
+
+  return NO_REGS;
+}
+
+/* Implement TARGET_CLASS_LIKELY_SPILLED_P.  */
+
+static bool
+ix86_class_likely_spilled_p (reg_class_t rclass)
+{
+  switch (rclass)
+    {
+      case AREG:
+      case DREG:
+      case CREG:
+      case BREG:
+      case AD_REGS:
+      case SIREG:
+      case DIREG:
+      case SSE_FIRST_REG:
+      case FP_TOP_REG:
+      case FP_SECOND_REG:
+	return true;
+
+      default:
+	break;
+    }
+
+  return false;
+}
+
+/* If we are copying between general and FP registers, we need a memory
+   location. The same is true for SSE and MMX registers.
+
+   To optimize register_move_cost performance, allow inline variant.
+
+   The macro can't work reliably when one of the CLASSES is class containing
+   registers from multiple units (SSE, MMX, integer).  We avoid this by never
+   combining those units in single alternative in the machine description.
+   Ensure that this constraint holds to avoid unexpected surprises.
+
+   When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
+   enforce these sanity checks.  */
+
+static inline bool
+inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
+				enum machine_mode mode, int strict)
+{
+  if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
+    return false;
+  if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
+      || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
+      || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
+      || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
+      || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
+      || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
+    {
+      gcc_assert (!strict || lra_in_progress);
+      return true;
+    }
+
+  if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
+    return true;
+
+  /* ??? This is a lie.  We do have moves between mmx/general, and for
+     mmx/sse2.  But by saying we need secondary memory we discourage the
+     register allocator from using the mmx registers unless needed.  */
+  if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
+    return true;
+
+  if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
+    {
+      /* SSE1 doesn't have any direct moves from other classes.  */
+      if (!TARGET_SSE2)
+	return true;
+
+      /* If the target says that inter-unit moves are more expensive
+	 than moving through memory, then don't generate them.  */
+      if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
+	  || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
+	return true;
+
+      /* Between SSE and general, we have moves no larger than word size.  */
+      if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+	return true;
+    }
+
+  return false;
+}
+
+bool
+ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
+			      enum machine_mode mode, int strict)
+{
+  return inline_secondary_memory_needed (class1, class2, mode, strict);
+}
+
+/* Implement the TARGET_CLASS_MAX_NREGS hook.
+
+   On the 80386, this is the size of MODE in words,
+   except in the FP regs, where a single reg is always enough.  */
+
+static unsigned char
+ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
+{
+  if (MAYBE_INTEGER_CLASS_P (rclass))
+    {
+      if (mode == XFmode)
+	return (TARGET_64BIT ? 2 : 3);
+      else if (mode == XCmode)
+	return (TARGET_64BIT ? 4 : 6);
+      else
+	return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
+    }
+  else
+    {
+      if (COMPLEX_MODE_P (mode))
+	return 2;
+      else
+	return 1;
+    }
+}
+
+/* Return true if the registers in CLASS cannot represent the change from
+   modes FROM to TO.  */
+
+bool
+ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
+			       enum reg_class regclass)
+{
+  if (from == to)
+    return false;
+
+  /* x87 registers can't do subreg at all, as all values are reformatted
+     to extended precision.  */
+  if (MAYBE_FLOAT_CLASS_P (regclass))
+    return true;
+
+  if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
+    {
+      /* Vector registers do not support QI or HImode loads.  If we don't
+	 disallow a change to these modes, reload will assume it's ok to
+	 drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
+	 the vec_dupv4hi pattern.  */
+      if (GET_MODE_SIZE (from) < 4)
+	return true;
+
+      /* Vector registers do not support subreg with nonzero offsets, which
+	 are otherwise valid for integer registers.  Since we can't see
+	 whether we have a nonzero offset from here, prohibit all
+         nonparadoxical subregs changing size.  */
+      if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
+	return true;
+    }
+
+  return false;
+}
+
+/* Return the cost of moving data of mode M between a
+   register and memory.  A value of 2 is the default; this cost is
+   relative to those in `REGISTER_MOVE_COST'.
+
+   This function is used extensively by register_move_cost that is used to
+   build tables at startup.  Make it inline in this case.
+   When IN is 2, return maximum of in and out move cost.
+
+   If moving between registers and memory is more expensive than
+   between two registers, you should define this macro to express the
+   relative cost.
+
+   Model also increased moving costs of QImode registers in non
+   Q_REGS classes.
+ */
+static inline int
+inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
+			 int in)
+{
+  int cost;
+  if (FLOAT_CLASS_P (regclass))
+    {
+      int index;
+      switch (mode)
+	{
+	  case SFmode:
+	    index = 0;
+	    break;
+	  case DFmode:
+	    index = 1;
+	    break;
+	  case XFmode:
+	    index = 2;
+	    break;
+	  default:
+	    return 100;
+	}
+      if (in == 2)
+        return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
+      return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
+    }
+  if (SSE_CLASS_P (regclass))
+    {
+      int index;
+      switch (GET_MODE_SIZE (mode))
+	{
+	  case 4:
+	    index = 0;
+	    break;
+	  case 8:
+	    index = 1;
+	    break;
+	  case 16:
+	    index = 2;
+	    break;
+	  default:
+	    return 100;
+	}
+      if (in == 2)
+        return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
+      return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
+    }
+  if (MMX_CLASS_P (regclass))
+    {
+      int index;
+      switch (GET_MODE_SIZE (mode))
+	{
+	  case 4:
+	    index = 0;
+	    break;
+	  case 8:
+	    index = 1;
+	    break;
+	  default:
+	    return 100;
+	}
+      if (in)
+        return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
+      return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
+    }
+  switch (GET_MODE_SIZE (mode))
+    {
+      case 1:
+	if (Q_CLASS_P (regclass) || TARGET_64BIT)
+	  {
+	    if (!in)
+	      return ix86_cost->int_store[0];
+	    if (TARGET_PARTIAL_REG_DEPENDENCY
+	        && optimize_function_for_speed_p (cfun))
+	      cost = ix86_cost->movzbl_load;
+	    else
+	      cost = ix86_cost->int_load[0];
+	    if (in == 2)
+	      return MAX (cost, ix86_cost->int_store[0]);
+	    return cost;
+	  }
+	else
+	  {
+	   if (in == 2)
+	     return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
+	   if (in)
+	     return ix86_cost->movzbl_load;
+	   else
+	     return ix86_cost->int_store[0] + 4;
+	  }
+	break;
+      case 2:
+	if (in == 2)
+	  return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
+	return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
+      default:
+	/* Compute number of 32bit moves needed.  TFmode is moved as XFmode.  */
+	if (mode == TFmode)
+	  mode = XFmode;
+	if (in == 2)
+	  cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
+	else if (in)
+	  cost = ix86_cost->int_load[2];
+	else
+	  cost = ix86_cost->int_store[2];
+	return (cost * (((int) GET_MODE_SIZE (mode)
+		        + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
+    }
+}
+
+static int
+ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
+		       bool in)
+{
+  return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
+}
+
+
+/* Return the cost of moving data from a register in class CLASS1 to
+   one in class CLASS2.
+
+   It is not required that the cost always equal 2 when FROM is the same as TO;
+   on some machines it is expensive to move between registers if they are not
+   general registers.  */
+
+static int
+ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
+			 reg_class_t class2_i)
+{
+  enum reg_class class1 = (enum reg_class) class1_i;
+  enum reg_class class2 = (enum reg_class) class2_i;
+
+  /* In case we require secondary memory, compute cost of the store followed
+     by load.  In order to avoid bad register allocation choices, we need
+     for this to be *at least* as high as the symmetric MEMORY_MOVE_COST.  */
+
+  if (inline_secondary_memory_needed (class1, class2, mode, 0))
+    {
+      int cost = 1;
+
+      cost += inline_memory_move_cost (mode, class1, 2);
+      cost += inline_memory_move_cost (mode, class2, 2);
+
+      /* In case of copying from general_purpose_register we may emit multiple
+         stores followed by single load causing memory size mismatch stall.
+         Count this as arbitrarily high cost of 20.  */
+      if (targetm.class_max_nregs (class1, mode)
+	  > targetm.class_max_nregs (class2, mode))
+	cost += 20;
+
+      /* In the case of FP/MMX moves, the registers actually overlap, and we
+	 have to switch modes in order to treat them differently.  */
+      if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
+          || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
+	cost += 20;
+
+      return cost;
+    }
+
+  /* Moves between SSE/MMX and integer unit are expensive.  */
+  if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
+      || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
+
+    /* ??? By keeping returned value relatively high, we limit the number
+       of moves between integer and MMX/SSE registers for all targets.
+       Additionally, high value prevents problem with x86_modes_tieable_p(),
+       where integer modes in MMX/SSE registers are not tieable
+       because of missing QImode and HImode moves to, from or between
+       MMX/SSE registers.  */
+    return MAX (8, ix86_cost->mmxsse_to_integer);
+
+  if (MAYBE_FLOAT_CLASS_P (class1))
+    return ix86_cost->fp_move;
+  if (MAYBE_SSE_CLASS_P (class1))
+    return ix86_cost->sse_move;
+  if (MAYBE_MMX_CLASS_P (class1))
+    return ix86_cost->mmx_move;
+  return 2;
+}
+
+/* Return TRUE if hard register REGNO can hold a value of machine-mode
+   MODE.  */
+
+bool
+ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
+{
+  /* Flags and only flags can only hold CCmode values.  */
+  if (CC_REGNO_P (regno))
+    return GET_MODE_CLASS (mode) == MODE_CC;
+  if (GET_MODE_CLASS (mode) == MODE_CC
+      || GET_MODE_CLASS (mode) == MODE_RANDOM
+      || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
+    return false;
+  if (STACK_REGNO_P (regno))
+    return VALID_FP_MODE_P (mode);
+  if (MASK_REGNO_P (regno))
+    return VALID_MASK_REG_MODE (mode);
+  if (SSE_REGNO_P (regno))
+    {
+      /* We implement the move patterns for all vector modes into and
+	 out of SSE registers, even when no operation instructions
+	 are available.  */
+
+      /* For AVX-512 we allow, regardless of regno:
+	  - XI mode
+	  - any of 512-bit wide vector mode
+	  - any scalar mode.  */
+      if (TARGET_AVX512F
+	  && (mode == XImode
+	      || VALID_AVX512F_REG_MODE (mode)
+	      || VALID_AVX512F_SCALAR_MODE (mode)))
+	return true;
+
+      /* xmm16-xmm31 are only available for AVX-512.  */
+      if (EXT_REX_SSE_REGNO_P (regno))
+	return false;
+
+      /* OImode and AVX modes are available only when AVX is enabled.  */
+      return ((TARGET_AVX
+	       && VALID_AVX256_REG_OR_OI_MODE (mode))
+	      || VALID_SSE_REG_MODE (mode)
+	      || VALID_SSE2_REG_MODE (mode)
+	      || VALID_MMX_REG_MODE (mode)
+	      || VALID_MMX_REG_MODE_3DNOW (mode));
+    }
+  if (MMX_REGNO_P (regno))
+    {
+      /* We implement the move patterns for 3DNOW modes even in MMX mode,
+	 so if the register is available at all, then we can move data of
+	 the given mode into or out of it.  */
+      return (VALID_MMX_REG_MODE (mode)
+	      || VALID_MMX_REG_MODE_3DNOW (mode));
+    }
+
+  if (mode == QImode)
+    {
+      /* Take care for QImode values - they can be in non-QI regs,
+	 but then they do cause partial register stalls.  */
+      if (ANY_QI_REGNO_P (regno))
+	return true;
+      if (!TARGET_PARTIAL_REG_STALL)
+	return true;
+      /* LRA checks if the hard register is OK for the given mode.
+	 QImode values can live in non-QI regs, so we allow all
+	 registers here.  */
+      if (lra_in_progress)
+       return true;
+      return !can_create_pseudo_p ();
+    }
+  /* We handle both integer and floats in the general purpose registers.  */
+  else if (VALID_INT_MODE_P (mode))
+    return true;
+  else if (VALID_FP_MODE_P (mode))
+    return true;
+  else if (VALID_DFP_MODE_P (mode))
+    return true;
+  /* Lots of MMX code casts 8 byte vector modes to DImode.  If we then go
+     on to use that value in smaller contexts, this can easily force a
+     pseudo to be allocated to GENERAL_REGS.  Since this is no worse than
+     supporting DImode, allow it.  */
+  else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
+    return true;
+
+  return false;
+}
+
+/* A subroutine of ix86_modes_tieable_p.  Return true if MODE is a
+   tieable integer mode.  */
+
+static bool
+ix86_tieable_integer_mode_p (enum machine_mode mode)
+{
+  switch (mode)
+    {
+    case HImode:
+    case SImode:
+      return true;
+
+    case QImode:
+      return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
+
+    case DImode:
+      return TARGET_64BIT;
+
+    default:
+      return false;
+    }
+}
+
+/* Return true if MODE1 is accessible in a register that can hold MODE2
+   without copying.  That is, all register classes that can hold MODE2
+   can also hold MODE1.  */
+
+bool
+ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
+{
+  if (mode1 == mode2)
+    return true;
+
+  if (ix86_tieable_integer_mode_p (mode1)
+      && ix86_tieable_integer_mode_p (mode2))
+    return true;
+
+  /* MODE2 being XFmode implies fp stack or general regs, which means we
+     can tie any smaller floating point modes to it.  Note that we do not
+     tie this with TFmode.  */
+  if (mode2 == XFmode)
+    return mode1 == SFmode || mode1 == DFmode;
+
+  /* MODE2 being DFmode implies fp stack, general or sse regs, which means
+     that we can tie it with SFmode.  */
+  if (mode2 == DFmode)
+    return mode1 == SFmode;
+
+  /* If MODE2 is only appropriate for an SSE register, then tie with
+     any other mode acceptable to SSE registers.  */
+  if (GET_MODE_SIZE (mode2) == 32
+      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
+    return (GET_MODE_SIZE (mode1) == 32
+	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
+  if (GET_MODE_SIZE (mode2) == 16
+      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
+    return (GET_MODE_SIZE (mode1) == 16
+	    && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
+
+  /* If MODE2 is appropriate for an MMX register, then tie
+     with any other mode acceptable to MMX registers.  */
+  if (GET_MODE_SIZE (mode2) == 8
+      && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
+    return (GET_MODE_SIZE (mode1) == 8
+	    && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
+
+  return false;
+}
+
+/* Return the cost of moving between two registers of mode MODE.  */
+
+static int
+ix86_set_reg_reg_cost (enum machine_mode mode)
+{
+  unsigned int units = UNITS_PER_WORD;
+
+  switch (GET_MODE_CLASS (mode))
+    {
+    default:
+      break;
+
+    case MODE_CC:
+      units = GET_MODE_SIZE (CCmode);
+      break;
+
+    case MODE_FLOAT:
+      if ((TARGET_SSE && mode == TFmode)
+	  || (TARGET_80387 && mode == XFmode)
+	  || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
+	  || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
+	units = GET_MODE_SIZE (mode);
+      break;
+
+    case MODE_COMPLEX_FLOAT:
+      if ((TARGET_SSE && mode == TCmode)
+	  || (TARGET_80387 && mode == XCmode)
+	  || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
+	  || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
+	units = GET_MODE_SIZE (mode);
+      break;
+
+    case MODE_VECTOR_INT:
+    case MODE_VECTOR_FLOAT:
+      if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
+	  || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
+	  || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
+	  || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
+	  || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
+	units = GET_MODE_SIZE (mode);
+    }
+
+  /* Return the cost of moving between two registers of mode MODE,
+     assuming that the move will be in pieces of at most UNITS bytes.  */
+  return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
+}
+
+/* Compute a (partial) cost for rtx X.  Return true if the complete
+   cost has been computed, and false if subexpressions should be
+   scanned.  In either case, *TOTAL contains the cost result.  */
+
+static bool
+ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
+		bool speed)
+{
+  rtx mask;
+  enum rtx_code code = (enum rtx_code) code_i;
+  enum rtx_code outer_code = (enum rtx_code) outer_code_i;
+  enum machine_mode mode = GET_MODE (x);
+  const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
+
+  switch (code)
+    {
+    case SET:
+      if (register_operand (SET_DEST (x), VOIDmode)
+	  && reg_or_0_operand (SET_SRC (x), VOIDmode))
+	{
+	  *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
+	  return true;
+	}
+      return false;
+
+    case CONST_INT:
+    case CONST:
+    case LABEL_REF:
+    case SYMBOL_REF:
+      if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
+	*total = 3;
+      else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
+	*total = 2;
+      else if (flag_pic && SYMBOLIC_CONST (x)
+	       && (!TARGET_64BIT
+		   || (!GET_CODE (x) != LABEL_REF
+		       && (GET_CODE (x) != SYMBOL_REF
+		           || !SYMBOL_REF_LOCAL_P (x)))))
+	*total = 1;
+      else
+	*total = 0;
+      return true;
+
+    case CONST_DOUBLE:
+      if (mode == VOIDmode)
+	{
+	  *total = 0;
+	  return true;
+	}
+      switch (standard_80387_constant_p (x))
+	{
+	case 1: /* 0.0 */
+	  *total = 1;
+	  return true;
+	default: /* Other constants */
+	  *total = 2;
+	  return true;
+	case 0:
+	case -1:
+	  break;
+	}
+      if (SSE_FLOAT_MODE_P (mode))
+	{
+    case CONST_VECTOR:
+	  switch (standard_sse_constant_p (x))
+	    {
+	    case 0:
+	      break;
+	    case 1:  /* 0: xor eliminates false dependency */
+	      *total = 0;
+	      return true;
+	    default: /* -1: cmp contains false dependency */
+	      *total = 1;
+	      return true;
+	    }
+	}
+      /* Fall back to (MEM (SYMBOL_REF)), since that's where
+	 it'll probably end up.  Add a penalty for size.  */
+      *total = (COSTS_N_INSNS (1)
+		+ (flag_pic != 0 && !TARGET_64BIT)
+		+ (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
+      return true;
+
+    case ZERO_EXTEND:
+      /* The zero extensions is often completely free on x86_64, so make
+	 it as cheap as possible.  */
+      if (TARGET_64BIT && mode == DImode
+	  && GET_MODE (XEXP (x, 0)) == SImode)
+	*total = 1;
+      else if (TARGET_ZERO_EXTEND_WITH_AND)
+	*total = cost->add;
+      else
+	*total = cost->movzx;
+      return false;
+
+    case SIGN_EXTEND:
+      *total = cost->movsx;
+      return false;
+
+    case ASHIFT:
+      if (SCALAR_INT_MODE_P (mode)
+	  && GET_MODE_SIZE (mode) < UNITS_PER_WORD
+	  && CONST_INT_P (XEXP (x, 1)))
+	{
+	  HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
+	  if (value == 1)
+	    {
+	      *total = cost->add;
+	      return false;
+	    }
+	  if ((value == 2 || value == 3)
+	      && cost->lea <= cost->shift_const)
+	    {
+	      *total = cost->lea;
+	      return false;
+	    }
+	}
+      /* FALLTHRU */
+
+    case ROTATE:
+    case ASHIFTRT:
+    case LSHIFTRT:
+    case ROTATERT:
+      if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+	{
+	  /* ??? Should be SSE vector operation cost.  */
+	  /* At least for published AMD latencies, this really is the same
+	     as the latency for a simple fpu operation like fabs.  */
+	  /* V*QImode is emulated with 1-11 insns.  */
+	  if (mode == V16QImode || mode == V32QImode)
+	    {
+	      int count = 11;
+	      if (TARGET_XOP && mode == V16QImode)
+		{
+		  /* For XOP we use vpshab, which requires a broadcast of the
+		     value to the variable shift insn.  For constants this
+		     means a V16Q const in mem; even when we can perform the
+		     shift with one insn set the cost to prefer paddb.  */
+		  if (CONSTANT_P (XEXP (x, 1)))
+		    {
+		      *total = (cost->fabs
+				+ rtx_cost (XEXP (x, 0), code, 0, speed)
+				+ (speed ? 2 : COSTS_N_BYTES (16)));
+		      return true;
+		    }
+		  count = 3;
+		}
+	      else if (TARGET_SSSE3)
+		count = 7;
+	      *total = cost->fabs * count;
+	    }
+	  else
+	    *total = cost->fabs;
+	}
+      else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+	{
+	  if (CONST_INT_P (XEXP (x, 1)))
+	    {
+	      if (INTVAL (XEXP (x, 1)) > 32)
+		*total = cost->shift_const + COSTS_N_INSNS (2);
+	      else
+		*total = cost->shift_const * 2;
+	    }
+	  else
+	    {
+	      if (GET_CODE (XEXP (x, 1)) == AND)
+		*total = cost->shift_var * 2;
+	      else
+		*total = cost->shift_var * 6 + COSTS_N_INSNS (2);
+	    }
+	}
+      else
+	{
+	  if (CONST_INT_P (XEXP (x, 1)))
+	    *total = cost->shift_const;
+	  else if (GET_CODE (XEXP (x, 1)) == SUBREG
+		   && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
+	    {
+	      /* Return the cost after shift-and truncation.  */
+	      *total = cost->shift_var;
+	      return true;
+	    }
+	  else
+	    *total = cost->shift_var;
+	}
+      return false;
+
+    case FMA:
+      {
+	rtx sub;
+
+        gcc_assert (FLOAT_MODE_P (mode));
+        gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
+
+        /* ??? SSE scalar/vector cost should be used here.  */
+        /* ??? Bald assumption that fma has the same cost as fmul.  */
+        *total = cost->fmul;
+	*total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
+
+        /* Negate in op0 or op2 is free: FMS, FNMA, FNMS.  */
+	sub = XEXP (x, 0);
+	if (GET_CODE (sub) == NEG)
+	  sub = XEXP (sub, 0);
+	*total += rtx_cost (sub, FMA, 0, speed);
+
+	sub = XEXP (x, 2);
+	if (GET_CODE (sub) == NEG)
+	  sub = XEXP (sub, 0);
+	*total += rtx_cost (sub, FMA, 2, speed);
+	return true;
+      }
+
+    case MULT:
+      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+	{
+	  /* ??? SSE scalar cost should be used here.  */
+	  *total = cost->fmul;
+	  return false;
+	}
+      else if (X87_FLOAT_MODE_P (mode))
+	{
+	  *total = cost->fmul;
+	  return false;
+	}
+      else if (FLOAT_MODE_P (mode))
+	{
+	  /* ??? SSE vector cost should be used here.  */
+	  *total = cost->fmul;
+	  return false;
+	}
+      else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+	{
+	  /* V*QImode is emulated with 7-13 insns.  */
+	  if (mode == V16QImode || mode == V32QImode)
+	    {
+	      int extra = 11;
+	      if (TARGET_XOP && mode == V16QImode)
+		extra = 5;
+	      else if (TARGET_SSSE3)
+		extra = 6;
+	      *total = cost->fmul * 2 + cost->fabs * extra;
+	    }
+	  /* V*DImode is emulated with 5-8 insns.  */
+	  else if (mode == V2DImode || mode == V4DImode)
+	    {
+	      if (TARGET_XOP && mode == V2DImode)
+		*total = cost->fmul * 2 + cost->fabs * 3;
+	      else
+		*total = cost->fmul * 3 + cost->fabs * 5;
+	    }
+	  /* Without sse4.1, we don't have PMULLD; it's emulated with 7
+	     insns, including two PMULUDQ.  */
+	  else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
+	    *total = cost->fmul * 2 + cost->fabs * 5;
+	  else
+	    *total = cost->fmul;
+	  return false;
+	}
+      else
+	{
+	  rtx op0 = XEXP (x, 0);
+	  rtx op1 = XEXP (x, 1);
+	  int nbits;
+	  if (CONST_INT_P (XEXP (x, 1)))
+	    {
+	      unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
+	      for (nbits = 0; value != 0; value &= value - 1)
+	        nbits++;
+	    }
+	  else
+	    /* This is arbitrary.  */
+	    nbits = 7;
+
+	  /* Compute costs correctly for widening multiplication.  */
+	  if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
+	      && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
+	         == GET_MODE_SIZE (mode))
+	    {
+	      int is_mulwiden = 0;
+	      enum machine_mode inner_mode = GET_MODE (op0);
+
+	      if (GET_CODE (op0) == GET_CODE (op1))
+		is_mulwiden = 1, op1 = XEXP (op1, 0);
+	      else if (CONST_INT_P (op1))
+		{
+		  if (GET_CODE (op0) == SIGN_EXTEND)
+		    is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
+			          == INTVAL (op1);
+		  else
+		    is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
+	        }
+
+	      if (is_mulwiden)
+	        op0 = XEXP (op0, 0), mode = GET_MODE (op0);
+	    }
+
+  	  *total = (cost->mult_init[MODE_INDEX (mode)]
+		    + nbits * cost->mult_bit
+	            + rtx_cost (op0, outer_code, opno, speed)
+		    + rtx_cost (op1, outer_code, opno, speed));
+
+          return true;
+	}
+
+    case DIV:
+    case UDIV:
+    case MOD:
+    case UMOD:
+      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+	/* ??? SSE cost should be used here.  */
+	*total = cost->fdiv;
+      else if (X87_FLOAT_MODE_P (mode))
+	*total = cost->fdiv;
+      else if (FLOAT_MODE_P (mode))
+	/* ??? SSE vector cost should be used here.  */
+	*total = cost->fdiv;
+      else
+	*total = cost->divide[MODE_INDEX (mode)];
+      return false;
+
+    case PLUS:
+      if (GET_MODE_CLASS (mode) == MODE_INT
+	  && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
+	{
+	  if (GET_CODE (XEXP (x, 0)) == PLUS
+	      && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
+	      && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
+	      && CONSTANT_P (XEXP (x, 1)))
+	    {
+	      HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
+	      if (val == 2 || val == 4 || val == 8)
+		{
+		  *total = cost->lea;
+		  *total += rtx_cost (XEXP (XEXP (x, 0), 1),
+				      outer_code, opno, speed);
+		  *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
+				      outer_code, opno, speed);
+		  *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
+		  return true;
+		}
+	    }
+	  else if (GET_CODE (XEXP (x, 0)) == MULT
+		   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
+	    {
+	      HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
+	      if (val == 2 || val == 4 || val == 8)
+		{
+		  *total = cost->lea;
+		  *total += rtx_cost (XEXP (XEXP (x, 0), 0),
+				      outer_code, opno, speed);
+		  *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
+		  return true;
+		}
+	    }
+	  else if (GET_CODE (XEXP (x, 0)) == PLUS)
+	    {
+	      *total = cost->lea;
+	      *total += rtx_cost (XEXP (XEXP (x, 0), 0),
+				  outer_code, opno, speed);
+	      *total += rtx_cost (XEXP (XEXP (x, 0), 1),
+				  outer_code, opno, speed);
+	      *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
+	      return true;
+	    }
+	}
+      /* FALLTHRU */
+
+    case MINUS:
+      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+	{
+	  /* ??? SSE cost should be used here.  */
+	  *total = cost->fadd;
+	  return false;
+	}
+      else if (X87_FLOAT_MODE_P (mode))
+	{
+	  *total = cost->fadd;
+	  return false;
+	}
+      else if (FLOAT_MODE_P (mode))
+	{
+	  /* ??? SSE vector cost should be used here.  */
+	  *total = cost->fadd;
+	  return false;
+	}
+      /* FALLTHRU */
+
+    case AND:
+    case IOR:
+    case XOR:
+      if (GET_MODE_CLASS (mode) == MODE_INT
+	  && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+	{
+	  *total = (cost->add * 2
+		    + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
+		       << (GET_MODE (XEXP (x, 0)) != DImode))
+		    + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
+	               << (GET_MODE (XEXP (x, 1)) != DImode)));
+	  return true;
+	}
+      /* FALLTHRU */
+
+    case NEG:
+      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+	{
+	  /* ??? SSE cost should be used here.  */
+	  *total = cost->fchs;
+	  return false;
+	}
+      else if (X87_FLOAT_MODE_P (mode))
+	{
+	  *total = cost->fchs;
+	  return false;
+	}
+      else if (FLOAT_MODE_P (mode))
+	{
+	  /* ??? SSE vector cost should be used here.  */
+	  *total = cost->fchs;
+	  return false;
+	}
+      /* FALLTHRU */
+
+    case NOT:
+      if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+	{
+	  /* ??? Should be SSE vector operation cost.  */
+	  /* At least for published AMD latencies, this really is the same
+	     as the latency for a simple fpu operation like fabs.  */
+	  *total = cost->fabs;
+	}
+      else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+	*total = cost->add * 2;
+      else
+	*total = cost->add;
+      return false;
+
+    case COMPARE:
+      if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
+	  && XEXP (XEXP (x, 0), 1) == const1_rtx
+	  && CONST_INT_P (XEXP (XEXP (x, 0), 2))
+	  && XEXP (x, 1) == const0_rtx)
+	{
+	  /* This kind of construct is implemented using test[bwl].
+	     Treat it as if we had an AND.  */
+	  *total = (cost->add
+		    + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
+		    + rtx_cost (const1_rtx, outer_code, opno, speed));
+	  return true;
+	}
+      return false;
+
+    case FLOAT_EXTEND:
+      if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
+	*total = 0;
+      return false;
+
+    case ABS:
+      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+	/* ??? SSE cost should be used here.  */
+	*total = cost->fabs;
+      else if (X87_FLOAT_MODE_P (mode))
+	*total = cost->fabs;
+      else if (FLOAT_MODE_P (mode))
+	/* ??? SSE vector cost should be used here.  */
+	*total = cost->fabs;
+      return false;
+
+    case SQRT:
+      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+	/* ??? SSE cost should be used here.  */
+	*total = cost->fsqrt;
+      else if (X87_FLOAT_MODE_P (mode))
+	*total = cost->fsqrt;
+      else if (FLOAT_MODE_P (mode))
+	/* ??? SSE vector cost should be used here.  */
+	*total = cost->fsqrt;
+      return false;
+
+    case UNSPEC:
+      if (XINT (x, 1) == UNSPEC_TP)
+	*total = 0;
+      return false;
+
+    case VEC_SELECT:
+    case VEC_CONCAT:
+    case VEC_DUPLICATE:
+      /* ??? Assume all of these vector manipulation patterns are
+	 recognizable.  In which case they all pretty much have the
+	 same cost.  */
+     *total = cost->fabs;
+     return true;
+    case VEC_MERGE:
+      mask = XEXP (x, 2);
+      /* This is masked instruction, assume the same cost,
+	 as nonmasked variant.  */
+      if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
+	*total = rtx_cost (XEXP (x, 0), outer_code, opno, speed);
+      else
+	*total = cost->fabs;
+      return true;
+
+    default:
+      return false;
+    }
+}
+
+#if TARGET_MACHO
+
+static int current_machopic_label_num;
+
+/* Given a symbol name and its associated stub, write out the
+   definition of the stub.  */
+
+void
+machopic_output_stub (FILE *file, const char *symb, const char *stub)
+{
+  unsigned int length;
+  char *binder_name, *symbol_name, lazy_ptr_name[32];
+  int label = ++current_machopic_label_num;
+
+  /* For 64-bit we shouldn't get here.  */
+  gcc_assert (!TARGET_64BIT);
+
+  /* Lose our funky encoding stuff so it doesn't contaminate the stub.  */
+  symb = targetm.strip_name_encoding (symb);
+
+  length = strlen (stub);
+  binder_name = XALLOCAVEC (char, length + 32);
+  GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
+
+  length = strlen (symb);
+  symbol_name = XALLOCAVEC (char, length + 32);
+  GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
+
+  sprintf (lazy_ptr_name, "L%d$lz", label);
+
+  if (MACHOPIC_ATT_STUB)
+    switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
+  else if (MACHOPIC_PURE)
+    switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
+  else
+    switch_to_section (darwin_sections[machopic_symbol_stub_section]);
+
+  fprintf (file, "%s:\n", stub);
+  fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
+
+  if (MACHOPIC_ATT_STUB)
+    {
+      fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
+    }
+  else if (MACHOPIC_PURE)
+    {
+      /* PIC stub.  */
+      /* 25-byte PIC stub using "CALL get_pc_thunk".  */
+      rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
+      output_set_got (tmp, NULL_RTX);	/* "CALL ___<cpu>.get_pc_thunk.cx".  */
+      fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
+	       label, lazy_ptr_name, label);
+      fprintf (file, "\tjmp\t*%%ecx\n");
+    }
+  else
+    fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
+
+  /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
+     it needs no stub-binding-helper.  */
+  if (MACHOPIC_ATT_STUB)
+    return;
+
+  fprintf (file, "%s:\n", binder_name);
+
+  if (MACHOPIC_PURE)
+    {
+      fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
+      fprintf (file, "\tpushl\t%%ecx\n");
+    }
+  else
+    fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
+
+  fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
+
+  /* N.B. Keep the correspondence of these
+     'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
+     old-pic/new-pic/non-pic stubs; altering this will break
+     compatibility with existing dylibs.  */
+  if (MACHOPIC_PURE)
+    {
+      /* 25-byte PIC stub using "CALL get_pc_thunk".  */
+      switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
+    }
+  else
+    /* 16-byte -mdynamic-no-pic stub.  */
+    switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
+
+  fprintf (file, "%s:\n", lazy_ptr_name);
+  fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
+  fprintf (file, ASM_LONG "%s\n", binder_name);
+}
+#endif /* TARGET_MACHO */
+
+/* Order the registers for register allocator.  */
+
+void
+x86_order_regs_for_local_alloc (void)
+{
+   int pos = 0;
+   int i;
+
+   /* First allocate the local general purpose registers.  */
+   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+     if (GENERAL_REGNO_P (i) && call_used_regs[i])
+	reg_alloc_order [pos++] = i;
+
+   /* Global general purpose registers.  */
+   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+     if (GENERAL_REGNO_P (i) && !call_used_regs[i])
+	reg_alloc_order [pos++] = i;
+
+   /* x87 registers come first in case we are doing FP math
+      using them.  */
+   if (!TARGET_SSE_MATH)
+     for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
+       reg_alloc_order [pos++] = i;
+
+   /* SSE registers.  */
+   for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
+     reg_alloc_order [pos++] = i;
+   for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
+     reg_alloc_order [pos++] = i;
+
+   /* Extended REX SSE registers.  */
+   for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
+     reg_alloc_order [pos++] = i;
+
+   /* Mask register.  */
+   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
+     reg_alloc_order [pos++] = i;
+
+   /* x87 registers.  */
+   if (TARGET_SSE_MATH)
+     for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
+       reg_alloc_order [pos++] = i;
+
+   for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
+     reg_alloc_order [pos++] = i;
+
+   /* Initialize the rest of array as we do not allocate some registers
+      at all.  */
+   while (pos < FIRST_PSEUDO_REGISTER)
+     reg_alloc_order [pos++] = 0;
+}
+
+/* Handle a "callee_pop_aggregate_return" attribute; arguments as
+   in struct attribute_spec handler.  */
+static tree
+ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
+					      tree args,
+					      int flags ATTRIBUTE_UNUSED,
+					      bool *no_add_attrs)
+{
+  if (TREE_CODE (*node) != FUNCTION_TYPE
+      && TREE_CODE (*node) != METHOD_TYPE
+      && TREE_CODE (*node) != FIELD_DECL
+      && TREE_CODE (*node) != TYPE_DECL)
+    {
+      warning (OPT_Wattributes, "%qE attribute only applies to functions",
+	       name);
+      *no_add_attrs = true;
+      return NULL_TREE;
+    }
+  if (TARGET_64BIT)
+    {
+      warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
+	       name);
+      *no_add_attrs = true;
+      return NULL_TREE;
+    }
+  if (is_attribute_p ("callee_pop_aggregate_return", name))
+    {
+      tree cst;
+
+      cst = TREE_VALUE (args);
+      if (TREE_CODE (cst) != INTEGER_CST)
+	{
+	  warning (OPT_Wattributes,
+		   "%qE attribute requires an integer constant argument",
+		   name);
+	  *no_add_attrs = true;
+	}
+      else if (compare_tree_int (cst, 0) != 0
+	       && compare_tree_int (cst, 1) != 0)
+	{
+	  warning (OPT_Wattributes,
+		   "argument to %qE attribute is neither zero, nor one",
+		   name);
+	  *no_add_attrs = true;
+	}
+
+      return NULL_TREE;
+    }
+
+  return NULL_TREE;
+}
+
+/* Handle a "ms_abi" or "sysv" attribute; arguments as in
+   struct attribute_spec.handler.  */
+static tree
+ix86_handle_abi_attribute (tree *node, tree name,
+			      tree args ATTRIBUTE_UNUSED,
+			      int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
+{
+  if (TREE_CODE (*node) != FUNCTION_TYPE
+      && TREE_CODE (*node) != METHOD_TYPE
+      && TREE_CODE (*node) != FIELD_DECL
+      && TREE_CODE (*node) != TYPE_DECL)
+    {
+      warning (OPT_Wattributes, "%qE attribute only applies to functions",
+	       name);
+      *no_add_attrs = true;
+      return NULL_TREE;
+    }
+
+  /* Can combine regparm with all attributes but fastcall.  */
+  if (is_attribute_p ("ms_abi", name))
+    {
+      if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("ms_abi and sysv_abi attributes are not compatible");
+	}
+
+      return NULL_TREE;
+    }
+  else if (is_attribute_p ("sysv_abi", name))
+    {
+      if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
+        {
+	  error ("ms_abi and sysv_abi attributes are not compatible");
+	}
+
+      return NULL_TREE;
+    }
+
+  return NULL_TREE;
+}
+
+/* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
+   struct attribute_spec.handler.  */
+static tree
+ix86_handle_struct_attribute (tree *node, tree name,
+			      tree args ATTRIBUTE_UNUSED,
+			      int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
+{
+  tree *type = NULL;
+  if (DECL_P (*node))
+    {
+      if (TREE_CODE (*node) == TYPE_DECL)
+	type = &TREE_TYPE (*node);
+    }
+  else
+    type = node;
+
+  if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
+    {
+      warning (OPT_Wattributes, "%qE attribute ignored",
+	       name);
+      *no_add_attrs = true;
+    }
+
+  else if ((is_attribute_p ("ms_struct", name)
+	    && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
+	   || ((is_attribute_p ("gcc_struct", name)
+		&& lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
+    {
+      warning (OPT_Wattributes, "%qE incompatible attribute ignored",
+               name);
+      *no_add_attrs = true;
+    }
+
+  return NULL_TREE;
+}
+
+static tree
+ix86_handle_fndecl_attribute (tree *node, tree name,
+                              tree args ATTRIBUTE_UNUSED,
+                              int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
+{
+  if (TREE_CODE (*node) != FUNCTION_DECL)
+    {
+      warning (OPT_Wattributes, "%qE attribute only applies to functions",
+               name);
+      *no_add_attrs = true;
+    }
+  return NULL_TREE;
+}
+
+static bool
+ix86_ms_bitfield_layout_p (const_tree record_type)
+{
+  return ((TARGET_MS_BITFIELD_LAYOUT
+	   && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
+          || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
+}
+
+/* Returns an expression indicating where the this parameter is
+   located on entry to the FUNCTION.  */
+
+static rtx
+x86_this_parameter (tree function)
+{
+  tree type = TREE_TYPE (function);
+  bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
+  int nregs;
+
+  if (TARGET_64BIT)
+    {
+      const int *parm_regs;
+
+      if (ix86_function_type_abi (type) == MS_ABI)
+        parm_regs = x86_64_ms_abi_int_parameter_registers;
+      else
+        parm_regs = x86_64_int_parameter_registers;
+      return gen_rtx_REG (Pmode, parm_regs[aggr]);
+    }
+
+  nregs = ix86_function_regparm (type, function);
+
+  if (nregs > 0 && !stdarg_p (type))
+    {
+      int regno;
+      unsigned int ccvt = ix86_get_callcvt (type);
+
+      if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+	regno = aggr ? DX_REG : CX_REG;
+      else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
+        {
+	  regno = CX_REG;
+	  if (aggr)
+	    return gen_rtx_MEM (SImode,
+				plus_constant (Pmode, stack_pointer_rtx, 4));
+	}
+      else
+        {
+	  regno = AX_REG;
+	  if (aggr)
+	    {
+	      regno = DX_REG;
+	      if (nregs == 1)
+		return gen_rtx_MEM (SImode,
+				    plus_constant (Pmode,
+						   stack_pointer_rtx, 4));
+	    }
+	}
+      return gen_rtx_REG (SImode, regno);
+    }
+
+  return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
+					     aggr ? 8 : 4));
+}
+
+/* Determine whether x86_output_mi_thunk can succeed.  */
+
+static bool
+x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
+			 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
+			 HOST_WIDE_INT vcall_offset, const_tree function)
+{
+  /* 64-bit can handle anything.  */
+  if (TARGET_64BIT)
+    return true;
+
+  /* For 32-bit, everything's fine if we have one free register.  */
+  if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
+    return true;
+
+  /* Need a free register for vcall_offset.  */
+  if (vcall_offset)
+    return false;
+
+  /* Need a free register for GOT references.  */
+  if (flag_pic && !targetm.binds_local_p (function))
+    return false;
+
+  /* Otherwise ok.  */
+  return true;
+}
+
+/* Output the assembler code for a thunk function.  THUNK_DECL is the
+   declaration for the thunk function itself, FUNCTION is the decl for
+   the target function.  DELTA is an immediate constant offset to be
+   added to THIS.  If VCALL_OFFSET is nonzero, the word at
+   *(*this + vcall_offset) should be added to THIS.  */
+
+static void
+x86_output_mi_thunk (FILE *file,
+		     tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
+		     HOST_WIDE_INT vcall_offset, tree function)
+{
+  rtx this_param = x86_this_parameter (function);
+  rtx this_reg, tmp, fnaddr;
+  unsigned int tmp_regno;
+
+  if (TARGET_64BIT)
+    tmp_regno = R10_REG;
+  else
+    {
+      unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
+      if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+	tmp_regno = AX_REG;
+      else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
+	tmp_regno = DX_REG;
+      else
+	tmp_regno = CX_REG;
+    }
+
+  emit_note (NOTE_INSN_PROLOGUE_END);
+
+  /* If VCALL_OFFSET, we'll need THIS in a register.  Might as well
+     pull it in now and let DELTA benefit.  */
+  if (REG_P (this_param))
+    this_reg = this_param;
+  else if (vcall_offset)
+    {
+      /* Put the this parameter into %eax.  */
+      this_reg = gen_rtx_REG (Pmode, AX_REG);
+      emit_move_insn (this_reg, this_param);
+    }
+  else
+    this_reg = NULL_RTX;
+
+  /* Adjust the this parameter by a fixed constant.  */
+  if (delta)
+    {
+      rtx delta_rtx = GEN_INT (delta);
+      rtx delta_dst = this_reg ? this_reg : this_param;
+
+      if (TARGET_64BIT)
+	{
+	  if (!x86_64_general_operand (delta_rtx, Pmode))
+	    {
+	      tmp = gen_rtx_REG (Pmode, tmp_regno);
+	      emit_move_insn (tmp, delta_rtx);
+	      delta_rtx = tmp;
+	    }
+	}
+
+      ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
+    }
+
+  /* Adjust the this parameter by a value stored in the vtable.  */
+  if (vcall_offset)
+    {
+      rtx vcall_addr, vcall_mem, this_mem;
+
+      tmp = gen_rtx_REG (Pmode, tmp_regno);
+
+      this_mem = gen_rtx_MEM (ptr_mode, this_reg);
+      if (Pmode != ptr_mode)
+	this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
+      emit_move_insn (tmp, this_mem);
+
+      /* Adjust the this parameter.  */
+      vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
+      if (TARGET_64BIT
+	  && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
+	{
+	  rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
+	  emit_move_insn (tmp2, GEN_INT (vcall_offset));
+	  vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
+	}
+
+      vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
+      if (Pmode != ptr_mode)
+	emit_insn (gen_addsi_1_zext (this_reg,
+				     gen_rtx_REG (ptr_mode,
+						  REGNO (this_reg)),
+				     vcall_mem));
+      else
+	ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
+    }
+
+  /* If necessary, drop THIS back to its stack slot.  */
+  if (this_reg && this_reg != this_param)
+    emit_move_insn (this_param, this_reg);
+
+  fnaddr = XEXP (DECL_RTL (function), 0);
+  if (TARGET_64BIT)
+    {
+      if (!flag_pic || targetm.binds_local_p (function)
+	  || TARGET_PECOFF)
+	;
+      else
+	{
+	  tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
+	  tmp = gen_rtx_CONST (Pmode, tmp);
+	  fnaddr = gen_const_mem (Pmode, tmp);
+	}
+    }
+  else
+    {
+      if (!flag_pic || targetm.binds_local_p (function))
+	;
+#if TARGET_MACHO
+      else if (TARGET_MACHO)
+	{
+	  fnaddr = machopic_indirect_call_target (DECL_RTL (function));
+	  fnaddr = XEXP (fnaddr, 0);
+	}
+#endif /* TARGET_MACHO */
+      else
+	{
+	  tmp = gen_rtx_REG (Pmode, CX_REG);
+	  output_set_got (tmp, NULL_RTX);
+
+	  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
+	  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
+	  fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
+	  fnaddr = gen_const_mem (Pmode, fnaddr);
+	}
+    }
+
+  /* Our sibling call patterns do not allow memories, because we have no
+     predicate that can distinguish between frame and non-frame memory.
+     For our purposes here, we can get away with (ab)using a jump pattern,
+     because we're going to do no optimization.  */
+  if (MEM_P (fnaddr))
+    emit_jump_insn (gen_indirect_jump (fnaddr));
+  else
+    {
+      if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
+	fnaddr = legitimize_pic_address (fnaddr,
+					 gen_rtx_REG (Pmode, tmp_regno));
+
+      if (!sibcall_insn_operand (fnaddr, word_mode))
+	{
+	  tmp = gen_rtx_REG (word_mode, tmp_regno);
+	  if (GET_MODE (fnaddr) != word_mode)
+	    fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
+	  emit_move_insn (tmp, fnaddr);
+	  fnaddr = tmp;
+	}
+
+      tmp = gen_rtx_MEM (QImode, fnaddr);
+      tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
+      tmp = emit_call_insn (tmp);
+      SIBLING_CALL_P (tmp) = 1;
+    }
+  emit_barrier ();
+
+  /* Emit just enough of rest_of_compilation to get the insns emitted.
+     Note that use_thunk calls assemble_start_function et al.  */
+  tmp = get_insns ();
+  shorten_branches (tmp);
+  final_start_function (tmp, file, 1);
+  final (tmp, file, 1);
+  final_end_function ();
+}
+
+static void
+x86_file_start (void)
+{
+  default_file_start ();
+  if (TARGET_16BIT)
+    fputs ("\t.code16gcc\n", asm_out_file);
+#if TARGET_MACHO
+  darwin_file_start ();
+#endif
+  if (X86_FILE_START_VERSION_DIRECTIVE)
+    fputs ("\t.version\t\"01.01\"\n", asm_out_file);
+  if (X86_FILE_START_FLTUSED)
+    fputs ("\t.global\t__fltused\n", asm_out_file);
+  if (ix86_asm_dialect == ASM_INTEL)
+    fputs ("\t.intel_syntax noprefix\n", asm_out_file);
+}
+
+int
+x86_field_alignment (tree field, int computed)
+{
+  enum machine_mode mode;
+  tree type = TREE_TYPE (field);
+
+  if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
+    return computed;
+  mode = TYPE_MODE (strip_array_types (type));
+  if (mode == DFmode || mode == DCmode
+      || GET_MODE_CLASS (mode) == MODE_INT
+      || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
+    return MIN (32, computed);
+  return computed;
+}
+
+/* Output assembler code to FILE to increment profiler label # LABELNO
+   for profiling a function entry.  */
+void
+x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
+{
+  const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
+					 : MCOUNT_NAME);
+
+  if (TARGET_64BIT)
+    {
+#ifndef NO_PROFILE_COUNTERS
+      fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
+#endif
+
+      if (!TARGET_PECOFF && flag_pic)
+	fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
+      else
+	fprintf (file, "\tcall\t%s\n", mcount_name);
+    }
+  else if (flag_pic)
+    {
+#ifndef NO_PROFILE_COUNTERS
+      fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
+	       LPREFIX, labelno);
+#endif
+      fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
+    }
+  else
+    {
+#ifndef NO_PROFILE_COUNTERS
+      fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
+	       LPREFIX, labelno);
+#endif
+      fprintf (file, "\tcall\t%s\n", mcount_name);
+    }
+}
+
+/* We don't have exact information about the insn sizes, but we may assume
+   quite safely that we are informed about all 1 byte insns and memory
+   address sizes.  This is enough to eliminate unnecessary padding in
+   99% of cases.  */
+
+static int
+min_insn_size (rtx insn)
+{
+  int l = 0, len;
+
+  if (!INSN_P (insn) || !active_insn_p (insn))
+    return 0;
+
+  /* Discard alignments we've emit and jump instructions.  */
+  if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
+      && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
+    return 0;
+
+  /* Important case - calls are always 5 bytes.
+     It is common to have many calls in the row.  */
+  if (CALL_P (insn)
+      && symbolic_reference_mentioned_p (PATTERN (insn))
+      && !SIBLING_CALL_P (insn))
+    return 5;
+  len = get_attr_length (insn);
+  if (len <= 1)
+    return 1;
+
+  /* For normal instructions we rely on get_attr_length being exact,
+     with a few exceptions.  */
+  if (!JUMP_P (insn))
+    {
+      enum attr_type type = get_attr_type (insn);
+
+      switch (type)
+	{
+	case TYPE_MULTI:
+	  if (GET_CODE (PATTERN (insn)) == ASM_INPUT
+	      || asm_noperands (PATTERN (insn)) >= 0)
+	    return 0;
+	  break;
+	case TYPE_OTHER:
+	case TYPE_FCMP:
+	  break;
+	default:
+	  /* Otherwise trust get_attr_length.  */
+	  return len;
+	}
+
+      l = get_attr_length_address (insn);
+      if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
+	l = 4;
+    }
+  if (l)
+    return 1+l;
+  else
+    return 2;
+}
+
+#ifdef ASM_OUTPUT_MAX_SKIP_PAD
+
+/* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
+   window.  */
+
+static void
+ix86_avoid_jump_mispredicts (void)
+{
+  rtx insn, start = get_insns ();
+  int nbytes = 0, njumps = 0;
+  int isjump = 0;
+
+  /* Look for all minimal intervals of instructions containing 4 jumps.
+     The intervals are bounded by START and INSN.  NBYTES is the total
+     size of instructions in the interval including INSN and not including
+     START.  When the NBYTES is smaller than 16 bytes, it is possible
+     that the end of START and INSN ends up in the same 16byte page.
+
+     The smallest offset in the page INSN can start is the case where START
+     ends on the offset 0.  Offset of INSN is then NBYTES - sizeof (INSN).
+     We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
+
+     Don't consider asm goto as jump, while it can contain a jump, it doesn't
+     have to, control transfer to label(s) can be performed through other
+     means, and also we estimate minimum length of all asm stmts as 0.  */
+  for (insn = start; insn; insn = NEXT_INSN (insn))
+    {
+      int min_size;
+
+      if (LABEL_P (insn))
+	{
+	  int align = label_to_alignment (insn);
+	  int max_skip = label_to_max_skip (insn);
+
+	  if (max_skip > 15)
+	    max_skip = 15;
+	  /* If align > 3, only up to 16 - max_skip - 1 bytes can be
+	     already in the current 16 byte page, because otherwise
+	     ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
+	     bytes to reach 16 byte boundary.  */
+	  if (align <= 0
+	      || (align <= 3 && max_skip != (1 << align) - 1))
+	    max_skip = 0;
+	  if (dump_file)
+	    fprintf (dump_file, "Label %i with max_skip %i\n",
+		     INSN_UID (insn), max_skip);
+	  if (max_skip)
+	    {
+	      while (nbytes + max_skip >= 16)
+		{
+		  start = NEXT_INSN (start);
+		  if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
+		      || CALL_P (start))
+		    njumps--, isjump = 1;
+		  else
+		    isjump = 0;
+		  nbytes -= min_insn_size (start);
+		}
+	    }
+	  continue;
+	}
+
+      min_size = min_insn_size (insn);
+      nbytes += min_size;
+      if (dump_file)
+	fprintf (dump_file, "Insn %i estimated to %i bytes\n",
+		 INSN_UID (insn), min_size);
+      if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
+	  || CALL_P (insn))
+	njumps++;
+      else
+	continue;
+
+      while (njumps > 3)
+	{
+	  start = NEXT_INSN (start);
+	  if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
+	      || CALL_P (start))
+	    njumps--, isjump = 1;
+	  else
+	    isjump = 0;
+	  nbytes -= min_insn_size (start);
+	}
+      gcc_assert (njumps >= 0);
+      if (dump_file)
+        fprintf (dump_file, "Interval %i to %i has %i bytes\n",
+		 INSN_UID (start), INSN_UID (insn), nbytes);
+
+      if (njumps == 3 && isjump && nbytes < 16)
+	{
+	  int padsize = 15 - nbytes + min_insn_size (insn);
+
+	  if (dump_file)
+	    fprintf (dump_file, "Padding insn %i by %i bytes!\n",
+		     INSN_UID (insn), padsize);
+          emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
+	}
+    }
+}
+#endif
+
+/* AMD Athlon works faster
+   when RET is not destination of conditional jump or directly preceded
+   by other jump instruction.  We avoid the penalty by inserting NOP just
+   before the RET instructions in such cases.  */
+static void
+ix86_pad_returns (void)
+{
+  edge e;
+  edge_iterator ei;
+
+  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+    {
+      basic_block bb = e->src;
+      rtx ret = BB_END (bb);
+      rtx prev;
+      bool replace = false;
+
+      if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
+	  || optimize_bb_for_size_p (bb))
+	continue;
+      for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
+	if (active_insn_p (prev) || LABEL_P (prev))
+	  break;
+      if (prev && LABEL_P (prev))
+	{
+	  edge e;
+	  edge_iterator ei;
+
+	  FOR_EACH_EDGE (e, ei, bb->preds)
+	    if (EDGE_FREQUENCY (e) && e->src->index >= 0
+		&& !(e->flags & EDGE_FALLTHRU))
+	      {
+		replace = true;
+		break;
+	      }
+	}
+      if (!replace)
+	{
+	  prev = prev_active_insn (ret);
+	  if (prev
+	      && ((JUMP_P (prev) && any_condjump_p (prev))
+		  || CALL_P (prev)))
+	    replace = true;
+	  /* Empty functions get branch mispredict even when
+	     the jump destination is not visible to us.  */
+	  if (!prev && !optimize_function_for_size_p (cfun))
+	    replace = true;
+	}
+      if (replace)
+	{
+	  emit_jump_insn_before (gen_simple_return_internal_long (), ret);
+	  delete_insn (ret);
+	}
+    }
+}
+
+/* Count the minimum number of instructions in BB.  Return 4 if the
+   number of instructions >= 4.  */
+
+static int
+ix86_count_insn_bb (basic_block bb)
+{
+  rtx insn;
+  int insn_count = 0;
+
+  /* Count number of instructions in this block.  Return 4 if the number
+     of instructions >= 4.  */
+  FOR_BB_INSNS (bb, insn)
+    {
+      /* Only happen in exit blocks.  */
+      if (JUMP_P (insn)
+	  && ANY_RETURN_P (PATTERN (insn)))
+	break;
+
+      if (NONDEBUG_INSN_P (insn)
+	  && GET_CODE (PATTERN (insn)) != USE
+	  && GET_CODE (PATTERN (insn)) != CLOBBER)
+	{
+	  insn_count++;
+	  if (insn_count >= 4)
+	    return insn_count;
+	}
+    }
+
+  return insn_count;
+}
+
+
+/* Count the minimum number of instructions in code path in BB.
+   Return 4 if the number of instructions >= 4.  */
+
+static int
+ix86_count_insn (basic_block bb)
+{
+  edge e;
+  edge_iterator ei;
+  int min_prev_count;
+
+  /* Only bother counting instructions along paths with no
+     more than 2 basic blocks between entry and exit.  Given
+     that BB has an edge to exit, determine if a predecessor
+     of BB has an edge from entry.  If so, compute the number
+     of instructions in the predecessor block.  If there
+     happen to be multiple such blocks, compute the minimum.  */
+  min_prev_count = 4;
+  FOR_EACH_EDGE (e, ei, bb->preds)
+    {
+      edge prev_e;
+      edge_iterator prev_ei;
+
+      if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
+	{
+	  min_prev_count = 0;
+	  break;
+	}
+      FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
+	{
+	  if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
+	    {
+	      int count = ix86_count_insn_bb (e->src);
+	      if (count < min_prev_count)
+		min_prev_count = count;
+	      break;
+	    }
+	}
+    }
+
+  if (min_prev_count < 4)
+    min_prev_count += ix86_count_insn_bb (bb);
+
+  return min_prev_count;
+}
+
+/* Pad short function to 4 instructions.   */
+
+static void
+ix86_pad_short_function (void)
+{
+  edge e;
+  edge_iterator ei;
+
+  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+    {
+      rtx ret = BB_END (e->src);
+      if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
+	{
+	  int insn_count = ix86_count_insn (e->src);
+
+	  /* Pad short function.  */
+	  if (insn_count < 4)
+	    {
+	      rtx insn = ret;
+
+	      /* Find epilogue.  */
+	      while (insn
+		     && (!NOTE_P (insn)
+			 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
+		insn = PREV_INSN (insn);
+
+	      if (!insn)
+		insn = ret;
+
+	      /* Two NOPs count as one instruction.  */
+	      insn_count = 2 * (4 - insn_count);
+	      emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
+	    }
+	}
+    }
+}
+
+/* Fix up a Windows system unwinder issue.  If an EH region falls through into
+   the epilogue, the Windows system unwinder will apply epilogue logic and
+   produce incorrect offsets.  This can be avoided by adding a nop between
+   the last insn that can throw and the first insn of the epilogue.  */
+
+static void
+ix86_seh_fixup_eh_fallthru (void)
+{
+  edge e;
+  edge_iterator ei;
+
+  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+    {
+      rtx insn, next;
+
+      /* Find the beginning of the epilogue.  */
+      for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
+	if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
+	  break;
+      if (insn == NULL)
+	continue;
+
+      /* We only care about preceding insns that can throw.  */
+      insn = prev_active_insn (insn);
+      if (insn == NULL || !can_throw_internal (insn))
+	continue;
+
+      /* Do not separate calls from their debug information.  */
+      for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
+	if (NOTE_P (next)
+            && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
+                || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
+	  insn = next;
+	else
+	  break;
+
+      emit_insn_after (gen_nops (const1_rtx), insn);
+    }
+}
+
+/* Implement machine specific optimizations.  We implement padding of returns
+   for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
+static void
+ix86_reorg (void)
+{
+  /* We are freeing block_for_insn in the toplev to keep compatibility
+     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
+  compute_bb_for_insn ();
+
+  if (TARGET_SEH && current_function_has_exception_handlers ())
+    ix86_seh_fixup_eh_fallthru ();
+
+  if (optimize && optimize_function_for_speed_p (cfun))
+    {
+      if (TARGET_PAD_SHORT_FUNCTION)
+	ix86_pad_short_function ();
+      else if (TARGET_PAD_RETURNS)
+	ix86_pad_returns ();
+#ifdef ASM_OUTPUT_MAX_SKIP_PAD
+      if (TARGET_FOUR_JUMP_LIMIT)
+	ix86_avoid_jump_mispredicts ();
+#endif
+    }
+}
+
+/* Return nonzero when QImode register that must be represented via REX prefix
+   is used.  */
+bool
+x86_extended_QIreg_mentioned_p (rtx insn)
+{
+  int i;
+  extract_insn_cached (insn);
+  for (i = 0; i < recog_data.n_operands; i++)
+    if (GENERAL_REG_P (recog_data.operand[i])
+	&& !QI_REGNO_P (REGNO (recog_data.operand[i])))
+       return true;
+  return false;
+}
+
+/* Return nonzero when P points to register encoded via REX prefix.
+   Called via for_each_rtx.  */
+static int
+extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
+{
+   unsigned int regno;
+   if (!REG_P (*p))
+     return 0;
+   regno = REGNO (*p);
+   return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
+}
+
+/* Return true when INSN mentions register that must be encoded using REX
+   prefix.  */
+bool
+x86_extended_reg_mentioned_p (rtx insn)
+{
+  return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
+		       extended_reg_mentioned_1, NULL);
+}
+
+/* If profitable, negate (without causing overflow) integer constant
+   of mode MODE at location LOC.  Return true in this case.  */
+bool
+x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
+{
+  HOST_WIDE_INT val;
+
+  if (!CONST_INT_P (*loc))
+    return false;
+
+  switch (mode)
+    {
+    case DImode:
+      /* DImode x86_64 constants must fit in 32 bits.  */
+      gcc_assert (x86_64_immediate_operand (*loc, mode));
+
+      mode = SImode;
+      break;
+
+    case SImode:
+    case HImode:
+    case QImode:
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Avoid overflows.  */
+  if (mode_signbit_p (mode, *loc))
+    return false;
+
+  val = INTVAL (*loc);
+
+  /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
+     Exceptions: -128 encodes smaller than 128, so swap sign and op.  */
+  if ((val < 0 && val != -128)
+      || val == 128)
+    {
+      *loc = GEN_INT (-val);
+      return true;
+    }
+
+  return false;
+}
+
+/* Generate an unsigned DImode/SImode to FP conversion.  This is the same code
+   optabs would emit if we didn't have TFmode patterns.  */
+
+void
+x86_emit_floatuns (rtx operands[2])
+{
+  rtx neglab, donelab, i0, i1, f0, in, out;
+  enum machine_mode mode, inmode;
+
+  inmode = GET_MODE (operands[1]);
+  gcc_assert (inmode == SImode || inmode == DImode);
+
+  out = operands[0];
+  in = force_reg (inmode, operands[1]);
+  mode = GET_MODE (out);
+  neglab = gen_label_rtx ();
+  donelab = gen_label_rtx ();
+  f0 = gen_reg_rtx (mode);
+
+  emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
+
+  expand_float (out, in, 0);
+
+  emit_jump_insn (gen_jump (donelab));
+  emit_barrier ();
+
+  emit_label (neglab);
+
+  i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
+			    1, OPTAB_DIRECT);
+  i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
+			    1, OPTAB_DIRECT);
+  i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
+
+  expand_float (f0, i0, 0);
+
+  emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
+
+  emit_label (donelab);
+}
+
+/* AVX512F does support 64-byte integer vector operations,
+   thus the longest vector we are faced with is V64QImode.  */
+#define MAX_VECT_LEN	64
+
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  enum machine_mode vmode;
+  unsigned char nelt;
+  bool one_operand_p;
+  bool testing_p;
+};
+
+static bool canonicalize_perm (struct expand_vec_perm_d *d);
+static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
+static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
+
+/* Get a vector mode of the same size as the original but with elements
+   twice as wide.  This is only guaranteed to apply to integral vectors.  */
+
+static inline enum machine_mode
+get_mode_wider_vector (enum machine_mode o)
+{
+  /* ??? Rely on the ordering that genmodes.c gives to vectors.  */
+  enum machine_mode n = GET_MODE_WIDER_MODE (o);
+  gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
+  gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
+  return n;
+}
+
+/* A subroutine of ix86_expand_vector_init_duplicate.  Tries to
+   fill target with val via vec_duplicate.  */
+
+static bool
+ix86_vector_duplicate_value (enum machine_mode mode, rtx target, rtx val)
+{
+  bool ok;
+  rtx insn, dup;
+
+  /* First attempt to recognize VAL as-is.  */
+  dup = gen_rtx_VEC_DUPLICATE (mode, val);
+  insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
+  if (recog_memoized (insn) < 0)
+    {
+      rtx seq;
+      /* If that fails, force VAL into a register.  */
+
+      start_sequence ();
+      XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
+      seq = get_insns ();
+      end_sequence ();
+      if (seq)
+	emit_insn_before (seq, insn);
+
+      ok = recog_memoized (insn) >= 0;
+      gcc_assert (ok);
+    }
+  return true;
+}
+
+/* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
+   with all elements equal to VAR.  Return true if successful.  */
+
+static bool
+ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
+				   rtx target, rtx val)
+{
+  bool ok;
+
+  switch (mode)
+    {
+    case V2SImode:
+    case V2SFmode:
+      if (!mmx_ok)
+	return false;
+      /* FALLTHRU */
+
+    case V4DFmode:
+    case V4DImode:
+    case V8SFmode:
+    case V8SImode:
+    case V2DFmode:
+    case V2DImode:
+    case V4SFmode:
+    case V4SImode:
+    case V16SImode:
+    case V8DImode:
+    case V16SFmode:
+    case V8DFmode:
+      return ix86_vector_duplicate_value (mode, target, val);
+
+    case V4HImode:
+      if (!mmx_ok)
+	return false;
+      if (TARGET_SSE || TARGET_3DNOW_A)
+	{
+	  rtx x;
+
+	  val = gen_lowpart (SImode, val);
+	  x = gen_rtx_TRUNCATE (HImode, val);
+	  x = gen_rtx_VEC_DUPLICATE (mode, x);
+	  emit_insn (gen_rtx_SET (VOIDmode, target, x));
+	  return true;
+	}
+      goto widen;
+
+    case V8QImode:
+      if (!mmx_ok)
+	return false;
+      goto widen;
+
+    case V8HImode:
+      if (TARGET_SSE2)
+	{
+	  struct expand_vec_perm_d dperm;
+	  rtx tmp1, tmp2;
+
+	permute:
+	  memset (&dperm, 0, sizeof (dperm));
+	  dperm.target = target;
+	  dperm.vmode = mode;
+	  dperm.nelt = GET_MODE_NUNITS (mode);
+	  dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
+	  dperm.one_operand_p = true;
+
+	  /* Extend to SImode using a paradoxical SUBREG.  */
+	  tmp1 = gen_reg_rtx (SImode);
+	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
+
+	  /* Insert the SImode value as low element of a V4SImode vector. */
+	  tmp2 = gen_reg_rtx (V4SImode);
+	  emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
+	  emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
+
+	  ok = (expand_vec_perm_1 (&dperm)
+		|| expand_vec_perm_broadcast_1 (&dperm));
+	  gcc_assert (ok);
+	  return ok;
+	}
+      goto widen;
+
+    case V16QImode:
+      if (TARGET_SSE2)
+	goto permute;
+      goto widen;
+
+    widen:
+      /* Replicate the value once into the next wider mode and recurse.  */
+      {
+	enum machine_mode smode, wsmode, wvmode;
+	rtx x;
+
+	smode = GET_MODE_INNER (mode);
+	wvmode = get_mode_wider_vector (mode);
+	wsmode = GET_MODE_INNER (wvmode);
+
+	val = convert_modes (wsmode, smode, val, true);
+	x = expand_simple_binop (wsmode, ASHIFT, val,
+				 GEN_INT (GET_MODE_BITSIZE (smode)),
+				 NULL_RTX, 1, OPTAB_LIB_WIDEN);
+	val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
+
+	x = gen_reg_rtx (wvmode);
+	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
+	gcc_assert (ok);
+	emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
+	return ok;
+      }
+
+    case V16HImode:
+    case V32QImode:
+      {
+	enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
+	rtx x = gen_reg_rtx (hvmode);
+
+	ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
+	gcc_assert (ok);
+
+	x = gen_rtx_VEC_CONCAT (mode, x, x);
+	emit_insn (gen_rtx_SET (VOIDmode, target, x));
+      }
+      return true;
+
+    default:
+      return false;
+    }
+}
+
+/* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
+   whose ONE_VAR element is VAR, and other elements are zero.  Return true
+   if successful.  */
+
+static bool
+ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
+				     rtx target, rtx var, int one_var)
+{
+  enum machine_mode vsimode;
+  rtx new_target;
+  rtx x, tmp;
+  bool use_vector_set = false;
+
+  switch (mode)
+    {
+    case V2DImode:
+      /* For SSE4.1, we normally use vector set.  But if the second
+	 element is zero and inter-unit moves are OK, we use movq
+	 instead.  */
+      use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
+			&& !(TARGET_INTER_UNIT_MOVES_TO_VEC
+			     && one_var == 0));
+      break;
+    case V16QImode:
+    case V4SImode:
+    case V4SFmode:
+      use_vector_set = TARGET_SSE4_1;
+      break;
+    case V8HImode:
+      use_vector_set = TARGET_SSE2;
+      break;
+    case V4HImode:
+      use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
+      break;
+    case V32QImode:
+    case V16HImode:
+    case V8SImode:
+    case V8SFmode:
+    case V4DFmode:
+      use_vector_set = TARGET_AVX;
+      break;
+    case V4DImode:
+      /* Use ix86_expand_vector_set in 64bit mode only.  */
+      use_vector_set = TARGET_AVX && TARGET_64BIT;
+      break;
+    default:
+      break;
+    }
+
+  if (use_vector_set)
+    {
+      emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
+      var = force_reg (GET_MODE_INNER (mode), var);
+      ix86_expand_vector_set (mmx_ok, target, var, one_var);
+      return true;
+    }
+
+  switch (mode)
+    {
+    case V2SFmode:
+    case V2SImode:
+      if (!mmx_ok)
+	return false;
+      /* FALLTHRU */
+
+    case V2DFmode:
+    case V2DImode:
+      if (one_var != 0)
+	return false;
+      var = force_reg (GET_MODE_INNER (mode), var);
+      x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
+      emit_insn (gen_rtx_SET (VOIDmode, target, x));
+      return true;
+
+    case V4SFmode:
+    case V4SImode:
+      if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+	new_target = gen_reg_rtx (mode);
+      else
+	new_target = target;
+      var = force_reg (GET_MODE_INNER (mode), var);
+      x = gen_rtx_VEC_DUPLICATE (mode, var);
+      x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
+      emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
+      if (one_var != 0)
+	{
+	  /* We need to shuffle the value to the correct position, so
+	     create a new pseudo to store the intermediate result.  */
+
+	  /* With SSE2, we can use the integer shuffle insns.  */
+	  if (mode != V4SFmode && TARGET_SSE2)
+	    {
+	      emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
+					    const1_rtx,
+					    GEN_INT (one_var == 1 ? 0 : 1),
+					    GEN_INT (one_var == 2 ? 0 : 1),
+					    GEN_INT (one_var == 3 ? 0 : 1)));
+	      if (target != new_target)
+		emit_move_insn (target, new_target);
+	      return true;
+	    }
+
+	  /* Otherwise convert the intermediate result to V4SFmode and
+	     use the SSE1 shuffle instructions.  */
+	  if (mode != V4SFmode)
+	    {
+	      tmp = gen_reg_rtx (V4SFmode);
+	      emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
+	    }
+	  else
+	    tmp = new_target;
+
+	  emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
+				       const1_rtx,
+				       GEN_INT (one_var == 1 ? 0 : 1),
+				       GEN_INT (one_var == 2 ? 0+4 : 1+4),
+				       GEN_INT (one_var == 3 ? 0+4 : 1+4)));
+
+	  if (mode != V4SFmode)
+	    emit_move_insn (target, gen_lowpart (V4SImode, tmp));
+	  else if (tmp != target)
+	    emit_move_insn (target, tmp);
+	}
+      else if (target != new_target)
+	emit_move_insn (target, new_target);
+      return true;
+
+    case V8HImode:
+    case V16QImode:
+      vsimode = V4SImode;
+      goto widen;
+    case V4HImode:
+    case V8QImode:
+      if (!mmx_ok)
+	return false;
+      vsimode = V2SImode;
+      goto widen;
+    widen:
+      if (one_var != 0)
+	return false;
+
+      /* Zero extend the variable element to SImode and recurse.  */
+      var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
+
+      x = gen_reg_rtx (vsimode);
+      if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
+						var, one_var))
+	gcc_unreachable ();
+
+      emit_move_insn (target, gen_lowpart (mode, x));
+      return true;
+
+    default:
+      return false;
+    }
+}
+
+/* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
+   consisting of the values in VALS.  It is known that all elements
+   except ONE_VAR are constants.  Return true if successful.  */
+
+static bool
+ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
+				 rtx target, rtx vals, int one_var)
+{
+  rtx var = XVECEXP (vals, 0, one_var);
+  enum machine_mode wmode;
+  rtx const_vec, x;
+
+  const_vec = copy_rtx (vals);
+  XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
+  const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
+
+  switch (mode)
+    {
+    case V2DFmode:
+    case V2DImode:
+    case V2SFmode:
+    case V2SImode:
+      /* For the two element vectors, it's just as easy to use
+	 the general case.  */
+      return false;
+
+    case V4DImode:
+      /* Use ix86_expand_vector_set in 64bit mode only.  */
+      if (!TARGET_64BIT)
+	return false;
+    case V4DFmode:
+    case V8SFmode:
+    case V8SImode:
+    case V16HImode:
+    case V32QImode:
+    case V4SFmode:
+    case V4SImode:
+    case V8HImode:
+    case V4HImode:
+      break;
+
+    case V16QImode:
+      if (TARGET_SSE4_1)
+	break;
+      wmode = V8HImode;
+      goto widen;
+    case V8QImode:
+      wmode = V4HImode;
+      goto widen;
+    widen:
+      /* There's no way to set one QImode entry easily.  Combine
+	 the variable value with its adjacent constant value, and
+	 promote to an HImode set.  */
+      x = XVECEXP (vals, 0, one_var ^ 1);
+      if (one_var & 1)
+	{
+	  var = convert_modes (HImode, QImode, var, true);
+	  var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
+				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
+	  x = GEN_INT (INTVAL (x) & 0xff);
+	}
+      else
+	{
+	  var = convert_modes (HImode, QImode, var, true);
+	  x = gen_int_mode (INTVAL (x) << 8, HImode);
+	}
+      if (x != const0_rtx)
+	var = expand_simple_binop (HImode, IOR, var, x, var,
+				   1, OPTAB_LIB_WIDEN);
+
+      x = gen_reg_rtx (wmode);
+      emit_move_insn (x, gen_lowpart (wmode, const_vec));
+      ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
+
+      emit_move_insn (target, gen_lowpart (mode, x));
+      return true;
+
+    default:
+      return false;
+    }
+
+  emit_move_insn (target, const_vec);
+  ix86_expand_vector_set (mmx_ok, target, var, one_var);
+  return true;
+}
+
+/* A subroutine of ix86_expand_vector_init_general.  Use vector
+   concatenate to handle the most general case: all values variable,
+   and none identical.  */
+
+static void
+ix86_expand_vector_init_concat (enum machine_mode mode,
+				rtx target, rtx *ops, int n)
+{
+  enum machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
+  rtx first[16], second[8], third[4];
+  rtvec v;
+  int i, j;
+
+  switch (n)
+    {
+    case 2:
+      switch (mode)
+	{
+	case V16SImode:
+	  cmode = V8SImode;
+	  break;
+	case V16SFmode:
+	  cmode = V8SFmode;
+	  break;
+	case V8DImode:
+	  cmode = V4DImode;
+	  break;
+	case V8DFmode:
+	  cmode = V4DFmode;
+	  break;
+	case V8SImode:
+	  cmode = V4SImode;
+	  break;
+	case V8SFmode:
+	  cmode = V4SFmode;
+	  break;
+	case V4DImode:
+	  cmode = V2DImode;
+	  break;
+	case V4DFmode:
+	  cmode = V2DFmode;
+	  break;
+	case V4SImode:
+	  cmode = V2SImode;
+	  break;
+	case V4SFmode:
+	  cmode = V2SFmode;
+	  break;
+	case V2DImode:
+	  cmode = DImode;
+	  break;
+	case V2SImode:
+	  cmode = SImode;
+	  break;
+	case V2DFmode:
+	  cmode = DFmode;
+	  break;
+	case V2SFmode:
+	  cmode = SFmode;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+
+      if (!register_operand (ops[1], cmode))
+	ops[1] = force_reg (cmode, ops[1]);
+      if (!register_operand (ops[0], cmode))
+	ops[0] = force_reg (cmode, ops[0]);
+      emit_insn (gen_rtx_SET (VOIDmode, target,
+			      gen_rtx_VEC_CONCAT (mode, ops[0],
+						  ops[1])));
+      break;
+
+    case 4:
+      switch (mode)
+	{
+	case V4DImode:
+	  cmode = V2DImode;
+	  break;
+	case V4DFmode:
+	  cmode = V2DFmode;
+	  break;
+	case V4SImode:
+	  cmode = V2SImode;
+	  break;
+	case V4SFmode:
+	  cmode = V2SFmode;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      goto half;
+
+    case 8:
+      switch (mode)
+	{
+	case V8DImode:
+	  cmode = V2DImode;
+	  hmode = V4DImode;
+	  break;
+	case V8DFmode:
+	  cmode = V2DFmode;
+	  hmode = V4DFmode;
+	  break;
+	case V8SImode:
+	  cmode = V2SImode;
+	  hmode = V4SImode;
+	  break;
+	case V8SFmode:
+	  cmode = V2SFmode;
+	  hmode = V4SFmode;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      goto half;
+
+    case 16:
+      switch (mode)
+	{
+	case V16SImode:
+	  cmode = V2SImode;
+	  hmode = V4SImode;
+	  gmode = V8SImode;
+	  break;
+	case V16SFmode:
+	  cmode = V2SFmode;
+	  hmode = V4SFmode;
+	  gmode = V8SFmode;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      goto half;
+
+half:
+      /* FIXME: We process inputs backward to help RA.  PR 36222.  */
+      i = n - 1;
+      j = (n >> 1) - 1;
+      for (; i > 0; i -= 2, j--)
+	{
+	  first[j] = gen_reg_rtx (cmode);
+	  v = gen_rtvec (2, ops[i - 1], ops[i]);
+	  ix86_expand_vector_init (false, first[j],
+				   gen_rtx_PARALLEL (cmode, v));
+	}
+
+      n >>= 1;
+      if (n > 4)
+	{
+	  gcc_assert (hmode != VOIDmode);
+	  gcc_assert (gmode != VOIDmode);
+	  for (i = j = 0; i < n; i += 2, j++)
+	    {
+	      second[j] = gen_reg_rtx (hmode);
+	      ix86_expand_vector_init_concat (hmode, second [j],
+					      &first [i], 2);
+	    }
+	  n >>= 1;
+	  for (i = j = 0; i < n; i += 2, j++)
+	    {
+	      third[j] = gen_reg_rtx (gmode);
+	      ix86_expand_vector_init_concat (gmode, third[j],
+					      &second[i], 2);
+	    }
+	  n >>= 1;
+	  ix86_expand_vector_init_concat (mode, target, third, n);
+	}
+      else if (n > 2)
+	{
+	  gcc_assert (hmode != VOIDmode);
+	  for (i = j = 0; i < n; i += 2, j++)
+	    {
+	      second[j] = gen_reg_rtx (hmode);
+	      ix86_expand_vector_init_concat (hmode, second [j],
+					      &first [i], 2);
+	    }
+	  n >>= 1;
+	  ix86_expand_vector_init_concat (mode, target, second, n);
+	}
+      else
+	ix86_expand_vector_init_concat (mode, target, first, n);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* A subroutine of ix86_expand_vector_init_general.  Use vector
+   interleave to handle the most general case: all values variable,
+   and none identical.  */
+
+static void
+ix86_expand_vector_init_interleave (enum machine_mode mode,
+				    rtx target, rtx *ops, int n)
+{
+  enum machine_mode first_imode, second_imode, third_imode, inner_mode;
+  int i, j;
+  rtx op0, op1;
+  rtx (*gen_load_even) (rtx, rtx, rtx);
+  rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
+  rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
+
+  switch (mode)
+    {
+    case V8HImode:
+      gen_load_even = gen_vec_setv8hi;
+      gen_interleave_first_low = gen_vec_interleave_lowv4si;
+      gen_interleave_second_low = gen_vec_interleave_lowv2di;
+      inner_mode = HImode;
+      first_imode = V4SImode;
+      second_imode = V2DImode;
+      third_imode = VOIDmode;
+      break;
+    case V16QImode:
+      gen_load_even = gen_vec_setv16qi;
+      gen_interleave_first_low = gen_vec_interleave_lowv8hi;
+      gen_interleave_second_low = gen_vec_interleave_lowv4si;
+      inner_mode = QImode;
+      first_imode = V8HImode;
+      second_imode = V4SImode;
+      third_imode = V2DImode;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  for (i = 0; i < n; i++)
+    {
+      /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
+      op0 = gen_reg_rtx (SImode);
+      emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
+
+      /* Insert the SImode value as low element of V4SImode vector. */
+      op1 = gen_reg_rtx (V4SImode);
+      op0 = gen_rtx_VEC_MERGE (V4SImode,
+			       gen_rtx_VEC_DUPLICATE (V4SImode,
+						      op0),
+			       CONST0_RTX (V4SImode),
+			       const1_rtx);
+      emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
+
+      /* Cast the V4SImode vector back to a vector in orignal mode.  */
+      op0 = gen_reg_rtx (mode);
+      emit_move_insn (op0, gen_lowpart (mode, op1));
+
+      /* Load even elements into the second position.  */
+      emit_insn (gen_load_even (op0,
+				force_reg (inner_mode,
+					   ops [i + i + 1]),
+				const1_rtx));
+
+      /* Cast vector to FIRST_IMODE vector.  */
+      ops[i] = gen_reg_rtx (first_imode);
+      emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
+    }
+
+  /* Interleave low FIRST_IMODE vectors.  */
+  for (i = j = 0; i < n; i += 2, j++)
+    {
+      op0 = gen_reg_rtx (first_imode);
+      emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
+
+      /* Cast FIRST_IMODE vector to SECOND_IMODE vector.  */
+      ops[j] = gen_reg_rtx (second_imode);
+      emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
+    }
+
+  /* Interleave low SECOND_IMODE vectors.  */
+  switch (second_imode)
+    {
+    case V4SImode:
+      for (i = j = 0; i < n / 2; i += 2, j++)
+	{
+	  op0 = gen_reg_rtx (second_imode);
+	  emit_insn (gen_interleave_second_low (op0, ops[i],
+						ops[i + 1]));
+
+	  /* Cast the SECOND_IMODE vector to the THIRD_IMODE
+	     vector.  */
+	  ops[j] = gen_reg_rtx (third_imode);
+	  emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
+	}
+      second_imode = V2DImode;
+      gen_interleave_second_low = gen_vec_interleave_lowv2di;
+      /* FALLTHRU */
+
+    case V2DImode:
+      op0 = gen_reg_rtx (second_imode);
+      emit_insn (gen_interleave_second_low (op0, ops[0],
+					    ops[1]));
+
+      /* Cast the SECOND_IMODE vector back to a vector on original
+	 mode.  */
+      emit_insn (gen_rtx_SET (VOIDmode, target,
+			      gen_lowpart (mode, op0)));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* A subroutine of ix86_expand_vector_init.  Handle the most general case:
+   all values variable, and none identical.  */
+
+static void
+ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
+				 rtx target, rtx vals)
+{
+  rtx ops[64], op0, op1;
+  enum machine_mode half_mode = VOIDmode;
+  int n, i;
+
+  switch (mode)
+    {
+    case V2SFmode:
+    case V2SImode:
+      if (!mmx_ok && !TARGET_SSE)
+	break;
+      /* FALLTHRU */
+
+    case V16SImode:
+    case V16SFmode:
+    case V8DFmode:
+    case V8DImode:
+    case V8SFmode:
+    case V8SImode:
+    case V4DFmode:
+    case V4DImode:
+    case V4SFmode:
+    case V4SImode:
+    case V2DFmode:
+    case V2DImode:
+      n = GET_MODE_NUNITS (mode);
+      for (i = 0; i < n; i++)
+	ops[i] = XVECEXP (vals, 0, i);
+      ix86_expand_vector_init_concat (mode, target, ops, n);
+      return;
+
+    case V32QImode:
+      half_mode = V16QImode;
+      goto half;
+
+    case V16HImode:
+      half_mode = V8HImode;
+      goto half;
+
+half:
+      n = GET_MODE_NUNITS (mode);
+      for (i = 0; i < n; i++)
+	ops[i] = XVECEXP (vals, 0, i);
+      op0 = gen_reg_rtx (half_mode);
+      op1 = gen_reg_rtx (half_mode);
+      ix86_expand_vector_init_interleave (half_mode, op0, ops,
+					  n >> 2);
+      ix86_expand_vector_init_interleave (half_mode, op1,
+					  &ops [n >> 1], n >> 2);
+      emit_insn (gen_rtx_SET (VOIDmode, target,
+			      gen_rtx_VEC_CONCAT (mode, op0, op1)));
+      return;
+
+    case V16QImode:
+      if (!TARGET_SSE4_1)
+	break;
+      /* FALLTHRU */
+
+    case V8HImode:
+      if (!TARGET_SSE2)
+	break;
+
+      /* Don't use ix86_expand_vector_init_interleave if we can't
+	 move from GPR to SSE register directly.  */
+      if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
+	break;
+
+      n = GET_MODE_NUNITS (mode);
+      for (i = 0; i < n; i++)
+	ops[i] = XVECEXP (vals, 0, i);
+      ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
+      return;
+
+    case V4HImode:
+    case V8QImode:
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+    {
+      int i, j, n_elts, n_words, n_elt_per_word;
+      enum machine_mode inner_mode;
+      rtx words[4], shift;
+
+      inner_mode = GET_MODE_INNER (mode);
+      n_elts = GET_MODE_NUNITS (mode);
+      n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
+      n_elt_per_word = n_elts / n_words;
+      shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
+
+      for (i = 0; i < n_words; ++i)
+	{
+	  rtx word = NULL_RTX;
+
+	  for (j = 0; j < n_elt_per_word; ++j)
+	    {
+	      rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
+	      elt = convert_modes (word_mode, inner_mode, elt, true);
+
+	      if (j == 0)
+		word = elt;
+	      else
+		{
+		  word = expand_simple_binop (word_mode, ASHIFT, word, shift,
+					      word, 1, OPTAB_LIB_WIDEN);
+		  word = expand_simple_binop (word_mode, IOR, word, elt,
+					      word, 1, OPTAB_LIB_WIDEN);
+		}
+	    }
+
+	  words[i] = word;
+	}
+
+      if (n_words == 1)
+	emit_move_insn (target, gen_lowpart (mode, words[0]));
+      else if (n_words == 2)
+	{
+	  rtx tmp = gen_reg_rtx (mode);
+	  emit_clobber (tmp);
+	  emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
+	  emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
+	  emit_move_insn (target, tmp);
+	}
+      else if (n_words == 4)
+	{
+	  rtx tmp = gen_reg_rtx (V4SImode);
+	  gcc_assert (word_mode == SImode);
+	  vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
+	  ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
+	  emit_move_insn (target, gen_lowpart (mode, tmp));
+	}
+      else
+	gcc_unreachable ();
+    }
+}
+
+/* Initialize vector TARGET via VALS.  Suppress the use of MMX
+   instructions unless MMX_OK is true.  */
+
+void
+ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
+{
+  enum machine_mode mode = GET_MODE (target);
+  enum machine_mode inner_mode = GET_MODE_INNER (mode);
+  int n_elts = GET_MODE_NUNITS (mode);
+  int n_var = 0, one_var = -1;
+  bool all_same = true, all_const_zero = true;
+  int i;
+  rtx x;
+
+  for (i = 0; i < n_elts; ++i)
+    {
+      x = XVECEXP (vals, 0, i);
+      if (!(CONST_INT_P (x)
+	    || GET_CODE (x) == CONST_DOUBLE
+	    || GET_CODE (x) == CONST_FIXED))
+	n_var++, one_var = i;
+      else if (x != CONST0_RTX (inner_mode))
+	all_const_zero = false;
+      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
+	all_same = false;
+    }
+
+  /* Constants are best loaded from the constant pool.  */
+  if (n_var == 0)
+    {
+      emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
+      return;
+    }
+
+  /* If all values are identical, broadcast the value.  */
+  if (all_same
+      && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
+					    XVECEXP (vals, 0, 0)))
+    return;
+
+  /* Values where only one field is non-constant are best loaded from
+     the pool and overwritten via move later.  */
+  if (n_var == 1)
+    {
+      if (all_const_zero
+	  && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
+						  XVECEXP (vals, 0, one_var),
+						  one_var))
+	return;
+
+      if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
+	return;
+    }
+
+  ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
+}
+
+void
+ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
+{
+  enum machine_mode mode = GET_MODE (target);
+  enum machine_mode inner_mode = GET_MODE_INNER (mode);
+  enum machine_mode half_mode;
+  bool use_vec_merge = false;
+  rtx tmp;
+  static rtx (*gen_extract[6][2]) (rtx, rtx)
+    = {
+	{ gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
+	{ gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
+	{ gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
+	{ gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
+	{ gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
+	{ gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
+      };
+  static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
+    = {
+	{ gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
+	{ gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
+	{ gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
+	{ gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
+	{ gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
+	{ gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
+      };
+  int i, j, n;
+
+  switch (mode)
+    {
+    case V2SFmode:
+    case V2SImode:
+      if (mmx_ok)
+	{
+	  tmp = gen_reg_rtx (GET_MODE_INNER (mode));
+	  ix86_expand_vector_extract (true, tmp, target, 1 - elt);
+	  if (elt == 0)
+	    tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
+	  else
+	    tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
+	  emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
+	  return;
+	}
+      break;
+
+    case V2DImode:
+      use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
+      if (use_vec_merge)
+	break;
+
+      tmp = gen_reg_rtx (GET_MODE_INNER (mode));
+      ix86_expand_vector_extract (false, tmp, target, 1 - elt);
+      if (elt == 0)
+	tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
+      else
+	tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
+      emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
+      return;
+
+    case V2DFmode:
+      {
+	rtx op0, op1;
+
+	/* For the two element vectors, we implement a VEC_CONCAT with
+	   the extraction of the other element.  */
+
+	tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
+	tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
+
+	if (elt == 0)
+	  op0 = val, op1 = tmp;
+	else
+	  op0 = tmp, op1 = val;
+
+	tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
+	emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
+      }
+      return;
+
+    case V4SFmode:
+      use_vec_merge = TARGET_SSE4_1;
+      if (use_vec_merge)
+	break;
+
+      switch (elt)
+	{
+	case 0:
+	  use_vec_merge = true;
+	  break;
+
+	case 1:
+	  /* tmp = target = A B C D */
+	  tmp = copy_to_reg (target);
+	  /* target = A A B B */
+	  emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
+	  /* target = X A B B */
+	  ix86_expand_vector_set (false, target, val, 0);
+	  /* target = A X C D  */
+	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
+					  const1_rtx, const0_rtx,
+					  GEN_INT (2+4), GEN_INT (3+4)));
+	  return;
+
+	case 2:
+	  /* tmp = target = A B C D */
+	  tmp = copy_to_reg (target);
+	  /* tmp = X B C D */
+	  ix86_expand_vector_set (false, tmp, val, 0);
+	  /* target = A B X D */
+	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
+					  const0_rtx, const1_rtx,
+					  GEN_INT (0+4), GEN_INT (3+4)));
+	  return;
+
+	case 3:
+	  /* tmp = target = A B C D */
+	  tmp = copy_to_reg (target);
+	  /* tmp = X B C D */
+	  ix86_expand_vector_set (false, tmp, val, 0);
+	  /* target = A B X D */
+	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
+					  const0_rtx, const1_rtx,
+					  GEN_INT (2+4), GEN_INT (0+4)));
+	  return;
+
+	default:
+	  gcc_unreachable ();
+	}
+      break;
+
+    case V4SImode:
+      use_vec_merge = TARGET_SSE4_1;
+      if (use_vec_merge)
+	break;
+
+      /* Element 0 handled by vec_merge below.  */
+      if (elt == 0)
+	{
+	  use_vec_merge = true;
+	  break;
+	}
+
+      if (TARGET_SSE2)
+	{
+	  /* With SSE2, use integer shuffles to swap element 0 and ELT,
+	     store into element 0, then shuffle them back.  */
+
+	  rtx order[4];
+
+	  order[0] = GEN_INT (elt);
+	  order[1] = const1_rtx;
+	  order[2] = const2_rtx;
+	  order[3] = GEN_INT (3);
+	  order[elt] = const0_rtx;
+
+	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
+					order[1], order[2], order[3]));
+
+	  ix86_expand_vector_set (false, target, val, 0);
+
+	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
+					order[1], order[2], order[3]));
+	}
+      else
+	{
+	  /* For SSE1, we have to reuse the V4SF code.  */
+	  rtx t = gen_reg_rtx (V4SFmode);
+	  ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
+	  emit_move_insn (target, gen_lowpart (mode, t));
+	}
+      return;
+
+    case V8HImode:
+      use_vec_merge = TARGET_SSE2;
+      break;
+    case V4HImode:
+      use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
+      break;
+
+    case V16QImode:
+      use_vec_merge = TARGET_SSE4_1;
+      break;
+
+    case V8QImode:
+      break;
+
+    case V32QImode:
+      half_mode = V16QImode;
+      j = 0;
+      n = 16;
+      goto half;
+
+    case V16HImode:
+      half_mode = V8HImode;
+      j = 1;
+      n = 8;
+      goto half;
+
+    case V8SImode:
+      half_mode = V4SImode;
+      j = 2;
+      n = 4;
+      goto half;
+
+    case V4DImode:
+      half_mode = V2DImode;
+      j = 3;
+      n = 2;
+      goto half;
+
+    case V8SFmode:
+      half_mode = V4SFmode;
+      j = 4;
+      n = 4;
+      goto half;
+
+    case V4DFmode:
+      half_mode = V2DFmode;
+      j = 5;
+      n = 2;
+      goto half;
+
+half:
+      /* Compute offset.  */
+      i = elt / n;
+      elt %= n;
+
+      gcc_assert (i <= 1);
+
+      /* Extract the half.  */
+      tmp = gen_reg_rtx (half_mode);
+      emit_insn (gen_extract[j][i] (tmp, target));
+
+      /* Put val in tmp at elt.  */
+      ix86_expand_vector_set (false, tmp, val, elt);
+
+      /* Put it back.  */
+      emit_insn (gen_insert[j][i] (target, target, tmp));
+      return;
+
+    default:
+      break;
+    }
+
+  if (use_vec_merge)
+    {
+      tmp = gen_rtx_VEC_DUPLICATE (mode, val);
+      tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
+      emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
+    }
+  else
+    {
+      rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
+
+      emit_move_insn (mem, target);
+
+      tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
+      emit_move_insn (tmp, val);
+
+      emit_move_insn (target, mem);
+    }
+}
+
+void
+ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
+{
+  enum machine_mode mode = GET_MODE (vec);
+  enum machine_mode inner_mode = GET_MODE_INNER (mode);
+  bool use_vec_extr = false;
+  rtx tmp;
+
+  switch (mode)
+    {
+    case V2SImode:
+    case V2SFmode:
+      if (!mmx_ok)
+	break;
+      /* FALLTHRU */
+
+    case V2DFmode:
+    case V2DImode:
+      use_vec_extr = true;
+      break;
+
+    case V4SFmode:
+      use_vec_extr = TARGET_SSE4_1;
+      if (use_vec_extr)
+	break;
+
+      switch (elt)
+	{
+	case 0:
+	  tmp = vec;
+	  break;
+
+	case 1:
+	case 3:
+	  tmp = gen_reg_rtx (mode);
+	  emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
+				       GEN_INT (elt), GEN_INT (elt),
+				       GEN_INT (elt+4), GEN_INT (elt+4)));
+	  break;
+
+	case 2:
+	  tmp = gen_reg_rtx (mode);
+	  emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+      vec = tmp;
+      use_vec_extr = true;
+      elt = 0;
+      break;
+
+    case V4SImode:
+      use_vec_extr = TARGET_SSE4_1;
+      if (use_vec_extr)
+	break;
+
+      if (TARGET_SSE2)
+	{
+	  switch (elt)
+	    {
+	    case 0:
+	      tmp = vec;
+	      break;
+
+	    case 1:
+	    case 3:
+	      tmp = gen_reg_rtx (mode);
+	      emit_insn (gen_sse2_pshufd_1 (tmp, vec,
+					    GEN_INT (elt), GEN_INT (elt),
+					    GEN_INT (elt), GEN_INT (elt)));
+	      break;
+
+	    case 2:
+	      tmp = gen_reg_rtx (mode);
+	      emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
+	      break;
+
+	    default:
+	      gcc_unreachable ();
+	    }
+	  vec = tmp;
+	  use_vec_extr = true;
+	  elt = 0;
+	}
+      else
+	{
+	  /* For SSE1, we have to reuse the V4SF code.  */
+	  ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
+				      gen_lowpart (V4SFmode, vec), elt);
+	  return;
+	}
+      break;
+
+    case V8HImode:
+      use_vec_extr = TARGET_SSE2;
+      break;
+    case V4HImode:
+      use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
+      break;
+
+    case V16QImode:
+      use_vec_extr = TARGET_SSE4_1;
+      break;
+
+    case V8SFmode:
+      if (TARGET_AVX)
+	{
+	  tmp = gen_reg_rtx (V4SFmode);
+	  if (elt < 4)
+	    emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
+	  else
+	    emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
+	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
+	  return;
+	}
+      break;
+
+    case V4DFmode:
+      if (TARGET_AVX)
+	{
+	  tmp = gen_reg_rtx (V2DFmode);
+	  if (elt < 2)
+	    emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
+	  else
+	    emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
+	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
+	  return;
+	}
+      break;
+
+    case V32QImode:
+      if (TARGET_AVX)
+	{
+	  tmp = gen_reg_rtx (V16QImode);
+	  if (elt < 16)
+	    emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
+	  else
+	    emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
+	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
+	  return;
+	}
+      break;
+
+    case V16HImode:
+      if (TARGET_AVX)
+	{
+	  tmp = gen_reg_rtx (V8HImode);
+	  if (elt < 8)
+	    emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
+	  else
+	    emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
+	  ix86_expand_vector_extract (false, target, tmp, elt & 7);
+	  return;
+	}
+      break;
+
+    case V8SImode:
+      if (TARGET_AVX)
+	{
+	  tmp = gen_reg_rtx (V4SImode);
+	  if (elt < 4)
+	    emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
+	  else
+	    emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
+	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
+	  return;
+	}
+      break;
+
+    case V4DImode:
+      if (TARGET_AVX)
+	{
+	  tmp = gen_reg_rtx (V2DImode);
+	  if (elt < 2)
+	    emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
+	  else
+	    emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
+	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
+	  return;
+	}
+      break;
+
+    case V16SFmode:
+      tmp = gen_reg_rtx (V8SFmode);
+      if (elt < 8)
+	emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
+      else
+	emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
+      ix86_expand_vector_extract (false, target, tmp, elt & 7);
+      return;
+
+    case V8DFmode:
+      tmp = gen_reg_rtx (V4DFmode);
+      if (elt < 4)
+	emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
+      else
+	emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
+      ix86_expand_vector_extract (false, target, tmp, elt & 3);
+      return;
+
+    case V16SImode:
+      tmp = gen_reg_rtx (V8SImode);
+      if (elt < 8)
+	emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
+      else
+	emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
+      ix86_expand_vector_extract (false, target, tmp, elt & 7);
+      return;
+
+    case V8DImode:
+      tmp = gen_reg_rtx (V4DImode);
+      if (elt < 4)
+	emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
+      else
+	emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
+      ix86_expand_vector_extract (false, target, tmp, elt & 3);
+      return;
+
+    case V8QImode:
+      /* ??? Could extract the appropriate HImode element and shift.  */
+    default:
+      break;
+    }
+
+  if (use_vec_extr)
+    {
+      tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
+      tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
+
+      /* Let the rtl optimizers know about the zero extension performed.  */
+      if (inner_mode == QImode || inner_mode == HImode)
+	{
+	  tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
+	  target = gen_lowpart (SImode, target);
+	}
+
+      emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
+    }
+  else
+    {
+      rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
+
+      emit_move_insn (mem, vec);
+
+      tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
+      emit_move_insn (target, tmp);
+    }
+}
+
+/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
+   to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
+   The upper bits of DEST are undefined, though they shouldn't cause
+   exceptions (some bits from src or all zeros are ok).  */
+
+static void
+emit_reduc_half (rtx dest, rtx src, int i)
+{
+  rtx tem, d = dest;
+  switch (GET_MODE (src))
+    {
+    case V4SFmode:
+      if (i == 128)
+	tem = gen_sse_movhlps (dest, src, src);
+      else
+	tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
+				   GEN_INT (1 + 4), GEN_INT (1 + 4));
+      break;
+    case V2DFmode:
+      tem = gen_vec_interleave_highv2df (dest, src, src);
+      break;
+    case V16QImode:
+    case V8HImode:
+    case V4SImode:
+    case V2DImode:
+      d = gen_reg_rtx (V1TImode);
+      tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
+				GEN_INT (i / 2));
+      break;
+    case V8SFmode:
+      if (i == 256)
+	tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
+      else
+	tem = gen_avx_shufps256 (dest, src, src,
+				 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
+      break;
+    case V4DFmode:
+      if (i == 256)
+	tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
+      else
+	tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
+      break;
+    case V32QImode:
+    case V16HImode:
+    case V8SImode:
+    case V4DImode:
+      if (i == 256)
+	{
+	  if (GET_MODE (dest) != V4DImode)
+	    d = gen_reg_rtx (V4DImode);
+	  tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
+				   gen_lowpart (V4DImode, src),
+				   const1_rtx);
+	}
+      else
+	{
+	  d = gen_reg_rtx (V2TImode);
+	  tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
+				    GEN_INT (i / 2));
+	}
+      break;
+    case V16SImode:
+    case V16SFmode:
+    case V8DImode:
+    case V8DFmode:
+      if (i > 128)
+	tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
+				      gen_lowpart (V16SImode, src),
+				      gen_lowpart (V16SImode, src),
+				      GEN_INT (0x4 + (i == 512 ? 4 : 0)),
+				      GEN_INT (0x5 + (i == 512 ? 4 : 0)),
+				      GEN_INT (0x6 + (i == 512 ? 4 : 0)),
+				      GEN_INT (0x7 + (i == 512 ? 4 : 0)),
+				      GEN_INT (0xC), GEN_INT (0xD),
+				      GEN_INT (0xE), GEN_INT (0xF),
+				      GEN_INT (0x10), GEN_INT (0x11),
+				      GEN_INT (0x12), GEN_INT (0x13),
+				      GEN_INT (0x14), GEN_INT (0x15),
+				      GEN_INT (0x16), GEN_INT (0x17));
+      else
+	tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
+				   gen_lowpart (V16SImode, src),
+				   GEN_INT (i == 128 ? 0x2 : 0x1),
+				   GEN_INT (0x3),
+				   GEN_INT (0x3),
+				   GEN_INT (0x3),
+				   GEN_INT (i == 128 ? 0x6 : 0x5),
+				   GEN_INT (0x7),
+				   GEN_INT (0x7),
+				   GEN_INT (0x7),
+				   GEN_INT (i == 128 ? 0xA : 0x9),
+				   GEN_INT (0xB),
+				   GEN_INT (0xB),
+				   GEN_INT (0xB),
+				   GEN_INT (i == 128 ? 0xE : 0xD),
+				   GEN_INT (0xF),
+				   GEN_INT (0xF),
+				   GEN_INT (0xF));
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  emit_insn (tem);
+  if (d != dest)
+    emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
+}
+
+/* Expand a vector reduction.  FN is the binary pattern to reduce;
+   DEST is the destination; IN is the input vector.  */
+
+void
+ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
+{
+  rtx half, dst, vec = in;
+  enum machine_mode mode = GET_MODE (in);
+  int i;
+
+  /* SSE4 has a special instruction for V8HImode UMIN reduction.  */
+  if (TARGET_SSE4_1
+      && mode == V8HImode
+      && fn == gen_uminv8hi3)
+    {
+      emit_insn (gen_sse4_1_phminposuw (dest, in));
+      return;
+    }
+
+  for (i = GET_MODE_BITSIZE (mode);
+       i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
+       i >>= 1)
+    {
+      half = gen_reg_rtx (mode);
+      emit_reduc_half (half, vec, i);
+      if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
+	dst = dest;
+      else
+	dst = gen_reg_rtx (mode);
+      emit_insn (fn (dst, half, vec));
+      vec = dst;
+    }
+}
+
+/* Target hook for scalar_mode_supported_p.  */
+static bool
+ix86_scalar_mode_supported_p (enum machine_mode mode)
+{
+  if (DECIMAL_FLOAT_MODE_P (mode))
+    return default_decimal_float_supported_p ();
+  else if (mode == TFmode)
+    return true;
+  else
+    return default_scalar_mode_supported_p (mode);
+}
+
+/* Implements target hook vector_mode_supported_p.  */
+static bool
+ix86_vector_mode_supported_p (enum machine_mode mode)
+{
+  if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
+    return true;
+  if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
+    return true;
+  if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
+    return true;
+  if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
+    return true;
+  if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
+    return true;
+  if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
+    return true;
+  return false;
+}
+
+/* Target hook for c_mode_for_suffix.  */
+static enum machine_mode
+ix86_c_mode_for_suffix (char suffix)
+{
+  if (suffix == 'q')
+    return TFmode;
+  if (suffix == 'w')
+    return XFmode;
+
+  return VOIDmode;
+}
+
+/* Worker function for TARGET_MD_ASM_CLOBBERS.
+
+   We do this in the new i386 backend to maintain source compatibility
+   with the old cc0-based compiler.  */
+
+static tree
+ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
+		      tree inputs ATTRIBUTE_UNUSED,
+		      tree clobbers)
+{
+  clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
+			clobbers);
+  clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
+			clobbers);
+  return clobbers;
+}
+
+/* Implements target vector targetm.asm.encode_section_info.  */
+
+static void ATTRIBUTE_UNUSED
+ix86_encode_section_info (tree decl, rtx rtl, int first)
+{
+  default_encode_section_info (decl, rtl, first);
+
+  if (TREE_CODE (decl) == VAR_DECL
+      && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
+      && ix86_in_large_data_p (decl))
+    SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
+}
+
+/* Worker function for REVERSE_CONDITION.  */
+
+enum rtx_code
+ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
+{
+  return (mode != CCFPmode && mode != CCFPUmode
+	  ? reverse_condition (code)
+	  : reverse_condition_maybe_unordered (code));
+}
+
+/* Output code to perform an x87 FP register move, from OPERANDS[1]
+   to OPERANDS[0].  */
+
+const char *
+output_387_reg_move (rtx insn, rtx *operands)
+{
+  if (REG_P (operands[0]))
+    {
+      if (REG_P (operands[1])
+	  && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
+	{
+	  if (REGNO (operands[0]) == FIRST_STACK_REG)
+	    return output_387_ffreep (operands, 0);
+	  return "fstp\t%y0";
+	}
+      if (STACK_TOP_P (operands[0]))
+	return "fld%Z1\t%y1";
+      return "fst\t%y0";
+    }
+  else if (MEM_P (operands[0]))
+    {
+      gcc_assert (REG_P (operands[1]));
+      if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
+	return "fstp%Z0\t%y0";
+      else
+	{
+	  /* There is no non-popping store to memory for XFmode.
+	     So if we need one, follow the store with a load.  */
+	  if (GET_MODE (operands[0]) == XFmode)
+	    return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
+	  else
+	    return "fst%Z0\t%y0";
+	}
+    }
+  else
+    gcc_unreachable();
+}
+
+/* Output code to perform a conditional jump to LABEL, if C2 flag in
+   FP status register is set.  */
+
+void
+ix86_emit_fp_unordered_jump (rtx label)
+{
+  rtx reg = gen_reg_rtx (HImode);
+  rtx temp;
+
+  emit_insn (gen_x86_fnstsw_1 (reg));
+
+  if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
+    {
+      emit_insn (gen_x86_sahf_1 (reg));
+
+      temp = gen_rtx_REG (CCmode, FLAGS_REG);
+      temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
+    }
+  else
+    {
+      emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
+
+      temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
+      temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
+    }
+
+  temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
+			      gen_rtx_LABEL_REF (VOIDmode, label),
+			      pc_rtx);
+  temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
+
+  emit_jump_insn (temp);
+  predict_jump (REG_BR_PROB_BASE * 10 / 100);
+}
+
+/* Output code to perform a log1p XFmode calculation.  */
+
+void ix86_emit_i387_log1p (rtx op0, rtx op1)
+{
+  rtx label1 = gen_label_rtx ();
+  rtx label2 = gen_label_rtx ();
+
+  rtx tmp = gen_reg_rtx (XFmode);
+  rtx tmp2 = gen_reg_rtx (XFmode);
+  rtx test;
+
+  emit_insn (gen_absxf2 (tmp, op1));
+  test = gen_rtx_GE (VOIDmode, tmp,
+    CONST_DOUBLE_FROM_REAL_VALUE (
+       REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
+       XFmode));
+  emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
+
+  emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
+  emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
+  emit_jump (label2);
+
+  emit_label (label1);
+  emit_move_insn (tmp, CONST1_RTX (XFmode));
+  emit_insn (gen_addxf3 (tmp, op1, tmp));
+  emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
+  emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
+
+  emit_label (label2);
+}
+
+/* Emit code for round calculation.  */
+void ix86_emit_i387_round (rtx op0, rtx op1)
+{
+  enum machine_mode inmode = GET_MODE (op1);
+  enum machine_mode outmode = GET_MODE (op0);
+  rtx e1, e2, res, tmp, tmp1, half;
+  rtx scratch = gen_reg_rtx (HImode);
+  rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+  rtx jump_label = gen_label_rtx ();
+  rtx insn;
+  rtx (*gen_abs) (rtx, rtx);
+  rtx (*gen_neg) (rtx, rtx);
+
+  switch (inmode)
+    {
+    case SFmode:
+      gen_abs = gen_abssf2;
+      break;
+    case DFmode:
+      gen_abs = gen_absdf2;
+      break;
+    case XFmode:
+      gen_abs = gen_absxf2;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  switch (outmode)
+    {
+    case SFmode:
+      gen_neg = gen_negsf2;
+      break;
+    case DFmode:
+      gen_neg = gen_negdf2;
+      break;
+    case XFmode:
+      gen_neg = gen_negxf2;
+      break;
+    case HImode:
+      gen_neg = gen_neghi2;
+      break;
+    case SImode:
+      gen_neg = gen_negsi2;
+      break;
+    case DImode:
+      gen_neg = gen_negdi2;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  e1 = gen_reg_rtx (inmode);
+  e2 = gen_reg_rtx (inmode);
+  res = gen_reg_rtx (outmode);
+
+  half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
+
+  /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
+
+  /* scratch = fxam(op1) */
+  emit_insn (gen_rtx_SET (VOIDmode, scratch,
+			  gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
+					  UNSPEC_FXAM)));
+  /* e1 = fabs(op1) */
+  emit_insn (gen_abs (e1, op1));
+
+  /* e2 = e1 + 0.5 */
+  half = force_reg (inmode, half);
+  emit_insn (gen_rtx_SET (VOIDmode, e2,
+			  gen_rtx_PLUS (inmode, e1, half)));
+
+  /* res = floor(e2) */
+  if (inmode != XFmode)
+    {
+      tmp1 = gen_reg_rtx (XFmode);
+
+      emit_insn (gen_rtx_SET (VOIDmode, tmp1,
+			      gen_rtx_FLOAT_EXTEND (XFmode, e2)));
+    }
+  else
+    tmp1 = e2;
+
+  switch (outmode)
+    {
+    case SFmode:
+    case DFmode:
+      {
+	rtx tmp0 = gen_reg_rtx (XFmode);
+
+	emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
+
+	emit_insn (gen_rtx_SET (VOIDmode, res,
+				gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
+						UNSPEC_TRUNC_NOOP)));
+      }
+      break;
+    case XFmode:
+      emit_insn (gen_frndintxf2_floor (res, tmp1));
+      break;
+    case HImode:
+      emit_insn (gen_lfloorxfhi2 (res, tmp1));
+      break;
+    case SImode:
+      emit_insn (gen_lfloorxfsi2 (res, tmp1));
+      break;
+    case DImode:
+      emit_insn (gen_lfloorxfdi2 (res, tmp1));
+	break;
+    default:
+      gcc_unreachable ();
+    }
+
+  /* flags = signbit(a) */
+  emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
+
+  /* if (flags) then res = -res */
+  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
+			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
+			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
+			      pc_rtx);
+  insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
+  predict_jump (REG_BR_PROB_BASE * 50 / 100);
+  JUMP_LABEL (insn) = jump_label;
+
+  emit_insn (gen_neg (res, res));
+
+  emit_label (jump_label);
+  LABEL_NUSES (jump_label) = 1;
+
+  emit_move_insn (op0, res);
+}
+
+/* Output code to perform a Newton-Rhapson approximation of a single precision
+   floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
+
+void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
+{
+  rtx x0, x1, e0, e1;
+
+  x0 = gen_reg_rtx (mode);
+  e0 = gen_reg_rtx (mode);
+  e1 = gen_reg_rtx (mode);
+  x1 = gen_reg_rtx (mode);
+
+  /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
+
+  b = force_reg (mode, b);
+
+  /* x0 = rcp(b) estimate */
+  if (mode == V16SFmode || mode == V8DFmode)
+    emit_insn (gen_rtx_SET (VOIDmode, x0,
+			    gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+					    UNSPEC_RCP14)));
+  else
+    emit_insn (gen_rtx_SET (VOIDmode, x0,
+			    gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+					    UNSPEC_RCP)));
+
+  /* e0 = x0 * b */
+  emit_insn (gen_rtx_SET (VOIDmode, e0,
+			  gen_rtx_MULT (mode, x0, b)));
+
+  /* e0 = x0 * e0 */
+  emit_insn (gen_rtx_SET (VOIDmode, e0,
+			  gen_rtx_MULT (mode, x0, e0)));
+
+  /* e1 = x0 + x0 */
+  emit_insn (gen_rtx_SET (VOIDmode, e1,
+			  gen_rtx_PLUS (mode, x0, x0)));
+
+  /* x1 = e1 - e0 */
+  emit_insn (gen_rtx_SET (VOIDmode, x1,
+			  gen_rtx_MINUS (mode, e1, e0)));
+
+  /* res = a * x1 */
+  emit_insn (gen_rtx_SET (VOIDmode, res,
+			  gen_rtx_MULT (mode, a, x1)));
+}
+
+/* Output code to perform a Newton-Rhapson approximation of a
+   single precision floating point [reciprocal] square root.  */
+
+void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
+			 bool recip)
+{
+  rtx x0, e0, e1, e2, e3, mthree, mhalf;
+  REAL_VALUE_TYPE r;
+  int unspec;
+
+  x0 = gen_reg_rtx (mode);
+  e0 = gen_reg_rtx (mode);
+  e1 = gen_reg_rtx (mode);
+  e2 = gen_reg_rtx (mode);
+  e3 = gen_reg_rtx (mode);
+
+  real_from_integer (&r, VOIDmode, -3, -1, 0);
+  mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
+
+  real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
+  mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
+  unspec = UNSPEC_RSQRT;
+
+  if (VECTOR_MODE_P (mode))
+    {
+      mthree = ix86_build_const_vector (mode, true, mthree);
+      mhalf = ix86_build_const_vector (mode, true, mhalf);
+      /* There is no 512-bit rsqrt.  There is however rsqrt14.  */
+      if (GET_MODE_SIZE (mode) == 64)
+	unspec = UNSPEC_RSQRT14;
+    }
+
+  /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
+     rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
+
+  a = force_reg (mode, a);
+
+  /* x0 = rsqrt(a) estimate */
+  emit_insn (gen_rtx_SET (VOIDmode, x0,
+			  gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
+					  unspec)));
+
+  /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
+  if (!recip)
+    {
+      rtx zero, mask;
+
+      zero = gen_reg_rtx (mode);
+      mask = gen_reg_rtx (mode);
+
+      zero = force_reg (mode, CONST0_RTX(mode));
+
+      /* Handle masked compare.  */
+      if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
+	{
+	  mask = gen_reg_rtx (HImode);
+	  /* Imm value 0x4 corresponds to not-equal comparison.  */
+	  emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
+	  emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
+	}
+      else
+	{
+	  emit_insn (gen_rtx_SET (VOIDmode, mask,
+				  gen_rtx_NE (mode, zero, a)));
+
+	  emit_insn (gen_rtx_SET (VOIDmode, x0,
+				  gen_rtx_AND (mode, x0, mask)));
+	}
+    }
+
+  /* e0 = x0 * a */
+  emit_insn (gen_rtx_SET (VOIDmode, e0,
+			  gen_rtx_MULT (mode, x0, a)));
+  /* e1 = e0 * x0 */
+  emit_insn (gen_rtx_SET (VOIDmode, e1,
+			  gen_rtx_MULT (mode, e0, x0)));
+
+  /* e2 = e1 - 3. */
+  mthree = force_reg (mode, mthree);
+  emit_insn (gen_rtx_SET (VOIDmode, e2,
+			  gen_rtx_PLUS (mode, e1, mthree)));
+
+  mhalf = force_reg (mode, mhalf);
+  if (recip)
+    /* e3 = -.5 * x0 */
+    emit_insn (gen_rtx_SET (VOIDmode, e3,
+			    gen_rtx_MULT (mode, x0, mhalf)));
+  else
+    /* e3 = -.5 * e0 */
+    emit_insn (gen_rtx_SET (VOIDmode, e3,
+			    gen_rtx_MULT (mode, e0, mhalf)));
+  /* ret = e2 * e3 */
+  emit_insn (gen_rtx_SET (VOIDmode, res,
+			  gen_rtx_MULT (mode, e2, e3)));
+}
+
+#ifdef TARGET_SOLARIS
+/* Solaris implementation of TARGET_ASM_NAMED_SECTION.  */
+
+static void
+i386_solaris_elf_named_section (const char *name, unsigned int flags,
+				tree decl)
+{
+  /* With Binutils 2.15, the "@unwind" marker must be specified on
+     every occurrence of the ".eh_frame" section, not just the first
+     one.  */
+  if (TARGET_64BIT
+      && strcmp (name, ".eh_frame") == 0)
+    {
+      fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
+	       flags & SECTION_WRITE ? "aw" : "a");
+      return;
+    }
+
+#ifndef USE_GAS
+  if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
+    {
+      solaris_elf_asm_comdat_section (name, flags, decl);
+      return;
+    }
+#endif
+
+  default_elf_asm_named_section (name, flags, decl);
+}
+#endif /* TARGET_SOLARIS */
+
+/* Return the mangling of TYPE if it is an extended fundamental type.  */
+
+static const char *
+ix86_mangle_type (const_tree type)
+{
+  type = TYPE_MAIN_VARIANT (type);
+
+  if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
+      && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
+    return NULL;
+
+  switch (TYPE_MODE (type))
+    {
+    case TFmode:
+      /* __float128 is "g".  */
+      return "g";
+    case XFmode:
+      /* "long double" or __float80 is "e".  */
+      return "e";
+    default:
+      return NULL;
+    }
+}
+
+/* For 32-bit code we can save PIC register setup by using
+   __stack_chk_fail_local hidden function instead of calling
+   __stack_chk_fail directly.  64-bit code doesn't need to setup any PIC
+   register, so it is better to call __stack_chk_fail directly.  */
+
+static tree ATTRIBUTE_UNUSED
+ix86_stack_protect_fail (void)
+{
+  return TARGET_64BIT
+	 ? default_external_stack_protect_fail ()
+	 : default_hidden_stack_protect_fail ();
+}
+
+/* Select a format to encode pointers in exception handling data.  CODE
+   is 0 for data, 1 for code labels, 2 for function pointers.  GLOBAL is
+   true if the symbol may be affected by dynamic relocations.
+
+   ??? All x86 object file formats are capable of representing this.
+   After all, the relocation needed is the same as for the call insn.
+   Whether or not a particular assembler allows us to enter such, I
+   guess we'll have to see.  */
+int
+asm_preferred_eh_data_format (int code, int global)
+{
+  if (flag_pic)
+    {
+      int type = DW_EH_PE_sdata8;
+      if (!TARGET_64BIT
+	  || ix86_cmodel == CM_SMALL_PIC
+	  || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
+	type = DW_EH_PE_sdata4;
+      return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
+    }
+  if (ix86_cmodel == CM_SMALL
+      || (ix86_cmodel == CM_MEDIUM && code))
+    return DW_EH_PE_udata4;
+  return DW_EH_PE_absptr;
+}
+
+/* Expand copysign from SIGN to the positive value ABS_VALUE
+   storing in RESULT.  If MASK is non-null, it shall be a mask to mask out
+   the sign-bit.  */
+static void
+ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
+{
+  enum machine_mode mode = GET_MODE (sign);
+  rtx sgn = gen_reg_rtx (mode);
+  if (mask == NULL_RTX)
+    {
+      enum machine_mode vmode;
+
+      if (mode == SFmode)
+	vmode = V4SFmode;
+      else if (mode == DFmode)
+	vmode = V2DFmode;
+      else
+	vmode = mode;
+
+      mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
+      if (!VECTOR_MODE_P (mode))
+	{
+	  /* We need to generate a scalar mode mask in this case.  */
+	  rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
+	  tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
+	  mask = gen_reg_rtx (mode);
+	  emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
+	}
+    }
+  else
+    mask = gen_rtx_NOT (mode, mask);
+  emit_insn (gen_rtx_SET (VOIDmode, sgn,
+			  gen_rtx_AND (mode, mask, sign)));
+  emit_insn (gen_rtx_SET (VOIDmode, result,
+			  gen_rtx_IOR (mode, abs_value, sgn)));
+}
+
+/* Expand fabs (OP0) and return a new rtx that holds the result.  The
+   mask for masking out the sign-bit is stored in *SMASK, if that is
+   non-null.  */
+static rtx
+ix86_expand_sse_fabs (rtx op0, rtx *smask)
+{
+  enum machine_mode vmode, mode = GET_MODE (op0);
+  rtx xa, mask;
+
+  xa = gen_reg_rtx (mode);
+  if (mode == SFmode)
+    vmode = V4SFmode;
+  else if (mode == DFmode)
+    vmode = V2DFmode;
+  else
+    vmode = mode;
+  mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
+  if (!VECTOR_MODE_P (mode))
+    {
+      /* We need to generate a scalar mode mask in this case.  */
+      rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
+      tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
+      mask = gen_reg_rtx (mode);
+      emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
+    }
+  emit_insn (gen_rtx_SET (VOIDmode, xa,
+			  gen_rtx_AND (mode, op0, mask)));
+
+  if (smask)
+    *smask = mask;
+
+  return xa;
+}
+
+/* Expands a comparison of OP0 with OP1 using comparison code CODE,
+   swapping the operands if SWAP_OPERANDS is true.  The expanded
+   code is a forward jump to a newly created label in case the
+   comparison is true.  The generated label rtx is returned.  */
+static rtx
+ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
+                                  bool swap_operands)
+{
+  enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
+  rtx label, tmp;
+
+  if (swap_operands)
+    {
+      tmp = op0;
+      op0 = op1;
+      op1 = tmp;
+    }
+
+  label = gen_label_rtx ();
+  tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
+  emit_insn (gen_rtx_SET (VOIDmode, tmp,
+			  gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
+  tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
+  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+			      gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
+  tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
+  JUMP_LABEL (tmp) = label;
+
+  return label;
+}
+
+/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
+   using comparison code CODE.  Operands are swapped for the comparison if
+   SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
+static rtx
+ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
+			      bool swap_operands)
+{
+  rtx (*insn)(rtx, rtx, rtx, rtx);
+  enum machine_mode mode = GET_MODE (op0);
+  rtx mask = gen_reg_rtx (mode);
+
+  if (swap_operands)
+    {
+      rtx tmp = op0;
+      op0 = op1;
+      op1 = tmp;
+    }
+
+  insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
+
+  emit_insn (insn (mask, op0, op1,
+		   gen_rtx_fmt_ee (code, mode, op0, op1)));
+  return mask;
+}
+
+/* Generate and return a rtx of mode MODE for 2**n where n is the number
+   of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
+static rtx
+ix86_gen_TWO52 (enum machine_mode mode)
+{
+  REAL_VALUE_TYPE TWO52r;
+  rtx TWO52;
+
+  real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
+  TWO52 = const_double_from_real_value (TWO52r, mode);
+  TWO52 = force_reg (mode, TWO52);
+
+  return TWO52;
+}
+
+/* Expand SSE sequence for computing lround from OP1 storing
+   into OP0.  */
+void
+ix86_expand_lround (rtx op0, rtx op1)
+{
+  /* C code for the stuff we're doing below:
+       tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
+       return (long)tmp;
+   */
+  enum machine_mode mode = GET_MODE (op1);
+  const struct real_format *fmt;
+  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+  rtx adj;
+
+  /* load nextafter (0.5, 0.0) */
+  fmt = REAL_MODE_FORMAT (mode);
+  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
+  REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
+
+  /* adj = copysign (0.5, op1) */
+  adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
+  ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
+
+  /* adj = op1 + adj */
+  adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* op0 = (imode)adj */
+  expand_fix (op0, adj, 0);
+}
+
+/* Expand SSE2 sequence for computing lround from OPERAND1 storing
+   into OPERAND0.  */
+void
+ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
+{
+  /* C code for the stuff we're doing below (for do_floor):
+	xi = (long)op1;
+        xi -= (double)xi > op1 ? 1 : 0;
+        return xi;
+   */
+  enum machine_mode fmode = GET_MODE (op1);
+  enum machine_mode imode = GET_MODE (op0);
+  rtx ireg, freg, label, tmp;
+
+  /* reg = (long)op1 */
+  ireg = gen_reg_rtx (imode);
+  expand_fix (ireg, op1, 0);
+
+  /* freg = (double)reg */
+  freg = gen_reg_rtx (fmode);
+  expand_float (freg, ireg, 0);
+
+  /* ireg = (freg > op1) ? ireg - 1 : ireg */
+  label = ix86_expand_sse_compare_and_jump (UNLE,
+					    freg, op1, !do_floor);
+  tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
+			     ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
+  emit_move_insn (ireg, tmp);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (op0, ireg);
+}
+
+/* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
+   result in OPERAND0.  */
+void
+ix86_expand_rint (rtx operand0, rtx operand1)
+{
+  /* C code for the stuff we're doing below:
+	xa = fabs (operand1);
+        if (!isless (xa, 2**52))
+	  return operand1;
+        xa = xa + 2**52 - 2**52;
+        return copysign (xa, operand1);
+   */
+  enum machine_mode mode = GET_MODE (operand0);
+  rtx res, xa, label, TWO52, mask;
+
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  /* xa = abs (operand1) */
+  xa = ix86_expand_sse_fabs (res, &mask);
+
+  /* if (!isless (xa, TWO52)) goto label; */
+  TWO52 = ix86_gen_TWO52 (mode);
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+  xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
+
+  ix86_sse_copysign_to_positive (res, xa, res, mask);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
+/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
+   into OPERAND0.  */
+void
+ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
+{
+  /* C code for the stuff we expand below.
+        double xa = fabs (x), x2;
+        if (!isless (xa, TWO52))
+          return x;
+        xa = xa + TWO52 - TWO52;
+        x2 = copysign (xa, x);
+     Compensate.  Floor:
+        if (x2 > x)
+          x2 -= 1;
+     Compensate.  Ceil:
+        if (x2 < x)
+          x2 -= -1;
+        return x2;
+   */
+  enum machine_mode mode = GET_MODE (operand0);
+  rtx xa, TWO52, tmp, label, one, res, mask;
+
+  TWO52 = ix86_gen_TWO52 (mode);
+
+  /* Temporary for holding the result, initialized to the input
+     operand to ease control flow.  */
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  /* xa = abs (operand1) */
+  xa = ix86_expand_sse_fabs (res, &mask);
+
+  /* if (!isless (xa, TWO52)) goto label; */
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  /* xa = xa + TWO52 - TWO52; */
+  xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+  xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
+
+  /* xa = copysign (xa, operand1) */
+  ix86_sse_copysign_to_positive (xa, xa, res, mask);
+
+  /* generate 1.0 or -1.0 */
+  one = force_reg (mode,
+	           const_double_from_real_value (do_floor
+						 ? dconst1 : dconstm1, mode));
+
+  /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
+  tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
+  emit_insn (gen_rtx_SET (VOIDmode, tmp,
+                          gen_rtx_AND (mode, one, tmp)));
+  /* We always need to subtract here to preserve signed zero.  */
+  tmp = expand_simple_binop (mode, MINUS,
+			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+  emit_move_insn (res, tmp);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
+/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
+   into OPERAND0.  */
+void
+ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
+{
+  /* C code for the stuff we expand below.
+	double xa = fabs (x), x2;
+        if (!isless (xa, TWO52))
+          return x;
+	x2 = (double)(long)x;
+     Compensate.  Floor:
+	if (x2 > x)
+	  x2 -= 1;
+     Compensate.  Ceil:
+	if (x2 < x)
+	  x2 += 1;
+	if (HONOR_SIGNED_ZEROS (mode))
+	  return copysign (x2, x);
+	return x2;
+   */
+  enum machine_mode mode = GET_MODE (operand0);
+  rtx xa, xi, TWO52, tmp, label, one, res, mask;
+
+  TWO52 = ix86_gen_TWO52 (mode);
+
+  /* Temporary for holding the result, initialized to the input
+     operand to ease control flow.  */
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  /* xa = abs (operand1) */
+  xa = ix86_expand_sse_fabs (res, &mask);
+
+  /* if (!isless (xa, TWO52)) goto label; */
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  /* xa = (double)(long)x */
+  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+  expand_fix (xi, res, 0);
+  expand_float (xa, xi, 0);
+
+  /* generate 1.0 */
+  one = force_reg (mode, const_double_from_real_value (dconst1, mode));
+
+  /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
+  tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
+  emit_insn (gen_rtx_SET (VOIDmode, tmp,
+                          gen_rtx_AND (mode, one, tmp)));
+  tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
+			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+  emit_move_insn (res, tmp);
+
+  if (HONOR_SIGNED_ZEROS (mode))
+    ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
+/* Expand SSE sequence for computing round from OPERAND1 storing
+   into OPERAND0.  Sequence that works without relying on DImode truncation
+   via cvttsd2siq that is only available on 64bit targets.  */
+void
+ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
+{
+  /* C code for the stuff we expand below.
+        double xa = fabs (x), xa2, x2;
+        if (!isless (xa, TWO52))
+          return x;
+     Using the absolute value and copying back sign makes
+     -0.0 -> -0.0 correct.
+        xa2 = xa + TWO52 - TWO52;
+     Compensate.
+	dxa = xa2 - xa;
+        if (dxa <= -0.5)
+          xa2 += 1;
+        else if (dxa > 0.5)
+          xa2 -= 1;
+        x2 = copysign (xa2, x);
+        return x2;
+   */
+  enum machine_mode mode = GET_MODE (operand0);
+  rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
+
+  TWO52 = ix86_gen_TWO52 (mode);
+
+  /* Temporary for holding the result, initialized to the input
+     operand to ease control flow.  */
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  /* xa = abs (operand1) */
+  xa = ix86_expand_sse_fabs (res, &mask);
+
+  /* if (!isless (xa, TWO52)) goto label; */
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  /* xa2 = xa + TWO52 - TWO52; */
+  xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+  xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
+
+  /* dxa = xa2 - xa; */
+  dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* generate 0.5, 1.0 and -0.5 */
+  half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
+  one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
+  mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
+			       0, OPTAB_DIRECT);
+
+  /* Compensate.  */
+  tmp = gen_reg_rtx (mode);
+  /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
+  tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
+  emit_insn (gen_rtx_SET (VOIDmode, tmp,
+                          gen_rtx_AND (mode, one, tmp)));
+  xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+  /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
+  tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
+  emit_insn (gen_rtx_SET (VOIDmode, tmp,
+                          gen_rtx_AND (mode, one, tmp)));
+  xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* res = copysign (xa2, operand1) */
+  ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
+/* Expand SSE sequence for computing trunc from OPERAND1 storing
+   into OPERAND0.  */
+void
+ix86_expand_trunc (rtx operand0, rtx operand1)
+{
+  /* C code for SSE variant we expand below.
+        double xa = fabs (x), x2;
+        if (!isless (xa, TWO52))
+          return x;
+        x2 = (double)(long)x;
+	if (HONOR_SIGNED_ZEROS (mode))
+	  return copysign (x2, x);
+	return x2;
+   */
+  enum machine_mode mode = GET_MODE (operand0);
+  rtx xa, xi, TWO52, label, res, mask;
+
+  TWO52 = ix86_gen_TWO52 (mode);
+
+  /* Temporary for holding the result, initialized to the input
+     operand to ease control flow.  */
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  /* xa = abs (operand1) */
+  xa = ix86_expand_sse_fabs (res, &mask);
+
+  /* if (!isless (xa, TWO52)) goto label; */
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  /* x = (double)(long)x */
+  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+  expand_fix (xi, res, 0);
+  expand_float (res, xi, 0);
+
+  if (HONOR_SIGNED_ZEROS (mode))
+    ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
+/* Expand SSE sequence for computing trunc from OPERAND1 storing
+   into OPERAND0.  */
+void
+ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
+{
+  enum machine_mode mode = GET_MODE (operand0);
+  rtx xa, mask, TWO52, label, one, res, smask, tmp;
+
+  /* C code for SSE variant we expand below.
+        double xa = fabs (x), x2;
+        if (!isless (xa, TWO52))
+          return x;
+        xa2 = xa + TWO52 - TWO52;
+     Compensate:
+        if (xa2 > xa)
+          xa2 -= 1.0;
+        x2 = copysign (xa2, x);
+        return x2;
+   */
+
+  TWO52 = ix86_gen_TWO52 (mode);
+
+  /* Temporary for holding the result, initialized to the input
+     operand to ease control flow.  */
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  /* xa = abs (operand1) */
+  xa = ix86_expand_sse_fabs (res, &smask);
+
+  /* if (!isless (xa, TWO52)) goto label; */
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  /* res = xa + TWO52 - TWO52; */
+  tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+  tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
+  emit_move_insn (res, tmp);
+
+  /* generate 1.0 */
+  one = force_reg (mode, const_double_from_real_value (dconst1, mode));
+
+  /* Compensate: res = xa2 - (res > xa ? 1 : 0)  */
+  mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
+  emit_insn (gen_rtx_SET (VOIDmode, mask,
+                          gen_rtx_AND (mode, mask, one)));
+  tmp = expand_simple_binop (mode, MINUS,
+			     res, mask, NULL_RTX, 0, OPTAB_DIRECT);
+  emit_move_insn (res, tmp);
+
+  /* res = copysign (res, operand1) */
+  ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
+/* Expand SSE sequence for computing round from OPERAND1 storing
+   into OPERAND0.  */
+void
+ix86_expand_round (rtx operand0, rtx operand1)
+{
+  /* C code for the stuff we're doing below:
+        double xa = fabs (x);
+        if (!isless (xa, TWO52))
+          return x;
+        xa = (double)(long)(xa + nextafter (0.5, 0.0));
+        return copysign (xa, x);
+   */
+  enum machine_mode mode = GET_MODE (operand0);
+  rtx res, TWO52, xa, label, xi, half, mask;
+  const struct real_format *fmt;
+  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+
+  /* Temporary for holding the result, initialized to the input
+     operand to ease control flow.  */
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  TWO52 = ix86_gen_TWO52 (mode);
+  xa = ix86_expand_sse_fabs (res, &mask);
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  /* load nextafter (0.5, 0.0) */
+  fmt = REAL_MODE_FORMAT (mode);
+  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
+  REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
+
+  /* xa = xa + 0.5 */
+  half = force_reg (mode, const_double_from_real_value (pred_half, mode));
+  xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* xa = (double)(int64_t)xa */
+  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+  expand_fix (xi, xa, 0);
+  expand_float (xa, xi, 0);
+
+  /* res = copysign (xa, operand1) */
+  ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
+/* Expand SSE sequence for computing round
+   from OP1 storing into OP0 using sse4 round insn.  */
+void
+ix86_expand_round_sse4 (rtx op0, rtx op1)
+{
+  enum machine_mode mode = GET_MODE (op0);
+  rtx e1, e2, res, half;
+  const struct real_format *fmt;
+  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+  rtx (*gen_copysign) (rtx, rtx, rtx);
+  rtx (*gen_round) (rtx, rtx, rtx);
+
+  switch (mode)
+    {
+    case SFmode:
+      gen_copysign = gen_copysignsf3;
+      gen_round = gen_sse4_1_roundsf2;
+      break;
+    case DFmode:
+      gen_copysign = gen_copysigndf3;
+      gen_round = gen_sse4_1_rounddf2;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  /* round (a) = trunc (a + copysign (0.5, a)) */
+
+  /* load nextafter (0.5, 0.0) */
+  fmt = REAL_MODE_FORMAT (mode);
+  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
+  REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
+  half = const_double_from_real_value (pred_half, mode);
+
+  /* e1 = copysign (0.5, op1) */
+  e1 = gen_reg_rtx (mode);
+  emit_insn (gen_copysign (e1, half, op1));
+
+  /* e2 = op1 + e1 */
+  e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* res = trunc (e2) */
+  res = gen_reg_rtx (mode);
+  emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
+
+  emit_move_insn (op0, res);
+}
+
+
+/* Table of valid machine attributes.  */
+static const struct attribute_spec ix86_attribute_table[] =
+{
+  /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
+       affects_type_identity } */
+  /* Stdcall attribute says callee is responsible for popping arguments
+     if they are not variable.  */
+  { "stdcall",   0, 0, false, true,  true,  ix86_handle_cconv_attribute,
+    true },
+  /* Fastcall attribute says callee is responsible for popping arguments
+     if they are not variable.  */
+  { "fastcall",  0, 0, false, true,  true,  ix86_handle_cconv_attribute,
+    true },
+  /* Thiscall attribute says callee is responsible for popping arguments
+     if they are not variable.  */
+  { "thiscall",  0, 0, false, true,  true,  ix86_handle_cconv_attribute,
+    true },
+  /* Cdecl attribute says the callee is a normal C declaration */
+  { "cdecl",     0, 0, false, true,  true,  ix86_handle_cconv_attribute,
+    true },
+  /* Regparm attribute specifies how many integer arguments are to be
+     passed in registers.  */
+  { "regparm",   1, 1, false, true,  true,  ix86_handle_cconv_attribute,
+    true },
+  /* Sseregparm attribute says we are using x86_64 calling conventions
+     for FP arguments.  */
+  { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
+    true },
+  /* The transactional memory builtins are implicitly regparm or fastcall
+     depending on the ABI.  Override the generic do-nothing attribute that
+     these builtins were declared with.  */
+  { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
+    true },
+  /* force_align_arg_pointer says this function realigns the stack at entry.  */
+  { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
+    false, true,  true, ix86_handle_cconv_attribute, false },
+#if TARGET_DLLIMPORT_DECL_ATTRIBUTES
+  { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
+  { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
+  { "shared",    0, 0, true,  false, false, ix86_handle_shared_attribute,
+    false },
+#endif
+  { "ms_struct", 0, 0, false, false,  false, ix86_handle_struct_attribute,
+    false },
+  { "gcc_struct", 0, 0, false, false,  false, ix86_handle_struct_attribute,
+    false },
+#ifdef SUBTARGET_ATTRIBUTE_TABLE
+  SUBTARGET_ATTRIBUTE_TABLE,
+#endif
+  /* ms_abi and sysv_abi calling convention function attributes.  */
+  { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
+  { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
+  { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
+    false },
+  { "callee_pop_aggregate_return", 1, 1, false, true, true,
+    ix86_handle_callee_pop_aggregate_return, true },
+  /* End element.  */
+  { NULL,        0, 0, false, false, false, NULL, false }
+};
+
+/* Implement targetm.vectorize.builtin_vectorization_cost.  */
+static int
+ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
+                                 tree vectype,
+                                 int misalign ATTRIBUTE_UNUSED)
+{
+  unsigned elements;
+
+  switch (type_of_cost)
+    {
+      case scalar_stmt:
+        return ix86_cost->scalar_stmt_cost;
+
+      case scalar_load:
+        return ix86_cost->scalar_load_cost;
+
+      case scalar_store:
+        return ix86_cost->scalar_store_cost;
+
+      case vector_stmt:
+        return ix86_cost->vec_stmt_cost;
+
+      case vector_load:
+        return ix86_cost->vec_align_load_cost;
+
+      case vector_store:
+        return ix86_cost->vec_store_cost;
+
+      case vec_to_scalar:
+        return ix86_cost->vec_to_scalar_cost;
+
+      case scalar_to_vec:
+        return ix86_cost->scalar_to_vec_cost;
+
+      case unaligned_load:
+      case unaligned_store:
+        return ix86_cost->vec_unalign_load_cost;
+
+      case cond_branch_taken:
+        return ix86_cost->cond_taken_branch_cost;
+
+      case cond_branch_not_taken:
+        return ix86_cost->cond_not_taken_branch_cost;
+
+      case vec_perm:
+      case vec_promote_demote:
+        return ix86_cost->vec_stmt_cost;
+
+      case vec_construct:
+	elements = TYPE_VECTOR_SUBPARTS (vectype);
+	return elements / 2 + 1;
+
+      default:
+        gcc_unreachable ();
+    }
+}
+
+/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
+   insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
+   insn every time.  */
+
+static GTY(()) rtx vselect_insn;
+
+/* Initialize vselect_insn.  */
+
+static void
+init_vselect_insn (void)
+{
+  unsigned i;
+  rtx x;
+
+  x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
+  for (i = 0; i < MAX_VECT_LEN; ++i)
+    XVECEXP (x, 0, i) = const0_rtx;
+  x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
+							const0_rtx), x);
+  x = gen_rtx_SET (VOIDmode, const0_rtx, x);
+  start_sequence ();
+  vselect_insn = emit_insn (x);
+  end_sequence ();
+}
+
+/* Construct (set target (vec_select op0 (parallel perm))) and
+   return true if that's a valid instruction in the active ISA.  */
+
+static bool
+expand_vselect (rtx target, rtx op0, const unsigned char *perm,
+		unsigned nelt, bool testing_p)
+{
+  unsigned int i;
+  rtx x, save_vconcat;
+  int icode;
+
+  if (vselect_insn == NULL_RTX)
+    init_vselect_insn ();
+
+  x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
+  PUT_NUM_ELEM (XVEC (x, 0), nelt);
+  for (i = 0; i < nelt; ++i)
+    XVECEXP (x, 0, i) = GEN_INT (perm[i]);
+  save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
+  XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
+  PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
+  SET_DEST (PATTERN (vselect_insn)) = target;
+  icode = recog_memoized (vselect_insn);
+
+  if (icode >= 0 && !testing_p)
+    emit_insn (copy_rtx (PATTERN (vselect_insn)));
+
+  SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
+  XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
+  INSN_CODE (vselect_insn) = -1;
+
+  return icode >= 0;
+}
+
+/* Similar, but generate a vec_concat from op0 and op1 as well.  */
+
+static bool
+expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
+			const unsigned char *perm, unsigned nelt,
+			bool testing_p)
+{
+  enum machine_mode v2mode;
+  rtx x;
+  bool ok;
+
+  if (vselect_insn == NULL_RTX)
+    init_vselect_insn ();
+
+  v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
+  x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
+  PUT_MODE (x, v2mode);
+  XEXP (x, 0) = op0;
+  XEXP (x, 1) = op1;
+  ok = expand_vselect (target, x, perm, nelt, testing_p);
+  XEXP (x, 0) = const0_rtx;
+  XEXP (x, 1) = const0_rtx;
+  return ok;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
+
+static bool
+expand_vec_perm_blend (struct expand_vec_perm_d *d)
+{
+  enum machine_mode vmode = d->vmode;
+  unsigned i, mask, nelt = d->nelt;
+  rtx target, op0, op1, x;
+  rtx rperm[32], vperm;
+
+  if (d->one_operand_p)
+    return false;
+  if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+    ;
+  else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
+    ;
+  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+    ;
+  else
+    return false;
+
+  /* This is a blend, not a permute.  Elements must stay in their
+     respective lanes.  */
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned e = d->perm[i];
+      if (!(e == i || e == i + nelt))
+	return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
+     decision should be extracted elsewhere, so that we only try that
+     sequence once all budget==3 options have been tried.  */
+  target = d->target;
+  op0 = d->op0;
+  op1 = d->op1;
+  mask = 0;
+
+  switch (vmode)
+    {
+    case V4DFmode:
+    case V8SFmode:
+    case V2DFmode:
+    case V4SFmode:
+    case V8HImode:
+    case V8SImode:
+      for (i = 0; i < nelt; ++i)
+	mask |= (d->perm[i] >= nelt) << i;
+      break;
+
+    case V2DImode:
+      for (i = 0; i < 2; ++i)
+	mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
+      vmode = V8HImode;
+      goto do_subreg;
+
+    case V4SImode:
+      for (i = 0; i < 4; ++i)
+	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+      vmode = V8HImode;
+      goto do_subreg;
+
+    case V16QImode:
+      /* See if bytes move in pairs so we can use pblendw with
+	 an immediate argument, rather than pblendvb with a vector
+	 argument.  */
+      for (i = 0; i < 16; i += 2)
+	if (d->perm[i] + 1 != d->perm[i + 1])
+	  {
+	  use_pblendvb:
+	    for (i = 0; i < nelt; ++i)
+	      rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
+
+	  finish_pblendvb:
+	    vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
+	    vperm = force_reg (vmode, vperm);
+
+	    if (GET_MODE_SIZE (vmode) == 16)
+	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
+	    else
+	      emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
+	    if (target != d->target)
+	      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+	    return true;
+	  }
+
+      for (i = 0; i < 8; ++i)
+	mask |= (d->perm[i * 2] >= 16) << i;
+      vmode = V8HImode;
+      /* FALLTHRU */
+
+    do_subreg:
+      target = gen_reg_rtx (vmode);
+      op0 = gen_lowpart (vmode, op0);
+      op1 = gen_lowpart (vmode, op1);
+      break;
+
+    case V32QImode:
+      /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
+      for (i = 0; i < 32; i += 2)
+	if (d->perm[i] + 1 != d->perm[i + 1])
+	  goto use_pblendvb;
+      /* See if bytes move in quadruplets.  If yes, vpblendd
+	 with immediate can be used.  */
+      for (i = 0; i < 32; i += 4)
+	if (d->perm[i] + 2 != d->perm[i + 2])
+	  break;
+      if (i < 32)
+	{
+	  /* See if bytes move the same in both lanes.  If yes,
+	     vpblendw with immediate can be used.  */
+	  for (i = 0; i < 16; i += 2)
+	    if (d->perm[i] + 16 != d->perm[i + 16])
+	      goto use_pblendvb;
+
+	  /* Use vpblendw.  */
+	  for (i = 0; i < 16; ++i)
+	    mask |= (d->perm[i * 2] >= 32) << i;
+	  vmode = V16HImode;
+	  goto do_subreg;
+	}
+
+      /* Use vpblendd.  */
+      for (i = 0; i < 8; ++i)
+	mask |= (d->perm[i * 4] >= 32) << i;
+      vmode = V8SImode;
+      goto do_subreg;
+
+    case V16HImode:
+      /* See if words move in pairs.  If yes, vpblendd can be used.  */
+      for (i = 0; i < 16; i += 2)
+	if (d->perm[i] + 1 != d->perm[i + 1])
+	  break;
+      if (i < 16)
+	{
+	  /* See if words move the same in both lanes.  If not,
+	     vpblendvb must be used.  */
+	  for (i = 0; i < 8; i++)
+	    if (d->perm[i] + 8 != d->perm[i + 8])
+	      {
+		/* Use vpblendvb.  */
+		for (i = 0; i < 32; ++i)
+		  rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
+
+		vmode = V32QImode;
+		nelt = 32;
+		target = gen_reg_rtx (vmode);
+		op0 = gen_lowpart (vmode, op0);
+		op1 = gen_lowpart (vmode, op1);
+		goto finish_pblendvb;
+	      }
+
+	  /* Use vpblendw.  */
+	  for (i = 0; i < 16; ++i)
+	    mask |= (d->perm[i] >= 16) << i;
+	  break;
+	}
+
+      /* Use vpblendd.  */
+      for (i = 0; i < 8; ++i)
+	mask |= (d->perm[i * 2] >= 16) << i;
+      vmode = V8SImode;
+      goto do_subreg;
+
+    case V4DImode:
+      /* Use vpblendd.  */
+      for (i = 0; i < 4; ++i)
+	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+      vmode = V8SImode;
+      goto do_subreg;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  /* This matches five different patterns with the different modes.  */
+  x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
+  x = gen_rtx_SET (VOIDmode, target, x);
+  emit_insn (x);
+  if (target != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   in terms of the variable form of vpermilps.
+
+   Note that we will have already failed the immediate input vpermilps,
+   which requires that the high and low part shuffle be identical; the
+   variable form doesn't require that.  */
+
+static bool
+expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
+{
+  rtx rperm[8], vperm;
+  unsigned i;
+
+  if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
+    return false;
+
+  /* We can only permute within the 128-bit lane.  */
+  for (i = 0; i < 8; ++i)
+    {
+      unsigned e = d->perm[i];
+      if (i < 4 ? e >= 4 : e < 4)
+	return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  for (i = 0; i < 8; ++i)
+    {
+      unsigned e = d->perm[i];
+
+      /* Within each 128-bit lane, the elements of op0 are numbered
+	 from 0 and the elements of op1 are numbered from 4.  */
+      if (e >= 8 + 4)
+	e -= 8;
+      else if (e >= 4)
+	e -= 4;
+
+      rperm[i] = GEN_INT (e);
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
+  vperm = force_reg (V8SImode, vperm);
+  emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
+
+  return true;
+}
+
+/* Return true if permutation D can be performed as VMODE permutation
+   instead.  */
+
+static bool
+valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
+{
+  unsigned int i, j, chunk;
+
+  if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
+      || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
+      || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
+    return false;
+
+  if (GET_MODE_NUNITS (vmode) >= d->nelt)
+    return true;
+
+  chunk = d->nelt / GET_MODE_NUNITS (vmode);
+  for (i = 0; i < d->nelt; i += chunk)
+    if (d->perm[i] & (chunk - 1))
+      return false;
+    else
+      for (j = 1; j < chunk; ++j)
+	if (d->perm[i] + j != d->perm[i + j])
+	  return false;
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
+
+static bool
+expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
+{
+  unsigned i, nelt, eltsz, mask;
+  unsigned char perm[32];
+  enum machine_mode vmode = V16QImode;
+  rtx rperm[32], vperm, target, op0, op1;
+
+  nelt = d->nelt;
+
+  if (!d->one_operand_p)
+    {
+      if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
+	{
+	  if (TARGET_AVX2
+	      && valid_perm_using_mode_p (V2TImode, d))
+	    {
+	      if (d->testing_p)
+		return true;
+
+	      /* Use vperm2i128 insn.  The pattern uses
+		 V4DImode instead of V2TImode.  */
+	      target = d->target;
+	      if (d->vmode != V4DImode)
+		target = gen_reg_rtx (V4DImode);
+	      op0 = gen_lowpart (V4DImode, d->op0);
+	      op1 = gen_lowpart (V4DImode, d->op1);
+	      rperm[0]
+		= GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
+			   || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
+	      emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
+	      if (target != d->target)
+		emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+	      return true;
+	    }
+	  return false;
+	}
+    }
+  else
+    {
+      if (GET_MODE_SIZE (d->vmode) == 16)
+	{
+	  if (!TARGET_SSSE3)
+	    return false;
+	}
+      else if (GET_MODE_SIZE (d->vmode) == 32)
+	{
+	  if (!TARGET_AVX2)
+	    return false;
+
+	  /* V4DImode should be already handled through
+	     expand_vselect by vpermq instruction.  */
+	  gcc_assert (d->vmode != V4DImode);
+
+	  vmode = V32QImode;
+	  if (d->vmode == V8SImode
+	      || d->vmode == V16HImode
+	      || d->vmode == V32QImode)
+	    {
+	      /* First see if vpermq can be used for
+		 V8SImode/V16HImode/V32QImode.  */
+	      if (valid_perm_using_mode_p (V4DImode, d))
+		{
+		  for (i = 0; i < 4; i++)
+		    perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
+		  if (d->testing_p)
+		    return true;
+		  target = gen_reg_rtx (V4DImode);
+		  if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
+				      perm, 4, false))
+		    {
+		      emit_move_insn (d->target,
+				      gen_lowpart (d->vmode, target));
+		      return true;
+		    }
+		  return false;
+		}
+
+	      /* Next see if vpermd can be used.  */
+	      if (valid_perm_using_mode_p (V8SImode, d))
+		vmode = V8SImode;
+	    }
+	  /* Or if vpermps can be used.  */
+	  else if (d->vmode == V8SFmode)
+	    vmode = V8SImode;
+
+	  if (vmode == V32QImode)
+	    {
+	      /* vpshufb only works intra lanes, it is not
+		 possible to shuffle bytes in between the lanes.  */
+	      for (i = 0; i < nelt; ++i)
+		if ((d->perm[i] ^ i) & (nelt / 2))
+		  return false;
+	    }
+	}
+      else
+	return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  if (vmode == V8SImode)
+    for (i = 0; i < 8; ++i)
+      rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
+  else
+    {
+      eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+      if (!d->one_operand_p)
+	mask = 2 * nelt - 1;
+      else if (vmode == V16QImode)
+	mask = nelt - 1;
+      else
+	mask = nelt / 2 - 1;
+
+      for (i = 0; i < nelt; ++i)
+	{
+	  unsigned j, e = d->perm[i] & mask;
+	  for (j = 0; j < eltsz; ++j)
+	    rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
+	}
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (vmode,
+				gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
+  vperm = force_reg (vmode, vperm);
+
+  target = d->target;
+  if (d->vmode != vmode)
+    target = gen_reg_rtx (vmode);
+  op0 = gen_lowpart (vmode, d->op0);
+  if (d->one_operand_p)
+    {
+      if (vmode == V16QImode)
+	emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
+      else if (vmode == V32QImode)
+	emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+      else if (vmode == V8SFmode)
+	emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
+      else
+	emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
+    }
+  else
+    {
+      op1 = gen_lowpart (vmode, d->op1);
+      emit_insn (gen_xop_pperm (target, op0, op1, vperm));
+    }
+  if (target != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
+   in a single instruction.  */
+
+static bool
+expand_vec_perm_1 (struct expand_vec_perm_d *d)
+{
+  unsigned i, nelt = d->nelt;
+  unsigned char perm2[MAX_VECT_LEN];
+
+  /* Check plain VEC_SELECT first, because AVX has instructions that could
+     match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
+     input where SEL+CONCAT may not.  */
+  if (d->one_operand_p)
+    {
+      int mask = nelt - 1;
+      bool identity_perm = true;
+      bool broadcast_perm = true;
+
+      for (i = 0; i < nelt; i++)
+	{
+	  perm2[i] = d->perm[i] & mask;
+	  if (perm2[i] != i)
+	    identity_perm = false;
+	  if (perm2[i])
+	    broadcast_perm = false;
+	}
+
+      if (identity_perm)
+	{
+	  if (!d->testing_p)
+	    emit_move_insn (d->target, d->op0);
+	  return true;
+	}
+      else if (broadcast_perm && TARGET_AVX2)
+	{
+	  /* Use vpbroadcast{b,w,d}.  */
+	  rtx (*gen) (rtx, rtx) = NULL;
+	  switch (d->vmode)
+	    {
+	    case V32QImode:
+	      gen = gen_avx2_pbroadcastv32qi_1;
+	      break;
+	    case V16HImode:
+	      gen = gen_avx2_pbroadcastv16hi_1;
+	      break;
+	    case V8SImode:
+	      gen = gen_avx2_pbroadcastv8si_1;
+	      break;
+	    case V16QImode:
+	      gen = gen_avx2_pbroadcastv16qi;
+	      break;
+	    case V8HImode:
+	      gen = gen_avx2_pbroadcastv8hi;
+	      break;
+	    case V8SFmode:
+	      gen = gen_avx2_vec_dupv8sf_1;
+	      break;
+	    /* For other modes prefer other shuffles this function creates.  */
+	    default: break;
+	    }
+	  if (gen != NULL)
+	    {
+	      if (!d->testing_p)
+		emit_insn (gen (d->target, d->op0));
+	      return true;
+	    }
+	}
+
+      if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
+	return true;
+
+      /* There are plenty of patterns in sse.md that are written for
+	 SEL+CONCAT and are not replicated for a single op.  Perhaps
+	 that should be changed, to avoid the nastiness here.  */
+
+      /* Recognize interleave style patterns, which means incrementing
+	 every other permutation operand.  */
+      for (i = 0; i < nelt; i += 2)
+	{
+	  perm2[i] = d->perm[i] & mask;
+	  perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
+	}
+      if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
+				  d->testing_p))
+	return true;
+
+      /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
+      if (nelt >= 4)
+	{
+	  for (i = 0; i < nelt; i += 4)
+	    {
+	      perm2[i + 0] = d->perm[i + 0] & mask;
+	      perm2[i + 1] = d->perm[i + 1] & mask;
+	      perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
+	      perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
+	    }
+
+	  if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
+				      d->testing_p))
+	    return true;
+	}
+    }
+
+  /* Finally, try the fully general two operand permute.  */
+  if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
+			      d->testing_p))
+    return true;
+
+  /* Recognize interleave style patterns with reversed operands.  */
+  if (!d->one_operand_p)
+    {
+      for (i = 0; i < nelt; ++i)
+	{
+	  unsigned e = d->perm[i];
+	  if (e >= nelt)
+	    e -= nelt;
+	  else
+	    e += nelt;
+	  perm2[i] = e;
+	}
+
+      if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
+				  d->testing_p))
+	return true;
+    }
+
+  /* Try the SSE4.1 blend variable merge instructions.  */
+  if (expand_vec_perm_blend (d))
+    return true;
+
+  /* Try one of the AVX vpermil variable permutations.  */
+  if (expand_vec_perm_vpermil (d))
+    return true;
+
+  /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
+     vpshufb, vpermd, vpermps or vpermq variable permutation.  */
+  if (expand_vec_perm_pshufb (d))
+    return true;
+
+  /* Try the AVX512F vpermi2 instructions.  */
+  rtx vec[64];
+  enum machine_mode mode = d->vmode;
+  if (mode == V8DFmode)
+    mode = V8DImode;
+  else if (mode == V16SFmode)
+    mode = V16SImode;
+  for (i = 0; i < nelt; ++i)
+    vec[i] = GEN_INT (d->perm[i]);
+  rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec));
+  if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, mask, d->op1))
+    return true;
+
+  return false;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   in terms of a pair of pshuflw + pshufhw instructions.  */
+
+static bool
+expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
+{
+  unsigned char perm2[MAX_VECT_LEN];
+  unsigned i;
+  bool ok;
+
+  if (d->vmode != V8HImode || !d->one_operand_p)
+    return false;
+
+  /* The two permutations only operate in 64-bit lanes.  */
+  for (i = 0; i < 4; ++i)
+    if (d->perm[i] >= 4)
+      return false;
+  for (i = 4; i < 8; ++i)
+    if (d->perm[i] < 4)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  /* Emit the pshuflw.  */
+  memcpy (perm2, d->perm, 4);
+  for (i = 4; i < 8; ++i)
+    perm2[i] = i;
+  ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
+  gcc_assert (ok);
+
+  /* Emit the pshufhw.  */
+  memcpy (perm2 + 4, d->perm + 4, 4);
+  for (i = 0; i < 4; ++i)
+    perm2[i] = i;
+  ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
+  gcc_assert (ok);
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+   the permutation using the SSSE3 palignr instruction.  This succeeds
+   when all of the elements in PERM fit within one vector and we merely
+   need to shift them down so that a single vector permutation has a
+   chance to succeed.  */
+
+static bool
+expand_vec_perm_palignr (struct expand_vec_perm_d *d)
+{
+  unsigned i, nelt = d->nelt;
+  unsigned min, max;
+  bool in_order, ok;
+  rtx shift, target;
+  struct expand_vec_perm_d dcopy;
+
+  /* Even with AVX, palignr only operates on 128-bit vectors.  */
+  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+    return false;
+
+  min = nelt, max = 0;
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned e = d->perm[i];
+      if (e < min)
+	min = e;
+      if (e > max)
+	max = e;
+    }
+  if (min == 0 || max - min >= nelt)
+    return false;
+
+  /* Given that we have SSSE3, we know we'll be able to implement the
+     single operand permutation after the palignr with pshufb.  */
+  if (d->testing_p)
+    return true;
+
+  dcopy = *d;
+  shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
+  target = gen_reg_rtx (TImode);
+  emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
+				  gen_lowpart (TImode, d->op0), shift));
+
+  dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
+  dcopy.one_operand_p = true;
+
+  in_order = true;
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned e = dcopy.perm[i] - min;
+      if (e != i)
+	in_order = false;
+      dcopy.perm[i] = e;
+    }
+
+  /* Test for the degenerate case where the alignment by itself
+     produces the desired permutation.  */
+  if (in_order)
+    {
+      emit_move_insn (d->target, dcopy.op0);
+      return true;
+    }
+
+  ok = expand_vec_perm_1 (&dcopy);
+  gcc_assert (ok);
+
+  return ok;
+}
+
+static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+   a two vector permutation into a single vector permutation by using
+   an interleave operation to merge the vectors.  */
+
+static bool
+expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
+{
+  struct expand_vec_perm_d dremap, dfinal;
+  unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
+  unsigned HOST_WIDE_INT contents;
+  unsigned char remap[2 * MAX_VECT_LEN];
+  rtx seq;
+  bool ok, same_halves = false;
+
+  if (GET_MODE_SIZE (d->vmode) == 16)
+    {
+      if (d->one_operand_p)
+	return false;
+    }
+  else if (GET_MODE_SIZE (d->vmode) == 32)
+    {
+      if (!TARGET_AVX)
+	return false;
+      /* For 32-byte modes allow even d->one_operand_p.
+	 The lack of cross-lane shuffling in some instructions
+	 might prevent a single insn shuffle.  */
+      dfinal = *d;
+      dfinal.testing_p = true;
+      /* If expand_vec_perm_interleave3 can expand this into
+	 a 3 insn sequence, give up and let it be expanded as
+	 3 insn sequence.  While that is one insn longer,
+	 it doesn't need a memory operand and in the common
+	 case that both interleave low and high permutations
+	 with the same operands are adjacent needs 4 insns
+	 for both after CSE.  */
+      if (expand_vec_perm_interleave3 (&dfinal))
+	return false;
+    }
+  else
+    return false;
+
+  /* Examine from whence the elements come.  */
+  contents = 0;
+  for (i = 0; i < nelt; ++i)
+    contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
+
+  memset (remap, 0xff, sizeof (remap));
+  dremap = *d;
+
+  if (GET_MODE_SIZE (d->vmode) == 16)
+    {
+      unsigned HOST_WIDE_INT h1, h2, h3, h4;
+
+      /* Split the two input vectors into 4 halves.  */
+      h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
+      h2 = h1 << nelt2;
+      h3 = h2 << nelt2;
+      h4 = h3 << nelt2;
+
+      /* If the elements from the low halves use interleave low, and similarly
+	 for interleave high.  If the elements are from mis-matched halves, we
+	 can use shufps for V4SF/V4SI or do a DImode shuffle.  */
+      if ((contents & (h1 | h3)) == contents)
+	{
+	  /* punpckl* */
+	  for (i = 0; i < nelt2; ++i)
+	    {
+	      remap[i] = i * 2;
+	      remap[i + nelt] = i * 2 + 1;
+	      dremap.perm[i * 2] = i;
+	      dremap.perm[i * 2 + 1] = i + nelt;
+	    }
+	  if (!TARGET_SSE2 && d->vmode == V4SImode)
+	    dremap.vmode = V4SFmode;
+	}
+      else if ((contents & (h2 | h4)) == contents)
+	{
+	  /* punpckh* */
+	  for (i = 0; i < nelt2; ++i)
+	    {
+	      remap[i + nelt2] = i * 2;
+	      remap[i + nelt + nelt2] = i * 2 + 1;
+	      dremap.perm[i * 2] = i + nelt2;
+	      dremap.perm[i * 2 + 1] = i + nelt + nelt2;
+	    }
+	  if (!TARGET_SSE2 && d->vmode == V4SImode)
+	    dremap.vmode = V4SFmode;
+	}
+      else if ((contents & (h1 | h4)) == contents)
+	{
+	  /* shufps */
+	  for (i = 0; i < nelt2; ++i)
+	    {
+	      remap[i] = i;
+	      remap[i + nelt + nelt2] = i + nelt2;
+	      dremap.perm[i] = i;
+	      dremap.perm[i + nelt2] = i + nelt + nelt2;
+	    }
+	  if (nelt != 4)
+	    {
+	      /* shufpd */
+	      dremap.vmode = V2DImode;
+	      dremap.nelt = 2;
+	      dremap.perm[0] = 0;
+	      dremap.perm[1] = 3;
+	    }
+	}
+      else if ((contents & (h2 | h3)) == contents)
+	{
+	  /* shufps */
+	  for (i = 0; i < nelt2; ++i)
+	    {
+	      remap[i + nelt2] = i;
+	      remap[i + nelt] = i + nelt2;
+	      dremap.perm[i] = i + nelt2;
+	      dremap.perm[i + nelt2] = i + nelt;
+	    }
+	  if (nelt != 4)
+	    {
+	      /* shufpd */
+	      dremap.vmode = V2DImode;
+	      dremap.nelt = 2;
+	      dremap.perm[0] = 1;
+	      dremap.perm[1] = 2;
+	    }
+	}
+      else
+	return false;
+    }
+  else
+    {
+      unsigned int nelt4 = nelt / 4, nzcnt = 0;
+      unsigned HOST_WIDE_INT q[8];
+      unsigned int nonzero_halves[4];
+
+      /* Split the two input vectors into 8 quarters.  */
+      q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
+      for (i = 1; i < 8; ++i)
+	q[i] = q[0] << (nelt4 * i);
+      for (i = 0; i < 4; ++i)
+	if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
+	  {
+	    nonzero_halves[nzcnt] = i;
+	    ++nzcnt;
+	  }
+
+      if (nzcnt == 1)
+	{
+	  gcc_assert (d->one_operand_p);
+	  nonzero_halves[1] = nonzero_halves[0];
+	  same_halves = true;
+	}
+      else if (d->one_operand_p)
+	{
+	  gcc_assert (nonzero_halves[0] == 0);
+	  gcc_assert (nonzero_halves[1] == 1);
+	}
+
+      if (nzcnt <= 2)
+	{
+	  if (d->perm[0] / nelt2 == nonzero_halves[1])
+	    {
+	      /* Attempt to increase the likelihood that dfinal
+		 shuffle will be intra-lane.  */
+	      char tmph = nonzero_halves[0];
+	      nonzero_halves[0] = nonzero_halves[1];
+	      nonzero_halves[1] = tmph;
+	    }
+
+	  /* vperm2f128 or vperm2i128.  */
+	  for (i = 0; i < nelt2; ++i)
+	    {
+	      remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
+	      remap[i + nonzero_halves[0] * nelt2] = i;
+	      dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
+	      dremap.perm[i] = i + nonzero_halves[0] * nelt2;
+	    }
+
+	  if (d->vmode != V8SFmode
+	      && d->vmode != V4DFmode
+	      && d->vmode != V8SImode)
+	    {
+	      dremap.vmode = V8SImode;
+	      dremap.nelt = 8;
+	      for (i = 0; i < 4; ++i)
+		{
+		  dremap.perm[i] = i + nonzero_halves[0] * 4;
+		  dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
+		}
+	    }
+	}
+      else if (d->one_operand_p)
+	return false;
+      else if (TARGET_AVX2
+	       && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
+	{
+	  /* vpunpckl* */
+	  for (i = 0; i < nelt4; ++i)
+	    {
+	      remap[i] = i * 2;
+	      remap[i + nelt] = i * 2 + 1;
+	      remap[i + nelt2] = i * 2 + nelt2;
+	      remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
+	      dremap.perm[i * 2] = i;
+	      dremap.perm[i * 2 + 1] = i + nelt;
+	      dremap.perm[i * 2 + nelt2] = i + nelt2;
+	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
+	    }
+	}
+      else if (TARGET_AVX2
+	       && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
+	{
+	  /* vpunpckh* */
+	  for (i = 0; i < nelt4; ++i)
+	    {
+	      remap[i + nelt4] = i * 2;
+	      remap[i + nelt + nelt4] = i * 2 + 1;
+	      remap[i + nelt2 + nelt4] = i * 2 + nelt2;
+	      remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
+	      dremap.perm[i * 2] = i + nelt4;
+	      dremap.perm[i * 2 + 1] = i + nelt + nelt4;
+	      dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
+	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
+	    }
+	}
+      else
+	return false;
+    }
+
+  /* Use the remapping array set up above to move the elements from their
+     swizzled locations into their final destinations.  */
+  dfinal = *d;
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned e = remap[d->perm[i]];
+      gcc_assert (e < nelt);
+      /* If same_halves is true, both halves of the remapped vector are the
+	 same.  Avoid cross-lane accesses if possible.  */
+      if (same_halves && i >= nelt2)
+	{
+	  gcc_assert (e < nelt2);
+	  dfinal.perm[i] = e + nelt2;
+	}
+      else
+	dfinal.perm[i] = e;
+    }
+  if (!d->testing_p)
+    {
+      dremap.target = gen_reg_rtx (dremap.vmode);
+      dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
+    }
+  dfinal.op1 = dfinal.op0;
+  dfinal.one_operand_p = true;
+
+  /* Test if the final remap can be done with a single insn.  For V4SFmode or
+     V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
+  start_sequence ();
+  ok = expand_vec_perm_1 (&dfinal);
+  seq = get_insns ();
+  end_sequence ();
+
+  if (!ok)
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  if (dremap.vmode != dfinal.vmode)
+    {
+      dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
+      dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
+    }
+
+  ok = expand_vec_perm_1 (&dremap);
+  gcc_assert (ok);
+
+  emit_insn (seq);
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+   a single vector cross-lane permutation into vpermq followed
+   by any of the single insn permutations.  */
+
+static bool
+expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
+{
+  struct expand_vec_perm_d dremap, dfinal;
+  unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
+  unsigned contents[2];
+  bool ok;
+
+  if (!(TARGET_AVX2
+	&& (d->vmode == V32QImode || d->vmode == V16HImode)
+	&& d->one_operand_p))
+    return false;
+
+  contents[0] = 0;
+  contents[1] = 0;
+  for (i = 0; i < nelt2; ++i)
+    {
+      contents[0] |= 1u << (d->perm[i] / nelt4);
+      contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
+    }
+
+  for (i = 0; i < 2; ++i)
+    {
+      unsigned int cnt = 0;
+      for (j = 0; j < 4; ++j)
+	if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
+	  return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  dremap = *d;
+  dremap.vmode = V4DImode;
+  dremap.nelt = 4;
+  dremap.target = gen_reg_rtx (V4DImode);
+  dremap.op0 = gen_lowpart (V4DImode, d->op0);
+  dremap.op1 = dremap.op0;
+  dremap.one_operand_p = true;
+  for (i = 0; i < 2; ++i)
+    {
+      unsigned int cnt = 0;
+      for (j = 0; j < 4; ++j)
+	if ((contents[i] & (1u << j)) != 0)
+	  dremap.perm[2 * i + cnt++] = j;
+      for (; cnt < 2; ++cnt)
+	dremap.perm[2 * i + cnt] = 0;
+    }
+
+  dfinal = *d;
+  dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
+  dfinal.op1 = dfinal.op0;
+  dfinal.one_operand_p = true;
+  for (i = 0, j = 0; i < nelt; ++i)
+    {
+      if (i == nelt2)
+	j = 2;
+      dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
+      if ((d->perm[i] / nelt4) == dremap.perm[j])
+	;
+      else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
+	dfinal.perm[i] |= nelt4;
+      else
+	gcc_unreachable ();
+    }
+
+  ok = expand_vec_perm_1 (&dremap);
+  gcc_assert (ok);
+
+  ok = expand_vec_perm_1 (&dfinal);
+  gcc_assert (ok);
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to expand
+   a vector permutation using two instructions, vperm2f128 resp.
+   vperm2i128 followed by any single in-lane permutation.  */
+
+static bool
+expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
+{
+  struct expand_vec_perm_d dfirst, dsecond;
+  unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
+  bool ok;
+
+  if (!TARGET_AVX
+      || GET_MODE_SIZE (d->vmode) != 32
+      || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
+    return false;
+
+  dsecond = *d;
+  dsecond.one_operand_p = false;
+  dsecond.testing_p = true;
+
+  /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
+     immediate.  For perm < 16 the second permutation uses
+     d->op0 as first operand, for perm >= 16 it uses d->op1
+     as first operand.  The second operand is the result of
+     vperm2[fi]128.  */
+  for (perm = 0; perm < 32; perm++)
+    {
+      /* Ignore permutations which do not move anything cross-lane.  */
+      if (perm < 16)
+	{
+	  /* The second shuffle for e.g. V4DFmode has
+	     0123 and ABCD operands.
+	     Ignore AB23, as 23 is already in the second lane
+	     of the first operand.  */
+	  if ((perm & 0xc) == (1 << 2)) continue;
+	  /* And 01CD, as 01 is in the first lane of the first
+	     operand.  */
+	  if ((perm & 3) == 0) continue;
+	  /* And 4567, as then the vperm2[fi]128 doesn't change
+	     anything on the original 4567 second operand.  */
+	  if ((perm & 0xf) == ((3 << 2) | 2)) continue;
+	}
+      else
+	{
+	  /* The second shuffle for e.g. V4DFmode has
+	     4567 and ABCD operands.
+	     Ignore AB67, as 67 is already in the second lane
+	     of the first operand.  */
+	  if ((perm & 0xc) == (3 << 2)) continue;
+	  /* And 45CD, as 45 is in the first lane of the first
+	     operand.  */
+	  if ((perm & 3) == 2) continue;
+	  /* And 0123, as then the vperm2[fi]128 doesn't change
+	     anything on the original 0123 first operand.  */
+	  if ((perm & 0xf) == (1 << 2)) continue;
+	}
+
+      for (i = 0; i < nelt; i++)
+	{
+	  j = d->perm[i] / nelt2;
+	  if (j == ((perm >> (2 * (i >= nelt2))) & 3))
+	    dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
+	  else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
+	    dsecond.perm[i] = d->perm[i] & (nelt - 1);
+	  else
+	    break;
+	}
+
+      if (i == nelt)
+	{
+	  start_sequence ();
+	  ok = expand_vec_perm_1 (&dsecond);
+	  end_sequence ();
+	}
+      else
+	ok = false;
+
+      if (ok)
+	{
+	  if (d->testing_p)
+	    return true;
+
+	  /* Found a usable second shuffle.  dfirst will be
+	     vperm2f128 on d->op0 and d->op1.  */
+	  dsecond.testing_p = false;
+	  dfirst = *d;
+	  dfirst.target = gen_reg_rtx (d->vmode);
+	  for (i = 0; i < nelt; i++)
+	    dfirst.perm[i] = (i & (nelt2 - 1))
+			     + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
+
+	  ok = expand_vec_perm_1 (&dfirst);
+	  gcc_assert (ok);
+
+	  /* And dsecond is some single insn shuffle, taking
+	     d->op0 and result of vperm2f128 (if perm < 16) or
+	     d->op1 and result of vperm2f128 (otherwise).  */
+	  dsecond.op1 = dfirst.target;
+	  if (perm >= 16)
+	    dsecond.op0 = dfirst.op1;
+
+	  ok = expand_vec_perm_1 (&dsecond);
+	  gcc_assert (ok);
+
+	  return true;
+	}
+
+      /* For one operand, the only useful vperm2f128 permutation is 0x10.  */
+      if (d->one_operand_p)
+	return false;
+    }
+
+  return false;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+   a two vector permutation using 2 intra-lane interleave insns
+   and cross-lane shuffle for 32-byte vectors.  */
+
+static bool
+expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
+{
+  unsigned i, nelt;
+  rtx (*gen) (rtx, rtx, rtx);
+
+  if (d->one_operand_p)
+    return false;
+  if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
+    ;
+  else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
+    ;
+  else
+    return false;
+
+  nelt = d->nelt;
+  if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
+    return false;
+  for (i = 0; i < nelt; i += 2)
+    if (d->perm[i] != d->perm[0] + i / 2
+	|| d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  switch (d->vmode)
+    {
+    case V32QImode:
+      if (d->perm[0])
+	gen = gen_vec_interleave_highv32qi;
+      else
+	gen = gen_vec_interleave_lowv32qi;
+      break;
+    case V16HImode:
+      if (d->perm[0])
+	gen = gen_vec_interleave_highv16hi;
+      else
+	gen = gen_vec_interleave_lowv16hi;
+      break;
+    case V8SImode:
+      if (d->perm[0])
+	gen = gen_vec_interleave_highv8si;
+      else
+	gen = gen_vec_interleave_lowv8si;
+      break;
+    case V4DImode:
+      if (d->perm[0])
+	gen = gen_vec_interleave_highv4di;
+      else
+	gen = gen_vec_interleave_lowv4di;
+      break;
+    case V8SFmode:
+      if (d->perm[0])
+	gen = gen_vec_interleave_highv8sf;
+      else
+	gen = gen_vec_interleave_lowv8sf;
+      break;
+    case V4DFmode:
+      if (d->perm[0])
+	gen = gen_vec_interleave_highv4df;
+      else
+	gen = gen_vec_interleave_lowv4df;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  emit_insn (gen (d->target, d->op0, d->op1));
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement
+   a single vector permutation using a single intra-lane vector
+   permutation, vperm2f128 swapping the lanes and vblend* insn blending
+   the non-swapped and swapped vectors together.  */
+
+static bool
+expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
+{
+  struct expand_vec_perm_d dfirst, dsecond;
+  unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
+  rtx seq;
+  bool ok;
+  rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
+
+  if (!TARGET_AVX
+      || TARGET_AVX2
+      || (d->vmode != V8SFmode && d->vmode != V4DFmode)
+      || !d->one_operand_p)
+    return false;
+
+  dfirst = *d;
+  for (i = 0; i < nelt; i++)
+    dfirst.perm[i] = 0xff;
+  for (i = 0, msk = 0; i < nelt; i++)
+    {
+      j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
+      if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
+	return false;
+      dfirst.perm[j] = d->perm[i];
+      if (j != i)
+	msk |= (1 << i);
+    }
+  for (i = 0; i < nelt; i++)
+    if (dfirst.perm[i] == 0xff)
+      dfirst.perm[i] = i;
+
+  if (!d->testing_p)
+    dfirst.target = gen_reg_rtx (dfirst.vmode);
+
+  start_sequence ();
+  ok = expand_vec_perm_1 (&dfirst);
+  seq = get_insns ();
+  end_sequence ();
+
+  if (!ok)
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  emit_insn (seq);
+
+  dsecond = *d;
+  dsecond.op0 = dfirst.target;
+  dsecond.op1 = dfirst.target;
+  dsecond.one_operand_p = true;
+  dsecond.target = gen_reg_rtx (dsecond.vmode);
+  for (i = 0; i < nelt; i++)
+    dsecond.perm[i] = i ^ nelt2;
+
+  ok = expand_vec_perm_1 (&dsecond);
+  gcc_assert (ok);
+
+  blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
+  emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement a V4DF
+   permutation using two vperm2f128, followed by a vshufpd insn blending
+   the two vectors together.  */
+
+static bool
+expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
+{
+  struct expand_vec_perm_d dfirst, dsecond, dthird;
+  bool ok;
+
+  if (!TARGET_AVX || (d->vmode != V4DFmode))
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  dfirst = *d;
+  dsecond = *d;
+  dthird = *d;
+
+  dfirst.perm[0] = (d->perm[0] & ~1);
+  dfirst.perm[1] = (d->perm[0] & ~1) + 1;
+  dfirst.perm[2] = (d->perm[2] & ~1);
+  dfirst.perm[3] = (d->perm[2] & ~1) + 1;
+  dsecond.perm[0] = (d->perm[1] & ~1);
+  dsecond.perm[1] = (d->perm[1] & ~1) + 1;
+  dsecond.perm[2] = (d->perm[3] & ~1);
+  dsecond.perm[3] = (d->perm[3] & ~1) + 1;
+  dthird.perm[0] = (d->perm[0] % 2);
+  dthird.perm[1] = (d->perm[1] % 2) + 4;
+  dthird.perm[2] = (d->perm[2] % 2) + 2;
+  dthird.perm[3] = (d->perm[3] % 2) + 6;
+
+  dfirst.target = gen_reg_rtx (dfirst.vmode);
+  dsecond.target = gen_reg_rtx (dsecond.vmode);
+  dthird.op0 = dfirst.target;
+  dthird.op1 = dsecond.target;
+  dthird.one_operand_p = false;
+
+  canonicalize_perm (&dfirst);
+  canonicalize_perm (&dsecond);
+
+  ok = expand_vec_perm_1 (&dfirst)
+       && expand_vec_perm_1 (&dsecond)
+       && expand_vec_perm_1 (&dthird);
+
+  gcc_assert (ok);
+
+  return true;
+}
+
+/* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
+   permutation with two pshufb insns and an ior.  We should have already
+   failed all two instruction sequences.  */
+
+static bool
+expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
+{
+  rtx rperm[2][16], vperm, l, h, op, m128;
+  unsigned int i, nelt, eltsz;
+
+  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+    return false;
+  gcc_assert (!d->one_operand_p);
+
+  if (d->testing_p)
+    return true;
+
+  nelt = d->nelt;
+  eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+
+  /* Generate two permutation masks.  If the required element is within
+     the given vector it is shuffled into the proper lane.  If the required
+     element is in the other vector, force a zero into the lane by setting
+     bit 7 in the permutation mask.  */
+  m128 = GEN_INT (-128);
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned j, e = d->perm[i];
+      unsigned which = (e >= nelt);
+      if (e >= nelt)
+	e -= nelt;
+
+      for (j = 0; j < eltsz; ++j)
+	{
+	  rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
+	  rperm[1-which][i*eltsz + j] = m128;
+	}
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
+  vperm = force_reg (V16QImode, vperm);
+
+  l = gen_reg_rtx (V16QImode);
+  op = gen_lowpart (V16QImode, d->op0);
+  emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
+
+  vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
+  vperm = force_reg (V16QImode, vperm);
+
+  h = gen_reg_rtx (V16QImode);
+  op = gen_lowpart (V16QImode, d->op1);
+  emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
+
+  op = d->target;
+  if (d->vmode != V16QImode)
+    op = gen_reg_rtx (V16QImode);
+  emit_insn (gen_iorv16qi3 (op, l, h));
+  if (op != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
+
+  return true;
+}
+
+/* Implement arbitrary permutation of one V32QImode and V16QImode operand
+   with two vpshufb insns, vpermq and vpor.  We should have already failed
+   all two or three instruction sequences.  */
+
+static bool
+expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
+{
+  rtx rperm[2][32], vperm, l, h, hp, op, m128;
+  unsigned int i, nelt, eltsz;
+
+  if (!TARGET_AVX2
+      || !d->one_operand_p
+      || (d->vmode != V32QImode && d->vmode != V16HImode))
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  nelt = d->nelt;
+  eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+
+  /* Generate two permutation masks.  If the required element is within
+     the same lane, it is shuffled in.  If the required element from the
+     other lane, force a zero by setting bit 7 in the permutation mask.
+     In the other mask the mask has non-negative elements if element
+     is requested from the other lane, but also moved to the other lane,
+     so that the result of vpshufb can have the two V2TImode halves
+     swapped.  */
+  m128 = GEN_INT (-128);
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+      unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
+
+      for (j = 0; j < eltsz; ++j)
+	{
+	  rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
+	  rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
+	}
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
+  vperm = force_reg (V32QImode, vperm);
+
+  h = gen_reg_rtx (V32QImode);
+  op = gen_lowpart (V32QImode, d->op0);
+  emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
+
+  /* Swap the 128-byte lanes of h into hp.  */
+  hp = gen_reg_rtx (V4DImode);
+  op = gen_lowpart (V4DImode, h);
+  emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
+				  const1_rtx));
+
+  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
+  vperm = force_reg (V32QImode, vperm);
+
+  l = gen_reg_rtx (V32QImode);
+  op = gen_lowpart (V32QImode, d->op0);
+  emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
+
+  op = d->target;
+  if (d->vmode != V32QImode)
+    op = gen_reg_rtx (V32QImode);
+  emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
+  if (op != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
+
+  return true;
+}
+
+/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
+   and extract-odd permutations of two V32QImode and V16QImode operand
+   with two vpshufb insns, vpor and vpermq.  We should have already
+   failed all two or three instruction sequences.  */
+
+static bool
+expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
+{
+  rtx rperm[2][32], vperm, l, h, ior, op, m128;
+  unsigned int i, nelt, eltsz;
+
+  if (!TARGET_AVX2
+      || d->one_operand_p
+      || (d->vmode != V32QImode && d->vmode != V16HImode))
+    return false;
+
+  for (i = 0; i < d->nelt; ++i)
+    if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  nelt = d->nelt;
+  eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+
+  /* Generate two permutation masks.  In the first permutation mask
+     the first quarter will contain indexes for the first half
+     of the op0, the second quarter will contain bit 7 set, third quarter
+     will contain indexes for the second half of the op0 and the
+     last quarter bit 7 set.  In the second permutation mask
+     the first quarter will contain bit 7 set, the second quarter
+     indexes for the first half of the op1, the third quarter bit 7 set
+     and last quarter indexes for the second half of the op1.
+     I.e. the first mask e.g. for V32QImode extract even will be:
+     0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
+     (all values masked with 0xf except for -128) and second mask
+     for extract even will be
+     -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe.  */
+  m128 = GEN_INT (-128);
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+      unsigned which = d->perm[i] >= nelt;
+      unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
+
+      for (j = 0; j < eltsz; ++j)
+	{
+	  rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
+	  rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
+	}
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
+  vperm = force_reg (V32QImode, vperm);
+
+  l = gen_reg_rtx (V32QImode);
+  op = gen_lowpart (V32QImode, d->op0);
+  emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
+
+  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
+  vperm = force_reg (V32QImode, vperm);
+
+  h = gen_reg_rtx (V32QImode);
+  op = gen_lowpart (V32QImode, d->op1);
+  emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
+
+  ior = gen_reg_rtx (V32QImode);
+  emit_insn (gen_iorv32qi3 (ior, l, h));
+
+  /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
+  op = gen_reg_rtx (V4DImode);
+  ior = gen_lowpart (V4DImode, ior);
+  emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
+				  const1_rtx, GEN_INT (3)));
+  emit_move_insn (d->target, gen_lowpart (d->vmode, op));
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
+   and extract-odd permutations.  */
+
+static bool
+expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
+{
+  rtx t1, t2, t3, t4, t5;
+
+  switch (d->vmode)
+    {
+    case V4DFmode:
+      if (d->testing_p)
+	break;
+      t1 = gen_reg_rtx (V4DFmode);
+      t2 = gen_reg_rtx (V4DFmode);
+
+      /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
+      emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
+      emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
+
+      /* Now an unpck[lh]pd will produce the result required.  */
+      if (odd)
+	t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
+      else
+	t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
+      emit_insn (t3);
+      break;
+
+    case V8SFmode:
+      {
+	int mask = odd ? 0xdd : 0x88;
+
+	if (d->testing_p)
+	  break;
+	t1 = gen_reg_rtx (V8SFmode);
+	t2 = gen_reg_rtx (V8SFmode);
+	t3 = gen_reg_rtx (V8SFmode);
+
+	/* Shuffle within the 128-bit lanes to produce:
+	   { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }.  */
+	emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
+				      GEN_INT (mask)));
+
+	/* Shuffle the lanes around to produce:
+	   { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }.  */
+	emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
+					    GEN_INT (0x3)));
+
+	/* Shuffle within the 128-bit lanes to produce:
+	   { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }.  */
+	emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
+
+	/* Shuffle within the 128-bit lanes to produce:
+	   { 8 a c e c e 8 a } | { 9 b d f d f 9 b }.  */
+	emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
+
+	/* Shuffle the lanes around to produce:
+	   { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }.  */
+	emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
+					    GEN_INT (0x20)));
+      }
+      break;
+
+    case V2DFmode:
+    case V4SFmode:
+    case V2DImode:
+    case V4SImode:
+      /* These are always directly implementable by expand_vec_perm_1.  */
+      gcc_unreachable ();
+
+    case V8HImode:
+      if (TARGET_SSSE3)
+	return expand_vec_perm_pshufb2 (d);
+      else
+	{
+	  if (d->testing_p)
+	    break;
+	  /* We need 2*log2(N)-1 operations to achieve odd/even
+	     with interleave. */
+	  t1 = gen_reg_rtx (V8HImode);
+	  t2 = gen_reg_rtx (V8HImode);
+	  emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
+	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
+	  emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
+	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
+	  if (odd)
+	    t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
+	  else
+	    t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
+	  emit_insn (t3);
+	}
+      break;
+
+    case V16QImode:
+      if (TARGET_SSSE3)
+	return expand_vec_perm_pshufb2 (d);
+      else
+	{
+	  if (d->testing_p)
+	    break;
+	  t1 = gen_reg_rtx (V16QImode);
+	  t2 = gen_reg_rtx (V16QImode);
+	  t3 = gen_reg_rtx (V16QImode);
+	  emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
+	  emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
+	  emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
+	  emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
+	  emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
+	  emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
+	  if (odd)
+	    t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
+	  else
+	    t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
+	  emit_insn (t3);
+	}
+      break;
+
+    case V16HImode:
+    case V32QImode:
+      return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
+
+    case V4DImode:
+      if (!TARGET_AVX2)
+	{
+	  struct expand_vec_perm_d d_copy = *d;
+	  d_copy.vmode = V4DFmode;
+	  if (d->testing_p)
+	    d_copy.target = gen_lowpart (V4DFmode, d->target);
+	  else
+	    d_copy.target = gen_reg_rtx (V4DFmode);
+	  d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
+	  d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
+	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
+	    {
+	      if (!d->testing_p)
+		emit_move_insn (d->target,
+				gen_lowpart (V4DImode, d_copy.target));
+	      return true;
+	    }
+	  return false;
+	}
+
+      if (d->testing_p)
+	break;
+
+      t1 = gen_reg_rtx (V4DImode);
+      t2 = gen_reg_rtx (V4DImode);
+
+      /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
+      emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
+      emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
+
+      /* Now an vpunpck[lh]qdq will produce the result required.  */
+      if (odd)
+	t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
+      else
+	t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
+      emit_insn (t3);
+      break;
+
+    case V8SImode:
+      if (!TARGET_AVX2)
+	{
+	  struct expand_vec_perm_d d_copy = *d;
+	  d_copy.vmode = V8SFmode;
+	  if (d->testing_p)
+	    d_copy.target = gen_lowpart (V8SFmode, d->target);
+	  else
+	    d_copy.target = gen_reg_rtx (V8SFmode);
+	  d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
+	  d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
+	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
+	    {
+	      if (!d->testing_p)
+		emit_move_insn (d->target,
+				gen_lowpart (V8SImode, d_copy.target));
+	      return true;
+	    }
+	  return false;
+	}
+
+      if (d->testing_p)
+	break;
+
+      t1 = gen_reg_rtx (V8SImode);
+      t2 = gen_reg_rtx (V8SImode);
+      t3 = gen_reg_rtx (V4DImode);
+      t4 = gen_reg_rtx (V4DImode);
+      t5 = gen_reg_rtx (V4DImode);
+
+      /* Shuffle the lanes around into
+	 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
+      emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
+				    gen_lowpart (V4DImode, d->op1),
+				    GEN_INT (0x20)));
+      emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
+				    gen_lowpart (V4DImode, d->op1),
+				    GEN_INT (0x31)));
+
+      /* Swap the 2nd and 3rd position in each lane into
+	 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
+      emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
+				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
+      emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
+				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
+
+      /* Now an vpunpck[lh]qdq will produce
+	 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
+      if (odd)
+	t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
+					   gen_lowpart (V4DImode, t2));
+      else
+	t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
+					  gen_lowpart (V4DImode, t2));
+      emit_insn (t3);
+      emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
+   extract-even and extract-odd permutations.  */
+
+static bool
+expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
+{
+  unsigned i, odd, nelt = d->nelt;
+
+  odd = d->perm[0];
+  if (odd != 0 && odd != 1)
+    return false;
+
+  for (i = 1; i < nelt; ++i)
+    if (d->perm[i] != 2 * i + odd)
+      return false;
+
+  return expand_vec_perm_even_odd_1 (d, odd);
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement broadcast
+   permutations.  We assume that expand_vec_perm_1 has already failed.  */
+
+static bool
+expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
+{
+  unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
+  enum machine_mode vmode = d->vmode;
+  unsigned char perm2[4];
+  rtx op0 = d->op0, dest;
+  bool ok;
+
+  switch (vmode)
+    {
+    case V4DFmode:
+    case V8SFmode:
+      /* These are special-cased in sse.md so that we can optionally
+	 use the vbroadcast instruction.  They expand to two insns
+	 if the input happens to be in a register.  */
+      gcc_unreachable ();
+
+    case V2DFmode:
+    case V2DImode:
+    case V4SFmode:
+    case V4SImode:
+      /* These are always implementable using standard shuffle patterns.  */
+      gcc_unreachable ();
+
+    case V8HImode:
+    case V16QImode:
+      /* These can be implemented via interleave.  We save one insn by
+	 stopping once we have promoted to V4SImode and then use pshufd.  */
+      if (d->testing_p)
+	return true;
+      do
+	{
+	  rtx dest;
+	  rtx (*gen) (rtx, rtx, rtx)
+	    = vmode == V16QImode ? gen_vec_interleave_lowv16qi
+				 : gen_vec_interleave_lowv8hi;
+
+	  if (elt >= nelt2)
+	    {
+	      gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
+				       : gen_vec_interleave_highv8hi;
+	      elt -= nelt2;
+	    }
+	  nelt2 /= 2;
+
+	  dest = gen_reg_rtx (vmode);
+	  emit_insn (gen (dest, op0, op0));
+	  vmode = get_mode_wider_vector (vmode);
+	  op0 = gen_lowpart (vmode, dest);
+	}
+      while (vmode != V4SImode);
+
+      memset (perm2, elt, 4);
+      dest = gen_reg_rtx (V4SImode);
+      ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
+      gcc_assert (ok);
+      if (!d->testing_p)
+	emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+      return true;
+
+    case V32QImode:
+    case V16HImode:
+    case V8SImode:
+    case V4DImode:
+      /* For AVX2 broadcasts of the first element vpbroadcast* or
+	 vpermq should be used by expand_vec_perm_1.  */
+      gcc_assert (!TARGET_AVX2 || d->perm[0]);
+      return false;
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
+   broadcast permutations.  */
+
+static bool
+expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
+{
+  unsigned i, elt, nelt = d->nelt;
+
+  if (!d->one_operand_p)
+    return false;
+
+  elt = d->perm[0];
+  for (i = 1; i < nelt; ++i)
+    if (d->perm[i] != elt)
+      return false;
+
+  return expand_vec_perm_broadcast_1 (d);
+}
+
+/* Implement arbitrary permutation of two V32QImode and V16QImode operands
+   with 4 vpshufb insns, 2 vpermq and 3 vpor.  We should have already failed
+   all the shorter instruction sequences.  */
+
+static bool
+expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
+{
+  rtx rperm[4][32], vperm, l[2], h[2], op, m128;
+  unsigned int i, nelt, eltsz;
+  bool used[4];
+
+  if (!TARGET_AVX2
+      || d->one_operand_p
+      || (d->vmode != V32QImode && d->vmode != V16HImode))
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  nelt = d->nelt;
+  eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+
+  /* Generate 4 permutation masks.  If the required element is within
+     the same lane, it is shuffled in.  If the required element from the
+     other lane, force a zero by setting bit 7 in the permutation mask.
+     In the other mask the mask has non-negative elements if element
+     is requested from the other lane, but also moved to the other lane,
+     so that the result of vpshufb can have the two V2TImode halves
+     swapped.  */
+  m128 = GEN_INT (-128);
+  for (i = 0; i < 32; ++i)
+    {
+      rperm[0][i] = m128;
+      rperm[1][i] = m128;
+      rperm[2][i] = m128;
+      rperm[3][i] = m128;
+    }
+  used[0] = false;
+  used[1] = false;
+  used[2] = false;
+  used[3] = false;
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+      unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
+      unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
+
+      for (j = 0; j < eltsz; ++j)
+	rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
+      used[which] = true;
+    }
+
+  for (i = 0; i < 2; ++i)
+    {
+      if (!used[2 * i + 1])
+	{
+	  h[i] = NULL_RTX;
+	  continue;
+	}
+      vperm = gen_rtx_CONST_VECTOR (V32QImode,
+				    gen_rtvec_v (32, rperm[2 * i + 1]));
+      vperm = force_reg (V32QImode, vperm);
+      h[i] = gen_reg_rtx (V32QImode);
+      op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
+      emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
+    }
+
+  /* Swap the 128-byte lanes of h[X].  */
+  for (i = 0; i < 2; ++i)
+   {
+     if (h[i] == NULL_RTX)
+       continue;
+     op = gen_reg_rtx (V4DImode);
+     emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
+				     const2_rtx, GEN_INT (3), const0_rtx,
+				     const1_rtx));
+     h[i] = gen_lowpart (V32QImode, op);
+   }
+
+  for (i = 0; i < 2; ++i)
+    {
+      if (!used[2 * i])
+	{
+	  l[i] = NULL_RTX;
+	  continue;
+	}
+      vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
+      vperm = force_reg (V32QImode, vperm);
+      l[i] = gen_reg_rtx (V32QImode);
+      op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
+      emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
+    }
+
+  for (i = 0; i < 2; ++i)
+    {
+      if (h[i] && l[i])
+	{
+	  op = gen_reg_rtx (V32QImode);
+	  emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
+	  l[i] = op;
+	}
+      else if (h[i])
+	l[i] = h[i];
+    }
+
+  gcc_assert (l[0] && l[1]);
+  op = d->target;
+  if (d->vmode != V32QImode)
+    op = gen_reg_rtx (V32QImode);
+  emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
+  if (op != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
+  return true;
+}
+
+/* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
+   With all of the interface bits taken care of, perform the expansion
+   in D and return true on success.  */
+
+static bool
+ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
+{
+  /* Try a single instruction expansion.  */
+  if (expand_vec_perm_1 (d))
+    return true;
+
+  /* Try sequences of two instructions.  */
+
+  if (expand_vec_perm_pshuflw_pshufhw (d))
+    return true;
+
+  if (expand_vec_perm_palignr (d))
+    return true;
+
+  if (expand_vec_perm_interleave2 (d))
+    return true;
+
+  if (expand_vec_perm_broadcast (d))
+    return true;
+
+  if (expand_vec_perm_vpermq_perm_1 (d))
+    return true;
+
+  if (expand_vec_perm_vperm2f128 (d))
+    return true;
+
+  /* Try sequences of three instructions.  */
+
+  if (expand_vec_perm_2vperm2f128_vshuf (d))
+    return true;
+
+  if (expand_vec_perm_pshufb2 (d))
+    return true;
+
+  if (expand_vec_perm_interleave3 (d))
+    return true;
+
+  if (expand_vec_perm_vperm2f128_vblend (d))
+    return true;
+
+  /* Try sequences of four instructions.  */
+
+  if (expand_vec_perm_vpshufb2_vpermq (d))
+    return true;
+
+  if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
+    return true;
+
+  /* ??? Look for narrow permutations whose element orderings would
+     allow the promotion to a wider mode.  */
+
+  /* ??? Look for sequences of interleave or a wider permute that place
+     the data into the correct lanes for a half-vector shuffle like
+     pshuf[lh]w or vpermilps.  */
+
+  /* ??? Look for sequences of interleave that produce the desired results.
+     The combinatorics of punpck[lh] get pretty ugly... */
+
+  if (expand_vec_perm_even_odd (d))
+    return true;
+
+  /* Even longer sequences.  */
+  if (expand_vec_perm_vpshufb4_vpermq2 (d))
+    return true;
+
+  return false;
+}
+
+/* If a permutation only uses one operand, make it clear. Returns true
+   if the permutation references both operands.  */
+
+static bool
+canonicalize_perm (struct expand_vec_perm_d *d)
+{
+  int i, which, nelt = d->nelt;
+
+  for (i = which = 0; i < nelt; ++i)
+      which |= (d->perm[i] < nelt ? 1 : 2);
+
+  d->one_operand_p = true;
+  switch (which)
+    {
+    default:
+      gcc_unreachable();
+
+    case 3:
+      if (!rtx_equal_p (d->op0, d->op1))
+        {
+	  d->one_operand_p = false;
+	  break;
+        }
+      /* The elements of PERM do not suggest that only the first operand
+	 is used, but both operands are identical.  Allow easier matching
+	 of the permutation by folding the permutation into the single
+	 input vector.  */
+      /* FALLTHRU */
+
+    case 2:
+      for (i = 0; i < nelt; ++i)
+        d->perm[i] &= nelt - 1;
+      d->op0 = d->op1;
+      break;
+
+    case 1:
+      d->op1 = d->op0;
+      break;
+    }
+
+  return (which == 3);
+}
+
+bool
+ix86_expand_vec_perm_const (rtx operands[4])
+{
+  struct expand_vec_perm_d d;
+  unsigned char perm[MAX_VECT_LEN];
+  int i, nelt;
+  bool two_args;
+  rtx sel;
+
+  d.target = operands[0];
+  d.op0 = operands[1];
+  d.op1 = operands[2];
+  sel = operands[3];
+
+  d.vmode = GET_MODE (d.target);
+  gcc_assert (VECTOR_MODE_P (d.vmode));
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = false;
+
+  gcc_assert (GET_CODE (sel) == CONST_VECTOR);
+  gcc_assert (XVECLEN (sel, 0) == nelt);
+  gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
+
+  for (i = 0; i < nelt; ++i)
+    {
+      rtx e = XVECEXP (sel, 0, i);
+      int ei = INTVAL (e) & (2 * nelt - 1);
+      d.perm[i] = ei;
+      perm[i] = ei;
+    }
+
+  two_args = canonicalize_perm (&d);
+
+  if (ix86_expand_vec_perm_const_1 (&d))
+    return true;
+
+  /* If the selector says both arguments are needed, but the operands are the
+     same, the above tried to expand with one_operand_p and flattened selector.
+     If that didn't work, retry without one_operand_p; we succeeded with that
+     during testing.  */
+  if (two_args && d.one_operand_p)
+    {
+      d.one_operand_p = false;
+      memcpy (d.perm, perm, sizeof (perm));
+      return ix86_expand_vec_perm_const_1 (&d);
+    }
+
+  return false;
+}
+
+/* Implement targetm.vectorize.vec_perm_const_ok.  */
+
+static bool
+ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+				  const unsigned char *sel)
+{
+  struct expand_vec_perm_d d;
+  unsigned int i, nelt, which;
+  bool ret;
+
+  d.vmode = vmode;
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = true;
+
+  /* Given sufficient ISA support we can just return true here
+     for selected vector modes.  */
+  if (d.vmode == V16SImode || d.vmode == V16SFmode
+      || d.vmode == V8DFmode || d.vmode == V8DImode)
+    /* All implementable with a single vpermi2 insn.  */
+    return true;
+  if (GET_MODE_SIZE (d.vmode) == 16)
+    {
+      /* All implementable with a single vpperm insn.  */
+      if (TARGET_XOP)
+	return true;
+      /* All implementable with 2 pshufb + 1 ior.  */
+      if (TARGET_SSSE3)
+	return true;
+      /* All implementable with shufpd or unpck[lh]pd.  */
+      if (d.nelt == 2)
+	return true;
+    }
+
+  /* Extract the values from the vector CST into the permutation
+     array in D.  */
+  memcpy (d.perm, sel, nelt);
+  for (i = which = 0; i < nelt; ++i)
+    {
+      unsigned char e = d.perm[i];
+      gcc_assert (e < 2 * nelt);
+      which |= (e < nelt ? 1 : 2);
+    }
+
+  /* For all elements from second vector, fold the elements to first.  */
+  if (which == 2)
+    for (i = 0; i < nelt; ++i)
+      d.perm[i] -= nelt;
+
+  /* Check whether the mask can be applied to the vector type.  */
+  d.one_operand_p = (which != 3);
+
+  /* Implementable with shufps or pshufd.  */
+  if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
+    return true;
+
+  /* Otherwise we have to go through the motions and see if we can
+     figure out how to generate the requested permutation.  */
+  d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
+  d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
+  if (!d.one_operand_p)
+    d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
+
+  start_sequence ();
+  ret = ix86_expand_vec_perm_const_1 (&d);
+  end_sequence ();
+
+  return ret;
+}
+
+void
+ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
+{
+  struct expand_vec_perm_d d;
+  unsigned i, nelt;
+
+  d.target = targ;
+  d.op0 = op0;
+  d.op1 = op1;
+  d.vmode = GET_MODE (targ);
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.one_operand_p = false;
+  d.testing_p = false;
+
+  for (i = 0; i < nelt; ++i)
+    d.perm[i] = i * 2 + odd;
+
+  /* We'll either be able to implement the permutation directly...  */
+  if (expand_vec_perm_1 (&d))
+    return;
+
+  /* ... or we use the special-case patterns.  */
+  expand_vec_perm_even_odd_1 (&d, odd);
+}
+
+static void
+ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
+{
+  struct expand_vec_perm_d d;
+  unsigned i, nelt, base;
+  bool ok;
+
+  d.target = targ;
+  d.op0 = op0;
+  d.op1 = op1;
+  d.vmode = GET_MODE (targ);
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.one_operand_p = false;
+  d.testing_p = false;
+
+  base = high_p ? nelt / 2 : 0;
+  for (i = 0; i < nelt / 2; ++i)
+    {
+      d.perm[i * 2] = i + base;
+      d.perm[i * 2 + 1] = i + base + nelt;
+    }
+
+  /* Note that for AVX this isn't one instruction.  */
+  ok = ix86_expand_vec_perm_const_1 (&d);
+  gcc_assert (ok);
+}
+
+
+/* Expand a vector operation CODE for a V*QImode in terms of the
+   same operation on V*HImode.  */
+
+void
+ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+  enum machine_mode qimode = GET_MODE (dest);
+  enum machine_mode himode;
+  rtx (*gen_il) (rtx, rtx, rtx);
+  rtx (*gen_ih) (rtx, rtx, rtx);
+  rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
+  struct expand_vec_perm_d d;
+  bool ok, full_interleave;
+  bool uns_p = false;
+  int i;
+
+  switch (qimode)
+    {
+    case V16QImode:
+      himode = V8HImode;
+      gen_il = gen_vec_interleave_lowv16qi;
+      gen_ih = gen_vec_interleave_highv16qi;
+      break;
+    case V32QImode:
+      himode = V16HImode;
+      gen_il = gen_avx2_interleave_lowv32qi;
+      gen_ih = gen_avx2_interleave_highv32qi;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  op2_l = op2_h = op2;
+  switch (code)
+    {
+    case MULT:
+      /* Unpack data such that we've got a source byte in each low byte of
+	 each word.  We don't care what goes into the high byte of each word.
+	 Rather than trying to get zero in there, most convenient is to let
+	 it be a copy of the low byte.  */
+      op2_l = gen_reg_rtx (qimode);
+      op2_h = gen_reg_rtx (qimode);
+      emit_insn (gen_il (op2_l, op2, op2));
+      emit_insn (gen_ih (op2_h, op2, op2));
+      /* FALLTHRU */
+
+      op1_l = gen_reg_rtx (qimode);
+      op1_h = gen_reg_rtx (qimode);
+      emit_insn (gen_il (op1_l, op1, op1));
+      emit_insn (gen_ih (op1_h, op1, op1));
+      full_interleave = qimode == V16QImode;
+      break;
+
+    case ASHIFT:
+    case LSHIFTRT:
+      uns_p = true;
+      /* FALLTHRU */
+    case ASHIFTRT:
+      op1_l = gen_reg_rtx (himode);
+      op1_h = gen_reg_rtx (himode);
+      ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
+      ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
+      full_interleave = true;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Perform the operation.  */
+  res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
+			       1, OPTAB_DIRECT);
+  res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
+			       1, OPTAB_DIRECT);
+  gcc_assert (res_l && res_h);
+
+  /* Merge the data back into the right place.  */
+  d.target = dest;
+  d.op0 = gen_lowpart (qimode, res_l);
+  d.op1 = gen_lowpart (qimode, res_h);
+  d.vmode = qimode;
+  d.nelt = GET_MODE_NUNITS (qimode);
+  d.one_operand_p = false;
+  d.testing_p = false;
+
+  if (full_interleave)
+    {
+      /* For SSE2, we used an full interleave, so the desired
+	 results are in the even elements.  */
+      for (i = 0; i < 32; ++i)
+	d.perm[i] = i * 2;
+    }
+  else
+    {
+      /* For AVX, the interleave used above was not cross-lane.  So the
+	 extraction is evens but with the second and third quarter swapped.
+	 Happily, that is even one insn shorter than even extraction.  */
+      for (i = 0; i < 32; ++i)
+	d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
+    }
+
+  ok = ix86_expand_vec_perm_const_1 (&d);
+  gcc_assert (ok);
+
+  set_unique_reg_note (get_last_insn (), REG_EQUAL,
+		       gen_rtx_fmt_ee (code, qimode, op1, op2));
+}
+
+/* Helper function of ix86_expand_mul_widen_evenodd.  Return true
+   if op is CONST_VECTOR with all odd elements equal to their
+   preceding element.  */
+
+static bool
+const_vector_equal_evenodd_p (rtx op)
+{
+  enum machine_mode mode = GET_MODE (op);
+  int i, nunits = GET_MODE_NUNITS (mode);
+  if (GET_CODE (op) != CONST_VECTOR
+      || nunits != CONST_VECTOR_NUNITS (op))
+    return false;
+  for (i = 0; i < nunits; i += 2)
+    if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
+      return false;
+  return true;
+}
+
+void
+ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
+			       bool uns_p, bool odd_p)
+{
+  enum machine_mode mode = GET_MODE (op1);
+  enum machine_mode wmode = GET_MODE (dest);
+  rtx x;
+  rtx orig_op1 = op1, orig_op2 = op2;
+
+  if (!nonimmediate_operand (op1, mode))
+    op1 = force_reg (mode, op1);
+  if (!nonimmediate_operand (op2, mode))
+    op2 = force_reg (mode, op2);
+
+  /* We only play even/odd games with vectors of SImode.  */
+  gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
+
+  /* If we're looking for the odd results, shift those members down to
+     the even slots.  For some cpus this is faster than a PSHUFD.  */
+  if (odd_p)
+    {
+      /* For XOP use vpmacsdqh, but only for smult, as it is only
+	 signed.  */
+      if (TARGET_XOP && mode == V4SImode && !uns_p)
+	{
+	  x = force_reg (wmode, CONST0_RTX (wmode));
+	  emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
+	  return;
+	}
+
+      x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
+      if (!const_vector_equal_evenodd_p (orig_op1))
+	op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
+			    x, NULL, 1, OPTAB_DIRECT);
+      if (!const_vector_equal_evenodd_p (orig_op2))
+	op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
+			    x, NULL, 1, OPTAB_DIRECT);
+      op1 = gen_lowpart (mode, op1);
+      op2 = gen_lowpart (mode, op2);
+    }
+
+  if (mode == V16SImode)
+    {
+      if (uns_p)
+	x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
+      else
+	x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
+    }
+  else if (mode == V8SImode)
+    {
+      if (uns_p)
+	x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
+      else
+	x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
+    }
+  else if (uns_p)
+    x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
+  else if (TARGET_SSE4_1)
+    x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
+  else
+    {
+      rtx s1, s2, t0, t1, t2;
+
+      /* The easiest way to implement this without PMULDQ is to go through
+	 the motions as if we are performing a full 64-bit multiply.  With
+	 the exception that we need to do less shuffling of the elements.  */
+
+      /* Compute the sign-extension, aka highparts, of the two operands.  */
+      s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
+				op1, pc_rtx, pc_rtx);
+      s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
+				op2, pc_rtx, pc_rtx);
+
+      /* Multiply LO(A) * HI(B), and vice-versa.  */
+      t1 = gen_reg_rtx (wmode);
+      t2 = gen_reg_rtx (wmode);
+      emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
+      emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
+
+      /* Multiply LO(A) * LO(B).  */
+      t0 = gen_reg_rtx (wmode);
+      emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
+
+      /* Combine and shift the highparts into place.  */
+      t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
+      t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
+			 1, OPTAB_DIRECT);
+
+      /* Combine high and low parts.  */
+      force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
+      return;
+    }
+  emit_insn (x);
+}
+
+void
+ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
+			    bool uns_p, bool high_p)
+{
+  enum machine_mode wmode = GET_MODE (dest);
+  enum machine_mode mode = GET_MODE (op1);
+  rtx t1, t2, t3, t4, mask;
+
+  switch (mode)
+    {
+    case V4SImode:
+      t1 = gen_reg_rtx (mode);
+      t2 = gen_reg_rtx (mode);
+      if (TARGET_XOP && !uns_p)
+	{
+	  /* With XOP, we have pmacsdqh, aka mul_widen_odd.  In this case,
+	     shuffle the elements once so that all elements are in the right
+	     place for immediate use: { A C B D }.  */
+	  emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
+					const1_rtx, GEN_INT (3)));
+	  emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
+					const1_rtx, GEN_INT (3)));
+	}
+      else
+	{
+	  /* Put the elements into place for the multiply.  */
+	  ix86_expand_vec_interleave (t1, op1, op1, high_p);
+	  ix86_expand_vec_interleave (t2, op2, op2, high_p);
+	  high_p = false;
+	}
+      ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
+      break;
+
+    case V8SImode:
+      /* Shuffle the elements between the lanes.  After this we
+	 have { A B E F | C D G H } for each operand.  */
+      t1 = gen_reg_rtx (V4DImode);
+      t2 = gen_reg_rtx (V4DImode);
+      emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
+				      const0_rtx, const2_rtx,
+				      const1_rtx, GEN_INT (3)));
+      emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
+				      const0_rtx, const2_rtx,
+				      const1_rtx, GEN_INT (3)));
+
+      /* Shuffle the elements within the lanes.  After this we
+	 have { A A B B | C C D D } or { E E F F | G G H H }.  */
+      t3 = gen_reg_rtx (V8SImode);
+      t4 = gen_reg_rtx (V8SImode);
+      mask = GEN_INT (high_p
+		      ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
+		      : 0 + (0 << 2) + (1 << 4) + (1 << 6));
+      emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
+      emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
+
+      ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
+      break;
+
+    case V8HImode:
+    case V16HImode:
+      t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
+			 uns_p, OPTAB_DIRECT);
+      t2 = expand_binop (mode,
+			 uns_p ? umul_highpart_optab : smul_highpart_optab,
+			 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
+      gcc_assert (t1 && t2);
+
+      t3 = gen_reg_rtx (mode);
+      ix86_expand_vec_interleave (t3, t1, t2, high_p);
+      emit_move_insn (dest, gen_lowpart (wmode, t3));
+      break;
+
+    case V16QImode:
+    case V32QImode:
+      t1 = gen_reg_rtx (wmode);
+      t2 = gen_reg_rtx (wmode);
+      ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
+      ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
+
+      emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+void
+ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
+{
+  rtx res_1, res_2, res_3, res_4;
+
+  res_1 = gen_reg_rtx (V4SImode);
+  res_2 = gen_reg_rtx (V4SImode);
+  res_3 = gen_reg_rtx (V2DImode);
+  res_4 = gen_reg_rtx (V2DImode);
+  ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
+  ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
+
+  /* Move the results in element 2 down to element 1; we don't care
+     what goes in elements 2 and 3.  Then we can merge the parts
+     back together with an interleave.
+
+     Note that two other sequences were tried:
+     (1) Use interleaves at the start instead of psrldq, which allows
+     us to use a single shufps to merge things back at the end.
+     (2) Use shufps here to combine the two vectors, then pshufd to
+     put the elements in the correct order.
+     In both cases the cost of the reformatting stall was too high
+     and the overall sequence slower.  */
+
+  emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
+				const0_rtx, const2_rtx,
+				const0_rtx, const0_rtx));
+  emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
+				const0_rtx, const2_rtx,
+				const0_rtx, const0_rtx));
+  res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
+
+  set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
+}
+
+void
+ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
+{
+  enum machine_mode mode = GET_MODE (op0);
+  rtx t1, t2, t3, t4, t5, t6;
+
+  if (TARGET_XOP && mode == V2DImode)
+    {
+      /* op1: A,B,C,D, op2: E,F,G,H */
+      op1 = gen_lowpart (V4SImode, op1);
+      op2 = gen_lowpart (V4SImode, op2);
+
+      t1 = gen_reg_rtx (V4SImode);
+      t2 = gen_reg_rtx (V4SImode);
+      t3 = gen_reg_rtx (V2DImode);
+      t4 = gen_reg_rtx (V2DImode);
+
+      /* t1: B,A,D,C */
+      emit_insn (gen_sse2_pshufd_1 (t1, op1,
+				    GEN_INT (1),
+				    GEN_INT (0),
+				    GEN_INT (3),
+				    GEN_INT (2)));
+
+      /* t2: (B*E),(A*F),(D*G),(C*H) */
+      emit_insn (gen_mulv4si3 (t2, t1, op2));
+
+      /* t3: (B*E)+(A*F), (D*G)+(C*H) */
+      emit_insn (gen_xop_phadddq (t3, t2));
+
+      /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
+      emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
+
+      /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
+      emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
+    }
+  else
+    {
+      enum machine_mode nmode;
+      rtx (*umul) (rtx, rtx, rtx);
+
+      if (mode == V2DImode)
+	{
+	  umul = gen_vec_widen_umult_even_v4si;
+	  nmode = V4SImode;
+	}
+      else if (mode == V4DImode)
+	{
+	  umul = gen_vec_widen_umult_even_v8si;
+	  nmode = V8SImode;
+	}
+      else if (mode == V8DImode)
+	{
+	  umul = gen_vec_widen_umult_even_v16si;
+	  nmode = V16SImode;
+	}
+      else
+	gcc_unreachable ();
+
+
+      /* Multiply low parts.  */
+      t1 = gen_reg_rtx (mode);
+      emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
+
+      /* Shift input vectors right 32 bits so we can multiply high parts.  */
+      t6 = GEN_INT (32);
+      t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
+      t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
+
+      /* Multiply high parts by low parts.  */
+      t4 = gen_reg_rtx (mode);
+      t5 = gen_reg_rtx (mode);
+      emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
+      emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
+
+      /* Combine and shift the highparts back.  */
+      t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
+      t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
+
+      /* Combine high and low parts.  */
+      force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
+    }
+
+  set_unique_reg_note (get_last_insn (), REG_EQUAL,
+		       gen_rtx_MULT (mode, op1, op2));
+}
+
+/* Calculate integer abs() using only SSE2 instructions.  */
+
+void
+ix86_expand_sse2_abs (rtx target, rtx input)
+{
+  enum machine_mode mode = GET_MODE (target);
+  rtx tmp0, tmp1, x;
+
+  switch (mode)
+    {
+      /* For 32-bit signed integer X, the best way to calculate the absolute
+	 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
+      case V4SImode:
+	tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
+				    GEN_INT (GET_MODE_BITSIZE
+					     (GET_MODE_INNER (mode)) - 1),
+				    NULL, 0, OPTAB_DIRECT);
+	tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
+				    NULL, 0, OPTAB_DIRECT);
+	x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
+				 target, 0, OPTAB_DIRECT);
+	break;
+
+      /* For 16-bit signed integer X, the best way to calculate the absolute
+	 value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
+      case V8HImode:
+	tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
+
+	x = expand_simple_binop (mode, SMAX, tmp0, input,
+				 target, 0, OPTAB_DIRECT);
+	break;
+
+      /* For 8-bit signed integer X, the best way to calculate the absolute
+	 value of X is min ((unsigned char) X, (unsigned char) (-X)),
+	 as SSE2 provides the PMINUB insn.  */
+      case V16QImode:
+	tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
+
+	x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
+				 target, 0, OPTAB_DIRECT);
+	break;
+
+      default:
+	gcc_unreachable ();
+    }
+
+  if (x != target)
+    emit_move_insn (target, x);
+}
+
+/* Expand an insert into a vector register through pinsr insn.
+   Return true if successful.  */
+
+bool
+ix86_expand_pinsr (rtx *operands)
+{
+  rtx dst = operands[0];
+  rtx src = operands[3];
+
+  unsigned int size = INTVAL (operands[1]);
+  unsigned int pos = INTVAL (operands[2]);
+
+  if (GET_CODE (dst) == SUBREG)
+    {
+      pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
+      dst = SUBREG_REG (dst);
+    }
+
+  if (GET_CODE (src) == SUBREG)
+    src = SUBREG_REG (src);
+
+  switch (GET_MODE (dst))
+    {
+    case V16QImode:
+    case V8HImode:
+    case V4SImode:
+    case V2DImode:
+      {
+	enum machine_mode srcmode, dstmode;
+	rtx (*pinsr)(rtx, rtx, rtx, rtx);
+
+	srcmode = mode_for_size (size, MODE_INT, 0);
+
+	switch (srcmode)
+	  {
+	  case QImode:
+	    if (!TARGET_SSE4_1)
+	      return false;
+	    dstmode = V16QImode;
+	    pinsr = gen_sse4_1_pinsrb;
+	    break;
+
+	  case HImode:
+	    if (!TARGET_SSE2)
+	      return false;
+	    dstmode = V8HImode;
+	    pinsr = gen_sse2_pinsrw;
+	    break;
+
+	  case SImode:
+	    if (!TARGET_SSE4_1)
+	      return false;
+	    dstmode = V4SImode;
+	    pinsr = gen_sse4_1_pinsrd;
+	    break;
+
+	  case DImode:
+	    gcc_assert (TARGET_64BIT);
+	    if (!TARGET_SSE4_1)
+	      return false;
+	    dstmode = V2DImode;
+	    pinsr = gen_sse4_1_pinsrq;
+	    break;
+
+	  default:
+	    return false;
+	  }
+
+	rtx d = dst;
+	if (GET_MODE (dst) != dstmode)
+	  d = gen_reg_rtx (dstmode);
+	src = gen_lowpart (srcmode, src);
+
+	pos /= size;
+
+	emit_insn (pinsr (d, gen_lowpart (dstmode, dst), src,
+			  GEN_INT (1 << pos)));
+	if (d != dst)
+	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
+	return true;
+      }
+
+    default:
+      return false;
+    }
+}
+
+/* This function returns the calling abi specific va_list type node.
+   It returns  the FNDECL specific va_list type.  */
+
+static tree
+ix86_fn_abi_va_list (tree fndecl)
+{
+  if (!TARGET_64BIT)
+    return va_list_type_node;
+  gcc_assert (fndecl != NULL_TREE);
+
+  if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
+    return ms_va_list_type_node;
+  else
+    return sysv_va_list_type_node;
+}
+
+/* Returns the canonical va_list type specified by TYPE. If there
+   is no valid TYPE provided, it return NULL_TREE.  */
+
+static tree
+ix86_canonical_va_list_type (tree type)
+{
+  tree wtype, htype;
+
+  /* Resolve references and pointers to va_list type.  */
+  if (TREE_CODE (type) == MEM_REF)
+    type = TREE_TYPE (type);
+  else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
+    type = TREE_TYPE (type);
+  else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
+    type = TREE_TYPE (type);
+
+  if (TARGET_64BIT && va_list_type_node != NULL_TREE)
+    {
+      wtype = va_list_type_node;
+	  gcc_assert (wtype != NULL_TREE);
+      htype = type;
+      if (TREE_CODE (wtype) == ARRAY_TYPE)
+	{
+	  /* If va_list is an array type, the argument may have decayed
+	     to a pointer type, e.g. by being passed to another function.
+	     In that case, unwrap both types so that we can compare the
+	     underlying records.  */
+	  if (TREE_CODE (htype) == ARRAY_TYPE
+	      || POINTER_TYPE_P (htype))
+	    {
+	      wtype = TREE_TYPE (wtype);
+	      htype = TREE_TYPE (htype);
+	    }
+	}
+      if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
+	return va_list_type_node;
+      wtype = sysv_va_list_type_node;
+	  gcc_assert (wtype != NULL_TREE);
+      htype = type;
+      if (TREE_CODE (wtype) == ARRAY_TYPE)
+	{
+	  /* If va_list is an array type, the argument may have decayed
+	     to a pointer type, e.g. by being passed to another function.
+	     In that case, unwrap both types so that we can compare the
+	     underlying records.  */
+	  if (TREE_CODE (htype) == ARRAY_TYPE
+	      || POINTER_TYPE_P (htype))
+	    {
+	      wtype = TREE_TYPE (wtype);
+	      htype = TREE_TYPE (htype);
+	    }
+	}
+      if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
+	return sysv_va_list_type_node;
+      wtype = ms_va_list_type_node;
+	  gcc_assert (wtype != NULL_TREE);
+      htype = type;
+      if (TREE_CODE (wtype) == ARRAY_TYPE)
+	{
+	  /* If va_list is an array type, the argument may have decayed
+	     to a pointer type, e.g. by being passed to another function.
+	     In that case, unwrap both types so that we can compare the
+	     underlying records.  */
+	  if (TREE_CODE (htype) == ARRAY_TYPE
+	      || POINTER_TYPE_P (htype))
+	    {
+	      wtype = TREE_TYPE (wtype);
+	      htype = TREE_TYPE (htype);
+	    }
+	}
+      if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
+	return ms_va_list_type_node;
+      return NULL_TREE;
+    }
+  return std_canonical_va_list_type (type);
+}
+
+/* Iterate through the target-specific builtin types for va_list.
+   IDX denotes the iterator, *PTREE is set to the result type of
+   the va_list builtin, and *PNAME to its internal type.
+   Returns zero if there is no element for this index, otherwise
+   IDX should be increased upon the next call.
+   Note, do not iterate a base builtin's name like __builtin_va_list.
+   Used from c_common_nodes_and_builtins.  */
+
+static int
+ix86_enum_va_list (int idx, const char **pname, tree *ptree)
+{
+  if (TARGET_64BIT)
+    {
+      switch (idx)
+	{
+	default:
+	  break;
+
+	case 0:
+	  *ptree = ms_va_list_type_node;
+	  *pname = "__builtin_ms_va_list";
+	  return 1;
+
+	case 1:
+	  *ptree = sysv_va_list_type_node;
+	  *pname = "__builtin_sysv_va_list";
+	  return 1;
+	}
+    }
+
+  return 0;
+}
+
+#undef TARGET_SCHED_DISPATCH
+#define TARGET_SCHED_DISPATCH has_dispatch
+#undef TARGET_SCHED_DISPATCH_DO
+#define TARGET_SCHED_DISPATCH_DO do_dispatch
+#undef TARGET_SCHED_REASSOCIATION_WIDTH
+#define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
+#undef TARGET_SCHED_REORDER
+#define TARGET_SCHED_REORDER ix86_sched_reorder
+#undef TARGET_SCHED_ADJUST_PRIORITY
+#define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
+#undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
+#define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
+  ix86_dependencies_evaluation_hook
+
+/* The size of the dispatch window is the total number of bytes of
+   object code allowed in a window.  */
+#define DISPATCH_WINDOW_SIZE 16
+
+/* Number of dispatch windows considered for scheduling.  */
+#define MAX_DISPATCH_WINDOWS 3
+
+/* Maximum number of instructions in a window.  */
+#define MAX_INSN 4
+
+/* Maximum number of immediate operands in a window.  */
+#define MAX_IMM 4
+
+/* Maximum number of immediate bits allowed in a window.  */
+#define MAX_IMM_SIZE 128
+
+/* Maximum number of 32 bit immediates allowed in a window.  */
+#define MAX_IMM_32 4
+
+/* Maximum number of 64 bit immediates allowed in a window.  */
+#define MAX_IMM_64 2
+
+/* Maximum total of loads or prefetches allowed in a window.  */
+#define MAX_LOAD 2
+
+/* Maximum total of stores allowed in a window.  */
+#define MAX_STORE 1
+
+#undef BIG
+#define BIG 100
+
+
+/* Dispatch groups.  Istructions that affect the mix in a dispatch window.  */
+enum dispatch_group {
+  disp_no_group = 0,
+  disp_load,
+  disp_store,
+  disp_load_store,
+  disp_prefetch,
+  disp_imm,
+  disp_imm_32,
+  disp_imm_64,
+  disp_branch,
+  disp_cmp,
+  disp_jcc,
+  disp_last
+};
+
+/* Number of allowable groups in a dispatch window.  It is an array
+   indexed by dispatch_group enum.  100 is used as a big number,
+   because the number of these kind of operations does not have any
+   effect in dispatch window, but we need them for other reasons in
+   the table.  */
+static unsigned int num_allowable_groups[disp_last] = {
+  0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
+};
+
+char group_name[disp_last + 1][16] = {
+  "disp_no_group", "disp_load", "disp_store", "disp_load_store",
+  "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
+  "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
+};
+
+/* Instruction path.  */
+enum insn_path {
+  no_path = 0,
+  path_single, /* Single micro op.  */
+  path_double, /* Double micro op.  */
+  path_multi,  /* Instructions with more than 2 micro op..  */
+  last_path
+};
+
+/* sched_insn_info defines a window to the instructions scheduled in
+   the basic block.  It contains a pointer to the insn_info table and
+   the instruction scheduled.
+
+   Windows are allocated for each basic block and are linked
+   together.  */
+typedef struct sched_insn_info_s {
+  rtx insn;
+  enum dispatch_group group;
+  enum insn_path path;
+  int byte_len;
+  int imm_bytes;
+} sched_insn_info;
+
+/* Linked list of dispatch windows.  This is a two way list of
+   dispatch windows of a basic block.  It contains information about
+   the number of uops in the window and the total number of
+   instructions and of bytes in the object code for this dispatch
+   window.  */
+typedef struct dispatch_windows_s {
+  int num_insn;            /* Number of insn in the window.  */
+  int num_uops;            /* Number of uops in the window.  */
+  int window_size;         /* Number of bytes in the window.  */
+  int window_num;          /* Window number between 0 or 1.  */
+  int num_imm;             /* Number of immediates in an insn.  */
+  int num_imm_32;          /* Number of 32 bit immediates in an insn.  */
+  int num_imm_64;          /* Number of 64 bit immediates in an insn.  */
+  int imm_size;            /* Total immediates in the window.  */
+  int num_loads;           /* Total memory loads in the window.  */
+  int num_stores;          /* Total memory stores in the window.  */
+  int violation;          /* Violation exists in window.  */
+  sched_insn_info *window; /* Pointer to the window.  */
+  struct dispatch_windows_s *next;
+  struct dispatch_windows_s *prev;
+} dispatch_windows;
+
+/* Immediate valuse used in an insn.  */
+typedef struct imm_info_s
+  {
+    int imm;
+    int imm32;
+    int imm64;
+  } imm_info;
+
+static dispatch_windows *dispatch_window_list;
+static dispatch_windows *dispatch_window_list1;
+
+/* Get dispatch group of insn.  */
+
+static enum dispatch_group
+get_mem_group (rtx insn)
+{
+  enum attr_memory memory;
+
+  if (INSN_CODE (insn) < 0)
+    return disp_no_group;
+  memory = get_attr_memory (insn);
+  if (memory == MEMORY_STORE)
+    return disp_store;
+
+  if (memory == MEMORY_LOAD)
+    return disp_load;
+
+  if (memory == MEMORY_BOTH)
+    return disp_load_store;
+
+  return disp_no_group;
+}
+
+/* Return true if insn is a compare instruction.  */
+
+static bool
+is_cmp (rtx insn)
+{
+  enum attr_type type;
+
+  type = get_attr_type (insn);
+  return (type == TYPE_TEST
+	  || type == TYPE_ICMP
+	  || type == TYPE_FCMP
+	  || GET_CODE (PATTERN (insn)) == COMPARE);
+}
+
+/* Return true if a dispatch violation encountered.  */
+
+static bool
+dispatch_violation (void)
+{
+  if (dispatch_window_list->next)
+    return dispatch_window_list->next->violation;
+  return dispatch_window_list->violation;
+}
+
+/* Return true if insn is a branch instruction.  */
+
+static bool
+is_branch (rtx insn)
+{
+  return (CALL_P (insn) || JUMP_P (insn));
+}
+
+/* Return true if insn is a prefetch instruction.  */
+
+static bool
+is_prefetch (rtx insn)
+{
+  return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
+}
+
+/* This function initializes a dispatch window and the list container holding a
+   pointer to the window.  */
+
+static void
+init_window (int window_num)
+{
+  int i;
+  dispatch_windows *new_list;
+
+  if (window_num == 0)
+    new_list = dispatch_window_list;
+  else
+    new_list = dispatch_window_list1;
+
+  new_list->num_insn = 0;
+  new_list->num_uops = 0;
+  new_list->window_size = 0;
+  new_list->next = NULL;
+  new_list->prev = NULL;
+  new_list->window_num = window_num;
+  new_list->num_imm = 0;
+  new_list->num_imm_32 = 0;
+  new_list->num_imm_64 = 0;
+  new_list->imm_size = 0;
+  new_list->num_loads = 0;
+  new_list->num_stores = 0;
+  new_list->violation = false;
+
+  for (i = 0; i < MAX_INSN; i++)
+    {
+      new_list->window[i].insn = NULL;
+      new_list->window[i].group = disp_no_group;
+      new_list->window[i].path = no_path;
+      new_list->window[i].byte_len = 0;
+      new_list->window[i].imm_bytes = 0;
+    }
+  return;
+}
+
+/* This function allocates and initializes a dispatch window and the
+   list container holding a pointer to the window.  */
+
+static dispatch_windows *
+allocate_window (void)
+{
+  dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
+  new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
+
+  return new_list;
+}
+
+/* This routine initializes the dispatch scheduling information.  It
+   initiates building dispatch scheduler tables and constructs the
+   first dispatch window.  */
+
+static void
+init_dispatch_sched (void)
+{
+  /* Allocate a dispatch list and a window.  */
+  dispatch_window_list = allocate_window ();
+  dispatch_window_list1 = allocate_window ();
+  init_window (0);
+  init_window (1);
+}
+
+/* This function returns true if a branch is detected.  End of a basic block
+   does not have to be a branch, but here we assume only branches end a
+   window.  */
+
+static bool
+is_end_basic_block (enum dispatch_group group)
+{
+  return group == disp_branch;
+}
+
+/* This function is called when the end of a window processing is reached.  */
+
+static void
+process_end_window (void)
+{
+  gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
+  if (dispatch_window_list->next)
+    {
+      gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
+      gcc_assert (dispatch_window_list->window_size
+		  + dispatch_window_list1->window_size <= 48);
+      init_window (1);
+    }
+  init_window (0);
+}
+
+/* Allocates a new dispatch window and adds it to WINDOW_LIST.
+   WINDOW_NUM is either 0 or 1.  A maximum of two windows are generated
+   for 48 bytes of instructions.  Note that these windows are not dispatch
+   windows that their sizes are DISPATCH_WINDOW_SIZE.  */
+
+static dispatch_windows *
+allocate_next_window (int window_num)
+{
+  if (window_num == 0)
+    {
+      if (dispatch_window_list->next)
+	  init_window (1);
+      init_window (0);
+      return dispatch_window_list;
+    }
+
+  dispatch_window_list->next = dispatch_window_list1;
+  dispatch_window_list1->prev = dispatch_window_list;
+
+  return dispatch_window_list1;
+}
+
+/* Increment the number of immediate operands of an instruction.  */
+
+static int
+find_constant_1 (rtx *in_rtx, imm_info *imm_values)
+{
+  if (*in_rtx == 0)
+    return 0;
+
+    switch ( GET_CODE (*in_rtx))
+    {
+    case CONST:
+    case SYMBOL_REF:
+    case CONST_INT:
+      (imm_values->imm)++;
+      if (x86_64_immediate_operand (*in_rtx, SImode))
+	(imm_values->imm32)++;
+      else
+	(imm_values->imm64)++;
+      break;
+
+    case CONST_DOUBLE:
+      (imm_values->imm)++;
+      (imm_values->imm64)++;
+      break;
+
+    case CODE_LABEL:
+      if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
+	{
+	  (imm_values->imm)++;
+	  (imm_values->imm32)++;
+	}
+      break;
+
+    default:
+      break;
+    }
+
+  return 0;
+}
+
+/* Compute number of immediate operands of an instruction.  */
+
+static void
+find_constant (rtx in_rtx, imm_info *imm_values)
+{
+  for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
+		(rtx_function) find_constant_1, (void *) imm_values);
+}
+
+/* Return total size of immediate operands of an instruction along with number
+   of corresponding immediate-operands.  It initializes its parameters to zero
+   befor calling FIND_CONSTANT.
+   INSN is the input instruction.  IMM is the total of immediates.
+   IMM32 is the number of 32 bit immediates.  IMM64 is the number of 64
+   bit immediates.  */
+
+static int
+get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
+{
+  imm_info imm_values = {0, 0, 0};
+
+  find_constant (insn, &imm_values);
+  *imm = imm_values.imm;
+  *imm32 = imm_values.imm32;
+  *imm64 = imm_values.imm64;
+  return imm_values.imm32 * 4 + imm_values.imm64 * 8;
+}
+
+/* This function indicates if an operand of an instruction is an
+   immediate.  */
+
+static bool
+has_immediate (rtx insn)
+{
+  int num_imm_operand;
+  int num_imm32_operand;
+  int num_imm64_operand;
+
+  if (insn)
+    return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
+			       &num_imm64_operand);
+  return false;
+}
+
+/* Return single or double path for instructions.  */
+
+static enum insn_path
+get_insn_path (rtx insn)
+{
+  enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
+
+  if ((int)path == 0)
+    return path_single;
+
+  if ((int)path == 1)
+    return path_double;
+
+  return path_multi;
+}
+
+/* Return insn dispatch group.  */
+
+static enum dispatch_group
+get_insn_group (rtx insn)
+{
+  enum dispatch_group group = get_mem_group (insn);
+  if (group)
+    return group;
+
+  if (is_branch (insn))
+    return disp_branch;
+
+  if (is_cmp (insn))
+    return disp_cmp;
+
+  if (has_immediate (insn))
+    return disp_imm;
+
+  if (is_prefetch (insn))
+    return disp_prefetch;
+
+  return disp_no_group;
+}
+
+/* Count number of GROUP restricted instructions in a dispatch
+   window WINDOW_LIST.  */
+
+static int
+count_num_restricted (rtx insn, dispatch_windows *window_list)
+{
+  enum dispatch_group group = get_insn_group (insn);
+  int imm_size;
+  int num_imm_operand;
+  int num_imm32_operand;
+  int num_imm64_operand;
+
+  if (group == disp_no_group)
+    return 0;
+
+  if (group == disp_imm)
+    {
+      imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
+			      &num_imm64_operand);
+      if (window_list->imm_size + imm_size > MAX_IMM_SIZE
+	  || num_imm_operand + window_list->num_imm > MAX_IMM
+	  || (num_imm32_operand > 0
+	      && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
+		  || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
+	  || (num_imm64_operand > 0
+	      && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
+		  || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
+	  || (window_list->imm_size + imm_size == MAX_IMM_SIZE
+	      && num_imm64_operand > 0
+	      && ((window_list->num_imm_64 > 0
+		   && window_list->num_insn >= 2)
+		  || window_list->num_insn >= 3)))
+	return BIG;
+
+      return 1;
+    }
+
+  if ((group == disp_load_store
+       && (window_list->num_loads >= MAX_LOAD
+	   || window_list->num_stores >= MAX_STORE))
+      || ((group == disp_load
+	   || group == disp_prefetch)
+	  && window_list->num_loads >= MAX_LOAD)
+      || (group == disp_store
+	  && window_list->num_stores >= MAX_STORE))
+    return BIG;
+
+  return 1;
+}
+
+/* This function returns true if insn satisfies dispatch rules on the
+   last window scheduled.  */
+
+static bool
+fits_dispatch_window (rtx insn)
+{
+  dispatch_windows *window_list = dispatch_window_list;
+  dispatch_windows *window_list_next = dispatch_window_list->next;
+  unsigned int num_restrict;
+  enum dispatch_group group = get_insn_group (insn);
+  enum insn_path path = get_insn_path (insn);
+  int sum;
+
+  /* Make disp_cmp and disp_jcc get scheduled at the latest.  These
+     instructions should be given the lowest priority in the
+     scheduling process in Haifa scheduler to make sure they will be
+     scheduled in the same dispatch window as the reference to them.  */
+  if (group == disp_jcc || group == disp_cmp)
+    return false;
+
+  /* Check nonrestricted.  */
+  if (group == disp_no_group || group == disp_branch)
+    return true;
+
+  /* Get last dispatch window.  */
+  if (window_list_next)
+    window_list = window_list_next;
+
+  if (window_list->window_num == 1)
+    {
+      sum = window_list->prev->window_size + window_list->window_size;
+
+      if (sum == 32
+	  || (min_insn_size (insn) + sum) >= 48)
+	/* Window 1 is full.  Go for next window.  */
+	return true;
+    }
+
+  num_restrict = count_num_restricted (insn, window_list);
+
+  if (num_restrict > num_allowable_groups[group])
+    return false;
+
+  /* See if it fits in the first window.  */
+  if (window_list->window_num == 0)
+    {
+      /* The first widow should have only single and double path
+	 uops.  */
+      if (path == path_double
+	  && (window_list->num_uops + 2) > MAX_INSN)
+	return false;
+      else if (path != path_single)
+        return false;
+    }
+  return true;
+}
+
+/* Add an instruction INSN with NUM_UOPS micro-operations to the
+   dispatch window WINDOW_LIST.  */
+
+static void
+add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
+{
+  int byte_len = min_insn_size (insn);
+  int num_insn = window_list->num_insn;
+  int imm_size;
+  sched_insn_info *window = window_list->window;
+  enum dispatch_group group = get_insn_group (insn);
+  enum insn_path path = get_insn_path (insn);
+  int num_imm_operand;
+  int num_imm32_operand;
+  int num_imm64_operand;
+
+  if (!window_list->violation && group != disp_cmp
+      && !fits_dispatch_window (insn))
+    window_list->violation = true;
+
+  imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
+				 &num_imm64_operand);
+
+  /* Initialize window with new instruction.  */
+  window[num_insn].insn = insn;
+  window[num_insn].byte_len = byte_len;
+  window[num_insn].group = group;
+  window[num_insn].path = path;
+  window[num_insn].imm_bytes = imm_size;
+
+  window_list->window_size += byte_len;
+  window_list->num_insn = num_insn + 1;
+  window_list->num_uops = window_list->num_uops + num_uops;
+  window_list->imm_size += imm_size;
+  window_list->num_imm += num_imm_operand;
+  window_list->num_imm_32 += num_imm32_operand;
+  window_list->num_imm_64 += num_imm64_operand;
+
+  if (group == disp_store)
+    window_list->num_stores += 1;
+  else if (group == disp_load
+	   || group == disp_prefetch)
+    window_list->num_loads += 1;
+  else if (group == disp_load_store)
+    {
+      window_list->num_stores += 1;
+      window_list->num_loads += 1;
+    }
+}
+
+/* Adds a scheduled instruction, INSN, to the current dispatch window.
+   If the total bytes of instructions or the number of instructions in
+   the window exceed allowable, it allocates a new window.  */
+
+static void
+add_to_dispatch_window (rtx insn)
+{
+  int byte_len;
+  dispatch_windows *window_list;
+  dispatch_windows *next_list;
+  dispatch_windows *window0_list;
+  enum insn_path path;
+  enum dispatch_group insn_group;
+  bool insn_fits;
+  int num_insn;
+  int num_uops;
+  int window_num;
+  int insn_num_uops;
+  int sum;
+
+  if (INSN_CODE (insn) < 0)
+    return;
+
+  byte_len = min_insn_size (insn);
+  window_list = dispatch_window_list;
+  next_list = window_list->next;
+  path = get_insn_path (insn);
+  insn_group = get_insn_group (insn);
+
+  /* Get the last dispatch window.  */
+  if (next_list)
+      window_list = dispatch_window_list->next;
+
+  if (path == path_single)
+    insn_num_uops = 1;
+  else if (path == path_double)
+    insn_num_uops = 2;
+  else
+    insn_num_uops = (int) path;
+
+  /* If current window is full, get a new window.
+     Window number zero is full, if MAX_INSN uops are scheduled in it.
+     Window number one is full, if window zero's bytes plus window
+     one's bytes is 32, or if the bytes of the new instruction added
+     to the total makes it greater than 48, or it has already MAX_INSN
+     instructions in it.  */
+  num_insn = window_list->num_insn;
+  num_uops = window_list->num_uops;
+  window_num = window_list->window_num;
+  insn_fits = fits_dispatch_window (insn);
+
+  if (num_insn >= MAX_INSN
+      || num_uops + insn_num_uops > MAX_INSN
+      || !(insn_fits))
+    {
+      window_num = ~window_num & 1;
+      window_list = allocate_next_window (window_num);
+    }
+
+  if (window_num == 0)
+    {
+      add_insn_window (insn, window_list, insn_num_uops);
+      if (window_list->num_insn >= MAX_INSN
+	  && insn_group == disp_branch)
+	{
+	  process_end_window ();
+	  return;
+	}
+    }
+  else if (window_num == 1)
+    {
+      window0_list = window_list->prev;
+      sum = window0_list->window_size + window_list->window_size;
+      if (sum == 32
+	  || (byte_len + sum) >= 48)
+	{
+	  process_end_window ();
+	  window_list = dispatch_window_list;
+	}
+
+      add_insn_window (insn, window_list, insn_num_uops);
+    }
+  else
+    gcc_unreachable ();
+
+  if (is_end_basic_block (insn_group))
+    {
+      /* End of basic block is reached do end-basic-block process.  */
+      process_end_window ();
+      return;
+    }
+}
+
+/* Print the dispatch window, WINDOW_NUM, to FILE.  */
+
+DEBUG_FUNCTION static void
+debug_dispatch_window_file (FILE *file, int window_num)
+{
+  dispatch_windows *list;
+  int i;
+
+  if (window_num == 0)
+    list = dispatch_window_list;
+  else
+    list = dispatch_window_list1;
+
+  fprintf (file, "Window #%d:\n", list->window_num);
+  fprintf (file, "  num_insn = %d, num_uops = %d, window_size = %d\n",
+	  list->num_insn, list->num_uops, list->window_size);
+  fprintf (file, "  num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
+	   list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
+
+  fprintf (file, "  num_loads = %d, num_stores = %d\n", list->num_loads,
+	  list->num_stores);
+  fprintf (file, " insn info:\n");
+
+  for (i = 0; i < MAX_INSN; i++)
+    {
+      if (!list->window[i].insn)
+	break;
+      fprintf (file, "    group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
+	      i, group_name[list->window[i].group],
+	      i, (void *)list->window[i].insn,
+	      i, list->window[i].path,
+	      i, list->window[i].byte_len,
+	      i, list->window[i].imm_bytes);
+    }
+}
+
+/* Print to stdout a dispatch window.  */
+
+DEBUG_FUNCTION void
+debug_dispatch_window (int window_num)
+{
+  debug_dispatch_window_file (stdout, window_num);
+}
+
+/* Print INSN dispatch information to FILE.  */
+
+DEBUG_FUNCTION static void
+debug_insn_dispatch_info_file (FILE *file, rtx insn)
+{
+  int byte_len;
+  enum insn_path path;
+  enum dispatch_group group;
+  int imm_size;
+  int num_imm_operand;
+  int num_imm32_operand;
+  int num_imm64_operand;
+
+  if (INSN_CODE (insn) < 0)
+    return;
+
+  byte_len = min_insn_size (insn);
+  path = get_insn_path (insn);
+  group = get_insn_group (insn);
+  imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
+				 &num_imm64_operand);
+
+  fprintf (file, " insn info:\n");
+  fprintf (file, "  group = %s, path = %d, byte_len = %d\n",
+	   group_name[group], path, byte_len);
+  fprintf (file, "  num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
+	   num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
+}
+
+/* Print to STDERR the status of the ready list with respect to
+   dispatch windows.  */
+
+DEBUG_FUNCTION void
+debug_ready_dispatch (void)
+{
+  int i;
+  int no_ready = number_in_ready ();
+
+  fprintf (stdout, "Number of ready: %d\n", no_ready);
+
+  for (i = 0; i < no_ready; i++)
+    debug_insn_dispatch_info_file (stdout, get_ready_element (i));
+}
+
+/* This routine is the driver of the dispatch scheduler.  */
+
+static void
+do_dispatch (rtx insn, int mode)
+{
+  if (mode == DISPATCH_INIT)
+    init_dispatch_sched ();
+  else if (mode == ADD_TO_DISPATCH_WINDOW)
+    add_to_dispatch_window (insn);
+}
+
+/* Return TRUE if Dispatch Scheduling is supported.  */
+
+static bool
+has_dispatch (rtx insn, int action)
+{
+  if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3 || TARGET_BDVER4)
+      && flag_dispatch_scheduler)
+    switch (action)
+      {
+      default:
+	return false;
+
+      case IS_DISPATCH_ON:
+	return true;
+	break;
+
+      case IS_CMP:
+	return is_cmp (insn);
+
+      case DISPATCH_VIOLATION:
+	return dispatch_violation ();
+
+      case FITS_DISPATCH_WINDOW:
+	return fits_dispatch_window (insn);
+      }
+
+  return false;
+}
+
+/* Implementation of reassociation_width target hook used by
+   reassoc phase to identify parallelism level in reassociated
+   tree.  Statements tree_code is passed in OPC.  Arguments type
+   is passed in MODE.
+
+   Currently parallel reassociation is enabled for Atom
+   processors only and we set reassociation width to be 2
+   because Atom may issue up to 2 instructions per cycle.
+
+   Return value should be fixed if parallel reassociation is
+   enabled for other processors.  */
+
+static int
+ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
+			  enum machine_mode mode)
+{
+  int res = 1;
+
+  if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
+    res = 2;
+  else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
+    res = 2;
+
+  return res;
+}
+
+/* ??? No autovectorization into MMX or 3DNOW until we can reliably
+   place emms and femms instructions.  */
+
+static enum machine_mode
+ix86_preferred_simd_mode (enum machine_mode mode)
+{
+  if (!TARGET_SSE)
+    return word_mode;
+
+  switch (mode)
+    {
+    case QImode:
+      return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
+    case HImode:
+      return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
+    case SImode:
+      return TARGET_AVX512F ? V16SImode :
+	(TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
+    case DImode:
+      return TARGET_AVX512F ? V8DImode :
+	(TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
+
+    case SFmode:
+      if (TARGET_AVX512F)
+	return V16SFmode;
+      else if (TARGET_AVX && !TARGET_PREFER_AVX128)
+	return V8SFmode;
+      else
+	return V4SFmode;
+
+    case DFmode:
+      if (!TARGET_VECTORIZE_DOUBLE)
+	return word_mode;
+      else if (TARGET_AVX512F)
+	return V8DFmode;
+      else if (TARGET_AVX && !TARGET_PREFER_AVX128)
+	return V4DFmode;
+      else if (TARGET_SSE2)
+	return V2DFmode;
+      /* FALLTHRU */
+
+    default:
+      return word_mode;
+    }
+}
+
+/* If AVX is enabled then try vectorizing with both 256bit and 128bit
+   vectors.  If AVX512F is enabled then try vectorizing with 512bit,
+   256bit and 128bit vectors.  */
+
+static unsigned int
+ix86_autovectorize_vector_sizes (void)
+{
+  return TARGET_AVX512F ? 64 | 32 | 16 :
+    (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
+}
+
+
+
+/* Return class of registers which could be used for pseudo of MODE
+   and of class RCLASS for spilling instead of memory.  Return NO_REGS
+   if it is not possible or non-profitable.  */
+static reg_class_t
+ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
+{
+  if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
+      && (mode == SImode || (TARGET_64BIT && mode == DImode))
+      && INTEGER_CLASS_P (rclass))
+    return ALL_SSE_REGS;
+  return NO_REGS;
+}
+
+/* Implement targetm.vectorize.init_cost.  */
+
+static void *
+ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
+{
+  unsigned *cost = XNEWVEC (unsigned, 3);
+  cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
+  return cost;
+}
+
+/* Implement targetm.vectorize.add_stmt_cost.  */
+
+static unsigned
+ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
+		    struct _stmt_vec_info *stmt_info, int misalign,
+		    enum vect_cost_model_location where)
+{
+  unsigned *cost = (unsigned *) data;
+  unsigned retval = 0;
+
+  tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
+  int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+
+  /* Statements in an inner loop relative to the loop being
+     vectorized are weighted more heavily.  The value here is
+      arbitrary and could potentially be improved with analysis.  */
+  if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
+    count *= 50;  /* FIXME.  */
+
+  retval = (unsigned) (count * stmt_cost);
+  cost[where] += retval;
+
+  return retval;
+}
+
+/* Implement targetm.vectorize.finish_cost.  */
+
+static void
+ix86_finish_cost (void *data, unsigned *prologue_cost,
+		  unsigned *body_cost, unsigned *epilogue_cost)
+{
+  unsigned *cost = (unsigned *) data;
+  *prologue_cost = cost[vect_prologue];
+  *body_cost     = cost[vect_body];
+  *epilogue_cost = cost[vect_epilogue];
+}
+
+/* Implement targetm.vectorize.destroy_cost_data.  */
+
+static void
+ix86_destroy_cost_data (void *data)
+{
+  free (data);
+}
+
+/* Validate target specific memory model bits in VAL. */
+
+static unsigned HOST_WIDE_INT
+ix86_memmodel_check (unsigned HOST_WIDE_INT val)
+{
+  unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
+  bool strong;
+
+  if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
+				      |MEMMODEL_MASK)
+      || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
+    {
+      warning (OPT_Winvalid_memory_model,
+	       "Unknown architecture specific memory model");
+      return MEMMODEL_SEQ_CST;
+    }
+  strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
+  if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
+    {
+      warning (OPT_Winvalid_memory_model,
+              "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
+      return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
+    }
+   if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
+    {
+      warning (OPT_Winvalid_memory_model,
+              "HLE_RELEASE not used with RELEASE or stronger memory model");
+      return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
+    }
+  return val;
+}
+
+/* Set CLONEI->vecsize_mangle, CLONEI->vecsize_int,
+   CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
+   CLONEI->simdlen.  Return 0 if SIMD clones shouldn't be emitted,
+   or number of vecsize_mangle variants that should be emitted.  */
+
+static int
+ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
+					     struct cgraph_simd_clone *clonei,
+					     tree base_type, int num)
+{
+  int ret = 1;
+
+  if (clonei->simdlen
+      && (clonei->simdlen < 2
+	  || clonei->simdlen > 16
+	  || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
+    {
+      warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+		  "unsupported simdlen %d", clonei->simdlen);
+      return 0;
+    }
+
+  tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
+  if (TREE_CODE (ret_type) != VOID_TYPE)
+    switch (TYPE_MODE (ret_type))
+      {
+      case QImode:
+      case HImode:
+      case SImode:
+      case DImode:
+      case SFmode:
+      case DFmode:
+      /* case SCmode: */
+      /* case DCmode: */
+	break;
+      default:
+	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+		    "unsupported return type %qT for simd\n", ret_type);
+	return 0;
+      }
+
+  tree t;
+  int i;
+
+  for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
+    /* FIXME: Shouldn't we allow such arguments if they are uniform?  */
+    switch (TYPE_MODE (TREE_TYPE (t)))
+      {
+      case QImode:
+      case HImode:
+      case SImode:
+      case DImode:
+      case SFmode:
+      case DFmode:
+      /* case SCmode: */
+      /* case DCmode: */
+	break;
+      default:
+	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
+		    "unsupported argument type %qT for simd\n", TREE_TYPE (t));
+	return 0;
+      }
+
+  if (clonei->cilk_elemental)
+    {
+      /* Parse here processor clause.  If not present, default to 'b'.  */
+      clonei->vecsize_mangle = 'b';
+    }
+  else if (!TREE_PUBLIC (node->decl))
+    {
+      /* If the function isn't exported, we can pick up just one ISA
+	 for the clones.  */
+      if (TARGET_AVX2)
+	clonei->vecsize_mangle = 'd';
+      else if (TARGET_AVX)
+	clonei->vecsize_mangle = 'c';
+      else
+	clonei->vecsize_mangle = 'b';
+      ret = 1;
+    }
+  else
+    {
+      clonei->vecsize_mangle = "bcd"[num];
+      ret = 3;
+    }
+  switch (clonei->vecsize_mangle)
+    {
+    case 'b':
+      clonei->vecsize_int = 128;
+      clonei->vecsize_float = 128;
+      break;
+    case 'c':
+      clonei->vecsize_int = 128;
+      clonei->vecsize_float = 256;
+      break;
+    case 'd':
+      clonei->vecsize_int = 256;
+      clonei->vecsize_float = 256;
+      break;
+    }
+  if (clonei->simdlen == 0)
+    {
+      if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
+	clonei->simdlen = clonei->vecsize_int;
+      else
+	clonei->simdlen = clonei->vecsize_float;
+      clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
+      if (clonei->simdlen > 16)
+	clonei->simdlen = 16;
+    }
+  return ret;
+}
+
+/* Add target attribute to SIMD clone NODE if needed.  */
+
+static void
+ix86_simd_clone_adjust (struct cgraph_node *node)
+{
+  const char *str = NULL;
+  gcc_assert (node->decl == cfun->decl);
+  switch (node->simdclone->vecsize_mangle)
+    {
+    case 'b':
+      if (!TARGET_SSE2)
+	str = "sse2";
+      break;
+    case 'c':
+      if (!TARGET_AVX)
+	str = "avx";
+      break;
+    case 'd':
+      if (!TARGET_AVX2)
+	str = "avx2";
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  if (str == NULL)
+    return;
+  push_cfun (NULL);
+  tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
+  bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
+  gcc_assert (ok);
+  pop_cfun ();
+  ix86_previous_fndecl = NULL_TREE;
+  ix86_set_current_function (node->decl);
+}
+
+/* If SIMD clone NODE can't be used in a vectorized loop
+   in current function, return -1, otherwise return a badness of using it
+   (0 if it is most desirable from vecsize_mangle point of view, 1
+   slightly less desirable, etc.).  */
+
+static int
+ix86_simd_clone_usable (struct cgraph_node *node)
+{
+  switch (node->simdclone->vecsize_mangle)
+    {
+    case 'b':
+      if (!TARGET_SSE2)
+	return -1;
+      if (!TARGET_AVX)
+	return 0;
+      return TARGET_AVX2 ? 2 : 1;
+    case 'c':
+      if (!TARGET_AVX)
+	return -1;
+      return TARGET_AVX2 ? 1 : 0;
+      break;
+    case 'd':
+      if (!TARGET_AVX2)
+	return -1;
+      return 0;
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* This function gives out the number of memory references.
+   This value determines the unrolling factor for
+   bdver3 and bdver4 architectures. */
+
+static int
+ix86_loop_memcount (rtx *x, unsigned *mem_count)
+{
+  if (*x != NULL_RTX && MEM_P (*x))
+   {
+     enum machine_mode mode;
+     unsigned int n_words;
+
+     mode = GET_MODE (*x);
+     n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
+
+    if (n_words > 4)
+       (*mem_count)+=2;
+    else
+       (*mem_count)+=1;
+   }
+  return 0;
+}
+
+/* This function adjusts the unroll factor based on
+   the hardware capabilities. For ex, bdver3 has
+   a loop buffer which makes unrolling of smaller
+   loops less important. This function decides the
+   unroll factor using number of memory references
+   (value 32 is used) as a heuristic. */
+
+static unsigned
+ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
+{
+  basic_block *bbs;
+  rtx insn;
+  unsigned i;
+  unsigned mem_count = 0;
+
+  if (!TARGET_ADJUST_UNROLL)
+     return nunroll;
+
+  /* Count the number of memory references within the loop body.  */
+  bbs = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      for (insn = BB_HEAD (bbs[i]); insn != BB_END (bbs[i]); insn = NEXT_INSN (insn))
+        if (NONDEBUG_INSN_P (insn))
+            for_each_rtx (&insn, (rtx_function) ix86_loop_memcount, &mem_count);
+    }
+  free (bbs);
+
+  if (mem_count && mem_count <=32)
+    return 32/mem_count;
+
+  return nunroll;
+}
+
+
+/* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P.  */
+
+static bool
+ix86_float_exceptions_rounding_supported_p (void)
+{
+  /* For x87 floating point with standard excess precision handling,
+     there is no adddf3 pattern (since x87 floating point only has
+     XFmode operations) so the default hook implementation gets this
+     wrong.  */
+  return TARGET_80387 || TARGET_SSE_MATH;
+}
+
+/* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV.  */
+
+static void
+ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
+{
+  if (!TARGET_80387 && !TARGET_SSE_MATH)
+    return;
+  tree exceptions_var = create_tmp_var (integer_type_node, NULL);
+  if (TARGET_80387)
+    {
+      tree fenv_index_type = build_index_type (size_int (6));
+      tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
+      tree fenv_var = create_tmp_var (fenv_type, NULL);
+      mark_addressable (fenv_var);
+      tree fenv_ptr = build_pointer_type (fenv_type);
+      tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
+      fenv_addr = fold_convert (ptr_type_node, fenv_addr);
+      tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
+      tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
+      tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
+      tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
+      tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
+      tree hold_fnclex = build_call_expr (fnclex, 0);
+      *hold = build2 (COMPOUND_EXPR, void_type_node, hold_fnstenv,
+		      hold_fnclex);
+      *clear = build_call_expr (fnclex, 0);
+      tree sw_var = create_tmp_var (short_unsigned_type_node, NULL);
+      mark_addressable (sw_var);
+      tree su_ptr = build_pointer_type (short_unsigned_type_node);
+      tree sw_addr = build1 (ADDR_EXPR, su_ptr, sw_var);
+      tree fnstsw_call = build_call_expr (fnstsw, 1, sw_addr);
+      tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
+      tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
+				exceptions_var, exceptions_x87);
+      *update = build2 (COMPOUND_EXPR, integer_type_node,
+			fnstsw_call, update_mod);
+      tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
+      *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
+    }
+  if (TARGET_SSE_MATH)
+    {
+      tree mxcsr_orig_var = create_tmp_var (unsigned_type_node, NULL);
+      tree mxcsr_mod_var = create_tmp_var (unsigned_type_node, NULL);
+      tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
+      tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
+      tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
+      tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
+				      mxcsr_orig_var, stmxcsr_hold_call);
+      tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
+				  mxcsr_orig_var,
+				  build_int_cst (unsigned_type_node, 0x1f80));
+      hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
+			     build_int_cst (unsigned_type_node, 0xffffffc0));
+      tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
+				     mxcsr_mod_var, hold_mod_val);
+      tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
+      tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
+			      hold_assign_orig, hold_assign_mod);
+      hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
+			 ldmxcsr_hold_call);
+      if (*hold)
+	*hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
+      else
+	*hold = hold_all;
+      tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
+      if (*clear)
+	*clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
+			 ldmxcsr_clear_call);
+      else
+	*clear = ldmxcsr_clear_call;
+      tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
+      tree exceptions_sse = fold_convert (integer_type_node,
+					  stxmcsr_update_call);
+      if (*update)
+	{
+	  tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
+					exceptions_var, exceptions_sse);
+	  tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
+					   exceptions_var, exceptions_mod);
+	  *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
+			    exceptions_assign);
+	}
+      else
+	*update = build2 (MODIFY_EXPR, integer_type_node,
+			  exceptions_var, exceptions_sse);
+      tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
+      *update = build2 (COMPOUND_EXPR, void_type_node, *update,
+			ldmxcsr_update_call);
+    }
+  tree atomic_feraiseexcept
+    = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
+  tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
+						    1, exceptions_var);
+  *update = build2 (COMPOUND_EXPR, void_type_node, *update,
+		    atomic_feraiseexcept_call);
+}
+
+/* Initialize the GCC target structure.  */
+#undef TARGET_RETURN_IN_MEMORY
+#define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
+
+#undef TARGET_LEGITIMIZE_ADDRESS
+#define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
+
+#undef TARGET_ATTRIBUTE_TABLE
+#define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
+#undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
+#define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
+#if TARGET_DLLIMPORT_DECL_ATTRIBUTES
+#  undef TARGET_MERGE_DECL_ATTRIBUTES
+#  define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
+#endif
+
+#undef TARGET_COMP_TYPE_ATTRIBUTES
+#define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
+
+#undef TARGET_INIT_BUILTINS
+#define TARGET_INIT_BUILTINS ix86_init_builtins
+#undef TARGET_BUILTIN_DECL
+#define TARGET_BUILTIN_DECL ix86_builtin_decl
+#undef TARGET_EXPAND_BUILTIN
+#define TARGET_EXPAND_BUILTIN ix86_expand_builtin
+
+#undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
+#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
+  ix86_builtin_vectorized_function
+
+#undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
+#define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
+
+#undef TARGET_VECTORIZE_BUILTIN_TM_STORE
+#define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
+
+#undef TARGET_VECTORIZE_BUILTIN_GATHER
+#define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
+
+#undef TARGET_BUILTIN_RECIPROCAL
+#define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
+
+#undef TARGET_ASM_FUNCTION_EPILOGUE
+#define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
+
+#undef TARGET_ENCODE_SECTION_INFO
+#ifndef SUBTARGET_ENCODE_SECTION_INFO
+#define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
+#else
+#define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
+#endif
+
+#undef TARGET_ASM_OPEN_PAREN
+#define TARGET_ASM_OPEN_PAREN ""
+#undef TARGET_ASM_CLOSE_PAREN
+#define TARGET_ASM_CLOSE_PAREN ""
+
+#undef TARGET_ASM_BYTE_OP
+#define TARGET_ASM_BYTE_OP ASM_BYTE
+
+#undef TARGET_ASM_ALIGNED_HI_OP
+#define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
+#undef TARGET_ASM_ALIGNED_SI_OP
+#define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
+#ifdef ASM_QUAD
+#undef TARGET_ASM_ALIGNED_DI_OP
+#define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
+#endif
+
+#undef TARGET_PROFILE_BEFORE_PROLOGUE
+#define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
+
+#undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
+#define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
+
+#undef TARGET_ASM_UNALIGNED_HI_OP
+#define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
+#undef TARGET_ASM_UNALIGNED_SI_OP
+#define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
+#undef TARGET_ASM_UNALIGNED_DI_OP
+#define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
+
+#undef TARGET_PRINT_OPERAND
+#define TARGET_PRINT_OPERAND ix86_print_operand
+#undef TARGET_PRINT_OPERAND_ADDRESS
+#define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
+#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
+#define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
+#undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
+#define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
+
+#undef TARGET_SCHED_INIT_GLOBAL
+#define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
+#undef TARGET_SCHED_ADJUST_COST
+#define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
+#undef TARGET_SCHED_ISSUE_RATE
+#define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
+#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
+#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
+  ia32_multipass_dfa_lookahead
+#undef TARGET_SCHED_MACRO_FUSION_P
+#define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
+#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
+#define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
+
+#undef TARGET_FUNCTION_OK_FOR_SIBCALL
+#define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
+
+#undef TARGET_MEMMODEL_CHECK
+#define TARGET_MEMMODEL_CHECK ix86_memmodel_check
+
+#undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
+#define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
+
+#ifdef HAVE_AS_TLS
+#undef TARGET_HAVE_TLS
+#define TARGET_HAVE_TLS true
+#endif
+#undef TARGET_CANNOT_FORCE_CONST_MEM
+#define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
+#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
+#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
+
+#undef TARGET_DELEGITIMIZE_ADDRESS
+#define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
+
+#undef TARGET_MS_BITFIELD_LAYOUT_P
+#define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
+
+#if TARGET_MACHO
+#undef TARGET_BINDS_LOCAL_P
+#define TARGET_BINDS_LOCAL_P darwin_binds_local_p
+#endif
+#if TARGET_DLLIMPORT_DECL_ATTRIBUTES
+#undef TARGET_BINDS_LOCAL_P
+#define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
+#endif
+
+#undef TARGET_ASM_OUTPUT_MI_THUNK
+#define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
+#undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
+#define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
+
+#undef TARGET_ASM_FILE_START
+#define TARGET_ASM_FILE_START x86_file_start
+
+#undef TARGET_OPTION_OVERRIDE
+#define TARGET_OPTION_OVERRIDE ix86_option_override
+
+#undef TARGET_REGISTER_MOVE_COST
+#define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
+#undef TARGET_MEMORY_MOVE_COST
+#define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
+#undef TARGET_RTX_COSTS
+#define TARGET_RTX_COSTS ix86_rtx_costs
+#undef TARGET_ADDRESS_COST
+#define TARGET_ADDRESS_COST ix86_address_cost
+
+#undef TARGET_FIXED_CONDITION_CODE_REGS
+#define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
+#undef TARGET_CC_MODES_COMPATIBLE
+#define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
+
+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
+
+#undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
+#define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
+
+#undef TARGET_BUILD_BUILTIN_VA_LIST
+#define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
+
+#undef TARGET_FOLD_BUILTIN
+#define TARGET_FOLD_BUILTIN ix86_fold_builtin
+
+#undef TARGET_COMPARE_VERSION_PRIORITY
+#define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
+
+#undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
+#define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
+  ix86_generate_version_dispatcher_body
+
+#undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
+#define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
+  ix86_get_function_versions_dispatcher
+
+#undef TARGET_ENUM_VA_LIST_P
+#define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
+
+#undef TARGET_FN_ABI_VA_LIST
+#define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
+
+#undef TARGET_CANONICAL_VA_LIST_TYPE
+#define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
+
+#undef TARGET_EXPAND_BUILTIN_VA_START
+#define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
+
+#undef TARGET_MD_ASM_CLOBBERS
+#define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
+
+#undef TARGET_PROMOTE_PROTOTYPES
+#define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
+#undef TARGET_SETUP_INCOMING_VARARGS
+#define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
+#undef TARGET_MUST_PASS_IN_STACK
+#define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
+#undef TARGET_FUNCTION_ARG_ADVANCE
+#define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
+#undef TARGET_FUNCTION_ARG
+#define TARGET_FUNCTION_ARG ix86_function_arg
+#undef TARGET_FUNCTION_ARG_BOUNDARY
+#define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
+#undef TARGET_PASS_BY_REFERENCE
+#define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
+#undef TARGET_INTERNAL_ARG_POINTER
+#define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
+#undef TARGET_UPDATE_STACK_BOUNDARY
+#define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
+#undef TARGET_GET_DRAP_RTX
+#define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
+#undef TARGET_STRICT_ARGUMENT_NAMING
+#define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
+#undef TARGET_STATIC_CHAIN
+#define TARGET_STATIC_CHAIN ix86_static_chain
+#undef TARGET_TRAMPOLINE_INIT
+#define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
+#undef TARGET_RETURN_POPS_ARGS
+#define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
+
+#undef TARGET_LEGITIMATE_COMBINED_INSN
+#define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
+
+#undef TARGET_ASAN_SHADOW_OFFSET
+#define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
+
+#undef TARGET_GIMPLIFY_VA_ARG_EXPR
+#define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
+
+#undef TARGET_SCALAR_MODE_SUPPORTED_P
+#define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
+
+#undef TARGET_VECTOR_MODE_SUPPORTED_P
+#define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
+
+#undef TARGET_C_MODE_FOR_SUFFIX
+#define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
+
+#ifdef HAVE_AS_TLS
+#undef TARGET_ASM_OUTPUT_DWARF_DTPREL
+#define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
+#endif
+
+#ifdef SUBTARGET_INSERT_ATTRIBUTES
+#undef TARGET_INSERT_ATTRIBUTES
+#define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
+#endif
+
+#undef TARGET_MANGLE_TYPE
+#define TARGET_MANGLE_TYPE ix86_mangle_type
+
+#if !TARGET_MACHO
+#undef TARGET_STACK_PROTECT_FAIL
+#define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
+#endif
+
+#undef TARGET_FUNCTION_VALUE
+#define TARGET_FUNCTION_VALUE ix86_function_value
+
+#undef TARGET_FUNCTION_VALUE_REGNO_P
+#define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
+
+#undef TARGET_PROMOTE_FUNCTION_MODE
+#define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
+
+#undef TARGET_MEMBER_TYPE_FORCES_BLK
+#define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
+
+#undef TARGET_INSTANTIATE_DECLS
+#define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
+
+#undef TARGET_SECONDARY_RELOAD
+#define TARGET_SECONDARY_RELOAD ix86_secondary_reload
+
+#undef TARGET_CLASS_MAX_NREGS
+#define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
+
+#undef TARGET_PREFERRED_RELOAD_CLASS
+#define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
+#undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
+#define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
+#undef TARGET_CLASS_LIKELY_SPILLED_P
+#define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
+
+#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
+#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
+  ix86_builtin_vectorization_cost
+#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
+#define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
+  ix86_vectorize_vec_perm_const_ok
+#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
+#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
+  ix86_preferred_simd_mode
+#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
+#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
+  ix86_autovectorize_vector_sizes
+#undef TARGET_VECTORIZE_INIT_COST
+#define TARGET_VECTORIZE_INIT_COST ix86_init_cost
+#undef TARGET_VECTORIZE_ADD_STMT_COST
+#define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
+#undef TARGET_VECTORIZE_FINISH_COST
+#define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
+#undef TARGET_VECTORIZE_DESTROY_COST_DATA
+#define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
+
+#undef TARGET_SET_CURRENT_FUNCTION
+#define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
+
+#undef TARGET_OPTION_VALID_ATTRIBUTE_P
+#define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
+
+#undef TARGET_OPTION_SAVE
+#define TARGET_OPTION_SAVE ix86_function_specific_save
+
+#undef TARGET_OPTION_RESTORE
+#define TARGET_OPTION_RESTORE ix86_function_specific_restore
+
+#undef TARGET_OPTION_PRINT
+#define TARGET_OPTION_PRINT ix86_function_specific_print
+
+#undef TARGET_OPTION_FUNCTION_VERSIONS
+#define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
+
+#undef TARGET_CAN_INLINE_P
+#define TARGET_CAN_INLINE_P ix86_can_inline_p
+
+#undef TARGET_EXPAND_TO_RTL_HOOK
+#define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
+
+#undef TARGET_LEGITIMATE_ADDRESS_P
+#define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
+
+#undef TARGET_LRA_P
+#define TARGET_LRA_P hook_bool_void_true
+
+#undef TARGET_REGISTER_PRIORITY
+#define TARGET_REGISTER_PRIORITY ix86_register_priority
+
+#undef TARGET_REGISTER_USAGE_LEVELING_P
+#define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
+
+#undef TARGET_LEGITIMATE_CONSTANT_P
+#define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
+
+#undef TARGET_FRAME_POINTER_REQUIRED
+#define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
+
+#undef TARGET_CAN_ELIMINATE
+#define TARGET_CAN_ELIMINATE ix86_can_eliminate
+
+#undef TARGET_EXTRA_LIVE_ON_ENTRY
+#define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
+
+#undef TARGET_ASM_CODE_END
+#define TARGET_ASM_CODE_END ix86_code_end
+
+#undef TARGET_CONDITIONAL_REGISTER_USAGE
+#define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
+
+#if TARGET_MACHO
+#undef TARGET_INIT_LIBFUNCS
+#define TARGET_INIT_LIBFUNCS darwin_rename_builtins
+#endif
+
+#undef TARGET_LOOP_UNROLL_ADJUST
+#define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
+
+#undef TARGET_SPILL_CLASS
+#define TARGET_SPILL_CLASS ix86_spill_class
+
+#undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
+#define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
+  ix86_simd_clone_compute_vecsize_and_simdlen
+
+#undef TARGET_SIMD_CLONE_ADJUST
+#define TARGET_SIMD_CLONE_ADJUST \
+  ix86_simd_clone_adjust
+
+#undef TARGET_SIMD_CLONE_USABLE
+#define TARGET_SIMD_CLONE_USABLE \
+  ix86_simd_clone_usable
+
+#undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
+#define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
+  ix86_float_exceptions_rounding_supported_p
+
+struct gcc_target targetm = TARGET_INITIALIZER;
+
+#include "gt-i386.h"
diff --git a/gcc-4.9/gcc/config/i386/i386.h b/gcc-4.9/gcc/config/i386/i386.h
new file mode 100644
index 000000000..51659deb3
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/i386.h
@@ -0,0 +1,2552 @@
+/* Definitions of target machine for GCC for IA-32.
+   Copyright (C) 1988-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* The purpose of this file is to define the characteristics of the i386,
+   independent of assembler syntax or operating system.
+
+   Three other files build on this one to describe a specific assembler syntax:
+   bsd386.h, att386.h, and sun386.h.
+
+   The actual tm.h file for a particular system should include
+   this file, and then the file for the appropriate assembler syntax.
+
+   Many macros that specify assembler syntax are omitted entirely from
+   this file because they really belong in the files for particular
+   assemblers.  These include RP, IP, LPREFIX, PUT_OP_SIZE, USE_STAR,
+   ADDR_BEG, ADDR_END, PRINT_IREG, PRINT_SCALE, PRINT_B_I_S, and many
+   that start with ASM_ or end in ASM_OP.  */
+
+/* Redefines for option macros.  */
+
+#define TARGET_64BIT	TARGET_ISA_64BIT
+#define TARGET_64BIT_P(x)	TARGET_ISA_64BIT_P(x)
+#define TARGET_MMX	TARGET_ISA_MMX
+#define TARGET_MMX_P(x)	TARGET_ISA_MMX_P(x)
+#define TARGET_3DNOW	TARGET_ISA_3DNOW
+#define TARGET_3DNOW_P(x)	TARGET_ISA_3DNOW_P(x)
+#define TARGET_3DNOW_A	TARGET_ISA_3DNOW_A
+#define TARGET_3DNOW_A_P(x)	TARGET_ISA_3DNOW_A_P(x)
+#define TARGET_SSE	TARGET_ISA_SSE
+#define TARGET_SSE_P(x)	TARGET_ISA_SSE_P(x)
+#define TARGET_SSE2	TARGET_ISA_SSE2
+#define TARGET_SSE2_P(x)	TARGET_ISA_SSE2_P(x)
+#define TARGET_SSE3	TARGET_ISA_SSE3
+#define TARGET_SSE3_P(x)	TARGET_ISA_SSE3_P(x)
+#define TARGET_SSSE3	TARGET_ISA_SSSE3
+#define TARGET_SSSE3_P(x)	TARGET_ISA_SSSE3_P(x)
+#define TARGET_SSE4_1	TARGET_ISA_SSE4_1
+#define TARGET_SSE4_1_P(x)	TARGET_ISA_SSE4_1_P(x)
+#define TARGET_SSE4_2	TARGET_ISA_SSE4_2
+#define TARGET_SSE4_2_P(x)	TARGET_ISA_SSE4_2_P(x)
+#define TARGET_AVX	TARGET_ISA_AVX
+#define TARGET_AVX_P(x)	TARGET_ISA_AVX_P(x)
+#define TARGET_AVX2	TARGET_ISA_AVX2
+#define TARGET_AVX2_P(x)	TARGET_ISA_AVX2_P(x)
+#define TARGET_AVX512F	TARGET_ISA_AVX512F
+#define TARGET_AVX512F_P(x)	TARGET_ISA_AVX512F_P(x)
+#define TARGET_AVX512PF	TARGET_ISA_AVX512PF
+#define TARGET_AVX512PF_P(x)	TARGET_ISA_AVX512PF_P(x)
+#define TARGET_AVX512ER	TARGET_ISA_AVX512ER
+#define TARGET_AVX512ER_P(x)	TARGET_ISA_AVX512ER_P(x)
+#define TARGET_AVX512CD	TARGET_ISA_AVX512CD
+#define TARGET_AVX512CD_P(x)	TARGET_ISA_AVX512CD_P(x)
+#define TARGET_FMA	TARGET_ISA_FMA
+#define TARGET_FMA_P(x)	TARGET_ISA_FMA_P(x)
+#define TARGET_SSE4A	TARGET_ISA_SSE4A
+#define TARGET_SSE4A_P(x)	TARGET_ISA_SSE4A_P(x)
+#define TARGET_FMA4	TARGET_ISA_FMA4
+#define TARGET_FMA4_P(x)	TARGET_ISA_FMA4_P(x)
+#define TARGET_XOP	TARGET_ISA_XOP
+#define TARGET_XOP_P(x)	TARGET_ISA_XOP_P(x)
+#define TARGET_LWP	TARGET_ISA_LWP
+#define TARGET_LWP_P(x)	TARGET_ISA_LWP_P(x)
+#define TARGET_ROUND	TARGET_ISA_ROUND
+#define TARGET_ABM	TARGET_ISA_ABM
+#define TARGET_ABM_P(x)	TARGET_ISA_ABM_P(x)
+#define TARGET_BMI	TARGET_ISA_BMI
+#define TARGET_BMI_P(x)	TARGET_ISA_BMI_P(x)
+#define TARGET_BMI2	TARGET_ISA_BMI2
+#define TARGET_BMI2_P(x)	TARGET_ISA_BMI2_P(x)
+#define TARGET_LZCNT	TARGET_ISA_LZCNT
+#define TARGET_LZCNT_P(x)	TARGET_ISA_LZCNT_P(x)
+#define TARGET_TBM	TARGET_ISA_TBM
+#define TARGET_TBM_P(x)	TARGET_ISA_TBM_P(x)
+#define TARGET_POPCNT	TARGET_ISA_POPCNT
+#define TARGET_POPCNT_P(x)	TARGET_ISA_POPCNT_P(x)
+#define TARGET_SAHF	TARGET_ISA_SAHF
+#define TARGET_SAHF_P(x)	TARGET_ISA_SAHF_P(x)
+#define TARGET_MOVBE	TARGET_ISA_MOVBE
+#define TARGET_MOVBE_P(x)	TARGET_ISA_MOVBE_P(x)
+#define TARGET_CRC32	TARGET_ISA_CRC32
+#define TARGET_CRC32_P(x)	TARGET_ISA_CRC32_P(x)
+#define TARGET_AES	TARGET_ISA_AES
+#define TARGET_AES_P(x)	TARGET_ISA_AES_P(x)
+#define TARGET_SHA	TARGET_ISA_SHA
+#define TARGET_SHA_P(x)	TARGET_ISA_SHA_P(x)
+#define TARGET_PCLMUL	TARGET_ISA_PCLMUL
+#define TARGET_PCLMUL_P(x)	TARGET_ISA_PCLMUL_P(x)
+#define TARGET_CMPXCHG16B	TARGET_ISA_CX16
+#define TARGET_CMPXCHG16B_P(x)	TARGET_ISA_CX16_P(x)
+#define TARGET_FSGSBASE	TARGET_ISA_FSGSBASE
+#define TARGET_FSGSBASE_P(x)	TARGET_ISA_FSGSBASE_P(x)
+#define TARGET_RDRND	TARGET_ISA_RDRND
+#define TARGET_RDRND_P(x)	TARGET_ISA_RDRND_P(x)
+#define TARGET_F16C	TARGET_ISA_F16C
+#define TARGET_F16C_P(x)	TARGET_ISA_F16C_P(x)
+#define TARGET_RTM	TARGET_ISA_RTM
+#define TARGET_RTM_P(x)	TARGET_ISA_RTM_P(x)
+#define TARGET_HLE	TARGET_ISA_HLE
+#define TARGET_HLE_P(x)	TARGET_ISA_HLE_P(x)
+#define TARGET_RDSEED	TARGET_ISA_RDSEED
+#define TARGET_RDSEED_P(x)	TARGET_ISA_RDSEED_P(x)
+#define TARGET_PRFCHW	TARGET_ISA_PRFCHW
+#define TARGET_PRFCHW_P(x)	TARGET_ISA_PRFCHW_P(x)
+#define TARGET_ADX	TARGET_ISA_ADX
+#define TARGET_ADX_P(x)	TARGET_ISA_ADX_P(x)
+#define TARGET_FXSR	TARGET_ISA_FXSR
+#define TARGET_FXSR_P(x)	TARGET_ISA_FXSR_P(x)
+#define TARGET_XSAVE	TARGET_ISA_XSAVE
+#define TARGET_XSAVE_P(x)	TARGET_ISA_XSAVE_P(x)
+#define TARGET_XSAVEOPT	TARGET_ISA_XSAVEOPT
+#define TARGET_XSAVEOPT_P(x)	TARGET_ISA_XSAVEOPT_P(x)
+#define TARGET_PREFETCHWT1	TARGET_ISA_PREFETCHWT1
+#define TARGET_PREFETCHWT1_P(x)	TARGET_ISA_PREFETCHWT1_P(x)
+
+#define TARGET_LP64	TARGET_ABI_64
+#define TARGET_LP64_P(x)	TARGET_ABI_64_P(x)
+#define TARGET_X32	TARGET_ABI_X32
+#define TARGET_X32_P(x)	TARGET_ABI_X32_P(x)
+#define TARGET_16BIT	TARGET_CODE16
+#define TARGET_16BIT_P(x)	TARGET_CODE16_P(x)
+
+/* SSE4.1 defines round instructions */
+#define	OPTION_MASK_ISA_ROUND	OPTION_MASK_ISA_SSE4_1
+#define	TARGET_ISA_ROUND	((ix86_isa_flags & OPTION_MASK_ISA_ROUND) != 0)
+
+#include "config/vxworks-dummy.h"
+
+#include "config/i386/i386-opts.h"
+
+#define MAX_STRINGOP_ALGS 4
+
+/* Specify what algorithm to use for stringops on known size.
+   When size is unknown, the UNKNOWN_SIZE alg is used.  When size is
+   known at compile time or estimated via feedback, the SIZE array
+   is walked in order until MAX is greater then the estimate (or -1
+   means infinity).  Corresponding ALG is used then.
+   When NOALIGN is true the code guaranting the alignment of the memory
+   block is skipped.
+
+   For example initializer:
+    {{256, loop}, {-1, rep_prefix_4_byte}}
+   will use loop for blocks smaller or equal to 256 bytes, rep prefix will
+   be used otherwise.  */
+struct stringop_algs
+{
+  const enum stringop_alg unknown_size;
+  const struct stringop_strategy {
+    const int max;
+    const enum stringop_alg alg;
+    int noalign;
+  } size [MAX_STRINGOP_ALGS];
+};
+
+/* Define the specific costs for a given cpu */
+
+struct processor_costs {
+  const int add;		/* cost of an add instruction */
+  const int lea;		/* cost of a lea instruction */
+  const int shift_var;		/* variable shift costs */
+  const int shift_const;	/* constant shift costs */
+  const int mult_init[5];	/* cost of starting a multiply
+				   in QImode, HImode, SImode, DImode, TImode*/
+  const int mult_bit;		/* cost of multiply per each bit set */
+  const int divide[5];		/* cost of a divide/mod
+				   in QImode, HImode, SImode, DImode, TImode*/
+  int movsx;			/* The cost of movsx operation.  */
+  int movzx;			/* The cost of movzx operation.  */
+  const int large_insn;		/* insns larger than this cost more */
+  const int move_ratio;		/* The threshold of number of scalar
+				   memory-to-memory move insns.  */
+  const int movzbl_load;	/* cost of loading using movzbl */
+  const int int_load[3];	/* cost of loading integer registers
+				   in QImode, HImode and SImode relative
+				   to reg-reg move (2).  */
+  const int int_store[3];	/* cost of storing integer register
+				   in QImode, HImode and SImode */
+  const int fp_move;		/* cost of reg,reg fld/fst */
+  const int fp_load[3];		/* cost of loading FP register
+				   in SFmode, DFmode and XFmode */
+  const int fp_store[3];	/* cost of storing FP register
+				   in SFmode, DFmode and XFmode */
+  const int mmx_move;		/* cost of moving MMX register.  */
+  const int mmx_load[2];	/* cost of loading MMX register
+				   in SImode and DImode */
+  const int mmx_store[2];	/* cost of storing MMX register
+				   in SImode and DImode */
+  const int sse_move;		/* cost of moving SSE register.  */
+  const int sse_load[3];	/* cost of loading SSE register
+				   in SImode, DImode and TImode*/
+  const int sse_store[3];	/* cost of storing SSE register
+				   in SImode, DImode and TImode*/
+  const int mmxsse_to_integer;	/* cost of moving mmxsse register to
+				   integer and vice versa.  */
+  const int l1_cache_size;	/* size of l1 cache, in kilobytes.  */
+  const int l2_cache_size;	/* size of l2 cache, in kilobytes.  */
+  const int prefetch_block;	/* bytes moved to cache for prefetch.  */
+  const int simultaneous_prefetches; /* number of parallel prefetch
+				   operations.  */
+  const int branch_cost;	/* Default value for BRANCH_COST.  */
+  const int fadd;		/* cost of FADD and FSUB instructions.  */
+  const int fmul;		/* cost of FMUL instruction.  */
+  const int fdiv;		/* cost of FDIV instruction.  */
+  const int fabs;		/* cost of FABS instruction.  */
+  const int fchs;		/* cost of FCHS instruction.  */
+  const int fsqrt;		/* cost of FSQRT instruction.  */
+				/* Specify what algorithm
+				   to use for stringops on unknown size.  */
+  struct stringop_algs *memcpy, *memset;
+  const int scalar_stmt_cost;   /* Cost of any scalar operation, excluding
+				   load and store.  */
+  const int scalar_load_cost;   /* Cost of scalar load.  */
+  const int scalar_store_cost;  /* Cost of scalar store.  */
+  const int vec_stmt_cost;      /* Cost of any vector operation, excluding
+                                   load, store, vector-to-scalar and
+                                   scalar-to-vector operation.  */
+  const int vec_to_scalar_cost;    /* Cost of vect-to-scalar operation.  */
+  const int scalar_to_vec_cost;    /* Cost of scalar-to-vector operation.  */
+  const int vec_align_load_cost;   /* Cost of aligned vector load.  */
+  const int vec_unalign_load_cost; /* Cost of unaligned vector load.  */
+  const int vec_store_cost;        /* Cost of vector store.  */
+  const int cond_taken_branch_cost;    /* Cost of taken branch for vectorizer
+					  cost model.  */
+  const int cond_not_taken_branch_cost;/* Cost of not taken branch for
+					  vectorizer cost model.  */
+};
+
+extern const struct processor_costs *ix86_cost;
+extern const struct processor_costs ix86_size_cost;
+
+#define ix86_cur_cost() \
+  (optimize_insn_for_size_p () ? &ix86_size_cost: ix86_cost)
+
+/* Macros used in the machine description to test the flags.  */
+
+/* configure can arrange to change it.  */
+
+#ifndef TARGET_CPU_DEFAULT
+#define TARGET_CPU_DEFAULT PROCESSOR_GENERIC
+#endif
+
+#ifndef TARGET_FPMATH_DEFAULT
+#define TARGET_FPMATH_DEFAULT \
+  (TARGET_64BIT && TARGET_SSE ? FPMATH_SSE : FPMATH_387)
+#endif
+
+#ifndef TARGET_FPMATH_DEFAULT_P
+#define TARGET_FPMATH_DEFAULT_P(x) \
+  (TARGET_64BIT_P(x) && TARGET_SSE_P(x) ? FPMATH_SSE : FPMATH_387)
+#endif
+
+#define TARGET_FLOAT_RETURNS_IN_80387 TARGET_FLOAT_RETURNS
+#define TARGET_FLOAT_RETURNS_IN_80387_P(x) TARGET_FLOAT_RETURNS_P(x)
+
+/* 64bit Sledgehammer mode.  For libgcc2 we make sure this is a
+   compile-time constant.  */
+#ifdef IN_LIBGCC2
+#undef TARGET_64BIT
+#ifdef __x86_64__
+#define TARGET_64BIT 1
+#else
+#define TARGET_64BIT 0
+#endif
+#else
+#ifndef TARGET_BI_ARCH
+#undef TARGET_64BIT
+#undef TARGET_64BIT_P
+#if TARGET_64BIT_DEFAULT
+#define TARGET_64BIT 1
+#define TARGET_64BIT_P(x) 1
+#else
+#define TARGET_64BIT 0
+#define TARGET_64BIT_P(x) 0
+#endif
+#endif
+#endif
+
+#define HAS_LONG_COND_BRANCH 1
+#define HAS_LONG_UNCOND_BRANCH 1
+
+#define TARGET_386 (ix86_tune == PROCESSOR_I386)
+#define TARGET_486 (ix86_tune == PROCESSOR_I486)
+#define TARGET_PENTIUM (ix86_tune == PROCESSOR_PENTIUM)
+#define TARGET_PENTIUMPRO (ix86_tune == PROCESSOR_PENTIUMPRO)
+#define TARGET_GEODE (ix86_tune == PROCESSOR_GEODE)
+#define TARGET_K6 (ix86_tune == PROCESSOR_K6)
+#define TARGET_ATHLON (ix86_tune == PROCESSOR_ATHLON)
+#define TARGET_PENTIUM4 (ix86_tune == PROCESSOR_PENTIUM4)
+#define TARGET_K8 (ix86_tune == PROCESSOR_K8)
+#define TARGET_ATHLON_K8 (TARGET_K8 || TARGET_ATHLON)
+#define TARGET_NOCONA (ix86_tune == PROCESSOR_NOCONA)
+#define TARGET_CORE2 (ix86_tune == PROCESSOR_CORE2)
+#define TARGET_NEHALEM (ix86_tune == PROCESSOR_NEHALEM)
+#define TARGET_SANDYBRIDGE (ix86_tune == PROCESSOR_SANDYBRIDGE)
+#define TARGET_HASWELL (ix86_tune == PROCESSOR_HASWELL)
+#define TARGET_BONNELL (ix86_tune == PROCESSOR_BONNELL)
+#define TARGET_SILVERMONT (ix86_tune == PROCESSOR_SILVERMONT)
+#define TARGET_INTEL (ix86_tune == PROCESSOR_INTEL)
+#define TARGET_GENERIC (ix86_tune == PROCESSOR_GENERIC)
+#define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10)
+#define TARGET_BDVER1 (ix86_tune == PROCESSOR_BDVER1)
+#define TARGET_BDVER2 (ix86_tune == PROCESSOR_BDVER2)
+#define TARGET_BDVER3 (ix86_tune == PROCESSOR_BDVER3)
+#define TARGET_BDVER4 (ix86_tune == PROCESSOR_BDVER4)
+#define TARGET_BTVER1 (ix86_tune == PROCESSOR_BTVER1)
+#define TARGET_BTVER2 (ix86_tune == PROCESSOR_BTVER2)
+
+/* Feature tests against the various tunings.  */
+enum ix86_tune_indices {
+#undef DEF_TUNE
+#define DEF_TUNE(tune, name, selector) tune,
+#include "x86-tune.def"
+#undef DEF_TUNE
+X86_TUNE_LAST
+};
+
+extern unsigned char ix86_tune_features[X86_TUNE_LAST];
+
+#define TARGET_USE_LEAVE	ix86_tune_features[X86_TUNE_USE_LEAVE]
+#define TARGET_PUSH_MEMORY	ix86_tune_features[X86_TUNE_PUSH_MEMORY]
+#define TARGET_ZERO_EXTEND_WITH_AND \
+	ix86_tune_features[X86_TUNE_ZERO_EXTEND_WITH_AND]
+#define TARGET_UNROLL_STRLEN	ix86_tune_features[X86_TUNE_UNROLL_STRLEN]
+#define TARGET_BRANCH_PREDICTION_HINTS \
+	ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS]
+#define TARGET_DOUBLE_WITH_ADD	ix86_tune_features[X86_TUNE_DOUBLE_WITH_ADD]
+#define TARGET_USE_SAHF		ix86_tune_features[X86_TUNE_USE_SAHF]
+#define TARGET_MOVX		ix86_tune_features[X86_TUNE_MOVX]
+#define TARGET_PARTIAL_REG_STALL ix86_tune_features[X86_TUNE_PARTIAL_REG_STALL]
+#define TARGET_PARTIAL_FLAG_REG_STALL \
+	ix86_tune_features[X86_TUNE_PARTIAL_FLAG_REG_STALL]
+#define TARGET_LCP_STALL \
+	ix86_tune_features[X86_TUNE_LCP_STALL]
+#define TARGET_USE_HIMODE_FIOP	ix86_tune_features[X86_TUNE_USE_HIMODE_FIOP]
+#define TARGET_USE_SIMODE_FIOP	ix86_tune_features[X86_TUNE_USE_SIMODE_FIOP]
+#define TARGET_USE_MOV0		ix86_tune_features[X86_TUNE_USE_MOV0]
+#define TARGET_USE_CLTD		ix86_tune_features[X86_TUNE_USE_CLTD]
+#define TARGET_USE_XCHGB	ix86_tune_features[X86_TUNE_USE_XCHGB]
+#define TARGET_SPLIT_LONG_MOVES	ix86_tune_features[X86_TUNE_SPLIT_LONG_MOVES]
+#define TARGET_READ_MODIFY_WRITE ix86_tune_features[X86_TUNE_READ_MODIFY_WRITE]
+#define TARGET_READ_MODIFY	ix86_tune_features[X86_TUNE_READ_MODIFY]
+#define TARGET_PROMOTE_QImode	ix86_tune_features[X86_TUNE_PROMOTE_QIMODE]
+#define TARGET_FAST_PREFIX	ix86_tune_features[X86_TUNE_FAST_PREFIX]
+#define TARGET_SINGLE_STRINGOP	ix86_tune_features[X86_TUNE_SINGLE_STRINGOP]
+#define TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES \
+	ix86_tune_features[X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES]
+#define TARGET_QIMODE_MATH	ix86_tune_features[X86_TUNE_QIMODE_MATH]
+#define TARGET_HIMODE_MATH	ix86_tune_features[X86_TUNE_HIMODE_MATH]
+#define TARGET_PROMOTE_QI_REGS	ix86_tune_features[X86_TUNE_PROMOTE_QI_REGS]
+#define TARGET_PROMOTE_HI_REGS	ix86_tune_features[X86_TUNE_PROMOTE_HI_REGS]
+#define TARGET_SINGLE_POP	ix86_tune_features[X86_TUNE_SINGLE_POP]
+#define TARGET_DOUBLE_POP	ix86_tune_features[X86_TUNE_DOUBLE_POP]
+#define TARGET_SINGLE_PUSH	ix86_tune_features[X86_TUNE_SINGLE_PUSH]
+#define TARGET_DOUBLE_PUSH	ix86_tune_features[X86_TUNE_DOUBLE_PUSH]
+#define TARGET_INTEGER_DFMODE_MOVES \
+	ix86_tune_features[X86_TUNE_INTEGER_DFMODE_MOVES]
+#define TARGET_PARTIAL_REG_DEPENDENCY \
+	ix86_tune_features[X86_TUNE_PARTIAL_REG_DEPENDENCY]
+#define TARGET_SSE_PARTIAL_REG_DEPENDENCY \
+	ix86_tune_features[X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY]
+#define TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
+	ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL]
+#define TARGET_SSE_UNALIGNED_STORE_OPTIMAL \
+	ix86_tune_features[X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL]
+#define TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL \
+	ix86_tune_features[X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL]
+#define TARGET_SSE_SPLIT_REGS	ix86_tune_features[X86_TUNE_SSE_SPLIT_REGS]
+#define TARGET_SSE_TYPELESS_STORES \
+	ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES]
+#define TARGET_SSE_LOAD0_BY_PXOR ix86_tune_features[X86_TUNE_SSE_LOAD0_BY_PXOR]
+#define TARGET_MEMORY_MISMATCH_STALL \
+	ix86_tune_features[X86_TUNE_MEMORY_MISMATCH_STALL]
+#define TARGET_PROLOGUE_USING_MOVE \
+	ix86_tune_features[X86_TUNE_PROLOGUE_USING_MOVE]
+#define TARGET_EPILOGUE_USING_MOVE \
+	ix86_tune_features[X86_TUNE_EPILOGUE_USING_MOVE]
+#define TARGET_SHIFT1		ix86_tune_features[X86_TUNE_SHIFT1]
+#define TARGET_USE_FFREEP	ix86_tune_features[X86_TUNE_USE_FFREEP]
+#define TARGET_INTER_UNIT_MOVES_TO_VEC \
+	ix86_tune_features[X86_TUNE_INTER_UNIT_MOVES_TO_VEC]
+#define TARGET_INTER_UNIT_MOVES_FROM_VEC \
+	ix86_tune_features[X86_TUNE_INTER_UNIT_MOVES_FROM_VEC]
+#define TARGET_INTER_UNIT_CONVERSIONS \
+	ix86_tune_features[X86_TUNE_INTER_UNIT_CONVERSIONS]
+#define TARGET_FOUR_JUMP_LIMIT	ix86_tune_features[X86_TUNE_FOUR_JUMP_LIMIT]
+#define TARGET_SCHEDULE		ix86_tune_features[X86_TUNE_SCHEDULE]
+#define TARGET_USE_BT		ix86_tune_features[X86_TUNE_USE_BT]
+#define TARGET_USE_INCDEC	ix86_tune_features[X86_TUNE_USE_INCDEC]
+#define TARGET_PAD_RETURNS	ix86_tune_features[X86_TUNE_PAD_RETURNS]
+#define TARGET_PAD_SHORT_FUNCTION \
+	ix86_tune_features[X86_TUNE_PAD_SHORT_FUNCTION]
+#define TARGET_EXT_80387_CONSTANTS \
+	ix86_tune_features[X86_TUNE_EXT_80387_CONSTANTS]
+#define TARGET_AVOID_VECTOR_DECODE \
+	ix86_tune_features[X86_TUNE_AVOID_VECTOR_DECODE]
+#define TARGET_TUNE_PROMOTE_HIMODE_IMUL \
+	ix86_tune_features[X86_TUNE_PROMOTE_HIMODE_IMUL]
+#define TARGET_SLOW_IMUL_IMM32_MEM \
+	ix86_tune_features[X86_TUNE_SLOW_IMUL_IMM32_MEM]
+#define TARGET_SLOW_IMUL_IMM8	ix86_tune_features[X86_TUNE_SLOW_IMUL_IMM8]
+#define	TARGET_MOVE_M1_VIA_OR	ix86_tune_features[X86_TUNE_MOVE_M1_VIA_OR]
+#define TARGET_NOT_UNPAIRABLE	ix86_tune_features[X86_TUNE_NOT_UNPAIRABLE]
+#define TARGET_NOT_VECTORMODE	ix86_tune_features[X86_TUNE_NOT_VECTORMODE]
+#define TARGET_USE_VECTOR_FP_CONVERTS \
+	ix86_tune_features[X86_TUNE_USE_VECTOR_FP_CONVERTS]
+#define TARGET_USE_VECTOR_CONVERTS \
+	ix86_tune_features[X86_TUNE_USE_VECTOR_CONVERTS]
+#define TARGET_FUSE_CMP_AND_BRANCH_32 \
+	ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_32]
+#define TARGET_FUSE_CMP_AND_BRANCH_64 \
+	ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_64]
+#define TARGET_FUSE_CMP_AND_BRANCH \
+	(TARGET_64BIT ? TARGET_FUSE_CMP_AND_BRANCH_64 \
+	 : TARGET_FUSE_CMP_AND_BRANCH_32)
+#define TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS \
+	ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS]
+#define TARGET_FUSE_ALU_AND_BRANCH \
+	ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH]
+#define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
+#define TARGET_AVOID_LEA_FOR_ADDR \
+	ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR]
+#define TARGET_VECTORIZE_DOUBLE \
+	ix86_tune_features[X86_TUNE_VECTORIZE_DOUBLE]
+#define TARGET_SOFTWARE_PREFETCHING_BENEFICIAL \
+	ix86_tune_features[X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL]
+#define TARGET_AVX128_OPTIMAL \
+	ix86_tune_features[X86_TUNE_AVX128_OPTIMAL]
+#define TARGET_REASSOC_INT_TO_PARALLEL \
+	ix86_tune_features[X86_TUNE_REASSOC_INT_TO_PARALLEL]
+#define TARGET_REASSOC_FP_TO_PARALLEL \
+	ix86_tune_features[X86_TUNE_REASSOC_FP_TO_PARALLEL]
+#define TARGET_GENERAL_REGS_SSE_SPILL \
+	ix86_tune_features[X86_TUNE_GENERAL_REGS_SSE_SPILL]
+#define TARGET_AVOID_MEM_OPND_FOR_CMOVE \
+	ix86_tune_features[X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE]
+#define TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS \
+	ix86_tune_features[X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS]
+#define TARGET_ADJUST_UNROLL \
+    ix86_tune_features[X86_TUNE_ADJUST_UNROLL]
+
+/* Feature tests against the various architecture variations.  */
+enum ix86_arch_indices {
+  X86_ARCH_CMOV,
+  X86_ARCH_CMPXCHG,
+  X86_ARCH_CMPXCHG8B,
+  X86_ARCH_XADD,
+  X86_ARCH_BSWAP,
+
+  X86_ARCH_LAST
+};
+
+extern unsigned char ix86_arch_features[X86_ARCH_LAST];
+
+#define TARGET_CMOV		ix86_arch_features[X86_ARCH_CMOV]
+#define TARGET_CMPXCHG		ix86_arch_features[X86_ARCH_CMPXCHG]
+#define TARGET_CMPXCHG8B	ix86_arch_features[X86_ARCH_CMPXCHG8B]
+#define TARGET_XADD		ix86_arch_features[X86_ARCH_XADD]
+#define TARGET_BSWAP		ix86_arch_features[X86_ARCH_BSWAP]
+
+/* For sane SSE instruction set generation we need fcomi instruction.
+   It is safe to enable all CMOVE instructions.  Also, RDRAND intrinsic
+   expands to a sequence that includes conditional move. */
+#define TARGET_CMOVE		(TARGET_CMOV || TARGET_SSE || TARGET_RDRND)
+
+#define TARGET_FISTTP		(TARGET_SSE3 && TARGET_80387)
+
+extern unsigned char x86_prefetch_sse;
+#define TARGET_PREFETCH_SSE	x86_prefetch_sse
+
+#define ASSEMBLER_DIALECT	(ix86_asm_dialect)
+
+#define TARGET_SSE_MATH		((ix86_fpmath & FPMATH_SSE) != 0)
+#define TARGET_MIX_SSE_I387 \
+ ((ix86_fpmath & (FPMATH_SSE | FPMATH_387)) == (FPMATH_SSE | FPMATH_387))
+
+#define TARGET_GNU_TLS		(ix86_tls_dialect == TLS_DIALECT_GNU)
+#define TARGET_GNU2_TLS		(ix86_tls_dialect == TLS_DIALECT_GNU2)
+#define TARGET_ANY_GNU_TLS	(TARGET_GNU_TLS || TARGET_GNU2_TLS)
+#define TARGET_SUN_TLS		0
+
+#ifndef TARGET_64BIT_DEFAULT
+#define TARGET_64BIT_DEFAULT 0
+#endif
+#ifndef TARGET_TLS_DIRECT_SEG_REFS_DEFAULT
+#define TARGET_TLS_DIRECT_SEG_REFS_DEFAULT 0
+#endif
+
+#define TARGET_SSP_GLOBAL_GUARD (ix86_stack_protector_guard == SSP_GLOBAL)
+#define TARGET_SSP_TLS_GUARD    (ix86_stack_protector_guard == SSP_TLS)
+
+/* Fence to use after loop using storent.  */
+
+extern tree x86_mfence;
+#define FENCE_FOLLOWING_MOVNT x86_mfence
+
+/* Once GDB has been enhanced to deal with functions without frame
+   pointers, we can change this to allow for elimination of
+   the frame pointer in leaf functions.  */
+#define TARGET_DEFAULT 0
+
+/* Extra bits to force.  */
+#define TARGET_SUBTARGET_DEFAULT 0
+#define TARGET_SUBTARGET_ISA_DEFAULT 0
+
+/* Extra bits to force on w/ 32-bit mode.  */
+#define TARGET_SUBTARGET32_DEFAULT 0
+#define TARGET_SUBTARGET32_ISA_DEFAULT 0
+
+/* Extra bits to force on w/ 64-bit mode.  */
+#define TARGET_SUBTARGET64_DEFAULT 0
+#define TARGET_SUBTARGET64_ISA_DEFAULT 0
+
+/* Replace MACH-O, ifdefs by in-line tests, where possible. 
+   (a) Macros defined in config/i386/darwin.h  */
+#define TARGET_MACHO 0
+#define TARGET_MACHO_BRANCH_ISLANDS 0
+#define MACHOPIC_ATT_STUB 0
+/* (b) Macros defined in config/darwin.h  */
+#define MACHO_DYNAMIC_NO_PIC_P 0
+#define MACHOPIC_INDIRECT 0
+#define MACHOPIC_PURE 0
+
+/* For the RDOS  */
+#define TARGET_RDOS 0
+
+/* For the Windows 64-bit ABI.  */
+#define TARGET_64BIT_MS_ABI (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
+
+/* For the Windows 32-bit ABI.  */
+#define TARGET_32BIT_MS_ABI (!TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
+
+/* This is re-defined by cygming.h.  */
+#define TARGET_SEH 0
+
+/* This is re-defined by cygming.h.  */
+#define TARGET_PECOFF 0
+
+/* The default abi used by target.  */
+#define DEFAULT_ABI SYSV_ABI
+
+/* The default TLS segment register used by target.  */
+#define DEFAULT_TLS_SEG_REG (TARGET_64BIT ? SEG_FS : SEG_GS)
+
+/* Subtargets may reset this to 1 in order to enable 96-bit long double
+   with the rounding mode forced to 53 bits.  */
+#define TARGET_96_ROUND_53_LONG_DOUBLE 0
+
+/* -march=native handling only makes sense with compiler running on
+   an x86 or x86_64 chip.  If changing this condition, also change
+   the condition in driver-i386.c.  */
+#if defined(__i386__) || defined(__x86_64__)
+/* In driver-i386.c.  */
+extern const char *host_detect_local_cpu (int argc, const char **argv);
+#define EXTRA_SPEC_FUNCTIONS \
+  { "local_cpu_detect", host_detect_local_cpu },
+#define HAVE_LOCAL_CPU_DETECT
+#endif
+
+#if TARGET_64BIT_DEFAULT
+#define OPT_ARCH64 "!m32"
+#define OPT_ARCH32 "m32"
+#else
+#define OPT_ARCH64 "m64|mx32"
+#define OPT_ARCH32 "m64|mx32:;"
+#endif
+
+/* Support for configure-time defaults of some command line options.
+   The order here is important so that -march doesn't squash the
+   tune or cpu values.  */
+#define OPTION_DEFAULT_SPECS					   \
+  {"tune", "%{!mtune=*:%{!mcpu=*:%{!march=*:-mtune=%(VALUE)}}}" }, \
+  {"tune_32", "%{" OPT_ARCH32 ":%{!mtune=*:%{!mcpu=*:%{!march=*:-mtune=%(VALUE)}}}}" }, \
+  {"tune_64", "%{" OPT_ARCH64 ":%{!mtune=*:%{!mcpu=*:%{!march=*:-mtune=%(VALUE)}}}}" }, \
+  {"cpu", "%{!mtune=*:%{!mcpu=*:%{!march=*:-mtune=%(VALUE)}}}" },  \
+  {"cpu_32", "%{" OPT_ARCH32 ":%{!mtune=*:%{!mcpu=*:%{!march=*:-mtune=%(VALUE)}}}}" }, \
+  {"cpu_64", "%{" OPT_ARCH64 ":%{!mtune=*:%{!mcpu=*:%{!march=*:-mtune=%(VALUE)}}}}" }, \
+  {"arch", "%{!march=*:-march=%(VALUE)}"},			   \
+  {"arch_32", "%{" OPT_ARCH32 ":%{!march=*:-march=%(VALUE)}}"},	   \
+  {"arch_64", "%{" OPT_ARCH64 ":%{!march=*:-march=%(VALUE)}}"},
+
+/* Specs for the compiler proper */
+
+#ifndef CC1_CPU_SPEC
+#define CC1_CPU_SPEC_1 ""
+
+#ifndef HAVE_LOCAL_CPU_DETECT
+#define CC1_CPU_SPEC CC1_CPU_SPEC_1
+#else
+#define CC1_CPU_SPEC CC1_CPU_SPEC_1 \
+"%{march=native:%>march=native %:local_cpu_detect(arch) \
+  %{!mtune=*:%>mtune=native %:local_cpu_detect(tune)}} \
+%{mtune=native:%>mtune=native %:local_cpu_detect(tune)}"
+#endif
+#endif
+
+/* Target CPU builtins.  */
+#define TARGET_CPU_CPP_BUILTINS() ix86_target_macros ()
+
+/* Target Pragmas.  */
+#define REGISTER_TARGET_PRAGMAS() ix86_register_pragmas ()
+
+#ifndef CC1_SPEC
+#define CC1_SPEC "%(cc1_cpu) "
+#endif
+
+/* This macro defines names of additional specifications to put in the
+   specs that can be used in various specifications like CC1_SPEC.  Its
+   definition is an initializer with a subgrouping for each command option.
+
+   Each subgrouping contains a string constant, that defines the
+   specification name, and a string constant that used by the GCC driver
+   program.
+
+   Do not define this macro if it does not need to do anything.  */
+
+#ifndef SUBTARGET_EXTRA_SPECS
+#define SUBTARGET_EXTRA_SPECS
+#endif
+
+#define EXTRA_SPECS							\
+  { "cc1_cpu",  CC1_CPU_SPEC },						\
+  SUBTARGET_EXTRA_SPECS
+
+
+/* Set the value of FLT_EVAL_METHOD in float.h.  When using only the
+   FPU, assume that the fpcw is set to extended precision; when using
+   only SSE, rounding is correct; when using both SSE and the FPU,
+   the rounding precision is indeterminate, since either may be chosen
+   apparently at random.  */
+#define TARGET_FLT_EVAL_METHOD \
+  (TARGET_MIX_SSE_I387 ? -1 : TARGET_SSE_MATH ? 0 : 2)
+
+/* Whether to allow x87 floating-point arithmetic on MODE (one of
+   SFmode, DFmode and XFmode) in the current excess precision
+   configuration.  */
+#define X87_ENABLE_ARITH(MODE) \
+  (flag_excess_precision == EXCESS_PRECISION_FAST || (MODE) == XFmode)
+
+/* Likewise, whether to allow direct conversions from integer mode
+   IMODE (HImode, SImode or DImode) to MODE.  */
+#define X87_ENABLE_FLOAT(MODE, IMODE)			\
+  (flag_excess_precision == EXCESS_PRECISION_FAST	\
+   || (MODE) == XFmode					\
+   || ((MODE) == DFmode && (IMODE) == SImode)		\
+   || (IMODE) == HImode)
+
+/* target machine storage layout */
+
+#define SHORT_TYPE_SIZE 16
+#define INT_TYPE_SIZE 32
+#define LONG_TYPE_SIZE (TARGET_X32 ? 32 : BITS_PER_WORD)
+#define POINTER_SIZE (TARGET_X32 ? 32 : BITS_PER_WORD)
+#define LONG_LONG_TYPE_SIZE 64
+#define FLOAT_TYPE_SIZE 32
+#define DOUBLE_TYPE_SIZE 64
+#define LONG_DOUBLE_TYPE_SIZE \
+  (TARGET_LONG_DOUBLE_64 ? 64 : (TARGET_LONG_DOUBLE_128 ? 128 : 80))
+
+/* Define this to set long double type size to use in libgcc2.c, which can
+   not depend on target_flags.  */
+#ifdef __LONG_DOUBLE_64__
+#define LIBGCC2_LONG_DOUBLE_TYPE_SIZE 64
+#elif defined (__LONG_DOUBLE_128__)
+#define LIBGCC2_LONG_DOUBLE_TYPE_SIZE 128
+#else
+#define LIBGCC2_LONG_DOUBLE_TYPE_SIZE 80
+#endif
+
+#define WIDEST_HARDWARE_FP_SIZE 80
+
+#if defined (TARGET_BI_ARCH) || TARGET_64BIT_DEFAULT
+#define MAX_BITS_PER_WORD 64
+#else
+#define MAX_BITS_PER_WORD 32
+#endif
+
+/* Define this if most significant byte of a word is the lowest numbered.  */
+/* That is true on the 80386.  */
+
+#define BITS_BIG_ENDIAN 0
+
+/* Define this if most significant byte of a word is the lowest numbered.  */
+/* That is not true on the 80386.  */
+#define BYTES_BIG_ENDIAN 0
+
+/* Define this if most significant word of a multiword number is the lowest
+   numbered.  */
+/* Not true for 80386 */
+#define WORDS_BIG_ENDIAN 0
+
+/* Width of a word, in units (bytes).  */
+#define UNITS_PER_WORD		(TARGET_64BIT ? 8 : 4)
+
+#ifndef IN_LIBGCC2
+#define MIN_UNITS_PER_WORD	4
+#endif
+
+/* Allocation boundary (in *bits*) for storing arguments in argument list.  */
+#define PARM_BOUNDARY BITS_PER_WORD
+
+/* Boundary (in *bits*) on which stack pointer should be aligned.  */
+#define STACK_BOUNDARY \
+ (TARGET_64BIT && ix86_abi == MS_ABI ? 128 : BITS_PER_WORD)
+
+/* Stack boundary of the main function guaranteed by OS.  */
+#define MAIN_STACK_BOUNDARY (TARGET_64BIT ? 128 : 32)
+
+/* Minimum stack boundary.  */
+#define MIN_STACK_BOUNDARY (TARGET_64BIT ? (TARGET_SSE ? 128 : 64) : 32)
+
+/* Boundary (in *bits*) on which the stack pointer prefers to be
+   aligned; the compiler cannot rely on having this alignment.  */
+#define PREFERRED_STACK_BOUNDARY ix86_preferred_stack_boundary
+
+/* It should be MIN_STACK_BOUNDARY.  But we set it to 128 bits for
+   both 32bit and 64bit, to support codes that need 128 bit stack
+   alignment for SSE instructions, but can't realign the stack.  */
+#define PREFERRED_STACK_BOUNDARY_DEFAULT 128
+
+/* 1 if -mstackrealign should be turned on by default.  It will
+   generate an alternate prologue and epilogue that realigns the
+   runtime stack if nessary.  This supports mixing codes that keep a
+   4-byte aligned stack, as specified by i386 psABI, with codes that
+   need a 16-byte aligned stack, as required by SSE instructions.  */
+#define STACK_REALIGN_DEFAULT 0
+
+/* Boundary (in *bits*) on which the incoming stack is aligned.  */
+#define INCOMING_STACK_BOUNDARY ix86_incoming_stack_boundary
+
+/* According to Windows x64 software convention, the maximum stack allocatable
+   in the prologue is 4G - 8 bytes.  Furthermore, there is a limited set of
+   instructions allowed to adjust the stack pointer in the epilog, forcing the
+   use of frame pointer for frames larger than 2 GB.  This theorical limit
+   is reduced by 256, an over-estimated upper bound for the stack use by the
+   prologue.
+   We define only one threshold for both the prolog and the epilog.  When the
+   frame size is larger than this threshold, we allocate the area to save SSE
+   regs, then save them, and then allocate the remaining.  There is no SEH
+   unwind info for this later allocation.  */
+#define SEH_MAX_FRAME_SIZE ((2U << 30) - 256)
+
+/* Target OS keeps a vector-aligned (128-bit, 16-byte) stack.  This is
+   mandatory for the 64-bit ABI, and may or may not be true for other
+   operating systems.  */
+#define TARGET_KEEPS_VECTOR_ALIGNED_STACK TARGET_64BIT
+
+/* Minimum allocation boundary for the code of a function.  */
+#define FUNCTION_BOUNDARY 8
+
+/* C++ stores the virtual bit in the lowest bit of function pointers.  */
+#define TARGET_PTRMEMFUNC_VBIT_LOCATION ptrmemfunc_vbit_in_pfn
+
+/* Minimum size in bits of the largest boundary to which any
+   and all fundamental data types supported by the hardware
+   might need to be aligned. No data type wants to be aligned
+   rounder than this.
+
+   Pentium+ prefers DFmode values to be aligned to 64 bit boundary
+   and Pentium Pro XFmode values at 128 bit boundaries.  */
+
+#define BIGGEST_ALIGNMENT \
+  (TARGET_AVX512F ? 512 : (TARGET_AVX ? 256 : 128))
+
+/* Maximum stack alignment.  */
+#define MAX_STACK_ALIGNMENT MAX_OFILE_ALIGNMENT
+
+/* Alignment value for attribute ((aligned)).  It is a constant since
+   it is the part of the ABI.  We shouldn't change it with -mavx.  */
+#define ATTRIBUTE_ALIGNED_VALUE 128
+
+/* Decide whether a variable of mode MODE should be 128 bit aligned.  */
+#define ALIGN_MODE_128(MODE) \
+ ((MODE) == XFmode || SSE_REG_MODE_P (MODE))
+
+/* The published ABIs say that doubles should be aligned on word
+   boundaries, so lower the alignment for structure fields unless
+   -malign-double is set.  */
+
+/* ??? Blah -- this macro is used directly by libobjc.  Since it
+   supports no vector modes, cut out the complexity and fall back
+   on BIGGEST_FIELD_ALIGNMENT.  */
+#ifdef IN_TARGET_LIBS
+#ifdef __x86_64__
+#define BIGGEST_FIELD_ALIGNMENT 128
+#else
+#define BIGGEST_FIELD_ALIGNMENT 32
+#endif
+#else
+#define ADJUST_FIELD_ALIGN(FIELD, COMPUTED) \
+   x86_field_alignment (FIELD, COMPUTED)
+#endif
+
+/* If defined, a C expression to compute the alignment given to a
+   constant that is being placed in memory.  EXP is the constant
+   and ALIGN is the alignment that the object would ordinarily have.
+   The value of this macro is used instead of that alignment to align
+   the object.
+
+   If this macro is not defined, then ALIGN is used.
+
+   The typical use of this macro is to increase alignment for string
+   constants to be word aligned so that `strcpy' calls that copy
+   constants can be done inline.  */
+
+#define CONSTANT_ALIGNMENT(EXP, ALIGN) ix86_constant_alignment ((EXP), (ALIGN))
+
+/* If defined, a C expression to compute the alignment for a static
+   variable.  TYPE is the data type, and ALIGN is the alignment that
+   the object would ordinarily have.  The value of this macro is used
+   instead of that alignment to align the object.
+
+   If this macro is not defined, then ALIGN is used.
+
+   One use of this macro is to increase alignment of medium-size
+   data to make it all fit in fewer cache lines.  Another is to
+   cause character arrays to be word-aligned so that `strcpy' calls
+   that copy constants to character arrays can be done inline.  */
+
+#define DATA_ALIGNMENT(TYPE, ALIGN) \
+  ix86_data_alignment ((TYPE), (ALIGN), true)
+
+/* Similar to DATA_ALIGNMENT, but for the cases where the ABI mandates
+   some alignment increase, instead of optimization only purposes.  E.g.
+   AMD x86-64 psABI says that variables with array type larger than 15 bytes
+   must be aligned to 16 byte boundaries.
+
+   If this macro is not defined, then ALIGN is used.  */
+
+#define DATA_ABI_ALIGNMENT(TYPE, ALIGN) \
+  ix86_data_alignment ((TYPE), (ALIGN), false)
+
+/* If defined, a C expression to compute the alignment for a local
+   variable.  TYPE is the data type, and ALIGN is the alignment that
+   the object would ordinarily have.  The value of this macro is used
+   instead of that alignment to align the object.
+
+   If this macro is not defined, then ALIGN is used.
+
+   One use of this macro is to increase alignment of medium-size
+   data to make it all fit in fewer cache lines.  */
+
+#define LOCAL_ALIGNMENT(TYPE, ALIGN) \
+  ix86_local_alignment ((TYPE), VOIDmode, (ALIGN))
+
+/* If defined, a C expression to compute the alignment for stack slot.
+   TYPE is the data type, MODE is the widest mode available, and ALIGN
+   is the alignment that the slot would ordinarily have.  The value of
+   this macro is used instead of that alignment to align the slot.
+
+   If this macro is not defined, then ALIGN is used when TYPE is NULL,
+   Otherwise, LOCAL_ALIGNMENT will be used.
+
+   One use of this macro is to set alignment of stack slot to the
+   maximum alignment of all possible modes which the slot may have.  */
+
+#define STACK_SLOT_ALIGNMENT(TYPE, MODE, ALIGN) \
+  ix86_local_alignment ((TYPE), (MODE), (ALIGN))
+
+/* If defined, a C expression to compute the alignment for a local
+   variable DECL.
+
+   If this macro is not defined, then
+   LOCAL_ALIGNMENT (TREE_TYPE (DECL), DECL_ALIGN (DECL)) will be used.
+
+   One use of this macro is to increase alignment of medium-size
+   data to make it all fit in fewer cache lines.  */
+
+#define LOCAL_DECL_ALIGNMENT(DECL) \
+  ix86_local_alignment ((DECL), VOIDmode, DECL_ALIGN (DECL))
+
+/* If defined, a C expression to compute the minimum required alignment
+   for dynamic stack realignment purposes for EXP (a TYPE or DECL),
+   MODE, assuming normal alignment ALIGN.
+
+   If this macro is not defined, then (ALIGN) will be used.  */
+
+#define MINIMUM_ALIGNMENT(EXP, MODE, ALIGN) \
+  ix86_minimum_alignment (EXP, MODE, ALIGN)
+
+
+/* Set this nonzero if move instructions will actually fail to work
+   when given unaligned data.  */
+#define STRICT_ALIGNMENT 0
+
+/* If bit field type is int, don't let it cross an int,
+   and give entire struct the alignment of an int.  */
+/* Required on the 386 since it doesn't have bit-field insns.  */
+#define PCC_BITFIELD_TYPE_MATTERS 1
+
+/* Standard register usage.  */
+
+/* This processor has special stack-like registers.  See reg-stack.c
+   for details.  */
+
+#define STACK_REGS
+
+#define IS_STACK_MODE(MODE)					\
+  (((MODE) == SFmode && !(TARGET_SSE && TARGET_SSE_MATH))	\
+   || ((MODE) == DFmode && !(TARGET_SSE2 && TARGET_SSE_MATH))	\
+   || (MODE) == XFmode)
+
+/* Number of actual hardware registers.
+   The hardware registers are assigned numbers for the compiler
+   from 0 to just below FIRST_PSEUDO_REGISTER.
+   All registers that the compiler knows about must be given numbers,
+   even those that are not normally considered general registers.
+
+   In the 80386 we give the 8 general purpose registers the numbers 0-7.
+   We number the floating point registers 8-15.
+   Note that registers 0-7 can be accessed as a  short or int,
+   while only 0-3 may be used with byte `mov' instructions.
+
+   Reg 16 does not correspond to any hardware register, but instead
+   appears in the RTL as an argument pointer prior to reload, and is
+   eliminated during reloading in favor of either the stack or frame
+   pointer.  */
+
+#define FIRST_PSEUDO_REGISTER 77
+
+/* Number of hardware registers that go into the DWARF-2 unwind info.
+   If not defined, equals FIRST_PSEUDO_REGISTER.  */
+
+#define DWARF_FRAME_REGISTERS 17
+
+/* 1 for registers that have pervasive standard uses
+   and are not available for the register allocator.
+   On the 80386, the stack pointer is such, as is the arg pointer.
+
+   REX registers are disabled for 32bit targets in
+   TARGET_CONDITIONAL_REGISTER_USAGE.  */
+
+#define FIXED_REGISTERS						\
+/*ax,dx,cx,bx,si,di,bp,sp,st,st1,st2,st3,st4,st5,st6,st7*/	\
+{  0, 0, 0, 0, 0, 0, 0, 1, 0,  0,  0,  0,  0,  0,  0,  0,	\
+/*arg,flags,fpsr,fpcr,frame*/					\
+    1,    1,   1,   1,    1,					\
+/*xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7*/			\
+     0,   0,   0,   0,   0,   0,   0,   0,			\
+/* mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7*/			\
+     0,   0,   0,   0,   0,   0,   0,   0,			\
+/*  r8,  r9, r10, r11, r12, r13, r14, r15*/			\
+     0,   0,   0,   0,   0,   0,   0,   0,			\
+/*xmm8,xmm9,xmm10,xmm11,xmm12,xmm13,xmm14,xmm15*/		\
+     0,   0,    0,    0,    0,    0,    0,    0,		\
+/*xmm16,xmm17,xmm18,xmm19,xmm20,xmm21,xmm22,xmm23*/		\
+     0,   0,    0,    0,    0,    0,    0,    0,		\
+/*xmm24,xmm25,xmm26,xmm27,xmm28,xmm29,xmm30,xmm31*/		\
+     0,   0,    0,    0,    0,    0,    0,    0,		\
+/*  k0,  k1, k2, k3, k4, k5, k6, k7*/				\
+     0,  0,   0,  0,  0,  0,  0,  0 }
+
+/* 1 for registers not available across function calls.
+   These must include the FIXED_REGISTERS and also any
+   registers that can be used without being saved.
+   The latter must include the registers where values are returned
+   and the register where structure-value addresses are passed.
+   Aside from that, you can include as many other registers as you like.
+
+   Value is set to 1 if the register is call used unconditionally.
+   Bit one is set if the register is call used on TARGET_32BIT ABI.
+   Bit two is set if the register is call used on TARGET_64BIT ABI.
+   Bit three is set if the register is call used on TARGET_64BIT_MS_ABI.
+
+   Proper values are computed in TARGET_CONDITIONAL_REGISTER_USAGE.  */
+
+#define CALL_USED_REGISTERS					\
+/*ax,dx,cx,bx,si,di,bp,sp,st,st1,st2,st3,st4,st5,st6,st7*/	\
+{  1, 1, 1, 0, 4, 4, 0, 1, 1,  1,  1,  1,  1,  1,  1,  1,	\
+/*arg,flags,fpsr,fpcr,frame*/					\
+    1,   1,    1,   1,    1,					\
+/*xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7*/			\
+     1,   1,   1,   1,   1,   1,   6,   6,			\
+/* mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7*/			\
+     1,   1,   1,   1,   1,   1,   1,   1,			\
+/*  r8,  r9, r10, r11, r12, r13, r14, r15*/			\
+     1,   1,   1,   1,   2,   2,   2,   2,			\
+/*xmm8,xmm9,xmm10,xmm11,xmm12,xmm13,xmm14,xmm15*/		\
+     6,   6,    6,    6,    6,    6,    6,    6,		\
+/*xmm16,xmm17,xmm18,xmm19,xmm20,xmm21,xmm22,xmm23*/		\
+     6,    6,     6,    6,    6,    6,    6,    6,		\
+/*xmm24,xmm25,xmm26,xmm27,xmm28,xmm29,xmm30,xmm31*/		\
+     6,    6,     6,    6,    6,    6,    6,    6,		\
+ /* k0,  k1,  k2,  k3,  k4,  k5,  k6,  k7*/			\
+     1,   1,   1,   1,   1,   1,   1,   1 }
+
+/* Order in which to allocate registers.  Each register must be
+   listed once, even those in FIXED_REGISTERS.  List frame pointer
+   late and fixed registers last.  Note that, in general, we prefer
+   registers listed in CALL_USED_REGISTERS, keeping the others
+   available for storage of persistent values.
+
+   The ADJUST_REG_ALLOC_ORDER actually overwrite the order,
+   so this is just empty initializer for array.  */
+
+#define REG_ALLOC_ORDER 					\
+{  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\
+   18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,	\
+   33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,  \
+   48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,	\
+   63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76 }
+
+/* ADJUST_REG_ALLOC_ORDER is a macro which permits reg_alloc_order
+   to be rearranged based on a particular function.  When using sse math,
+   we want to allocate SSE before x87 registers and vice versa.  */
+
+#define ADJUST_REG_ALLOC_ORDER x86_order_regs_for_local_alloc ()
+
+
+#define OVERRIDE_ABI_FORMAT(FNDECL) ix86_call_abi_override (FNDECL)
+
+/* Return number of consecutive hard regs needed starting at reg REGNO
+   to hold something of mode MODE.
+   This is ordinarily the length in words of a value of mode MODE
+   but can be less for certain modes in special long registers.
+
+   Actually there are no two word move instructions for consecutive
+   registers.  And only registers 0-3 may have mov byte instructions
+   applied to them.  */
+
+#define HARD_REGNO_NREGS(REGNO, MODE)					\
+  (STACK_REGNO_P (REGNO) || SSE_REGNO_P (REGNO) || MMX_REGNO_P (REGNO)	\
+   ? (COMPLEX_MODE_P (MODE) ? 2 : 1)					\
+   : ((MODE) == XFmode							\
+      ? (TARGET_64BIT ? 2 : 3)						\
+      : (MODE) == XCmode						\
+      ? (TARGET_64BIT ? 4 : 6)						\
+      : ((GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)))
+
+#define HARD_REGNO_NREGS_HAS_PADDING(REGNO, MODE)			\
+  ((TARGET_128BIT_LONG_DOUBLE && !TARGET_64BIT)				\
+   ? (STACK_REGNO_P (REGNO) || SSE_REGNO_P (REGNO) || MMX_REGNO_P (REGNO) \
+      ? 0								\
+      : ((MODE) == XFmode || (MODE) == XCmode))				\
+   : 0)
+
+#define HARD_REGNO_NREGS_WITH_PADDING(REGNO, MODE) ((MODE) == XFmode ? 4 : 8)
+
+#define VALID_AVX256_REG_MODE(MODE)					\
+  ((MODE) == V32QImode || (MODE) == V16HImode || (MODE) == V8SImode	\
+   || (MODE) == V4DImode || (MODE) == V2TImode || (MODE) == V8SFmode	\
+   || (MODE) == V4DFmode)
+
+#define VALID_AVX256_REG_OR_OI_MODE(MODE)		\
+  (VALID_AVX256_REG_MODE (MODE) || (MODE) == OImode)
+
+#define VALID_AVX512F_SCALAR_MODE(MODE)					\
+  ((MODE) == DImode || (MODE) == DFmode || (MODE) == SImode		\
+   || (MODE) == SFmode)
+
+#define VALID_AVX512F_REG_MODE(MODE)					\
+  ((MODE) == V8DImode || (MODE) == V8DFmode || (MODE) == V64QImode	\
+   || (MODE) == V16SImode || (MODE) == V16SFmode || (MODE) == V32HImode)
+
+#define VALID_SSE2_REG_MODE(MODE)					\
+  ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode	\
+   || (MODE) == V2DImode || (MODE) == DFmode)
+
+#define VALID_SSE_REG_MODE(MODE)					\
+  ((MODE) == V1TImode || (MODE) == TImode				\
+   || (MODE) == V4SFmode || (MODE) == V4SImode				\
+   || (MODE) == SFmode || (MODE) == TFmode)
+
+#define VALID_MMX_REG_MODE_3DNOW(MODE) \
+  ((MODE) == V2SFmode || (MODE) == SFmode)
+
+#define VALID_MMX_REG_MODE(MODE)					\
+  ((MODE == V1DImode) || (MODE) == DImode				\
+   || (MODE) == V2SImode || (MODE) == SImode				\
+   || (MODE) == V4HImode || (MODE) == V8QImode)
+
+#define VALID_DFP_MODE_P(MODE) \
+  ((MODE) == SDmode || (MODE) == DDmode || (MODE) == TDmode)
+
+#define VALID_FP_MODE_P(MODE)						\
+  ((MODE) == SFmode || (MODE) == DFmode || (MODE) == XFmode		\
+   || (MODE) == SCmode || (MODE) == DCmode || (MODE) == XCmode)		\
+
+#define VALID_INT_MODE_P(MODE)						\
+  ((MODE) == QImode || (MODE) == HImode || (MODE) == SImode		\
+   || (MODE) == DImode							\
+   || (MODE) == CQImode || (MODE) == CHImode || (MODE) == CSImode	\
+   || (MODE) == CDImode							\
+   || (TARGET_64BIT && ((MODE) == TImode || (MODE) == CTImode		\
+			|| (MODE) == TFmode || (MODE) == TCmode)))
+
+/* Return true for modes passed in SSE registers.  */
+#define SSE_REG_MODE_P(MODE)						\
+  ((MODE) == V1TImode || (MODE) == TImode || (MODE) == V16QImode	\
+   || (MODE) == TFmode || (MODE) == V8HImode || (MODE) == V2DFmode	\
+   || (MODE) == V2DImode || (MODE) == V4SFmode || (MODE) == V4SImode	\
+   || (MODE) == V32QImode || (MODE) == V16HImode || (MODE) == V8SImode	\
+   || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode	\
+   || (MODE) == V2TImode || (MODE) == V8DImode || (MODE) == V64QImode	\
+   || (MODE) == V16SImode || (MODE) == V32HImode || (MODE) == V8DFmode	\
+   || (MODE) == V16SFmode)
+
+#define VALID_MASK_REG_MODE(MODE) ((MODE) == HImode || (MODE) == QImode)
+
+/* Value is 1 if hard register REGNO can hold a value of machine-mode MODE.  */
+
+#define HARD_REGNO_MODE_OK(REGNO, MODE)	\
+   ix86_hard_regno_mode_ok ((REGNO), (MODE))
+
+/* Value is 1 if it is a good idea to tie two pseudo registers
+   when one has mode MODE1 and one has mode MODE2.
+   If HARD_REGNO_MODE_OK could produce different values for MODE1 and MODE2,
+   for any hard reg, then this must be 0 for correct output.  */
+
+#define MODES_TIEABLE_P(MODE1, MODE2)  ix86_modes_tieable_p (MODE1, MODE2)
+
+/* It is possible to write patterns to move flags; but until someone
+   does it,  */
+#define AVOID_CCMODE_COPIES
+
+/* Specify the modes required to caller save a given hard regno.
+   We do this on i386 to prevent flags from being saved at all.
+
+   Kill any attempts to combine saving of modes.  */
+
+#define HARD_REGNO_CALLER_SAVE_MODE(REGNO, NREGS, MODE)			\
+  (CC_REGNO_P (REGNO) ? VOIDmode					\
+   : (MODE) == VOIDmode && (NREGS) != 1 ? VOIDmode			\
+   : (MODE) == VOIDmode ? choose_hard_reg_mode ((REGNO), (NREGS), false) \
+   : (MODE) == HImode && !(TARGET_PARTIAL_REG_STALL			\
+			   || MASK_REGNO_P (REGNO)) ? SImode		\
+   : (MODE) == QImode && !(TARGET_64BIT || QI_REGNO_P (REGNO)		\
+			   || MASK_REGNO_P (REGNO)) ? SImode		\
+   : (MODE))
+
+/* The only ABI that saves SSE registers across calls is Win64 (thus no
+   need to check the current ABI here), and with AVX enabled Win64 only
+   guarantees that the low 16 bytes are saved.  */
+#define HARD_REGNO_CALL_PART_CLOBBERED(REGNO, MODE)             \
+  (SSE_REGNO_P (REGNO) && GET_MODE_SIZE (MODE) > 16)
+
+/* Specify the registers used for certain standard purposes.
+   The values of these macros are register numbers.  */
+
+/* on the 386 the pc register is %eip, and is not usable as a general
+   register.  The ordinary mov instructions won't work */
+/* #define PC_REGNUM  */
+
+/* Register to use for pushing function arguments.  */
+#define STACK_POINTER_REGNUM 7
+
+/* Base register for access to local variables of the function.  */
+#define HARD_FRAME_POINTER_REGNUM 6
+
+/* Base register for access to local variables of the function.  */
+#define FRAME_POINTER_REGNUM 20
+
+/* First floating point reg */
+#define FIRST_FLOAT_REG 8
+
+/* First & last stack-like regs */
+#define FIRST_STACK_REG FIRST_FLOAT_REG
+#define LAST_STACK_REG (FIRST_FLOAT_REG + 7)
+
+#define FIRST_SSE_REG (FRAME_POINTER_REGNUM + 1)
+#define LAST_SSE_REG  (FIRST_SSE_REG + 7)
+
+#define FIRST_MMX_REG  (LAST_SSE_REG + 1)   /*29*/
+#define LAST_MMX_REG   (FIRST_MMX_REG + 7)
+
+#define FIRST_REX_INT_REG  (LAST_MMX_REG + 1) /*37*/
+#define LAST_REX_INT_REG   (FIRST_REX_INT_REG + 7)
+
+#define FIRST_REX_SSE_REG  (LAST_REX_INT_REG + 1) /*45*/
+#define LAST_REX_SSE_REG   (FIRST_REX_SSE_REG + 7)
+
+#define FIRST_EXT_REX_SSE_REG  (LAST_REX_SSE_REG + 1) /*53*/
+#define LAST_EXT_REX_SSE_REG   (FIRST_EXT_REX_SSE_REG + 15) /*68*/
+
+#define FIRST_MASK_REG  (LAST_EXT_REX_SSE_REG + 1) /*69*/
+#define LAST_MASK_REG   (FIRST_MASK_REG + 7) /*76*/
+
+/* Override this in other tm.h files to cope with various OS lossage
+   requiring a frame pointer.  */
+#ifndef SUBTARGET_FRAME_POINTER_REQUIRED
+#define SUBTARGET_FRAME_POINTER_REQUIRED 0
+#endif
+
+/* Make sure we can access arbitrary call frames.  */
+#define SETUP_FRAME_ADDRESSES()  ix86_setup_frame_addresses ()
+
+/* Base register for access to arguments of the function.  */
+#define ARG_POINTER_REGNUM 16
+
+/* Register to hold the addressing base for position independent
+   code access to data items.  We don't use PIC pointer for 64bit
+   mode.  Define the regnum to dummy value to prevent gcc from
+   pessimizing code dealing with EBX.
+
+   To avoid clobbering a call-saved register unnecessarily, we renumber
+   the pic register when possible.  The change is visible after the
+   prologue has been emitted.  */
+
+#define REAL_PIC_OFFSET_TABLE_REGNUM  BX_REG
+
+#define PIC_OFFSET_TABLE_REGNUM				\
+  ((TARGET_64BIT && (ix86_cmodel == CM_SMALL_PIC	\
+                     || TARGET_PECOFF))		\
+   || !flag_pic ? INVALID_REGNUM			\
+   : reload_completed ? REGNO (pic_offset_table_rtx)	\
+   : REAL_PIC_OFFSET_TABLE_REGNUM)
+
+#define GOT_SYMBOL_NAME "_GLOBAL_OFFSET_TABLE_"
+
+/* This is overridden by <cygwin.h>.  */
+#define MS_AGGREGATE_RETURN 0
+
+#define KEEP_AGGREGATE_RETURN_POINTER 0
+
+/* Define the classes of registers for register constraints in the
+   machine description.  Also define ranges of constants.
+
+   One of the classes must always be named ALL_REGS and include all hard regs.
+   If there is more than one class, another class must be named NO_REGS
+   and contain no registers.
+
+   The name GENERAL_REGS must be the name of a class (or an alias for
+   another name such as ALL_REGS).  This is the class of registers
+   that is allowed by "g" or "r" in a register constraint.
+   Also, registers outside this class are allocated only when
+   instructions express preferences for them.
+
+   The classes must be numbered in nondecreasing order; that is,
+   a larger-numbered class must never be contained completely
+   in a smaller-numbered class.
+
+   For any two classes, it is very desirable that there be another
+   class that represents their union.
+
+   It might seem that class BREG is unnecessary, since no useful 386
+   opcode needs reg %ebx.  But some systems pass args to the OS in ebx,
+   and the "b" register constraint is useful in asms for syscalls.
+
+   The flags, fpsr and fpcr registers are in no class.  */
+
+enum reg_class
+{
+  NO_REGS,
+  AREG, DREG, CREG, BREG, SIREG, DIREG,
+  AD_REGS,			/* %eax/%edx for DImode */
+  Q_REGS,			/* %eax %ebx %ecx %edx */
+  NON_Q_REGS,			/* %esi %edi %ebp %esp */
+  INDEX_REGS,			/* %eax %ebx %ecx %edx %esi %edi %ebp */
+  LEGACY_REGS,			/* %eax %ebx %ecx %edx %esi %edi %ebp %esp */
+  CLOBBERED_REGS,		/* call-clobbered integer registers */
+  GENERAL_REGS,			/* %eax %ebx %ecx %edx %esi %edi %ebp %esp
+				   %r8 %r9 %r10 %r11 %r12 %r13 %r14 %r15 */
+  FP_TOP_REG, FP_SECOND_REG,	/* %st(0) %st(1) */
+  FLOAT_REGS,
+  SSE_FIRST_REG,
+  SSE_REGS,
+  EVEX_SSE_REGS,
+  ALL_SSE_REGS,
+  MMX_REGS,
+  FP_TOP_SSE_REGS,
+  FP_SECOND_SSE_REGS,
+  FLOAT_SSE_REGS,
+  FLOAT_INT_REGS,
+  INT_SSE_REGS,
+  FLOAT_INT_SSE_REGS,
+  MASK_EVEX_REGS,
+  MASK_REGS,
+  ALL_REGS, LIM_REG_CLASSES
+};
+
+#define N_REG_CLASSES ((int) LIM_REG_CLASSES)
+
+#define INTEGER_CLASS_P(CLASS) \
+  reg_class_subset_p ((CLASS), GENERAL_REGS)
+#define FLOAT_CLASS_P(CLASS) \
+  reg_class_subset_p ((CLASS), FLOAT_REGS)
+#define SSE_CLASS_P(CLASS) \
+  reg_class_subset_p ((CLASS), ALL_SSE_REGS)
+#define MMX_CLASS_P(CLASS) \
+  ((CLASS) == MMX_REGS)
+#define MAYBE_INTEGER_CLASS_P(CLASS) \
+  reg_classes_intersect_p ((CLASS), GENERAL_REGS)
+#define MAYBE_FLOAT_CLASS_P(CLASS) \
+  reg_classes_intersect_p ((CLASS), FLOAT_REGS)
+#define MAYBE_SSE_CLASS_P(CLASS) \
+  reg_classes_intersect_p ((CLASS), ALL_SSE_REGS)
+#define MAYBE_MMX_CLASS_P(CLASS) \
+  reg_classes_intersect_p ((CLASS), MMX_REGS)
+#define MAYBE_MASK_CLASS_P(CLASS) \
+  reg_classes_intersect_p ((CLASS), MASK_REGS)
+
+#define Q_CLASS_P(CLASS) \
+  reg_class_subset_p ((CLASS), Q_REGS)
+
+#define MAYBE_NON_Q_CLASS_P(CLASS) \
+  reg_classes_intersect_p ((CLASS), NON_Q_REGS)
+
+/* Give names of register classes as strings for dump file.  */
+
+#define REG_CLASS_NAMES \
+{  "NO_REGS",				\
+   "AREG", "DREG", "CREG", "BREG",	\
+   "SIREG", "DIREG",			\
+   "AD_REGS",				\
+   "Q_REGS", "NON_Q_REGS",		\
+   "INDEX_REGS",			\
+   "LEGACY_REGS",			\
+   "CLOBBERED_REGS",			\
+   "GENERAL_REGS",			\
+   "FP_TOP_REG", "FP_SECOND_REG",	\
+   "FLOAT_REGS",			\
+   "SSE_FIRST_REG",			\
+   "SSE_REGS",				\
+   "EVEX_SSE_REGS",			\
+   "ALL_SSE_REGS",			\
+   "MMX_REGS",				\
+   "FP_TOP_SSE_REGS",			\
+   "FP_SECOND_SSE_REGS",		\
+   "FLOAT_SSE_REGS",			\
+   "FLOAT_INT_REGS",			\
+   "INT_SSE_REGS",			\
+   "FLOAT_INT_SSE_REGS",		\
+   "MASK_EVEX_REGS",			\
+   "MASK_REGS",				\
+   "ALL_REGS" }
+
+/* Define which registers fit in which classes.  This is an initializer
+   for a vector of HARD_REG_SET of length N_REG_CLASSES.
+
+   Note that CLOBBERED_REGS are calculated by
+   TARGET_CONDITIONAL_REGISTER_USAGE.  */
+
+#define REG_CLASS_CONTENTS                                              \
+{     { 0x00,       0x0,   0x0 },                                       \
+      { 0x01,       0x0,   0x0 },       /* AREG */                      \
+      { 0x02,       0x0,   0x0 },       /* DREG */                      \
+      { 0x04,       0x0,   0x0 },       /* CREG */                      \
+      { 0x08,       0x0,   0x0 },       /* BREG */                      \
+      { 0x10,       0x0,   0x0 },       /* SIREG */                     \
+      { 0x20,       0x0,   0x0 },       /* DIREG */                     \
+      { 0x03,       0x0,   0x0 },       /* AD_REGS */                   \
+      { 0x0f,       0x0,   0x0 },       /* Q_REGS */                    \
+  { 0x1100f0,    0x1fe0,   0x0 },       /* NON_Q_REGS */                \
+      { 0x7f,    0x1fe0,   0x0 },       /* INDEX_REGS */                \
+  { 0x1100ff,       0x0,   0x0 },       /* LEGACY_REGS */               \
+      { 0x07,       0x0,   0x0 },       /* CLOBBERED_REGS */            \
+  { 0x1100ff,    0x1fe0,   0x0 },       /* GENERAL_REGS */              \
+     { 0x100,       0x0,   0x0 },       /* FP_TOP_REG */                \
+    { 0x0200,       0x0,   0x0 },       /* FP_SECOND_REG */             \
+    { 0xff00,       0x0,   0x0 },       /* FLOAT_REGS */                \
+  { 0x200000,       0x0,   0x0 },       /* SSE_FIRST_REG */             \
+{ 0x1fe00000,  0x1fe000,   0x0 },       /* SSE_REGS */                  \
+       { 0x0,0xffe00000,  0x1f },       /* EVEX_SSE_REGS */             \
+{ 0x1fe00000,0xffffe000,  0x1f },       /* ALL_SSE_REGS */              \
+{ 0xe0000000,      0x1f,   0x0 },       /* MMX_REGS */                  \
+{ 0x1fe00100,0xffffe000,  0x1f },       /* FP_TOP_SSE_REG */            \
+{ 0x1fe00200,0xffffe000,  0x1f },       /* FP_SECOND_SSE_REG */         \
+{ 0x1fe0ff00,0xffffe000,  0x1f },       /* FLOAT_SSE_REGS */            \
+{   0x11ffff,    0x1fe0,   0x0 },       /* FLOAT_INT_REGS */            \
+{ 0x1ff100ff,0xffffffe0,  0x1f },       /* INT_SSE_REGS */              \
+{ 0x1ff1ffff,0xffffffe0,  0x1f },       /* FLOAT_INT_SSE_REGS */        \
+       { 0x0,       0x0,0x1fc0 },       /* MASK_EVEX_REGS */           \
+       { 0x0,       0x0,0x1fe0 },       /* MASK_REGS */                 \
+{ 0xffffffff,0xffffffff,0x1fff }                                        \
+}
+
+/* The same information, inverted:
+   Return the class number of the smallest class containing
+   reg number REGNO.  This could be a conditional expression
+   or could index an array.  */
+
+#define REGNO_REG_CLASS(REGNO) (regclass_map[REGNO])
+
+/* When this hook returns true for MODE, the compiler allows
+   registers explicitly used in the rtl to be used as spill registers
+   but prevents the compiler from extending the lifetime of these
+   registers.  */
+#define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P hook_bool_mode_true
+
+#define QI_REG_P(X) (REG_P (X) && QI_REGNO_P (REGNO (X)))
+#define QI_REGNO_P(N) IN_RANGE ((N), AX_REG, BX_REG)
+
+#define GENERAL_REG_P(X) \
+  (REG_P (X) && GENERAL_REGNO_P (REGNO (X)))
+#define GENERAL_REGNO_P(N) \
+  (IN_RANGE ((N), AX_REG, SP_REG) || REX_INT_REGNO_P (N))
+
+#define ANY_QI_REG_P(X) (REG_P (X) && ANY_QI_REGNO_P (REGNO (X)))
+#define ANY_QI_REGNO_P(N) \
+  (TARGET_64BIT ? GENERAL_REGNO_P (N) : QI_REGNO_P (N))
+
+#define REX_INT_REG_P(X) (REG_P (X) && REX_INT_REGNO_P (REGNO (X)))
+#define REX_INT_REGNO_P(N) \
+  IN_RANGE ((N), FIRST_REX_INT_REG, LAST_REX_INT_REG)
+
+#define STACK_REG_P(X) (REG_P (X) && STACK_REGNO_P (REGNO (X)))
+#define STACK_REGNO_P(N) IN_RANGE ((N), FIRST_STACK_REG, LAST_STACK_REG)
+
+#define ANY_FP_REG_P(X) (REG_P (X) && ANY_FP_REGNO_P (REGNO (X)))
+#define ANY_FP_REGNO_P(N) (STACK_REGNO_P (N) || SSE_REGNO_P (N))
+
+#define X87_FLOAT_MODE_P(MODE)	\
+  (TARGET_80387 && ((MODE) == SFmode || (MODE) == DFmode || (MODE) == XFmode))
+
+#define SSE_REG_P(X) (REG_P (X) && SSE_REGNO_P (REGNO (X)))
+#define SSE_REGNO_P(N)						\
+  (IN_RANGE ((N), FIRST_SSE_REG, LAST_SSE_REG)			\
+   || REX_SSE_REGNO_P (N)					\
+   || EXT_REX_SSE_REGNO_P (N))
+
+#define REX_SSE_REGNO_P(N) \
+  IN_RANGE ((N), FIRST_REX_SSE_REG, LAST_REX_SSE_REG)
+
+#define EXT_REX_SSE_REGNO_P(N) \
+  IN_RANGE ((N), FIRST_EXT_REX_SSE_REG, LAST_EXT_REX_SSE_REG)
+
+#define SSE_REGNO(N) \
+  ((N) < 8 ? FIRST_SSE_REG + (N) \
+         : (N) <= LAST_REX_SSE_REG ? (FIRST_REX_SSE_REG + (N) - 8) \
+                                   : (FIRST_EXT_REX_SSE_REG + (N) - 16))
+
+#define MASK_REGNO_P(N) IN_RANGE ((N), FIRST_MASK_REG, LAST_MASK_REG)
+#define ANY_MASK_REG_P(X) (REG_P (X) && MASK_REGNO_P (REGNO (X)))
+
+#define SSE_FLOAT_MODE_P(MODE) \
+  ((TARGET_SSE && (MODE) == SFmode) || (TARGET_SSE2 && (MODE) == DFmode))
+
+#define FMA4_VEC_FLOAT_MODE_P(MODE) \
+  (TARGET_FMA4 && ((MODE) == V4SFmode || (MODE) == V2DFmode \
+		  || (MODE) == V8SFmode || (MODE) == V4DFmode))
+
+#define MMX_REG_P(X) (REG_P (X) && MMX_REGNO_P (REGNO (X)))
+#define MMX_REGNO_P(N) IN_RANGE ((N), FIRST_MMX_REG, LAST_MMX_REG)
+
+#define STACK_TOP_P(X) (REG_P (X) && REGNO (X) == FIRST_STACK_REG)
+
+#define CC_REG_P(X) (REG_P (X) && CC_REGNO_P (REGNO (X)))
+#define CC_REGNO_P(X) ((X) == FLAGS_REG || (X) == FPSR_REG)
+
+/* The class value for index registers, and the one for base regs.  */
+
+#define INDEX_REG_CLASS INDEX_REGS
+#define BASE_REG_CLASS GENERAL_REGS
+
+/* Place additional restrictions on the register class to use when it
+   is necessary to be able to hold a value of mode MODE in a reload
+   register for which class CLASS would ordinarily be used.
+
+   We avoid classes containing registers from multiple units due to
+   the limitation in ix86_secondary_memory_needed.  We limit these
+   classes to their "natural mode" single unit register class, depending
+   on the unit availability.
+
+   Please note that reg_class_subset_p is not commutative, so these
+   conditions mean "... if (CLASS) includes ALL registers from the
+   register set."  */
+
+#define LIMIT_RELOAD_CLASS(MODE, CLASS)					\
+  (((MODE) == QImode && !TARGET_64BIT					\
+    && reg_class_subset_p (Q_REGS, (CLASS))) ? Q_REGS			\
+   : (((MODE) == SImode || (MODE) == DImode)				\
+      && reg_class_subset_p (GENERAL_REGS, (CLASS))) ? GENERAL_REGS	\
+   : (SSE_FLOAT_MODE_P (MODE) && TARGET_SSE_MATH			\
+      && reg_class_subset_p (SSE_REGS, (CLASS))) ? SSE_REGS		\
+   : (X87_FLOAT_MODE_P (MODE)						\
+      && reg_class_subset_p (FLOAT_REGS, (CLASS))) ? FLOAT_REGS		\
+   : (CLASS))
+
+/* If we are copying between general and FP registers, we need a memory
+   location. The same is true for SSE and MMX registers.  */
+#define SECONDARY_MEMORY_NEEDED(CLASS1, CLASS2, MODE) \
+  ix86_secondary_memory_needed ((CLASS1), (CLASS2), (MODE), 1)
+
+/* Get_secondary_mem widens integral modes to BITS_PER_WORD.
+   There is no need to emit full 64 bit move on 64 bit targets
+   for integral modes that can be moved using 32 bit move.  */
+#define SECONDARY_MEMORY_NEEDED_MODE(MODE)			\
+  (GET_MODE_BITSIZE (MODE) < 32 && INTEGRAL_MODE_P (MODE)	\
+   ? mode_for_size (32, GET_MODE_CLASS (MODE), 0)		\
+   : MODE)
+
+/* Return a class of registers that cannot change FROM mode to TO mode.  */
+
+#define CANNOT_CHANGE_MODE_CLASS(FROM, TO, CLASS) \
+  ix86_cannot_change_mode_class (FROM, TO, CLASS)
+
+/* Stack layout; function entry, exit and calling.  */
+
+/* Define this if pushing a word on the stack
+   makes the stack pointer a smaller address.  */
+#define STACK_GROWS_DOWNWARD
+
+/* Define this to nonzero if the nominal address of the stack frame
+   is at the high-address end of the local variables;
+   that is, each additional local variable allocated
+   goes at a more negative offset in the frame.  */
+#define FRAME_GROWS_DOWNWARD 1
+
+/* Offset within stack frame to start allocating local variables at.
+   If FRAME_GROWS_DOWNWARD, this is the offset to the END of the
+   first local allocated.  Otherwise, it is the offset to the BEGINNING
+   of the first local allocated.  */
+#define STARTING_FRAME_OFFSET 0
+
+/* If we generate an insn to push BYTES bytes, this says how many the stack
+   pointer really advances by.  On 386, we have pushw instruction that
+   decrements by exactly 2 no matter what the position was, there is no pushb.
+
+   But as CIE data alignment factor on this arch is -4 for 32bit targets
+   and -8 for 64bit targets, we need to make sure all stack pointer adjustments
+   are in multiple of 4 for 32bit targets and 8 for 64bit targets.  */
+
+#define PUSH_ROUNDING(BYTES) \
+  (((BYTES) + UNITS_PER_WORD - 1) & -UNITS_PER_WORD)
+
+/* If defined, the maximum amount of space required for outgoing arguments
+   will be computed and placed into the variable `crtl->outgoing_args_size'.
+   No space will be pushed onto the stack for each call; instead, the
+   function prologue should increase the stack frame size by this amount.  
+
+   In 32bit mode enabling argument accumulation results in about 5% code size
+   growth becuase move instructions are less compact than push.  In 64bit
+   mode the difference is less drastic but visible.  
+
+   FIXME: Unlike earlier implementations, the size of unwind info seems to
+   actually grow with accumulation.  Is that because accumulated args
+   unwind info became unnecesarily bloated?
+
+   With the 64-bit MS ABI, we can generate correct code with or without
+   accumulated args, but because of OUTGOING_REG_PARM_STACK_SPACE the code
+   generated without accumulated args is terrible.
+
+   If stack probes are required, the space used for large function
+   arguments on the stack must also be probed, so enable
+   -maccumulate-outgoing-args so this happens in the prologue.  */
+
+#define ACCUMULATE_OUTGOING_ARGS \
+  ((TARGET_ACCUMULATE_OUTGOING_ARGS && optimize_function_for_speed_p (cfun)) \
+   || TARGET_STACK_PROBE || TARGET_64BIT_MS_ABI)
+
+/* If defined, a C expression whose value is nonzero when we want to use PUSH
+   instructions to pass outgoing arguments.  */
+
+#define PUSH_ARGS (TARGET_PUSH_ARGS && !ACCUMULATE_OUTGOING_ARGS)
+
+/* We want the stack and args grow in opposite directions, even if
+   PUSH_ARGS is 0.  */
+#define PUSH_ARGS_REVERSED 1
+
+/* Offset of first parameter from the argument pointer register value.  */
+#define FIRST_PARM_OFFSET(FNDECL) 0
+
+/* Define this macro if functions should assume that stack space has been
+   allocated for arguments even when their values are passed in registers.
+
+   The value of this macro is the size, in bytes, of the area reserved for
+   arguments passed in registers for the function represented by FNDECL.
+
+   This space can be allocated by the caller, or be a part of the
+   machine-dependent stack frame: `OUTGOING_REG_PARM_STACK_SPACE' says
+   which.  */
+#define REG_PARM_STACK_SPACE(FNDECL) ix86_reg_parm_stack_space (FNDECL)
+
+#define OUTGOING_REG_PARM_STACK_SPACE(FNTYPE) \
+  (TARGET_64BIT && ix86_function_type_abi (FNTYPE) == MS_ABI)
+
+/* Define how to find the value returned by a library function
+   assuming the value has mode MODE.  */
+
+#define LIBCALL_VALUE(MODE) ix86_libcall_value (MODE)
+
+/* Define the size of the result block used for communication between
+   untyped_call and untyped_return.  The block contains a DImode value
+   followed by the block used by fnsave and frstor.  */
+
+#define APPLY_RESULT_SIZE (8+108)
+
+/* 1 if N is a possible register number for function argument passing.  */
+#define FUNCTION_ARG_REGNO_P(N) ix86_function_arg_regno_p (N)
+
+/* Define a data type for recording info about an argument list
+   during the scan of that argument list.  This data type should
+   hold all necessary information about the function itself
+   and about the args processed so far, enough to enable macros
+   such as FUNCTION_ARG to determine where the next arg should go.  */
+
+typedef struct ix86_args {
+  int words;			/* # words passed so far */
+  int nregs;			/* # registers available for passing */
+  int regno;			/* next available register number */
+  int fastcall;			/* fastcall or thiscall calling convention
+				   is used */
+  int sse_words;		/* # sse words passed so far */
+  int sse_nregs;		/* # sse registers available for passing */
+  int warn_avx512f;		/* True when we want to warn
+				   about AVX512F ABI.  */
+  int warn_avx;			/* True when we want to warn about AVX ABI.  */
+  int warn_sse;			/* True when we want to warn about SSE ABI.  */
+  int warn_mmx;			/* True when we want to warn about MMX ABI.  */
+  int sse_regno;		/* next available sse register number */
+  int mmx_words;		/* # mmx words passed so far */
+  int mmx_nregs;		/* # mmx registers available for passing */
+  int mmx_regno;		/* next available mmx register number */
+  int maybe_vaarg;		/* true for calls to possibly vardic fncts.  */
+  int caller;			/* true if it is caller.  */
+  int float_in_sse;		/* Set to 1 or 2 for 32bit targets if
+				   SFmode/DFmode arguments should be passed
+				   in SSE registers.  Otherwise 0.  */
+  enum calling_abi call_abi;	/* Set to SYSV_ABI for sysv abi. Otherwise
+ 				   MS_ABI for ms abi.  */
+} CUMULATIVE_ARGS;
+
+/* Initialize a variable CUM of type CUMULATIVE_ARGS
+   for a call to a function whose data type is FNTYPE.
+   For a library call, FNTYPE is 0.  */
+
+#define INIT_CUMULATIVE_ARGS(CUM, FNTYPE, LIBNAME, FNDECL, N_NAMED_ARGS) \
+  init_cumulative_args (&(CUM), (FNTYPE), (LIBNAME), (FNDECL), \
+			(N_NAMED_ARGS) != -1)
+
+/* Output assembler code to FILE to increment profiler label # LABELNO
+   for profiling a function entry.  */
+
+#define FUNCTION_PROFILER(FILE, LABELNO) x86_function_profiler (FILE, LABELNO)
+
+#define MCOUNT_NAME "_mcount"
+
+#define MCOUNT_NAME_BEFORE_PROLOGUE "__fentry__"
+
+#define PROFILE_COUNT_REGISTER "edx"
+
+/* EXIT_IGNORE_STACK should be nonzero if, when returning from a function,
+   the stack pointer does not matter.  The value is tested only in
+   functions that have frame pointers.
+   No definition is equivalent to always zero.  */
+/* Note on the 386 it might be more efficient not to define this since
+   we have to restore it ourselves from the frame pointer, in order to
+   use pop */
+
+#define EXIT_IGNORE_STACK 1
+
+/* Output assembler code for a block containing the constant parts
+   of a trampoline, leaving space for the variable parts.  */
+
+/* On the 386, the trampoline contains two instructions:
+     mov #STATIC,ecx
+     jmp FUNCTION
+   The trampoline is generated entirely at runtime.  The operand of JMP
+   is the address of FUNCTION relative to the instruction following the
+   JMP (which is 5 bytes long).  */
+
+/* Length in units of the trampoline for entering a nested function.  */
+
+#define TRAMPOLINE_SIZE (TARGET_64BIT ? 24 : 10)
+
+/* Definitions for register eliminations.
+
+   This is an array of structures.  Each structure initializes one pair
+   of eliminable registers.  The "from" register number is given first,
+   followed by "to".  Eliminations of the same "from" register are listed
+   in order of preference.
+
+   There are two registers that can always be eliminated on the i386.
+   The frame pointer and the arg pointer can be replaced by either the
+   hard frame pointer or to the stack pointer, depending upon the
+   circumstances.  The hard frame pointer is not used before reload and
+   so it is not eligible for elimination.  */
+
+#define ELIMINABLE_REGS					\
+{{ ARG_POINTER_REGNUM, STACK_POINTER_REGNUM},		\
+ { ARG_POINTER_REGNUM, HARD_FRAME_POINTER_REGNUM},	\
+ { FRAME_POINTER_REGNUM, STACK_POINTER_REGNUM},		\
+ { FRAME_POINTER_REGNUM, HARD_FRAME_POINTER_REGNUM}}	\
+
+/* Define the offset between two registers, one to be eliminated, and the other
+   its replacement, at the start of a routine.  */
+
+#define INITIAL_ELIMINATION_OFFSET(FROM, TO, OFFSET) \
+  ((OFFSET) = ix86_initial_elimination_offset ((FROM), (TO)))
+
+/* Addressing modes, and classification of registers for them.  */
+
+/* Macros to check register numbers against specific register classes.  */
+
+/* These assume that REGNO is a hard or pseudo reg number.
+   They give nonzero only if REGNO is a hard reg of the suitable class
+   or a pseudo reg currently allocated to a suitable hard reg.
+   Since they use reg_renumber, they are safe only once reg_renumber
+   has been allocated, which happens in reginfo.c during register
+   allocation.  */
+
+#define REGNO_OK_FOR_INDEX_P(REGNO) 					\
+  ((REGNO) < STACK_POINTER_REGNUM 					\
+   || REX_INT_REGNO_P (REGNO)						\
+   || (unsigned) reg_renumber[(REGNO)] < STACK_POINTER_REGNUM		\
+   || REX_INT_REGNO_P ((unsigned) reg_renumber[(REGNO)]))
+
+#define REGNO_OK_FOR_BASE_P(REGNO) 					\
+  (GENERAL_REGNO_P (REGNO)						\
+   || (REGNO) == ARG_POINTER_REGNUM 					\
+   || (REGNO) == FRAME_POINTER_REGNUM 					\
+   || GENERAL_REGNO_P ((unsigned) reg_renumber[(REGNO)]))
+
+/* The macros REG_OK_FOR..._P assume that the arg is a REG rtx
+   and check its validity for a certain class.
+   We have two alternate definitions for each of them.
+   The usual definition accepts all pseudo regs; the other rejects
+   them unless they have been allocated suitable hard regs.
+   The symbol REG_OK_STRICT causes the latter definition to be used.
+
+   Most source files want to accept pseudo regs in the hope that
+   they will get allocated to the class that the insn wants them to be in.
+   Source files for reload pass need to be strict.
+   After reload, it makes no difference, since pseudo regs have
+   been eliminated by then.  */
+
+
+/* Non strict versions, pseudos are ok.  */
+#define REG_OK_FOR_INDEX_NONSTRICT_P(X)					\
+  (REGNO (X) < STACK_POINTER_REGNUM					\
+   || REX_INT_REGNO_P (REGNO (X))					\
+   || REGNO (X) >= FIRST_PSEUDO_REGISTER)
+
+#define REG_OK_FOR_BASE_NONSTRICT_P(X)					\
+  (GENERAL_REGNO_P (REGNO (X))						\
+   || REGNO (X) == ARG_POINTER_REGNUM					\
+   || REGNO (X) == FRAME_POINTER_REGNUM 				\
+   || REGNO (X) >= FIRST_PSEUDO_REGISTER)
+
+/* Strict versions, hard registers only */
+#define REG_OK_FOR_INDEX_STRICT_P(X) REGNO_OK_FOR_INDEX_P (REGNO (X))
+#define REG_OK_FOR_BASE_STRICT_P(X)  REGNO_OK_FOR_BASE_P (REGNO (X))
+
+#ifndef REG_OK_STRICT
+#define REG_OK_FOR_INDEX_P(X)  REG_OK_FOR_INDEX_NONSTRICT_P (X)
+#define REG_OK_FOR_BASE_P(X)   REG_OK_FOR_BASE_NONSTRICT_P (X)
+
+#else
+#define REG_OK_FOR_INDEX_P(X)  REG_OK_FOR_INDEX_STRICT_P (X)
+#define REG_OK_FOR_BASE_P(X)   REG_OK_FOR_BASE_STRICT_P (X)
+#endif
+
+/* TARGET_LEGITIMATE_ADDRESS_P recognizes an RTL expression
+   that is a valid memory address for an instruction.
+   The MODE argument is the machine mode for the MEM expression
+   that wants to use this address.
+
+   The other macros defined here are used only in TARGET_LEGITIMATE_ADDRESS_P,
+   except for CONSTANT_ADDRESS_P which is usually machine-independent.
+
+   See legitimize_pic_address in i386.c for details as to what
+   constitutes a legitimate address when -fpic is used.  */
+
+#define MAX_REGS_PER_ADDRESS 2
+
+#define CONSTANT_ADDRESS_P(X)  constant_address_p (X)
+
+/* Try a machine-dependent way of reloading an illegitimate address
+   operand.  If we find one, push the reload and jump to WIN.  This
+   macro is used in only one place: `find_reloads_address' in reload.c.  */
+
+#define LEGITIMIZE_RELOAD_ADDRESS(X, MODE, OPNUM, TYPE, INDL, WIN)	\
+do {									\
+  if (ix86_legitimize_reload_address ((X), (MODE), (OPNUM),		\
+				      (int)(TYPE), (INDL)))		\
+    goto WIN;								\
+} while (0)
+
+/* If defined, a C expression to determine the base term of address X.
+   This macro is used in only one place: `find_base_term' in alias.c.
+
+   It is always safe for this macro to not be defined.  It exists so
+   that alias analysis can understand machine-dependent addresses.
+
+   The typical use of this macro is to handle addresses containing
+   a label_ref or symbol_ref within an UNSPEC.  */
+
+#define FIND_BASE_TERM(X) ix86_find_base_term (X)
+
+/* Nonzero if the constant value X is a legitimate general operand
+   when generating PIC code.  It is given that flag_pic is on and
+   that X satisfies CONSTANT_P or is a CONST_DOUBLE.  */
+
+#define LEGITIMATE_PIC_OPERAND_P(X) legitimate_pic_operand_p (X)
+
+#define SYMBOLIC_CONST(X)	\
+  (GET_CODE (X) == SYMBOL_REF						\
+   || GET_CODE (X) == LABEL_REF						\
+   || (GET_CODE (X) == CONST && symbolic_reference_mentioned_p (X)))
+
+/* Max number of args passed in registers.  If this is more than 3, we will
+   have problems with ebx (register #4), since it is a caller save register and
+   is also used as the pic register in ELF.  So for now, don't allow more than
+   3 registers to be passed in registers.  */
+
+/* Abi specific values for REGPARM_MAX and SSE_REGPARM_MAX */
+#define X86_64_REGPARM_MAX 6
+#define X86_64_MS_REGPARM_MAX 4
+
+#define X86_32_REGPARM_MAX 3
+
+#define REGPARM_MAX							\
+  (TARGET_64BIT								\
+   ? (TARGET_64BIT_MS_ABI						\
+      ? X86_64_MS_REGPARM_MAX						\
+      : X86_64_REGPARM_MAX)						\
+   : X86_32_REGPARM_MAX)
+
+#define X86_64_SSE_REGPARM_MAX 8
+#define X86_64_MS_SSE_REGPARM_MAX 4
+
+#define X86_32_SSE_REGPARM_MAX (TARGET_SSE ? (TARGET_MACHO ? 4 : 3) : 0)
+
+#define SSE_REGPARM_MAX							\
+  (TARGET_64BIT								\
+   ? (TARGET_64BIT_MS_ABI						\
+      ? X86_64_MS_SSE_REGPARM_MAX					\
+      : X86_64_SSE_REGPARM_MAX)						\
+   : X86_32_SSE_REGPARM_MAX)
+
+#define MMX_REGPARM_MAX (TARGET_64BIT ? 0 : (TARGET_MMX ? 3 : 0))
+
+/* Specify the machine mode that this machine uses
+   for the index in the tablejump instruction.  */
+#define CASE_VECTOR_MODE \
+ (!TARGET_LP64 || (flag_pic && ix86_cmodel != CM_LARGE_PIC) ? SImode : DImode)
+
+/* Define this as 1 if `char' should by default be signed; else as 0.  */
+#define DEFAULT_SIGNED_CHAR 1
+
+/* Max number of bytes we can move from memory to memory
+   in one reasonably fast instruction.  */
+#define MOVE_MAX 16
+
+/* MOVE_MAX_PIECES is the number of bytes at a time which we can
+   move efficiently, as opposed to  MOVE_MAX which is the maximum
+   number of bytes we can move with a single instruction.  */
+#define MOVE_MAX_PIECES UNITS_PER_WORD
+
+/* If a memory-to-memory move would take MOVE_RATIO or more simple
+   move-instruction pairs, we will do a movmem or libcall instead.
+   Increasing the value will always make code faster, but eventually
+   incurs high cost in increased code size.
+
+   If you don't define this, a reasonable default is used.  */
+
+#define MOVE_RATIO(speed) ((speed) ? ix86_cost->move_ratio : 3)
+
+/* If a clear memory operation would take CLEAR_RATIO or more simple
+   move-instruction sequences, we will do a clrmem or libcall instead.  */
+
+#define CLEAR_RATIO(speed) ((speed) ? MIN (6, ix86_cost->move_ratio) : 2)
+
+/* Define if shifts truncate the shift count which implies one can
+   omit a sign-extension or zero-extension of a shift count.
+
+   On i386, shifts do truncate the count.  But bit test instructions
+   take the modulo of the bit offset operand.  */
+
+/* #define SHIFT_COUNT_TRUNCATED */
+
+/* Value is 1 if truncating an integer of INPREC bits to OUTPREC bits
+   is done just by pretending it is already truncated.  */
+#define TRULY_NOOP_TRUNCATION(OUTPREC, INPREC) 1
+
+/* A macro to update M and UNSIGNEDP when an object whose type is
+   TYPE and which has the specified mode and signedness is to be
+   stored in a register.  This macro is only called when TYPE is a
+   scalar type.
+
+   On i386 it is sometimes useful to promote HImode and QImode
+   quantities to SImode.  The choice depends on target type.  */
+
+#define PROMOTE_MODE(MODE, UNSIGNEDP, TYPE) 		\
+do {							\
+  if (((MODE) == HImode && TARGET_PROMOTE_HI_REGS)	\
+      || ((MODE) == QImode && TARGET_PROMOTE_QI_REGS))	\
+    (MODE) = SImode;					\
+} while (0)
+
+/* Specify the machine mode that pointers have.
+   After generation of rtl, the compiler makes no further distinction
+   between pointers and any other objects of this machine mode.  */
+#define Pmode (ix86_pmode == PMODE_DI ? DImode : SImode)
+
+/* A C expression whose value is zero if pointers that need to be extended
+   from being `POINTER_SIZE' bits wide to `Pmode' are sign-extended and
+   greater then zero if they are zero-extended and less then zero if the
+   ptr_extend instruction should be used.  */
+
+#define POINTERS_EXTEND_UNSIGNED 1
+
+/* A function address in a call instruction
+   is a byte address (for indexing purposes)
+   so give the MEM rtx a byte's mode.  */
+#define FUNCTION_MODE QImode
+
+
+/* A C expression for the cost of a branch instruction.  A value of 1
+   is the default; other values are interpreted relative to that.  */
+
+#define BRANCH_COST(speed_p, predictable_p) \
+  (!(speed_p) ? 2 : (predictable_p) ? 0 : ix86_branch_cost)
+
+/* An integer expression for the size in bits of the largest integer machine
+   mode that should actually be used.  We allow pairs of registers.  */
+#define MAX_FIXED_MODE_SIZE GET_MODE_BITSIZE (TARGET_64BIT ? TImode : DImode)
+
+/* Define this macro as a C expression which is nonzero if accessing
+   less than a word of memory (i.e. a `char' or a `short') is no
+   faster than accessing a word of memory, i.e., if such access
+   require more than one instruction or if there is no difference in
+   cost between byte and (aligned) word loads.
+
+   When this macro is not defined, the compiler will access a field by
+   finding the smallest containing object; when it is defined, a
+   fullword load will be used if alignment permits.  Unless bytes
+   accesses are faster than word accesses, using word accesses is
+   preferable since it may eliminate subsequent memory access if
+   subsequent accesses occur to other fields in the same word of the
+   structure, but to different bytes.  */
+
+#define SLOW_BYTE_ACCESS 0
+
+/* Nonzero if access to memory by shorts is slow and undesirable.  */
+#define SLOW_SHORT_ACCESS 0
+
+/* Define this macro to be the value 1 if unaligned accesses have a
+   cost many times greater than aligned accesses, for example if they
+   are emulated in a trap handler.
+
+   When this macro is nonzero, the compiler will act as if
+   `STRICT_ALIGNMENT' were nonzero when generating code for block
+   moves.  This can cause significantly more instructions to be
+   produced.  Therefore, do not set this macro nonzero if unaligned
+   accesses only add a cycle or two to the time for a memory access.
+
+   If the value of this macro is always zero, it need not be defined.  */
+
+/* #define SLOW_UNALIGNED_ACCESS(MODE, ALIGN) 0 */
+
+/* Define this macro if it is as good or better to call a constant
+   function address than to call an address kept in a register.
+
+   Desirable on the 386 because a CALL with a constant address is
+   faster than one with a register address.  */
+
+#define NO_FUNCTION_CSE
+
+/* Given a comparison code (EQ, NE, etc.) and the first operand of a COMPARE,
+   return the mode to be used for the comparison.
+
+   For floating-point equality comparisons, CCFPEQmode should be used.
+   VOIDmode should be used in all other cases.
+
+   For integer comparisons against zero, reduce to CCNOmode or CCZmode if
+   possible, to allow for more combinations.  */
+
+#define SELECT_CC_MODE(OP, X, Y) ix86_cc_mode ((OP), (X), (Y))
+
+/* Return nonzero if MODE implies a floating point inequality can be
+   reversed.  */
+
+#define REVERSIBLE_CC_MODE(MODE) 1
+
+/* A C expression whose value is reversed condition code of the CODE for
+   comparison done in CC_MODE mode.  */
+#define REVERSE_CONDITION(CODE, MODE) ix86_reverse_condition ((CODE), (MODE))
+
+
+/* Control the assembler format that we output, to the extent
+   this does not vary between assemblers.  */
+
+/* How to refer to registers in assembler output.
+   This sequence is indexed by compiler's hard-register-number (see above).  */
+
+/* In order to refer to the first 8 regs as 32-bit regs, prefix an "e".
+   For non floating point regs, the following are the HImode names.
+
+   For float regs, the stack top is sometimes referred to as "%st(0)"
+   instead of just "%st".  TARGET_PRINT_OPERAND handles this with the
+   "y" code.  */
+
+#define HI_REGISTER_NAMES						\
+{"ax","dx","cx","bx","si","di","bp","sp",				\
+ "st","st(1)","st(2)","st(3)","st(4)","st(5)","st(6)","st(7)",		\
+ "argp", "flags", "fpsr", "fpcr", "frame",				\
+ "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7",		\
+ "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",		\
+ "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",			\
+ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",	\
+ "xmm16", "xmm17", "xmm18", "xmm19",					\
+ "xmm20", "xmm21", "xmm22", "xmm23",					\
+ "xmm24", "xmm25", "xmm26", "xmm27",					\
+ "xmm28", "xmm29", "xmm30", "xmm31",					\
+ "k0", "k1", "k2", "k3", "k4", "k5", "k6", "k7" }
+
+#define REGISTER_NAMES HI_REGISTER_NAMES
+
+/* Table of additional register names to use in user input.  */
+
+#define ADDITIONAL_REGISTER_NAMES \
+{ { "eax", 0 }, { "edx", 1 }, { "ecx", 2 }, { "ebx", 3 },		\
+  { "esi", 4 }, { "edi", 5 }, { "ebp", 6 }, { "esp", 7 },		\
+  { "rax", 0 }, { "rdx", 1 }, { "rcx", 2 }, { "rbx", 3 },		\
+  { "rsi", 4 }, { "rdi", 5 }, { "rbp", 6 }, { "rsp", 7 },		\
+  { "al", 0 }, { "dl", 1 }, { "cl", 2 }, { "bl", 3 },			\
+  { "ah", 0 }, { "dh", 1 }, { "ch", 2 }, { "bh", 3 },			\
+  { "ymm0", 21}, { "ymm1", 22}, { "ymm2", 23}, { "ymm3", 24},		\
+  { "ymm4", 25}, { "ymm5", 26}, { "ymm6", 27}, { "ymm7", 28},		\
+  { "ymm8", 45}, { "ymm9", 46}, { "ymm10", 47}, { "ymm11", 48},		\
+  { "ymm12", 49}, { "ymm13", 50}, { "ymm14", 51}, { "ymm15", 52},	\
+  { "ymm16", 53}, { "ymm17", 54}, { "ymm18", 55}, { "ymm19", 56},	\
+  { "ymm20", 57}, { "ymm21", 58}, { "ymm22", 59}, { "ymm23", 60},	\
+  { "ymm24", 61}, { "ymm25", 62}, { "ymm26", 63}, { "ymm27", 64},	\
+  { "ymm28", 65}, { "ymm29", 66}, { "ymm30", 67}, { "ymm31", 68},	\
+  { "zmm0", 21}, { "zmm1", 22}, { "zmm2", 23}, { "zmm3", 24},		\
+  { "zmm4", 25}, { "zmm5", 26}, { "zmm6", 27}, { "zmm7", 28},		\
+  { "zmm8", 45}, { "zmm9", 46}, { "zmm10", 47}, { "zmm11", 48},		\
+  { "zmm12", 49}, { "zmm13", 50}, { "zmm14", 51}, { "zmm15", 52},	\
+  { "zmm16", 53}, { "zmm17", 54}, { "zmm18", 55}, { "zmm19", 56},	\
+  { "zmm20", 57}, { "zmm21", 58}, { "zmm22", 59}, { "zmm23", 60},	\
+  { "zmm24", 61}, { "zmm25", 62}, { "zmm26", 63}, { "zmm27", 64},	\
+  { "zmm28", 65}, { "zmm29", 66}, { "zmm30", 67}, { "zmm31", 68} }
+
+/* Note we are omitting these since currently I don't know how
+to get gcc to use these, since they want the same but different
+number as al, and ax.
+*/
+
+#define QI_REGISTER_NAMES \
+{"al", "dl", "cl", "bl", "sil", "dil", "bpl", "spl",}
+
+/* These parallel the array above, and can be used to access bits 8:15
+   of regs 0 through 3.  */
+
+#define QI_HIGH_REGISTER_NAMES \
+{"ah", "dh", "ch", "bh", }
+
+/* How to renumber registers for dbx and gdb.  */
+
+#define DBX_REGISTER_NUMBER(N) \
+  (TARGET_64BIT ? dbx64_register_map[(N)] : dbx_register_map[(N)])
+
+extern int const dbx_register_map[FIRST_PSEUDO_REGISTER];
+extern int const dbx64_register_map[FIRST_PSEUDO_REGISTER];
+extern int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER];
+
+extern int const x86_64_ms_sysv_extra_clobbered_registers[12];
+
+/* Before the prologue, RA is at 0(%esp).  */
+#define INCOMING_RETURN_ADDR_RTX \
+  gen_rtx_MEM (VOIDmode, gen_rtx_REG (VOIDmode, STACK_POINTER_REGNUM))
+
+/* After the prologue, RA is at -4(AP) in the current frame.  */
+#define RETURN_ADDR_RTX(COUNT, FRAME)					   \
+  ((COUNT) == 0								   \
+   ? gen_rtx_MEM (Pmode, plus_constant (Pmode, arg_pointer_rtx,	   \
+					-UNITS_PER_WORD))		   \
+   : gen_rtx_MEM (Pmode, plus_constant (Pmode, FRAME, UNITS_PER_WORD)))
+
+/* PC is dbx register 8; let's use that column for RA.  */
+#define DWARF_FRAME_RETURN_COLUMN 	(TARGET_64BIT ? 16 : 8)
+
+/* Before the prologue, the top of the frame is at 4(%esp).  */
+#define INCOMING_FRAME_SP_OFFSET UNITS_PER_WORD
+
+/* Describe how we implement __builtin_eh_return.  */
+#define EH_RETURN_DATA_REGNO(N)	((N) <= DX_REG ? (N) : INVALID_REGNUM)
+#define EH_RETURN_STACKADJ_RTX	gen_rtx_REG (Pmode, CX_REG)
+
+
+/* Select a format to encode pointers in exception handling data.  CODE
+   is 0 for data, 1 for code labels, 2 for function pointers.  GLOBAL is
+   true if the symbol may be affected by dynamic relocations.
+
+   ??? All x86 object file formats are capable of representing this.
+   After all, the relocation needed is the same as for the call insn.
+   Whether or not a particular assembler allows us to enter such, I
+   guess we'll have to see.  */
+#define ASM_PREFERRED_EH_DATA_FORMAT(CODE, GLOBAL)       		\
+  asm_preferred_eh_data_format ((CODE), (GLOBAL))
+
+/* This is how to output an insn to push a register on the stack.
+   It need not be very fast code.  */
+
+#define ASM_OUTPUT_REG_PUSH(FILE, REGNO)  \
+do {									\
+  if (TARGET_64BIT)							\
+    asm_fprintf ((FILE), "\tpush{q}\t%%r%s\n",				\
+		 reg_names[(REGNO)] + (REX_INT_REGNO_P (REGNO) != 0));	\
+  else									\
+    asm_fprintf ((FILE), "\tpush{l}\t%%e%s\n", reg_names[(REGNO)]);	\
+} while (0)
+
+/* This is how to output an insn to pop a register from the stack.
+   It need not be very fast code.  */
+
+#define ASM_OUTPUT_REG_POP(FILE, REGNO)  \
+do {									\
+  if (TARGET_64BIT)							\
+    asm_fprintf ((FILE), "\tpop{q}\t%%r%s\n",				\
+		 reg_names[(REGNO)] + (REX_INT_REGNO_P (REGNO) != 0));	\
+  else									\
+    asm_fprintf ((FILE), "\tpop{l}\t%%e%s\n", reg_names[(REGNO)]);	\
+} while (0)
+
+/* This is how to output an element of a case-vector that is absolute.  */
+
+#define ASM_OUTPUT_ADDR_VEC_ELT(FILE, VALUE)  \
+  ix86_output_addr_vec_elt ((FILE), (VALUE))
+
+/* This is how to output an element of a case-vector that is relative.  */
+
+#define ASM_OUTPUT_ADDR_DIFF_ELT(FILE, BODY, VALUE, REL) \
+  ix86_output_addr_diff_elt ((FILE), (VALUE), (REL))
+
+/* When we see %v, we will print the 'v' prefix if TARGET_AVX is true.  */
+
+#define ASM_OUTPUT_AVX_PREFIX(STREAM, PTR)	\
+{						\
+  if ((PTR)[0] == '%' && (PTR)[1] == 'v')	\
+    (PTR) += TARGET_AVX ? 1 : 2;		\
+}
+
+/* A C statement or statements which output an assembler instruction
+   opcode to the stdio stream STREAM.  The macro-operand PTR is a
+   variable of type `char *' which points to the opcode name in
+   its "internal" form--the form that is written in the machine
+   description.  */
+
+#define ASM_OUTPUT_OPCODE(STREAM, PTR) \
+  ASM_OUTPUT_AVX_PREFIX ((STREAM), (PTR))
+
+/* A C statement to output to the stdio stream FILE an assembler
+   command to pad the location counter to a multiple of 1<<LOG
+   bytes if it is within MAX_SKIP bytes.  */
+
+#ifdef HAVE_GAS_MAX_SKIP_P2ALIGN
+#undef  ASM_OUTPUT_MAX_SKIP_PAD
+#define ASM_OUTPUT_MAX_SKIP_PAD(FILE, LOG, MAX_SKIP)			\
+  if ((LOG) != 0)							\
+    {									\
+      if ((MAX_SKIP) == 0)						\
+        fprintf ((FILE), "\t.p2align %d\n", (LOG));			\
+      else								\
+        fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP));	\
+    }
+#endif
+
+/* Write the extra assembler code needed to declare a function
+   properly.  */
+
+#undef ASM_OUTPUT_FUNCTION_LABEL
+#define ASM_OUTPUT_FUNCTION_LABEL(FILE, NAME, DECL) \
+  ix86_asm_output_function_label (FILE, NAME, DECL)
+
+/* Under some conditions we need jump tables in the text section,
+   because the assembler cannot handle label differences between
+   sections.  This is the case for x86_64 on Mach-O for example.  */
+
+#define JUMP_TABLES_IN_TEXT_SECTION \
+  (flag_pic && ((TARGET_MACHO && TARGET_64BIT) \
+   || (!TARGET_64BIT && !HAVE_AS_GOTOFF_IN_DATA)))
+
+/* Switch to init or fini section via SECTION_OP, emit a call to FUNC,
+   and switch back.  For x86 we do this only to save a few bytes that
+   would otherwise be unused in the text section.  */
+#define CRT_MKSTR2(VAL) #VAL
+#define CRT_MKSTR(x) CRT_MKSTR2(x)
+
+#define CRT_CALL_STATIC_FUNCTION(SECTION_OP, FUNC)		\
+   asm (SECTION_OP "\n\t"					\
+	"call " CRT_MKSTR(__USER_LABEL_PREFIX__) #FUNC "\n"	\
+	TEXT_SECTION_ASM_OP);
+
+/* Default threshold for putting data in large sections
+   with x86-64 medium memory model */
+#define DEFAULT_LARGE_SECTION_THRESHOLD 65536
+
+/* Which processor to tune code generation for.  These must be in sync
+   with processor_target_table in i386.c.  */ 
+
+enum processor_type
+{
+  PROCESSOR_GENERIC = 0,
+  PROCESSOR_I386,			/* 80386 */
+  PROCESSOR_I486,			/* 80486DX, 80486SX, 80486DX[24] */
+  PROCESSOR_PENTIUM,
+  PROCESSOR_PENTIUMPRO,
+  PROCESSOR_PENTIUM4,
+  PROCESSOR_NOCONA,
+  PROCESSOR_CORE2,
+  PROCESSOR_NEHALEM,
+  PROCESSOR_SANDYBRIDGE,
+  PROCESSOR_HASWELL,
+  PROCESSOR_BONNELL,
+  PROCESSOR_SILVERMONT,
+  PROCESSOR_INTEL,
+  PROCESSOR_GEODE,
+  PROCESSOR_K6,
+  PROCESSOR_ATHLON,
+  PROCESSOR_K8,
+  PROCESSOR_AMDFAM10,
+  PROCESSOR_BDVER1,
+  PROCESSOR_BDVER2,
+  PROCESSOR_BDVER3,
+  PROCESSOR_BDVER4,
+  PROCESSOR_BTVER1,
+  PROCESSOR_BTVER2,
+  PROCESSOR_max
+};
+
+extern enum processor_type ix86_tune;
+extern enum processor_type ix86_arch;
+
+/* Size of the RED_ZONE area.  */
+#define RED_ZONE_SIZE 128
+/* Reserved area of the red zone for temporaries.  */
+#define RED_ZONE_RESERVE 8
+
+extern unsigned int ix86_preferred_stack_boundary;
+extern unsigned int ix86_incoming_stack_boundary;
+
+/* Smallest class containing REGNO.  */
+extern enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER];
+
+enum ix86_fpcmp_strategy {
+  IX86_FPCMP_SAHF,
+  IX86_FPCMP_COMI,
+  IX86_FPCMP_ARITH
+};
+
+/* To properly truncate FP values into integers, we need to set i387 control
+   word.  We can't emit proper mode switching code before reload, as spills
+   generated by reload may truncate values incorrectly, but we still can avoid
+   redundant computation of new control word by the mode switching pass.
+   The fldcw instructions are still emitted redundantly, but this is probably
+   not going to be noticeable problem, as most CPUs do have fast path for
+   the sequence.
+
+   The machinery is to emit simple truncation instructions and split them
+   before reload to instructions having USEs of two memory locations that
+   are filled by this code to old and new control word.
+
+   Post-reload pass may be later used to eliminate the redundant fildcw if
+   needed.  */
+
+enum ix86_entity
+{
+  AVX_U128 = 0,
+  I387_TRUNC,
+  I387_FLOOR,
+  I387_CEIL,
+  I387_MASK_PM,
+  MAX_386_ENTITIES
+};
+
+enum ix86_stack_slot
+{
+  SLOT_TEMP = 0,
+  SLOT_CW_STORED,
+  SLOT_CW_TRUNC,
+  SLOT_CW_FLOOR,
+  SLOT_CW_CEIL,
+  SLOT_CW_MASK_PM,
+  MAX_386_STACK_LOCALS
+};
+
+enum avx_u128_state
+{
+  AVX_U128_CLEAN,
+  AVX_U128_DIRTY,
+  AVX_U128_ANY
+};
+
+/* Define this macro if the port needs extra instructions inserted
+   for mode switching in an optimizing compilation.  */
+
+#define OPTIMIZE_MODE_SWITCHING(ENTITY) \
+   ix86_optimize_mode_switching[(ENTITY)]
+
+/* If you define `OPTIMIZE_MODE_SWITCHING', you have to define this as
+   initializer for an array of integers.  Each initializer element N
+   refers to an entity that needs mode switching, and specifies the
+   number of different modes that might need to be set for this
+   entity.  The position of the initializer in the initializer -
+   starting counting at zero - determines the integer that is used to
+   refer to the mode-switched entity in question.  */
+
+#define NUM_MODES_FOR_MODE_SWITCHING \
+  { AVX_U128_ANY, I387_CW_ANY, I387_CW_ANY, I387_CW_ANY, I387_CW_ANY }
+
+/* ENTITY is an integer specifying a mode-switched entity.  If
+   `OPTIMIZE_MODE_SWITCHING' is defined, you must define this macro to
+   return an integer value not larger than the corresponding element
+   in `NUM_MODES_FOR_MODE_SWITCHING', to denote the mode that ENTITY
+   must be switched into prior to the execution of INSN.  */
+
+#define MODE_NEEDED(ENTITY, I) ix86_mode_needed ((ENTITY), (I))
+
+/* If this macro is defined, it is evaluated for every INSN during
+   mode switching.  It determines the mode that an insn results in (if
+   different from the incoming mode).  */
+
+#define MODE_AFTER(ENTITY, MODE, I) ix86_mode_after ((ENTITY), (MODE), (I))
+
+/* If this macro is defined, it is evaluated for every ENTITY that
+   needs mode switching.  It should evaluate to an integer, which is
+   a mode that ENTITY is assumed to be switched to at function entry.  */
+
+#define MODE_ENTRY(ENTITY) ix86_mode_entry (ENTITY)
+
+/* If this macro is defined, it is evaluated for every ENTITY that
+   needs mode switching.  It should evaluate to an integer, which is
+   a mode that ENTITY is assumed to be switched to at function exit.  */
+
+#define MODE_EXIT(ENTITY) ix86_mode_exit (ENTITY)
+
+/* This macro specifies the order in which modes for ENTITY are
+   processed.  0 is the highest priority.  */
+
+#define MODE_PRIORITY_TO_MODE(ENTITY, N) (N)
+
+/* Generate one or more insns to set ENTITY to MODE.  HARD_REG_LIVE
+   is the set of hard registers live at the point where the insn(s)
+   are to be inserted.  */
+
+#define EMIT_MODE_SET(ENTITY, MODE, HARD_REGS_LIVE) \
+  ix86_emit_mode_set ((ENTITY), (MODE), (HARD_REGS_LIVE))
+
+/* Avoid renaming of stack registers, as doing so in combination with
+   scheduling just increases amount of live registers at time and in
+   the turn amount of fxch instructions needed.
+
+   ??? Maybe Pentium chips benefits from renaming, someone can try....
+
+   Don't rename evex to non-evex sse registers.  */
+
+#define HARD_REGNO_RENAME_OK(SRC, TARGET) (!STACK_REGNO_P (SRC) &&	 \
+					   (EXT_REX_SSE_REGNO_P (SRC) == \
+					    EXT_REX_SSE_REGNO_P (TARGET)))
+
+
+#define FASTCALL_PREFIX '@'
+
+/* Machine specific frame tracking during prologue/epilogue generation.  */
+
+#ifndef USED_FOR_TARGET
+struct GTY(()) machine_frame_state
+{
+  /* This pair tracks the currently active CFA as reg+offset.  When reg
+     is drap_reg, we don't bother trying to record here the real CFA when
+     it might really be a DW_CFA_def_cfa_expression.  */
+  rtx cfa_reg;
+  HOST_WIDE_INT cfa_offset;
+
+  /* The current offset (canonically from the CFA) of ESP and EBP.
+     When stack frame re-alignment is active, these may not be relative
+     to the CFA.  However, in all cases they are relative to the offsets
+     of the saved registers stored in ix86_frame.  */
+  HOST_WIDE_INT sp_offset;
+  HOST_WIDE_INT fp_offset;
+
+  /* The size of the red-zone that may be assumed for the purposes of
+     eliding register restore notes in the epilogue.  This may be zero
+     if no red-zone is in effect, or may be reduced from the real
+     red-zone value by a maximum runtime stack re-alignment value.  */
+  int red_zone_offset;
+
+  /* Indicate whether each of ESP, EBP or DRAP currently holds a valid
+     value within the frame.  If false then the offset above should be
+     ignored.  Note that DRAP, if valid, *always* points to the CFA and
+     thus has an offset of zero.  */
+  BOOL_BITFIELD sp_valid : 1;
+  BOOL_BITFIELD fp_valid : 1;
+  BOOL_BITFIELD drap_valid : 1;
+
+  /* Indicate whether the local stack frame has been re-aligned.  When
+     set, the SP/FP offsets above are relative to the aligned frame
+     and not the CFA.  */
+  BOOL_BITFIELD realigned : 1;
+};
+
+/* Private to winnt.c.  */
+struct seh_frame_state;
+
+struct GTY(()) machine_function {
+  struct stack_local_entry *stack_locals;
+  const char *some_ld_name;
+  int varargs_gpr_size;
+  int varargs_fpr_size;
+  int optimize_mode_switching[MAX_386_ENTITIES];
+
+  /* Number of saved registers USE_FAST_PROLOGUE_EPILOGUE
+     has been computed for.  */
+  int use_fast_prologue_epilogue_nregs;
+
+  /* For -fsplit-stack support: A stack local which holds a pointer to
+     the stack arguments for a function with a variable number of
+     arguments.  This is set at the start of the function and is used
+     to initialize the overflow_arg_area field of the va_list
+     structure.  */
+  rtx split_stack_varargs_pointer;
+
+  /* This value is used for amd64 targets and specifies the current abi
+     to be used. MS_ABI means ms abi. Otherwise SYSV_ABI means sysv abi.  */
+  ENUM_BITFIELD(calling_abi) call_abi : 8;
+
+  /* Nonzero if the function accesses a previous frame.  */
+  BOOL_BITFIELD accesses_prev_frame : 1;
+
+  /* Nonzero if the function requires a CLD in the prologue.  */
+  BOOL_BITFIELD needs_cld : 1;
+
+  /* Set by ix86_compute_frame_layout and used by prologue/epilogue
+     expander to determine the style used.  */
+  BOOL_BITFIELD use_fast_prologue_epilogue : 1;
+
+  /* If true, the current function needs the default PIC register, not
+     an alternate register (on x86) and must not use the red zone (on
+     x86_64), even if it's a leaf function.  We don't want the
+     function to be regarded as non-leaf because TLS calls need not
+     affect register allocation.  This flag is set when a TLS call
+     instruction is expanded within a function, and never reset, even
+     if all such instructions are optimized away.  Use the
+     ix86_current_function_calls_tls_descriptor macro for a better
+     approximation.  */
+  BOOL_BITFIELD tls_descriptor_call_expanded_p : 1;
+
+  /* If true, the current function has a STATIC_CHAIN is placed on the
+     stack below the return address.  */
+  BOOL_BITFIELD static_chain_on_stack : 1;
+
+  /* If true, it is safe to not save/restore DRAP register.  */
+  BOOL_BITFIELD no_drap_save_restore : 1;
+
+  /* During prologue/epilogue generation, the current frame state.
+     Otherwise, the frame state at the end of the prologue.  */
+  struct machine_frame_state fs;
+
+  /* During SEH output, this is non-null.  */
+  struct seh_frame_state * GTY((skip(""))) seh;
+};
+#endif
+
+#define ix86_stack_locals (cfun->machine->stack_locals)
+#define ix86_varargs_gpr_size (cfun->machine->varargs_gpr_size)
+#define ix86_varargs_fpr_size (cfun->machine->varargs_fpr_size)
+#define ix86_optimize_mode_switching (cfun->machine->optimize_mode_switching)
+#define ix86_current_function_needs_cld (cfun->machine->needs_cld)
+#define ix86_tls_descriptor_calls_expanded_in_cfun \
+  (cfun->machine->tls_descriptor_call_expanded_p)
+/* Since tls_descriptor_call_expanded is not cleared, even if all TLS
+   calls are optimized away, we try to detect cases in which it was
+   optimized away.  Since such instructions (use (reg REG_SP)), we can
+   verify whether there's any such instruction live by testing that
+   REG_SP is live.  */
+#define ix86_current_function_calls_tls_descriptor \
+  (ix86_tls_descriptor_calls_expanded_in_cfun && df_regs_ever_live_p (SP_REG))
+#define ix86_static_chain_on_stack (cfun->machine->static_chain_on_stack)
+
+/* Control behavior of x86_file_start.  */
+#define X86_FILE_START_VERSION_DIRECTIVE false
+#define X86_FILE_START_FLTUSED false
+
+/* Flag to mark data that is in the large address area.  */
+#define SYMBOL_FLAG_FAR_ADDR		(SYMBOL_FLAG_MACH_DEP << 0)
+#define SYMBOL_REF_FAR_ADDR_P(X)	\
+	((SYMBOL_REF_FLAGS (X) & SYMBOL_FLAG_FAR_ADDR) != 0)
+
+/* Flags to mark dllimport/dllexport.  Used by PE ports, but handy to
+   have defined always, to avoid ifdefing.  */
+#define SYMBOL_FLAG_DLLIMPORT		(SYMBOL_FLAG_MACH_DEP << 1)
+#define SYMBOL_REF_DLLIMPORT_P(X) \
+	((SYMBOL_REF_FLAGS (X) & SYMBOL_FLAG_DLLIMPORT) != 0)
+
+#define SYMBOL_FLAG_DLLEXPORT		(SYMBOL_FLAG_MACH_DEP << 2)
+#define SYMBOL_REF_DLLEXPORT_P(X) \
+	((SYMBOL_REF_FLAGS (X) & SYMBOL_FLAG_DLLEXPORT) != 0)
+
+#define SYMBOL_FLAG_STUBVAR	(SYMBOL_FLAG_MACH_DEP << 4)
+#define SYMBOL_REF_STUBVAR_P(X) \
+	((SYMBOL_REF_FLAGS (X) & SYMBOL_FLAG_STUBVAR) != 0)
+
+extern void debug_ready_dispatch (void);
+extern void debug_dispatch_window (int);
+
+/* The value at zero is only defined for the BMI instructions
+   LZCNT and TZCNT, not the BSR/BSF insns in the original isa.  */
+#define CTZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) \
+	((VALUE) = GET_MODE_BITSIZE (MODE), TARGET_BMI)
+#define CLZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) \
+	((VALUE) = GET_MODE_BITSIZE (MODE), TARGET_LZCNT)
+
+
+/* Flags returned by ix86_get_callcvt ().  */
+#define IX86_CALLCVT_CDECL	0x1
+#define IX86_CALLCVT_STDCALL	0x2
+#define IX86_CALLCVT_FASTCALL	0x4
+#define IX86_CALLCVT_THISCALL	0x8
+#define IX86_CALLCVT_REGPARM	0x10
+#define IX86_CALLCVT_SSEREGPARM	0x20
+
+#define IX86_BASE_CALLCVT(FLAGS) \
+	((FLAGS) & (IX86_CALLCVT_CDECL | IX86_CALLCVT_STDCALL \
+		    | IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL))
+
+#define RECIP_MASK_NONE		0x00
+#define RECIP_MASK_DIV		0x01
+#define RECIP_MASK_SQRT		0x02
+#define RECIP_MASK_VEC_DIV	0x04
+#define RECIP_MASK_VEC_SQRT	0x08
+#define RECIP_MASK_ALL	(RECIP_MASK_DIV | RECIP_MASK_SQRT \
+			 | RECIP_MASK_VEC_DIV | RECIP_MASK_VEC_SQRT)
+#define RECIP_MASK_DEFAULT (RECIP_MASK_VEC_DIV | RECIP_MASK_VEC_SQRT)
+
+#define TARGET_RECIP_DIV	((recip_mask & RECIP_MASK_DIV) != 0)
+#define TARGET_RECIP_SQRT	((recip_mask & RECIP_MASK_SQRT) != 0)
+#define TARGET_RECIP_VEC_DIV	((recip_mask & RECIP_MASK_VEC_DIV) != 0)
+#define TARGET_RECIP_VEC_SQRT	((recip_mask & RECIP_MASK_VEC_SQRT) != 0)
+
+#define IX86_HLE_ACQUIRE (1 << 16)
+#define IX86_HLE_RELEASE (1 << 17)
+
+/* For switching between functions with different target attributes.  */
+#define SWITCHABLE_TARGET 1
+
+/*
+Local variables:
+version-control: t
+End:
+*/
diff --git a/gcc-4.9/gcc/config/i386/i386.md b/gcc-4.9/gcc/config/i386/i386.md
new file mode 100644
index 000000000..4a8b46388
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/i386.md
@@ -0,0 +1,18044 @@
+;; GCC machine description for IA-32 and x86-64.
+;; Copyright (C) 1988-2014 Free Software Foundation, Inc.
+;; Mostly by William Schelter.
+;; x86_64 support added by Jan Hubicka
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+;;
+;; The original PO technology requires these to be ordered by speed,
+;; so that assigner will pick the fastest.
+;;
+;; See file "rtl.def" for documentation on define_insn, match_*, et. al.
+;;
+;; The special asm out single letter directives following a '%' are:
+;; L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
+;; C -- print opcode suffix for set/cmov insn.
+;; c -- like C, but print reversed condition
+;; F,f -- likewise, but for floating-point.
+;; O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
+;;      otherwise nothing
+;; R -- print the prefix for register names.
+;; z -- print the opcode suffix for the size of the current operand.
+;; Z -- likewise, with special suffixes for x87 instructions.
+;; * -- print a star (in certain assembler syntax)
+;; A -- print an absolute memory reference.
+;; E -- print address with DImode register names if TARGET_64BIT.
+;; w -- print the operand as if it's a "word" (HImode) even if it isn't.
+;; s -- print a shift double count, followed by the assemblers argument
+;;	delimiter.
+;; b -- print the QImode name of the register for the indicated operand.
+;;	%b0 would print %al if operands[0] is reg 0.
+;; w --  likewise, print the HImode name of the register.
+;; k --  likewise, print the SImode name of the register.
+;; q --  likewise, print the DImode name of the register.
+;; x --  likewise, print the V4SFmode name of the register.
+;; t --  likewise, print the V8SFmode name of the register.
+;; h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
+;; y -- print "st(0)" instead of "st" as a register.
+;; d -- print duplicated register operand for AVX instruction.
+;; D -- print condition for SSE cmp instruction.
+;; P -- if PIC, print an @PLT suffix.
+;; p -- print raw symbol name.
+;; X -- don't print any sort of PIC '@' suffix for a symbol.
+;; & -- print some in-use local-dynamic symbol name.
+;; H -- print a memory address offset by 8; used for sse high-parts
+;; K -- print HLE lock prefix
+;; Y -- print condition for XOP pcom* instruction.
+;; + -- print a branch hint as 'cs' or 'ds' prefix
+;; ; -- print a semicolon (after prefixes due to bug in older gas).
+;; ~ -- print "i" if TARGET_AVX2, "f" otherwise.
+;; @ -- print a segment register of thread base pointer load
+;; ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
+
+(define_c_enum "unspec" [
+  ;; Relocation specifiers
+  UNSPEC_GOT
+  UNSPEC_GOTOFF
+  UNSPEC_GOTPCREL
+  UNSPEC_GOTTPOFF
+  UNSPEC_TPOFF
+  UNSPEC_NTPOFF
+  UNSPEC_DTPOFF
+  UNSPEC_GOTNTPOFF
+  UNSPEC_INDNTPOFF
+  UNSPEC_PLTOFF
+  UNSPEC_MACHOPIC_OFFSET
+  UNSPEC_PCREL
+
+  ;; Prologue support
+  UNSPEC_STACK_ALLOC
+  UNSPEC_SET_GOT
+  UNSPEC_SET_RIP
+  UNSPEC_SET_GOT_OFFSET
+  UNSPEC_MEMORY_BLOCKAGE
+  UNSPEC_STACK_CHECK
+
+  ;; TLS support
+  UNSPEC_TP
+  UNSPEC_TLS_GD
+  UNSPEC_TLS_LD_BASE
+  UNSPEC_TLSDESC
+  UNSPEC_TLS_IE_SUN
+
+  ;; Other random patterns
+  UNSPEC_SCAS
+  UNSPEC_FNSTSW
+  UNSPEC_SAHF
+  UNSPEC_PARITY
+  UNSPEC_FSTCW
+  UNSPEC_ADD_CARRY
+  UNSPEC_FLDCW
+  UNSPEC_REP
+  UNSPEC_LD_MPIC	; load_macho_picbase
+  UNSPEC_TRUNC_NOOP
+  UNSPEC_DIV_ALREADY_SPLIT
+  UNSPEC_MS_TO_SYSV_CALL
+  UNSPEC_PAUSE
+  UNSPEC_LEA_ADDR
+  UNSPEC_XBEGIN_ABORT
+  UNSPEC_STOS
+
+  ;; For SSE/MMX support:
+  UNSPEC_FIX_NOTRUNC
+  UNSPEC_MASKMOV
+  UNSPEC_MOVMSK
+  UNSPEC_RCP
+  UNSPEC_RSQRT
+  UNSPEC_PSADBW
+
+  ;; Generic math support
+  UNSPEC_COPYSIGN
+  UNSPEC_IEEE_MIN	; not commutative
+  UNSPEC_IEEE_MAX	; not commutative
+
+  ;; x87 Floating point
+  UNSPEC_SIN
+  UNSPEC_COS
+  UNSPEC_FPATAN
+  UNSPEC_FYL2X
+  UNSPEC_FYL2XP1
+  UNSPEC_FRNDINT
+  UNSPEC_FIST
+  UNSPEC_F2XM1
+  UNSPEC_TAN
+  UNSPEC_FXAM
+
+  ;; x87 Rounding
+  UNSPEC_FRNDINT_FLOOR
+  UNSPEC_FRNDINT_CEIL
+  UNSPEC_FRNDINT_TRUNC
+  UNSPEC_FRNDINT_MASK_PM
+  UNSPEC_FIST_FLOOR
+  UNSPEC_FIST_CEIL
+
+  ;; x87 Double output FP
+  UNSPEC_SINCOS_COS
+  UNSPEC_SINCOS_SIN
+  UNSPEC_XTRACT_FRACT
+  UNSPEC_XTRACT_EXP
+  UNSPEC_FSCALE_FRACT
+  UNSPEC_FSCALE_EXP
+  UNSPEC_FPREM_F
+  UNSPEC_FPREM_U
+  UNSPEC_FPREM1_F
+  UNSPEC_FPREM1_U
+
+  UNSPEC_C2_FLAG
+  UNSPEC_FXAM_MEM
+
+  ;; SSP patterns
+  UNSPEC_SP_SET
+  UNSPEC_SP_TEST
+  UNSPEC_SP_TLS_SET
+  UNSPEC_SP_TLS_TEST
+
+  ;; For ROUND support
+  UNSPEC_ROUND
+
+  ;; For CRC32 support
+  UNSPEC_CRC32
+
+  ;; For BMI support
+  UNSPEC_BEXTR
+
+  ;; For BMI2 support
+  UNSPEC_PDEP
+  UNSPEC_PEXT
+
+  ;; For AVX512F support
+  UNSPEC_KMOV
+])
+
+(define_c_enum "unspecv" [
+  UNSPECV_BLOCKAGE
+  UNSPECV_STACK_PROBE
+  UNSPECV_PROBE_STACK_RANGE
+  UNSPECV_ALIGN
+  UNSPECV_PROLOGUE_USE
+  UNSPECV_SPLIT_STACK_RETURN
+  UNSPECV_CLD
+  UNSPECV_NOPS
+  UNSPECV_RDTSC
+  UNSPECV_RDTSCP
+  UNSPECV_RDPMC
+  UNSPECV_LLWP_INTRINSIC
+  UNSPECV_SLWP_INTRINSIC
+  UNSPECV_LWPVAL_INTRINSIC
+  UNSPECV_LWPINS_INTRINSIC
+  UNSPECV_RDFSBASE
+  UNSPECV_RDGSBASE
+  UNSPECV_WRFSBASE
+  UNSPECV_WRGSBASE
+  UNSPECV_FXSAVE
+  UNSPECV_FXRSTOR
+  UNSPECV_FXSAVE64
+  UNSPECV_FXRSTOR64
+  UNSPECV_XSAVE
+  UNSPECV_XRSTOR
+  UNSPECV_XSAVE64
+  UNSPECV_XRSTOR64
+  UNSPECV_XSAVEOPT
+  UNSPECV_XSAVEOPT64
+
+  ;; For atomic compound assignments.
+  UNSPECV_FNSTENV
+  UNSPECV_FLDENV
+  UNSPECV_FNSTSW
+  UNSPECV_FNCLEX
+
+  ;; For RDRAND support
+  UNSPECV_RDRAND
+
+  ;; For RDSEED support
+  UNSPECV_RDSEED
+
+  ;; For RTM support
+  UNSPECV_XBEGIN
+  UNSPECV_XEND
+  UNSPECV_XABORT
+  UNSPECV_XTEST
+
+  UNSPECV_NLGR
+])
+
+;; Constants to represent rounding modes in the ROUND instruction
+(define_constants
+  [(ROUND_FLOOR			0x1)
+   (ROUND_CEIL			0x2)
+   (ROUND_TRUNC			0x3)
+   (ROUND_MXCSR			0x4)
+   (ROUND_NO_EXC		0x8)
+  ])
+
+;; Constants to represent AVX512F embeded rounding
+(define_constants
+  [(ROUND_NEAREST_INT			0)
+   (ROUND_NEG_INF			1)
+   (ROUND_POS_INF			2)
+   (ROUND_ZERO				3)
+   (NO_ROUND				4)
+   (ROUND_SAE				8)
+  ])
+
+;; Constants to represent pcomtrue/pcomfalse variants
+(define_constants
+  [(PCOM_FALSE			0)
+   (PCOM_TRUE			1)
+   (COM_FALSE_S			2)
+   (COM_FALSE_P			3)
+   (COM_TRUE_S			4)
+   (COM_TRUE_P			5)
+  ])
+
+;; Constants used in the XOP pperm instruction
+(define_constants
+  [(PPERM_SRC			0x00)	/* copy source */
+   (PPERM_INVERT		0x20)	/* invert source */
+   (PPERM_REVERSE		0x40)	/* bit reverse source */
+   (PPERM_REV_INV		0x60)	/* bit reverse & invert src */
+   (PPERM_ZERO			0x80)	/* all 0's */
+   (PPERM_ONES			0xa0)	/* all 1's */
+   (PPERM_SIGN			0xc0)	/* propagate sign bit */
+   (PPERM_INV_SIGN		0xe0)	/* invert & propagate sign */
+   (PPERM_SRC1			0x00)	/* use first source byte */
+   (PPERM_SRC2			0x10)	/* use second source byte */
+   ])
+
+;; Registers by name.
+(define_constants
+  [(AX_REG			 0)
+   (DX_REG			 1)
+   (CX_REG			 2)
+   (BX_REG			 3)
+   (SI_REG			 4)
+   (DI_REG			 5)
+   (BP_REG			 6)
+   (SP_REG			 7)
+   (ST0_REG			 8)
+   (ST1_REG			 9)
+   (ST2_REG			10)
+   (ST3_REG			11)
+   (ST4_REG			12)
+   (ST5_REG			13)
+   (ST6_REG			14)
+   (ST7_REG			15)
+   (FLAGS_REG			17)
+   (FPSR_REG			18)
+   (FPCR_REG			19)
+   (XMM0_REG			21)
+   (XMM1_REG			22)
+   (XMM2_REG			23)
+   (XMM3_REG			24)
+   (XMM4_REG			25)
+   (XMM5_REG			26)
+   (XMM6_REG			27)
+   (XMM7_REG			28)
+   (MM0_REG			29)
+   (MM1_REG			30)
+   (MM2_REG			31)
+   (MM3_REG			32)
+   (MM4_REG			33)
+   (MM5_REG			34)
+   (MM6_REG			35)
+   (MM7_REG			36)
+   (R8_REG			37)
+   (R9_REG			38)
+   (R10_REG			39)
+   (R11_REG			40)
+   (R12_REG			41)
+   (R13_REG			42)
+   (R14_REG			43)
+   (R15_REG			44)
+   (XMM8_REG			45)
+   (XMM9_REG			46)
+   (XMM10_REG			47)
+   (XMM11_REG			48)
+   (XMM12_REG			49)
+   (XMM13_REG			50)
+   (XMM14_REG			51)
+   (XMM15_REG			52)
+   (XMM16_REG			53)
+   (XMM17_REG			54)
+   (XMM18_REG			55)
+   (XMM19_REG			56)
+   (XMM20_REG			57)
+   (XMM21_REG			58)
+   (XMM22_REG			59)
+   (XMM23_REG			60)
+   (XMM24_REG			61)
+   (XMM25_REG			62)
+   (XMM26_REG			63)
+   (XMM27_REG			64)
+   (XMM28_REG			65)
+   (XMM29_REG			66)
+   (XMM30_REG			67)
+   (XMM31_REG			68)
+   (MASK0_REG			69)
+   (MASK1_REG			70)
+   (MASK2_REG			71)
+   (MASK3_REG			72)
+   (MASK4_REG			73)
+   (MASK5_REG			74)
+   (MASK6_REG			75)
+   (MASK7_REG			76)
+  ])
+
+;; Insns whose names begin with "x86_" are emitted by gen_FOO calls
+;; from i386.c.
+
+;; In C guard expressions, put expressions which may be compile-time
+;; constants first.  This allows for better optimization.  For
+;; example, write "TARGET_64BIT && reload_completed", not
+;; "reload_completed && TARGET_64BIT".
+
+
+;; Processor type.
+(define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,nehalem,
+		    atom,slm,generic,amdfam10,bdver1,bdver2,bdver3,bdver4,
+		    btver2"
+  (const (symbol_ref "ix86_schedule")))
+
+;; A basic instruction type.  Refinements due to arguments to be
+;; provided in other attributes.
+(define_attr "type"
+  "other,multi,
+   alu,alu1,negnot,imov,imovx,lea,
+   incdec,ishift,ishiftx,ishift1,rotate,rotatex,rotate1,
+   imul,imulx,idiv,icmp,test,ibr,setcc,icmov,
+   push,pop,call,callv,leave,
+   str,bitmanip,
+   fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,
+   fxch,fistp,fisttp,frndint,
+   sse,ssemov,sseadd,sseadd1,sseiadd,sseiadd1,
+   ssemul,sseimul,ssediv,sselog,sselog1,
+   sseishft,sseishft1,ssecmp,ssecomi,
+   ssecvt,ssecvt1,sseicvt,sseins,
+   sseshuf,sseshuf1,ssemuladd,sse4arg,
+   lwp,mskmov,msklog,
+   mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft"
+  (const_string "other"))
+
+;; Main data type used by the insn
+(define_attr "mode"
+  "unknown,none,QI,HI,SI,DI,TI,OI,XI,SF,DF,XF,TF,V16SF,V8SF,V4DF,V4SF,
+  V2DF,V2SF,V1DF,V8DF"
+  (const_string "unknown"))
+
+;; The CPU unit operations uses.
+(define_attr "unit" "integer,i387,sse,mmx,unknown"
+  (cond [(eq_attr "type" "fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,
+			  fxch,fistp,fisttp,frndint")
+	   (const_string "i387")
+	 (eq_attr "type" "sse,ssemov,sseadd,sseadd1,sseiadd,sseiadd1,
+			  ssemul,sseimul,ssediv,sselog,sselog1,
+			  sseishft,sseishft1,ssecmp,ssecomi,
+			  ssecvt,ssecvt1,sseicvt,sseins,
+			  sseshuf,sseshuf1,ssemuladd,sse4arg,mskmov")
+	   (const_string "sse")
+	 (eq_attr "type" "mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft")
+	   (const_string "mmx")
+	 (eq_attr "type" "other")
+	   (const_string "unknown")]
+	 (const_string "integer")))
+
+;; The minimum required alignment of vector mode memory operands of the SSE
+;; (non-VEX/EVEX) instruction in bits, if it is different from
+;; GET_MODE_ALIGNMENT of the operand, otherwise 0.  If an instruction has
+;; multiple alternatives, this should be conservative maximum of those minimum
+;; required alignments.
+(define_attr "ssememalign" "" (const_int 0))
+
+;; The (bounding maximum) length of an instruction immediate.
+(define_attr "length_immediate" ""
+  (cond [(eq_attr "type" "incdec,setcc,icmov,str,lea,other,multi,idiv,leave,
+			  bitmanip,imulx,msklog,mskmov")
+	   (const_int 0)
+	 (eq_attr "unit" "i387,sse,mmx")
+	   (const_int 0)
+	 (eq_attr "type" "alu,alu1,negnot,imovx,ishift,ishiftx,ishift1,
+			  rotate,rotatex,rotate1,imul,icmp,push,pop")
+	   (symbol_ref "ix86_attr_length_immediate_default (insn, true)")
+	 (eq_attr "type" "imov,test")
+	   (symbol_ref "ix86_attr_length_immediate_default (insn, false)")
+	 (eq_attr "type" "call")
+	   (if_then_else (match_operand 0 "constant_call_address_operand")
+	     (const_int 4)
+	     (const_int 0))
+	 (eq_attr "type" "callv")
+	   (if_then_else (match_operand 1 "constant_call_address_operand")
+	     (const_int 4)
+	     (const_int 0))
+	 ;; We don't know the size before shorten_branches.  Expect
+	 ;; the instruction to fit for better scheduling.
+	 (eq_attr "type" "ibr")
+	   (const_int 1)
+	 ]
+	 (symbol_ref "/* Update immediate_length and other attributes! */
+		      gcc_unreachable (),1")))
+
+;; The (bounding maximum) length of an instruction address.
+(define_attr "length_address" ""
+  (cond [(eq_attr "type" "str,other,multi,fxch")
+	   (const_int 0)
+	 (and (eq_attr "type" "call")
+	      (match_operand 0 "constant_call_address_operand"))
+	     (const_int 0)
+	 (and (eq_attr "type" "callv")
+	      (match_operand 1 "constant_call_address_operand"))
+	     (const_int 0)
+	 ]
+	 (symbol_ref "ix86_attr_length_address_default (insn)")))
+
+;; Set when length prefix is used.
+(define_attr "prefix_data16" ""
+  (cond [(eq_attr "type" "ssemuladd,sse4arg,sseiadd1,ssecvt1")
+	   (const_int 0)
+	 (eq_attr "mode" "HI")
+	   (const_int 1)
+	 (and (eq_attr "unit" "sse") (eq_attr "mode" "V2DF,TI"))
+	   (const_int 1)
+	]
+	(const_int 0)))
+
+;; Set when string REP prefix is used.
+(define_attr "prefix_rep" ""
+  (cond [(eq_attr "type" "ssemuladd,sse4arg,sseiadd1,ssecvt1")
+	   (const_int 0)
+	 (and (eq_attr "unit" "sse") (eq_attr "mode" "SF,DF"))
+	   (const_int 1)
+	]
+	(const_int 0)))
+
+;; Set when 0f opcode prefix is used.
+(define_attr "prefix_0f" ""
+  (if_then_else
+    (ior (eq_attr "type" "imovx,setcc,icmov,bitmanip,msklog,mskmov")
+	 (eq_attr "unit" "sse,mmx"))
+    (const_int 1)
+    (const_int 0)))
+
+;; Set when REX opcode prefix is used.
+(define_attr "prefix_rex" ""
+  (cond [(not (match_test "TARGET_64BIT"))
+	   (const_int 0)
+	 (and (eq_attr "mode" "DI")
+	      (and (eq_attr "type" "!push,pop,call,callv,leave,ibr")
+		   (eq_attr "unit" "!mmx")))
+	   (const_int 1)
+	 (and (eq_attr "mode" "QI")
+	      (match_test "x86_extended_QIreg_mentioned_p (insn)"))
+	   (const_int 1)
+	 (match_test "x86_extended_reg_mentioned_p (insn)")
+	   (const_int 1)
+	 (and (eq_attr "type" "imovx")
+	      (match_operand:QI 1 "ext_QIreg_operand"))
+	   (const_int 1)
+	]
+	(const_int 0)))
+
+;; There are also additional prefixes in 3DNOW, SSSE3.
+;; ssemuladd,sse4arg default to 0f24/0f25 and DREX byte,
+;; sseiadd1,ssecvt1 to 0f7a with no DREX byte.
+;; 3DNOW has 0f0f prefix, SSSE3 and SSE4_{1,2} 0f38/0f3a.
+(define_attr "prefix_extra" ""
+  (cond [(eq_attr "type" "ssemuladd,sse4arg")
+	   (const_int 2)
+	 (eq_attr "type" "sseiadd1,ssecvt1")
+	   (const_int 1)
+	]
+	(const_int 0)))
+
+;; Prefix used: original, VEX or maybe VEX.
+(define_attr "prefix" "orig,vex,maybe_vex,evex,maybe_evex"
+  (cond [(eq_attr "mode" "OI,V8SF,V4DF")
+           (const_string "vex")
+         (eq_attr "mode" "XI,V16SF,V8DF")
+           (const_string "evex")
+        ]
+        (const_string "orig")))
+
+;; VEX W bit is used.
+(define_attr "prefix_vex_w" "" (const_int 0))
+
+;; The length of VEX prefix
+;; Only instructions with 0f prefix can have 2 byte VEX prefix,
+;; 0f38/0f3a prefixes can't.  In i386.md 0f3[8a] is
+;; still prefix_0f 1, with prefix_extra 1.
+(define_attr "length_vex" ""
+  (if_then_else (and (eq_attr "prefix_0f" "1")
+		     (eq_attr "prefix_extra" "0"))
+    (if_then_else (eq_attr "prefix_vex_w" "1")
+      (symbol_ref "ix86_attr_length_vex_default (insn, true, true)")
+      (symbol_ref "ix86_attr_length_vex_default (insn, true, false)"))
+    (if_then_else (eq_attr "prefix_vex_w" "1")
+      (symbol_ref "ix86_attr_length_vex_default (insn, false, true)")
+      (symbol_ref "ix86_attr_length_vex_default (insn, false, false)"))))
+
+;; 4-bytes evex prefix and 1 byte opcode.
+(define_attr "length_evex" "" (const_int 5))
+
+;; Set when modrm byte is used.
+(define_attr "modrm" ""
+  (cond [(eq_attr "type" "str,leave")
+	   (const_int 0)
+	 (eq_attr "unit" "i387")
+	   (const_int 0)
+         (and (eq_attr "type" "incdec")
+	      (and (not (match_test "TARGET_64BIT"))
+		   (ior (match_operand:SI 1 "register_operand")
+			(match_operand:HI 1 "register_operand"))))
+	   (const_int 0)
+	 (and (eq_attr "type" "push")
+	      (not (match_operand 1 "memory_operand")))
+	   (const_int 0)
+	 (and (eq_attr "type" "pop")
+	      (not (match_operand 0 "memory_operand")))
+	   (const_int 0)
+	 (and (eq_attr "type" "imov")
+	      (and (not (eq_attr "mode" "DI"))
+		   (ior (and (match_operand 0 "register_operand")
+			     (match_operand 1 "immediate_operand"))
+		        (ior (and (match_operand 0 "ax_reg_operand")
+				  (match_operand 1 "memory_displacement_only_operand"))
+			     (and (match_operand 0 "memory_displacement_only_operand")
+				  (match_operand 1 "ax_reg_operand"))))))
+	   (const_int 0)
+	 (and (eq_attr "type" "call")
+	      (match_operand 0 "constant_call_address_operand"))
+	     (const_int 0)
+	 (and (eq_attr "type" "callv")
+	      (match_operand 1 "constant_call_address_operand"))
+	     (const_int 0)
+	 (and (eq_attr "type" "alu,alu1,icmp,test")
+	      (match_operand 0 "ax_reg_operand"))
+	     (symbol_ref "(get_attr_length_immediate (insn) <= (get_attr_mode (insn) != MODE_QI))")
+	 ]
+	 (const_int 1)))
+
+;; The (bounding maximum) length of an instruction in bytes.
+;; ??? fistp and frndint are in fact fldcw/{fistp,frndint}/fldcw sequences.
+;; Later we may want to split them and compute proper length as for
+;; other insns.
+(define_attr "length" ""
+  (cond [(eq_attr "type" "other,multi,fistp,frndint")
+	   (const_int 16)
+	 (eq_attr "type" "fcmp")
+	   (const_int 4)
+	 (eq_attr "unit" "i387")
+	   (plus (const_int 2)
+		 (plus (attr "prefix_data16")
+		       (attr "length_address")))
+	 (ior (eq_attr "prefix" "evex")
+	      (and (ior (eq_attr "prefix" "maybe_evex")
+			(eq_attr "prefix" "maybe_vex"))
+		   (match_test "TARGET_AVX512F")))
+	   (plus (attr "length_evex")
+		 (plus (attr "length_immediate")
+		       (plus (attr "modrm")
+			     (attr "length_address"))))
+	 (ior (eq_attr "prefix" "vex")
+	      (and (ior (eq_attr "prefix" "maybe_vex")
+			(eq_attr "prefix" "maybe_evex"))
+		   (match_test "TARGET_AVX")))
+	   (plus (attr "length_vex")
+		 (plus (attr "length_immediate")
+		       (plus (attr "modrm")
+			     (attr "length_address"))))]
+	 (plus (plus (attr "modrm")
+		     (plus (attr "prefix_0f")
+			   (plus (attr "prefix_rex")
+				 (plus (attr "prefix_extra")
+				       (const_int 1)))))
+	       (plus (attr "prefix_rep")
+		     (plus (attr "prefix_data16")
+			   (plus (attr "length_immediate")
+				 (attr "length_address")))))))
+
+;; The `memory' attribute is `none' if no memory is referenced, `load' or
+;; `store' if there is a simple memory reference therein, or `unknown'
+;; if the instruction is complex.
+
+(define_attr "memory" "none,load,store,both,unknown"
+  (cond [(eq_attr "type" "other,multi,str,lwp")
+	   (const_string "unknown")
+	 (eq_attr "type" "lea,fcmov,fpspc")
+	   (const_string "none")
+	 (eq_attr "type" "fistp,leave")
+	   (const_string "both")
+	 (eq_attr "type" "frndint")
+	   (const_string "load")
+	 (eq_attr "type" "push")
+	   (if_then_else (match_operand 1 "memory_operand")
+	     (const_string "both")
+	     (const_string "store"))
+	 (eq_attr "type" "pop")
+	   (if_then_else (match_operand 0 "memory_operand")
+	     (const_string "both")
+	     (const_string "load"))
+	 (eq_attr "type" "setcc")
+	   (if_then_else (match_operand 0 "memory_operand")
+	     (const_string "store")
+	     (const_string "none"))
+	 (eq_attr "type" "icmp,test,ssecmp,ssecomi,mmxcmp,fcmp")
+	   (if_then_else (ior (match_operand 0 "memory_operand")
+			      (match_operand 1 "memory_operand"))
+	     (const_string "load")
+	     (const_string "none"))
+	 (eq_attr "type" "ibr")
+	   (if_then_else (match_operand 0 "memory_operand")
+	     (const_string "load")
+	     (const_string "none"))
+	 (eq_attr "type" "call")
+	   (if_then_else (match_operand 0 "constant_call_address_operand")
+	     (const_string "none")
+	     (const_string "load"))
+	 (eq_attr "type" "callv")
+	   (if_then_else (match_operand 1 "constant_call_address_operand")
+	     (const_string "none")
+	     (const_string "load"))
+	 (and (eq_attr "type" "alu1,negnot,ishift1,sselog1,sseshuf1")
+	      (match_operand 1 "memory_operand"))
+	   (const_string "both")
+	 (and (match_operand 0 "memory_operand")
+	      (match_operand 1 "memory_operand"))
+	   (const_string "both")
+	 (match_operand 0 "memory_operand")
+	   (const_string "store")
+	 (match_operand 1 "memory_operand")
+	   (const_string "load")
+	 (and (eq_attr "type"
+		 "!alu1,negnot,ishift1,
+		   imov,imovx,icmp,test,bitmanip,
+		   fmov,fcmp,fsgn,
+		   sse,ssemov,ssecmp,ssecomi,ssecvt,ssecvt1,sseicvt,
+		   sselog1,sseshuf1,sseadd1,sseiadd1,sseishft1,
+		   mmx,mmxmov,mmxcmp,mmxcvt,mskmov,msklog")
+	      (match_operand 2 "memory_operand"))
+	   (const_string "load")
+	 (and (eq_attr "type" "icmov,ssemuladd,sse4arg")
+	      (match_operand 3 "memory_operand"))
+	   (const_string "load")
+	]
+	(const_string "none")))
+
+;; Indicates if an instruction has both an immediate and a displacement.
+
+(define_attr "imm_disp" "false,true,unknown"
+  (cond [(eq_attr "type" "other,multi")
+	   (const_string "unknown")
+	 (and (eq_attr "type" "icmp,test,imov,alu1,ishift1,rotate1")
+	      (and (match_operand 0 "memory_displacement_operand")
+		   (match_operand 1 "immediate_operand")))
+	   (const_string "true")
+	 (and (eq_attr "type" "alu,ishift,ishiftx,rotate,rotatex,imul,idiv")
+	      (and (match_operand 0 "memory_displacement_operand")
+		   (match_operand 2 "immediate_operand")))
+	   (const_string "true")
+	]
+	(const_string "false")))
+
+;; Indicates if an FP operation has an integer source.
+
+(define_attr "fp_int_src" "false,true"
+  (const_string "false"))
+
+;; Defines rounding mode of an FP operation.
+
+(define_attr "i387_cw" "trunc,floor,ceil,mask_pm,uninitialized,any"
+  (const_string "any"))
+
+;; Define attribute to classify add/sub insns that consumes carry flag (CF)
+(define_attr "use_carry" "0,1" (const_string "0"))
+
+;; Define attribute to indicate unaligned ssemov insns
+(define_attr "movu" "0,1" (const_string "0"))
+
+;; Used to control the "enabled" attribute on a per-instruction basis.
+(define_attr "isa" "base,x64,x64_sse4,x64_sse4_noavx,x64_avx,nox64,
+		    sse2,sse2_noavx,sse3,sse4,sse4_noavx,avx,noavx,
+		    avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,noavx512f,fma_avx512f"
+  (const_string "base"))
+
+(define_attr "enabled" ""
+  (cond [(eq_attr "isa" "x64") (symbol_ref "TARGET_64BIT")
+	 (eq_attr "isa" "x64_sse4")
+	   (symbol_ref "TARGET_64BIT && TARGET_SSE4_1")
+	 (eq_attr "isa" "x64_sse4_noavx")
+	   (symbol_ref "TARGET_64BIT && TARGET_SSE4_1 && !TARGET_AVX")
+	 (eq_attr "isa" "x64_avx")
+	   (symbol_ref "TARGET_64BIT && TARGET_AVX")
+	 (eq_attr "isa" "nox64") (symbol_ref "!TARGET_64BIT")
+	 (eq_attr "isa" "sse2") (symbol_ref "TARGET_SSE2")
+	 (eq_attr "isa" "sse2_noavx")
+	   (symbol_ref "TARGET_SSE2 && !TARGET_AVX")
+	 (eq_attr "isa" "sse3") (symbol_ref "TARGET_SSE3")
+	 (eq_attr "isa" "sse4") (symbol_ref "TARGET_SSE4_1")
+	 (eq_attr "isa" "sse4_noavx")
+	   (symbol_ref "TARGET_SSE4_1 && !TARGET_AVX")
+	 (eq_attr "isa" "avx") (symbol_ref "TARGET_AVX")
+	 (eq_attr "isa" "noavx") (symbol_ref "!TARGET_AVX")
+	 (eq_attr "isa" "avx2") (symbol_ref "TARGET_AVX2")
+	 (eq_attr "isa" "noavx2") (symbol_ref "!TARGET_AVX2")
+	 (eq_attr "isa" "bmi") (symbol_ref "TARGET_BMI")
+	 (eq_attr "isa" "bmi2") (symbol_ref "TARGET_BMI2")
+	 (eq_attr "isa" "fma4") (symbol_ref "TARGET_FMA4")
+	 (eq_attr "isa" "fma") (symbol_ref "TARGET_FMA")
+	 (eq_attr "isa" "avx512f") (symbol_ref "TARGET_AVX512F")
+	 (eq_attr "isa" "noavx512f") (symbol_ref "!TARGET_AVX512F")
+	 (eq_attr "isa" "fma_avx512f")
+	   (symbol_ref "TARGET_FMA || TARGET_AVX512F")
+	]
+	(const_int 1)))
+
+;; Describe a user's asm statement.
+(define_asm_attributes
+  [(set_attr "length" "128")
+   (set_attr "type" "multi")])
+
+(define_code_iterator plusminus [plus minus])
+
+(define_code_iterator sat_plusminus [ss_plus us_plus ss_minus us_minus])
+
+(define_code_iterator multdiv [mult div])
+
+;; Base name for define_insn
+(define_code_attr plusminus_insn
+  [(plus "add") (ss_plus "ssadd") (us_plus "usadd")
+   (minus "sub") (ss_minus "sssub") (us_minus "ussub")])
+
+;; Base name for insn mnemonic.
+(define_code_attr plusminus_mnemonic
+  [(plus "add") (ss_plus "adds") (us_plus "addus")
+   (minus "sub") (ss_minus "subs") (us_minus "subus")])
+(define_code_attr plusminus_carry_mnemonic
+  [(plus "adc") (minus "sbb")])
+(define_code_attr multdiv_mnemonic
+  [(mult "mul") (div "div")])
+
+;; Mark commutative operators as such in constraints.
+(define_code_attr comm [(plus "%") (ss_plus "%") (us_plus "%")
+			(minus "") (ss_minus "") (us_minus "")])
+
+;; Mapping of max and min
+(define_code_iterator maxmin [smax smin umax umin])
+
+;; Mapping of signed max and min
+(define_code_iterator smaxmin [smax smin])
+
+;; Mapping of unsigned max and min
+(define_code_iterator umaxmin [umax umin])
+
+;; Base name for integer and FP insn mnemonic
+(define_code_attr maxmin_int [(smax "maxs") (smin "mins")
+			      (umax "maxu") (umin "minu")])
+(define_code_attr maxmin_float [(smax "max") (smin "min")])
+
+;; Mapping of logic operators
+(define_code_iterator any_logic [and ior xor])
+(define_code_iterator any_or [ior xor])
+(define_code_iterator fpint_logic [and xor])
+
+;; Base name for insn mnemonic.
+(define_code_attr logic [(and "and") (ior "or") (xor "xor")])
+
+;; Mapping of logic-shift operators
+(define_code_iterator any_lshift [ashift lshiftrt])
+
+;; Mapping of shift-right operators
+(define_code_iterator any_shiftrt [lshiftrt ashiftrt])
+
+;; Mapping of all shift operators
+(define_code_iterator any_shift [ashift lshiftrt ashiftrt])
+
+;; Base name for define_insn
+(define_code_attr shift_insn
+  [(ashift "ashl") (lshiftrt "lshr") (ashiftrt "ashr")])
+
+;; Base name for insn mnemonic.
+(define_code_attr shift [(ashift "sll") (lshiftrt "shr") (ashiftrt "sar")])
+(define_code_attr vshift [(ashift "sll") (lshiftrt "srl") (ashiftrt "sra")])
+
+;; Mapping of rotate operators
+(define_code_iterator any_rotate [rotate rotatert])
+
+;; Base name for define_insn
+(define_code_attr rotate_insn [(rotate "rotl") (rotatert "rotr")])
+
+;; Base name for insn mnemonic.
+(define_code_attr rotate [(rotate "rol") (rotatert "ror")])
+
+;; Mapping of abs neg operators
+(define_code_iterator absneg [abs neg])
+
+;; Base name for x87 insn mnemonic.
+(define_code_attr absneg_mnemonic [(abs "abs") (neg "chs")])
+
+;; Used in signed and unsigned widening multiplications.
+(define_code_iterator any_extend [sign_extend zero_extend])
+
+;; Prefix for insn menmonic.
+(define_code_attr sgnprefix [(sign_extend "i") (zero_extend "")])
+
+;; Prefix for define_insn
+(define_code_attr u [(sign_extend "") (zero_extend "u")])
+(define_code_attr s [(sign_extend "s") (zero_extend "u")])
+(define_code_attr u_bool [(sign_extend "false") (zero_extend "true")])
+
+;; Used in signed and unsigned truncations.
+(define_code_iterator any_truncate [ss_truncate truncate us_truncate])
+;; Instruction suffix for truncations.
+(define_code_attr trunsuffix [(ss_truncate "s") (truncate "") (us_truncate "us")])
+
+;; Used in signed and unsigned fix.
+(define_code_iterator any_fix [fix unsigned_fix])
+(define_code_attr fixsuffix [(fix "") (unsigned_fix "u")])
+
+;; All integer modes.
+(define_mode_iterator SWI1248x [QI HI SI DI])
+
+;; All integer modes without QImode.
+(define_mode_iterator SWI248x [HI SI DI])
+
+;; All integer modes without QImode and HImode.
+(define_mode_iterator SWI48x [SI DI])
+
+;; All integer modes without SImode and DImode.
+(define_mode_iterator SWI12 [QI HI])
+
+;; All integer modes without DImode.
+(define_mode_iterator SWI124 [QI HI SI])
+
+;; All integer modes without QImode and DImode.
+(define_mode_iterator SWI24 [HI SI])
+
+;; Single word integer modes.
+(define_mode_iterator SWI [QI HI SI (DI "TARGET_64BIT")])
+
+;; Single word integer modes without QImode.
+(define_mode_iterator SWI248 [HI SI (DI "TARGET_64BIT")])
+
+;; Single word integer modes without QImode and HImode.
+(define_mode_iterator SWI48 [SI (DI "TARGET_64BIT")])
+
+;; All math-dependant single and double word integer modes.
+(define_mode_iterator SDWIM [(QI "TARGET_QIMODE_MATH")
+			     (HI "TARGET_HIMODE_MATH")
+			     SI DI (TI "TARGET_64BIT")])
+
+;; Math-dependant single word integer modes.
+(define_mode_iterator SWIM [(QI "TARGET_QIMODE_MATH")
+			    (HI "TARGET_HIMODE_MATH")
+			    SI (DI "TARGET_64BIT")])
+
+;; Math-dependant integer modes without DImode.
+(define_mode_iterator SWIM124 [(QI "TARGET_QIMODE_MATH")
+			       (HI "TARGET_HIMODE_MATH")
+			       SI])
+
+;; Math-dependant single word integer modes without QImode.
+(define_mode_iterator SWIM248 [(HI "TARGET_HIMODE_MATH")
+		      	       SI (DI "TARGET_64BIT")])
+
+;; Double word integer modes.
+(define_mode_iterator DWI [(DI "!TARGET_64BIT")
+			   (TI "TARGET_64BIT")])
+
+;; GET_MODE_SIZE for selected modes.  As GET_MODE_SIZE is not
+;; compile time constant, it is faster to use <MODE_SIZE> than
+;; GET_MODE_SIZE (<MODE>mode).  For XFmode which depends on
+;; command line options just use GET_MODE_SIZE macro.
+(define_mode_attr MODE_SIZE [(QI "1") (HI "2") (SI "4") (DI "8") (TI "16")
+			     (SF "4") (DF "8") (XF "GET_MODE_SIZE (XFmode)")
+			     (V16QI "16") (V32QI "32") (V64QI "64")
+			     (V8HI "16") (V16HI "32") (V32HI "64")
+			     (V4SI "16") (V8SI "32") (V16SI "64")
+			     (V2DI "16") (V4DI "32") (V8DI "64")
+			     (V1TI "16") (V2TI "32") (V4TI "64")
+			     (V2DF "16") (V4DF "32") (V8DF "64")
+			     (V4SF "16") (V8SF "32") (V16SF "64")])
+
+;; Double word integer modes as mode attribute.
+(define_mode_attr DWI [(QI "HI") (HI "SI") (SI "DI") (DI "TI")])
+(define_mode_attr dwi [(QI "hi") (HI "si") (SI "di") (DI "ti")])
+
+;; Half mode for double word integer modes.
+(define_mode_iterator DWIH [(SI "!TARGET_64BIT")
+			    (DI "TARGET_64BIT")])
+
+;; Instruction suffix for integer modes.
+(define_mode_attr imodesuffix [(QI "b") (HI "w") (SI "l") (DI "q")])
+
+;; Pointer size prefix for integer modes (Intel asm dialect)
+(define_mode_attr iptrsize [(QI "BYTE")
+			    (HI "WORD")
+			    (SI "DWORD")
+			    (DI "QWORD")])
+
+;; Register class for integer modes.
+(define_mode_attr r [(QI "q") (HI "r") (SI "r") (DI "r")])
+
+;; Immediate operand constraint for integer modes.
+(define_mode_attr i [(QI "n") (HI "n") (SI "e") (DI "e")])
+
+;; General operand constraint for word modes.
+(define_mode_attr g [(QI "qmn") (HI "rmn") (SI "rme") (DI "rme")])
+
+;; Immediate operand constraint for double integer modes.
+(define_mode_attr di [(SI "nF") (DI "e")])
+
+;; Immediate operand constraint for shifts.
+(define_mode_attr S [(QI "I") (HI "I") (SI "I") (DI "J") (TI "O")])
+
+;; General operand predicate for integer modes.
+(define_mode_attr general_operand
+	[(QI "general_operand")
+	 (HI "general_operand")
+	 (SI "x86_64_general_operand")
+	 (DI "x86_64_general_operand")
+	 (TI "x86_64_general_operand")])
+
+;; General sign/zero extend operand predicate for integer modes.
+(define_mode_attr general_szext_operand
+	[(QI "general_operand")
+	 (HI "general_operand")
+	 (SI "x86_64_szext_general_operand")
+	 (DI "x86_64_szext_general_operand")])
+
+;; Immediate operand predicate for integer modes.
+(define_mode_attr immediate_operand
+	[(QI "immediate_operand")
+	 (HI "immediate_operand")
+	 (SI "x86_64_immediate_operand")
+	 (DI "x86_64_immediate_operand")])
+
+;; Nonmemory operand predicate for integer modes.
+(define_mode_attr nonmemory_operand
+	[(QI "nonmemory_operand")
+	 (HI "nonmemory_operand")
+	 (SI "x86_64_nonmemory_operand")
+	 (DI "x86_64_nonmemory_operand")])
+
+;; Operand predicate for shifts.
+(define_mode_attr shift_operand
+	[(QI "nonimmediate_operand")
+	 (HI "nonimmediate_operand")
+	 (SI "nonimmediate_operand")
+	 (DI "shiftdi_operand")
+	 (TI "register_operand")])
+
+;; Operand predicate for shift argument.
+(define_mode_attr shift_immediate_operand
+	[(QI "const_1_to_31_operand")
+	 (HI "const_1_to_31_operand")
+	 (SI "const_1_to_31_operand")
+	 (DI "const_1_to_63_operand")])
+
+;; Input operand predicate for arithmetic left shifts.
+(define_mode_attr ashl_input_operand
+	[(QI "nonimmediate_operand")
+	 (HI "nonimmediate_operand")
+	 (SI "nonimmediate_operand")
+	 (DI "ashldi_input_operand")
+	 (TI "reg_or_pm1_operand")])
+
+;; SSE and x87 SFmode and DFmode floating point modes
+(define_mode_iterator MODEF [SF DF])
+
+;; All x87 floating point modes
+(define_mode_iterator X87MODEF [SF DF XF])
+
+;; SSE instruction suffix for various modes
+(define_mode_attr ssemodesuffix
+  [(SF "ss") (DF "sd")
+   (V16SF "ps") (V8DF "pd")
+   (V8SF "ps") (V4DF "pd")
+   (V4SF "ps") (V2DF "pd")
+   (V16QI "b") (V8HI "w") (V4SI "d") (V2DI "q")
+   (V32QI "b") (V16HI "w") (V8SI "d") (V4DI "q")
+   (V64QI "b") (V16SI "d") (V8DI "q")])
+
+;; SSE vector suffix for floating point modes
+(define_mode_attr ssevecmodesuffix [(SF "ps") (DF "pd")])
+
+;; SSE vector mode corresponding to a scalar mode
+(define_mode_attr ssevecmode
+  [(QI "V16QI") (HI "V8HI") (SI "V4SI") (DI "V2DI") (SF "V4SF") (DF "V2DF")])
+(define_mode_attr ssevecmodelower
+  [(QI "v16qi") (HI "v8hi") (SI "v4si") (DI "v2di") (SF "v4sf") (DF "v2df")])
+
+;; Instruction suffix for REX 64bit operators.
+(define_mode_attr rex64suffix [(SI "") (DI "{q}")])
+
+;; This mode iterator allows :P to be used for patterns that operate on
+;; pointer-sized quantities.  Exactly one of the two alternatives will match.
+(define_mode_iterator P [(SI "Pmode == SImode") (DI "Pmode == DImode")])
+
+;; This mode iterator allows :W to be used for patterns that operate on
+;; word_mode sized quantities.
+(define_mode_iterator W
+  [(SI "word_mode == SImode") (DI "word_mode == DImode")])
+
+;; This mode iterator allows :PTR to be used for patterns that operate on
+;; ptr_mode sized quantities.
+(define_mode_iterator PTR
+  [(SI "ptr_mode == SImode") (DI "ptr_mode == DImode")])
+
+;; Scheduling descriptions
+
+(include "pentium.md")
+(include "ppro.md")
+(include "k6.md")
+(include "athlon.md")
+(include "bdver1.md")
+(include "bdver3.md")
+(include "btver2.md")
+(include "geode.md")
+(include "atom.md")
+(include "slm.md")
+(include "core2.md")
+
+
+;; Operand and operator predicates and constraints
+
+(include "predicates.md")
+(include "constraints.md")
+
+
+;; Compare and branch/compare and store instructions.
+
+(define_expand "cbranch<mode>4"
+  [(set (reg:CC FLAGS_REG)
+	(compare:CC (match_operand:SDWIM 1 "nonimmediate_operand")
+		    (match_operand:SDWIM 2 "<general_operand>")))
+   (set (pc) (if_then_else
+	       (match_operator 0 "ordered_comparison_operator"
+		[(reg:CC FLAGS_REG) (const_int 0)])
+	       (label_ref (match_operand 3))
+	       (pc)))]
+  ""
+{
+  if (MEM_P (operands[1]) && MEM_P (operands[2]))
+    operands[1] = force_reg (<MODE>mode, operands[1]);
+  ix86_expand_branch (GET_CODE (operands[0]),
+		      operands[1], operands[2], operands[3]);
+  DONE;
+})
+
+(define_expand "cstore<mode>4"
+  [(set (reg:CC FLAGS_REG)
+	(compare:CC (match_operand:SWIM 2 "nonimmediate_operand")
+		    (match_operand:SWIM 3 "<general_operand>")))
+   (set (match_operand:QI 0 "register_operand")
+	(match_operator 1 "ordered_comparison_operator"
+	  [(reg:CC FLAGS_REG) (const_int 0)]))]
+  ""
+{
+  if (MEM_P (operands[2]) && MEM_P (operands[3]))
+    operands[2] = force_reg (<MODE>mode, operands[2]);
+  ix86_expand_setcc (operands[0], GET_CODE (operands[1]),
+		     operands[2], operands[3]);
+  DONE;
+})
+
+(define_expand "cmp<mode>_1"
+  [(set (reg:CC FLAGS_REG)
+	(compare:CC (match_operand:SWI48 0 "nonimmediate_operand")
+		    (match_operand:SWI48 1 "<general_operand>")))])
+
+(define_insn "*cmp<mode>_ccno_1"
+  [(set (reg FLAGS_REG)
+	(compare (match_operand:SWI 0 "nonimmediate_operand" "<r>,?m<r>")
+		 (match_operand:SWI 1 "const0_operand")))]
+  "ix86_match_ccmode (insn, CCNOmode)"
+  "@
+   test{<imodesuffix>}\t%0, %0
+   cmp{<imodesuffix>}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "test,icmp")
+   (set_attr "length_immediate" "0,1")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*cmp<mode>_1"
+  [(set (reg FLAGS_REG)
+	(compare (match_operand:SWI 0 "nonimmediate_operand" "<r>m,<r>")
+		 (match_operand:SWI 1 "<general_operand>" "<r><i>,<r>m")))]
+  "ix86_match_ccmode (insn, CCmode)"
+  "cmp{<imodesuffix>}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "icmp")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*cmp<mode>_minus_1"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (minus:SWI (match_operand:SWI 0 "nonimmediate_operand" "<r>m,<r>")
+		     (match_operand:SWI 1 "<general_operand>" "<r><i>,<r>m"))
+	  (const_int 0)))]
+  "ix86_match_ccmode (insn, CCGOCmode)"
+  "cmp{<imodesuffix>}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "icmp")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*cmpqi_ext_1"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (match_operand:QI 0 "nonimmediate_x64nomem_operand" "Q,m")
+	  (subreg:QI
+	    (zero_extract:SI
+	      (match_operand 1 "ext_register_operand" "Q,Q")
+	      (const_int 8)
+	      (const_int 8)) 0)))]
+  "ix86_match_ccmode (insn, CCmode)"
+  "cmp{b}\t{%h1, %0|%0, %h1}"
+  [(set_attr "isa" "*,nox64")
+   (set_attr "type" "icmp")
+   (set_attr "mode" "QI")])
+
+(define_insn "*cmpqi_ext_2"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (subreg:QI
+	    (zero_extract:SI
+	      (match_operand 0 "ext_register_operand" "Q")
+	      (const_int 8)
+	      (const_int 8)) 0)
+	  (match_operand:QI 1 "const0_operand")))]
+  "ix86_match_ccmode (insn, CCNOmode)"
+  "test{b}\t%h0, %h0"
+  [(set_attr "type" "test")
+   (set_attr "length_immediate" "0")
+   (set_attr "mode" "QI")])
+
+(define_expand "cmpqi_ext_3"
+  [(set (reg:CC FLAGS_REG)
+	(compare:CC
+	  (subreg:QI
+	    (zero_extract:SI
+	      (match_operand 0 "ext_register_operand")
+	      (const_int 8)
+	      (const_int 8)) 0)
+	  (match_operand:QI 1 "const_int_operand")))])
+
+(define_insn "*cmpqi_ext_3"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (subreg:QI
+	    (zero_extract:SI
+	      (match_operand 0 "ext_register_operand" "Q,Q")
+	      (const_int 8)
+	      (const_int 8)) 0)
+	  (match_operand:QI 1 "general_x64nomem_operand" "Qn,m")))]
+  "ix86_match_ccmode (insn, CCmode)"
+  "cmp{b}\t{%1, %h0|%h0, %1}"
+  [(set_attr "isa" "*,nox64")
+   (set_attr "type" "icmp")
+   (set_attr "modrm" "1")
+   (set_attr "mode" "QI")])
+
+(define_insn "*cmpqi_ext_4"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (subreg:QI
+	    (zero_extract:SI
+	      (match_operand 0 "ext_register_operand" "Q")
+	      (const_int 8)
+	      (const_int 8)) 0)
+	  (subreg:QI
+	    (zero_extract:SI
+	      (match_operand 1 "ext_register_operand" "Q")
+	      (const_int 8)
+	      (const_int 8)) 0)))]
+  "ix86_match_ccmode (insn, CCmode)"
+  "cmp{b}\t{%h1, %h0|%h0, %h1}"
+  [(set_attr "type" "icmp")
+   (set_attr "mode" "QI")])
+
+;; These implement float point compares.
+;; %%% See if we can get away with VOIDmode operands on the actual insns,
+;; which would allow mix and match FP modes on the compares.  Which is what
+;; the old patterns did, but with many more of them.
+
+(define_expand "cbranchxf4"
+  [(set (reg:CC FLAGS_REG)
+	(compare:CC (match_operand:XF 1 "nonmemory_operand")
+		    (match_operand:XF 2 "nonmemory_operand")))
+   (set (pc) (if_then_else
+              (match_operator 0 "ix86_fp_comparison_operator"
+               [(reg:CC FLAGS_REG)
+                (const_int 0)])
+              (label_ref (match_operand 3))
+              (pc)))]
+  "TARGET_80387"
+{
+  ix86_expand_branch (GET_CODE (operands[0]),
+		      operands[1], operands[2], operands[3]);
+  DONE;
+})
+
+(define_expand "cstorexf4"
+  [(set (reg:CC FLAGS_REG)
+	(compare:CC (match_operand:XF 2 "nonmemory_operand")
+		    (match_operand:XF 3 "nonmemory_operand")))
+   (set (match_operand:QI 0 "register_operand")
+              (match_operator 1 "ix86_fp_comparison_operator"
+               [(reg:CC FLAGS_REG)
+                (const_int 0)]))]
+  "TARGET_80387"
+{
+  ix86_expand_setcc (operands[0], GET_CODE (operands[1]),
+		     operands[2], operands[3]);
+  DONE;
+})
+
+(define_expand "cbranch<mode>4"
+  [(set (reg:CC FLAGS_REG)
+	(compare:CC (match_operand:MODEF 1 "cmp_fp_expander_operand")
+		    (match_operand:MODEF 2 "cmp_fp_expander_operand")))
+   (set (pc) (if_then_else
+              (match_operator 0 "ix86_fp_comparison_operator"
+               [(reg:CC FLAGS_REG)
+                (const_int 0)])
+              (label_ref (match_operand 3))
+              (pc)))]
+  "TARGET_80387 || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
+{
+  ix86_expand_branch (GET_CODE (operands[0]),
+		      operands[1], operands[2], operands[3]);
+  DONE;
+})
+
+(define_expand "cstore<mode>4"
+  [(set (reg:CC FLAGS_REG)
+	(compare:CC (match_operand:MODEF 2 "cmp_fp_expander_operand")
+		    (match_operand:MODEF 3 "cmp_fp_expander_operand")))
+   (set (match_operand:QI 0 "register_operand")
+              (match_operator 1 "ix86_fp_comparison_operator"
+               [(reg:CC FLAGS_REG)
+                (const_int 0)]))]
+  "TARGET_80387 || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
+{
+  ix86_expand_setcc (operands[0], GET_CODE (operands[1]),
+		     operands[2], operands[3]);
+  DONE;
+})
+
+(define_expand "cbranchcc4"
+  [(set (pc) (if_then_else
+              (match_operator 0 "comparison_operator"
+               [(match_operand 1 "flags_reg_operand")
+                (match_operand 2 "const0_operand")])
+              (label_ref (match_operand 3))
+              (pc)))]
+  ""
+{
+  ix86_expand_branch (GET_CODE (operands[0]),
+		      operands[1], operands[2], operands[3]);
+  DONE;
+})
+
+(define_expand "cstorecc4"
+  [(set (match_operand:QI 0 "register_operand")
+              (match_operator 1 "comparison_operator"
+               [(match_operand 2 "flags_reg_operand")
+                (match_operand 3 "const0_operand")]))]
+  ""
+{
+  ix86_expand_setcc (operands[0], GET_CODE (operands[1]),
+		     operands[2], operands[3]);
+  DONE;
+})
+
+
+;; FP compares, step 1:
+;; Set the FP condition codes.
+;;
+;; CCFPmode	compare with exceptions
+;; CCFPUmode	compare with no exceptions
+
+;; We may not use "#" to split and emit these, since the REG_DEAD notes
+;; used to manage the reg stack popping would not be preserved.
+
+(define_insn "*cmp<mode>_0_i387"
+  [(set (match_operand:HI 0 "register_operand" "=a")
+	(unspec:HI
+	  [(compare:CCFP
+	     (match_operand:X87MODEF 1 "register_operand" "f")
+	     (match_operand:X87MODEF 2 "const0_operand"))]
+	UNSPEC_FNSTSW))]
+  "TARGET_80387"
+  "* return output_fp_compare (insn, operands, false, false);"
+  [(set_attr "type" "multi")
+   (set_attr "unit" "i387")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn_and_split "*cmp<mode>_0_cc_i387"
+  [(set (reg:CCFP FLAGS_REG)
+	(compare:CCFP
+	  (match_operand:X87MODEF 1 "register_operand" "f")
+	  (match_operand:X87MODEF 2 "const0_operand")))
+   (clobber (match_operand:HI 0 "register_operand" "=a"))]
+  "TARGET_80387 && TARGET_SAHF && !TARGET_CMOVE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+	(unspec:HI
+	  [(compare:CCFP (match_dup 1)(match_dup 2))]
+	UNSPEC_FNSTSW))
+   (set (reg:CC FLAGS_REG)
+	(unspec:CC [(match_dup 0)] UNSPEC_SAHF))]
+  ""
+  [(set_attr "type" "multi")
+   (set_attr "unit" "i387")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*cmpxf_i387"
+  [(set (match_operand:HI 0 "register_operand" "=a")
+	(unspec:HI
+	  [(compare:CCFP
+	     (match_operand:XF 1 "register_operand" "f")
+	     (match_operand:XF 2 "register_operand" "f"))]
+	  UNSPEC_FNSTSW))]
+  "TARGET_80387"
+  "* return output_fp_compare (insn, operands, false, false);"
+  [(set_attr "type" "multi")
+   (set_attr "unit" "i387")
+   (set_attr "mode" "XF")])
+
+(define_insn_and_split "*cmpxf_cc_i387"
+  [(set (reg:CCFP FLAGS_REG)
+	(compare:CCFP
+	  (match_operand:XF 1 "register_operand" "f")
+	  (match_operand:XF 2 "register_operand" "f")))
+   (clobber (match_operand:HI 0 "register_operand" "=a"))]
+  "TARGET_80387 && TARGET_SAHF && !TARGET_CMOVE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+	(unspec:HI
+	  [(compare:CCFP (match_dup 1)(match_dup 2))]
+	UNSPEC_FNSTSW))
+   (set (reg:CC FLAGS_REG)
+	(unspec:CC [(match_dup 0)] UNSPEC_SAHF))]
+  ""
+  [(set_attr "type" "multi")
+   (set_attr "unit" "i387")
+   (set_attr "mode" "XF")])
+
+(define_insn "*cmp<mode>_i387"
+  [(set (match_operand:HI 0 "register_operand" "=a")
+	(unspec:HI
+	  [(compare:CCFP
+	     (match_operand:MODEF 1 "register_operand" "f")
+	     (match_operand:MODEF 2 "nonimmediate_operand" "fm"))]
+	  UNSPEC_FNSTSW))]
+  "TARGET_80387"
+  "* return output_fp_compare (insn, operands, false, false);"
+  [(set_attr "type" "multi")
+   (set_attr "unit" "i387")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn_and_split "*cmp<mode>_cc_i387"
+  [(set (reg:CCFP FLAGS_REG)
+	(compare:CCFP
+	  (match_operand:MODEF 1 "register_operand" "f")
+	  (match_operand:MODEF 2 "nonimmediate_operand" "fm")))
+   (clobber (match_operand:HI 0 "register_operand" "=a"))]
+  "TARGET_80387 && TARGET_SAHF && !TARGET_CMOVE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+	(unspec:HI
+	  [(compare:CCFP (match_dup 1)(match_dup 2))]
+	UNSPEC_FNSTSW))
+   (set (reg:CC FLAGS_REG)
+	(unspec:CC [(match_dup 0)] UNSPEC_SAHF))]
+  ""
+  [(set_attr "type" "multi")
+   (set_attr "unit" "i387")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*cmpu<mode>_i387"
+  [(set (match_operand:HI 0 "register_operand" "=a")
+	(unspec:HI
+	  [(compare:CCFPU
+	     (match_operand:X87MODEF 1 "register_operand" "f")
+	     (match_operand:X87MODEF 2 "register_operand" "f"))]
+	  UNSPEC_FNSTSW))]
+  "TARGET_80387"
+  "* return output_fp_compare (insn, operands, false, true);"
+  [(set_attr "type" "multi")
+   (set_attr "unit" "i387")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn_and_split "*cmpu<mode>_cc_i387"
+  [(set (reg:CCFPU FLAGS_REG)
+	(compare:CCFPU
+	  (match_operand:X87MODEF 1 "register_operand" "f")
+	  (match_operand:X87MODEF 2 "register_operand" "f")))
+   (clobber (match_operand:HI 0 "register_operand" "=a"))]
+  "TARGET_80387 && TARGET_SAHF && !TARGET_CMOVE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+	(unspec:HI
+	  [(compare:CCFPU (match_dup 1)(match_dup 2))]
+	UNSPEC_FNSTSW))
+   (set (reg:CC FLAGS_REG)
+	(unspec:CC [(match_dup 0)] UNSPEC_SAHF))]
+  ""
+  [(set_attr "type" "multi")
+   (set_attr "unit" "i387")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*cmp<X87MODEF:mode>_<SWI24:mode>_i387"
+  [(set (match_operand:HI 0 "register_operand" "=a")
+	(unspec:HI
+	  [(compare:CCFP
+	     (match_operand:X87MODEF 1 "register_operand" "f")
+	     (match_operator:X87MODEF 3 "float_operator"
+	       [(match_operand:SWI24 2 "memory_operand" "m")]))]
+	  UNSPEC_FNSTSW))]
+  "TARGET_80387
+   && (TARGET_USE_<SWI24:MODE>MODE_FIOP
+       || optimize_function_for_size_p (cfun))"
+  "* return output_fp_compare (insn, operands, false, false);"
+  [(set_attr "type" "multi")
+   (set_attr "unit" "i387")
+   (set_attr "fp_int_src" "true")
+   (set_attr "mode" "<SWI24:MODE>")])
+
+(define_insn_and_split "*cmp<X87MODEF:mode>_<SWI24:mode>_cc_i387"
+  [(set (reg:CCFP FLAGS_REG)
+	(compare:CCFP
+	  (match_operand:X87MODEF 1 "register_operand" "f")
+	  (match_operator:X87MODEF 3 "float_operator"
+	    [(match_operand:SWI24 2 "memory_operand" "m")])))
+   (clobber (match_operand:HI 0 "register_operand" "=a"))]
+  "TARGET_80387 && TARGET_SAHF && !TARGET_CMOVE
+   && (TARGET_USE_<SWI24:MODE>MODE_FIOP
+       || optimize_function_for_size_p (cfun))"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+	(unspec:HI
+	  [(compare:CCFP
+	     (match_dup 1)
+	     (match_op_dup 3 [(match_dup 2)]))]
+	UNSPEC_FNSTSW))
+   (set (reg:CC FLAGS_REG)
+	(unspec:CC [(match_dup 0)] UNSPEC_SAHF))]
+  ""
+  [(set_attr "type" "multi")
+   (set_attr "unit" "i387")
+   (set_attr "fp_int_src" "true")
+   (set_attr "mode" "<SWI24:MODE>")])
+
+;; FP compares, step 2
+;; Move the fpsw to ax.
+
+(define_insn "x86_fnstsw_1"
+  [(set (match_operand:HI 0 "register_operand" "=a")
+	(unspec:HI [(reg:CCFP FPSR_REG)] UNSPEC_FNSTSW))]
+  "TARGET_80387"
+  "fnstsw\t%0"
+  [(set (attr "length")
+	(symbol_ref "ix86_attr_length_address_default (insn) + 2"))
+   (set_attr "mode" "SI")
+   (set_attr "unit" "i387")])
+
+;; FP compares, step 3
+;; Get ax into flags, general case.
+
+(define_insn "x86_sahf_1"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC [(match_operand:HI 0 "register_operand" "a")]
+		   UNSPEC_SAHF))]
+  "TARGET_SAHF"
+{
+#ifndef HAVE_AS_IX86_SAHF
+  if (TARGET_64BIT)
+    return ASM_BYTE "0x9e";
+  else
+#endif
+  return "sahf";
+}
+  [(set_attr "length" "1")
+   (set_attr "athlon_decode" "vector")
+   (set_attr "amdfam10_decode" "direct")
+   (set_attr "bdver1_decode" "direct")
+   (set_attr "mode" "SI")])
+
+;; Pentium Pro can do steps 1 through 3 in one go.
+;; comi*, ucomi*, fcomi*, ficomi*, fucomi*
+;; (these i387 instructions set flags directly)
+
+(define_mode_iterator FPCMP [CCFP CCFPU])
+(define_mode_attr unord [(CCFP "") (CCFPU "u")])
+
+(define_insn "*cmpi<FPCMP:unord><MODEF:mode>_mixed"
+  [(set (reg:FPCMP FLAGS_REG)
+	(compare:FPCMP
+	  (match_operand:MODEF 0 "register_operand" "f,x")
+	  (match_operand:MODEF 1 "nonimmediate_operand" "f,xm")))]
+  "TARGET_MIX_SSE_I387
+   && SSE_FLOAT_MODE_P (<MODEF:MODE>mode)"
+  "* return output_fp_compare (insn, operands, true,
+			       <FPCMP:MODE>mode == CCFPUmode);"
+  [(set_attr "type" "fcmp,ssecomi")
+   (set_attr "prefix" "orig,maybe_vex")
+   (set_attr "mode" "<MODEF:MODE>")
+   (set (attr "prefix_rep")
+	(if_then_else (eq_attr "type" "ssecomi")
+		      (const_string "0")
+		      (const_string "*")))
+   (set (attr "prefix_data16")
+	(cond [(eq_attr "type" "fcmp")
+		 (const_string "*")
+	       (eq_attr "mode" "DF")
+		 (const_string "1")
+	      ]
+	      (const_string "0")))
+   (set_attr "athlon_decode" "vector")
+   (set_attr "amdfam10_decode" "direct")
+   (set_attr "bdver1_decode" "double")])
+
+(define_insn "*cmpi<FPCMP:unord><MODEF:mode>_sse"
+  [(set (reg:FPCMP FLAGS_REG)
+	(compare:FPCMP
+	  (match_operand:MODEF 0 "register_operand" "x")
+	  (match_operand:MODEF 1 "nonimmediate_operand" "xm")))]
+  "TARGET_SSE_MATH
+   && SSE_FLOAT_MODE_P (<MODEF:MODE>mode)"
+  "* return output_fp_compare (insn, operands, true,
+			       <FPCMP:MODE>mode == CCFPUmode);"
+  [(set_attr "type" "ssecomi")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<MODEF:MODE>")
+   (set_attr "prefix_rep" "0")
+   (set (attr "prefix_data16")
+	(if_then_else (eq_attr "mode" "DF")
+		      (const_string "1")
+		      (const_string "0")))
+   (set_attr "athlon_decode" "vector")
+   (set_attr "amdfam10_decode" "direct")
+   (set_attr "bdver1_decode" "double")])
+
+(define_insn "*cmpi<FPCMP:unord><X87MODEF:mode>_i387"
+  [(set (reg:FPCMP FLAGS_REG)
+	(compare:FPCMP
+	  (match_operand:X87MODEF 0 "register_operand" "f")
+	  (match_operand:X87MODEF 1 "register_operand" "f")))]
+  "TARGET_80387 && TARGET_CMOVE
+   && !(SSE_FLOAT_MODE_P (<X87MODEF:MODE>mode) && TARGET_SSE_MATH)"
+  "* return output_fp_compare (insn, operands, true,
+			       <FPCMP:MODE>mode == CCFPUmode);"
+  [(set_attr "type" "fcmp")
+   (set_attr "mode" "<X87MODEF:MODE>")
+   (set_attr "athlon_decode" "vector")
+   (set_attr "amdfam10_decode" "direct")
+   (set_attr "bdver1_decode" "double")])
+
+;; Push/pop instructions.
+
+(define_insn "*push<mode>2"
+  [(set (match_operand:DWI 0 "push_operand" "=<")
+	(match_operand:DWI 1 "general_no_elim_operand" "riF*o"))]
+  ""
+  "#"
+  [(set_attr "type" "multi")
+   (set_attr "mode" "<MODE>")])
+
+(define_split
+  [(set (match_operand:TI 0 "push_operand")
+        (match_operand:TI 1 "general_operand"))]
+  "TARGET_64BIT && reload_completed
+   && !SSE_REG_P (operands[1])"
+  [(const_int 0)]
+  "ix86_split_long_move (operands); DONE;")
+
+(define_insn "*pushdi2_rex64"
+  [(set (match_operand:DI 0 "push_operand" "=<,!<")
+	(match_operand:DI 1 "general_no_elim_operand" "re*m,n"))]
+  "TARGET_64BIT"
+  "@
+   push{q}\t%1
+   #"
+  [(set_attr "type" "push,multi")
+   (set_attr "mode" "DI")])
+
+;; Convert impossible pushes of immediate to existing instructions.
+;; First try to get scratch register and go through it.  In case this
+;; fails, push sign extended lower part first and then overwrite
+;; upper part by 32bit move.
+(define_peephole2
+  [(match_scratch:DI 2 "r")
+   (set (match_operand:DI 0 "push_operand")
+        (match_operand:DI 1 "immediate_operand"))]
+  "TARGET_64BIT && !symbolic_operand (operands[1], DImode)
+   && !x86_64_immediate_operand (operands[1], DImode)"
+  [(set (match_dup 2) (match_dup 1))
+   (set (match_dup 0) (match_dup 2))])
+
+;; We need to define this as both peepholer and splitter for case
+;; peephole2 pass is not run.
+;; "&& 1" is needed to keep it from matching the previous pattern.
+(define_peephole2
+  [(set (match_operand:DI 0 "push_operand")
+        (match_operand:DI 1 "immediate_operand"))]
+  "TARGET_64BIT && !symbolic_operand (operands[1], DImode)
+   && !x86_64_immediate_operand (operands[1], DImode) && 1"
+  [(set (match_dup 0) (match_dup 1))
+   (set (match_dup 2) (match_dup 3))]
+{
+  split_double_mode (DImode, &operands[1], 1, &operands[2], &operands[3]);
+
+  operands[1] = gen_lowpart (DImode, operands[2]);
+  operands[2] = gen_rtx_MEM (SImode, gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+						   GEN_INT (4)));
+})
+
+(define_split
+  [(set (match_operand:DI 0 "push_operand")
+        (match_operand:DI 1 "immediate_operand"))]
+  "TARGET_64BIT && ((optimize > 0 && flag_peephole2)
+		    ? epilogue_completed : reload_completed)
+   && !symbolic_operand (operands[1], DImode)
+   && !x86_64_immediate_operand (operands[1], DImode)"
+  [(set (match_dup 0) (match_dup 1))
+   (set (match_dup 2) (match_dup 3))]
+{
+  split_double_mode (DImode, &operands[1], 1, &operands[2], &operands[3]);
+
+  operands[1] = gen_lowpart (DImode, operands[2]);
+  operands[2] = gen_rtx_MEM (SImode, gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+						   GEN_INT (4)));
+})
+
+(define_split
+  [(set (match_operand:DI 0 "push_operand")
+        (match_operand:DI 1 "general_operand"))]
+  "!TARGET_64BIT && reload_completed
+   && !(MMX_REG_P (operands[1]) || SSE_REG_P (operands[1]))"
+  [(const_int 0)]
+  "ix86_split_long_move (operands); DONE;")
+
+(define_insn "*pushsi2"
+  [(set (match_operand:SI 0 "push_operand" "=<")
+	(match_operand:SI 1 "general_no_elim_operand" "ri*m"))]
+  "!TARGET_64BIT"
+  "push{l}\t%1"
+  [(set_attr "type" "push")
+   (set_attr "mode" "SI")])
+
+;; emit_push_insn when it calls move_by_pieces requires an insn to
+;; "push a byte/word".  But actually we use pushl, which has the effect
+;; of rounding the amount pushed up to a word.
+
+;; For TARGET_64BIT we always round up to 8 bytes.
+(define_insn "*push<mode>2_rex64"
+  [(set (match_operand:SWI124 0 "push_operand" "=X")
+	(match_operand:SWI124 1 "nonmemory_no_elim_operand" "r<i>"))]
+  "TARGET_64BIT"
+  "push{q}\t%q1"
+  [(set_attr "type" "push")
+   (set_attr "mode" "DI")])
+
+(define_insn "*push<mode>2"
+  [(set (match_operand:SWI12 0 "push_operand" "=X")
+	(match_operand:SWI12 1 "nonmemory_no_elim_operand" "rn"))]
+  "!TARGET_64BIT"
+  "push{l}\t%k1"
+  [(set_attr "type" "push")
+   (set_attr "mode" "SI")])
+
+(define_insn "*push<mode>2_prologue"
+  [(set (match_operand:W 0 "push_operand" "=<")
+	(match_operand:W 1 "general_no_elim_operand" "r<i>*m"))
+   (clobber (mem:BLK (scratch)))]
+  ""
+  "push{<imodesuffix>}\t%1"
+  [(set_attr "type" "push")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*pop<mode>1"
+  [(set (match_operand:W 0 "nonimmediate_operand" "=r*m")
+	(match_operand:W 1 "pop_operand" ">"))]
+  ""
+  "pop{<imodesuffix>}\t%0"
+  [(set_attr "type" "pop")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*pop<mode>1_epilogue"
+  [(set (match_operand:W 0 "nonimmediate_operand" "=r*m")
+	(match_operand:W 1 "pop_operand" ">"))
+   (clobber (mem:BLK (scratch)))]
+  ""
+  "pop{<imodesuffix>}\t%0"
+  [(set_attr "type" "pop")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*pushfl<mode>2"
+  [(set (match_operand:W 0 "push_operand" "=<")
+	(match_operand:W 1 "flags_reg_operand"))]
+  ""
+  "pushf{<imodesuffix>}"
+  [(set_attr "type" "push")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*popfl<mode>1"
+  [(set (match_operand:W 0 "flags_reg_operand")
+	(match_operand:W 1 "pop_operand" ">"))]
+  ""
+  "popf{<imodesuffix>}"
+  [(set_attr "type" "pop")
+   (set_attr "mode" "<MODE>")])
+
+
+;; Move instructions.
+
+(define_expand "movxi"
+  [(set (match_operand:XI 0 "nonimmediate_operand")
+	(match_operand:XI 1 "general_operand"))]
+  "TARGET_AVX512F"
+  "ix86_expand_move (XImode, operands); DONE;")
+
+;; Reload patterns to support multi-word load/store
+;; with non-offsetable address.
+(define_expand "reload_noff_store"
+  [(parallel [(match_operand 0 "memory_operand" "=m")
+              (match_operand 1 "register_operand" "r")
+              (match_operand:DI 2 "register_operand" "=&r")])]
+  "TARGET_64BIT"
+{
+  rtx mem = operands[0];
+  rtx addr = XEXP (mem, 0);
+
+  emit_move_insn (operands[2], addr);
+  mem = replace_equiv_address_nv (mem, operands[2]);
+
+  emit_insn (gen_rtx_SET (VOIDmode, mem, operands[1]));
+  DONE;
+})
+
+(define_expand "reload_noff_load"
+  [(parallel [(match_operand 0 "register_operand" "=r")
+              (match_operand 1 "memory_operand" "m")
+              (match_operand:DI 2 "register_operand" "=r")])]
+  "TARGET_64BIT"
+{
+  rtx mem = operands[1];
+  rtx addr = XEXP (mem, 0);
+
+  emit_move_insn (operands[2], addr);
+  mem = replace_equiv_address_nv (mem, operands[2]);
+
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0], mem));
+  DONE;
+})
+
+(define_expand "movoi"
+  [(set (match_operand:OI 0 "nonimmediate_operand")
+	(match_operand:OI 1 "general_operand"))]
+  "TARGET_AVX"
+  "ix86_expand_move (OImode, operands); DONE;")
+
+(define_expand "movti"
+  [(set (match_operand:TI 0 "nonimmediate_operand")
+	(match_operand:TI 1 "nonimmediate_operand"))]
+  "TARGET_64BIT || TARGET_SSE"
+{
+  if (TARGET_64BIT)
+    ix86_expand_move (TImode, operands);
+  else
+    ix86_expand_vector_move (TImode, operands);
+  DONE;
+})
+
+;; This expands to what emit_move_complex would generate if we didn't
+;; have a movti pattern.  Having this avoids problems with reload on
+;; 32-bit targets when SSE is present, but doesn't seem to be harmful
+;; to have around all the time.
+(define_expand "movcdi"
+  [(set (match_operand:CDI 0 "nonimmediate_operand")
+	(match_operand:CDI 1 "general_operand"))]
+  ""
+{
+  if (push_operand (operands[0], CDImode))
+    emit_move_complex_push (CDImode, operands[0], operands[1]);
+  else
+    emit_move_complex_parts (operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "mov<mode>"
+  [(set (match_operand:SWI1248x 0 "nonimmediate_operand")
+	(match_operand:SWI1248x 1 "general_operand"))]
+  ""
+  "ix86_expand_move (<MODE>mode, operands); DONE;")
+
+(define_insn "*mov<mode>_xor"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(match_operand:SWI48 1 "const0_operand"))
+   (clobber (reg:CC FLAGS_REG))]
+  "reload_completed"
+  "xor{l}\t%k0, %k0"
+  [(set_attr "type" "alu1")
+   (set_attr "mode" "SI")
+   (set_attr "length_immediate" "0")])
+
+(define_insn "*mov<mode>_or"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(match_operand:SWI48 1 "const_int_operand"))
+   (clobber (reg:CC FLAGS_REG))]
+  "reload_completed
+   && operands[1] == constm1_rtx"
+  "or{<imodesuffix>}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "mode" "<MODE>")
+   (set_attr "length_immediate" "1")])
+
+(define_insn "*movxi_internal_avx512f"
+  [(set (match_operand:XI 0 "nonimmediate_operand" "=x,x ,m")
+	(match_operand:XI 1 "vector_move_operand"  "C ,xm,x"))]
+  "TARGET_AVX512F && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return standard_sse_constant_opcode (insn, operands[1]);
+    case 1:
+    case 2:
+      if (misaligned_operand (operands[0], XImode)
+	  || misaligned_operand (operands[1], XImode))
+	return "vmovdqu32\t{%1, %0|%0, %1}";
+      else
+	return "vmovdqa32\t{%1, %0|%0, %1}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "sselog1,ssemov,ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "*movoi_internal_avx"
+  [(set (match_operand:OI 0 "nonimmediate_operand" "=x,x ,m")
+	(match_operand:OI 1 "vector_move_operand"  "C ,xm,x"))]
+  "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_SSELOG1:
+      return standard_sse_constant_opcode (insn, operands[1]);
+
+    case TYPE_SSEMOV:
+      if (misaligned_operand (operands[0], OImode)
+	  || misaligned_operand (operands[1], OImode))
+	{
+	  if (get_attr_mode (insn) == MODE_V8SF)
+	    return "vmovups\t{%1, %0|%0, %1}";
+	  else
+	    return "vmovdqu\t{%1, %0|%0, %1}";
+	}
+      else
+	{
+	  if (get_attr_mode (insn) == MODE_V8SF)
+	    return "vmovaps\t{%1, %0|%0, %1}";
+	  else
+	    return "vmovdqa\t{%1, %0|%0, %1}";
+	}
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "sselog1,ssemov,ssemov")
+   (set_attr "prefix" "vex")
+   (set (attr "mode")
+	(cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+		 (const_string "V8SF")
+	       (and (eq_attr "alternative" "2")
+		    (match_test "TARGET_SSE_TYPELESS_STORES"))
+		 (const_string "V8SF")
+	      ]
+	      (const_string "OI")))])
+
+(define_insn "*movti_internal"
+  [(set (match_operand:TI 0 "nonimmediate_operand" "=!r ,o ,x,x ,m")
+	(match_operand:TI 1 "general_operand"      "riFo,re,C,xm,x"))]
+  "(TARGET_64BIT || TARGET_SSE)
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_MULTI:
+      return "#";
+
+    case TYPE_SSELOG1:
+      return standard_sse_constant_opcode (insn, operands[1]);
+
+    case TYPE_SSEMOV:
+      /* TDmode values are passed as TImode on the stack.  Moving them
+	 to stack may result in unaligned memory access.  */
+      if (misaligned_operand (operands[0], TImode)
+	  || misaligned_operand (operands[1], TImode))
+	{
+	  if (get_attr_mode (insn) == MODE_V4SF)
+	    return "%vmovups\t{%1, %0|%0, %1}";
+	  else
+	    return "%vmovdqu\t{%1, %0|%0, %1}";
+	}
+      else
+	{
+	  if (get_attr_mode (insn) == MODE_V4SF)
+	    return "%vmovaps\t{%1, %0|%0, %1}";
+	  else
+	    return "%vmovdqa\t{%1, %0|%0, %1}";
+	}
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "x64,x64,*,*,*")
+   (set_attr "type" "multi,multi,sselog1,ssemov,ssemov")
+   (set (attr "prefix")
+     (if_then_else (eq_attr "type" "sselog1,ssemov")
+       (const_string "maybe_vex")
+       (const_string "orig")))
+   (set (attr "mode")
+   	(cond [(eq_attr "alternative" "0,1")
+		 (const_string "DI")
+	       (ior (not (match_test "TARGET_SSE2"))
+		    (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
+		 (const_string "V4SF")
+	       (and (eq_attr "alternative" "4")
+		    (match_test "TARGET_SSE_TYPELESS_STORES"))
+		 (const_string "V4SF")
+	       (match_test "TARGET_AVX")
+		 (const_string "TI")
+	       (match_test "optimize_function_for_size_p (cfun)")
+		 (const_string "V4SF")
+	       ]
+	       (const_string "TI")))])
+
+(define_split
+  [(set (match_operand:TI 0 "nonimmediate_operand")
+	(match_operand:TI 1 "general_operand"))]
+  "reload_completed
+   && !SSE_REG_P (operands[0]) && !SSE_REG_P (operands[1])"
+  [(const_int 0)]
+  "ix86_split_long_move (operands); DONE;")
+
+(define_insn "*movdi_internal"
+  [(set (match_operand:DI 0 "nonimmediate_operand"
+    "=r  ,o  ,r,r  ,r,m ,*y,*y,?*y,?m,?r ,?*Ym,*v,*v,*v,m ,?r ,?r,?*Yi,?*Ym,?*Yi")
+	(match_operand:DI 1 "general_operand"
+    "riFo,riF,Z,rem,i,re,C ,*y,m  ,*y,*Yn,r   ,C ,*v,m ,*v,*Yj,*v,r   ,*Yj ,*Yn"))]
+  "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_MULTI:
+      return "#";
+
+    case TYPE_MMX:
+      return "pxor\t%0, %0";
+
+    case TYPE_MMXMOV:
+      /* Handle broken assemblers that require movd instead of movq.  */
+      if (!HAVE_AS_IX86_INTERUNIT_MOVQ
+	  && (GENERAL_REG_P (operands[0]) || GENERAL_REG_P (operands[1])))
+	return "movd\t{%1, %0|%0, %1}";
+      return "movq\t{%1, %0|%0, %1}";
+
+    case TYPE_SSELOG1:
+      if (GENERAL_REG_P (operands[0]))
+	return "%vpextrq\t{$0, %1, %0|%0, %1, 0}";
+
+      return standard_sse_constant_opcode (insn, operands[1]);
+
+    case TYPE_SSEMOV:
+      switch (get_attr_mode (insn))
+	{
+	case MODE_DI:
+	  /* Handle broken assemblers that require movd instead of movq.  */
+	  if (!HAVE_AS_IX86_INTERUNIT_MOVQ
+	      && (GENERAL_REG_P (operands[0]) || GENERAL_REG_P (operands[1])))
+	    return "%vmovd\t{%1, %0|%0, %1}";
+	  return "%vmovq\t{%1, %0|%0, %1}";
+	case MODE_TI:
+	  return "%vmovdqa\t{%1, %0|%0, %1}";
+	case MODE_XI:
+	  return "vmovdqa64\t{%g1, %g0|%g0, %g1}";
+
+	case MODE_V2SF:
+	  gcc_assert (!TARGET_AVX);
+	  return "movlps\t{%1, %0|%0, %1}";
+	case MODE_V4SF:
+	  return "%vmovaps\t{%1, %0|%0, %1}";
+
+	default:
+	  gcc_unreachable ();
+	}
+
+    case TYPE_SSECVT:
+      if (SSE_REG_P (operands[0]))
+	return "movq2dq\t{%1, %0|%0, %1}";
+      else
+	return "movdq2q\t{%1, %0|%0, %1}";
+
+    case TYPE_LEA:
+      return "lea{q}\t{%E1, %0|%0, %E1}";
+
+    case TYPE_IMOV:
+      gcc_assert (!flag_pic || LEGITIMATE_PIC_OPERAND_P (operands[1]));
+      if (get_attr_mode (insn) == MODE_SI)
+	return "mov{l}\t{%k1, %k0|%k0, %k1}";
+      else if (which_alternative == 4)
+	return "movabs{q}\t{%1, %0|%0, %1}";
+      else if (ix86_use_lea_for_mov (insn, operands))
+	return "lea{q}\t{%E1, %0|%0, %E1}";
+      else
+	return "mov{q}\t{%1, %0|%0, %1}";
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set (attr "isa")
+     (cond [(eq_attr "alternative" "0,1")
+	      (const_string "nox64")
+	    (eq_attr "alternative" "2,3,4,5,10,11,16,18")
+	      (const_string "x64")
+	    (eq_attr "alternative" "17")
+	      (const_string "x64_sse4")
+	   ]
+	   (const_string "*")))
+   (set (attr "type")
+     (cond [(eq_attr "alternative" "0,1")
+	      (const_string "multi")
+	    (eq_attr "alternative" "6")
+	      (const_string "mmx")
+	    (eq_attr "alternative" "7,8,9,10,11")
+	      (const_string "mmxmov")
+	    (eq_attr "alternative" "12,17")
+	      (const_string "sselog1")
+	    (eq_attr "alternative" "13,14,15,16,18")
+	      (const_string "ssemov")
+	    (eq_attr "alternative" "19,20")
+	      (const_string "ssecvt")
+	    (match_operand 1 "pic_32bit_operand")
+	      (const_string "lea")
+	   ]
+	   (const_string "imov")))
+   (set (attr "modrm")
+     (if_then_else
+       (and (eq_attr "alternative" "4") (eq_attr "type" "imov"))
+	 (const_string "0")
+	 (const_string "*")))
+   (set (attr "length_immediate")
+     (cond [(and (eq_attr "alternative" "4") (eq_attr "type" "imov"))
+	      (const_string "8")
+	    (eq_attr "alternative" "17")
+	      (const_string "1")
+	   ]
+	   (const_string "*")))
+   (set (attr "prefix_rex")
+     (if_then_else (eq_attr "alternative" "10,11,16,17,18")
+       (const_string "1")
+       (const_string "*")))
+   (set (attr "prefix_extra")
+     (if_then_else (eq_attr "alternative" "17")
+       (const_string "1")
+       (const_string "*")))
+   (set (attr "prefix")
+     (if_then_else (eq_attr "type" "sselog1,ssemov")
+       (const_string "maybe_vex")
+       (const_string "orig")))
+   (set (attr "prefix_data16")
+     (if_then_else (and (eq_attr "type" "ssemov") (eq_attr "mode" "DI"))
+       (const_string "1")
+       (const_string "*")))
+   (set (attr "mode")
+     (cond [(eq_attr "alternative" "2")
+	      (const_string "SI")
+	    (eq_attr "alternative" "12,13")
+	      (cond [(ior (match_operand 0 "ext_sse_reg_operand")
+			  (match_operand 1 "ext_sse_reg_operand"))
+		       (const_string "XI")
+		     (ior (not (match_test "TARGET_SSE2"))
+			  (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
+		       (const_string "V4SF")
+		     (match_test "TARGET_AVX")
+		       (const_string "TI")
+		     (match_test "optimize_function_for_size_p (cfun)")
+		       (const_string "V4SF")
+		    ]
+		    (const_string "TI"))
+
+	    (and (eq_attr "alternative" "14,15")
+		 (not (match_test "TARGET_SSE2")))
+	      (const_string "V2SF")
+	    (eq_attr "alternative" "17")
+	      (const_string "TI")
+	   ]
+	   (const_string "DI")))])
+
+(define_split
+  [(set (match_operand:DI 0 "nonimmediate_operand")
+        (match_operand:DI 1 "general_operand"))]
+  "!TARGET_64BIT && reload_completed
+   && !(MMX_REG_P (operands[0]) || SSE_REG_P (operands[0]))
+   && !(MMX_REG_P (operands[1]) || SSE_REG_P (operands[1]))"
+  [(const_int 0)]
+  "ix86_split_long_move (operands); DONE;")
+
+(define_insn "*movsi_internal"
+  [(set (match_operand:SI 0 "nonimmediate_operand"
+			"=r,m ,*y,*y,?rm,?*y,*v,*v,*v,m ,?r ,?r,?*Yi")
+	(match_operand:SI 1 "general_operand"
+			"g ,re,C ,*y,*y ,rm ,C ,*v,m ,*v,*Yj,*v,r"))]
+  "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_SSELOG1:
+      if (GENERAL_REG_P (operands[0]))
+	return "%vpextrd\t{$0, %1, %0|%0, %1, 0}";
+
+      return standard_sse_constant_opcode (insn, operands[1]);
+
+    case TYPE_SSEMOV:
+      switch (get_attr_mode (insn))
+	{
+	case MODE_SI:
+          return "%vmovd\t{%1, %0|%0, %1}";
+	case MODE_TI:
+	  return "%vmovdqa\t{%1, %0|%0, %1}";
+	case MODE_XI:
+	  return "vmovdqa32\t{%g1, %g0|%g0, %g1}";
+
+	case MODE_V4SF:
+	  return "%vmovaps\t{%1, %0|%0, %1}";
+
+	case MODE_SF:
+	  gcc_assert (!TARGET_AVX);
+          return "movss\t{%1, %0|%0, %1}";
+
+	default:
+	  gcc_unreachable ();
+	}
+
+    case TYPE_MMX:
+      return "pxor\t%0, %0";
+
+    case TYPE_MMXMOV:
+      switch (get_attr_mode (insn))
+	{
+	case MODE_DI:
+	  return "movq\t{%1, %0|%0, %1}";
+	case MODE_SI:
+	  return "movd\t{%1, %0|%0, %1}";
+
+	default:
+	  gcc_unreachable ();
+	}
+
+    case TYPE_LEA:
+      return "lea{l}\t{%E1, %0|%0, %E1}";
+
+    case TYPE_IMOV:
+      gcc_assert (!flag_pic || LEGITIMATE_PIC_OPERAND_P (operands[1]));
+      if (ix86_use_lea_for_mov (insn, operands))
+	return "lea{l}\t{%E1, %0|%0, %E1}";
+      else
+	return "mov{l}\t{%1, %0|%0, %1}";
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set (attr "isa")
+     (if_then_else (eq_attr "alternative" "11")
+       (const_string "sse4")
+       (const_string "*")))
+   (set (attr "type")
+     (cond [(eq_attr "alternative" "2")
+	      (const_string "mmx")
+	    (eq_attr "alternative" "3,4,5")
+	      (const_string "mmxmov")
+	    (eq_attr "alternative" "6,11")
+	      (const_string "sselog1")
+	    (eq_attr "alternative" "7,8,9,10,12")
+	      (const_string "ssemov")
+ 	    (match_operand 1 "pic_32bit_operand")
+	      (const_string "lea")
+	   ]
+	   (const_string "imov")))
+   (set (attr "length_immediate")
+     (if_then_else (eq_attr "alternative" "11")
+       (const_string "1")
+       (const_string "*")))
+   (set (attr "prefix_extra")
+     (if_then_else (eq_attr "alternative" "11")
+       (const_string "1")
+       (const_string "*")))
+   (set (attr "prefix")
+     (if_then_else (eq_attr "type" "sselog1,ssemov")
+       (const_string "maybe_vex")
+       (const_string "orig")))
+   (set (attr "prefix_data16")
+     (if_then_else (and (eq_attr "type" "ssemov") (eq_attr "mode" "SI"))
+       (const_string "1")
+       (const_string "*")))
+   (set (attr "mode")
+     (cond [(eq_attr "alternative" "2,3")
+	      (const_string "DI")
+	    (eq_attr "alternative" "6,7")
+	      (cond [(ior (match_operand 0 "ext_sse_reg_operand")
+			  (match_operand 1 "ext_sse_reg_operand"))
+		       (const_string "XI")
+		     (ior (not (match_test "TARGET_SSE2"))
+			  (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
+		       (const_string "V4SF")
+		     (match_test "TARGET_AVX")
+		       (const_string "TI")
+		     (match_test "optimize_function_for_size_p (cfun)")
+		       (const_string "V4SF")
+		    ]
+		    (const_string "TI"))
+
+	    (and (eq_attr "alternative" "8,9")
+	         (not (match_test "TARGET_SSE2")))
+	      (const_string "SF")
+	    (eq_attr "alternative" "11")
+	      (const_string "TI")
+	   ]
+	   (const_string "SI")))])
+
+(define_insn "kmovw"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=k,k")
+	(unspec:HI
+	  [(match_operand:HI 1 "nonimmediate_operand" "rm,k")]
+	  UNSPEC_KMOV))]
+  "!(MEM_P (operands[0]) && MEM_P (operands[1])) && TARGET_AVX512F"
+  "@
+   kmovw\t{%k1, %0|%0, %k1}
+   kmovw\t{%1, %0|%0, %1}";
+  [(set_attr "mode" "HI")
+   (set_attr "type" "mskmov")
+   (set_attr "prefix" "vex")])
+
+
+(define_insn "*movhi_internal"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r ,r ,m ,k,k,rm")
+	(match_operand:HI 1 "general_operand"      "r ,rn,rm,rn,rm,k,k"))]
+  "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_IMOVX:
+      /* movzwl is faster than movw on p2 due to partial word stalls,
+	 though not as fast as an aligned movl.  */
+      return "movz{wl|x}\t{%1, %k0|%k0, %1}";
+
+    case TYPE_MSKMOV:
+      switch (which_alternative)
+        {
+	case 4: return "kmovw\t{%k1, %0|%0, %k1}";
+	case 5: return "kmovw\t{%1, %0|%0, %1}";
+	case 6: return "kmovw\t{%1, %k0|%k0, %1}";
+	default: gcc_unreachable ();
+	}
+
+    default:
+      if (get_attr_mode (insn) == MODE_SI)
+        return "mov{l}\t{%k1, %k0|%k0, %k1}";
+      else
+        return "mov{w}\t{%1, %0|%0, %1}";
+    }
+}
+  [(set (attr "type")
+     (cond [(match_test "optimize_function_for_size_p (cfun)")
+	      (const_string "imov")
+	    (and (eq_attr "alternative" "0")
+		 (ior (not (match_test "TARGET_PARTIAL_REG_STALL"))
+		      (not (match_test "TARGET_HIMODE_MATH"))))
+	      (const_string "imov")
+	    (and (eq_attr "alternative" "1,2")
+		 (match_operand:HI 1 "aligned_operand"))
+	      (const_string "imov")
+	    (eq_attr "alternative" "4,5,6")
+	      (const_string "mskmov")
+	    (and (match_test "TARGET_MOVX")
+		 (eq_attr "alternative" "0,2"))
+	      (const_string "imovx")
+	   ]
+	   (const_string "imov")))
+    (set (attr "prefix")
+      (if_then_else (eq_attr "alternative" "4,5,6")
+	(const_string "vex")
+	(const_string "orig")))
+    (set (attr "mode")
+      (cond [(eq_attr "type" "imovx")
+	       (const_string "SI")
+	     (and (eq_attr "alternative" "1,2")
+		  (match_operand:HI 1 "aligned_operand"))
+	       (const_string "SI")
+	     (and (eq_attr "alternative" "0")
+		  (ior (not (match_test "TARGET_PARTIAL_REG_STALL"))
+		       (not (match_test "TARGET_HIMODE_MATH"))))
+	       (const_string "SI")
+	    ]
+	    (const_string "HI")))])
+
+;; Situation is quite tricky about when to choose full sized (SImode) move
+;; over QImode moves.  For Q_REG -> Q_REG move we use full size only for
+;; partial register dependency machines (such as AMD Athlon), where QImode
+;; moves issue extra dependency and for partial register stalls machines
+;; that don't use QImode patterns (and QImode move cause stall on the next
+;; instruction).
+;;
+;; For loads of Q_REG to NONQ_REG we use full sized moves except for partial
+;; register stall machines with, where we use QImode instructions, since
+;; partial register stall can be caused there.  Then we use movzx.
+
+(define_insn "*movqi_internal"
+  [(set (match_operand:QI 0 "nonimmediate_operand"
+			"=q,q ,q ,r,r ,?r,m ,k,k,r")
+	(match_operand:QI 1 "general_operand"
+			"q ,qn,qm,q,rn,qm,qn,r ,k,k"))]
+  "!(MEM_P (operands[0]) && MEM_P (operands[1]))"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_IMOVX:
+      gcc_assert (ANY_QI_REG_P (operands[1]) || MEM_P (operands[1]));
+      return "movz{bl|x}\t{%1, %k0|%k0, %1}";
+
+    case TYPE_MSKMOV:
+      switch (which_alternative)
+        {
+	case 7: return "kmovw\t{%k1, %0|%0, %k1}";
+	case 8: return "kmovw\t{%1, %0|%0, %1}";
+	case 9: return "kmovw\t{%1, %k0|%k0, %1}";
+	default: gcc_unreachable ();
+	}
+
+    default:
+      if (get_attr_mode (insn) == MODE_SI)
+        return "mov{l}\t{%k1, %k0|%k0, %k1}";
+      else
+        return "mov{b}\t{%1, %0|%0, %1}";
+    }
+}
+  [(set (attr "type")
+     (cond [(and (eq_attr "alternative" "5")
+		 (not (match_operand:QI 1 "aligned_operand")))
+	      (const_string "imovx")
+	    (match_test "optimize_function_for_size_p (cfun)")
+	      (const_string "imov")
+	    (and (eq_attr "alternative" "3")
+		 (ior (not (match_test "TARGET_PARTIAL_REG_STALL"))
+		      (not (match_test "TARGET_QIMODE_MATH"))))
+	      (const_string "imov")
+	    (eq_attr "alternative" "3,5")
+	      (const_string "imovx")
+	    (eq_attr "alternative" "7,8,9")
+	      (const_string "mskmov")
+	    (and (match_test "TARGET_MOVX")
+		 (eq_attr "alternative" "2"))
+	      (const_string "imovx")
+	   ]
+	   (const_string "imov")))
+   (set (attr "prefix")
+     (if_then_else (eq_attr "alternative" "7,8,9")
+       (const_string "vex")
+       (const_string "orig")))
+   (set (attr "mode")
+      (cond [(eq_attr "alternative" "3,4,5")
+	       (const_string "SI")
+	     (eq_attr "alternative" "6")
+	       (const_string "QI")
+	     (eq_attr "type" "imovx")
+	       (const_string "SI")
+	     (and (eq_attr "type" "imov")
+		  (and (eq_attr "alternative" "0,1")
+		       (and (match_test "TARGET_PARTIAL_REG_DEPENDENCY")
+			    (and (not (match_test "optimize_function_for_size_p (cfun)"))
+				 (not (match_test "TARGET_PARTIAL_REG_STALL"))))))
+	       (const_string "SI")
+	     ;; Avoid partial register stalls when not using QImode arithmetic
+	     (and (eq_attr "type" "imov")
+		  (and (eq_attr "alternative" "0,1")
+		       (and (match_test "TARGET_PARTIAL_REG_STALL")
+			    (not (match_test "TARGET_QIMODE_MATH")))))
+	       (const_string "SI")
+	   ]
+	   (const_string "QI")))])
+
+;; Stores and loads of ax to arbitrary constant address.
+;; We fake an second form of instruction to force reload to load address
+;; into register when rax is not available
+(define_insn "*movabs<mode>_1"
+  [(set (mem:SWI1248x (match_operand:DI 0 "x86_64_movabs_operand" "i,r"))
+	(match_operand:SWI1248x 1 "nonmemory_operand" "a,r<i>"))]
+  "TARGET_LP64 && ix86_check_movabs (insn, 0)"
+  "@
+   movabs{<imodesuffix>}\t{%1, %P0|[%P0], %1}
+   mov{<imodesuffix>}\t{%1, %a0|<iptrsize> PTR %a0, %1}"
+  [(set_attr "type" "imov")
+   (set_attr "modrm" "0,*")
+   (set_attr "length_address" "8,0")
+   (set_attr "length_immediate" "0,*")
+   (set_attr "memory" "store")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*movabs<mode>_2"
+  [(set (match_operand:SWI1248x 0 "register_operand" "=a,r")
+        (mem:SWI1248x (match_operand:DI 1 "x86_64_movabs_operand" "i,r")))]
+  "TARGET_LP64 && ix86_check_movabs (insn, 1)"
+  "@
+   movabs{<imodesuffix>}\t{%P1, %0|%0, [%P1]}
+   mov{<imodesuffix>}\t{%a1, %0|%0, <iptrsize> PTR %a1}"
+  [(set_attr "type" "imov")
+   (set_attr "modrm" "0,*")
+   (set_attr "length_address" "8,0")
+   (set_attr "length_immediate" "0")
+   (set_attr "memory" "load")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*swap<mode>"
+  [(set (match_operand:SWI48 0 "register_operand" "+r")
+	(match_operand:SWI48 1 "register_operand" "+r"))
+   (set (match_dup 1)
+	(match_dup 0))]
+  ""
+  "xchg{<imodesuffix>}\t%1, %0"
+  [(set_attr "type" "imov")
+   (set_attr "mode" "<MODE>")
+   (set_attr "pent_pair" "np")
+   (set_attr "athlon_decode" "vector")
+   (set_attr "amdfam10_decode" "double")
+   (set_attr "bdver1_decode" "double")])
+
+(define_insn "*swap<mode>_1"
+  [(set (match_operand:SWI12 0 "register_operand" "+r")
+	(match_operand:SWI12 1 "register_operand" "+r"))
+   (set (match_dup 1)
+	(match_dup 0))]
+  "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
+  "xchg{l}\t%k1, %k0"
+  [(set_attr "type" "imov")
+   (set_attr "mode" "SI")
+   (set_attr "pent_pair" "np")
+   (set_attr "athlon_decode" "vector")
+   (set_attr "amdfam10_decode" "double")
+   (set_attr "bdver1_decode" "double")])
+
+;; Not added amdfam10_decode since TARGET_PARTIAL_REG_STALL
+;; is disabled for AMDFAM10
+(define_insn "*swap<mode>_2"
+  [(set (match_operand:SWI12 0 "register_operand" "+<r>")
+	(match_operand:SWI12 1 "register_operand" "+<r>"))
+   (set (match_dup 1)
+	(match_dup 0))]
+  "TARGET_PARTIAL_REG_STALL"
+  "xchg{<imodesuffix>}\t%1, %0"
+  [(set_attr "type" "imov")
+   (set_attr "mode" "<MODE>")
+   (set_attr "pent_pair" "np")
+   (set_attr "athlon_decode" "vector")])
+
+(define_expand "movstrict<mode>"
+  [(set (strict_low_part (match_operand:SWI12 0 "nonimmediate_operand"))
+	(match_operand:SWI12 1 "general_operand"))]
+  ""
+{
+  if (TARGET_PARTIAL_REG_STALL && optimize_function_for_speed_p (cfun))
+    FAIL;
+  if (GET_CODE (operands[0]) == SUBREG
+      && GET_MODE_CLASS (GET_MODE (SUBREG_REG (operands[0]))) != MODE_INT)
+    FAIL;
+  /* Don't generate memory->memory moves, go through a register */
+  if (MEM_P (operands[0]) && MEM_P (operands[1]))
+    operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*movstrict<mode>_1"
+  [(set (strict_low_part
+	  (match_operand:SWI12 0 "nonimmediate_operand" "+<r>m,<r>"))
+	(match_operand:SWI12 1 "general_operand" "<r>n,m"))]
+  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "mov{<imodesuffix>}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "imov")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*movstrict<mode>_xor"
+  [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>"))
+	(match_operand:SWI12 1 "const0_operand"))
+   (clobber (reg:CC FLAGS_REG))]
+  "reload_completed"
+  "xor{<imodesuffix>}\t%0, %0"
+  [(set_attr "type" "alu1")
+   (set_attr "mode" "<MODE>")
+   (set_attr "length_immediate" "0")])
+
+(define_insn "*mov<mode>_extv_1"
+  [(set (match_operand:SWI24 0 "register_operand" "=R")
+	(sign_extract:SWI24 (match_operand 1 "ext_register_operand" "Q")
+			    (const_int 8)
+			    (const_int 8)))]
+  ""
+  "movs{bl|x}\t{%h1, %k0|%k0, %h1}"
+  [(set_attr "type" "imovx")
+   (set_attr "mode" "SI")])
+
+(define_insn "*movqi_extv_1"
+  [(set (match_operand:QI 0 "nonimmediate_x64nomem_operand" "=Q,?R,m")
+        (sign_extract:QI (match_operand 1 "ext_register_operand" "Q,Q,Q")
+                         (const_int 8)
+                         (const_int 8)))]
+  ""
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_IMOVX:
+      return "movs{bl|x}\t{%h1, %k0|%k0, %h1}";
+    default:
+      return "mov{b}\t{%h1, %0|%0, %h1}";
+    }
+}
+  [(set_attr "isa" "*,*,nox64")
+   (set (attr "type")
+     (if_then_else (and (match_operand:QI 0 "register_operand")
+			(ior (not (match_operand:QI 0 "QIreg_operand"))
+			     (match_test "TARGET_MOVX")))
+	(const_string "imovx")
+	(const_string "imov")))
+   (set (attr "mode")
+     (if_then_else (eq_attr "type" "imovx")
+	(const_string "SI")
+	(const_string "QI")))])
+
+(define_insn "*mov<mode>_extzv_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=R")
+	(zero_extract:SWI48 (match_operand 1 "ext_register_operand" "Q")
+			    (const_int 8)
+			    (const_int 8)))]
+  ""
+  "movz{bl|x}\t{%h1, %k0|%k0, %h1}"
+  [(set_attr "type" "imovx")
+   (set_attr "mode" "SI")])
+
+(define_insn "*movqi_extzv_2"
+  [(set (match_operand:QI 0 "nonimmediate_x64nomem_operand" "=Q,?R,m")
+        (subreg:QI
+	  (zero_extract:SI (match_operand 1 "ext_register_operand" "Q,Q,Q")
+			   (const_int 8)
+			   (const_int 8)) 0))]
+  ""
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_IMOVX:
+      return "movz{bl|x}\t{%h1, %k0|%k0, %h1}";
+    default:
+      return "mov{b}\t{%h1, %0|%0, %h1}";
+    }
+}
+  [(set_attr "isa" "*,*,nox64")
+   (set (attr "type")
+     (if_then_else (and (match_operand:QI 0 "register_operand")
+			(ior (not (match_operand:QI 0 "QIreg_operand"))
+			     (match_test "TARGET_MOVX")))
+	(const_string "imovx")
+	(const_string "imov")))
+   (set (attr "mode")
+     (if_then_else (eq_attr "type" "imovx")
+	(const_string "SI")
+	(const_string "QI")))])
+
+(define_insn "mov<mode>_insv_1"
+  [(set (zero_extract:SWI48 (match_operand 0 "ext_register_operand" "+Q,Q")
+			     (const_int 8)
+			     (const_int 8))
+	(match_operand:SWI48 1 "general_x64nomem_operand" "Qn,m"))]
+  ""
+{
+  if (CONST_INT_P (operands[1]))
+    operands[1] = simplify_gen_subreg (QImode, operands[1], <MODE>mode, 0);
+  return "mov{b}\t{%b1, %h0|%h0, %b1}";
+}
+  [(set_attr "isa" "*,nox64")
+   (set_attr "type" "imov")
+   (set_attr "mode" "QI")])
+
+(define_insn "*movqi_insv_2"
+  [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "+Q")
+			 (const_int 8)
+			 (const_int 8))
+	(lshiftrt:SI (match_operand:SI 1 "register_operand" "Q")
+		     (const_int 8)))]
+  ""
+  "mov{b}\t{%h1, %h0|%h0, %h1}"
+  [(set_attr "type" "imov")
+   (set_attr "mode" "QI")])
+
+;; Floating point push instructions.
+
+(define_insn "*pushtf"
+  [(set (match_operand:TF 0 "push_operand" "=<,<")
+	(match_operand:TF 1 "general_no_elim_operand" "x,*roF"))]
+  "TARGET_64BIT || TARGET_SSE"
+{
+  /* This insn should be already split before reg-stack.  */
+  gcc_unreachable ();
+}
+  [(set_attr "isa" "*,x64")
+   (set_attr "type" "multi")
+   (set_attr "unit" "sse,*")
+   (set_attr "mode" "TF,DI")])
+
+;; %%% Kill this when call knows how to work this out.
+(define_split
+  [(set (match_operand:TF 0 "push_operand")
+	(match_operand:TF 1 "sse_reg_operand"))]
+  "TARGET_SSE && reload_completed"
+  [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (const_int -16)))
+   (set (match_dup 0) (match_dup 1))]
+{
+  /* Preserve memory attributes. */
+  operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);
+})
+
+(define_insn "*pushxf"
+  [(set (match_operand:XF 0 "push_operand" "=<,<")
+	(match_operand:XF 1 "general_no_elim_operand" "f,Yx*roF"))]
+  ""
+{
+  /* This insn should be already split before reg-stack.  */
+  gcc_unreachable ();
+}
+  [(set_attr "type" "multi")
+   (set_attr "unit" "i387,*")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "1")
+		 (if_then_else (match_test "TARGET_64BIT")
+		   (const_string "DI")
+		   (const_string "SI"))
+	      ]
+	      (const_string "XF")))])
+
+;; %%% Kill this when call knows how to work this out.
+(define_split
+  [(set (match_operand:XF 0 "push_operand")
+	(match_operand:XF 1 "fp_register_operand"))]
+  "reload_completed"
+  [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))
+   (set (match_dup 0) (match_dup 1))]
+{
+  operands[2] = GEN_INT (-GET_MODE_SIZE (XFmode));
+  /* Preserve memory attributes. */
+  operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);
+})
+
+(define_insn "*pushdf"
+  [(set (match_operand:DF 0 "push_operand" "=<,<,<,<")
+	(match_operand:DF 1 "general_no_elim_operand" "f,Yd*roF,rmF,x"))]
+  ""
+{
+  /* This insn should be already split before reg-stack.  */
+  gcc_unreachable ();
+}
+  [(set_attr "isa" "*,nox64,x64,sse2")
+   (set_attr "type" "multi")
+   (set_attr "unit" "i387,*,*,sse")
+   (set_attr "mode" "DF,SI,DI,DF")])
+
+;; %%% Kill this when call knows how to work this out.
+(define_split
+  [(set (match_operand:DF 0 "push_operand")
+	(match_operand:DF 1 "any_fp_register_operand"))]
+  "reload_completed"
+  [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (const_int -8)))
+   (set (match_dup 0) (match_dup 1))]
+{
+  /* Preserve memory attributes. */
+  operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);
+})
+
+(define_insn "*pushsf_rex64"
+  [(set (match_operand:SF 0 "push_operand" "=X,X,X")
+	(match_operand:SF 1 "nonmemory_no_elim_operand" "f,rF,x"))]
+  "TARGET_64BIT"
+{
+  /* Anything else should be already split before reg-stack.  */
+  gcc_assert (which_alternative == 1);
+  return "push{q}\t%q1";
+}
+  [(set_attr "type" "multi,push,multi")
+   (set_attr "unit" "i387,*,*")
+   (set_attr "mode" "SF,DI,SF")])
+
+(define_insn "*pushsf"
+  [(set (match_operand:SF 0 "push_operand" "=<,<,<")
+	(match_operand:SF 1 "general_no_elim_operand" "f,rmF,x"))]
+  "!TARGET_64BIT"
+{
+  /* Anything else should be already split before reg-stack.  */
+  gcc_assert (which_alternative == 1);
+  return "push{l}\t%1";
+}
+  [(set_attr "type" "multi,push,multi")
+   (set_attr "unit" "i387,*,*")
+   (set_attr "mode" "SF,SI,SF")])
+
+;; %%% Kill this when call knows how to work this out.
+(define_split
+  [(set (match_operand:SF 0 "push_operand")
+	(match_operand:SF 1 "any_fp_register_operand"))]
+  "reload_completed"
+  [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))
+   (set (match_dup 0) (match_dup 1))]
+{
+  rtx op = XEXP (operands[0], 0);
+  if (GET_CODE (op) == PRE_DEC)
+    {
+      gcc_assert (!TARGET_64BIT);
+      op = GEN_INT (-4);
+    }
+  else
+    {
+      op = XEXP (XEXP (op, 1), 1);
+      gcc_assert (CONST_INT_P (op));
+    }
+  operands[2] = op;
+  /* Preserve memory attributes. */
+  operands[0] = replace_equiv_address (operands[0], stack_pointer_rtx);
+})
+
+(define_split
+  [(set (match_operand:SF 0 "push_operand")
+	(match_operand:SF 1 "memory_operand"))]
+  "reload_completed
+   && (operands[2] = find_constant_src (insn))"
+  [(set (match_dup 0) (match_dup 2))])
+
+(define_split
+  [(set (match_operand 0 "push_operand")
+	(match_operand 1 "general_operand"))]
+  "reload_completed
+   && (GET_MODE (operands[0]) == TFmode
+       || GET_MODE (operands[0]) == XFmode
+       || GET_MODE (operands[0]) == DFmode)
+   && !ANY_FP_REG_P (operands[1])"
+  [(const_int 0)]
+  "ix86_split_long_move (operands); DONE;")
+
+;; Floating point move instructions.
+
+(define_expand "movtf"
+  [(set (match_operand:TF 0 "nonimmediate_operand")
+	(match_operand:TF 1 "nonimmediate_operand"))]
+  "TARGET_64BIT || TARGET_SSE"
+  "ix86_expand_move (TFmode, operands); DONE;")
+
+(define_expand "mov<mode>"
+  [(set (match_operand:X87MODEF 0 "nonimmediate_operand")
+	(match_operand:X87MODEF 1 "general_operand"))]
+  ""
+  "ix86_expand_move (<MODE>mode, operands); DONE;")
+
+(define_insn "*movtf_internal"
+  [(set (match_operand:TF 0 "nonimmediate_operand" "=x,x ,m,?*r ,!o")
+	(match_operand:TF 1 "general_operand"	   "C ,xm,x,*roF,*rC"))]
+  "(TARGET_64BIT || TARGET_SSE)
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))
+   && (!can_create_pseudo_p ()
+       || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE)
+       || GET_CODE (operands[1]) != CONST_DOUBLE
+       || (optimize_function_for_size_p (cfun)
+	   && standard_sse_constant_p (operands[1])
+	   && !memory_operand (operands[0], TFmode))
+       || (!TARGET_MEMORY_MISMATCH_STALL
+	   && memory_operand (operands[0], TFmode)))"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_SSELOG1:
+      return standard_sse_constant_opcode (insn, operands[1]);
+
+    case TYPE_SSEMOV:
+      /* Handle misaligned load/store since we
+         don't have movmisaligntf pattern. */
+      if (misaligned_operand (operands[0], TFmode)
+	  || misaligned_operand (operands[1], TFmode))
+	{
+	  if (get_attr_mode (insn) == MODE_V4SF)
+	    return "%vmovups\t{%1, %0|%0, %1}";
+	  else
+	    return "%vmovdqu\t{%1, %0|%0, %1}";
+	}
+      else
+	{
+	  if (get_attr_mode (insn) == MODE_V4SF)
+	    return "%vmovaps\t{%1, %0|%0, %1}";
+	  else
+	    return "%vmovdqa\t{%1, %0|%0, %1}";
+	}
+
+    case TYPE_MULTI:
+	return "#";
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "*,*,*,x64,x64")
+   (set_attr "type" "sselog1,ssemov,ssemov,multi,multi")
+   (set (attr "prefix")
+     (if_then_else (eq_attr "type" "sselog1,ssemov")
+       (const_string "maybe_vex")
+       (const_string "orig")))
+   (set (attr "mode")
+        (cond [(eq_attr "alternative" "3,4")
+		 (const_string "DI")
+	       (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+		 (const_string "V4SF")
+	       (and (eq_attr "alternative" "2")
+		    (match_test "TARGET_SSE_TYPELESS_STORES"))
+		 (const_string "V4SF")
+	       (match_test "TARGET_AVX")
+		 (const_string "TI")
+	       (ior (not (match_test "TARGET_SSE2"))
+		    (match_test "optimize_function_for_size_p (cfun)"))
+		 (const_string "V4SF")
+	       ]
+	       (const_string "TI")))])
+
+;; Possible store forwarding (partial memory) stall in alternatives 4 and 5.
+(define_insn "*movxf_internal"
+  [(set (match_operand:XF 0 "nonimmediate_operand"
+	 "=f,m,f,?Yx*r ,!o   ,!o")
+	(match_operand:XF 1 "general_operand"
+	 "fm,f,G,Yx*roF,Yx*rF,Yx*rC"))]
+  "!(MEM_P (operands[0]) && MEM_P (operands[1]))
+   && (!can_create_pseudo_p ()
+       || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE)
+       || GET_CODE (operands[1]) != CONST_DOUBLE
+       || (optimize_function_for_size_p (cfun)
+	   && standard_80387_constant_p (operands[1]) > 0
+	   && !memory_operand (operands[0], XFmode))
+       || (!TARGET_MEMORY_MISMATCH_STALL
+	   && memory_operand (operands[0], XFmode)))"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_FMOV:
+      if (which_alternative == 2)
+        return standard_80387_constant_opcode (operands[1]);
+      return output_387_reg_move (insn, operands);
+
+    case TYPE_MULTI:
+      return "#";
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "*,*,*,*,nox64,x64")
+   (set_attr "type" "fmov,fmov,fmov,multi,multi,multi")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "3,4,5")
+		 (if_then_else (match_test "TARGET_64BIT")
+		   (const_string "DI")
+		   (const_string "SI"))
+	      ]
+	      (const_string "XF")))])
+
+;; Possible store forwarding (partial memory) stall in alternative 4.
+(define_insn "*movdf_internal"
+  [(set (match_operand:DF 0 "nonimmediate_operand"
+    "=Yf*f,m   ,Yf*f,?Yd*r ,!o   ,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,r ,Yi")
+	(match_operand:DF 1 "general_operand"
+    "Yf*fm,Yf*f,G   ,Yd*roF,Yd*rF,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,Yj,r"))]
+  "!(MEM_P (operands[0]) && MEM_P (operands[1]))
+   && (!can_create_pseudo_p ()
+       || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE)
+       || GET_CODE (operands[1]) != CONST_DOUBLE
+       || (optimize_function_for_size_p (cfun)
+	   && ((!(TARGET_SSE2 && TARGET_SSE_MATH)
+		&& standard_80387_constant_p (operands[1]) > 0)
+	       || (TARGET_SSE2 && TARGET_SSE_MATH
+		   && standard_sse_constant_p (operands[1])))
+	   && !memory_operand (operands[0], DFmode))
+       || ((TARGET_64BIT || !TARGET_MEMORY_MISMATCH_STALL)
+	   && memory_operand (operands[0], DFmode)))"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_FMOV:
+      if (which_alternative == 2)
+        return standard_80387_constant_opcode (operands[1]);
+      return output_387_reg_move (insn, operands);
+
+    case TYPE_MULTI:
+      return "#";
+
+    case TYPE_IMOV:
+      if (get_attr_mode (insn) == MODE_SI)
+	return "mov{l}\t{%1, %k0|%k0, %1}";
+      else if (which_alternative == 8)
+	return "movabs{q}\t{%1, %0|%0, %1}";
+      else
+	return "mov{q}\t{%1, %0|%0, %1}";
+
+    case TYPE_SSELOG1:
+      return standard_sse_constant_opcode (insn, operands[1]);
+
+    case TYPE_SSEMOV:
+      switch (get_attr_mode (insn))
+	{
+	case MODE_DF:
+	  if (TARGET_AVX && REG_P (operands[0]) && REG_P (operands[1]))
+	    return "vmovsd\t{%1, %0, %0|%0, %0, %1}";
+	  return "%vmovsd\t{%1, %0|%0, %1}";
+
+	case MODE_V4SF:
+	  return "%vmovaps\t{%1, %0|%0, %1}";
+	case MODE_V8DF:
+	  return "vmovapd\t{%g1, %g0|%g0, %g1}";
+	case MODE_V2DF:
+	  return "%vmovapd\t{%1, %0|%0, %1}";
+
+	case MODE_V2SF:
+	  gcc_assert (!TARGET_AVX);
+	  return "movlps\t{%1, %0|%0, %1}";
+	case MODE_V1DF:
+	  gcc_assert (!TARGET_AVX);
+	  return "movlpd\t{%1, %0|%0, %1}";
+
+	case MODE_DI:
+	  /* Handle broken assemblers that require movd instead of movq.  */
+	  if (!HAVE_AS_IX86_INTERUNIT_MOVQ
+	      && (GENERAL_REG_P (operands[0]) || GENERAL_REG_P (operands[1])))
+	    return "%vmovd\t{%1, %0|%0, %1}";
+	  return "%vmovq\t{%1, %0|%0, %1}";
+
+	default:
+	  gcc_unreachable ();
+	}
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set (attr "isa")
+	(cond [(eq_attr "alternative" "3,4")
+		 (const_string "nox64")
+	       (eq_attr "alternative" "5,6,7,8,17,18")
+		 (const_string "x64")
+	       (eq_attr "alternative" "9,10,11,12")
+		 (const_string "sse2")
+	      ]
+	      (const_string "*")))
+   (set (attr "type")
+	(cond [(eq_attr "alternative" "0,1,2")
+		 (const_string "fmov")
+	       (eq_attr "alternative" "3,4")
+		 (const_string "multi")
+	       (eq_attr "alternative" "5,6,7,8")
+		 (const_string "imov")
+	       (eq_attr "alternative" "9,13")
+		 (const_string "sselog1")
+	      ]
+	      (const_string "ssemov")))
+   (set (attr "modrm")
+     (if_then_else (eq_attr "alternative" "8")
+       (const_string "0")
+       (const_string "*")))
+   (set (attr "length_immediate")
+     (if_then_else (eq_attr "alternative" "8")
+       (const_string "8")
+       (const_string "*")))
+   (set (attr "prefix")
+     (if_then_else (eq_attr "type" "sselog1,ssemov")
+       (const_string "maybe_vex")
+       (const_string "orig")))
+   (set (attr "prefix_data16")
+     (if_then_else
+       (ior (and (eq_attr "type" "ssemov") (eq_attr "mode" "DI"))
+	    (eq_attr "mode" "V1DF"))
+       (const_string "1")
+       (const_string "*")))
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "3,4,7")
+		 (const_string "SI")
+	       (eq_attr "alternative" "5,6,8,17,18")
+		 (const_string "DI")
+
+	       /* xorps is one byte shorter for non-AVX targets.  */
+	       (eq_attr "alternative" "9,13")
+		 (cond [(not (match_test "TARGET_SSE2"))
+		 	  (const_string "V4SF")
+			(match_test "TARGET_AVX512F")
+			  (const_string "XI")
+			(match_test "TARGET_AVX")
+			  (const_string "V2DF")
+			(match_test "optimize_function_for_size_p (cfun)")
+			  (const_string "V4SF")
+			(match_test "TARGET_SSE_LOAD0_BY_PXOR")
+			  (const_string "TI")
+		       ]
+		       (const_string "V2DF"))
+
+	       /* For architectures resolving dependencies on
+		  whole SSE registers use movapd to break dependency
+		  chains, otherwise use short move to avoid extra work.  */
+
+	       /* movaps is one byte shorter for non-AVX targets.  */
+	       (eq_attr "alternative" "10,14")
+		 (cond [(ior (match_operand 0 "ext_sse_reg_operand")
+			     (match_operand 1 "ext_sse_reg_operand"))
+			  (const_string "V8DF")
+			(ior (not (match_test "TARGET_SSE2"))
+			     (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
+			  (const_string "V4SF")
+			(match_test "TARGET_SSE_PARTIAL_REG_DEPENDENCY")
+			  (const_string "V2DF")
+			(match_test "TARGET_AVX")
+			  (const_string "DF")
+			(match_test "optimize_function_for_size_p (cfun)")
+			  (const_string "V4SF")
+		       ]
+		       (const_string "DF"))
+
+	       /* For architectures resolving dependencies on register
+		  parts we may avoid extra work to zero out upper part
+		  of register.  */
+	       (eq_attr "alternative" "11,15")
+		 (cond [(not (match_test "TARGET_SSE2"))
+			  (const_string "V2SF")
+			(match_test "TARGET_AVX")
+			  (const_string "DF")
+			(match_test "TARGET_SSE_SPLIT_REGS")
+			  (const_string "V1DF")
+		       ]
+		       (const_string "DF"))
+
+	       (and (eq_attr "alternative" "12,16")
+		    (not (match_test "TARGET_SSE2")))
+		 (const_string "V2SF")
+	      ]
+	      (const_string "DF")))])
+
+(define_insn "*movsf_internal"
+  [(set (match_operand:SF 0 "nonimmediate_operand"
+	  "=Yf*f,m   ,Yf*f,?r ,?m,v,v,v,m,?r,?Yi,!*y,!*y,!m,!r ,!*Ym")
+	(match_operand:SF 1 "general_operand"
+	  "Yf*fm,Yf*f,G   ,rmF,rF,C,v,m,v,Yj,r  ,*y ,m  ,*y,*Yn,r"))]
+  "!(MEM_P (operands[0]) && MEM_P (operands[1]))
+   && (!can_create_pseudo_p ()
+       || (ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_LARGE)
+       || GET_CODE (operands[1]) != CONST_DOUBLE
+       || (optimize_function_for_size_p (cfun)
+	   && ((!TARGET_SSE_MATH
+		&& standard_80387_constant_p (operands[1]) > 0)
+	       || (TARGET_SSE_MATH
+		   && standard_sse_constant_p (operands[1]))))
+       || memory_operand (operands[0], SFmode))"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_FMOV:
+      if (which_alternative == 2)
+        return standard_80387_constant_opcode (operands[1]);
+      return output_387_reg_move (insn, operands);
+
+    case TYPE_IMOV:
+      return "mov{l}\t{%1, %0|%0, %1}";
+
+    case TYPE_SSELOG1:
+      return standard_sse_constant_opcode (insn, operands[1]);
+
+    case TYPE_SSEMOV:
+      switch (get_attr_mode (insn))
+	{
+	case MODE_SF:
+	  if (TARGET_AVX && REG_P (operands[0]) && REG_P (operands[1]))
+	    return "vmovss\t{%1, %0, %0|%0, %0, %1}";
+	  return "%vmovss\t{%1, %0|%0, %1}";
+
+	case MODE_V16SF:
+	  return "vmovaps\t{%g1, %g0|%g0, %g1}";
+	case MODE_V4SF:
+	  return "%vmovaps\t{%1, %0|%0, %1}";
+
+	case MODE_SI:
+	  return "%vmovd\t{%1, %0|%0, %1}";
+
+	default:
+	  gcc_unreachable ();
+	}
+
+    case TYPE_MMXMOV:
+      switch (get_attr_mode (insn))
+	{
+	case MODE_DI:
+	  return "movq\t{%1, %0|%0, %1}";
+	case MODE_SI:
+	  return "movd\t{%1, %0|%0, %1}";
+
+	default:
+	  gcc_unreachable ();
+	}
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set (attr "type")
+	(cond [(eq_attr "alternative" "0,1,2")
+		 (const_string "fmov")
+	       (eq_attr "alternative" "3,4")
+		 (const_string "imov")
+	       (eq_attr "alternative" "5")
+		 (const_string "sselog1")
+	       (eq_attr "alternative" "11,12,13,14,15")
+		 (const_string "mmxmov")
+	      ]
+	      (const_string "ssemov")))
+   (set (attr "prefix")
+     (if_then_else (eq_attr "type" "sselog1,ssemov")
+       (const_string "maybe_vex")
+       (const_string "orig")))
+   (set (attr "prefix_data16")
+     (if_then_else (and (eq_attr "type" "ssemov") (eq_attr "mode" "SI"))
+       (const_string "1")
+       (const_string "*")))
+   (set (attr "mode")
+        (cond [(eq_attr "alternative" "3,4,9,10,13,14,15")
+		 (const_string "SI")
+	       (eq_attr "alternative" "11")
+		 (const_string "DI")
+	       (eq_attr "alternative" "5")
+		 (cond [(not (match_test "TARGET_SSE2"))
+ 		 	  (const_string "V4SF")
+			(match_test "TARGET_AVX512F")
+			  (const_string "V16SF")
+			(match_test "TARGET_AVX")
+			  (const_string "V4SF")
+ 			(match_test "optimize_function_for_size_p (cfun)")
+			  (const_string "V4SF")
+ 			(match_test "TARGET_SSE_LOAD0_BY_PXOR")
+			  (const_string "TI")
+		       ]
+		       (const_string "V4SF"))
+
+	       /* For architectures resolving dependencies on
+		  whole SSE registers use APS move to break dependency
+		  chains, otherwise use short move to avoid extra work.
+
+		  Do the same for architectures resolving dependencies on
+		  the parts.  While in DF mode it is better to always handle
+		  just register parts, the SF mode is different due to lack
+		  of instructions to load just part of the register.  It is
+		  better to maintain the whole registers in single format
+		  to avoid problems on using packed logical operations.  */
+	       (eq_attr "alternative" "6")
+		 (cond [(ior  (match_operand 0 "ext_sse_reg_operand")
+			      (match_operand 1 "ext_sse_reg_operand"))
+			  (const_string "V16SF")
+			(ior (match_test "TARGET_SSE_PARTIAL_REG_DEPENDENCY")
+			     (match_test "TARGET_SSE_SPLIT_REGS"))
+			  (const_string "V4SF")
+		       ]
+		       (const_string "SF"))
+	      ]
+	      (const_string "SF")))])
+
+(define_split
+  [(set (match_operand 0 "any_fp_register_operand")
+	(match_operand 1 "memory_operand"))]
+  "reload_completed
+   && (GET_MODE (operands[0]) == TFmode
+       || GET_MODE (operands[0]) == XFmode
+       || GET_MODE (operands[0]) == DFmode
+       || GET_MODE (operands[0]) == SFmode)
+   && (operands[2] = find_constant_src (insn))"
+  [(set (match_dup 0) (match_dup 2))]
+{
+  rtx c = operands[2];
+  int r = REGNO (operands[0]);
+
+  if ((SSE_REGNO_P (r) && !standard_sse_constant_p (c))
+      || (STACK_REGNO_P (r) && standard_80387_constant_p (c) < 1))
+    FAIL;
+})
+
+(define_split
+  [(set (match_operand 0 "any_fp_register_operand")
+	(float_extend (match_operand 1 "memory_operand")))]
+  "reload_completed
+   && (GET_MODE (operands[0]) == TFmode
+       || GET_MODE (operands[0]) == XFmode
+       || GET_MODE (operands[0]) == DFmode)
+   && (operands[2] = find_constant_src (insn))"
+  [(set (match_dup 0) (match_dup 2))]
+{
+  rtx c = operands[2];
+  int r = REGNO (operands[0]);
+
+  if ((SSE_REGNO_P (r) && !standard_sse_constant_p (c))
+      || (STACK_REGNO_P (r) && standard_80387_constant_p (c) < 1))
+    FAIL;
+})
+
+;; Split the load of -0.0 or -1.0 into fldz;fchs or fld1;fchs sequence
+(define_split
+  [(set (match_operand:X87MODEF 0 "fp_register_operand")
+	(match_operand:X87MODEF 1 "immediate_operand"))]
+  "reload_completed
+   && (standard_80387_constant_p (operands[1]) == 8
+       || standard_80387_constant_p (operands[1]) == 9)"
+  [(set (match_dup 0)(match_dup 1))
+   (set (match_dup 0)
+	(neg:X87MODEF (match_dup 0)))]
+{
+  REAL_VALUE_TYPE r;
+
+  REAL_VALUE_FROM_CONST_DOUBLE (r, operands[1]);
+  if (real_isnegzero (&r))
+    operands[1] = CONST0_RTX (<MODE>mode);
+  else
+    operands[1] = CONST1_RTX (<MODE>mode);
+})
+
+(define_split
+  [(set (match_operand 0 "nonimmediate_operand")
+        (match_operand 1 "general_operand"))]
+  "reload_completed
+   && (GET_MODE (operands[0]) == TFmode
+       || GET_MODE (operands[0]) == XFmode
+       || GET_MODE (operands[0]) == DFmode)
+   && !(ANY_FP_REG_P (operands[0]) || ANY_FP_REG_P (operands[1]))"
+  [(const_int 0)]
+  "ix86_split_long_move (operands); DONE;")
+
+(define_insn "swapxf"
+  [(set (match_operand:XF 0 "register_operand" "+f")
+	(match_operand:XF 1 "register_operand" "+f"))
+   (set (match_dup 1)
+	(match_dup 0))]
+  "TARGET_80387"
+{
+  if (STACK_TOP_P (operands[0]))
+    return "fxch\t%1";
+  else
+    return "fxch\t%0";
+}
+  [(set_attr "type" "fxch")
+   (set_attr "mode" "XF")])
+
+(define_insn "*swap<mode>"
+  [(set (match_operand:MODEF 0 "fp_register_operand" "+f")
+	(match_operand:MODEF 1 "fp_register_operand" "+f"))
+   (set (match_dup 1)
+	(match_dup 0))]
+  "TARGET_80387 || reload_completed"
+{
+  if (STACK_TOP_P (operands[0]))
+    return "fxch\t%1";
+  else
+    return "fxch\t%0";
+}
+  [(set_attr "type" "fxch")
+   (set_attr "mode" "<MODE>")])
+
+;; Zero extension instructions
+
+(define_expand "zero_extendsidi2"
+  [(set (match_operand:DI 0 "nonimmediate_operand")
+	(zero_extend:DI (match_operand:SI 1 "nonimmediate_operand")))])
+
+(define_insn "*zero_extendsidi2"
+  [(set (match_operand:DI 0 "nonimmediate_operand"
+			"=r,?r,?o,r   ,o,?*Ym,?!*y,?r ,?r,?*Yi,?*x")
+	(zero_extend:DI
+	 (match_operand:SI 1 "x86_64_zext_operand"
+	        	"0 ,rm,r ,rmWz,0,r   ,m   ,*Yj,*x,r   ,m")))]
+  ""
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_IMOVX:
+      if (ix86_use_lea_for_mov (insn, operands))
+	return "lea{l}\t{%E1, %k0|%k0, %E1}";
+      else
+	return "mov{l}\t{%1, %k0|%k0, %1}";
+
+    case TYPE_MULTI:
+      return "#";
+
+    case TYPE_MMXMOV:
+      return "movd\t{%1, %0|%0, %1}";
+
+    case TYPE_SSELOG1:
+      return "%vpextrd\t{$0, %1, %k0|%k0, %1, 0}";
+
+    case TYPE_SSEMOV:
+      if (GENERAL_REG_P (operands[0]))
+	return "%vmovd\t{%1, %k0|%k0, %1}";
+
+      return "%vmovd\t{%1, %0|%0, %1}";
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set (attr "isa")
+     (cond [(eq_attr "alternative" "0,1,2")
+	      (const_string "nox64")
+	    (eq_attr "alternative" "3,7")
+	      (const_string "x64")
+	    (eq_attr "alternative" "8")
+	      (const_string "x64_sse4")
+	    (eq_attr "alternative" "10")
+	      (const_string "sse2")
+	   ]
+	   (const_string "*")))
+   (set (attr "type")
+     (cond [(eq_attr "alternative" "0,1,2,4")
+	      (const_string "multi")
+	    (eq_attr "alternative" "5,6")
+	      (const_string "mmxmov")
+	    (eq_attr "alternative" "7,9,10")
+	      (const_string "ssemov")
+	    (eq_attr "alternative" "8")
+	      (const_string "sselog1")
+	   ]
+	   (const_string "imovx")))
+   (set (attr "prefix_extra")
+     (if_then_else (eq_attr "alternative" "8")
+       (const_string "1")
+       (const_string "*")))
+   (set (attr "length_immediate")
+     (if_then_else (eq_attr "alternative" "8")
+       (const_string "1")
+       (const_string "*")))
+   (set (attr "prefix")
+     (if_then_else (eq_attr "type" "ssemov,sselog1")
+       (const_string "maybe_vex")
+       (const_string "orig")))
+   (set (attr "prefix_0f")
+     (if_then_else (eq_attr "type" "imovx")
+       (const_string "0")
+       (const_string "*")))
+   (set (attr "mode")
+     (cond [(eq_attr "alternative" "5,6")
+	      (const_string "DI")
+	    (eq_attr "alternative" "7,8,9")
+	      (const_string "TI")
+	   ]
+	   (const_string "SI")))])
+
+(define_split
+  [(set (match_operand:DI 0 "memory_operand")
+     	(zero_extend:DI (match_operand:SI 1 "memory_operand")))]
+  "reload_completed"
+  [(set (match_dup 4) (const_int 0))]
+  "split_double_mode (DImode, &operands[0], 1, &operands[3], &operands[4]);")
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(zero_extend:DI (match_operand:SI 1 "register_operand")))]
+  "!TARGET_64BIT && reload_completed
+   && !(MMX_REG_P (operands[0]) || SSE_REG_P (operands[0]))
+   && true_regnum (operands[0]) == true_regnum (operands[1])"
+  [(set (match_dup 4) (const_int 0))]
+  "split_double_mode (DImode, &operands[0], 1, &operands[3], &operands[4]);")
+
+(define_split
+  [(set (match_operand:DI 0 "nonimmediate_operand")
+	(zero_extend:DI (match_operand:SI 1 "nonimmediate_operand")))]
+  "!TARGET_64BIT && reload_completed
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))
+   && !(MMX_REG_P (operands[0]) || SSE_REG_P (operands[0]))"
+  [(set (match_dup 3) (match_dup 1))
+   (set (match_dup 4) (const_int 0))]
+  "split_double_mode (DImode, &operands[0], 1, &operands[3], &operands[4]);")
+
+(define_insn "zero_extend<mode>di2"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	 (match_operand:SWI12 1 "nonimmediate_operand" "<r>m")))]
+  "TARGET_64BIT"
+  "movz{<imodesuffix>l|x}\t{%1, %k0|%k0, %1}"
+  [(set_attr "type" "imovx")
+   (set_attr "mode" "SI")])
+
+(define_expand "zero_extend<mode>si2"
+  [(set (match_operand:SI 0 "register_operand")
+	(zero_extend:SI (match_operand:SWI12 1 "nonimmediate_operand")))]
+  ""
+{
+  if (TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun))
+    {
+      operands[1] = force_reg (<MODE>mode, operands[1]);
+      emit_insn (gen_zero_extend<mode>si2_and (operands[0], operands[1]));
+      DONE;
+    }
+})
+
+(define_insn_and_split "zero_extend<mode>si2_and"
+  [(set (match_operand:SI 0 "register_operand" "=r,?&<r>")
+	(zero_extend:SI
+	  (match_operand:SWI12 1 "nonimmediate_operand" "0,<r>m")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun)"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (match_dup 0) (and:SI (match_dup 0) (match_dup 2)))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  if (true_regnum (operands[0]) != true_regnum (operands[1]))
+    {
+      ix86_expand_clear (operands[0]);
+
+      gcc_assert (!TARGET_PARTIAL_REG_STALL);
+      emit_insn (gen_movstrict<mode>
+		  (gen_lowpart (<MODE>mode, operands[0]), operands[1]));
+      DONE;
+    }
+
+  operands[2] = GEN_INT (GET_MODE_MASK (<MODE>mode));
+}
+  [(set_attr "type" "alu1")
+   (set_attr "mode" "SI")])
+
+(define_insn "*zero_extend<mode>si2"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(zero_extend:SI
+	  (match_operand:SWI12 1 "nonimmediate_operand" "<r>m")))]
+  "!(TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun))"
+  "movz{<imodesuffix>l|x}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "imovx")
+   (set_attr "mode" "SI")])
+
+(define_expand "zero_extendqihi2"
+  [(set (match_operand:HI 0 "register_operand")
+	(zero_extend:HI (match_operand:QI 1 "nonimmediate_operand")))]
+  ""
+{
+  if (TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun))
+    {
+      operands[1] = force_reg (QImode, operands[1]);
+      emit_insn (gen_zero_extendqihi2_and (operands[0], operands[1]));
+      DONE;
+    }
+})
+
+(define_insn_and_split "zero_extendqihi2_and"
+  [(set (match_operand:HI 0 "register_operand" "=r,?&q")
+	(zero_extend:HI (match_operand:QI 1 "nonimmediate_operand" "0,qm")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun)"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (match_dup 0) (and:SI (match_dup 0) (const_int 255)))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  if (true_regnum (operands[0]) != true_regnum (operands[1]))
+    {
+      ix86_expand_clear (operands[0]);
+
+      gcc_assert (!TARGET_PARTIAL_REG_STALL);
+      emit_insn (gen_movstrictqi
+		  (gen_lowpart (QImode, operands[0]), operands[1]));
+      DONE;
+    }
+
+  operands[0] = gen_lowpart (SImode, operands[0]);
+}
+  [(set_attr "type" "alu1")
+   (set_attr "mode" "SI")])
+
+; zero extend to SImode to avoid partial register stalls
+(define_insn "*zero_extendqihi2"
+  [(set (match_operand:HI 0 "register_operand" "=r")
+	(zero_extend:HI (match_operand:QI 1 "nonimmediate_operand" "qm")))]
+  "!(TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun))"
+  "movz{bl|x}\t{%1, %k0|%k0, %1}"
+  [(set_attr "type" "imovx")
+   (set_attr "mode" "SI")])
+
+;; Sign extension instructions
+
+(define_expand "extendsidi2"
+  [(set (match_operand:DI 0 "register_operand")
+	(sign_extend:DI (match_operand:SI 1 "register_operand")))]
+  ""
+{
+  if (!TARGET_64BIT)
+    {
+      emit_insn (gen_extendsidi2_1 (operands[0], operands[1]));
+      DONE;
+    }
+})
+
+(define_insn "*extendsidi2_rex64"
+  [(set (match_operand:DI 0 "register_operand" "=*a,r")
+	(sign_extend:DI (match_operand:SI 1 "nonimmediate_operand" "*0,rm")))]
+  "TARGET_64BIT"
+  "@
+   {cltq|cdqe}
+   movs{lq|x}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "imovx")
+   (set_attr "mode" "DI")
+   (set_attr "prefix_0f" "0")
+   (set_attr "modrm" "0,1")])
+
+(define_insn "extendsidi2_1"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=*A,r,?r,?*o")
+	(sign_extend:DI (match_operand:SI 1 "register_operand" "0,0,r,r")))
+   (clobber (reg:CC FLAGS_REG))
+   (clobber (match_scratch:SI 2 "=X,X,X,&r"))]
+  "!TARGET_64BIT"
+  "#")
+
+;; Split the memory case.  If the source register doesn't die, it will stay
+;; this way, if it does die, following peephole2s take care of it.
+(define_split
+  [(set (match_operand:DI 0 "memory_operand")
+	(sign_extend:DI (match_operand:SI 1 "register_operand")))
+   (clobber (reg:CC FLAGS_REG))
+   (clobber (match_operand:SI 2 "register_operand"))]
+  "reload_completed"
+  [(const_int 0)]
+{
+  split_double_mode (DImode, &operands[0], 1, &operands[3], &operands[4]);
+
+  emit_move_insn (operands[3], operands[1]);
+
+  /* Generate a cltd if possible and doing so it profitable.  */
+  if ((optimize_function_for_size_p (cfun) || TARGET_USE_CLTD)
+      && true_regnum (operands[1]) == AX_REG
+      && true_regnum (operands[2]) == DX_REG)
+    {
+      emit_insn (gen_ashrsi3_cvt (operands[2], operands[1], GEN_INT (31)));
+    }
+  else
+    {
+      emit_move_insn (operands[2], operands[1]);
+      emit_insn (gen_ashrsi3_cvt (operands[2], operands[2], GEN_INT (31)));
+    }
+  emit_move_insn (operands[4], operands[2]);
+  DONE;
+})
+
+;; Peepholes for the case where the source register does die, after
+;; being split with the above splitter.
+(define_peephole2
+  [(set (match_operand:SI 0 "memory_operand")
+	(match_operand:SI 1 "register_operand"))
+   (set (match_operand:SI 2 "register_operand") (match_dup 1))
+   (parallel [(set (match_dup 2)
+		   (ashiftrt:SI (match_dup 2) (const_int 31)))
+	       (clobber (reg:CC FLAGS_REG))])
+   (set (match_operand:SI 3 "memory_operand") (match_dup 2))]
+  "REGNO (operands[1]) != REGNO (operands[2])
+   && peep2_reg_dead_p (2, operands[1])
+   && peep2_reg_dead_p (4, operands[2])
+   && !reg_mentioned_p (operands[2], operands[3])"
+  [(set (match_dup 0) (match_dup 1))
+   (parallel [(set (match_dup 1) (ashiftrt:SI (match_dup 1) (const_int 31)))
+	      (clobber (reg:CC FLAGS_REG))])
+   (set (match_dup 3) (match_dup 1))])
+
+(define_peephole2
+  [(set (match_operand:SI 0 "memory_operand")
+	(match_operand:SI 1 "register_operand"))
+   (parallel [(set (match_operand:SI 2 "register_operand")
+		   (ashiftrt:SI (match_dup 1) (const_int 31)))
+	       (clobber (reg:CC FLAGS_REG))])
+   (set (match_operand:SI 3 "memory_operand") (match_dup 2))]
+  "/* cltd is shorter than sarl $31, %eax */
+   !optimize_function_for_size_p (cfun)
+   && true_regnum (operands[1]) == AX_REG
+   && true_regnum (operands[2]) == DX_REG
+   && peep2_reg_dead_p (2, operands[1])
+   && peep2_reg_dead_p (3, operands[2])
+   && !reg_mentioned_p (operands[2], operands[3])"
+  [(set (match_dup 0) (match_dup 1))
+   (parallel [(set (match_dup 1) (ashiftrt:SI (match_dup 1) (const_int 31)))
+	      (clobber (reg:CC FLAGS_REG))])
+   (set (match_dup 3) (match_dup 1))])
+
+;; Extend to register case.  Optimize case where source and destination
+;; registers match and cases where we can use cltd.
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(sign_extend:DI (match_operand:SI 1 "register_operand")))
+   (clobber (reg:CC FLAGS_REG))
+   (clobber (match_scratch:SI 2))]
+  "reload_completed"
+  [(const_int 0)]
+{
+  split_double_mode (DImode, &operands[0], 1, &operands[3], &operands[4]);
+
+  if (true_regnum (operands[3]) != true_regnum (operands[1]))
+    emit_move_insn (operands[3], operands[1]);
+
+  /* Generate a cltd if possible and doing so it profitable.  */
+  if ((optimize_function_for_size_p (cfun) || TARGET_USE_CLTD)
+      && true_regnum (operands[3]) == AX_REG
+      && true_regnum (operands[4]) == DX_REG)
+    {
+      emit_insn (gen_ashrsi3_cvt (operands[4], operands[3], GEN_INT (31)));
+      DONE;
+    }
+
+  if (true_regnum (operands[4]) != true_regnum (operands[1]))
+    emit_move_insn (operands[4], operands[1]);
+
+  emit_insn (gen_ashrsi3_cvt (operands[4], operands[4], GEN_INT (31)));
+  DONE;
+})
+
+(define_insn "extend<mode>di2"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(sign_extend:DI
+	 (match_operand:SWI12 1 "nonimmediate_operand" "<r>m")))]
+  "TARGET_64BIT"
+  "movs{<imodesuffix>q|x}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "imovx")
+   (set_attr "mode" "DI")])
+
+(define_insn "extendhisi2"
+  [(set (match_operand:SI 0 "register_operand" "=*a,r")
+	(sign_extend:SI (match_operand:HI 1 "nonimmediate_operand" "*0,rm")))]
+  ""
+{
+  switch (get_attr_prefix_0f (insn))
+    {
+    case 0:
+      return "{cwtl|cwde}";
+    default:
+      return "movs{wl|x}\t{%1, %0|%0, %1}";
+    }
+}
+  [(set_attr "type" "imovx")
+   (set_attr "mode" "SI")
+   (set (attr "prefix_0f")
+     ;; movsx is short decodable while cwtl is vector decoded.
+     (if_then_else (and (eq_attr "cpu" "!k6")
+			(eq_attr "alternative" "0"))
+	(const_string "0")
+	(const_string "1")))
+   (set (attr "modrm")
+     (if_then_else (eq_attr "prefix_0f" "0")
+	(const_string "0")
+	(const_string "1")))])
+
+(define_insn "*extendhisi2_zext"
+  [(set (match_operand:DI 0 "register_operand" "=*a,r")
+	(zero_extend:DI
+	 (sign_extend:SI
+	  (match_operand:HI 1 "nonimmediate_operand" "*0,rm"))))]
+  "TARGET_64BIT"
+{
+  switch (get_attr_prefix_0f (insn))
+    {
+    case 0:
+      return "{cwtl|cwde}";
+    default:
+      return "movs{wl|x}\t{%1, %k0|%k0, %1}";
+    }
+}
+  [(set_attr "type" "imovx")
+   (set_attr "mode" "SI")
+   (set (attr "prefix_0f")
+     ;; movsx is short decodable while cwtl is vector decoded.
+     (if_then_else (and (eq_attr "cpu" "!k6")
+			(eq_attr "alternative" "0"))
+	(const_string "0")
+	(const_string "1")))
+   (set (attr "modrm")
+     (if_then_else (eq_attr "prefix_0f" "0")
+	(const_string "0")
+	(const_string "1")))])
+
+(define_insn "extendqisi2"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(sign_extend:SI (match_operand:QI 1 "nonimmediate_operand" "qm")))]
+  ""
+  "movs{bl|x}\t{%1, %0|%0, %1}"
+   [(set_attr "type" "imovx")
+    (set_attr "mode" "SI")])
+
+(define_insn "*extendqisi2_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (sign_extend:SI (match_operand:QI 1 "nonimmediate_operand" "qm"))))]
+  "TARGET_64BIT"
+  "movs{bl|x}\t{%1, %k0|%k0, %1}"
+   [(set_attr "type" "imovx")
+    (set_attr "mode" "SI")])
+
+(define_insn "extendqihi2"
+  [(set (match_operand:HI 0 "register_operand" "=*a,r")
+	(sign_extend:HI (match_operand:QI 1 "nonimmediate_operand" "*0,qm")))]
+  ""
+{
+  switch (get_attr_prefix_0f (insn))
+    {
+    case 0:
+      return "{cbtw|cbw}";
+    default:
+      return "movs{bw|x}\t{%1, %0|%0, %1}";
+    }
+}
+  [(set_attr "type" "imovx")
+   (set_attr "mode" "HI")
+   (set (attr "prefix_0f")
+     ;; movsx is short decodable while cwtl is vector decoded.
+     (if_then_else (and (eq_attr "cpu" "!k6")
+			(eq_attr "alternative" "0"))
+	(const_string "0")
+	(const_string "1")))
+   (set (attr "modrm")
+     (if_then_else (eq_attr "prefix_0f" "0")
+	(const_string "0")
+	(const_string "1")))])
+
+;; Conversions between float and double.
+
+;; These are all no-ops in the model used for the 80387.
+;; So just emit moves.
+
+;; %%% Kill these when call knows how to work out a DFmode push earlier.
+(define_split
+  [(set (match_operand:DF 0 "push_operand")
+	(float_extend:DF (match_operand:SF 1 "fp_register_operand")))]
+  "reload_completed"
+  [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (const_int -8)))
+   (set (mem:DF (reg:P SP_REG)) (float_extend:DF (match_dup 1)))])
+
+(define_split
+  [(set (match_operand:XF 0 "push_operand")
+	(float_extend:XF (match_operand:MODEF 1 "fp_register_operand")))]
+  "reload_completed"
+  [(set (reg:P SP_REG) (plus:P (reg:P SP_REG) (match_dup 2)))
+   (set (mem:XF (reg:P SP_REG)) (float_extend:XF (match_dup 1)))]
+  "operands[2] = GEN_INT (-GET_MODE_SIZE (XFmode));")
+
+(define_expand "extendsfdf2"
+  [(set (match_operand:DF 0 "nonimmediate_operand")
+        (float_extend:DF (match_operand:SF 1 "general_operand")))]
+  "TARGET_80387 || (TARGET_SSE2 && TARGET_SSE_MATH)"
+{
+  /* ??? Needed for compress_float_constant since all fp constants
+     are TARGET_LEGITIMATE_CONSTANT_P.  */
+  if (GET_CODE (operands[1]) == CONST_DOUBLE)
+    {
+      if ((!TARGET_SSE2 || TARGET_MIX_SSE_I387)
+	  && standard_80387_constant_p (operands[1]) > 0)
+	{
+	  operands[1] = simplify_const_unary_operation
+	    (FLOAT_EXTEND, DFmode, operands[1], SFmode);
+	  emit_move_insn_1 (operands[0], operands[1]);
+	  DONE;
+	}
+      operands[1] = validize_mem (force_const_mem (SFmode, operands[1]));
+    }
+})
+
+/* For converting SF(xmm2) to DF(xmm1), use the following code instead of
+   cvtss2sd:
+      unpcklps xmm2,xmm2   ; packed conversion might crash on signaling NaNs
+      cvtps2pd xmm2,xmm1
+   We do the conversion post reload to avoid producing of 128bit spills
+   that might lead to ICE on 32bit target.  The sequence unlikely combine
+   anyway.  */
+(define_split
+  [(set (match_operand:DF 0 "register_operand")
+        (float_extend:DF
+	  (match_operand:SF 1 "nonimmediate_operand")))]
+  "TARGET_USE_VECTOR_FP_CONVERTS
+   && optimize_insn_for_speed_p ()
+   && reload_completed && SSE_REG_P (operands[0])"
+   [(set (match_dup 2)
+	 (float_extend:V2DF
+	   (vec_select:V2SF
+	     (match_dup 3)
+	     (parallel [(const_int 0) (const_int 1)]))))]
+{
+  operands[2] = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0);
+  operands[3] = simplify_gen_subreg (V4SFmode, operands[0], DFmode, 0);
+  /* Use movss for loading from memory, unpcklps reg, reg for registers.
+     Try to avoid move when unpacking can be done in source.  */
+  if (REG_P (operands[1]))
+    {
+      /* If it is unsafe to overwrite upper half of source, we need
+	 to move to destination and unpack there.  */
+      if ((ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
+	   || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 4)
+	  && true_regnum (operands[0]) != true_regnum (operands[1]))
+	{
+	  rtx tmp = gen_rtx_REG (SFmode, true_regnum (operands[0]));
+	  emit_move_insn (tmp, operands[1]);
+	}
+      else
+	operands[3] = simplify_gen_subreg (V4SFmode, operands[1], SFmode, 0);
+      emit_insn (gen_vec_interleave_lowv4sf (operands[3], operands[3],
+      		 			     operands[3]));
+    }
+  else
+    emit_insn (gen_vec_setv4sf_0 (operands[3],
+				  CONST0_RTX (V4SFmode), operands[1]));
+})
+
+;; It's more profitable to split and then extend in the same register.
+(define_peephole2
+  [(set (match_operand:DF 0 "register_operand")
+	(float_extend:DF
+	  (match_operand:SF 1 "memory_operand")))]
+  "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
+   && optimize_insn_for_speed_p ()
+   && SSE_REG_P (operands[0])"
+  [(set (match_dup 2) (match_dup 1))
+   (set (match_dup 0) (float_extend:DF (match_dup 2)))]
+  "operands[2] = gen_rtx_REG (SFmode, REGNO (operands[0]));")
+
+(define_insn "*extendsfdf2_mixed"
+  [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,x")
+        (float_extend:DF
+	  (match_operand:SF 1 "nonimmediate_operand" "fm,f,xm")))]
+  "TARGET_SSE2 && TARGET_MIX_SSE_I387"
+{
+  switch (which_alternative)
+    {
+    case 0:
+    case 1:
+      return output_387_reg_move (insn, operands);
+
+    case 2:
+      return "%vcvtss2sd\t{%1, %d0|%d0, %1}";
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "fmov,fmov,ssecvt")
+   (set_attr "prefix" "orig,orig,maybe_vex")
+   (set_attr "mode" "SF,XF,DF")])
+
+(define_insn "*extendsfdf2_sse"
+  [(set (match_operand:DF 0 "nonimmediate_operand" "=x")
+        (float_extend:DF (match_operand:SF 1 "nonimmediate_operand" "xm")))]
+  "TARGET_SSE2 && TARGET_SSE_MATH"
+  "%vcvtss2sd\t{%1, %d0|%d0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "DF")])
+
+(define_insn "*extendsfdf2_i387"
+  [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m")
+        (float_extend:DF (match_operand:SF 1 "nonimmediate_operand" "fm,f")))]
+  "TARGET_80387"
+  "* return output_387_reg_move (insn, operands);"
+  [(set_attr "type" "fmov")
+   (set_attr "mode" "SF,XF")])
+
+(define_expand "extend<mode>xf2"
+  [(set (match_operand:XF 0 "nonimmediate_operand")
+        (float_extend:XF (match_operand:MODEF 1 "general_operand")))]
+  "TARGET_80387"
+{
+  /* ??? Needed for compress_float_constant since all fp constants
+     are TARGET_LEGITIMATE_CONSTANT_P.  */
+  if (GET_CODE (operands[1]) == CONST_DOUBLE)
+    {
+      if (standard_80387_constant_p (operands[1]) > 0)
+	{
+	  operands[1] = simplify_const_unary_operation
+	    (FLOAT_EXTEND, XFmode, operands[1], <MODE>mode);
+	  emit_move_insn_1 (operands[0], operands[1]);
+	  DONE;
+	}
+      operands[1] = validize_mem (force_const_mem (<MODE>mode, operands[1]));
+    }
+})
+
+(define_insn "*extend<mode>xf2_i387"
+  [(set (match_operand:XF 0 "nonimmediate_operand" "=f,m")
+        (float_extend:XF
+	  (match_operand:MODEF 1 "nonimmediate_operand" "fm,f")))]
+  "TARGET_80387"
+  "* return output_387_reg_move (insn, operands);"
+  [(set_attr "type" "fmov")
+   (set_attr "mode" "<MODE>,XF")])
+
+;; %%% This seems bad bad news.
+;; This cannot output into an f-reg because there is no way to be sure
+;; of truncating in that case.  Otherwise this is just like a simple move
+;; insn.  So we pretend we can output to a reg in order to get better
+;; register preferencing, but we really use a stack slot.
+
+;; Conversion from DFmode to SFmode.
+
+(define_expand "truncdfsf2"
+  [(set (match_operand:SF 0 "nonimmediate_operand")
+	(float_truncate:SF
+	  (match_operand:DF 1 "nonimmediate_operand")))]
+  "TARGET_80387 || (TARGET_SSE2 && TARGET_SSE_MATH)"
+{
+  if (TARGET_SSE2 && TARGET_SSE_MATH && !TARGET_MIX_SSE_I387)
+    ;
+  else if (flag_unsafe_math_optimizations)
+    ;
+  else
+    {
+      rtx temp = assign_386_stack_local (SFmode, SLOT_TEMP);
+      emit_insn (gen_truncdfsf2_with_temp (operands[0], operands[1], temp));
+      DONE;
+    }
+})
+
+/* For converting DF(xmm2) to SF(xmm1), use the following code instead of
+   cvtsd2ss:
+      unpcklpd xmm2,xmm2   ; packed conversion might crash on signaling NaNs
+      cvtpd2ps xmm2,xmm1
+   We do the conversion post reload to avoid producing of 128bit spills
+   that might lead to ICE on 32bit target.  The sequence unlikely combine
+   anyway.  */
+(define_split
+  [(set (match_operand:SF 0 "register_operand")
+        (float_truncate:SF
+	  (match_operand:DF 1 "nonimmediate_operand")))]
+  "TARGET_USE_VECTOR_FP_CONVERTS
+   && optimize_insn_for_speed_p ()
+   && reload_completed && SSE_REG_P (operands[0])"
+   [(set (match_dup 2)
+	 (vec_concat:V4SF
+	   (float_truncate:V2SF
+	     (match_dup 4))
+	   (match_dup 3)))]
+{
+  operands[2] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
+  operands[3] = CONST0_RTX (V2SFmode);
+  operands[4] = simplify_gen_subreg (V2DFmode, operands[0], SFmode, 0);
+  /* Use movsd for loading from memory, unpcklpd for registers.
+     Try to avoid move when unpacking can be done in source, or SSE3
+     movddup is available.  */
+  if (REG_P (operands[1]))
+    {
+      if (!TARGET_SSE3
+	  && true_regnum (operands[0]) != true_regnum (operands[1])
+	  && (ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
+	      || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 8))
+	{
+	  rtx tmp = simplify_gen_subreg (DFmode, operands[0], SFmode, 0);
+	  emit_move_insn (tmp, operands[1]);
+	  operands[1] = tmp;
+	}
+      else if (!TARGET_SSE3)
+	operands[4] = simplify_gen_subreg (V2DFmode, operands[1], DFmode, 0);
+      emit_insn (gen_vec_dupv2df (operands[4], operands[1]));
+    }
+  else
+    emit_insn (gen_sse2_loadlpd (operands[4],
+				 CONST0_RTX (V2DFmode), operands[1]));
+})
+
+;; It's more profitable to split and then extend in the same register.
+(define_peephole2
+  [(set (match_operand:SF 0 "register_operand")
+	(float_truncate:SF
+	  (match_operand:DF 1 "memory_operand")))]
+  "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
+   && optimize_insn_for_speed_p ()
+   && SSE_REG_P (operands[0])"
+  [(set (match_dup 2) (match_dup 1))
+   (set (match_dup 0) (float_truncate:SF (match_dup 2)))]
+  "operands[2] = gen_rtx_REG (DFmode, REGNO (operands[0]));")
+
+(define_expand "truncdfsf2_with_temp"
+  [(parallel [(set (match_operand:SF 0)
+		   (float_truncate:SF (match_operand:DF 1)))
+	      (clobber (match_operand:SF 2))])])
+
+(define_insn "*truncdfsf_fast_mixed"
+  [(set (match_operand:SF 0 "nonimmediate_operand"   "=fm,x")
+        (float_truncate:SF
+          (match_operand:DF 1 "nonimmediate_operand" "f  ,xm")))]
+  "TARGET_SSE2 && TARGET_MIX_SSE_I387 && flag_unsafe_math_optimizations"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return output_387_reg_move (insn, operands);
+    case 1:
+      return "%vcvtsd2ss\t{%1, %d0|%d0, %1}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "fmov,ssecvt")
+   (set_attr "prefix" "orig,maybe_vex")
+   (set_attr "mode" "SF")])
+
+;; Yes, this one doesn't depend on flag_unsafe_math_optimizations,
+;; because nothing we do here is unsafe.
+(define_insn "*truncdfsf_fast_sse"
+  [(set (match_operand:SF 0 "nonimmediate_operand"   "=x")
+        (float_truncate:SF
+          (match_operand:DF 1 "nonimmediate_operand" "xm")))]
+  "TARGET_SSE2 && TARGET_SSE_MATH"
+  "%vcvtsd2ss\t{%1, %d0|%d0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "SF")])
+
+(define_insn "*truncdfsf_fast_i387"
+  [(set (match_operand:SF 0 "nonimmediate_operand"   "=fm")
+        (float_truncate:SF
+          (match_operand:DF 1 "nonimmediate_operand" "f")))]
+  "TARGET_80387 && flag_unsafe_math_optimizations"
+  "* return output_387_reg_move (insn, operands);"
+  [(set_attr "type" "fmov")
+   (set_attr "mode" "SF")])
+
+(define_insn "*truncdfsf_mixed"
+  [(set (match_operand:SF 0 "nonimmediate_operand"   "=m,x ,?f,?x,?*r")
+	(float_truncate:SF
+	  (match_operand:DF 1 "nonimmediate_operand" "f ,xm,f ,f ,f")))
+   (clobber (match_operand:SF 2 "memory_operand"     "=X,X ,m ,m ,m"))]
+  "TARGET_MIX_SSE_I387"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return output_387_reg_move (insn, operands);
+    case 1:
+      return "%vcvtsd2ss\t{%1, %d0|%d0, %1}";
+
+    default:
+      return "#";
+    }
+}
+  [(set_attr "isa" "*,sse2,*,*,*")
+   (set_attr "type" "fmov,ssecvt,multi,multi,multi")
+   (set_attr "unit" "*,*,i387,i387,i387")
+   (set_attr "prefix" "orig,maybe_vex,orig,orig,orig")
+   (set_attr "mode" "SF")])
+
+(define_insn "*truncdfsf_i387"
+  [(set (match_operand:SF 0 "nonimmediate_operand"   "=m,?f,?x,?*r")
+	(float_truncate:SF
+	  (match_operand:DF 1 "nonimmediate_operand" "f ,f ,f ,f")))
+   (clobber (match_operand:SF 2 "memory_operand"     "=X,m ,m ,m"))]
+  "TARGET_80387"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return output_387_reg_move (insn, operands);
+
+    default:
+      return "#";
+    }
+}
+  [(set_attr "type" "fmov,multi,multi,multi")
+   (set_attr "unit" "*,i387,i387,i387")
+   (set_attr "mode" "SF")])
+
+(define_insn "*truncdfsf2_i387_1"
+  [(set (match_operand:SF 0 "memory_operand" "=m")
+	(float_truncate:SF
+	  (match_operand:DF 1 "register_operand" "f")))]
+  "TARGET_80387
+   && !(TARGET_SSE2 && TARGET_SSE_MATH)
+   && !TARGET_MIX_SSE_I387"
+  "* return output_387_reg_move (insn, operands);"
+  [(set_attr "type" "fmov")
+   (set_attr "mode" "SF")])
+
+(define_split
+  [(set (match_operand:SF 0 "register_operand")
+	(float_truncate:SF
+	 (match_operand:DF 1 "fp_register_operand")))
+   (clobber (match_operand 2))]
+  "reload_completed"
+  [(set (match_dup 2) (match_dup 1))
+   (set (match_dup 0) (match_dup 2))]
+  "operands[1] = gen_rtx_REG (SFmode, true_regnum (operands[1]));")
+
+;; Conversion from XFmode to {SF,DF}mode
+
+(define_expand "truncxf<mode>2"
+  [(parallel [(set (match_operand:MODEF 0 "nonimmediate_operand")
+		   (float_truncate:MODEF
+		     (match_operand:XF 1 "register_operand")))
+	      (clobber (match_dup 2))])]
+  "TARGET_80387"
+{
+  if (flag_unsafe_math_optimizations)
+    {
+      rtx reg = REG_P (operands[0]) ? operands[0] : gen_reg_rtx (<MODE>mode);
+      emit_insn (gen_truncxf<mode>2_i387_noop (reg, operands[1]));
+      if (reg != operands[0])
+	emit_move_insn (operands[0], reg);
+      DONE;
+    }
+  else
+    operands[2] = assign_386_stack_local (<MODE>mode, SLOT_TEMP);
+})
+
+(define_insn "*truncxfsf2_mixed"
+  [(set (match_operand:SF 0 "nonimmediate_operand" "=m,?f,?x,?*r")
+	(float_truncate:SF
+	  (match_operand:XF 1 "register_operand"   "f ,f ,f ,f")))
+   (clobber (match_operand:SF 2 "memory_operand"   "=X,m ,m ,m"))]
+  "TARGET_80387"
+{
+  gcc_assert (!which_alternative);
+  return output_387_reg_move (insn, operands);
+}
+  [(set_attr "type" "fmov,multi,multi,multi")
+   (set_attr "unit" "*,i387,i387,i387")
+   (set_attr "mode" "SF")])
+
+(define_insn "*truncxfdf2_mixed"
+  [(set (match_operand:DF 0 "nonimmediate_operand" "=m,?f,?x,?*r")
+	(float_truncate:DF
+	  (match_operand:XF 1 "register_operand"   "f ,f ,f  ,f")))
+   (clobber (match_operand:DF 2 "memory_operand"   "=X,m ,m  ,m"))]
+  "TARGET_80387"
+{
+  gcc_assert (!which_alternative);
+  return output_387_reg_move (insn, operands);
+}
+  [(set_attr "isa" "*,*,sse2,*")
+   (set_attr "type" "fmov,multi,multi,multi")
+   (set_attr "unit" "*,i387,i387,i387")
+   (set_attr "mode" "DF")])
+
+(define_insn "truncxf<mode>2_i387_noop"
+  [(set (match_operand:MODEF 0 "register_operand" "=f")
+	(float_truncate:MODEF
+	  (match_operand:XF 1 "register_operand" "f")))]
+  "TARGET_80387 && flag_unsafe_math_optimizations"
+  "* return output_387_reg_move (insn, operands);"
+  [(set_attr "type" "fmov")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*truncxf<mode>2_i387"
+  [(set (match_operand:MODEF 0 "memory_operand" "=m")
+	(float_truncate:MODEF
+	  (match_operand:XF 1 "register_operand" "f")))]
+  "TARGET_80387"
+  "* return output_387_reg_move (insn, operands);"
+  [(set_attr "type" "fmov")
+   (set_attr "mode" "<MODE>")])
+
+(define_split
+  [(set (match_operand:MODEF 0 "register_operand")
+	(float_truncate:MODEF
+	  (match_operand:XF 1 "register_operand")))
+   (clobber (match_operand:MODEF 2 "memory_operand"))]
+  "TARGET_80387 && reload_completed"
+  [(set (match_dup 2) (float_truncate:MODEF (match_dup 1)))
+   (set (match_dup 0) (match_dup 2))])
+
+(define_split
+  [(set (match_operand:MODEF 0 "memory_operand")
+	(float_truncate:MODEF
+	  (match_operand:XF 1 "register_operand")))
+   (clobber (match_operand:MODEF 2 "memory_operand"))]
+  "TARGET_80387"
+  [(set (match_dup 0) (float_truncate:MODEF (match_dup 1)))])
+
+;; Signed conversion to DImode.
+
+(define_expand "fix_truncxfdi2"
+  [(parallel [(set (match_operand:DI 0 "nonimmediate_operand")
+                   (fix:DI (match_operand:XF 1 "register_operand")))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_80387"
+{
+  if (TARGET_FISTTP)
+   {
+     emit_insn (gen_fix_truncdi_fisttp_i387_1 (operands[0], operands[1]));
+     DONE;
+   }
+})
+
+(define_expand "fix_trunc<mode>di2"
+  [(parallel [(set (match_operand:DI 0 "nonimmediate_operand")
+                   (fix:DI (match_operand:MODEF 1 "register_operand")))
+              (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_80387 || (TARGET_64BIT && SSE_FLOAT_MODE_P (<MODE>mode))"
+{
+  if (TARGET_FISTTP
+      && !(TARGET_64BIT && SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH))
+   {
+     emit_insn (gen_fix_truncdi_fisttp_i387_1 (operands[0], operands[1]));
+     DONE;
+   }
+  if (TARGET_64BIT && SSE_FLOAT_MODE_P (<MODE>mode))
+   {
+     rtx out = REG_P (operands[0]) ? operands[0] : gen_reg_rtx (DImode);
+     emit_insn (gen_fix_trunc<mode>di_sse (out, operands[1]));
+     if (out != operands[0])
+	emit_move_insn (operands[0], out);
+     DONE;
+   }
+})
+
+;; Signed conversion to SImode.
+
+(define_expand "fix_truncxfsi2"
+  [(parallel [(set (match_operand:SI 0 "nonimmediate_operand")
+                   (fix:SI (match_operand:XF 1 "register_operand")))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_80387"
+{
+  if (TARGET_FISTTP)
+   {
+     emit_insn (gen_fix_truncsi_fisttp_i387_1 (operands[0], operands[1]));
+     DONE;
+   }
+})
+
+(define_expand "fix_trunc<mode>si2"
+  [(parallel [(set (match_operand:SI 0 "nonimmediate_operand")
+	           (fix:SI (match_operand:MODEF 1 "register_operand")))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_80387 || SSE_FLOAT_MODE_P (<MODE>mode)"
+{
+  if (TARGET_FISTTP
+      && !(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH))
+   {
+     emit_insn (gen_fix_truncsi_fisttp_i387_1 (operands[0], operands[1]));
+     DONE;
+   }
+  if (SSE_FLOAT_MODE_P (<MODE>mode))
+   {
+     rtx out = REG_P (operands[0]) ? operands[0] : gen_reg_rtx (SImode);
+     emit_insn (gen_fix_trunc<mode>si_sse (out, operands[1]));
+     if (out != operands[0])
+	emit_move_insn (operands[0], out);
+     DONE;
+   }
+})
+
+;; Signed conversion to HImode.
+
+(define_expand "fix_trunc<mode>hi2"
+  [(parallel [(set (match_operand:HI 0 "nonimmediate_operand")
+	           (fix:HI (match_operand:X87MODEF 1 "register_operand")))
+              (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_80387
+   && !(SSE_FLOAT_MODE_P (<MODE>mode) && (!TARGET_FISTTP || TARGET_SSE_MATH))"
+{
+  if (TARGET_FISTTP)
+   {
+     emit_insn (gen_fix_trunchi_fisttp_i387_1 (operands[0], operands[1]));
+     DONE;
+   }
+})
+
+;; Unsigned conversion to SImode.
+
+(define_expand "fixuns_trunc<mode>si2"
+  [(parallel
+    [(set (match_operand:SI 0 "register_operand")
+	  (unsigned_fix:SI
+	    (match_operand:MODEF 1 "nonimmediate_operand")))
+     (use (match_dup 2))
+     (clobber (match_scratch:<ssevecmode> 3))
+     (clobber (match_scratch:<ssevecmode> 4))])]
+  "!TARGET_64BIT && TARGET_SSE2 && TARGET_SSE_MATH"
+{
+  enum machine_mode mode = <MODE>mode;
+  enum machine_mode vecmode = <ssevecmode>mode;
+  REAL_VALUE_TYPE TWO31r;
+  rtx two31;
+
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  real_ldexp (&TWO31r, &dconst1, 31);
+  two31 = const_double_from_real_value (TWO31r, mode);
+  two31 = ix86_build_const_vector (vecmode, true, two31);
+  operands[2] = force_reg (vecmode, two31);
+})
+
+(define_insn_and_split "*fixuns_trunc<mode>_1"
+  [(set (match_operand:SI 0 "register_operand" "=&x,&x")
+	(unsigned_fix:SI
+	  (match_operand:MODEF 3 "nonimmediate_operand" "xm,xm")))
+   (use (match_operand:<ssevecmode> 4  "nonimmediate_operand" "m,x"))
+   (clobber (match_scratch:<ssevecmode> 1 "=x,&x"))
+   (clobber (match_scratch:<ssevecmode> 2 "=x,x"))]
+  "!TARGET_64BIT && TARGET_SSE2 && TARGET_SSE_MATH
+   && optimize_function_for_speed_p (cfun)"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  ix86_split_convert_uns_si_sse (operands);
+  DONE;
+})
+
+;; Unsigned conversion to HImode.
+;; Without these patterns, we'll try the unsigned SI conversion which
+;; is complex for SSE, rather than the signed SI conversion, which isn't.
+
+(define_expand "fixuns_trunc<mode>hi2"
+  [(set (match_dup 2)
+	(fix:SI (match_operand:MODEF 1 "nonimmediate_operand")))
+   (set (match_operand:HI 0 "nonimmediate_operand")
+	(subreg:HI (match_dup 2) 0))]
+  "SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH"
+  "operands[2] = gen_reg_rtx (SImode);")
+
+;; When SSE is available, it is always faster to use it!
+(define_insn "fix_trunc<MODEF:mode><SWI48:mode>_sse"
+  [(set (match_operand:SWI48 0 "register_operand" "=r,r")
+	(fix:SWI48 (match_operand:MODEF 1 "nonimmediate_operand" "x,m")))]
+  "SSE_FLOAT_MODE_P (<MODEF:MODE>mode)
+   && (!TARGET_FISTTP || TARGET_SSE_MATH)"
+  "%vcvtt<MODEF:ssemodesuffix>2si<SWI48:rex64suffix>\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "maybe_vex")
+   (set (attr "prefix_rex")
+	(if_then_else
+	  (match_test "<SWI48:MODE>mode == DImode")
+	  (const_string "1")
+	  (const_string "*")))
+   (set_attr "mode" "<MODEF:MODE>")
+   (set_attr "athlon_decode" "double,vector")
+   (set_attr "amdfam10_decode" "double,double")
+   (set_attr "bdver1_decode" "double,double")])
+
+;; Avoid vector decoded forms of the instruction.
+(define_peephole2
+  [(match_scratch:MODEF 2 "x")
+   (set (match_operand:SWI48 0 "register_operand")
+	(fix:SWI48 (match_operand:MODEF 1 "memory_operand")))]
+  "TARGET_AVOID_VECTOR_DECODE
+   && SSE_FLOAT_MODE_P (<MODEF:MODE>mode)
+   && optimize_insn_for_speed_p ()"
+  [(set (match_dup 2) (match_dup 1))
+   (set (match_dup 0) (fix:SWI48 (match_dup 2)))])
+
+(define_insn_and_split "fix_trunc<mode>_fisttp_i387_1"
+  [(set (match_operand:SWI248x 0 "nonimmediate_operand")
+	(fix:SWI248x (match_operand 1 "register_operand")))]
+  "X87_FLOAT_MODE_P (GET_MODE (operands[1]))
+   && TARGET_FISTTP
+   && !((SSE_FLOAT_MODE_P (GET_MODE (operands[1]))
+	 && (TARGET_64BIT || <MODE>mode != DImode))
+	&& TARGET_SSE_MATH)
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (memory_operand (operands[0], VOIDmode))
+    emit_insn (gen_fix_trunc<mode>_i387_fisttp (operands[0], operands[1]));
+  else
+    {
+      operands[2] = assign_386_stack_local (<MODE>mode, SLOT_TEMP);
+      emit_insn (gen_fix_trunc<mode>_i387_fisttp_with_temp (operands[0],
+							    operands[1],
+							    operands[2]));
+    }
+  DONE;
+}
+  [(set_attr "type" "fisttp")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "fix_trunc<mode>_i387_fisttp"
+  [(set (match_operand:SWI248x 0 "memory_operand" "=m")
+	(fix:SWI248x (match_operand 1 "register_operand" "f")))
+   (clobber (match_scratch:XF 2 "=&1f"))]
+  "X87_FLOAT_MODE_P (GET_MODE (operands[1]))
+   && TARGET_FISTTP
+   && !((SSE_FLOAT_MODE_P (GET_MODE (operands[1]))
+	 && (TARGET_64BIT || <MODE>mode != DImode))
+	&& TARGET_SSE_MATH)"
+  "* return output_fix_trunc (insn, operands, true);"
+  [(set_attr "type" "fisttp")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "fix_trunc<mode>_i387_fisttp_with_temp"
+  [(set (match_operand:SWI248x 0 "nonimmediate_operand" "=m,?r")
+	(fix:SWI248x (match_operand 1 "register_operand" "f,f")))
+   (clobber (match_operand:SWI248x 2 "memory_operand" "=X,m"))
+   (clobber (match_scratch:XF 3 "=&1f,&1f"))]
+  "X87_FLOAT_MODE_P (GET_MODE (operands[1]))
+   && TARGET_FISTTP
+   && !((SSE_FLOAT_MODE_P (GET_MODE (operands[1]))
+	&& (TARGET_64BIT || <MODE>mode != DImode))
+	&& TARGET_SSE_MATH)"
+  "#"
+  [(set_attr "type" "fisttp")
+   (set_attr "mode" "<MODE>")])
+
+(define_split
+  [(set (match_operand:SWI248x 0 "register_operand")
+	(fix:SWI248x (match_operand 1 "register_operand")))
+   (clobber (match_operand:SWI248x 2 "memory_operand"))
+   (clobber (match_scratch 3))]
+  "reload_completed"
+  [(parallel [(set (match_dup 2) (fix:SWI248x (match_dup 1)))
+	      (clobber (match_dup 3))])
+   (set (match_dup 0) (match_dup 2))])
+
+(define_split
+  [(set (match_operand:SWI248x 0 "memory_operand")
+	(fix:SWI248x (match_operand 1 "register_operand")))
+   (clobber (match_operand:SWI248x 2 "memory_operand"))
+   (clobber (match_scratch 3))]
+  "reload_completed"
+  [(parallel [(set (match_dup 0) (fix:SWI248x (match_dup 1)))
+	      (clobber (match_dup 3))])])
+
+;; See the comments in i386.h near OPTIMIZE_MODE_SWITCHING for the description
+;; of the machinery. Please note the clobber of FLAGS_REG. In i387 control
+;; word calculation (inserted by LCM in mode switching pass) a FLAGS_REG
+;; clobbering insns can be used. Look at emit_i387_cw_initialization ()
+;; function in i386.c.
+(define_insn_and_split "*fix_trunc<mode>_i387_1"
+  [(set (match_operand:SWI248x 0 "nonimmediate_operand")
+	(fix:SWI248x (match_operand 1 "register_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "X87_FLOAT_MODE_P (GET_MODE (operands[1]))
+   && !TARGET_FISTTP
+   && !(SSE_FLOAT_MODE_P (GET_MODE (operands[1]))
+	 && (TARGET_64BIT || <MODE>mode != DImode))
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  ix86_optimize_mode_switching[I387_TRUNC] = 1;
+
+  operands[2] = assign_386_stack_local (HImode, SLOT_CW_STORED);
+  operands[3] = assign_386_stack_local (HImode, SLOT_CW_TRUNC);
+  if (memory_operand (operands[0], VOIDmode))
+    emit_insn (gen_fix_trunc<mode>_i387 (operands[0], operands[1],
+					 operands[2], operands[3]));
+  else
+    {
+      operands[4] = assign_386_stack_local (<MODE>mode, SLOT_TEMP);
+      emit_insn (gen_fix_trunc<mode>_i387_with_temp (operands[0], operands[1],
+						     operands[2], operands[3],
+						     operands[4]));
+    }
+  DONE;
+}
+  [(set_attr "type" "fistp")
+   (set_attr "i387_cw" "trunc")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "fix_truncdi_i387"
+  [(set (match_operand:DI 0 "memory_operand" "=m")
+	(fix:DI (match_operand 1 "register_operand" "f")))
+   (use (match_operand:HI 2 "memory_operand" "m"))
+   (use (match_operand:HI 3 "memory_operand" "m"))
+   (clobber (match_scratch:XF 4 "=&1f"))]
+  "X87_FLOAT_MODE_P (GET_MODE (operands[1]))
+   && !TARGET_FISTTP
+   && !(TARGET_64BIT && SSE_FLOAT_MODE_P (GET_MODE (operands[1])))"
+  "* return output_fix_trunc (insn, operands, false);"
+  [(set_attr "type" "fistp")
+   (set_attr "i387_cw" "trunc")
+   (set_attr "mode" "DI")])
+
+(define_insn "fix_truncdi_i387_with_temp"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=m,?r")
+	(fix:DI (match_operand 1 "register_operand" "f,f")))
+   (use (match_operand:HI 2 "memory_operand" "m,m"))
+   (use (match_operand:HI 3 "memory_operand" "m,m"))
+   (clobber (match_operand:DI 4 "memory_operand" "=X,m"))
+   (clobber (match_scratch:XF 5 "=&1f,&1f"))]
+  "X87_FLOAT_MODE_P (GET_MODE (operands[1]))
+   && !TARGET_FISTTP
+   && !(TARGET_64BIT && SSE_FLOAT_MODE_P (GET_MODE (operands[1])))"
+  "#"
+  [(set_attr "type" "fistp")
+   (set_attr "i387_cw" "trunc")
+   (set_attr "mode" "DI")])
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(fix:DI (match_operand 1 "register_operand")))
+   (use (match_operand:HI 2 "memory_operand"))
+   (use (match_operand:HI 3 "memory_operand"))
+   (clobber (match_operand:DI 4 "memory_operand"))
+   (clobber (match_scratch 5))]
+  "reload_completed"
+  [(parallel [(set (match_dup 4) (fix:DI (match_dup 1)))
+	      (use (match_dup 2))
+	      (use (match_dup 3))
+	      (clobber (match_dup 5))])
+   (set (match_dup 0) (match_dup 4))])
+
+(define_split
+  [(set (match_operand:DI 0 "memory_operand")
+	(fix:DI (match_operand 1 "register_operand")))
+   (use (match_operand:HI 2 "memory_operand"))
+   (use (match_operand:HI 3 "memory_operand"))
+   (clobber (match_operand:DI 4 "memory_operand"))
+   (clobber (match_scratch 5))]
+  "reload_completed"
+  [(parallel [(set (match_dup 0) (fix:DI (match_dup 1)))
+	      (use (match_dup 2))
+	      (use (match_dup 3))
+	      (clobber (match_dup 5))])])
+
+(define_insn "fix_trunc<mode>_i387"
+  [(set (match_operand:SWI24 0 "memory_operand" "=m")
+	(fix:SWI24 (match_operand 1 "register_operand" "f")))
+   (use (match_operand:HI 2 "memory_operand" "m"))
+   (use (match_operand:HI 3 "memory_operand" "m"))]
+  "X87_FLOAT_MODE_P (GET_MODE (operands[1]))
+   && !TARGET_FISTTP
+   && !SSE_FLOAT_MODE_P (GET_MODE (operands[1]))"
+  "* return output_fix_trunc (insn, operands, false);"
+  [(set_attr "type" "fistp")
+   (set_attr "i387_cw" "trunc")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "fix_trunc<mode>_i387_with_temp"
+  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=m,?r")
+	(fix:SWI24 (match_operand 1 "register_operand" "f,f")))
+   (use (match_operand:HI 2 "memory_operand" "m,m"))
+   (use (match_operand:HI 3 "memory_operand" "m,m"))
+   (clobber (match_operand:SWI24 4 "memory_operand" "=X,m"))]
+  "X87_FLOAT_MODE_P (GET_MODE (operands[1]))
+   && !TARGET_FISTTP
+   && !SSE_FLOAT_MODE_P (GET_MODE (operands[1]))"
+  "#"
+  [(set_attr "type" "fistp")
+   (set_attr "i387_cw" "trunc")
+   (set_attr "mode" "<MODE>")])
+
+(define_split
+  [(set (match_operand:SWI24 0 "register_operand")
+	(fix:SWI24 (match_operand 1 "register_operand")))
+   (use (match_operand:HI 2 "memory_operand"))
+   (use (match_operand:HI 3 "memory_operand"))
+   (clobber (match_operand:SWI24 4 "memory_operand"))]
+  "reload_completed"
+  [(parallel [(set (match_dup 4) (fix:SWI24 (match_dup 1)))
+	      (use (match_dup 2))
+	      (use (match_dup 3))])
+   (set (match_dup 0) (match_dup 4))])
+
+(define_split
+  [(set (match_operand:SWI24 0 "memory_operand")
+	(fix:SWI24 (match_operand 1 "register_operand")))
+   (use (match_operand:HI 2 "memory_operand"))
+   (use (match_operand:HI 3 "memory_operand"))
+   (clobber (match_operand:SWI24 4 "memory_operand"))]
+  "reload_completed"
+  [(parallel [(set (match_dup 0) (fix:SWI24 (match_dup 1)))
+	      (use (match_dup 2))
+	      (use (match_dup 3))])])
+
+(define_insn "x86_fnstcw_1"
+  [(set (match_operand:HI 0 "memory_operand" "=m")
+	(unspec:HI [(reg:HI FPCR_REG)] UNSPEC_FSTCW))]
+  "TARGET_80387"
+  "fnstcw\t%0"
+  [(set (attr "length")
+	(symbol_ref "ix86_attr_length_address_default (insn) + 2"))
+   (set_attr "mode" "HI")
+   (set_attr "unit" "i387")
+   (set_attr "bdver1_decode" "vector")])
+
+(define_insn "x86_fldcw_1"
+  [(set (reg:HI FPCR_REG)
+	(unspec:HI [(match_operand:HI 0 "memory_operand" "m")] UNSPEC_FLDCW))]
+  "TARGET_80387"
+  "fldcw\t%0"
+  [(set (attr "length")
+	(symbol_ref "ix86_attr_length_address_default (insn) + 2"))
+   (set_attr "mode" "HI")
+   (set_attr "unit" "i387")
+   (set_attr "athlon_decode" "vector")
+   (set_attr "amdfam10_decode" "vector")
+   (set_attr "bdver1_decode" "vector")])
+
+;; Conversion between fixed point and floating point.
+
+;; Even though we only accept memory inputs, the backend _really_
+;; wants to be able to do this between registers.  Thankfully, LRA
+;; will fix this up for us during register allocation.
+
+(define_insn "floathi<mode>2"
+  [(set (match_operand:X87MODEF 0 "register_operand" "=f")
+	(float:X87MODEF (match_operand:HI 1 "nonimmediate_operand" "m")))]
+  "TARGET_80387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)"
+  "fild%Z1\t%1"
+  [(set_attr "type" "fmov")
+   (set_attr "mode" "<MODE>")
+   (set_attr "fp_int_src" "true")])
+
+(define_insn "float<SWI48x:mode>xf2"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(float:XF (match_operand:SWI48x 1 "nonimmediate_operand" "m")))]
+  "TARGET_80387"
+  "fild%Z1\t%1"
+  [(set_attr "type" "fmov")
+   (set_attr "mode" "XF")
+   (set_attr "fp_int_src" "true")])
+
+(define_expand "float<SWI48:mode><MODEF:mode>2"
+  [(set (match_operand:MODEF 0 "register_operand")
+	(float:MODEF (match_operand:SWI48 1 "nonimmediate_operand")))]
+  "TARGET_80387 || (SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_SSE_MATH)"
+{
+  if (!(SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_SSE_MATH)
+      && !X87_ENABLE_FLOAT (<MODEF:MODE>mode, <SWI48:MODE>mode))
+    {
+      rtx reg = gen_reg_rtx (XFmode);
+      rtx (*insn)(rtx, rtx);
+
+      emit_insn (gen_float<SWI48:mode>xf2 (reg, operands[1]));
+
+      if (<MODEF:MODE>mode == SFmode)
+	insn = gen_truncxfsf2;
+      else if (<MODEF:MODE>mode == DFmode)
+	insn = gen_truncxfdf2;
+      else
+	gcc_unreachable ();
+
+      emit_insn (insn (operands[0], reg));
+      DONE;
+    }
+})
+
+(define_insn "*float<SWI48:mode><MODEF:mode>2_sse"
+  [(set (match_operand:MODEF 0 "register_operand" "=f,x,x")
+	(float:MODEF
+	  (match_operand:SWI48 1 "nonimmediate_operand" "m,r,m")))]
+  "SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_SSE_MATH"
+  "@
+   fild%Z1\t%1
+   %vcvtsi2<MODEF:ssemodesuffix><SWI48:rex64suffix>\t{%1, %d0|%d0, %1}
+   %vcvtsi2<MODEF:ssemodesuffix><SWI48:rex64suffix>\t{%1, %d0|%d0, %1}"
+  [(set_attr "type" "fmov,sseicvt,sseicvt")
+   (set_attr "prefix" "orig,maybe_vex,maybe_vex")
+   (set_attr "mode" "<MODEF:MODE>")
+   (set (attr "prefix_rex")
+     (if_then_else
+       (and (eq_attr "prefix" "maybe_vex")
+	    (match_test "<SWI48:MODE>mode == DImode"))
+       (const_string "1")
+       (const_string "*")))
+   (set_attr "unit" "i387,*,*")
+   (set_attr "athlon_decode" "*,double,direct")
+   (set_attr "amdfam10_decode" "*,vector,double")
+   (set_attr "bdver1_decode" "*,double,direct")
+   (set_attr "fp_int_src" "true")
+   (set (attr "enabled")
+     (cond [(eq_attr "alternative" "0")
+              (symbol_ref "TARGET_MIX_SSE_I387
+                           && X87_ENABLE_FLOAT (<MODEF:MODE>mode,
+                                                <SWI48:MODE>mode)")
+            (eq_attr "alternative" "1")
+              (symbol_ref "TARGET_INTER_UNIT_CONVERSIONS
+                           || optimize_function_for_size_p (cfun)")
+           ]
+           (symbol_ref "true")))
+   ])
+
+(define_insn "*float<SWI48x:mode><MODEF:mode>2_i387"
+  [(set (match_operand:MODEF 0 "register_operand" "=f")
+	(float:MODEF (match_operand:SWI48x 1 "nonimmediate_operand" "m")))]
+  "TARGET_80387 && X87_ENABLE_FLOAT (<MODEF:MODE>mode, <SWI48x:MODE>mode)"
+  "fild%Z1\t%1"
+  [(set_attr "type" "fmov")
+   (set_attr "mode" "<MODEF:MODE>")
+   (set_attr "fp_int_src" "true")])
+
+;; Try TARGET_USE_VECTOR_CONVERTS, but not so hard as to require extra memory
+;; slots when !TARGET_INTER_UNIT_MOVES_TO_VEC disables the general_regs
+;; alternative in sse2_loadld.
+(define_split
+  [(set (match_operand:MODEF 0 "register_operand")
+	(float:MODEF (match_operand:SI 1 "nonimmediate_operand")))]
+  "TARGET_SSE2 && TARGET_SSE_MATH
+   && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun)
+   && reload_completed && SSE_REG_P (operands[0])
+   && (MEM_P (operands[1]) || TARGET_INTER_UNIT_MOVES_TO_VEC)"
+  [(const_int 0)]
+{
+  operands[3] = simplify_gen_subreg (<ssevecmode>mode, operands[0],
+				     <MODE>mode, 0);
+  operands[4] = simplify_gen_subreg (V4SImode, operands[0], <MODE>mode, 0);
+
+  emit_insn (gen_sse2_loadld (operands[4],
+			      CONST0_RTX (V4SImode), operands[1]));
+
+  if (<ssevecmode>mode == V4SFmode)
+    emit_insn (gen_floatv4siv4sf2 (operands[3], operands[4]));
+  else
+    emit_insn (gen_sse2_cvtdq2pd (operands[3], operands[4]));
+  DONE;
+})
+
+;; Avoid partial SSE register dependency stalls
+(define_split
+  [(set (match_operand:MODEF 0 "register_operand")
+	(float:MODEF (match_operand:SWI48 1 "nonimmediate_operand")))]
+  "TARGET_SSE2 && TARGET_SSE_MATH
+   && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+   && optimize_function_for_speed_p (cfun)
+   && reload_completed && SSE_REG_P (operands[0])"
+  [(const_int 0)]
+{
+  const enum machine_mode vmode = <MODEF:ssevecmode>mode;
+  const enum machine_mode mode = <MODEF:MODE>mode;
+  rtx t, op0 = simplify_gen_subreg (vmode, operands[0], mode, 0);
+
+  emit_move_insn (op0, CONST0_RTX (vmode));
+
+  t = gen_rtx_FLOAT (mode, operands[1]);
+  t = gen_rtx_VEC_DUPLICATE (vmode, t);
+  t = gen_rtx_VEC_MERGE (vmode, t, op0, const1_rtx);
+  emit_insn (gen_rtx_SET (VOIDmode, op0, t));
+  DONE;
+})
+
+;; Break partial reg stall for cvtsd2ss.
+
+(define_peephole2
+  [(set (match_operand:SF 0 "register_operand")
+        (float_truncate:SF
+	  (match_operand:DF 1 "nonimmediate_operand")))]
+  "TARGET_SSE2 && TARGET_SSE_MATH
+   && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+   && optimize_function_for_speed_p (cfun)
+   && SSE_REG_P (operands[0])
+   && (!SSE_REG_P (operands[1])
+       || REGNO (operands[0]) != REGNO (operands[1]))"
+  [(set (match_dup 0)
+	(vec_merge:V4SF
+	  (vec_duplicate:V4SF
+	    (float_truncate:V2SF
+	      (match_dup 1)))
+	  (match_dup 0)
+	  (const_int 1)))]
+{
+  operands[0] = simplify_gen_subreg (V4SFmode, operands[0],
+				     SFmode, 0);
+  operands[1] = simplify_gen_subreg (V2DFmode, operands[1],
+				     DFmode, 0);
+  emit_move_insn (operands[0], CONST0_RTX (V4SFmode));
+})
+
+;; Break partial reg stall for cvtss2sd.
+
+(define_peephole2
+  [(set (match_operand:DF 0 "register_operand")
+        (float_extend:DF
+          (match_operand:SF 1 "nonimmediate_operand")))]
+  "TARGET_SSE2 && TARGET_SSE_MATH
+   && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+   && optimize_function_for_speed_p (cfun)
+   && SSE_REG_P (operands[0])
+   && (!SSE_REG_P (operands[1])
+       || REGNO (operands[0]) != REGNO (operands[1]))"
+  [(set (match_dup 0)
+        (vec_merge:V2DF
+          (float_extend:V2DF
+            (vec_select:V2SF
+              (match_dup 1)
+              (parallel [(const_int 0) (const_int 1)])))
+          (match_dup 0)
+          (const_int 1)))]
+{
+  operands[0] = simplify_gen_subreg (V2DFmode, operands[0],
+				     DFmode, 0);
+  operands[1] = simplify_gen_subreg (V4SFmode, operands[1],
+				     SFmode, 0);
+  emit_move_insn (operands[0], CONST0_RTX (V2DFmode));
+})
+
+;; Avoid store forwarding (partial memory) stall penalty
+;; by passing DImode value through XMM registers.  */
+
+(define_insn "floatdi<X87MODEF:mode>2_i387_with_xmm"
+  [(set (match_operand:X87MODEF 0 "register_operand" "=f,f")
+	(float:X87MODEF
+	  (match_operand:DI 1 "nonimmediate_operand" "m,?r")))
+   (clobber (match_scratch:V4SI 3 "=X,x"))
+   (clobber (match_scratch:V4SI 4 "=X,x"))
+   (clobber (match_operand:DI 2 "memory_operand" "=X,m"))]
+  "TARGET_80387 && X87_ENABLE_FLOAT (<X87MODEF:MODE>mode, DImode)
+   && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_TO_VEC
+   && !TARGET_64BIT && optimize_function_for_speed_p (cfun)"
+  "#"
+  [(set_attr "type" "multi")
+   (set_attr "mode" "<X87MODEF:MODE>")
+   (set_attr "unit" "i387")
+   (set_attr "fp_int_src" "true")])
+
+(define_split
+  [(set (match_operand:X87MODEF 0 "fp_register_operand")
+	(float:X87MODEF (match_operand:DI 1 "register_operand")))
+   (clobber (match_scratch:V4SI 3))
+   (clobber (match_scratch:V4SI 4))
+   (clobber (match_operand:DI 2 "memory_operand"))]
+  "TARGET_80387 && X87_ENABLE_FLOAT (<X87MODEF:MODE>mode, DImode)
+   && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_TO_VEC
+   && !TARGET_64BIT && optimize_function_for_speed_p (cfun)
+   && reload_completed"
+  [(set (match_dup 2) (match_dup 3))
+   (set (match_dup 0) (float:X87MODEF (match_dup 2)))]
+{
+  /* The DImode arrived in a pair of integral registers (e.g. %edx:%eax).
+     Assemble the 64-bit DImode value in an xmm register.  */
+  emit_insn (gen_sse2_loadld (operands[3], CONST0_RTX (V4SImode),
+			      gen_rtx_SUBREG (SImode, operands[1], 0)));
+  emit_insn (gen_sse2_loadld (operands[4], CONST0_RTX (V4SImode),
+			      gen_rtx_SUBREG (SImode, operands[1], 4)));
+  emit_insn (gen_vec_interleave_lowv4si (operands[3], operands[3],
+  	    				 operands[4]));
+
+  operands[3] = gen_rtx_REG (DImode, REGNO (operands[3]));
+})
+
+(define_split
+  [(set (match_operand:X87MODEF 0 "fp_register_operand")
+	(float:X87MODEF (match_operand:DI 1 "memory_operand")))
+   (clobber (match_scratch:V4SI 3))
+   (clobber (match_scratch:V4SI 4))
+   (clobber (match_operand:DI 2 "memory_operand"))]
+  "TARGET_80387 && X87_ENABLE_FLOAT (<X87MODEF:MODE>mode, DImode)
+   && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_TO_VEC
+   && !TARGET_64BIT && optimize_function_for_speed_p (cfun)
+   && reload_completed"
+  [(set (match_dup 0) (float:X87MODEF (match_dup 1)))])
+
+(define_expand "floatuns<SWI12:mode><MODEF:mode>2"
+  [(set (match_operand:MODEF 0 "register_operand")
+	(unsigned_float:MODEF
+	  (match_operand:SWI12 1 "nonimmediate_operand")))]
+  "!TARGET_64BIT
+   && SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_SSE_MATH"
+{
+  operands[1] = convert_to_mode (SImode, operands[1], 1);
+  emit_insn (gen_floatsi<MODEF:mode>2 (operands[0], operands[1]));
+  DONE;
+})
+
+;; Avoid store forwarding (partial memory) stall penalty by extending
+;; SImode value to DImode through XMM register instead of pushing two
+;; SImode values to stack. Note that even !TARGET_INTER_UNIT_MOVES_TO_VEC
+;; targets benefit from this optimization. Also note that fild
+;; loads from memory only.
+
+(define_insn "*floatunssi<mode>2_1"
+  [(set (match_operand:X87MODEF 0 "register_operand" "=f,f")
+	(unsigned_float:X87MODEF
+	  (match_operand:SI 1 "nonimmediate_operand" "x,m")))
+   (clobber (match_operand:DI 2 "memory_operand" "=m,m"))
+   (clobber (match_scratch:SI 3 "=X,x"))]
+  "!TARGET_64BIT
+   && TARGET_80387 && X87_ENABLE_FLOAT (<X87MODEF:MODE>mode, DImode)
+   && TARGET_SSE"
+  "#"
+  [(set_attr "type" "multi")
+   (set_attr "mode" "<MODE>")])
+
+(define_split
+  [(set (match_operand:X87MODEF 0 "register_operand")
+	(unsigned_float:X87MODEF
+	  (match_operand:SI 1 "register_operand")))
+   (clobber (match_operand:DI 2 "memory_operand"))
+   (clobber (match_scratch:SI 3))]
+  "!TARGET_64BIT
+   && TARGET_80387 && X87_ENABLE_FLOAT (<X87MODEF:MODE>mode, DImode)
+   && TARGET_SSE
+   && reload_completed"
+  [(set (match_dup 2) (match_dup 1))
+   (set (match_dup 0)
+	(float:X87MODEF (match_dup 2)))]
+  "operands[1] = simplify_gen_subreg (DImode, operands[1], SImode, 0);")
+
+(define_split
+  [(set (match_operand:X87MODEF 0 "register_operand")
+	(unsigned_float:X87MODEF
+	  (match_operand:SI 1 "memory_operand")))
+   (clobber (match_operand:DI 2 "memory_operand"))
+   (clobber (match_scratch:SI 3))]
+  "!TARGET_64BIT
+   && TARGET_80387 && X87_ENABLE_FLOAT (<X87MODEF:MODE>mode, DImode)
+   && TARGET_SSE
+   && reload_completed"
+  [(set (match_dup 2) (match_dup 3))
+   (set (match_dup 0)
+	(float:X87MODEF (match_dup 2)))]
+{
+  emit_move_insn (operands[3], operands[1]);
+  operands[3] = simplify_gen_subreg (DImode, operands[3], SImode, 0);
+})
+
+(define_expand "floatunssi<mode>2"
+  [(parallel
+     [(set (match_operand:X87MODEF 0 "register_operand")
+	   (unsigned_float:X87MODEF
+	     (match_operand:SI 1 "nonimmediate_operand")))
+      (clobber (match_dup 2))
+      (clobber (match_scratch:SI 3))])]
+  "!TARGET_64BIT
+   && ((TARGET_80387 && X87_ENABLE_FLOAT (<X87MODEF:MODE>mode, DImode)
+	&& TARGET_SSE)
+       || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH))"
+{
+  if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+    {
+      ix86_expand_convert_uns_si<mode>_sse (operands[0], operands[1]);
+      DONE;
+    }
+  else
+    operands[2] = assign_386_stack_local (DImode, SLOT_TEMP);
+})
+
+(define_expand "floatunsdisf2"
+  [(use (match_operand:SF 0 "register_operand"))
+   (use (match_operand:DI 1 "nonimmediate_operand"))]
+  "TARGET_64BIT && TARGET_SSE_MATH"
+  "x86_emit_floatuns (operands); DONE;")
+
+(define_expand "floatunsdidf2"
+  [(use (match_operand:DF 0 "register_operand"))
+   (use (match_operand:DI 1 "nonimmediate_operand"))]
+  "(TARGET_64BIT || TARGET_KEEPS_VECTOR_ALIGNED_STACK)
+   && TARGET_SSE2 && TARGET_SSE_MATH"
+{
+  if (TARGET_64BIT)
+    x86_emit_floatuns (operands);
+  else
+    ix86_expand_convert_uns_didf_sse (operands[0], operands[1]);
+  DONE;
+})
+
+;; Load effective address instructions
+
+(define_insn_and_split "*lea<mode>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(match_operand:SWI48 1 "address_no_seg_operand" "Ts"))]
+  ""
+{
+  if (SImode_address_operand (operands[1], VOIDmode))
+    {
+      gcc_assert (TARGET_64BIT);
+      return "lea{l}\t{%E1, %k0|%k0, %E1}";
+    }
+  else 
+    return "lea{<imodesuffix>}\t{%E1, %0|%0, %E1}";
+}
+  "reload_completed && ix86_avoid_lea_for_addr (insn, operands)"
+  [(const_int 0)]
+{
+  enum machine_mode mode = <MODE>mode;
+  rtx pat;
+
+  /* ix86_avoid_lea_for_addr re-recognizes insn and may
+     change operands[] array behind our back.  */
+  pat = PATTERN (curr_insn);
+
+  operands[0] = SET_DEST (pat);
+  operands[1] = SET_SRC (pat);
+
+  /* Emit all operations in SImode for zero-extended addresses.  */
+  if (SImode_address_operand (operands[1], VOIDmode))
+    mode = SImode;
+
+  ix86_split_lea_for_addr (curr_insn, operands, mode);
+
+  /* Zero-extend return register to DImode for zero-extended addresses.  */
+  if (mode != <MODE>mode)
+    emit_insn (gen_zero_extendsidi2
+    	       (operands[0], gen_lowpart (mode, operands[0])));
+
+  DONE;
+}
+  [(set_attr "type" "lea")
+   (set (attr "mode")
+     (if_then_else
+       (match_operand 1 "SImode_address_operand")
+       (const_string "SI")
+       (const_string "<MODE>")))])
+
+;; Add instructions
+
+(define_expand "add<mode>3"
+  [(set (match_operand:SDWIM 0 "nonimmediate_operand")
+	(plus:SDWIM (match_operand:SDWIM 1 "nonimmediate_operand")
+		    (match_operand:SDWIM 2 "<general_operand>")))]
+  ""
+  "ix86_expand_binary_operator (PLUS, <MODE>mode, operands); DONE;")
+
+(define_insn_and_split "*add<dwi>3_doubleword"
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=r,o")
+	(plus:<DWI>
+	  (match_operand:<DWI> 1 "nonimmediate_operand" "%0,0")
+	  (match_operand:<DWI> 2 "<general_operand>" "ro<di>,r<di>")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (PLUS, <DWI>mode, operands)"
+  "#"
+  "reload_completed"
+  [(parallel [(set (reg:CC FLAGS_REG)
+		   (unspec:CC [(match_dup 1) (match_dup 2)]
+			      UNSPEC_ADD_CARRY))
+	      (set (match_dup 0)
+		   (plus:DWIH (match_dup 1) (match_dup 2)))])
+   (parallel [(set (match_dup 3)
+		   (plus:DWIH
+		     (match_dup 4)
+		     (plus:DWIH
+		       (ltu:DWIH (reg:CC FLAGS_REG) (const_int 0))
+		       (match_dup 5))))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "split_double_mode (<DWI>mode, &operands[0], 3, &operands[0], &operands[3]);")
+
+(define_insn "*add<mode>3_cc"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC
+	  [(match_operand:SWI48 1 "nonimmediate_operand" "%0,0")
+	   (match_operand:SWI48 2 "<general_operand>" "r<i>,rm")]
+	  UNSPEC_ADD_CARRY))
+   (set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r")
+	(plus:SWI48 (match_dup 1) (match_dup 2)))]
+  "ix86_binary_operator_ok (PLUS, <MODE>mode, operands)"
+  "add{<imodesuffix>}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "addqi3_cc"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC
+	  [(match_operand:QI 1 "nonimmediate_operand" "%0,0")
+	   (match_operand:QI 2 "general_operand" "qn,qm")]
+	  UNSPEC_ADD_CARRY))
+   (set (match_operand:QI 0 "nonimmediate_operand" "=qm,q")
+	(plus:QI (match_dup 1) (match_dup 2)))]
+  "ix86_binary_operator_ok (PLUS, QImode, operands)"
+  "add{b}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "QI")])
+
+(define_insn "*add<mode>_1"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=r,rm,r,r")
+	(plus:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "%0,0,r,r")
+	  (match_operand:SWI48 2 "x86_64_general_operand" "rme,re,0,le")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (PLUS, <MODE>mode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LEA:
+      return "#";
+
+    case TYPE_INCDEC:
+      gcc_assert (rtx_equal_p (operands[0], operands[1]));
+      if (operands[2] == const1_rtx)
+        return "inc{<imodesuffix>}\t%0";
+      else
+        {
+	  gcc_assert (operands[2] == constm1_rtx);
+          return "dec{<imodesuffix>}\t%0";
+	}
+
+    default:
+      /* For most processors, ADD is faster than LEA.  This alternative
+	 was added to use ADD as much as possible.  */
+      if (which_alternative == 2)
+	{
+	  rtx tmp;
+	  tmp = operands[1], operands[1] = operands[2], operands[2] = tmp;
+	}
+        
+      gcc_assert (rtx_equal_p (operands[0], operands[1]));
+      if (x86_maybe_negate_const_int (&operands[2], <MODE>mode))
+        return "sub{<imodesuffix>}\t{%2, %0|%0, %2}";
+
+      return "add{<imodesuffix>}\t{%2, %0|%0, %2}";
+    }
+}
+  [(set (attr "type")
+     (cond [(eq_attr "alternative" "3")
+              (const_string "lea")
+	    (match_operand:SWI48 2 "incdec_operand")
+	      (const_string "incdec")
+	   ]
+	   (const_string "alu")))
+   (set (attr "length_immediate")
+      (if_then_else
+	(and (eq_attr "type" "alu") (match_operand 2 "const128_operand"))
+	(const_string "1")
+	(const_string "*")))
+   (set_attr "mode" "<MODE>")])
+
+;; It may seem that nonimmediate operand is proper one for operand 1.
+;; The addsi_1 pattern allows nonimmediate operand at that place and
+;; we take care in ix86_binary_operator_ok to not allow two memory
+;; operands so proper swapping will be done in reload.  This allow
+;; patterns constructed from addsi_1 to match.
+
+(define_insn "addsi_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r,r,r")
+	(zero_extend:DI
+	  (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,r,r")
+		   (match_operand:SI 2 "x86_64_general_operand" "rme,0,le"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && ix86_binary_operator_ok (PLUS, SImode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LEA:
+      return "#";
+
+    case TYPE_INCDEC:
+      if (operands[2] == const1_rtx)
+        return "inc{l}\t%k0";
+      else
+        {
+	  gcc_assert (operands[2] == constm1_rtx);
+          return "dec{l}\t%k0";
+	}
+
+    default:
+      /* For most processors, ADD is faster than LEA.  This alternative
+	 was added to use ADD as much as possible.  */
+      if (which_alternative == 1)
+	{
+	  rtx tmp;
+	  tmp = operands[1], operands[1] = operands[2], operands[2] = tmp;
+	}
+
+      if (x86_maybe_negate_const_int (&operands[2], SImode))
+        return "sub{l}\t{%2, %k0|%k0, %2}";
+
+      return "add{l}\t{%2, %k0|%k0, %2}";
+    }
+}
+  [(set (attr "type")
+     (cond [(eq_attr "alternative" "2")
+	      (const_string "lea")
+	    (match_operand:SI 2 "incdec_operand")
+	      (const_string "incdec")
+	   ]
+	   (const_string "alu")))
+   (set (attr "length_immediate")
+      (if_then_else
+	(and (eq_attr "type" "alu") (match_operand 2 "const128_operand"))
+	(const_string "1")
+	(const_string "*")))
+   (set_attr "mode" "SI")])
+
+(define_insn "*addhi_1"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,r,r,Yp")
+	(plus:HI (match_operand:HI 1 "nonimmediate_operand" "%0,0,r,Yp")
+		 (match_operand:HI 2 "general_operand" "rn,rm,0,ln")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (PLUS, HImode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LEA:
+      return "#";
+
+    case TYPE_INCDEC:
+      gcc_assert (rtx_equal_p (operands[0], operands[1]));
+      if (operands[2] == const1_rtx)
+	return "inc{w}\t%0";
+      else
+	{
+	  gcc_assert (operands[2] == constm1_rtx);
+	  return "dec{w}\t%0";
+	}
+
+    default:
+      /* For most processors, ADD is faster than LEA.  This alternative
+	 was added to use ADD as much as possible.  */
+      if (which_alternative == 2)
+	{
+	  rtx tmp;
+	  tmp = operands[1], operands[1] = operands[2], operands[2] = tmp;
+	}
+
+      gcc_assert (rtx_equal_p (operands[0], operands[1]));
+      if (x86_maybe_negate_const_int (&operands[2], HImode))
+	return "sub{w}\t{%2, %0|%0, %2}";
+
+      return "add{w}\t{%2, %0|%0, %2}";
+    }
+}
+  [(set (attr "type")
+     (cond [(eq_attr "alternative" "3")
+              (const_string "lea")
+	    (match_operand:HI 2 "incdec_operand")
+	      (const_string "incdec")
+	   ]
+	   (const_string "alu")))
+   (set (attr "length_immediate")
+      (if_then_else
+	(and (eq_attr "type" "alu") (match_operand 2 "const128_operand"))
+	(const_string "1")
+	(const_string "*")))
+   (set_attr "mode" "HI,HI,HI,SI")])
+
+;; %%% Potential partial reg stall on alternatives 3 and 4.  What to do?
+(define_insn "*addqi_1"
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,q,r,r,Yp")
+	(plus:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,q,0,r,Yp")
+		 (match_operand:QI 2 "general_operand" "qn,qm,0,rn,0,ln")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (PLUS, QImode, operands)"
+{
+  bool widen = (which_alternative == 3 || which_alternative == 4);
+
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LEA:
+      return "#";
+
+    case TYPE_INCDEC:
+      gcc_assert (rtx_equal_p (operands[0], operands[1]));
+      if (operands[2] == const1_rtx)
+	return widen ? "inc{l}\t%k0" : "inc{b}\t%0";
+      else
+	{
+	  gcc_assert (operands[2] == constm1_rtx);
+	  return widen ? "dec{l}\t%k0" : "dec{b}\t%0";
+	}
+
+    default:
+      /* For most processors, ADD is faster than LEA.  These alternatives
+	 were added to use ADD as much as possible.  */
+      if (which_alternative == 2 || which_alternative == 4)
+	{
+	  rtx tmp;
+	  tmp = operands[1], operands[1] = operands[2], operands[2] = tmp;
+	}
+
+      gcc_assert (rtx_equal_p (operands[0], operands[1]));
+      if (x86_maybe_negate_const_int (&operands[2], QImode))
+	{
+	  if (widen)
+	    return "sub{l}\t{%2, %k0|%k0, %2}";
+	  else
+	    return "sub{b}\t{%2, %0|%0, %2}";
+	}
+      if (widen)
+        return "add{l}\t{%k2, %k0|%k0, %k2}";
+      else
+        return "add{b}\t{%2, %0|%0, %2}";
+    }
+}
+  [(set (attr "type")
+     (cond [(eq_attr "alternative" "5")
+              (const_string "lea")
+	    (match_operand:QI 2 "incdec_operand")
+	      (const_string "incdec")
+	   ]
+	   (const_string "alu")))
+   (set (attr "length_immediate")
+      (if_then_else
+	(and (eq_attr "type" "alu") (match_operand 2 "const128_operand"))
+	(const_string "1")
+	(const_string "*")))
+   (set_attr "mode" "QI,QI,QI,SI,SI,SI")])
+
+(define_insn "*addqi_1_slp"
+  [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm,q"))
+	(plus:QI (match_dup 0)
+		 (match_operand:QI 1 "general_operand" "qn,qm")))
+   (clobber (reg:CC FLAGS_REG))]
+  "(! TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_INCDEC:
+      if (operands[1] == const1_rtx)
+	return "inc{b}\t%0";
+      else
+	{
+	  gcc_assert (operands[1] == constm1_rtx);
+	  return "dec{b}\t%0";
+	}
+
+    default:
+      if (x86_maybe_negate_const_int (&operands[1], QImode))
+	return "sub{b}\t{%1, %0|%0, %1}";
+
+      return "add{b}\t{%1, %0|%0, %1}";
+    }
+}
+  [(set (attr "type")
+     (if_then_else (match_operand:QI 1 "incdec_operand")
+	(const_string "incdec")
+	(const_string "alu1")))
+   (set (attr "memory")
+     (if_then_else (match_operand 1 "memory_operand")
+        (const_string "load")
+        (const_string "none")))
+   (set_attr "mode" "QI")])
+
+;; Split non destructive adds if we cannot use lea.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+	(plus:SWI48 (match_operand:SWI48 1 "register_operand")
+		    (match_operand:SWI48 2 "x86_64_nonmemory_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "reload_completed && ix86_avoid_lea_for_add (insn, operands)"
+  [(set (match_dup 0) (match_dup 1))
+   (parallel [(set (match_dup 0) (plus:SWI48 (match_dup 0) (match_dup 2)))
+	      (clobber (reg:CC FLAGS_REG))])])
+
+;; Convert add to the lea pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:SWI 0 "register_operand")
+	(plus:SWI (match_operand:SWI 1 "register_operand")
+		  (match_operand:SWI 2 "<nonmemory_operand>")))
+   (clobber (reg:CC FLAGS_REG))]
+  "reload_completed && ix86_lea_for_add_ok (insn, operands)" 
+  [(const_int 0)]
+{
+  enum machine_mode mode = <MODE>mode;
+  rtx pat;
+
+  if (<MODE_SIZE> < GET_MODE_SIZE (SImode))
+    { 
+      mode = SImode; 
+      operands[0] = gen_lowpart (mode, operands[0]);
+      operands[1] = gen_lowpart (mode, operands[1]);
+      operands[2] = gen_lowpart (mode, operands[2]);
+    }
+
+  pat = gen_rtx_PLUS (mode, operands[1], operands[2]);
+
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0], pat));
+  DONE;
+})
+
+;; Split non destructive adds if we cannot use lea.
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+  	(zero_extend:DI
+	  (plus:SI (match_operand:SI 1 "register_operand")
+		   (match_operand:SI 2 "x86_64_nonmemory_operand"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT
+   && reload_completed && ix86_avoid_lea_for_add (insn, operands)"
+  [(set (match_dup 3) (match_dup 1))
+   (parallel [(set (match_dup 0)
+   	     	   (zero_extend:DI (plus:SI (match_dup 3) (match_dup 2))))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "operands[3] = gen_lowpart (SImode, operands[0]);")
+
+;; Convert add to the lea pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(zero_extend:DI
+	  (plus:SI (match_operand:SI 1 "register_operand")
+		   (match_operand:SI 2 "x86_64_nonmemory_operand"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && reload_completed && ix86_lea_for_add_ok (insn, operands)"
+  [(set (match_dup 0)
+	(zero_extend:DI (plus:SI (match_dup 1) (match_dup 2))))])
+
+(define_insn "*add<mode>_2"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (plus:SWI
+	    (match_operand:SWI 1 "nonimmediate_operand" "%0,0,<r>")
+	    (match_operand:SWI 2 "<general_operand>" "<g>,<r><i>,0"))
+	  (const_int 0)))
+   (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>,<r>m,<r>")
+	(plus:SWI (match_dup 1) (match_dup 2)))]
+  "ix86_match_ccmode (insn, CCGOCmode)
+   && ix86_binary_operator_ok (PLUS, <MODE>mode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_INCDEC:
+      if (operands[2] == const1_rtx)
+        return "inc{<imodesuffix>}\t%0";
+      else
+        {
+	  gcc_assert (operands[2] == constm1_rtx);
+          return "dec{<imodesuffix>}\t%0";
+	}
+
+    default:
+      if (which_alternative == 2)
+	{
+	  rtx tmp;
+	  tmp = operands[1], operands[1] = operands[2], operands[2] = tmp;
+	}
+        
+      gcc_assert (rtx_equal_p (operands[0], operands[1]));
+      if (x86_maybe_negate_const_int (&operands[2], <MODE>mode))
+        return "sub{<imodesuffix>}\t{%2, %0|%0, %2}";
+
+      return "add{<imodesuffix>}\t{%2, %0|%0, %2}";
+    }
+}
+  [(set (attr "type")
+     (if_then_else (match_operand:SWI 2 "incdec_operand")
+	(const_string "incdec")
+	(const_string "alu")))
+   (set (attr "length_immediate")
+      (if_then_else
+	(and (eq_attr "type" "alu") (match_operand 2 "const128_operand"))
+	(const_string "1")
+	(const_string "*")))
+   (set_attr "mode" "<MODE>")])
+
+;; See comment for addsi_1_zext why we do use nonimmediate_operand
+(define_insn "*addsi_2_zext"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,r")
+		   (match_operand:SI 2 "x86_64_general_operand" "rme,0"))
+	  (const_int 0)))
+   (set (match_operand:DI 0 "register_operand" "=r,r")
+	(zero_extend:DI (plus:SI (match_dup 1) (match_dup 2))))]
+  "TARGET_64BIT && ix86_match_ccmode (insn, CCGOCmode)
+   && ix86_binary_operator_ok (PLUS, SImode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_INCDEC:
+      if (operands[2] == const1_rtx)
+        return "inc{l}\t%k0";
+      else
+	{
+	  gcc_assert (operands[2] == constm1_rtx);
+          return "dec{l}\t%k0";
+	}
+
+    default:
+      if (which_alternative == 1)
+	{
+	  rtx tmp;
+	  tmp = operands[1], operands[1] = operands[2], operands[2] = tmp;
+	}
+
+      if (x86_maybe_negate_const_int (&operands[2], SImode))
+        return "sub{l}\t{%2, %k0|%k0, %2}";
+
+      return "add{l}\t{%2, %k0|%k0, %2}";
+    }
+}
+  [(set (attr "type")
+     (if_then_else (match_operand:SI 2 "incdec_operand")
+	(const_string "incdec")
+	(const_string "alu")))
+   (set (attr "length_immediate")
+      (if_then_else
+	(and (eq_attr "type" "alu") (match_operand 2 "const128_operand"))
+	(const_string "1")
+	(const_string "*")))
+   (set_attr "mode" "SI")])
+
+(define_insn "*add<mode>_3"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (neg:SWI (match_operand:SWI 2 "<general_operand>" "<g>,0"))
+	  (match_operand:SWI 1 "nonimmediate_operand" "%0,<r>")))
+   (clobber (match_scratch:SWI 0 "=<r>,<r>"))]
+  "ix86_match_ccmode (insn, CCZmode)
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_INCDEC:
+      if (operands[2] == const1_rtx)
+        return "inc{<imodesuffix>}\t%0";
+      else
+        {
+	  gcc_assert (operands[2] == constm1_rtx);
+          return "dec{<imodesuffix>}\t%0";
+	}
+
+    default:
+      if (which_alternative == 1)
+	{
+	  rtx tmp;
+	  tmp = operands[1], operands[1] = operands[2], operands[2] = tmp;
+	}
+
+      gcc_assert (rtx_equal_p (operands[0], operands[1]));
+      if (x86_maybe_negate_const_int (&operands[2], <MODE>mode))
+        return "sub{<imodesuffix>}\t{%2, %0|%0, %2}";
+
+      return "add{<imodesuffix>}\t{%2, %0|%0, %2}";
+    }
+}
+  [(set (attr "type")
+     (if_then_else (match_operand:SWI 2 "incdec_operand")
+	(const_string "incdec")
+	(const_string "alu")))
+   (set (attr "length_immediate")
+      (if_then_else
+	(and (eq_attr "type" "alu") (match_operand 2 "const128_operand"))
+	(const_string "1")
+	(const_string "*")))
+   (set_attr "mode" "<MODE>")])
+
+;; See comment for addsi_1_zext why we do use nonimmediate_operand
+(define_insn "*addsi_3_zext"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (neg:SI (match_operand:SI 2 "x86_64_general_operand" "rme,0"))
+	  (match_operand:SI 1 "nonimmediate_operand" "%0,r")))
+   (set (match_operand:DI 0 "register_operand" "=r,r")
+	(zero_extend:DI (plus:SI (match_dup 1) (match_dup 2))))]
+  "TARGET_64BIT && ix86_match_ccmode (insn, CCZmode)
+   && ix86_binary_operator_ok (PLUS, SImode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_INCDEC:
+      if (operands[2] == const1_rtx)
+        return "inc{l}\t%k0";
+      else
+        {
+	  gcc_assert (operands[2] == constm1_rtx);
+          return "dec{l}\t%k0";
+	}
+
+    default:
+      if (which_alternative == 1)
+	{
+	  rtx tmp;
+	  tmp = operands[1], operands[1] = operands[2], operands[2] = tmp;
+	}
+
+      if (x86_maybe_negate_const_int (&operands[2], SImode))
+        return "sub{l}\t{%2, %k0|%k0, %2}";
+
+      return "add{l}\t{%2, %k0|%k0, %2}";
+    }
+}
+  [(set (attr "type")
+     (if_then_else (match_operand:SI 2 "incdec_operand")
+	(const_string "incdec")
+	(const_string "alu")))
+   (set (attr "length_immediate")
+      (if_then_else
+	(and (eq_attr "type" "alu") (match_operand 2 "const128_operand"))
+	(const_string "1")
+	(const_string "*")))
+   (set_attr "mode" "SI")])
+
+; For comparisons against 1, -1 and 128, we may generate better code
+; by converting cmp to add, inc or dec as done by peephole2.  This pattern
+; is matched then.  We can't accept general immediate, because for
+; case of overflows,  the result is messed up.
+; Also carry flag is reversed compared to cmp, so this conversion is valid
+; only for comparisons not depending on it.
+
+(define_insn "*adddi_4"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (match_operand:DI 1 "nonimmediate_operand" "0")
+	  (match_operand:DI 2 "x86_64_immediate_operand" "e")))
+   (clobber (match_scratch:DI 0 "=rm"))]
+  "TARGET_64BIT
+   && ix86_match_ccmode (insn, CCGCmode)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_INCDEC:
+      if (operands[2] == constm1_rtx)
+        return "inc{q}\t%0";
+      else
+        {
+	  gcc_assert (operands[2] == const1_rtx);
+          return "dec{q}\t%0";
+	}
+
+    default:
+      if (x86_maybe_negate_const_int (&operands[2], DImode))
+	return "add{q}\t{%2, %0|%0, %2}";
+
+      return "sub{q}\t{%2, %0|%0, %2}";
+    }
+}
+  [(set (attr "type")
+     (if_then_else (match_operand:DI 2 "incdec_operand")
+	(const_string "incdec")
+	(const_string "alu")))
+   (set (attr "length_immediate")
+      (if_then_else
+	(and (eq_attr "type" "alu") (match_operand 2 "const128_operand"))
+	(const_string "1")
+	(const_string "*")))
+   (set_attr "mode" "DI")])
+
+; For comparisons against 1, -1 and 128, we may generate better code
+; by converting cmp to add, inc or dec as done by peephole2.  This pattern
+; is matched then.  We can't accept general immediate, because for
+; case of overflows,  the result is messed up.
+; Also carry flag is reversed compared to cmp, so this conversion is valid
+; only for comparisons not depending on it.
+
+(define_insn "*add<mode>_4"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (match_operand:SWI124 1 "nonimmediate_operand" "0")
+	  (match_operand:SWI124 2 "const_int_operand" "n")))
+   (clobber (match_scratch:SWI124 0 "=<r>m"))]
+  "ix86_match_ccmode (insn, CCGCmode)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_INCDEC:
+      if (operands[2] == constm1_rtx)
+        return "inc{<imodesuffix>}\t%0";
+      else
+        {
+	  gcc_assert (operands[2] == const1_rtx);
+          return "dec{<imodesuffix>}\t%0";
+	}
+
+    default:
+      if (x86_maybe_negate_const_int (&operands[2], <MODE>mode))
+	return "add{<imodesuffix>}\t{%2, %0|%0, %2}";
+
+      return "sub{<imodesuffix>}\t{%2, %0|%0, %2}";
+    }
+}
+  [(set (attr "type")
+     (if_then_else (match_operand:<MODE> 2 "incdec_operand")
+	(const_string "incdec")
+	(const_string "alu")))
+   (set (attr "length_immediate")
+      (if_then_else
+	(and (eq_attr "type" "alu") (match_operand 2 "const128_operand"))
+	(const_string "1")
+	(const_string "*")))
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*add<mode>_5"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (plus:SWI
+	    (match_operand:SWI 1 "nonimmediate_operand" "%0,<r>")
+	    (match_operand:SWI 2 "<general_operand>" "<g>,0"))
+	  (const_int 0)))
+   (clobber (match_scratch:SWI 0 "=<r>,<r>"))]
+  "ix86_match_ccmode (insn, CCGOCmode)
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_INCDEC:
+      if (operands[2] == const1_rtx)
+        return "inc{<imodesuffix>}\t%0";
+      else
+        {
+          gcc_assert (operands[2] == constm1_rtx);
+          return "dec{<imodesuffix>}\t%0";
+	}
+
+    default:
+      if (which_alternative == 1)
+	{
+	  rtx tmp;
+	  tmp = operands[1], operands[1] = operands[2], operands[2] = tmp;
+	}
+
+      gcc_assert (rtx_equal_p (operands[0], operands[1]));
+      if (x86_maybe_negate_const_int (&operands[2], <MODE>mode))
+        return "sub{<imodesuffix>}\t{%2, %0|%0, %2}";
+
+      return "add{<imodesuffix>}\t{%2, %0|%0, %2}";
+    }
+}
+  [(set (attr "type")
+     (if_then_else (match_operand:SWI 2 "incdec_operand")
+	(const_string "incdec")
+	(const_string "alu")))
+   (set (attr "length_immediate")
+      (if_then_else
+	(and (eq_attr "type" "alu") (match_operand 2 "const128_operand"))
+	(const_string "1")
+	(const_string "*")))
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "addqi_ext_1"
+  [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q,Q")
+			 (const_int 8)
+			 (const_int 8))
+	(plus:SI
+	  (zero_extract:SI
+	    (match_operand 1 "ext_register_operand" "0,0")
+	    (const_int 8)
+	    (const_int 8))
+	  (match_operand:QI 2 "general_x64nomem_operand" "Qn,m")))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_INCDEC:
+      if (operands[2] == const1_rtx)
+	return "inc{b}\t%h0";
+      else
+        {
+	  gcc_assert (operands[2] == constm1_rtx);
+          return "dec{b}\t%h0";
+        }
+
+    default:
+      return "add{b}\t{%2, %h0|%h0, %2}";
+    }
+}
+  [(set_attr "isa" "*,nox64")
+   (set (attr "type")
+     (if_then_else (match_operand:QI 2 "incdec_operand")
+	(const_string "incdec")
+	(const_string "alu")))
+   (set_attr "modrm" "1")
+   (set_attr "mode" "QI")])
+
+(define_insn "*addqi_ext_2"
+  [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q")
+			 (const_int 8)
+			 (const_int 8))
+	(plus:SI
+	  (zero_extract:SI
+	    (match_operand 1 "ext_register_operand" "%0")
+	    (const_int 8)
+	    (const_int 8))
+	  (zero_extract:SI
+	    (match_operand 2 "ext_register_operand" "Q")
+	    (const_int 8)
+	    (const_int 8))))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "add{b}\t{%h2, %h0|%h0, %h2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "QI")])
+
+;; Add with jump on overflow.
+(define_expand "addv<mode>4"
+  [(parallel [(set (reg:CCO FLAGS_REG)
+		   (eq:CCO (plus:<DWI>
+			      (sign_extend:<DWI>
+				 (match_operand:SWI 1 "nonimmediate_operand"))
+			      (sign_extend:<DWI>
+				 (match_operand:SWI 2 "<general_operand>")))
+			   (sign_extend:<DWI>
+			      (plus:SWI (match_dup 1) (match_dup 2)))))
+	      (set (match_operand:SWI 0 "register_operand")
+		   (plus:SWI (match_dup 1) (match_dup 2)))])
+   (set (pc) (if_then_else
+	       (eq (reg:CCO FLAGS_REG) (const_int 0))
+	       (label_ref (match_operand 3))
+	       (pc)))]
+  ""
+  "ix86_fixup_binary_operands_no_copy (PLUS, <MODE>mode, operands);")
+
+(define_insn "*addv<mode>4"
+  [(set (reg:CCO FLAGS_REG)
+	(eq:CCO (plus:<DWI>
+		   (sign_extend:<DWI>
+		      (match_operand:SWI 1 "nonimmediate_operand" "%0,0"))
+		   (sign_extend:<DWI>
+		      (match_operand:SWI 2 "<general_operand>" "<g>,<r><i>")))
+		(sign_extend:<DWI>
+		   (plus:SWI (match_dup 1) (match_dup 2)))))
+   (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>,<r>m")
+	(plus:SWI (match_dup 1) (match_dup 2)))]
+  "ix86_binary_operator_ok (PLUS, <MODE>mode, operands)"
+  "add{<imodesuffix>}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "<MODE>")])
+
+;; The lea patterns for modes less than 32 bits need to be matched by
+;; several insns converted to real lea by splitters.
+
+(define_insn_and_split "*lea_general_1"
+  [(set (match_operand 0 "register_operand" "=r")
+	(plus (plus (match_operand 1 "index_register_operand" "l")
+		    (match_operand 2 "register_operand" "r"))
+	      (match_operand 3 "immediate_operand" "i")))]
+  "(GET_MODE (operands[0]) == QImode || GET_MODE (operands[0]) == HImode)
+   && (!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+   && GET_MODE (operands[0]) == GET_MODE (operands[1])
+   && GET_MODE (operands[0]) == GET_MODE (operands[2])
+   && (GET_MODE (operands[0]) == GET_MODE (operands[3])
+       || GET_MODE (operands[3]) == VOIDmode)"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  enum machine_mode mode = SImode;
+  rtx pat;
+
+  operands[0] = gen_lowpart (mode, operands[0]);
+  operands[1] = gen_lowpart (mode, operands[1]);
+  operands[2] = gen_lowpart (mode, operands[2]);
+  operands[3] = gen_lowpart (mode, operands[3]);
+
+  pat = gen_rtx_PLUS (mode, gen_rtx_PLUS (mode, operands[1], operands[2]),
+  		      operands[3]);
+
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0], pat));
+  DONE;
+}
+  [(set_attr "type" "lea")
+   (set_attr "mode" "SI")])
+
+(define_insn_and_split "*lea_general_2"
+  [(set (match_operand 0 "register_operand" "=r")
+	(plus (mult (match_operand 1 "index_register_operand" "l")
+		    (match_operand 2 "const248_operand" "n"))
+	      (match_operand 3 "nonmemory_operand" "ri")))]
+  "(GET_MODE (operands[0]) == QImode || GET_MODE (operands[0]) == HImode)
+   && (!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+   && GET_MODE (operands[0]) == GET_MODE (operands[1])
+   && (GET_MODE (operands[0]) == GET_MODE (operands[3])
+       || GET_MODE (operands[3]) == VOIDmode)"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  enum machine_mode mode = SImode;
+  rtx pat;
+
+  operands[0] = gen_lowpart (mode, operands[0]);
+  operands[1] = gen_lowpart (mode, operands[1]);
+  operands[3] = gen_lowpart (mode, operands[3]);
+
+  pat = gen_rtx_PLUS (mode, gen_rtx_MULT (mode, operands[1], operands[2]),
+		      operands[3]);
+
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0], pat));
+  DONE;
+}
+  [(set_attr "type" "lea")
+   (set_attr "mode" "SI")])
+
+(define_insn_and_split "*lea_general_3"
+  [(set (match_operand 0 "register_operand" "=r")
+	(plus (plus (mult (match_operand 1 "index_register_operand" "l")
+			  (match_operand 2 "const248_operand" "n"))
+		    (match_operand 3 "register_operand" "r"))
+	      (match_operand 4 "immediate_operand" "i")))]
+  "(GET_MODE (operands[0]) == QImode || GET_MODE (operands[0]) == HImode)
+   && (!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+   && GET_MODE (operands[0]) == GET_MODE (operands[1])
+   && GET_MODE (operands[0]) == GET_MODE (operands[3])"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  enum machine_mode mode = SImode;
+  rtx pat;
+
+  operands[0] = gen_lowpart (mode, operands[0]);
+  operands[1] = gen_lowpart (mode, operands[1]);
+  operands[3] = gen_lowpart (mode, operands[3]);
+  operands[4] = gen_lowpart (mode, operands[4]);
+
+  pat = gen_rtx_PLUS (mode,
+  		      gen_rtx_PLUS (mode,
+				    gen_rtx_MULT (mode, operands[1],
+		      					operands[2]),
+				    operands[3]),
+  		      operands[4]);
+
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0], pat));
+  DONE;
+}
+  [(set_attr "type" "lea")
+   (set_attr "mode" "SI")])
+
+(define_insn_and_split "*lea_general_4"
+  [(set (match_operand 0 "register_operand" "=r")
+	(any_or (ashift
+		  (match_operand 1 "index_register_operand" "l")
+		  (match_operand 2 "const_int_operand" "n"))
+		(match_operand 3 "const_int_operand" "n")))]
+  "(((GET_MODE (operands[0]) == QImode || GET_MODE (operands[0]) == HImode)
+      && (!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)))
+    || GET_MODE (operands[0]) == SImode
+    || (TARGET_64BIT && GET_MODE (operands[0]) == DImode))
+   && GET_MODE (operands[0]) == GET_MODE (operands[1])
+   && ((unsigned HOST_WIDE_INT) INTVAL (operands[2])) - 1 < 3
+   && ((unsigned HOST_WIDE_INT) INTVAL (operands[3])
+       < ((unsigned HOST_WIDE_INT) 1 << INTVAL (operands[2])))"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  enum machine_mode mode = GET_MODE (operands[0]);
+  rtx pat;
+
+  if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
+    { 
+      mode = SImode; 
+      operands[0] = gen_lowpart (mode, operands[0]);
+      operands[1] = gen_lowpart (mode, operands[1]);
+    }
+
+  operands[2] = GEN_INT (1 << INTVAL (operands[2]));
+
+  pat = plus_constant (mode, gen_rtx_MULT (mode, operands[1], operands[2]),
+		       INTVAL (operands[3]));
+
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0], pat));
+  DONE;
+}
+  [(set_attr "type" "lea")
+   (set (attr "mode")
+      (if_then_else (match_operand:DI 0)
+	(const_string "DI")
+	(const_string "SI")))])
+
+;; Subtract instructions
+
+(define_expand "sub<mode>3"
+  [(set (match_operand:SDWIM 0 "nonimmediate_operand")
+	(minus:SDWIM (match_operand:SDWIM 1 "nonimmediate_operand")
+		     (match_operand:SDWIM 2 "<general_operand>")))]
+  ""
+  "ix86_expand_binary_operator (MINUS, <MODE>mode, operands); DONE;")
+
+(define_insn_and_split "*sub<dwi>3_doubleword"
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=r,o")
+	(minus:<DWI>
+	  (match_operand:<DWI> 1 "nonimmediate_operand" "0,0")
+	  (match_operand:<DWI> 2 "<general_operand>" "ro<di>,r<di>")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (MINUS, <MODE>mode, operands)"
+  "#"
+  "reload_completed"
+  [(parallel [(set (reg:CC FLAGS_REG)
+		   (compare:CC (match_dup 1) (match_dup 2)))
+	      (set (match_dup 0)
+		   (minus:DWIH (match_dup 1) (match_dup 2)))])
+   (parallel [(set (match_dup 3)
+		   (minus:DWIH
+		     (match_dup 4)
+		     (plus:DWIH
+		       (ltu:DWIH (reg:CC FLAGS_REG) (const_int 0))
+		       (match_dup 5))))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "split_double_mode (<DWI>mode, &operands[0], 3, &operands[0], &operands[3]);")
+
+(define_insn "*sub<mode>_1"
+  [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m,<r>")
+	(minus:SWI
+	  (match_operand:SWI 1 "nonimmediate_operand" "0,0")
+	  (match_operand:SWI 2 "<general_operand>" "<r><i>,<r>m")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (MINUS, <MODE>mode, operands)"
+  "sub{<imodesuffix>}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*subsi_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (minus:SI (match_operand:SI 1 "register_operand" "0")
+		    (match_operand:SI 2 "x86_64_general_operand" "rme"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && ix86_binary_operator_ok (MINUS, SImode, operands)"
+  "sub{l}\t{%2, %k0|%k0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "SI")])
+
+(define_insn "*subqi_1_slp"
+  [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm,q"))
+	(minus:QI (match_dup 0)
+		  (match_operand:QI 1 "general_operand" "qn,qm")))
+   (clobber (reg:CC FLAGS_REG))]
+  "(! TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "sub{b}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "mode" "QI")])
+
+(define_insn "*sub<mode>_2"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (minus:SWI
+	    (match_operand:SWI 1 "nonimmediate_operand" "0,0")
+	    (match_operand:SWI 2 "<general_operand>" "<r><i>,<r>m"))
+	  (const_int 0)))
+   (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m,<r>")
+	(minus:SWI (match_dup 1) (match_dup 2)))]
+  "ix86_match_ccmode (insn, CCGOCmode)
+   && ix86_binary_operator_ok (MINUS, <MODE>mode, operands)"
+  "sub{<imodesuffix>}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*subsi_2_zext"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (minus:SI (match_operand:SI 1 "register_operand" "0")
+		    (match_operand:SI 2 "x86_64_general_operand" "rme"))
+	  (const_int 0)))
+   (set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (minus:SI (match_dup 1)
+		    (match_dup 2))))]
+  "TARGET_64BIT && ix86_match_ccmode (insn, CCGOCmode)
+   && ix86_binary_operator_ok (MINUS, SImode, operands)"
+  "sub{l}\t{%2, %k0|%k0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "SI")])
+
+;; Subtract with jump on overflow.
+(define_expand "subv<mode>4"
+  [(parallel [(set (reg:CCO FLAGS_REG)
+		   (eq:CCO (minus:<DWI>
+			      (sign_extend:<DWI>
+				 (match_operand:SWI 1 "nonimmediate_operand"))
+			      (sign_extend:<DWI>
+				 (match_operand:SWI 2 "<general_operand>")))
+			   (sign_extend:<DWI>
+			      (minus:SWI (match_dup 1) (match_dup 2)))))
+	      (set (match_operand:SWI 0 "register_operand")
+		   (minus:SWI (match_dup 1) (match_dup 2)))])
+   (set (pc) (if_then_else
+	       (eq (reg:CCO FLAGS_REG) (const_int 0))
+	       (label_ref (match_operand 3))
+	       (pc)))]
+  ""
+  "ix86_fixup_binary_operands_no_copy (MINUS, <MODE>mode, operands);")
+
+(define_insn "*subv<mode>4"
+  [(set (reg:CCO FLAGS_REG)
+	(eq:CCO (minus:<DWI>
+		   (sign_extend:<DWI>
+		      (match_operand:SWI 1 "nonimmediate_operand" "0,0"))
+		   (sign_extend:<DWI>
+		      (match_operand:SWI 2 "<general_operand>" "<r><i>,<r>m")))
+		(sign_extend:<DWI>
+		   (minus:SWI (match_dup 1) (match_dup 2)))))
+   (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m,<r>")
+	(minus:SWI (match_dup 1) (match_dup 2)))]
+  "ix86_binary_operator_ok (MINUS, <MODE>mode, operands)"
+  "sub{<imodesuffix>}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*sub<mode>_3"
+  [(set (reg FLAGS_REG)
+	(compare (match_operand:SWI 1 "nonimmediate_operand" "0,0")
+		 (match_operand:SWI 2 "<general_operand>" "<r><i>,<r>m")))
+   (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m,<r>")
+	(minus:SWI (match_dup 1) (match_dup 2)))]
+  "ix86_match_ccmode (insn, CCmode)
+   && ix86_binary_operator_ok (MINUS, <MODE>mode, operands)"
+  "sub{<imodesuffix>}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*subsi_3_zext"
+  [(set (reg FLAGS_REG)
+	(compare (match_operand:SI 1 "register_operand" "0")
+		 (match_operand:SI 2 "x86_64_general_operand" "rme")))
+   (set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (minus:SI (match_dup 1)
+		    (match_dup 2))))]
+  "TARGET_64BIT && ix86_match_ccmode (insn, CCmode)
+   && ix86_binary_operator_ok (MINUS, SImode, operands)"
+  "sub{l}\t{%2, %1|%1, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "SI")])
+
+;; Add with carry and subtract with borrow
+
+(define_expand "<plusminus_insn><mode>3_carry"
+  [(parallel
+    [(set (match_operand:SWI 0 "nonimmediate_operand")
+	  (plusminus:SWI
+	    (match_operand:SWI 1 "nonimmediate_operand")
+	    (plus:SWI (match_operator:SWI 4 "ix86_carry_flag_operator"
+		       [(match_operand 3 "flags_reg_operand")
+			(const_int 0)])
+		      (match_operand:SWI 2 "<general_operand>"))))
+     (clobber (reg:CC FLAGS_REG))])]
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)")
+
+(define_insn "*<plusminus_insn><mode>3_carry"
+  [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m,<r>")
+	(plusminus:SWI
+	  (match_operand:SWI 1 "nonimmediate_operand" "<comm>0,0")
+	  (plus:SWI
+	    (match_operator 3 "ix86_carry_flag_operator"
+	     [(reg FLAGS_REG) (const_int 0)])
+	    (match_operand:SWI 2 "<general_operand>" "<r><i>,<r>m"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (PLUS, <MODE>mode, operands)"
+  "<plusminus_carry_mnemonic>{<imodesuffix>}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "use_carry" "1")
+   (set_attr "pent_pair" "pu")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*addsi3_carry_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0")
+		   (plus:SI (match_operator 3 "ix86_carry_flag_operator"
+			     [(reg FLAGS_REG) (const_int 0)])
+			    (match_operand:SI 2 "x86_64_general_operand" "rme")))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && ix86_binary_operator_ok (PLUS, SImode, operands)"
+  "adc{l}\t{%2, %k0|%k0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "use_carry" "1")
+   (set_attr "pent_pair" "pu")
+   (set_attr "mode" "SI")])
+
+(define_insn "*subsi3_carry_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (minus:SI (match_operand:SI 1 "register_operand" "0")
+		    (plus:SI (match_operator 3 "ix86_carry_flag_operator"
+			      [(reg FLAGS_REG) (const_int 0)])
+			     (match_operand:SI 2 "x86_64_general_operand" "rme")))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && ix86_binary_operator_ok (MINUS, SImode, operands)"
+  "sbb{l}\t{%2, %k0|%k0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "pent_pair" "pu")
+   (set_attr "mode" "SI")])
+
+;; ADCX instruction
+
+(define_insn "adcx<mode>3"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	  (plus:SWI48
+	    (match_operand:SWI48 1 "nonimmediate_operand" "%0")
+	    (plus:SWI48
+	      (match_operator 4 "ix86_carry_flag_operator"
+	       [(match_operand 3 "flags_reg_operand") (const_int 0)])
+	      (match_operand:SWI48 2 "nonimmediate_operand" "rm")))
+	  (const_int 0)))
+   (set (match_operand:SWI48 0 "register_operand" "=r")
+	(plus:SWI48 (match_dup 1)
+		    (plus:SWI48 (match_op_dup 4
+				 [(match_dup 3) (const_int 0)])
+				(match_dup 2))))]
+  "TARGET_ADX && ix86_binary_operator_ok (PLUS, <MODE>mode, operands)"
+  "adcx\t{%2, %0|%0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "use_carry" "1")
+   (set_attr "mode" "<MODE>")])
+
+;; Overflow setting add instructions
+
+(define_insn "*add<mode>3_cconly_overflow"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	  (plus:SWI
+	    (match_operand:SWI 1 "nonimmediate_operand" "%0")
+	    (match_operand:SWI 2 "<general_operand>" "<g>"))
+	  (match_dup 1)))
+   (clobber (match_scratch:SWI 0 "=<r>"))]
+  "!(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "add{<imodesuffix>}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*add<mode>3_cc_overflow"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	    (plus:SWI
+		(match_operand:SWI 1 "nonimmediate_operand" "%0,0")
+		(match_operand:SWI 2 "<general_operand>" "<r><i>,<r>m"))
+	    (match_dup 1)))
+   (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m,<r>")
+	(plus:SWI (match_dup 1) (match_dup 2)))]
+  "ix86_binary_operator_ok (PLUS, <MODE>mode, operands)"
+  "add{<imodesuffix>}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*addsi3_zext_cc_overflow"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	  (plus:SI
+	    (match_operand:SI 1 "nonimmediate_operand" "%0")
+	    (match_operand:SI 2 "x86_64_general_operand" "rme"))
+	  (match_dup 1)))
+   (set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI (plus:SI (match_dup 1) (match_dup 2))))]
+  "TARGET_64BIT && ix86_binary_operator_ok (PLUS, SImode, operands)"
+  "add{l}\t{%2, %k0|%k0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "SI")])
+
+;; The patterns that match these are at the end of this file.
+
+(define_expand "<plusminus_insn>xf3"
+  [(set (match_operand:XF 0 "register_operand")
+	(plusminus:XF
+	  (match_operand:XF 1 "register_operand")
+	  (match_operand:XF 2 "register_operand")))]
+  "TARGET_80387")
+
+(define_expand "<plusminus_insn><mode>3"
+  [(set (match_operand:MODEF 0 "register_operand")
+	(plusminus:MODEF
+	  (match_operand:MODEF 1 "register_operand")
+	  (match_operand:MODEF 2 "nonimmediate_operand")))]
+  "(TARGET_80387 && X87_ENABLE_ARITH (<MODE>mode))
+    || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)")
+
+;; Multiply instructions
+
+(define_expand "mul<mode>3"
+  [(parallel [(set (match_operand:SWIM248 0 "register_operand")
+		   (mult:SWIM248
+		     (match_operand:SWIM248 1 "register_operand")
+		     (match_operand:SWIM248 2 "<general_operand>")))
+	      (clobber (reg:CC FLAGS_REG))])])
+
+(define_expand "mulqi3"
+  [(parallel [(set (match_operand:QI 0 "register_operand")
+		   (mult:QI
+		     (match_operand:QI 1 "register_operand")
+		     (match_operand:QI 2 "nonimmediate_operand")))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_QIMODE_MATH")
+
+;; On AMDFAM10
+;; IMUL reg32/64, reg32/64, imm8 	Direct
+;; IMUL reg32/64, mem32/64, imm8 	VectorPath
+;; IMUL reg32/64, reg32/64, imm32 	Direct
+;; IMUL reg32/64, mem32/64, imm32 	VectorPath
+;; IMUL reg32/64, reg32/64 		Direct
+;; IMUL reg32/64, mem32/64 		Direct
+;;
+;; On BDVER1, all above IMULs use DirectPath
+
+(define_insn "*mul<mode>3_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=r,r,r")
+	(mult:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "%rm,rm,0")
+	  (match_operand:SWI48 2 "<general_operand>" "K,<i>,mr")))
+   (clobber (reg:CC FLAGS_REG))]
+  "!(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "@
+   imul{<imodesuffix>}\t{%2, %1, %0|%0, %1, %2}
+   imul{<imodesuffix>}\t{%2, %1, %0|%0, %1, %2}
+   imul{<imodesuffix>}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "imul")
+   (set_attr "prefix_0f" "0,0,1")
+   (set (attr "athlon_decode")
+	(cond [(eq_attr "cpu" "athlon")
+		  (const_string "vector")
+	       (eq_attr "alternative" "1")
+		  (const_string "vector")
+	       (and (eq_attr "alternative" "2")
+		    (match_operand 1 "memory_operand"))
+		  (const_string "vector")]
+	      (const_string "direct")))
+   (set (attr "amdfam10_decode")
+	(cond [(and (eq_attr "alternative" "0,1")
+		    (match_operand 1 "memory_operand"))
+		  (const_string "vector")]
+	      (const_string "direct")))
+   (set_attr "bdver1_decode" "direct")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*mulsi3_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r,r,r")
+	(zero_extend:DI
+	  (mult:SI (match_operand:SI 1 "nonimmediate_operand" "%rm,rm,0")
+		   (match_operand:SI 2 "x86_64_general_operand" "K,e,mr"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "@
+   imul{l}\t{%2, %1, %k0|%k0, %1, %2}
+   imul{l}\t{%2, %1, %k0|%k0, %1, %2}
+   imul{l}\t{%2, %k0|%k0, %2}"
+  [(set_attr "type" "imul")
+   (set_attr "prefix_0f" "0,0,1")
+   (set (attr "athlon_decode")
+	(cond [(eq_attr "cpu" "athlon")
+		  (const_string "vector")
+	       (eq_attr "alternative" "1")
+		  (const_string "vector")
+	       (and (eq_attr "alternative" "2")
+		    (match_operand 1 "memory_operand"))
+		  (const_string "vector")]
+	      (const_string "direct")))
+   (set (attr "amdfam10_decode")
+	(cond [(and (eq_attr "alternative" "0,1")
+		    (match_operand 1 "memory_operand"))
+		  (const_string "vector")]
+	      (const_string "direct")))
+   (set_attr "bdver1_decode" "direct")
+   (set_attr "mode" "SI")])
+
+;; On AMDFAM10
+;; IMUL reg16, reg16, imm8 	VectorPath
+;; IMUL reg16, mem16, imm8 	VectorPath
+;; IMUL reg16, reg16, imm16 	VectorPath
+;; IMUL reg16, mem16, imm16 	VectorPath
+;; IMUL reg16, reg16 		Direct
+;; IMUL reg16, mem16 		Direct
+;;
+;; On BDVER1, all HI MULs use DoublePath
+
+(define_insn "*mulhi3_1"
+  [(set (match_operand:HI 0 "register_operand" "=r,r,r")
+	(mult:HI (match_operand:HI 1 "nonimmediate_operand" "%rm,rm,0")
+		 (match_operand:HI 2 "general_operand" "K,n,mr")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_HIMODE_MATH
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "@
+   imul{w}\t{%2, %1, %0|%0, %1, %2}
+   imul{w}\t{%2, %1, %0|%0, %1, %2}
+   imul{w}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "imul")
+   (set_attr "prefix_0f" "0,0,1")
+   (set (attr "athlon_decode")
+	(cond [(eq_attr "cpu" "athlon")
+		  (const_string "vector")
+	       (eq_attr "alternative" "1,2")
+		  (const_string "vector")]
+	      (const_string "direct")))
+   (set (attr "amdfam10_decode")
+	(cond [(eq_attr "alternative" "0,1")
+		  (const_string "vector")]
+	      (const_string "direct")))
+   (set_attr "bdver1_decode" "double")
+   (set_attr "mode" "HI")])
+
+;;On AMDFAM10 and BDVER1
+;; MUL reg8 	Direct
+;; MUL mem8 	Direct
+
+(define_insn "*mulqi3_1"
+  [(set (match_operand:QI 0 "register_operand" "=a")
+	(mult:QI (match_operand:QI 1 "nonimmediate_operand" "%0")
+		 (match_operand:QI 2 "nonimmediate_operand" "qm")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_QIMODE_MATH
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "mul{b}\t%2"
+  [(set_attr "type" "imul")
+   (set_attr "length_immediate" "0")
+   (set (attr "athlon_decode")
+     (if_then_else (eq_attr "cpu" "athlon")
+        (const_string "vector")
+        (const_string "direct")))
+   (set_attr "amdfam10_decode" "direct")
+   (set_attr "bdver1_decode" "direct")
+   (set_attr "mode" "QI")])
+
+;; Multiply with jump on overflow.
+(define_expand "mulv<mode>4"
+  [(parallel [(set (reg:CCO FLAGS_REG)
+		   (eq:CCO (mult:<DWI>
+			      (sign_extend:<DWI>
+				 (match_operand:SWI48 1 "register_operand"))
+			      (sign_extend:<DWI>
+				 (match_operand:SWI48 2 "<general_operand>")))
+			   (sign_extend:<DWI>
+			      (mult:SWI48 (match_dup 1) (match_dup 2)))))
+	      (set (match_operand:SWI48 0 "register_operand")
+		   (mult:SWI48 (match_dup 1) (match_dup 2)))])
+   (set (pc) (if_then_else
+	       (eq (reg:CCO FLAGS_REG) (const_int 0))
+	       (label_ref (match_operand 3))
+	       (pc)))])
+
+(define_insn "*mulv<mode>4"
+  [(set (reg:CCO FLAGS_REG)
+	(eq:CCO (mult:<DWI>
+		   (sign_extend:<DWI>
+		      (match_operand:SWI 1 "nonimmediate_operand" "%rm,rm,0"))
+		   (sign_extend:<DWI>
+		      (match_operand:SWI 2 "<general_operand>" "K,<i>,mr")))
+		(sign_extend:<DWI>
+		   (mult:SWI (match_dup 1) (match_dup 2)))))
+   (set (match_operand:SWI 0 "register_operand" "=r,r,r")
+	(mult:SWI (match_dup 1) (match_dup 2)))]
+  "!(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "@
+   imul{<imodesuffix>}\t{%2, %1, %0|%0, %1, %2}
+   imul{<imodesuffix>}\t{%2, %1, %0|%0, %1, %2}
+   imul{<imodesuffix>}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "imul")
+   (set_attr "prefix_0f" "0,0,1")
+   (set (attr "athlon_decode")
+	(cond [(eq_attr "cpu" "athlon")
+		  (const_string "vector")
+	       (eq_attr "alternative" "1")
+		  (const_string "vector")
+	       (and (eq_attr "alternative" "2")
+		    (match_operand 1 "memory_operand"))
+		  (const_string "vector")]
+	      (const_string "direct")))
+   (set (attr "amdfam10_decode")
+	(cond [(and (eq_attr "alternative" "0,1")
+		    (match_operand 1 "memory_operand"))
+		  (const_string "vector")]
+	      (const_string "direct")))
+   (set_attr "bdver1_decode" "direct")
+   (set_attr "mode" "<MODE>")])
+
+(define_expand "<u>mul<mode><dwi>3"
+  [(parallel [(set (match_operand:<DWI> 0 "register_operand")
+		   (mult:<DWI>
+		     (any_extend:<DWI>
+		       (match_operand:DWIH 1 "nonimmediate_operand"))
+		     (any_extend:<DWI>
+		       (match_operand:DWIH 2 "register_operand"))))
+	      (clobber (reg:CC FLAGS_REG))])])
+
+(define_expand "<u>mulqihi3"
+  [(parallel [(set (match_operand:HI 0 "register_operand")
+		   (mult:HI
+		     (any_extend:HI
+		       (match_operand:QI 1 "nonimmediate_operand"))
+		     (any_extend:HI
+		       (match_operand:QI 2 "register_operand"))))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_QIMODE_MATH")
+
+(define_insn "*bmi2_umulditi3_1"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(mult:DI
+	  (match_operand:DI 2 "nonimmediate_operand" "%d")
+	  (match_operand:DI 3 "nonimmediate_operand" "rm")))
+   (set (match_operand:DI 1 "register_operand" "=r")
+	(truncate:DI
+	  (lshiftrt:TI
+	    (mult:TI (zero_extend:TI (match_dup 2))
+		     (zero_extend:TI (match_dup 3)))
+	    (const_int 64))))]
+  "TARGET_64BIT && TARGET_BMI2
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "mulx\t{%3, %0, %1|%1, %0, %3}"
+  [(set_attr "type" "imulx")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "DI")])
+
+(define_insn "*bmi2_umulsidi3_1"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(mult:SI
+	  (match_operand:SI 2 "nonimmediate_operand" "%d")
+	  (match_operand:SI 3 "nonimmediate_operand" "rm")))
+   (set (match_operand:SI 1 "register_operand" "=r")
+	(truncate:SI
+	  (lshiftrt:DI
+	    (mult:DI (zero_extend:DI (match_dup 2))
+		     (zero_extend:DI (match_dup 3)))
+	    (const_int 32))))]
+  "!TARGET_64BIT && TARGET_BMI2
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "mulx\t{%3, %0, %1|%1, %0, %3}"
+  [(set_attr "type" "imulx")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "SI")])
+
+(define_insn "*umul<mode><dwi>3_1"
+  [(set (match_operand:<DWI> 0 "register_operand" "=r,A")
+	(mult:<DWI>
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 1 "nonimmediate_operand" "%d,0"))
+	  (zero_extend:<DWI>
+	    (match_operand:DWIH 2 "nonimmediate_operand" "rm,rm"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "@
+   #
+   mul{<imodesuffix>}\t%2"
+  [(set_attr "isa" "bmi2,*")
+   (set_attr "type" "imulx,imul")
+   (set_attr "length_immediate" "*,0")
+   (set (attr "athlon_decode")
+	(cond [(eq_attr "alternative" "1")
+		 (if_then_else (eq_attr "cpu" "athlon")
+		   (const_string "vector")
+		   (const_string "double"))]
+	      (const_string "*")))
+   (set_attr "amdfam10_decode" "*,double")
+   (set_attr "bdver1_decode" "*,direct")
+   (set_attr "prefix" "vex,orig")
+   (set_attr "mode" "<MODE>")])
+
+;; Convert mul to the mulx pattern to avoid flags dependency.
+(define_split
+ [(set (match_operand:<DWI> 0 "register_operand")
+       (mult:<DWI>
+	 (zero_extend:<DWI>
+	   (match_operand:DWIH 1 "register_operand"))
+	 (zero_extend:<DWI>
+	   (match_operand:DWIH 2 "nonimmediate_operand"))))
+  (clobber (reg:CC FLAGS_REG))]
+ "TARGET_BMI2 && reload_completed
+  && true_regnum (operands[1]) == DX_REG"
+  [(parallel [(set (match_dup 3)
+		   (mult:DWIH (match_dup 1) (match_dup 2)))
+	      (set (match_dup 4)
+		   (truncate:DWIH
+		     (lshiftrt:<DWI>
+		       (mult:<DWI> (zero_extend:<DWI> (match_dup 1))
+				   (zero_extend:<DWI> (match_dup 2)))
+		       (match_dup 5))))])]
+{
+  split_double_mode (<DWI>mode, &operands[0], 1, &operands[3], &operands[4]);
+
+  operands[5] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode));
+})
+
+(define_insn "*mul<mode><dwi>3_1"
+  [(set (match_operand:<DWI> 0 "register_operand" "=A")
+	(mult:<DWI>
+	  (sign_extend:<DWI>
+	    (match_operand:DWIH 1 "nonimmediate_operand" "%0"))
+	  (sign_extend:<DWI>
+	    (match_operand:DWIH 2 "nonimmediate_operand" "rm"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "imul{<imodesuffix>}\t%2"
+  [(set_attr "type" "imul")
+   (set_attr "length_immediate" "0")
+   (set (attr "athlon_decode")
+     (if_then_else (eq_attr "cpu" "athlon")
+        (const_string "vector")
+        (const_string "double")))
+   (set_attr "amdfam10_decode" "double")
+   (set_attr "bdver1_decode" "direct")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*<u>mulqihi3_1"
+  [(set (match_operand:HI 0 "register_operand" "=a")
+	(mult:HI
+	  (any_extend:HI
+	    (match_operand:QI 1 "nonimmediate_operand" "%0"))
+	  (any_extend:HI
+	    (match_operand:QI 2 "nonimmediate_operand" "qm"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_QIMODE_MATH
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "<sgnprefix>mul{b}\t%2"
+  [(set_attr "type" "imul")
+   (set_attr "length_immediate" "0")
+   (set (attr "athlon_decode")
+     (if_then_else (eq_attr "cpu" "athlon")
+        (const_string "vector")
+        (const_string "direct")))
+   (set_attr "amdfam10_decode" "direct")
+   (set_attr "bdver1_decode" "direct")
+   (set_attr "mode" "QI")])
+
+(define_expand "<s>mul<mode>3_highpart"
+  [(parallel [(set (match_operand:SWI48 0 "register_operand")
+		   (truncate:SWI48
+		     (lshiftrt:<DWI>
+		       (mult:<DWI>
+			 (any_extend:<DWI>
+			   (match_operand:SWI48 1 "nonimmediate_operand"))
+			 (any_extend:<DWI>
+			   (match_operand:SWI48 2 "register_operand")))
+		       (match_dup 4))))
+	      (clobber (match_scratch:SWI48 3))
+	      (clobber (reg:CC FLAGS_REG))])]
+  ""
+  "operands[4] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode));")
+
+(define_insn "*<s>muldi3_highpart_1"
+  [(set (match_operand:DI 0 "register_operand" "=d")
+	(truncate:DI
+	  (lshiftrt:TI
+	    (mult:TI
+	      (any_extend:TI
+		(match_operand:DI 1 "nonimmediate_operand" "%a"))
+	      (any_extend:TI
+		(match_operand:DI 2 "nonimmediate_operand" "rm")))
+	    (const_int 64))))
+   (clobber (match_scratch:DI 3 "=1"))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "<sgnprefix>mul{q}\t%2"
+  [(set_attr "type" "imul")
+   (set_attr "length_immediate" "0")
+   (set (attr "athlon_decode")
+     (if_then_else (eq_attr "cpu" "athlon")
+        (const_string "vector")
+        (const_string "double")))
+   (set_attr "amdfam10_decode" "double")
+   (set_attr "bdver1_decode" "direct")
+   (set_attr "mode" "DI")])
+
+(define_insn "*<s>mulsi3_highpart_1"
+  [(set (match_operand:SI 0 "register_operand" "=d")
+	(truncate:SI
+	  (lshiftrt:DI
+	    (mult:DI
+	      (any_extend:DI
+		(match_operand:SI 1 "nonimmediate_operand" "%a"))
+	      (any_extend:DI
+		(match_operand:SI 2 "nonimmediate_operand" "rm")))
+	    (const_int 32))))
+   (clobber (match_scratch:SI 3 "=1"))
+   (clobber (reg:CC FLAGS_REG))]
+  "!(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "<sgnprefix>mul{l}\t%2"
+  [(set_attr "type" "imul")
+   (set_attr "length_immediate" "0")
+   (set (attr "athlon_decode")
+     (if_then_else (eq_attr "cpu" "athlon")
+        (const_string "vector")
+        (const_string "double")))
+   (set_attr "amdfam10_decode" "double")
+   (set_attr "bdver1_decode" "direct")
+   (set_attr "mode" "SI")])
+
+(define_insn "*<s>mulsi3_highpart_zext"
+  [(set (match_operand:DI 0 "register_operand" "=d")
+	(zero_extend:DI (truncate:SI
+	  (lshiftrt:DI
+	    (mult:DI (any_extend:DI
+		       (match_operand:SI 1 "nonimmediate_operand" "%a"))
+		     (any_extend:DI
+		       (match_operand:SI 2 "nonimmediate_operand" "rm")))
+	    (const_int 32)))))
+   (clobber (match_scratch:SI 3 "=1"))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "<sgnprefix>mul{l}\t%2"
+  [(set_attr "type" "imul")
+   (set_attr "length_immediate" "0")
+   (set (attr "athlon_decode")
+     (if_then_else (eq_attr "cpu" "athlon")
+        (const_string "vector")
+        (const_string "double")))
+   (set_attr "amdfam10_decode" "double")
+   (set_attr "bdver1_decode" "direct")
+   (set_attr "mode" "SI")])
+
+;; The patterns that match these are at the end of this file.
+
+(define_expand "mulxf3"
+  [(set (match_operand:XF 0 "register_operand")
+	(mult:XF (match_operand:XF 1 "register_operand")
+		 (match_operand:XF 2 "register_operand")))]
+  "TARGET_80387")
+
+(define_expand "mul<mode>3"
+  [(set (match_operand:MODEF 0 "register_operand")
+	(mult:MODEF (match_operand:MODEF 1 "register_operand")
+		    (match_operand:MODEF 2 "nonimmediate_operand")))]
+  "(TARGET_80387 && X87_ENABLE_ARITH (<MODE>mode))
+    || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)")
+
+;; Divide instructions
+
+;; The patterns that match these are at the end of this file.
+
+(define_expand "divxf3"
+  [(set (match_operand:XF 0 "register_operand")
+	(div:XF (match_operand:XF 1 "register_operand")
+		(match_operand:XF 2 "register_operand")))]
+  "TARGET_80387")
+
+(define_expand "divdf3"
+  [(set (match_operand:DF 0 "register_operand")
+ 	(div:DF (match_operand:DF 1 "register_operand")
+ 		(match_operand:DF 2 "nonimmediate_operand")))]
+   "(TARGET_80387 && X87_ENABLE_ARITH (DFmode))
+    || (TARGET_SSE2 && TARGET_SSE_MATH)")
+
+(define_expand "divsf3"
+  [(set (match_operand:SF 0 "register_operand")
+	(div:SF (match_operand:SF 1 "register_operand")
+		(match_operand:SF 2 "nonimmediate_operand")))]
+  "(TARGET_80387 && X87_ENABLE_ARITH (SFmode))
+    || TARGET_SSE_MATH"
+{
+  if (TARGET_SSE_MATH
+      && TARGET_RECIP_DIV
+      && optimize_insn_for_speed_p ()
+      && flag_finite_math_only && !flag_trapping_math
+      && flag_unsafe_math_optimizations)
+    {
+      ix86_emit_swdivsf (operands[0], operands[1],
+			 operands[2], SFmode);
+      DONE;
+    }
+})
+
+;; Divmod instructions.
+
+(define_expand "divmod<mode>4"
+  [(parallel [(set (match_operand:SWIM248 0 "register_operand")
+		   (div:SWIM248
+		     (match_operand:SWIM248 1 "register_operand")
+		     (match_operand:SWIM248 2 "nonimmediate_operand")))
+	      (set (match_operand:SWIM248 3 "register_operand")
+		   (mod:SWIM248 (match_dup 1) (match_dup 2)))
+	      (clobber (reg:CC FLAGS_REG))])])
+
+;; Split with 8bit unsigned divide:
+;; 	if (dividend an divisor are in [0-255])
+;;	   use 8bit unsigned integer divide
+;;	 else
+;;	   use original integer divide
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+	(div:SWI48 (match_operand:SWI48 2 "register_operand")
+		    (match_operand:SWI48 3 "nonimmediate_operand")))
+   (set (match_operand:SWI48 1 "register_operand")
+	(mod:SWI48 (match_dup 2) (match_dup 3)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_8BIT_IDIV
+   && TARGET_QIMODE_MATH
+   && can_create_pseudo_p ()
+   && !optimize_insn_for_size_p ()"
+  [(const_int 0)]
+  "ix86_split_idivmod (<MODE>mode, operands, true); DONE;")
+
+(define_insn_and_split "divmod<mode>4_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=a")
+	(div:SWI48 (match_operand:SWI48 2 "register_operand" "0")
+		   (match_operand:SWI48 3 "nonimmediate_operand" "rm")))
+   (set (match_operand:SWI48 1 "register_operand" "=&d")
+	(mod:SWI48 (match_dup 2) (match_dup 3)))
+   (unspec [(const_int 0)] UNSPEC_DIV_ALREADY_SPLIT)
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "#"
+  "reload_completed"
+  [(parallel [(set (match_dup 1)
+		   (ashiftrt:SWI48 (match_dup 4) (match_dup 5)))
+	      (clobber (reg:CC FLAGS_REG))])
+   (parallel [(set (match_dup 0)
+	           (div:SWI48 (match_dup 2) (match_dup 3)))
+	      (set (match_dup 1)
+		   (mod:SWI48 (match_dup 2) (match_dup 3)))
+	      (use (match_dup 1))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  operands[5] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1);
+
+  if (optimize_function_for_size_p (cfun) || TARGET_USE_CLTD)
+    operands[4] = operands[2];
+  else
+    {
+      /* Avoid use of cltd in favor of a mov+shift.  */
+      emit_move_insn (operands[1], operands[2]);
+      operands[4] = operands[1];
+    }
+}
+  [(set_attr "type" "multi")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn_and_split "*divmod<mode>4"
+  [(set (match_operand:SWIM248 0 "register_operand" "=a")
+	(div:SWIM248 (match_operand:SWIM248 2 "register_operand" "0")
+		    (match_operand:SWIM248 3 "nonimmediate_operand" "rm")))
+   (set (match_operand:SWIM248 1 "register_operand" "=&d")
+	(mod:SWIM248 (match_dup 2) (match_dup 3)))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "#"
+  "reload_completed"
+  [(parallel [(set (match_dup 1)
+		   (ashiftrt:SWIM248 (match_dup 4) (match_dup 5)))
+	      (clobber (reg:CC FLAGS_REG))])
+   (parallel [(set (match_dup 0)
+	           (div:SWIM248 (match_dup 2) (match_dup 3)))
+	      (set (match_dup 1)
+		   (mod:SWIM248 (match_dup 2) (match_dup 3)))
+	      (use (match_dup 1))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  operands[5] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1);
+
+  if (<MODE>mode != HImode
+      && (optimize_function_for_size_p (cfun) || TARGET_USE_CLTD))
+    operands[4] = operands[2];
+  else
+    {
+      /* Avoid use of cltd in favor of a mov+shift.  */
+      emit_move_insn (operands[1], operands[2]);
+      operands[4] = operands[1];
+    }
+}
+  [(set_attr "type" "multi")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*divmod<mode>4_noext"
+  [(set (match_operand:SWIM248 0 "register_operand" "=a")
+	(div:SWIM248 (match_operand:SWIM248 2 "register_operand" "0")
+		    (match_operand:SWIM248 3 "nonimmediate_operand" "rm")))
+   (set (match_operand:SWIM248 1 "register_operand" "=d")
+	(mod:SWIM248 (match_dup 2) (match_dup 3)))
+   (use (match_operand:SWIM248 4 "register_operand" "1"))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "idiv{<imodesuffix>}\t%3"
+  [(set_attr "type" "idiv")
+   (set_attr "mode" "<MODE>")])
+
+(define_expand "divmodqi4"
+  [(parallel [(set (match_operand:QI 0 "register_operand")
+		   (div:QI
+		     (match_operand:QI 1 "register_operand")
+		     (match_operand:QI 2 "nonimmediate_operand")))
+	      (set (match_operand:QI 3 "register_operand")
+		   (mod:QI (match_dup 1) (match_dup 2)))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_QIMODE_MATH"
+{
+  rtx div, mod, insn;
+  rtx tmp0, tmp1;
+  
+  tmp0 = gen_reg_rtx (HImode);
+  tmp1 = gen_reg_rtx (HImode);
+
+  /* Extend operands[1] to HImode.  Generate 8bit divide.  Result is
+     in AX.  */
+  emit_insn (gen_extendqihi2 (tmp1, operands[1]));
+  emit_insn (gen_divmodhiqi3 (tmp0, tmp1, operands[2]));
+
+  /* Extract remainder from AH.  */
+  tmp1 = gen_rtx_SIGN_EXTRACT (QImode, tmp0, GEN_INT (8), GEN_INT (8));
+  insn = emit_move_insn (operands[3], tmp1);
+
+  mod = gen_rtx_MOD (QImode, operands[1], operands[2]);
+  set_unique_reg_note (insn, REG_EQUAL, mod);
+
+  /* Extract quotient from AL.  */
+  insn = emit_move_insn (operands[0], gen_lowpart (QImode, tmp0));
+
+  div = gen_rtx_DIV (QImode, operands[1], operands[2]);
+  set_unique_reg_note (insn, REG_EQUAL, div);
+
+  DONE;
+})
+
+;; Divide AX by r/m8, with result stored in
+;; AL <- Quotient
+;; AH <- Remainder
+;; Change div/mod to HImode and extend the second argument to HImode
+;; so that mode of div/mod matches with mode of arguments.  Otherwise
+;; combine may fail.
+(define_insn "divmodhiqi3"
+  [(set (match_operand:HI 0 "register_operand" "=a")
+	(ior:HI
+	  (ashift:HI
+	    (zero_extend:HI
+	      (truncate:QI
+		(mod:HI (match_operand:HI 1 "register_operand" "0")
+			(sign_extend:HI
+			  (match_operand:QI 2 "nonimmediate_operand" "qm")))))
+	    (const_int 8))
+	  (zero_extend:HI
+	    (truncate:QI
+	      (div:HI (match_dup 1) (sign_extend:HI (match_dup 2)))))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_QIMODE_MATH"
+  "idiv{b}\t%2"
+  [(set_attr "type" "idiv")
+   (set_attr "mode" "QI")])
+
+(define_expand "udivmod<mode>4"
+  [(parallel [(set (match_operand:SWIM248 0 "register_operand")
+		   (udiv:SWIM248
+		     (match_operand:SWIM248 1 "register_operand")
+		     (match_operand:SWIM248 2 "nonimmediate_operand")))
+	      (set (match_operand:SWIM248 3 "register_operand")
+		   (umod:SWIM248 (match_dup 1) (match_dup 2)))
+	      (clobber (reg:CC FLAGS_REG))])])
+
+;; Split with 8bit unsigned divide:
+;; 	if (dividend an divisor are in [0-255])
+;;	   use 8bit unsigned integer divide
+;;	 else
+;;	   use original integer divide
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+	(udiv:SWI48 (match_operand:SWI48 2 "register_operand")
+		    (match_operand:SWI48 3 "nonimmediate_operand")))
+   (set (match_operand:SWI48 1 "register_operand")
+	(umod:SWI48 (match_dup 2) (match_dup 3)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_8BIT_IDIV
+   && TARGET_QIMODE_MATH
+   && can_create_pseudo_p ()
+   && !optimize_insn_for_size_p ()"
+  [(const_int 0)]
+  "ix86_split_idivmod (<MODE>mode, operands, false); DONE;")
+
+(define_insn_and_split "udivmod<mode>4_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=a")
+	(udiv:SWI48 (match_operand:SWI48 2 "register_operand" "0")
+		    (match_operand:SWI48 3 "nonimmediate_operand" "rm")))
+   (set (match_operand:SWI48 1 "register_operand" "=&d")
+	(umod:SWI48 (match_dup 2) (match_dup 3)))
+   (unspec [(const_int 0)] UNSPEC_DIV_ALREADY_SPLIT)
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "#"
+  "reload_completed"
+  [(set (match_dup 1) (const_int 0))
+   (parallel [(set (match_dup 0)
+		   (udiv:SWI48 (match_dup 2) (match_dup 3)))
+	      (set (match_dup 1)
+		   (umod:SWI48 (match_dup 2) (match_dup 3)))
+	      (use (match_dup 1))
+	      (clobber (reg:CC FLAGS_REG))])]
+  ""
+  [(set_attr "type" "multi")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn_and_split "*udivmod<mode>4"
+  [(set (match_operand:SWIM248 0 "register_operand" "=a")
+	(udiv:SWIM248 (match_operand:SWIM248 2 "register_operand" "0")
+		      (match_operand:SWIM248 3 "nonimmediate_operand" "rm")))
+   (set (match_operand:SWIM248 1 "register_operand" "=&d")
+	(umod:SWIM248 (match_dup 2) (match_dup 3)))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "#"
+  "reload_completed"
+  [(set (match_dup 1) (const_int 0))
+   (parallel [(set (match_dup 0)
+		   (udiv:SWIM248 (match_dup 2) (match_dup 3)))
+	      (set (match_dup 1)
+		   (umod:SWIM248 (match_dup 2) (match_dup 3)))
+	      (use (match_dup 1))
+	      (clobber (reg:CC FLAGS_REG))])]
+  ""
+  [(set_attr "type" "multi")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*udivmod<mode>4_noext"
+  [(set (match_operand:SWIM248 0 "register_operand" "=a")
+	(udiv:SWIM248 (match_operand:SWIM248 2 "register_operand" "0")
+		      (match_operand:SWIM248 3 "nonimmediate_operand" "rm")))
+   (set (match_operand:SWIM248 1 "register_operand" "=d")
+	(umod:SWIM248 (match_dup 2) (match_dup 3)))
+   (use (match_operand:SWIM248 4 "register_operand" "1"))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "div{<imodesuffix>}\t%3"
+  [(set_attr "type" "idiv")
+   (set_attr "mode" "<MODE>")])
+
+(define_expand "udivmodqi4"
+  [(parallel [(set (match_operand:QI 0 "register_operand")
+		   (udiv:QI
+		     (match_operand:QI 1 "register_operand")
+		     (match_operand:QI 2 "nonimmediate_operand")))
+	      (set (match_operand:QI 3 "register_operand")
+		   (umod:QI (match_dup 1) (match_dup 2)))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_QIMODE_MATH"
+{
+  rtx div, mod, insn;
+  rtx tmp0, tmp1;
+  
+  tmp0 = gen_reg_rtx (HImode);
+  tmp1 = gen_reg_rtx (HImode);
+
+  /* Extend operands[1] to HImode.  Generate 8bit divide.  Result is
+     in AX.  */
+  emit_insn (gen_zero_extendqihi2 (tmp1, operands[1]));
+  emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, operands[2]));
+
+  /* Extract remainder from AH.  */
+  tmp1 = gen_rtx_ZERO_EXTRACT (SImode, tmp0, GEN_INT (8), GEN_INT (8));
+  tmp1 = simplify_gen_subreg (QImode, tmp1, SImode, 0);
+  insn = emit_move_insn (operands[3], tmp1);
+
+  mod = gen_rtx_UMOD (QImode, operands[1], operands[2]);
+  set_unique_reg_note (insn, REG_EQUAL, mod);
+
+  /* Extract quotient from AL.  */
+  insn = emit_move_insn (operands[0], gen_lowpart (QImode, tmp0));
+
+  div = gen_rtx_UDIV (QImode, operands[1], operands[2]);
+  set_unique_reg_note (insn, REG_EQUAL, div);
+
+  DONE;
+})
+
+(define_insn "udivmodhiqi3"
+  [(set (match_operand:HI 0 "register_operand" "=a")
+	(ior:HI
+	  (ashift:HI
+	    (zero_extend:HI
+	      (truncate:QI
+		(mod:HI (match_operand:HI 1 "register_operand" "0")
+			(zero_extend:HI
+			  (match_operand:QI 2 "nonimmediate_operand" "qm")))))
+	    (const_int 8))
+	  (zero_extend:HI
+	    (truncate:QI
+	      (div:HI (match_dup 1) (zero_extend:HI (match_dup 2)))))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_QIMODE_MATH"
+  "div{b}\t%2"
+  [(set_attr "type" "idiv")
+   (set_attr "mode" "QI")])
+
+;; We cannot use div/idiv for double division, because it causes
+;; "division by zero" on the overflow and that's not what we expect
+;; from truncate.  Because true (non truncating) double division is
+;; never generated, we can't create this insn anyway.
+;
+;(define_insn ""
+;  [(set (match_operand:SI 0 "register_operand" "=a")
+;	(truncate:SI
+;	  (udiv:DI (match_operand:DI 1 "register_operand" "A")
+;		   (zero_extend:DI
+;		     (match_operand:SI 2 "nonimmediate_operand" "rm")))))
+;   (set (match_operand:SI 3 "register_operand" "=d")
+;	(truncate:SI
+;	  (umod:DI (match_dup 1) (zero_extend:DI (match_dup 2)))))
+;   (clobber (reg:CC FLAGS_REG))]
+;  ""
+;  "div{l}\t{%2, %0|%0, %2}"
+;  [(set_attr "type" "idiv")])
+
+;;- Logical AND instructions
+
+;; On Pentium, "test imm, reg" is pairable only with eax, ax, and al.
+;; Note that this excludes ah.
+
+(define_expand "testsi_ccno_1"
+  [(set (reg:CCNO FLAGS_REG)
+	(compare:CCNO
+	  (and:SI (match_operand:SI 0 "nonimmediate_operand")
+		  (match_operand:SI 1 "x86_64_nonmemory_operand"))
+	  (const_int 0)))])
+
+(define_expand "testqi_ccz_1"
+  [(set (reg:CCZ FLAGS_REG)
+        (compare:CCZ (and:QI (match_operand:QI 0 "nonimmediate_operand")
+			     (match_operand:QI 1 "nonmemory_operand"))
+		 (const_int 0)))])
+
+(define_expand "testdi_ccno_1"
+  [(set (reg:CCNO FLAGS_REG)
+	(compare:CCNO
+	  (and:DI (match_operand:DI 0 "nonimmediate_operand")
+		  (match_operand:DI 1 "x86_64_szext_general_operand"))
+	  (const_int 0)))]
+  "TARGET_64BIT && !(MEM_P (operands[0]) && MEM_P (operands[1]))")
+
+(define_insn "*testdi_1"
+  [(set (reg FLAGS_REG)
+	(compare
+	 (and:DI
+	  (match_operand:DI 0 "nonimmediate_operand" "%!*a,r,!*a,r,rm")
+	  (match_operand:DI 1 "x86_64_szext_general_operand" "Z,Z,e,e,re"))
+	 (const_int 0)))]
+  "TARGET_64BIT && ix86_match_ccmode (insn, CCNOmode)
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "@
+   test{l}\t{%k1, %k0|%k0, %k1}
+   test{l}\t{%k1, %k0|%k0, %k1}
+   test{q}\t{%1, %0|%0, %1}
+   test{q}\t{%1, %0|%0, %1}
+   test{q}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "test")
+   (set_attr "modrm" "0,1,0,1,1")
+   (set_attr "mode" "SI,SI,DI,DI,DI")])
+
+(define_insn "*testqi_1_maybe_si"
+  [(set (reg FLAGS_REG)
+        (compare
+	  (and:QI
+	    (match_operand:QI 0 "nonimmediate_operand" "%!*a,q,qm,r")
+	    (match_operand:QI 1 "general_operand" "n,n,qn,n"))
+	  (const_int 0)))]
+   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
+    && ix86_match_ccmode (insn,
+ 			 CONST_INT_P (operands[1])
+ 			 && INTVAL (operands[1]) >= 0 ? CCNOmode : CCZmode)"
+{
+  if (which_alternative == 3)
+    {
+      if (CONST_INT_P (operands[1]) && INTVAL (operands[1]) < 0)
+	operands[1] = GEN_INT (INTVAL (operands[1]) & 0xff);
+      return "test{l}\t{%1, %k0|%k0, %1}";
+    }
+  return "test{b}\t{%1, %0|%0, %1}";
+}
+  [(set_attr "type" "test")
+   (set_attr "modrm" "0,1,1,1")
+   (set_attr "mode" "QI,QI,QI,SI")
+   (set_attr "pent_pair" "uv,np,uv,np")])
+
+(define_insn "*test<mode>_1"
+  [(set (reg FLAGS_REG)
+	(compare
+	 (and:SWI124
+	  (match_operand:SWI124 0 "nonimmediate_operand" "%!*a,<r>,<r>m")
+	  (match_operand:SWI124 1 "<general_operand>" "<i>,<i>,<r><i>"))
+	 (const_int 0)))]
+  "ix86_match_ccmode (insn, CCNOmode)
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "test{<imodesuffix>}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "test")
+   (set_attr "modrm" "0,1,1")
+   (set_attr "mode" "<MODE>")
+   (set_attr "pent_pair" "uv,np,uv")])
+
+(define_expand "testqi_ext_ccno_0"
+  [(set (reg:CCNO FLAGS_REG)
+	(compare:CCNO
+	  (and:SI
+	    (zero_extract:SI
+	      (match_operand 0 "ext_register_operand")
+	      (const_int 8)
+	      (const_int 8))
+	    (match_operand 1 "const_int_operand"))
+	  (const_int 0)))])
+
+(define_insn "*testqi_ext_0"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (and:SI
+	    (zero_extract:SI
+	      (match_operand 0 "ext_register_operand" "Q")
+	      (const_int 8)
+	      (const_int 8))
+	    (match_operand 1 "const_int_operand" "n"))
+	  (const_int 0)))]
+  "ix86_match_ccmode (insn, CCNOmode)"
+  "test{b}\t{%1, %h0|%h0, %1}"
+  [(set_attr "type" "test")
+   (set_attr "mode" "QI")
+   (set_attr "length_immediate" "1")
+   (set_attr "modrm" "1")
+   (set_attr "pent_pair" "np")])
+
+(define_insn "*testqi_ext_1"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (and:SI
+	    (zero_extract:SI
+	      (match_operand 0 "ext_register_operand" "Q,Q")
+	      (const_int 8)
+	      (const_int 8))
+	    (zero_extend:SI
+	      (match_operand:QI 1 "nonimmediate_x64nomem_operand" "Q,m")))
+	  (const_int 0)))]
+  "ix86_match_ccmode (insn, CCNOmode)"
+  "test{b}\t{%1, %h0|%h0, %1}"
+  [(set_attr "isa" "*,nox64")
+   (set_attr "type" "test")
+   (set_attr "mode" "QI")])
+
+(define_insn "*testqi_ext_2"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (and:SI
+	    (zero_extract:SI
+	      (match_operand 0 "ext_register_operand" "Q")
+	      (const_int 8)
+	      (const_int 8))
+	    (zero_extract:SI
+	      (match_operand 1 "ext_register_operand" "Q")
+	      (const_int 8)
+	      (const_int 8)))
+	  (const_int 0)))]
+  "ix86_match_ccmode (insn, CCNOmode)"
+  "test{b}\t{%h1, %h0|%h0, %h1}"
+  [(set_attr "type" "test")
+   (set_attr "mode" "QI")])
+
+;; Combine likes to form bit extractions for some tests.  Humor it.
+(define_insn "*testqi_ext_3"
+  [(set (reg FLAGS_REG)
+	(compare (zero_extract:SWI48
+		   (match_operand 0 "nonimmediate_operand" "rm")
+		   (match_operand:SWI48 1 "const_int_operand")
+		   (match_operand:SWI48 2 "const_int_operand"))
+		 (const_int 0)))]
+  "ix86_match_ccmode (insn, CCNOmode)
+   && ((TARGET_64BIT && GET_MODE (operands[0]) == DImode)
+       || GET_MODE (operands[0]) == SImode
+       || GET_MODE (operands[0]) == HImode
+       || GET_MODE (operands[0]) == QImode)
+   /* Ensure that resulting mask is zero or sign extended operand.  */
+   && INTVAL (operands[2]) >= 0
+   && ((INTVAL (operands[1]) > 0
+	&& INTVAL (operands[1]) + INTVAL (operands[2]) <= 32)
+       || (<MODE>mode == DImode
+	   && INTVAL (operands[1]) > 32
+	   && INTVAL (operands[1]) + INTVAL (operands[2]) == 64))"
+  "#")
+
+(define_split
+  [(set (match_operand 0 "flags_reg_operand")
+        (match_operator 1 "compare_operator"
+	  [(zero_extract
+	     (match_operand 2 "nonimmediate_operand")
+	     (match_operand 3 "const_int_operand")
+	     (match_operand 4 "const_int_operand"))
+	   (const_int 0)]))]
+  "ix86_match_ccmode (insn, CCNOmode)"
+  [(set (match_dup 0) (match_op_dup 1 [(match_dup 2) (const_int 0)]))]
+{
+  rtx val = operands[2];
+  HOST_WIDE_INT len = INTVAL (operands[3]);
+  HOST_WIDE_INT pos = INTVAL (operands[4]);
+  HOST_WIDE_INT mask;
+  enum machine_mode mode, submode;
+
+  mode = GET_MODE (val);
+  if (MEM_P (val))
+    {
+      /* ??? Combine likes to put non-volatile mem extractions in QImode
+	 no matter the size of the test.  So find a mode that works.  */
+      if (! MEM_VOLATILE_P (val))
+	{
+	  mode = smallest_mode_for_size (pos + len, MODE_INT);
+	  val = adjust_address (val, mode, 0);
+	}
+    }
+  else if (GET_CODE (val) == SUBREG
+	   && (submode = GET_MODE (SUBREG_REG (val)),
+	       GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (submode))
+	   && pos + len <= GET_MODE_BITSIZE (submode)
+	   && GET_MODE_CLASS (submode) == MODE_INT)
+    {
+      /* Narrow a paradoxical subreg to prevent partial register stalls.  */
+      mode = submode;
+      val = SUBREG_REG (val);
+    }
+  else if (mode == HImode && pos + len <= 8)
+    {
+      /* Small HImode tests can be converted to QImode.  */
+      mode = QImode;
+      val = gen_lowpart (QImode, val);
+    }
+
+  if (len == HOST_BITS_PER_WIDE_INT)
+    mask = -1;
+  else
+    mask = ((HOST_WIDE_INT)1 << len) - 1;
+  mask <<= pos;
+
+  operands[2] = gen_rtx_AND (mode, val, gen_int_mode (mask, mode));
+})
+
+;; Convert HImode/SImode test instructions with immediate to QImode ones.
+;; i386 does not allow to encode test with 8bit sign extended immediate, so
+;; this is relatively important trick.
+;; Do the conversion only post-reload to avoid limiting of the register class
+;; to QI regs.
+(define_split
+  [(set (match_operand 0 "flags_reg_operand")
+	(match_operator 1 "compare_operator"
+	  [(and (match_operand 2 "register_operand")
+	        (match_operand 3 "const_int_operand"))
+	   (const_int 0)]))]
+   "reload_completed
+    && QI_REG_P (operands[2])
+    && GET_MODE (operands[2]) != QImode
+    && ((ix86_match_ccmode (insn, CCZmode)
+    	 && !(INTVAL (operands[3]) & ~(255 << 8)))
+	|| (ix86_match_ccmode (insn, CCNOmode)
+	    && !(INTVAL (operands[3]) & ~(127 << 8))))"
+  [(set (match_dup 0)
+	(match_op_dup 1
+	  [(and:SI (zero_extract:SI (match_dup 2) (const_int 8) (const_int 8))
+		   (match_dup 3))
+	   (const_int 0)]))]
+{
+  operands[2] = gen_lowpart (SImode, operands[2]);
+  operands[3] = gen_int_mode (INTVAL (operands[3]) >> 8, SImode);
+})
+
+(define_split
+  [(set (match_operand 0 "flags_reg_operand")
+	(match_operator 1 "compare_operator"
+	  [(and (match_operand 2 "nonimmediate_operand")
+	        (match_operand 3 "const_int_operand"))
+	   (const_int 0)]))]
+   "reload_completed
+    && GET_MODE (operands[2]) != QImode
+    && (!REG_P (operands[2]) || ANY_QI_REG_P (operands[2]))
+    && ((ix86_match_ccmode (insn, CCZmode)
+	 && !(INTVAL (operands[3]) & ~255))
+	|| (ix86_match_ccmode (insn, CCNOmode)
+	    && !(INTVAL (operands[3]) & ~127)))"
+  [(set (match_dup 0)
+	(match_op_dup 1 [(and:QI (match_dup 2) (match_dup 3))
+			 (const_int 0)]))]
+{
+  operands[2] = gen_lowpart (QImode, operands[2]);
+  operands[3] = gen_lowpart (QImode, operands[3]);
+})
+
+(define_split
+  [(set (match_operand:SWI12 0 "mask_reg_operand")
+	(any_logic:SWI12 (match_operand:SWI12 1 "mask_reg_operand")
+			 (match_operand:SWI12 2 "mask_reg_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512F && reload_completed"
+  [(set (match_dup 0)
+	(any_logic:SWI12 (match_dup 1)
+			 (match_dup 2)))])
+
+(define_insn "*k<logic><mode>"
+  [(set (match_operand:SWI12 0 "mask_reg_operand" "=k")
+	(any_logic:SWI12 (match_operand:SWI12 1 "mask_reg_operand" "k")
+			 (match_operand:SWI12 2 "mask_reg_operand" "k")))]
+  "TARGET_AVX512F"
+  "k<logic>w\t{%2, %1, %0|%0, %1, %2}";
+  [(set_attr "mode" "<MODE>")
+   (set_attr "type" "msklog")
+   (set_attr "prefix" "vex")])
+
+;; %%% This used to optimize known byte-wide and operations to memory,
+;; and sometimes to QImode registers.  If this is considered useful,
+;; it should be done with splitters.
+
+(define_expand "and<mode>3"
+  [(set (match_operand:SWIM 0 "nonimmediate_operand")
+	(and:SWIM (match_operand:SWIM 1 "nonimmediate_operand")
+		  (match_operand:SWIM 2 "<general_szext_operand>")))]
+  ""
+{
+  enum machine_mode mode = <MODE>mode;
+  rtx (*insn) (rtx, rtx);
+
+  if (CONST_INT_P (operands[2]) && REG_P (operands[0]))
+    {
+      HOST_WIDE_INT ival = INTVAL (operands[2]);
+
+      if (ival == (HOST_WIDE_INT) 0xffffffff)
+	mode = SImode;
+      else if (ival == 0xffff)
+	mode = HImode;
+      else if (ival == 0xff)
+	mode = QImode;
+      }
+
+  if (mode == <MODE>mode)
+    {
+      ix86_expand_binary_operator (AND, <MODE>mode, operands);
+      DONE;
+    }
+
+  if (<MODE>mode == DImode)
+    insn = (mode == SImode)
+	   ? gen_zero_extendsidi2
+	   : (mode == HImode)
+	   ? gen_zero_extendhidi2
+	   : gen_zero_extendqidi2;
+  else if (<MODE>mode == SImode)
+    insn = (mode == HImode)
+	   ? gen_zero_extendhisi2
+	   : gen_zero_extendqisi2;
+  else if (<MODE>mode == HImode)
+    insn = gen_zero_extendqihi2;
+  else
+    gcc_unreachable ();
+
+  emit_insn (insn (operands[0], gen_lowpart (mode, operands[1])));
+  DONE;
+})
+
+(define_insn "*anddi_1"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r")
+	(and:DI
+	 (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm")
+	 (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,rm,L")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && ix86_binary_operator_ok (AND, DImode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_IMOVX:
+      return "#";
+
+    default:
+      gcc_assert (rtx_equal_p (operands[0], operands[1]));
+      if (get_attr_mode (insn) == MODE_SI)
+	return "and{l}\t{%k2, %k0|%k0, %k2}";
+      else
+	return "and{q}\t{%2, %0|%0, %2}";
+    }
+}
+  [(set_attr "type" "alu,alu,alu,imovx")
+   (set_attr "length_immediate" "*,*,*,0")
+   (set (attr "prefix_rex")
+     (if_then_else
+       (and (eq_attr "type" "imovx")
+	    (and (match_test "INTVAL (operands[2]) == 0xff")
+		 (match_operand 1 "ext_QIreg_operand")))
+       (const_string "1")
+       (const_string "*")))
+   (set_attr "mode" "SI,DI,DI,SI")])
+
+(define_insn "*andsi_1"
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r,Ya")
+	(and:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0,qm")
+		(match_operand:SI 2 "x86_64_general_operand" "re,rm,L")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (AND, SImode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_IMOVX:
+      return "#";
+
+    default:
+      gcc_assert (rtx_equal_p (operands[0], operands[1]));
+      return "and{l}\t{%2, %0|%0, %2}";
+    }
+}
+  [(set_attr "type" "alu,alu,imovx")
+   (set (attr "prefix_rex")
+     (if_then_else
+       (and (eq_attr "type" "imovx")
+	    (and (match_test "INTVAL (operands[2]) == 0xff")
+		 (match_operand 1 "ext_QIreg_operand")))
+       (const_string "1")
+       (const_string "*")))
+   (set_attr "length_immediate" "*,*,0")
+   (set_attr "mode" "SI")])
+
+;; See comment for addsi_1_zext why we do use nonimmediate_operand
+(define_insn "*andsi_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (and:SI (match_operand:SI 1 "nonimmediate_operand" "%0")
+		  (match_operand:SI 2 "x86_64_general_operand" "rme"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && ix86_binary_operator_ok (AND, SImode, operands)"
+  "and{l}\t{%2, %k0|%k0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "SI")])
+
+(define_insn "*andhi_1"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,r,Ya,!k")
+	(and:HI (match_operand:HI 1 "nonimmediate_operand" "%0,0,qm,k")
+		(match_operand:HI 2 "general_operand" "rn,rm,L,k")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (AND, HImode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_IMOVX:
+      return "#";
+
+    case TYPE_MSKLOG:
+      return "kandw\t{%2, %1, %0|%0, %1, %2}";
+
+    default:
+      gcc_assert (rtx_equal_p (operands[0], operands[1]));
+      return "and{w}\t{%2, %0|%0, %2}";
+    }
+}
+  [(set_attr "type" "alu,alu,imovx,msklog")
+   (set_attr "length_immediate" "*,*,0,*")
+   (set (attr "prefix_rex")
+     (if_then_else
+       (and (eq_attr "type" "imovx")
+	    (match_operand 1 "ext_QIreg_operand"))
+       (const_string "1")
+       (const_string "*")))
+   (set_attr "mode" "HI,HI,SI,HI")])
+
+;; %%% Potential partial reg stall on alternative 2.  What to do?
+(define_insn "*andqi_1"
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,!k")
+	(and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
+		(match_operand:QI 2 "general_operand" "qn,qmn,rn,k")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (AND, QImode, operands)"
+  "@
+   and{b}\t{%2, %0|%0, %2}
+   and{b}\t{%2, %0|%0, %2}
+   and{l}\t{%k2, %k0|%k0, %k2}
+   kandw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "alu,alu,alu,msklog")
+   (set_attr "mode" "QI,QI,SI,HI")])
+
+(define_insn "*andqi_1_slp"
+  [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm,q"))
+	(and:QI (match_dup 0)
+		(match_operand:QI 1 "general_operand" "qn,qmn")))
+   (clobber (reg:CC FLAGS_REG))]
+  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "and{b}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "mode" "QI")])
+
+(define_insn "kandn<mode>"
+  [(set (match_operand:SWI12 0 "register_operand" "=r,&r,!k")
+	(and:SWI12
+	  (not:SWI12
+	    (match_operand:SWI12 1 "register_operand" "r,0,k"))
+	  (match_operand:SWI12 2 "register_operand" "r,r,k")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512F"
+  "@
+   andn\t{%k2, %k1, %k0|%k0, %k1, %k2}
+   #
+   kandnw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "bmi,*,avx512f")
+   (set_attr "type" "bitmanip,*,msklog")
+   (set_attr "prefix" "*,*,vex")
+   (set_attr "btver2_decode" "direct,*,*")
+   (set_attr "mode" "<MODE>")])
+
+(define_split
+  [(set (match_operand:SWI12 0 "general_reg_operand")
+	(and:SWI12
+	  (not:SWI12
+	    (match_dup 0))
+	  (match_operand:SWI12 1 "general_reg_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512F && !TARGET_BMI && reload_completed"
+  [(set (match_dup 0)
+	(not:HI (match_dup 0)))
+   (parallel [(set (match_dup 0)
+		   (and:HI (match_dup 0)
+			   (match_dup 1)))
+	      (clobber (reg:CC FLAGS_REG))])])
+
+;; Turn *anddi_1 into *andsi_1_zext if possible.
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(and:DI (subreg:DI (match_operand:SI 1 "register_operand") 0)
+		(match_operand:DI 2 "x86_64_zext_immediate_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT"
+  [(parallel [(set (match_dup 0)
+		   (zero_extend:DI (and:SI (match_dup 1) (match_dup 2))))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "operands[2] = gen_lowpart (SImode, operands[2]);")
+
+(define_split
+  [(set (match_operand:SWI248 0 "register_operand")
+	(and:SWI248 (match_operand:SWI248 1 "nonimmediate_operand")
+		    (match_operand:SWI248 2 "const_int_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "reload_completed
+   && true_regnum (operands[0]) != true_regnum (operands[1])"
+  [(const_int 0)]
+{
+  HOST_WIDE_INT ival = INTVAL (operands[2]);
+  enum machine_mode mode;
+  rtx (*insn) (rtx, rtx);
+
+  if (ival == (HOST_WIDE_INT) 0xffffffff)
+    mode = SImode;
+  else if (ival == 0xffff)
+    mode = HImode;
+  else
+    {
+      gcc_assert (ival == 0xff);
+      mode = QImode;
+    }
+
+  if (<MODE>mode == DImode)
+    insn = (mode == SImode)
+	   ? gen_zero_extendsidi2
+	   : (mode == HImode)
+	   ? gen_zero_extendhidi2
+	   : gen_zero_extendqidi2;
+  else
+    {
+      if (<MODE>mode != SImode)
+	/* Zero extend to SImode to avoid partial register stalls.  */
+	operands[0] = gen_lowpart (SImode, operands[0]);
+
+      insn = (mode == HImode)
+	     ? gen_zero_extendhisi2
+	     : gen_zero_extendqisi2;
+    }
+  emit_insn (insn (operands[0], gen_lowpart (mode, operands[1])));
+  DONE;
+})
+
+(define_split
+  [(set (match_operand 0 "register_operand")
+	(and (match_dup 0)
+	     (const_int -65536)))
+   (clobber (reg:CC FLAGS_REG))]
+  "(TARGET_FAST_PREFIX && !TARGET_PARTIAL_REG_STALL)
+    || optimize_function_for_size_p (cfun)"
+  [(set (strict_low_part (match_dup 1)) (const_int 0))]
+  "operands[1] = gen_lowpart (HImode, operands[0]);")
+
+(define_split
+  [(set (match_operand 0 "ext_register_operand")
+	(and (match_dup 0)
+	     (const_int -256)))
+   (clobber (reg:CC FLAGS_REG))]
+  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+   && reload_completed"
+  [(set (strict_low_part (match_dup 1)) (const_int 0))]
+  "operands[1] = gen_lowpart (QImode, operands[0]);")
+
+(define_split
+  [(set (match_operand 0 "ext_register_operand")
+	(and (match_dup 0)
+	     (const_int -65281)))
+   (clobber (reg:CC FLAGS_REG))]
+  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+   && reload_completed"
+  [(parallel [(set (zero_extract:SI (match_dup 0)
+				    (const_int 8)
+				    (const_int 8))
+		   (xor:SI
+		     (zero_extract:SI (match_dup 0)
+				      (const_int 8)
+				      (const_int 8))
+		     (zero_extract:SI (match_dup 0)
+				      (const_int 8)
+				      (const_int 8))))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "operands[0] = gen_lowpart (SImode, operands[0]);")
+
+(define_insn "*anddi_2"
+  [(set (reg FLAGS_REG)
+	(compare
+	 (and:DI
+	  (match_operand:DI 1 "nonimmediate_operand" "%0,0,0")
+	  (match_operand:DI 2 "x86_64_szext_general_operand" "Z,rem,re"))
+	 (const_int 0)))
+   (set (match_operand:DI 0 "nonimmediate_operand" "=r,r,rm")
+	(and:DI (match_dup 1) (match_dup 2)))]
+  "TARGET_64BIT
+   && ix86_match_ccmode
+	(insn,
+	 /* If we are going to emit andl instead of andq, and the operands[2]
+	    constant might have the SImode sign bit set, make sure the sign
+	    flag isn't tested, because the instruction will set the sign flag
+	    based on bit 31 rather than bit 63.  If it isn't CONST_INT,
+	    conservatively assume it might have bit 31 set.  */
+	 (satisfies_constraint_Z (operands[2])
+	  && (!CONST_INT_P (operands[2])
+	      || val_signbit_known_set_p (SImode, INTVAL (operands[2]))))
+	 ? CCZmode : CCNOmode)
+   && ix86_binary_operator_ok (AND, DImode, operands)"
+  "@
+   and{l}\t{%k2, %k0|%k0, %k2}
+   and{q}\t{%2, %0|%0, %2}
+   and{q}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "SI,DI,DI")])
+
+(define_insn "*andqi_2_maybe_si"
+  [(set (reg FLAGS_REG)
+	(compare (and:QI
+		  (match_operand:QI 1 "nonimmediate_operand" "%0,0,0")
+		  (match_operand:QI 2 "general_operand" "qmn,qn,n"))
+		 (const_int 0)))
+   (set (match_operand:QI 0 "nonimmediate_operand" "=q,qm,*r")
+	(and:QI (match_dup 1) (match_dup 2)))]
+  "ix86_binary_operator_ok (AND, QImode, operands)
+   && ix86_match_ccmode (insn,
+			 CONST_INT_P (operands[2])
+			 && INTVAL (operands[2]) >= 0 ? CCNOmode : CCZmode)"
+{
+  if (which_alternative == 2)
+    {
+      if (CONST_INT_P (operands[2]) && INTVAL (operands[2]) < 0)
+        operands[2] = GEN_INT (INTVAL (operands[2]) & 0xff);
+      return "and{l}\t{%2, %k0|%k0, %2}";
+    }
+  return "and{b}\t{%2, %0|%0, %2}";
+}
+  [(set_attr "type" "alu")
+   (set_attr "mode" "QI,QI,SI")])
+
+(define_insn "*and<mode>_2"
+  [(set (reg FLAGS_REG)
+	(compare (and:SWI124
+		  (match_operand:SWI124 1 "nonimmediate_operand" "%0,0")
+		  (match_operand:SWI124 2 "<general_operand>" "<g>,<r><i>"))
+		 (const_int 0)))
+   (set (match_operand:SWI124 0 "nonimmediate_operand" "=<r>,<r>m")
+	(and:SWI124 (match_dup 1) (match_dup 2)))]
+  "ix86_match_ccmode (insn, CCNOmode)
+   && ix86_binary_operator_ok (AND, <MODE>mode, operands)"
+  "and{<imodesuffix>}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "<MODE>")])
+
+;; See comment for addsi_1_zext why we do use nonimmediate_operand
+(define_insn "*andsi_2_zext"
+  [(set (reg FLAGS_REG)
+	(compare (and:SI
+		  (match_operand:SI 1 "nonimmediate_operand" "%0")
+		  (match_operand:SI 2 "x86_64_general_operand" "rme"))
+		 (const_int 0)))
+   (set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI (and:SI (match_dup 1) (match_dup 2))))]
+  "TARGET_64BIT && ix86_match_ccmode (insn, CCNOmode)
+   && ix86_binary_operator_ok (AND, SImode, operands)"
+  "and{l}\t{%2, %k0|%k0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "SI")])
+
+(define_insn "*andqi_2_slp"
+  [(set (reg FLAGS_REG)
+	(compare (and:QI
+		   (match_operand:QI 0 "nonimmediate_operand" "+q,qm")
+		   (match_operand:QI 1 "nonimmediate_operand" "qmn,qn"))
+		 (const_int 0)))
+   (set (strict_low_part (match_dup 0))
+	(and:QI (match_dup 0) (match_dup 1)))]
+  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+   && ix86_match_ccmode (insn, CCNOmode)
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "and{b}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "mode" "QI")])
+
+;; ??? A bug in recog prevents it from recognizing a const_int as an
+;; operand to zero_extend in andqi_ext_1.  It was checking explicitly
+;; for a QImode operand, which of course failed.
+(define_insn "andqi_ext_0"
+  [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q")
+			 (const_int 8)
+			 (const_int 8))
+	(and:SI
+	  (zero_extract:SI
+	    (match_operand 1 "ext_register_operand" "0")
+	    (const_int 8)
+	    (const_int 8))
+	  (match_operand 2 "const_int_operand" "n")))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "and{b}\t{%2, %h0|%h0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "length_immediate" "1")
+   (set_attr "modrm" "1")
+   (set_attr "mode" "QI")])
+
+;; Generated by peephole translating test to and.  This shows up
+;; often in fp comparisons.
+(define_insn "*andqi_ext_0_cc"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (and:SI
+	    (zero_extract:SI
+	      (match_operand 1 "ext_register_operand" "0")
+	      (const_int 8)
+	      (const_int 8))
+	    (match_operand 2 "const_int_operand" "n"))
+	  (const_int 0)))
+   (set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q")
+			 (const_int 8)
+			 (const_int 8))
+	(and:SI
+	  (zero_extract:SI
+	    (match_dup 1)
+	    (const_int 8)
+	    (const_int 8))
+	  (match_dup 2)))]
+  "ix86_match_ccmode (insn, CCNOmode)"
+  "and{b}\t{%2, %h0|%h0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "length_immediate" "1")
+   (set_attr "modrm" "1")
+   (set_attr "mode" "QI")])
+
+(define_insn "*andqi_ext_1"
+  [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q,Q")
+			 (const_int 8)
+			 (const_int 8))
+	(and:SI
+	  (zero_extract:SI
+	    (match_operand 1 "ext_register_operand" "0,0")
+	    (const_int 8)
+	    (const_int 8))
+	  (zero_extend:SI
+	    (match_operand:QI 2 "nonimmediate_x64nomem_operand" "Q,m"))))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "and{b}\t{%2, %h0|%h0, %2}"
+  [(set_attr "isa" "*,nox64")
+   (set_attr "type" "alu")
+   (set_attr "length_immediate" "0")
+   (set_attr "mode" "QI")])
+
+(define_insn "*andqi_ext_2"
+  [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q")
+			 (const_int 8)
+			 (const_int 8))
+	(and:SI
+	  (zero_extract:SI
+	    (match_operand 1 "ext_register_operand" "%0")
+	    (const_int 8)
+	    (const_int 8))
+	  (zero_extract:SI
+	    (match_operand 2 "ext_register_operand" "Q")
+	    (const_int 8)
+	    (const_int 8))))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "and{b}\t{%h2, %h0|%h0, %h2}"
+  [(set_attr "type" "alu")
+   (set_attr "length_immediate" "0")
+   (set_attr "mode" "QI")])
+
+;; Convert wide AND instructions with immediate operand to shorter QImode
+;; equivalents when possible.
+;; Don't do the splitting with memory operands, since it introduces risk
+;; of memory mismatch stalls.  We may want to do the splitting for optimizing
+;; for size, but that can (should?) be handled by generic code instead.
+(define_split
+  [(set (match_operand 0 "register_operand")
+	(and (match_operand 1 "register_operand")
+	     (match_operand 2 "const_int_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+   "reload_completed
+    && QI_REG_P (operands[0])
+    && (!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+    && !(~INTVAL (operands[2]) & ~(255 << 8))
+    && GET_MODE (operands[0]) != QImode"
+  [(parallel [(set (zero_extract:SI (match_dup 0) (const_int 8) (const_int 8))
+		   (and:SI (zero_extract:SI (match_dup 1)
+					    (const_int 8) (const_int 8))
+			   (match_dup 2)))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  operands[0] = gen_lowpart (SImode, operands[0]);
+  operands[1] = gen_lowpart (SImode, operands[1]);
+  operands[2] = gen_int_mode ((INTVAL (operands[2]) >> 8) & 0xff, SImode);
+})
+
+;; Since AND can be encoded with sign extended immediate, this is only
+;; profitable when 7th bit is not set.
+(define_split
+  [(set (match_operand 0 "register_operand")
+	(and (match_operand 1 "general_operand")
+	     (match_operand 2 "const_int_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+   "reload_completed
+    && ANY_QI_REG_P (operands[0])
+    && (!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+    && !(~INTVAL (operands[2]) & ~255)
+    && !(INTVAL (operands[2]) & 128)
+    && GET_MODE (operands[0]) != QImode"
+  [(parallel [(set (strict_low_part (match_dup 0))
+		   (and:QI (match_dup 1)
+			   (match_dup 2)))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  operands[0] = gen_lowpart (QImode, operands[0]);
+  operands[1] = gen_lowpart (QImode, operands[1]);
+  operands[2] = gen_lowpart (QImode, operands[2]);
+})
+
+;; Logical inclusive and exclusive OR instructions
+
+;; %%% This used to optimize known byte-wide and operations to memory.
+;; If this is considered useful, it should be done with splitters.
+
+(define_expand "<code><mode>3"
+  [(set (match_operand:SWIM 0 "nonimmediate_operand")
+	(any_or:SWIM (match_operand:SWIM 1 "nonimmediate_operand")
+		     (match_operand:SWIM 2 "<general_operand>")))]
+  ""
+  "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;")
+
+(define_insn "*<code><mode>_1"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=r,rm")
+	(any_or:SWI48
+	 (match_operand:SWI48 1 "nonimmediate_operand" "%0,0")
+	 (match_operand:SWI48 2 "<general_operand>" "<g>,r<i>")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "<logic>{<imodesuffix>}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*<code>hi_1"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=r,rm,!k")
+	(any_or:HI
+	 (match_operand:HI 1 "nonimmediate_operand" "%0,0,k")
+	 (match_operand:HI 2 "general_operand" "<g>,r<i>,k")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, HImode, operands)"
+  "@
+  <logic>{w}\t{%2, %0|%0, %2}
+  <logic>{w}\t{%2, %0|%0, %2}
+  k<logic>w\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "alu,alu,msklog")
+   (set_attr "mode" "HI")])
+
+;; %%% Potential partial reg stall on alternative 2.  What to do?
+(define_insn "*<code>qi_1"
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=q,m,r,!k")
+	(any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
+		   (match_operand:QI 2 "general_operand" "qmn,qn,rn,k")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, QImode, operands)"
+  "@
+   <logic>{b}\t{%2, %0|%0, %2}
+   <logic>{b}\t{%2, %0|%0, %2}
+   <logic>{l}\t{%k2, %k0|%k0, %k2}
+   k<logic>w\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "alu,alu,alu,msklog")
+   (set_attr "mode" "QI,QI,SI,HI")])
+
+;; See comment for addsi_1_zext why we do use nonimmediate_operand
+(define_insn "*<code>si_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	 (any_or:SI (match_operand:SI 1 "nonimmediate_operand" "%0")
+		    (match_operand:SI 2 "x86_64_general_operand" "rme"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && ix86_binary_operator_ok (<CODE>, SImode, operands)"
+  "<logic>{l}\t{%2, %k0|%k0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "SI")])
+
+(define_insn "*<code>si_1_zext_imm"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(any_or:DI
+	 (zero_extend:DI (match_operand:SI 1 "register_operand" "%0"))
+	 (match_operand:DI 2 "x86_64_zext_immediate_operand" "Z")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && ix86_binary_operator_ok (<CODE>, SImode, operands)"
+  "<logic>{l}\t{%2, %k0|%k0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "SI")])
+
+(define_insn "*<code>qi_1_slp"
+  [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+q,m"))
+	(any_or:QI (match_dup 0)
+		   (match_operand:QI 1 "general_operand" "qmn,qn")))
+   (clobber (reg:CC FLAGS_REG))]
+  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "<logic>{b}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "mode" "QI")])
+
+(define_insn "*<code><mode>_2"
+  [(set (reg FLAGS_REG)
+	(compare (any_or:SWI
+		  (match_operand:SWI 1 "nonimmediate_operand" "%0,0")
+		  (match_operand:SWI 2 "<general_operand>" "<g>,<r><i>"))
+		 (const_int 0)))
+   (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>,<r>m")
+	(any_or:SWI (match_dup 1) (match_dup 2)))]
+  "ix86_match_ccmode (insn, CCNOmode)
+   && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "<logic>{<imodesuffix>}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "kxnor<mode>"
+  [(set (match_operand:SWI12 0 "register_operand" "=r,!k")
+	(not:SWI12
+	  (xor:SWI12
+	    (match_operand:SWI12 1 "register_operand" "0,k")
+	    (match_operand:SWI12 2 "register_operand" "r,k"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512F"
+  "@
+   #
+   kxnorw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "*,msklog")
+   (set_attr "prefix" "*,vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_split
+  [(set (match_operand:SWI12 0 "general_reg_operand")
+	(not:SWI12
+	  (xor:SWI12
+	    (match_dup 0)
+	    (match_operand:SWI12 1 "general_reg_operand"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512F && reload_completed"
+   [(parallel [(set (match_dup 0)
+		    (xor:HI (match_dup 0)
+			    (match_dup 1)))
+	       (clobber (reg:CC FLAGS_REG))])
+    (set (match_dup 0)
+	 (not:HI (match_dup 0)))])
+
+(define_insn "kortestzhi"
+  [(set (reg:CCZ FLAGS_REG)
+	(compare:CCZ
+	  (ior:HI
+	    (match_operand:HI 0 "register_operand" "k")
+	    (match_operand:HI 1 "register_operand" "k"))
+	  (const_int 0)))]
+  "TARGET_AVX512F && ix86_match_ccmode (insn, CCZmode)"
+  "kortestw\t{%1, %0|%0, %1}"
+  [(set_attr "mode" "HI")
+   (set_attr "type" "msklog")
+   (set_attr "prefix" "vex")])
+
+(define_insn "kortestchi"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	  (ior:HI
+	    (match_operand:HI 0 "register_operand" "k")
+	    (match_operand:HI 1 "register_operand" "k"))
+	  (const_int -1)))]
+  "TARGET_AVX512F && ix86_match_ccmode (insn, CCCmode)"
+  "kortestw\t{%1, %0|%0, %1}"
+  [(set_attr "mode" "HI")
+   (set_attr "type" "msklog")
+   (set_attr "prefix" "vex")])
+
+(define_insn "kunpckhi"
+  [(set (match_operand:HI 0 "register_operand" "=k")
+	(ior:HI
+	  (ashift:HI
+	    (match_operand:HI 1 "register_operand" "k")
+	    (const_int 8))
+	  (zero_extend:HI (match_operand:QI 2 "register_operand" "k"))))]
+  "TARGET_AVX512F"
+  "kunpckbw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "mode" "HI")
+   (set_attr "type" "msklog")
+   (set_attr "prefix" "vex")])
+
+;; See comment for addsi_1_zext why we do use nonimmediate_operand
+;; ??? Special case for immediate operand is missing - it is tricky.
+(define_insn "*<code>si_2_zext"
+  [(set (reg FLAGS_REG)
+	(compare (any_or:SI (match_operand:SI 1 "nonimmediate_operand" "%0")
+			    (match_operand:SI 2 "x86_64_general_operand" "rme"))
+		 (const_int 0)))
+   (set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI (any_or:SI (match_dup 1) (match_dup 2))))]
+  "TARGET_64BIT && ix86_match_ccmode (insn, CCNOmode)
+   && ix86_binary_operator_ok (<CODE>, SImode, operands)"
+  "<logic>{l}\t{%2, %k0|%k0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "SI")])
+
+(define_insn "*<code>si_2_zext_imm"
+  [(set (reg FLAGS_REG)
+	(compare (any_or:SI
+		  (match_operand:SI 1 "nonimmediate_operand" "%0")
+		  (match_operand:SI 2 "x86_64_zext_immediate_operand" "Z"))
+		 (const_int 0)))
+   (set (match_operand:DI 0 "register_operand" "=r")
+	(any_or:DI (zero_extend:DI (match_dup 1)) (match_dup 2)))]
+  "TARGET_64BIT && ix86_match_ccmode (insn, CCNOmode)
+   && ix86_binary_operator_ok (<CODE>, SImode, operands)"
+  "<logic>{l}\t{%2, %k0|%k0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "SI")])
+
+(define_insn "*<code>qi_2_slp"
+  [(set (reg FLAGS_REG)
+	(compare (any_or:QI (match_operand:QI 0 "nonimmediate_operand" "+q,qm")
+			    (match_operand:QI 1 "general_operand" "qmn,qn"))
+		 (const_int 0)))
+   (set (strict_low_part (match_dup 0))
+	(any_or:QI (match_dup 0) (match_dup 1)))]
+  "(!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+   && ix86_match_ccmode (insn, CCNOmode)
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "<logic>{b}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "mode" "QI")])
+
+(define_insn "*<code><mode>_3"
+  [(set (reg FLAGS_REG)
+	(compare (any_or:SWI
+		  (match_operand:SWI 1 "nonimmediate_operand" "%0")
+		  (match_operand:SWI 2 "<general_operand>" "<g>"))
+		 (const_int 0)))
+   (clobber (match_scratch:SWI 0 "=<r>"))]
+  "ix86_match_ccmode (insn, CCNOmode)
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "<logic>{<imodesuffix>}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*<code>qi_ext_0"
+  [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q")
+			 (const_int 8)
+			 (const_int 8))
+	(any_or:SI
+	  (zero_extract:SI
+	    (match_operand 1 "ext_register_operand" "0")
+	    (const_int 8)
+	    (const_int 8))
+	  (match_operand 2 "const_int_operand" "n")))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
+  "<logic>{b}\t{%2, %h0|%h0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "length_immediate" "1")
+   (set_attr "modrm" "1")
+   (set_attr "mode" "QI")])
+
+(define_insn "*<code>qi_ext_1"
+  [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q,Q")
+			 (const_int 8)
+			 (const_int 8))
+	(any_or:SI
+	  (zero_extract:SI
+	    (match_operand 1 "ext_register_operand" "0,0")
+	    (const_int 8)
+	    (const_int 8))
+	  (zero_extend:SI
+	    (match_operand:QI 2 "nonimmediate_x64nomem_operand" "Q,m"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
+  "<logic>{b}\t{%2, %h0|%h0, %2}"
+  [(set_attr "isa" "*,nox64")
+   (set_attr "type" "alu")
+   (set_attr "length_immediate" "0")
+   (set_attr "mode" "QI")])
+
+(define_insn "*<code>qi_ext_2"
+  [(set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q")
+			 (const_int 8)
+			 (const_int 8))
+	(any_or:SI
+	  (zero_extract:SI (match_operand 1 "ext_register_operand" "0")
+	  		   (const_int 8)
+			   (const_int 8))
+	  (zero_extract:SI (match_operand 2 "ext_register_operand" "Q")
+	  		   (const_int 8)
+			   (const_int 8))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
+  "<logic>{b}\t{%h2, %h0|%h0, %h2}"
+  [(set_attr "type" "alu")
+   (set_attr "length_immediate" "0")
+   (set_attr "mode" "QI")])
+
+(define_split
+  [(set (match_operand 0 "register_operand")
+	(any_or (match_operand 1 "register_operand")
+		(match_operand 2 "const_int_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+   "reload_completed
+    && QI_REG_P (operands[0])
+    && (!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+    && !(INTVAL (operands[2]) & ~(255 << 8))
+    && GET_MODE (operands[0]) != QImode"
+  [(parallel [(set (zero_extract:SI (match_dup 0) (const_int 8) (const_int 8))
+		   (any_or:SI (zero_extract:SI (match_dup 1)
+					       (const_int 8) (const_int 8))
+			      (match_dup 2)))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  operands[0] = gen_lowpart (SImode, operands[0]);
+  operands[1] = gen_lowpart (SImode, operands[1]);
+  operands[2] = gen_int_mode ((INTVAL (operands[2]) >> 8) & 0xff, SImode);
+})
+
+;; Since OR can be encoded with sign extended immediate, this is only
+;; profitable when 7th bit is set.
+(define_split
+  [(set (match_operand 0 "register_operand")
+	(any_or (match_operand 1 "general_operand")
+		(match_operand 2 "const_int_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+   "reload_completed
+    && ANY_QI_REG_P (operands[0])
+    && (!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+    && !(INTVAL (operands[2]) & ~255)
+    && (INTVAL (operands[2]) & 128)
+    && GET_MODE (operands[0]) != QImode"
+  [(parallel [(set (strict_low_part (match_dup 0))
+		   (any_or:QI (match_dup 1)
+			      (match_dup 2)))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  operands[0] = gen_lowpart (QImode, operands[0]);
+  operands[1] = gen_lowpart (QImode, operands[1]);
+  operands[2] = gen_lowpart (QImode, operands[2]);
+})
+
+(define_expand "xorqi_cc_ext_1"
+  [(parallel [
+     (set (reg:CCNO FLAGS_REG)
+	  (compare:CCNO
+	    (xor:SI
+	      (zero_extract:SI
+		(match_operand 1 "ext_register_operand")
+		(const_int 8)
+		(const_int 8))
+	      (match_operand:QI 2 "const_int_operand"))
+	    (const_int 0)))
+     (set (zero_extract:SI (match_operand 0 "ext_register_operand")
+			   (const_int 8)
+			   (const_int 8))
+	  (xor:SI
+	    (zero_extract:SI
+	     (match_dup 1)
+	     (const_int 8)
+	     (const_int 8))
+	    (match_dup 2)))])])
+
+(define_insn "*xorqi_cc_ext_1"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (xor:SI
+	    (zero_extract:SI
+	      (match_operand 1 "ext_register_operand" "0,0")
+	      (const_int 8)
+	      (const_int 8))
+	    (match_operand:QI 2 "general_x64nomem_operand" "Qn,m"))
+	  (const_int 0)))
+   (set (zero_extract:SI (match_operand 0 "ext_register_operand" "=Q,Q")
+			 (const_int 8)
+			 (const_int 8))
+	(xor:SI
+	  (zero_extract:SI
+	   (match_dup 1)
+	   (const_int 8)
+	   (const_int 8))
+	  (match_dup 2)))]
+  "ix86_match_ccmode (insn, CCNOmode)"
+  "xor{b}\t{%2, %h0|%h0, %2}"
+  [(set_attr "isa" "*,nox64")
+   (set_attr "type" "alu")
+   (set_attr "modrm" "1")
+   (set_attr "mode" "QI")])
+
+;; Negation instructions
+
+(define_expand "neg<mode>2"
+  [(set (match_operand:SDWIM 0 "nonimmediate_operand")
+	(neg:SDWIM (match_operand:SDWIM 1 "nonimmediate_operand")))]
+  ""
+  "ix86_expand_unary_operator (NEG, <MODE>mode, operands); DONE;")
+
+(define_insn_and_split "*neg<dwi>2_doubleword"
+  [(set (match_operand:<DWI> 0 "nonimmediate_operand" "=ro")
+	(neg:<DWI> (match_operand:<DWI> 1 "nonimmediate_operand" "0")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_unary_operator_ok (NEG, <DWI>mode, operands)"
+  "#"
+  "reload_completed"
+  [(parallel
+    [(set (reg:CCZ FLAGS_REG)
+	  (compare:CCZ (neg:DWIH (match_dup 1)) (const_int 0)))
+     (set (match_dup 0) (neg:DWIH (match_dup 1)))])
+   (parallel
+    [(set (match_dup 2)
+	  (plus:DWIH (match_dup 3)
+		     (plus:DWIH (ltu:DWIH (reg:CC FLAGS_REG) (const_int 0))
+				(const_int 0))))
+     (clobber (reg:CC FLAGS_REG))])
+   (parallel
+    [(set (match_dup 2)
+	  (neg:DWIH (match_dup 2)))
+     (clobber (reg:CC FLAGS_REG))])]
+  "split_double_mode (<DWI>mode, &operands[0], 2, &operands[0], &operands[2]);")
+
+(define_insn "*neg<mode>2_1"
+  [(set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
+	(neg:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_unary_operator_ok (NEG, <MODE>mode, operands)"
+  "neg{<imodesuffix>}\t%0"
+  [(set_attr "type" "negnot")
+   (set_attr "mode" "<MODE>")])
+
+;; Combine is quite creative about this pattern.
+(define_insn "*negsi2_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(lshiftrt:DI
+	  (neg:DI (ashift:DI (match_operand:DI 1 "register_operand" "0")
+			     (const_int 32)))
+	(const_int 32)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && ix86_unary_operator_ok (NEG, SImode, operands)"
+  "neg{l}\t%k0"
+  [(set_attr "type" "negnot")
+   (set_attr "mode" "SI")])
+
+;; The problem with neg is that it does not perform (compare x 0),
+;; it really performs (compare 0 x), which leaves us with the zero
+;; flag being the only useful item.
+
+(define_insn "*neg<mode>2_cmpz"
+  [(set (reg:CCZ FLAGS_REG)
+	(compare:CCZ
+	  (neg:SWI (match_operand:SWI 1 "nonimmediate_operand" "0"))
+		   (const_int 0)))
+   (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
+	(neg:SWI (match_dup 1)))]
+  "ix86_unary_operator_ok (NEG, <MODE>mode, operands)"
+  "neg{<imodesuffix>}\t%0"
+  [(set_attr "type" "negnot")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*negsi2_cmpz_zext"
+  [(set (reg:CCZ FLAGS_REG)
+	(compare:CCZ
+	  (lshiftrt:DI
+	    (neg:DI (ashift:DI
+		      (match_operand:DI 1 "register_operand" "0")
+		      (const_int 32)))
+	    (const_int 32))
+	  (const_int 0)))
+   (set (match_operand:DI 0 "register_operand" "=r")
+	(lshiftrt:DI (neg:DI (ashift:DI (match_dup 1)
+					(const_int 32)))
+		     (const_int 32)))]
+  "TARGET_64BIT && ix86_unary_operator_ok (NEG, SImode, operands)"
+  "neg{l}\t%k0"
+  [(set_attr "type" "negnot")
+   (set_attr "mode" "SI")])
+
+;; Negate with jump on overflow.
+(define_expand "negv<mode>3"
+  [(parallel [(set (reg:CCO FLAGS_REG)
+		   (ne:CCO (match_operand:SWI 1 "register_operand")
+			   (match_dup 3)))
+	      (set (match_operand:SWI 0 "register_operand")
+		   (neg:SWI (match_dup 1)))])
+   (set (pc) (if_then_else
+	       (eq (reg:CCO FLAGS_REG) (const_int 0))
+	       (label_ref (match_operand 2))
+	       (pc)))]
+  ""
+{
+  operands[3]
+    = gen_int_mode (HOST_WIDE_INT_1U << (GET_MODE_BITSIZE (<MODE>mode) - 1),
+		    <MODE>mode);
+})
+
+(define_insn "*negv<mode>3"
+  [(set (reg:CCO FLAGS_REG)
+	(ne:CCO (match_operand:SWI 1 "nonimmediate_operand" "0")
+		(match_operand:SWI 2 "const_int_operand")))
+   (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
+	(neg:SWI (match_dup 1)))]
+  "ix86_unary_operator_ok (NEG, <MODE>mode, operands)
+   && mode_signbit_p (<MODE>mode, operands[2])"
+  "neg{<imodesuffix>}\t%0"
+  [(set_attr "type" "negnot")
+   (set_attr "mode" "<MODE>")])
+
+;; Changing of sign for FP values is doable using integer unit too.
+
+(define_expand "<code><mode>2"
+  [(set (match_operand:X87MODEF 0 "register_operand")
+	(absneg:X87MODEF (match_operand:X87MODEF 1 "register_operand")))]
+  "TARGET_80387 || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
+  "ix86_expand_fp_absneg_operator (<CODE>, <MODE>mode, operands); DONE;")
+
+(define_insn "*absneg<mode>2_mixed"
+  [(set (match_operand:MODEF 0 "register_operand" "=x,x,f,!r")
+	(match_operator:MODEF 3 "absneg_operator"
+	  [(match_operand:MODEF 1 "register_operand" "0,x,0,0")]))
+   (use (match_operand:<ssevecmode> 2 "nonimmediate_operand" "xm,0,X,X"))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (<MODE>mode)"
+  "#")
+
+(define_insn "*absneg<mode>2_sse"
+  [(set (match_operand:MODEF 0 "register_operand" "=x,x,!r")
+	(match_operator:MODEF 3 "absneg_operator"
+	  [(match_operand:MODEF 1 "register_operand" "0 ,x,0")]))
+   (use (match_operand:<ssevecmode> 2 "register_operand" "xm,0,X"))
+   (clobber (reg:CC FLAGS_REG))]
+  "SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH"
+  "#")
+
+(define_insn "*absneg<mode>2_i387"
+  [(set (match_operand:X87MODEF 0 "register_operand" "=f,!r")
+	(match_operator:X87MODEF 3 "absneg_operator"
+	  [(match_operand:X87MODEF 1 "register_operand" "0,0")]))
+   (use (match_operand 2))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_80387 && !(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
+  "#")
+
+(define_expand "<code>tf2"
+  [(set (match_operand:TF 0 "register_operand")
+	(absneg:TF (match_operand:TF 1 "register_operand")))]
+  "TARGET_SSE"
+  "ix86_expand_fp_absneg_operator (<CODE>, TFmode, operands); DONE;")
+
+(define_insn "*absnegtf2_sse"
+  [(set (match_operand:TF 0 "register_operand" "=x,x")
+	(match_operator:TF 3 "absneg_operator"
+	  [(match_operand:TF 1 "register_operand" "0,x")]))
+   (use (match_operand:TF 2 "nonimmediate_operand" "xm,0"))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_SSE"
+  "#")
+
+;; Splitters for fp abs and neg.
+
+(define_split
+  [(set (match_operand 0 "fp_register_operand")
+	(match_operator 1 "absneg_operator" [(match_dup 0)]))
+   (use (match_operand 2))
+   (clobber (reg:CC FLAGS_REG))]
+  "reload_completed"
+  [(set (match_dup 0) (match_op_dup 1 [(match_dup 0)]))])
+
+(define_split
+  [(set (match_operand 0 "register_operand")
+	(match_operator 3 "absneg_operator"
+	  [(match_operand 1 "register_operand")]))
+   (use (match_operand 2 "nonimmediate_operand"))
+   (clobber (reg:CC FLAGS_REG))]
+  "reload_completed && SSE_REG_P (operands[0])"
+  [(set (match_dup 0) (match_dup 3))]
+{
+  enum machine_mode mode = GET_MODE (operands[0]);
+  enum machine_mode vmode = GET_MODE (operands[2]);
+  rtx tmp;
+
+  operands[0] = simplify_gen_subreg (vmode, operands[0], mode, 0);
+  operands[1] = simplify_gen_subreg (vmode, operands[1], mode, 0);
+  if (operands_match_p (operands[0], operands[2]))
+    {
+      tmp = operands[1];
+      operands[1] = operands[2];
+      operands[2] = tmp;
+    }
+  if (GET_CODE (operands[3]) == ABS)
+    tmp = gen_rtx_AND (vmode, operands[1], operands[2]);
+  else
+    tmp = gen_rtx_XOR (vmode, operands[1], operands[2]);
+  operands[3] = tmp;
+})
+
+(define_split
+  [(set (match_operand:SF 0 "register_operand")
+	(match_operator:SF 1 "absneg_operator" [(match_dup 0)]))
+   (use (match_operand:V4SF 2))
+   (clobber (reg:CC FLAGS_REG))]
+  "reload_completed"
+  [(parallel [(set (match_dup 0) (match_dup 1))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  rtx tmp;
+  operands[0] = gen_lowpart (SImode, operands[0]);
+  if (GET_CODE (operands[1]) == ABS)
+    {
+      tmp = gen_int_mode (0x7fffffff, SImode);
+      tmp = gen_rtx_AND (SImode, operands[0], tmp);
+    }
+  else
+    {
+      tmp = gen_int_mode (0x80000000, SImode);
+      tmp = gen_rtx_XOR (SImode, operands[0], tmp);
+    }
+  operands[1] = tmp;
+})
+
+(define_split
+  [(set (match_operand:DF 0 "register_operand")
+	(match_operator:DF 1 "absneg_operator" [(match_dup 0)]))
+   (use (match_operand 2))
+   (clobber (reg:CC FLAGS_REG))]
+  "reload_completed"
+  [(parallel [(set (match_dup 0) (match_dup 1))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  rtx tmp;
+  if (TARGET_64BIT)
+    {
+      tmp = gen_lowpart (DImode, operands[0]);
+      tmp = gen_rtx_ZERO_EXTRACT (DImode, tmp, const1_rtx, GEN_INT (63));
+      operands[0] = tmp;
+
+      if (GET_CODE (operands[1]) == ABS)
+	tmp = const0_rtx;
+      else
+	tmp = gen_rtx_NOT (DImode, tmp);
+    }
+  else
+    {
+      operands[0] = gen_highpart (SImode, operands[0]);
+      if (GET_CODE (operands[1]) == ABS)
+	{
+	  tmp = gen_int_mode (0x7fffffff, SImode);
+	  tmp = gen_rtx_AND (SImode, operands[0], tmp);
+	}
+      else
+	{
+	  tmp = gen_int_mode (0x80000000, SImode);
+	  tmp = gen_rtx_XOR (SImode, operands[0], tmp);
+	}
+    }
+  operands[1] = tmp;
+})
+
+(define_split
+  [(set (match_operand:XF 0 "register_operand")
+	(match_operator:XF 1 "absneg_operator" [(match_dup 0)]))
+   (use (match_operand 2))
+   (clobber (reg:CC FLAGS_REG))]
+  "reload_completed"
+  [(parallel [(set (match_dup 0) (match_dup 1))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  rtx tmp;
+  operands[0] = gen_rtx_REG (SImode,
+			     true_regnum (operands[0])
+			     + (TARGET_64BIT ? 1 : 2));
+  if (GET_CODE (operands[1]) == ABS)
+    {
+      tmp = GEN_INT (0x7fff);
+      tmp = gen_rtx_AND (SImode, operands[0], tmp);
+    }
+  else
+    {
+      tmp = GEN_INT (0x8000);
+      tmp = gen_rtx_XOR (SImode, operands[0], tmp);
+    }
+  operands[1] = tmp;
+})
+
+;; Conditionalize these after reload. If they match before reload, we
+;; lose the clobber and ability to use integer instructions.
+
+(define_insn "*<code><mode>2_1"
+  [(set (match_operand:X87MODEF 0 "register_operand" "=f")
+	(absneg:X87MODEF (match_operand:X87MODEF 1 "register_operand" "0")))]
+  "TARGET_80387
+   && (reload_completed
+       || !(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH))"
+  "f<absneg_mnemonic>"
+  [(set_attr "type" "fsgn")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*<code>extendsfdf2"
+  [(set (match_operand:DF 0 "register_operand" "=f")
+	(absneg:DF (float_extend:DF
+		     (match_operand:SF 1 "register_operand" "0"))))]
+  "TARGET_80387 && (!TARGET_SSE_MATH || TARGET_MIX_SSE_I387)"
+  "f<absneg_mnemonic>"
+  [(set_attr "type" "fsgn")
+   (set_attr "mode" "DF")])
+
+(define_insn "*<code>extendsfxf2"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(absneg:XF (float_extend:XF
+		     (match_operand:SF 1 "register_operand" "0"))))]
+  "TARGET_80387"
+  "f<absneg_mnemonic>"
+  [(set_attr "type" "fsgn")
+   (set_attr "mode" "XF")])
+
+(define_insn "*<code>extenddfxf2"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(absneg:XF (float_extend:XF
+		     (match_operand:DF 1 "register_operand" "0"))))]
+  "TARGET_80387"
+  "f<absneg_mnemonic>"
+  [(set_attr "type" "fsgn")
+   (set_attr "mode" "XF")])
+
+;; Copysign instructions
+
+(define_mode_iterator CSGNMODE [SF DF TF])
+(define_mode_attr CSGNVMODE [(SF "V4SF") (DF "V2DF") (TF "TF")])
+
+(define_expand "copysign<mode>3"
+  [(match_operand:CSGNMODE 0 "register_operand")
+   (match_operand:CSGNMODE 1 "nonmemory_operand")
+   (match_operand:CSGNMODE 2 "register_operand")]
+  "(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+   || (TARGET_SSE && (<MODE>mode == TFmode))"
+  "ix86_expand_copysign (operands); DONE;")
+
+(define_insn_and_split "copysign<mode>3_const"
+  [(set (match_operand:CSGNMODE 0 "register_operand" "=x")
+	(unspec:CSGNMODE
+	  [(match_operand:<CSGNVMODE> 1 "vector_move_operand" "xmC")
+	   (match_operand:CSGNMODE 2 "register_operand" "0")
+	   (match_operand:<CSGNVMODE> 3 "nonimmediate_operand" "xm")]
+	  UNSPEC_COPYSIGN))]
+  "(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+   || (TARGET_SSE && (<MODE>mode == TFmode))"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  "ix86_split_copysign_const (operands); DONE;")
+
+(define_insn "copysign<mode>3_var"
+  [(set (match_operand:CSGNMODE 0 "register_operand" "=x,x,x,x,x")
+	(unspec:CSGNMODE
+	  [(match_operand:CSGNMODE 2 "register_operand"	"x,0,0,x,x")
+	   (match_operand:CSGNMODE 3 "register_operand"	"1,1,x,1,x")
+	   (match_operand:<CSGNVMODE> 4 "nonimmediate_operand" "X,xm,xm,0,0")
+	   (match_operand:<CSGNVMODE> 5 "nonimmediate_operand" "0,xm,1,xm,1")]
+	  UNSPEC_COPYSIGN))
+   (clobber (match_scratch:<CSGNVMODE> 1 "=x,x,x,x,x"))]
+  "(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+   || (TARGET_SSE && (<MODE>mode == TFmode))"
+  "#")
+
+(define_split
+  [(set (match_operand:CSGNMODE 0 "register_operand")
+	(unspec:CSGNMODE
+	  [(match_operand:CSGNMODE 2 "register_operand")
+	   (match_operand:CSGNMODE 3 "register_operand")
+	   (match_operand:<CSGNVMODE> 4)
+	   (match_operand:<CSGNVMODE> 5)]
+	  UNSPEC_COPYSIGN))
+   (clobber (match_scratch:<CSGNVMODE> 1))]
+  "((SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+    || (TARGET_SSE && (<MODE>mode == TFmode)))
+   && reload_completed"
+  [(const_int 0)]
+  "ix86_split_copysign_var (operands); DONE;")
+
+;; One complement instructions
+
+(define_expand "one_cmpl<mode>2"
+  [(set (match_operand:SWIM 0 "nonimmediate_operand")
+	(not:SWIM (match_operand:SWIM 1 "nonimmediate_operand")))]
+  ""
+  "ix86_expand_unary_operator (NOT, <MODE>mode, operands); DONE;")
+
+(define_insn "*one_cmpl<mode>2_1"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm")
+	(not:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "0")))]
+  "ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
+  "not{<imodesuffix>}\t%0"
+  [(set_attr "type" "negnot")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*one_cmplhi2_1"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,!k")
+	(not:HI (match_operand:HI 1 "nonimmediate_operand" "0,k")))]
+  "ix86_unary_operator_ok (NOT, HImode, operands)"
+  "@
+   not{w}\t%0
+   knotw\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "*,avx512f")
+   (set_attr "type" "negnot,msklog")
+   (set_attr "prefix" "*,vex")
+   (set_attr "mode" "HI")])
+
+;; %%% Potential partial reg stall on alternative 1.  What to do?
+(define_insn "*one_cmplqi2_1"
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,!k")
+	(not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,k")))]
+  "ix86_unary_operator_ok (NOT, QImode, operands)"
+  "@
+   not{b}\t%0
+   not{l}\t%k0
+   knotw\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "*,*,avx512f")
+   (set_attr "type" "negnot,negnot,msklog")
+   (set_attr "prefix" "*,*,vex")
+   (set_attr "mode" "QI,SI,QI")])
+
+;; ??? Currently never generated - xor is used instead.
+(define_insn "*one_cmplsi2_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (not:SI (match_operand:SI 1 "register_operand" "0"))))]
+  "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands)"
+  "not{l}\t%k0"
+  [(set_attr "type" "negnot")
+   (set_attr "mode" "SI")])
+
+(define_insn "*one_cmpl<mode>2_2"
+  [(set (reg FLAGS_REG)
+	(compare (not:SWI (match_operand:SWI 1 "nonimmediate_operand" "0"))
+		 (const_int 0)))
+   (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
+	(not:SWI (match_dup 1)))]
+  "ix86_match_ccmode (insn, CCNOmode)
+   && ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
+  "#"
+  [(set_attr "type" "alu1")
+   (set_attr "mode" "<MODE>")])
+
+(define_split
+  [(set (match_operand 0 "flags_reg_operand")
+	(match_operator 2 "compare_operator"
+	  [(not:SWI (match_operand:SWI 3 "nonimmediate_operand"))
+	   (const_int 0)]))
+   (set (match_operand:SWI 1 "nonimmediate_operand")
+	(not:SWI (match_dup 3)))]
+  "ix86_match_ccmode (insn, CCNOmode)"
+  [(parallel [(set (match_dup 0)
+		   (match_op_dup 2 [(xor:SWI (match_dup 3) (const_int -1))
+				    (const_int 0)]))
+	      (set (match_dup 1)
+		   (xor:SWI (match_dup 3) (const_int -1)))])])
+
+;; ??? Currently never generated - xor is used instead.
+(define_insn "*one_cmplsi2_2_zext"
+  [(set (reg FLAGS_REG)
+	(compare (not:SI (match_operand:SI 1 "register_operand" "0"))
+		 (const_int 0)))
+   (set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI (not:SI (match_dup 1))))]
+  "TARGET_64BIT && ix86_match_ccmode (insn, CCNOmode)
+   && ix86_unary_operator_ok (NOT, SImode, operands)"
+  "#"
+  [(set_attr "type" "alu1")
+   (set_attr "mode" "SI")])
+
+(define_split
+  [(set (match_operand 0 "flags_reg_operand")
+	(match_operator 2 "compare_operator"
+	  [(not:SI (match_operand:SI 3 "register_operand"))
+	   (const_int 0)]))
+   (set (match_operand:DI 1 "register_operand")
+	(zero_extend:DI (not:SI (match_dup 3))))]
+  "ix86_match_ccmode (insn, CCNOmode)"
+  [(parallel [(set (match_dup 0)
+		   (match_op_dup 2 [(xor:SI (match_dup 3) (const_int -1))
+				    (const_int 0)]))
+	      (set (match_dup 1)
+		   (zero_extend:DI (xor:SI (match_dup 3) (const_int -1))))])])
+
+;; Shift instructions
+
+;; DImode shifts are implemented using the i386 "shift double" opcode,
+;; which is written as "sh[lr]d[lw] imm,reg,reg/mem".  If the shift count
+;; is variable, then the count is in %cl and the "imm" operand is dropped
+;; from the assembler input.
+;;
+;; This instruction shifts the target reg/mem as usual, but instead of
+;; shifting in zeros, bits are shifted in from reg operand.  If the insn
+;; is a left shift double, bits are taken from the high order bits of
+;; reg, else if the insn is a shift right double, bits are taken from the
+;; low order bits of reg.  So if %eax is "1234" and %edx is "5678",
+;; "shldl $8,%edx,%eax" leaves %edx unchanged and sets %eax to "2345".
+;;
+;; Since sh[lr]d does not change the `reg' operand, that is done
+;; separately, making all shifts emit pairs of shift double and normal
+;; shift.  Since sh[lr]d does not shift more than 31 bits, and we wish to
+;; support a 63 bit shift, each shift where the count is in a reg expands
+;; to a pair of shifts, a branch, a shift by 32 and a label.
+;;
+;; If the shift count is a constant, we need never emit more than one
+;; shift pair, instead using moves and sign extension for counts greater
+;; than 31.
+
+(define_expand "ashl<mode>3"
+  [(set (match_operand:SDWIM 0 "<shift_operand>")
+	(ashift:SDWIM (match_operand:SDWIM 1 "<ashl_input_operand>")
+		      (match_operand:QI 2 "nonmemory_operand")))]
+  ""
+  "ix86_expand_binary_operator (ASHIFT, <MODE>mode, operands); DONE;")
+
+(define_insn "*ashl<mode>3_doubleword"
+  [(set (match_operand:DWI 0 "register_operand" "=&r,r")
+	(ashift:DWI (match_operand:DWI 1 "reg_or_pm1_operand" "n,0")
+		    (match_operand:QI 2 "nonmemory_operand" "<S>c,<S>c")))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "#"
+  [(set_attr "type" "multi")])
+
+(define_split
+  [(set (match_operand:DWI 0 "register_operand")
+	(ashift:DWI (match_operand:DWI 1 "nonmemory_operand")
+		    (match_operand:QI 2 "nonmemory_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "(optimize && flag_peephole2) ? epilogue_completed : reload_completed"
+  [(const_int 0)]
+  "ix86_split_ashl (operands, NULL_RTX, <MODE>mode); DONE;")
+
+;; By default we don't ask for a scratch register, because when DWImode
+;; values are manipulated, registers are already at a premium.  But if
+;; we have one handy, we won't turn it away.
+
+(define_peephole2
+  [(match_scratch:DWIH 3 "r")
+   (parallel [(set (match_operand:<DWI> 0 "register_operand")
+		   (ashift:<DWI>
+		     (match_operand:<DWI> 1 "nonmemory_operand")
+		     (match_operand:QI 2 "nonmemory_operand")))
+	      (clobber (reg:CC FLAGS_REG))])
+   (match_dup 3)]
+  "TARGET_CMOVE"
+  [(const_int 0)]
+  "ix86_split_ashl (operands, operands[3], <DWI>mode); DONE;")
+
+(define_insn "x86_64_shld"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
+        (ior:DI (ashift:DI (match_dup 0)
+		  (match_operand:QI 2 "nonmemory_operand" "Jc"))
+		(lshiftrt:DI (match_operand:DI 1 "register_operand" "r")
+		  (minus:QI (const_int 64) (match_dup 2)))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT"
+  "shld{q}\t{%s2%1, %0|%0, %1, %2}"
+  [(set_attr "type" "ishift")
+   (set_attr "prefix_0f" "1")
+   (set_attr "mode" "DI")
+   (set_attr "athlon_decode" "vector")
+   (set_attr "amdfam10_decode" "vector")
+   (set_attr "bdver1_decode" "vector")])
+
+(define_insn "x86_shld"
+  [(set (match_operand:SI 0 "nonimmediate_operand" "+r*m")
+        (ior:SI (ashift:SI (match_dup 0)
+		  (match_operand:QI 2 "nonmemory_operand" "Ic"))
+		(lshiftrt:SI (match_operand:SI 1 "register_operand" "r")
+		  (minus:QI (const_int 32) (match_dup 2)))))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "shld{l}\t{%s2%1, %0|%0, %1, %2}"
+  [(set_attr "type" "ishift")
+   (set_attr "prefix_0f" "1")
+   (set_attr "mode" "SI")
+   (set_attr "pent_pair" "np")
+   (set_attr "athlon_decode" "vector")
+   (set_attr "amdfam10_decode" "vector")
+   (set_attr "bdver1_decode" "vector")])
+
+(define_expand "x86_shift<mode>_adj_1"
+  [(set (reg:CCZ FLAGS_REG)
+	(compare:CCZ (and:QI (match_operand:QI 2 "register_operand")
+			     (match_dup 4))
+		     (const_int 0)))
+   (set (match_operand:SWI48 0 "register_operand")
+        (if_then_else:SWI48 (ne (reg:CCZ FLAGS_REG) (const_int 0))
+			    (match_operand:SWI48 1 "register_operand")
+			    (match_dup 0)))
+   (set (match_dup 1)
+	(if_then_else:SWI48 (ne (reg:CCZ FLAGS_REG) (const_int 0))
+			    (match_operand:SWI48 3 "register_operand")
+			    (match_dup 1)))]
+  "TARGET_CMOVE"
+  "operands[4] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode));")
+
+(define_expand "x86_shift<mode>_adj_2"
+  [(use (match_operand:SWI48 0 "register_operand"))
+   (use (match_operand:SWI48 1 "register_operand"))
+   (use (match_operand:QI 2 "register_operand"))]
+  ""
+{
+  rtx label = gen_label_rtx ();
+  rtx tmp;
+
+  emit_insn (gen_testqi_ccz_1 (operands[2],
+			       GEN_INT (GET_MODE_BITSIZE (<MODE>mode))));
+
+  tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
+  tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
+  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+			      gen_rtx_LABEL_REF (VOIDmode, label),
+			      pc_rtx);
+  tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
+  JUMP_LABEL (tmp) = label;
+
+  emit_move_insn (operands[0], operands[1]);
+  ix86_expand_clear (operands[1]);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  DONE;
+})
+
+;; Avoid useless masking of count operand.
+(define_insn "*ashl<mode>3_mask"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm")
+	(ashift:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "0")
+	  (subreg:QI
+	    (and:SI
+	      (match_operand:SI 2 "register_operand" "c")
+	      (match_operand:SI 3 "const_int_operand" "n")) 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (ASHIFT, <MODE>mode, operands)
+   && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
+      == GET_MODE_BITSIZE (<MODE>mode)-1"
+{
+  return "sal{<imodesuffix>}\t{%b2, %0|%0, %b2}";
+}
+  [(set_attr "type" "ishift")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*bmi2_ashl<mode>3_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+		      (match_operand:SWI48 2 "register_operand" "r")))]
+  "TARGET_BMI2"
+  "shlx\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ishiftx")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*ashl<mode>3_1"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r")
+	(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "0,l,rm")
+		      (match_operand:QI 2 "nonmemory_operand" "c<S>,M,r")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (ASHIFT, <MODE>mode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LEA:
+    case TYPE_ISHIFTX:
+      return "#";
+
+    case TYPE_ALU:
+      gcc_assert (operands[2] == const1_rtx);
+      gcc_assert (rtx_equal_p (operands[0], operands[1]));
+      return "add{<imodesuffix>}\t%0, %0";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "sal{<imodesuffix>}\t%0";
+      else
+	return "sal{<imodesuffix>}\t{%2, %0|%0, %2}";
+    }
+}
+  [(set_attr "isa" "*,*,bmi2")
+   (set (attr "type")
+     (cond [(eq_attr "alternative" "1")
+	      (const_string "lea")
+	    (eq_attr "alternative" "2")
+	      (const_string "ishiftx")
+            (and (and (match_test "TARGET_DOUBLE_WITH_ADD")
+		      (match_operand 0 "register_operand"))
+		 (match_operand 2 "const1_operand"))
+	      (const_string "alu")
+	   ]
+	   (const_string "ishift")))
+   (set (attr "length_immediate")
+     (if_then_else
+       (ior (eq_attr "type" "alu")
+	    (and (eq_attr "type" "ishift")
+		 (and (match_operand 2 "const1_operand")
+		      (ior (match_test "TARGET_SHIFT1")
+			   (match_test "optimize_function_for_size_p (cfun)")))))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "<MODE>")])
+
+;; Convert shift to the shiftx pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+	(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+		      (match_operand:QI 2 "register_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+	(ashift:SWI48 (match_dup 1) (match_dup 2)))]
+  "operands[2] = gen_lowpart (<MODE>mode, operands[2]);")
+
+(define_insn "*bmi2_ashlsi3_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "rm")
+		     (match_operand:SI 2 "register_operand" "r"))))]
+  "TARGET_64BIT && TARGET_BMI2"
+  "shlx\t{%2, %1, %k0|%k0, %1, %2}"
+  [(set_attr "type" "ishiftx")
+   (set_attr "mode" "SI")])
+
+(define_insn "*ashlsi3_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r,r,r")
+	(zero_extend:DI
+	  (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0,l,rm")
+		     (match_operand:QI 2 "nonmemory_operand" "cI,M,r"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && ix86_binary_operator_ok (ASHIFT, SImode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LEA:
+    case TYPE_ISHIFTX:
+      return "#";
+
+    case TYPE_ALU:
+      gcc_assert (operands[2] == const1_rtx);
+      return "add{l}\t%k0, %k0";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "sal{l}\t%k0";
+      else
+	return "sal{l}\t{%2, %k0|%k0, %2}";
+    }
+}
+  [(set_attr "isa" "*,*,bmi2")
+   (set (attr "type")
+     (cond [(eq_attr "alternative" "1")
+	      (const_string "lea")
+	    (eq_attr "alternative" "2")
+	      (const_string "ishiftx")
+            (and (match_test "TARGET_DOUBLE_WITH_ADD")
+		 (match_operand 2 "const1_operand"))
+	      (const_string "alu")
+	   ]
+	   (const_string "ishift")))
+   (set (attr "length_immediate")
+     (if_then_else
+       (ior (eq_attr "type" "alu")
+	    (and (eq_attr "type" "ishift")
+		 (and (match_operand 2 "const1_operand")
+		      (ior (match_test "TARGET_SHIFT1")
+			   (match_test "optimize_function_for_size_p (cfun)")))))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "SI")])
+
+;; Convert shift to the shiftx pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(zero_extend:DI
+	  (ashift:SI (match_operand:SI 1 "nonimmediate_operand")
+		     (match_operand:QI 2 "register_operand"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+	(zero_extend:DI (ashift:SI (match_dup 1) (match_dup 2))))]
+  "operands[2] = gen_lowpart (SImode, operands[2]);")
+
+(define_insn "*ashlhi3_1"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,Yp")
+	(ashift:HI (match_operand:HI 1 "nonimmediate_operand" "0,l")
+		   (match_operand:QI 2 "nonmemory_operand" "cI,M")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (ASHIFT, HImode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LEA:
+      return "#";
+
+    case TYPE_ALU:
+      gcc_assert (operands[2] == const1_rtx);
+      return "add{w}\t%0, %0";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "sal{w}\t%0";
+      else
+	return "sal{w}\t{%2, %0|%0, %2}";
+    }
+}
+  [(set (attr "type")
+     (cond [(eq_attr "alternative" "1")
+	      (const_string "lea")
+            (and (and (match_test "TARGET_DOUBLE_WITH_ADD")
+		      (match_operand 0 "register_operand"))
+		 (match_operand 2 "const1_operand"))
+	      (const_string "alu")
+	   ]
+	   (const_string "ishift")))
+   (set (attr "length_immediate")
+     (if_then_else
+       (ior (eq_attr "type" "alu")
+	    (and (eq_attr "type" "ishift")
+		 (and (match_operand 2 "const1_operand")
+		      (ior (match_test "TARGET_SHIFT1")
+			   (match_test "optimize_function_for_size_p (cfun)")))))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "HI,SI")])
+
+;; %%% Potential partial reg stall on alternative 1.  What to do?
+(define_insn "*ashlqi3_1"
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,Yp")
+	(ashift:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,l")
+		   (match_operand:QI 2 "nonmemory_operand" "cI,cI,M")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (ASHIFT, QImode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_LEA:
+      return "#";
+
+    case TYPE_ALU:
+      gcc_assert (operands[2] == const1_rtx);
+      if (REG_P (operands[1]) && !ANY_QI_REG_P (operands[1]))
+        return "add{l}\t%k0, %k0";
+      else
+        return "add{b}\t%0, %0";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	{
+	  if (get_attr_mode (insn) == MODE_SI)
+	    return "sal{l}\t%k0";
+	  else
+	    return "sal{b}\t%0";
+	}
+      else
+	{
+	  if (get_attr_mode (insn) == MODE_SI)
+	    return "sal{l}\t{%2, %k0|%k0, %2}";
+	  else
+	    return "sal{b}\t{%2, %0|%0, %2}";
+	}
+    }
+}
+  [(set (attr "type")
+     (cond [(eq_attr "alternative" "2")
+	      (const_string "lea")
+            (and (and (match_test "TARGET_DOUBLE_WITH_ADD")
+		      (match_operand 0 "register_operand"))
+		 (match_operand 2 "const1_operand"))
+	      (const_string "alu")
+	   ]
+	   (const_string "ishift")))
+   (set (attr "length_immediate")
+     (if_then_else
+       (ior (eq_attr "type" "alu")
+	    (and (eq_attr "type" "ishift")
+		 (and (match_operand 2 "const1_operand")
+		      (ior (match_test "TARGET_SHIFT1")
+			   (match_test "optimize_function_for_size_p (cfun)")))))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "QI,SI,SI")])
+
+(define_insn "*ashlqi3_1_slp"
+  [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm"))
+	(ashift:QI (match_dup 0)
+		   (match_operand:QI 1 "nonmemory_operand" "cI")))
+   (clobber (reg:CC FLAGS_REG))]
+  "(optimize_function_for_size_p (cfun)
+    || !TARGET_PARTIAL_FLAG_REG_STALL
+    || (operands[1] == const1_rtx
+	&& (TARGET_SHIFT1
+	    || (TARGET_DOUBLE_WITH_ADD && REG_P (operands[0])))))"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_ALU:
+      gcc_assert (operands[1] == const1_rtx);
+      return "add{b}\t%0, %0";
+
+    default:
+      if (operands[1] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "sal{b}\t%0";
+      else
+	return "sal{b}\t{%1, %0|%0, %1}";
+    }
+}
+  [(set (attr "type")
+     (cond [(and (and (match_test "TARGET_DOUBLE_WITH_ADD")
+		      (match_operand 0 "register_operand"))
+		 (match_operand 1 "const1_operand"))
+	      (const_string "alu")
+	   ]
+	   (const_string "ishift1")))
+   (set (attr "length_immediate")
+     (if_then_else
+       (ior (eq_attr "type" "alu")
+	    (and (eq_attr "type" "ishift1")
+		 (and (match_operand 1 "const1_operand")
+		      (ior (match_test "TARGET_SHIFT1")
+			   (match_test "optimize_function_for_size_p (cfun)")))))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "QI")])
+
+;; Convert ashift to the lea pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand 0 "register_operand")
+	(ashift (match_operand 1 "index_register_operand")
+                (match_operand:QI 2 "const_int_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "GET_MODE (operands[0]) == GET_MODE (operands[1])
+   && reload_completed
+   && true_regnum (operands[0]) != true_regnum (operands[1])"
+  [(const_int 0)]
+{
+  enum machine_mode mode = GET_MODE (operands[0]);
+  rtx pat;
+
+  if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
+    { 
+      mode = SImode; 
+      operands[0] = gen_lowpart (mode, operands[0]);
+      operands[1] = gen_lowpart (mode, operands[1]);
+    }
+
+  operands[2] = gen_int_mode (1 << INTVAL (operands[2]), mode);
+
+  pat = gen_rtx_MULT (mode, operands[1], operands[2]);
+
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0], pat));
+  DONE;
+})
+
+;; Convert ashift to the lea pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(zero_extend:DI
+	  (ashift:SI (match_operand:SI 1 "index_register_operand")
+		     (match_operand:QI 2 "const_int_operand"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && reload_completed
+   && true_regnum (operands[0]) != true_regnum (operands[1])"
+  [(set (match_dup 0)
+	(zero_extend:DI (mult:SI (match_dup 1) (match_dup 2))))]
+{
+  operands[1] = gen_lowpart (SImode, operands[1]);
+  operands[2] = gen_int_mode (1 << INTVAL (operands[2]), SImode);
+})
+
+;; This pattern can't accept a variable shift count, since shifts by
+;; zero don't affect the flags.  We assume that shifts by constant
+;; zero are optimized away.
+(define_insn "*ashl<mode>3_cmp"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (ashift:SWI (match_operand:SWI 1 "nonimmediate_operand" "0")
+		      (match_operand:QI 2 "<shift_immediate_operand>" "<S>"))
+	  (const_int 0)))
+   (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
+	(ashift:SWI (match_dup 1) (match_dup 2)))]
+  "(optimize_function_for_size_p (cfun)
+    || !TARGET_PARTIAL_FLAG_REG_STALL
+    || (operands[2] == const1_rtx
+	&& (TARGET_SHIFT1
+	    || (TARGET_DOUBLE_WITH_ADD && REG_P (operands[0])))))
+   && ix86_match_ccmode (insn, CCGOCmode)
+   && ix86_binary_operator_ok (ASHIFT, <MODE>mode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_ALU:
+      gcc_assert (operands[2] == const1_rtx);
+      return "add{<imodesuffix>}\t%0, %0";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "sal{<imodesuffix>}\t%0";
+      else
+	return "sal{<imodesuffix>}\t{%2, %0|%0, %2}";
+    }
+}
+  [(set (attr "type")
+     (cond [(and (and (match_test "TARGET_DOUBLE_WITH_ADD")
+		      (match_operand 0 "register_operand"))
+		 (match_operand 2 "const1_operand"))
+	      (const_string "alu")
+	   ]
+	   (const_string "ishift")))
+   (set (attr "length_immediate")
+     (if_then_else
+       (ior (eq_attr "type" "alu")
+	    (and (eq_attr "type" "ishift")
+		 (and (match_operand 2 "const1_operand")
+		      (ior (match_test "TARGET_SHIFT1")
+			   (match_test "optimize_function_for_size_p (cfun)")))))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*ashlsi3_cmp_zext"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (ashift:SI (match_operand:SI 1 "register_operand" "0")
+		     (match_operand:QI 2 "const_1_to_31_operand" "I"))
+	  (const_int 0)))
+   (set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI (ashift:SI (match_dup 1) (match_dup 2))))]
+  "TARGET_64BIT
+   && (optimize_function_for_size_p (cfun)
+       || !TARGET_PARTIAL_FLAG_REG_STALL
+       || (operands[2] == const1_rtx
+	   && (TARGET_SHIFT1
+	       || TARGET_DOUBLE_WITH_ADD)))
+   && ix86_match_ccmode (insn, CCGOCmode)
+   && ix86_binary_operator_ok (ASHIFT, SImode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_ALU:
+      gcc_assert (operands[2] == const1_rtx);
+      return "add{l}\t%k0, %k0";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "sal{l}\t%k0";
+      else
+	return "sal{l}\t{%2, %k0|%k0, %2}";
+    }
+}
+  [(set (attr "type")
+     (cond [(and (match_test "TARGET_DOUBLE_WITH_ADD")
+		 (match_operand 2 "const1_operand"))
+	      (const_string "alu")
+	   ]
+	   (const_string "ishift")))
+   (set (attr "length_immediate")
+     (if_then_else
+       (ior (eq_attr "type" "alu")
+	    (and (eq_attr "type" "ishift")
+		 (and (match_operand 2 "const1_operand")
+		      (ior (match_test "TARGET_SHIFT1")
+			   (match_test "optimize_function_for_size_p (cfun)")))))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "SI")])
+
+(define_insn "*ashl<mode>3_cconly"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (ashift:SWI (match_operand:SWI 1 "register_operand" "0")
+		      (match_operand:QI 2 "<shift_immediate_operand>" "<S>"))
+	  (const_int 0)))
+   (clobber (match_scratch:SWI 0 "=<r>"))]
+  "(optimize_function_for_size_p (cfun)
+    || !TARGET_PARTIAL_FLAG_REG_STALL
+    || (operands[2] == const1_rtx
+	&& (TARGET_SHIFT1
+	    || TARGET_DOUBLE_WITH_ADD)))
+   && ix86_match_ccmode (insn, CCGOCmode)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_ALU:
+      gcc_assert (operands[2] == const1_rtx);
+      return "add{<imodesuffix>}\t%0, %0";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "sal{<imodesuffix>}\t%0";
+      else
+	return "sal{<imodesuffix>}\t{%2, %0|%0, %2}";
+    }
+}
+  [(set (attr "type")
+     (cond [(and (and (match_test "TARGET_DOUBLE_WITH_ADD")
+		      (match_operand 0 "register_operand"))
+		 (match_operand 2 "const1_operand"))
+	      (const_string "alu")
+	   ]
+	   (const_string "ishift")))
+   (set (attr "length_immediate")
+     (if_then_else
+       (ior (eq_attr "type" "alu")
+	    (and (eq_attr "type" "ishift")
+		 (and (match_operand 2 "const1_operand")
+		      (ior (match_test "TARGET_SHIFT1")
+			   (match_test "optimize_function_for_size_p (cfun)")))))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "<MODE>")])
+
+;; See comment above `ashl<mode>3' about how this works.
+
+(define_expand "<shift_insn><mode>3"
+  [(set (match_operand:SDWIM 0 "<shift_operand>")
+	(any_shiftrt:SDWIM (match_operand:SDWIM 1 "<shift_operand>")
+			   (match_operand:QI 2 "nonmemory_operand")))]
+  ""
+  "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;")
+
+;; Avoid useless masking of count operand.
+(define_insn "*<shift_insn><mode>3_mask"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm")
+	(any_shiftrt:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "0")
+	  (subreg:QI
+	    (and:SI
+	      (match_operand:SI 2 "register_operand" "c")
+	      (match_operand:SI 3 "const_int_operand" "n")) 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)
+   && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
+      == GET_MODE_BITSIZE (<MODE>mode)-1"
+{
+  return "<shift>{<imodesuffix>}\t{%b2, %0|%0, %b2}";
+}
+  [(set_attr "type" "ishift")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn_and_split "*<shift_insn><mode>3_doubleword"
+  [(set (match_operand:DWI 0 "register_operand" "=r")
+	(any_shiftrt:DWI (match_operand:DWI 1 "register_operand" "0")
+			 (match_operand:QI 2 "nonmemory_operand" "<S>c")))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "#"
+  "(optimize && flag_peephole2) ? epilogue_completed : reload_completed"
+  [(const_int 0)]
+  "ix86_split_<shift_insn> (operands, NULL_RTX, <MODE>mode); DONE;"
+  [(set_attr "type" "multi")])
+
+;; By default we don't ask for a scratch register, because when DWImode
+;; values are manipulated, registers are already at a premium.  But if
+;; we have one handy, we won't turn it away.
+
+(define_peephole2
+  [(match_scratch:DWIH 3 "r")
+   (parallel [(set (match_operand:<DWI> 0 "register_operand")
+		   (any_shiftrt:<DWI>
+		     (match_operand:<DWI> 1 "register_operand")
+		     (match_operand:QI 2 "nonmemory_operand")))
+	      (clobber (reg:CC FLAGS_REG))])
+   (match_dup 3)]
+  "TARGET_CMOVE"
+  [(const_int 0)]
+  "ix86_split_<shift_insn> (operands, operands[3], <DWI>mode); DONE;")
+
+(define_insn "x86_64_shrd"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
+        (ior:DI (ashiftrt:DI (match_dup 0)
+		  (match_operand:QI 2 "nonmemory_operand" "Jc"))
+		(ashift:DI (match_operand:DI 1 "register_operand" "r")
+		  (minus:QI (const_int 64) (match_dup 2)))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT"
+  "shrd{q}\t{%s2%1, %0|%0, %1, %2}"
+  [(set_attr "type" "ishift")
+   (set_attr "prefix_0f" "1")
+   (set_attr "mode" "DI")
+   (set_attr "athlon_decode" "vector")
+   (set_attr "amdfam10_decode" "vector")
+   (set_attr "bdver1_decode" "vector")])
+
+(define_insn "x86_shrd"
+  [(set (match_operand:SI 0 "nonimmediate_operand" "+r*m")
+        (ior:SI (ashiftrt:SI (match_dup 0)
+		  (match_operand:QI 2 "nonmemory_operand" "Ic"))
+		(ashift:SI (match_operand:SI 1 "register_operand" "r")
+		  (minus:QI (const_int 32) (match_dup 2)))))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "shrd{l}\t{%s2%1, %0|%0, %1, %2}"
+  [(set_attr "type" "ishift")
+   (set_attr "prefix_0f" "1")
+   (set_attr "mode" "SI")
+   (set_attr "pent_pair" "np")
+   (set_attr "athlon_decode" "vector")
+   (set_attr "amdfam10_decode" "vector")
+   (set_attr "bdver1_decode" "vector")])
+
+(define_insn "ashrdi3_cvt"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=*d,rm")
+	(ashiftrt:DI (match_operand:DI 1 "nonimmediate_operand" "*a,0")
+		     (match_operand:QI 2 "const_int_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && INTVAL (operands[2]) == 63
+   && (TARGET_USE_CLTD || optimize_function_for_size_p (cfun))
+   && ix86_binary_operator_ok (ASHIFTRT, DImode, operands)"
+  "@
+   {cqto|cqo}
+   sar{q}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "imovx,ishift")
+   (set_attr "prefix_0f" "0,*")
+   (set_attr "length_immediate" "0,*")
+   (set_attr "modrm" "0,1")
+   (set_attr "mode" "DI")])
+
+(define_insn "ashrsi3_cvt"
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=*d,rm")
+	(ashiftrt:SI (match_operand:SI 1 "nonimmediate_operand" "*a,0")
+		     (match_operand:QI 2 "const_int_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "INTVAL (operands[2]) == 31
+   && (TARGET_USE_CLTD || optimize_function_for_size_p (cfun))
+   && ix86_binary_operator_ok (ASHIFTRT, SImode, operands)"
+  "@
+   {cltd|cdq}
+   sar{l}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "imovx,ishift")
+   (set_attr "prefix_0f" "0,*")
+   (set_attr "length_immediate" "0,*")
+   (set_attr "modrm" "0,1")
+   (set_attr "mode" "SI")])
+
+(define_insn "*ashrsi3_cvt_zext"
+  [(set (match_operand:DI 0 "register_operand" "=*d,r")
+	(zero_extend:DI
+	  (ashiftrt:SI (match_operand:SI 1 "register_operand" "*a,0")
+		       (match_operand:QI 2 "const_int_operand"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && INTVAL (operands[2]) == 31
+   && (TARGET_USE_CLTD || optimize_function_for_size_p (cfun))
+   && ix86_binary_operator_ok (ASHIFTRT, SImode, operands)"
+  "@
+   {cltd|cdq}
+   sar{l}\t{%2, %k0|%k0, %2}"
+  [(set_attr "type" "imovx,ishift")
+   (set_attr "prefix_0f" "0,*")
+   (set_attr "length_immediate" "0,*")
+   (set_attr "modrm" "0,1")
+   (set_attr "mode" "SI")])
+
+(define_expand "x86_shift<mode>_adj_3"
+  [(use (match_operand:SWI48 0 "register_operand"))
+   (use (match_operand:SWI48 1 "register_operand"))
+   (use (match_operand:QI 2 "register_operand"))]
+  ""
+{
+  rtx label = gen_label_rtx ();
+  rtx tmp;
+
+  emit_insn (gen_testqi_ccz_1 (operands[2],
+			       GEN_INT (GET_MODE_BITSIZE (<MODE>mode))));
+
+  tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
+  tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
+  tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+			      gen_rtx_LABEL_REF (VOIDmode, label),
+			      pc_rtx);
+  tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
+  JUMP_LABEL (tmp) = label;
+
+  emit_move_insn (operands[0], operands[1]);
+  emit_insn (gen_ashr<mode>3_cvt (operands[1], operands[1],
+				  GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1)));
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  DONE;
+})
+
+(define_insn "*bmi2_<shift_insn><mode>3_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(any_shiftrt:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+			   (match_operand:SWI48 2 "register_operand" "r")))]
+  "TARGET_BMI2"
+  "<shift>x\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ishiftx")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*<shift_insn><mode>3_1"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r")
+	(any_shiftrt:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "0,rm")
+	  (match_operand:QI 2 "nonmemory_operand" "c<S>,r")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_ISHIFTX:
+      return "#";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "<shift>{<imodesuffix>}\t%0";
+      else
+	return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    }
+}
+  [(set_attr "isa" "*,bmi2")
+   (set_attr "type" "ishift,ishiftx")
+   (set (attr "length_immediate")
+     (if_then_else
+       (and (match_operand 2 "const1_operand")
+	    (ior (match_test "TARGET_SHIFT1")
+		 (match_test "optimize_function_for_size_p (cfun)")))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "<MODE>")])
+
+;; Convert shift to the shiftx pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+	(any_shiftrt:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+			   (match_operand:QI 2 "register_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+	(any_shiftrt:SWI48 (match_dup 1) (match_dup 2)))]
+  "operands[2] = gen_lowpart (<MODE>mode, operands[2]);")
+
+(define_insn "*bmi2_<shift_insn>si3_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (any_shiftrt:SI (match_operand:SI 1 "nonimmediate_operand" "rm")
+			  (match_operand:SI 2 "register_operand" "r"))))]
+  "TARGET_64BIT && TARGET_BMI2"
+  "<shift>x\t{%2, %1, %k0|%k0, %1, %2}"
+  [(set_attr "type" "ishiftx")
+   (set_attr "mode" "SI")])
+
+(define_insn "*<shift_insn>si3_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(zero_extend:DI
+	  (any_shiftrt:SI (match_operand:SI 1 "nonimmediate_operand" "0,rm")
+			  (match_operand:QI 2 "nonmemory_operand" "cI,r"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && ix86_binary_operator_ok (<CODE>, SImode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_ISHIFTX:
+      return "#";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "<shift>{l}\t%k0";
+      else
+	return "<shift>{l}\t{%2, %k0|%k0, %2}";
+    }
+}
+  [(set_attr "isa" "*,bmi2")
+   (set_attr "type" "ishift,ishiftx")
+   (set (attr "length_immediate")
+     (if_then_else
+       (and (match_operand 2 "const1_operand")
+	    (ior (match_test "TARGET_SHIFT1")
+		 (match_test "optimize_function_for_size_p (cfun)")))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "SI")])
+
+;; Convert shift to the shiftx pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(zero_extend:DI
+	  (any_shiftrt:SI (match_operand:SI 1 "nonimmediate_operand")
+			  (match_operand:QI 2 "register_operand"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+	(zero_extend:DI (any_shiftrt:SI (match_dup 1) (match_dup 2))))]
+  "operands[2] = gen_lowpart (SImode, operands[2]);")
+
+(define_insn "*<shift_insn><mode>3_1"
+  [(set (match_operand:SWI12 0 "nonimmediate_operand" "=<r>m")
+	(any_shiftrt:SWI12
+	  (match_operand:SWI12 1 "nonimmediate_operand" "0")
+	  (match_operand:QI 2 "nonmemory_operand" "c<S>")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+{
+  if (operands[2] == const1_rtx
+      && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+    return "<shift>{<imodesuffix>}\t%0";
+  else
+    return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
+}
+  [(set_attr "type" "ishift")
+   (set (attr "length_immediate")
+     (if_then_else
+       (and (match_operand 2 "const1_operand")
+	    (ior (match_test "TARGET_SHIFT1")
+		 (match_test "optimize_function_for_size_p (cfun)")))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*<shift_insn>qi3_1_slp"
+  [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm"))
+	(any_shiftrt:QI (match_dup 0)
+			(match_operand:QI 1 "nonmemory_operand" "cI")))
+   (clobber (reg:CC FLAGS_REG))]
+  "(optimize_function_for_size_p (cfun)
+    || !TARGET_PARTIAL_REG_STALL
+    || (operands[1] == const1_rtx
+	&& TARGET_SHIFT1))"
+{
+  if (operands[1] == const1_rtx
+      && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+    return "<shift>{b}\t%0";
+  else
+    return "<shift>{b}\t{%1, %0|%0, %1}";
+}
+  [(set_attr "type" "ishift1")
+   (set (attr "length_immediate")
+     (if_then_else
+       (and (match_operand 1 "const1_operand")
+	    (ior (match_test "TARGET_SHIFT1")
+		 (match_test "optimize_function_for_size_p (cfun)")))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "QI")])
+
+;; This pattern can't accept a variable shift count, since shifts by
+;; zero don't affect the flags.  We assume that shifts by constant
+;; zero are optimized away.
+(define_insn "*<shift_insn><mode>3_cmp"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (any_shiftrt:SWI
+	    (match_operand:SWI 1 "nonimmediate_operand" "0")
+	    (match_operand:QI 2 "<shift_immediate_operand>" "<S>"))
+	  (const_int 0)))
+   (set (match_operand:SWI 0 "nonimmediate_operand" "=<r>m")
+	(any_shiftrt:SWI (match_dup 1) (match_dup 2)))]
+  "(optimize_function_for_size_p (cfun)
+    || !TARGET_PARTIAL_FLAG_REG_STALL
+    || (operands[2] == const1_rtx
+	&& TARGET_SHIFT1))
+   && ix86_match_ccmode (insn, CCGOCmode)
+   && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+{
+  if (operands[2] == const1_rtx
+      && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+    return "<shift>{<imodesuffix>}\t%0";
+  else
+    return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
+}
+  [(set_attr "type" "ishift")
+   (set (attr "length_immediate")
+     (if_then_else
+       (and (match_operand 2 "const1_operand")
+	    (ior (match_test "TARGET_SHIFT1")
+		 (match_test "optimize_function_for_size_p (cfun)")))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*<shift_insn>si3_cmp_zext"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (any_shiftrt:SI (match_operand:SI 1 "register_operand" "0")
+			  (match_operand:QI 2 "const_1_to_31_operand" "I"))
+	  (const_int 0)))
+   (set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI (any_shiftrt:SI (match_dup 1) (match_dup 2))))]
+  "TARGET_64BIT
+   && (optimize_function_for_size_p (cfun)
+       || !TARGET_PARTIAL_FLAG_REG_STALL
+       || (operands[2] == const1_rtx
+	   && TARGET_SHIFT1))
+   && ix86_match_ccmode (insn, CCGOCmode)
+   && ix86_binary_operator_ok (<CODE>, SImode, operands)"
+{
+  if (operands[2] == const1_rtx
+      && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+    return "<shift>{l}\t%k0";
+  else
+    return "<shift>{l}\t{%2, %k0|%k0, %2}";
+}
+  [(set_attr "type" "ishift")
+   (set (attr "length_immediate")
+     (if_then_else
+       (and (match_operand 2 "const1_operand")
+	    (ior (match_test "TARGET_SHIFT1")
+		 (match_test "optimize_function_for_size_p (cfun)")))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "SI")])
+
+(define_insn "*<shift_insn><mode>3_cconly"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (any_shiftrt:SWI
+	    (match_operand:SWI 1 "register_operand" "0")
+	    (match_operand:QI 2 "<shift_immediate_operand>" "<S>"))
+	  (const_int 0)))
+   (clobber (match_scratch:SWI 0 "=<r>"))]
+  "(optimize_function_for_size_p (cfun)
+    || !TARGET_PARTIAL_FLAG_REG_STALL
+    || (operands[2] == const1_rtx
+	&& TARGET_SHIFT1))
+   && ix86_match_ccmode (insn, CCGOCmode)"
+{
+  if (operands[2] == const1_rtx
+      && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+    return "<shift>{<imodesuffix>}\t%0";
+  else
+    return "<shift>{<imodesuffix>}\t{%2, %0|%0, %2}";
+}
+  [(set_attr "type" "ishift")
+   (set (attr "length_immediate")
+     (if_then_else
+       (and (match_operand 2 "const1_operand")
+	    (ior (match_test "TARGET_SHIFT1")
+		 (match_test "optimize_function_for_size_p (cfun)")))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "<MODE>")])
+
+;; Rotate instructions
+
+(define_expand "<rotate_insn>ti3"
+  [(set (match_operand:TI 0 "register_operand")
+	(any_rotate:TI (match_operand:TI 1 "register_operand")
+		       (match_operand:QI 2 "nonmemory_operand")))]
+  "TARGET_64BIT"
+{
+  if (const_1_to_63_operand (operands[2], VOIDmode))
+    emit_insn (gen_ix86_<rotate_insn>ti3_doubleword
+		(operands[0], operands[1], operands[2]));
+  else
+    FAIL;
+
+  DONE;
+})
+
+(define_expand "<rotate_insn>di3"
+  [(set (match_operand:DI 0 "shiftdi_operand")
+	(any_rotate:DI (match_operand:DI 1 "shiftdi_operand")
+		       (match_operand:QI 2 "nonmemory_operand")))]
+ ""
+{
+  if (TARGET_64BIT)
+    ix86_expand_binary_operator (<CODE>, DImode, operands);
+  else if (const_1_to_31_operand (operands[2], VOIDmode))
+    emit_insn (gen_ix86_<rotate_insn>di3_doubleword
+		(operands[0], operands[1], operands[2]));
+  else
+    FAIL;
+
+  DONE;
+})
+
+(define_expand "<rotate_insn><mode>3"
+  [(set (match_operand:SWIM124 0 "nonimmediate_operand")
+	(any_rotate:SWIM124 (match_operand:SWIM124 1 "nonimmediate_operand")
+			    (match_operand:QI 2 "nonmemory_operand")))]
+  ""
+  "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;")
+
+;; Avoid useless masking of count operand.
+(define_insn "*<rotate_insn><mode>3_mask"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm")
+	(any_rotate:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "0")
+	  (subreg:QI
+	    (and:SI
+	      (match_operand:SI 2 "register_operand" "c")
+	      (match_operand:SI 3 "const_int_operand" "n")) 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)
+   && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
+      == GET_MODE_BITSIZE (<MODE>mode)-1"
+{
+  return "<rotate>{<imodesuffix>}\t{%b2, %0|%0, %b2}";
+}
+  [(set_attr "type" "rotate")
+   (set_attr "mode" "<MODE>")])
+
+;; Implement rotation using two double-precision
+;; shift instructions and a scratch register.
+
+(define_insn_and_split "ix86_rotl<dwi>3_doubleword"
+ [(set (match_operand:<DWI> 0 "register_operand" "=r")
+       (rotate:<DWI> (match_operand:<DWI> 1 "register_operand" "0")
+		     (match_operand:QI 2 "<shift_immediate_operand>" "<S>")))
+  (clobber (reg:CC FLAGS_REG))
+  (clobber (match_scratch:DWIH 3 "=&r"))]
+ ""
+ "#"
+ "reload_completed"
+ [(set (match_dup 3) (match_dup 4))
+  (parallel
+   [(set (match_dup 4)
+	 (ior:DWIH (ashift:DWIH (match_dup 4) (match_dup 2))
+		   (lshiftrt:DWIH (match_dup 5)
+				  (minus:QI (match_dup 6) (match_dup 2)))))
+    (clobber (reg:CC FLAGS_REG))])
+  (parallel
+   [(set (match_dup 5)
+	 (ior:DWIH (ashift:DWIH (match_dup 5) (match_dup 2))
+		   (lshiftrt:DWIH (match_dup 3)
+				  (minus:QI (match_dup 6) (match_dup 2)))))
+    (clobber (reg:CC FLAGS_REG))])]
+{
+  operands[6] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode));
+
+  split_double_mode (<DWI>mode, &operands[0], 1, &operands[4], &operands[5]);
+})
+
+(define_insn_and_split "ix86_rotr<dwi>3_doubleword"
+ [(set (match_operand:<DWI> 0 "register_operand" "=r")
+       (rotatert:<DWI> (match_operand:<DWI> 1 "register_operand" "0")
+		       (match_operand:QI 2 "<shift_immediate_operand>" "<S>")))
+  (clobber (reg:CC FLAGS_REG))
+  (clobber (match_scratch:DWIH 3 "=&r"))]
+ ""
+ "#"
+ "reload_completed"
+ [(set (match_dup 3) (match_dup 4))
+  (parallel
+   [(set (match_dup 4)
+	 (ior:DWIH (ashiftrt:DWIH (match_dup 4) (match_dup 2))
+		   (ashift:DWIH (match_dup 5)
+				(minus:QI (match_dup 6) (match_dup 2)))))
+    (clobber (reg:CC FLAGS_REG))])
+  (parallel
+   [(set (match_dup 5)
+	 (ior:DWIH (ashiftrt:DWIH (match_dup 5) (match_dup 2))
+		   (ashift:DWIH (match_dup 3)
+				(minus:QI (match_dup 6) (match_dup 2)))))
+    (clobber (reg:CC FLAGS_REG))])]
+{
+  operands[6] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode));
+
+  split_double_mode (<DWI>mode, &operands[0], 1, &operands[4], &operands[5]);
+})
+
+(define_insn "*bmi2_rorx<mode>3_1"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(rotatert:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+			(match_operand:QI 2 "immediate_operand" "<S>")))]
+  "TARGET_BMI2"
+  "rorx\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "rotatex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*<rotate_insn><mode>3_1"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r")
+	(any_rotate:SWI48
+	  (match_operand:SWI48 1 "nonimmediate_operand" "0,rm")
+	  (match_operand:QI 2 "nonmemory_operand" "c<S>,<S>")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_ROTATEX:
+      return "#";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "<rotate>{<imodesuffix>}\t%0";
+      else
+	return "<rotate>{<imodesuffix>}\t{%2, %0|%0, %2}";
+    }
+}
+  [(set_attr "isa" "*,bmi2")
+   (set_attr "type" "rotate,rotatex")
+   (set (attr "length_immediate")
+     (if_then_else
+       (and (eq_attr "type" "rotate")
+	    (and (match_operand 2 "const1_operand")
+		 (ior (match_test "TARGET_SHIFT1")
+		      (match_test "optimize_function_for_size_p (cfun)"))))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "<MODE>")])
+
+;; Convert rotate to the rotatex pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+	(rotate:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+		      (match_operand:QI 2 "immediate_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+	(rotatert:SWI48 (match_dup 1) (match_dup 2)))]
+{
+  operands[2]
+    = GEN_INT (GET_MODE_BITSIZE (<MODE>mode) - INTVAL (operands[2]));
+})
+
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+	(rotatert:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+			(match_operand:QI 2 "immediate_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+	(rotatert:SWI48 (match_dup 1) (match_dup 2)))])
+
+(define_insn "*bmi2_rorxsi3_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (rotatert:SI (match_operand:SI 1 "nonimmediate_operand" "rm")
+		       (match_operand:QI 2 "immediate_operand" "I"))))]
+  "TARGET_64BIT && TARGET_BMI2"
+  "rorx\t{%2, %1, %k0|%k0, %1, %2}"
+  [(set_attr "type" "rotatex")
+   (set_attr "mode" "SI")])
+
+(define_insn "*<rotate_insn>si3_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(zero_extend:DI
+	  (any_rotate:SI (match_operand:SI 1 "nonimmediate_operand" "0,rm")
+			 (match_operand:QI 2 "nonmemory_operand" "cI,I"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && ix86_binary_operator_ok (<CODE>, SImode, operands)"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_ROTATEX:
+      return "#";
+
+    default:
+      if (operands[2] == const1_rtx
+	  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+	return "<rotate>{l}\t%k0";
+      else
+	return "<rotate>{l}\t{%2, %k0|%k0, %2}";
+    }
+}
+  [(set_attr "isa" "*,bmi2")
+   (set_attr "type" "rotate,rotatex")
+   (set (attr "length_immediate")
+     (if_then_else
+       (and (eq_attr "type" "rotate")
+	    (and (match_operand 2 "const1_operand")
+		 (ior (match_test "TARGET_SHIFT1")
+		      (match_test "optimize_function_for_size_p (cfun)"))))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "SI")])
+
+;; Convert rotate to the rotatex pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(zero_extend:DI
+	  (rotate:SI (match_operand:SI 1 "nonimmediate_operand")
+		     (match_operand:QI 2 "immediate_operand"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+	(zero_extend:DI (rotatert:SI (match_dup 1) (match_dup 2))))]
+{
+  operands[2]
+    = GEN_INT (GET_MODE_BITSIZE (SImode) - INTVAL (operands[2]));
+})
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(zero_extend:DI
+	  (rotatert:SI (match_operand:SI 1 "nonimmediate_operand")
+		       (match_operand:QI 2 "immediate_operand"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+	(zero_extend:DI (rotatert:SI (match_dup 1) (match_dup 2))))])
+
+(define_insn "*<rotate_insn><mode>3_1"
+  [(set (match_operand:SWI12 0 "nonimmediate_operand" "=<r>m")
+	(any_rotate:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "0")
+			  (match_operand:QI 2 "nonmemory_operand" "c<S>")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+{
+  if (operands[2] == const1_rtx
+      && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+    return "<rotate>{<imodesuffix>}\t%0";
+  else
+    return "<rotate>{<imodesuffix>}\t{%2, %0|%0, %2}";
+}
+  [(set_attr "type" "rotate")
+   (set (attr "length_immediate")
+     (if_then_else
+       (and (match_operand 2 "const1_operand")
+	    (ior (match_test "TARGET_SHIFT1")
+		 (match_test "optimize_function_for_size_p (cfun)")))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*<rotate_insn>qi3_1_slp"
+  [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm"))
+	(any_rotate:QI (match_dup 0)
+		       (match_operand:QI 1 "nonmemory_operand" "cI")))
+   (clobber (reg:CC FLAGS_REG))]
+  "(optimize_function_for_size_p (cfun)
+    || !TARGET_PARTIAL_REG_STALL
+    || (operands[1] == const1_rtx
+	&& TARGET_SHIFT1))"
+{
+  if (operands[1] == const1_rtx
+      && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun)))
+    return "<rotate>{b}\t%0";
+  else
+    return "<rotate>{b}\t{%1, %0|%0, %1}";
+}
+  [(set_attr "type" "rotate1")
+   (set (attr "length_immediate")
+     (if_then_else
+       (and (match_operand 1 "const1_operand")
+	    (ior (match_test "TARGET_SHIFT1")
+		 (match_test "optimize_function_for_size_p (cfun)")))
+       (const_string "0")
+       (const_string "*")))
+   (set_attr "mode" "QI")])
+
+(define_split
+ [(set (match_operand:HI 0 "register_operand")
+       (any_rotate:HI (match_dup 0) (const_int 8)))
+  (clobber (reg:CC FLAGS_REG))]
+ "reload_completed
+  && (TARGET_USE_XCHGB || optimize_function_for_size_p (cfun))"
+ [(parallel [(set (strict_low_part (match_dup 0))
+		  (bswap:HI (match_dup 0)))
+	     (clobber (reg:CC FLAGS_REG))])])
+
+;; Bit set / bit test instructions
+
+(define_expand "extv"
+  [(set (match_operand:SI 0 "register_operand")
+	(sign_extract:SI (match_operand:SI 1 "register_operand")
+			 (match_operand:SI 2 "const8_operand")
+			 (match_operand:SI 3 "const8_operand")))]
+  ""
+{
+  /* Handle extractions from %ah et al.  */
+  if (INTVAL (operands[2]) != 8 || INTVAL (operands[3]) != 8)
+    FAIL;
+
+  /* From mips.md: extract_bit_field doesn't verify that our source
+     matches the predicate, so check it again here.  */
+  if (! ext_register_operand (operands[1], VOIDmode))
+    FAIL;
+})
+
+(define_expand "extzv"
+  [(set (match_operand:SI 0 "register_operand")
+	(zero_extract:SI (match_operand 1 "ext_register_operand")
+			 (match_operand:SI 2 "const8_operand")
+			 (match_operand:SI 3 "const8_operand")))]
+  ""
+{
+  /* Handle extractions from %ah et al.  */
+  if (INTVAL (operands[2]) != 8 || INTVAL (operands[3]) != 8)
+    FAIL;
+
+  /* From mips.md: extract_bit_field doesn't verify that our source
+     matches the predicate, so check it again here.  */
+  if (! ext_register_operand (operands[1], VOIDmode))
+    FAIL;
+})
+
+(define_expand "insv"
+  [(set (zero_extract (match_operand 0 "register_operand")
+		      (match_operand 1 "const_int_operand")
+		      (match_operand 2 "const_int_operand"))
+        (match_operand 3 "register_operand"))]
+  ""
+{
+  rtx (*gen_mov_insv_1) (rtx, rtx);
+
+  if (ix86_expand_pinsr (operands))
+    DONE;
+
+  /* Handle insertions to %ah et al.  */
+  if (INTVAL (operands[1]) != 8 || INTVAL (operands[2]) != 8)
+    FAIL;
+
+  /* From mips.md: insert_bit_field doesn't verify that our source
+     matches the predicate, so check it again here.  */
+  if (! ext_register_operand (operands[0], VOIDmode))
+    FAIL;
+
+  gen_mov_insv_1 = (TARGET_64BIT
+		    ? gen_movdi_insv_1 : gen_movsi_insv_1);
+
+  emit_insn (gen_mov_insv_1 (operands[0], operands[3]));
+  DONE;
+})
+
+;; %%% bts, btr, btc, bt.
+;; In general these instructions are *slow* when applied to memory,
+;; since they enforce atomic operation.  When applied to registers,
+;; it depends on the cpu implementation.  They're never faster than
+;; the corresponding and/ior/xor operations, so with 32-bit there's
+;; no point.  But in 64-bit, we can't hold the relevant immediates
+;; within the instruction itself, so operating on bits in the high
+;; 32-bits of a register becomes easier.
+;;
+;; These are slow on Nocona, but fast on Athlon64.  We do require the use
+;; of btrq and btcq for corner cases of post-reload expansion of absdf and
+;; negdf respectively, so they can never be disabled entirely.
+
+(define_insn "*btsq"
+  [(set (zero_extract:DI (match_operand:DI 0 "register_operand" "+r")
+			 (const_int 1)
+			 (match_operand:DI 1 "const_0_to_63_operand"))
+	(const_int 1))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && (TARGET_USE_BT || reload_completed)"
+  "bts{q}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "mode" "DI")])
+
+(define_insn "*btrq"
+  [(set (zero_extract:DI (match_operand:DI 0 "register_operand" "+r")
+			 (const_int 1)
+			 (match_operand:DI 1 "const_0_to_63_operand"))
+	(const_int 0))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && (TARGET_USE_BT || reload_completed)"
+  "btr{q}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "mode" "DI")])
+
+(define_insn "*btcq"
+  [(set (zero_extract:DI (match_operand:DI 0 "register_operand" "+r")
+			 (const_int 1)
+			 (match_operand:DI 1 "const_0_to_63_operand"))
+	(not:DI (zero_extract:DI (match_dup 0) (const_int 1) (match_dup 1))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && (TARGET_USE_BT || reload_completed)"
+  "btc{q}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "mode" "DI")])
+
+;; Allow Nocona to avoid these instructions if a register is available.
+
+(define_peephole2
+  [(match_scratch:DI 2 "r")
+   (parallel [(set (zero_extract:DI
+		     (match_operand:DI 0 "register_operand")
+		     (const_int 1)
+		     (match_operand:DI 1 "const_0_to_63_operand"))
+		   (const_int 1))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_64BIT && !TARGET_USE_BT"
+  [(const_int 0)]
+{
+  HOST_WIDE_INT i = INTVAL (operands[1]), hi, lo;
+  rtx op1;
+
+  if (HOST_BITS_PER_WIDE_INT >= 64)
+    lo = (HOST_WIDE_INT)1 << i, hi = 0;
+  else if (i < HOST_BITS_PER_WIDE_INT)
+    lo = (HOST_WIDE_INT)1 << i, hi = 0;
+  else
+    lo = 0, hi = (HOST_WIDE_INT)1 << (i - HOST_BITS_PER_WIDE_INT);
+
+  op1 = immed_double_const (lo, hi, DImode);
+  if (i >= 31)
+    {
+      emit_move_insn (operands[2], op1);
+      op1 = operands[2];
+    }
+
+  emit_insn (gen_iordi3 (operands[0], operands[0], op1));
+  DONE;
+})
+
+(define_peephole2
+  [(match_scratch:DI 2 "r")
+   (parallel [(set (zero_extract:DI
+		     (match_operand:DI 0 "register_operand")
+		     (const_int 1)
+		     (match_operand:DI 1 "const_0_to_63_operand"))
+		   (const_int 0))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_64BIT && !TARGET_USE_BT"
+  [(const_int 0)]
+{
+  HOST_WIDE_INT i = INTVAL (operands[1]), hi, lo;
+  rtx op1;
+
+  if (HOST_BITS_PER_WIDE_INT >= 64)
+    lo = (HOST_WIDE_INT)1 << i, hi = 0;
+  else if (i < HOST_BITS_PER_WIDE_INT)
+    lo = (HOST_WIDE_INT)1 << i, hi = 0;
+  else
+    lo = 0, hi = (HOST_WIDE_INT)1 << (i - HOST_BITS_PER_WIDE_INT);
+
+  op1 = immed_double_const (~lo, ~hi, DImode);
+  if (i >= 32)
+    {
+      emit_move_insn (operands[2], op1);
+      op1 = operands[2];
+    }
+
+  emit_insn (gen_anddi3 (operands[0], operands[0], op1));
+  DONE;
+})
+
+(define_peephole2
+  [(match_scratch:DI 2 "r")
+   (parallel [(set (zero_extract:DI
+		     (match_operand:DI 0 "register_operand")
+		     (const_int 1)
+		     (match_operand:DI 1 "const_0_to_63_operand"))
+	      (not:DI (zero_extract:DI
+			(match_dup 0) (const_int 1) (match_dup 1))))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_64BIT && !TARGET_USE_BT"
+  [(const_int 0)]
+{
+  HOST_WIDE_INT i = INTVAL (operands[1]), hi, lo;
+  rtx op1;
+
+  if (HOST_BITS_PER_WIDE_INT >= 64)
+    lo = (HOST_WIDE_INT)1 << i, hi = 0;
+  else if (i < HOST_BITS_PER_WIDE_INT)
+    lo = (HOST_WIDE_INT)1 << i, hi = 0;
+  else
+    lo = 0, hi = (HOST_WIDE_INT)1 << (i - HOST_BITS_PER_WIDE_INT);
+
+  op1 = immed_double_const (lo, hi, DImode);
+  if (i >= 31)
+    {
+      emit_move_insn (operands[2], op1);
+      op1 = operands[2];
+    }
+
+  emit_insn (gen_xordi3 (operands[0], operands[0], op1));
+  DONE;
+})
+
+(define_insn "*bt<mode>"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	  (zero_extract:SWI48
+	    (match_operand:SWI48 0 "register_operand" "r")
+	    (const_int 1)
+	    (match_operand:SWI48 1 "x86_64_nonmemory_operand" "rN"))
+	  (const_int 0)))]
+  "TARGET_USE_BT || optimize_function_for_size_p (cfun)"
+  "bt{<imodesuffix>}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "mode" "<MODE>")])
+
+;; Store-flag instructions.
+
+;; For all sCOND expanders, also expand the compare or test insn that
+;; generates cc0.  Generate an equality comparison if `seq' or `sne'.
+
+(define_insn_and_split "*setcc_di_1"
+  [(set (match_operand:DI 0 "register_operand" "=q")
+	(match_operator:DI 1 "ix86_comparison_operator"
+	  [(reg FLAGS_REG) (const_int 0)]))]
+  "TARGET_64BIT && !TARGET_PARTIAL_REG_STALL"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 2) (match_dup 1))
+   (set (match_dup 0) (zero_extend:DI (match_dup 2)))]
+{
+  PUT_MODE (operands[1], QImode);
+  operands[2] = gen_lowpart (QImode, operands[0]);
+})
+
+(define_insn_and_split "*setcc_si_1_and"
+  [(set (match_operand:SI 0 "register_operand" "=q")
+	(match_operator:SI 1 "ix86_comparison_operator"
+	  [(reg FLAGS_REG) (const_int 0)]))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_PARTIAL_REG_STALL
+   && TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun)"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 2) (match_dup 1))
+   (parallel [(set (match_dup 0) (zero_extend:SI (match_dup 2)))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  PUT_MODE (operands[1], QImode);
+  operands[2] = gen_lowpart (QImode, operands[0]);
+})
+
+(define_insn_and_split "*setcc_si_1_movzbl"
+  [(set (match_operand:SI 0 "register_operand" "=q")
+	(match_operator:SI 1 "ix86_comparison_operator"
+	  [(reg FLAGS_REG) (const_int 0)]))]
+  "!TARGET_PARTIAL_REG_STALL
+   && (!TARGET_ZERO_EXTEND_WITH_AND || optimize_function_for_size_p (cfun))"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 2) (match_dup 1))
+   (set (match_dup 0) (zero_extend:SI (match_dup 2)))]
+{
+  PUT_MODE (operands[1], QImode);
+  operands[2] = gen_lowpart (QImode, operands[0]);
+})
+
+(define_insn "*setcc_qi"
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm")
+	(match_operator:QI 1 "ix86_comparison_operator"
+	  [(reg FLAGS_REG) (const_int 0)]))]
+  ""
+  "set%C1\t%0"
+  [(set_attr "type" "setcc")
+   (set_attr "mode" "QI")])
+
+(define_insn "*setcc_qi_slp"
+  [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand" "+qm"))
+	(match_operator:QI 1 "ix86_comparison_operator"
+	  [(reg FLAGS_REG) (const_int 0)]))]
+  ""
+  "set%C1\t%0"
+  [(set_attr "type" "setcc")
+   (set_attr "mode" "QI")])
+
+;; In general it is not safe to assume too much about CCmode registers,
+;; so simplify-rtx stops when it sees a second one.  Under certain
+;; conditions this is safe on x86, so help combine not create
+;;
+;;	seta	%al
+;;	testb	%al, %al
+;;	sete	%al
+
+(define_split
+  [(set (match_operand:QI 0 "nonimmediate_operand")
+	(ne:QI (match_operator 1 "ix86_comparison_operator"
+	         [(reg FLAGS_REG) (const_int 0)])
+	    (const_int 0)))]
+  ""
+  [(set (match_dup 0) (match_dup 1))]
+  "PUT_MODE (operands[1], QImode);")
+
+(define_split
+  [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand"))
+	(ne:QI (match_operator 1 "ix86_comparison_operator"
+	         [(reg FLAGS_REG) (const_int 0)])
+	    (const_int 0)))]
+  ""
+  [(set (match_dup 0) (match_dup 1))]
+  "PUT_MODE (operands[1], QImode);")
+
+(define_split
+  [(set (match_operand:QI 0 "nonimmediate_operand")
+	(eq:QI (match_operator 1 "ix86_comparison_operator"
+	         [(reg FLAGS_REG) (const_int 0)])
+	    (const_int 0)))]
+  ""
+  [(set (match_dup 0) (match_dup 1))]
+{
+  rtx new_op1 = copy_rtx (operands[1]);
+  operands[1] = new_op1;
+  PUT_MODE (new_op1, QImode);
+  PUT_CODE (new_op1, ix86_reverse_condition (GET_CODE (new_op1),
+					     GET_MODE (XEXP (new_op1, 0))));
+
+  /* Make sure that (a) the CCmode we have for the flags is strong
+     enough for the reversed compare or (b) we have a valid FP compare.  */
+  if (! ix86_comparison_operator (new_op1, VOIDmode))
+    FAIL;
+})
+
+(define_split
+  [(set (strict_low_part (match_operand:QI 0 "nonimmediate_operand"))
+	(eq:QI (match_operator 1 "ix86_comparison_operator"
+	         [(reg FLAGS_REG) (const_int 0)])
+	    (const_int 0)))]
+  ""
+  [(set (match_dup 0) (match_dup 1))]
+{
+  rtx new_op1 = copy_rtx (operands[1]);
+  operands[1] = new_op1;
+  PUT_MODE (new_op1, QImode);
+  PUT_CODE (new_op1, ix86_reverse_condition (GET_CODE (new_op1),
+					     GET_MODE (XEXP (new_op1, 0))));
+
+  /* Make sure that (a) the CCmode we have for the flags is strong
+     enough for the reversed compare or (b) we have a valid FP compare.  */
+  if (! ix86_comparison_operator (new_op1, VOIDmode))
+    FAIL;
+})
+
+;; The SSE store flag instructions saves 0 or 0xffffffff to the result.
+;; subsequent logical operations are used to imitate conditional moves.
+;; 0xffffffff is NaN, but not in normalized form, so we can't represent
+;; it directly.
+
+(define_insn "setcc_<mode>_sse"
+  [(set (match_operand:MODEF 0 "register_operand" "=x,x")
+	(match_operator:MODEF 3 "sse_comparison_operator"
+	  [(match_operand:MODEF 1 "register_operand" "0,x")
+	   (match_operand:MODEF 2 "nonimmediate_operand" "xm,xm")]))]
+  "SSE_FLOAT_MODE_P (<MODE>mode)"
+  "@
+   cmp%D3<ssemodesuffix>\t{%2, %0|%0, %2}
+   vcmp%D3<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<MODE>")])
+
+;; Basic conditional jump instructions.
+;; We ignore the overflow flag for signed branch instructions.
+
+(define_insn "*jcc_1"
+  [(set (pc)
+	(if_then_else (match_operator 1 "ix86_comparison_operator"
+				      [(reg FLAGS_REG) (const_int 0)])
+		      (label_ref (match_operand 0))
+		      (pc)))]
+  ""
+  "%+j%C1\t%l0"
+  [(set_attr "type" "ibr")
+   (set_attr "modrm" "0")
+   (set (attr "length")
+	   (if_then_else (and (ge (minus (match_dup 0) (pc))
+				  (const_int -126))
+			      (lt (minus (match_dup 0) (pc))
+				  (const_int 128)))
+	     (const_int 2)
+	     (const_int 6)))])
+
+(define_insn "*jcc_2"
+  [(set (pc)
+	(if_then_else (match_operator 1 "ix86_comparison_operator"
+				      [(reg FLAGS_REG) (const_int 0)])
+		      (pc)
+		      (label_ref (match_operand 0))))]
+  ""
+  "%+j%c1\t%l0"
+  [(set_attr "type" "ibr")
+   (set_attr "modrm" "0")
+   (set (attr "length")
+	   (if_then_else (and (ge (minus (match_dup 0) (pc))
+				  (const_int -126))
+			      (lt (minus (match_dup 0) (pc))
+				  (const_int 128)))
+	     (const_int 2)
+	     (const_int 6)))])
+
+;; In general it is not safe to assume too much about CCmode registers,
+;; so simplify-rtx stops when it sees a second one.  Under certain
+;; conditions this is safe on x86, so help combine not create
+;;
+;;	seta	%al
+;;	testb	%al, %al
+;;	je	Lfoo
+
+(define_split
+  [(set (pc)
+	(if_then_else (ne (match_operator 0 "ix86_comparison_operator"
+				      [(reg FLAGS_REG) (const_int 0)])
+			  (const_int 0))
+		      (label_ref (match_operand 1))
+		      (pc)))]
+  ""
+  [(set (pc)
+	(if_then_else (match_dup 0)
+		      (label_ref (match_dup 1))
+		      (pc)))]
+  "PUT_MODE (operands[0], VOIDmode);")
+
+(define_split
+  [(set (pc)
+	(if_then_else (eq (match_operator 0 "ix86_comparison_operator"
+				      [(reg FLAGS_REG) (const_int 0)])
+			  (const_int 0))
+		      (label_ref (match_operand 1))
+		      (pc)))]
+  ""
+  [(set (pc)
+	(if_then_else (match_dup 0)
+		      (label_ref (match_dup 1))
+		      (pc)))]
+{
+  rtx new_op0 = copy_rtx (operands[0]);
+  operands[0] = new_op0;
+  PUT_MODE (new_op0, VOIDmode);
+  PUT_CODE (new_op0, ix86_reverse_condition (GET_CODE (new_op0),
+					     GET_MODE (XEXP (new_op0, 0))));
+
+  /* Make sure that (a) the CCmode we have for the flags is strong
+     enough for the reversed compare or (b) we have a valid FP compare.  */
+  if (! ix86_comparison_operator (new_op0, VOIDmode))
+    FAIL;
+})
+
+;; zero_extend in SImode is correct also for DImode, since this is what combine
+;; pass generates from shift insn with QImode operand.  Actually, the mode
+;; of operand 2 (bit offset operand) doesn't matter since bt insn takes
+;; appropriate modulo of the bit offset value.
+
+(define_insn_and_split "*jcc_bt<mode>"
+  [(set (pc)
+  	(if_then_else (match_operator 0 "bt_comparison_operator"
+			[(zero_extract:SWI48
+			   (match_operand:SWI48 1 "register_operand" "r")
+			   (const_int 1)
+			   (zero_extend:SI
+			     (match_operand:QI 2 "register_operand" "r")))
+			 (const_int 0)])
+		      (label_ref (match_operand 3))
+		      (pc)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_BT || optimize_function_for_size_p (cfun)"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	  (zero_extract:SWI48
+	    (match_dup 1)
+	    (const_int 1)
+	    (match_dup 2))
+	  (const_int 0)))
+   (set (pc)
+	(if_then_else (match_op_dup 0 [(reg:CCC FLAGS_REG) (const_int 0)])
+		      (label_ref (match_dup 3))
+		      (pc)))]
+{
+  operands[2] = simplify_gen_subreg (<MODE>mode, operands[2], QImode, 0);
+
+  PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));
+})
+
+;; Like *jcc_bt<mode>, but expect a SImode operand 2 instead of QImode
+;; zero extended to SImode.
+(define_insn_and_split "*jcc_bt<mode>_1"
+  [(set (pc)
+  	(if_then_else (match_operator 0 "bt_comparison_operator"
+			[(zero_extract:SWI48
+			   (match_operand:SWI48 1 "register_operand" "r")
+			   (const_int 1)
+			   (match_operand:SI 2 "register_operand" "r"))
+			 (const_int 0)])
+		      (label_ref (match_operand 3))
+		      (pc)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_BT || optimize_function_for_size_p (cfun)"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	  (zero_extract:SWI48
+	    (match_dup 1)
+	    (const_int 1)
+	    (match_dup 2))
+	  (const_int 0)))
+   (set (pc)
+	(if_then_else (match_op_dup 0 [(reg:CCC FLAGS_REG) (const_int 0)])
+		      (label_ref (match_dup 3))
+		      (pc)))]
+{
+  operands[2] = simplify_gen_subreg (<MODE>mode, operands[2], SImode, 0);
+
+  PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));
+})
+
+;; Avoid useless masking of bit offset operand.  "and" in SImode is correct
+;; also for DImode, this is what combine produces.
+(define_insn_and_split "*jcc_bt<mode>_mask"
+  [(set (pc)
+  	(if_then_else (match_operator 0 "bt_comparison_operator"
+			[(zero_extract:SWI48
+			   (match_operand:SWI48 1 "register_operand" "r")
+			   (const_int 1)
+			   (and:SI
+			     (match_operand:SI 2 "register_operand" "r")
+			     (match_operand:SI 3 "const_int_operand" "n")))])
+		      (label_ref (match_operand 4))
+		      (pc)))
+   (clobber (reg:CC FLAGS_REG))]
+  "(TARGET_USE_BT || optimize_function_for_size_p (cfun))
+   && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1))
+      == GET_MODE_BITSIZE (<MODE>mode)-1"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	  (zero_extract:SWI48
+	    (match_dup 1)
+	    (const_int 1)
+	    (match_dup 2))
+	  (const_int 0)))
+   (set (pc)
+	(if_then_else (match_op_dup 0 [(reg:CCC FLAGS_REG) (const_int 0)])
+		      (label_ref (match_dup 4))
+		      (pc)))]
+{
+  operands[2] = simplify_gen_subreg (<MODE>mode, operands[2], SImode, 0);
+
+  PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));
+})
+
+(define_insn_and_split "*jcc_btsi_1"
+  [(set (pc)
+  	(if_then_else (match_operator 0 "bt_comparison_operator"
+			[(and:SI
+			   (lshiftrt:SI
+			     (match_operand:SI 1 "register_operand" "r")
+			     (match_operand:QI 2 "register_operand" "r"))
+			   (const_int 1))
+			 (const_int 0)])
+		      (label_ref (match_operand 3))
+		      (pc)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_BT || optimize_function_for_size_p (cfun)"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	  (zero_extract:SI
+	    (match_dup 1)
+	    (const_int 1)
+	    (match_dup 2))
+	  (const_int 0)))
+   (set (pc)
+	(if_then_else (match_op_dup 0 [(reg:CCC FLAGS_REG) (const_int 0)])
+		      (label_ref (match_dup 3))
+		      (pc)))]
+{
+  operands[2] = simplify_gen_subreg (SImode, operands[2], QImode, 0);
+
+  PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));
+})
+
+;; avoid useless masking of bit offset operand
+(define_insn_and_split "*jcc_btsi_mask_1"
+  [(set (pc)
+  	(if_then_else
+	  (match_operator 0 "bt_comparison_operator"
+	    [(and:SI
+	       (lshiftrt:SI
+		 (match_operand:SI 1 "register_operand" "r")
+		 (subreg:QI
+		   (and:SI
+		     (match_operand:SI 2 "register_operand" "r")
+		     (match_operand:SI 3 "const_int_operand" "n")) 0))
+	       (const_int 1))
+	     (const_int 0)])
+	  (label_ref (match_operand 4))
+	  (pc)))
+   (clobber (reg:CC FLAGS_REG))]
+  "(TARGET_USE_BT || optimize_function_for_size_p (cfun))
+   && (INTVAL (operands[3]) & 0x1f) == 0x1f"
+  "#"
+  "&& 1"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC
+	  (zero_extract:SI
+	    (match_dup 1)
+	    (const_int 1)
+	    (match_dup 2))
+	  (const_int 0)))
+   (set (pc)
+	(if_then_else (match_op_dup 0 [(reg:CCC FLAGS_REG) (const_int 0)])
+		      (label_ref (match_dup 4))
+		      (pc)))]
+  "PUT_CODE (operands[0], reverse_condition (GET_CODE (operands[0])));")
+
+;; Define combination compare-and-branch fp compare instructions to help
+;; combine.
+
+(define_insn "*jcc<mode>_0_i387"
+  [(set (pc)
+	(if_then_else (match_operator:CCFP 0 "ix86_fp_comparison_operator"
+			[(match_operand:X87MODEF 1 "register_operand" "f")
+			 (match_operand:X87MODEF 2 "const0_operand")])
+	  (label_ref (match_operand 3))
+	  (pc)))
+   (clobber (reg:CCFP FPSR_REG))
+   (clobber (reg:CCFP FLAGS_REG))
+   (clobber (match_scratch:HI 4 "=a"))]
+  "TARGET_80387 && !TARGET_CMOVE"
+  "#")
+
+(define_insn "*jcc<mode>_0_r_i387"
+  [(set (pc)
+	(if_then_else (match_operator:CCFP 0 "ix86_fp_comparison_operator"
+			[(match_operand:X87MODEF 1 "register_operand" "f")
+			 (match_operand:X87MODEF 2 "const0_operand")])
+	  (pc)
+	  (label_ref (match_operand 3))))
+   (clobber (reg:CCFP FPSR_REG))
+   (clobber (reg:CCFP FLAGS_REG))
+   (clobber (match_scratch:HI 4 "=a"))]
+  "TARGET_80387 && !TARGET_CMOVE"
+  "#")
+
+(define_insn "*jccxf_i387"
+  [(set (pc)
+	(if_then_else (match_operator:CCFP 0 "ix86_fp_comparison_operator"
+			[(match_operand:XF 1 "register_operand" "f")
+			 (match_operand:XF 2 "register_operand" "f")])
+	  (label_ref (match_operand 3))
+	  (pc)))
+   (clobber (reg:CCFP FPSR_REG))
+   (clobber (reg:CCFP FLAGS_REG))
+   (clobber (match_scratch:HI 4 "=a"))]
+  "TARGET_80387 && !TARGET_CMOVE"
+  "#")
+
+(define_insn "*jccxf_r_i387"
+  [(set (pc)
+	(if_then_else (match_operator:CCFP 0 "ix86_fp_comparison_operator"
+			[(match_operand:XF 1 "register_operand" "f")
+			 (match_operand:XF 2 "register_operand" "f")])
+	  (pc)
+	  (label_ref (match_operand 3))))
+   (clobber (reg:CCFP FPSR_REG))
+   (clobber (reg:CCFP FLAGS_REG))
+   (clobber (match_scratch:HI 4 "=a"))]
+  "TARGET_80387 && !TARGET_CMOVE"
+  "#")
+
+(define_insn "*jcc<mode>_i387"
+  [(set (pc)
+	(if_then_else (match_operator:CCFP 0 "ix86_fp_comparison_operator"
+			[(match_operand:MODEF 1 "register_operand" "f")
+			 (match_operand:MODEF 2 "nonimmediate_operand" "fm")])
+	  (label_ref (match_operand 3))
+	  (pc)))
+   (clobber (reg:CCFP FPSR_REG))
+   (clobber (reg:CCFP FLAGS_REG))
+   (clobber (match_scratch:HI 4 "=a"))]
+  "TARGET_80387 && !TARGET_CMOVE"
+  "#")
+
+(define_insn "*jcc<mode>_r_i387"
+  [(set (pc)
+	(if_then_else (match_operator:CCFP 0 "ix86_fp_comparison_operator"
+			[(match_operand:MODEF 1 "register_operand" "f")
+			 (match_operand:MODEF 2 "nonimmediate_operand" "fm")])
+	  (pc)
+	  (label_ref (match_operand 3))))
+   (clobber (reg:CCFP FPSR_REG))
+   (clobber (reg:CCFP FLAGS_REG))
+   (clobber (match_scratch:HI 4 "=a"))]
+  "TARGET_80387 && !TARGET_CMOVE"
+  "#")
+
+(define_insn "*jccu<mode>_i387"
+  [(set (pc)
+	(if_then_else (match_operator:CCFPU 0 "ix86_fp_comparison_operator"
+			[(match_operand:X87MODEF 1 "register_operand" "f")
+			 (match_operand:X87MODEF 2 "register_operand" "f")])
+	  (label_ref (match_operand 3))
+	  (pc)))
+   (clobber (reg:CCFP FPSR_REG))
+   (clobber (reg:CCFP FLAGS_REG))
+   (clobber (match_scratch:HI 4 "=a"))]
+  "TARGET_80387 && !TARGET_CMOVE"
+  "#")
+
+(define_insn "*jccu<mode>_r_i387"
+  [(set (pc)
+	(if_then_else (match_operator:CCFPU 0 "ix86_fp_comparison_operator"
+			[(match_operand:X87MODEF 1 "register_operand" "f")
+			 (match_operand:X87MODEF 2 "register_operand" "f")])
+	  (pc)
+	  (label_ref (match_operand 3))))
+   (clobber (reg:CCFP FPSR_REG))
+   (clobber (reg:CCFP FLAGS_REG))
+   (clobber (match_scratch:HI 4 "=a"))]
+  "TARGET_80387 && !TARGET_CMOVE"
+  "#")
+
+(define_split
+  [(set (pc)
+	(if_then_else (match_operator 0 "ix86_fp_comparison_operator"
+			[(match_operand:X87MODEF 1 "register_operand")
+			 (match_operand:X87MODEF 2 "nonimmediate_operand")])
+	  (match_operand 3)
+	  (match_operand 4)))
+   (clobber (reg:CCFP FPSR_REG))
+   (clobber (reg:CCFP FLAGS_REG))]
+  "TARGET_80387 && !TARGET_CMOVE
+   && reload_completed"
+  [(const_int 0)]
+{
+  ix86_split_fp_branch (GET_CODE (operands[0]), operands[1], operands[2],
+	                operands[3], operands[4], NULL_RTX);
+  DONE;
+})
+
+(define_split
+  [(set (pc)
+	(if_then_else (match_operator 0 "ix86_fp_comparison_operator"
+			[(match_operand:X87MODEF 1 "register_operand")
+			 (match_operand:X87MODEF 2 "general_operand")])
+	  (match_operand 3)
+	  (match_operand 4)))
+   (clobber (reg:CCFP FPSR_REG))
+   (clobber (reg:CCFP FLAGS_REG))
+   (clobber (match_scratch:HI 5))]
+  "TARGET_80387 && !TARGET_CMOVE
+   && reload_completed"
+  [(const_int 0)]
+{
+  ix86_split_fp_branch (GET_CODE (operands[0]), operands[1], operands[2],
+			operands[3], operands[4], operands[5]);
+  DONE;
+})
+
+;; The order of operands in *jcc<fp>_<int>_i387 is forced by combine in
+;; simplify_comparison () function. Float operator is treated as RTX_OBJ
+;; with a precedence over other operators and is always put in the first
+;; place. Swap condition and operands to match ficom instruction.
+
+(define_insn "*jcc<X87MODEF:mode>_<SWI24:mode>_i387"
+  [(set (pc)
+	(if_then_else
+	  (match_operator:CCFP 0 "ix86_swapped_fp_comparison_operator"
+	    [(match_operator:X87MODEF 1 "float_operator"
+	      [(match_operand:SWI24 2 "nonimmediate_operand" "m")])
+	     (match_operand:X87MODEF 3 "register_operand" "f")])
+	  (label_ref (match_operand 4))
+	  (pc)))
+   (clobber (reg:CCFP FPSR_REG))
+   (clobber (reg:CCFP FLAGS_REG))
+   (clobber (match_scratch:HI 5 "=a"))]
+  "TARGET_80387 && !TARGET_CMOVE
+   && (TARGET_USE_<SWI24:MODE>MODE_FIOP
+       || optimize_function_for_size_p (cfun))"
+  "#")
+
+(define_insn "*jcc<X87MODEF:mode>_<SWI24:mode>_r_i387"
+  [(set (pc)
+	(if_then_else
+	  (match_operator:CCFP 0 "ix86_swapped_fp_comparison_operator"
+	    [(match_operator:X87MODEF 1 "float_operator"
+	      [(match_operand:SWI24 2 "nonimmediate_operand" "m")])
+	     (match_operand:X87MODEF 3 "register_operand" "f")])
+	  (pc)
+	  (label_ref (match_operand 4))))
+   (clobber (reg:CCFP FPSR_REG))
+   (clobber (reg:CCFP FLAGS_REG))
+   (clobber (match_scratch:HI 5 "=a"))]
+  "TARGET_80387 && !TARGET_CMOVE
+   && (TARGET_USE_<SWI24:MODE>MODE_FIOP
+       || optimize_function_for_size_p (cfun))"
+  "#")
+
+(define_split
+  [(set (pc)
+	(if_then_else
+	  (match_operator:CCFP 0 "ix86_swapped_fp_comparison_operator"
+	    [(match_operator:X87MODEF 1 "float_operator"
+	      [(match_operand:SWI24 2 "memory_operand")])
+	     (match_operand:X87MODEF 3 "register_operand")])
+	  (match_operand 4)
+	  (match_operand 5)))
+   (clobber (reg:CCFP FPSR_REG))
+   (clobber (reg:CCFP FLAGS_REG))
+   (clobber (match_scratch:HI 6))]
+  "TARGET_80387 && !TARGET_CMOVE
+   && reload_completed"
+  [(const_int 0)]
+{
+  ix86_split_fp_branch (swap_condition (GET_CODE (operands[0])), operands[3],
+		        gen_rtx_FLOAT (GET_MODE (operands[1]), operands[2]),
+			operands[4], operands[5], operands[6]);
+  DONE;
+})
+
+;; Unconditional and other jump instructions
+
+(define_insn "jump"
+  [(set (pc)
+	(label_ref (match_operand 0)))]
+  ""
+  "jmp\t%l0"
+  [(set_attr "type" "ibr")
+   (set (attr "length")
+	   (if_then_else (and (ge (minus (match_dup 0) (pc))
+				  (const_int -126))
+			      (lt (minus (match_dup 0) (pc))
+				  (const_int 128)))
+	     (const_int 2)
+	     (const_int 5)))
+   (set_attr "modrm" "0")])
+
+(define_expand "indirect_jump"
+  [(set (pc) (match_operand 0 "indirect_branch_operand"))]
+  ""
+{
+  if (TARGET_X32)
+    operands[0] = convert_memory_address (word_mode, operands[0]);
+})
+
+(define_insn "*indirect_jump"
+  [(set (pc) (match_operand:W 0 "indirect_branch_operand" "rw"))]
+  ""
+  "jmp\t%A0"
+  [(set_attr "type" "ibr")
+   (set_attr "length_immediate" "0")])
+
+(define_expand "tablejump"
+  [(parallel [(set (pc) (match_operand 0 "indirect_branch_operand"))
+	      (use (label_ref (match_operand 1)))])]
+  ""
+{
+  /* In PIC mode, the table entries are stored GOT (32-bit) or PC (64-bit)
+     relative.  Convert the relative address to an absolute address.  */
+  if (flag_pic)
+    {
+      rtx op0, op1;
+      enum rtx_code code;
+
+      /* We can't use @GOTOFF for text labels on VxWorks;
+	 see gotoff_operand.  */
+      if (TARGET_64BIT || TARGET_VXWORKS_RTP)
+	{
+	  code = PLUS;
+	  op0 = operands[0];
+	  op1 = gen_rtx_LABEL_REF (Pmode, operands[1]);
+	}
+      else if (TARGET_MACHO || HAVE_AS_GOTOFF_IN_DATA)
+	{
+	  code = PLUS;
+	  op0 = operands[0];
+	  op1 = pic_offset_table_rtx;
+	}
+      else
+	{
+	  code = MINUS;
+	  op0 = pic_offset_table_rtx;
+	  op1 = operands[0];
+	}
+
+      operands[0] = expand_simple_binop (Pmode, code, op0, op1, NULL_RTX, 0,
+					 OPTAB_DIRECT);
+    }
+
+  if (TARGET_X32)
+    operands[0] = convert_memory_address (word_mode, operands[0]);
+})
+
+(define_insn "*tablejump_1"
+  [(set (pc) (match_operand:W 0 "indirect_branch_operand" "rw"))
+   (use (label_ref (match_operand 1)))]
+  ""
+  "jmp\t%A0"
+  [(set_attr "type" "ibr")
+   (set_attr "length_immediate" "0")])
+
+;; Convert setcc + movzbl to xor + setcc if operands don't overlap.
+
+(define_peephole2
+  [(set (reg FLAGS_REG) (match_operand 0))
+   (set (match_operand:QI 1 "register_operand")
+	(match_operator:QI 2 "ix86_comparison_operator"
+	  [(reg FLAGS_REG) (const_int 0)]))
+   (set (match_operand 3 "q_regs_operand")
+	(zero_extend (match_dup 1)))]
+  "(peep2_reg_dead_p (3, operands[1])
+    || operands_match_p (operands[1], operands[3]))
+   && ! reg_overlap_mentioned_p (operands[3], operands[0])"
+  [(set (match_dup 4) (match_dup 0))
+   (set (strict_low_part (match_dup 5))
+	(match_dup 2))]
+{
+  operands[4] = gen_rtx_REG (GET_MODE (operands[0]), FLAGS_REG);
+  operands[5] = gen_lowpart (QImode, operands[3]);
+  ix86_expand_clear (operands[3]);
+})
+
+(define_peephole2
+  [(parallel [(set (reg FLAGS_REG) (match_operand 0))
+	      (match_operand 4)])
+   (set (match_operand:QI 1 "register_operand")
+	(match_operator:QI 2 "ix86_comparison_operator"
+	  [(reg FLAGS_REG) (const_int 0)]))
+   (set (match_operand 3 "q_regs_operand")
+	(zero_extend (match_dup 1)))]
+  "(peep2_reg_dead_p (3, operands[1])
+    || operands_match_p (operands[1], operands[3]))
+   && ! reg_overlap_mentioned_p (operands[3], operands[0])"
+  [(parallel [(set (match_dup 5) (match_dup 0))
+	      (match_dup 4)])
+   (set (strict_low_part (match_dup 6))
+	(match_dup 2))]
+{
+  operands[5] = gen_rtx_REG (GET_MODE (operands[0]), FLAGS_REG);
+  operands[6] = gen_lowpart (QImode, operands[3]);
+  ix86_expand_clear (operands[3]);
+})
+
+;; Similar, but match zero extend with andsi3.
+
+(define_peephole2
+  [(set (reg FLAGS_REG) (match_operand 0))
+   (set (match_operand:QI 1 "register_operand")
+	(match_operator:QI 2 "ix86_comparison_operator"
+	  [(reg FLAGS_REG) (const_int 0)]))
+   (parallel [(set (match_operand:SI 3 "q_regs_operand")
+		   (and:SI (match_dup 3) (const_int 255)))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "REGNO (operands[1]) == REGNO (operands[3])
+   && ! reg_overlap_mentioned_p (operands[3], operands[0])"
+  [(set (match_dup 4) (match_dup 0))
+   (set (strict_low_part (match_dup 5))
+	(match_dup 2))]
+{
+  operands[4] = gen_rtx_REG (GET_MODE (operands[0]), FLAGS_REG);
+  operands[5] = gen_lowpart (QImode, operands[3]);
+  ix86_expand_clear (operands[3]);
+})
+
+(define_peephole2
+  [(parallel [(set (reg FLAGS_REG) (match_operand 0))
+	      (match_operand 4)])
+   (set (match_operand:QI 1 "register_operand")
+	(match_operator:QI 2 "ix86_comparison_operator"
+	  [(reg FLAGS_REG) (const_int 0)]))
+   (parallel [(set (match_operand 3 "q_regs_operand")
+		   (zero_extend (match_dup 1)))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "(peep2_reg_dead_p (3, operands[1])
+    || operands_match_p (operands[1], operands[3]))
+   && ! reg_overlap_mentioned_p (operands[3], operands[0])"
+  [(parallel [(set (match_dup 5) (match_dup 0))
+	      (match_dup 4)])
+   (set (strict_low_part (match_dup 6))
+	(match_dup 2))]
+{
+  operands[5] = gen_rtx_REG (GET_MODE (operands[0]), FLAGS_REG);
+  operands[6] = gen_lowpart (QImode, operands[3]);
+  ix86_expand_clear (operands[3]);
+})
+
+;; Call instructions.
+
+;; The predicates normally associated with named expanders are not properly
+;; checked for calls.  This is a bug in the generic code, but it isn't that
+;; easy to fix.  Ignore it for now and be prepared to fix things up.
+
+;; P6 processors will jump to the address after the decrement when %esp
+;; is used as a call operand, so they will execute return address as a code.
+;; See Pentium Pro errata 70, Pentium 2 errata A33 and Pentium 3 errata E17.
+
+;; Register constraint for call instruction.
+(define_mode_attr c [(SI "l") (DI "r")])
+
+;; Call subroutine returning no value.
+
+(define_expand "call"
+  [(call (match_operand:QI 0)
+	 (match_operand 1))
+   (use (match_operand 2))]
+  ""
+{
+  ix86_expand_call (NULL, operands[0], operands[1],
+		    operands[2], NULL, false);
+  DONE;
+})
+
+(define_expand "sibcall"
+  [(call (match_operand:QI 0)
+	 (match_operand 1))
+   (use (match_operand 2))]
+  ""
+{
+  ix86_expand_call (NULL, operands[0], operands[1],
+		    operands[2], NULL, true);
+  DONE;
+})
+
+(define_insn "*call"
+  [(call (mem:QI (match_operand:W 0 "call_insn_operand" "<c>zw"))
+	 (match_operand 1))]
+  "!SIBLING_CALL_P (insn)"
+  "* return ix86_output_call_insn (insn, operands[0]);"
+  [(set_attr "type" "call")])
+
+(define_insn "*call_rex64_ms_sysv"
+  [(match_parallel 2 "call_rex64_ms_sysv_operation"
+    [(call (mem:QI (match_operand:DI 0 "call_insn_operand" "rzw"))
+	   (match_operand 1))
+     (unspec [(const_int 0)] UNSPEC_MS_TO_SYSV_CALL)])]
+  "TARGET_64BIT && !SIBLING_CALL_P (insn)"
+  "* return ix86_output_call_insn (insn, operands[0]);"
+  [(set_attr "type" "call")])
+
+(define_insn "*sibcall"
+  [(call (mem:QI (match_operand:W 0 "sibcall_insn_operand" "Uz"))
+	 (match_operand 1))]
+  "SIBLING_CALL_P (insn)"
+  "* return ix86_output_call_insn (insn, operands[0]);"
+  [(set_attr "type" "call")])
+
+(define_expand "call_pop"
+  [(parallel [(call (match_operand:QI 0)
+		    (match_operand:SI 1))
+	      (set (reg:SI SP_REG)
+		   (plus:SI (reg:SI SP_REG)
+			    (match_operand:SI 3)))])]
+  "!TARGET_64BIT"
+{
+  ix86_expand_call (NULL, operands[0], operands[1],
+		    operands[2], operands[3], false);
+  DONE;
+})
+
+(define_insn "*call_pop"
+  [(call (mem:QI (match_operand:SI 0 "call_insn_operand" "lzm"))
+	 (match_operand 1))
+   (set (reg:SI SP_REG)
+	(plus:SI (reg:SI SP_REG)
+		 (match_operand:SI 2 "immediate_operand" "i")))]
+  "!TARGET_64BIT && !SIBLING_CALL_P (insn)"
+  "* return ix86_output_call_insn (insn, operands[0]);"
+  [(set_attr "type" "call")])
+
+(define_insn "*sibcall_pop"
+  [(call (mem:QI (match_operand:SI 0 "sibcall_insn_operand" "Uz"))
+	 (match_operand 1))
+   (set (reg:SI SP_REG)
+	(plus:SI (reg:SI SP_REG)
+		 (match_operand:SI 2 "immediate_operand" "i")))]
+  "!TARGET_64BIT && SIBLING_CALL_P (insn)"
+  "* return ix86_output_call_insn (insn, operands[0]);"
+  [(set_attr "type" "call")])
+
+;; Call subroutine, returning value in operand 0
+
+(define_expand "call_value"
+  [(set (match_operand 0)
+	(call (match_operand:QI 1)
+	      (match_operand 2)))
+   (use (match_operand 3))]
+  ""
+{
+  ix86_expand_call (operands[0], operands[1], operands[2],
+		    operands[3], NULL, false);
+  DONE;
+})
+
+(define_expand "sibcall_value"
+  [(set (match_operand 0)
+	(call (match_operand:QI 1)
+	      (match_operand 2)))
+   (use (match_operand 3))]
+  ""
+{
+  ix86_expand_call (operands[0], operands[1], operands[2],
+		    operands[3], NULL, true);
+  DONE;
+})
+
+(define_insn "*call_value"
+  [(set (match_operand 0)
+	(call (mem:QI (match_operand:W 1 "call_insn_operand" "<c>zw"))
+	      (match_operand 2)))]
+  "!SIBLING_CALL_P (insn)"
+  "* return ix86_output_call_insn (insn, operands[1]);"
+  [(set_attr "type" "callv")])
+
+(define_insn "*sibcall_value"
+  [(set (match_operand 0)
+	(call (mem:QI (match_operand:W 1 "sibcall_insn_operand" "Uz"))
+	      (match_operand 2)))]
+  "SIBLING_CALL_P (insn)"
+  "* return ix86_output_call_insn (insn, operands[1]);"
+  [(set_attr "type" "callv")])
+
+(define_insn "*call_value_rex64_ms_sysv"
+  [(match_parallel 3 "call_rex64_ms_sysv_operation"
+    [(set (match_operand 0)
+	  (call (mem:QI (match_operand:DI 1 "call_insn_operand" "rzw"))
+		(match_operand 2)))
+     (unspec [(const_int 0)] UNSPEC_MS_TO_SYSV_CALL)])]
+ "TARGET_64BIT && !SIBLING_CALL_P (insn)"
+  "* return ix86_output_call_insn (insn, operands[1]);"
+  [(set_attr "type" "callv")])
+
+(define_expand "call_value_pop"
+  [(parallel [(set (match_operand 0)
+		   (call (match_operand:QI 1)
+			 (match_operand:SI 2)))
+	      (set (reg:SI SP_REG)
+		   (plus:SI (reg:SI SP_REG)
+			    (match_operand:SI 4)))])]
+  "!TARGET_64BIT"
+{
+  ix86_expand_call (operands[0], operands[1], operands[2],
+		    operands[3], operands[4], false);
+  DONE;
+})
+
+(define_insn "*call_value_pop"
+  [(set (match_operand 0)
+	(call (mem:QI (match_operand:SI 1 "call_insn_operand" "lzm"))
+	      (match_operand 2)))
+   (set (reg:SI SP_REG)
+	(plus:SI (reg:SI SP_REG)
+		 (match_operand:SI 3 "immediate_operand" "i")))]
+  "!TARGET_64BIT && !SIBLING_CALL_P (insn)"
+  "* return ix86_output_call_insn (insn, operands[1]);"
+  [(set_attr "type" "callv")])
+
+(define_insn "*sibcall_value_pop"
+  [(set (match_operand 0)
+	(call (mem:QI (match_operand:SI 1 "sibcall_insn_operand" "Uz"))
+	      (match_operand 2)))
+   (set (reg:SI SP_REG)
+	(plus:SI (reg:SI SP_REG)
+		 (match_operand:SI 3 "immediate_operand" "i")))]
+  "!TARGET_64BIT && SIBLING_CALL_P (insn)"
+  "* return ix86_output_call_insn (insn, operands[1]);"
+  [(set_attr "type" "callv")])
+
+;; Call subroutine returning any type.
+
+(define_expand "untyped_call"
+  [(parallel [(call (match_operand 0)
+		    (const_int 0))
+	      (match_operand 1)
+	      (match_operand 2)])]
+  ""
+{
+  int i;
+
+  /* In order to give reg-stack an easier job in validating two
+     coprocessor registers as containing a possible return value,
+     simply pretend the untyped call returns a complex long double
+     value. 
+
+     We can't use SSE_REGPARM_MAX here since callee is unprototyped
+     and should have the default ABI.  */
+
+  ix86_expand_call ((TARGET_FLOAT_RETURNS_IN_80387
+		     ? gen_rtx_REG (XCmode, FIRST_FLOAT_REG) : NULL),
+		    operands[0], const0_rtx,
+		    GEN_INT ((TARGET_64BIT
+			      ? (ix86_abi == SYSV_ABI
+				 ? X86_64_SSE_REGPARM_MAX
+				 : X86_64_MS_SSE_REGPARM_MAX)
+			      : X86_32_SSE_REGPARM_MAX)
+		    	     - 1),
+		    NULL, false);
+
+  for (i = 0; i < XVECLEN (operands[2], 0); i++)
+    {
+      rtx set = XVECEXP (operands[2], 0, i);
+      emit_move_insn (SET_DEST (set), SET_SRC (set));
+    }
+
+  /* The optimizer does not know that the call sets the function value
+     registers we stored in the result block.  We avoid problems by
+     claiming that all hard registers are used and clobbered at this
+     point.  */
+  emit_insn (gen_blockage ());
+
+  DONE;
+})
+
+;; Prologue and epilogue instructions
+
+;; UNSPEC_VOLATILE is considered to use and clobber all hard registers and
+;; all of memory.  This blocks insns from being moved across this point.
+
+(define_insn "blockage"
+  [(unspec_volatile [(const_int 0)] UNSPECV_BLOCKAGE)]
+  ""
+  ""
+  [(set_attr "length" "0")])
+
+;; Do not schedule instructions accessing memory across this point.
+
+(define_expand "memory_blockage"
+  [(set (match_dup 0)
+	(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BLOCKAGE))]
+  ""
+{
+  operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[0]) = 1;
+})
+
+(define_insn "*memory_blockage"
+  [(set (match_operand:BLK 0)
+	(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BLOCKAGE))]
+  ""
+  ""
+  [(set_attr "length" "0")])
+
+;; As USE insns aren't meaningful after reload, this is used instead
+;; to prevent deleting instructions setting registers for PIC code
+(define_insn "prologue_use"
+  [(unspec_volatile [(match_operand 0)] UNSPECV_PROLOGUE_USE)]
+  ""
+  ""
+  [(set_attr "length" "0")])
+
+;; Insn emitted into the body of a function to return from a function.
+;; This is only done if the function's epilogue is known to be simple.
+;; See comments for ix86_can_use_return_insn_p in i386.c.
+
+(define_expand "return"
+  [(simple_return)]
+  "ix86_can_use_return_insn_p ()"
+{
+  if (crtl->args.pops_args)
+    {
+      rtx popc = GEN_INT (crtl->args.pops_args);
+      emit_jump_insn (gen_simple_return_pop_internal (popc));
+      DONE;
+    }
+})
+
+;; We need to disable this for TARGET_SEH, as otherwise
+;; shrink-wrapped prologue gets enabled too.  This might exceed
+;; the maximum size of prologue in unwind information.
+
+(define_expand "simple_return"
+  [(simple_return)]
+  "!TARGET_SEH"
+{
+  if (crtl->args.pops_args)
+    {
+      rtx popc = GEN_INT (crtl->args.pops_args);
+      emit_jump_insn (gen_simple_return_pop_internal (popc));
+      DONE;
+    }
+})
+
+(define_insn "simple_return_internal"
+  [(simple_return)]
+  "reload_completed"
+  "ret"
+  [(set_attr "length" "1")
+   (set_attr "atom_unit" "jeu")
+   (set_attr "length_immediate" "0")
+   (set_attr "modrm" "0")])
+
+;; Used by x86_machine_dependent_reorg to avoid penalty on single byte RET
+;; instruction Athlon and K8 have.
+
+(define_insn "simple_return_internal_long"
+  [(simple_return)
+   (unspec [(const_int 0)] UNSPEC_REP)]
+  "reload_completed"
+  "rep%; ret"
+  [(set_attr "length" "2")
+   (set_attr "atom_unit" "jeu")
+   (set_attr "length_immediate" "0")
+   (set_attr "prefix_rep" "1")
+   (set_attr "modrm" "0")])
+
+(define_insn "simple_return_pop_internal"
+  [(simple_return)
+   (use (match_operand:SI 0 "const_int_operand"))]
+  "reload_completed"
+  "ret\t%0"
+  [(set_attr "length" "3")
+   (set_attr "atom_unit" "jeu")
+   (set_attr "length_immediate" "2")
+   (set_attr "modrm" "0")])
+
+(define_insn "simple_return_indirect_internal"
+  [(simple_return)
+   (use (match_operand:SI 0 "register_operand" "r"))]
+  "reload_completed"
+  "jmp\t%A0"
+  [(set_attr "type" "ibr")
+   (set_attr "length_immediate" "0")])
+
+(define_insn "nop"
+  [(const_int 0)]
+  ""
+  "nop"
+  [(set_attr "length" "1")
+   (set_attr "length_immediate" "0")
+   (set_attr "modrm" "0")])
+
+;; Generate nops.  Operand 0 is the number of nops, up to 8.
+(define_insn "nops"
+  [(unspec_volatile [(match_operand 0 "const_int_operand")]
+		    UNSPECV_NOPS)]
+  "reload_completed"
+{
+  int num = INTVAL (operands[0]);
+
+  gcc_assert (IN_RANGE (num, 1, 8));
+
+  while (num--)
+    fputs ("\tnop\n", asm_out_file);
+
+  return "";
+}
+  [(set (attr "length") (symbol_ref "INTVAL (operands[0])"))
+   (set_attr "length_immediate" "0")
+   (set_attr "modrm" "0")])
+
+;; Pad to 16-byte boundary, max skip in op0.  Used to avoid
+;; branch prediction penalty for the third jump in a 16-byte
+;; block on K8.
+
+(define_insn "pad"
+  [(unspec_volatile [(match_operand 0)] UNSPECV_ALIGN)]
+  ""
+{
+#ifdef ASM_OUTPUT_MAX_SKIP_PAD
+  ASM_OUTPUT_MAX_SKIP_PAD (asm_out_file, 4, (int)INTVAL (operands[0]));
+#else
+  /* It is tempting to use ASM_OUTPUT_ALIGN here, but we don't want to do that.
+     The align insn is used to avoid 3 jump instructions in the row to improve
+     branch prediction and the benefits hardly outweigh the cost of extra 8
+     nops on the average inserted by full alignment pseudo operation.  */
+#endif
+  return "";
+}
+  [(set_attr "length" "16")])
+
+(define_expand "prologue"
+  [(const_int 0)]
+  ""
+  "ix86_expand_prologue (); DONE;")
+
+(define_insn "set_got"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(const_int 0)] UNSPEC_SET_GOT))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_64BIT"
+  "* return output_set_got (operands[0], NULL_RTX);"
+  [(set_attr "type" "multi")
+   (set_attr "length" "12")])
+
+(define_insn "set_got_labelled"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(label_ref (match_operand 1))]
+	 UNSPEC_SET_GOT))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_64BIT"
+  "* return output_set_got (operands[0], operands[1]);"
+  [(set_attr "type" "multi")
+   (set_attr "length" "12")])
+
+(define_insn "set_got_rex64"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec:DI [(const_int 0)] UNSPEC_SET_GOT))]
+  "TARGET_64BIT"
+  "lea{q}\t{_GLOBAL_OFFSET_TABLE_(%%rip), %0|%0, _GLOBAL_OFFSET_TABLE_[rip]}"
+  [(set_attr "type" "lea")
+   (set_attr "length_address" "4")
+   (set_attr "mode" "DI")])
+
+(define_insn "set_rip_rex64"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec:DI [(label_ref (match_operand 1))] UNSPEC_SET_RIP))]
+  "TARGET_64BIT"
+  "lea{q}\t{%l1(%%rip), %0|%0, %l1[rip]}"
+  [(set_attr "type" "lea")
+   (set_attr "length_address" "4")
+   (set_attr "mode" "DI")])
+
+(define_insn "set_got_offset_rex64"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec:DI
+	  [(label_ref (match_operand 1))]
+	  UNSPEC_SET_GOT_OFFSET))]
+  "TARGET_LP64"
+  "movabs{q}\t{$_GLOBAL_OFFSET_TABLE_-%l1, %0|%0, OFFSET FLAT:_GLOBAL_OFFSET_TABLE_-%l1}"
+  [(set_attr "type" "imov")
+   (set_attr "length_immediate" "0")
+   (set_attr "length_address" "8")
+   (set_attr "mode" "DI")])
+
+(define_expand "epilogue"
+  [(const_int 0)]
+  ""
+  "ix86_expand_epilogue (1); DONE;")
+
+(define_expand "sibcall_epilogue"
+  [(const_int 0)]
+  ""
+  "ix86_expand_epilogue (0); DONE;")
+
+(define_expand "eh_return"
+  [(use (match_operand 0 "register_operand"))]
+  ""
+{
+  rtx tmp, sa = EH_RETURN_STACKADJ_RTX, ra = operands[0];
+
+  /* Tricky bit: we write the address of the handler to which we will
+     be returning into someone else's stack frame, one word below the
+     stack address we wish to restore.  */
+  tmp = gen_rtx_PLUS (Pmode, arg_pointer_rtx, sa);
+  tmp = plus_constant (Pmode, tmp, -UNITS_PER_WORD);
+  tmp = gen_rtx_MEM (Pmode, tmp);
+  emit_move_insn (tmp, ra);
+
+  emit_jump_insn (gen_eh_return_internal ());
+  emit_barrier ();
+  DONE;
+})
+
+(define_insn_and_split "eh_return_internal"
+  [(eh_return)]
+  ""
+  "#"
+  "epilogue_completed"
+  [(const_int 0)]
+  "ix86_expand_epilogue (2); DONE;")
+
+(define_insn "leave"
+  [(set (reg:SI SP_REG) (plus:SI (reg:SI BP_REG) (const_int 4)))
+   (set (reg:SI BP_REG) (mem:SI (reg:SI BP_REG)))
+   (clobber (mem:BLK (scratch)))]
+  "!TARGET_64BIT"
+  "leave"
+  [(set_attr "type" "leave")])
+
+(define_insn "leave_rex64"
+  [(set (reg:DI SP_REG) (plus:DI (reg:DI BP_REG) (const_int 8)))
+   (set (reg:DI BP_REG) (mem:DI (reg:DI BP_REG)))
+   (clobber (mem:BLK (scratch)))]
+  "TARGET_64BIT"
+  "leave"
+  [(set_attr "type" "leave")])
+
+;; Handle -fsplit-stack.
+
+(define_expand "split_stack_prologue"
+  [(const_int 0)]
+  ""
+{
+  ix86_expand_split_stack_prologue ();
+  DONE;
+})
+
+;; In order to support the call/return predictor, we use a return
+;; instruction which the middle-end doesn't see.
+(define_insn "split_stack_return"
+  [(unspec_volatile [(match_operand:SI 0 "const_int_operand")]
+		     UNSPECV_SPLIT_STACK_RETURN)]
+  ""
+{
+  if (operands[0] == const0_rtx)
+    return "ret";
+  else
+    return "ret\t%0";
+}
+  [(set_attr "atom_unit" "jeu")
+   (set_attr "modrm" "0")
+   (set (attr "length")
+	(if_then_else (match_operand:SI 0 "const0_operand")
+		      (const_int 1)
+		      (const_int 3)))
+   (set (attr "length_immediate")
+	(if_then_else (match_operand:SI 0 "const0_operand")
+		      (const_int 0)
+		      (const_int 2)))])
+
+;; If there are operand 0 bytes available on the stack, jump to
+;; operand 1.
+
+(define_expand "split_stack_space_check"
+  [(set (pc) (if_then_else
+	      (ltu (minus (reg SP_REG)
+			  (match_operand 0 "register_operand"))
+		   (unspec [(const_int 0)] UNSPEC_STACK_CHECK))
+	      (label_ref (match_operand 1))
+	      (pc)))]
+  ""
+{
+  rtx reg, size, limit;
+
+  reg = gen_reg_rtx (Pmode);
+  size = force_reg (Pmode, operands[0]);
+  emit_insn (gen_sub3_insn (reg, stack_pointer_rtx, size));
+  limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
+			  UNSPEC_STACK_CHECK);
+  limit = gen_rtx_MEM (Pmode, gen_rtx_CONST (Pmode, limit));
+  ix86_expand_branch (GEU, reg, limit, operands[1]);
+
+  DONE;
+})
+
+;; Bit manipulation instructions.
+
+(define_expand "ffs<mode>2"
+  [(set (match_dup 2) (const_int -1))
+   (parallel [(set (match_dup 3) (match_dup 4))
+	      (set (match_operand:SWI48 0 "register_operand")
+		   (ctz:SWI48
+		     (match_operand:SWI48 1 "nonimmediate_operand")))])
+   (set (match_dup 0) (if_then_else:SWI48
+			(eq (match_dup 3) (const_int 0))
+			(match_dup 2)
+			(match_dup 0)))
+   (parallel [(set (match_dup 0) (plus:SWI48 (match_dup 0) (const_int 1)))
+	      (clobber (reg:CC FLAGS_REG))])]
+  ""
+{
+  enum machine_mode flags_mode;
+
+  if (<MODE>mode == SImode && !TARGET_CMOVE)
+    {
+      emit_insn (gen_ffssi2_no_cmove (operands[0], operands [1]));
+      DONE;
+    }
+
+  flags_mode = TARGET_BMI ? CCCmode : CCZmode;
+
+  operands[2] = gen_reg_rtx (<MODE>mode);
+  operands[3] = gen_rtx_REG (flags_mode, FLAGS_REG);
+  operands[4] = gen_rtx_COMPARE (flags_mode, operands[1], const0_rtx);
+})
+
+(define_insn_and_split "ffssi2_no_cmove"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(ffs:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))
+   (clobber (match_scratch:SI 2 "=&q"))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_CMOVE"
+  "#"
+  "&& reload_completed"
+  [(parallel [(set (match_dup 4) (match_dup 5))
+	      (set (match_dup 0) (ctz:SI (match_dup 1)))])
+   (set (strict_low_part (match_dup 3))
+	(eq:QI (match_dup 4) (const_int 0)))
+   (parallel [(set (match_dup 2) (neg:SI (match_dup 2)))
+	      (clobber (reg:CC FLAGS_REG))])
+   (parallel [(set (match_dup 0) (ior:SI (match_dup 0) (match_dup 2)))
+	      (clobber (reg:CC FLAGS_REG))])
+   (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 1)))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  enum machine_mode flags_mode = TARGET_BMI ? CCCmode : CCZmode;
+
+  operands[3] = gen_lowpart (QImode, operands[2]);
+  operands[4] = gen_rtx_REG (flags_mode, FLAGS_REG);
+  operands[5] = gen_rtx_COMPARE (flags_mode, operands[1], const0_rtx);
+
+  ix86_expand_clear (operands[2]);
+})
+
+(define_insn "*tzcnt<mode>_1"
+  [(set (reg:CCC FLAGS_REG)
+	(compare:CCC (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+		     (const_int 0)))
+   (set (match_operand:SWI48 0 "register_operand" "=r")
+	(ctz:SWI48 (match_dup 1)))]
+  "TARGET_BMI"
+  "tzcnt{<imodesuffix>}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "btver2_decode" "double")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*bsf<mode>_1"
+  [(set (reg:CCZ FLAGS_REG)
+	(compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+		     (const_int 0)))
+   (set (match_operand:SWI48 0 "register_operand" "=r")
+	(ctz:SWI48 (match_dup 1)))]
+  ""
+  "bsf{<imodesuffix>}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "btver2_decode" "double")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "ctz<mode>2"
+  [(set (match_operand:SWI248 0 "register_operand" "=r")
+	(ctz:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "rm")))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+{
+  if (TARGET_BMI)
+    return "tzcnt{<imodesuffix>}\t{%1, %0|%0, %1}";
+  else if (optimize_function_for_size_p (cfun))
+    ;
+  else if (TARGET_GENERIC)
+    /* tzcnt expands to 'rep bsf' and we can use it even if !TARGET_BMI.  */
+    return "rep%; bsf{<imodesuffix>}\t{%1, %0|%0, %1}";
+
+  return "bsf{<imodesuffix>}\t{%1, %0|%0, %1}";
+}
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set (attr "prefix_rep")
+     (if_then_else
+       (ior (match_test "TARGET_BMI")
+	    (and (not (match_test "optimize_function_for_size_p (cfun)"))
+		 (match_test "TARGET_GENERIC")))
+       (const_string "1")
+       (const_string "0")))
+   (set_attr "mode" "<MODE>")])
+
+(define_expand "clz<mode>2"
+  [(parallel
+     [(set (match_operand:SWI248 0 "register_operand")
+	   (minus:SWI248
+	     (match_dup 2)
+	     (clz:SWI248 (match_operand:SWI248 1 "nonimmediate_operand"))))
+      (clobber (reg:CC FLAGS_REG))])
+   (parallel
+     [(set (match_dup 0) (xor:SWI248 (match_dup 0) (match_dup 2)))
+      (clobber (reg:CC FLAGS_REG))])]
+  ""
+{
+  if (TARGET_LZCNT)
+    {
+      emit_insn (gen_clz<mode>2_lzcnt (operands[0], operands[1]));
+      DONE;
+    }
+  operands[2] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1);
+})
+
+(define_insn "clz<mode>2_lzcnt"
+  [(set (match_operand:SWI248 0 "register_operand" "=r")
+	(clz:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "rm")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_LZCNT"
+  "lzcnt{<imodesuffix>}\t{%1, %0|%0, %1}"
+  [(set_attr "prefix_rep" "1")
+   (set_attr "type" "bitmanip")
+   (set_attr "mode" "<MODE>")])
+
+;; BMI instructions.
+(define_insn "*bmi_andn_<mode>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r,r")
+        (and:SWI48
+          (not:SWI48
+            (match_operand:SWI48 1 "register_operand" "r,r"))
+            (match_operand:SWI48 2 "nonimmediate_operand" "r,m")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI"
+  "andn\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "btver2_decode" "direct, double")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "bmi_bextr_<mode>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r,r")
+        (unspec:SWI48 [(match_operand:SWI48 1 "nonimmediate_operand" "r,m")
+                       (match_operand:SWI48 2 "register_operand" "r,r")]
+                       UNSPEC_BEXTR))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI"
+  "bextr\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "btver2_decode" "direct, double")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*bmi_blsi_<mode>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (and:SWI48
+          (neg:SWI48
+            (match_operand:SWI48 1 "nonimmediate_operand" "rm"))
+          (match_dup 1)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI"
+  "blsi\t{%1, %0|%0, %1}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "btver2_decode" "double")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*bmi_blsmsk_<mode>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (xor:SWI48
+          (plus:SWI48
+            (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+            (const_int -1))
+          (match_dup 1)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI"
+  "blsmsk\t{%1, %0|%0, %1}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "btver2_decode" "double")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*bmi_blsr_<mode>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (and:SWI48
+          (plus:SWI48
+            (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+            (const_int -1))
+          (match_dup 1)))
+   (clobber (reg:CC FLAGS_REG))]
+   "TARGET_BMI"
+   "blsr\t{%1, %0|%0, %1}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "btver2_decode" "double")
+   (set_attr "mode" "<MODE>")])
+
+;; BMI2 instructions.
+(define_insn "bmi2_bzhi_<mode>3"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(and:SWI48 (lshiftrt:SWI48 (const_int -1)
+				   (match_operand:SWI48 2 "register_operand" "r"))
+		   (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_BMI2"
+  "bzhi\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "bmi2_pdep_<mode>3"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (unspec:SWI48 [(match_operand:SWI48 1 "register_operand" "r")
+                       (match_operand:SWI48 2 "nonimmediate_operand" "rm")]
+                       UNSPEC_PDEP))]
+  "TARGET_BMI2"
+  "pdep\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "bmi2_pext_<mode>3"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (unspec:SWI48 [(match_operand:SWI48 1 "register_operand" "r")
+                       (match_operand:SWI48 2 "nonimmediate_operand" "rm")]
+                       UNSPEC_PEXT))]
+  "TARGET_BMI2"
+  "pext\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+;; TBM instructions.
+(define_insn "tbm_bextri_<mode>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (zero_extract:SWI48
+          (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+          (match_operand:SWI48 2 "const_0_to_255_operand" "n")
+          (match_operand:SWI48 3 "const_0_to_255_operand" "n")))
+   (clobber (reg:CC FLAGS_REG))]
+   "TARGET_TBM"
+{
+  operands[2] = GEN_INT (INTVAL (operands[2]) << 8 | INTVAL (operands[3]));
+  return "bextr\t{%2, %1, %0|%0, %1, %2}";
+}
+  [(set_attr "type" "bitmanip")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*tbm_blcfill_<mode>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (and:SWI48
+          (plus:SWI48
+            (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+            (const_int 1))
+          (match_dup 1)))
+   (clobber (reg:CC FLAGS_REG))]
+   "TARGET_TBM"
+   "blcfill\t{%1, %0|%0, %1}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*tbm_blci_<mode>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (ior:SWI48
+          (not:SWI48
+            (plus:SWI48
+              (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+              (const_int 1)))
+          (match_dup 1)))
+   (clobber (reg:CC FLAGS_REG))]
+   "TARGET_TBM"
+   "blci\t{%1, %0|%0, %1}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*tbm_blcic_<mode>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (and:SWI48
+          (plus:SWI48
+            (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+            (const_int 1))
+          (not:SWI48
+            (match_dup 1))))
+   (clobber (reg:CC FLAGS_REG))]
+   "TARGET_TBM"
+   "blcic\t{%1, %0|%0, %1}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*tbm_blcmsk_<mode>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (xor:SWI48
+          (plus:SWI48
+            (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+            (const_int 1))
+          (match_dup 1)))
+   (clobber (reg:CC FLAGS_REG))]
+   "TARGET_TBM"
+   "blcmsk\t{%1, %0|%0, %1}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*tbm_blcs_<mode>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (ior:SWI48
+          (plus:SWI48
+            (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+            (const_int 1))
+          (match_dup 1)))
+   (clobber (reg:CC FLAGS_REG))]
+   "TARGET_TBM"
+   "blcs\t{%1, %0|%0, %1}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*tbm_blsfill_<mode>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (ior:SWI48
+          (plus:SWI48
+            (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+            (const_int -1))
+          (match_dup 1)))
+   (clobber (reg:CC FLAGS_REG))]
+   "TARGET_TBM"
+   "blsfill\t{%1, %0|%0, %1}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*tbm_blsic_<mode>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (ior:SWI48
+          (plus:SWI48
+            (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+            (const_int -1))
+          (not:SWI48
+            (match_dup 1))))
+   (clobber (reg:CC FLAGS_REG))]
+   "TARGET_TBM"
+   "blsic\t{%1, %0|%0, %1}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*tbm_t1mskc_<mode>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (ior:SWI48
+          (plus:SWI48
+            (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+            (const_int 1))
+          (not:SWI48
+            (match_dup 1))))
+   (clobber (reg:CC FLAGS_REG))]
+   "TARGET_TBM"
+   "t1mskc\t{%1, %0|%0, %1}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*tbm_tzmsk_<mode>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (and:SWI48
+          (plus:SWI48
+            (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+            (const_int -1))
+          (not:SWI48
+            (match_dup 1))))
+   (clobber (reg:CC FLAGS_REG))]
+   "TARGET_TBM"
+   "tzmsk\t{%1, %0|%0, %1}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "bsr_rex64"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(minus:DI (const_int 63)
+		  (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT"
+  "bsr{q}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "mode" "DI")])
+
+(define_insn "bsr"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(minus:SI (const_int 31)
+		  (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "bsr{l}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "mode" "SI")])
+
+(define_insn "*bsrhi"
+  [(set (match_operand:HI 0 "register_operand" "=r")
+	(minus:HI (const_int 15)
+		  (clz:HI (match_operand:HI 1 "nonimmediate_operand" "rm"))))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "bsr{w}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "mode" "HI")])
+
+(define_insn "popcount<mode>2"
+  [(set (match_operand:SWI248 0 "register_operand" "=r")
+	(popcount:SWI248
+	  (match_operand:SWI248 1 "nonimmediate_operand" "rm")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_POPCNT"
+{
+#if TARGET_MACHO
+  return "popcnt\t{%1, %0|%0, %1}";
+#else
+  return "popcnt{<imodesuffix>}\t{%1, %0|%0, %1}";
+#endif
+}
+  [(set_attr "prefix_rep" "1")
+   (set_attr "type" "bitmanip")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*popcount<mode>2_cmp"
+  [(set (reg FLAGS_REG)
+	(compare
+	  (popcount:SWI248
+	    (match_operand:SWI248 1 "nonimmediate_operand" "rm"))
+	  (const_int 0)))
+   (set (match_operand:SWI248 0 "register_operand" "=r")
+	(popcount:SWI248 (match_dup 1)))]
+  "TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)"
+{
+#if TARGET_MACHO
+  return "popcnt\t{%1, %0|%0, %1}";
+#else
+  return "popcnt{<imodesuffix>}\t{%1, %0|%0, %1}";
+#endif
+}
+  [(set_attr "prefix_rep" "1")
+   (set_attr "type" "bitmanip")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*popcountsi2_cmp_zext"
+  [(set (reg FLAGS_REG)
+        (compare
+          (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))
+          (const_int 0)))
+   (set (match_operand:DI 0 "register_operand" "=r")
+        (zero_extend:DI(popcount:SI (match_dup 1))))]
+  "TARGET_64BIT && TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)"
+{
+#if TARGET_MACHO
+  return "popcnt\t{%1, %0|%0, %1}";
+#else
+  return "popcnt{l}\t{%1, %0|%0, %1}";
+#endif
+}
+  [(set_attr "prefix_rep" "1")
+   (set_attr "type" "bitmanip")
+   (set_attr "mode" "SI")])
+
+(define_expand "bswapdi2"
+  [(set (match_operand:DI 0 "register_operand")
+	(bswap:DI (match_operand:DI 1 "nonimmediate_operand")))]
+  "TARGET_64BIT"
+{
+  if (!TARGET_MOVBE)
+    operands[1] = force_reg (DImode, operands[1]);
+})
+
+(define_expand "bswapsi2"
+  [(set (match_operand:SI 0 "register_operand")
+	(bswap:SI (match_operand:SI 1 "nonimmediate_operand")))]
+  ""
+{
+  if (TARGET_MOVBE)
+    ;
+  else if (TARGET_BSWAP)
+    operands[1] = force_reg (SImode, operands[1]);
+  else
+    {
+      rtx x = operands[0];
+
+      emit_move_insn (x, operands[1]);
+      emit_insn (gen_bswaphi_lowpart (gen_lowpart (HImode, x)));
+      emit_insn (gen_rotlsi3 (x, x, GEN_INT (16)));
+      emit_insn (gen_bswaphi_lowpart (gen_lowpart (HImode, x)));
+      DONE;
+    }
+})
+
+(define_insn "*bswap<mode>2_movbe"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=r,r,m")
+	(bswap:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "0,m,r")))]
+  "TARGET_MOVBE
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "@
+    bswap\t%0
+    movbe\t{%1, %0|%0, %1}
+    movbe\t{%1, %0|%0, %1}"
+  [(set_attr "type" "bitmanip,imov,imov")
+   (set_attr "modrm" "0,1,1")
+   (set_attr "prefix_0f" "*,1,1")
+   (set_attr "prefix_extra" "*,1,1")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*bswap<mode>2"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(bswap:SWI48 (match_operand:SWI48 1 "register_operand" "0")))]
+  "TARGET_BSWAP"
+  "bswap\t%0"
+  [(set_attr "type" "bitmanip")
+   (set_attr "modrm" "0")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*bswaphi_lowpart_1"
+  [(set (strict_low_part (match_operand:HI 0 "register_operand" "+Q,r"))
+	(bswap:HI (match_dup 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_XCHGB || optimize_function_for_size_p (cfun)"
+  "@
+    xchg{b}\t{%h0, %b0|%b0, %h0}
+    rol{w}\t{$8, %0|%0, 8}"
+  [(set_attr "length" "2,4")
+   (set_attr "mode" "QI,HI")])
+
+(define_insn "bswaphi_lowpart"
+  [(set (strict_low_part (match_operand:HI 0 "register_operand" "+r"))
+	(bswap:HI (match_dup 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "rol{w}\t{$8, %0|%0, 8}"
+  [(set_attr "length" "4")
+   (set_attr "mode" "HI")])
+
+(define_expand "paritydi2"
+  [(set (match_operand:DI 0 "register_operand")
+	(parity:DI (match_operand:DI 1 "register_operand")))]
+  "! TARGET_POPCNT"
+{
+  rtx scratch = gen_reg_rtx (QImode);
+  rtx cond;
+
+  emit_insn (gen_paritydi2_cmp (NULL_RTX, NULL_RTX,
+				NULL_RTX, operands[1]));
+
+  cond = gen_rtx_fmt_ee (ORDERED, QImode,
+			 gen_rtx_REG (CCmode, FLAGS_REG),
+			 const0_rtx);
+  emit_insn (gen_rtx_SET (VOIDmode, scratch, cond));
+
+  if (TARGET_64BIT)
+    emit_insn (gen_zero_extendqidi2 (operands[0], scratch));
+  else
+    {
+      rtx tmp = gen_reg_rtx (SImode);
+
+      emit_insn (gen_zero_extendqisi2 (tmp, scratch));
+      emit_insn (gen_zero_extendsidi2 (operands[0], tmp));
+    }
+  DONE;
+})
+
+(define_expand "paritysi2"
+  [(set (match_operand:SI 0 "register_operand")
+	(parity:SI (match_operand:SI 1 "register_operand")))]
+  "! TARGET_POPCNT"
+{
+  rtx scratch = gen_reg_rtx (QImode);
+  rtx cond;
+
+  emit_insn (gen_paritysi2_cmp (NULL_RTX, NULL_RTX, operands[1]));
+
+  cond = gen_rtx_fmt_ee (ORDERED, QImode,
+			 gen_rtx_REG (CCmode, FLAGS_REG),
+			 const0_rtx);
+  emit_insn (gen_rtx_SET (VOIDmode, scratch, cond));
+
+  emit_insn (gen_zero_extendqisi2 (operands[0], scratch));
+  DONE;
+})
+
+(define_insn_and_split "paritydi2_cmp"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC [(match_operand:DI 3 "register_operand" "0")]
+		   UNSPEC_PARITY))
+   (clobber (match_scratch:DI 0 "=r"))
+   (clobber (match_scratch:SI 1 "=&r"))
+   (clobber (match_scratch:HI 2 "=Q"))]
+  "! TARGET_POPCNT"
+  "#"
+  "&& reload_completed"
+  [(parallel
+     [(set (match_dup 1)
+	   (xor:SI (match_dup 1) (match_dup 4)))
+      (clobber (reg:CC FLAGS_REG))])
+   (parallel
+     [(set (reg:CC FLAGS_REG)
+	   (unspec:CC [(match_dup 1)] UNSPEC_PARITY))
+      (clobber (match_dup 1))
+      (clobber (match_dup 2))])]
+{
+  operands[4] = gen_lowpart (SImode, operands[3]);
+
+  if (TARGET_64BIT)
+    {
+      emit_move_insn (operands[1], gen_lowpart (SImode, operands[3]));
+      emit_insn (gen_lshrdi3 (operands[3], operands[3], GEN_INT (32)));
+    }
+  else
+    operands[1] = gen_highpart (SImode, operands[3]);
+})
+
+(define_insn_and_split "paritysi2_cmp"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC [(match_operand:SI 2 "register_operand" "0")]
+		   UNSPEC_PARITY))
+   (clobber (match_scratch:SI 0 "=r"))
+   (clobber (match_scratch:HI 1 "=&Q"))]
+  "! TARGET_POPCNT"
+  "#"
+  "&& reload_completed"
+  [(parallel
+     [(set (match_dup 1)
+	   (xor:HI (match_dup 1) (match_dup 3)))
+      (clobber (reg:CC FLAGS_REG))])
+   (parallel
+     [(set (reg:CC FLAGS_REG)
+	   (unspec:CC [(match_dup 1)] UNSPEC_PARITY))
+      (clobber (match_dup 1))])]
+{
+  operands[3] = gen_lowpart (HImode, operands[2]);
+
+  emit_move_insn (operands[1], gen_lowpart (HImode, operands[2]));
+  emit_insn (gen_lshrsi3 (operands[2], operands[2], GEN_INT (16)));
+})
+
+(define_insn "*parityhi2_cmp"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC [(match_operand:HI 1 "register_operand" "0")]
+		   UNSPEC_PARITY))
+   (clobber (match_scratch:HI 0 "=Q"))]
+  "! TARGET_POPCNT"
+  "xor{b}\t{%h0, %b0|%b0, %h0}"
+  [(set_attr "length" "2")
+   (set_attr "mode" "HI")])
+
+
+;; Thread-local storage patterns for ELF.
+;;
+;; Note that these code sequences must appear exactly as shown
+;; in order to allow linker relaxation.
+
+(define_insn "*tls_global_dynamic_32_gnu"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(unspec:SI
+	 [(match_operand:SI 1 "register_operand" "b")
+	  (match_operand 2 "tls_symbolic_operand")
+	  (match_operand 3 "constant_call_address_operand" "z")]
+	 UNSPEC_TLS_GD))
+   (clobber (match_scratch:SI 4 "=d"))
+   (clobber (match_scratch:SI 5 "=c"))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_64BIT && TARGET_GNU_TLS"
+{
+  output_asm_insn
+    ("lea{l}\t{%E2@tlsgd(,%1,1), %0|%0, %E2@tlsgd[%1*1]}", operands);
+  if (TARGET_SUN_TLS)
+#ifdef HAVE_AS_IX86_TLSGDPLT
+    return "call\t%a2@tlsgdplt";
+#else
+    return "call\t%p3@plt";
+#endif
+  return "call\t%P3";
+}
+  [(set_attr "type" "multi")
+   (set_attr "length" "12")])
+
+(define_expand "tls_global_dynamic_32"
+  [(parallel
+    [(set (match_operand:SI 0 "register_operand")
+	  (unspec:SI [(match_operand:SI 2 "register_operand")
+		      (match_operand 1 "tls_symbolic_operand")
+		      (match_operand 3 "constant_call_address_operand")]
+		     UNSPEC_TLS_GD))
+     (clobber (match_scratch:SI 4))
+     (clobber (match_scratch:SI 5))
+     (clobber (reg:CC FLAGS_REG))])])
+
+(define_insn "*tls_global_dynamic_64_<mode>"
+  [(set (match_operand:P 0 "register_operand" "=a")
+	(call:P
+	 (mem:QI (match_operand 2 "constant_call_address_operand" "z"))
+	 (match_operand 3)))
+   (unspec:P [(match_operand 1 "tls_symbolic_operand")]
+	     UNSPEC_TLS_GD)]
+  "TARGET_64BIT"
+{
+  if (!TARGET_X32)
+    fputs (ASM_BYTE "0x66\n", asm_out_file);
+  output_asm_insn
+    ("lea{q}\t{%E1@tlsgd(%%rip), %%rdi|rdi, %E1@tlsgd[rip]}", operands);
+  fputs (ASM_SHORT "0x6666\n", asm_out_file);
+  fputs ("\trex64\n", asm_out_file);
+  if (TARGET_SUN_TLS)
+    return "call\t%p2@plt";
+  return "call\t%P2";
+}
+  [(set_attr "type" "multi")
+   (set (attr "length")
+	(symbol_ref "TARGET_X32 ? 15 : 16"))])
+
+(define_insn "*tls_global_dynamic_64_largepic"
+  [(set (match_operand:DI 0 "register_operand" "=a")
+	(call:DI
+	 (mem:QI (plus:DI (match_operand:DI 2 "register_operand" "b")
+			  (match_operand:DI 3 "immediate_operand" "i")))
+	 (match_operand 4)))
+   (unspec:DI [(match_operand 1 "tls_symbolic_operand")]
+	     UNSPEC_TLS_GD)]
+  "TARGET_64BIT && ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF
+   && GET_CODE (operands[3]) == CONST
+   && GET_CODE (XEXP (operands[3], 0)) == UNSPEC
+   && XINT (XEXP (operands[3], 0), 1) == UNSPEC_PLTOFF"
+{
+  output_asm_insn
+    ("lea{q}\t{%E1@tlsgd(%%rip), %%rdi|rdi, %E1@tlsgd[rip]}", operands);
+  output_asm_insn ("movabs{q}\t{%3, %%rax|rax, %3}", operands);
+  output_asm_insn ("add{q}\t{%2, %%rax|rax, %2}", operands);
+  return "call\t{*%%rax|rax}";
+}
+  [(set_attr "type" "multi")
+   (set_attr "length" "22")])
+
+(define_expand "tls_global_dynamic_64_<mode>"
+  [(parallel
+    [(set (match_operand:P 0 "register_operand")
+	  (call:P
+	   (mem:QI (match_operand 2))
+	   (const_int 0)))
+     (unspec:P [(match_operand 1 "tls_symbolic_operand")]
+	       UNSPEC_TLS_GD)])]
+  "TARGET_64BIT")
+
+(define_insn "*tls_local_dynamic_base_32_gnu"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(unspec:SI
+	 [(match_operand:SI 1 "register_operand" "b")
+	  (match_operand 2 "constant_call_address_operand" "z")]
+	 UNSPEC_TLS_LD_BASE))
+   (clobber (match_scratch:SI 3 "=d"))
+   (clobber (match_scratch:SI 4 "=c"))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_64BIT && TARGET_GNU_TLS"
+{
+  output_asm_insn
+    ("lea{l}\t{%&@tlsldm(%1), %0|%0, %&@tlsldm[%1]}", operands);
+  if (TARGET_SUN_TLS)
+    {
+      if (HAVE_AS_IX86_TLSLDMPLT)
+	return "call\t%&@tlsldmplt";
+      else
+	return "call\t%p2@plt";
+    }
+  return "call\t%P2";
+}
+  [(set_attr "type" "multi")
+   (set_attr "length" "11")])
+
+(define_expand "tls_local_dynamic_base_32"
+  [(parallel
+     [(set (match_operand:SI 0 "register_operand")
+	   (unspec:SI
+	    [(match_operand:SI 1 "register_operand")
+	     (match_operand 2 "constant_call_address_operand")]
+	    UNSPEC_TLS_LD_BASE))
+      (clobber (match_scratch:SI 3))
+      (clobber (match_scratch:SI 4))
+      (clobber (reg:CC FLAGS_REG))])])
+
+(define_insn "*tls_local_dynamic_base_64_<mode>"
+  [(set (match_operand:P 0 "register_operand" "=a")
+	(call:P
+	 (mem:QI (match_operand 1 "constant_call_address_operand" "z"))
+	 (match_operand 2)))
+   (unspec:P [(const_int 0)] UNSPEC_TLS_LD_BASE)]
+  "TARGET_64BIT"
+{
+  output_asm_insn
+    ("lea{q}\t{%&@tlsld(%%rip), %%rdi|rdi, %&@tlsld[rip]}", operands);
+  if (TARGET_SUN_TLS)
+    return "call\t%p1@plt";
+  return "call\t%P1";
+}
+  [(set_attr "type" "multi")
+   (set_attr "length" "12")])
+
+(define_insn "*tls_local_dynamic_base_64_largepic"
+  [(set (match_operand:DI 0 "register_operand" "=a")
+	(call:DI
+	 (mem:QI (plus:DI (match_operand:DI 1 "register_operand" "b")
+			  (match_operand:DI 2 "immediate_operand" "i")))
+	 (match_operand 3)))
+   (unspec:DI [(const_int 0)] UNSPEC_TLS_LD_BASE)]
+  "TARGET_64BIT && ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF
+   && GET_CODE (operands[2]) == CONST
+   && GET_CODE (XEXP (operands[2], 0)) == UNSPEC
+   && XINT (XEXP (operands[2], 0), 1) == UNSPEC_PLTOFF"
+{
+  output_asm_insn
+    ("lea{q}\t{%&@tlsld(%%rip), %%rdi|rdi, %&@tlsld[rip]}", operands);
+  output_asm_insn ("movabs{q}\t{%2, %%rax|rax, %2}", operands);
+  output_asm_insn ("add{q}\t{%1, %%rax|rax, %1}", operands);
+  return "call\t{*%%rax|rax}";
+}
+  [(set_attr "type" "multi")
+   (set_attr "length" "22")])
+
+(define_expand "tls_local_dynamic_base_64_<mode>"
+  [(parallel
+     [(set (match_operand:P 0 "register_operand")
+	   (call:P
+	    (mem:QI (match_operand 1))
+	    (const_int 0)))
+      (unspec:P [(const_int 0)] UNSPEC_TLS_LD_BASE)])]
+  "TARGET_64BIT")
+
+;; Local dynamic of a single variable is a lose.  Show combine how
+;; to convert that back to global dynamic.
+
+(define_insn_and_split "*tls_local_dynamic_32_once"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(plus:SI
+	 (unspec:SI [(match_operand:SI 1 "register_operand" "b")
+		     (match_operand 2 "constant_call_address_operand" "z")]
+		    UNSPEC_TLS_LD_BASE)
+	 (const:SI (unspec:SI
+		    [(match_operand 3 "tls_symbolic_operand")]
+		    UNSPEC_DTPOFF))))
+   (clobber (match_scratch:SI 4 "=d"))
+   (clobber (match_scratch:SI 5 "=c"))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "#"
+  ""
+  [(parallel
+     [(set (match_dup 0)
+	   (unspec:SI [(match_dup 1) (match_dup 3) (match_dup 2)]
+		      UNSPEC_TLS_GD))
+      (clobber (match_dup 4))
+      (clobber (match_dup 5))
+      (clobber (reg:CC FLAGS_REG))])])
+
+;; Segment register for the thread base ptr load
+(define_mode_attr tp_seg [(SI "gs") (DI "fs")])
+
+;; Load and add the thread base pointer from %<tp_seg>:0.
+(define_insn "*load_tp_x32"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(const_int 0)] UNSPEC_TP))]
+  "TARGET_X32"
+  "mov{l}\t{%%fs:0, %0|%0, DWORD PTR fs:0}"
+  [(set_attr "type" "imov")
+   (set_attr "modrm" "0")
+   (set_attr "length" "7")
+   (set_attr "memory" "load")
+   (set_attr "imm_disp" "false")])
+
+(define_insn "*load_tp_x32_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI (unspec:SI [(const_int 0)] UNSPEC_TP)))]
+  "TARGET_X32"
+  "mov{l}\t{%%fs:0, %k0|%k0, DWORD PTR fs:0}"
+  [(set_attr "type" "imov")
+   (set_attr "modrm" "0")
+   (set_attr "length" "7")
+   (set_attr "memory" "load")
+   (set_attr "imm_disp" "false")])
+
+(define_insn "*load_tp_<mode>"
+  [(set (match_operand:P 0 "register_operand" "=r")
+	(unspec:P [(const_int 0)] UNSPEC_TP))]
+  "!TARGET_X32"
+  "mov{<imodesuffix>}\t{%%<tp_seg>:0, %0|%0, <iptrsize> PTR <tp_seg>:0}"
+  [(set_attr "type" "imov")
+   (set_attr "modrm" "0")
+   (set_attr "length" "7")
+   (set_attr "memory" "load")
+   (set_attr "imm_disp" "false")])
+
+(define_insn "*add_tp_x32"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(plus:SI (unspec:SI [(const_int 0)] UNSPEC_TP)
+		 (match_operand:SI 1 "register_operand" "0")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_X32"
+  "add{l}\t{%%fs:0, %0|%0, DWORD PTR fs:0}"
+  [(set_attr "type" "alu")
+   (set_attr "modrm" "0")
+   (set_attr "length" "7")
+   (set_attr "memory" "load")
+   (set_attr "imm_disp" "false")])
+
+(define_insn "*add_tp_x32_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (plus:SI (unspec:SI [(const_int 0)] UNSPEC_TP)
+		   (match_operand:SI 1 "register_operand" "0"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_X32"
+  "add{l}\t{%%fs:0, %k0|%k0, DWORD PTR fs:0}"
+  [(set_attr "type" "alu")
+   (set_attr "modrm" "0")
+   (set_attr "length" "7")
+   (set_attr "memory" "load")
+   (set_attr "imm_disp" "false")])
+
+(define_insn "*add_tp_<mode>"
+  [(set (match_operand:P 0 "register_operand" "=r")
+	(plus:P (unspec:P [(const_int 0)] UNSPEC_TP)
+		(match_operand:P 1 "register_operand" "0")))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_X32"
+  "add{<imodesuffix>}\t{%%<tp_seg>:0, %0|%0, <iptrsize> PTR <tp_seg>:0}"
+  [(set_attr "type" "alu")
+   (set_attr "modrm" "0")
+   (set_attr "length" "7")
+   (set_attr "memory" "load")
+   (set_attr "imm_disp" "false")])
+
+;; The Sun linker took the AMD64 TLS spec literally and can only handle
+;; %rax as destination of the initial executable code sequence.
+(define_insn "tls_initial_exec_64_sun"
+  [(set (match_operand:DI 0 "register_operand" "=a")
+	(unspec:DI
+	 [(match_operand 1 "tls_symbolic_operand")]
+	 UNSPEC_TLS_IE_SUN))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && TARGET_SUN_TLS"
+{
+  output_asm_insn
+    ("mov{q}\t{%%fs:0, %0|%0, QWORD PTR fs:0}", operands);
+  return "add{q}\t{%a1@gottpoff(%%rip), %0|%0, %a1@gottpoff[rip]}";
+}
+  [(set_attr "type" "multi")])
+
+;; GNU2 TLS patterns can be split.
+
+(define_expand "tls_dynamic_gnu2_32"
+  [(set (match_dup 3)
+	(plus:SI (match_operand:SI 2 "register_operand")
+		 (const:SI
+		  (unspec:SI [(match_operand 1 "tls_symbolic_operand")]
+			     UNSPEC_TLSDESC))))
+   (parallel
+    [(set (match_operand:SI 0 "register_operand")
+	  (unspec:SI [(match_dup 1) (match_dup 3)
+		      (match_dup 2) (reg:SI SP_REG)]
+		      UNSPEC_TLSDESC))
+     (clobber (reg:CC FLAGS_REG))])]
+  "!TARGET_64BIT && TARGET_GNU2_TLS"
+{
+  operands[3] = can_create_pseudo_p () ? gen_reg_rtx (Pmode) : operands[0];
+  ix86_tls_descriptor_calls_expanded_in_cfun = true;
+})
+
+(define_insn "*tls_dynamic_gnu2_lea_32"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(plus:SI (match_operand:SI 1 "register_operand" "b")
+		 (const:SI
+		  (unspec:SI [(match_operand 2 "tls_symbolic_operand")]
+			      UNSPEC_TLSDESC))))]
+  "!TARGET_64BIT && TARGET_GNU2_TLS"
+  "lea{l}\t{%E2@TLSDESC(%1), %0|%0, %E2@TLSDESC[%1]}"
+  [(set_attr "type" "lea")
+   (set_attr "mode" "SI")
+   (set_attr "length" "6")
+   (set_attr "length_address" "4")])
+
+(define_insn "*tls_dynamic_gnu2_call_32"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(unspec:SI [(match_operand 1 "tls_symbolic_operand")
+		    (match_operand:SI 2 "register_operand" "0")
+		    ;; we have to make sure %ebx still points to the GOT
+		    (match_operand:SI 3 "register_operand" "b")
+		    (reg:SI SP_REG)]
+		   UNSPEC_TLSDESC))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_64BIT && TARGET_GNU2_TLS"
+  "call\t{*%a1@TLSCALL(%2)|[DWORD PTR [%2+%a1@TLSCALL]]}"
+  [(set_attr "type" "call")
+   (set_attr "length" "2")
+   (set_attr "length_address" "0")])
+
+(define_insn_and_split "*tls_dynamic_gnu2_combine_32"
+  [(set (match_operand:SI 0 "register_operand" "=&a")
+	(plus:SI
+	 (unspec:SI [(match_operand 3 "tls_modbase_operand")
+		     (match_operand:SI 4)
+		     (match_operand:SI 2 "register_operand" "b")
+		     (reg:SI SP_REG)]
+		    UNSPEC_TLSDESC)
+	 (const:SI (unspec:SI
+		    [(match_operand 1 "tls_symbolic_operand")]
+		    UNSPEC_DTPOFF))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_64BIT && TARGET_GNU2_TLS"
+  "#"
+  ""
+  [(set (match_dup 0) (match_dup 5))]
+{
+  operands[5] = can_create_pseudo_p () ? gen_reg_rtx (Pmode) : operands[0];
+  emit_insn (gen_tls_dynamic_gnu2_32 (operands[5], operands[1], operands[2]));
+})
+
+(define_expand "tls_dynamic_gnu2_64"
+  [(set (match_dup 2)
+	(unspec:DI [(match_operand 1 "tls_symbolic_operand")]
+		   UNSPEC_TLSDESC))
+   (parallel
+    [(set (match_operand:DI 0 "register_operand")
+	  (unspec:DI [(match_dup 1) (match_dup 2) (reg:DI SP_REG)]
+		     UNSPEC_TLSDESC))
+     (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_64BIT && TARGET_GNU2_TLS"
+{
+  operands[2] = can_create_pseudo_p () ? gen_reg_rtx (Pmode) : operands[0];
+  ix86_tls_descriptor_calls_expanded_in_cfun = true;
+})
+
+(define_insn "*tls_dynamic_gnu2_lea_64"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec:DI [(match_operand 1 "tls_symbolic_operand")]
+		   UNSPEC_TLSDESC))]
+  "TARGET_64BIT && TARGET_GNU2_TLS"
+  "lea{q}\t{%E1@TLSDESC(%%rip), %0|%0, %E1@TLSDESC[rip]}"
+  [(set_attr "type" "lea")
+   (set_attr "mode" "DI")
+   (set_attr "length" "7")
+   (set_attr "length_address" "4")])
+
+(define_insn "*tls_dynamic_gnu2_call_64"
+  [(set (match_operand:DI 0 "register_operand" "=a")
+	(unspec:DI [(match_operand 1 "tls_symbolic_operand")
+		    (match_operand:DI 2 "register_operand" "0")
+		    (reg:DI SP_REG)]
+		   UNSPEC_TLSDESC))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && TARGET_GNU2_TLS"
+  "call\t{*%a1@TLSCALL(%2)|[QWORD PTR [%2+%a1@TLSCALL]]}"
+  [(set_attr "type" "call")
+   (set_attr "length" "2")
+   (set_attr "length_address" "0")])
+
+(define_insn_and_split "*tls_dynamic_gnu2_combine_64"
+  [(set (match_operand:DI 0 "register_operand" "=&a")
+	(plus:DI
+	 (unspec:DI [(match_operand 2 "tls_modbase_operand")
+		     (match_operand:DI 3)
+		     (reg:DI SP_REG)]
+		    UNSPEC_TLSDESC)
+	 (const:DI (unspec:DI
+		    [(match_operand 1 "tls_symbolic_operand")]
+		    UNSPEC_DTPOFF))))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && TARGET_GNU2_TLS"
+  "#"
+  ""
+  [(set (match_dup 0) (match_dup 4))]
+{
+  operands[4] = can_create_pseudo_p () ? gen_reg_rtx (Pmode) : operands[0];
+  emit_insn (gen_tls_dynamic_gnu2_64 (operands[4], operands[1]));
+})
+
+;; These patterns match the binary 387 instructions for addM3, subM3,
+;; mulM3 and divM3.  There are three patterns for each of DFmode and
+;; SFmode.  The first is the normal insn, the second the same insn but
+;; with one operand a conversion, and the third the same insn but with
+;; the other operand a conversion.  The conversion may be SFmode or
+;; SImode if the target mode DFmode, but only SImode if the target mode
+;; is SFmode.
+
+;; Gcc is slightly more smart about handling normal two address instructions
+;; so use special patterns for add and mull.
+
+(define_insn "*fop_<mode>_comm_mixed"
+  [(set (match_operand:MODEF 0 "register_operand" "=f,x,x")
+	(match_operator:MODEF 3 "binary_fp_operator"
+	  [(match_operand:MODEF 1 "nonimmediate_operand" "%0,0,x")
+	   (match_operand:MODEF 2 "nonimmediate_operand" "fm,xm,xm")]))]
+  "SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_MIX_SSE_I387
+   && COMMUTATIVE_ARITH_P (operands[3])
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "* return output_387_binary_op (insn, operands);"
+  [(set (attr "type")
+	(if_then_else (eq_attr "alternative" "1,2")
+	   (if_then_else (match_operand:MODEF 3 "mult_operator")
+	      (const_string "ssemul")
+	      (const_string "sseadd"))
+	   (if_then_else (match_operand:MODEF 3 "mult_operator")
+	      (const_string "fmul")
+	      (const_string "fop"))))
+   (set_attr "isa" "*,noavx,avx")
+   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fop_<mode>_comm_sse"
+  [(set (match_operand:MODEF 0 "register_operand" "=x,v")
+	(match_operator:MODEF 3 "binary_fp_operator"
+	  [(match_operand:MODEF 1 "nonimmediate_operand" "%0,v")
+	   (match_operand:MODEF 2 "nonimmediate_operand" "xm,vm")]))]
+  "SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
+   && COMMUTATIVE_ARITH_P (operands[3])
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "* return output_387_binary_op (insn, operands);"
+  [(set (attr "type")
+        (if_then_else (match_operand:MODEF 3 "mult_operator")
+	   (const_string "ssemul")
+	   (const_string "sseadd")))
+   (set_attr "isa" "noavx,avx")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fop_<mode>_comm_i387"
+  [(set (match_operand:MODEF 0 "register_operand" "=f")
+	(match_operator:MODEF 3 "binary_fp_operator"
+	  [(match_operand:MODEF 1 "nonimmediate_operand" "%0")
+	   (match_operand:MODEF 2 "nonimmediate_operand" "fm")]))]
+  "TARGET_80387 && X87_ENABLE_ARITH (<MODE>mode)
+   && COMMUTATIVE_ARITH_P (operands[3])
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "* return output_387_binary_op (insn, operands);"
+  [(set (attr "type")
+	(if_then_else (match_operand:MODEF 3 "mult_operator")
+	   (const_string "fmul")
+	   (const_string "fop")))
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fop_<mode>_1_mixed"
+  [(set (match_operand:MODEF 0 "register_operand" "=f,f,x,x")
+	(match_operator:MODEF 3 "binary_fp_operator"
+	  [(match_operand:MODEF 1 "nonimmediate_operand" "0,fm,0,x")
+	   (match_operand:MODEF 2 "nonimmediate_operand" "fm,0,xm,xm")]))]
+  "SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_MIX_SSE_I387
+   && !COMMUTATIVE_ARITH_P (operands[3])
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "* return output_387_binary_op (insn, operands);"
+  [(set (attr "type")
+        (cond [(and (eq_attr "alternative" "2,3")
+	            (match_operand:MODEF 3 "mult_operator"))
+                 (const_string "ssemul")
+	       (and (eq_attr "alternative" "2,3")
+	            (match_operand:MODEF 3 "div_operator"))
+                 (const_string "ssediv")
+	       (eq_attr "alternative" "2,3")
+                 (const_string "sseadd")
+	       (match_operand:MODEF 3 "mult_operator")
+                 (const_string "fmul")
+               (match_operand:MODEF 3 "div_operator")
+                 (const_string "fdiv")
+              ]
+              (const_string "fop")))
+   (set_attr "isa" "*,*,noavx,avx")
+   (set_attr "prefix" "orig,orig,orig,vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*rcpsf2_sse"
+  [(set (match_operand:SF 0 "register_operand" "=x")
+	(unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm")]
+		   UNSPEC_RCP))]
+  "TARGET_SSE_MATH"
+  "%vrcpss\t{%1, %d0|%d0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "atom_sse_attr" "rcp")
+   (set_attr "btver2_sse_attr" "rcp")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "SF")])
+
+(define_insn "*fop_<mode>_1_sse"
+  [(set (match_operand:MODEF 0 "register_operand" "=x,x")
+	(match_operator:MODEF 3 "binary_fp_operator"
+	  [(match_operand:MODEF 1 "register_operand" "0,x")
+	   (match_operand:MODEF 2 "nonimmediate_operand" "xm,xm")]))]
+  "SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
+   && !COMMUTATIVE_ARITH_P (operands[3])"
+  "* return output_387_binary_op (insn, operands);"
+  [(set (attr "type")
+        (cond [(match_operand:MODEF 3 "mult_operator")
+                 (const_string "ssemul")
+	       (match_operand:MODEF 3 "div_operator")
+                 (const_string "ssediv")
+              ]
+              (const_string "sseadd")))
+   (set_attr "isa" "noavx,avx")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<MODE>")])
+
+;; This pattern is not fully shadowed by the pattern above.
+(define_insn "*fop_<mode>_1_i387"
+  [(set (match_operand:MODEF 0 "register_operand" "=f,f")
+	(match_operator:MODEF 3 "binary_fp_operator"
+	  [(match_operand:MODEF 1 "nonimmediate_operand" "0,fm")
+	   (match_operand:MODEF 2 "nonimmediate_operand" "fm,0")]))]
+  "TARGET_80387 && X87_ENABLE_ARITH (<MODE>mode)
+   && !(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+   && !COMMUTATIVE_ARITH_P (operands[3])
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "* return output_387_binary_op (insn, operands);"
+  [(set (attr "type")
+        (cond [(match_operand:MODEF 3 "mult_operator")
+                 (const_string "fmul")
+               (match_operand:MODEF 3 "div_operator")
+                 (const_string "fdiv")
+              ]
+              (const_string "fop")))
+   (set_attr "mode" "<MODE>")])
+
+;; ??? Add SSE splitters for these!
+(define_insn "*fop_<MODEF:mode>_2_i387"
+  [(set (match_operand:MODEF 0 "register_operand" "=f")
+	(match_operator:MODEF 3 "binary_fp_operator"
+	  [(float:MODEF
+	     (match_operand:SWI24 1 "nonimmediate_operand" "m"))
+	   (match_operand:MODEF 2 "register_operand" "0")]))]
+  "TARGET_80387 && X87_ENABLE_FLOAT (<MODEF:MODE>mode, <SWI24:MODE>mode)
+   && !(SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_SSE_MATH)
+   && (TARGET_USE_<SWI24:MODE>MODE_FIOP
+       || optimize_function_for_size_p (cfun))"
+  { return output_387_binary_op (insn, operands); }
+  [(set (attr "type")
+        (cond [(match_operand:MODEF 3 "mult_operator")
+                 (const_string "fmul")
+               (match_operand:MODEF 3 "div_operator")
+                 (const_string "fdiv")
+              ]
+              (const_string "fop")))
+   (set_attr "fp_int_src" "true")
+   (set_attr "mode" "<SWI24:MODE>")])
+
+(define_insn "*fop_<MODEF:mode>_3_i387"
+  [(set (match_operand:MODEF 0 "register_operand" "=f")
+	(match_operator:MODEF 3 "binary_fp_operator"
+	  [(match_operand:MODEF 1 "register_operand" "0")
+	   (float:MODEF
+	     (match_operand:SWI24 2 "nonimmediate_operand" "m"))]))]
+  "TARGET_80387 && X87_ENABLE_FLOAT (<MODEF:MODE>mode, <SWI24:MODE>mode)
+   && !(SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_SSE_MATH)
+   && (TARGET_USE_<SWI24:MODE>MODE_FIOP
+       || optimize_function_for_size_p (cfun))"
+  { return output_387_binary_op (insn, operands); }
+  [(set (attr "type")
+        (cond [(match_operand:MODEF 3 "mult_operator")
+                 (const_string "fmul")
+               (match_operand:MODEF 3 "div_operator")
+                 (const_string "fdiv")
+              ]
+              (const_string "fop")))
+   (set_attr "fp_int_src" "true")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fop_df_4_i387"
+  [(set (match_operand:DF 0 "register_operand" "=f,f")
+	(match_operator:DF 3 "binary_fp_operator"
+	   [(float_extend:DF
+	     (match_operand:SF 1 "nonimmediate_operand" "fm,0"))
+	    (match_operand:DF 2 "register_operand" "0,f")]))]
+  "TARGET_80387 && X87_ENABLE_ARITH (DFmode)
+   && !(TARGET_SSE2 && TARGET_SSE_MATH)
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "* return output_387_binary_op (insn, operands);"
+  [(set (attr "type")
+        (cond [(match_operand:DF 3 "mult_operator")
+                 (const_string "fmul")
+               (match_operand:DF 3 "div_operator")
+                 (const_string "fdiv")
+              ]
+              (const_string "fop")))
+   (set_attr "mode" "SF")])
+
+(define_insn "*fop_df_5_i387"
+  [(set (match_operand:DF 0 "register_operand" "=f,f")
+	(match_operator:DF 3 "binary_fp_operator"
+	  [(match_operand:DF 1 "register_operand" "0,f")
+	   (float_extend:DF
+	    (match_operand:SF 2 "nonimmediate_operand" "fm,0"))]))]
+  "TARGET_80387 && X87_ENABLE_ARITH (DFmode)
+   && !(TARGET_SSE2 && TARGET_SSE_MATH)"
+  "* return output_387_binary_op (insn, operands);"
+  [(set (attr "type")
+        (cond [(match_operand:DF 3 "mult_operator")
+                 (const_string "fmul")
+               (match_operand:DF 3 "div_operator")
+                 (const_string "fdiv")
+              ]
+              (const_string "fop")))
+   (set_attr "mode" "SF")])
+
+(define_insn "*fop_df_6_i387"
+  [(set (match_operand:DF 0 "register_operand" "=f,f")
+	(match_operator:DF 3 "binary_fp_operator"
+	  [(float_extend:DF
+	    (match_operand:SF 1 "register_operand" "0,f"))
+	   (float_extend:DF
+	    (match_operand:SF 2 "nonimmediate_operand" "fm,0"))]))]
+  "TARGET_80387 && X87_ENABLE_ARITH (DFmode)
+   && !(TARGET_SSE2 && TARGET_SSE_MATH)"
+  "* return output_387_binary_op (insn, operands);"
+  [(set (attr "type")
+        (cond [(match_operand:DF 3 "mult_operator")
+                 (const_string "fmul")
+               (match_operand:DF 3 "div_operator")
+                 (const_string "fdiv")
+              ]
+              (const_string "fop")))
+   (set_attr "mode" "SF")])
+
+(define_insn "*fop_xf_comm_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(match_operator:XF 3 "binary_fp_operator"
+			[(match_operand:XF 1 "register_operand" "%0")
+			 (match_operand:XF 2 "register_operand" "f")]))]
+  "TARGET_80387
+   && COMMUTATIVE_ARITH_P (operands[3])"
+  "* return output_387_binary_op (insn, operands);"
+  [(set (attr "type")
+        (if_then_else (match_operand:XF 3 "mult_operator")
+           (const_string "fmul")
+           (const_string "fop")))
+   (set_attr "mode" "XF")])
+
+(define_insn "*fop_xf_1_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f,f")
+	(match_operator:XF 3 "binary_fp_operator"
+			[(match_operand:XF 1 "register_operand" "0,f")
+			 (match_operand:XF 2 "register_operand" "f,0")]))]
+  "TARGET_80387
+   && !COMMUTATIVE_ARITH_P (operands[3])"
+  "* return output_387_binary_op (insn, operands);"
+  [(set (attr "type")
+        (cond [(match_operand:XF 3 "mult_operator")
+                 (const_string "fmul")
+               (match_operand:XF 3 "div_operator")
+                 (const_string "fdiv")
+              ]
+              (const_string "fop")))
+   (set_attr "mode" "XF")])
+
+(define_insn "*fop_xf_2_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(match_operator:XF 3 "binary_fp_operator"
+	  [(float:XF
+	     (match_operand:SWI24 1 "nonimmediate_operand" "m"))
+	   (match_operand:XF 2 "register_operand" "0")]))]
+  "TARGET_80387
+   && (TARGET_USE_<MODE>MODE_FIOP || optimize_function_for_size_p (cfun))"
+  { return output_387_binary_op (insn, operands); }
+  [(set (attr "type")
+        (cond [(match_operand:XF 3 "mult_operator")
+                 (const_string "fmul")
+               (match_operand:XF 3 "div_operator")
+                 (const_string "fdiv")
+              ]
+              (const_string "fop")))
+   (set_attr "fp_int_src" "true")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fop_xf_3_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(match_operator:XF 3 "binary_fp_operator"
+	  [(match_operand:XF 1 "register_operand" "0")
+	   (float:XF
+	     (match_operand:SWI24 2 "nonimmediate_operand" "m"))]))]
+  "TARGET_80387
+   && (TARGET_USE_<MODE>MODE_FIOP || optimize_function_for_size_p (cfun))"
+  { return output_387_binary_op (insn, operands); }
+  [(set (attr "type")
+        (cond [(match_operand:XF 3 "mult_operator")
+                 (const_string "fmul")
+               (match_operand:XF 3 "div_operator")
+                 (const_string "fdiv")
+              ]
+              (const_string "fop")))
+   (set_attr "fp_int_src" "true")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fop_xf_4_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f,f")
+	(match_operator:XF 3 "binary_fp_operator"
+	   [(float_extend:XF
+	      (match_operand:MODEF 1 "nonimmediate_operand" "fm,0"))
+	    (match_operand:XF 2 "register_operand" "0,f")]))]
+  "TARGET_80387"
+  "* return output_387_binary_op (insn, operands);"
+  [(set (attr "type")
+        (cond [(match_operand:XF 3 "mult_operator")
+                 (const_string "fmul")
+               (match_operand:XF 3 "div_operator")
+                 (const_string "fdiv")
+              ]
+              (const_string "fop")))
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fop_xf_5_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f,f")
+	(match_operator:XF 3 "binary_fp_operator"
+	  [(match_operand:XF 1 "register_operand" "0,f")
+	   (float_extend:XF
+	     (match_operand:MODEF 2 "nonimmediate_operand" "fm,0"))]))]
+  "TARGET_80387"
+  "* return output_387_binary_op (insn, operands);"
+  [(set (attr "type")
+        (cond [(match_operand:XF 3 "mult_operator")
+                 (const_string "fmul")
+               (match_operand:XF 3 "div_operator")
+                 (const_string "fdiv")
+              ]
+              (const_string "fop")))
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fop_xf_6_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f,f")
+	(match_operator:XF 3 "binary_fp_operator"
+	  [(float_extend:XF
+	     (match_operand:MODEF 1 "register_operand" "0,f"))
+	   (float_extend:XF
+	     (match_operand:MODEF 2 "nonimmediate_operand" "fm,0"))]))]
+  "TARGET_80387"
+  "* return output_387_binary_op (insn, operands);"
+  [(set (attr "type")
+        (cond [(match_operand:XF 3 "mult_operator")
+                 (const_string "fmul")
+               (match_operand:XF 3 "div_operator")
+                 (const_string "fdiv")
+              ]
+              (const_string "fop")))
+   (set_attr "mode" "<MODE>")])
+
+;; FPU special functions.
+
+;; This pattern implements a no-op XFmode truncation for
+;; all fancy i386 XFmode math functions.
+
+(define_insn "truncxf<mode>2_i387_noop_unspec"
+  [(set (match_operand:MODEF 0 "register_operand" "=f")
+	(unspec:MODEF [(match_operand:XF 1 "register_operand" "f")]
+	UNSPEC_TRUNC_NOOP))]
+  "TARGET_USE_FANCY_MATH_387"
+  "* return output_387_reg_move (insn, operands);"
+  [(set_attr "type" "fmov")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "sqrtxf2"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(sqrt:XF (match_operand:XF 1 "register_operand" "0")))]
+  "TARGET_USE_FANCY_MATH_387"
+  "fsqrt"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")
+   (set_attr "athlon_decode" "direct")
+   (set_attr "amdfam10_decode" "direct")
+   (set_attr "bdver1_decode" "direct")])
+
+(define_insn "sqrt_extend<mode>xf2_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(sqrt:XF
+	  (float_extend:XF
+	    (match_operand:MODEF 1 "register_operand" "0"))))]
+  "TARGET_USE_FANCY_MATH_387"
+  "fsqrt"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")
+   (set_attr "athlon_decode" "direct")
+   (set_attr "amdfam10_decode" "direct")
+   (set_attr "bdver1_decode" "direct")])
+
+(define_insn "*rsqrtsf2_sse"
+  [(set (match_operand:SF 0 "register_operand" "=x")
+	(unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm")]
+		   UNSPEC_RSQRT))]
+  "TARGET_SSE_MATH"
+  "%vrsqrtss\t{%1, %d0|%d0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "atom_sse_attr" "rcp")
+   (set_attr "btver2_sse_attr" "rcp")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "SF")])
+
+(define_expand "rsqrtsf2"
+  [(set (match_operand:SF 0 "register_operand")
+	(unspec:SF [(match_operand:SF 1 "nonimmediate_operand")]
+		   UNSPEC_RSQRT))]
+  "TARGET_SSE_MATH"
+{
+  ix86_emit_swsqrtsf (operands[0], operands[1], SFmode, 1);
+  DONE;
+})
+
+(define_insn "*sqrt<mode>2_sse"
+  [(set (match_operand:MODEF 0 "register_operand" "=x")
+	(sqrt:MODEF
+	  (match_operand:MODEF 1 "nonimmediate_operand" "xm")))]
+  "SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH"
+  "%vsqrt<ssemodesuffix>\t{%1, %d0|%d0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "atom_sse_attr" "sqrt")
+   (set_attr "btver2_sse_attr" "sqrt")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<MODE>")
+   (set_attr "athlon_decode" "*")
+   (set_attr "amdfam10_decode" "*")
+   (set_attr "bdver1_decode" "*")])
+
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:MODEF 0 "register_operand")
+	(sqrt:MODEF
+	  (match_operand:MODEF 1 "nonimmediate_operand")))]
+  "(TARGET_USE_FANCY_MATH_387 && X87_ENABLE_ARITH (<MODE>mode))
+   || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
+{
+  if (<MODE>mode == SFmode
+      && TARGET_SSE_MATH
+      && TARGET_RECIP_SQRT
+      && !optimize_function_for_size_p (cfun)
+      && flag_finite_math_only && !flag_trapping_math
+      && flag_unsafe_math_optimizations)
+    {
+      ix86_emit_swsqrtsf (operands[0], operands[1], SFmode, 0);
+      DONE;
+    }
+
+  if (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH))
+    {
+      rtx op0 = gen_reg_rtx (XFmode);
+      rtx op1 = force_reg (<MODE>mode, operands[1]);
+
+      emit_insn (gen_sqrt_extend<mode>xf2_i387 (op0, op1));
+      emit_insn (gen_truncxf<mode>2_i387_noop_unspec (operands[0], op0));
+      DONE;
+   }
+})
+
+(define_insn "fpremxf4_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(unspec:XF [(match_operand:XF 2 "register_operand" "0")
+		    (match_operand:XF 3 "register_operand" "1")]
+		   UNSPEC_FPREM_F))
+   (set (match_operand:XF 1 "register_operand" "=u")
+	(unspec:XF [(match_dup 2) (match_dup 3)]
+		   UNSPEC_FPREM_U))
+   (set (reg:CCFP FPSR_REG)
+	(unspec:CCFP [(match_dup 2) (match_dup 3)]
+		     UNSPEC_C2_FLAG))]
+  "TARGET_USE_FANCY_MATH_387"
+  "fprem"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")])
+
+(define_expand "fmodxf3"
+  [(use (match_operand:XF 0 "register_operand"))
+   (use (match_operand:XF 1 "general_operand"))
+   (use (match_operand:XF 2 "general_operand"))]
+  "TARGET_USE_FANCY_MATH_387"
+{
+  rtx label = gen_label_rtx ();
+
+  rtx op1 = gen_reg_rtx (XFmode);
+  rtx op2 = gen_reg_rtx (XFmode);
+
+  emit_move_insn (op2, operands[2]);
+  emit_move_insn (op1, operands[1]);
+
+  emit_label (label);
+  emit_insn (gen_fpremxf4_i387 (op1, op2, op1, op2));
+  ix86_emit_fp_unordered_jump (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operands[0], op1);
+  DONE;
+})
+
+(define_expand "fmod<mode>3"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "general_operand"))
+   (use (match_operand:MODEF 2 "general_operand"))]
+  "TARGET_USE_FANCY_MATH_387"
+{
+  rtx (*gen_truncxf) (rtx, rtx);
+
+  rtx label = gen_label_rtx ();
+
+  rtx op1 = gen_reg_rtx (XFmode);
+  rtx op2 = gen_reg_rtx (XFmode);
+
+  emit_insn (gen_extend<mode>xf2 (op2, operands[2]));
+  emit_insn (gen_extend<mode>xf2 (op1, operands[1]));
+
+  emit_label (label);
+  emit_insn (gen_fpremxf4_i387 (op1, op2, op1, op2));
+  ix86_emit_fp_unordered_jump (label);
+  LABEL_NUSES (label) = 1;
+
+  /* Truncate the result properly for strict SSE math.  */
+  if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
+      && !TARGET_MIX_SSE_I387)
+    gen_truncxf = gen_truncxf<mode>2;
+  else
+    gen_truncxf = gen_truncxf<mode>2_i387_noop_unspec;
+
+  emit_insn (gen_truncxf (operands[0], op1));
+  DONE;
+})
+
+(define_insn "fprem1xf4_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(unspec:XF [(match_operand:XF 2 "register_operand" "0")
+		    (match_operand:XF 3 "register_operand" "1")]
+		   UNSPEC_FPREM1_F))
+   (set (match_operand:XF 1 "register_operand" "=u")
+	(unspec:XF [(match_dup 2) (match_dup 3)]
+		   UNSPEC_FPREM1_U))
+   (set (reg:CCFP FPSR_REG)
+	(unspec:CCFP [(match_dup 2) (match_dup 3)]
+		     UNSPEC_C2_FLAG))]
+  "TARGET_USE_FANCY_MATH_387"
+  "fprem1"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")])
+
+(define_expand "remainderxf3"
+  [(use (match_operand:XF 0 "register_operand"))
+   (use (match_operand:XF 1 "general_operand"))
+   (use (match_operand:XF 2 "general_operand"))]
+  "TARGET_USE_FANCY_MATH_387"
+{
+  rtx label = gen_label_rtx ();
+
+  rtx op1 = gen_reg_rtx (XFmode);
+  rtx op2 = gen_reg_rtx (XFmode);
+
+  emit_move_insn (op2, operands[2]);
+  emit_move_insn (op1, operands[1]);
+
+  emit_label (label);
+  emit_insn (gen_fprem1xf4_i387 (op1, op2, op1, op2));
+  ix86_emit_fp_unordered_jump (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operands[0], op1);
+  DONE;
+})
+
+(define_expand "remainder<mode>3"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "general_operand"))
+   (use (match_operand:MODEF 2 "general_operand"))]
+  "TARGET_USE_FANCY_MATH_387"
+{
+  rtx (*gen_truncxf) (rtx, rtx);
+
+  rtx label = gen_label_rtx ();
+
+  rtx op1 = gen_reg_rtx (XFmode);
+  rtx op2 = gen_reg_rtx (XFmode);
+
+  emit_insn (gen_extend<mode>xf2 (op2, operands[2]));
+  emit_insn (gen_extend<mode>xf2 (op1, operands[1]));
+
+  emit_label (label);
+
+  emit_insn (gen_fprem1xf4_i387 (op1, op2, op1, op2));
+  ix86_emit_fp_unordered_jump (label);
+  LABEL_NUSES (label) = 1;
+
+  /* Truncate the result properly for strict SSE math.  */
+  if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
+      && !TARGET_MIX_SSE_I387)
+    gen_truncxf = gen_truncxf<mode>2;
+  else
+    gen_truncxf = gen_truncxf<mode>2_i387_noop_unspec;
+
+  emit_insn (gen_truncxf (operands[0], op1));
+  DONE;
+})
+
+(define_int_iterator SINCOS
+	[UNSPEC_SIN
+	 UNSPEC_COS])
+
+(define_int_attr sincos
+	[(UNSPEC_SIN "sin")
+	 (UNSPEC_COS "cos")])
+
+(define_insn "*<sincos>xf2_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(unspec:XF [(match_operand:XF 1 "register_operand" "0")]
+		   SINCOS))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+  "f<sincos>"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")])
+
+(define_insn "*<sincos>_extend<mode>xf2_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(unspec:XF [(float_extend:XF
+		      (match_operand:MODEF 1 "register_operand" "0"))]
+		   SINCOS))]
+  "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+  "f<sincos>"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")])
+
+;; When sincos pattern is defined, sin and cos builtin functions will be
+;; expanded to sincos pattern with one of its outputs left unused.
+;; CSE pass will figure out if two sincos patterns can be combined,
+;; otherwise sincos pattern will be split back to sin or cos pattern,
+;; depending on the unused output.
+
+(define_insn "sincosxf3"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(unspec:XF [(match_operand:XF 2 "register_operand" "0")]
+		   UNSPEC_SINCOS_COS))
+   (set (match_operand:XF 1 "register_operand" "=u")
+        (unspec:XF [(match_dup 2)] UNSPEC_SINCOS_SIN))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+  "fsincos"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")])
+
+(define_split
+  [(set (match_operand:XF 0 "register_operand")
+	(unspec:XF [(match_operand:XF 2 "register_operand")]
+		   UNSPEC_SINCOS_COS))
+   (set (match_operand:XF 1 "register_operand")
+	(unspec:XF [(match_dup 2)] UNSPEC_SINCOS_SIN))]
+  "find_regno_note (insn, REG_UNUSED, REGNO (operands[0]))
+   && can_create_pseudo_p ()"
+  [(set (match_dup 1) (unspec:XF [(match_dup 2)] UNSPEC_SIN))])
+
+(define_split
+  [(set (match_operand:XF 0 "register_operand")
+	(unspec:XF [(match_operand:XF 2 "register_operand")]
+		   UNSPEC_SINCOS_COS))
+   (set (match_operand:XF 1 "register_operand")
+	(unspec:XF [(match_dup 2)] UNSPEC_SINCOS_SIN))]
+  "find_regno_note (insn, REG_UNUSED, REGNO (operands[1]))
+   && can_create_pseudo_p ()"
+  [(set (match_dup 0) (unspec:XF [(match_dup 2)] UNSPEC_COS))])
+
+(define_insn "sincos_extend<mode>xf3_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(unspec:XF [(float_extend:XF
+		      (match_operand:MODEF 2 "register_operand" "0"))]
+		   UNSPEC_SINCOS_COS))
+   (set (match_operand:XF 1 "register_operand" "=u")
+        (unspec:XF [(float_extend:XF (match_dup 2))] UNSPEC_SINCOS_SIN))]
+  "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+  "fsincos"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")])
+
+(define_split
+  [(set (match_operand:XF 0 "register_operand")
+	(unspec:XF [(float_extend:XF
+		      (match_operand:MODEF 2 "register_operand"))]
+		   UNSPEC_SINCOS_COS))
+   (set (match_operand:XF 1 "register_operand")
+	(unspec:XF [(float_extend:XF (match_dup 2))] UNSPEC_SINCOS_SIN))]
+  "find_regno_note (insn, REG_UNUSED, REGNO (operands[0]))
+   && can_create_pseudo_p ()"
+  [(set (match_dup 1)
+	(unspec:XF [(float_extend:XF (match_dup 2))] UNSPEC_SIN))])
+
+(define_split
+  [(set (match_operand:XF 0 "register_operand")
+	(unspec:XF [(float_extend:XF
+		      (match_operand:MODEF 2 "register_operand"))]
+		   UNSPEC_SINCOS_COS))
+   (set (match_operand:XF 1 "register_operand")
+	(unspec:XF [(float_extend:XF (match_dup 2))] UNSPEC_SINCOS_SIN))]
+  "find_regno_note (insn, REG_UNUSED, REGNO (operands[1]))
+   && can_create_pseudo_p ()"
+  [(set (match_dup 0)
+	(unspec:XF [(float_extend:XF (match_dup 2))] UNSPEC_COS))])
+
+(define_expand "sincos<mode>3"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "register_operand"))
+   (use (match_operand:MODEF 2 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0 = gen_reg_rtx (XFmode);
+  rtx op1 = gen_reg_rtx (XFmode);
+
+  emit_insn (gen_sincos_extend<mode>xf3_i387 (op0, op1, operands[2]));
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[1], op1));
+  DONE;
+})
+
+(define_insn "fptanxf4_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(match_operand:XF 3 "const_double_operand" "F"))
+   (set (match_operand:XF 1 "register_operand" "=u")
+        (unspec:XF [(match_operand:XF 2 "register_operand" "0")]
+		   UNSPEC_TAN))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations
+   && standard_80387_constant_p (operands[3]) == 2"
+  "fptan"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")])
+
+(define_insn "fptan_extend<mode>xf4_i387"
+  [(set (match_operand:MODEF 0 "register_operand" "=f")
+	(match_operand:MODEF 3 "const_double_operand" "F"))
+   (set (match_operand:XF 1 "register_operand" "=u")
+        (unspec:XF [(float_extend:XF
+		      (match_operand:MODEF 2 "register_operand" "0"))]
+		   UNSPEC_TAN))]
+  "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations
+   && standard_80387_constant_p (operands[3]) == 2"
+  "fptan"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")])
+
+(define_expand "tanxf2"
+  [(use (match_operand:XF 0 "register_operand"))
+   (use (match_operand:XF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+{
+  rtx one = gen_reg_rtx (XFmode);
+  rtx op2 = CONST1_RTX (XFmode); /* fld1 */
+
+  emit_insn (gen_fptanxf4_i387 (one, operands[0], operands[1], op2));
+  DONE;
+})
+
+(define_expand "tan<mode>2"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0 = gen_reg_rtx (XFmode);
+
+  rtx one = gen_reg_rtx (<MODE>mode);
+  rtx op2 = CONST1_RTX (<MODE>mode); /* fld1 */
+
+  emit_insn (gen_fptan_extend<mode>xf4_i387 (one, op0,
+					     operands[1], op2));
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+  DONE;
+})
+
+(define_insn "*fpatanxf3_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+        (unspec:XF [(match_operand:XF 1 "register_operand" "0")
+	            (match_operand:XF 2 "register_operand" "u")]
+	           UNSPEC_FPATAN))
+   (clobber (match_scratch:XF 3 "=2"))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+  "fpatan"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")])
+
+(define_insn "fpatan_extend<mode>xf3_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+        (unspec:XF [(float_extend:XF
+		      (match_operand:MODEF 1 "register_operand" "0"))
+		    (float_extend:XF
+		      (match_operand:MODEF 2 "register_operand" "u"))]
+	           UNSPEC_FPATAN))
+   (clobber (match_scratch:XF 3 "=2"))]
+  "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+  "fpatan"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")])
+
+(define_expand "atan2xf3"
+  [(parallel [(set (match_operand:XF 0 "register_operand")
+		   (unspec:XF [(match_operand:XF 2 "register_operand")
+			       (match_operand:XF 1 "register_operand")]
+			      UNSPEC_FPATAN))
+	      (clobber (match_scratch:XF 3))])]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations")
+
+(define_expand "atan2<mode>3"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "register_operand"))
+   (use (match_operand:MODEF 2 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0 = gen_reg_rtx (XFmode);
+
+  emit_insn (gen_fpatan_extend<mode>xf3_i387 (op0, operands[2], operands[1]));
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+  DONE;
+})
+
+(define_expand "atanxf2"
+  [(parallel [(set (match_operand:XF 0 "register_operand")
+		   (unspec:XF [(match_dup 2)
+			       (match_operand:XF 1 "register_operand")]
+			      UNSPEC_FPATAN))
+	      (clobber (match_scratch:XF 3))])]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+{
+  operands[2] = gen_reg_rtx (XFmode);
+  emit_move_insn (operands[2], CONST1_RTX (XFmode));  /* fld1 */
+})
+
+(define_expand "atan<mode>2"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0 = gen_reg_rtx (XFmode);
+
+  rtx op2 = gen_reg_rtx (<MODE>mode);
+  emit_move_insn (op2, CONST1_RTX (<MODE>mode));  /* fld1 */
+
+  emit_insn (gen_fpatan_extend<mode>xf3_i387 (op0, op2, operands[1]));
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+  DONE;
+})
+
+(define_expand "asinxf2"
+  [(set (match_dup 2)
+	(mult:XF (match_operand:XF 1 "register_operand")
+		 (match_dup 1)))
+   (set (match_dup 4) (minus:XF (match_dup 3) (match_dup 2)))
+   (set (match_dup 5) (sqrt:XF (match_dup 4)))
+   (parallel [(set (match_operand:XF 0 "register_operand")
+        	   (unspec:XF [(match_dup 5) (match_dup 1)]
+			      UNSPEC_FPATAN))
+   	      (clobber (match_scratch:XF 6))])]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+{
+  int i;
+
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  for (i = 2; i < 6; i++)
+    operands[i] = gen_reg_rtx (XFmode);
+
+  emit_move_insn (operands[3], CONST1_RTX (XFmode));  /* fld1 */
+})
+
+(define_expand "asin<mode>2"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "general_operand"))]
+ "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0 = gen_reg_rtx (XFmode);
+  rtx op1 = gen_reg_rtx (XFmode);
+
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  emit_insn (gen_extend<mode>xf2 (op1, operands[1]));
+  emit_insn (gen_asinxf2 (op0, op1));
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+  DONE;
+})
+
+(define_expand "acosxf2"
+  [(set (match_dup 2)
+	(mult:XF (match_operand:XF 1 "register_operand")
+		 (match_dup 1)))
+   (set (match_dup 4) (minus:XF (match_dup 3) (match_dup 2)))
+   (set (match_dup 5) (sqrt:XF (match_dup 4)))
+   (parallel [(set (match_operand:XF 0 "register_operand")
+        	   (unspec:XF [(match_dup 1) (match_dup 5)]
+			      UNSPEC_FPATAN))
+   	      (clobber (match_scratch:XF 6))])]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+{
+  int i;
+
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  for (i = 2; i < 6; i++)
+    operands[i] = gen_reg_rtx (XFmode);
+
+  emit_move_insn (operands[3], CONST1_RTX (XFmode));  /* fld1 */
+})
+
+(define_expand "acos<mode>2"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "general_operand"))]
+ "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0 = gen_reg_rtx (XFmode);
+  rtx op1 = gen_reg_rtx (XFmode);
+
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  emit_insn (gen_extend<mode>xf2 (op1, operands[1]));
+  emit_insn (gen_acosxf2 (op0, op1));
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+  DONE;
+})
+
+(define_insn "fyl2xxf3_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+        (unspec:XF [(match_operand:XF 1 "register_operand" "0")
+		    (match_operand:XF 2 "register_operand" "u")]
+	           UNSPEC_FYL2X))
+   (clobber (match_scratch:XF 3 "=2"))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+  "fyl2x"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")])
+
+(define_insn "fyl2x_extend<mode>xf3_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+        (unspec:XF [(float_extend:XF
+		      (match_operand:MODEF 1 "register_operand" "0"))
+		    (match_operand:XF 2 "register_operand" "u")]
+	           UNSPEC_FYL2X))
+   (clobber (match_scratch:XF 3 "=2"))]
+  "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+  "fyl2x"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")])
+
+(define_expand "logxf2"
+  [(parallel [(set (match_operand:XF 0 "register_operand")
+		   (unspec:XF [(match_operand:XF 1 "register_operand")
+			       (match_dup 2)] UNSPEC_FYL2X))
+	      (clobber (match_scratch:XF 3))])]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+{
+  operands[2] = gen_reg_rtx (XFmode);
+  emit_move_insn (operands[2], standard_80387_constant_rtx (4)); /* fldln2 */
+})
+
+(define_expand "log<mode>2"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0 = gen_reg_rtx (XFmode);
+
+  rtx op2 = gen_reg_rtx (XFmode);
+  emit_move_insn (op2, standard_80387_constant_rtx (4)); /* fldln2 */
+
+  emit_insn (gen_fyl2x_extend<mode>xf3_i387 (op0, operands[1], op2));
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+  DONE;
+})
+
+(define_expand "log10xf2"
+  [(parallel [(set (match_operand:XF 0 "register_operand")
+		   (unspec:XF [(match_operand:XF 1 "register_operand")
+			       (match_dup 2)] UNSPEC_FYL2X))
+	      (clobber (match_scratch:XF 3))])]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+{
+  operands[2] = gen_reg_rtx (XFmode);
+  emit_move_insn (operands[2], standard_80387_constant_rtx (3)); /* fldlg2 */
+})
+
+(define_expand "log10<mode>2"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0 = gen_reg_rtx (XFmode);
+
+  rtx op2 = gen_reg_rtx (XFmode);
+  emit_move_insn (op2, standard_80387_constant_rtx (3)); /* fldlg2 */
+
+  emit_insn (gen_fyl2x_extend<mode>xf3_i387 (op0, operands[1], op2));
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+  DONE;
+})
+
+(define_expand "log2xf2"
+  [(parallel [(set (match_operand:XF 0 "register_operand")
+		   (unspec:XF [(match_operand:XF 1 "register_operand")
+			       (match_dup 2)] UNSPEC_FYL2X))
+	      (clobber (match_scratch:XF 3))])]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+{
+  operands[2] = gen_reg_rtx (XFmode);
+  emit_move_insn (operands[2], CONST1_RTX (XFmode)); /* fld1 */
+})
+
+(define_expand "log2<mode>2"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0 = gen_reg_rtx (XFmode);
+
+  rtx op2 = gen_reg_rtx (XFmode);
+  emit_move_insn (op2, CONST1_RTX (XFmode)); /* fld1 */
+
+  emit_insn (gen_fyl2x_extend<mode>xf3_i387 (op0, operands[1], op2));
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+  DONE;
+})
+
+(define_insn "fyl2xp1xf3_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+        (unspec:XF [(match_operand:XF 1 "register_operand" "0")
+		    (match_operand:XF 2 "register_operand" "u")]
+	           UNSPEC_FYL2XP1))
+   (clobber (match_scratch:XF 3 "=2"))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+  "fyl2xp1"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")])
+
+(define_insn "fyl2xp1_extend<mode>xf3_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+        (unspec:XF [(float_extend:XF
+		      (match_operand:MODEF 1 "register_operand" "0"))
+		    (match_operand:XF 2 "register_operand" "u")]
+	           UNSPEC_FYL2XP1))
+   (clobber (match_scratch:XF 3 "=2"))]
+  "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+  "fyl2xp1"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")])
+
+(define_expand "log1pxf2"
+  [(use (match_operand:XF 0 "register_operand"))
+   (use (match_operand:XF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+{
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  ix86_emit_i387_log1p (operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "log1p<mode>2"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0;
+
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  op0 = gen_reg_rtx (XFmode);
+
+  operands[1] = gen_rtx_FLOAT_EXTEND (XFmode, operands[1]);
+
+  ix86_emit_i387_log1p (op0, operands[1]);
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+  DONE;
+})
+
+(define_insn "fxtractxf3_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(unspec:XF [(match_operand:XF 2 "register_operand" "0")]
+		   UNSPEC_XTRACT_FRACT))
+   (set (match_operand:XF 1 "register_operand" "=u")
+        (unspec:XF [(match_dup 2)] UNSPEC_XTRACT_EXP))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+  "fxtract"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")])
+
+(define_insn "fxtract_extend<mode>xf3_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(unspec:XF [(float_extend:XF
+		      (match_operand:MODEF 2 "register_operand" "0"))]
+		   UNSPEC_XTRACT_FRACT))
+   (set (match_operand:XF 1 "register_operand" "=u")
+        (unspec:XF [(float_extend:XF (match_dup 2))] UNSPEC_XTRACT_EXP))]
+  "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+  "fxtract"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")])
+
+(define_expand "logbxf2"
+  [(parallel [(set (match_dup 2)
+		   (unspec:XF [(match_operand:XF 1 "register_operand")]
+			      UNSPEC_XTRACT_FRACT))
+	      (set (match_operand:XF 0 "register_operand")
+		   (unspec:XF [(match_dup 1)] UNSPEC_XTRACT_EXP))])]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+  "operands[2] = gen_reg_rtx (XFmode);")
+
+(define_expand "logb<mode>2"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0 = gen_reg_rtx (XFmode);
+  rtx op1 = gen_reg_rtx (XFmode);
+
+  emit_insn (gen_fxtract_extend<mode>xf3_i387 (op0, op1, operands[1]));
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op1));
+  DONE;
+})
+
+(define_expand "ilogbxf2"
+  [(use (match_operand:SI 0 "register_operand"))
+   (use (match_operand:XF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0, op1;
+
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  op0 = gen_reg_rtx (XFmode);
+  op1 = gen_reg_rtx (XFmode);
+
+  emit_insn (gen_fxtractxf3_i387 (op0, op1, operands[1]));
+  emit_insn (gen_fix_truncxfsi2 (operands[0], op1));
+  DONE;
+})
+
+(define_expand "ilogb<mode>2"
+  [(use (match_operand:SI 0 "register_operand"))
+   (use (match_operand:MODEF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0, op1;
+
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  op0 = gen_reg_rtx (XFmode);
+  op1 = gen_reg_rtx (XFmode);
+
+  emit_insn (gen_fxtract_extend<mode>xf3_i387 (op0, op1, operands[1]));
+  emit_insn (gen_fix_truncxfsi2 (operands[0], op1));
+  DONE;
+})
+
+(define_insn "*f2xm1xf2_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(unspec:XF [(match_operand:XF 1 "register_operand" "0")]
+		   UNSPEC_F2XM1))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+  "f2xm1"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")])
+
+(define_insn "fscalexf4_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(unspec:XF [(match_operand:XF 2 "register_operand" "0")
+		    (match_operand:XF 3 "register_operand" "1")]
+		   UNSPEC_FSCALE_FRACT))
+   (set (match_operand:XF 1 "register_operand" "=u")
+	(unspec:XF [(match_dup 2) (match_dup 3)]
+		   UNSPEC_FSCALE_EXP))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+  "fscale"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")])
+
+(define_expand "expNcorexf3"
+  [(set (match_dup 3) (mult:XF (match_operand:XF 1 "register_operand")
+			       (match_operand:XF 2 "register_operand")))
+   (set (match_dup 4) (unspec:XF [(match_dup 3)] UNSPEC_FRNDINT))
+   (set (match_dup 5) (minus:XF (match_dup 3) (match_dup 4)))
+   (set (match_dup 6) (unspec:XF [(match_dup 5)] UNSPEC_F2XM1))
+   (set (match_dup 8) (plus:XF (match_dup 6) (match_dup 7)))
+   (parallel [(set (match_operand:XF 0 "register_operand")
+		   (unspec:XF [(match_dup 8) (match_dup 4)]
+			      UNSPEC_FSCALE_FRACT))
+	      (set (match_dup 9)
+		   (unspec:XF [(match_dup 8) (match_dup 4)]
+			      UNSPEC_FSCALE_EXP))])]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+{
+  int i;
+
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  for (i = 3; i < 10; i++)
+    operands[i] = gen_reg_rtx (XFmode);
+
+  emit_move_insn (operands[7], CONST1_RTX (XFmode));  /* fld1 */
+})
+
+(define_expand "expxf2"
+  [(use (match_operand:XF 0 "register_operand"))
+   (use (match_operand:XF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+{
+  rtx op2;
+
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  op2 = gen_reg_rtx (XFmode);
+  emit_move_insn (op2, standard_80387_constant_rtx (5)); /* fldl2e */
+
+  emit_insn (gen_expNcorexf3 (operands[0], operands[1], op2));
+  DONE;
+})
+
+(define_expand "exp<mode>2"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "general_operand"))]
+ "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0, op1;
+
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  op0 = gen_reg_rtx (XFmode);
+  op1 = gen_reg_rtx (XFmode);
+
+  emit_insn (gen_extend<mode>xf2 (op1, operands[1]));
+  emit_insn (gen_expxf2 (op0, op1));
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+  DONE;
+})
+
+(define_expand "exp10xf2"
+  [(use (match_operand:XF 0 "register_operand"))
+   (use (match_operand:XF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+{
+  rtx op2;
+
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  op2 = gen_reg_rtx (XFmode);
+  emit_move_insn (op2, standard_80387_constant_rtx (6)); /* fldl2t */
+
+  emit_insn (gen_expNcorexf3 (operands[0], operands[1], op2));
+  DONE;
+})
+
+(define_expand "exp10<mode>2"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "general_operand"))]
+ "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0, op1;
+
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  op0 = gen_reg_rtx (XFmode);
+  op1 = gen_reg_rtx (XFmode);
+
+  emit_insn (gen_extend<mode>xf2 (op1, operands[1]));
+  emit_insn (gen_exp10xf2 (op0, op1));
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+  DONE;
+})
+
+(define_expand "exp2xf2"
+  [(use (match_operand:XF 0 "register_operand"))
+   (use (match_operand:XF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+{
+  rtx op2;
+
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  op2 = gen_reg_rtx (XFmode);
+  emit_move_insn (op2, CONST1_RTX (XFmode));  /* fld1 */
+
+  emit_insn (gen_expNcorexf3 (operands[0], operands[1], op2));
+  DONE;
+})
+
+(define_expand "exp2<mode>2"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "general_operand"))]
+ "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0, op1;
+
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  op0 = gen_reg_rtx (XFmode);
+  op1 = gen_reg_rtx (XFmode);
+
+  emit_insn (gen_extend<mode>xf2 (op1, operands[1]));
+  emit_insn (gen_exp2xf2 (op0, op1));
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+  DONE;
+})
+
+(define_expand "expm1xf2"
+  [(set (match_dup 3) (mult:XF (match_operand:XF 1 "register_operand")
+			       (match_dup 2)))
+   (set (match_dup 4) (unspec:XF [(match_dup 3)] UNSPEC_FRNDINT))
+   (set (match_dup 5) (minus:XF (match_dup 3) (match_dup 4)))
+   (set (match_dup 9) (float_extend:XF (match_dup 13)))
+   (set (match_dup 6) (unspec:XF [(match_dup 5)] UNSPEC_F2XM1))
+   (parallel [(set (match_dup 7)
+		   (unspec:XF [(match_dup 6) (match_dup 4)]
+			      UNSPEC_FSCALE_FRACT))
+	      (set (match_dup 8)
+		   (unspec:XF [(match_dup 6) (match_dup 4)]
+			      UNSPEC_FSCALE_EXP))])
+   (parallel [(set (match_dup 10)
+		   (unspec:XF [(match_dup 9) (match_dup 8)]
+			      UNSPEC_FSCALE_FRACT))
+	      (set (match_dup 11)
+		   (unspec:XF [(match_dup 9) (match_dup 8)]
+			      UNSPEC_FSCALE_EXP))])
+   (set (match_dup 12) (minus:XF (match_dup 10)
+				 (float_extend:XF (match_dup 13))))
+   (set (match_operand:XF 0 "register_operand")
+	(plus:XF (match_dup 12) (match_dup 7)))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+{
+  int i;
+
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  for (i = 2; i < 13; i++)
+    operands[i] = gen_reg_rtx (XFmode);
+
+  operands[13]
+    = validize_mem (force_const_mem (SFmode, CONST1_RTX (SFmode))); /* fld1 */
+
+  emit_move_insn (operands[2], standard_80387_constant_rtx (5)); /* fldl2e */
+})
+
+(define_expand "expm1<mode>2"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "general_operand"))]
+ "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0, op1;
+
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  op0 = gen_reg_rtx (XFmode);
+  op1 = gen_reg_rtx (XFmode);
+
+  emit_insn (gen_extend<mode>xf2 (op1, operands[1]));
+  emit_insn (gen_expm1xf2 (op0, op1));
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+  DONE;
+})
+
+(define_expand "ldexpxf3"
+  [(match_operand:XF 0 "register_operand")
+   (match_operand:XF 1 "register_operand")
+   (match_operand:SI 2 "register_operand")]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+{
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  operands[3] = gen_reg_rtx (XFmode);
+  operands[4] = gen_reg_rtx (XFmode);
+
+  emit_insn (gen_floatsixf2 (operands[3], operands[2]));
+  emit_insn (gen_fscalexf4_i387 (operands[0], operands[4],
+                                 operands[1], operands[3]));
+  DONE;
+})
+
+(define_expand "ldexp<mode>3"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "general_operand"))
+   (use (match_operand:SI 2 "register_operand"))]
+ "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0, op1;
+
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  op0 = gen_reg_rtx (XFmode);
+  op1 = gen_reg_rtx (XFmode);
+
+  emit_insn (gen_extend<mode>xf2 (op1, operands[1]));
+  emit_insn (gen_ldexpxf3 (op0, op1, operands[2]));
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+  DONE;
+})
+
+(define_expand "scalbxf3"
+  [(parallel [(set (match_operand:XF 0 " register_operand")
+		   (unspec:XF [(match_operand:XF 1 "register_operand")
+			       (match_operand:XF 2 "register_operand")]
+			      UNSPEC_FSCALE_FRACT))
+	      (set (match_dup 3)
+		   (unspec:XF [(match_dup 1) (match_dup 2)]
+			      UNSPEC_FSCALE_EXP))])]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+{
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  operands[3] = gen_reg_rtx (XFmode);
+})
+
+(define_expand "scalb<mode>3"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "general_operand"))
+   (use (match_operand:MODEF 2 "general_operand"))]
+ "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0, op1, op2;
+
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  op0 = gen_reg_rtx (XFmode);
+  op1 = gen_reg_rtx (XFmode);
+  op2 = gen_reg_rtx (XFmode);
+
+  emit_insn (gen_extend<mode>xf2 (op1, operands[1]));
+  emit_insn (gen_extend<mode>xf2 (op2, operands[2]));
+  emit_insn (gen_scalbxf3 (op0, op1, op2));
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+  DONE;
+})
+
+(define_expand "significandxf2"
+  [(parallel [(set (match_operand:XF 0 "register_operand")
+		   (unspec:XF [(match_operand:XF 1 "register_operand")]
+			      UNSPEC_XTRACT_FRACT))
+	      (set (match_dup 2)
+		   (unspec:XF [(match_dup 1)] UNSPEC_XTRACT_EXP))])]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+  "operands[2] = gen_reg_rtx (XFmode);")
+
+(define_expand "significand<mode>2"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0 = gen_reg_rtx (XFmode);
+  rtx op1 = gen_reg_rtx (XFmode);
+
+  emit_insn (gen_fxtract_extend<mode>xf3_i387 (op0, op1, operands[1]));
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+  DONE;
+})
+
+
+(define_insn "sse4_1_round<mode>2"
+  [(set (match_operand:MODEF 0 "register_operand" "=x")
+	(unspec:MODEF [(match_operand:MODEF 1 "register_operand" "x")
+		       (match_operand:SI 2 "const_0_to_15_operand" "n")]
+		      UNSPEC_ROUND))]
+  "TARGET_ROUND"
+  "%vround<ssemodesuffix>\t{%2, %1, %d0|%d0, %1, %2}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "rintxf2"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(unspec:XF [(match_operand:XF 1 "register_operand" "0")]
+		   UNSPEC_FRNDINT))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+  "frndint"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "XF")])
+
+(define_expand "rint<mode>2"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "register_operand"))]
+  "(TARGET_USE_FANCY_MATH_387
+    && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+	|| TARGET_MIX_SSE_I387)
+    && flag_unsafe_math_optimizations)
+   || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
+       && !flag_trapping_math)"
+{
+  if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
+      && !flag_trapping_math)
+    {
+      if (TARGET_ROUND)
+	emit_insn (gen_sse4_1_round<mode>2
+		   (operands[0], operands[1], GEN_INT (ROUND_MXCSR)));
+      else if (optimize_insn_for_size_p ())
+        FAIL;
+      else
+	ix86_expand_rint (operands[0], operands[1]);
+    }
+  else
+    {
+      rtx op0 = gen_reg_rtx (XFmode);
+      rtx op1 = gen_reg_rtx (XFmode);
+
+      emit_insn (gen_extend<mode>xf2 (op1, operands[1]));
+      emit_insn (gen_rintxf2 (op0, op1));
+
+      emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+    }
+  DONE;
+})
+
+(define_expand "round<mode>2"
+  [(match_operand:X87MODEF 0 "register_operand")
+   (match_operand:X87MODEF 1 "nonimmediate_operand")]
+  "(TARGET_USE_FANCY_MATH_387
+    && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+	|| TARGET_MIX_SSE_I387)
+    && flag_unsafe_math_optimizations)
+   || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
+       && !flag_trapping_math && !flag_rounding_math)"
+{
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
+      && !flag_trapping_math && !flag_rounding_math)
+    {
+      if (TARGET_ROUND)
+        {
+	  operands[1] = force_reg (<MODE>mode, operands[1]);
+	  ix86_expand_round_sse4 (operands[0], operands[1]);
+	}
+      else if (TARGET_64BIT || (<MODE>mode != DFmode))
+	ix86_expand_round (operands[0], operands[1]);
+      else
+	ix86_expand_rounddf_32 (operands[0], operands[1]);
+    }
+  else
+    {
+      operands[1] = force_reg (<MODE>mode, operands[1]);
+      ix86_emit_i387_round (operands[0], operands[1]);
+    }
+  DONE;
+})
+
+(define_insn_and_split "*fistdi2_1"
+  [(set (match_operand:DI 0 "nonimmediate_operand")
+	(unspec:DI [(match_operand:XF 1 "register_operand")]
+		   UNSPEC_FIST))]
+  "TARGET_USE_FANCY_MATH_387
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (memory_operand (operands[0], VOIDmode))
+    emit_insn (gen_fistdi2 (operands[0], operands[1]));
+  else
+    {
+      operands[2] = assign_386_stack_local (DImode, SLOT_TEMP);
+      emit_insn (gen_fistdi2_with_temp (operands[0], operands[1],
+					 operands[2]));
+    }
+  DONE;
+}
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "DI")])
+
+(define_insn "fistdi2"
+  [(set (match_operand:DI 0 "memory_operand" "=m")
+	(unspec:DI [(match_operand:XF 1 "register_operand" "f")]
+		   UNSPEC_FIST))
+   (clobber (match_scratch:XF 2 "=&1f"))]
+  "TARGET_USE_FANCY_MATH_387"
+  "* return output_fix_trunc (insn, operands, false);"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "DI")])
+
+(define_insn "fistdi2_with_temp"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=m,?r")
+	(unspec:DI [(match_operand:XF 1 "register_operand" "f,f")]
+		   UNSPEC_FIST))
+   (clobber (match_operand:DI 2 "memory_operand" "=X,m"))
+   (clobber (match_scratch:XF 3 "=&1f,&1f"))]
+  "TARGET_USE_FANCY_MATH_387"
+  "#"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "DI")])
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(unspec:DI [(match_operand:XF 1 "register_operand")]
+		   UNSPEC_FIST))
+   (clobber (match_operand:DI 2 "memory_operand"))
+   (clobber (match_scratch 3))]
+  "reload_completed"
+  [(parallel [(set (match_dup 2) (unspec:DI [(match_dup 1)] UNSPEC_FIST))
+	      (clobber (match_dup 3))])
+   (set (match_dup 0) (match_dup 2))])
+
+(define_split
+  [(set (match_operand:DI 0 "memory_operand")
+	(unspec:DI [(match_operand:XF 1 "register_operand")]
+		   UNSPEC_FIST))
+   (clobber (match_operand:DI 2 "memory_operand"))
+   (clobber (match_scratch 3))]
+  "reload_completed"
+  [(parallel [(set (match_dup 0) (unspec:DI [(match_dup 1)] UNSPEC_FIST))
+	      (clobber (match_dup 3))])])
+
+(define_insn_and_split "*fist<mode>2_1"
+  [(set (match_operand:SWI24 0 "register_operand")
+	(unspec:SWI24 [(match_operand:XF 1 "register_operand")]
+		      UNSPEC_FIST))]
+  "TARGET_USE_FANCY_MATH_387
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  operands[2] = assign_386_stack_local (<MODE>mode, SLOT_TEMP);
+  emit_insn (gen_fist<mode>2_with_temp (operands[0], operands[1],
+					operands[2]));
+  DONE;
+}
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "fist<mode>2"
+  [(set (match_operand:SWI24 0 "memory_operand" "=m")
+	(unspec:SWI24 [(match_operand:XF 1 "register_operand" "f")]
+		      UNSPEC_FIST))]
+  "TARGET_USE_FANCY_MATH_387"
+  "* return output_fix_trunc (insn, operands, false);"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "fist<mode>2_with_temp"
+  [(set (match_operand:SWI24 0 "register_operand" "=r")
+	(unspec:SWI24 [(match_operand:XF 1 "register_operand" "f")]
+		      UNSPEC_FIST))
+   (clobber (match_operand:SWI24 2 "memory_operand" "=m"))]
+  "TARGET_USE_FANCY_MATH_387"
+  "#"
+  [(set_attr "type" "fpspc")
+   (set_attr "mode" "<MODE>")])
+
+(define_split
+  [(set (match_operand:SWI24 0 "register_operand")
+	(unspec:SWI24 [(match_operand:XF 1 "register_operand")]
+		      UNSPEC_FIST))
+   (clobber (match_operand:SWI24 2 "memory_operand"))]
+  "reload_completed"
+  [(set (match_dup 2) (unspec:SWI24 [(match_dup 1)] UNSPEC_FIST))
+   (set (match_dup 0) (match_dup 2))])
+
+(define_split
+  [(set (match_operand:SWI24 0 "memory_operand")
+	(unspec:SWI24 [(match_operand:XF 1 "register_operand")]
+		      UNSPEC_FIST))
+   (clobber (match_operand:SWI24 2 "memory_operand"))]
+  "reload_completed"
+  [(set (match_dup 0) (unspec:SWI24 [(match_dup 1)] UNSPEC_FIST))])
+
+(define_expand "lrintxf<mode>2"
+  [(set (match_operand:SWI248x 0 "nonimmediate_operand")
+     (unspec:SWI248x [(match_operand:XF 1 "register_operand")]
+		     UNSPEC_FIST))]
+  "TARGET_USE_FANCY_MATH_387")
+
+(define_expand "lrint<MODEF:mode><SWI48:mode>2"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand")
+     (unspec:SWI48 [(match_operand:MODEF 1 "register_operand")]
+		   UNSPEC_FIX_NOTRUNC))]
+  "SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_SSE_MATH")
+
+(define_expand "lround<X87MODEF:mode><SWI248x:mode>2"
+  [(match_operand:SWI248x 0 "nonimmediate_operand")
+   (match_operand:X87MODEF 1 "register_operand")]
+  "(TARGET_USE_FANCY_MATH_387
+    && (!(SSE_FLOAT_MODE_P (<X87MODEF:MODE>mode) && TARGET_SSE_MATH)
+	|| TARGET_MIX_SSE_I387)
+    && flag_unsafe_math_optimizations)
+   || (SSE_FLOAT_MODE_P (<X87MODEF:MODE>mode) && TARGET_SSE_MATH
+       && <SWI248x:MODE>mode != HImode 
+       && ((<SWI248x:MODE>mode != DImode) || TARGET_64BIT)
+       && !flag_trapping_math && !flag_rounding_math)"
+{
+  if (optimize_insn_for_size_p ())
+    FAIL;
+
+  if (SSE_FLOAT_MODE_P (<X87MODEF:MODE>mode) && TARGET_SSE_MATH
+      && <SWI248x:MODE>mode != HImode
+      && ((<SWI248x:MODE>mode != DImode) || TARGET_64BIT)
+      && !flag_trapping_math && !flag_rounding_math)
+    ix86_expand_lround (operands[0], operands[1]);
+  else
+    ix86_emit_i387_round (operands[0], operands[1]);
+  DONE;
+})
+
+(define_int_iterator FRNDINT_ROUNDING
+	[UNSPEC_FRNDINT_FLOOR
+	 UNSPEC_FRNDINT_CEIL
+	 UNSPEC_FRNDINT_TRUNC])
+
+(define_int_iterator FIST_ROUNDING
+	[UNSPEC_FIST_FLOOR
+	 UNSPEC_FIST_CEIL])
+
+;; Base name for define_insn
+(define_int_attr rounding_insn
+	[(UNSPEC_FRNDINT_FLOOR "floor")
+	 (UNSPEC_FRNDINT_CEIL "ceil")
+	 (UNSPEC_FRNDINT_TRUNC "btrunc")
+	 (UNSPEC_FIST_FLOOR "floor")
+	 (UNSPEC_FIST_CEIL "ceil")])
+
+(define_int_attr rounding
+	[(UNSPEC_FRNDINT_FLOOR "floor")
+	 (UNSPEC_FRNDINT_CEIL "ceil")
+	 (UNSPEC_FRNDINT_TRUNC "trunc")
+	 (UNSPEC_FIST_FLOOR "floor")
+	 (UNSPEC_FIST_CEIL "ceil")])
+
+(define_int_attr ROUNDING
+	[(UNSPEC_FRNDINT_FLOOR "FLOOR")
+	 (UNSPEC_FRNDINT_CEIL "CEIL")
+	 (UNSPEC_FRNDINT_TRUNC "TRUNC")
+	 (UNSPEC_FIST_FLOOR "FLOOR")
+	 (UNSPEC_FIST_CEIL "CEIL")])
+
+;; Rounding mode control word calculation could clobber FLAGS_REG.
+(define_insn_and_split "frndintxf2_<rounding>"
+  [(set (match_operand:XF 0 "register_operand")
+	(unspec:XF [(match_operand:XF 1 "register_operand")]
+		   FRNDINT_ROUNDING))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  ix86_optimize_mode_switching[I387_<ROUNDING>] = 1;
+
+  operands[2] = assign_386_stack_local (HImode, SLOT_CW_STORED);
+  operands[3] = assign_386_stack_local (HImode, SLOT_CW_<ROUNDING>);
+
+  emit_insn (gen_frndintxf2_<rounding>_i387 (operands[0], operands[1],
+					     operands[2], operands[3]));
+  DONE;
+}
+  [(set_attr "type" "frndint")
+   (set_attr "i387_cw" "<rounding>")
+   (set_attr "mode" "XF")])
+
+(define_insn "frndintxf2_<rounding>_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(unspec:XF [(match_operand:XF 1 "register_operand" "0")]
+		   FRNDINT_ROUNDING))
+   (use (match_operand:HI 2 "memory_operand" "m"))
+   (use (match_operand:HI 3 "memory_operand" "m"))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+  "fldcw\t%3\n\tfrndint\n\tfldcw\t%2"
+  [(set_attr "type" "frndint")
+   (set_attr "i387_cw" "<rounding>")
+   (set_attr "mode" "XF")])
+
+(define_expand "<rounding_insn>xf2"
+  [(parallel [(set (match_operand:XF 0 "register_operand")
+		   (unspec:XF [(match_operand:XF 1 "register_operand")]
+			      FRNDINT_ROUNDING))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations
+   && !optimize_insn_for_size_p ()")
+
+(define_expand "<rounding_insn><mode>2"
+  [(parallel [(set (match_operand:MODEF 0 "register_operand")
+		   (unspec:MODEF [(match_operand:MODEF 1 "register_operand")]
+				 FRNDINT_ROUNDING))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "(TARGET_USE_FANCY_MATH_387
+    && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+	|| TARGET_MIX_SSE_I387)
+    && flag_unsafe_math_optimizations)
+   || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
+       && !flag_trapping_math)"
+{
+  if (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
+      && !flag_trapping_math)
+    {
+      if (TARGET_ROUND)
+	emit_insn (gen_sse4_1_round<mode>2
+		   (operands[0], operands[1], GEN_INT (ROUND_<ROUNDING>)));
+      else if (optimize_insn_for_size_p ())
+	FAIL;
+      else if (TARGET_64BIT || (<MODE>mode != DFmode))
+	{
+	  if (ROUND_<ROUNDING> == ROUND_FLOOR)
+	    ix86_expand_floorceil (operands[0], operands[1], true);
+	  else if (ROUND_<ROUNDING> == ROUND_CEIL)
+	    ix86_expand_floorceil (operands[0], operands[1], false);
+	  else if (ROUND_<ROUNDING> == ROUND_TRUNC)
+	    ix86_expand_trunc (operands[0], operands[1]);
+	  else
+	    gcc_unreachable ();
+	}
+      else
+	{
+	  if (ROUND_<ROUNDING> == ROUND_FLOOR)
+	    ix86_expand_floorceildf_32 (operands[0], operands[1], true);
+	  else if (ROUND_<ROUNDING> == ROUND_CEIL)
+	    ix86_expand_floorceildf_32 (operands[0], operands[1], false);
+	  else if (ROUND_<ROUNDING> == ROUND_TRUNC)
+	    ix86_expand_truncdf_32 (operands[0], operands[1]);
+	  else
+	    gcc_unreachable ();
+	}
+    }
+  else
+    {
+      rtx op0, op1;
+
+      if (optimize_insn_for_size_p ())
+	FAIL;
+
+      op0 = gen_reg_rtx (XFmode);
+      op1 = gen_reg_rtx (XFmode);
+      emit_insn (gen_extend<mode>xf2 (op1, operands[1]));
+      emit_insn (gen_frndintxf2_<rounding> (op0, op1));
+
+      emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+    }
+  DONE;
+})
+
+;; Rounding mode control word calculation could clobber FLAGS_REG.
+(define_insn_and_split "frndintxf2_mask_pm"
+  [(set (match_operand:XF 0 "register_operand")
+	(unspec:XF [(match_operand:XF 1 "register_operand")]
+		   UNSPEC_FRNDINT_MASK_PM))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  ix86_optimize_mode_switching[I387_MASK_PM] = 1;
+
+  operands[2] = assign_386_stack_local (HImode, SLOT_CW_STORED);
+  operands[3] = assign_386_stack_local (HImode, SLOT_CW_MASK_PM);
+
+  emit_insn (gen_frndintxf2_mask_pm_i387 (operands[0], operands[1],
+					  operands[2], operands[3]));
+  DONE;
+}
+  [(set_attr "type" "frndint")
+   (set_attr "i387_cw" "mask_pm")
+   (set_attr "mode" "XF")])
+
+(define_insn "frndintxf2_mask_pm_i387"
+  [(set (match_operand:XF 0 "register_operand" "=f")
+	(unspec:XF [(match_operand:XF 1 "register_operand" "0")]
+		   UNSPEC_FRNDINT_MASK_PM))
+   (use (match_operand:HI 2 "memory_operand" "m"))
+   (use (match_operand:HI 3 "memory_operand" "m"))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+  "fldcw\t%3\n\tfrndint\n\tfclex\n\tfldcw\t%2"
+  [(set_attr "type" "frndint")
+   (set_attr "i387_cw" "mask_pm")
+   (set_attr "mode" "XF")])
+
+(define_expand "nearbyintxf2"
+  [(parallel [(set (match_operand:XF 0 "register_operand")
+		   (unspec:XF [(match_operand:XF 1 "register_operand")]
+			      UNSPEC_FRNDINT_MASK_PM))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations")
+
+(define_expand "nearbyint<mode>2"
+  [(use (match_operand:MODEF 0 "register_operand"))
+   (use (match_operand:MODEF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)
+       || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations"
+{
+  rtx op0 = gen_reg_rtx (XFmode);
+  rtx op1 = gen_reg_rtx (XFmode);
+
+  emit_insn (gen_extend<mode>xf2 (op1, operands[1]));
+  emit_insn (gen_frndintxf2_mask_pm (op0, op1));
+
+  emit_insn (gen_truncxf<mode>2_i387_noop (operands[0], op0));
+  DONE;
+})
+
+;; Rounding mode control word calculation could clobber FLAGS_REG.
+(define_insn_and_split "*fist<mode>2_<rounding>_1"
+  [(set (match_operand:SWI248x 0 "nonimmediate_operand")
+	(unspec:SWI248x [(match_operand:XF 1 "register_operand")]
+			FIST_ROUNDING))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  ix86_optimize_mode_switching[I387_<ROUNDING>] = 1;
+
+  operands[2] = assign_386_stack_local (HImode, SLOT_CW_STORED);
+  operands[3] = assign_386_stack_local (HImode, SLOT_CW_<ROUNDING>);
+  if (memory_operand (operands[0], VOIDmode))
+    emit_insn (gen_fist<mode>2_<rounding> (operands[0], operands[1],
+					   operands[2], operands[3]));
+  else
+    {
+      operands[4] = assign_386_stack_local (<MODE>mode, SLOT_TEMP);
+      emit_insn (gen_fist<mode>2_<rounding>_with_temp
+		  (operands[0], operands[1], operands[2],
+		   operands[3], operands[4]));
+    }
+  DONE;
+}
+  [(set_attr "type" "fistp")
+   (set_attr "i387_cw" "<rounding>")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "fistdi2_<rounding>"
+  [(set (match_operand:DI 0 "memory_operand" "=m")
+	(unspec:DI [(match_operand:XF 1 "register_operand" "f")]
+		   FIST_ROUNDING))
+   (use (match_operand:HI 2 "memory_operand" "m"))
+   (use (match_operand:HI 3 "memory_operand" "m"))
+   (clobber (match_scratch:XF 4 "=&1f"))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+  "* return output_fix_trunc (insn, operands, false);"
+  [(set_attr "type" "fistp")
+   (set_attr "i387_cw" "<rounding>")
+   (set_attr "mode" "DI")])
+
+(define_insn "fistdi2_<rounding>_with_temp"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=m,?r")
+	(unspec:DI [(match_operand:XF 1 "register_operand" "f,f")]
+		   FIST_ROUNDING))
+   (use (match_operand:HI 2 "memory_operand" "m,m"))
+   (use (match_operand:HI 3 "memory_operand" "m,m"))
+   (clobber (match_operand:DI 4 "memory_operand" "=X,m"))
+   (clobber (match_scratch:XF 5 "=&1f,&1f"))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+  "#"
+  [(set_attr "type" "fistp")
+   (set_attr "i387_cw" "<rounding>")
+   (set_attr "mode" "DI")])
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(unspec:DI [(match_operand:XF 1 "register_operand")]
+		   FIST_ROUNDING))
+   (use (match_operand:HI 2 "memory_operand"))
+   (use (match_operand:HI 3 "memory_operand"))
+   (clobber (match_operand:DI 4 "memory_operand"))
+   (clobber (match_scratch 5))]
+  "reload_completed"
+  [(parallel [(set (match_dup 4)
+		   (unspec:DI [(match_dup 1)] FIST_ROUNDING))
+	      (use (match_dup 2))
+	      (use (match_dup 3))
+	      (clobber (match_dup 5))])
+   (set (match_dup 0) (match_dup 4))])
+
+(define_split
+  [(set (match_operand:DI 0 "memory_operand")
+	(unspec:DI [(match_operand:XF 1 "register_operand")]
+		   FIST_ROUNDING))
+   (use (match_operand:HI 2 "memory_operand"))
+   (use (match_operand:HI 3 "memory_operand"))
+   (clobber (match_operand:DI 4 "memory_operand"))
+   (clobber (match_scratch 5))]
+  "reload_completed"
+  [(parallel [(set (match_dup 0)
+		   (unspec:DI [(match_dup 1)] FIST_ROUNDING))
+	      (use (match_dup 2))
+	      (use (match_dup 3))
+	      (clobber (match_dup 5))])])
+
+(define_insn "fist<mode>2_<rounding>"
+  [(set (match_operand:SWI24 0 "memory_operand" "=m")
+	(unspec:SWI24 [(match_operand:XF 1 "register_operand" "f")]
+		      FIST_ROUNDING))
+   (use (match_operand:HI 2 "memory_operand" "m"))
+   (use (match_operand:HI 3 "memory_operand" "m"))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+  "* return output_fix_trunc (insn, operands, false);"
+  [(set_attr "type" "fistp")
+   (set_attr "i387_cw" "<rounding>")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "fist<mode>2_<rounding>_with_temp"
+  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=m,?r")
+	(unspec:SWI24 [(match_operand:XF 1 "register_operand" "f,f")]
+		      FIST_ROUNDING))
+   (use (match_operand:HI 2 "memory_operand" "m,m"))
+   (use (match_operand:HI 3 "memory_operand" "m,m"))
+   (clobber (match_operand:SWI24 4 "memory_operand" "=X,m"))]
+  "TARGET_USE_FANCY_MATH_387
+   && flag_unsafe_math_optimizations"
+  "#"
+  [(set_attr "type" "fistp")
+   (set_attr "i387_cw" "<rounding>")
+   (set_attr "mode" "<MODE>")])
+
+(define_split
+  [(set (match_operand:SWI24 0 "register_operand")
+	(unspec:SWI24 [(match_operand:XF 1 "register_operand")]
+		      FIST_ROUNDING))
+   (use (match_operand:HI 2 "memory_operand"))
+   (use (match_operand:HI 3 "memory_operand"))
+   (clobber (match_operand:SWI24 4 "memory_operand"))]
+  "reload_completed"
+  [(parallel [(set (match_dup 4)
+		   (unspec:SWI24 [(match_dup 1)] FIST_ROUNDING))
+	      (use (match_dup 2))
+	      (use (match_dup 3))])
+   (set (match_dup 0) (match_dup 4))])
+
+(define_split
+  [(set (match_operand:SWI24 0 "memory_operand")
+	(unspec:SWI24 [(match_operand:XF 1 "register_operand")]
+		      FIST_ROUNDING))
+   (use (match_operand:HI 2 "memory_operand"))
+   (use (match_operand:HI 3 "memory_operand"))
+   (clobber (match_operand:SWI24 4 "memory_operand"))]
+  "reload_completed"
+  [(parallel [(set (match_dup 0)
+		   (unspec:SWI24 [(match_dup 1)] FIST_ROUNDING))
+	      (use (match_dup 2))
+	      (use (match_dup 3))])])
+
+(define_expand "l<rounding_insn>xf<mode>2"
+  [(parallel [(set (match_operand:SWI248x 0 "nonimmediate_operand")
+		   (unspec:SWI248x [(match_operand:XF 1 "register_operand")]
+				   FIST_ROUNDING))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_USE_FANCY_MATH_387
+   && (!TARGET_SSE_MATH || TARGET_MIX_SSE_I387)
+   && flag_unsafe_math_optimizations")
+
+(define_expand "l<rounding_insn><MODEF:mode><SWI48:mode>2"
+  [(parallel [(set (match_operand:SWI48 0 "nonimmediate_operand")
+		   (unspec:SWI48 [(match_operand:MODEF 1 "register_operand")]
+				 FIST_ROUNDING))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_SSE_MATH
+   && !flag_trapping_math"
+{
+  if (TARGET_64BIT && optimize_insn_for_size_p ())
+    FAIL;
+
+  if (ROUND_<ROUNDING> == ROUND_FLOOR)
+    ix86_expand_lfloorceil (operands[0], operands[1], true);
+  else if (ROUND_<ROUNDING> == ROUND_CEIL)
+    ix86_expand_lfloorceil (operands[0], operands[1], false);
+  else
+    gcc_unreachable ();
+
+  DONE;
+})
+
+(define_insn "fxam<mode>2_i387"
+  [(set (match_operand:HI 0 "register_operand" "=a")
+	(unspec:HI
+	  [(match_operand:X87MODEF 1 "register_operand" "f")]
+	  UNSPEC_FXAM))]
+  "TARGET_USE_FANCY_MATH_387"
+  "fxam\n\tfnstsw\t%0"
+  [(set_attr "type" "multi")
+   (set_attr "length" "4")
+   (set_attr "unit" "i387")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn_and_split "fxam<mode>2_i387_with_temp"
+  [(set (match_operand:HI 0 "register_operand")
+	(unspec:HI
+	  [(match_operand:MODEF 1 "memory_operand")]
+	  UNSPEC_FXAM_MEM))]
+  "TARGET_USE_FANCY_MATH_387
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 2)(match_dup 1))
+   (set (match_dup 0)
+	(unspec:HI [(match_dup 2)] UNSPEC_FXAM))]
+{
+  operands[2] = gen_reg_rtx (<MODE>mode);
+
+  MEM_VOLATILE_P (operands[1]) = 1;
+}
+  [(set_attr "type" "multi")
+   (set_attr "unit" "i387")
+   (set_attr "mode" "<MODE>")])
+
+(define_expand "isinfxf2"
+  [(use (match_operand:SI 0 "register_operand"))
+   (use (match_operand:XF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && ix86_libc_has_function (function_c99_misc)"
+{
+  rtx mask = GEN_INT (0x45);
+  rtx val = GEN_INT (0x05);
+
+  rtx cond;
+
+  rtx scratch = gen_reg_rtx (HImode);
+  rtx res = gen_reg_rtx (QImode);
+
+  emit_insn (gen_fxamxf2_i387 (scratch, operands[1]));
+
+  emit_insn (gen_andqi_ext_0 (scratch, scratch, mask));
+  emit_insn (gen_cmpqi_ext_3 (scratch, val));
+  cond = gen_rtx_fmt_ee (EQ, QImode,
+			 gen_rtx_REG (CCmode, FLAGS_REG),
+			 const0_rtx);
+  emit_insn (gen_rtx_SET (VOIDmode, res, cond));
+  emit_insn (gen_zero_extendqisi2 (operands[0], res));
+  DONE;
+})
+
+(define_expand "isinf<mode>2"
+  [(use (match_operand:SI 0 "register_operand"))
+   (use (match_operand:MODEF 1 "nonimmediate_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && ix86_libc_has_function (function_c99_misc)
+   && !(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
+{
+  rtx mask = GEN_INT (0x45);
+  rtx val = GEN_INT (0x05);
+
+  rtx cond;
+
+  rtx scratch = gen_reg_rtx (HImode);
+  rtx res = gen_reg_rtx (QImode);
+
+  /* Remove excess precision by forcing value through memory. */
+  if (memory_operand (operands[1], VOIDmode))
+    emit_insn (gen_fxam<mode>2_i387_with_temp (scratch, operands[1]));
+  else
+    {
+      rtx temp = assign_386_stack_local (<MODE>mode, SLOT_TEMP);
+
+      emit_move_insn (temp, operands[1]);
+      emit_insn (gen_fxam<mode>2_i387_with_temp (scratch, temp));
+    }
+
+  emit_insn (gen_andqi_ext_0 (scratch, scratch, mask));
+  emit_insn (gen_cmpqi_ext_3 (scratch, val));
+  cond = gen_rtx_fmt_ee (EQ, QImode,
+			 gen_rtx_REG (CCmode, FLAGS_REG),
+			 const0_rtx);
+  emit_insn (gen_rtx_SET (VOIDmode, res, cond));
+  emit_insn (gen_zero_extendqisi2 (operands[0], res));
+  DONE;
+})
+
+(define_expand "signbitxf2"
+  [(use (match_operand:SI 0 "register_operand"))
+   (use (match_operand:XF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387"
+{
+  rtx scratch = gen_reg_rtx (HImode);
+
+  emit_insn (gen_fxamxf2_i387 (scratch, operands[1]));
+  emit_insn (gen_andsi3 (operands[0],
+	     gen_lowpart (SImode, scratch), GEN_INT (0x200)));
+  DONE;
+})
+
+(define_insn "movmsk_df"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI
+	  [(match_operand:DF 1 "register_operand" "x")]
+	  UNSPEC_MOVMSK))]
+  "SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH"
+  "%vmovmskpd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "DF")])
+
+;; Use movmskpd in SSE mode to avoid store forwarding stall
+;; for 32bit targets and movq+shrq sequence for 64bit targets.
+(define_expand "signbitdf2"
+  [(use (match_operand:SI 0 "register_operand"))
+   (use (match_operand:DF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   || (SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH)"
+{
+  if (SSE_FLOAT_MODE_P (DFmode) && TARGET_SSE_MATH)
+    {
+      emit_insn (gen_movmsk_df (operands[0], operands[1]));
+      emit_insn (gen_andsi3 (operands[0], operands[0], const1_rtx));
+    }
+  else
+    {
+      rtx scratch = gen_reg_rtx (HImode);
+
+      emit_insn (gen_fxamdf2_i387 (scratch, operands[1]));
+      emit_insn (gen_andsi3 (operands[0],
+		 gen_lowpart (SImode, scratch), GEN_INT (0x200)));
+    }
+  DONE;
+})
+
+(define_expand "signbitsf2"
+  [(use (match_operand:SI 0 "register_operand"))
+   (use (match_operand:SF 1 "register_operand"))]
+  "TARGET_USE_FANCY_MATH_387
+   && !(SSE_FLOAT_MODE_P (SFmode) && TARGET_SSE_MATH)"
+{
+  rtx scratch = gen_reg_rtx (HImode);
+
+  emit_insn (gen_fxamsf2_i387 (scratch, operands[1]));
+  emit_insn (gen_andsi3 (operands[0],
+	     gen_lowpart (SImode, scratch), GEN_INT (0x200)));
+  DONE;
+})
+
+;; Block operation instructions
+
+(define_insn "cld"
+  [(unspec_volatile [(const_int 0)] UNSPECV_CLD)]
+  ""
+  "cld"
+  [(set_attr "length" "1")
+   (set_attr "length_immediate" "0")
+   (set_attr "modrm" "0")])
+
+(define_expand "movmem<mode>"
+  [(use (match_operand:BLK 0 "memory_operand"))
+   (use (match_operand:BLK 1 "memory_operand"))
+   (use (match_operand:SWI48 2 "nonmemory_operand"))
+   (use (match_operand:SWI48 3 "const_int_operand"))
+   (use (match_operand:SI 4 "const_int_operand"))
+   (use (match_operand:SI 5 "const_int_operand"))
+   (use (match_operand:SI 6 ""))
+   (use (match_operand:SI 7 ""))
+   (use (match_operand:SI 8 ""))]
+  ""
+{
+ if (ix86_expand_set_or_movmem (operands[0], operands[1],
+			        operands[2], NULL, operands[3],
+			        operands[4], operands[5],
+				operands[6], operands[7],
+				operands[8], false))
+   DONE;
+ else
+   FAIL;
+})
+
+;; Most CPUs don't like single string operations
+;; Handle this case here to simplify previous expander.
+
+(define_expand "strmov"
+  [(set (match_dup 4) (match_operand 3 "memory_operand"))
+   (set (match_operand 1 "memory_operand") (match_dup 4))
+   (parallel [(set (match_operand 0 "register_operand") (match_dup 5))
+	      (clobber (reg:CC FLAGS_REG))])
+   (parallel [(set (match_operand 2 "register_operand") (match_dup 6))
+	      (clobber (reg:CC FLAGS_REG))])]
+  ""
+{
+  rtx adjust = GEN_INT (GET_MODE_SIZE (GET_MODE (operands[1])));
+
+  /* If .md ever supports :P for Pmode, these can be directly
+     in the pattern above.  */
+  operands[5] = gen_rtx_PLUS (Pmode, operands[0], adjust);
+  operands[6] = gen_rtx_PLUS (Pmode, operands[2], adjust);
+
+  /* Can't use this if the user has appropriated esi or edi.  */
+  if ((TARGET_SINGLE_STRINGOP || optimize_insn_for_size_p ())
+      && !(fixed_regs[SI_REG] || fixed_regs[DI_REG]))
+    {
+      emit_insn (gen_strmov_singleop (operands[0], operands[1],
+				      operands[2], operands[3],
+				      operands[5], operands[6]));
+      DONE;
+    }
+
+  operands[4] = gen_reg_rtx (GET_MODE (operands[1]));
+})
+
+(define_expand "strmov_singleop"
+  [(parallel [(set (match_operand 1 "memory_operand")
+		   (match_operand 3 "memory_operand"))
+	      (set (match_operand 0 "register_operand")
+		   (match_operand 4))
+	      (set (match_operand 2 "register_operand")
+		   (match_operand 5))])]
+  ""
+  "ix86_current_function_needs_cld = 1;")
+
+(define_insn "*strmovdi_rex_1"
+  [(set (mem:DI (match_operand:P 2 "register_operand" "0"))
+	(mem:DI (match_operand:P 3 "register_operand" "1")))
+   (set (match_operand:P 0 "register_operand" "=D")
+	(plus:P (match_dup 2)
+		(const_int 8)))
+   (set (match_operand:P 1 "register_operand" "=S")
+	(plus:P (match_dup 3)
+		(const_int 8)))]
+  "TARGET_64BIT
+   && !(fixed_regs[SI_REG] || fixed_regs[DI_REG])"
+  "%^movsq"
+  [(set_attr "type" "str")
+   (set_attr "memory" "both")
+   (set_attr "mode" "DI")])
+
+(define_insn "*strmovsi_1"
+  [(set (mem:SI (match_operand:P 2 "register_operand" "0"))
+	(mem:SI (match_operand:P 3 "register_operand" "1")))
+   (set (match_operand:P 0 "register_operand" "=D")
+	(plus:P (match_dup 2)
+		(const_int 4)))
+   (set (match_operand:P 1 "register_operand" "=S")
+	(plus:P (match_dup 3)
+		(const_int 4)))]
+  "!(fixed_regs[SI_REG] || fixed_regs[DI_REG])"
+  "%^movs{l|d}"
+  [(set_attr "type" "str")
+   (set_attr "memory" "both")
+   (set_attr "mode" "SI")])
+
+(define_insn "*strmovhi_1"
+  [(set (mem:HI (match_operand:P 2 "register_operand" "0"))
+	(mem:HI (match_operand:P 3 "register_operand" "1")))
+   (set (match_operand:P 0 "register_operand" "=D")
+	(plus:P (match_dup 2)
+		(const_int 2)))
+   (set (match_operand:P 1 "register_operand" "=S")
+	(plus:P (match_dup 3)
+		(const_int 2)))]
+  "!(fixed_regs[SI_REG] || fixed_regs[DI_REG])"
+  "%^movsw"
+  [(set_attr "type" "str")
+   (set_attr "memory" "both")
+   (set_attr "mode" "HI")])
+
+(define_insn "*strmovqi_1"
+  [(set (mem:QI (match_operand:P 2 "register_operand" "0"))
+	(mem:QI (match_operand:P 3 "register_operand" "1")))
+   (set (match_operand:P 0 "register_operand" "=D")
+	(plus:P (match_dup 2)
+		(const_int 1)))
+   (set (match_operand:P 1 "register_operand" "=S")
+	(plus:P (match_dup 3)
+		(const_int 1)))]
+  "!(fixed_regs[SI_REG] || fixed_regs[DI_REG])"
+  "%^movsb"
+  [(set_attr "type" "str")
+   (set_attr "memory" "both")
+   (set (attr "prefix_rex")
+	(if_then_else
+	  (match_test "<P:MODE>mode == DImode")
+	  (const_string "0")
+	  (const_string "*")))
+   (set_attr "mode" "QI")])
+
+(define_expand "rep_mov"
+  [(parallel [(set (match_operand 4 "register_operand") (const_int 0))
+	      (set (match_operand 0 "register_operand")
+		   (match_operand 5))
+	      (set (match_operand 2 "register_operand")
+		   (match_operand 6))
+	      (set (match_operand 1 "memory_operand")
+		   (match_operand 3 "memory_operand"))
+	      (use (match_dup 4))])]
+  ""
+  "ix86_current_function_needs_cld = 1;")
+
+(define_insn "*rep_movdi_rex64"
+  [(set (match_operand:P 2 "register_operand" "=c") (const_int 0))
+   (set (match_operand:P 0 "register_operand" "=D")
+        (plus:P (ashift:P (match_operand:P 5 "register_operand" "2")
+			  (const_int 3))
+		(match_operand:P 3 "register_operand" "0")))
+   (set (match_operand:P 1 "register_operand" "=S")
+        (plus:P (ashift:P (match_dup 5) (const_int 3))
+		(match_operand:P 4 "register_operand" "1")))
+   (set (mem:BLK (match_dup 3))
+	(mem:BLK (match_dup 4)))
+   (use (match_dup 5))]
+  "TARGET_64BIT
+   && !(fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])"
+  "%^rep{%;} movsq"
+  [(set_attr "type" "str")
+   (set_attr "prefix_rep" "1")
+   (set_attr "memory" "both")
+   (set_attr "mode" "DI")])
+
+(define_insn "*rep_movsi"
+  [(set (match_operand:P 2 "register_operand" "=c") (const_int 0))
+   (set (match_operand:P 0 "register_operand" "=D")
+        (plus:P (ashift:P (match_operand:P 5 "register_operand" "2")
+			  (const_int 2))
+		 (match_operand:P 3 "register_operand" "0")))
+   (set (match_operand:P 1 "register_operand" "=S")
+        (plus:P (ashift:P (match_dup 5) (const_int 2))
+		(match_operand:P 4 "register_operand" "1")))
+   (set (mem:BLK (match_dup 3))
+	(mem:BLK (match_dup 4)))
+   (use (match_dup 5))]
+  "!(fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])"
+  "%^rep{%;} movs{l|d}"
+  [(set_attr "type" "str")
+   (set_attr "prefix_rep" "1")
+   (set_attr "memory" "both")
+   (set_attr "mode" "SI")])
+
+(define_insn "*rep_movqi"
+  [(set (match_operand:P 2 "register_operand" "=c") (const_int 0))
+   (set (match_operand:P 0 "register_operand" "=D")
+        (plus:P (match_operand:P 3 "register_operand" "0")
+		(match_operand:P 5 "register_operand" "2")))
+   (set (match_operand:P 1 "register_operand" "=S")
+        (plus:P (match_operand:P 4 "register_operand" "1") (match_dup 5)))
+   (set (mem:BLK (match_dup 3))
+	(mem:BLK (match_dup 4)))
+   (use (match_dup 5))]
+  "!(fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])"
+  "%^rep{%;} movsb"
+  [(set_attr "type" "str")
+   (set_attr "prefix_rep" "1")
+   (set_attr "memory" "both")
+   (set_attr "mode" "QI")])
+
+(define_expand "setmem<mode>"
+   [(use (match_operand:BLK 0 "memory_operand"))
+    (use (match_operand:SWI48 1 "nonmemory_operand"))
+    (use (match_operand:QI 2 "nonmemory_operand"))
+    (use (match_operand 3 "const_int_operand"))
+    (use (match_operand:SI 4 "const_int_operand"))
+    (use (match_operand:SI 5 "const_int_operand"))
+    (use (match_operand:SI 6 ""))
+    (use (match_operand:SI 7 ""))
+    (use (match_operand:SI 8 ""))]
+  ""
+{
+ if (ix86_expand_set_or_movmem (operands[0], NULL,
+			        operands[1], operands[2],
+				operands[3], operands[4],
+			        operands[5], operands[6],
+				operands[7], operands[8], true))
+   DONE;
+ else
+   FAIL;
+})
+
+;; Most CPUs don't like single string operations
+;; Handle this case here to simplify previous expander.
+
+(define_expand "strset"
+  [(set (match_operand 1 "memory_operand")
+	(match_operand 2 "register_operand"))
+   (parallel [(set (match_operand 0 "register_operand")
+		   (match_dup 3))
+	      (clobber (reg:CC FLAGS_REG))])]
+  ""
+{
+  if (GET_MODE (operands[1]) != GET_MODE (operands[2]))
+    operands[1] = adjust_address_nv (operands[1], GET_MODE (operands[2]), 0);
+
+  /* If .md ever supports :P for Pmode, this can be directly
+     in the pattern above.  */
+  operands[3] = gen_rtx_PLUS (Pmode, operands[0],
+			      GEN_INT (GET_MODE_SIZE (GET_MODE
+						      (operands[2]))));
+  /* Can't use this if the user has appropriated eax or edi.  */
+  if ((TARGET_SINGLE_STRINGOP || optimize_insn_for_size_p ())
+      && !(fixed_regs[AX_REG] || fixed_regs[DI_REG]))
+    {
+      emit_insn (gen_strset_singleop (operands[0], operands[1], operands[2],
+				      operands[3]));
+      DONE;
+    }
+})
+
+(define_expand "strset_singleop"
+  [(parallel [(set (match_operand 1 "memory_operand")
+		   (match_operand 2 "register_operand"))
+	      (set (match_operand 0 "register_operand")
+		   (match_operand 3))
+	      (unspec [(const_int 0)] UNSPEC_STOS)])]
+  ""
+  "ix86_current_function_needs_cld = 1;")
+
+(define_insn "*strsetdi_rex_1"
+  [(set (mem:DI (match_operand:P 1 "register_operand" "0"))
+	(match_operand:DI 2 "register_operand" "a"))
+   (set (match_operand:P 0 "register_operand" "=D")
+	(plus:P (match_dup 1)
+		(const_int 8)))
+   (unspec [(const_int 0)] UNSPEC_STOS)]
+  "TARGET_64BIT
+   && !(fixed_regs[AX_REG] || fixed_regs[DI_REG])"
+  "%^stosq"
+  [(set_attr "type" "str")
+   (set_attr "memory" "store")
+   (set_attr "mode" "DI")])
+
+(define_insn "*strsetsi_1"
+  [(set (mem:SI (match_operand:P 1 "register_operand" "0"))
+	(match_operand:SI 2 "register_operand" "a"))
+   (set (match_operand:P 0 "register_operand" "=D")
+	(plus:P (match_dup 1)
+		(const_int 4)))
+   (unspec [(const_int 0)] UNSPEC_STOS)]
+  "!(fixed_regs[AX_REG] || fixed_regs[DI_REG])"
+  "%^stos{l|d}"
+  [(set_attr "type" "str")
+   (set_attr "memory" "store")
+   (set_attr "mode" "SI")])
+
+(define_insn "*strsethi_1"
+  [(set (mem:HI (match_operand:P 1 "register_operand" "0"))
+	(match_operand:HI 2 "register_operand" "a"))
+   (set (match_operand:P 0 "register_operand" "=D")
+	(plus:P (match_dup 1)
+		(const_int 2)))
+   (unspec [(const_int 0)] UNSPEC_STOS)]
+  "!(fixed_regs[AX_REG] || fixed_regs[DI_REG])"
+  "%^stosw"
+  [(set_attr "type" "str")
+   (set_attr "memory" "store")
+   (set_attr "mode" "HI")])
+
+(define_insn "*strsetqi_1"
+  [(set (mem:QI (match_operand:P 1 "register_operand" "0"))
+	(match_operand:QI 2 "register_operand" "a"))
+   (set (match_operand:P 0 "register_operand" "=D")
+	(plus:P (match_dup 1)
+		(const_int 1)))
+   (unspec [(const_int 0)] UNSPEC_STOS)]
+  "!(fixed_regs[AX_REG] || fixed_regs[DI_REG])"
+  "%^stosb"
+  [(set_attr "type" "str")
+   (set_attr "memory" "store")
+   (set (attr "prefix_rex")
+	(if_then_else
+	  (match_test "<P:MODE>mode == DImode")
+	  (const_string "0")
+	  (const_string "*")))
+   (set_attr "mode" "QI")])
+
+(define_expand "rep_stos"
+  [(parallel [(set (match_operand 1 "register_operand") (const_int 0))
+	      (set (match_operand 0 "register_operand")
+		   (match_operand 4))
+	      (set (match_operand 2 "memory_operand") (const_int 0))
+	      (use (match_operand 3 "register_operand"))
+	      (use (match_dup 1))])]
+  ""
+  "ix86_current_function_needs_cld = 1;")
+
+(define_insn "*rep_stosdi_rex64"
+  [(set (match_operand:P 1 "register_operand" "=c") (const_int 0))
+   (set (match_operand:P 0 "register_operand" "=D")
+        (plus:P (ashift:P (match_operand:P 4 "register_operand" "1")
+			  (const_int 3))
+		 (match_operand:P 3 "register_operand" "0")))
+   (set (mem:BLK (match_dup 3))
+	(const_int 0))
+   (use (match_operand:DI 2 "register_operand" "a"))
+   (use (match_dup 4))]
+  "TARGET_64BIT
+   && !(fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])"
+  "%^rep{%;} stosq"
+  [(set_attr "type" "str")
+   (set_attr "prefix_rep" "1")
+   (set_attr "memory" "store")
+   (set_attr "mode" "DI")])
+
+(define_insn "*rep_stossi"
+  [(set (match_operand:P 1 "register_operand" "=c") (const_int 0))
+   (set (match_operand:P 0 "register_operand" "=D")
+        (plus:P (ashift:P (match_operand:P 4 "register_operand" "1")
+			  (const_int 2))
+		 (match_operand:P 3 "register_operand" "0")))
+   (set (mem:BLK (match_dup 3))
+	(const_int 0))
+   (use (match_operand:SI 2 "register_operand" "a"))
+   (use (match_dup 4))]
+  "!(fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])"
+  "%^rep{%;} stos{l|d}"
+  [(set_attr "type" "str")
+   (set_attr "prefix_rep" "1")
+   (set_attr "memory" "store")
+   (set_attr "mode" "SI")])
+
+(define_insn "*rep_stosqi"
+  [(set (match_operand:P 1 "register_operand" "=c") (const_int 0))
+   (set (match_operand:P 0 "register_operand" "=D")
+        (plus:P (match_operand:P 3 "register_operand" "0")
+		(match_operand:P 4 "register_operand" "1")))
+   (set (mem:BLK (match_dup 3))
+	(const_int 0))
+   (use (match_operand:QI 2 "register_operand" "a"))
+   (use (match_dup 4))]
+  "!(fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])"
+  "%^rep{%;} stosb"
+  [(set_attr "type" "str")
+   (set_attr "prefix_rep" "1")
+   (set_attr "memory" "store")
+   (set (attr "prefix_rex")
+	(if_then_else
+	  (match_test "<P:MODE>mode == DImode")
+	  (const_string "0")
+	  (const_string "*")))
+   (set_attr "mode" "QI")])
+
+(define_expand "cmpstrnsi"
+  [(set (match_operand:SI 0 "register_operand")
+	(compare:SI (match_operand:BLK 1 "general_operand")
+		    (match_operand:BLK 2 "general_operand")))
+   (use (match_operand 3 "general_operand"))
+   (use (match_operand 4 "immediate_operand"))]
+  ""
+{
+  rtx addr1, addr2, out, outlow, count, countreg, align;
+
+  if (optimize_insn_for_size_p () && !TARGET_INLINE_ALL_STRINGOPS)
+    FAIL;
+
+  /* Can't use this if the user has appropriated ecx, esi or edi.  */
+  if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
+    FAIL;
+
+  out = operands[0];
+  if (!REG_P (out))
+    out = gen_reg_rtx (SImode);
+
+  addr1 = copy_addr_to_reg (XEXP (operands[1], 0));
+  addr2 = copy_addr_to_reg (XEXP (operands[2], 0));
+  if (addr1 != XEXP (operands[1], 0))
+    operands[1] = replace_equiv_address_nv (operands[1], addr1);
+  if (addr2 != XEXP (operands[2], 0))
+    operands[2] = replace_equiv_address_nv (operands[2], addr2);
+
+  count = operands[3];
+  countreg = ix86_zero_extend_to_Pmode (count);
+
+  /* %%% Iff we are testing strict equality, we can use known alignment
+     to good advantage.  This may be possible with combine, particularly
+     once cc0 is dead.  */
+  align = operands[4];
+
+  if (CONST_INT_P (count))
+    {
+      if (INTVAL (count) == 0)
+	{
+	  emit_move_insn (operands[0], const0_rtx);
+	  DONE;
+	}
+      emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, countreg, align,
+				     operands[1], operands[2]));
+    }
+  else
+    {
+      rtx (*gen_cmp) (rtx, rtx);
+
+      gen_cmp = (TARGET_64BIT
+		 ? gen_cmpdi_1 : gen_cmpsi_1);
+
+      emit_insn (gen_cmp (countreg, countreg));
+      emit_insn (gen_cmpstrnqi_1 (addr1, addr2, countreg, align,
+				  operands[1], operands[2]));
+    }
+
+  outlow = gen_lowpart (QImode, out);
+  emit_insn (gen_cmpintqi (outlow));
+  emit_move_insn (out, gen_rtx_SIGN_EXTEND (SImode, outlow));
+
+  if (operands[0] != out)
+    emit_move_insn (operands[0], out);
+
+  DONE;
+})
+
+;; Produce a tri-state integer (-1, 0, 1) from condition codes.
+
+(define_expand "cmpintqi"
+  [(set (match_dup 1)
+	(gtu:QI (reg:CC FLAGS_REG) (const_int 0)))
+   (set (match_dup 2)
+	(ltu:QI (reg:CC FLAGS_REG) (const_int 0)))
+   (parallel [(set (match_operand:QI 0 "register_operand")
+		   (minus:QI (match_dup 1)
+			     (match_dup 2)))
+	      (clobber (reg:CC FLAGS_REG))])]
+  ""
+{
+  operands[1] = gen_reg_rtx (QImode);
+  operands[2] = gen_reg_rtx (QImode);
+})
+
+;; memcmp recognizers.  The `cmpsb' opcode does nothing if the count is
+;; zero.  Emit extra code to make sure that a zero-length compare is EQ.
+
+(define_expand "cmpstrnqi_nz_1"
+  [(parallel [(set (reg:CC FLAGS_REG)
+		   (compare:CC (match_operand 4 "memory_operand")
+			       (match_operand 5 "memory_operand")))
+	      (use (match_operand 2 "register_operand"))
+	      (use (match_operand:SI 3 "immediate_operand"))
+	      (clobber (match_operand 0 "register_operand"))
+	      (clobber (match_operand 1 "register_operand"))
+	      (clobber (match_dup 2))])]
+  ""
+  "ix86_current_function_needs_cld = 1;")
+
+(define_insn "*cmpstrnqi_nz_1"
+  [(set (reg:CC FLAGS_REG)
+	(compare:CC (mem:BLK (match_operand:P 4 "register_operand" "0"))
+		    (mem:BLK (match_operand:P 5 "register_operand" "1"))))
+   (use (match_operand:P 6 "register_operand" "2"))
+   (use (match_operand:SI 3 "immediate_operand" "i"))
+   (clobber (match_operand:P 0 "register_operand" "=S"))
+   (clobber (match_operand:P 1 "register_operand" "=D"))
+   (clobber (match_operand:P 2 "register_operand" "=c"))]
+  "!(fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])"
+  "%^repz{%;} cmpsb"
+  [(set_attr "type" "str")
+   (set_attr "mode" "QI")
+   (set (attr "prefix_rex")
+	(if_then_else
+	  (match_test "<P:MODE>mode == DImode")
+	  (const_string "0")
+	  (const_string "*")))
+   (set_attr "prefix_rep" "1")])
+
+;; The same, but the count is not known to not be zero.
+
+(define_expand "cmpstrnqi_1"
+  [(parallel [(set (reg:CC FLAGS_REG)
+		(if_then_else:CC (ne (match_operand 2 "register_operand")
+				     (const_int 0))
+		  (compare:CC (match_operand 4 "memory_operand")
+			      (match_operand 5 "memory_operand"))
+		  (const_int 0)))
+	      (use (match_operand:SI 3 "immediate_operand"))
+	      (use (reg:CC FLAGS_REG))
+	      (clobber (match_operand 0 "register_operand"))
+	      (clobber (match_operand 1 "register_operand"))
+	      (clobber (match_dup 2))])]
+  ""
+  "ix86_current_function_needs_cld = 1;")
+
+(define_insn "*cmpstrnqi_1"
+  [(set (reg:CC FLAGS_REG)
+	(if_then_else:CC (ne (match_operand:P 6 "register_operand" "2")
+			     (const_int 0))
+	  (compare:CC (mem:BLK (match_operand:P 4 "register_operand" "0"))
+		      (mem:BLK (match_operand:P 5 "register_operand" "1")))
+	  (const_int 0)))
+   (use (match_operand:SI 3 "immediate_operand" "i"))
+   (use (reg:CC FLAGS_REG))
+   (clobber (match_operand:P 0 "register_operand" "=S"))
+   (clobber (match_operand:P 1 "register_operand" "=D"))
+   (clobber (match_operand:P 2 "register_operand" "=c"))]
+  "!(fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])"
+  "%^repz{%;} cmpsb"
+  [(set_attr "type" "str")
+   (set_attr "mode" "QI")
+   (set (attr "prefix_rex")
+	(if_then_else
+	  (match_test "<P:MODE>mode == DImode")
+	  (const_string "0")
+	  (const_string "*")))
+   (set_attr "prefix_rep" "1")])
+
+(define_expand "strlen<mode>"
+  [(set (match_operand:P 0 "register_operand")
+	(unspec:P [(match_operand:BLK 1 "general_operand")
+		   (match_operand:QI 2 "immediate_operand")
+		   (match_operand 3 "immediate_operand")]
+		  UNSPEC_SCAS))]
+  ""
+{
+ if (ix86_expand_strlen (operands[0], operands[1], operands[2], operands[3]))
+   DONE;
+ else
+   FAIL;
+})
+
+(define_expand "strlenqi_1"
+  [(parallel [(set (match_operand 0 "register_operand")
+		   (match_operand 2))
+	      (clobber (match_operand 1 "register_operand"))
+	      (clobber (reg:CC FLAGS_REG))])]
+  ""
+  "ix86_current_function_needs_cld = 1;")
+
+(define_insn "*strlenqi_1"
+  [(set (match_operand:P 0 "register_operand" "=&c")
+	(unspec:P [(mem:BLK (match_operand:P 5 "register_operand" "1"))
+		   (match_operand:QI 2 "register_operand" "a")
+		   (match_operand:P 3 "immediate_operand" "i")
+		   (match_operand:P 4 "register_operand" "0")] UNSPEC_SCAS))
+   (clobber (match_operand:P 1 "register_operand" "=D"))
+   (clobber (reg:CC FLAGS_REG))]
+  "!(fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])"
+  "%^repnz{%;} scasb"
+  [(set_attr "type" "str")
+   (set_attr "mode" "QI")
+   (set (attr "prefix_rex")
+	(if_then_else
+	  (match_test "<P:MODE>mode == DImode")
+	  (const_string "0")
+	  (const_string "*")))
+   (set_attr "prefix_rep" "1")])
+
+;; Peephole optimizations to clean up after cmpstrn*.  This should be
+;; handled in combine, but it is not currently up to the task.
+;; When used for their truth value, the cmpstrn* expanders generate
+;; code like this:
+;;
+;;   repz cmpsb
+;;   seta 	%al
+;;   setb 	%dl
+;;   cmpb 	%al, %dl
+;;   jcc	label
+;;
+;; The intermediate three instructions are unnecessary.
+
+;; This one handles cmpstrn*_nz_1...
+(define_peephole2
+  [(parallel[
+     (set (reg:CC FLAGS_REG)
+	  (compare:CC (mem:BLK (match_operand 4 "register_operand"))
+		      (mem:BLK (match_operand 5 "register_operand"))))
+     (use (match_operand 6 "register_operand"))
+     (use (match_operand:SI 3 "immediate_operand"))
+     (clobber (match_operand 0 "register_operand"))
+     (clobber (match_operand 1 "register_operand"))
+     (clobber (match_operand 2 "register_operand"))])
+   (set (match_operand:QI 7 "register_operand")
+	(gtu:QI (reg:CC FLAGS_REG) (const_int 0)))
+   (set (match_operand:QI 8 "register_operand")
+	(ltu:QI (reg:CC FLAGS_REG) (const_int 0)))
+   (set (reg FLAGS_REG)
+	(compare (match_dup 7) (match_dup 8)))
+  ]
+  "peep2_reg_dead_p (4, operands[7]) && peep2_reg_dead_p (4, operands[8])"
+  [(parallel[
+     (set (reg:CC FLAGS_REG)
+	  (compare:CC (mem:BLK (match_dup 4))
+		      (mem:BLK (match_dup 5))))
+     (use (match_dup 6))
+     (use (match_dup 3))
+     (clobber (match_dup 0))
+     (clobber (match_dup 1))
+     (clobber (match_dup 2))])])
+
+;; ...and this one handles cmpstrn*_1.
+(define_peephole2
+  [(parallel[
+     (set (reg:CC FLAGS_REG)
+	  (if_then_else:CC (ne (match_operand 6 "register_operand")
+			       (const_int 0))
+	    (compare:CC (mem:BLK (match_operand 4 "register_operand"))
+		        (mem:BLK (match_operand 5 "register_operand")))
+	    (const_int 0)))
+     (use (match_operand:SI 3 "immediate_operand"))
+     (use (reg:CC FLAGS_REG))
+     (clobber (match_operand 0 "register_operand"))
+     (clobber (match_operand 1 "register_operand"))
+     (clobber (match_operand 2 "register_operand"))])
+   (set (match_operand:QI 7 "register_operand")
+	(gtu:QI (reg:CC FLAGS_REG) (const_int 0)))
+   (set (match_operand:QI 8 "register_operand")
+	(ltu:QI (reg:CC FLAGS_REG) (const_int 0)))
+   (set (reg FLAGS_REG)
+	(compare (match_dup 7) (match_dup 8)))
+  ]
+  "peep2_reg_dead_p (4, operands[7]) && peep2_reg_dead_p (4, operands[8])"
+  [(parallel[
+     (set (reg:CC FLAGS_REG)
+	  (if_then_else:CC (ne (match_dup 6)
+			       (const_int 0))
+	    (compare:CC (mem:BLK (match_dup 4))
+			(mem:BLK (match_dup 5)))
+	    (const_int 0)))
+     (use (match_dup 3))
+     (use (reg:CC FLAGS_REG))
+     (clobber (match_dup 0))
+     (clobber (match_dup 1))
+     (clobber (match_dup 2))])])
+
+;; Conditional move instructions.
+
+(define_expand "mov<mode>cc"
+  [(set (match_operand:SWIM 0 "register_operand")
+	(if_then_else:SWIM (match_operand 1 "comparison_operator")
+			   (match_operand:SWIM 2 "<general_operand>")
+			   (match_operand:SWIM 3 "<general_operand>")))]
+  ""
+  "if (ix86_expand_int_movcc (operands)) DONE; else FAIL;")
+
+;; Data flow gets confused by our desire for `sbbl reg,reg', and clearing
+;; the register first winds up with `sbbl $0,reg', which is also weird.
+;; So just document what we're doing explicitly.
+
+(define_expand "x86_mov<mode>cc_0_m1"
+  [(parallel
+    [(set (match_operand:SWI48 0 "register_operand")
+	  (if_then_else:SWI48
+	    (match_operator:SWI48 2 "ix86_carry_flag_operator"
+	     [(match_operand 1 "flags_reg_operand")
+	      (const_int 0)])
+	    (const_int -1)
+	    (const_int 0)))
+     (clobber (reg:CC FLAGS_REG))])])
+
+(define_insn "*x86_mov<mode>cc_0_m1"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(if_then_else:SWI48 (match_operator 1 "ix86_carry_flag_operator"
+			     [(reg FLAGS_REG) (const_int 0)])
+	  (const_int -1)
+	  (const_int 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "sbb{<imodesuffix>}\t%0, %0"
+  ; Since we don't have the proper number of operands for an alu insn,
+  ; fill in all the blanks.
+  [(set_attr "type" "alu")
+   (set_attr "use_carry" "1")
+   (set_attr "pent_pair" "pu")
+   (set_attr "memory" "none")
+   (set_attr "imm_disp" "false")
+   (set_attr "mode" "<MODE>")
+   (set_attr "length_immediate" "0")])
+
+(define_insn "*x86_mov<mode>cc_0_m1_se"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(sign_extract:SWI48 (match_operator 1 "ix86_carry_flag_operator"
+			     [(reg FLAGS_REG) (const_int 0)])
+			    (const_int 1)
+			    (const_int 0)))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "sbb{<imodesuffix>}\t%0, %0"
+  [(set_attr "type" "alu")
+   (set_attr "use_carry" "1")
+   (set_attr "pent_pair" "pu")
+   (set_attr "memory" "none")
+   (set_attr "imm_disp" "false")
+   (set_attr "mode" "<MODE>")
+   (set_attr "length_immediate" "0")])
+
+(define_insn "*x86_mov<mode>cc_0_m1_neg"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(neg:SWI48 (match_operator 1 "ix86_carry_flag_operator"
+		    [(reg FLAGS_REG) (const_int 0)])))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "sbb{<imodesuffix>}\t%0, %0"
+  [(set_attr "type" "alu")
+   (set_attr "use_carry" "1")
+   (set_attr "pent_pair" "pu")
+   (set_attr "memory" "none")
+   (set_attr "imm_disp" "false")
+   (set_attr "mode" "<MODE>")
+   (set_attr "length_immediate" "0")])
+
+(define_insn "*mov<mode>cc_noc"
+  [(set (match_operand:SWI248 0 "register_operand" "=r,r")
+	(if_then_else:SWI248 (match_operator 1 "ix86_comparison_operator"
+			       [(reg FLAGS_REG) (const_int 0)])
+	  (match_operand:SWI248 2 "nonimmediate_operand" "rm,0")
+	  (match_operand:SWI248 3 "nonimmediate_operand" "0,rm")))]
+  "TARGET_CMOVE && !(MEM_P (operands[2]) && MEM_P (operands[3]))"
+  "@
+   cmov%O2%C1\t{%2, %0|%0, %2}
+   cmov%O2%c1\t{%3, %0|%0, %3}"
+  [(set_attr "type" "icmov")
+   (set_attr "mode" "<MODE>")])
+
+;; Don't do conditional moves with memory inputs.  This splitter helps
+;; register starved x86_32 by forcing inputs into registers before reload.
+(define_split
+  [(set (match_operand:SWI248 0 "register_operand")
+	(if_then_else:SWI248 (match_operator 1 "ix86_comparison_operator"
+			       [(reg FLAGS_REG) (const_int 0)])
+	  (match_operand:SWI248 2 "nonimmediate_operand")
+	  (match_operand:SWI248 3 "nonimmediate_operand")))]
+  "!TARGET_64BIT && TARGET_CMOVE
+   && TARGET_AVOID_MEM_OPND_FOR_CMOVE
+   && (MEM_P (operands[2]) || MEM_P (operands[3]))
+   && can_create_pseudo_p ()
+   && optimize_insn_for_speed_p ()"
+  [(set (match_dup 0)
+	(if_then_else:SWI248 (match_dup 1) (match_dup 2) (match_dup 3)))]
+{
+  if (MEM_P (operands[2]))
+    operands[2] = force_reg (<MODE>mode, operands[2]);
+  if (MEM_P (operands[3]))
+    operands[3] = force_reg (<MODE>mode, operands[3]);
+})
+
+(define_insn "*movqicc_noc"
+  [(set (match_operand:QI 0 "register_operand" "=r,r")
+	(if_then_else:QI (match_operator 1 "ix86_comparison_operator"
+			   [(reg FLAGS_REG) (const_int 0)])
+		      (match_operand:QI 2 "register_operand" "r,0")
+		      (match_operand:QI 3 "register_operand" "0,r")))]
+  "TARGET_CMOVE && !TARGET_PARTIAL_REG_STALL"
+  "#"
+  [(set_attr "type" "icmov")
+   (set_attr "mode" "QI")])
+
+(define_split
+  [(set (match_operand:SWI12 0 "register_operand")
+	(if_then_else:SWI12 (match_operator 1 "ix86_comparison_operator"
+			      [(reg FLAGS_REG) (const_int 0)])
+		      (match_operand:SWI12 2 "register_operand")
+		      (match_operand:SWI12 3 "register_operand")))]
+  "TARGET_CMOVE && !TARGET_PARTIAL_REG_STALL
+   && reload_completed"
+  [(set (match_dup 0)
+	(if_then_else:SI (match_dup 1) (match_dup 2) (match_dup 3)))]
+{
+  operands[0] = gen_lowpart (SImode, operands[0]);
+  operands[2] = gen_lowpart (SImode, operands[2]);
+  operands[3] = gen_lowpart (SImode, operands[3]);
+})
+
+;; Don't do conditional moves with memory inputs
+(define_peephole2
+  [(match_scratch:SWI248 2 "r")
+   (set (match_operand:SWI248 0 "register_operand")
+	(if_then_else:SWI248 (match_operator 1 "ix86_comparison_operator"
+			       [(reg FLAGS_REG) (const_int 0)])
+	  (match_dup 0)
+	  (match_operand:SWI248 3 "memory_operand")))]
+  "TARGET_CMOVE && TARGET_AVOID_MEM_OPND_FOR_CMOVE
+   && optimize_insn_for_speed_p ()"
+  [(set (match_dup 2) (match_dup 3))
+   (set (match_dup 0)
+	(if_then_else:SWI248 (match_dup 1) (match_dup 0) (match_dup 2)))])
+
+(define_peephole2
+  [(match_scratch:SWI248 2 "r")
+   (set (match_operand:SWI248 0 "register_operand")
+	(if_then_else:SWI248 (match_operator 1 "ix86_comparison_operator"
+			       [(reg FLAGS_REG) (const_int 0)])
+	  (match_operand:SWI248 3 "memory_operand")
+	  (match_dup 0)))]
+  "TARGET_CMOVE && TARGET_AVOID_MEM_OPND_FOR_CMOVE
+   && optimize_insn_for_speed_p ()"
+  [(set (match_dup 2) (match_dup 3))
+   (set (match_dup 0)
+	(if_then_else:SWI248 (match_dup 1) (match_dup 2) (match_dup 0)))])
+
+(define_expand "mov<mode>cc"
+  [(set (match_operand:X87MODEF 0 "register_operand")
+	(if_then_else:X87MODEF
+	  (match_operand 1 "comparison_operator")
+	  (match_operand:X87MODEF 2 "register_operand")
+	  (match_operand:X87MODEF 3 "register_operand")))]
+  "(TARGET_80387 && TARGET_CMOVE)
+   || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
+  "if (ix86_expand_fp_movcc (operands)) DONE; else FAIL;")
+
+(define_insn "*movxfcc_1"
+  [(set (match_operand:XF 0 "register_operand" "=f,f")
+	(if_then_else:XF (match_operator 1 "fcmov_comparison_operator"
+				[(reg FLAGS_REG) (const_int 0)])
+		      (match_operand:XF 2 "register_operand" "f,0")
+		      (match_operand:XF 3 "register_operand" "0,f")))]
+  "TARGET_80387 && TARGET_CMOVE"
+  "@
+   fcmov%F1\t{%2, %0|%0, %2}
+   fcmov%f1\t{%3, %0|%0, %3}"
+  [(set_attr "type" "fcmov")
+   (set_attr "mode" "XF")])
+
+(define_insn "*movdfcc_1"
+  [(set (match_operand:DF 0 "register_operand" "=f,f,&r,&r,r ,r")
+	(if_then_else:DF (match_operator 1 "fcmov_comparison_operator"
+				[(reg FLAGS_REG) (const_int 0)])
+		      (match_operand:DF 2 "nonimmediate_operand"
+					       "f ,0,rm,0 ,rm,0")
+		      (match_operand:DF 3 "nonimmediate_operand"
+					       "0 ,f,0 ,rm,0, rm")))]
+  "TARGET_80387 && TARGET_CMOVE
+   && !(MEM_P (operands[2]) && MEM_P (operands[3]))"
+  "@
+   fcmov%F1\t{%2, %0|%0, %2}
+   fcmov%f1\t{%3, %0|%0, %3}
+   #
+   #
+   cmov%O2%C1\t{%2, %0|%0, %2}
+   cmov%O2%c1\t{%3, %0|%0, %3}"
+  [(set_attr "isa" "*,*,nox64,nox64,x64,x64")
+   (set_attr "type" "fcmov,fcmov,multi,multi,icmov,icmov")
+   (set_attr "mode" "DF,DF,DI,DI,DI,DI")])
+
+(define_split
+  [(set (match_operand:DF 0 "register_and_not_any_fp_reg_operand")
+	(if_then_else:DF (match_operator 1 "fcmov_comparison_operator"
+				[(reg FLAGS_REG) (const_int 0)])
+		      (match_operand:DF 2 "nonimmediate_operand")
+		      (match_operand:DF 3 "nonimmediate_operand")))]
+  "!TARGET_64BIT && reload_completed"
+  [(set (match_dup 2)
+	(if_then_else:SI (match_dup 1) (match_dup 4) (match_dup 5)))
+   (set (match_dup 3)
+	(if_then_else:SI (match_dup 1) (match_dup 6) (match_dup 7)))]
+{
+  split_double_mode (DImode, &operands[2], 2, &operands[4], &operands[6]);
+  split_double_mode (DImode, &operands[0], 1, &operands[2], &operands[3]);
+})
+
+(define_insn "*movsfcc_1_387"
+  [(set (match_operand:SF 0 "register_operand" "=f,f,r,r")
+	(if_then_else:SF (match_operator 1 "fcmov_comparison_operator"
+				[(reg FLAGS_REG) (const_int 0)])
+		      (match_operand:SF 2 "nonimmediate_operand" "f,0,rm,0")
+		      (match_operand:SF 3 "nonimmediate_operand" "0,f,0,rm")))]
+  "TARGET_80387 && TARGET_CMOVE
+   && !(MEM_P (operands[2]) && MEM_P (operands[3]))"
+  "@
+   fcmov%F1\t{%2, %0|%0, %2}
+   fcmov%f1\t{%3, %0|%0, %3}
+   cmov%O2%C1\t{%2, %0|%0, %2}
+   cmov%O2%c1\t{%3, %0|%0, %3}"
+  [(set_attr "type" "fcmov,fcmov,icmov,icmov")
+   (set_attr "mode" "SF,SF,SI,SI")])
+
+;; Don't do conditional moves with memory inputs.  This splitter helps
+;; register starved x86_32 by forcing inputs into registers before reload.
+(define_split
+  [(set (match_operand:MODEF 0 "register_operand")
+	(if_then_else:MODEF (match_operator 1 "ix86_comparison_operator"
+			      [(reg FLAGS_REG) (const_int 0)])
+	  (match_operand:MODEF 2 "nonimmediate_operand")
+	  (match_operand:MODEF 3 "nonimmediate_operand")))]
+  "!TARGET_64BIT && TARGET_80387 && TARGET_CMOVE
+   && TARGET_AVOID_MEM_OPND_FOR_CMOVE
+   && (MEM_P (operands[2]) || MEM_P (operands[3]))
+   && can_create_pseudo_p ()
+   && optimize_insn_for_speed_p ()"
+  [(set (match_dup 0)
+	(if_then_else:MODEF (match_dup 1) (match_dup 2) (match_dup 3)))]
+{
+  if (MEM_P (operands[2]))
+    operands[2] = force_reg (<MODE>mode, operands[2]);
+  if (MEM_P (operands[3]))
+    operands[3] = force_reg (<MODE>mode, operands[3]);
+})
+
+;; Don't do conditional moves with memory inputs
+(define_peephole2
+  [(match_scratch:MODEF 2 "r")
+   (set (match_operand:MODEF 0 "register_and_not_any_fp_reg_operand")
+	(if_then_else:MODEF (match_operator 1 "fcmov_comparison_operator"
+			      [(reg FLAGS_REG) (const_int 0)])
+	  (match_dup 0)
+	  (match_operand:MODEF 3 "memory_operand")))]
+  "(<MODE>mode != DFmode || TARGET_64BIT)
+   && TARGET_80387 && TARGET_CMOVE
+   && TARGET_AVOID_MEM_OPND_FOR_CMOVE
+   && optimize_insn_for_speed_p ()"
+  [(set (match_dup 2) (match_dup 3))
+   (set (match_dup 0)
+	(if_then_else:MODEF (match_dup 1) (match_dup 0) (match_dup 2)))])
+
+(define_peephole2
+  [(match_scratch:MODEF 2 "r")
+   (set (match_operand:MODEF 0 "register_and_not_any_fp_reg_operand")
+	(if_then_else:MODEF (match_operator 1 "fcmov_comparison_operator"
+			      [(reg FLAGS_REG) (const_int 0)])
+	  (match_operand:MODEF 3 "memory_operand")
+	  (match_dup 0)))]
+  "(<MODE>mode != DFmode || TARGET_64BIT)
+   && TARGET_80387 && TARGET_CMOVE
+   && TARGET_AVOID_MEM_OPND_FOR_CMOVE
+   && optimize_insn_for_speed_p ()"
+  [(set (match_dup 2) (match_dup 3))
+   (set (match_dup 0)
+	(if_then_else:MODEF (match_dup 1) (match_dup 2) (match_dup 0)))])
+
+;; All moves in XOP pcmov instructions are 128 bits and hence we restrict
+;; the scalar versions to have only XMM registers as operands.
+
+;; XOP conditional move
+(define_insn "*xop_pcmov_<mode>"
+  [(set (match_operand:MODEF 0 "register_operand" "=x")
+	(if_then_else:MODEF
+	  (match_operand:MODEF 1 "register_operand" "x")
+	  (match_operand:MODEF 2 "register_operand" "x")
+	  (match_operand:MODEF 3 "register_operand" "x")))]
+  "TARGET_XOP"
+  "vpcmov\t{%1, %3, %2, %0|%0, %2, %3, %1}"
+  [(set_attr "type" "sse4arg")])
+
+;; These versions of the min/max patterns are intentionally ignorant of
+;; their behavior wrt -0.0 and NaN (via the commutative operand mark).
+;; Since both the tree-level MAX_EXPR and the rtl-level SMAX operator
+;; are undefined in this condition, we're certain this is correct.
+
+(define_insn "<code><mode>3"
+  [(set (match_operand:MODEF 0 "register_operand" "=x,v")
+	(smaxmin:MODEF
+	  (match_operand:MODEF 1 "nonimmediate_operand" "%0,v")
+	  (match_operand:MODEF 2 "nonimmediate_operand" "xm,vm")))]
+  "SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH"
+  "@
+   <maxmin_float><ssemodesuffix>\t{%2, %0|%0, %2}
+   v<maxmin_float><ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "type" "sseadd")
+   (set_attr "mode" "<MODE>")])
+
+;; These versions of the min/max patterns implement exactly the operations
+;;   min = (op1 < op2 ? op1 : op2)
+;;   max = (!(op1 < op2) ? op1 : op2)
+;; Their operands are not commutative, and thus they may be used in the
+;; presence of -0.0 and NaN.
+
+(define_int_iterator IEEE_MAXMIN
+	[UNSPEC_IEEE_MAX
+	 UNSPEC_IEEE_MIN])
+
+(define_int_attr ieee_maxmin
+	[(UNSPEC_IEEE_MAX "max")
+	 (UNSPEC_IEEE_MIN "min")])
+
+(define_insn "*ieee_s<ieee_maxmin><mode>3"
+  [(set (match_operand:MODEF 0 "register_operand" "=x,x")
+	(unspec:MODEF
+	  [(match_operand:MODEF 1 "register_operand" "0,x")
+	   (match_operand:MODEF 2 "nonimmediate_operand" "xm,xm")]
+	  IEEE_MAXMIN))]
+  "SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH"
+  "@
+   <ieee_maxmin><ssemodesuffix>\t{%2, %0|%0, %2}
+   v<ieee_maxmin><ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "type" "sseadd")
+   (set_attr "mode" "<MODE>")])
+
+;; Make two stack loads independent:
+;;   fld aa              fld aa
+;;   fld %st(0)     ->   fld bb
+;;   fmul bb             fmul %st(1), %st
+;;
+;; Actually we only match the last two instructions for simplicity.
+(define_peephole2
+  [(set (match_operand 0 "fp_register_operand")
+	(match_operand 1 "fp_register_operand"))
+   (set (match_dup 0)
+	(match_operator 2 "binary_fp_operator"
+	   [(match_dup 0)
+	    (match_operand 3 "memory_operand")]))]
+  "REGNO (operands[0]) != REGNO (operands[1])"
+  [(set (match_dup 0) (match_dup 3))
+   (set (match_dup 0) (match_dup 4))]
+
+  ;; The % modifier is not operational anymore in peephole2's, so we have to
+  ;; swap the operands manually in the case of addition and multiplication.
+{
+  rtx op0, op1;
+
+  if (COMMUTATIVE_ARITH_P (operands[2]))
+    op0 = operands[0], op1 = operands[1];
+  else
+    op0 = operands[1], op1 = operands[0];
+
+  operands[4] = gen_rtx_fmt_ee (GET_CODE (operands[2]),
+				GET_MODE (operands[2]),
+				op0, op1);
+})
+
+;; Conditional addition patterns
+(define_expand "add<mode>cc"
+  [(match_operand:SWI 0 "register_operand")
+   (match_operand 1 "ordered_comparison_operator")
+   (match_operand:SWI 2 "register_operand")
+   (match_operand:SWI 3 "const_int_operand")]
+  ""
+  "if (ix86_expand_int_addcc (operands)) DONE; else FAIL;")
+
+;; Misc patterns (?)
+
+;; This pattern exists to put a dependency on all ebp-based memory accesses.
+;; Otherwise there will be nothing to keep
+;;
+;; [(set (reg ebp) (reg esp))]
+;; [(set (reg esp) (plus (reg esp) (const_int -160000)))
+;;  (clobber (eflags)]
+;; [(set (mem (plus (reg ebp) (const_int -160000))) (const_int 0))]
+;;
+;; in proper program order.
+
+(define_insn "pro_epilogue_adjust_stack_<mode>_add"
+  [(set (match_operand:P 0 "register_operand" "=r,r")
+	(plus:P (match_operand:P 1 "register_operand" "0,r")
+	        (match_operand:P 2 "<nonmemory_operand>" "r<i>,l<i>")))
+   (clobber (reg:CC FLAGS_REG))
+   (clobber (mem:BLK (scratch)))]
+  ""
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_IMOV:
+      return "mov{<imodesuffix>}\t{%1, %0|%0, %1}";
+
+    case TYPE_ALU:
+      gcc_assert (rtx_equal_p (operands[0], operands[1]));
+      if (x86_maybe_negate_const_int (&operands[2], <MODE>mode))
+	return "sub{<imodesuffix>}\t{%2, %0|%0, %2}";
+
+      return "add{<imodesuffix>}\t{%2, %0|%0, %2}";
+
+    default:
+      operands[2] = SET_SRC (XVECEXP (PATTERN (insn), 0, 0));
+      return "lea{<imodesuffix>}\t{%E2, %0|%0, %E2}";
+    }
+}
+  [(set (attr "type")
+	(cond [(and (eq_attr "alternative" "0")
+		    (not (match_test "TARGET_OPT_AGU")))
+		 (const_string "alu")
+	       (match_operand:<MODE> 2 "const0_operand")
+		 (const_string "imov")
+	      ]
+	      (const_string "lea")))
+   (set (attr "length_immediate")
+	(cond [(eq_attr "type" "imov")
+		 (const_string "0")
+	       (and (eq_attr "type" "alu")
+		    (match_operand 2 "const128_operand"))
+		 (const_string "1")
+	      ]
+	      (const_string "*")))
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "pro_epilogue_adjust_stack_<mode>_sub"
+  [(set (match_operand:P 0 "register_operand" "=r")
+	(minus:P (match_operand:P 1 "register_operand" "0")
+		 (match_operand:P 2 "register_operand" "r")))
+   (clobber (reg:CC FLAGS_REG))
+   (clobber (mem:BLK (scratch)))]
+  ""
+  "sub{<imodesuffix>}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "alu")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "allocate_stack_worker_probe_<mode>"
+  [(set (match_operand:P 0 "register_operand" "=a")
+	(unspec_volatile:P [(match_operand:P 1 "register_operand" "0")]
+			    UNSPECV_STACK_PROBE))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_target_stack_probe ()"
+  "call\t___chkstk_ms"
+  [(set_attr "type" "multi")
+   (set_attr "length" "5")])
+
+(define_expand "allocate_stack"
+  [(match_operand 0 "register_operand")
+   (match_operand 1 "general_operand")]
+  "ix86_target_stack_probe ()"
+{
+  rtx x;
+
+#ifndef CHECK_STACK_LIMIT
+#define CHECK_STACK_LIMIT 0
+#endif
+
+  if (CHECK_STACK_LIMIT && CONST_INT_P (operands[1])
+      && INTVAL (operands[1]) < CHECK_STACK_LIMIT)
+    x = operands[1];
+  else
+    {
+      rtx (*insn) (rtx, rtx);
+
+      x = copy_to_mode_reg (Pmode, operands[1]);
+
+      insn = (TARGET_64BIT
+	      ? gen_allocate_stack_worker_probe_di
+	      : gen_allocate_stack_worker_probe_si);
+
+      emit_insn (insn (x, x));
+    }
+
+  x = expand_simple_binop (Pmode, MINUS, stack_pointer_rtx, x,
+			   stack_pointer_rtx, 0, OPTAB_DIRECT);
+
+  if (x != stack_pointer_rtx)
+    emit_move_insn (stack_pointer_rtx, x);
+
+  emit_move_insn (operands[0], virtual_stack_dynamic_rtx);
+  DONE;
+})
+
+;; Use IOR for stack probes, this is shorter.
+(define_expand "probe_stack"
+  [(match_operand 0 "memory_operand")]
+  ""
+{
+  rtx (*gen_ior3) (rtx, rtx, rtx);
+
+  gen_ior3 = (GET_MODE (operands[0]) == DImode
+	      ? gen_iordi3 : gen_iorsi3);
+
+  emit_insn (gen_ior3 (operands[0], operands[0], const0_rtx));
+  DONE;
+})
+
+(define_insn "adjust_stack_and_probe<mode>"
+  [(set (match_operand:P 0 "register_operand" "=r")
+	(unspec_volatile:P [(match_operand:P 1 "register_operand" "0")]
+			    UNSPECV_PROBE_STACK_RANGE))
+   (set (reg:P SP_REG)
+        (minus:P (reg:P SP_REG) (match_operand:P 2 "const_int_operand" "n")))
+   (clobber (reg:CC FLAGS_REG))
+   (clobber (mem:BLK (scratch)))]
+  ""
+  "* return output_adjust_stack_and_probe (operands[0]);"
+  [(set_attr "type" "multi")])
+
+(define_insn "probe_stack_range<mode>"
+  [(set (match_operand:P 0 "register_operand" "=r")
+	(unspec_volatile:P [(match_operand:P 1 "register_operand" "0")
+			    (match_operand:P 2 "const_int_operand" "n")]
+			    UNSPECV_PROBE_STACK_RANGE))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "* return output_probe_stack_range (operands[0], operands[2]);"
+  [(set_attr "type" "multi")])
+
+(define_expand "builtin_setjmp_receiver"
+  [(label_ref (match_operand 0))]
+  "!TARGET_64BIT && flag_pic"
+{
+#if TARGET_MACHO
+  if (TARGET_MACHO)
+    {
+      rtx xops[3];
+      rtx picreg = gen_rtx_REG (Pmode, PIC_OFFSET_TABLE_REGNUM);
+      rtx label_rtx = gen_label_rtx ();
+      emit_insn (gen_set_got_labelled (pic_offset_table_rtx, label_rtx));
+      xops[0] = xops[1] = picreg;
+      xops[2] = machopic_gen_offset (gen_rtx_LABEL_REF (SImode, label_rtx));
+      ix86_expand_binary_operator (MINUS, SImode, xops);
+    }
+  else
+#endif
+    emit_insn (gen_set_got (pic_offset_table_rtx));
+  DONE;
+})
+
+(define_insn_and_split "nonlocal_goto_receiver"
+  [(unspec_volatile [(const_int 0)] UNSPECV_NLGR)]
+  "TARGET_MACHO && !TARGET_64BIT && flag_pic"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  if (crtl->uses_pic_offset_table)
+    {
+      rtx xops[3];
+      rtx label_rtx = gen_label_rtx ();
+      rtx tmp;
+
+      /* Get a new pic base.  */
+      emit_insn (gen_set_got_labelled (pic_offset_table_rtx, label_rtx));
+      /* Correct this with the offset from the new to the old.  */
+      xops[0] = xops[1] = pic_offset_table_rtx;
+      label_rtx = gen_rtx_LABEL_REF (SImode, label_rtx);
+      tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, label_rtx),
+			    UNSPEC_MACHOPIC_OFFSET);
+      xops[2] = gen_rtx_CONST (Pmode, tmp);
+      ix86_expand_binary_operator (MINUS, SImode, xops);
+    }
+  else
+    /* No pic reg restore needed.  */
+    emit_note (NOTE_INSN_DELETED);
+
+  DONE;
+})
+
+;; Avoid redundant prefixes by splitting HImode arithmetic to SImode.
+;; Do not split instructions with mask registers.
+(define_split
+  [(set (match_operand 0 "general_reg_operand")
+	(match_operator 3 "promotable_binary_operator"
+	   [(match_operand 1 "general_reg_operand")
+	    (match_operand 2 "aligned_operand")]))
+   (clobber (reg:CC FLAGS_REG))]
+  "! TARGET_PARTIAL_REG_STALL && reload_completed
+   && ((GET_MODE (operands[0]) == HImode
+	&& ((optimize_function_for_speed_p (cfun) && !TARGET_FAST_PREFIX)
+            /* ??? next two lines just !satisfies_constraint_K (...) */
+	    || !CONST_INT_P (operands[2])
+	    || satisfies_constraint_K (operands[2])))
+       || (GET_MODE (operands[0]) == QImode
+	   && (TARGET_PROMOTE_QImode || optimize_function_for_size_p (cfun))))"
+  [(parallel [(set (match_dup 0)
+		   (match_op_dup 3 [(match_dup 1) (match_dup 2)]))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  operands[0] = gen_lowpart (SImode, operands[0]);
+  operands[1] = gen_lowpart (SImode, operands[1]);
+  if (GET_CODE (operands[3]) != ASHIFT)
+    operands[2] = gen_lowpart (SImode, operands[2]);
+  PUT_MODE (operands[3], SImode);
+})
+
+; Promote the QImode tests, as i386 has encoding of the AND
+; instruction with 32-bit sign-extended immediate and thus the
+; instruction size is unchanged, except in the %eax case for
+; which it is increased by one byte, hence the ! optimize_size.
+(define_split
+  [(set (match_operand 0 "flags_reg_operand")
+	(match_operator 2 "compare_operator"
+	  [(and (match_operand 3 "aligned_operand")
+		(match_operand 4 "const_int_operand"))
+	   (const_int 0)]))
+   (set (match_operand 1 "register_operand")
+	(and (match_dup 3) (match_dup 4)))]
+  "! TARGET_PARTIAL_REG_STALL && reload_completed
+   && optimize_insn_for_speed_p ()
+   && ((GET_MODE (operands[1]) == HImode && ! TARGET_FAST_PREFIX)
+       || (GET_MODE (operands[1]) == QImode && TARGET_PROMOTE_QImode))
+   /* Ensure that the operand will remain sign-extended immediate.  */
+   && ix86_match_ccmode (insn, INTVAL (operands[4]) >= 0 ? CCNOmode : CCZmode)"
+  [(parallel [(set (match_dup 0)
+		   (match_op_dup 2 [(and:SI (match_dup 3) (match_dup 4))
+			            (const_int 0)]))
+	      (set (match_dup 1)
+		   (and:SI (match_dup 3) (match_dup 4)))])]
+{
+  operands[4]
+    = gen_int_mode (INTVAL (operands[4])
+		    & GET_MODE_MASK (GET_MODE (operands[1])), SImode);
+  operands[1] = gen_lowpart (SImode, operands[1]);
+  operands[3] = gen_lowpart (SImode, operands[3]);
+})
+
+; Don't promote the QImode tests, as i386 doesn't have encoding of
+; the TEST instruction with 32-bit sign-extended immediate and thus
+; the instruction size would at least double, which is not what we
+; want even with ! optimize_size.
+(define_split
+  [(set (match_operand 0 "flags_reg_operand")
+	(match_operator 1 "compare_operator"
+	  [(and (match_operand:HI 2 "aligned_operand")
+		(match_operand:HI 3 "const_int_operand"))
+	   (const_int 0)]))]
+  "! TARGET_PARTIAL_REG_STALL && reload_completed
+   && ! TARGET_FAST_PREFIX
+   && optimize_insn_for_speed_p ()
+   /* Ensure that the operand will remain sign-extended immediate.  */
+   && ix86_match_ccmode (insn, INTVAL (operands[3]) >= 0 ? CCNOmode : CCZmode)"
+  [(set (match_dup 0)
+	(match_op_dup 1 [(and:SI (match_dup 2) (match_dup 3))
+		         (const_int 0)]))]
+{
+  operands[3]
+    = gen_int_mode (INTVAL (operands[3])
+		    & GET_MODE_MASK (GET_MODE (operands[2])), SImode);
+  operands[2] = gen_lowpart (SImode, operands[2]);
+})
+
+(define_split
+  [(set (match_operand 0 "register_operand")
+	(neg (match_operand 1 "register_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "! TARGET_PARTIAL_REG_STALL && reload_completed
+   && (GET_MODE (operands[0]) == HImode
+       || (GET_MODE (operands[0]) == QImode
+	   && (TARGET_PROMOTE_QImode
+	       || optimize_insn_for_size_p ())))"
+  [(parallel [(set (match_dup 0)
+		   (neg:SI (match_dup 1)))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  operands[0] = gen_lowpart (SImode, operands[0]);
+  operands[1] = gen_lowpart (SImode, operands[1]);
+})
+
+;; Do not split instructions with mask regs.
+(define_split
+  [(set (match_operand 0 "general_reg_operand")
+	(not (match_operand 1 "general_reg_operand")))]
+  "! TARGET_PARTIAL_REG_STALL && reload_completed
+   && (GET_MODE (operands[0]) == HImode
+       || (GET_MODE (operands[0]) == QImode
+	   && (TARGET_PROMOTE_QImode
+	       || optimize_insn_for_size_p ())))"
+  [(set (match_dup 0)
+	(not:SI (match_dup 1)))]
+{
+  operands[0] = gen_lowpart (SImode, operands[0]);
+  operands[1] = gen_lowpart (SImode, operands[1]);
+})
+
+;; RTL Peephole optimizations, run before sched2.  These primarily look to
+;; transform a complex memory operation into two memory to register operations.
+
+;; Don't push memory operands
+(define_peephole2
+  [(set (match_operand:SWI 0 "push_operand")
+	(match_operand:SWI 1 "memory_operand"))
+   (match_scratch:SWI 2 "<r>")]
+  "!(TARGET_PUSH_MEMORY || optimize_insn_for_size_p ())
+   && !RTX_FRAME_RELATED_P (peep2_next_insn (0))"
+  [(set (match_dup 2) (match_dup 1))
+   (set (match_dup 0) (match_dup 2))])
+
+;; We need to handle SFmode only, because DFmode and XFmode are split to
+;; SImode pushes.
+(define_peephole2
+  [(set (match_operand:SF 0 "push_operand")
+	(match_operand:SF 1 "memory_operand"))
+   (match_scratch:SF 2 "r")]
+  "!(TARGET_PUSH_MEMORY || optimize_insn_for_size_p ())
+   && !RTX_FRAME_RELATED_P (peep2_next_insn (0))"
+  [(set (match_dup 2) (match_dup 1))
+   (set (match_dup 0) (match_dup 2))])
+
+;; Don't move an immediate directly to memory when the instruction
+;; gets too big, or if LCP stalls are a problem for 16-bit moves.
+(define_peephole2
+  [(match_scratch:SWI124 1 "<r>")
+   (set (match_operand:SWI124 0 "memory_operand")
+        (const_int 0))]
+  "optimize_insn_for_speed_p ()
+   && ((<MODE>mode == HImode
+       && TARGET_LCP_STALL)
+       || (!TARGET_USE_MOV0
+          && TARGET_SPLIT_LONG_MOVES
+          && get_attr_length (insn) >= ix86_cur_cost ()->large_insn))
+   && peep2_regno_dead_p (0, FLAGS_REG)"
+  [(parallel [(set (match_dup 2) (const_int 0))
+	      (clobber (reg:CC FLAGS_REG))])
+   (set (match_dup 0) (match_dup 1))]
+  "operands[2] = gen_lowpart (SImode, operands[1]);")
+
+(define_peephole2
+  [(match_scratch:SWI124 2 "<r>")
+   (set (match_operand:SWI124 0 "memory_operand")
+        (match_operand:SWI124 1 "immediate_operand"))]
+  "optimize_insn_for_speed_p ()
+   && ((<MODE>mode == HImode
+       && TARGET_LCP_STALL)
+       || (TARGET_SPLIT_LONG_MOVES
+          && get_attr_length (insn) >= ix86_cur_cost ()->large_insn))"
+  [(set (match_dup 2) (match_dup 1))
+   (set (match_dup 0) (match_dup 2))])
+
+;; Don't compare memory with zero, load and use a test instead.
+(define_peephole2
+  [(set (match_operand 0 "flags_reg_operand")
+ 	(match_operator 1 "compare_operator"
+	  [(match_operand:SI 2 "memory_operand")
+	   (const_int 0)]))
+   (match_scratch:SI 3 "r")]
+  "optimize_insn_for_speed_p () && ix86_match_ccmode (insn, CCNOmode)"
+  [(set (match_dup 3) (match_dup 2))
+   (set (match_dup 0) (match_op_dup 1 [(match_dup 3) (const_int 0)]))])
+
+;; NOT is not pairable on Pentium, while XOR is, but one byte longer.
+;; Don't split NOTs with a displacement operand, because resulting XOR
+;; will not be pairable anyway.
+;;
+;; On AMD K6, NOT is vector decoded with memory operand that cannot be
+;; represented using a modRM byte.  The XOR replacement is long decoded,
+;; so this split helps here as well.
+;;
+;; Note: Can't do this as a regular split because we can't get proper
+;; lifetime information then.
+
+(define_peephole2
+  [(set (match_operand:SWI124 0 "nonimmediate_operand")
+	(not:SWI124 (match_operand:SWI124 1 "nonimmediate_operand")))]
+  "optimize_insn_for_speed_p ()
+   && ((TARGET_NOT_UNPAIRABLE
+	&& (!MEM_P (operands[0])
+	    || !memory_displacement_operand (operands[0], <MODE>mode)))
+       || (TARGET_NOT_VECTORMODE
+	   && long_memory_operand (operands[0], <MODE>mode)))
+   && peep2_regno_dead_p (0, FLAGS_REG)"
+  [(parallel [(set (match_dup 0)
+		   (xor:SWI124 (match_dup 1) (const_int -1)))
+	      (clobber (reg:CC FLAGS_REG))])])
+
+;; Non pairable "test imm, reg" instructions can be translated to
+;; "and imm, reg" if reg dies.  The "and" form is also shorter (one
+;; byte opcode instead of two, have a short form for byte operands),
+;; so do it for other CPUs as well.  Given that the value was dead,
+;; this should not create any new dependencies.  Pass on the sub-word
+;; versions if we're concerned about partial register stalls.
+
+(define_peephole2
+  [(set (match_operand 0 "flags_reg_operand")
+	(match_operator 1 "compare_operator"
+	  [(and:SI (match_operand:SI 2 "register_operand")
+		   (match_operand:SI 3 "immediate_operand"))
+	   (const_int 0)]))]
+  "ix86_match_ccmode (insn, CCNOmode)
+   && (true_regnum (operands[2]) != AX_REG
+       || satisfies_constraint_K (operands[3]))
+   && peep2_reg_dead_p (1, operands[2])"
+  [(parallel
+     [(set (match_dup 0)
+	   (match_op_dup 1 [(and:SI (match_dup 2) (match_dup 3))
+		            (const_int 0)]))
+      (set (match_dup 2)
+	   (and:SI (match_dup 2) (match_dup 3)))])])
+
+;; We don't need to handle HImode case, because it will be promoted to SImode
+;; on ! TARGET_PARTIAL_REG_STALL
+
+(define_peephole2
+  [(set (match_operand 0 "flags_reg_operand")
+	(match_operator 1 "compare_operator"
+	  [(and:QI (match_operand:QI 2 "register_operand")
+		   (match_operand:QI 3 "immediate_operand"))
+	   (const_int 0)]))]
+  "! TARGET_PARTIAL_REG_STALL
+   && ix86_match_ccmode (insn, CCNOmode)
+   && true_regnum (operands[2]) != AX_REG
+   && peep2_reg_dead_p (1, operands[2])"
+  [(parallel
+     [(set (match_dup 0)
+	   (match_op_dup 1 [(and:QI (match_dup 2) (match_dup 3))
+		            (const_int 0)]))
+      (set (match_dup 2)
+	   (and:QI (match_dup 2) (match_dup 3)))])])
+
+(define_peephole2
+  [(set (match_operand 0 "flags_reg_operand")
+	(match_operator 1 "compare_operator"
+	  [(and:SI
+	     (zero_extract:SI
+	       (match_operand 2 "ext_register_operand")
+	       (const_int 8)
+	       (const_int 8))
+	     (match_operand 3 "const_int_operand"))
+	   (const_int 0)]))]
+  "! TARGET_PARTIAL_REG_STALL
+   && ix86_match_ccmode (insn, CCNOmode)
+   && true_regnum (operands[2]) != AX_REG
+   && peep2_reg_dead_p (1, operands[2])"
+  [(parallel [(set (match_dup 0)
+		   (match_op_dup 1
+		     [(and:SI
+			(zero_extract:SI
+			  (match_dup 2)
+			  (const_int 8)
+			  (const_int 8))
+			(match_dup 3))
+		      (const_int 0)]))
+	      (set (zero_extract:SI (match_dup 2)
+				    (const_int 8)
+				    (const_int 8))
+		   (and:SI
+		     (zero_extract:SI
+		       (match_dup 2)
+		       (const_int 8)
+		       (const_int 8))
+		     (match_dup 3)))])])
+
+;; Don't do logical operations with memory inputs.
+(define_peephole2
+  [(match_scratch:SI 2 "r")
+   (parallel [(set (match_operand:SI 0 "register_operand")
+                   (match_operator:SI 3 "arith_or_logical_operator"
+                     [(match_dup 0)
+                      (match_operand:SI 1 "memory_operand")]))
+              (clobber (reg:CC FLAGS_REG))])]
+  "!(TARGET_READ_MODIFY || optimize_insn_for_size_p ())"
+  [(set (match_dup 2) (match_dup 1))
+   (parallel [(set (match_dup 0)
+                   (match_op_dup 3 [(match_dup 0) (match_dup 2)]))
+              (clobber (reg:CC FLAGS_REG))])])
+
+(define_peephole2
+  [(match_scratch:SI 2 "r")
+   (parallel [(set (match_operand:SI 0 "register_operand")
+                   (match_operator:SI 3 "arith_or_logical_operator"
+                     [(match_operand:SI 1 "memory_operand")
+                      (match_dup 0)]))
+              (clobber (reg:CC FLAGS_REG))])]
+  "!(TARGET_READ_MODIFY || optimize_insn_for_size_p ())"
+  [(set (match_dup 2) (match_dup 1))
+   (parallel [(set (match_dup 0)
+                   (match_op_dup 3 [(match_dup 2) (match_dup 0)]))
+              (clobber (reg:CC FLAGS_REG))])])
+
+;; Prefer Load+RegOp to Mov+MemOp.  Watch out for cases when the memory address
+;; refers to the destination of the load!
+
+(define_peephole2
+  [(set (match_operand:SI 0 "register_operand")
+        (match_operand:SI 1 "register_operand"))
+   (parallel [(set (match_dup 0)
+                   (match_operator:SI 3 "commutative_operator"
+                     [(match_dup 0)
+                      (match_operand:SI 2 "memory_operand")]))
+              (clobber (reg:CC FLAGS_REG))])]
+  "REGNO (operands[0]) != REGNO (operands[1])
+   && GENERAL_REGNO_P (REGNO (operands[0]))
+   && GENERAL_REGNO_P (REGNO (operands[1]))"
+  [(set (match_dup 0) (match_dup 4))
+   (parallel [(set (match_dup 0)
+                   (match_op_dup 3 [(match_dup 0) (match_dup 1)]))
+              (clobber (reg:CC FLAGS_REG))])]
+  "operands[4] = replace_rtx (operands[2], operands[0], operands[1]);")
+
+(define_peephole2
+  [(set (match_operand 0 "register_operand")
+        (match_operand 1 "register_operand"))
+   (set (match_dup 0)
+                   (match_operator 3 "commutative_operator"
+                     [(match_dup 0)
+                      (match_operand 2 "memory_operand")]))]
+  "REGNO (operands[0]) != REGNO (operands[1])
+   && ((MMX_REG_P (operands[0]) && MMX_REG_P (operands[1])) 
+       || (SSE_REG_P (operands[0]) && SSE_REG_P (operands[1])))"
+  [(set (match_dup 0) (match_dup 2))
+   (set (match_dup 0)
+        (match_op_dup 3 [(match_dup 0) (match_dup 1)]))])
+
+; Don't do logical operations with memory outputs
+;
+; These two don't make sense for PPro/PII -- we're expanding a 4-uop
+; instruction into two 1-uop insns plus a 2-uop insn.  That last has
+; the same decoder scheduling characteristics as the original.
+
+(define_peephole2
+  [(match_scratch:SI 2 "r")
+   (parallel [(set (match_operand:SI 0 "memory_operand")
+                   (match_operator:SI 3 "arith_or_logical_operator"
+                     [(match_dup 0)
+                      (match_operand:SI 1 "nonmemory_operand")]))
+              (clobber (reg:CC FLAGS_REG))])]
+  "!(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   /* Do not split stack checking probes.  */
+   && GET_CODE (operands[3]) != IOR && operands[1] != const0_rtx"
+  [(set (match_dup 2) (match_dup 0))
+   (parallel [(set (match_dup 2)
+                   (match_op_dup 3 [(match_dup 2) (match_dup 1)]))
+              (clobber (reg:CC FLAGS_REG))])
+   (set (match_dup 0) (match_dup 2))])
+
+(define_peephole2
+  [(match_scratch:SI 2 "r")
+   (parallel [(set (match_operand:SI 0 "memory_operand")
+                   (match_operator:SI 3 "arith_or_logical_operator"
+                     [(match_operand:SI 1 "nonmemory_operand")
+                      (match_dup 0)]))
+              (clobber (reg:CC FLAGS_REG))])]
+  "!(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   /* Do not split stack checking probes.  */
+   && GET_CODE (operands[3]) != IOR && operands[1] != const0_rtx"
+  [(set (match_dup 2) (match_dup 0))
+   (parallel [(set (match_dup 2)
+                   (match_op_dup 3 [(match_dup 1) (match_dup 2)]))
+              (clobber (reg:CC FLAGS_REG))])
+   (set (match_dup 0) (match_dup 2))])
+
+;; Attempt to use arith or logical operations with memory outputs with
+;; setting of flags.
+(define_peephole2
+  [(set (match_operand:SWI 0 "register_operand")
+	(match_operand:SWI 1 "memory_operand"))
+   (parallel [(set (match_dup 0)
+		   (match_operator:SWI 3 "plusminuslogic_operator"
+		     [(match_dup 0)
+		      (match_operand:SWI 2 "<nonmemory_operand>")]))
+	      (clobber (reg:CC FLAGS_REG))])
+   (set (match_dup 1) (match_dup 0))
+   (set (reg FLAGS_REG) (compare (match_dup 0) (const_int 0)))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && peep2_reg_dead_p (4, operands[0])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])
+   && (<MODE>mode != QImode
+       || immediate_operand (operands[2], QImode)
+       || q_regs_operand (operands[2], QImode))
+   && ix86_match_ccmode (peep2_next_insn (3),
+			 (GET_CODE (operands[3]) == PLUS
+			  || GET_CODE (operands[3]) == MINUS)
+			 ? CCGOCmode : CCNOmode)"
+  [(parallel [(set (match_dup 4) (match_dup 5))
+	      (set (match_dup 1) (match_op_dup 3 [(match_dup 1)
+						  (match_dup 2)]))])]
+{
+  operands[4] = SET_DEST (PATTERN (peep2_next_insn (3)));
+  operands[5] = gen_rtx_fmt_ee (GET_CODE (operands[3]), <MODE>mode,
+				copy_rtx (operands[1]),
+				copy_rtx (operands[2]));
+  operands[5] = gen_rtx_COMPARE (GET_MODE (operands[4]),
+				 operands[5], const0_rtx);
+})
+
+(define_peephole2
+  [(parallel [(set (match_operand:SWI 0 "register_operand")
+		   (match_operator:SWI 2 "plusminuslogic_operator"
+		     [(match_dup 0)
+		      (match_operand:SWI 1 "memory_operand")]))
+	      (clobber (reg:CC FLAGS_REG))])
+   (set (match_dup 1) (match_dup 0))
+   (set (reg FLAGS_REG) (compare (match_dup 0) (const_int 0)))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && GET_CODE (operands[2]) != MINUS
+   && peep2_reg_dead_p (3, operands[0])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && ix86_match_ccmode (peep2_next_insn (2),
+			 GET_CODE (operands[2]) == PLUS
+			 ? CCGOCmode : CCNOmode)"
+  [(parallel [(set (match_dup 3) (match_dup 4))
+	      (set (match_dup 1) (match_op_dup 2 [(match_dup 1)
+						  (match_dup 0)]))])]
+{
+  operands[3] = SET_DEST (PATTERN (peep2_next_insn (2)));
+  operands[4] = gen_rtx_fmt_ee (GET_CODE (operands[2]), <MODE>mode,
+				copy_rtx (operands[1]),
+				copy_rtx (operands[0]));
+  operands[4] = gen_rtx_COMPARE (GET_MODE (operands[3]),
+				 operands[4], const0_rtx);
+})
+
+(define_peephole2
+  [(set (match_operand:SWI12 0 "register_operand")
+	(match_operand:SWI12 1 "memory_operand"))
+   (parallel [(set (match_operand:SI 4 "register_operand")
+		   (match_operator:SI 3 "plusminuslogic_operator"
+		     [(match_dup 4)
+		      (match_operand:SI 2 "nonmemory_operand")]))
+	      (clobber (reg:CC FLAGS_REG))])
+   (set (match_dup 1) (match_dup 0))
+   (set (reg FLAGS_REG) (compare (match_dup 0) (const_int 0)))]
+  "(TARGET_READ_MODIFY_WRITE || optimize_insn_for_size_p ())
+   && REG_P (operands[0]) && REG_P (operands[4])
+   && REGNO (operands[0]) == REGNO (operands[4])
+   && peep2_reg_dead_p (4, operands[0])
+   && (<MODE>mode != QImode
+       || immediate_operand (operands[2], SImode)
+       || q_regs_operand (operands[2], SImode))
+   && !reg_overlap_mentioned_p (operands[0], operands[1])
+   && !reg_overlap_mentioned_p (operands[0], operands[2])
+   && ix86_match_ccmode (peep2_next_insn (3),
+			 (GET_CODE (operands[3]) == PLUS
+			  || GET_CODE (operands[3]) == MINUS)
+			 ? CCGOCmode : CCNOmode)"
+  [(parallel [(set (match_dup 4) (match_dup 5))
+	      (set (match_dup 1) (match_dup 6))])]
+{
+  operands[2] = gen_lowpart (<MODE>mode, operands[2]);
+  operands[4] = SET_DEST (PATTERN (peep2_next_insn (3)));
+  operands[5] = gen_rtx_fmt_ee (GET_CODE (operands[3]), <MODE>mode,
+				copy_rtx (operands[1]), operands[2]);
+  operands[5] = gen_rtx_COMPARE (GET_MODE (operands[4]),
+				 operands[5], const0_rtx);
+  operands[6] = gen_rtx_fmt_ee (GET_CODE (operands[3]), <MODE>mode,
+				copy_rtx (operands[1]),
+				copy_rtx (operands[2]));
+})
+
+;; Attempt to always use XOR for zeroing registers.
+(define_peephole2
+  [(set (match_operand 0 "register_operand")
+	(match_operand 1 "const0_operand"))]
+  "GET_MODE_SIZE (GET_MODE (operands[0])) <= UNITS_PER_WORD
+   && (! TARGET_USE_MOV0 || optimize_insn_for_size_p ())
+   && GENERAL_REG_P (operands[0])
+   && peep2_regno_dead_p (0, FLAGS_REG)"
+  [(parallel [(set (match_dup 0) (const_int 0))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "operands[0] = gen_lowpart (word_mode, operands[0]);")
+
+(define_peephole2
+  [(set (strict_low_part (match_operand 0 "register_operand"))
+	(const_int 0))]
+  "(GET_MODE (operands[0]) == QImode
+    || GET_MODE (operands[0]) == HImode)
+   && (! TARGET_USE_MOV0 || optimize_insn_for_size_p ())
+   && peep2_regno_dead_p (0, FLAGS_REG)"
+  [(parallel [(set (strict_low_part (match_dup 0)) (const_int 0))
+	      (clobber (reg:CC FLAGS_REG))])])
+
+;; For HI, SI and DI modes, or $-1,reg is smaller than mov $-1,reg.
+(define_peephole2
+  [(set (match_operand:SWI248 0 "register_operand")
+	(const_int -1))]
+  "(optimize_insn_for_size_p () || TARGET_MOVE_M1_VIA_OR)
+   && peep2_regno_dead_p (0, FLAGS_REG)"
+  [(parallel [(set (match_dup 0) (const_int -1))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  if (<MODE_SIZE> < GET_MODE_SIZE (SImode))
+    operands[0] = gen_lowpart (SImode, operands[0]);
+})
+
+;; Attempt to convert simple lea to add/shift.
+;; These can be created by move expanders.
+;; Disable PLUS peepholes on TARGET_OPT_AGU, since all
+;; relevant lea instructions were already split.
+
+(define_peephole2
+  [(set (match_operand:SWI48 0 "register_operand")
+  	(plus:SWI48 (match_dup 0)
+		    (match_operand:SWI48 1 "<nonmemory_operand>")))]
+  "!TARGET_OPT_AGU
+   && peep2_regno_dead_p (0, FLAGS_REG)"
+  [(parallel [(set (match_dup 0) (plus:SWI48 (match_dup 0) (match_dup 1)))
+	      (clobber (reg:CC FLAGS_REG))])])
+
+(define_peephole2
+  [(set (match_operand:SWI48 0 "register_operand")
+  	(plus:SWI48 (match_operand:SWI48 1 "<nonmemory_operand>")
+		    (match_dup 0)))]
+  "!TARGET_OPT_AGU
+   && peep2_regno_dead_p (0, FLAGS_REG)"
+  [(parallel [(set (match_dup 0) (plus:SWI48 (match_dup 0) (match_dup 1)))
+	      (clobber (reg:CC FLAGS_REG))])])
+
+(define_peephole2
+  [(set (match_operand:DI 0 "register_operand")
+  	(zero_extend:DI
+	  (plus:SI (match_operand:SI 1 "register_operand")
+		   (match_operand:SI 2 "nonmemory_operand"))))]
+  "TARGET_64BIT && !TARGET_OPT_AGU
+   && REGNO (operands[0]) == REGNO (operands[1])
+   && peep2_regno_dead_p (0, FLAGS_REG)"
+  [(parallel [(set (match_dup 0)
+		   (zero_extend:DI (plus:SI (match_dup 1) (match_dup 2))))
+	      (clobber (reg:CC FLAGS_REG))])])
+
+(define_peephole2
+  [(set (match_operand:DI 0 "register_operand")
+  	(zero_extend:DI
+	  (plus:SI (match_operand:SI 1 "nonmemory_operand")
+		   (match_operand:SI 2 "register_operand"))))]
+  "TARGET_64BIT && !TARGET_OPT_AGU
+   && REGNO (operands[0]) == REGNO (operands[2])
+   && peep2_regno_dead_p (0, FLAGS_REG)"
+  [(parallel [(set (match_dup 0)
+		   (zero_extend:DI (plus:SI (match_dup 2) (match_dup 1))))
+	      (clobber (reg:CC FLAGS_REG))])])
+
+(define_peephole2
+  [(set (match_operand:SWI48 0 "register_operand")
+  	(mult:SWI48 (match_dup 0)
+		    (match_operand:SWI48 1 "const_int_operand")))]
+  "exact_log2 (INTVAL (operands[1])) >= 0
+   && peep2_regno_dead_p (0, FLAGS_REG)"
+  [(parallel [(set (match_dup 0) (ashift:SWI48 (match_dup 0) (match_dup 1)))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "operands[1] = GEN_INT (exact_log2 (INTVAL (operands[1])));")
+
+(define_peephole2
+  [(set (match_operand:DI 0 "register_operand")
+  	(zero_extend:DI
+	  (mult:SI (match_operand:SI 1 "register_operand")
+		   (match_operand:SI 2 "const_int_operand"))))]
+  "TARGET_64BIT
+   && exact_log2 (INTVAL (operands[2])) >= 0
+   && REGNO (operands[0]) == REGNO (operands[1])
+   && peep2_regno_dead_p (0, FLAGS_REG)"
+  [(parallel [(set (match_dup 0)
+		   (zero_extend:DI (ashift:SI (match_dup 1) (match_dup 2))))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "operands[2] = GEN_INT (exact_log2 (INTVAL (operands[2])));")
+
+;; The ESP adjustments can be done by the push and pop instructions.  Resulting
+;; code is shorter, since push is only 1 byte, while add imm, %esp is 3 bytes.
+;; On many CPUs it is also faster, since special hardware to avoid esp
+;; dependencies is present.
+
+;; While some of these conversions may be done using splitters, we use
+;; peepholes in order to allow combine_stack_adjustments pass to see
+;; nonobfuscated RTL.
+
+;; Convert prologue esp subtractions to push.
+;; We need register to push.  In order to keep verify_flow_info happy we have
+;; two choices
+;; - use scratch and clobber it in order to avoid dependencies
+;; - use already live register
+;; We can't use the second way right now, since there is no reliable way how to
+;; verify that given register is live.  First choice will also most likely in
+;; fewer dependencies.  On the place of esp adjustments it is very likely that
+;; call clobbered registers are dead.  We may want to use base pointer as an
+;; alternative when no register is available later.
+
+(define_peephole2
+  [(match_scratch:W 1 "r")
+   (parallel [(set (reg:P SP_REG)
+		   (plus:P (reg:P SP_REG)
+			   (match_operand:P 0 "const_int_operand")))
+	      (clobber (reg:CC FLAGS_REG))
+	      (clobber (mem:BLK (scratch)))])]
+  "(TARGET_SINGLE_PUSH || optimize_insn_for_size_p ())
+   && INTVAL (operands[0]) == -GET_MODE_SIZE (word_mode)"
+  [(clobber (match_dup 1))
+   (parallel [(set (mem:W (pre_dec:P (reg:P SP_REG))) (match_dup 1))
+	      (clobber (mem:BLK (scratch)))])])
+
+(define_peephole2
+  [(match_scratch:W 1 "r")
+   (parallel [(set (reg:P SP_REG)
+		   (plus:P (reg:P SP_REG)
+			   (match_operand:P 0 "const_int_operand")))
+	      (clobber (reg:CC FLAGS_REG))
+	      (clobber (mem:BLK (scratch)))])]
+  "(TARGET_DOUBLE_PUSH || optimize_insn_for_size_p ())
+   && INTVAL (operands[0]) == -2*GET_MODE_SIZE (word_mode)"
+  [(clobber (match_dup 1))
+   (set (mem:W (pre_dec:P (reg:P SP_REG))) (match_dup 1))
+   (parallel [(set (mem:W (pre_dec:P (reg:P SP_REG))) (match_dup 1))
+	      (clobber (mem:BLK (scratch)))])])
+
+;; Convert esp subtractions to push.
+(define_peephole2
+  [(match_scratch:W 1 "r")
+   (parallel [(set (reg:P SP_REG)
+		   (plus:P (reg:P SP_REG)
+			   (match_operand:P 0 "const_int_operand")))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "(TARGET_SINGLE_PUSH || optimize_insn_for_size_p ())
+   && INTVAL (operands[0]) == -GET_MODE_SIZE (word_mode)"
+  [(clobber (match_dup 1))
+   (set (mem:W (pre_dec:P (reg:P SP_REG))) (match_dup 1))])
+
+(define_peephole2
+  [(match_scratch:W 1 "r")
+   (parallel [(set (reg:P SP_REG)
+		   (plus:P (reg:P SP_REG)
+			   (match_operand:P 0 "const_int_operand")))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "(TARGET_DOUBLE_PUSH || optimize_insn_for_size_p ())
+   && INTVAL (operands[0]) == -2*GET_MODE_SIZE (word_mode)"
+  [(clobber (match_dup 1))
+   (set (mem:W (pre_dec:P (reg:P SP_REG))) (match_dup 1))
+   (set (mem:W (pre_dec:P (reg:P SP_REG))) (match_dup 1))])
+
+;; Convert epilogue deallocator to pop.
+(define_peephole2
+  [(match_scratch:W 1 "r")
+   (parallel [(set (reg:P SP_REG)
+		   (plus:P (reg:P SP_REG)
+			   (match_operand:P 0 "const_int_operand")))
+	      (clobber (reg:CC FLAGS_REG))
+	      (clobber (mem:BLK (scratch)))])]
+  "(TARGET_SINGLE_POP || optimize_insn_for_size_p ())
+   && INTVAL (operands[0]) == GET_MODE_SIZE (word_mode)"
+  [(parallel [(set (match_dup 1) (mem:W (post_inc:P (reg:P SP_REG))))
+	      (clobber (mem:BLK (scratch)))])])
+
+;; Two pops case is tricky, since pop causes dependency
+;; on destination register.  We use two registers if available.
+(define_peephole2
+  [(match_scratch:W 1 "r")
+   (match_scratch:W 2 "r")
+   (parallel [(set (reg:P SP_REG)
+		   (plus:P (reg:P SP_REG)
+			   (match_operand:P 0 "const_int_operand")))
+	      (clobber (reg:CC FLAGS_REG))
+	      (clobber (mem:BLK (scratch)))])]
+  "(TARGET_DOUBLE_POP || optimize_insn_for_size_p ())
+   && INTVAL (operands[0]) == 2*GET_MODE_SIZE (word_mode)"
+  [(parallel [(set (match_dup 1) (mem:W (post_inc:P (reg:P SP_REG))))
+	      (clobber (mem:BLK (scratch)))])
+   (set (match_dup 2) (mem:W (post_inc:P (reg:P SP_REG))))])
+
+(define_peephole2
+  [(match_scratch:W 1 "r")
+   (parallel [(set (reg:P SP_REG)
+		   (plus:P (reg:P SP_REG)
+			   (match_operand:P 0 "const_int_operand")))
+	      (clobber (reg:CC FLAGS_REG))
+	      (clobber (mem:BLK (scratch)))])]
+  "optimize_insn_for_size_p ()
+   && INTVAL (operands[0]) == 2*GET_MODE_SIZE (word_mode)"
+  [(parallel [(set (match_dup 1) (mem:W (post_inc:P (reg:P SP_REG))))
+	      (clobber (mem:BLK (scratch)))])
+   (set (match_dup 1) (mem:W (post_inc:P (reg:P SP_REG))))])
+
+;; Convert esp additions to pop.
+(define_peephole2
+  [(match_scratch:W 1 "r")
+   (parallel [(set (reg:P SP_REG)
+		   (plus:P (reg:P SP_REG)
+			   (match_operand:P 0 "const_int_operand")))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "INTVAL (operands[0]) == GET_MODE_SIZE (word_mode)"
+  [(set (match_dup 1) (mem:W (post_inc:P (reg:P SP_REG))))])
+
+;; Two pops case is tricky, since pop causes dependency
+;; on destination register.  We use two registers if available.
+(define_peephole2
+  [(match_scratch:W 1 "r")
+   (match_scratch:W 2 "r")
+   (parallel [(set (reg:P SP_REG)
+		   (plus:P (reg:P SP_REG)
+			   (match_operand:P 0 "const_int_operand")))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "INTVAL (operands[0]) == 2*GET_MODE_SIZE (word_mode)"
+  [(set (match_dup 1) (mem:W (post_inc:P (reg:P SP_REG))))
+   (set (match_dup 2) (mem:W (post_inc:P (reg:P SP_REG))))])
+
+(define_peephole2
+  [(match_scratch:W 1 "r")
+   (parallel [(set (reg:P SP_REG)
+		   (plus:P (reg:P SP_REG)
+			   (match_operand:P 0 "const_int_operand")))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "optimize_insn_for_size_p ()
+   && INTVAL (operands[0]) == 2*GET_MODE_SIZE (word_mode)"
+  [(set (match_dup 1) (mem:W (post_inc:P (reg:P SP_REG))))
+   (set (match_dup 1) (mem:W (post_inc:P (reg:P SP_REG))))])
+
+;; Convert compares with 1 to shorter inc/dec operations when CF is not
+;; required and register dies.  Similarly for 128 to -128.
+(define_peephole2
+  [(set (match_operand 0 "flags_reg_operand")
+	(match_operator 1 "compare_operator"
+	  [(match_operand 2 "register_operand")
+	   (match_operand 3 "const_int_operand")]))]
+  "(((!TARGET_FUSE_CMP_AND_BRANCH || optimize_insn_for_size_p ())
+     && incdec_operand (operands[3], GET_MODE (operands[3])))
+    || (!TARGET_FUSE_CMP_AND_BRANCH
+	&& INTVAL (operands[3]) == 128))
+   && ix86_match_ccmode (insn, CCGCmode)
+   && peep2_reg_dead_p (1, operands[2])"
+  [(parallel [(set (match_dup 0)
+		   (match_op_dup 1 [(match_dup 2) (match_dup 3)]))
+	      (clobber (match_dup 2))])])
+
+;; Convert imul by three, five and nine into lea
+(define_peephole2
+  [(parallel
+    [(set (match_operand:SWI48 0 "register_operand")
+	  (mult:SWI48 (match_operand:SWI48 1 "register_operand")
+		      (match_operand:SWI48 2 "const359_operand")))
+     (clobber (reg:CC FLAGS_REG))])]
+  "!TARGET_PARTIAL_REG_STALL
+   || <MODE>mode == SImode
+   || optimize_function_for_size_p (cfun)"
+  [(set (match_dup 0)
+	(plus:SWI48 (mult:SWI48 (match_dup 1) (match_dup 2))
+		    (match_dup 1)))]
+  "operands[2] = GEN_INT (INTVAL (operands[2]) - 1);")
+
+(define_peephole2
+  [(parallel
+    [(set (match_operand:SWI48 0 "register_operand")
+	  (mult:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+		      (match_operand:SWI48 2 "const359_operand")))
+     (clobber (reg:CC FLAGS_REG))])]
+  "optimize_insn_for_speed_p ()
+   && (!TARGET_PARTIAL_REG_STALL || <MODE>mode == SImode)"
+  [(set (match_dup 0) (match_dup 1))
+   (set (match_dup 0)
+	(plus:SWI48 (mult:SWI48 (match_dup 0) (match_dup 2))
+		    (match_dup 0)))]
+  "operands[2] = GEN_INT (INTVAL (operands[2]) - 1);")
+
+;; imul $32bit_imm, mem, reg is vector decoded, while
+;; imul $32bit_imm, reg, reg is direct decoded.
+(define_peephole2
+  [(match_scratch:SWI48 3 "r")
+   (parallel [(set (match_operand:SWI48 0 "register_operand")
+		   (mult:SWI48 (match_operand:SWI48 1 "memory_operand")
+			       (match_operand:SWI48 2 "immediate_operand")))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_SLOW_IMUL_IMM32_MEM && optimize_insn_for_speed_p ()
+   && !satisfies_constraint_K (operands[2])"
+  [(set (match_dup 3) (match_dup 1))
+   (parallel [(set (match_dup 0) (mult:SWI48 (match_dup 3) (match_dup 2)))
+	      (clobber (reg:CC FLAGS_REG))])])
+
+(define_peephole2
+  [(match_scratch:SI 3 "r")
+   (parallel [(set (match_operand:DI 0 "register_operand")
+		   (zero_extend:DI
+		     (mult:SI (match_operand:SI 1 "memory_operand")
+			      (match_operand:SI 2 "immediate_operand"))))
+	      (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_64BIT
+   && TARGET_SLOW_IMUL_IMM32_MEM && optimize_insn_for_speed_p ()
+   && !satisfies_constraint_K (operands[2])"
+  [(set (match_dup 3) (match_dup 1))
+   (parallel [(set (match_dup 0)
+		   (zero_extend:DI (mult:SI (match_dup 3) (match_dup 2))))
+	      (clobber (reg:CC FLAGS_REG))])])
+
+;; imul $8/16bit_imm, regmem, reg is vector decoded.
+;; Convert it into imul reg, reg
+;; It would be better to force assembler to encode instruction using long
+;; immediate, but there is apparently no way to do so.
+(define_peephole2
+  [(parallel [(set (match_operand:SWI248 0 "register_operand")
+		   (mult:SWI248
+		    (match_operand:SWI248 1 "nonimmediate_operand")
+		    (match_operand:SWI248 2 "const_int_operand")))
+	      (clobber (reg:CC FLAGS_REG))])
+   (match_scratch:SWI248 3 "r")]
+  "TARGET_SLOW_IMUL_IMM8 && optimize_insn_for_speed_p ()
+   && satisfies_constraint_K (operands[2])"
+  [(set (match_dup 3) (match_dup 2))
+   (parallel [(set (match_dup 0) (mult:SWI248 (match_dup 0) (match_dup 3)))
+	      (clobber (reg:CC FLAGS_REG))])]
+{
+  if (!rtx_equal_p (operands[0], operands[1]))
+    emit_move_insn (operands[0], operands[1]);
+})
+
+;; After splitting up read-modify operations, array accesses with memory
+;; operands might end up in form:
+;;  sall    $2, %eax
+;;  movl    4(%esp), %edx
+;;  addl    %edx, %eax
+;; instead of pre-splitting:
+;;  sall    $2, %eax
+;;  addl    4(%esp), %eax
+;; Turn it into:
+;;  movl    4(%esp), %edx
+;;  leal    (%edx,%eax,4), %eax
+
+(define_peephole2
+  [(match_scratch:W 5 "r")
+   (parallel [(set (match_operand 0 "register_operand")
+		   (ashift (match_operand 1 "register_operand")
+			   (match_operand 2 "const_int_operand")))
+	       (clobber (reg:CC FLAGS_REG))])
+   (parallel [(set (match_operand 3 "register_operand")
+		   (plus (match_dup 0)
+			 (match_operand 4 "x86_64_general_operand")))
+		   (clobber (reg:CC FLAGS_REG))])]
+  "IN_RANGE (INTVAL (operands[2]), 1, 3)
+   /* Validate MODE for lea.  */
+   && ((!TARGET_PARTIAL_REG_STALL
+	&& (GET_MODE (operands[0]) == QImode
+	    || GET_MODE (operands[0]) == HImode))
+       || GET_MODE (operands[0]) == SImode
+       || (TARGET_64BIT && GET_MODE (operands[0]) == DImode))
+   && (rtx_equal_p (operands[0], operands[3])
+       || peep2_reg_dead_p (2, operands[0]))
+   /* We reorder load and the shift.  */
+   && !reg_overlap_mentioned_p (operands[0], operands[4])"
+  [(set (match_dup 5) (match_dup 4))
+   (set (match_dup 0) (match_dup 1))]
+{
+  enum machine_mode op1mode = GET_MODE (operands[1]);
+  enum machine_mode mode = op1mode == DImode ? DImode : SImode;
+  int scale = 1 << INTVAL (operands[2]);
+  rtx index = gen_lowpart (word_mode, operands[1]);
+  rtx base = gen_lowpart (word_mode, operands[5]);
+  rtx dest = gen_lowpart (mode, operands[3]);
+
+  operands[1] = gen_rtx_PLUS (word_mode, base,
+			      gen_rtx_MULT (word_mode, index, GEN_INT (scale)));
+  operands[5] = base;
+  if (mode != word_mode)
+    operands[1] = gen_rtx_SUBREG (mode, operands[1], 0);
+  if (op1mode != word_mode)
+    operands[5] = gen_rtx_SUBREG (op1mode, operands[5], 0);
+  operands[0] = dest;
+})
+
+;; We used to use "int $5", in honor of #BR which maps to interrupt vector 5.
+;; That, however, is usually mapped by the OS to SIGSEGV, which is often
+;; caught for use by garbage collectors and the like.  Using an insn that
+;; maps to SIGILL makes it more likely the program will rightfully die.
+;; Keeping with tradition, "6" is in honor of #UD.
+(define_insn "trap"
+  [(trap_if (const_int 1) (const_int 6))]
+  ""
+{
+#ifdef HAVE_AS_IX86_UD2
+  return "ud2";
+#else
+  return ASM_SHORT "0x0b0f";
+#endif
+}
+  [(set_attr "length" "2")])
+
+(define_expand "prefetch"
+  [(prefetch (match_operand 0 "address_operand")
+	     (match_operand:SI 1 "const_int_operand")
+	     (match_operand:SI 2 "const_int_operand"))]
+  "TARGET_PREFETCH_SSE || TARGET_PRFCHW || TARGET_PREFETCHWT1"
+{
+  bool write = INTVAL (operands[1]) != 0;
+  int locality = INTVAL (operands[2]);
+
+  gcc_assert (IN_RANGE (locality, 0, 3));
+
+  /* Use 3dNOW prefetch in case we are asking for write prefetch not
+     supported by SSE counterpart or the SSE prefetch is not available
+     (K6 machines).  Otherwise use SSE prefetch as it allows specifying
+     of locality.  */
+  if (TARGET_PREFETCHWT1 && write && locality <= 2)
+    operands[2] = const2_rtx;
+  else if (TARGET_PRFCHW && (write || !TARGET_PREFETCH_SSE))
+    operands[2] = GEN_INT (3);
+  else
+    operands[1] = const0_rtx;
+})
+
+(define_insn "*prefetch_sse"
+  [(prefetch (match_operand 0 "address_operand" "p")
+	     (const_int 0)
+	     (match_operand:SI 1 "const_int_operand"))]
+  "TARGET_PREFETCH_SSE"
+{
+  static const char * const patterns[4] = {
+   "prefetchnta\t%a0", "prefetcht2\t%a0", "prefetcht1\t%a0", "prefetcht0\t%a0"
+  };
+
+  int locality = INTVAL (operands[1]);
+  gcc_assert (IN_RANGE (locality, 0, 3));
+
+  return patterns[locality];
+}
+  [(set_attr "type" "sse")
+   (set_attr "atom_sse_attr" "prefetch")
+   (set (attr "length_address")
+	(symbol_ref "memory_address_length (operands[0], false)"))
+   (set_attr "memory" "none")])
+
+(define_insn "*prefetch_3dnow"
+  [(prefetch (match_operand 0 "address_operand" "p")
+	     (match_operand:SI 1 "const_int_operand" "n")
+	     (const_int 3))]
+  "TARGET_PRFCHW"
+{
+  if (INTVAL (operands[1]) == 0)
+    return "prefetch\t%a0";
+  else
+    return "prefetchw\t%a0";
+}
+  [(set_attr "type" "mmx")
+   (set (attr "length_address")
+	(symbol_ref "memory_address_length (operands[0], false)"))
+   (set_attr "memory" "none")])
+
+(define_insn "*prefetch_prefetchwt1_<mode>"
+  [(prefetch (match_operand:P 0 "address_operand" "p")
+	     (const_int 1)
+	     (const_int 2))]
+  "TARGET_PREFETCHWT1"
+  "prefetchwt1\t%a0";
+  [(set_attr "type" "sse")
+   (set (attr "length_address")
+	(symbol_ref "memory_address_length (operands[0], false)"))
+   (set_attr "memory" "none")])
+
+(define_expand "stack_protect_set"
+  [(match_operand 0 "memory_operand")
+   (match_operand 1 "memory_operand")]
+  "TARGET_SSP_TLS_GUARD"
+{
+  rtx (*insn)(rtx, rtx);
+
+#ifdef TARGET_THREAD_SSP_OFFSET
+  operands[1] = GEN_INT (TARGET_THREAD_SSP_OFFSET);
+  insn = (TARGET_LP64
+	  ? gen_stack_tls_protect_set_di
+	  : gen_stack_tls_protect_set_si);
+#else
+  insn = (TARGET_LP64
+	  ? gen_stack_protect_set_di
+	  : gen_stack_protect_set_si);
+#endif
+
+  emit_insn (insn (operands[0], operands[1]));
+  DONE;
+})
+
+(define_insn "stack_protect_set_<mode>"
+  [(set (match_operand:PTR 0 "memory_operand" "=m")
+	(unspec:PTR [(match_operand:PTR 1 "memory_operand" "m")]
+		    UNSPEC_SP_SET))
+   (set (match_scratch:PTR 2 "=&r") (const_int 0))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_SSP_TLS_GUARD"
+  "mov{<imodesuffix>}\t{%1, %2|%2, %1}\;mov{<imodesuffix>}\t{%2, %0|%0, %2}\;xor{l}\t%k2, %k2"
+  [(set_attr "type" "multi")])
+
+(define_insn "stack_tls_protect_set_<mode>"
+  [(set (match_operand:PTR 0 "memory_operand" "=m")
+	(unspec:PTR [(match_operand:PTR 1 "const_int_operand" "i")]
+		    UNSPEC_SP_TLS_SET))
+   (set (match_scratch:PTR 2 "=&r") (const_int 0))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "mov{<imodesuffix>}\t{%@:%P1, %2|%2, <iptrsize> PTR %@:%P1}\;mov{<imodesuffix>}\t{%2, %0|%0, %2}\;xor{l}\t%k2, %k2"
+  [(set_attr "type" "multi")])
+
+(define_expand "stack_protect_test"
+  [(match_operand 0 "memory_operand")
+   (match_operand 1 "memory_operand")
+   (match_operand 2)]
+  "TARGET_SSP_TLS_GUARD"
+{
+  rtx flags = gen_rtx_REG (CCZmode, FLAGS_REG);
+
+  rtx (*insn)(rtx, rtx, rtx);
+
+#ifdef TARGET_THREAD_SSP_OFFSET
+  operands[1] = GEN_INT (TARGET_THREAD_SSP_OFFSET);
+  insn = (TARGET_LP64
+	  ? gen_stack_tls_protect_test_di
+	  : gen_stack_tls_protect_test_si);
+#else
+  insn = (TARGET_LP64
+	  ? gen_stack_protect_test_di
+	  : gen_stack_protect_test_si);
+#endif
+
+  emit_insn (insn (flags, operands[0], operands[1]));
+
+  emit_jump_insn (gen_cbranchcc4 (gen_rtx_EQ (VOIDmode, flags, const0_rtx),
+				  flags, const0_rtx, operands[2]));
+  DONE;
+})
+
+(define_insn "stack_protect_test_<mode>"
+  [(set (match_operand:CCZ 0 "flags_reg_operand")
+	(unspec:CCZ [(match_operand:PTR 1 "memory_operand" "m")
+		     (match_operand:PTR 2 "memory_operand" "m")]
+		    UNSPEC_SP_TEST))
+   (clobber (match_scratch:PTR 3 "=&r"))]
+  "TARGET_SSP_TLS_GUARD"
+  "mov{<imodesuffix>}\t{%1, %3|%3, %1}\;xor{<imodesuffix>}\t{%2, %3|%3, %2}"
+  [(set_attr "type" "multi")])
+
+(define_insn "stack_tls_protect_test_<mode>"
+  [(set (match_operand:CCZ 0 "flags_reg_operand")
+	(unspec:CCZ [(match_operand:PTR 1 "memory_operand" "m")
+		     (match_operand:PTR 2 "const_int_operand" "i")]
+		    UNSPEC_SP_TLS_TEST))
+   (clobber (match_scratch:PTR 3 "=r"))]
+  ""
+  "mov{<imodesuffix>}\t{%1, %3|%3, %1}\;xor{<imodesuffix>}\t{%@:%P2, %3|%3, <iptrsize> PTR %@:%P2}"
+  [(set_attr "type" "multi")])
+
+(define_insn "sse4_2_crc32<mode>"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI
+	  [(match_operand:SI 1 "register_operand" "0")
+	   (match_operand:SWI124 2 "nonimmediate_operand" "<r>m")]
+	  UNSPEC_CRC32))]
+  "TARGET_SSE4_2 || TARGET_CRC32"
+  "crc32{<imodesuffix>}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "prefix_extra" "1")
+   (set (attr "prefix_data16")
+     (if_then_else (match_operand:HI 2)
+       (const_string "1")
+       (const_string "*")))
+   (set (attr "prefix_rex")
+     (if_then_else (match_operand:QI 2 "ext_QIreg_operand")
+       (const_string "1")
+       (const_string "*")))
+   (set_attr "mode" "SI")])
+
+(define_insn "sse4_2_crc32di"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec:DI
+	  [(match_operand:DI 1 "register_operand" "0")
+	   (match_operand:DI 2 "nonimmediate_operand" "rm")]
+	  UNSPEC_CRC32))]
+  "TARGET_64BIT && (TARGET_SSE4_2 || TARGET_CRC32)"
+  "crc32{q}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "DI")])
+
+(define_insn "rdpmc"
+  [(set (match_operand:DI 0 "register_operand" "=A")
+  	(unspec_volatile:DI [(match_operand:SI 1 "register_operand" "c")]
+			    UNSPECV_RDPMC))]
+  "!TARGET_64BIT"
+  "rdpmc"
+  [(set_attr "type" "other")
+   (set_attr "length" "2")])
+
+(define_insn "rdpmc_rex64"
+  [(set (match_operand:DI 0 "register_operand" "=a")
+  	(unspec_volatile:DI [(match_operand:SI 2 "register_operand" "c")]
+			    UNSPECV_RDPMC))
+   (set (match_operand:DI 1 "register_operand" "=d")
+	(unspec_volatile:DI [(match_dup 2)] UNSPECV_RDPMC))]
+  "TARGET_64BIT"
+  "rdpmc"
+  [(set_attr "type" "other")
+   (set_attr "length" "2")])
+
+(define_insn "rdtsc"
+  [(set (match_operand:DI 0 "register_operand" "=A")
+	(unspec_volatile:DI [(const_int 0)] UNSPECV_RDTSC))]
+  "!TARGET_64BIT"
+  "rdtsc"
+  [(set_attr "type" "other")
+   (set_attr "length" "2")])
+
+(define_insn "rdtsc_rex64"
+  [(set (match_operand:DI 0 "register_operand" "=a")
+	(unspec_volatile:DI [(const_int 0)] UNSPECV_RDTSC))
+   (set (match_operand:DI 1 "register_operand" "=d")
+	(unspec_volatile:DI [(const_int 0)] UNSPECV_RDTSC))]
+  "TARGET_64BIT"
+  "rdtsc"
+  [(set_attr "type" "other")
+   (set_attr "length" "2")])
+
+(define_insn "rdtscp"
+  [(set (match_operand:DI 0 "register_operand" "=A")
+	(unspec_volatile:DI [(const_int 0)] UNSPECV_RDTSCP))
+   (set (match_operand:SI 1 "register_operand" "=c")
+	(unspec_volatile:SI [(const_int 0)] UNSPECV_RDTSCP))]
+  "!TARGET_64BIT"
+  "rdtscp"
+  [(set_attr "type" "other")
+   (set_attr "length" "3")])
+
+(define_insn "rdtscp_rex64"
+  [(set (match_operand:DI 0 "register_operand" "=a")
+	(unspec_volatile:DI [(const_int 0)] UNSPECV_RDTSCP))
+   (set (match_operand:DI 1 "register_operand" "=d")
+	(unspec_volatile:DI [(const_int 0)] UNSPECV_RDTSCP))
+   (set (match_operand:SI 2 "register_operand" "=c")
+	(unspec_volatile:SI [(const_int 0)] UNSPECV_RDTSCP))]
+  "TARGET_64BIT"
+  "rdtscp"
+  [(set_attr "type" "other")
+   (set_attr "length" "3")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; FXSR, XSAVE and XSAVEOPT instructions
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "fxsave"
+  [(set (match_operand:BLK 0 "memory_operand" "=m")
+	(unspec_volatile:BLK [(const_int 0)] UNSPECV_FXSAVE))]
+  "TARGET_FXSR"
+  "fxsave\t%0"
+  [(set_attr "type" "other")
+   (set_attr "memory" "store")
+   (set (attr "length")
+        (symbol_ref "ix86_attr_length_address_default (insn) + 3"))])
+
+(define_insn "fxsave64"
+  [(set (match_operand:BLK 0 "memory_operand" "=m")
+	(unspec_volatile:BLK [(const_int 0)] UNSPECV_FXSAVE64))]
+  "TARGET_64BIT && TARGET_FXSR"
+  "fxsave64\t%0"
+  [(set_attr "type" "other")
+   (set_attr "memory" "store")
+   (set (attr "length")
+        (symbol_ref "ix86_attr_length_address_default (insn) + 4"))])
+
+(define_insn "fxrstor"
+  [(unspec_volatile [(match_operand:BLK 0 "memory_operand" "m")]
+		    UNSPECV_FXRSTOR)]
+  "TARGET_FXSR"
+  "fxrstor\t%0"
+  [(set_attr "type" "other")
+   (set_attr "memory" "load")
+   (set (attr "length")
+        (symbol_ref "ix86_attr_length_address_default (insn) + 3"))])
+
+(define_insn "fxrstor64"
+  [(unspec_volatile [(match_operand:BLK 0 "memory_operand" "m")]
+		    UNSPECV_FXRSTOR64)]
+  "TARGET_64BIT && TARGET_FXSR"
+  "fxrstor64\t%0"
+  [(set_attr "type" "other")
+   (set_attr "memory" "load")
+   (set (attr "length")
+        (symbol_ref "ix86_attr_length_address_default (insn) + 4"))])
+
+(define_int_iterator ANY_XSAVE
+	[UNSPECV_XSAVE
+	 (UNSPECV_XSAVEOPT "TARGET_XSAVEOPT")])
+
+(define_int_iterator ANY_XSAVE64
+	[UNSPECV_XSAVE64
+	 (UNSPECV_XSAVEOPT64 "TARGET_XSAVEOPT")])
+
+(define_int_attr xsave
+	[(UNSPECV_XSAVE "xsave")
+	 (UNSPECV_XSAVE64 "xsave64")
+	 (UNSPECV_XSAVEOPT "xsaveopt")
+	 (UNSPECV_XSAVEOPT64 "xsaveopt64")])
+
+(define_insn "<xsave>"
+  [(set (match_operand:BLK 0 "memory_operand" "=m")
+	(unspec_volatile:BLK
+	 [(match_operand:DI 1 "register_operand" "A")]
+	 ANY_XSAVE))]
+  "!TARGET_64BIT && TARGET_XSAVE"
+  "<xsave>\t%0"
+  [(set_attr "type" "other")
+   (set_attr "memory" "store")
+   (set (attr "length")
+        (symbol_ref "ix86_attr_length_address_default (insn) + 3"))])
+
+(define_insn "<xsave>_rex64"
+  [(set (match_operand:BLK 0 "memory_operand" "=m")
+	(unspec_volatile:BLK
+	 [(match_operand:SI 1 "register_operand" "a")
+	  (match_operand:SI 2 "register_operand" "d")]
+	 ANY_XSAVE))]
+  "TARGET_64BIT && TARGET_XSAVE"
+  "<xsave>\t%0"
+  [(set_attr "type" "other")
+   (set_attr "memory" "store")
+   (set (attr "length")
+        (symbol_ref "ix86_attr_length_address_default (insn) + 3"))])
+
+(define_insn "<xsave>"
+  [(set (match_operand:BLK 0 "memory_operand" "=m")
+	(unspec_volatile:BLK
+	 [(match_operand:SI 1 "register_operand" "a")
+	  (match_operand:SI 2 "register_operand" "d")]
+	 ANY_XSAVE64))]
+  "TARGET_64BIT && TARGET_XSAVE"
+  "<xsave>\t%0"
+  [(set_attr "type" "other")
+   (set_attr "memory" "store")
+   (set (attr "length")
+        (symbol_ref "ix86_attr_length_address_default (insn) + 4"))])
+
+(define_insn "xrstor"
+   [(unspec_volatile:BLK
+     [(match_operand:BLK 0 "memory_operand" "m")
+      (match_operand:DI 1 "register_operand" "A")]
+     UNSPECV_XRSTOR)]
+  "!TARGET_64BIT && TARGET_XSAVE"
+  "xrstor\t%0"
+  [(set_attr "type" "other")
+   (set_attr "memory" "load")
+   (set (attr "length")
+        (symbol_ref "ix86_attr_length_address_default (insn) + 3"))])
+
+(define_insn "xrstor_rex64"
+   [(unspec_volatile:BLK
+     [(match_operand:BLK 0 "memory_operand" "m")
+      (match_operand:SI 1 "register_operand" "a")
+      (match_operand:SI 2 "register_operand" "d")]
+     UNSPECV_XRSTOR)]
+  "TARGET_64BIT && TARGET_XSAVE"
+  "xrstor\t%0"
+  [(set_attr "type" "other")
+   (set_attr "memory" "load")
+   (set (attr "length")
+        (symbol_ref "ix86_attr_length_address_default (insn) + 3"))])
+
+(define_insn "xrstor64"
+   [(unspec_volatile:BLK
+     [(match_operand:BLK 0 "memory_operand" "m")
+      (match_operand:SI 1 "register_operand" "a")
+      (match_operand:SI 2 "register_operand" "d")]
+     UNSPECV_XRSTOR64)]
+  "TARGET_64BIT && TARGET_XSAVE"
+  "xrstor64\t%0"
+  [(set_attr "type" "other")
+   (set_attr "memory" "load")
+   (set (attr "length")
+        (symbol_ref "ix86_attr_length_address_default (insn) + 4"))])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Floating-point instructions for atomic compound assignments
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Clobber all floating-point registers on environment save and restore
+; to ensure that the TOS value saved at fnstenv is valid after fldenv.
+(define_insn "fnstenv"
+  [(set (match_operand:BLK 0 "memory_operand" "=m")
+	(unspec_volatile:BLK [(const_int 0)] UNSPECV_FNSTENV))
+   (clobber (reg:HI FPCR_REG))
+   (clobber (reg:XF ST0_REG))
+   (clobber (reg:XF ST1_REG))
+   (clobber (reg:XF ST2_REG))
+   (clobber (reg:XF ST3_REG))
+   (clobber (reg:XF ST4_REG))
+   (clobber (reg:XF ST5_REG))
+   (clobber (reg:XF ST6_REG))
+   (clobber (reg:XF ST7_REG))]
+  "TARGET_80387"
+  "fnstenv\t%0"
+  [(set_attr "type" "other")
+   (set_attr "memory" "store")
+   (set (attr "length")
+        (symbol_ref "ix86_attr_length_address_default (insn) + 2"))])
+
+(define_insn "fldenv"
+  [(unspec_volatile [(match_operand:BLK 0 "memory_operand" "m")]
+		    UNSPECV_FLDENV)
+   (clobber (reg:CCFP FPSR_REG))
+   (clobber (reg:HI FPCR_REG))
+   (clobber (reg:XF ST0_REG))
+   (clobber (reg:XF ST1_REG))
+   (clobber (reg:XF ST2_REG))
+   (clobber (reg:XF ST3_REG))
+   (clobber (reg:XF ST4_REG))
+   (clobber (reg:XF ST5_REG))
+   (clobber (reg:XF ST6_REG))
+   (clobber (reg:XF ST7_REG))]
+  "TARGET_80387"
+  "fldenv\t%0"
+  [(set_attr "type" "other")
+   (set_attr "memory" "load")
+   (set (attr "length")
+        (symbol_ref "ix86_attr_length_address_default (insn) + 2"))])
+
+(define_insn "fnstsw"
+  [(set (match_operand:HI 0 "memory_operand" "=m")
+	(unspec_volatile:HI [(const_int 0)] UNSPECV_FNSTSW))]
+  "TARGET_80387"
+  "fnstsw\t%0"
+  [(set_attr "type" "other")
+   (set_attr "memory" "store")
+   (set (attr "length")
+        (symbol_ref "ix86_attr_length_address_default (insn) + 2"))])
+
+(define_insn "fnclex"
+  [(unspec_volatile [(const_int 0)] UNSPECV_FNCLEX)]
+  "TARGET_80387"
+  "fnclex"
+  [(set_attr "type" "other")
+   (set_attr "memory" "none")
+   (set_attr "length" "2")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; LWP instructions
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_expand "lwp_llwpcb"
+  [(unspec_volatile [(match_operand 0 "register_operand" "r")]
+		    UNSPECV_LLWP_INTRINSIC)]
+  "TARGET_LWP")
+
+(define_insn "*lwp_llwpcb<mode>1"
+  [(unspec_volatile [(match_operand:P 0 "register_operand" "r")]
+		    UNSPECV_LLWP_INTRINSIC)]
+  "TARGET_LWP"
+  "llwpcb\t%0"
+  [(set_attr "type" "lwp")
+   (set_attr "mode" "<MODE>")
+   (set_attr "length" "5")])
+
+(define_expand "lwp_slwpcb"
+  [(set (match_operand 0 "register_operand" "=r")
+	(unspec_volatile [(const_int 0)] UNSPECV_SLWP_INTRINSIC))]
+  "TARGET_LWP"
+{
+  rtx (*insn)(rtx);
+
+  insn = (Pmode == DImode
+	  ? gen_lwp_slwpcbdi
+	  : gen_lwp_slwpcbsi);
+
+  emit_insn (insn (operands[0]));
+  DONE;
+})
+
+(define_insn "lwp_slwpcb<mode>"
+  [(set (match_operand:P 0 "register_operand" "=r")
+	(unspec_volatile:P [(const_int 0)] UNSPECV_SLWP_INTRINSIC))]
+  "TARGET_LWP"
+  "slwpcb\t%0"
+  [(set_attr "type" "lwp")
+   (set_attr "mode" "<MODE>")
+   (set_attr "length" "5")])
+
+(define_expand "lwp_lwpval<mode>3"
+  [(unspec_volatile [(match_operand:SWI48 1 "register_operand" "r")
+    	    	     (match_operand:SI 2 "nonimmediate_operand" "rm")
+		     (match_operand:SI 3 "const_int_operand" "i")]
+		    UNSPECV_LWPVAL_INTRINSIC)]
+  "TARGET_LWP"
+  ;; Avoid unused variable warning.
+  "(void) operands[0];")
+
+(define_insn "*lwp_lwpval<mode>3_1"
+  [(unspec_volatile [(match_operand:SWI48 0 "register_operand" "r")
+    	    	     (match_operand:SI 1 "nonimmediate_operand" "rm")
+		     (match_operand:SI 2 "const_int_operand" "i")]
+		    UNSPECV_LWPVAL_INTRINSIC)]
+  "TARGET_LWP"
+  "lwpval\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "lwp")
+   (set_attr "mode" "<MODE>")
+   (set (attr "length")
+        (symbol_ref "ix86_attr_length_address_default (insn) + 9"))])
+
+(define_expand "lwp_lwpins<mode>3"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec_volatile:CCC [(match_operand:SWI48 1 "register_operand" "r")
+			      (match_operand:SI 2 "nonimmediate_operand" "rm")
+			      (match_operand:SI 3 "const_int_operand" "i")]
+			     UNSPECV_LWPINS_INTRINSIC))
+   (set (match_operand:QI 0 "nonimmediate_operand" "=qm")
+	(eq:QI (reg:CCC FLAGS_REG) (const_int 0)))]
+  "TARGET_LWP")
+
+(define_insn "*lwp_lwpins<mode>3_1"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec_volatile:CCC [(match_operand:SWI48 0 "register_operand" "r")
+			      (match_operand:SI 1 "nonimmediate_operand" "rm")
+			      (match_operand:SI 2 "const_int_operand" "i")]
+			     UNSPECV_LWPINS_INTRINSIC))]
+  "TARGET_LWP"
+  "lwpins\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "lwp")
+   (set_attr "mode" "<MODE>")
+   (set (attr "length")
+        (symbol_ref "ix86_attr_length_address_default (insn) + 9"))])
+
+(define_int_iterator RDFSGSBASE
+	[UNSPECV_RDFSBASE
+	 UNSPECV_RDGSBASE])
+
+(define_int_iterator WRFSGSBASE
+	[UNSPECV_WRFSBASE
+	 UNSPECV_WRGSBASE])
+
+(define_int_attr fsgs
+	[(UNSPECV_RDFSBASE "fs")
+	 (UNSPECV_RDGSBASE "gs")
+	 (UNSPECV_WRFSBASE "fs")
+	 (UNSPECV_WRGSBASE "gs")])
+
+(define_insn "rd<fsgs>base<mode>"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(unspec_volatile:SWI48 [(const_int 0)] RDFSGSBASE))]
+  "TARGET_64BIT && TARGET_FSGSBASE"
+  "rd<fsgs>base\t%0"
+  [(set_attr "type" "other")
+   (set_attr "prefix_extra" "2")])
+
+(define_insn "wr<fsgs>base<mode>"
+  [(unspec_volatile [(match_operand:SWI48 0 "register_operand" "r")]
+		    WRFSGSBASE)]
+  "TARGET_64BIT && TARGET_FSGSBASE"
+  "wr<fsgs>base\t%0"
+  [(set_attr "type" "other")
+   (set_attr "prefix_extra" "2")])
+
+(define_insn "rdrand<mode>_1"
+  [(set (match_operand:SWI248 0 "register_operand" "=r")
+	(unspec_volatile:SWI248 [(const_int 0)] UNSPECV_RDRAND))
+   (set (reg:CCC FLAGS_REG)
+	(unspec_volatile:CCC [(const_int 0)] UNSPECV_RDRAND))]
+  "TARGET_RDRND"
+  "rdrand\t%0"
+  [(set_attr "type" "other")
+   (set_attr "prefix_extra" "1")])
+
+(define_insn "rdseed<mode>_1"
+  [(set (match_operand:SWI248 0 "register_operand" "=r")
+	(unspec_volatile:SWI248 [(const_int 0)] UNSPECV_RDSEED))
+   (set (reg:CCC FLAGS_REG)
+	(unspec_volatile:CCC [(const_int 0)] UNSPECV_RDSEED))]
+  "TARGET_RDSEED"
+  "rdseed\t%0"
+  [(set_attr "type" "other")
+   (set_attr "prefix_extra" "1")])
+
+(define_expand "pause"
+  [(set (match_dup 0)
+	(unspec:BLK [(match_dup 0)] UNSPEC_PAUSE))]
+  ""
+{
+  operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[0]) = 1;
+})
+
+;; Use "rep; nop", instead of "pause", to support older assemblers.
+;; They have the same encoding.
+(define_insn "*pause"
+  [(set (match_operand:BLK 0)
+	(unspec:BLK [(match_dup 0)] UNSPEC_PAUSE))]
+  ""
+  "rep%; nop"
+  [(set_attr "length" "2")
+   (set_attr "memory" "unknown")])
+
+(define_expand "xbegin"
+  [(set (match_operand:SI 0 "register_operand")
+	(unspec_volatile:SI [(const_int 0)] UNSPECV_XBEGIN))]
+  "TARGET_RTM"
+{
+  rtx label = gen_label_rtx ();
+
+  /* xbegin is emitted as jump_insn, so reload won't be able
+     to reload its operand.  Force the value into AX hard register.  */
+  rtx ax_reg = gen_rtx_REG (SImode, AX_REG);
+  emit_move_insn (ax_reg, constm1_rtx);
+
+  emit_jump_insn (gen_xbegin_1 (ax_reg, label));
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operands[0], ax_reg);
+
+  DONE;
+})
+
+(define_insn "xbegin_1"
+  [(set (pc)
+	(if_then_else (ne (unspec [(const_int 0)] UNSPEC_XBEGIN_ABORT)
+			  (const_int 0))
+		      (label_ref (match_operand 1))
+		      (pc)))
+   (set (match_operand:SI 0 "register_operand" "+a")
+	(unspec_volatile:SI [(match_dup 0)] UNSPECV_XBEGIN))]
+  "TARGET_RTM"
+  "xbegin\t%l1"
+  [(set_attr "type" "other")
+   (set_attr "length" "6")])
+
+(define_insn "xend"
+  [(unspec_volatile [(const_int 0)] UNSPECV_XEND)]
+  "TARGET_RTM"
+  "xend"
+  [(set_attr "type" "other")
+   (set_attr "length" "3")])
+
+(define_insn "xabort"
+  [(unspec_volatile [(match_operand:SI 0 "const_0_to_255_operand" "n")]
+		    UNSPECV_XABORT)]
+  "TARGET_RTM"
+  "xabort\t%0"
+  [(set_attr "type" "other")
+   (set_attr "length" "3")])
+
+(define_expand "xtest"
+  [(set (match_operand:QI 0 "register_operand")
+	(unspec_volatile:QI [(const_int 0)] UNSPECV_XTEST))]
+  "TARGET_RTM"
+{
+  emit_insn (gen_xtest_1 ());
+
+  ix86_expand_setcc (operands[0], NE,
+		     gen_rtx_REG (CCZmode, FLAGS_REG), const0_rtx);
+  DONE;
+})
+
+(define_insn "xtest_1"
+  [(set (reg:CCZ FLAGS_REG)
+	(unspec_volatile:CCZ [(const_int 0)] UNSPECV_XTEST))]
+  "TARGET_RTM"
+  "xtest"
+  [(set_attr "type" "other")
+   (set_attr "length" "3")])
+
+(include "mmx.md")
+(include "sse.md")
+(include "sync.md")
diff --git a/gcc-4.9/gcc/config/i386/i386.opt b/gcc-4.9/gcc/config/i386/i386.opt
new file mode 100644
index 000000000..0f463a238
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/i386.opt
@@ -0,0 +1,796 @@
+; Options for the IA-32 and AMD64 ports of the compiler.
+
+; Copyright (C) 2005-2014 Free Software Foundation, Inc.
+;
+; This file is part of GCC.
+;
+; GCC is free software; you can redistribute it and/or modify it under
+; the terms of the GNU General Public License as published by the Free
+; Software Foundation; either version 3, or (at your option) any later
+; version.
+;
+; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+; for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with GCC; see the file COPYING3.  If not see
+; <http://www.gnu.org/licenses/>.
+
+HeaderInclude
+config/i386/i386-opts.h
+
+; Bit flags that specify the ISA we are compiling for.
+Variable
+HOST_WIDE_INT ix86_isa_flags = TARGET_64BIT_DEFAULT | TARGET_SUBTARGET_ISA_DEFAULT
+
+; A mask of ix86_isa_flags that includes bit X if X was set or cleared
+; on the command line.
+Variable
+HOST_WIDE_INT ix86_isa_flags_explicit
+
+TargetVariable
+int recip_mask = RECIP_MASK_DEFAULT
+
+Variable
+int recip_mask_explicit
+
+TargetSave
+int x_recip_mask_explicit
+
+;; Definitions to add to the cl_target_option structure
+;; -march= processor
+TargetSave
+unsigned char arch
+
+;; -mtune= processor
+TargetSave
+unsigned char tune
+
+;; -march= processor-string
+TargetSave
+const char *x_ix86_arch_string
+
+;; -mtune= processor-string
+TargetSave
+const char *x_ix86_tune_string
+
+;; CPU schedule model
+TargetSave
+unsigned char schedule
+
+;; branch cost
+TargetSave
+unsigned char branch_cost
+
+;; which flags were passed by the user
+TargetSave
+HOST_WIDE_INT x_ix86_isa_flags_explicit
+
+;; which flags were passed by the user
+Variable
+int ix86_target_flags_explicit
+
+;; which flags were passed by the user
+TargetSave
+HOST_WIDE_INT x_ix86_target_flags_explicit
+
+;; whether -mtune was not specified
+TargetSave
+unsigned char tune_defaulted
+
+;; whether -march was specified
+TargetSave
+unsigned char arch_specified
+
+;; -mcmodel= model
+TargetSave
+enum cmodel x_ix86_cmodel
+
+;; -mabi=
+TargetSave
+enum calling_abi x_ix86_abi
+
+;; -masm=
+TargetSave
+enum asm_dialect x_ix86_asm_dialect
+
+;; -mbranch-cost=
+TargetSave
+int x_ix86_branch_cost
+
+;; -mdump-tune-features= 
+TargetSave
+int x_ix86_dump_tunes
+
+;; -mstackrealign=
+TargetSave
+int x_ix86_force_align_arg_pointer
+
+;; -mforce-drap= 
+TargetSave
+int x_ix86_force_drap
+
+;; -mincoming-stack-boundary=
+TargetSave
+int x_ix86_incoming_stack_boundary_arg
+
+;; -maddress-mode=
+TargetSave
+enum pmode x_ix86_pmode
+
+;; -mpreferred-stack-boundary= 
+TargetSave
+int x_ix86_preferred_stack_boundary_arg
+
+;; -mrecip=
+TargetSave
+const char *x_ix86_recip_name
+
+;; -mregparm=
+TargetSave
+int x_ix86_regparm
+
+;; -mlarge-data-threshold=
+TargetSave
+int x_ix86_section_threshold
+
+;; -msse2avx=
+TargetSave
+int x_ix86_sse2avx
+
+;; -mstack-protector-guard=
+TargetSave
+enum stack_protector_guard x_ix86_stack_protector_guard
+
+;; -mstringop-strategy=
+TargetSave
+enum stringop_alg x_ix86_stringop_alg
+
+;; -mtls-dialect=
+TargetSave
+enum tls_dialect x_ix86_tls_dialect
+
+;; -mtune-ctrl=
+TargetSave
+const char *x_ix86_tune_ctrl_string
+
+;; -mmemcpy-strategy=
+TargetSave
+const char *x_ix86_tune_memcpy_strategy
+
+;; -mmemset-strategy=
+TargetSave
+const char *x_ix86_tune_memset_strategy
+
+;; -mno-default=
+TargetSave
+int x_ix86_tune_no_default
+
+;; -mveclibabi=
+TargetSave
+enum ix86_veclibabi x_ix86_veclibabi_type
+
+;; x86 options
+m128bit-long-double
+Target RejectNegative Report Mask(128BIT_LONG_DOUBLE) Save
+sizeof(long double) is 16
+
+m80387
+Target Report Mask(80387) Save
+Use hardware fp
+
+m96bit-long-double
+Target RejectNegative Report InverseMask(128BIT_LONG_DOUBLE) Save
+sizeof(long double) is 12
+
+mlong-double-80
+Target Report RejectNegative Negative(mlong-double-64) InverseMask(LONG_DOUBLE_64) Save
+Use 80-bit long double
+
+mlong-double-64
+Target Report RejectNegative Negative(mlong-double-128) Mask(LONG_DOUBLE_64) InverseMask(LONG_DOUBLE_128) Save
+Use 64-bit long double
+
+mlong-double-128
+Target Report RejectNegative Negative(mlong-double-80) Mask(LONG_DOUBLE_128) InverseMask(LONG_DOUBLE_64) Save
+Use 128-bit long double
+
+maccumulate-outgoing-args
+Target Report Mask(ACCUMULATE_OUTGOING_ARGS) Save
+Reserve space for outgoing arguments in the function prologue
+
+malign-double
+Target Report Mask(ALIGN_DOUBLE) Save
+Align some doubles on dword boundary
+
+malign-functions=
+Target RejectNegative Joined UInteger
+Function starts are aligned to this power of 2
+
+malign-jumps=
+Target RejectNegative Joined UInteger
+Jump targets are aligned to this power of 2
+
+malign-loops=
+Target RejectNegative Joined UInteger
+Loop code aligned to this power of 2
+
+malign-stringops
+Target RejectNegative Report InverseMask(NO_ALIGN_STRINGOPS, ALIGN_STRINGOPS) Save
+Align destination of the string operations
+
+march=
+Target RejectNegative Joined Var(ix86_arch_string)
+Generate code for given CPU
+
+masm=
+Target RejectNegative Joined Enum(asm_dialect) Var(ix86_asm_dialect) Init(ASM_ATT)
+Use given assembler dialect
+
+Enum
+Name(asm_dialect) Type(enum asm_dialect)
+Known assembler dialects (for use with the -masm-dialect= option):
+
+EnumValue
+Enum(asm_dialect) String(intel) Value(ASM_INTEL)
+
+EnumValue
+Enum(asm_dialect) String(att) Value(ASM_ATT)
+
+mbranch-cost=
+Target RejectNegative Joined UInteger Var(ix86_branch_cost)
+Branches are this expensive (1-5, arbitrary units)
+
+mlarge-data-threshold=
+Target RejectNegative Joined UInteger Var(ix86_section_threshold) Init(DEFAULT_LARGE_SECTION_THRESHOLD)
+Data greater than given threshold will go into .ldata section in x86-64 medium model
+
+mcmodel=
+Target RejectNegative Joined Enum(cmodel) Var(ix86_cmodel) Init(CM_32)
+Use given x86-64 code model
+
+Enum
+Name(cmodel) Type(enum cmodel)
+Known code models (for use with the -mcmodel= option):
+
+EnumValue
+Enum(cmodel) String(small) Value(CM_SMALL)
+
+EnumValue
+Enum(cmodel) String(medium) Value(CM_MEDIUM)
+
+EnumValue
+Enum(cmodel) String(large) Value(CM_LARGE)
+
+EnumValue
+Enum(cmodel) String(32) Value(CM_32)
+
+EnumValue
+Enum(cmodel) String(kernel) Value(CM_KERNEL)
+
+maddress-mode=
+Target RejectNegative Joined Enum(pmode) Var(ix86_pmode) Init(PMODE_SI)
+Use given address mode
+
+Enum
+Name(pmode) Type(enum pmode)
+Known address mode (for use with the -maddress-mode= option):
+
+EnumValue
+Enum(pmode) String(short) Value(PMODE_SI)
+
+EnumValue
+Enum(pmode) String(long) Value(PMODE_DI)
+
+mcpu=
+Target RejectNegative Joined Undocumented Alias(mtune=) Warn(%<-mcpu=%> is deprecated; use %<-mtune=%> or %<-march=%> instead)
+
+mfancy-math-387
+Target RejectNegative Report InverseMask(NO_FANCY_MATH_387, USE_FANCY_MATH_387) Save
+Generate sin, cos, sqrt for FPU
+
+mforce-drap
+Target Report Var(ix86_force_drap)
+Always use Dynamic Realigned Argument Pointer (DRAP) to realign stack
+
+mfp-ret-in-387
+Target Report Mask(FLOAT_RETURNS) Save
+Return values of functions in FPU registers
+
+mfpmath=
+Target RejectNegative Joined Var(ix86_fpmath) Enum(fpmath_unit) Init(FPMATH_387) Save
+Generate floating point mathematics using given instruction set
+
+Enum
+Name(fpmath_unit) Type(enum fpmath_unit)
+Valid arguments to -mfpmath=:
+
+EnumValue
+Enum(fpmath_unit) String(387) Value(FPMATH_387)
+
+EnumValue
+Enum(fpmath_unit) String(sse) Value(FPMATH_SSE)
+
+EnumValue
+Enum(fpmath_unit) String(387,sse) Value({(enum fpmath_unit) (FPMATH_SSE | FPMATH_387)})
+
+EnumValue
+Enum(fpmath_unit) String(387+sse) Value({(enum fpmath_unit) (FPMATH_SSE | FPMATH_387)})
+
+EnumValue
+Enum(fpmath_unit) String(sse,387) Value({(enum fpmath_unit) (FPMATH_SSE | FPMATH_387)})
+
+EnumValue
+Enum(fpmath_unit) String(sse+387) Value({(enum fpmath_unit) (FPMATH_SSE | FPMATH_387)})
+
+EnumValue
+Enum(fpmath_unit) String(both) Value({(enum fpmath_unit) (FPMATH_SSE | FPMATH_387)})
+
+mhard-float
+Target RejectNegative Mask(80387) Save
+Use hardware fp
+
+mieee-fp
+Target Report Mask(IEEE_FP) Save
+Use IEEE math for fp comparisons
+
+minline-all-stringops
+Target Report Mask(INLINE_ALL_STRINGOPS) Save
+Inline all known string operations
+
+minline-stringops-dynamically
+Target Report Mask(INLINE_STRINGOPS_DYNAMICALLY) Save
+Inline memset/memcpy string operations, but perform inline version only for small blocks
+
+mintel-syntax
+Target Undocumented Alias(masm=, intel, att) Warn(%<-mintel-syntax%> and %<-mno-intel-syntax%> are deprecated; use %<-masm=intel%> and %<-masm=att%> instead)
+;; Deprecated
+
+mms-bitfields
+Target Report Mask(MS_BITFIELD_LAYOUT) Save
+Use native (MS) bitfield layout
+
+mno-align-stringops
+Target RejectNegative Report Mask(NO_ALIGN_STRINGOPS) Undocumented Save
+
+mno-fancy-math-387
+Target RejectNegative Report Mask(NO_FANCY_MATH_387) Undocumented Save
+
+mno-push-args
+Target RejectNegative Report Mask(NO_PUSH_ARGS) Undocumented Save
+
+mno-red-zone
+Target RejectNegative Report Mask(NO_RED_ZONE) Undocumented Save
+
+momit-leaf-frame-pointer
+Target Report Mask(OMIT_LEAF_FRAME_POINTER) Save
+Omit the frame pointer in leaf functions
+
+mpc32
+Target RejectNegative Report
+Set 80387 floating-point precision to 32-bit
+
+mpc64
+Target RejectNegative Report
+Set 80387 floating-point precision to 64-bit
+
+mpc80
+Target RejectNegative Report
+Set 80387 floating-point precision to 80-bit
+
+mpreferred-stack-boundary=
+Target RejectNegative Joined UInteger Var(ix86_preferred_stack_boundary_arg)
+Attempt to keep stack aligned to this power of 2
+
+mincoming-stack-boundary=
+Target RejectNegative Joined UInteger Var(ix86_incoming_stack_boundary_arg)
+Assume incoming stack aligned to this power of 2
+
+mpush-args
+Target Report InverseMask(NO_PUSH_ARGS, PUSH_ARGS) Save
+Use push instructions to save outgoing arguments
+
+mred-zone
+Target RejectNegative Report InverseMask(NO_RED_ZONE, RED_ZONE) Save
+Use red-zone in the x86-64 code
+
+mregparm=
+Target RejectNegative Joined UInteger Var(ix86_regparm)
+Number of registers used to pass integer arguments
+
+mrtd
+Target Report Mask(RTD) Save
+Alternate calling convention
+
+msoft-float
+Target InverseMask(80387) Save
+Do not use hardware fp
+
+msseregparm
+Target RejectNegative Mask(SSEREGPARM) Save
+Use SSE register passing conventions for SF and DF mode
+
+mstackrealign
+Target Report Var(ix86_force_align_arg_pointer) Init(-1)
+Realign stack in prologue
+
+mstack-arg-probe
+Target Report Mask(STACK_PROBE) Save
+Enable stack probing
+
+mmemcpy-strategy=
+Target RejectNegative Joined Var(ix86_tune_memcpy_strategy)
+Specify memcpy expansion strategy when expected size is known
+
+mmemset-strategy=
+Target RejectNegative Joined Var(ix86_tune_memset_strategy)
+Specify memset expansion strategy when expected size is known
+
+mstringop-strategy=
+Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop)
+Chose strategy to generate stringop using
+
+Enum
+Name(stringop_alg) Type(enum stringop_alg)
+Valid arguments to -mstringop-strategy=:
+
+EnumValue
+Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte)
+
+EnumValue
+Enum(stringop_alg) String(libcall) Value(libcall)
+
+EnumValue
+Enum(stringop_alg) String(rep_4byte) Value(rep_prefix_4_byte)
+
+EnumValue
+Enum(stringop_alg) String(rep_8byte) Value(rep_prefix_8_byte)
+
+EnumValue
+Enum(stringop_alg) String(byte_loop) Value(loop_1_byte)
+
+EnumValue
+Enum(stringop_alg) String(loop) Value(loop)
+
+EnumValue
+Enum(stringop_alg) String(unrolled_loop) Value(unrolled_loop)
+
+EnumValue
+Enum(stringop_alg) String(vector_loop) Value(vector_loop)
+
+mtls-dialect=
+Target RejectNegative Joined Var(ix86_tls_dialect) Enum(tls_dialect) Init(TLS_DIALECT_GNU)
+Use given thread-local storage dialect
+
+Enum
+Name(tls_dialect) Type(enum tls_dialect)
+Known TLS dialects (for use with the -mtls-dialect= option):
+
+EnumValue
+Enum(tls_dialect) String(gnu) Value(TLS_DIALECT_GNU)
+
+EnumValue
+Enum(tls_dialect) String(gnu2) Value(TLS_DIALECT_GNU2)
+
+mtls-direct-seg-refs
+Target Report Mask(TLS_DIRECT_SEG_REFS)
+Use direct references against %gs when accessing tls data
+
+mtune=
+Target RejectNegative Joined Var(ix86_tune_string)
+Schedule code for given CPU
+
+mtune-ctrl=
+Target RejectNegative Joined Var(ix86_tune_ctrl_string)
+Fine grain control of tune features
+
+mno-default
+Target RejectNegative Var(ix86_tune_no_default) Init(0)
+Clear all tune features
+
+mdump-tune-features
+Target RejectNegative Var(ix86_dump_tunes) Init(0)
+
+mabi=
+Target RejectNegative Joined Var(ix86_abi) Enum(calling_abi) Init(SYSV_ABI)
+Generate code that conforms to the given ABI
+
+Enum
+Name(calling_abi) Type(enum calling_abi)
+Known ABIs (for use with the -mabi= option):
+
+EnumValue
+Enum(calling_abi) String(sysv) Value(SYSV_ABI)
+
+EnumValue
+Enum(calling_abi) String(ms) Value(MS_ABI)
+
+mveclibabi=
+Target RejectNegative Joined Var(ix86_veclibabi_type) Enum(ix86_veclibabi) Init(ix86_veclibabi_type_none)
+Vector library ABI to use
+
+Enum
+Name(ix86_veclibabi) Type(enum ix86_veclibabi)
+Known vectorization library ABIs (for use with the -mveclibabi= option):
+
+EnumValue
+Enum(ix86_veclibabi) String(svml) Value(ix86_veclibabi_type_svml)
+
+EnumValue
+Enum(ix86_veclibabi) String(acml) Value(ix86_veclibabi_type_acml)
+
+mvect8-ret-in-mem
+Target Report Mask(VECT8_RETURNS) Save
+Return 8-byte vectors in memory
+
+mrecip
+Target Report Mask(RECIP) Save
+Generate reciprocals instead of divss and sqrtss.
+
+mrecip=
+Target Report RejectNegative Joined Var(ix86_recip_name)
+Control generation of reciprocal estimates.
+
+mcld
+Target Report Mask(CLD) Save
+Generate cld instruction in the function prologue.
+
+mvzeroupper
+Target Report Mask(VZEROUPPER) Save
+Generate vzeroupper instruction before a transfer of control flow out of
+the function.
+
+mdispatch-scheduler
+Target RejectNegative Var(flag_dispatch_scheduler)
+Do dispatch scheduling if processor is bdver1 or bdver2 or bdver3 or bdver4 and Haifa scheduling
+is selected.
+
+mprefer-avx128
+Target Report Mask(PREFER_AVX128) SAVE
+Use 128-bit AVX instructions instead of 256-bit AVX instructions in the auto-vectorizer.
+
+;; ISA support
+
+m32
+Target RejectNegative Negative(m64) Report InverseMask(ISA_64BIT) Var(ix86_isa_flags) Save
+Generate 32bit i386 code
+
+m64
+Target RejectNegative Negative(mx32) Report Mask(ABI_64) Var(ix86_isa_flags) Save
+Generate 64bit x86-64 code
+
+mx32
+Target RejectNegative Negative(m16) Report Mask(ABI_X32) Var(ix86_isa_flags) Save
+Generate 32bit x86-64 code
+
+m16
+Target RejectNegative Negative(m32) Report Mask(CODE16) InverseMask(ISA_64BIT) Var(ix86_isa_flags) Save
+Generate 16bit i386 code
+
+mmmx
+Target Report Mask(ISA_MMX) Var(ix86_isa_flags) Save
+Support MMX built-in functions
+
+m3dnow
+Target Report Mask(ISA_3DNOW) Var(ix86_isa_flags) Save
+Support 3DNow! built-in functions
+
+m3dnowa
+Target Undocumented Mask(ISA_3DNOW_A) Var(ix86_isa_flags) Save
+Support Athlon 3Dnow! built-in functions
+
+msse
+Target Report Mask(ISA_SSE) Var(ix86_isa_flags) Save
+Support MMX and SSE built-in functions and code generation
+
+msse2
+Target Report Mask(ISA_SSE2) Var(ix86_isa_flags) Save
+Support MMX, SSE and SSE2 built-in functions and code generation
+
+msse3
+Target Report Mask(ISA_SSE3) Var(ix86_isa_flags) Save
+Support MMX, SSE, SSE2 and SSE3 built-in functions and code generation
+
+mssse3
+Target Report Mask(ISA_SSSE3) Var(ix86_isa_flags) Save
+Support MMX, SSE, SSE2, SSE3 and SSSE3 built-in functions and code generation
+
+msse4.1
+Target Report Mask(ISA_SSE4_1) Var(ix86_isa_flags) Save
+Support MMX, SSE, SSE2, SSE3, SSSE3 and SSE4.1 built-in functions and code generation
+
+msse4.2
+Target Report Mask(ISA_SSE4_2) Var(ix86_isa_flags) Save
+Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1 and SSE4.2 built-in functions and code generation
+
+msse4
+Target RejectNegative Report Mask(ISA_SSE4_2) Var(ix86_isa_flags) Save
+Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1 and SSE4.2 built-in functions and code generation
+
+mno-sse4
+Target RejectNegative Report InverseMask(ISA_SSE4_1) Var(ix86_isa_flags) Save
+Do not support SSE4.1 and SSE4.2 built-in functions and code generation
+
+msse5
+Target Undocumented Alias(mavx) Warn(%<-msse5%> was removed)
+;; Deprecated
+
+mavx
+Target Report Mask(ISA_AVX) Var(ix86_isa_flags) Save
+Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 and AVX built-in functions and code generation
+
+mavx2
+Target Report Mask(ISA_AVX2) Var(ix86_isa_flags) Save
+Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and AVX2 built-in functions and code generation
+
+mavx512f
+Target Report Mask(ISA_AVX512F) Var(ix86_isa_flags) Save
+Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 and AVX512F built-in functions and code generation
+
+mavx512pf
+Target Report Mask(ISA_AVX512PF) Var(ix86_isa_flags) Save
+Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 and AVX512F and AVX512PF built-in functions and code generation
+
+mavx512er
+Target Report Mask(ISA_AVX512ER) Var(ix86_isa_flags) Save
+Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 and AVX512F and AVX512ER built-in functions and code generation
+
+mavx512cd
+Target Report Mask(ISA_AVX512CD) Var(ix86_isa_flags) Save
+Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 and AVX512F and AVX512CD built-in functions and code generation
+
+mfma
+Target Report Mask(ISA_FMA) Var(ix86_isa_flags) Save
+Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and FMA built-in functions and code generation
+
+msse4a
+Target Report Mask(ISA_SSE4A) Var(ix86_isa_flags) Save
+Support MMX, SSE, SSE2, SSE3 and SSE4A built-in functions and code generation
+
+mfma4
+Target Report Mask(ISA_FMA4) Var(ix86_isa_flags) Save
+Support FMA4 built-in functions and code generation 
+
+mxop
+Target Report Mask(ISA_XOP) Var(ix86_isa_flags) Save
+Support XOP built-in functions and code generation 
+
+mlwp
+Target Report Mask(ISA_LWP) Var(ix86_isa_flags) Save
+Support LWP built-in functions and code generation 
+
+mabm
+Target Report Mask(ISA_ABM) Var(ix86_isa_flags) Save
+Support code generation of Advanced Bit Manipulation (ABM) instructions.
+
+mpopcnt
+Target Report Mask(ISA_POPCNT) Var(ix86_isa_flags) Save
+Support code generation of popcnt instruction.
+
+mbmi
+Target Report Mask(ISA_BMI) Var(ix86_isa_flags) Save
+Support BMI built-in functions and code generation
+
+mbmi2
+Target Report Mask(ISA_BMI2) Var(ix86_isa_flags) Save
+Support BMI2 built-in functions and code generation
+
+mlzcnt
+Target Report Mask(ISA_LZCNT) Var(ix86_isa_flags) Save
+Support LZCNT built-in function and code generation
+
+mhle
+Target Report Mask(ISA_HLE) Var(ix86_isa_flags) Save
+Support Hardware Lock Elision prefixes
+
+mrdseed
+Target Report Mask(ISA_RDSEED) Var(ix86_isa_flags) Save
+Support RDSEED instruction
+
+mprfchw
+Target Report Mask(ISA_PRFCHW) Var(ix86_isa_flags) Save
+Support PREFETCHW instruction
+
+madx
+Target Report Mask(ISA_ADX) Var(ix86_isa_flags) Save
+Support flag-preserving add-carry instructions
+
+mfxsr
+Target Report Mask(ISA_FXSR) Var(ix86_isa_flags) Save
+Support FXSAVE and FXRSTOR instructions
+
+mxsave
+Target Report Mask(ISA_XSAVE) Var(ix86_isa_flags) Save
+Support XSAVE and XRSTOR instructions
+
+mxsaveopt
+Target Report Mask(ISA_XSAVEOPT) Var(ix86_isa_flags) Save
+Support XSAVEOPT instruction
+
+mtbm
+Target Report Mask(ISA_TBM) Var(ix86_isa_flags) Save
+Support TBM built-in functions and code generation
+
+mcx16
+Target Report Mask(ISA_CX16) Var(ix86_isa_flags) Save
+Support code generation of cmpxchg16b instruction.
+
+msahf
+Target Report Mask(ISA_SAHF) Var(ix86_isa_flags) Save
+Support code generation of sahf instruction in 64bit x86-64 code.
+
+mmovbe
+Target Report Mask(ISA_MOVBE) Var(ix86_isa_flags) Save
+Support code generation of movbe instruction.
+
+mcrc32
+Target Report Mask(ISA_CRC32) Var(ix86_isa_flags) Save
+Support code generation of crc32 instruction.
+
+maes
+Target Report Mask(ISA_AES) Var(ix86_isa_flags) Save
+Support AES built-in functions and code generation
+
+msha
+Target Report Mask(ISA_SHA) Var(ix86_isa_flags) Save
+Support SHA1 and SHA256 built-in functions and code generation
+
+mpclmul
+Target Report Mask(ISA_PCLMUL) Var(ix86_isa_flags) Save
+Support PCLMUL built-in functions and code generation
+
+msse2avx
+Target Report Var(ix86_sse2avx)
+Encode SSE instructions with VEX prefix
+
+mfsgsbase
+Target Report Mask(ISA_FSGSBASE) Var(ix86_isa_flags) Save
+Support FSGSBASE built-in functions and code generation
+
+mrdrnd
+Target Report Mask(ISA_RDRND) Var(ix86_isa_flags) Save
+Support RDRND built-in functions and code generation
+
+mf16c
+Target Report Mask(ISA_F16C) Var(ix86_isa_flags) Save
+Support F16C built-in functions and code generation
+
+mprefetchwt1
+Target Report Mask(ISA_PREFETCHWT1) Var(ix86_isa_flags) Save
+Support PREFETCHWT1 built-in functions and code generation
+
+mfentry
+Target Report Var(flag_fentry) Init(-1)
+Emit profiling counter call at function entry before prologue.
+
+m8bit-idiv
+Target Report Mask(USE_8BIT_IDIV) Save
+Expand 32bit/64bit integer divide into 8bit unsigned integer divide with run-time check
+
+mavx256-split-unaligned-load
+Target Report Mask(AVX256_SPLIT_UNALIGNED_LOAD) Save
+Split 32-byte AVX unaligned load
+
+mavx256-split-unaligned-store
+Target Report Mask(AVX256_SPLIT_UNALIGNED_STORE) Save
+Split 32-byte AVX unaligned store
+
+mrtm
+Target Report Mask(ISA_RTM) Var(ix86_isa_flags) Save
+Support RTM built-in functions and code generation
+
+mstack-protector-guard=
+Target RejectNegative Joined Enum(stack_protector_guard) Var(ix86_stack_protector_guard) Init(SSP_TLS)
+Use given stack-protector guard
+
+Enum
+Name(stack_protector_guard) Type(enum stack_protector_guard)
+Known stack protector guard (for use with the -mstack-protector-guard= option):
+
+EnumValue
+Enum(stack_protector_guard) String(tls) Value(SSP_TLS)
+
+EnumValue
+Enum(stack_protector_guard) String(global) Value(SSP_GLOBAL)
diff --git a/gcc-4.9/gcc/config/i386/i386elf.h b/gcc-4.9/gcc/config/i386/i386elf.h
new file mode 100644
index 000000000..73e119ddd
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/i386elf.h
@@ -0,0 +1,103 @@
+/* Target definitions for GCC for Intel 80386 using ELF
+   Copyright (C) 1988-2014 Free Software Foundation, Inc.
+
+   Derived from sysv4.h written by Ron Guilmette (rfg@netcom.com).
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* The ELF ABI for the i386 says that records and unions are returned
+   in memory.  */
+
+#define SUBTARGET_RETURN_IN_MEMORY(TYPE, FNTYPE) \
+	(TYPE_MODE (TYPE) == BLKmode \
+	 || (VECTOR_MODE_P (TYPE_MODE (TYPE)) && int_size_in_bytes (TYPE) == 8))
+
+#undef CPP_SPEC
+#define CPP_SPEC ""
+
+#define ENDFILE_SPEC "crtend.o%s"
+
+#define STARTFILE_SPEC "%{!shared: \
+			 %{!symbolic: \
+			  %{pg:gcrt0.o%s}%{!pg:%{p:mcrt0.o%s}%{!p:crt0.o%s}}}}\
+			crtbegin.o%s"
+
+#undef DBX_REGISTER_NUMBER
+#define DBX_REGISTER_NUMBER(n) \
+  (TARGET_64BIT ? dbx64_register_map[n] : svr4_dbx_register_map[n])
+
+/* The routine used to output sequences of byte values.  We use a special
+   version of this for most svr4 targets because doing so makes the
+   generated assembly code more compact (and thus faster to assemble)
+   as well as more readable.  Note that if we find subparts of the
+   character sequence which end with NUL (and which are shorter than
+   ELF_STRING_LIMIT) we output those using ASM_OUTPUT_LIMITED_STRING.  */
+
+#undef ASM_OUTPUT_ASCII
+#define ASM_OUTPUT_ASCII(FILE, STR, LENGTH)				\
+  do									\
+    {									\
+      const unsigned char *_ascii_bytes =				\
+        (const unsigned char *) (STR);					\
+      const unsigned char *limit = _ascii_bytes + (LENGTH);		\
+      unsigned bytes_in_chunk = 0;					\
+      for (; _ascii_bytes < limit; _ascii_bytes++)			\
+	{								\
+	  const unsigned char *p;					\
+	  if (bytes_in_chunk >= 64)					\
+	    {								\
+	      fputc ('\n', (FILE));					\
+	      bytes_in_chunk = 0;					\
+	    }								\
+	  for (p = _ascii_bytes; p < limit && *p != '\0'; p++)		\
+	    continue;							\
+	  if (p < limit && (p - _ascii_bytes) <= (long) ELF_STRING_LIMIT) \
+	    {								\
+	      if (bytes_in_chunk > 0)					\
+		{							\
+		  fputc ('\n', (FILE));					\
+		  bytes_in_chunk = 0;					\
+		}							\
+	      ASM_OUTPUT_LIMITED_STRING ((FILE), (const char *) _ascii_bytes); \
+	      _ascii_bytes = p;						\
+	    }								\
+	  else								\
+	    {								\
+	      if (bytes_in_chunk == 0)					\
+		fputs (ASM_BYTE, (FILE));				\
+	      else							\
+		fputc (',', (FILE));					\
+	      fprintf ((FILE), "0x%02x", *_ascii_bytes);			\
+	      bytes_in_chunk += 5;					\
+	    }								\
+	}								\
+      if (bytes_in_chunk > 0)						\
+	fputc ('\n', (FILE));						\
+    }									\
+  while (0)
+
+#define LOCAL_LABEL_PREFIX	"."
+
+/* Switch into a generic section.  */
+#define TARGET_ASM_NAMED_SECTION  default_elf_asm_named_section
+
+#undef BSS_SECTION_ASM_OP
+#define BSS_SECTION_ASM_OP "\t.section\t.bss"
+
+#undef ASM_OUTPUT_ALIGNED_BSS
+#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \
+  asm_output_aligned_bss (FILE, DECL, NAME, SIZE, ALIGN)
diff --git a/gcc-4.9/gcc/config/i386/ia32intrin.h b/gcc-4.9/gcc/config/i386/ia32intrin.h
new file mode 100644
index 000000000..5e7c893fe
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/ia32intrin.h
@@ -0,0 +1,293 @@
+/* Copyright (C) 2009-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86INTRIN_H_INCLUDED
+# error "Never use <ia32intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+/* 32bit bsf */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bsfd (int __X)
+{
+  return __builtin_ctz (__X);
+}
+
+/* 32bit bsr */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bsrd (int __X)
+{
+  return __builtin_ia32_bsrsi (__X);
+}
+
+/* 32bit bswap */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bswapd (int __X)
+{
+  return __builtin_bswap32 (__X);
+}
+
+#ifndef __SSE4_2__
+#pragma GCC push_options
+#pragma GCC target("sse4.2")
+#define __DISABLE_SSE4_2__
+#endif /* __SSE4_2__ */
+
+/* 32bit accumulate CRC32 (polynomial 0x11EDC6F41) value.  */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__crc32b (unsigned int __C, unsigned char __V)
+{
+  return __builtin_ia32_crc32qi (__C, __V);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__crc32w (unsigned int __C, unsigned short __V)
+{
+  return __builtin_ia32_crc32hi (__C, __V);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__crc32d (unsigned int __C, unsigned int __V)
+{
+  return __builtin_ia32_crc32si (__C, __V);
+}
+
+#ifdef __DISABLE_SSE4_2__
+#undef __DISABLE_SSE4_2__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE4_2__ */
+
+/* 32bit popcnt */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__popcntd (unsigned int __X)
+{
+  return __builtin_popcount (__X);
+}
+
+/* rdpmc */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rdpmc (int __S)
+{
+  return __builtin_ia32_rdpmc (__S);
+}
+
+/* rdtsc */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rdtsc (void)
+{
+  return __builtin_ia32_rdtsc ();
+}
+
+/* rdtscp */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rdtscp (unsigned int *__A)
+{
+  return __builtin_ia32_rdtscp (__A);
+}
+
+/* 8bit rol */
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rolb (unsigned char __X, int __C)
+{
+  return __builtin_ia32_rolqi (__X, __C);
+}
+
+/* 16bit rol */
+extern __inline unsigned short
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rolw (unsigned short __X, int __C)
+{
+  return __builtin_ia32_rolhi (__X, __C);
+}
+
+/* 32bit rol */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rold (unsigned int __X, int __C)
+{
+  return (__X << __C) | (__X >> (32 - __C));
+}
+
+/* 8bit ror */
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rorb (unsigned char __X, int __C)
+{
+  return __builtin_ia32_rorqi (__X, __C);
+}
+
+/* 16bit ror */
+extern __inline unsigned short
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rorw (unsigned short __X, int __C)
+{
+  return __builtin_ia32_rorhi (__X, __C);
+}
+
+/* 32bit ror */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rord (unsigned int __X, int __C)
+{
+  return (__X >> __C) | (__X << (32 - __C));
+}
+
+/* Pause */
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__pause (void)
+{
+  __builtin_ia32_pause ();
+}
+
+#ifdef __x86_64__
+/* 64bit bsf */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bsfq (long long __X)
+{
+  return __builtin_ctzll (__X);
+}
+
+/* 64bit bsr */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bsrq (long long __X)
+{
+  return __builtin_ia32_bsrdi (__X);
+}
+
+/* 64bit bswap */
+extern __inline long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bswapq (long long __X)
+{
+  return __builtin_bswap64 (__X);
+}
+
+#ifndef __SSE4_2__
+#pragma GCC push_options
+#pragma GCC target("sse4.2")
+#define __DISABLE_SSE4_2__
+#endif /* __SSE4_2__ */
+
+/* 64bit accumulate CRC32 (polynomial 0x11EDC6F41) value.  */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__crc32q (unsigned long long __C, unsigned long long __V)
+{
+  return __builtin_ia32_crc32di (__C, __V);
+}
+
+#ifdef __DISABLE_SSE4_2__
+#undef __DISABLE_SSE4_2__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE4_2__ */
+
+/* 64bit popcnt */
+extern __inline long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__popcntq (unsigned long long __X)
+{
+  return __builtin_popcountll (__X);
+}
+
+/* 64bit rol */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rolq (unsigned long long __X, int __C)
+{
+  return (__X << __C) | (__X >> (64 - __C));
+}
+
+/* 64bit ror */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rorq (unsigned long long __X, int __C)
+{
+  return (__X >> __C) | (__X << (64 - __C));
+}
+
+/* Read flags register */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__readeflags (void)
+{
+  return __builtin_ia32_readeflags_u64 ();
+}
+
+/* Write flags register */
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__writeeflags (unsigned long long X)
+{
+  __builtin_ia32_writeeflags_u64 (X);
+}
+
+#define _bswap64(a)		__bswapq(a)
+#define _popcnt64(a)		__popcntq(a)
+#define _lrotl(a,b)		__rolq((a), (b))
+#define _lrotr(a,b)		__rorq((a), (b))
+#else
+#define _lrotl(a,b)		__rold((a), (b))
+#define _lrotr(a,b)		__rord((a), (b))
+
+/* Read flags register */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__readeflags (void)
+{
+  return __builtin_ia32_readeflags_u32 ();
+}
+
+/* Write flags register */
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__writeeflags (unsigned int X)
+{
+  __builtin_ia32_writeeflags_u32 (X);
+}
+
+#endif
+
+#define _bit_scan_forward(a)	__bsfd(a)
+#define _bit_scan_reverse(a)	__bsrd(a)
+#define _bswap(a)		__bswapd(a)
+#define _popcnt32(a)		__popcntd(a)
+#define _rdpmc(a)		__rdpmc(a)
+#define _rdtsc()		__rdtsc()
+#define _rdtscp(a)		__rdtscp(a)
+#define _rotwl(a,b)		__rolw((a), (b))
+#define _rotwr(a,b)		__rorw((a), (b))
+#define _rotl(a,b)		__rold((a), (b))
+#define _rotr(a,b)		__rord((a), (b))
diff --git a/gcc-4.9/gcc/config/i386/immintrin.h b/gcc-4.9/gcc/config/i386/immintrin.h
new file mode 100644
index 000000000..73b485992
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/immintrin.h
@@ -0,0 +1,177 @@
+/* Copyright (C) 2008-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#define _IMMINTRIN_H_INCLUDED
+
+#include <mmintrin.h>
+
+#include <xmmintrin.h>
+
+#include <emmintrin.h>
+
+#include <pmmintrin.h>
+
+#include <tmmintrin.h>
+
+#include <smmintrin.h>
+
+#include <wmmintrin.h>
+
+#include <avxintrin.h>
+
+#include <avx2intrin.h>
+
+#include <avx512fintrin.h>
+
+#include <avx512erintrin.h>
+
+#include <avx512pfintrin.h>
+
+#include <avx512cdintrin.h>
+
+#include <shaintrin.h>
+
+#include <lzcntintrin.h>
+
+#include <bmiintrin.h>
+
+#include <bmi2intrin.h>
+
+#include <fmaintrin.h>
+
+#include <f16cintrin.h>
+
+#include <rtmintrin.h>
+
+#include <xtestintrin.h>
+
+#ifndef __RDRND__
+#pragma GCC push_options
+#pragma GCC target("rdrnd")
+#define __DISABLE_RDRND__
+#endif /* __RDRND__ */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdrand16_step (unsigned short *__P)
+{
+  return __builtin_ia32_rdrand16_step (__P);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdrand32_step (unsigned int *__P)
+{
+  return __builtin_ia32_rdrand32_step (__P);
+}
+#ifdef __DISABLE_RDRND__
+#undef __DISABLE_RDRND__
+#pragma GCC pop_options
+#endif /* __DISABLE_RDRND__ */
+
+#ifdef  __x86_64__
+
+#ifndef __FSGSBASE__
+#pragma GCC push_options
+#pragma GCC target("fsgsbase")
+#define __DISABLE_FSGSBASE__
+#endif /* __FSGSBASE__ */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_readfsbase_u32 (void)
+{
+  return __builtin_ia32_rdfsbase32 ();
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_readfsbase_u64 (void)
+{
+  return __builtin_ia32_rdfsbase64 ();
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_readgsbase_u32 (void)
+{
+  return __builtin_ia32_rdgsbase32 ();
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_readgsbase_u64 (void)
+{
+  return __builtin_ia32_rdgsbase64 ();
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_writefsbase_u32 (unsigned int __B)
+{
+  __builtin_ia32_wrfsbase32 (__B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_writefsbase_u64 (unsigned long long __B)
+{
+  __builtin_ia32_wrfsbase64 (__B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_writegsbase_u32 (unsigned int __B)
+{
+  __builtin_ia32_wrgsbase32 (__B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_writegsbase_u64 (unsigned long long __B)
+{
+  __builtin_ia32_wrgsbase64 (__B);
+}
+#ifdef __DISABLE_FSGSBASE__
+#undef __DISABLE_FSGSBASE__
+#pragma GCC pop_options
+#endif /* __DISABLE_FSGSBASE__ */
+
+#ifndef __RDRND__
+#pragma GCC push_options
+#pragma GCC target("rdrnd")
+#define __DISABLE_RDRND__
+#endif /* __RDRND__ */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdrand64_step (unsigned long long *__P)
+{
+  return __builtin_ia32_rdrand64_step (__P);
+}
+#ifdef __DISABLE_RDRND__
+#undef __DISABLE_RDRND__
+#pragma GCC pop_options
+#endif /* __DISABLE_RDRND__ */
+
+#endif /* __x86_64__  */
+
+#endif /* _IMMINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/interix.opt b/gcc-4.9/gcc/config/i386/interix.opt
new file mode 100644
index 000000000..a8c7230a2
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/interix.opt
@@ -0,0 +1,34 @@
+; Interix-specific options.
+
+; Copyright (C) 2005-2014 Free Software Foundation, Inc.
+;
+; This file is part of GCC.
+;
+; GCC is free software; you can redistribute it and/or modify it under
+; the terms of the GNU General Public License as published by the Free
+; Software Foundation; either version 3, or (at your option) any later
+; version.
+;
+; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+; for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with GCC; see the file COPYING3.  If not see
+; <http://www.gnu.org/licenses/>.
+
+dynamic
+Driver
+
+G
+Driver
+
+posix
+Driver
+
+mpe-aligned-commons
+Target Var(use_pe_aligned_common) Init(HAVE_GAS_ALIGNED_COMM)
+Use the GNU extension to the PE format for aligned common data
+
+; This comment is to ensure we retain the blank line above.
diff --git a/gcc-4.9/gcc/config/i386/k6.md b/gcc-4.9/gcc/config/i386/k6.md
new file mode 100644
index 000000000..dadb39201
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/k6.md
@@ -0,0 +1,266 @@
+;; AMD K6/K6-2 Scheduling
+;; Copyright (C) 2002-2014 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+;;
+;; The K6 architecture is quite similar to PPro.  Important difference is
+;; that there are only two decoders and they seems to be much slower than
+;; any of the execution units.  So we have to pay much more attention to
+;; proper scheduling for the decoders.
+;; FIXME: We don't do that right now.  A good start would be to sort the
+;;        instructions based on length.
+;;
+;; This description is based on data from the following documents:
+;;
+;;    "AMD-K6 Processor Data Sheet (Preliminary information)"
+;;    Advanced Micro Devices, Inc., 1998.
+;;
+;;    "AMD-K6 Processor Code Optimization Application Note"
+;;    Advanced Micro Devices, Inc., 2000.
+;;
+;; CPU execution units of the K6:
+;;
+;; store	describes the Store unit.  This unit is not modelled
+;;		completely and it is only used to model lea operation.
+;;		Otherwise it lies outside of any critical path.
+;; load		describes the Load unit
+;; alux		describes the Integer X unit
+;; mm		describes the Multimedia unit, which shares a pipe
+;;		with the Integer X unit.  This unit is used for MMX,
+;;		which is not implemented for K6.
+;; aluy		describes the Integer Y unit
+;; fpu		describes the FPU unit
+;; branch	describes the Branch unit
+;;
+;; The fp unit is not pipelined, and it can only do one operation per two
+;; cycles, including fxcg.
+;;
+;; Generally this is a very poor description, but at least no worse than
+;; the old description, and a lot easier to extend to something more
+;; reasonable if anyone still cares enough about this architecture in 2004.
+;;
+;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real.
+
+(define_automaton "k6_decoder,k6_load_unit,k6_store_unit,k6_integer_units,k6_fpu_unit,k6_branch_unit")
+
+;; The K6 instruction decoding begins before the on-chip instruction cache is
+;; filled.  Depending on the length of the instruction, two simple instructions
+;; can be decoded in two parallel short decoders, or one complex instruction can
+;; be decoded in either the long or the vector decoder.  For all practical
+;; purposes, the long and vector decoder can be modelled as one decoder.
+(define_cpu_unit "k6_decode_short0" "k6_decoder")
+(define_cpu_unit "k6_decode_short1" "k6_decoder")
+(define_cpu_unit "k6_decode_long" "k6_decoder")
+(exclusion_set "k6_decode_long" "k6_decode_short0,k6_decode_short1")
+(define_reservation "k6_decode_short" "k6_decode_short0|k6_decode_short1")
+(define_reservation "k6_decode_vector" "k6_decode_long")
+
+(define_cpu_unit "k6_store" "k6_store_unit")
+(define_cpu_unit "k6_load" "k6_load_unit")
+(define_cpu_unit "k6_alux,k6_aluy" "k6_integer_units")
+(define_cpu_unit "k6_fpu" "k6_fpu_unit")
+(define_cpu_unit "k6_branch" "k6_branch_unit")
+
+;; Shift instructions and certain arithmetic are issued only on Integer X.
+(define_insn_reservation "k6_alux_only" 1
+			 (and (eq_attr "cpu" "k6")
+			      (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot")
+				   (eq_attr "memory" "none")))
+			 "k6_decode_short,k6_alux")
+
+(define_insn_reservation "k6_alux_only_load" 3
+			 (and (eq_attr "cpu" "k6")
+			       (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot")
+				    (eq_attr "memory" "load")))
+			 "k6_decode_short,k6_load,k6_alux")
+
+(define_insn_reservation "k6_alux_only_store" 3
+			 (and (eq_attr "cpu" "k6")
+			       (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot")
+				    (eq_attr "memory" "store,both,unknown")))
+			 "k6_decode_long,k6_load,k6_alux,k6_store")
+
+;; Integer divide and multiply can only be issued on Integer X, too.
+(define_insn_reservation "k6_alu_imul" 2
+			 (and (eq_attr "cpu" "k6")
+			      (eq_attr "type" "imul"))
+			 "k6_decode_vector,k6_alux*3")
+
+(define_insn_reservation "k6_alu_imul_load" 4
+			 (and (eq_attr "cpu" "k6")
+			      (and (eq_attr "type" "imul")
+				   (eq_attr "memory" "load")))
+			 "k6_decode_vector,k6_load,k6_alux*3")
+
+(define_insn_reservation "k6_alu_imul_store" 4
+			 (and (eq_attr "cpu" "k6")
+			      (and (eq_attr "type" "imul")
+				   (eq_attr "memory" "store,both,unknown")))
+			 "k6_decode_vector,k6_load,k6_alux*3,k6_store")
+
+;; ??? Guessed latencies based on the old pipeline description.
+(define_insn_reservation "k6_alu_idiv" 17
+			 (and (eq_attr "cpu" "k6")
+			      (and (eq_attr "type" "idiv")
+				   (eq_attr "memory" "none")))
+			 "k6_decode_vector,k6_alux*17")
+
+(define_insn_reservation "k6_alu_idiv_mem" 19
+			 (and (eq_attr "cpu" "k6")
+			      (and (eq_attr "type" "idiv")
+				   (eq_attr "memory" "!none")))
+			 "k6_decode_vector,k6_load,k6_alux*17")
+
+;; Basic word and doubleword ALU ops can be issued on both Integer units.
+(define_insn_reservation "k6_alu" 1
+			 (and (eq_attr "cpu" "k6")
+			      (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc")
+				   (eq_attr "memory" "none")))
+			 "k6_decode_short,k6_alux|k6_aluy")
+
+(define_insn_reservation "k6_alu_load" 3
+			 (and (eq_attr "cpu" "k6")
+			      (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc")
+				   (eq_attr "memory" "load")))
+			 "k6_decode_short,k6_load,k6_alux|k6_aluy")
+
+(define_insn_reservation "k6_alu_store" 3
+			 (and (eq_attr "cpu" "k6")
+			      (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc")
+				   (eq_attr "memory" "store,both,unknown")))
+			 "k6_decode_long,k6_load,k6_alux|k6_aluy,k6_store")
+
+;; A "load immediate" operation does not require execution at all,
+;; it is available immediately after decoding.  Special-case this.
+(define_insn_reservation "k6_alu_imov" 1
+			 (and (eq_attr "cpu" "k6")
+			      (and (eq_attr "type" "imov")
+				   (and (eq_attr "memory" "none")
+					(match_operand 1 "nonimmediate_operand"))))
+			 "k6_decode_short,k6_alux|k6_aluy")
+
+(define_insn_reservation "k6_alu_imov_imm" 0
+			 (and (eq_attr "cpu" "k6")
+			      (and (eq_attr "type" "imov")
+				   (and (eq_attr "memory" "none")
+					(match_operand 1 "immediate_operand"))))
+			 "k6_decode_short")
+
+(define_insn_reservation "k6_alu_imov_load" 2
+			 (and (eq_attr "cpu" "k6")
+			      (and (eq_attr "type" "imov")
+				   (eq_attr "memory" "load")))
+			 "k6_decode_short,k6_load")
+
+(define_insn_reservation "k6_alu_imov_store" 1
+			 (and (eq_attr "cpu" "k6")
+			      (and (eq_attr "type" "imov")
+				   (eq_attr "memory" "store")))
+			 "k6_decode_short,k6_store")
+
+(define_insn_reservation "k6_alu_imov_both" 2
+			 (and (eq_attr "cpu" "k6")
+			      (and (eq_attr "type" "imov")
+				   (eq_attr "memory" "both,unknown")))
+			 "k6_decode_long,k6_load,k6_alux|k6_aluy")
+
+;; The branch unit.
+(define_insn_reservation "k6_branch_call" 1
+			 (and (eq_attr "cpu" "k6")
+			      (eq_attr "type" "call,callv"))
+			 "k6_decode_vector,k6_branch")
+
+(define_insn_reservation "k6_branch_branch" 1
+			 (and (eq_attr "cpu" "k6")
+			      (eq_attr "type" "ibr"))
+			 "k6_decode_short,k6_branch")
+
+;; The load and units have two pipeline stages.  The load latency is
+;; two cycles.
+(define_insn_reservation "k6_load_pop" 3
+			 (and (eq_attr "cpu" "k6")
+			      (ior (eq_attr "type" "pop")
+				   (eq_attr "memory" "load,both")))
+			 "k6_decode_short,k6_load")
+
+(define_insn_reservation "k6_load_leave" 5
+			 (and (eq_attr "cpu" "k6")
+			      (eq_attr "type" "leave"))
+			 "k6_decode_long,k6_load,(k6_alux|k6_aluy)*2")
+
+;; ??? From the old pipeline description.  Egad!
+;; ??? Apparently we take care of this reservation in adjust_cost.
+(define_insn_reservation "k6_load_str" 10
+			 (and (eq_attr "cpu" "k6")
+			      (and (eq_attr "type" "str")
+				   (eq_attr "memory" "load,both")))
+			 "k6_decode_vector,k6_load*10")
+
+;; The store unit handles lea and push.  It is otherwise unmodelled.
+(define_insn_reservation "k6_store_lea" 2
+			 (and (eq_attr "cpu" "k6")
+			      (eq_attr "type" "lea"))
+			 "k6_decode_short,k6_store,k6_alux|k6_aluy")
+
+(define_insn_reservation "k6_store_push" 2
+			 (and (eq_attr "cpu" "k6")
+			      (ior (eq_attr "type" "push")
+				   (eq_attr "memory" "store,both")))
+			 "k6_decode_short,k6_store")
+
+(define_insn_reservation "k6_store_str" 10
+			 (and (eq_attr "cpu" "k6")
+			      (eq_attr "type" "str"))
+			 "k6_store*10")
+
+;; Most FPU instructions have latency 2 and throughput 2.
+(define_insn_reservation "k6_fpu" 2
+			 (and (eq_attr "cpu" "k6")
+			      (and (eq_attr "type" "fop,fmov,fcmp,fistp")
+				   (eq_attr "memory" "none")))
+			 "k6_decode_vector,k6_fpu*2")
+
+(define_insn_reservation "k6_fpu_load" 6
+			 (and (eq_attr "cpu" "k6")
+			      (and (eq_attr "type" "fop,fmov,fcmp,fistp")
+				   (eq_attr "memory" "load,both")))
+			 "k6_decode_short,k6_load,k6_fpu*2")
+
+(define_insn_reservation "k6_fpu_store" 6
+			 (and (eq_attr "cpu" "k6")
+			      (and (eq_attr "type" "fop,fmov,fcmp,fistp")
+				   (eq_attr "memory" "store")))
+			 "k6_decode_short,k6_store,k6_fpu*2")
+
+(define_insn_reservation "k6_fpu_fmul" 2
+			 (and (eq_attr "cpu" "k6")
+			      (and (eq_attr "type" "fmul")
+				   (eq_attr "memory" "none")))
+			 "k6_decode_short,k6_fpu*2")
+
+(define_insn_reservation "k6_fpu_fmul_load" 2
+			 (and (eq_attr "cpu" "k6")
+			      (and (eq_attr "type" "fmul")
+				   (eq_attr "memory" "load,both")))
+			 "k6_decode_short,k6_load,k6_fpu*2")
+
+;; ??? Guessed latencies from the old pipeline description.
+(define_insn_reservation "k6_fpu_expensive" 56
+			 (and (eq_attr "cpu" "k6")
+			      (eq_attr "type" "fdiv,fpspc"))
+			 "k6_decode_short,k6_fpu*56")
+
diff --git a/gcc-4.9/gcc/config/i386/kfreebsd-gnu.h b/gcc-4.9/gcc/config/i386/kfreebsd-gnu.h
new file mode 100644
index 000000000..e487205a7
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/kfreebsd-gnu.h
@@ -0,0 +1,22 @@
+/* Definitions for Intel 386 running kFreeBSD-based GNU systems with ELF format
+   Copyright (C) 2011-2014 Free Software Foundation, Inc.
+   Contributed by Robert Millan.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define GNU_USER_LINK_EMULATION "elf_i386_fbsd"
+#define GLIBC_DYNAMIC_LINKER "/lib/ld.so.1"
diff --git a/gcc-4.9/gcc/config/i386/kfreebsd-gnu64.h b/gcc-4.9/gcc/config/i386/kfreebsd-gnu64.h
new file mode 100644
index 000000000..1c75c8eb5
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/kfreebsd-gnu64.h
@@ -0,0 +1,27 @@
+/* Definitions for AMD x86-64 running kFreeBSD-based GNU systems with ELF format
+   Copyright (C) 2011-2014 Free Software Foundation, Inc.
+   Contributed by Robert Millan.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define GNU_USER_LINK_EMULATION32 "elf_i386_fbsd"
+#define GNU_USER_LINK_EMULATION64 "elf_x86_64_fbsd"
+#define GNU_USER_LINK_EMULATIONX32 "elf32_x86_64_fbsd"
+
+#define GLIBC_DYNAMIC_LINKER32 "/lib/ld.so.1"
+#define GLIBC_DYNAMIC_LINKER64 "/lib/ld-kfreebsd-x86-64.so.1"
+#define GLIBC_DYNAMIC_LINKERX32 "/lib/ld-kfreebsd-x32.so.1"
diff --git a/gcc-4.9/gcc/config/i386/knetbsd-gnu.h b/gcc-4.9/gcc/config/i386/knetbsd-gnu.h
new file mode 100644
index 000000000..23bf12922
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/knetbsd-gnu.h
@@ -0,0 +1,21 @@
+/* Definitions for Intel 386 running kNetBSD-based GNU systems with ELF format
+   Copyright (C) 2004-2014 Free Software Foundation, Inc.
+   Contributed by Robert Millan.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define GNU_USER_LINK_EMULATION "elf_i386"
diff --git a/gcc-4.9/gcc/config/i386/kopensolaris-gnu.h b/gcc-4.9/gcc/config/i386/kopensolaris-gnu.h
new file mode 100644
index 000000000..73ca5518a
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/kopensolaris-gnu.h
@@ -0,0 +1,21 @@
+/* Definitions for Intel 386 running kOpenSolaris-based GNU systems with ELF format
+   Copyright (C) 2009-2014 Free Software Foundation, Inc.
+   Contributed by Robert Millan.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define GNU_USER_LINK_EMULATION "elf_i386"
diff --git a/gcc-4.9/gcc/config/i386/linux-common.h b/gcc-4.9/gcc/config/i386/linux-common.h
new file mode 100644
index 000000000..1eaf024a6
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/linux-common.h
@@ -0,0 +1,55 @@
+/* Definitions for Intel 386 running Linux-based GNU systems with ELF format.
+   Copyright (C) 2012-2014 Free Software Foundation, Inc.
+   Contributed by Ilya Enkovich.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#undef TARGET_OS_CPP_BUILTINS
+#define TARGET_OS_CPP_BUILTINS()               \
+  do                                           \
+    {                                          \
+      GNU_USER_TARGET_OS_CPP_BUILTINS();       \
+      ANDROID_TARGET_OS_CPP_BUILTINS();	       \
+    }                                          \
+  while (0)
+
+#undef CC1_SPEC
+#define CC1_SPEC \
+  LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \
+		       GNU_USER_TARGET_CC1_SPEC " " ANDROID_CC1_SPEC)
+
+#undef	LINK_SPEC
+#define LINK_SPEC \
+  LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LINK_SPEC, \
+		       GNU_USER_TARGET_LINK_SPEC " " ANDROID_LINK_SPEC)
+
+#undef  LIB_SPEC
+#define LIB_SPEC \
+  LINUX_OR_ANDROID_LD (GNU_USER_TARGET_LIB_SPEC, \
+		    GNU_USER_TARGET_NO_PTHREADS_LIB_SPEC " " ANDROID_LIB_SPEC)
+
+#undef  STARTFILE_SPEC
+#define STARTFILE_SPEC \
+  LINUX_OR_ANDROID_LD (GNU_USER_TARGET_STARTFILE_SPEC, \
+		       ANDROID_STARTFILE_SPEC)
+
+#undef  ENDFILE_SPEC
+#define ENDFILE_SPEC \
+  LINUX_OR_ANDROID_LD (GNU_USER_TARGET_MATHFILE_SPEC " " \
+		       GNU_USER_TARGET_ENDFILE_SPEC,	 \
+		       GNU_USER_TARGET_MATHFILE_SPEC " " \
+		       ANDROID_ENDFILE_SPEC)
diff --git a/gcc-4.9/gcc/config/i386/linux.h b/gcc-4.9/gcc/config/i386/linux.h
new file mode 100644
index 000000000..1fb1e0321
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/linux.h
@@ -0,0 +1,23 @@
+/* Definitions for Intel 386 running Linux-based GNU systems with ELF format.
+   Copyright (C) 1994-2014 Free Software Foundation, Inc.
+   Contributed by Eric Youngdale.
+   Modified for stabs-in-ELF by H.J. Lu.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define GNU_USER_LINK_EMULATION "elf_i386"
+#define GLIBC_DYNAMIC_LINKER "/lib/ld-linux.so.2"
diff --git a/gcc-4.9/gcc/config/i386/linux64.h b/gcc-4.9/gcc/config/i386/linux64.h
new file mode 100644
index 000000000..a90171e8c
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/linux64.h
@@ -0,0 +1,32 @@
+/* Definitions for AMD x86-64 running Linux-based GNU systems with ELF format.
+   Copyright (C) 2001-2014 Free Software Foundation, Inc.
+   Contributed by Jan Hubicka <jh@suse.cz>, based on linux.h.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#define GNU_USER_LINK_EMULATION32 "elf_i386"
+#define GNU_USER_LINK_EMULATION64 "elf_x86_64"
+#define GNU_USER_LINK_EMULATIONX32 "elf32_x86_64"
+
+#define GLIBC_DYNAMIC_LINKER32 "/lib/ld-linux.so.2"
+#define GLIBC_DYNAMIC_LINKER64 "/lib64/ld-linux-x86-64.so.2"
+#define GLIBC_DYNAMIC_LINKERX32 "/libx32/ld-linux-x32.so.2"
diff --git a/gcc-4.9/gcc/config/i386/lwpintrin.h b/gcc-4.9/gcc/config/i386/lwpintrin.h
new file mode 100644
index 000000000..1cd046a99
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/lwpintrin.h
@@ -0,0 +1,105 @@
+/* Copyright (C) 2007-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86INTRIN_H_INCLUDED
+# error "Never use <lwpintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _LWPINTRIN_H_INCLUDED
+#define _LWPINTRIN_H_INCLUDED
+
+#ifndef __LWP__
+#pragma GCC push_options
+#pragma GCC target("lwp")
+#define __DISABLE_LWP__
+#endif /* __LWP__ */
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__llwpcb (void *pcbAddress)
+{
+  __builtin_ia32_llwpcb (pcbAddress);
+}
+
+extern __inline void * __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__slwpcb (void)
+{
+  return __builtin_ia32_slwpcb ();
+}
+
+#ifdef __OPTIMIZE__
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lwpval32 (unsigned int data2, unsigned int data1, unsigned int flags)
+{
+  __builtin_ia32_lwpval32 (data2, data1, flags);
+}
+
+#ifdef __x86_64__
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lwpval64 (unsigned long long data2, unsigned int data1, unsigned int flags)
+{
+  __builtin_ia32_lwpval64 (data2, data1, flags);
+}
+#endif
+#else
+#define __lwpval32(D2, D1, F) \
+  (__builtin_ia32_lwpval32 ((unsigned int) (D2), (unsigned int) (D1), \
+			    (unsigned int) (F)))
+#ifdef __x86_64__
+#define __lwpval64(D2, D1, F) \
+  (__builtin_ia32_lwpval64 ((unsigned long long) (D2), (unsigned int) (D1), \
+			    (unsigned int) (F)))
+#endif
+#endif
+
+
+#ifdef __OPTIMIZE__
+extern __inline unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lwpins32 (unsigned int data2, unsigned int data1, unsigned int flags)
+{
+  return __builtin_ia32_lwpins32 (data2, data1, flags);
+}
+
+#ifdef __x86_64__
+extern __inline unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lwpins64 (unsigned long long data2, unsigned int data1, unsigned int flags)
+{
+  return __builtin_ia32_lwpins64 (data2, data1, flags);
+}
+#endif
+#else
+#define __lwpins32(D2, D1, F) \
+  (__builtin_ia32_lwpins32 ((unsigned int) (D2), (unsigned int) (D1), \
+			    (unsigned int) (F)))
+#ifdef __x86_64__
+#define __lwpins64(D2, D1, F) \
+  (__builtin_ia32_lwpins64 ((unsigned long long) (D2), (unsigned int) (D1), \
+			    (unsigned int) (F)))
+#endif
+#endif
+
+#ifdef __DISABLE_LWP__
+#undef __DISABLE_LWP__
+#pragma GCC pop_options
+#endif /* __DISABLE_LWP__ */
+
+#endif /* _LWPINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/lynx.h b/gcc-4.9/gcc/config/i386/lynx.h
new file mode 100644
index 000000000..910930e71
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/lynx.h
@@ -0,0 +1,87 @@
+/* Definitions for LynxOS on i386.
+   Copyright (C) 1993-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define TARGET_OS_CPP_BUILTINS()		\
+  do						\
+    {						\
+      builtin_define ("__LITTLE_ENDIAN__");	\
+      builtin_define ("__x86__");		\
+    }						\
+  while (0)
+
+/* The svr4 ABI for the i386 says that records and unions are returned
+   in memory.  */
+
+#define DEFAULT_PCC_STRUCT_RETURN 1
+
+/* BSS_SECTION_ASM_OP gets defined i386/unix.h.  */
+
+#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \
+  asm_output_aligned_bss (FILE, DECL, NAME, SIZE, ALIGN)
+
+/* LynxOS's GDB counts the floating point registers from 16.  */
+
+#undef DBX_REGISTER_NUMBER
+#define DBX_REGISTER_NUMBER(n)						\
+  (TARGET_64BIT ? dbx64_register_map[n]					\
+   : (n) == 0 ? 0							\
+   : (n) == 1 ? 2							\
+   : (n) == 2 ? 1							\
+   : (n) == 3 ? 3							\
+   : (n) == 4 ? 6							\
+   : (n) == 5 ? 7							\
+   : (n) == 6 ? 5							\
+   : (n) == 7 ? 4							\
+   : ((n) >= FIRST_STACK_REG && (n) <= LAST_STACK_REG) ? (int) (n) + 8	\
+   : (-1))
+  
+/* A C statement to output to the stdio stream FILE an assembler
+   command to advance the location counter to a multiple of 1<<LOG
+   bytes if it is within MAX_SKIP bytes.
+
+   This is used to align code labels according to Intel recommendations.  */
+
+#ifdef HAVE_GAS_MAX_SKIP_P2ALIGN
+#define ASM_OUTPUT_MAX_SKIP_ALIGN(FILE,LOG,MAX_SKIP)			\
+  do {									\
+    if ((LOG) != 0) {							\
+      if ((MAX_SKIP) == 0) fprintf ((FILE), "\t.p2align %d\n", (LOG));	\
+      else fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP));	\
+    }									\
+  } while (0)
+#endif
+
+/* Undefine SUBTARGET_EXTRA_SPECS it is empty anyway.  We define it in
+   config/lynx.h.  */
+
+#undef SUBTARGET_EXTRA_SPECS
+
+/* Undefine the definition from att.h to enable our default.  */
+
+#undef ASM_OUTPUT_ALIGN
+
+/* Undefine the definition from elfos.h to enable our default.  */
+
+#undef PREFERRED_DEBUGGING_TYPE
+
+/* The file i386.c defines TARGET_HAVE_TLS unconditionally if
+   HAVE_AS_TLS is defined.  HAVE_AS_TLS is defined as gas support for
+   TLS is detected by configure.  We undefine it here.  */
+
+#undef HAVE_AS_TLS
diff --git a/gcc-4.9/gcc/config/i386/lzcntintrin.h b/gcc-4.9/gcc/config/i386/lzcntintrin.h
new file mode 100644
index 000000000..b680a3539
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/lzcntintrin.h
@@ -0,0 +1,75 @@
+/* Copyright (C) 2009-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+
+#ifndef _LZCNTINTRIN_H_INCLUDED
+#define _LZCNTINTRIN_H_INCLUDED
+
+#ifndef __LZCNT__
+#pragma GCC push_options
+#pragma GCC target("lzcnt")
+#define __DISABLE_LZCNT__
+#endif /* __LZCNT__ */
+
+extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lzcnt16 (unsigned short __X)
+{
+  return __builtin_clzs (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lzcnt32 (unsigned int __X)
+{
+  return __builtin_clz (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lzcnt_u32 (unsigned int __X)
+{
+  return __builtin_clz (__X);
+}
+
+#ifdef __x86_64__
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lzcnt64 (unsigned long long __X)
+{
+  return __builtin_clzll (__X);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lzcnt_u64 (unsigned long long __X)
+{
+  return __builtin_clzll (__X);
+}
+#endif
+
+#ifdef __DISABLE_LZCNT__
+#undef __DISABLE_LZCNT__
+#pragma GCC pop_options
+#endif /* __DISABLE_LZCNT__ */
+
+#endif /* _LZCNTINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/mingw-pthread.h b/gcc-4.9/gcc/config/i386/mingw-pthread.h
new file mode 100644
index 000000000..99753cf03
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/mingw-pthread.h
@@ -0,0 +1,21 @@
+/* Defines that pthread library shall be enabled by default
+   for target.
+   Copyright (C) 2011-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define TARGET_USE_PTHREAD_BY_DEFAULT 1
diff --git a/gcc-4.9/gcc/config/i386/mingw-stdint.h b/gcc-4.9/gcc/config/i386/mingw-stdint.h
new file mode 100644
index 000000000..1589d96bf
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/mingw-stdint.h
@@ -0,0 +1,50 @@
+/* Definitions for <stdint.h> types on systems using mingw.
+   Copyright (C) 2009-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define SIG_ATOMIC_TYPE "int"
+
+#define INT8_TYPE "signed char"
+#define INT16_TYPE "short int"
+#define INT32_TYPE "int"
+#define INT64_TYPE "long long int"
+#define UINT8_TYPE "unsigned char"
+#define UINT16_TYPE "short unsigned int"
+#define UINT32_TYPE "unsigned int"
+#define UINT64_TYPE "long long unsigned int"
+
+#define INT_LEAST8_TYPE "signed char"
+#define INT_LEAST16_TYPE "short int"
+#define INT_LEAST32_TYPE "int"
+#define INT_LEAST64_TYPE "long long int"
+#define UINT_LEAST8_TYPE "unsigned char"
+#define UINT_LEAST16_TYPE "short unsigned int"
+#define UINT_LEAST32_TYPE "unsigned int"
+#define UINT_LEAST64_TYPE "long long unsigned int"
+
+#define INT_FAST8_TYPE "signed char"
+#define INT_FAST16_TYPE "short int"
+#define INT_FAST32_TYPE "int"
+#define INT_FAST64_TYPE "long long int"
+#define UINT_FAST8_TYPE "unsigned char"
+#define UINT_FAST16_TYPE "short unsigned int"
+#define UINT_FAST32_TYPE "unsigned int"
+#define UINT_FAST64_TYPE "long long unsigned int"
+
+#define INTPTR_TYPE (TARGET_64BIT ? "long long int" : "int")
+#define UINTPTR_TYPE (TARGET_64BIT ? "long long unsigned int" : "unsigned int")
diff --git a/gcc-4.9/gcc/config/i386/mingw-w64.h b/gcc-4.9/gcc/config/i386/mingw-w64.h
new file mode 100644
index 000000000..b7436be04
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/mingw-w64.h
@@ -0,0 +1,86 @@
+/* Operating system specific defines to be used when targeting GCC for
+   hosting on Windows 32/64 via mingw-w64 runtime, using GNU tools and
+   the Windows API Library.
+   Copyright (C) 2009-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Enable -municode feature and support optional pthread support.  */
+
+#undef CPP_SPEC
+#define CPP_SPEC "%{posix:-D_POSIX_SOURCE} %{mthreads:-D_MT} " \
+		 "%{municode:-DUNICODE} " \
+		 "%{" SPEC_PTHREAD1 ":-D_REENTRANT} " \
+		 "%{" SPEC_PTHREAD2 ":-U_REENTRANT} "
+
+#undef STARTFILE_SPEC
+#define STARTFILE_SPEC "%{shared|mdll:dllcrt2%O%s} \
+  %{!shared:%{!mdll:%{!municode:crt2%O%s}}} \
+  %{!shared:%{!mdll:%{municode:crt2u%O%s}}} \
+  %{pg:gcrt2%O%s} \
+  crtbegin.o%s"
+
+/* Enable multilib.  */
+
+#undef ASM_SPEC
+#define ASM_SPEC "%{m32:--32} %{m64:--64}"
+
+#undef LIB_SPEC
+#define LIB_SPEC "%{pg:-lgmon} %{" SPEC_PTHREAD1 ":-lpthread} " \
+		 "%{" SPEC_PTHREAD2 ": } " \
+		 "%{mwindows:-lgdi32 -lcomdlg32} " \
+		 "-ladvapi32 -lshell32 -luser32 -lkernel32"
+
+#undef SPEC_32
+#undef SPEC_64
+#if TARGET_64BIT_DEFAULT
+#define SPEC_32 "m32"
+#define SPEC_64 "!m32"
+#else
+#define SPEC_32 "!m64"
+#define SPEC_64 "m64"
+#endif
+
+#undef SUB_LINK_ENTRY32
+#undef SUB_LINK_ENTRY64
+#define SUB_LINK_ENTRY32 "-e _DllMainCRTStartup@12"
+#if defined(USE_MINGW64_LEADING_UNDERSCORES)
+#define SUB_LINK_ENTRY64 "-e _DllMainCRTStartup"
+#else
+#define SUB_LINK_ENTRY64 "-e DllMainCRTStartup"
+#endif
+
+#undef SUB_LINK_SPEC
+#undef SUB_LINK_ENTRY
+#define SUB_LINK_SPEC "%{" SPEC_64 ":-m i386pep} %{" SPEC_32 ":-m i386pe}"
+#define SUB_LINK_ENTRY "%{" SPEC_64 ":" SUB_LINK_ENTRY64 "} %{" SPEC_32 ":" SUB_LINK_ENTRY32 "}"
+
+#undef MULTILIB_DEFAULTS
+#if TARGET_64BIT_DEFAULT
+#define MULTILIB_DEFAULTS { "m64" }
+#else
+#define MULTILIB_DEFAULTS { "m32" }
+#endif
+
+#undef LINK_SPEC
+#define LINK_SPEC SUB_LINK_SPEC " %{mwindows:--subsystem windows} \
+  %{mconsole:--subsystem console} \
+  %{shared: %{mdll: %eshared and mdll are not compatible}} \
+  %{shared: --shared} %{mdll:--dll} \
+  %{static:-Bstatic} %{!static:-Bdynamic} \
+  %{shared|mdll: " SUB_LINK_ENTRY " --enable-auto-image-base} \
+  %(shared_libgcc_undefs)"
diff --git a/gcc-4.9/gcc/config/i386/mingw-w64.opt b/gcc-4.9/gcc/config/i386/mingw-w64.opt
new file mode 100644
index 000000000..90e01f362
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/mingw-w64.opt
@@ -0,0 +1,25 @@
+; MinGW-w64-specific options.
+
+; Copyright (C) 2009-2014 Free Software Foundation, Inc.
+;
+; This file is part of GCC.
+;
+; GCC is free software; you can redistribute it and/or modify it under
+; the terms of the GNU General Public License as published by the Free
+; Software Foundation; either version 3, or (at your option) any later
+; version.
+;
+; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+; for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with GCC; see the file COPYING3.  If not see
+; <http://www.gnu.org/licenses/>.
+
+municode
+Target
+Use unicode startup and define UNICODE macro
+
+; Retain blank line above.
diff --git a/gcc-4.9/gcc/config/i386/mingw.opt b/gcc-4.9/gcc/config/i386/mingw.opt
new file mode 100644
index 000000000..44fecb0cc
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/mingw.opt
@@ -0,0 +1,35 @@
+; MinGW-specific options.
+
+; Copyright (C) 2008-2014 Free Software Foundation, Inc.
+;
+; This file is part of GCC.
+;
+; GCC is free software; you can redistribute it and/or modify it under
+; the terms of the GNU General Public License as published by the Free
+; Software Foundation; either version 3, or (at your option) any later
+; version.
+;
+; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+; for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with GCC; see the file COPYING3.  If not see
+; <http://www.gnu.org/licenses/>.
+
+pthread
+Driver
+
+no-pthread
+Driver
+
+Wpedantic-ms-format
+C ObjC C++ ObjC++ Var(warn_pedantic_ms_format) Init(1) Warning
+Warn about none ISO msvcrt scanf/printf width extensions
+
+fset-stack-executable
+Common Report Var(flag_setstackexecutable) Init(1) Optimization
+For nested functions on stack executable permission is set.
+
+; Need to retain blank line above.
diff --git a/gcc-4.9/gcc/config/i386/mingw32.h b/gcc-4.9/gcc/config/i386/mingw32.h
new file mode 100644
index 000000000..f56382095
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/mingw32.h
@@ -0,0 +1,248 @@
+/* Operating system specific defines to be used when targeting GCC for
+   hosting on Windows32, using GNU tools and the Windows32 API Library.
+   Copyright (C) 1997-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#undef DEFAULT_ABI
+#define DEFAULT_ABI MS_ABI
+
+/* By default, target has a 80387, uses IEEE compatible arithmetic,
+   returns float values in the 387 and needs stack probes.
+   We also align doubles to 64-bits for MSVC default compatibility.
+   Additionally we enable MS_BITFIELD_LAYOUT by default.  */
+
+#undef TARGET_SUBTARGET_DEFAULT
+#define TARGET_SUBTARGET_DEFAULT \
+	(MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS \
+	 | MASK_STACK_PROBE | MASK_ALIGN_DOUBLE \
+	 | MASK_MS_BITFIELD_LAYOUT)
+
+/* See i386/crtdll.h for an alternative definition. _INTEGRAL_MAX_BITS
+   is for compatibility with native compiler.  */
+#define EXTRA_OS_CPP_BUILTINS()					\
+  do								\
+    {								\
+      builtin_define ("__MSVCRT__");				\
+      builtin_define ("__MINGW32__");			   	\
+      builtin_define ("_WIN32");				\
+      builtin_define_std ("WIN32");				\
+      builtin_define_std ("WINNT");				\
+      builtin_define_with_int_value ("_INTEGRAL_MAX_BITS",	\
+				     TYPE_PRECISION (intmax_type_node));\
+      if (TARGET_64BIT && ix86_abi == MS_ABI)			\
+	{							\
+	  builtin_define ("__MINGW64__");			\
+	  builtin_define_std ("WIN64");				\
+	  builtin_define ("_WIN64");				\
+	}							\
+    }								\
+  while (0)
+
+#ifndef TARGET_USE_PTHREAD_BY_DEFAULT
+#define SPEC_PTHREAD1 "pthread"
+#define SPEC_PTHREAD2 "!no-pthread"
+#else
+#define SPEC_PTHREAD1 "!no-pthread"
+#define SPEC_PTHREAD2 "pthread"
+#endif
+
+#undef SUB_LINK_ENTRY32
+#undef SUB_LINK_ENTRY64
+#define SUB_LINK_ENTRY32 "-e _DllMainCRTStartup@12"
+#if defined(USE_MINGW64_LEADING_UNDERSCORES)
+#define SUB_LINK_ENTRY64 "-e _DllMainCRTStartup"
+#else
+#define SUB_LINK_ENTRY64 "-e DllMainCRTStartup"
+#endif
+
+#undef SUB_LINK_ENTRY
+#if TARGET_64BIT_DEFAULT
+#define SUB_LINK_ENTRY SUB_LINK_ENTRY64
+#else
+#define SUB_LINK_ENTRY SUB_LINK_ENTRY32
+#endif
+
+#undef NATIVE_SYSTEM_HEADER_COMPONENT
+#define NATIVE_SYSTEM_HEADER_COMPONENT "MINGW"
+
+#undef CPP_SPEC
+#define CPP_SPEC "%{posix:-D_POSIX_SOURCE} %{mthreads:-D_MT} " \
+		 "%{" SPEC_PTHREAD1 ":-D_REENTRANT} " \
+		 "%{" SPEC_PTHREAD2 ": } "
+
+/* For Windows applications, include more libraries, but always include
+   kernel32.  */
+#undef LIB_SPEC
+#define LIB_SPEC "%{pg:-lgmon} %{" SPEC_PTHREAD1 ":-lpthread} " \
+		 "%{" SPEC_PTHREAD2 ": } " \
+		 "%{mwindows:-lgdi32 -lcomdlg32} " \
+                 "-ladvapi32 -lshell32 -luser32 -lkernel32"
+
+/* Weak symbols do not get resolved if using a Windows dll import lib.
+   Make the unwind registration references strong undefs.  */
+#if DWARF2_UNWIND_INFO
+/* DW2-unwind is just available for 32-bit mode.  */
+#if TARGET_64BIT_DEFAULT
+#error DW2 unwind is not available for 64-bit.
+#endif
+#define SHARED_LIBGCC_UNDEFS_SPEC \
+ "%{shared-libgcc: -u ___register_frame_info -u ___deregister_frame_info}"
+#else
+#define SHARED_LIBGCC_UNDEFS_SPEC ""
+#endif
+
+#undef  SUBTARGET_EXTRA_SPECS
+#define SUBTARGET_EXTRA_SPECS						\
+  { "shared_libgcc_undefs", SHARED_LIBGCC_UNDEFS_SPEC }
+
+#define LINK_SPEC "%{mwindows:--subsystem windows} \
+  %{mconsole:--subsystem console} \
+  %{shared: %{mdll: %eshared and mdll are not compatible}} \
+  %{shared: --shared} %{mdll:--dll} \
+  %{static:-Bstatic} %{!static:-Bdynamic} \
+  %{shared|mdll: " SUB_LINK_ENTRY " --enable-auto-image-base} \
+  %(shared_libgcc_undefs)"
+
+/* Include in the mingw32 libraries with libgcc */
+#ifdef ENABLE_SHARED_LIBGCC
+#define SHARED_LIBGCC_SPEC " \
+ %{static|static-libgcc:-lgcc -lgcc_eh} \
+ %{!static: \
+   %{!static-libgcc: \
+     %{!shared: \
+       %{!shared-libgcc:-lgcc -lgcc_eh} \
+       %{shared-libgcc:-lgcc_s -lgcc} \
+      } \
+     %{shared:-lgcc_s -lgcc} \
+    } \
+  } "
+#else
+#define SHARED_LIBGCC_SPEC " -lgcc "
+#endif
+#undef REAL_LIBGCC_SPEC
+#define REAL_LIBGCC_SPEC \
+  "%{mthreads:-lmingwthrd} -lmingw32 \
+   "SHARED_LIBGCC_SPEC" \
+   -lmoldname -lmingwex -lmsvcrt"
+
+#undef STARTFILE_SPEC
+#define STARTFILE_SPEC "%{shared|mdll:dllcrt2%O%s} \
+  %{!shared:%{!mdll:crt2%O%s}} %{pg:gcrt2%O%s} \
+  crtbegin.o%s"
+
+#undef ENDFILE_SPEC
+#define ENDFILE_SPEC \
+  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+  crtend.o%s"
+
+/* Override startfile prefix defaults.  */
+#ifndef STANDARD_STARTFILE_PREFIX_1
+#define STANDARD_STARTFILE_PREFIX_1 "/mingw/lib/"
+#endif
+#ifndef STANDARD_STARTFILE_PREFIX_2
+#define STANDARD_STARTFILE_PREFIX_2 ""
+#endif
+
+/* For native mingw-version we need to take care that NATIVE_SYSTEM_HEADER_DIR
+   macro contains POSIX-style path.  See bug 52947.  */
+#undef NATIVE_SYSTEM_HEADER_DIR
+#define NATIVE_SYSTEM_HEADER_DIR "/mingw/include"
+
+/* Output STRING, a string representing a filename, to FILE.
+   We canonicalize it to be in Unix format (backslashes are replaced
+   forward slashes.  */
+#undef OUTPUT_QUOTED_STRING
+#define OUTPUT_QUOTED_STRING(FILE, STRING)               \
+do {						         \
+  const char *_string = (const char *) (STRING);	 \
+  char c;					         \
+						         \
+  putc ('\"', (FILE));				         \
+						         \
+  while ((c = *_string++) != 0)			         \
+    {						         \
+      if (c == '\\')				         \
+	c = '/';				         \
+						         \
+      if (ISPRINT (c))                                   \
+        {                                                \
+          if (c == '\"')			         \
+	    putc ('\\', (FILE));		         \
+          putc (c, (FILE));			         \
+        }                                                \
+      else                                               \
+        fprintf ((FILE), "\\%03o", (unsigned char) c);	 \
+    }						         \
+						         \
+  putc ('\"', (FILE));					 \
+} while (0)
+
+/* Define as short unsigned for compatibility with MS runtime.  */
+#undef WINT_TYPE
+#define WINT_TYPE "short unsigned int"
+
+/* mingw32 uses the  -mthreads option to enable thread support.  */
+#undef GOMP_SELF_SPECS
+#define GOMP_SELF_SPECS "%{fopenmp|ftree-parallelize-loops=*: " \
+			"-mthreads -pthread}"
+#undef GTM_SELF_SPECS
+#define GTM_SELF_SPECS "%{fgnu-tm:-mthreads -pthread}"
+
+/* mingw32 atexit function is safe to use in shared libraries.  Use it
+   to register C++ static destructors.  */
+#define TARGET_CXX_USE_ATEXIT_FOR_CXA_ATEXIT hook_bool_void_true
+
+/* Contains a pointer to type target_ovr_attr defining the target specific
+   overrides of format attributes.  See c-format.h for structure
+   definition.  */
+#undef TARGET_OVERRIDES_FORMAT_ATTRIBUTES
+#define TARGET_OVERRIDES_FORMAT_ATTRIBUTES mingw_format_attribute_overrides
+
+/* Specify the count of elements in TARGET_OVERRIDES_ATTRIBUTE.  */
+#undef TARGET_OVERRIDES_FORMAT_ATTRIBUTES_COUNT
+#define TARGET_OVERRIDES_FORMAT_ATTRIBUTES_COUNT 3
+
+/* Custom initialization for warning -Wpedantic-ms-format for c-format.  */
+#undef TARGET_OVERRIDES_FORMAT_INIT
+#define TARGET_OVERRIDES_FORMAT_INIT msformat_init
+
+/* MS specific format attributes for ms_printf, ms_scanf, ms_strftime.  */
+#undef TARGET_FORMAT_TYPES
+#define TARGET_FORMAT_TYPES mingw_format_attributes
+
+#undef TARGET_N_FORMAT_TYPES
+#define TARGET_N_FORMAT_TYPES 3
+
+/* Let defaults.h definition of TARGET_USE_JCR_SECTION apply. */
+#undef TARGET_USE_JCR_SECTION
+
+#define HAVE_ENABLE_EXECUTE_STACK
+#undef  CHECK_EXECUTE_STACK_ENABLED
+#define CHECK_EXECUTE_STACK_ENABLED flag_setstackexecutable
+
+/* This matches SHLIB_SONAME and SHLIB_SOVERSION in t-cygming. */
+/* This matches SHLIB_SONAME and SHLIB_SOVERSION in t-cygwin. */
+#if DWARF2_UNWIND_INFO
+#define LIBGCC_EH_EXTN "_dw2"
+#else
+#define LIBGCC_EH_EXTN "_sjlj"
+#endif
+#define LIBGCC_SONAME "libgcc_s" LIBGCC_EH_EXTN "-1.dll"
+
+/* We should find a way to not have to update this manually.  */
+#define LIBGCJ_SONAME "libgcj" /*LIBGCC_EH_EXTN*/ "-13.dll"
diff --git a/gcc-4.9/gcc/config/i386/mm3dnow.h b/gcc-4.9/gcc/config/i386/mm3dnow.h
new file mode 100644
index 000000000..bf847f939
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/mm3dnow.h
@@ -0,0 +1,218 @@
+/* Copyright (C) 2004-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the mm3dnow.h (of supposedly AMD origin) included with
+   MSVC 7.1.  */
+
+#ifndef _MM3DNOW_H_INCLUDED
+#define _MM3DNOW_H_INCLUDED
+
+#include <mmintrin.h>
+#include <prfchwintrin.h>
+
+#ifndef __3dNOW__
+#pragma GCC push_options
+#pragma GCC target("3dnow")
+#define __DISABLE_3dNOW__
+#endif /* __3dNOW__ */
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_femms (void)
+{
+  __builtin_ia32_femms();
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pavgusb (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pavgusb ((__v8qi)__A, (__v8qi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pf2id (__m64 __A)
+{
+  return (__m64)__builtin_ia32_pf2id ((__v2sf)__A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfacc (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfacc ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfadd (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfadd ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfcmpeq (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfcmpeq ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfcmpge (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfcmpge ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfcmpgt (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfcmpgt ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfmax (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfmax ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfmin (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfmin ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfmul (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfmul ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfrcp (__m64 __A)
+{
+  return (__m64)__builtin_ia32_pfrcp ((__v2sf)__A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfrcpit1 (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfrcpit1 ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfrcpit2 (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfrcpit2 ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfrsqrt (__m64 __A)
+{
+  return (__m64)__builtin_ia32_pfrsqrt ((__v2sf)__A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfrsqit1 (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfrsqit1 ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfsub (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfsub ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfsubr (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfsubr ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pi2fd (__m64 __A)
+{
+  return (__m64)__builtin_ia32_pi2fd ((__v2si)__A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmulhrw (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pmulhrw ((__v4hi)__A, (__v4hi)__B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_prefetch (void *__P)
+{
+  __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_from_float (float __A)
+{
+  return __extension__ (__m64)(__v2sf){ __A, 0.0f };
+}
+
+extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_to_float (__m64 __A)
+{
+  union { __v2sf v; float a[2]; } __tmp;
+  __tmp.v = (__v2sf)__A;
+  return __tmp.a[0];
+}
+
+#ifdef __3dNOW_A__
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pf2iw (__m64 __A)
+{
+  return (__m64)__builtin_ia32_pf2iw ((__v2sf)__A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfnacc (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfnacc ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfpnacc (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfpnacc ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pi2fw (__m64 __A)
+{
+  return (__m64)__builtin_ia32_pi2fw ((__v2si)__A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pswapd (__m64 __A)
+{
+  return (__m64)__builtin_ia32_pswapdsf ((__v2sf)__A);
+}
+
+#endif /* __3dNOW_A__ */
+
+#ifdef __DISABLE_3dNOW__
+#undef __DISABLE_3dNOW__
+#pragma GCC pop_options
+#endif /* __DISABLE_3dNOW__ */
+
+#endif /* _MM3DNOW_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/mmintrin.h b/gcc-4.9/gcc/config/i386/mmintrin.h
new file mode 100644
index 000000000..b351200e5
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/mmintrin.h
@@ -0,0 +1,942 @@
+/* Copyright (C) 2002-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef _MMINTRIN_H_INCLUDED
+#define _MMINTRIN_H_INCLUDED
+
+#ifndef __MMX__
+#pragma GCC push_options
+#pragma GCC target("mmx")
+#define __DISABLE_MMX__
+#endif /* __MMX__ */
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
+
+/* Internal data types for implementing the intrinsics.  */
+typedef int __v2si __attribute__ ((__vector_size__ (8)));
+typedef short __v4hi __attribute__ ((__vector_size__ (8)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+typedef long long __v1di __attribute__ ((__vector_size__ (8)));
+typedef float __v2sf __attribute__ ((__vector_size__ (8)));
+
+/* Empty the multimedia state.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_empty (void)
+{
+  __builtin_ia32_emms ();
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_empty (void)
+{
+  _mm_empty ();
+}
+
+/* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
+extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_si64 (int __i)
+{
+  return (__m64) __builtin_ia32_vec_init_v2si (__i, 0);
+}
+
+extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_from_int (int __i)
+{
+  return _mm_cvtsi32_si64 (__i);
+}
+
+#ifdef __x86_64__
+/* Convert I to a __m64 object.  */
+
+/* Intel intrinsic.  */
+extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_from_int64 (long long __i)
+{
+  return (__m64) __i;
+}
+
+extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_m64 (long long __i)
+{
+  return (__m64) __i;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_si64 (long long __i)
+{
+  return (__m64) __i;
+}
+
+extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi64x (long long __i)
+{
+  return (__m64) __i;
+}
+#endif
+
+/* Convert the lower 32 bits of the __m64 object into an integer.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_si32 (__m64 __i)
+{
+  return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_to_int (__m64 __i)
+{
+  return _mm_cvtsi64_si32 (__i);
+}
+
+#ifdef __x86_64__
+/* Convert the __m64 object to a 64bit integer.  */
+
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_to_int64 (__m64 __i)
+{
+  return (long long)__i;
+}
+
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtm64_si64 (__m64 __i)
+{
+  return (long long)__i;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_si64x (__m64 __i)
+{
+  return (long long)__i;
+}
+#endif
+
+/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
+   the result, and the four 16-bit values from M2 into the upper four 8-bit
+   values of the result, all with signed saturation.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_packsswb (__m64 __m1, __m64 __m2)
+{
+  return _mm_packs_pi16 (__m1, __m2);
+}
+
+/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
+   the result, and the two 32-bit values from M2 into the upper two 16-bit
+   values of the result, all with signed saturation.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pi32 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_packssdw (__m64 __m1, __m64 __m2)
+{
+  return _mm_packs_pi32 (__m1, __m2);
+}
+
+/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
+   the result, and the four 16-bit values from M2 into the upper four 8-bit
+   values of the result, all with unsigned saturation.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pu16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_packuswb (__m64 __m1, __m64 __m2)
+{
+  return _mm_packs_pu16 (__m1, __m2);
+}
+
+/* Interleave the four 8-bit values from the high half of M1 with the four
+   8-bit values from the high half of M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_punpckhbw (__m64 __m1, __m64 __m2)
+{
+  return _mm_unpackhi_pi8 (__m1, __m2);
+}
+
+/* Interleave the two 16-bit values from the high half of M1 with the two
+   16-bit values from the high half of M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_punpckhwd (__m64 __m1, __m64 __m2)
+{
+  return _mm_unpackhi_pi16 (__m1, __m2);
+}
+
+/* Interleave the 32-bit value from the high half of M1 with the 32-bit
+   value from the high half of M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_punpckhdq (__m64 __m1, __m64 __m2)
+{
+  return _mm_unpackhi_pi32 (__m1, __m2);
+}
+
+/* Interleave the four 8-bit values from the low half of M1 with the four
+   8-bit values from the low half of M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_punpcklbw (__m64 __m1, __m64 __m2)
+{
+  return _mm_unpacklo_pi8 (__m1, __m2);
+}
+
+/* Interleave the two 16-bit values from the low half of M1 with the two
+   16-bit values from the low half of M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_punpcklwd (__m64 __m1, __m64 __m2)
+{
+  return _mm_unpacklo_pi16 (__m1, __m2);
+}
+
+/* Interleave the 32-bit value from the low half of M1 with the 32-bit
+   value from the low half of M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_punpckldq (__m64 __m1, __m64 __m2)
+{
+  return _mm_unpacklo_pi32 (__m1, __m2);
+}
+
+/* Add the 8-bit values in M1 to the 8-bit values in M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddb (__m64 __m1, __m64 __m2)
+{
+  return _mm_add_pi8 (__m1, __m2);
+}
+
+/* Add the 16-bit values in M1 to the 16-bit values in M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddw (__m64 __m1, __m64 __m2)
+{
+  return _mm_add_pi16 (__m1, __m2);
+}
+
+/* Add the 32-bit values in M1 to the 32-bit values in M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi32 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddd (__m64 __m1, __m64 __m2)
+{
+  return _mm_add_pi32 (__m1, __m2);
+}
+
+/* Add the 64-bit values in M1 to the 64-bit values in M2.  */
+#ifndef __SSE2__
+#pragma GCC push_options
+#pragma GCC target("sse2")
+#define __DISABLE_SSE2__
+#endif /* __SSE2__ */
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_si64 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_paddq ((__v1di)__m1, (__v1di)__m2);
+}
+#ifdef __DISABLE_SSE2__
+#undef __DISABLE_SSE2__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE2__ */
+
+/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
+   saturated arithmetic.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pi8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddsb (__m64 __m1, __m64 __m2)
+{
+  return _mm_adds_pi8 (__m1, __m2);
+}
+
+/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
+   saturated arithmetic.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddsw (__m64 __m1, __m64 __m2)
+{
+  return _mm_adds_pi16 (__m1, __m2);
+}
+
+/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
+   saturated arithmetic.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pu8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddusb (__m64 __m1, __m64 __m2)
+{
+  return _mm_adds_pu8 (__m1, __m2);
+}
+
+/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
+   saturated arithmetic.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pu16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddusw (__m64 __m1, __m64 __m2)
+{
+  return _mm_adds_pu16 (__m1, __m2);
+}
+
+/* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pi8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubb (__m64 __m1, __m64 __m2)
+{
+  return _mm_sub_pi8 (__m1, __m2);
+}
+
+/* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubw (__m64 __m1, __m64 __m2)
+{
+  return _mm_sub_pi16 (__m1, __m2);
+}
+
+/* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pi32 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubd (__m64 __m1, __m64 __m2)
+{
+  return _mm_sub_pi32 (__m1, __m2);
+}
+
+/* Add the 64-bit values in M1 to the 64-bit values in M2.  */
+#ifndef __SSE2__
+#pragma GCC push_options
+#pragma GCC target("sse2")
+#define __DISABLE_SSE2__
+#endif /* __SSE2__ */
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_si64 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_psubq ((__v1di)__m1, (__v1di)__m2);
+}
+#ifdef __DISABLE_SSE2__
+#undef __DISABLE_SSE2__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE2__ */
+
+/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
+   saturating arithmetic.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_pi8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubsb (__m64 __m1, __m64 __m2)
+{
+  return _mm_subs_pi8 (__m1, __m2);
+}
+
+/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
+   signed saturating arithmetic.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubsw (__m64 __m1, __m64 __m2)
+{
+  return _mm_subs_pi16 (__m1, __m2);
+}
+
+/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
+   unsigned saturating arithmetic.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_pu8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubusb (__m64 __m1, __m64 __m2)
+{
+  return _mm_subs_pu8 (__m1, __m2);
+}
+
+/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
+   unsigned saturating arithmetic.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_pu16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubusw (__m64 __m1, __m64 __m2)
+{
+  return _mm_subs_pu16 (__m1, __m2);
+}
+
+/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
+   four 32-bit intermediate results, which are then summed by pairs to
+   produce two 32-bit results.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_madd_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmaddwd (__m64 __m1, __m64 __m2)
+{
+  return _mm_madd_pi16 (__m1, __m2);
+}
+
+/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
+   M2 and produce the high 16 bits of the 32-bit results.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmulhw (__m64 __m1, __m64 __m2)
+{
+  return _mm_mulhi_pi16 (__m1, __m2);
+}
+
+/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
+   the low 16 bits of the results.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmullw (__m64 __m1, __m64 __m2)
+{
+  return _mm_mullo_pi16 (__m1, __m2);
+}
+
+/* Shift four 16-bit values in M left by COUNT.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_pi16 (__m64 __m, __m64 __count)
+{
+  return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (__v4hi)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psllw (__m64 __m, __m64 __count)
+{
+  return _mm_sll_pi16 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_pi16 (__m64 __m, int __count)
+{
+  return (__m64) __builtin_ia32_psllwi ((__v4hi)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psllwi (__m64 __m, int __count)
+{
+  return _mm_slli_pi16 (__m, __count);
+}
+
+/* Shift two 32-bit values in M left by COUNT.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_pi32 (__m64 __m, __m64 __count)
+{
+  return (__m64) __builtin_ia32_pslld ((__v2si)__m, (__v2si)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pslld (__m64 __m, __m64 __count)
+{
+  return _mm_sll_pi32 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_pi32 (__m64 __m, int __count)
+{
+  return (__m64) __builtin_ia32_pslldi ((__v2si)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pslldi (__m64 __m, int __count)
+{
+  return _mm_slli_pi32 (__m, __count);
+}
+
+/* Shift the 64-bit value in M left by COUNT.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_si64 (__m64 __m, __m64 __count)
+{
+  return (__m64) __builtin_ia32_psllq ((__v1di)__m, (__v1di)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psllq (__m64 __m, __m64 __count)
+{
+  return _mm_sll_si64 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_si64 (__m64 __m, int __count)
+{
+  return (__m64) __builtin_ia32_psllqi ((__v1di)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psllqi (__m64 __m, int __count)
+{
+  return _mm_slli_si64 (__m, __count);
+}
+
+/* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_pi16 (__m64 __m, __m64 __count)
+{
+  return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (__v4hi)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psraw (__m64 __m, __m64 __count)
+{
+  return _mm_sra_pi16 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srai_pi16 (__m64 __m, int __count)
+{
+  return (__m64) __builtin_ia32_psrawi ((__v4hi)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrawi (__m64 __m, int __count)
+{
+  return _mm_srai_pi16 (__m, __count);
+}
+
+/* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_pi32 (__m64 __m, __m64 __count)
+{
+  return (__m64) __builtin_ia32_psrad ((__v2si)__m, (__v2si)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrad (__m64 __m, __m64 __count)
+{
+  return _mm_sra_pi32 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srai_pi32 (__m64 __m, int __count)
+{
+  return (__m64) __builtin_ia32_psradi ((__v2si)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psradi (__m64 __m, int __count)
+{
+  return _mm_srai_pi32 (__m, __count);
+}
+
+/* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_pi16 (__m64 __m, __m64 __count)
+{
+  return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (__v4hi)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrlw (__m64 __m, __m64 __count)
+{
+  return _mm_srl_pi16 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_pi16 (__m64 __m, int __count)
+{
+  return (__m64) __builtin_ia32_psrlwi ((__v4hi)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrlwi (__m64 __m, int __count)
+{
+  return _mm_srli_pi16 (__m, __count);
+}
+
+/* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_pi32 (__m64 __m, __m64 __count)
+{
+  return (__m64) __builtin_ia32_psrld ((__v2si)__m, (__v2si)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrld (__m64 __m, __m64 __count)
+{
+  return _mm_srl_pi32 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_pi32 (__m64 __m, int __count)
+{
+  return (__m64) __builtin_ia32_psrldi ((__v2si)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrldi (__m64 __m, int __count)
+{
+  return _mm_srli_pi32 (__m, __count);
+}
+
+/* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_si64 (__m64 __m, __m64 __count)
+{
+  return (__m64) __builtin_ia32_psrlq ((__v1di)__m, (__v1di)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrlq (__m64 __m, __m64 __count)
+{
+  return _mm_srl_si64 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_si64 (__m64 __m, int __count)
+{
+  return (__m64) __builtin_ia32_psrlqi ((__v1di)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrlqi (__m64 __m, int __count)
+{
+  return _mm_srli_si64 (__m, __count);
+}
+
+/* Bit-wise AND the 64-bit values in M1 and M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_si64 (__m64 __m1, __m64 __m2)
+{
+  return __builtin_ia32_pand (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pand (__m64 __m1, __m64 __m2)
+{
+  return _mm_and_si64 (__m1, __m2);
+}
+
+/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
+   64-bit value in M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andnot_si64 (__m64 __m1, __m64 __m2)
+{
+  return __builtin_ia32_pandn (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pandn (__m64 __m1, __m64 __m2)
+{
+  return _mm_andnot_si64 (__m1, __m2);
+}
+
+/* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_si64 (__m64 __m1, __m64 __m2)
+{
+  return __builtin_ia32_por (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_por (__m64 __m1, __m64 __m2)
+{
+  return _mm_or_si64 (__m1, __m2);
+}
+
+/* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_si64 (__m64 __m1, __m64 __m2)
+{
+  return __builtin_ia32_pxor (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pxor (__m64 __m1, __m64 __m2)
+{
+  return _mm_xor_si64 (__m1, __m2);
+}
+
+/* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
+   test is true and zero if false.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pcmpeqb (__m64 __m1, __m64 __m2)
+{
+  return _mm_cmpeq_pi8 (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pcmpgtb (__m64 __m1, __m64 __m2)
+{
+  return _mm_cmpgt_pi8 (__m1, __m2);
+}
+
+/* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
+   the test is true and zero if false.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pcmpeqw (__m64 __m1, __m64 __m2)
+{
+  return _mm_cmpeq_pi16 (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pcmpgtw (__m64 __m1, __m64 __m2)
+{
+  return _mm_cmpgt_pi16 (__m1, __m2);
+}
+
+/* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
+   the test is true and zero if false.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pcmpeqd (__m64 __m1, __m64 __m2)
+{
+  return _mm_cmpeq_pi32 (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pcmpgtd (__m64 __m1, __m64 __m2)
+{
+  return _mm_cmpgt_pi32 (__m1, __m2);
+}
+
+/* Creates a 64-bit zero.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_si64 (void)
+{
+  return (__m64)0LL;
+}
+
+/* Creates a vector of two 32-bit values; I0 is least significant.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi32 (int __i1, int __i0)
+{
+  return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1);
+}
+
+/* Creates a vector of four 16-bit values; W0 is least significant.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
+{
+  return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3);
+}
+
+/* Creates a vector of eight 8-bit values; B0 is least significant.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
+	     char __b3, char __b2, char __b1, char __b0)
+{
+  return (__m64) __builtin_ia32_vec_init_v8qi (__b0, __b1, __b2, __b3,
+					       __b4, __b5, __b6, __b7);
+}
+
+/* Similar, but with the arguments in reverse order.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_pi32 (int __i0, int __i1)
+{
+  return _mm_set_pi32 (__i1, __i0);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
+{
+  return _mm_set_pi16 (__w3, __w2, __w1, __w0);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
+	      char __b4, char __b5, char __b6, char __b7)
+{
+  return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
+}
+
+/* Creates a vector of two 32-bit values, both elements containing I.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_pi32 (int __i)
+{
+  return _mm_set_pi32 (__i, __i);
+}
+
+/* Creates a vector of four 16-bit values, all elements containing W.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_pi16 (short __w)
+{
+  return _mm_set_pi16 (__w, __w, __w, __w);
+}
+
+/* Creates a vector of eight 8-bit values, all elements containing B.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_pi8 (char __b)
+{
+  return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b);
+}
+#ifdef __DISABLE_MMX__
+#undef __DISABLE_MMX__
+#pragma GCC pop_options
+#endif /* __DISABLE_MMX__ */
+
+#endif /* _MMINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/mmx.md b/gcc-4.9/gcc/config/i386/mmx.md
new file mode 100644
index 000000000..214acde23
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/mmx.md
@@ -0,0 +1,1613 @@
+;; GCC machine description for MMX and 3dNOW! instructions
+;; Copyright (C) 2005-2014 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; The MMX and 3dNOW! patterns are in the same file because they use
+;; the same register file, and 3dNOW! adds a number of extensions to
+;; the base integer MMX isa.
+
+;; Note!  Except for the basic move instructions, *all* of these
+;; patterns are outside the normal optabs namespace.  This is because
+;; use of these registers requires the insertion of emms or femms
+;; instructions to return to normal fpu mode.  The compiler doesn't
+;; know how to do that itself, which means it's up to the user.  Which
+;; means that we should never use any of these patterns except at the
+;; direction of the user via a builtin.
+
+(define_c_enum "unspec" [
+  UNSPEC_MOVNTQ
+  UNSPEC_PFRCP
+  UNSPEC_PFRCPIT1
+  UNSPEC_PFRCPIT2
+  UNSPEC_PFRSQRT
+  UNSPEC_PFRSQIT1
+])
+
+(define_c_enum "unspecv" [
+  UNSPECV_EMMS
+  UNSPECV_FEMMS
+])
+
+;; 8 byte integral modes handled by MMX (and by extension, SSE)
+(define_mode_iterator MMXMODEI [V8QI V4HI V2SI])
+(define_mode_iterator MMXMODEI8 [V8QI V4HI V2SI V1DI])
+
+;; All 8-byte vector modes handled by MMX
+(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF])
+
+;; Mix-n-match
+(define_mode_iterator MMXMODE12 [V8QI V4HI])
+(define_mode_iterator MMXMODE24 [V4HI V2SI])
+(define_mode_iterator MMXMODE248 [V4HI V2SI V1DI])
+
+;; Mapping from integer vector mode to mnemonic suffix
+(define_mode_attr mmxvecsize [(V8QI "b") (V4HI "w") (V2SI "d") (V1DI "q")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Move patterns
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; All of these patterns are enabled for MMX as well as 3dNOW.
+;; This is essential for maintaining stable calling conventions.
+
+(define_expand "mov<mode>"
+  [(set (match_operand:MMXMODE 0 "nonimmediate_operand")
+	(match_operand:MMXMODE 1 "nonimmediate_operand"))]
+  "TARGET_MMX"
+{
+  ix86_expand_vector_move (<MODE>mode, operands);
+  DONE;
+})
+
+(define_insn "*mov<mode>_internal"
+  [(set (match_operand:MMXMODE 0 "nonimmediate_operand"
+    "=r ,o ,r,r ,m ,?!y,!y,?!y,m  ,r   ,?!Ym,v,v,v,m,*x,*x,*x,m ,r ,Yi,!Ym,*Yi")
+	(match_operand:MMXMODE 1 "vector_move_operand"
+    "rCo,rC,C,rm,rC,C  ,!y,m  ,?!y,?!Yn,r   ,C,v,m,v,C ,*x,m ,*x,Yj,r ,*Yj,!Yn"))]
+  "TARGET_MMX
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+{
+  switch (get_attr_type (insn))
+    {
+    case TYPE_MULTI:
+      return "#";
+
+    case TYPE_IMOV:
+      if (get_attr_mode (insn) == MODE_SI)
+	return "mov{l}\t{%1, %k0|%k0, %1}";
+      else
+	return "mov{q}\t{%1, %0|%0, %1}";
+
+    case TYPE_MMX:
+      return "pxor\t%0, %0";
+
+    case TYPE_MMXMOV:
+      /* Handle broken assemblers that require movd instead of movq.  */
+      if (!HAVE_AS_IX86_INTERUNIT_MOVQ
+	  && (GENERAL_REG_P (operands[0]) || GENERAL_REG_P (operands[1])))
+	return "movd\t{%1, %0|%0, %1}";
+      return "movq\t{%1, %0|%0, %1}";
+
+    case TYPE_SSECVT:
+      if (SSE_REG_P (operands[0]))
+	return "movq2dq\t{%1, %0|%0, %1}";
+      else
+	return "movdq2q\t{%1, %0|%0, %1}";
+
+    case TYPE_SSELOG1:
+      return standard_sse_constant_opcode (insn, operands[1]);
+
+    case TYPE_SSEMOV:
+      switch (get_attr_mode (insn))
+	{
+	case MODE_DI:
+	  /* Handle broken assemblers that require movd instead of movq.  */
+	  if (!HAVE_AS_IX86_INTERUNIT_MOVQ
+	      && (GENERAL_REG_P (operands[0]) || GENERAL_REG_P (operands[1])))
+	    return "%vmovd\t{%1, %0|%0, %1}";
+	  return "%vmovq\t{%1, %0|%0, %1}";
+	case MODE_TI:
+	  return "%vmovdqa\t{%1, %0|%0, %1}";
+	case MODE_XI:
+	  return "vmovdqa64\t{%g1, %g0|%g0, %g1}";
+
+	case MODE_V2SF:
+	  if (TARGET_AVX && REG_P (operands[0]))
+	    return "vmovlps\t{%1, %0, %0|%0, %0, %1}";
+	  return "%vmovlps\t{%1, %0|%0, %1}";
+	case MODE_V4SF:
+	  return "%vmovaps\t{%1, %0|%0, %1}";
+
+	default:
+	  gcc_unreachable ();
+	}
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set (attr "isa")
+     (cond [(eq_attr "alternative" "0,1")
+	      (const_string "nox64")
+	    (eq_attr "alternative" "2,3,4,9,10,11,12,13,14,19,20")
+	      (const_string "x64")
+	   ]
+	   (const_string "*")))
+   (set (attr "type")
+     (cond [(eq_attr "alternative" "0,1")
+	      (const_string "multi")
+	    (eq_attr "alternative" "2,3,4")
+	      (const_string "imov")
+	    (eq_attr "alternative" "5")
+	      (const_string "mmx")
+	    (eq_attr "alternative" "6,7,8,9,10")
+	      (const_string "mmxmov")
+	    (eq_attr "alternative" "11,15")
+	      (const_string "sselog1")
+	    (eq_attr "alternative" "21,22")
+	      (const_string "ssecvt")
+	   ]
+	   (const_string "ssemov")))
+   (set (attr "prefix_rex")
+     (if_then_else (eq_attr "alternative" "9,10,19,20")
+       (const_string "1")
+       (const_string "*")))
+   (set (attr "prefix")
+     (if_then_else (eq_attr "type" "sselog1,ssemov")
+       (const_string "maybe_vex")
+       (const_string "orig")))
+   (set (attr "prefix_data16")
+     (if_then_else
+       (and (eq_attr "type" "ssemov") (eq_attr "mode" "DI"))
+       (const_string "1")
+       (const_string "*")))
+   (set (attr "mode")
+     (cond [(eq_attr "alternative" "2")
+	      (const_string "SI")
+	    (eq_attr "alternative" "11,12,15,16")
+	      (cond [(ior (match_operand 0 "ext_sse_reg_operand")
+			  (match_operand 1 "ext_sse_reg_operand"))
+			(const_string "XI")
+		     (match_test "<MODE>mode == V2SFmode")
+		       (const_string "V4SF")
+		     (ior (not (match_test "TARGET_SSE2"))
+			  (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
+		       (const_string "V4SF")
+		     (match_test "TARGET_AVX")
+		       (const_string "TI")
+		     (match_test "optimize_function_for_size_p (cfun)")
+		       (const_string "V4SF")
+		    ]
+		    (const_string "TI"))
+
+	    (and (eq_attr "alternative" "13,14,17,18")
+	    	 (ior (match_test "<MODE>mode == V2SFmode")
+		      (not (match_test "TARGET_SSE2"))))
+	      (const_string "V2SF")
+	   ]
+	   (const_string "DI")))])
+
+(define_split
+  [(set (match_operand:MMXMODE 0 "nonimmediate_operand")
+        (match_operand:MMXMODE 1 "general_operand"))]
+  "!TARGET_64BIT && reload_completed
+   && !(MMX_REG_P (operands[0]) || SSE_REG_P (operands[0]))
+   && !(MMX_REG_P (operands[1]) || SSE_REG_P (operands[1]))"
+  [(const_int 0)]
+  "ix86_split_long_move (operands); DONE;")
+
+(define_expand "movmisalign<mode>"
+  [(set (match_operand:MMXMODE 0 "nonimmediate_operand")
+	(match_operand:MMXMODE 1 "nonimmediate_operand"))]
+  "TARGET_MMX"
+{
+  ix86_expand_vector_move (<MODE>mode, operands);
+  DONE;
+})
+
+(define_insn "sse_movntq"
+  [(set (match_operand:DI 0 "memory_operand" "=m")
+	(unspec:DI [(match_operand:DI 1 "register_operand" "y")]
+		   UNSPEC_MOVNTQ))]
+  "TARGET_SSE || TARGET_3DNOW_A"
+  "movntq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "mmxmov")
+   (set_attr "mode" "DI")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel single-precision floating point arithmetic
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_expand "mmx_addv2sf3"
+  [(set (match_operand:V2SF 0 "register_operand")
+	(plus:V2SF
+	  (match_operand:V2SF 1 "nonimmediate_operand")
+	  (match_operand:V2SF 2 "nonimmediate_operand")))]
+  "TARGET_3DNOW"
+  "ix86_fixup_binary_operands_no_copy (PLUS, V2SFmode, operands);")
+
+(define_insn "*mmx_addv2sf3"
+  [(set (match_operand:V2SF 0 "register_operand" "=y")
+	(plus:V2SF (match_operand:V2SF 1 "nonimmediate_operand" "%0")
+		   (match_operand:V2SF 2 "nonimmediate_operand" "ym")))]
+  "TARGET_3DNOW && ix86_binary_operator_ok (PLUS, V2SFmode, operands)"
+  "pfadd\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+(define_expand "mmx_subv2sf3"
+  [(set (match_operand:V2SF 0 "register_operand")
+        (minus:V2SF (match_operand:V2SF 1 "register_operand")
+		    (match_operand:V2SF 2 "nonimmediate_operand")))]
+  "TARGET_3DNOW")
+
+(define_expand "mmx_subrv2sf3"
+  [(set (match_operand:V2SF 0 "register_operand")
+        (minus:V2SF (match_operand:V2SF 2 "register_operand")
+		    (match_operand:V2SF 1 "nonimmediate_operand")))]
+  "TARGET_3DNOW")
+
+(define_insn "*mmx_subv2sf3"
+  [(set (match_operand:V2SF 0 "register_operand" "=y,y")
+        (minus:V2SF (match_operand:V2SF 1 "nonimmediate_operand" "0,ym")
+		    (match_operand:V2SF 2 "nonimmediate_operand" "ym,0")))]
+  "TARGET_3DNOW && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "@
+   pfsub\t{%2, %0|%0, %2}
+   pfsubr\t{%1, %0|%0, %1}"
+  [(set_attr "type" "mmxadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+(define_expand "mmx_mulv2sf3"
+  [(set (match_operand:V2SF 0 "register_operand")
+	(mult:V2SF (match_operand:V2SF 1 "nonimmediate_operand")
+		   (match_operand:V2SF 2 "nonimmediate_operand")))]
+  "TARGET_3DNOW"
+  "ix86_fixup_binary_operands_no_copy (MULT, V2SFmode, operands);")
+
+(define_insn "*mmx_mulv2sf3"
+  [(set (match_operand:V2SF 0 "register_operand" "=y")
+	(mult:V2SF (match_operand:V2SF 1 "nonimmediate_operand" "%0")
+		   (match_operand:V2SF 2 "nonimmediate_operand" "ym")))]
+  "TARGET_3DNOW && ix86_binary_operator_ok (MULT, V2SFmode, operands)"
+  "pfmul\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxmul")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+;; ??? For !flag_finite_math_only, the representation with SMIN/SMAX
+;; isn't really correct, as those rtl operators aren't defined when
+;; applied to NaNs.  Hopefully the optimizers won't get too smart on us.
+
+(define_expand "mmx_<code>v2sf3"
+  [(set (match_operand:V2SF 0 "register_operand")
+        (smaxmin:V2SF
+	  (match_operand:V2SF 1 "nonimmediate_operand")
+	  (match_operand:V2SF 2 "nonimmediate_operand")))]
+  "TARGET_3DNOW"
+{
+  if (!flag_finite_math_only)
+    operands[1] = force_reg (V2SFmode, operands[1]);
+  ix86_fixup_binary_operands_no_copy (<CODE>, V2SFmode, operands);
+})
+
+(define_insn "*mmx_<code>v2sf3_finite"
+  [(set (match_operand:V2SF 0 "register_operand" "=y")
+        (smaxmin:V2SF
+	  (match_operand:V2SF 1 "nonimmediate_operand" "%0")
+	  (match_operand:V2SF 2 "nonimmediate_operand" "ym")))]
+  "TARGET_3DNOW && flag_finite_math_only
+   && ix86_binary_operator_ok (<CODE>, V2SFmode, operands)"
+  "pf<maxmin_float>\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+(define_insn "*mmx_<code>v2sf3"
+  [(set (match_operand:V2SF 0 "register_operand" "=y")
+        (smaxmin:V2SF
+	  (match_operand:V2SF 1 "register_operand" "0")
+	  (match_operand:V2SF 2 "nonimmediate_operand" "ym")))]
+  "TARGET_3DNOW"
+  "pf<maxmin_float>\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+(define_insn "mmx_rcpv2sf2"
+  [(set (match_operand:V2SF 0 "register_operand" "=y")
+        (unspec:V2SF [(match_operand:V2SF 1 "nonimmediate_operand" "ym")]
+		     UNSPEC_PFRCP))]
+  "TARGET_3DNOW"
+  "pfrcp\t{%1, %0|%0, %1}"
+  [(set_attr "type" "mmx")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+(define_insn "mmx_rcpit1v2sf3"
+  [(set (match_operand:V2SF 0 "register_operand" "=y")
+	(unspec:V2SF [(match_operand:V2SF 1 "register_operand" "0")
+		      (match_operand:V2SF 2 "nonimmediate_operand" "ym")]
+		     UNSPEC_PFRCPIT1))]
+  "TARGET_3DNOW"
+  "pfrcpit1\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmx")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+(define_insn "mmx_rcpit2v2sf3"
+  [(set (match_operand:V2SF 0 "register_operand" "=y")
+	(unspec:V2SF [(match_operand:V2SF 1 "register_operand" "0")
+		      (match_operand:V2SF 2 "nonimmediate_operand" "ym")]
+		     UNSPEC_PFRCPIT2))]
+  "TARGET_3DNOW"
+  "pfrcpit2\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmx")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+(define_insn "mmx_rsqrtv2sf2"
+  [(set (match_operand:V2SF 0 "register_operand" "=y")
+	(unspec:V2SF [(match_operand:V2SF 1 "nonimmediate_operand" "ym")]
+		     UNSPEC_PFRSQRT))]
+  "TARGET_3DNOW"
+  "pfrsqrt\t{%1, %0|%0, %1}"
+  [(set_attr "type" "mmx")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+(define_insn "mmx_rsqit1v2sf3"
+  [(set (match_operand:V2SF 0 "register_operand" "=y")
+	(unspec:V2SF [(match_operand:V2SF 1 "register_operand" "0")
+		      (match_operand:V2SF 2 "nonimmediate_operand" "ym")]
+		     UNSPEC_PFRSQIT1))]
+  "TARGET_3DNOW"
+  "pfrsqit1\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmx")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+(define_insn "mmx_haddv2sf3"
+  [(set (match_operand:V2SF 0 "register_operand" "=y")
+	(vec_concat:V2SF
+	  (plus:SF
+	    (vec_select:SF
+	      (match_operand:V2SF 1 "register_operand" "0")
+	      (parallel [(const_int  0)]))
+	    (vec_select:SF (match_dup 1) (parallel [(const_int 1)])))
+	  (plus:SF
+            (vec_select:SF
+	      (match_operand:V2SF 2 "nonimmediate_operand" "ym")
+	      (parallel [(const_int  0)]))
+	    (vec_select:SF (match_dup 2) (parallel [(const_int 1)])))))]
+  "TARGET_3DNOW"
+  "pfacc\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+(define_insn "mmx_hsubv2sf3"
+  [(set (match_operand:V2SF 0 "register_operand" "=y")
+	(vec_concat:V2SF
+	  (minus:SF
+	    (vec_select:SF
+	      (match_operand:V2SF 1 "register_operand" "0")
+	      (parallel [(const_int  0)]))
+	    (vec_select:SF (match_dup 1) (parallel [(const_int 1)])))
+	  (minus:SF
+            (vec_select:SF
+	      (match_operand:V2SF 2 "nonimmediate_operand" "ym")
+	      (parallel [(const_int  0)]))
+	    (vec_select:SF (match_dup 2) (parallel [(const_int 1)])))))]
+  "TARGET_3DNOW_A"
+  "pfnacc\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+(define_insn "mmx_addsubv2sf3"
+  [(set (match_operand:V2SF 0 "register_operand" "=y")
+        (vec_merge:V2SF
+          (plus:V2SF
+            (match_operand:V2SF 1 "register_operand" "0")
+            (match_operand:V2SF 2 "nonimmediate_operand" "ym"))
+          (minus:V2SF (match_dup 1) (match_dup 2))
+          (const_int 1)))]
+  "TARGET_3DNOW_A"
+  "pfpnacc\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel single-precision floating point comparisons
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_expand "mmx_eqv2sf3"
+  [(set (match_operand:V2SI 0 "register_operand")
+	(eq:V2SI (match_operand:V2SF 1 "nonimmediate_operand")
+		 (match_operand:V2SF 2 "nonimmediate_operand")))]
+  "TARGET_3DNOW"
+  "ix86_fixup_binary_operands_no_copy (EQ, V2SFmode, operands);")
+
+(define_insn "*mmx_eqv2sf3"
+  [(set (match_operand:V2SI 0 "register_operand" "=y")
+	(eq:V2SI (match_operand:V2SF 1 "nonimmediate_operand" "%0")
+		 (match_operand:V2SF 2 "nonimmediate_operand" "ym")))]
+  "TARGET_3DNOW && ix86_binary_operator_ok (EQ, V2SFmode, operands)"
+  "pfcmpeq\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxcmp")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+(define_insn "mmx_gtv2sf3"
+  [(set (match_operand:V2SI 0 "register_operand" "=y")
+	(gt:V2SI (match_operand:V2SF 1 "register_operand" "0")
+		 (match_operand:V2SF 2 "nonimmediate_operand" "ym")))]
+  "TARGET_3DNOW"
+  "pfcmpgt\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxcmp")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+(define_insn "mmx_gev2sf3"
+  [(set (match_operand:V2SI 0 "register_operand" "=y")
+	(ge:V2SI (match_operand:V2SF 1 "register_operand" "0")
+		 (match_operand:V2SF 2 "nonimmediate_operand" "ym")))]
+  "TARGET_3DNOW"
+  "pfcmpge\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxcmp")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel single-precision floating point conversion operations
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "mmx_pf2id"
+  [(set (match_operand:V2SI 0 "register_operand" "=y")
+	(fix:V2SI (match_operand:V2SF 1 "nonimmediate_operand" "ym")))]
+  "TARGET_3DNOW"
+  "pf2id\t{%1, %0|%0, %1}"
+  [(set_attr "type" "mmxcvt")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+(define_insn "mmx_pf2iw"
+  [(set (match_operand:V2SI 0 "register_operand" "=y")
+	(sign_extend:V2SI
+	  (ss_truncate:V2HI
+	    (fix:V2SI
+	      (match_operand:V2SF 1 "nonimmediate_operand" "ym")))))]
+  "TARGET_3DNOW_A"
+  "pf2iw\t{%1, %0|%0, %1}"
+  [(set_attr "type" "mmxcvt")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+(define_insn "mmx_pi2fw"
+  [(set (match_operand:V2SF 0 "register_operand" "=y")
+	(float:V2SF
+	  (sign_extend:V2SI
+	    (truncate:V2HI
+	      (match_operand:V2SI 1 "nonimmediate_operand" "ym")))))]
+  "TARGET_3DNOW_A"
+  "pi2fw\t{%1, %0|%0, %1}"
+  [(set_attr "type" "mmxcvt")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+(define_insn "mmx_floatv2si2"
+  [(set (match_operand:V2SF 0 "register_operand" "=y")
+	(float:V2SF (match_operand:V2SI 1 "nonimmediate_operand" "ym")))]
+  "TARGET_3DNOW"
+  "pi2fd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "mmxcvt")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel single-precision floating point element swizzling
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "mmx_pswapdv2sf2"
+  [(set (match_operand:V2SF 0 "register_operand" "=y")
+	(vec_select:V2SF (match_operand:V2SF 1 "nonimmediate_operand" "ym")
+			 (parallel [(const_int 1) (const_int 0)])))]
+  "TARGET_3DNOW_A"
+  "pswapd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "mmxcvt")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2SF")])
+
+(define_insn "*vec_dupv2sf"
+  [(set (match_operand:V2SF 0 "register_operand" "=y")
+	(vec_duplicate:V2SF
+	  (match_operand:SF 1 "register_operand" "0")))]
+  "TARGET_MMX"
+  "punpckldq\t%0, %0"
+  [(set_attr "type" "mmxcvt")
+   (set_attr "mode" "DI")])
+
+(define_insn "*mmx_concatv2sf"
+  [(set (match_operand:V2SF 0 "register_operand"     "=y,y")
+	(vec_concat:V2SF
+	  (match_operand:SF 1 "nonimmediate_operand" " 0,rm")
+	  (match_operand:SF 2 "vector_move_operand"  "ym,C")))]
+  "TARGET_MMX && !TARGET_SSE"
+  "@
+   punpckldq\t{%2, %0|%0, %2}
+   movd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "mmxcvt,mmxmov")
+   (set_attr "mode" "DI")])
+
+(define_expand "vec_setv2sf"
+  [(match_operand:V2SF 0 "register_operand")
+   (match_operand:SF 1 "register_operand")
+   (match_operand 2 "const_int_operand")]
+  "TARGET_MMX"
+{
+  ix86_expand_vector_set (false, operands[0], operands[1],
+			  INTVAL (operands[2]));
+  DONE;
+})
+
+;; Avoid combining registers from different units in a single alternative,
+;; see comment above inline_secondary_memory_needed function in i386.c
+(define_insn_and_split "*vec_extractv2sf_0"
+  [(set (match_operand:SF 0 "nonimmediate_operand"     "=x, m,y ,m,f,r")
+	(vec_select:SF
+	  (match_operand:V2SF 1 "nonimmediate_operand" " xm,x,ym,y,m,m")
+	  (parallel [(const_int 0)])))]
+  "TARGET_MMX && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+{
+  if (REG_P (operands[1]))
+    operands[1] = gen_rtx_REG (SFmode, REGNO (operands[1]));
+  else
+    operands[1] = adjust_address (operands[1], SFmode, 0);
+})
+
+;; Avoid combining registers from different units in a single alternative,
+;; see comment above inline_secondary_memory_needed function in i386.c
+(define_insn "*vec_extractv2sf_1"
+  [(set (match_operand:SF 0 "nonimmediate_operand"     "=y,x,y,x,f,r")
+	(vec_select:SF
+	  (match_operand:V2SF 1 "nonimmediate_operand" " 0,0,o,o,o,o")
+	  (parallel [(const_int 1)])))]
+  "TARGET_MMX && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "@
+   punpckhdq\t%0, %0
+   unpckhps\t%0, %0
+   #
+   #
+   #
+   #"
+  [(set_attr "type" "mmxcvt,sselog1,mmxmov,ssemov,fmov,imov")
+   (set_attr "mode" "DI,V4SF,SF,SF,SF,SF")])
+
+(define_split
+  [(set (match_operand:SF 0 "register_operand")
+	(vec_select:SF
+	  (match_operand:V2SF 1 "memory_operand")
+	  (parallel [(const_int 1)])))]
+  "TARGET_MMX && reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+  "operands[1] = adjust_address (operands[1], SFmode, 4);")
+
+(define_expand "vec_extractv2sf"
+  [(match_operand:SF 0 "register_operand")
+   (match_operand:V2SF 1 "register_operand")
+   (match_operand 2 "const_int_operand")]
+  "TARGET_MMX"
+{
+  ix86_expand_vector_extract (false, operands[0], operands[1],
+			      INTVAL (operands[2]));
+  DONE;
+})
+
+(define_expand "vec_initv2sf"
+  [(match_operand:V2SF 0 "register_operand")
+   (match_operand 1)]
+  "TARGET_SSE"
+{
+  ix86_expand_vector_init (false, operands[0], operands[1]);
+  DONE;
+})
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel integral arithmetic
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_expand "mmx_<plusminus_insn><mode>3"
+  [(set (match_operand:MMXMODEI8 0 "register_operand")
+	(plusminus:MMXMODEI8
+	  (match_operand:MMXMODEI8 1 "nonimmediate_operand")
+	  (match_operand:MMXMODEI8 2 "nonimmediate_operand")))]
+  "TARGET_MMX || (TARGET_SSE2 && <MODE>mode == V1DImode)"
+  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+
+(define_insn "*mmx_<plusminus_insn><mode>3"
+  [(set (match_operand:MMXMODEI8 0 "register_operand" "=y")
+        (plusminus:MMXMODEI8
+	  (match_operand:MMXMODEI8 1 "nonimmediate_operand" "<comm>0")
+	  (match_operand:MMXMODEI8 2 "nonimmediate_operand" "ym")))]
+  "(TARGET_MMX || (TARGET_SSE2 && <MODE>mode == V1DImode))
+   && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "p<plusminus_mnemonic><mmxvecsize>\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxadd")
+   (set_attr "mode" "DI")])
+
+(define_expand "mmx_<plusminus_insn><mode>3"
+  [(set (match_operand:MMXMODE12 0 "register_operand")
+	(sat_plusminus:MMXMODE12
+	  (match_operand:MMXMODE12 1 "nonimmediate_operand")
+	  (match_operand:MMXMODE12 2 "nonimmediate_operand")))]
+  "TARGET_MMX"
+  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+
+(define_insn "*mmx_<plusminus_insn><mode>3"
+  [(set (match_operand:MMXMODE12 0 "register_operand" "=y")
+        (sat_plusminus:MMXMODE12
+	  (match_operand:MMXMODE12 1 "nonimmediate_operand" "<comm>0")
+	  (match_operand:MMXMODE12 2 "nonimmediate_operand" "ym")))]
+  "TARGET_MMX && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "p<plusminus_mnemonic><mmxvecsize>\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxadd")
+   (set_attr "mode" "DI")])
+
+(define_expand "mmx_mulv4hi3"
+  [(set (match_operand:V4HI 0 "register_operand")
+        (mult:V4HI (match_operand:V4HI 1 "nonimmediate_operand")
+		   (match_operand:V4HI 2 "nonimmediate_operand")))]
+  "TARGET_MMX"
+  "ix86_fixup_binary_operands_no_copy (MULT, V4HImode, operands);")
+
+(define_insn "*mmx_mulv4hi3"
+  [(set (match_operand:V4HI 0 "register_operand" "=y")
+        (mult:V4HI (match_operand:V4HI 1 "nonimmediate_operand" "%0")
+		   (match_operand:V4HI 2 "nonimmediate_operand" "ym")))]
+  "TARGET_MMX && ix86_binary_operator_ok (MULT, V4HImode, operands)"
+  "pmullw\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxmul")
+   (set_attr "mode" "DI")])
+
+(define_expand "mmx_smulv4hi3_highpart"
+  [(set (match_operand:V4HI 0 "register_operand")
+	(truncate:V4HI
+	  (lshiftrt:V4SI
+	    (mult:V4SI
+	      (sign_extend:V4SI
+		(match_operand:V4HI 1 "nonimmediate_operand"))
+	      (sign_extend:V4SI
+		(match_operand:V4HI 2 "nonimmediate_operand")))
+	    (const_int 16))))]
+  "TARGET_MMX"
+  "ix86_fixup_binary_operands_no_copy (MULT, V4HImode, operands);")
+
+(define_insn "*mmx_smulv4hi3_highpart"
+  [(set (match_operand:V4HI 0 "register_operand" "=y")
+	(truncate:V4HI
+	  (lshiftrt:V4SI
+	    (mult:V4SI
+	      (sign_extend:V4SI
+		(match_operand:V4HI 1 "nonimmediate_operand" "%0"))
+	      (sign_extend:V4SI
+		(match_operand:V4HI 2 "nonimmediate_operand" "ym")))
+	    (const_int 16))))]
+  "TARGET_MMX && ix86_binary_operator_ok (MULT, V4HImode, operands)"
+  "pmulhw\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxmul")
+   (set_attr "mode" "DI")])
+
+(define_expand "mmx_umulv4hi3_highpart"
+  [(set (match_operand:V4HI 0 "register_operand")
+	(truncate:V4HI
+	  (lshiftrt:V4SI
+	    (mult:V4SI
+	      (zero_extend:V4SI
+		(match_operand:V4HI 1 "nonimmediate_operand"))
+	      (zero_extend:V4SI
+		(match_operand:V4HI 2 "nonimmediate_operand")))
+	    (const_int 16))))]
+  "TARGET_SSE || TARGET_3DNOW_A"
+  "ix86_fixup_binary_operands_no_copy (MULT, V4HImode, operands);")
+
+(define_insn "*mmx_umulv4hi3_highpart"
+  [(set (match_operand:V4HI 0 "register_operand" "=y")
+	(truncate:V4HI
+	  (lshiftrt:V4SI
+	    (mult:V4SI
+	      (zero_extend:V4SI
+		(match_operand:V4HI 1 "nonimmediate_operand" "%0"))
+	      (zero_extend:V4SI
+		(match_operand:V4HI 2 "nonimmediate_operand" "ym")))
+	  (const_int 16))))]
+  "(TARGET_SSE || TARGET_3DNOW_A)
+   && ix86_binary_operator_ok (MULT, V4HImode, operands)"
+  "pmulhuw\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxmul")
+   (set_attr "mode" "DI")])
+
+(define_expand "mmx_pmaddwd"
+  [(set (match_operand:V2SI 0 "register_operand")
+        (plus:V2SI
+	  (mult:V2SI
+	    (sign_extend:V2SI
+	      (vec_select:V2HI
+		(match_operand:V4HI 1 "nonimmediate_operand")
+		(parallel [(const_int 0) (const_int 2)])))
+	    (sign_extend:V2SI
+	      (vec_select:V2HI
+		(match_operand:V4HI 2 "nonimmediate_operand")
+		(parallel [(const_int 0) (const_int 2)]))))
+	  (mult:V2SI
+	    (sign_extend:V2SI
+	      (vec_select:V2HI (match_dup 1)
+		(parallel [(const_int 1) (const_int 3)])))
+	    (sign_extend:V2SI
+	      (vec_select:V2HI (match_dup 2)
+		(parallel [(const_int 1) (const_int 3)]))))))]
+  "TARGET_MMX"
+  "ix86_fixup_binary_operands_no_copy (MULT, V4HImode, operands);")
+
+(define_insn "*mmx_pmaddwd"
+  [(set (match_operand:V2SI 0 "register_operand" "=y")
+        (plus:V2SI
+	  (mult:V2SI
+	    (sign_extend:V2SI
+	      (vec_select:V2HI
+		(match_operand:V4HI 1 "nonimmediate_operand" "%0")
+		(parallel [(const_int 0) (const_int 2)])))
+	    (sign_extend:V2SI
+	      (vec_select:V2HI
+		(match_operand:V4HI 2 "nonimmediate_operand" "ym")
+		(parallel [(const_int 0) (const_int 2)]))))
+	  (mult:V2SI
+	    (sign_extend:V2SI
+	      (vec_select:V2HI (match_dup 1)
+		(parallel [(const_int 1) (const_int 3)])))
+	    (sign_extend:V2SI
+	      (vec_select:V2HI (match_dup 2)
+		(parallel [(const_int 1) (const_int 3)]))))))]
+  "TARGET_MMX && ix86_binary_operator_ok (MULT, V4HImode, operands)"
+  "pmaddwd\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxmul")
+   (set_attr "mode" "DI")])
+
+(define_expand "mmx_pmulhrwv4hi3"
+  [(set (match_operand:V4HI 0 "register_operand")
+	(truncate:V4HI
+	  (lshiftrt:V4SI
+	    (plus:V4SI
+	      (mult:V4SI
+	        (sign_extend:V4SI
+		  (match_operand:V4HI 1 "nonimmediate_operand"))
+	        (sign_extend:V4SI
+		  (match_operand:V4HI 2 "nonimmediate_operand")))
+	      (const_vector:V4SI [(const_int 32768) (const_int 32768)
+				  (const_int 32768) (const_int 32768)]))
+	    (const_int 16))))]
+  "TARGET_3DNOW"
+  "ix86_fixup_binary_operands_no_copy (MULT, V4HImode, operands);")
+
+(define_insn "*mmx_pmulhrwv4hi3"
+  [(set (match_operand:V4HI 0 "register_operand" "=y")
+	(truncate:V4HI
+	  (lshiftrt:V4SI
+	    (plus:V4SI
+	      (mult:V4SI
+	        (sign_extend:V4SI
+		  (match_operand:V4HI 1 "nonimmediate_operand" "%0"))
+	        (sign_extend:V4SI
+		  (match_operand:V4HI 2 "nonimmediate_operand" "ym")))
+	      (const_vector:V4SI [(const_int 32768) (const_int 32768)
+				  (const_int 32768) (const_int 32768)]))
+	    (const_int 16))))]
+  "TARGET_3DNOW && ix86_binary_operator_ok (MULT, V4HImode, operands)"
+  "pmulhrw\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxmul")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "DI")])
+
+(define_expand "sse2_umulv1siv1di3"
+  [(set (match_operand:V1DI 0 "register_operand")
+        (mult:V1DI
+	  (zero_extend:V1DI
+	    (vec_select:V1SI
+	      (match_operand:V2SI 1 "nonimmediate_operand")
+	      (parallel [(const_int 0)])))
+	  (zero_extend:V1DI
+	    (vec_select:V1SI
+	      (match_operand:V2SI 2 "nonimmediate_operand")
+	      (parallel [(const_int 0)])))))]
+  "TARGET_SSE2"
+  "ix86_fixup_binary_operands_no_copy (MULT, V2SImode, operands);")
+
+(define_insn "*sse2_umulv1siv1di3"
+  [(set (match_operand:V1DI 0 "register_operand" "=y")
+        (mult:V1DI
+	  (zero_extend:V1DI
+	    (vec_select:V1SI
+	      (match_operand:V2SI 1 "nonimmediate_operand" "%0")
+	      (parallel [(const_int 0)])))
+	  (zero_extend:V1DI
+	    (vec_select:V1SI
+	      (match_operand:V2SI 2 "nonimmediate_operand" "ym")
+	      (parallel [(const_int 0)])))))]
+  "TARGET_SSE2 && ix86_binary_operator_ok (MULT, V2SImode, operands)"
+  "pmuludq\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxmul")
+   (set_attr "mode" "DI")])
+
+(define_expand "mmx_<code>v4hi3"
+  [(set (match_operand:V4HI 0 "register_operand")
+        (smaxmin:V4HI
+	  (match_operand:V4HI 1 "nonimmediate_operand")
+	  (match_operand:V4HI 2 "nonimmediate_operand")))]
+  "TARGET_SSE || TARGET_3DNOW_A"
+  "ix86_fixup_binary_operands_no_copy (<CODE>, V4HImode, operands);")
+
+(define_insn "*mmx_<code>v4hi3"
+  [(set (match_operand:V4HI 0 "register_operand" "=y")
+        (smaxmin:V4HI
+	  (match_operand:V4HI 1 "nonimmediate_operand" "%0")
+	  (match_operand:V4HI 2 "nonimmediate_operand" "ym")))]
+  "(TARGET_SSE || TARGET_3DNOW_A)
+   && ix86_binary_operator_ok (<CODE>, V4HImode, operands)"
+  "p<maxmin_int>w\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxadd")
+   (set_attr "mode" "DI")])
+
+(define_expand "mmx_<code>v8qi3"
+  [(set (match_operand:V8QI 0 "register_operand")
+        (umaxmin:V8QI
+	  (match_operand:V8QI 1 "nonimmediate_operand")
+	  (match_operand:V8QI 2 "nonimmediate_operand")))]
+  "TARGET_SSE || TARGET_3DNOW_A"
+  "ix86_fixup_binary_operands_no_copy (<CODE>, V8QImode, operands);")
+
+(define_insn "*mmx_<code>v8qi3"
+  [(set (match_operand:V8QI 0 "register_operand" "=y")
+        (umaxmin:V8QI
+	  (match_operand:V8QI 1 "nonimmediate_operand" "%0")
+	  (match_operand:V8QI 2 "nonimmediate_operand" "ym")))]
+  "(TARGET_SSE || TARGET_3DNOW_A)
+   && ix86_binary_operator_ok (<CODE>, V8QImode, operands)"
+  "p<maxmin_int>b\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxadd")
+   (set_attr "mode" "DI")])
+
+(define_insn "mmx_ashr<mode>3"
+  [(set (match_operand:MMXMODE24 0 "register_operand" "=y")
+        (ashiftrt:MMXMODE24
+	  (match_operand:MMXMODE24 1 "register_operand" "0")
+	  (match_operand:SI 2 "nonmemory_operand" "yN")))]
+  "TARGET_MMX"
+  "psra<mmxvecsize>\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxshft")
+   (set (attr "length_immediate")
+     (if_then_else (match_operand 2 "const_int_operand")
+       (const_string "1")
+       (const_string "0")))
+   (set_attr "mode" "DI")])
+
+(define_insn "mmx_<shift_insn><mode>3"
+  [(set (match_operand:MMXMODE248 0 "register_operand" "=y")
+        (any_lshift:MMXMODE248
+	  (match_operand:MMXMODE248 1 "register_operand" "0")
+	  (match_operand:SI 2 "nonmemory_operand" "yN")))]
+  "TARGET_MMX"
+  "p<vshift><mmxvecsize>\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxshft")
+   (set (attr "length_immediate")
+     (if_then_else (match_operand 2 "const_int_operand")
+       (const_string "1")
+       (const_string "0")))
+   (set_attr "mode" "DI")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel integral comparisons
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_expand "mmx_eq<mode>3"
+  [(set (match_operand:MMXMODEI 0 "register_operand")
+        (eq:MMXMODEI
+	  (match_operand:MMXMODEI 1 "nonimmediate_operand")
+	  (match_operand:MMXMODEI 2 "nonimmediate_operand")))]
+  "TARGET_MMX"
+  "ix86_fixup_binary_operands_no_copy (EQ, <MODE>mode, operands);")
+
+(define_insn "*mmx_eq<mode>3"
+  [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
+        (eq:MMXMODEI
+	  (match_operand:MMXMODEI 1 "nonimmediate_operand" "%0")
+	  (match_operand:MMXMODEI 2 "nonimmediate_operand" "ym")))]
+  "TARGET_MMX && ix86_binary_operator_ok (EQ, <MODE>mode, operands)"
+  "pcmpeq<mmxvecsize>\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxcmp")
+   (set_attr "mode" "DI")])
+
+(define_insn "mmx_gt<mode>3"
+  [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
+        (gt:MMXMODEI
+	  (match_operand:MMXMODEI 1 "register_operand" "0")
+	  (match_operand:MMXMODEI 2 "nonimmediate_operand" "ym")))]
+  "TARGET_MMX"
+  "pcmpgt<mmxvecsize>\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxcmp")
+   (set_attr "mode" "DI")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel integral logical operations
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "mmx_andnot<mode>3"
+  [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
+	(and:MMXMODEI
+	  (not:MMXMODEI (match_operand:MMXMODEI 1 "register_operand" "0"))
+	  (match_operand:MMXMODEI 2 "nonimmediate_operand" "ym")))]
+  "TARGET_MMX"
+  "pandn\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxadd")
+   (set_attr "mode" "DI")])
+
+(define_expand "mmx_<code><mode>3"
+  [(set (match_operand:MMXMODEI 0 "register_operand")
+	(any_logic:MMXMODEI
+	  (match_operand:MMXMODEI 1 "nonimmediate_operand")
+	  (match_operand:MMXMODEI 2 "nonimmediate_operand")))]
+  "TARGET_MMX"
+  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+
+(define_insn "*mmx_<code><mode>3"
+  [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
+        (any_logic:MMXMODEI
+	  (match_operand:MMXMODEI 1 "nonimmediate_operand" "%0")
+	  (match_operand:MMXMODEI 2 "nonimmediate_operand" "ym")))]
+  "TARGET_MMX && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "p<logic>\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxadd")
+   (set_attr "mode" "DI")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel integral element swizzling
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "mmx_packsswb"
+  [(set (match_operand:V8QI 0 "register_operand" "=y")
+	(vec_concat:V8QI
+	  (ss_truncate:V4QI
+	    (match_operand:V4HI 1 "register_operand" "0"))
+	  (ss_truncate:V4QI
+	    (match_operand:V4HI 2 "nonimmediate_operand" "ym"))))]
+  "TARGET_MMX"
+  "packsswb\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxshft")
+   (set_attr "mode" "DI")])
+
+(define_insn "mmx_packssdw"
+  [(set (match_operand:V4HI 0 "register_operand" "=y")
+	(vec_concat:V4HI
+	  (ss_truncate:V2HI
+	    (match_operand:V2SI 1 "register_operand" "0"))
+	  (ss_truncate:V2HI
+	    (match_operand:V2SI 2 "nonimmediate_operand" "ym"))))]
+  "TARGET_MMX"
+  "packssdw\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxshft")
+   (set_attr "mode" "DI")])
+
+(define_insn "mmx_packuswb"
+  [(set (match_operand:V8QI 0 "register_operand" "=y")
+	(vec_concat:V8QI
+	  (us_truncate:V4QI
+	    (match_operand:V4HI 1 "register_operand" "0"))
+	  (us_truncate:V4QI
+	    (match_operand:V4HI 2 "nonimmediate_operand" "ym"))))]
+  "TARGET_MMX"
+  "packuswb\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxshft")
+   (set_attr "mode" "DI")])
+
+(define_insn "mmx_punpckhbw"
+  [(set (match_operand:V8QI 0 "register_operand" "=y")
+	(vec_select:V8QI
+	  (vec_concat:V16QI
+	    (match_operand:V8QI 1 "register_operand" "0")
+	    (match_operand:V8QI 2 "nonimmediate_operand" "ym"))
+          (parallel [(const_int 4) (const_int 12)
+                     (const_int 5) (const_int 13)
+                     (const_int 6) (const_int 14)
+                     (const_int 7) (const_int 15)])))]
+  "TARGET_MMX"
+  "punpckhbw\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxcvt")
+   (set_attr "mode" "DI")])
+
+(define_insn "mmx_punpcklbw"
+  [(set (match_operand:V8QI 0 "register_operand" "=y")
+	(vec_select:V8QI
+	  (vec_concat:V16QI
+	    (match_operand:V8QI 1 "register_operand" "0")
+	    (match_operand:V8QI 2 "nonimmediate_operand" "ym"))
+          (parallel [(const_int 0) (const_int 8)
+                     (const_int 1) (const_int 9)
+                     (const_int 2) (const_int 10)
+                     (const_int 3) (const_int 11)])))]
+  "TARGET_MMX"
+  "punpcklbw\t{%2, %0|%0, %k2}"
+  [(set_attr "type" "mmxcvt")
+   (set_attr "mode" "DI")])
+
+(define_insn "mmx_punpckhwd"
+  [(set (match_operand:V4HI 0 "register_operand" "=y")
+	(vec_select:V4HI
+	  (vec_concat:V8HI
+	    (match_operand:V4HI 1 "register_operand" "0")
+	    (match_operand:V4HI 2 "nonimmediate_operand" "ym"))
+          (parallel [(const_int 2) (const_int 6)
+                     (const_int 3) (const_int 7)])))]
+  "TARGET_MMX"
+  "punpckhwd\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxcvt")
+   (set_attr "mode" "DI")])
+
+(define_insn "mmx_punpcklwd"
+  [(set (match_operand:V4HI 0 "register_operand" "=y")
+	(vec_select:V4HI
+	  (vec_concat:V8HI
+	    (match_operand:V4HI 1 "register_operand" "0")
+	    (match_operand:V4HI 2 "nonimmediate_operand" "ym"))
+          (parallel [(const_int 0) (const_int 4)
+                     (const_int 1) (const_int 5)])))]
+  "TARGET_MMX"
+  "punpcklwd\t{%2, %0|%0, %k2}"
+  [(set_attr "type" "mmxcvt")
+   (set_attr "mode" "DI")])
+
+(define_insn "mmx_punpckhdq"
+  [(set (match_operand:V2SI 0 "register_operand" "=y")
+	(vec_select:V2SI
+	  (vec_concat:V4SI
+	    (match_operand:V2SI 1 "register_operand" "0")
+	    (match_operand:V2SI 2 "nonimmediate_operand" "ym"))
+	  (parallel [(const_int 1)
+		     (const_int 3)])))]
+  "TARGET_MMX"
+  "punpckhdq\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxcvt")
+   (set_attr "mode" "DI")])
+
+(define_insn "mmx_punpckldq"
+  [(set (match_operand:V2SI 0 "register_operand" "=y")
+	(vec_select:V2SI
+	  (vec_concat:V4SI
+	    (match_operand:V2SI 1 "register_operand" "0")
+	    (match_operand:V2SI 2 "nonimmediate_operand" "ym"))
+	  (parallel [(const_int 0)
+		     (const_int 2)])))]
+  "TARGET_MMX"
+  "punpckldq\t{%2, %0|%0, %k2}"
+  [(set_attr "type" "mmxcvt")
+   (set_attr "mode" "DI")])
+
+(define_expand "mmx_pinsrw"
+  [(set (match_operand:V4HI 0 "register_operand")
+        (vec_merge:V4HI
+          (vec_duplicate:V4HI
+            (match_operand:SI 2 "nonimmediate_operand"))
+	  (match_operand:V4HI 1 "register_operand")
+          (match_operand:SI 3 "const_0_to_3_operand")))]
+  "TARGET_SSE || TARGET_3DNOW_A"
+{
+  operands[2] = gen_lowpart (HImode, operands[2]);
+  operands[3] = GEN_INT (1 << INTVAL (operands[3]));
+})
+
+(define_insn "*mmx_pinsrw"
+  [(set (match_operand:V4HI 0 "register_operand" "=y")
+        (vec_merge:V4HI
+          (vec_duplicate:V4HI
+            (match_operand:HI 2 "nonimmediate_operand" "rm"))
+	  (match_operand:V4HI 1 "register_operand" "0")
+          (match_operand:SI 3 "const_int_operand")))]
+  "(TARGET_SSE || TARGET_3DNOW_A)
+   && ((unsigned) exact_log2 (INTVAL (operands[3]))
+       < GET_MODE_NUNITS (V4HImode))"
+{
+  operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3])));
+  if (MEM_P (operands[2]))
+    return "pinsrw\t{%3, %2, %0|%0, %2, %3}";
+  else
+    return "pinsrw\t{%3, %k2, %0|%0, %k2, %3}";
+}
+  [(set_attr "type" "mmxcvt")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "DI")])
+
+(define_insn "mmx_pextrw"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+        (zero_extend:SI
+	  (vec_select:HI
+	    (match_operand:V4HI 1 "register_operand" "y")
+	    (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n")]))))]
+  "TARGET_SSE || TARGET_3DNOW_A"
+  "pextrw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "mmxcvt")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "DI")])
+
+(define_expand "mmx_pshufw"
+  [(match_operand:V4HI 0 "register_operand")
+   (match_operand:V4HI 1 "nonimmediate_operand")
+   (match_operand:SI 2 "const_int_operand")]
+  "TARGET_SSE || TARGET_3DNOW_A"
+{
+  int mask = INTVAL (operands[2]);
+  emit_insn (gen_mmx_pshufw_1 (operands[0], operands[1],
+                               GEN_INT ((mask >> 0) & 3),
+                               GEN_INT ((mask >> 2) & 3),
+                               GEN_INT ((mask >> 4) & 3),
+                               GEN_INT ((mask >> 6) & 3)));
+  DONE;
+})
+
+(define_insn "mmx_pshufw_1"
+  [(set (match_operand:V4HI 0 "register_operand" "=y")
+        (vec_select:V4HI
+          (match_operand:V4HI 1 "nonimmediate_operand" "ym")
+          (parallel [(match_operand 2 "const_0_to_3_operand")
+                     (match_operand 3 "const_0_to_3_operand")
+                     (match_operand 4 "const_0_to_3_operand")
+                     (match_operand 5 "const_0_to_3_operand")])))]
+  "TARGET_SSE || TARGET_3DNOW_A"
+{
+  int mask = 0;
+  mask |= INTVAL (operands[2]) << 0;
+  mask |= INTVAL (operands[3]) << 2;
+  mask |= INTVAL (operands[4]) << 4;
+  mask |= INTVAL (operands[5]) << 6;
+  operands[2] = GEN_INT (mask);
+
+  return "pshufw\t{%2, %1, %0|%0, %1, %2}";
+}
+  [(set_attr "type" "mmxcvt")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "DI")])
+
+(define_insn "mmx_pswapdv2si2"
+  [(set (match_operand:V2SI 0 "register_operand" "=y")
+	(vec_select:V2SI
+	  (match_operand:V2SI 1 "nonimmediate_operand" "ym")
+	  (parallel [(const_int 1) (const_int 0)])))]
+  "TARGET_3DNOW_A"
+  "pswapd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "mmxcvt")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "DI")])
+
+(define_insn "*vec_dupv4hi"
+  [(set (match_operand:V4HI 0 "register_operand" "=y")
+	(vec_duplicate:V4HI
+	  (truncate:HI
+	    (match_operand:SI 1 "register_operand" "0"))))]
+  "TARGET_SSE || TARGET_3DNOW_A"
+  "pshufw\t{$0, %0, %0|%0, %0, 0}"
+  [(set_attr "type" "mmxcvt")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "DI")])
+
+(define_insn "*vec_dupv2si"
+  [(set (match_operand:V2SI 0 "register_operand" "=y")
+	(vec_duplicate:V2SI
+	  (match_operand:SI 1 "register_operand" "0")))]
+  "TARGET_MMX"
+  "punpckldq\t%0, %0"
+  [(set_attr "type" "mmxcvt")
+   (set_attr "mode" "DI")])
+
+(define_insn "*mmx_concatv2si"
+  [(set (match_operand:V2SI 0 "register_operand"     "=y,y")
+	(vec_concat:V2SI
+	  (match_operand:SI 1 "nonimmediate_operand" " 0,rm")
+	  (match_operand:SI 2 "vector_move_operand"  "ym,C")))]
+  "TARGET_MMX && !TARGET_SSE"
+  "@
+   punpckldq\t{%2, %0|%0, %2}
+   movd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "mmxcvt,mmxmov")
+   (set_attr "mode" "DI")])
+
+(define_expand "vec_setv2si"
+  [(match_operand:V2SI 0 "register_operand")
+   (match_operand:SI 1 "register_operand")
+   (match_operand 2 "const_int_operand")]
+  "TARGET_MMX"
+{
+  ix86_expand_vector_set (false, operands[0], operands[1],
+			  INTVAL (operands[2]));
+  DONE;
+})
+
+;; Avoid combining registers from different units in a single alternative,
+;; see comment above inline_secondary_memory_needed function in i386.c
+(define_insn_and_split "*vec_extractv2si_0"
+  [(set (match_operand:SI 0 "nonimmediate_operand"     "=x,m,y, m,r")
+	(vec_select:SI
+	  (match_operand:V2SI 1 "nonimmediate_operand" "xm,x,ym,y,m")
+	  (parallel [(const_int 0)])))]
+  "TARGET_MMX && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+{
+  if (REG_P (operands[1]))
+    operands[1] = gen_rtx_REG (SImode, REGNO (operands[1]));
+  else
+    operands[1] = adjust_address (operands[1], SImode, 0);
+})
+
+;; Avoid combining registers from different units in a single alternative,
+;; see comment above inline_secondary_memory_needed function in i386.c
+(define_insn "*vec_extractv2si_1"
+  [(set (match_operand:SI 0 "nonimmediate_operand"     "=y,x,x,x,y,x,r")
+	(vec_select:SI
+	  (match_operand:V2SI 1 "nonimmediate_operand" " 0,0,x,0,o,o,o")
+	  (parallel [(const_int 1)])))]
+  "TARGET_MMX && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "@
+   punpckhdq\t%0, %0
+   punpckhdq\t%0, %0
+   pshufd\t{$85, %1, %0|%0, %1, 85}
+   unpckhps\t%0, %0
+   #
+   #
+   #"
+  [(set (attr "isa")
+     (if_then_else (eq_attr "alternative" "1,2")
+       (const_string "sse2")
+       (const_string "*")))
+   (set_attr "type" "mmxcvt,sselog1,sselog1,sselog1,mmxmov,ssemov,imov")
+   (set_attr "length_immediate" "*,*,1,*,*,*,*")
+   (set_attr "mode" "DI,TI,TI,V4SF,SI,SI,SI")])
+
+(define_split
+  [(set (match_operand:SI 0 "register_operand")
+	(vec_select:SI
+	  (match_operand:V2SI 1 "memory_operand")
+	  (parallel [(const_int 1)])))]
+  "TARGET_MMX && reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+  "operands[1] = adjust_address (operands[1], SImode, 4);")
+
+(define_insn_and_split "*vec_extractv2si_zext_mem"
+  [(set (match_operand:DI 0 "register_operand" "=y,x,r")
+	(zero_extend:DI
+	  (vec_select:SI
+	    (match_operand:V2SI 1 "memory_operand" "o,o,o")
+	    (parallel [(match_operand:SI 2 "const_0_to_1_operand")]))))]
+  "TARGET_64BIT && TARGET_MMX"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:DI (match_dup 1)))]
+{
+  operands[1] = adjust_address (operands[1], SImode, INTVAL (operands[2]) * 4);
+})
+
+(define_expand "vec_extractv2si"
+  [(match_operand:SI 0 "register_operand")
+   (match_operand:V2SI 1 "register_operand")
+   (match_operand 2 "const_int_operand")]
+  "TARGET_MMX"
+{
+  ix86_expand_vector_extract (false, operands[0], operands[1],
+			      INTVAL (operands[2]));
+  DONE;
+})
+
+(define_expand "vec_initv2si"
+  [(match_operand:V2SI 0 "register_operand")
+   (match_operand 1)]
+  "TARGET_SSE"
+{
+  ix86_expand_vector_init (false, operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "vec_setv4hi"
+  [(match_operand:V4HI 0 "register_operand")
+   (match_operand:HI 1 "register_operand")
+   (match_operand 2 "const_int_operand")]
+  "TARGET_MMX"
+{
+  ix86_expand_vector_set (false, operands[0], operands[1],
+			  INTVAL (operands[2]));
+  DONE;
+})
+
+(define_expand "vec_extractv4hi"
+  [(match_operand:HI 0 "register_operand")
+   (match_operand:V4HI 1 "register_operand")
+   (match_operand 2 "const_int_operand")]
+  "TARGET_MMX"
+{
+  ix86_expand_vector_extract (false, operands[0], operands[1],
+			      INTVAL (operands[2]));
+  DONE;
+})
+
+(define_expand "vec_initv4hi"
+  [(match_operand:V4HI 0 "register_operand")
+   (match_operand 1)]
+  "TARGET_SSE"
+{
+  ix86_expand_vector_init (false, operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "vec_setv8qi"
+  [(match_operand:V8QI 0 "register_operand")
+   (match_operand:QI 1 "register_operand")
+   (match_operand 2 "const_int_operand")]
+  "TARGET_MMX"
+{
+  ix86_expand_vector_set (false, operands[0], operands[1],
+			  INTVAL (operands[2]));
+  DONE;
+})
+
+(define_expand "vec_extractv8qi"
+  [(match_operand:QI 0 "register_operand")
+   (match_operand:V8QI 1 "register_operand")
+   (match_operand 2 "const_int_operand")]
+  "TARGET_MMX"
+{
+  ix86_expand_vector_extract (false, operands[0], operands[1],
+			      INTVAL (operands[2]));
+  DONE;
+})
+
+(define_expand "vec_initv8qi"
+  [(match_operand:V8QI 0 "register_operand")
+   (match_operand 1)]
+  "TARGET_SSE"
+{
+  ix86_expand_vector_init (false, operands[0], operands[1]);
+  DONE;
+})
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Miscellaneous
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_expand "mmx_uavgv8qi3"
+  [(set (match_operand:V8QI 0 "register_operand")
+	(truncate:V8QI
+	  (lshiftrt:V8HI
+	    (plus:V8HI
+	      (plus:V8HI
+		(zero_extend:V8HI
+		  (match_operand:V8QI 1 "nonimmediate_operand"))
+		(zero_extend:V8HI
+		  (match_operand:V8QI 2 "nonimmediate_operand")))
+	      (const_vector:V8HI [(const_int 1) (const_int 1)
+				  (const_int 1) (const_int 1)
+				  (const_int 1) (const_int 1)
+				  (const_int 1) (const_int 1)]))
+	    (const_int 1))))]
+  "TARGET_SSE || TARGET_3DNOW"
+  "ix86_fixup_binary_operands_no_copy (PLUS, V8QImode, operands);")
+
+(define_insn "*mmx_uavgv8qi3"
+  [(set (match_operand:V8QI 0 "register_operand" "=y")
+	(truncate:V8QI
+	  (lshiftrt:V8HI
+	    (plus:V8HI
+	      (plus:V8HI
+		(zero_extend:V8HI
+		  (match_operand:V8QI 1 "nonimmediate_operand" "%0"))
+		(zero_extend:V8HI
+		  (match_operand:V8QI 2 "nonimmediate_operand" "ym")))
+	      (const_vector:V8HI [(const_int 1) (const_int 1)
+				  (const_int 1) (const_int 1)
+				  (const_int 1) (const_int 1)
+				  (const_int 1) (const_int 1)]))
+	    (const_int 1))))]
+  "(TARGET_SSE || TARGET_3DNOW)
+   && ix86_binary_operator_ok (PLUS, V8QImode, operands)"
+{
+  /* These two instructions have the same operation, but their encoding
+     is different.  Prefer the one that is de facto standard.  */
+  if (TARGET_SSE || TARGET_3DNOW_A)
+    return "pavgb\t{%2, %0|%0, %2}";
+  else
+    return "pavgusb\t{%2, %0|%0, %2}";
+}
+  [(set_attr "type" "mmxshft")
+   (set (attr "prefix_extra")
+     (if_then_else
+       (not (ior (match_test "TARGET_SSE")
+		 (match_test "TARGET_3DNOW_A")))
+       (const_string "1")
+       (const_string "*")))
+   (set_attr "mode" "DI")])
+
+(define_expand "mmx_uavgv4hi3"
+  [(set (match_operand:V4HI 0 "register_operand")
+	(truncate:V4HI
+	  (lshiftrt:V4SI
+	    (plus:V4SI
+	      (plus:V4SI
+		(zero_extend:V4SI
+		  (match_operand:V4HI 1 "nonimmediate_operand"))
+		(zero_extend:V4SI
+		  (match_operand:V4HI 2 "nonimmediate_operand")))
+	      (const_vector:V4SI [(const_int 1) (const_int 1)
+				  (const_int 1) (const_int 1)]))
+	    (const_int 1))))]
+  "TARGET_SSE || TARGET_3DNOW_A"
+  "ix86_fixup_binary_operands_no_copy (PLUS, V4HImode, operands);")
+
+(define_insn "*mmx_uavgv4hi3"
+  [(set (match_operand:V4HI 0 "register_operand" "=y")
+	(truncate:V4HI
+	  (lshiftrt:V4SI
+	    (plus:V4SI
+	      (plus:V4SI
+		(zero_extend:V4SI
+		  (match_operand:V4HI 1 "nonimmediate_operand" "%0"))
+		(zero_extend:V4SI
+		  (match_operand:V4HI 2 "nonimmediate_operand" "ym")))
+	      (const_vector:V4SI [(const_int 1) (const_int 1)
+				  (const_int 1) (const_int 1)]))
+	    (const_int 1))))]
+  "(TARGET_SSE || TARGET_3DNOW_A)
+   && ix86_binary_operator_ok (PLUS, V4HImode, operands)"
+  "pavgw\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxshft")
+   (set_attr "mode" "DI")])
+
+(define_insn "mmx_psadbw"
+  [(set (match_operand:V1DI 0 "register_operand" "=y")
+        (unspec:V1DI [(match_operand:V8QI 1 "register_operand" "0")
+		      (match_operand:V8QI 2 "nonimmediate_operand" "ym")]
+		     UNSPEC_PSADBW))]
+  "TARGET_SSE || TARGET_3DNOW_A"
+  "psadbw\t{%2, %0|%0, %2}"
+  [(set_attr "type" "mmxshft")
+   (set_attr "mode" "DI")])
+
+(define_insn "mmx_pmovmskb"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:V8QI 1 "register_operand" "y")]
+		   UNSPEC_MOVMSK))]
+  "TARGET_SSE || TARGET_3DNOW_A"
+  "pmovmskb\t{%1, %0|%0, %1}"
+  [(set_attr "type" "mmxcvt")
+   (set_attr "mode" "DI")])
+
+(define_expand "mmx_maskmovq"
+  [(set (match_operand:V8QI 0 "memory_operand")
+	(unspec:V8QI [(match_operand:V8QI 1 "register_operand")
+		      (match_operand:V8QI 2 "register_operand")
+		      (match_dup 0)]
+		     UNSPEC_MASKMOV))]
+  "TARGET_SSE || TARGET_3DNOW_A")
+
+(define_insn "*mmx_maskmovq"
+  [(set (mem:V8QI (match_operand:P 0 "register_operand" "D"))
+	(unspec:V8QI [(match_operand:V8QI 1 "register_operand" "y")
+		      (match_operand:V8QI 2 "register_operand" "y")
+		      (mem:V8QI (match_dup 0))]
+		     UNSPEC_MASKMOV))]
+  "TARGET_SSE || TARGET_3DNOW_A"
+  ;; @@@ check ordering of operands in intel/nonintel syntax
+  "maskmovq\t{%2, %1|%1, %2}"
+  [(set_attr "type" "mmxcvt")
+   (set_attr "mode" "DI")])
+
+(define_expand "mmx_emms"
+  [(match_par_dup 0 [(const_int 0)])]
+  "TARGET_MMX"
+{
+  int regno;
+
+  operands[0] = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (17));
+
+  XVECEXP (operands[0], 0, 0)
+    = gen_rtx_UNSPEC_VOLATILE (VOIDmode, gen_rtvec (1, const0_rtx),
+			       UNSPECV_EMMS);
+
+  for (regno = 0; regno < 8; regno++)
+    {
+      XVECEXP (operands[0], 0, regno + 1)
+	= gen_rtx_CLOBBER (VOIDmode,
+			   gen_rtx_REG (XFmode, FIRST_STACK_REG + regno));
+
+      XVECEXP (operands[0], 0, regno + 9)
+	= gen_rtx_CLOBBER (VOIDmode,
+			   gen_rtx_REG (DImode, FIRST_MMX_REG + regno));
+    }
+})
+
+(define_insn "*mmx_emms"
+  [(match_parallel 0 "emms_operation"
+    [(unspec_volatile [(const_int 0)] UNSPECV_EMMS)])]
+  "TARGET_MMX"
+  "emms"
+  [(set_attr "type" "mmx")
+   (set_attr "modrm" "0")
+   (set_attr "memory" "none")])
+
+(define_expand "mmx_femms"
+  [(match_par_dup 0 [(const_int 0)])]
+  "TARGET_3DNOW"
+{
+  int regno;
+
+  operands[0] = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (17));
+
+  XVECEXP (operands[0], 0, 0)
+    = gen_rtx_UNSPEC_VOLATILE (VOIDmode, gen_rtvec (1, const0_rtx),
+			       UNSPECV_FEMMS);
+
+  for (regno = 0; regno < 8; regno++)
+    {
+      XVECEXP (operands[0], 0, regno + 1)
+	= gen_rtx_CLOBBER (VOIDmode,
+			   gen_rtx_REG (XFmode, FIRST_STACK_REG + regno));
+
+      XVECEXP (operands[0], 0, regno + 9)
+	= gen_rtx_CLOBBER (VOIDmode,
+			   gen_rtx_REG (DImode, FIRST_MMX_REG + regno));
+    }
+})
+
+(define_insn "*mmx_femms"
+  [(match_parallel 0 "emms_operation"
+    [(unspec_volatile [(const_int 0)] UNSPECV_FEMMS)])]
+  "TARGET_3DNOW"
+  "femms"
+  [(set_attr "type" "mmx")
+   (set_attr "modrm" "0")
+   (set_attr "memory" "none")])
diff --git a/gcc-4.9/gcc/config/i386/msformat-c.c b/gcc-4.9/gcc/config/i386/msformat-c.c
new file mode 100644
index 000000000..304d48f20
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/msformat-c.c
@@ -0,0 +1,195 @@
+/* Check calls to formatted I/O functions (-Wformat).
+   Copyright (C) 1992-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "tree.h"
+#include "flags.h"
+#include "c-family/c-common.h"
+#include "intl.h"
+#include "diagnostic.h"
+#include "langhooks.h"
+#include "c-family/c-format.h"
+#include "alloc-pool.h"
+
+/* Mingw specific format attributes ms_printf, ms_scanf, and ms_strftime.  */
+
+static format_length_info ms_printf_length_specs[] =
+{
+  { "h", FMT_LEN_h, STD_C89, NULL, FMT_LEN_none, STD_C89, 0 },
+  { "l", FMT_LEN_l, STD_C89, NULL, FMT_LEN_none, STD_C89, 0 },
+  { "I32", FMT_LEN_l, STD_EXT, NULL, FMT_LEN_none, STD_C89, 1 },
+  { "I64", FMT_LEN_ll, STD_EXT, NULL, FMT_LEN_none, STD_C89, 1 },
+  { "I", FMT_LEN_L, STD_EXT, NULL, FMT_LEN_none, STD_C89, 1 },
+  { NULL, FMT_LEN_none, STD_C89, NULL, FMT_LEN_none, STD_C89, 0 }
+};
+
+static const format_flag_spec ms_printf_flag_specs[] =
+{
+  { ' ',  0, 0, N_("' ' flag"),        N_("the ' ' printf flag"),              STD_C89 },
+  { '+',  0, 0, N_("'+' flag"),        N_("the '+' printf flag"),              STD_C89 },
+  { '#',  0, 0, N_("'#' flag"),        N_("the '#' printf flag"),              STD_C89 },
+  { '0',  0, 0, N_("'0' flag"),        N_("the '0' printf flag"),              STD_C89 },
+  { '-',  0, 0, N_("'-' flag"),        N_("the '-' printf flag"),              STD_C89 },
+  { '\'', 0, 0, N_("''' flag"),        N_("the ''' printf flag"),              STD_EXT },
+  { 'w',  0, 0, N_("field width"),     N_("field width in printf format"),     STD_C89 },
+  { 'p',  0, 0, N_("precision"),       N_("precision in printf format"),       STD_C89 },
+  { 'L',  0, 0, N_("length modifier"), N_("length modifier in printf format"), STD_C89 },
+  { 0, 0, 0, NULL, NULL, STD_C89 }
+};
+
+static const format_flag_pair ms_printf_flag_pairs[] =
+{
+  { ' ', '+', 1, 0   },
+  { '0', '-', 1, 0   }, { '0', 'p', 1, 'i' },
+  { 0, 0, 0, 0 }
+};
+
+static const format_flag_spec ms_scanf_flag_specs[] =
+{
+  { '*',  0, 0, N_("assignment suppression"), N_("the assignment suppression scanf feature"), STD_C89 },
+  { 'a',  0, 0, N_("'a' flag"),               N_("the 'a' scanf flag"),                       STD_EXT },
+  { 'w',  0, 0, N_("field width"),            N_("field width in scanf format"),              STD_C89 },
+  { 'L',  0, 0, N_("length modifier"),        N_("length modifier in scanf format"),          STD_C89 },
+  { '\'', 0, 0, N_("''' flag"),               N_("the ''' scanf flag"),                       STD_EXT },
+  { 0, 0, 0, NULL, NULL, STD_C89 }
+};
+
+static const format_flag_pair ms_scanf_flag_pairs[] =
+{
+  { '*', 'L', 0, 0 },
+  { 0, 0, 0, 0 }
+};
+
+static const format_flag_spec ms_strftime_flag_specs[] =
+{
+  { '#', 0,   0, N_("'#' flag"),     N_("the '#' strftime flag"),          STD_EXT },
+  { 0, 0, 0, NULL, NULL, STD_C89 }
+};
+
+static const format_flag_pair ms_strftime_flag_pairs[] =
+{
+  { 0, 0, 0, 0 }
+};
+
+static const format_char_info ms_print_char_table[] =
+{
+  /* C89 conversion specifiers.  */
+  { "di",  0, STD_C89, { T89_I,   BADLEN,  T89_S,   T89_L,   T9L_LL,  T99_SST,  BADLEN, BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN  }, "-wp0 +'",  "i",  NULL },
+  { "oxX", 0, STD_C89, { T89_UI,  BADLEN,  T89_US,  T89_UL,  T9L_ULL, T99_ST, BADLEN, BADLEN, BADLEN, BADLEN,  BADLEN,  BADLEN }, "-wp0#",     "i",  NULL },
+  { "u",   0, STD_C89, { T89_UI,  BADLEN,  T89_US,  T89_UL,  T9L_ULL, T99_ST, BADLEN, BADLEN, BADLEN, BADLEN,  BADLEN,  BADLEN }, "-wp0'",    "i",  NULL },
+  { "fgG", 0, STD_C89, { T89_D,   BADLEN,  BADLEN,  T99_D,   BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN, BADLEN, BADLEN }, "-wp0 +#'", "",   NULL },
+  { "eE",  0, STD_C89, { T89_D,   BADLEN,  BADLEN,  T99_D,   BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN, BADLEN, BADLEN }, "-wp0 +#",  "",   NULL },
+  { "c",   0, STD_C89, { T89_I,   BADLEN,  T89_S,  T94_WI,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN }, "-w",        "",   NULL },
+  { "s",   1, STD_C89, { T89_C,   BADLEN,  T89_S,  T94_W,   BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN }, "-wp",       "cR", NULL },
+  { "p",   1, STD_C89, { T89_V,   BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN }, "-w",        "c",  NULL },
+  { "n",   1, STD_C89, { T89_I,   BADLEN,  T89_S,   T89_L,   T9L_LL,  BADLEN,  BADLEN, BADLEN,  T99_IM,  BADLEN,  BADLEN,  BADLEN }, "",          "W",  NULL },
+  /* X/Open conversion specifiers.  */
+  { "C",   0, STD_EXT, { TEX_WI,  BADLEN,  T89_S,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN }, "-w",        "",   NULL },
+  { "S",   1, STD_EXT, { TEX_W,   BADLEN,  T89_S,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN }, "-wp",       "R",  NULL },
+  { NULL,  0, STD_C89, NOLENGTHS, NULL, NULL, NULL }
+};
+
+static const format_char_info ms_scan_char_table[] =
+{
+  /* C89 conversion specifiers.  */
+  { "di",    1, STD_C89, { T89_I,   BADLEN,  T89_S,   T89_L,   T9L_LL,  T99_SST,  BADLEN, BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN }, "*w'", "W",   NULL },
+  { "u",     1, STD_C89, { T89_UI,  BADLEN,  T89_US,  T89_UL,  T9L_ULL, T99_ST, BADLEN,  BADLEN, BADLEN, BADLEN,  BADLEN,  BADLEN }, "*w'", "W",   NULL },
+  { "oxX",   1, STD_C89, { T89_UI,  BADLEN,  T89_US,  T89_UL,  T9L_ULL, T99_ST, BADLEN,  BADLEN, BADLEN, BADLEN,  BADLEN,  BADLEN }, "*w",   "W",   NULL },
+  { "efgEG", 1, STD_C89, { T89_F,   BADLEN,  BADLEN,  T89_D,   BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN, BADLEN, BADLEN }, "*w'",  "W",   NULL },
+  { "c",     1, STD_C89, { T89_C,   BADLEN,  T89_S,  T94_W,   BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN }, "*w",   "cW",  NULL },
+  { "s",     1, STD_C89, { T89_C,   BADLEN,  T89_S,  T94_W,   BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN }, "*aw",  "cW",  NULL },
+  { "[",     1, STD_C89, { T89_C,   BADLEN,  BADLEN,  T94_W,   BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN }, "*aw",  "cW[", NULL },
+  { "p",     2, STD_C89, { T89_V,   BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN }, "*w",   "W",   NULL },
+  { "n",     1, STD_C89, { T89_I,   BADLEN,  T89_S,   T89_L,   T9L_LL,  BADLEN,  BADLEN, BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN }, "",     "W",   NULL },
+  /* X/Open conversion specifiers.  */
+  { "C",     1, STD_EXT, { TEX_W,   BADLEN,  T89_S,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN }, "*w",   "W",   NULL },
+  { "S",     1, STD_EXT, { TEX_W,   BADLEN,  T89_S,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN,  BADLEN }, "*aw",  "W",   NULL },
+  { NULL, 0, STD_C89, NOLENGTHS, NULL, NULL, NULL }
+};
+
+static const format_char_info ms_time_char_table[] =
+{
+  /* C89 conversion specifiers.  */
+  { "ABZab",		0, STD_C89, NOLENGTHS, "#",     "",   NULL },
+  { "cx",		0, STD_C89, NOLENGTHS, "#",      "3",  NULL },
+  { "HIMSUWdmw",	0, STD_C89, NOLENGTHS, "#",  "",   NULL },
+  { "j",		0, STD_C89, NOLENGTHS, "#",  "",  NULL },
+  { "p",		0, STD_C89, NOLENGTHS, "#",      "",   NULL },
+  { "X",		0, STD_C89, NOLENGTHS, "#",      "",   NULL },
+  { "y",		0, STD_C89, NOLENGTHS, "#", "4",  NULL },
+  { "Y",		0, STD_C89, NOLENGTHS, "#", "",  NULL },
+  { "%",		0, STD_C89, NOLENGTHS, "",       "",   NULL },
+  /* C99 conversion specifiers.  */
+  { "z",		0, STD_C99, NOLENGTHS, "#",      "",  NULL },
+  { NULL,		0, STD_C89, NOLENGTHS, NULL, NULL, NULL }
+};
+
+EXPORTED_CONST format_kind_info mingw_format_attributes[3] =
+{
+  { "ms_printf",   ms_printf_length_specs,  ms_print_char_table, " +#0-'", NULL,
+    ms_printf_flag_specs, ms_printf_flag_pairs,
+    FMT_FLAG_ARG_CONVERT|FMT_FLAG_DOLLAR_MULTIPLE|FMT_FLAG_USE_DOLLAR|FMT_FLAG_EMPTY_PREC_OK,
+    'w', 0, 'p', 0, 'L', 0,
+    &integer_type_node, &integer_type_node
+  },
+  { "ms_scanf",    ms_printf_length_specs,   ms_scan_char_table,  "*'", NULL,
+    ms_scanf_flag_specs, ms_scanf_flag_pairs,
+    FMT_FLAG_ARG_CONVERT|FMT_FLAG_SCANF_A_KLUDGE|FMT_FLAG_USE_DOLLAR|FMT_FLAG_ZERO_WIDTH_BAD|FMT_FLAG_DOLLAR_GAP_POINTER_OK,
+    'w', 0, 0, '*', 'L', 0,
+    NULL, NULL
+  },
+  { "ms_strftime", NULL,                 ms_time_char_table,  "", "#",
+    ms_strftime_flag_specs, ms_strftime_flag_pairs,
+    FMT_FLAG_FANCY_PERCENT_OK, 0, 0, 0, 0, 0, 0,
+    NULL, NULL
+  }
+};
+
+/* Default overrides for printf, scanf and strftime.  */
+EXPORTED_CONST target_ovr_attr mingw_format_attribute_overrides[4] =
+{
+  { "ms_printf", "printf" },
+  { "ms_scanf", "scanf" },
+  { "ms_strftime", "strftime" }
+};
+
+/* Setup for option Wpedantic-ms-format.  */
+
+#ifdef TARGET_OVERRIDES_FORMAT_INIT
+
+/* Make sure TARGET_OVERRIDES_FORMAT_INIT is prototyped.  */
+extern void TARGET_OVERRIDES_FORMAT_INIT (void);
+
+/* Helper.  */
+#define C89_OR_EXT (warn_pedantic_ms_format ? STD_EXT : STD_C89)
+
+void
+TARGET_OVERRIDES_FORMAT_INIT (void)
+{
+  ms_printf_length_specs[2].std = C89_OR_EXT; /* I32 */
+  ms_printf_length_specs[3].std = C89_OR_EXT; /* I64 */
+  ms_printf_length_specs[4].std = C89_OR_EXT; /* I */
+}
+
+#undef C89_OR_EXT
+
+#endif
diff --git a/gcc-4.9/gcc/config/i386/netbsd-elf.h b/gcc-4.9/gcc/config/i386/netbsd-elf.h
new file mode 100644
index 000000000..e575b39cb
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/netbsd-elf.h
@@ -0,0 +1,121 @@
+/* Definitions of target machine for GCC,
+   for i386/ELF NetBSD systems.
+   Copyright (C) 2001-2014 Free Software Foundation, Inc.
+   Contributed by matthew green <mrg@eterna.com.au>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define TARGET_OS_CPP_BUILTINS()		\
+  do						\
+    {						\
+      NETBSD_OS_CPP_BUILTINS_ELF();		\
+    }						\
+  while (0)
+
+
+/* Extra specs needed for NetBSD/i386 ELF.  */
+
+#undef SUBTARGET_EXTRA_SPECS
+#define SUBTARGET_EXTRA_SPECS			\
+  { "netbsd_cpp_spec", NETBSD_CPP_SPEC },	\
+  { "netbsd_entry_point", NETBSD_ENTRY_POINT },
+
+
+/* Provide a LINK_SPEC appropriate for a NetBSD/i386 ELF target.  */
+
+#undef LINK_SPEC
+#define LINK_SPEC NETBSD_LINK_SPEC_ELF
+
+#define NETBSD_ENTRY_POINT "__start"
+
+
+/* Provide a CPP_SPEC appropriate for NetBSD.  */
+
+#undef CPP_SPEC
+#define CPP_SPEC "%(netbsd_cpp_spec)"
+
+
+/* Make gcc agree with <machine/ansi.h> */
+
+#undef SIZE_TYPE
+#define SIZE_TYPE "unsigned int"
+
+#undef PTRDIFF_TYPE
+#define PTRDIFF_TYPE "int"
+
+#undef ASM_APP_ON
+#define ASM_APP_ON "#APP\n"
+
+#undef ASM_APP_OFF
+#define ASM_APP_OFF "#NO_APP\n"
+
+#undef ASM_COMMENT_START
+#define ASM_COMMENT_START "#"
+
+#undef DBX_REGISTER_NUMBER
+#define DBX_REGISTER_NUMBER(n)  svr4_dbx_register_map[n]
+
+
+/* Output assembler code to FILE to call the profiler.  */
+
+#undef NO_PROFILE_COUNTERS
+#define NO_PROFILE_COUNTERS	1
+
+#undef FUNCTION_PROFILER
+#define FUNCTION_PROFILER(FILE, LABELNO)				\
+{									\
+  if (flag_pic)								\
+    fprintf (FILE, "\tcall __mcount@PLT\n");				\
+  else									\
+    fprintf (FILE, "\tcall __mcount\n");				\
+}
+
+
+#undef HAS_INIT_SECTION
+
+/* This is how we tell the assembler that two symbols have the same value.  */
+
+#define ASM_OUTPUT_DEF(FILE,NAME1,NAME2) \
+  do { assemble_name(FILE, NAME1); 	 \
+       fputs(" = ", FILE);		 \
+       assemble_name(FILE, NAME2);	 \
+       fputc('\n', FILE); } while (0)
+
+/* A C statement to output to the stdio stream FILE an assembler
+   command to advance the location counter to a multiple of 1<<LOG
+   bytes if it is within MAX_SKIP bytes.
+
+   This is used to align code labels according to Intel recommendations.  */
+
+#ifdef HAVE_GAS_MAX_SKIP_P2ALIGN
+#define ASM_OUTPUT_MAX_SKIP_ALIGN(FILE, LOG, MAX_SKIP)			\
+  if ((LOG) != 0) {							\
+    if ((MAX_SKIP) == 0) fprintf ((FILE), "\t.p2align %d\n", (LOG));	\
+    else fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP));	\
+  }
+#endif
+
+/* We always use gas here, so we don't worry about ECOFF assembler
+   problems.  */
+#undef TARGET_GAS
+#define TARGET_GAS	1
+
+/* Default to pcc-struct-return, because this is the ELF abi and
+   we don't care about compatibility with older gcc versions.  */
+#define DEFAULT_PCC_STRUCT_RETURN 1
+
+#define HAVE_ENABLE_EXECUTE_STACK
diff --git a/gcc-4.9/gcc/config/i386/netbsd64.h b/gcc-4.9/gcc/config/i386/netbsd64.h
new file mode 100644
index 000000000..f990835bd
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/netbsd64.h
@@ -0,0 +1,69 @@
+/* Definitions of target machine for GCC,
+   for x86-64/ELF NetBSD systems.
+   Copyright (C) 2002-2014 Free Software Foundation, Inc.
+   Contributed by Wasabi Systems, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define TARGET_OS_CPP_BUILTINS()		\
+  do						\
+    {						\
+      NETBSD_OS_CPP_BUILTINS_ELF();		\
+    }						\
+  while (0)
+
+
+/* Extra specs needed for NetBSD/x86-64 ELF.  */
+
+#undef SUBTARGET_EXTRA_SPECS
+#define SUBTARGET_EXTRA_SPECS			\
+  { "netbsd_cpp_spec", NETBSD_CPP_SPEC },	\
+  { "netbsd_link_spec", NETBSD_LINK_SPEC_ELF },	\
+  { "netbsd_entry_point", NETBSD_ENTRY_POINT },
+
+
+/* Provide a LINK_SPEC appropriate for a NetBSD/x86-64 ELF target.  */
+
+#undef LINK_SPEC
+#define LINK_SPEC \
+  "%{m32:-m elf_i386} \
+   %{m64:-m elf_x86_64} \
+   %(netbsd_link_spec)"
+
+#define NETBSD_ENTRY_POINT "_start"
+
+
+/* Provide a CPP_SPEC appropriate for NetBSD.  */
+
+#undef CPP_SPEC
+#define CPP_SPEC "%(netbsd_cpp_spec)"
+
+
+/* Output assembler code to FILE to call the profiler.  */
+
+#undef FUNCTION_PROFILER
+#define FUNCTION_PROFILER(FILE, LABELNO)				\
+{									\
+  if (TARGET_64BIT && flag_pic)						\
+    fprintf (FILE, "\tcall *__mcount@PLT\n");				\
+  else if (flag_pic)							\
+    fprintf (FILE, "\tcall *__mcount@PLT\n");				\
+  else									\
+    fprintf (FILE, "\tcall __mcount\n");				\
+}
+
+#define HAVE_ENABLE_EXECUTE_STACK
diff --git a/gcc-4.9/gcc/config/i386/nmmintrin.h b/gcc-4.9/gcc/config/i386/nmmintrin.h
new file mode 100644
index 000000000..9fc710736
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/nmmintrin.h
@@ -0,0 +1,33 @@
+/* Copyright (C) 2007-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 10.0.  */
+
+#ifndef _NMMINTRIN_H_INCLUDED
+#define _NMMINTRIN_H_INCLUDED
+
+/* We just include SSE4.1 header file.  */
+#include <smmintrin.h>
+
+#endif /* _NMMINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/nto.h b/gcc-4.9/gcc/config/i386/nto.h
new file mode 100644
index 000000000..2abb98751
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/nto.h
@@ -0,0 +1,105 @@
+/* Definitions for Intel 386 running QNX/Neutrino.
+   Copyright (C) 2002-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#undef  DEFAULT_PCC_STRUCT_RETURN
+#define DEFAULT_PCC_STRUCT_RETURN 1
+
+#undef TARGET_OS_CPP_BUILTINS
+#define TARGET_OS_CPP_BUILTINS()		\
+  do						\
+    {						\
+        builtin_define ("__X86__");		\
+        builtin_define ("__QNXNTO__");		\
+        builtin_define ("__QNX__");		\
+        builtin_define ("__ELF__");		\
+        builtin_define ("__LITTLEENDIAN__");	\
+        builtin_assert ("system=qnx");		\
+        builtin_assert ("system=qnxnto");	\
+        builtin_assert ("system=nto");		\
+        builtin_assert ("system=unix");		\
+    }						\
+  while (0)
+
+#undef THREAD_MODEL_SPEC
+#define THREAD_MODEL_SPEC "posix"
+
+#ifdef CROSS_DIRECTORY_STRUCTURE
+#define SYSROOT_SUFFIX_SPEC "x86"
+#endif
+
+#ifndef CROSS_DIRECTORY_STRUCTURE
+#undef MD_EXEC_PREFIX
+#define MD_EXEC_PREFIX "/usr/ccs/bin/"
+
+#undef MD_STARTFILE_PREFIX
+#define MD_STARTFILE_PREFIX "/usr/ccs/lib/"
+#endif
+
+#undef STARTFILE_SPEC
+#define STARTFILE_SPEC \
+"%{!shared: \
+  %{!symbolic: \
+    %{pg:mcrt1.o%s} \
+    %{!pg:%{p:mcrt1.o%s} \
+    %{!p:crt1.o%s}}}} \
+crti.o%s \
+%{fexceptions: crtbegin.o%s} \
+%{!fexceptions: %R/lib/crtbegin.o}"
+
+#undef ENDFILE_SPEC
+#define ENDFILE_SPEC \
+  "crtend.o%s crtn.o%s"
+
+#undef LINK_SPEC
+#define LINK_SPEC \
+  "%{h*} %{v:-V} \
+   %{static:-dn -Bstatic} \
+   %{shared:-G -dy -z text} \
+   %{symbolic:-Bsymbolic -G -dy -z text} \
+   %{G:-G} \
+   %{YP,*} \
+   %{!YP,*:%{p:-Y P,%R/lib} \
+    %{!p:-Y P,%R/lib}} \
+   %{Qy:} %{!Qn:-Qy} \
+   -m i386nto \
+   %{!shared: --dynamic-linker /usr/lib/ldqnx.so.2}"
+
+#undef	LIB_SPEC
+#define LIB_SPEC "%{!shared:%{!symbolic:-lc}}"
+
+#undef  ASM_SPEC
+#define ASM_SPEC ""
+
+#undef SIZE_TYPE
+#define SIZE_TYPE "unsigned int"
+
+#undef PTRDIFF_TYPE
+#define PTRDIFF_TYPE "int"
+
+#undef WCHAR_TYPE
+#define WCHAR_TYPE "long unsigned int"
+
+#undef WCHAR_TYPE_SIZE
+#define WCHAR_TYPE_SIZE BITS_PER_WORD
+
+#define NO_IMPLICIT_EXTERN_C 1
+
+#define TARGET_POSIX_IO
+
+#undef DBX_REGISTER_NUMBER
diff --git a/gcc-4.9/gcc/config/i386/nto.opt b/gcc-4.9/gcc/config/i386/nto.opt
new file mode 100644
index 000000000..007894201
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/nto.opt
@@ -0,0 +1,32 @@
+; QNX options.
+
+; Copyright (C) 2011-2014 Free Software Foundation, Inc.
+;
+; This file is part of GCC.
+;
+; GCC is free software; you can redistribute it and/or modify it under
+; the terms of the GNU General Public License as published by the Free
+; Software Foundation; either version 3, or (at your option) any later
+; version.
+;
+; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+; WARRANTY; without even the implied warranty of MERCHANTABILITY or
+; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+; for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with GCC; see the file COPYING3.  If not see
+; <http://www.gnu.org/licenses/>.
+
+; See the GCC internals manual (options.texi) for a description of
+; this file's format.
+
+; Please try to keep this file in ASCII collating order.
+
+G
+Driver
+
+YP,
+Driver Joined
+
+; This comment is to ensure we retain the blank line above.
diff --git a/gcc-4.9/gcc/config/i386/openbsd.h b/gcc-4.9/gcc/config/i386/openbsd.h
new file mode 100644
index 000000000..f313d5cd1
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/openbsd.h
@@ -0,0 +1,101 @@
+/* Configuration for an OpenBSD i386 target.
+   Copyright (C) 1999-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+
+/* This goes away when the math-emulator is fixed */
+#undef TARGET_SUBTARGET_DEFAULT
+#define TARGET_SUBTARGET_DEFAULT \
+  (MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS | MASK_NO_FANCY_MATH_387)
+
+#define TARGET_OS_CPP_BUILTINS()		\
+  do						\
+    {						\
+	builtin_define ("__unix__");		\
+	builtin_define ("__OpenBSD__");		\
+	builtin_assert ("system=unix");		\
+	builtin_assert ("system=bsd");		\
+	builtin_assert ("system=OpenBSD");	\
+    }						\
+  while (0)
+
+/* Layout of source language data types.  */
+
+/* This must agree with <machine/ansi.h> */
+#undef SIZE_TYPE
+#define SIZE_TYPE "unsigned int"
+
+#undef PTRDIFF_TYPE
+#define PTRDIFF_TYPE "int"
+
+#undef WCHAR_TYPE
+#define WCHAR_TYPE "int"
+
+#undef WCHAR_TYPE_SIZE
+#define WCHAR_TYPE_SIZE 32
+
+/* Assembler format: overall framework.  */
+
+#undef ASM_APP_ON
+#define ASM_APP_ON "#APP\n"
+
+#undef ASM_APP_OFF
+#define ASM_APP_OFF "#NO_APP\n"
+
+/* Stack & calling: aggregate returns.  */
+
+/* Don't default to pcc-struct-return, because gcc is the only compiler, and
+   we want to retain compatibility with older gcc versions.  */
+#define DEFAULT_PCC_STRUCT_RETURN 0
+
+/* Assembler format: alignment output.  */
+
+/* Kludgy test: when gas is upgraded, it will have p2align, and no problems
+   with nops.  */
+#ifndef HAVE_GAS_MAX_SKIP_P2ALIGN
+/* i386 OpenBSD still uses an older gas that doesn't insert nops by default
+   when the .align directive demands to insert extra space in the text
+   segment.  */
+#undef ASM_OUTPUT_ALIGN
+#define ASM_OUTPUT_ALIGN(FILE,LOG) \
+  if ((LOG)!=0) fprintf ((FILE), "\t.align %d,0x90\n", (LOG))
+#endif
+
+/* Stack & calling: profiling.  */
+
+/* OpenBSD's profiler recovers all information from the stack pointer.
+   The icky part is not here, but in machine/profile.h.  */
+#undef FUNCTION_PROFILER
+#define FUNCTION_PROFILER(FILE, LABELNO)  \
+  fputs (flag_pic ? "\tcall mcount@PLT\n": "\tcall mcount\n", FILE);
+
+/* Assembler format: exception region output.  */
+
+/* All configurations that don't use elf must be explicit about not using
+   dwarf unwind information.  */
+#define DWARF2_UNWIND_INFO 0
+
+#undef ASM_PREFERRED_EH_DATA_FORMAT
+
+#undef ASM_COMMENT_START
+#define ASM_COMMENT_START ";#"
+
+/* OpenBSD gas currently does not support quad, so do not use it.  */
+#undef ASM_QUAD
+
+#define TARGET_HAVE_NAMED_SECTIONS false
diff --git a/gcc-4.9/gcc/config/i386/openbsdelf.h b/gcc-4.9/gcc/config/i386/openbsdelf.h
new file mode 100644
index 000000000..46ae0b6cd
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/openbsdelf.h
@@ -0,0 +1,119 @@
+/* Configuration for an OpenBSD i386 target.
+   
+   Copyright (C) 2005-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define TARGET_OS_CPP_BUILTINS()		\
+  do						\
+    {						\
+    	OPENBSD_OS_CPP_BUILTINS();		\
+    }						\
+  while (0)
+
+#undef DBX_REGISTER_NUMBER
+#define DBX_REGISTER_NUMBER(n) \
+  (TARGET_64BIT ? dbx64_register_map[n] : svr4_dbx_register_map[n])
+
+/* This must agree with <machine/_types.h>.  */
+#undef SIZE_TYPE
+#define SIZE_TYPE "long unsigned int"
+
+#undef PTRDIFF_TYPE
+#define PTRDIFF_TYPE "long int"
+
+#undef WCHAR_TYPE
+#define WCHAR_TYPE "int"
+
+#undef WCHAR_TYPE_SIZE
+#define WCHAR_TYPE_SIZE 32
+
+#undef WINT_TYPE
+#define WINT_TYPE "int"
+
+/* Don't default to pcc-struct-return, because gcc is the only compiler, and
+   we want to retain compatibility with older gcc versions.  */
+
+#undef DEFAULT_PCC_STRUCT_RETURN
+#define DEFAULT_PCC_STRUCT_RETURN 0
+
+/* Override the default comment-starter of "/".  */
+#undef ASM_COMMENT_START
+#define ASM_COMMENT_START "#"
+
+#undef ASM_APP_ON
+#define ASM_APP_ON "#APP\n"
+
+#undef ASM_APP_OFF
+#define ASM_APP_OFF "#NO_APP\n"
+
+/* A C statement to output to the stdio stream FILE an assembler
+   command to advance the location counter to a multiple of 1<<LOG
+   bytes if it is within MAX_SKIP bytes.
+
+   This is used to align code labels according to Intel recommendations.  */
+
+#ifdef HAVE_GAS_MAX_SKIP_P2ALIGN
+#define ASM_OUTPUT_MAX_SKIP_ALIGN(FILE,LOG,MAX_SKIP)			\
+  do {									\
+    if ((LOG) != 0) {							\
+      if ((MAX_SKIP) == 0) fprintf ((FILE), "\t.p2align %d\n", (LOG));	\
+      else {								\
+	fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP));	\
+	/* Make sure that we have at least 8 byte alignment if > 8 byte \
+	   alignment is preferred.  */					\
+	if ((LOG) > 3							\
+	    && (1 << (LOG)) > ((MAX_SKIP) + 1)				\
+	    && (MAX_SKIP) >= 7)						\
+	  fputs ("\t.p2align 3\n", (FILE));				\
+      }									\
+    }									\
+  } while (0)
+#endif
+
+/* OpenBSD's profiler recovers all information from the stack pointer.
+   The icky part is not here, but in <machine/profile.h>.  */
+#undef FUNCTION_PROFILER
+#define FUNCTION_PROFILER(FILE, LABELNO)  \
+  fputs (flag_pic ? "\tcall __mcount@PLT\n": "\tcall __mcount\n", FILE);
+
+#undef LINK_SPEC
+#define LINK_SPEC \
+  "%{!shared:%{!nostdlib:%{!r:%{!e*:-e __start}}}} \
+   %{shared:-shared} %{R*} \
+   %{static:-Bstatic} \
+   %{!static:-Bdynamic} \
+   %{assert*} \
+   -dynamic-linker /usr/libexec/ld.so"
+
+#undef STARTFILE_SPEC
+#define STARTFILE_SPEC "\
+	%{!shared: %{pg:gcrt0%O%s} %{!pg:%{p:gcrt0%O%s} %{!p:crt0%O%s}} \
+	crtbegin%O%s} %{shared:crtbeginS%O%s}"
+
+#undef ENDFILE_SPEC
+#define ENDFILE_SPEC "%{!shared:crtend%O%s} %{shared:crtendS%O%s}"
+
+#define OBSD_HAS_CORRECT_SPECS
+
+#define HAVE_ENABLE_EXECUTE_STACK
+
+/* Put all *tf routines in libgcc.  */
+#undef LIBGCC2_HAS_TF_MODE
+#define LIBGCC2_HAS_TF_MODE 1
+#define LIBGCC2_TF_CEXT q
+#define TF_SIZE 113
diff --git a/gcc-4.9/gcc/config/i386/pentium.md b/gcc-4.9/gcc/config/i386/pentium.md
new file mode 100644
index 000000000..97fc55e2a
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/pentium.md
@@ -0,0 +1,306 @@
+;; Pentium Scheduling
+;; Copyright (C) 2002-2014 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+;;
+;; The Pentium is an in-order core with two integer pipelines.
+
+;; True for insns that behave like prefixed insns on the Pentium.
+(define_attr "pent_prefix" "false,true"
+  (if_then_else (ior (eq_attr "prefix_0f" "1")
+  		     (ior (eq_attr "prefix_data16" "1")
+			  (eq_attr "prefix_rep" "1")))
+    (const_string "true")
+    (const_string "false")))
+
+;; Categorize how an instruction slots.
+
+;; The non-MMX Pentium slots an instruction with prefixes on U pipe only,
+;; while MMX Pentium can slot it on either U or V.  Model non-MMX Pentium
+;; rules, because it results in noticeably better code on non-MMX Pentium
+;; and doesn't hurt much on MMX.  (Prefixed instructions are not very
+;; common, so the scheduler usually has a non-prefixed insn to pair).
+
+(define_attr "pent_pair" "uv,pu,pv,np"
+  (cond [(eq_attr "imm_disp" "true")
+	   (const_string "np")
+	 (ior (eq_attr "type" "alu1,alu,imov,icmp,test,lea,incdec")
+	      (and (eq_attr "type" "pop,push")
+		   (eq_attr "memory" "!both")))
+	   (if_then_else (eq_attr "pent_prefix" "true")
+	     (const_string "pu")
+	     (const_string "uv"))
+	 (eq_attr "type" "ibr")
+	   (const_string "pv")
+	 (and (eq_attr "type" "ishift")
+	      (match_operand 2 "const_int_operand"))
+	   (const_string "pu")
+	 (and (eq_attr "type" "rotate")
+	      (match_operand 2 "const1_operand"))
+	   (const_string "pu")
+	 (and (eq_attr "type" "ishift1")
+	      (match_operand 1 "const_int_operand"))
+	   (const_string "pu")
+	 (and (eq_attr "type" "rotate1")
+	      (match_operand 1 "const1_operand"))
+	   (const_string "pu")
+	 (and (eq_attr "type" "call")
+	      (match_operand 0 "constant_call_address_operand"))
+	   (const_string "pv")
+	 (and (eq_attr "type" "callv")
+	      (match_operand 1 "constant_call_address_operand"))
+	   (const_string "pv")
+	]
+	(const_string "np")))
+
+(define_automaton "pentium,pentium_fpu")
+
+;; Pentium do have U and V pipes.  Instruction to both pipes
+;; are always issued together, much like on VLIW.
+;;
+;;                    predecode
+;;                   /         \
+;;               decodeu     decodev
+;;             /    |           |
+;;           fpu executeu    executev
+;;            |     |           |
+;;           fpu  retire     retire
+;;            |
+;;           fpu
+;; We add dummy "port" pipes allocated only first cycle of
+;; instruction to specify this behavior.
+
+(define_cpu_unit "pentium-portu,pentium-portv" "pentium")
+(define_cpu_unit "pentium-u,pentium-v" "pentium")
+(absence_set "pentium-portu" "pentium-u,pentium-v")
+(presence_set "pentium-portv" "pentium-portu")
+
+;; Floating point instructions can overlap with new issue of integer
+;; instructions.  We model only first cycle of FP pipeline, as it is
+;; fully pipelined.
+(define_cpu_unit "pentium-fp" "pentium_fpu")
+
+;; There is non-pipelined multiplier unit used for complex operations.
+(define_cpu_unit "pentium-fmul" "pentium_fpu")
+
+;; Pentium preserves memory ordering, so when load-execute-store
+;; instruction is executed together with other instruction loading
+;; data, the execution of the other instruction is delayed to very
+;; last cycle of first instruction, when data are bypassed.
+;; We model this by allocating "memory" unit when store is pending
+;; and using conflicting load units together.
+
+(define_cpu_unit "pentium-memory" "pentium")
+(define_cpu_unit "pentium-load0" "pentium")
+(define_cpu_unit "pentium-load1" "pentium")
+(absence_set "pentium-load0,pentium-load1" "pentium-memory")
+
+(define_reservation "pentium-load" "(pentium-load0 | pentium-load1)")
+(define_reservation "pentium-np" "(pentium-u + pentium-v)")
+(define_reservation "pentium-uv" "(pentium-u | pentium-v)")
+(define_reservation "pentium-portuv" "(pentium-portu | pentium-portv)")
+(define_reservation "pentium-firstu" "(pentium-u + pentium-portu)")
+(define_reservation "pentium-firstv" "(pentium-v + pentium-portuv)")
+(define_reservation "pentium-firstuv" "(pentium-uv + pentium-portuv)")
+(define_reservation "pentium-firstuload" "(pentium-load + pentium-firstu)")
+(define_reservation "pentium-firstvload" "(pentium-load + pentium-firstv)")
+(define_reservation "pentium-firstuvload" "(pentium-load + pentium-firstuv)
+					   | (pentium-firstv,pentium-v,
+					      (pentium-load+pentium-firstv))")
+(define_reservation "pentium-firstuboth" "(pentium-load + pentium-firstu
+					   + pentium-memory)")
+(define_reservation "pentium-firstvboth" "(pentium-load + pentium-firstv
+					   + pentium-memory)")
+(define_reservation "pentium-firstuvboth" "(pentium-load + pentium-firstuv
+					    + pentium-memory)
+					   | (pentium-firstv,pentium-v,
+					      (pentium-load+pentium-firstv))")
+
+;; Few common long latency instructions
+(define_insn_reservation "pent_mul" 11
+  (and (eq_attr "cpu" "pentium")
+       (eq_attr "type" "imul"))
+  "pentium-np*11")
+
+(define_insn_reservation "pent_str" 12
+  (and (eq_attr "cpu" "pentium")
+       (eq_attr "type" "str"))
+  "pentium-np*12")
+
+;; Integer division and some other long latency instruction block all
+;; units, including the FP pipe.  There is no value in modeling the
+;; latency of these instructions and not modeling the latency
+;; decreases the size of the DFA.
+(define_insn_reservation "pent_block" 1
+  (and (eq_attr "cpu" "pentium")
+       (eq_attr "type" "idiv"))
+  "pentium-np+pentium-fp")
+
+;;  Moves usually have one cycle penalty, but there are exceptions.
+(define_insn_reservation "pent_fmov" 1
+  (and (eq_attr "cpu" "pentium")
+       (and (eq_attr "type" "fmov")
+	    (eq_attr "memory" "none,load")))
+  "(pentium-fp+pentium-np)")
+
+(define_insn_reservation "pent_fpmovxf" 3
+  (and (eq_attr "cpu" "pentium")
+       (and (eq_attr "type" "fmov")
+	    (and (eq_attr "memory" "load,store")
+		 (eq_attr "mode" "XF"))))
+  "(pentium-fp+pentium-np)*3")
+
+(define_insn_reservation "pent_fpstore" 2
+  (and (eq_attr "cpu" "pentium")
+       (and (eq_attr "type" "fmov")
+	    (ior (match_operand 1 "immediate_operand")
+		 (eq_attr "memory" "store"))))
+  "(pentium-fp+pentium-np)*2")
+
+(define_insn_reservation "pent_imov" 1
+  (and (eq_attr "cpu" "pentium")
+       (eq_attr "type" "imov"))
+  "pentium-firstuv")
+
+;; Push and pop instructions have 1 cycle latency and special
+;; hardware bypass allows them to be paired with other push,pop
+;; and call instructions.
+(define_bypass 0 "pent_push,pent_pop" "pent_push,pent_pop,pent_call")
+(define_insn_reservation "pent_push" 1
+  (and (eq_attr "cpu" "pentium")
+       (and (eq_attr "type" "push")
+	    (eq_attr "memory" "store")))
+  "pentium-firstuv")
+
+(define_insn_reservation "pent_pop" 1
+  (and (eq_attr "cpu" "pentium")
+       (eq_attr "type" "pop,leave"))
+  "pentium-firstuv")
+
+;; Call and branch instruction can execute in either pipe, but
+;; they are only pairable when in the v pipe.
+(define_insn_reservation "pent_call" 10
+  (and (eq_attr "cpu" "pentium")
+       (eq_attr "type" "call,callv"))
+  "pentium-firstv,pentium-v*9")
+
+(define_insn_reservation "pent_branch" 1
+  (and (eq_attr "cpu" "pentium")
+       (eq_attr "type" "ibr"))
+  "pentium-firstv")
+
+;; Floating point instruction dispatch in U pipe, but continue
+;; in FP pipeline allowing other instructions to be executed.
+(define_insn_reservation "pent_fp" 3
+  (and (eq_attr "cpu" "pentium")
+       (eq_attr "type" "fop,fistp"))
+  "(pentium-firstu+pentium-fp),nothing,nothing")
+
+;; First two cycles of fmul are not pipelined.
+(define_insn_reservation "pent_fmul" 3
+  (and (eq_attr "cpu" "pentium")
+       (eq_attr "type" "fmul"))
+  "(pentium-firstuv+pentium-fp+pentium-fmul),pentium-fmul,nothing")
+
+;; Long latency FP instructions overlap with integer instructions,
+;; but only last 2 cycles with FP ones.
+(define_insn_reservation "pent_fdiv" 39
+  (and (eq_attr "cpu" "pentium")
+       (eq_attr "type" "fdiv"))
+  "(pentium-np+pentium-fp+pentium-fmul),
+   (pentium-fp+pentium-fmul)*36,pentium-fmul*2")
+
+(define_insn_reservation "pent_fpspc" 70
+  (and (eq_attr "cpu" "pentium")
+       (eq_attr "type" "fpspc"))
+  "(pentium-np+pentium-fp+pentium-fmul),
+   (pentium-fp+pentium-fmul)*67,pentium-fmul*2")
+
+;; Integer instructions.  Load/execute/store takes 3 cycles,
+;; load/execute 2 cycles and execute only one cycle.
+(define_insn_reservation "pent_uv_both" 3
+  (and (eq_attr "cpu" "pentium")
+       (and (eq_attr "pent_pair" "uv")
+	    (eq_attr "memory" "both")))
+  "pentium-firstuvboth,pentium-uv+pentium-memory,pentium-uv")
+
+(define_insn_reservation "pent_u_both" 3
+  (and (eq_attr "cpu" "pentium")
+       (and (eq_attr "pent_pair" "pu")
+	    (eq_attr "memory" "both")))
+  "pentium-firstuboth,pentium-u+pentium-memory,pentium-u")
+
+(define_insn_reservation "pent_v_both" 3
+  (and (eq_attr "cpu" "pentium")
+       (and (eq_attr "pent_pair" "pv")
+	    (eq_attr "memory" "both")))
+  "pentium-firstvboth,pentium-v+pentium-memory,pentium-v")
+
+(define_insn_reservation "pent_np_both" 3
+  (and (eq_attr "cpu" "pentium")
+       (and (eq_attr "pent_pair" "np")
+	    (eq_attr "memory" "both")))
+  "pentium-np,pentium-np,pentium-np")
+
+(define_insn_reservation "pent_uv_load" 2
+  (and (eq_attr "cpu" "pentium")
+       (and (eq_attr "pent_pair" "uv")
+	    (eq_attr "memory" "load")))
+  "pentium-firstuvload,pentium-uv")
+
+(define_insn_reservation "pent_u_load" 2
+  (and (eq_attr "cpu" "pentium")
+       (and (eq_attr "pent_pair" "pu")
+	    (eq_attr "memory" "load")))
+  "pentium-firstuload,pentium-u")
+
+(define_insn_reservation "pent_v_load" 2
+  (and (eq_attr "cpu" "pentium")
+       (and (eq_attr "pent_pair" "pv")
+	    (eq_attr "memory" "load")))
+  "pentium-firstvload,pentium-v")
+
+(define_insn_reservation "pent_np_load" 2
+  (and (eq_attr "cpu" "pentium")
+       (and (eq_attr "pent_pair" "np")
+	    (eq_attr "memory" "load")))
+  "pentium-np,pentium-np")
+
+(define_insn_reservation "pent_uv" 1
+  (and (eq_attr "cpu" "pentium")
+       (and (eq_attr "pent_pair" "uv")
+	    (eq_attr "memory" "none")))
+  "pentium-firstuv")
+
+(define_insn_reservation "pent_u" 1
+  (and (eq_attr "cpu" "pentium")
+       (and (eq_attr "pent_pair" "pu")
+	    (eq_attr "memory" "none")))
+  "pentium-firstu")
+
+(define_insn_reservation "pent_v" 1
+  (and (eq_attr "cpu" "pentium")
+       (and (eq_attr "pent_pair" "pv")
+	    (eq_attr "memory" "none")))
+  "pentium-firstv")
+
+(define_insn_reservation "pent_np" 1
+  (and (eq_attr "cpu" "pentium")
+       (and (eq_attr "pent_pair" "np")
+	    (eq_attr "memory" "none")))
+  "pentium-np")
+
diff --git a/gcc-4.9/gcc/config/i386/pmm_malloc.h b/gcc-4.9/gcc/config/i386/pmm_malloc.h
new file mode 100644
index 000000000..3be2f3545
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/pmm_malloc.h
@@ -0,0 +1,57 @@
+/* Copyright (C) 2004-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _MM_MALLOC_H_INCLUDED
+#define _MM_MALLOC_H_INCLUDED
+
+#include <stdlib.h>
+
+/* We can't depend on <stdlib.h> since the prototype of posix_memalign
+   may not be visible.  */
+#ifndef __cplusplus
+extern int posix_memalign (void **, size_t, size_t);
+#else
+extern "C" int posix_memalign (void **, size_t, size_t) throw ();
+#endif
+
+static __inline void *
+_mm_malloc (size_t size, size_t alignment)
+{
+  void *ptr;
+  if (alignment == 1)
+    return malloc (size);
+  if (alignment == 2 || (sizeof (void *) == 8 && alignment == 4))
+    alignment = sizeof (void *);
+  if (posix_memalign (&ptr, alignment, size) == 0)
+    return ptr;
+  else
+    return NULL;
+}
+
+static __inline void
+_mm_free (void * ptr)
+{
+  free (ptr);
+}
+
+#endif /* _MM_MALLOC_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/pmmintrin.h b/gcc-4.9/gcc/config/i386/pmmintrin.h
new file mode 100644
index 000000000..6a795005c
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/pmmintrin.h
@@ -0,0 +1,132 @@
+/* Copyright (C) 2003-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef _PMMINTRIN_H_INCLUDED
+#define _PMMINTRIN_H_INCLUDED
+
+/* We need definitions from the SSE2 and SSE header files*/
+#include <emmintrin.h>
+
+#ifndef __SSE3__
+#pragma GCC push_options
+#pragma GCC target("sse3")
+#define __DISABLE_SSE3__
+#endif /* __SSE3__ */
+
+/* Additional bits in the MXCSR.  */
+#define _MM_DENORMALS_ZERO_MASK		0x0040
+#define _MM_DENORMALS_ZERO_ON		0x0040
+#define _MM_DENORMALS_ZERO_OFF		0x0000
+
+#define _MM_SET_DENORMALS_ZERO_MODE(mode) \
+  _mm_setcsr ((_mm_getcsr () & ~_MM_DENORMALS_ZERO_MASK) | (mode))
+#define _MM_GET_DENORMALS_ZERO_MODE() \
+  (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_addsub_ps (__m128 __X, __m128 __Y)
+{
+  return (__m128) __builtin_ia32_addsubps ((__v4sf)__X, (__v4sf)__Y);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_ps (__m128 __X, __m128 __Y)
+{
+  return (__m128) __builtin_ia32_haddps ((__v4sf)__X, (__v4sf)__Y);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_ps (__m128 __X, __m128 __Y)
+{
+  return (__m128) __builtin_ia32_hsubps ((__v4sf)__X, (__v4sf)__Y);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movehdup_ps (__m128 __X)
+{
+  return (__m128) __builtin_ia32_movshdup ((__v4sf)__X);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_moveldup_ps (__m128 __X)
+{
+  return (__m128) __builtin_ia32_movsldup ((__v4sf)__X);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_addsub_pd (__m128d __X, __m128d __Y)
+{
+  return (__m128d) __builtin_ia32_addsubpd ((__v2df)__X, (__v2df)__Y);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_pd (__m128d __X, __m128d __Y)
+{
+  return (__m128d) __builtin_ia32_haddpd ((__v2df)__X, (__v2df)__Y);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_pd (__m128d __X, __m128d __Y)
+{
+  return (__m128d) __builtin_ia32_hsubpd ((__v2df)__X, (__v2df)__Y);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loaddup_pd (double const *__P)
+{
+  return _mm_load1_pd (__P);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movedup_pd (__m128d __X)
+{
+  return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0));
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_lddqu_si128 (__m128i const *__P)
+{
+  return (__m128i) __builtin_ia32_lddqu ((char const *)__P);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_monitor (void const * __P, unsigned int __E, unsigned int __H)
+{
+  __builtin_ia32_monitor (__P, __E, __H);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mwait (unsigned int __E, unsigned int __H)
+{
+  __builtin_ia32_mwait (__E, __H);
+}
+
+#ifdef __DISABLE_SSE3__
+#undef __DISABLE_SSE3__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE3__ */
+
+#endif /* _PMMINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/popcntintrin.h b/gcc-4.9/gcc/config/i386/popcntintrin.h
new file mode 100644
index 000000000..41845d868
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/popcntintrin.h
@@ -0,0 +1,53 @@
+/* Copyright (C) 2009-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _POPCNTINTRIN_H_INCLUDED
+#define _POPCNTINTRIN_H_INCLUDED
+
+#ifndef __POPCNT__
+#pragma GCC push_options
+#pragma GCC target("popcnt")
+#define __DISABLE_POPCNT__
+#endif /* __POPCNT__ */
+
+/* Calculate a number of bits set to 1.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_popcnt_u32 (unsigned int __X)
+{
+  return __builtin_popcount (__X);
+}
+
+#ifdef __x86_64__
+extern __inline long long  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_popcnt_u64 (unsigned long long __X)
+{
+  return __builtin_popcountll (__X);
+}
+#endif
+
+#ifdef __DISABLE_POPCNT__
+#undef __DISABLE_POPCNT__
+#pragma GCC pop_options
+#endif  /* __DISABLE_POPCNT__ */
+
+#endif /* _POPCNTINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/ppro.md b/gcc-4.9/gcc/config/i386/ppro.md
new file mode 100644
index 000000000..25b2a546c
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/ppro.md
@@ -0,0 +1,758 @@
+;; Scheduling for the Intel P6 family of processors
+;; Copyright (C) 2004-2014 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.  */
+
+;; The P6 family includes the Pentium Pro, Pentium II, Pentium III, Celeron
+;; and Xeon lines of CPUs.  The DFA scheduler description in this file is
+;; based on information that can be found in the following three documents:
+;;
+;;    "P6 Family of Processors Hardware Developer's Manual",
+;;    Intel, September 1999.
+;;
+;;    "Intel Architecture Optimization Manual",
+;;    Intel, 1999 (Order Number: 245127-001).
+;;
+;;    "How to optimize for the Pentium family of microprocessors",
+;;    by Agner Fog, PhD.
+;;
+;; The P6 pipeline has three major components:
+;;   1) the FETCH/DECODE unit, an in-order issue front-end
+;;   2) the DISPATCH/EXECUTE unit, which is the out-of-order core
+;;   3) the RETIRE unit, an in-order retirement unit
+;;
+;; So, the P6 CPUs have out-of-order cores, but the instruction decoder and
+;; retirement unit are naturally in-order.
+;;
+;;                       BUS INTERFACE UNIT
+;;                     /                   \
+;;                L1 ICACHE             L1 DCACHE
+;;              /     |     \              |     \
+;;       DECODER0  DECODER1  DECODER2  DISP/EXEC  RETIRE
+;;              \     |     /              |        |
+;;            INSTRUCTION POOL   __________|_______/
+;;          (inc. reorder buffer)
+;;
+;; Since the P6 CPUs execute instructions out-of-order, the most important
+;; consideration in performance tuning is making sure enough micro-ops are
+;; ready for execution in the out-of-order core, while not stalling the
+;; decoder.
+;;
+;; TODO:
+;; - Find a less crude way to model complex instructions, in
+;;   particular how many cycles they take to be decoded.
+;; - Include decoder latencies in the total reservation latencies.
+;;   This isn't necessary right now because we assume for every
+;;   instruction that it never blocks a decoder.
+;; - Figure out where the p0 and p1 reservations come from.  These
+;;   appear not to be in the manual
+;; - Lots more because I'm sure this is still far from optimal :-)
+
+;; The ppro_idiv and ppro_fdiv automata are used to model issue
+;; latencies of idiv and fdiv type insns.
+(define_automaton "ppro_decoder,ppro_core,ppro_idiv,ppro_fdiv,ppro_load,ppro_store")
+
+;; Simple instructions of the register-register form have only one uop.
+;; Load instructions are also only one uop.  Store instructions decode to
+;; two uops, and simple read-modify instructions also take two uops.
+;; Simple instructions of the register-memory form have two to three uops.
+;; Simple read-modify-write instructions have four uops.  The rules for
+;; the decoder are simple:
+;;  - an instruction with 1 uop can be decoded by any of the three
+;;    decoders in one cycle.
+;;  - an instruction with 1 to 4 uops can be decoded only by decoder 0
+;;    but still in only one cycle.
+;;  - a complex (microcode) instruction can also only be decoded by
+;;    decoder 0, and this takes an unspecified number of cycles.
+;;
+;; The goal is to schedule such that we have a few-one-one uops sequence
+;; in each cycle, to decode as many instructions per cycle as possible.
+(define_cpu_unit "decoder0" "ppro_decoder")
+(define_cpu_unit "decoder1" "ppro_decoder")
+(define_cpu_unit "decoder2" "ppro_decoder")
+
+;; We first wish to find an instruction for decoder0, so exclude
+;; decoder1 and decoder2 from being reserved until decoder 0 is
+;; reserved.
+(presence_set "decoder1" "decoder0")
+(presence_set "decoder2" "decoder0")
+
+;; Most instructions can be decoded on any of the three decoders.
+(define_reservation "decodern" "(decoder0|decoder1|decoder2)")
+
+;; The out-of-order core has five pipelines.  During each cycle, the core
+;; may dispatch zero or one uop on the port of any of the five pipelines
+;; so the maximum number of dispatched uops per cycle is 5.  In practicer,
+;; 3 uops per cycle is more realistic.
+;;
+;; Two of the five pipelines contain several execution units:
+;;
+;; Port 0	Port 1		Port 2		Port 3		Port 4
+;; ALU		ALU		LOAD		SAC		SDA
+;; FPU		JUE
+;; AGU		MMX
+;; MMX		P3FPU
+;; P3FPU
+;;
+;; (SAC=Store Address Calculation, SDA=Store Data Unit, P3FPU = SSE unit,
+;;  JUE = Jump Execution Unit, AGU = Address Generation Unit)
+;;
+(define_cpu_unit "p0,p1" "ppro_core")
+(define_cpu_unit "p2" "ppro_load")
+(define_cpu_unit "p3,p4" "ppro_store")
+(define_cpu_unit "idiv" "ppro_idiv")
+(define_cpu_unit "fdiv" "ppro_fdiv")
+
+;; Only the irregular instructions have to be modeled here.  A load
+;; increases the latency by 2 or 3, or by nothing if the manual gives
+;; a latency already.  Store latencies are not accounted for.
+;;
+;; The simple instructions follow a very regular pattern of 1 uop per
+;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store
+;; on port 4 and port 3.  These instructions are modelled at the bottom
+;; of this file.
+;;
+;; For microcoded instructions we don't know how many uops are produced.
+;; These instructions are the "complex" ones in the Intel manuals.  All
+;; we _do_ know is that they typically produce four or more uops, so
+;; they can only be decoded on decoder0.  Modelling their latencies
+;; doesn't make sense because we don't know how these instructions are
+;; executed in the core.  So we just model that they can only be decoded
+;; on decoder 0, and say that it takes a little while before the result
+;; is available.
+(define_insn_reservation "ppro_complex_insn" 6
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (eq_attr "type" "other,multi,call,callv,str"))
+			 "decoder0")
+
+;; imov with memory operands does not use the integer units.
+(define_insn_reservation "ppro_imov" 1
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "imov")))
+			 "decodern,(p0|p1)")
+
+(define_insn_reservation "ppro_imov_load" 4
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "imov")))
+			 "decodern,p2")
+
+(define_insn_reservation "ppro_imov_store" 1
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "store")
+				   (eq_attr "type" "imov")))
+			 "decoder0,p4+p3")
+
+;; imovx always decodes to one uop, and also doesn't use the integer
+;; units if it has memory operands.
+(define_insn_reservation "ppro_imovx" 1
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "imovx")))
+			 "decodern,(p0|p1)")
+
+(define_insn_reservation "ppro_imovx_load" 4
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "imovx")))
+			 "decodern,p2")
+
+;; lea executes on port 0 with latency one and throughput 1.
+(define_insn_reservation "ppro_lea" 1
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "lea")))
+			 "decodern,p0")
+
+;; Shift and rotate execute on port 0 with latency and throughput 1.
+;; The load and store units need to be reserved when memory operands
+;; are involved.
+(define_insn_reservation "ppro_shift_rotate" 1
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
+			 "decodern,p0")
+
+(define_insn_reservation "ppro_shift_rotate_mem" 4
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "!none")
+				   (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
+			 "decoder0,p2+p0,p4+p3")
+
+
+;; The P6 has a sophisticated branch prediction mechanism to minimize
+;; latencies due to branching.  In particular, it has a fast way to
+;; execute branches that are taken multiple times (such as in loops).
+;; Branches not taken suffer no penalty, and correctly predicted
+;; branches cost only one fetch cycle.  Mispredicted branches are very
+;; costly: typically 15 cycles and possibly as many as 26 cycles.
+;;
+;; Unfortunately all this makes it quite difficult to properly model
+;; the latencies for the compiler.  Here I've made the choice to be
+;; optimistic and assume branches are often predicted correctly, so
+;; they have latency 1, and the decoders are not blocked.
+;;
+;; In addition, the model assumes a branch always decodes to only 1 uop,
+;; which is not exactly true because there are a few instructions that
+;; decode to 2 uops or microcode.  But this probably gives the best
+;; results because we can assume these instructions can decode on all
+;; decoders.
+(define_insn_reservation "ppro_branch" 1
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "ibr")))
+			 "decodern,p1")
+
+;; ??? Indirect branches probably have worse latency than this.
+(define_insn_reservation "ppro_indirect_branch" 6
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "!none")
+				   (eq_attr "type" "ibr")))
+			 "decoder0,p2+p1")
+
+(define_insn_reservation "ppro_leave" 4
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (eq_attr "type" "leave"))
+			 "decoder0,p2+(p0|p1),(p0|p1)")
+
+;; imul has throughput one, but latency 4, and can only execute on port 0.
+(define_insn_reservation "ppro_imul" 4
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "imul")))
+			 "decodern,p0")
+
+(define_insn_reservation "ppro_imul_mem" 4
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "!none")
+				   (eq_attr "type" "imul")))
+			 "decoder0,p2+p0")
+
+;; div and idiv are very similar, so we model them the same.
+;; QI, HI, and SI have issue latency 12, 21, and 37, respectively.
+;; These issue latencies are modelled via the ppro_div automaton.
+(define_insn_reservation "ppro_idiv_QI" 19
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "QI")
+					(eq_attr "type" "idiv"))))
+			 "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*9")
+
+(define_insn_reservation "ppro_idiv_QI_load" 19
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "QI")
+					(eq_attr "type" "idiv"))))
+			 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*9")
+
+(define_insn_reservation "ppro_idiv_HI" 23
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "HI")
+					(eq_attr "type" "idiv"))))
+			 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*17")
+
+(define_insn_reservation "ppro_idiv_HI_load" 23
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "HI")
+					(eq_attr "type" "idiv"))))
+			 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*18")
+
+(define_insn_reservation "ppro_idiv_SI" 39
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SI")
+					(eq_attr "type" "idiv"))))
+			 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*33")
+
+(define_insn_reservation "ppro_idiv_SI_load" 39
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "SI")
+					(eq_attr "type" "idiv"))))
+			 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*34")
+
+;; Floating point operations always execute on port 0.
+;; ??? where do these latencies come from? fadd has latency 3 and
+;;     has throughput "1/cycle (align with FADD)".  What do they
+;;     mean and how can we model that?
+(define_insn_reservation "ppro_fop" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none,unknown")
+				   (eq_attr "type" "fop")))
+			 "decodern,p0")
+
+(define_insn_reservation "ppro_fop_load" 5
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "fop")))
+			 "decoder0,p2+p0,p0")
+
+(define_insn_reservation "ppro_fop_store" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "store")
+				   (eq_attr "type" "fop")))
+			 "decoder0,p0,p0,p0+p4+p3")
+
+(define_insn_reservation "ppro_fop_both" 5
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "both")
+				   (eq_attr "type" "fop")))
+			 "decoder0,p2+p0,p0+p4+p3")
+
+(define_insn_reservation "ppro_fsgn" 1
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (eq_attr "type" "fsgn"))
+			 "decodern,p0")
+
+(define_insn_reservation "ppro_fistp" 5
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (eq_attr "type" "fistp"))
+			 "decoder0,p0*2,p4+p3")
+
+(define_insn_reservation "ppro_fcmov" 2
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (eq_attr "type" "fcmov"))
+			 "decoder0,p0*2")
+
+(define_insn_reservation "ppro_fcmp" 1
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "fcmp")))
+			 "decodern,p0")
+
+(define_insn_reservation "ppro_fcmp_load" 4
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "fcmp")))
+			 "decoder0,p2+p0")
+
+(define_insn_reservation "ppro_fmov" 1
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "fmov")))
+			 "decodern,p0")
+
+(define_insn_reservation "ppro_fmov_load" 1
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "!XF")
+					(eq_attr "type" "fmov"))))
+			 "decodern,p2")
+
+(define_insn_reservation "ppro_fmov_XF_load" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "XF")
+					(eq_attr "type" "fmov"))))
+			 "decoder0,(p2+p0)*2")
+
+(define_insn_reservation "ppro_fmov_store" 1
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "store")
+				   (and (eq_attr "mode" "!XF")
+					(eq_attr "type" "fmov"))))
+			 "decodern,p0")
+
+(define_insn_reservation "ppro_fmov_XF_store" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "store")
+				   (and (eq_attr "mode" "XF")
+					(eq_attr "type" "fmov"))))
+			 "decoder0,(p0+p4),(p0+p3)")
+
+;; fmul executes on port 0 with latency 5.  It has issue latency 2,
+;; but we don't model this.
+(define_insn_reservation "ppro_fmul" 5
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "fmul")))
+			 "decoder0,p0*2")
+
+(define_insn_reservation "ppro_fmul_load" 6
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "fmul")))
+			 "decoder0,p2+p0,p0")
+
+;; fdiv latencies depend on the mode of the operands.  XFmode gives
+;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18.
+;; Division by a power of 2 takes only 9 cycles, but we cannot model
+;; that.  Throughput is equal to latency - 1, which we model using the
+;; ppro_div automaton.
+(define_insn_reservation "ppro_fdiv_SF" 18
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "fdiv,fpspc"))))
+			 "decodern,p0+fdiv,fdiv*16")
+
+(define_insn_reservation "ppro_fdiv_SF_load" 19
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "fdiv,fpspc"))))
+			 "decoder0,p2+p0+fdiv,fdiv*16")
+
+(define_insn_reservation "ppro_fdiv_DF" 32
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "DF")
+					(eq_attr "type" "fdiv,fpspc"))))
+			 "decodern,p0+fdiv,fdiv*30")
+
+(define_insn_reservation "ppro_fdiv_DF_load" 33
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "DF")
+					(eq_attr "type" "fdiv,fpspc"))))
+			 "decoder0,p2+p0+fdiv,fdiv*30")
+
+(define_insn_reservation "ppro_fdiv_XF" 38
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "XF")
+					(eq_attr "type" "fdiv,fpspc"))))
+			 "decodern,p0+fdiv,fdiv*36")
+
+(define_insn_reservation "ppro_fdiv_XF_load" 39
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "XF")
+					(eq_attr "type" "fdiv,fpspc"))))
+			 "decoder0,p2+p0+fdiv,fdiv*36")
+
+;; MMX instructions can execute on either port 0 or port 1 with a
+;; throughput of 1/cycle.
+;;   on port 0:	- ALU (latency 1)
+;;		- Multiplier Unit (latency 3)
+;;   on port 1:	- ALU (latency 1)
+;;		- Shift Unit (latency 1)
+;;
+;; MMX instructions are either of the type reg-reg, or read-modify, and
+;; except for mmxshft and mmxmul they can execute on port 0 or port 1,
+;; so they behave as "simple" instructions that need no special modelling.
+;; We only have to model mmxshft and mmxmul.
+(define_insn_reservation "ppro_mmx_shft" 1
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "mmxshft")))
+			 "decodern,p1")
+
+(define_insn_reservation "ppro_mmx_shft_load" 2
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "mmxshft")))
+			 "decoder0,p2+p1")
+
+(define_insn_reservation "ppro_mmx_mul" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "mmxmul")))
+			 "decodern,p0")
+
+(define_insn_reservation "ppro_mmx_mul_load" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (eq_attr "type" "mmxmul")))
+			 "decoder0,p2+p0")
+
+(define_insn_reservation "ppro_sse_mmxcvt" 4
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "mode" "DI")
+				   (eq_attr "type" "mmxcvt")))
+			 "decodern,p1")
+
+;; FIXME: These are Pentium III only, but we cannot tell here if
+;; we're generating code for PentiumPro/Pentium II or Pentium III
+;; (define_insn_reservation "ppro_sse_mmxshft" 2
+;;			 (and (eq_attr "cpu" "pentiumpro")
+;;			      (and (eq_attr "mode" "DI")
+;;				   (eq_attr "type" "mmxshft")))
+;;			 "decodern,p0")
+
+;; SSE is very complicated, and takes a bit more effort.
+;; ??? I assumed that all SSE instructions decode on decoder0,
+;;     but is this correct?
+
+;; The sfence instruction.
+(define_insn_reservation "ppro_sse_sfence" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "unknown")
+				   (eq_attr "type" "sse")))
+			 "decoder0,p4+p3")
+
+;; FIXME: This reservation is all wrong when we're scheduling sqrtss.
+(define_insn_reservation "ppro_sse_SF" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "mode" "SF")
+				   (eq_attr "type" "sse")))
+			 "decodern,p0")
+
+(define_insn_reservation "ppro_sse_add_SF" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "sseadd,sseadd1"))))
+			 "decodern,p1")
+
+(define_insn_reservation "ppro_sse_add_SF_load" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "sseadd,sseadd1"))))
+			 "decoder0,p2+p1")
+
+(define_insn_reservation "ppro_sse_cmp_SF" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "ssecmp"))))
+			 "decoder0,p1")
+
+(define_insn_reservation "ppro_sse_cmp_SF_load" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "ssecmp"))))
+			 "decoder0,p2+p1")
+
+(define_insn_reservation "ppro_sse_comi_SF" 1
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "ssecomi"))))
+			 "decodern,p0")
+
+(define_insn_reservation "ppro_sse_comi_SF_load" 1
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "ssecomi"))))
+			 "decoder0,p2+p0")
+
+(define_insn_reservation "ppro_sse_mul_SF" 4
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "ssemul"))))
+			"decodern,p0")
+
+(define_insn_reservation "ppro_sse_mul_SF_load" 4
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "ssemul"))))
+			"decoder0,p2+p0")
+
+;; FIXME: ssediv doesn't close p0 for 17 cycles, surely???
+(define_insn_reservation "ppro_sse_div_SF" 18
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "ssediv"))))
+			 "decoder0,p0*17")
+
+(define_insn_reservation "ppro_sse_div_SF_load" 18
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "ssediv"))))
+			 "decoder0,(p2+p0),p0*16")
+
+(define_insn_reservation "ppro_sse_icvt_SF" 4
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "mode" "SF")
+				   (eq_attr "type" "sseicvt")))
+			 "decoder0,(p2+p1)*2")
+
+(define_insn_reservation "ppro_sse_icvt_SI" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "mode" "SI")
+				   (eq_attr "type" "sseicvt")))
+			 "decoder0,(p2+p1)")
+
+(define_insn_reservation "ppro_sse_mov_SF" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "ssemov"))))
+			 "decoder0,(p0|p1)")
+
+(define_insn_reservation "ppro_sse_mov_SF_load" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "ssemov"))))
+			 "decoder0,p2+(p0|p1)")
+
+(define_insn_reservation "ppro_sse_mov_SF_store" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "store")
+				   (and (eq_attr "mode" "SF")
+					(eq_attr "type" "ssemov"))))
+			 "decoder0,p4+p3")
+
+(define_insn_reservation "ppro_sse_V4SF" 4
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "mode" "V4SF")
+				   (eq_attr "type" "sse")))
+			 "decoder0,p1*2")
+
+(define_insn_reservation "ppro_sse_add_V4SF" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "V4SF")
+					(eq_attr "type" "sseadd,sseadd1"))))
+			 "decoder0,p1*2")
+
+(define_insn_reservation "ppro_sse_add_V4SF_load" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "V4SF")
+					(eq_attr "type" "sseadd,sseadd1"))))
+			 "decoder0,(p2+p1)*2")
+
+(define_insn_reservation "ppro_sse_cmp_V4SF" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "V4SF")
+					(eq_attr "type" "ssecmp"))))
+			 "decoder0,p1*2")
+
+(define_insn_reservation "ppro_sse_cmp_V4SF_load" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "V4SF")
+					(eq_attr "type" "ssecmp"))))
+			 "decoder0,(p2+p1)*2")
+
+(define_insn_reservation "ppro_sse_cvt_V4SF" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none,unknown")
+				   (and (eq_attr "mode" "V4SF")
+					(eq_attr "type" "ssecvt"))))
+			 "decoder0,p1*2")
+
+(define_insn_reservation "ppro_sse_cvt_V4SF_other" 4
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "!none,unknown")
+				   (and (eq_attr "mode" "V4SF")
+					(eq_attr "type" "ssecmp"))))
+			 "decoder0,p1,p4+p3")
+
+(define_insn_reservation "ppro_sse_mul_V4SF" 5
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "V4SF")
+					(eq_attr "type" "ssemul"))))
+			"decoder0,p0*2")
+
+(define_insn_reservation "ppro_sse_mul_V4SF_load" 5
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "V4SF")
+					(eq_attr "type" "ssemul"))))
+			"decoder0,(p2+p0)*2")
+
+;; FIXME: p0 really closed this long???
+(define_insn_reservation "ppro_sse_div_V4SF" 48
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "V4SF")
+					(eq_attr "type" "ssediv"))))
+			 "decoder0,p0*34")
+
+(define_insn_reservation "ppro_sse_div_V4SF_load" 48
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "V4SF")
+					(eq_attr "type" "ssediv"))))
+			 "decoder0,(p2+p0)*2,p0*32")
+
+(define_insn_reservation "ppro_sse_log_V4SF" 2
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "V4SF")
+					(eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1"))))
+			 "decodern,p1")
+
+(define_insn_reservation "ppro_sse_log_V4SF_load" 2
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "V4SF")
+					(eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1"))))
+			 "decoder0,(p2+p1)")
+
+(define_insn_reservation "ppro_sse_mov_V4SF" 1
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none")
+				   (and (eq_attr "mode" "V4SF")
+					(eq_attr "type" "ssemov"))))
+			 "decoder0,(p0|p1)*2")
+
+(define_insn_reservation "ppro_sse_mov_V4SF_load" 2
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (and (eq_attr "mode" "V4SF")
+					(eq_attr "type" "ssemov"))))
+			 "decoder0,p2*2")
+
+(define_insn_reservation "ppro_sse_mov_V4SF_store" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "store")
+				   (and (eq_attr "mode" "V4SF")
+					(eq_attr "type" "ssemov"))))
+			 "decoder0,(p4+p3)*2")
+
+;; All other instructions are modelled as simple instructions.
+;; We have already modelled all i387 floating point instructions, so all
+;; other instructions execute on either port 0 or port 1.  This includes
+;; the ALU units, and the MMX units.
+;;
+;; reg-reg instructions produce 1 uop so they can be decoded on any of
+;; the three decoders.
+(define_insn_reservation "ppro_insn" 1
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "none,unknown")
+				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
+			 "decodern,(p0|p1)")
+
+;; read-modify and register-memory instructions have 2 or three uops,
+;; so they have to be decoded on decoder0.
+(define_insn_reservation "ppro_insn_load" 3
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "load")
+				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
+			 "decoder0,p2+(p0|p1)")
+
+(define_insn_reservation "ppro_insn_store" 1
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "store")
+				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
+			 "decoder0,(p0|p1),p4+p3")
+
+;; read-modify-store instructions produce 4 uops so they have to be
+;; decoded on decoder0 as well.
+(define_insn_reservation "ppro_insn_both" 4
+			 (and (eq_attr "cpu" "pentiumpro")
+			      (and (eq_attr "memory" "both")
+				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
+			 "decoder0,p2+(p0|p1),p4+p3")
+
diff --git a/gcc-4.9/gcc/config/i386/predicates.md b/gcc-4.9/gcc/config/i386/predicates.md
new file mode 100644
index 000000000..0492241fd
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/predicates.md
@@ -0,0 +1,1424 @@
+;; Predicate definitions for IA-32 and x86-64.
+;; Copyright (C) 2004-2014 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; Return true if OP is either a i387 or SSE fp register.
+(define_predicate "any_fp_register_operand"
+  (and (match_code "reg")
+       (match_test "ANY_FP_REGNO_P (REGNO (op))")))
+
+;; Return true if OP is an i387 fp register.
+(define_predicate "fp_register_operand"
+  (and (match_code "reg")
+       (match_test "STACK_REGNO_P (REGNO (op))")))
+
+;; Return true if OP is a non-fp register_operand.
+(define_predicate "register_and_not_any_fp_reg_operand"
+  (and (match_code "reg")
+       (not (match_test "ANY_FP_REGNO_P (REGNO (op))"))))
+
+;; True if the operand is a GENERAL class register.
+(define_predicate "general_reg_operand"
+  (and (match_code "reg")
+       (match_test "GENERAL_REG_P (op)")))
+
+;; Return true if OP is a register operand other than an i387 fp register.
+(define_predicate "register_and_not_fp_reg_operand"
+  (and (match_code "reg")
+       (not (match_test "STACK_REGNO_P (REGNO (op))"))))
+
+;; True if the operand is an MMX register.
+(define_predicate "mmx_reg_operand"
+  (and (match_code "reg")
+       (match_test "MMX_REGNO_P (REGNO (op))")))
+
+;; True if the operand is an SSE register.
+(define_predicate "sse_reg_operand"
+  (and (match_code "reg")
+       (match_test "SSE_REGNO_P (REGNO (op))")))
+
+;; True if the operand is an AVX-512 new register.
+(define_predicate "ext_sse_reg_operand"
+  (and (match_code "reg")
+       (match_test "EXT_REX_SSE_REGNO_P (REGNO (op))")))
+
+;; True if the operand is an AVX-512 mask register.
+(define_predicate "mask_reg_operand"
+  (and (match_code "reg")
+       (match_test "MASK_REGNO_P (REGNO (op))")))
+
+;; True if the operand is a Q_REGS class register.
+(define_predicate "q_regs_operand"
+  (match_operand 0 "register_operand")
+{
+  if (GET_CODE (op) == SUBREG)
+    op = SUBREG_REG (op);
+  return ANY_QI_REG_P (op);
+})
+
+;; Match an SI or HImode register for a zero_extract.
+(define_special_predicate "ext_register_operand"
+  (match_operand 0 "register_operand")
+{
+  if ((!TARGET_64BIT || GET_MODE (op) != DImode)
+      && GET_MODE (op) != SImode && GET_MODE (op) != HImode)
+    return false;
+  if (GET_CODE (op) == SUBREG)
+    op = SUBREG_REG (op);
+
+  /* Be careful to accept only registers having upper parts.  */
+  return (REG_P (op)
+	  && (REGNO (op) > LAST_VIRTUAL_REGISTER || REGNO (op) <= BX_REG));
+})
+
+;; Match nonimmediate operands, but exclude memory operands on 64bit targets.
+(define_predicate "nonimmediate_x64nomem_operand"
+  (if_then_else (match_test "TARGET_64BIT")
+    (match_operand 0 "register_operand")
+    (match_operand 0 "nonimmediate_operand")))
+
+;; Match general operands, but exclude memory operands on 64bit targets.
+(define_predicate "general_x64nomem_operand"
+  (if_then_else (match_test "TARGET_64BIT")
+    (match_operand 0 "nonmemory_operand")
+    (match_operand 0 "general_operand")))
+
+;; Return true if op is the AX register.
+(define_predicate "ax_reg_operand"
+  (and (match_code "reg")
+       (match_test "REGNO (op) == AX_REG")))
+
+;; Return true if op is the flags register.
+(define_predicate "flags_reg_operand"
+  (and (match_code "reg")
+       (match_test "REGNO (op) == FLAGS_REG")))
+
+;; Return true if op is one of QImode registers: %[abcd][hl].
+(define_predicate "QIreg_operand"
+  (match_test "QI_REG_P (op)"))
+
+;; Return true if op is a QImode register operand other than
+;; %[abcd][hl].
+(define_predicate "ext_QIreg_operand"
+  (and (match_code "reg")
+       (match_test "TARGET_64BIT")
+       (match_test "REGNO (op) > BX_REG")))
+
+;; Return true if VALUE can be stored in a sign extended immediate field.
+(define_predicate "x86_64_immediate_operand"
+  (match_code "const_int,symbol_ref,label_ref,const")
+{
+  if (!TARGET_64BIT)
+    return immediate_operand (op, mode);
+
+  switch (GET_CODE (op))
+    {
+    case CONST_INT:
+      /* CONST_DOUBLES never match, since HOST_BITS_PER_WIDE_INT is known
+         to be at least 32 and this all acceptable constants are
+	 represented as CONST_INT.  */
+      if (HOST_BITS_PER_WIDE_INT == 32)
+	return true;
+      else
+	{
+	  HOST_WIDE_INT val = trunc_int_for_mode (INTVAL (op), DImode);
+	  return trunc_int_for_mode (val, SImode) == val;
+	}
+      break;
+
+    case SYMBOL_REF:
+      /* For certain code models, the symbolic references are known to fit.
+	 in CM_SMALL_PIC model we know it fits if it is local to the shared
+	 library.  Don't count TLS SYMBOL_REFs here, since they should fit
+	 only if inside of UNSPEC handled below.  */
+      /* TLS symbols are not constant.  */
+      if (SYMBOL_REF_TLS_MODEL (op))
+	return false;
+      return (ix86_cmodel == CM_SMALL || ix86_cmodel == CM_KERNEL
+	      || (ix86_cmodel == CM_MEDIUM && !SYMBOL_REF_FAR_ADDR_P (op)));
+
+    case LABEL_REF:
+      /* For certain code models, the code is near as well.  */
+      return (ix86_cmodel == CM_SMALL || ix86_cmodel == CM_MEDIUM
+	      || ix86_cmodel == CM_KERNEL);
+
+    case CONST:
+      /* We also may accept the offsetted memory references in certain
+	 special cases.  */
+      if (GET_CODE (XEXP (op, 0)) == UNSPEC)
+	switch (XINT (XEXP (op, 0), 1))
+	  {
+	  case UNSPEC_GOTPCREL:
+	  case UNSPEC_DTPOFF:
+	  case UNSPEC_GOTNTPOFF:
+	  case UNSPEC_NTPOFF:
+	    return true;
+	  default:
+	    break;
+	  }
+
+      if (GET_CODE (XEXP (op, 0)) == PLUS)
+	{
+	  rtx op1 = XEXP (XEXP (op, 0), 0);
+	  rtx op2 = XEXP (XEXP (op, 0), 1);
+	  HOST_WIDE_INT offset;
+
+	  if (ix86_cmodel == CM_LARGE)
+	    return false;
+	  if (!CONST_INT_P (op2))
+	    return false;
+	  offset = trunc_int_for_mode (INTVAL (op2), DImode);
+	  switch (GET_CODE (op1))
+	    {
+	    case SYMBOL_REF:
+	      /* TLS symbols are not constant.  */
+	      if (SYMBOL_REF_TLS_MODEL (op1))
+		return false;
+	      /* For CM_SMALL assume that latest object is 16MB before
+		 end of 31bits boundary.  We may also accept pretty
+		 large negative constants knowing that all objects are
+		 in the positive half of address space.  */
+	      if ((ix86_cmodel == CM_SMALL
+		   || (ix86_cmodel == CM_MEDIUM
+		       && !SYMBOL_REF_FAR_ADDR_P (op1)))
+		  && offset < 16*1024*1024
+		  && trunc_int_for_mode (offset, SImode) == offset)
+		return true;
+	      /* For CM_KERNEL we know that all object resist in the
+		 negative half of 32bits address space.  We may not
+		 accept negative offsets, since they may be just off
+		 and we may accept pretty large positive ones.  */
+	      if (ix86_cmodel == CM_KERNEL
+		  && offset > 0
+		  && trunc_int_for_mode (offset, SImode) == offset)
+		return true;
+	      break;
+
+	    case LABEL_REF:
+	      /* These conditions are similar to SYMBOL_REF ones, just the
+		 constraints for code models differ.  */
+	      if ((ix86_cmodel == CM_SMALL || ix86_cmodel == CM_MEDIUM)
+		  && offset < 16*1024*1024
+		  && trunc_int_for_mode (offset, SImode) == offset)
+		return true;
+	      if (ix86_cmodel == CM_KERNEL
+		  && offset > 0
+		  && trunc_int_for_mode (offset, SImode) == offset)
+		return true;
+	      break;
+
+	    case UNSPEC:
+	      switch (XINT (op1, 1))
+		{
+		case UNSPEC_DTPOFF:
+		case UNSPEC_NTPOFF:
+		  if (trunc_int_for_mode (offset, SImode) == offset)
+		    return true;
+		}
+	      break;
+
+	    default:
+	      break;
+	    }
+	}
+      break;
+
+      default:
+	gcc_unreachable ();
+    }
+
+  return false;
+})
+
+;; Return true if VALUE can be stored in the zero extended immediate field.
+(define_predicate "x86_64_zext_immediate_operand"
+  (match_code "const_double,const_int,symbol_ref,label_ref,const")
+{
+  switch (GET_CODE (op))
+    {
+    case CONST_DOUBLE:
+      if (HOST_BITS_PER_WIDE_INT == 32)
+	return (GET_MODE (op) == VOIDmode && !CONST_DOUBLE_HIGH (op));
+      else
+	return false;
+
+    case CONST_INT:
+      if (HOST_BITS_PER_WIDE_INT == 32)
+	return INTVAL (op) >= 0;
+      else
+	return !(INTVAL (op) & ~(HOST_WIDE_INT) 0xffffffff);
+
+    case SYMBOL_REF:
+      /* For certain code models, the symbolic references are known to fit.  */
+      /* TLS symbols are not constant.  */
+      if (SYMBOL_REF_TLS_MODEL (op))
+	return false;
+      return (ix86_cmodel == CM_SMALL
+	      || (ix86_cmodel == CM_MEDIUM
+		  && !SYMBOL_REF_FAR_ADDR_P (op)));
+
+    case LABEL_REF:
+      /* For certain code models, the code is near as well.  */
+      return ix86_cmodel == CM_SMALL || ix86_cmodel == CM_MEDIUM;
+
+    case CONST:
+      /* We also may accept the offsetted memory references in certain
+	 special cases.  */
+      if (GET_CODE (XEXP (op, 0)) == PLUS)
+	{
+	  rtx op1 = XEXP (XEXP (op, 0), 0);
+	  rtx op2 = XEXP (XEXP (op, 0), 1);
+
+	  if (ix86_cmodel == CM_LARGE)
+	    return false;
+	  switch (GET_CODE (op1))
+	    {
+	    case SYMBOL_REF:
+	      /* TLS symbols are not constant.  */
+	      if (SYMBOL_REF_TLS_MODEL (op1))
+		return false;
+	      /* For small code model we may accept pretty large positive
+		 offsets, since one bit is available for free.  Negative
+		 offsets are limited by the size of NULL pointer area
+		 specified by the ABI.  */
+	      if ((ix86_cmodel == CM_SMALL
+		   || (ix86_cmodel == CM_MEDIUM
+		       && !SYMBOL_REF_FAR_ADDR_P (op1)))
+		  && CONST_INT_P (op2)
+		  && trunc_int_for_mode (INTVAL (op2), DImode) > -0x10000
+		  && trunc_int_for_mode (INTVAL (op2), SImode) == INTVAL (op2))
+		return true;
+	      /* ??? For the kernel, we may accept adjustment of
+		 -0x10000000, since we know that it will just convert
+		 negative address space to positive, but perhaps this
+		 is not worthwhile.  */
+	      break;
+
+	    case LABEL_REF:
+	      /* These conditions are similar to SYMBOL_REF ones, just the
+		 constraints for code models differ.  */
+	      if ((ix86_cmodel == CM_SMALL || ix86_cmodel == CM_MEDIUM)
+		  && CONST_INT_P (op2)
+		  && trunc_int_for_mode (INTVAL (op2), DImode) > -0x10000
+		  && trunc_int_for_mode (INTVAL (op2), SImode) == INTVAL (op2))
+		return true;
+	      break;
+
+	    default:
+	      return false;
+	    }
+	}
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+  return false;
+})
+
+;; Return true if OP is general operand representable on x86_64.
+(define_predicate "x86_64_general_operand"
+  (if_then_else (match_test "TARGET_64BIT")
+    (ior (match_operand 0 "nonimmediate_operand")
+	 (match_operand 0 "x86_64_immediate_operand"))
+    (match_operand 0 "general_operand")))
+
+;; Return true if OP is representable on x86_64 as zero-extended operand.
+;; This predicate is used in zero-extending conversion operations that
+;; require non-VOIDmode immediate operands.
+(define_predicate "x86_64_zext_operand"
+  (if_then_else (match_test "TARGET_64BIT")
+    (ior (match_operand 0 "nonimmediate_operand")
+	 (and (match_operand 0 "x86_64_zext_immediate_operand")
+	      (match_test "GET_MODE (op) != VOIDmode")))
+    (match_operand 0 "nonimmediate_operand")))
+
+;; Return true if OP is general operand representable on x86_64
+;; as either sign extended or zero extended constant.
+(define_predicate "x86_64_szext_general_operand"
+  (if_then_else (match_test "TARGET_64BIT")
+    (ior (match_operand 0 "nonimmediate_operand")
+	 (match_operand 0 "x86_64_immediate_operand")
+	 (match_operand 0 "x86_64_zext_immediate_operand"))
+    (match_operand 0 "general_operand")))
+
+;; Return true if OP is nonmemory operand representable on x86_64.
+(define_predicate "x86_64_nonmemory_operand"
+  (if_then_else (match_test "TARGET_64BIT")
+    (ior (match_operand 0 "register_operand")
+	 (match_operand 0 "x86_64_immediate_operand"))
+    (match_operand 0 "nonmemory_operand")))
+
+;; Return true if OP is nonmemory operand representable on x86_64.
+(define_predicate "x86_64_szext_nonmemory_operand"
+  (if_then_else (match_test "TARGET_64BIT")
+    (ior (match_operand 0 "register_operand")
+	 (match_operand 0 "x86_64_immediate_operand")
+	 (match_operand 0 "x86_64_zext_immediate_operand"))
+    (match_operand 0 "nonmemory_operand")))
+
+;; Return true when operand is PIC expression that can be computed by lea
+;; operation.
+(define_predicate "pic_32bit_operand"
+  (match_code "const,symbol_ref,label_ref")
+{
+  if (!flag_pic)
+    return false;
+
+  /* Rule out relocations that translate into 64bit constants.  */
+  if (TARGET_64BIT && GET_CODE (op) == CONST)
+    {
+      op = XEXP (op, 0);
+      if (GET_CODE (op) == PLUS && CONST_INT_P (XEXP (op, 1)))
+	op = XEXP (op, 0);
+      if (GET_CODE (op) == UNSPEC
+	  && (XINT (op, 1) == UNSPEC_GOTOFF
+	      || XINT (op, 1) == UNSPEC_GOT))
+	return false;
+    }
+
+  return symbolic_operand (op, mode);
+})
+
+;; Return true if OP is nonmemory operand acceptable by movabs patterns.
+(define_predicate "x86_64_movabs_operand"
+  (and (match_operand 0 "nonmemory_operand")
+       (not (match_operand 0 "pic_32bit_operand"))))
+
+;; Return true if OP is either a symbol reference or a sum of a symbol
+;; reference and a constant.
+(define_predicate "symbolic_operand"
+  (match_code "symbol_ref,label_ref,const")
+{
+  switch (GET_CODE (op))
+    {
+    case SYMBOL_REF:
+    case LABEL_REF:
+      return true;
+
+    case CONST:
+      op = XEXP (op, 0);
+      if (GET_CODE (op) == SYMBOL_REF
+	  || GET_CODE (op) == LABEL_REF
+	  || (GET_CODE (op) == UNSPEC
+	      && (XINT (op, 1) == UNSPEC_GOT
+		  || XINT (op, 1) == UNSPEC_GOTOFF
+		  || XINT (op, 1) == UNSPEC_PCREL
+		  || XINT (op, 1) == UNSPEC_GOTPCREL)))
+	return true;
+      if (GET_CODE (op) != PLUS
+	  || !CONST_INT_P (XEXP (op, 1)))
+	return false;
+
+      op = XEXP (op, 0);
+      if (GET_CODE (op) == SYMBOL_REF
+	  || GET_CODE (op) == LABEL_REF)
+	return true;
+      /* Only @GOTOFF gets offsets.  */
+      if (GET_CODE (op) != UNSPEC
+	  || XINT (op, 1) != UNSPEC_GOTOFF)
+	return false;
+
+      op = XVECEXP (op, 0, 0);
+      if (GET_CODE (op) == SYMBOL_REF
+	  || GET_CODE (op) == LABEL_REF)
+	return true;
+      return false;
+
+    default:
+      gcc_unreachable ();
+    }
+})
+
+;; Return true if OP is a symbolic operand that resolves locally.
+(define_predicate "local_symbolic_operand"
+  (match_code "const,label_ref,symbol_ref")
+{
+  if (GET_CODE (op) == CONST
+      && GET_CODE (XEXP (op, 0)) == PLUS
+      && CONST_INT_P (XEXP (XEXP (op, 0), 1)))
+    op = XEXP (XEXP (op, 0), 0);
+
+  if (GET_CODE (op) == LABEL_REF)
+    return true;
+
+  if (GET_CODE (op) != SYMBOL_REF)
+    return false;
+
+  if (SYMBOL_REF_TLS_MODEL (op))
+    return false;
+
+  /* Dll-imported symbols are always external.  */
+  if (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op))
+    return false;
+  if (SYMBOL_REF_LOCAL_P (op))
+    return true;
+
+  /* There is, however, a not insubstantial body of code in the rest of
+     the compiler that assumes it can just stick the results of
+     ASM_GENERATE_INTERNAL_LABEL in a symbol_ref and have done.  */
+  /* ??? This is a hack.  Should update the body of the compiler to
+     always create a DECL an invoke targetm.encode_section_info.  */
+  if (strncmp (XSTR (op, 0), internal_label_prefix,
+	       internal_label_prefix_len) == 0)
+    return true;
+
+  return false;
+})
+
+;; Test for a legitimate @GOTOFF operand.
+;;
+;; VxWorks does not impose a fixed gap between segments; the run-time
+;; gap can be different from the object-file gap.  We therefore can't
+;; use @GOTOFF unless we are absolutely sure that the symbol is in the
+;; same segment as the GOT.  Unfortunately, the flexibility of linker
+;; scripts means that we can't be sure of that in general, so assume
+;; that @GOTOFF is never valid on VxWorks.
+(define_predicate "gotoff_operand"
+  (and (not (match_test "TARGET_VXWORKS_RTP"))
+       (match_operand 0 "local_symbolic_operand")))
+
+;; Test for various thread-local symbols.
+(define_special_predicate "tls_symbolic_operand"
+  (and (match_code "symbol_ref")
+       (match_test "SYMBOL_REF_TLS_MODEL (op)")))
+
+(define_special_predicate "tls_modbase_operand"
+  (and (match_code "symbol_ref")
+       (match_test "op == ix86_tls_module_base ()")))
+
+;; Test for a pc-relative call operand
+(define_predicate "constant_call_address_operand"
+  (match_code "symbol_ref")
+{
+  if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
+    return false;
+  if (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op))
+    return false;
+  return true;
+})
+
+;; P6 processors will jump to the address after the decrement when %esp
+;; is used as a call operand, so they will execute return address as a code.
+;; See Pentium Pro errata 70, Pentium 2 errata A33 and Pentium 3 errata E17.
+
+(define_predicate "call_register_no_elim_operand"
+  (match_operand 0 "register_operand")
+{
+  if (GET_CODE (op) == SUBREG)
+    op = SUBREG_REG (op);
+
+  if (!TARGET_64BIT && op == stack_pointer_rtx)
+    return false;
+
+  return register_no_elim_operand (op, mode);
+})
+
+;; True for any non-virtual or eliminable register.  Used in places where
+;; instantiation of such a register may cause the pattern to not be recognized.
+(define_predicate "register_no_elim_operand"
+  (match_operand 0 "register_operand")
+{
+  if (GET_CODE (op) == SUBREG)
+    op = SUBREG_REG (op);
+  return !(op == arg_pointer_rtx
+	   || op == frame_pointer_rtx
+	   || IN_RANGE (REGNO (op),
+			FIRST_PSEUDO_REGISTER, LAST_VIRTUAL_REGISTER));
+})
+
+;; Similarly, but include the stack pointer.  This is used to prevent esp
+;; from being used as an index reg.
+(define_predicate "index_register_operand"
+  (match_operand 0 "register_operand")
+{
+  if (GET_CODE (op) == SUBREG)
+    op = SUBREG_REG (op);
+  if (reload_in_progress || reload_completed)
+    return REG_OK_FOR_INDEX_STRICT_P (op);
+  else
+    return REG_OK_FOR_INDEX_NONSTRICT_P (op);
+})
+
+;; Return false if this is any eliminable register.  Otherwise general_operand.
+(define_predicate "general_no_elim_operand"
+  (if_then_else (match_code "reg,subreg")
+    (match_operand 0 "register_no_elim_operand")
+    (match_operand 0 "general_operand")))
+
+;; Return false if this is any eliminable register.  Otherwise
+;; register_operand or a constant.
+(define_predicate "nonmemory_no_elim_operand"
+  (ior (match_operand 0 "register_no_elim_operand")
+       (match_operand 0 "immediate_operand")))
+
+;; Test for a valid operand for indirect branch.
+(define_predicate "indirect_branch_operand"
+  (ior (match_operand 0 "register_operand")
+       (and (not (match_test "TARGET_X32"))
+	    (match_operand 0 "memory_operand"))))
+
+;; Test for a valid operand for a call instruction.
+;; Allow constant call address operands in Pmode only.
+(define_special_predicate "call_insn_operand"
+  (ior (match_test "constant_call_address_operand
+		     (op, mode == VOIDmode ? mode : Pmode)")
+       (match_operand 0 "call_register_no_elim_operand")
+       (and (not (match_test "TARGET_X32"))
+	    (match_operand 0 "memory_operand"))))
+
+;; Similarly, but for tail calls, in which we cannot allow memory references.
+(define_special_predicate "sibcall_insn_operand"
+  (ior (match_test "constant_call_address_operand
+		     (op, mode == VOIDmode ? mode : Pmode)")
+       (match_operand 0 "register_no_elim_operand")))
+
+;; Return true if OP is a call from MS ABI to SYSV ABI function.
+(define_predicate "call_rex64_ms_sysv_operation"
+  (match_code "parallel")
+{
+  unsigned creg_size = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
+  unsigned i;
+
+  if ((unsigned) XVECLEN (op, 0) != creg_size + 2)
+    return false;
+
+  for (i = 0; i < creg_size; i++)
+    {
+      rtx elt = XVECEXP (op, 0, i+2);
+      enum machine_mode mode;
+      unsigned regno;
+
+      if (GET_CODE (elt) != CLOBBER
+          || GET_CODE (SET_DEST (elt)) != REG)
+        return false;
+
+      regno = x86_64_ms_sysv_extra_clobbered_registers[i];
+      mode = SSE_REGNO_P (regno) ? TImode : DImode;
+
+      if (GET_MODE (SET_DEST (elt)) != mode
+	  || REGNO (SET_DEST (elt)) != regno)
+	return false;
+    }
+  return true;
+})
+
+;; Match exactly zero.
+(define_predicate "const0_operand"
+  (match_code "const_int,const_double,const_vector")
+{
+  if (mode == VOIDmode)
+    mode = GET_MODE (op);
+  return op == CONST0_RTX (mode);
+})
+
+;; Match one or vector filled with ones.
+(define_predicate "const1_operand"
+  (match_code "const_int,const_double,const_vector")
+{
+  if (mode == VOIDmode)
+    mode = GET_MODE (op);
+  return op == CONST1_RTX (mode);
+})
+
+;; Match exactly eight.
+(define_predicate "const8_operand"
+  (and (match_code "const_int")
+       (match_test "INTVAL (op) == 8")))
+
+;; Match exactly 128.
+(define_predicate "const128_operand"
+  (and (match_code "const_int")
+       (match_test "INTVAL (op) == 128")))
+
+;; Match exactly 0x0FFFFFFFF in anddi as a zero-extension operation
+(define_predicate "const_32bit_mask"
+  (and (match_code "const_int")
+       (match_test "trunc_int_for_mode (INTVAL (op), DImode)
+		    == (HOST_WIDE_INT) 0xffffffff")))
+
+;; Match 2, 4, or 8.  Used for leal multiplicands.
+(define_predicate "const248_operand"
+  (match_code "const_int")
+{
+  HOST_WIDE_INT i = INTVAL (op);
+  return i == 2 || i == 4 || i == 8;
+})
+
+;; Match 2, 3, 6, or 7
+(define_predicate "const2367_operand"
+  (match_code "const_int")
+{
+  HOST_WIDE_INT i = INTVAL (op);
+  return i == 2 || i == 3 || i == 6 || i == 7;
+})
+
+;; Match 1, 2, 4, or 8
+(define_predicate "const1248_operand"
+  (match_code "const_int")
+{
+  HOST_WIDE_INT i = INTVAL (op);
+  return i == 1 || i == 2 || i == 4 || i == 8;
+})
+
+;; Match 3, 5, or 9.  Used for leal multiplicands.
+(define_predicate "const359_operand"
+  (match_code "const_int")
+{
+  HOST_WIDE_INT i = INTVAL (op);
+  return i == 3 || i == 5 || i == 9;
+})
+
+;; Match 4 or 8 to 11.  Used for embeded rounding.
+(define_predicate "const_4_or_8_to_11_operand"
+  (match_code "const_int")
+{
+  HOST_WIDE_INT i = INTVAL (op);
+  return i == 4 || (i >= 8 && i <= 11);
+})
+
+;; Match 4 or 8. Used for SAE.
+(define_predicate "const48_operand"
+  (match_code "const_int")
+{
+  HOST_WIDE_INT i = INTVAL (op);
+  return i == 4 || i == 8;
+})
+
+;; Match 0 or 1.
+(define_predicate "const_0_to_1_operand"
+  (and (match_code "const_int")
+       (ior (match_test "op == const0_rtx")
+	    (match_test "op == const1_rtx"))))
+
+;; Match 0 to 3.
+(define_predicate "const_0_to_3_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 3)")))
+
+;; Match 0 to 4.
+(define_predicate "const_0_to_4_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 4)")))
+
+;; Match 0 to 5.
+(define_predicate "const_0_to_5_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 5)")))
+
+;; Match 0 to 7.
+(define_predicate "const_0_to_7_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 7)")))
+
+;; Match 0 to 15.
+(define_predicate "const_0_to_15_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 15)")))
+
+;; Match 0 to 31.
+(define_predicate "const_0_to_31_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 31)")))
+
+;; Match 0 to 63.
+(define_predicate "const_0_to_63_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 63)")))
+
+;; Match 0 to 255.
+(define_predicate "const_0_to_255_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 255)")))
+
+;; Match (0 to 255) * 8
+(define_predicate "const_0_to_255_mul_8_operand"
+  (match_code "const_int")
+{
+  unsigned HOST_WIDE_INT val = INTVAL (op);
+  return val <= 255*8 && val % 8 == 0;
+})
+
+;; Return true if OP is CONST_INT >= 1 and <= 31 (a valid operand
+;; for shift & compare patterns, as shifting by 0 does not change flags).
+(define_predicate "const_1_to_31_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 1, 31)")))
+
+;; Return true if OP is CONST_INT >= 1 and <= 63 (a valid operand
+;; for 64bit shift & compare patterns, as shifting by 0 does not change flags).
+(define_predicate "const_1_to_63_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 1, 63)")))
+
+;; Match 2 or 3.
+(define_predicate "const_2_to_3_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 2, 3)")))
+
+;; Match 4 to 5.
+(define_predicate "const_4_to_5_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 4, 5)")))
+
+;; Match 4 to 7.
+(define_predicate "const_4_to_7_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 4, 7)")))
+
+;; Match 6 to 7.
+(define_predicate "const_6_to_7_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 6, 7)")))
+
+;; Match 8 to 9.
+(define_predicate "const_8_to_9_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 8, 9)")))
+
+;; Match 8 to 11.
+(define_predicate "const_8_to_11_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 8, 11)")))
+
+;; Match 8 to 15.
+(define_predicate "const_8_to_15_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 8, 15)")))
+
+;; Match 10 to 11.
+(define_predicate "const_10_to_11_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 10, 11)")))
+
+;; Match 12 to 13.
+(define_predicate "const_12_to_13_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 12, 13)")))
+
+;; Match 12 to 15.
+(define_predicate "const_12_to_15_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 12, 15)")))
+
+;; Match 14 to 15.
+(define_predicate "const_14_to_15_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 14, 15)")))
+
+;; Match 16 to 19.
+(define_predicate "const_16_to_19_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 16, 19)")))
+
+;; Match 16 to 31.
+(define_predicate "const_16_to_31_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 16, 31)")))
+
+;; Match 20 to 23.
+(define_predicate "const_20_to_23_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 20, 23)")))
+
+;; Match 24 to 27.
+(define_predicate "const_24_to_27_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 24, 27)")))
+
+;; Match 28 to 31.
+(define_predicate "const_28_to_31_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 28, 31)")))
+
+;; True if this is a constant appropriate for an increment or decrement.
+(define_predicate "incdec_operand"
+  (match_code "const_int")
+{
+  /* On Pentium4, the inc and dec operations causes extra dependency on flag
+     registers, since carry flag is not set.  */
+  if (!TARGET_USE_INCDEC && !optimize_insn_for_size_p ())
+    return false;
+  return op == const1_rtx || op == constm1_rtx;
+})
+
+;; True for registers, or 1 or -1.  Used to optimize double-word shifts.
+(define_predicate "reg_or_pm1_operand"
+  (ior (match_operand 0 "register_operand")
+       (and (match_code "const_int")
+	    (ior (match_test "op == const1_rtx")
+		 (match_test "op == constm1_rtx")))))
+
+;; True if OP is acceptable as operand of DImode shift expander.
+(define_predicate "shiftdi_operand"
+  (if_then_else (match_test "TARGET_64BIT")
+    (match_operand 0 "nonimmediate_operand")
+    (match_operand 0 "register_operand")))
+
+(define_predicate "ashldi_input_operand"
+  (if_then_else (match_test "TARGET_64BIT")
+    (match_operand 0 "nonimmediate_operand")
+    (match_operand 0 "reg_or_pm1_operand")))
+
+;; Return true if OP is a vector load from the constant pool with just
+;; the first element nonzero.
+(define_predicate "zero_extended_scalar_load_operand"
+  (match_code "mem")
+{
+  unsigned n_elts;
+  op = maybe_get_pool_constant (op);
+
+  if (!(op && GET_CODE (op) == CONST_VECTOR))
+    return false;
+
+  n_elts = CONST_VECTOR_NUNITS (op);
+
+  for (n_elts--; n_elts > 0; n_elts--)
+    {
+      rtx elt = CONST_VECTOR_ELT (op, n_elts);
+      if (elt != CONST0_RTX (GET_MODE_INNER (GET_MODE (op))))
+	return false;
+    }
+  return true;
+})
+
+/* Return true if operand is a vector constant that is all ones. */
+(define_predicate "vector_all_ones_operand"
+  (match_code "const_vector")
+{
+  int nunits = GET_MODE_NUNITS (mode);
+
+  if (GET_CODE (op) == CONST_VECTOR
+      && CONST_VECTOR_NUNITS (op) == nunits)
+    {
+      int i;
+      for (i = 0; i < nunits; ++i)
+        {
+          rtx x = CONST_VECTOR_ELT (op, i);
+          if (x != constm1_rtx)
+            return false;
+        }
+      return true;
+    }
+
+  return false;
+})
+
+; Return true when OP is operand acceptable for standard SSE move.
+(define_predicate "vector_move_operand"
+  (ior (match_operand 0 "nonimmediate_operand")
+       (match_operand 0 "const0_operand")))
+
+;; Return true when OP is either nonimmediate operand, or any
+;; CONST_VECTOR.
+(define_predicate "nonimmediate_or_const_vector_operand"
+  (ior (match_operand 0 "nonimmediate_operand")
+       (match_code "const_vector")))
+
+;; Return true when OP is nonimmediate or standard SSE constant.
+(define_predicate "nonimmediate_or_sse_const_operand"
+  (match_operand 0 "general_operand")
+{
+  if (nonimmediate_operand (op, mode))
+    return true;
+  if (standard_sse_constant_p (op) > 0)
+    return true;
+  return false;
+})
+
+;; Return true if OP is a register or a zero.
+(define_predicate "reg_or_0_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "const0_operand")))
+
+;; Return true for RTX codes that force SImode address.
+(define_predicate "SImode_address_operand"
+  (match_code "subreg,zero_extend,and"))
+
+;; Return true if op if a valid address for LEA, and does not contain
+;; a segment override.  Defined as a special predicate to allow
+;; mode-less const_int operands pass to address_operand.
+(define_special_predicate "address_no_seg_operand"
+  (match_operand 0 "address_operand")
+{
+  struct ix86_address parts;
+  int ok;
+
+  ok = ix86_decompose_address (op, &parts);
+  gcc_assert (ok);
+  return parts.seg == SEG_DEFAULT;
+})
+
+;; Return true if op if a valid base register, displacement or
+;; sum of base register and displacement for VSIB addressing.
+(define_predicate "vsib_address_operand"
+  (match_operand 0 "address_operand")
+{
+  struct ix86_address parts;
+  int ok;
+  rtx disp;
+
+  ok = ix86_decompose_address (op, &parts);
+  gcc_assert (ok);
+  if (parts.index || parts.seg != SEG_DEFAULT)
+    return false;
+
+  /* VSIB addressing doesn't support (%rip).  */
+  if (parts.disp)
+    {
+      disp = parts.disp;
+      if (GET_CODE (disp) == CONST)
+	{
+	  disp = XEXP (disp, 0);
+	  if (GET_CODE (disp) == PLUS)
+	    disp = XEXP (disp, 0);
+	  if (GET_CODE (disp) == UNSPEC)
+	    switch (XINT (disp, 1))
+	      {
+	      case UNSPEC_GOTPCREL:
+	      case UNSPEC_PCREL:
+	      case UNSPEC_GOTNTPOFF:
+		return false;
+	      }
+	}
+      if (TARGET_64BIT
+	  && flag_pic
+	  && (GET_CODE (disp) == SYMBOL_REF
+	      || GET_CODE (disp) == LABEL_REF))
+	return false;
+    }
+
+  return true;
+})
+
+(define_predicate "vsib_mem_operator"
+  (match_code "mem"))
+
+;; Return true if the rtx is known to be at least 32 bits aligned.
+(define_predicate "aligned_operand"
+  (match_operand 0 "general_operand")
+{
+  struct ix86_address parts;
+  int ok;
+
+  /* Registers and immediate operands are always "aligned".  */
+  if (!MEM_P (op))
+    return true;
+
+  /* All patterns using aligned_operand on memory operands ends up
+     in promoting memory operand to 64bit and thus causing memory mismatch.  */
+  if (TARGET_MEMORY_MISMATCH_STALL && !optimize_insn_for_size_p ())
+    return false;
+
+  /* Don't even try to do any aligned optimizations with volatiles.  */
+  if (MEM_VOLATILE_P (op))
+    return false;
+
+  if (MEM_ALIGN (op) >= 32)
+    return true;
+
+  op = XEXP (op, 0);
+
+  /* Pushes and pops are only valid on the stack pointer.  */
+  if (GET_CODE (op) == PRE_DEC
+      || GET_CODE (op) == POST_INC)
+    return true;
+
+  /* Decode the address.  */
+  ok = ix86_decompose_address (op, &parts);
+  gcc_assert (ok);
+
+  if (parts.base && GET_CODE (parts.base) == SUBREG)
+    parts.base = SUBREG_REG (parts.base);
+  if (parts.index && GET_CODE (parts.index) == SUBREG)
+    parts.index = SUBREG_REG (parts.index);
+
+  /* Look for some component that isn't known to be aligned.  */
+  if (parts.index)
+    {
+      if (REGNO_POINTER_ALIGN (REGNO (parts.index)) * parts.scale < 32)
+	return false;
+    }
+  if (parts.base)
+    {
+      if (REGNO_POINTER_ALIGN (REGNO (parts.base)) < 32)
+	return false;
+    }
+  if (parts.disp)
+    {
+      if (!CONST_INT_P (parts.disp)
+	  || (INTVAL (parts.disp) & 3))
+	return false;
+    }
+
+  /* Didn't find one -- this must be an aligned address.  */
+  return true;
+})
+
+;; Return true if OP is memory operand with a displacement.
+(define_predicate "memory_displacement_operand"
+  (match_operand 0 "memory_operand")
+{
+  struct ix86_address parts;
+  int ok;
+
+  ok = ix86_decompose_address (XEXP (op, 0), &parts);
+  gcc_assert (ok);
+  return parts.disp != NULL_RTX;
+})
+
+;; Return true if OP is memory operand with a displacement only.
+(define_predicate "memory_displacement_only_operand"
+  (match_operand 0 "memory_operand")
+{
+  struct ix86_address parts;
+  int ok;
+
+  if (TARGET_64BIT)
+    return false;
+
+  ok = ix86_decompose_address (XEXP (op, 0), &parts);
+  gcc_assert (ok);
+
+  if (parts.base || parts.index)
+    return false;
+
+  return parts.disp != NULL_RTX;
+})
+
+;; Return true if OP is memory operand which will need zero or
+;; one register at most, not counting stack pointer or frame pointer.
+(define_predicate "cmpxchg8b_pic_memory_operand"
+  (match_operand 0 "memory_operand")
+{
+  struct ix86_address parts;
+  int ok;
+
+  if (TARGET_64BIT || !flag_pic)
+    return true;
+
+  ok = ix86_decompose_address (XEXP (op, 0), &parts);
+  gcc_assert (ok);
+
+  if (parts.base && GET_CODE (parts.base) == SUBREG)
+    parts.base = SUBREG_REG (parts.base);
+  if (parts.index && GET_CODE (parts.index) == SUBREG)
+    parts.index = SUBREG_REG (parts.index);
+
+  if (parts.base == NULL_RTX
+      || parts.base == arg_pointer_rtx
+      || parts.base == frame_pointer_rtx
+      || parts.base == hard_frame_pointer_rtx
+      || parts.base == stack_pointer_rtx)
+    return true;
+
+  if (parts.index == NULL_RTX
+      || parts.index == arg_pointer_rtx
+      || parts.index == frame_pointer_rtx
+      || parts.index == hard_frame_pointer_rtx
+      || parts.index == stack_pointer_rtx)
+    return true;
+
+  return false;
+})
+
+
+;; Return true if OP is memory operand that cannot be represented
+;; by the modRM array.
+(define_predicate "long_memory_operand"
+  (and (match_operand 0 "memory_operand")
+       (match_test "memory_address_length (op, false)")))
+
+;; Return true if OP is a comparison operator that can be issued by fcmov.
+(define_predicate "fcmov_comparison_operator"
+  (match_operand 0 "comparison_operator")
+{
+  enum machine_mode inmode = GET_MODE (XEXP (op, 0));
+  enum rtx_code code = GET_CODE (op);
+
+  if (inmode == CCFPmode || inmode == CCFPUmode)
+    {
+      if (!ix86_trivial_fp_comparison_operator (op, mode))
+	return false;
+      code = ix86_fp_compare_code_to_integer (code);
+    }
+  /* i387 supports just limited amount of conditional codes.  */
+  switch (code)
+    {
+    case LTU: case GTU: case LEU: case GEU:
+      if (inmode == CCmode || inmode == CCFPmode || inmode == CCFPUmode
+	  || inmode == CCCmode)
+	return true;
+      return false;
+    case ORDERED: case UNORDERED:
+    case EQ: case NE:
+      return true;
+    default:
+      return false;
+    }
+})
+
+;; Return true if OP is a comparison that can be used in the CMPSS/CMPPS insns.
+;; The first set are supported directly; the second set can't be done with
+;; full IEEE support, i.e. NaNs.
+
+(define_predicate "sse_comparison_operator"
+  (ior (match_code "eq,ne,lt,le,unordered,unge,ungt,ordered")
+       (and (match_test "TARGET_AVX")
+	    (match_code "ge,gt,uneq,unle,unlt,ltgt"))))
+
+(define_predicate "ix86_comparison_int_operator"
+  (match_code "ne,eq,ge,gt,le,lt"))
+
+(define_predicate "ix86_comparison_uns_operator"
+  (match_code "ne,eq,geu,gtu,leu,ltu"))
+
+(define_predicate "bt_comparison_operator"
+  (match_code "ne,eq"))
+
+;; Return true if OP is a valid comparison operator in valid mode.
+(define_predicate "ix86_comparison_operator"
+  (match_operand 0 "comparison_operator")
+{
+  enum machine_mode inmode = GET_MODE (XEXP (op, 0));
+  enum rtx_code code = GET_CODE (op);
+
+  if (inmode == CCFPmode || inmode == CCFPUmode)
+    return ix86_trivial_fp_comparison_operator (op, mode);
+
+  switch (code)
+    {
+    case EQ: case NE:
+      return true;
+    case LT: case GE:
+      if (inmode == CCmode || inmode == CCGCmode
+	  || inmode == CCGOCmode || inmode == CCNOmode)
+	return true;
+      return false;
+    case LTU: case GTU: case LEU: case GEU:
+      if (inmode == CCmode || inmode == CCCmode)
+	return true;
+      return false;
+    case ORDERED: case UNORDERED:
+      if (inmode == CCmode)
+	return true;
+      return false;
+    case GT: case LE:
+      if (inmode == CCmode || inmode == CCGCmode || inmode == CCNOmode)
+	return true;
+      return false;
+    default:
+      return false;
+    }
+})
+
+;; Return true if OP is a valid comparison operator
+;; testing carry flag to be set.
+(define_predicate "ix86_carry_flag_operator"
+  (match_code "ltu,lt,unlt,gtu,gt,ungt,le,unle,ge,unge,ltgt,uneq")
+{
+  enum machine_mode inmode = GET_MODE (XEXP (op, 0));
+  enum rtx_code code = GET_CODE (op);
+
+  if (inmode == CCFPmode || inmode == CCFPUmode)
+    {
+      if (!ix86_trivial_fp_comparison_operator (op, mode))
+	return false;
+      code = ix86_fp_compare_code_to_integer (code);
+    }
+  else if (inmode == CCCmode)
+   return code == LTU || code == GTU;
+  else if (inmode != CCmode)
+    return false;
+
+  return code == LTU;
+})
+
+;; Return true if this comparison only requires testing one flag bit.
+(define_predicate "ix86_trivial_fp_comparison_operator"
+  (match_code "gt,ge,unlt,unle,uneq,ltgt,ordered,unordered"))
+
+;; Return true if we know how to do this comparison.  Others require
+;; testing more than one flag bit, and we let the generic middle-end
+;; code do that.
+(define_predicate "ix86_fp_comparison_operator"
+  (if_then_else (match_test "ix86_fp_comparison_strategy (GET_CODE (op))
+                             == IX86_FPCMP_ARITH")
+               (match_operand 0 "comparison_operator")
+               (match_operand 0 "ix86_trivial_fp_comparison_operator")))
+
+;; Same as above, but for swapped comparison used in *jcc<fp>_<int>_i387.
+(define_predicate "ix86_swapped_fp_comparison_operator"
+  (match_operand 0 "comparison_operator")
+{
+  enum rtx_code code = GET_CODE (op);
+  bool ret;
+
+  PUT_CODE (op, swap_condition (code));
+  ret = ix86_fp_comparison_operator (op, mode);
+  PUT_CODE (op, code);
+  return ret;
+})
+
+;; Nearly general operand, but accept any const_double, since we wish
+;; to be able to drop them into memory rather than have them get pulled
+;; into registers.
+(define_predicate "cmp_fp_expander_operand"
+  (ior (match_code "const_double")
+       (match_operand 0 "general_operand")))
+
+;; Return true if this is a valid binary floating-point operation.
+(define_predicate "binary_fp_operator"
+  (match_code "plus,minus,mult,div"))
+
+;; Return true if this is a multiply operation.
+(define_predicate "mult_operator"
+  (match_code "mult"))
+
+;; Return true if this is a division operation.
+(define_predicate "div_operator"
+  (match_code "div"))
+
+;; Return true if this is a plus, minus, and, ior or xor operation.
+(define_predicate "plusminuslogic_operator"
+  (match_code "plus,minus,and,ior,xor"))
+
+;; Return true if this is a float extend operation.
+(define_predicate "float_operator"
+  (match_code "float"))
+
+;; Return true for ARITHMETIC_P.
+(define_predicate "arith_or_logical_operator"
+  (match_code "plus,mult,and,ior,xor,smin,smax,umin,umax,compare,minus,div,
+	       mod,udiv,umod,ashift,rotate,ashiftrt,lshiftrt,rotatert"))
+
+;; Return true for COMMUTATIVE_P.
+(define_predicate "commutative_operator"
+  (match_code "plus,mult,and,ior,xor,smin,smax,umin,umax"))
+
+;; Return true if OP is a binary operator that can be promoted to wider mode.
+(define_predicate "promotable_binary_operator"
+  (ior (match_code "plus,minus,and,ior,xor,ashift")
+       (and (match_code "mult")
+	    (match_test "TARGET_TUNE_PROMOTE_HIMODE_IMUL"))))
+
+(define_predicate "compare_operator"
+  (match_code "compare"))
+
+(define_predicate "absneg_operator"
+  (match_code "abs,neg"))
+
+;; Return true if OP is misaligned memory operand
+(define_predicate "misaligned_operand"
+  (and (match_code "mem")
+       (match_test "MEM_ALIGN (op) < GET_MODE_ALIGNMENT (mode)")))
+
+;; Return true if OP is a emms operation, known to be a PARALLEL.
+(define_predicate "emms_operation"
+  (match_code "parallel")
+{
+  unsigned i;
+
+  if (XVECLEN (op, 0) != 17)
+    return false;
+
+  for (i = 0; i < 8; i++)
+    {
+      rtx elt = XVECEXP (op, 0, i+1);
+
+      if (GET_CODE (elt) != CLOBBER
+	  || GET_CODE (SET_DEST (elt)) != REG
+	  || GET_MODE (SET_DEST (elt)) != XFmode
+	  || REGNO (SET_DEST (elt)) != FIRST_STACK_REG + i)
+        return false;
+
+      elt = XVECEXP (op, 0, i+9);
+
+      if (GET_CODE (elt) != CLOBBER
+	  || GET_CODE (SET_DEST (elt)) != REG
+	  || GET_MODE (SET_DEST (elt)) != DImode
+	  || REGNO (SET_DEST (elt)) != FIRST_MMX_REG + i)
+	return false;
+    }
+  return true;
+})
+
+;; Return true if OP is a vzeroall operation, known to be a PARALLEL.
+(define_predicate "vzeroall_operation"
+  (match_code "parallel")
+{
+  unsigned i, nregs = TARGET_64BIT ? 16 : 8;
+
+  if ((unsigned) XVECLEN (op, 0) != 1 + nregs)
+    return false;
+
+  for (i = 0; i < nregs; i++)
+    {
+      rtx elt = XVECEXP (op, 0, i+1);
+
+      if (GET_CODE (elt) != SET
+	  || GET_CODE (SET_DEST (elt)) != REG
+	  || GET_MODE (SET_DEST (elt)) != V8SImode
+	  || REGNO (SET_DEST (elt)) != SSE_REGNO (i)
+	  || SET_SRC (elt) != CONST0_RTX (V8SImode))
+	return false;
+    }
+  return true;
+})
+
+;; return true if OP is a vzeroupper operation.
+(define_predicate "vzeroupper_operation"
+  (and (match_code "unspec_volatile")
+       (match_test "XINT (op, 1) == UNSPECV_VZEROUPPER")))
+
+;; Return true if OP is a parallel for a vbroadcast permute.
+
+(define_predicate "avx_vbroadcast_operand"
+  (and (match_code "parallel")
+       (match_code "const_int" "a"))
+{
+  rtx elt = XVECEXP (op, 0, 0);
+  int i, nelt = XVECLEN (op, 0);
+
+  /* Don't bother checking there are the right number of operands,
+     merely that they're all identical.  */
+  for (i = 1; i < nelt; ++i)
+    if (XVECEXP (op, 0, i) != elt)
+      return false;
+  return true;
+})
+
+;; Return true if OP is a proper third operand to vpblendw256.
+(define_predicate "avx2_pblendw_operand"
+  (match_code "const_int")
+{
+  HOST_WIDE_INT val = INTVAL (op);
+  HOST_WIDE_INT low = val & 0xff;
+  return val == ((low << 8) | low);
+})
+
+;; Return true if OP is nonimmediate_operand or CONST_VECTOR.
+(define_predicate "general_vector_operand"
+  (ior (match_operand 0 "nonimmediate_operand")
+       (match_code "const_vector")))
+
+;; Return true if OP is either -1 constant or stored in register.
+(define_predicate "register_or_constm1_operand"
+  (ior (match_operand 0 "register_operand")
+       (and (match_code "const_int")
+	    (match_test "op == constm1_rtx"))))
diff --git a/gcc-4.9/gcc/config/i386/prfchwintrin.h b/gcc-4.9/gcc/config/i386/prfchwintrin.h
new file mode 100644
index 000000000..b2f5772ec
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/prfchwintrin.h
@@ -0,0 +1,37 @@
+/* Copyright (C) 2012-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _X86INTRIN_H_INCLUDED && !defined _MM3DNOW_H_INCLUDED
+# error "Never use <prfchwintrin.h> directly; include <x86intrin.h> or <mm3dnow.h> instead."
+#endif
+
+#ifndef _PRFCHWINTRIN_H_INCLUDED
+#define _PRFCHWINTRIN_H_INCLUDED
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_prefetchw (void *__P)
+{
+  __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
+}
+
+#endif /* _PRFCHWINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/rdos.h b/gcc-4.9/gcc/config/i386/rdos.h
new file mode 100644
index 000000000..e8370c6c6
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/rdos.h
@@ -0,0 +1,39 @@
+/* Definitions for RDOS on i386.
+   Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* RDOS uses .exe suffix */
+#undef TARGET_EXECUTABLE_SUFFIX
+#define TARGET_EXECUTABLE_SUFFIX ".exe"
+
+#undef TARGET_TLS_DIRECT_SEG_REFS_DEFAULT
+#define TARGET_TLS_DIRECT_SEG_REFS_DEFAULT MASK_TLS_DIRECT_SEG_REFS
+
+#undef DEFAULT_TLS_SEG_REG
+#define DEFAULT_TLS_SEG_REG SEG_GS 
+
+#undef TARGET_RDOS
+#define TARGET_RDOS 1
+
+#define TARGET_OS_CPP_BUILTINS()		\
+  do						\
+    {						\
+      builtin_define ("__RDOS__");		\
+      builtin_assert ("system=rdos");		\
+    }						\
+  while (0)
diff --git a/gcc-4.9/gcc/config/i386/rdos64.h b/gcc-4.9/gcc/config/i386/rdos64.h
new file mode 100644
index 000000000..e6f089a00
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/rdos64.h
@@ -0,0 +1,24 @@
+/* Definitions for RDOS on x86_64.
+   Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#undef REAL_PIC_OFFSET_TABLE_REGNUM
+#define REAL_PIC_OFFSET_TABLE_REGNUM  R15_REG
+
+#undef DEFAULT_LARGE_SECTION_THRESHOLD
+#define DEFAULT_LARGE_SECTION_THRESHOLD 16
diff --git a/gcc-4.9/gcc/config/i386/rdseedintrin.h b/gcc-4.9/gcc/config/i386/rdseedintrin.h
new file mode 100644
index 000000000..0ab18e552
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/rdseedintrin.h
@@ -0,0 +1,66 @@
+/* Copyright (C) 2012-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _X86INTRIN_H_INCLUDED
+# error "Never use <rdseedintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _RDSEEDINTRIN_H_INCLUDED
+#define _RDSEEDINTRIN_H_INCLUDED
+
+#ifndef __RDSEED__
+#pragma GCC push_options
+#pragma GCC target("rdseed")
+#define __DISABLE_RDSEED__
+#endif /* __RDSEED__ */
+
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdseed16_step (unsigned short *p)
+{
+    return __builtin_ia32_rdseed_hi_step (p);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdseed32_step (unsigned int *p)
+{
+    return __builtin_ia32_rdseed_si_step (p);
+}
+
+#ifdef __x86_64__
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdseed64_step (unsigned long long *p)
+{
+    return __builtin_ia32_rdseed_di_step (p);
+}
+#endif
+
+#ifdef __DISABLE_RDSEED__
+#undef __DISABLE_RDSEED__
+#pragma GCC pop_options
+#endif /* __DISABLE_RDSEED__ */
+
+#endif /* _RDSEEDINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/rtemself.h b/gcc-4.9/gcc/config/i386/rtemself.h
new file mode 100644
index 000000000..7c3a19ce6
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/rtemself.h
@@ -0,0 +1,40 @@
+/* Definitions for rtems targeting an ix86 using ELF.
+   Copyright (C) 1996-2014 Free Software Foundation, Inc.
+   Contributed by Joel Sherrill (joel@OARcorp.com).
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Specify predefined symbols in preprocessor.  */
+
+#define TARGET_OS_CPP_BUILTINS()		\
+  do						\
+    {						\
+	builtin_define ("__rtems__");		\
+	builtin_define ("__USE_INIT_FINI__");	\
+	builtin_assert ("system=rtems");	\
+    }						\
+  while (0)
+
+#undef LONG_DOUBLE_TYPE_SIZE
+#define LONG_DOUBLE_TYPE_SIZE (TARGET_80387 ? 80 : 64)
+
+#undef LIBGCC2_LONG_DOUBLE_TYPE_SIZE
+#ifdef _SOFT_FLOAT
+#define LIBGCC2_LONG_DOUBLE_TYPE_SIZE 64
+#else
+#define LIBGCC2_LONG_DOUBLE_TYPE_SIZE 80
+#endif
diff --git a/gcc-4.9/gcc/config/i386/rtmintrin.h b/gcc-4.9/gcc/config/i386/rtmintrin.h
new file mode 100644
index 000000000..ac40d228a
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/rtmintrin.h
@@ -0,0 +1,84 @@
+/* Copyright (C) 2012-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+# error "Never use <rtmintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _RTMINTRIN_H_INCLUDED
+#define _RTMINTRIN_H_INCLUDED
+
+#ifndef __RTM__
+#pragma GCC push_options
+#pragma GCC target("rtm")
+#define __DISABLE_RTM__
+#endif /* __RTM__ */
+
+#define _XBEGIN_STARTED		(~0u)
+#define _XABORT_EXPLICIT	(1 << 0)
+#define _XABORT_RETRY		(1 << 1)
+#define _XABORT_CONFLICT	(1 << 2)
+#define _XABORT_CAPACITY	(1 << 3)
+#define _XABORT_DEBUG		(1 << 4)
+#define _XABORT_NESTED		(1 << 5)
+#define _XABORT_CODE(x)		(((x) >> 24) & 0xFF)
+
+/* Start an RTM code region.  Return _XBEGIN_STARTED on success and the
+   abort condition otherwise.  */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xbegin (void)
+{
+  return __builtin_ia32_xbegin ();
+}
+
+/* Specify the end of an RTM code region.  If it corresponds to the
+   outermost transaction, then attempts the transaction commit.  If the
+   commit fails, then control is transferred to the outermost transaction
+   fallback handler.  */
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xend (void)
+{
+  __builtin_ia32_xend ();
+}
+
+/* Force an RTM abort condition. The control is transferred to the
+   outermost transaction fallback handler with the abort condition IMM.  */
+#ifdef __OPTIMIZE__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xabort (const unsigned int imm)
+{
+  __builtin_ia32_xabort (imm);
+}
+#else
+#define _xabort(N)  __builtin_ia32_xabort (N)
+#endif /* __OPTIMIZE__ */
+
+#ifdef __DISABLE_RTM__
+#undef __DISABLE_RTM__
+#pragma GCC pop_options
+#endif /* __DISABLE_RTM__ */
+
+#endif /* _RTMINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/shaintrin.h b/gcc-4.9/gcc/config/i386/shaintrin.h
new file mode 100644
index 000000000..d8a3da3da
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/shaintrin.h
@@ -0,0 +1,98 @@
+/* Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <shaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _SHAINTRIN_H_INCLUDED
+#define _SHAINTRIN_H_INCLUDED
+
+#ifndef __SHA__
+#pragma GCC push_options
+#pragma GCC target("sha")
+#define __DISABLE_SHA__
+#endif /* __SHA__ */
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha1msg1_epu32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_sha1msg1 ((__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha1msg2_epu32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_sha1msg2 ((__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha1nexte_epu32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_sha1nexte ((__v4si) __A, (__v4si) __B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha1rnds4_epu32 (__m128i __A, __m128i __B, const int __I)
+{
+  return (__m128i) __builtin_ia32_sha1rnds4 ((__v4si) __A, (__v4si) __B, __I);
+}
+#else
+#define _mm_sha1rnds4_epu32(A, B, I)				    \
+  ((__m128i) __builtin_ia32_sha1rnds4 ((__v4si)(__m128i)A,	    \
+				       (__v4si)(__m128i)B, (int)I))
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha256msg1_epu32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_sha256msg1 ((__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha256msg2_epu32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_sha256msg2 ((__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha256rnds2_epu32 (__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_sha256rnds2 ((__v4si) __A, (__v4si) __B,
+					       (__v4si) __C);
+}
+
+#ifdef __DISABLE_SHA__
+#undef __DISABLE_SHA__
+#pragma GCC pop_options
+#endif /* __DISABLE_SHA__ */
+
+#endif /* _SHAINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/slm.md b/gcc-4.9/gcc/config/i386/slm.md
new file mode 100644
index 000000000..e3a8328c4
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/slm.md
@@ -0,0 +1,758 @@
+;; Slivermont(SLM) Scheduling
+;; Copyright (C) 2009-2014 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+;;
+;; Silvermont has 2 out-of-order IEC, 2 in-order FEC and 1 in-order MEC.
+
+
+(define_automaton "slm")
+
+;;  EU: Execution Unit
+;;  Silvermont EUs are connected by port 0 or port 1.
+
+;;  SLM has two ports: port 0 and port 1 connecting to all execution units
+(define_cpu_unit "slm-port-0,slm-port-1" "slm")
+
+(define_cpu_unit "slm-ieu-0, slm-ieu-1,
+                  slm-imul, slm-feu-0, slm-feu-1"
+                  "slm")
+
+(define_reservation "slm-all-ieu" "(slm-ieu-0 + slm-ieu-1 + slm-imul)")
+(define_reservation "slm-all-feu" "(slm-feu-0 + slm-feu-1)")
+(define_reservation "slm-all-eu" "(slm-all-ieu + slm-all-feu)")
+(define_reservation "slm-fp-0" "(slm-port-0 + slm-feu-0)")
+
+;; Some EUs have duplicated copied and can be accessed via either
+;; port 0 or port 1
+;; (define_reservation "slm-port-either" "(slm-port-0 | slm-port-1)"
+(define_reservation "slm-port-dual" "(slm-port-0 + slm-port-1)")
+
+;;; fmul insn can have 4 or 5 cycles latency
+(define_reservation "slm-fmul-5c"
+                    "(slm-port-0 + slm-feu-0), slm-feu-0, nothing*3")
+(define_reservation "slm-fmul-4c" "(slm-port-0 + slm-feu-0), nothing*3")
+
+;;; fadd can has 3 cycles latency depends on instruction forms
+(define_reservation "slm-fadd-3c" "(slm-port-1 + slm-feu-1), nothing*2")
+(define_reservation "slm-fadd-4c"
+                    "(slm-port-1 + slm-feu-1), slm-feu-1, nothing*2")
+
+;;; imul insn has 3 cycles latency for SI operands
+(define_reservation "slm-imul-32"
+                    "(slm-port-1 + slm-imul), nothing*2")
+(define_reservation "slm-imul-mem-32"
+                    "(slm-port-1 + slm-imul + slm-port-0), nothing*2")
+;;; imul has 4 cycles latency for DI operands with 1/2 tput
+(define_reservation "slm-imul-64"
+                    "(slm-port-1 + slm-imul), slm-imul, nothing*2")
+
+;;; dual-execution instructions can have 1,2,4,5 cycles latency depends on
+;;; instruction forms
+(define_reservation "slm-dual-1c" "(slm-port-dual + slm-all-eu)")
+(define_reservation "slm-dual-2c"
+                    "(slm-port-dual + slm-all-eu, nothing)")
+
+;;; Most of simple ALU instructions have 1 cycle latency. Some of them
+;;; issue in port 0, some in port 0 and some in either port.
+(define_reservation "slm-simple-0" "(slm-port-0 + slm-ieu-0)")
+(define_reservation "slm-simple-1" "(slm-port-1 + slm-ieu-1)")
+(define_reservation "slm-simple-either" "(slm-simple-0 | slm-simple-1)")
+
+;;; Complex macro-instruction has variants of latency, and uses both ports.
+(define_reservation "slm-complex" "(slm-port-dual + slm-all-eu)")
+
+(define_insn_reservation  "slm_other" 9
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "other")
+            (eq_attr "atom_unit" "!jeu")))
+  "slm-complex, slm-all-eu*8")
+
+;; return has type "other" with atom_unit "jeu"
+(define_insn_reservation  "slm_other_2" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "other")
+            (eq_attr "atom_unit" "jeu")))
+  "slm-dual-1c")
+
+(define_insn_reservation  "slm_multi" 9
+  (and (eq_attr "cpu" "slm")
+       (eq_attr "type" "multi"))
+  "slm-complex, slm-all-eu*8")
+
+;; Normal alu insns without carry
+(define_insn_reservation  "slm_alu" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "alu")
+            (and (eq_attr "memory" "none")
+                 (eq_attr "use_carry" "0"))))
+  "slm-simple-either")
+
+;; Normal alu insns without carry, but use MEC.
+(define_insn_reservation  "slm_alu_mem" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "alu")
+            (and (eq_attr "memory" "!none")
+                 (eq_attr "use_carry" "0"))))
+  "slm-simple-either")
+
+;; Alu insn consuming CF, such as add/sbb
+(define_insn_reservation  "slm_alu_carry" 2
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "alu")
+            (and (eq_attr "memory" "none")
+                 (eq_attr "use_carry" "1"))))
+  "slm-simple-either, nothing")
+
+;; Alu insn consuming CF, such as add/sbb
+(define_insn_reservation  "slm_alu_carry_mem" 2
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "alu")
+            (and (eq_attr "memory" "!none")
+                (eq_attr "use_carry" "1"))))
+  "slm-simple-either, nothing")
+
+(define_insn_reservation  "slm_alu1" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "alu1")
+            (eq_attr "memory" "none") (eq_attr "prefix_0f" "0")))
+  "slm-simple-either")
+
+;; bsf and bsf insn
+(define_insn_reservation  "slm_alu1_1" 10
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "alu1")
+            (eq_attr "memory" "none") (eq_attr "prefix_0f" "1")))
+  "slm-simple-1, slm-ieu-1*9")
+
+(define_insn_reservation  "slm_alu1_mem" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "alu1")
+            (eq_attr "memory" "!none")))
+  "slm-simple-either")
+
+(define_insn_reservation  "slm_negnot" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "negnot")
+            (eq_attr "memory" "none")))
+  "slm-simple-either")
+
+(define_insn_reservation  "slm_negnot_mem" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "negnot")
+            (eq_attr "memory" "!none")))
+  "slm-simple-either")
+
+(define_insn_reservation  "slm_imov" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "imov")
+            (eq_attr "memory" "none")))
+  "slm-simple-either")
+
+(define_insn_reservation  "slm_imov_mem" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "imov")
+            (eq_attr "memory" "!none")))
+  "slm-simple-0")
+
+;; 16<-16, 32<-32
+(define_insn_reservation  "slm_imovx" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "imovx")
+            (and (eq_attr "memory" "none")
+                 (ior (and (match_operand:HI 0 "register_operand")
+                           (match_operand:HI 1 "general_operand"))
+                      (and (match_operand:SI 0 "register_operand")
+                           (match_operand:SI 1 "general_operand"))))))
+  "slm-simple-either")
+
+;; 16<-16, 32<-32, mem
+(define_insn_reservation  "slm_imovx_mem" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "imovx")
+            (and (eq_attr "memory" "!none")
+                 (ior (and (match_operand:HI 0 "register_operand")
+                           (match_operand:HI 1 "general_operand"))
+                      (and (match_operand:SI 0 "register_operand")
+                           (match_operand:SI 1 "general_operand"))))))
+  "slm-simple-either")
+
+;; 32<-16, 32<-8, 64<-16, 64<-8, 64<-32, 8<-8
+(define_insn_reservation  "slm_imovx_2" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "imovx")
+            (and (eq_attr "memory" "none")
+                 (ior (match_operand:QI 0 "register_operand")
+                      (ior (and (match_operand:SI 0 "register_operand")
+                                (not (match_operand:SI 1 "general_operand")))
+                           (match_operand:DI 0 "register_operand"))))))
+  "slm-simple-either")
+
+;; 32<-16, 32<-8, 64<-16, 64<-8, 64<-32, 8<-8, mem
+(define_insn_reservation  "slm_imovx_2_mem" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "imovx")
+            (and (eq_attr "memory" "!none")
+                 (ior (match_operand:QI 0 "register_operand")
+                      (ior (and (match_operand:SI 0 "register_operand")
+                                (not (match_operand:SI 1 "general_operand")))
+                           (match_operand:DI 0 "register_operand"))))))
+  "slm-simple-0")
+
+;; 16<-8
+(define_insn_reservation  "slm_imovx_3" 3
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "imovx")
+            (and (match_operand:HI 0 "register_operand")
+                 (match_operand:QI 1 "general_operand"))))
+  "slm-simple-0, nothing*2")
+
+(define_insn_reservation  "slm_lea" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "lea")
+            (eq_attr "mode" "!HI")))
+  "slm-simple-either")
+
+;; lea 16bit address is complex insn
+(define_insn_reservation  "slm_lea_2" 2
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "lea")
+            (eq_attr "mode" "HI")))
+  "slm-complex, slm-all-eu")
+
+(define_insn_reservation  "slm_incdec" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "incdec")
+            (eq_attr "memory" "none")))
+  "slm-simple-0")
+
+(define_insn_reservation  "slm_incdec_mem" 3
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "incdec")
+            (eq_attr "memory" "!none")))
+  "slm-simple-0, nothing*2")
+
+;; simple shift instruction use SHIFT eu, none memory
+(define_insn_reservation  "slm_ishift" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "ishift")
+            (and (eq_attr "memory" "none") (eq_attr "prefix_0f" "0"))))
+  "slm-simple-0")
+
+;; simple shift instruction use SHIFT eu, memory
+(define_insn_reservation  "slm_ishift_mem" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "ishift")
+            (and (eq_attr "memory" "!none") (eq_attr "prefix_0f" "0"))))
+  "slm-simple-0")
+
+;; DF shift (prefixed with 0f) is complex insn with latency of 4 cycles
+(define_insn_reservation  "slm_ishift_3" 4
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "ishift")
+            (eq_attr "prefix_0f" "1")))
+  "slm-complex, slm-all-eu*3")
+
+(define_insn_reservation  "slm_ishift1" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "ishift1")
+            (eq_attr "memory" "none")))
+  "slm-simple-0")
+
+(define_insn_reservation  "slm_ishift1_mem" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "ishift1")
+            (eq_attr "memory" "!none")))
+  "slm-simple-0")
+
+(define_insn_reservation  "slm_rotate" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "rotate")
+            (eq_attr "memory" "none")))
+  "slm-simple-0")
+
+(define_insn_reservation  "slm_rotate_mem" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "rotate")
+            (eq_attr "memory" "!none")))
+  "slm-simple-0")
+
+(define_insn_reservation  "slm_rotate1" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "rotate1")
+            (eq_attr "memory" "none")))
+  "slm-simple-0")
+
+(define_insn_reservation  "slm_rotate1_mem" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "rotate1")
+            (eq_attr "memory" "!none")))
+  "slm-simple-0")
+
+(define_insn_reservation  "slm_imul" 3
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "imul")
+            (and (eq_attr "memory" "none") (eq_attr "mode" "SI"))))
+  "slm-imul-32")
+
+(define_insn_reservation  "slm_imul_mem" 3
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "imul")
+            (and (eq_attr "memory" "!none") (eq_attr "mode" "SI"))))
+  "slm-imul-mem-32")
+
+;; latency set to 4 as common 64x64 imul with 1/2 tput
+(define_insn_reservation  "slm_imul_3" 4
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "imul")
+            (eq_attr "mode" "!SI")))
+  "slm-imul-64")
+
+(define_insn_reservation  "slm_idiv" 33
+  (and (eq_attr "cpu" "slm")
+       (eq_attr "type" "idiv"))
+  "slm-complex, slm-all-eu*16, nothing*16")
+
+(define_insn_reservation  "slm_icmp" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "icmp")
+            (eq_attr "memory" "none")))
+  "slm-simple-either")
+
+(define_insn_reservation  "slm_icmp_mem" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "icmp")
+            (eq_attr "memory" "!none")))
+  "slm-simple-either")
+
+(define_insn_reservation  "slm_test" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "test")
+            (eq_attr "memory" "none")))
+  "slm-simple-either")
+
+(define_insn_reservation  "slm_test_mem" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "test")
+            (eq_attr "memory" "!none")))
+  "slm-simple-either")
+
+(define_insn_reservation  "slm_ibr" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "ibr")
+            (eq_attr "memory" "!load")))
+  "slm-simple-1")
+
+;; complex if jump target is from address
+(define_insn_reservation  "slm_ibr_2" 2
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "ibr")
+            (eq_attr "memory" "load")))
+  "slm-complex, slm-all-eu")
+
+(define_insn_reservation  "slm_setcc" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "setcc")
+            (eq_attr "memory" "!store")))
+  "slm-simple-either")
+
+;; 2 cycles complex if target is in memory
+(define_insn_reservation  "slm_setcc_2" 2
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "setcc")
+            (eq_attr "memory" "store")))
+  "slm-complex, slm-all-eu")
+
+(define_insn_reservation  "slm_icmov" 2
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "icmov")
+            (eq_attr "memory" "none")))
+  "slm-simple-either, nothing")
+
+(define_insn_reservation  "slm_icmov_mem" 2
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "icmov")
+            (eq_attr "memory" "!none")))
+  "slm-simple-0, nothing")
+
+;; UCODE if segreg, ignored
+(define_insn_reservation  "slm_push" 2
+  (and (eq_attr "cpu" "slm")
+       (eq_attr "type" "push"))
+  "slm-dual-2c")
+
+;; pop r64 is 1 cycle. UCODE if segreg, ignored
+(define_insn_reservation  "slm_pop" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "pop")
+            (eq_attr "mode" "DI")))
+  "slm-dual-1c")
+
+;; pop non-r64 is 2 cycles. UCODE if segreg, ignored
+(define_insn_reservation  "slm_pop_2" 2
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "pop")
+            (eq_attr "mode" "!DI")))
+  "slm-dual-2c")
+
+;; UCODE if segreg, ignored
+(define_insn_reservation  "slm_call" 1
+  (and (eq_attr "cpu" "slm")
+       (eq_attr "type" "call"))
+  "slm-dual-1c")
+
+(define_insn_reservation  "slm_callv" 1
+  (and (eq_attr "cpu" "slm")
+       (eq_attr "type" "callv"))
+  "slm-dual-1c")
+
+(define_insn_reservation  "slm_leave" 3
+  (and (eq_attr "cpu" "slm")
+       (eq_attr "type" "leave"))
+  "slm-complex, slm-all-eu*2")
+
+(define_insn_reservation  "slm_str" 3
+  (and (eq_attr "cpu" "slm")
+       (eq_attr "type" "str"))
+  "slm-complex, slm-all-eu*2")
+
+(define_insn_reservation  "slm_sselog" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sselog")
+            (eq_attr "memory" "none")))
+  "slm-simple-either")
+
+(define_insn_reservation  "slm_sselog_mem" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sselog")
+            (eq_attr "memory" "!none")))
+  "slm-simple-either")
+
+(define_insn_reservation  "slm_sselog1" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sselog1")
+            (eq_attr "memory" "none")))
+  "slm-simple-0")
+
+(define_insn_reservation  "slm_sselog1_mem" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sselog1")
+            (eq_attr "memory" "!none")))
+  "slm-simple-0")
+
+;; not pmad, not psad
+(define_insn_reservation  "slm_sseiadd" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sseiadd")
+            (and (not (match_operand:V2DI 0 "register_operand"))
+                 (and (eq_attr "atom_unit" "!simul")
+                      (eq_attr "atom_unit" "!complex")))))
+  "slm-simple-either")
+
+;; pmad, psad and 64
+(define_insn_reservation  "slm_sseiadd_2" 4
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sseiadd")
+            (and (not (match_operand:V2DI 0 "register_operand"))
+                 (and (eq_attr "atom_unit" "simul" )
+                      (eq_attr "mode" "DI")))))
+  "slm-fmul-4c")
+
+;; pmad, psad and 128
+(define_insn_reservation  "slm_sseiadd_3" 5
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sseiadd")
+            (and (not (match_operand:V2DI 0 "register_operand"))
+                 (and (eq_attr "atom_unit" "simul" )
+                      (eq_attr "mode" "TI")))))
+  "slm-fmul-5c")
+
+;; if paddq(64 bit op), phadd/phsub
+(define_insn_reservation  "slm_sseiadd_4" 4
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sseiadd")
+            (ior (match_operand:V2DI 0 "register_operand")
+                 (eq_attr "atom_unit" "complex"))))
+  "slm-fadd-4c")
+
+;; if immediate op.
+(define_insn_reservation  "slm_sseishft" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sseishft")
+            (and (eq_attr "atom_unit" "!sishuf")
+                 (match_operand 2 "immediate_operand"))))
+  "slm-simple-either")
+
+;; if palignr or psrldq
+(define_insn_reservation  "slm_sseishft_2" 1
+  (and (eq_attr "cpu" "slm")
+       (ior (eq_attr "type" "sseishft1")
+	    (and (eq_attr "type" "sseishft")
+		 (and (eq_attr "atom_unit" "sishuf")
+		      (match_operand 2 "immediate_operand")))))
+  "slm-simple-0")
+
+;; if reg/mem op
+(define_insn_reservation  "slm_sseishft_3" 2
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sseishft")
+            (not (match_operand 2 "immediate_operand"))))
+  "slm-complex, slm-all-eu")
+
+(define_insn_reservation  "slm_sseimul" 5
+  (and (eq_attr "cpu" "slm")
+       (eq_attr "type" "sseimul"))
+  "slm-fmul-5c")
+
+;; rcpss or rsqrtss
+(define_insn_reservation  "slm_sse" 4
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sse")
+            (and (eq_attr "atom_sse_attr" "rcp") (eq_attr "mode" "SF"))))
+  "slm-fmul-4c")
+
+;; movshdup, movsldup. Suggest to type sseishft
+(define_insn_reservation  "slm_sse_2" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sse")
+            (eq_attr "atom_sse_attr" "movdup")))
+  "slm-simple-0")
+
+;; lfence
+(define_insn_reservation  "slm_sse_3" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sse")
+            (eq_attr "atom_sse_attr" "lfence")))
+  "slm-simple-either")
+
+;; sfence,clflush,mfence, prefetch
+(define_insn_reservation  "slm_sse_4" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sse")
+            (ior (eq_attr "atom_sse_attr" "fence")
+                 (eq_attr "atom_sse_attr" "prefetch"))))
+  "slm-simple-0")
+
+;; rcpps, rsqrtss, sqrt, ldmxcsr
+(define_insn_reservation  "slm_sse_5" 9
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sse")
+            (ior (ior (eq_attr "atom_sse_attr" "sqrt")
+                      (eq_attr "atom_sse_attr" "mxcsr"))
+                 (and (eq_attr "atom_sse_attr" "rcp")
+                      (eq_attr "mode" "V4SF")))))
+  "slm-complex, slm-all-eu*7, nothing")
+
+;; xmm->xmm
+(define_insn_reservation  "slm_ssemov" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "ssemov")
+            (and (match_operand 0 "register_operand" "xy")
+                 (match_operand 1 "register_operand" "xy"))))
+  "slm-simple-either")
+
+;; reg->xmm
+(define_insn_reservation  "slm_ssemov_2" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "ssemov")
+            (and (match_operand 0 "register_operand" "xy")
+                 (match_operand 1 "register_operand" "r"))))
+  "slm-simple-0")
+
+;; xmm->reg
+(define_insn_reservation  "slm_ssemov_3" 3
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "ssemov")
+            (and (match_operand 0 "register_operand" "r")
+                 (match_operand 1 "register_operand" "xy"))))
+  "slm-simple-0, nothing*2")
+
+;; mov mem
+(define_insn_reservation  "slm_ssemov_4" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "ssemov")
+            (and (eq_attr "movu" "0") (eq_attr "memory" "!none"))))
+  "slm-simple-0")
+
+;; movu mem
+(define_insn_reservation  "slm_ssemov_5" 2
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "ssemov")
+            (ior (eq_attr "movu" "1") (eq_attr "memory" "!none"))))
+  "slm-simple-0, nothing")
+
+;; no memory simple
+(define_insn_reservation  "slm_sseadd" 3
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sseadd")
+            (and (eq_attr "memory" "none")
+                 (and (eq_attr "mode" "!V2DF")
+                      (eq_attr "atom_unit" "!complex")))))
+  "slm-fadd-3c")
+
+;; memory simple
+(define_insn_reservation  "slm_sseadd_mem" 3
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sseadd")
+            (and (eq_attr "memory" "!none")
+                 (and (eq_attr "mode" "!V2DF")
+                      (eq_attr "atom_unit" "!complex")))))
+  "slm-fadd-3c")
+
+;; maxps, minps, *pd, hadd, hsub
+(define_insn_reservation  "slm_sseadd_3" 4
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sseadd")
+            (ior (eq_attr "mode" "V2DF") (eq_attr "atom_unit" "complex"))))
+  "slm-fadd-4c")
+
+;; Except dppd/dpps
+(define_insn_reservation  "slm_ssemul" 5
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "ssemul")
+            (eq_attr "mode" "!SF")))
+  "slm-fmul-5c")
+
+;; Except dppd/dpps, 4 cycle if mulss
+(define_insn_reservation  "slm_ssemul_2" 4
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "ssemul")
+            (eq_attr "mode" "SF")))
+  "slm-fmul-4c")
+
+(define_insn_reservation  "slm_ssecmp" 1
+  (and (eq_attr "cpu" "slm")
+       (eq_attr "type" "ssecmp"))
+  "slm-simple-either")
+
+(define_insn_reservation  "slm_ssecomi" 1
+  (and (eq_attr "cpu" "slm")
+       (eq_attr "type" "ssecomi"))
+  "slm-simple-0")
+
+;; no memory and cvtpi2ps, cvtps2pi, cvttps2pi
+(define_insn_reservation  "slm_ssecvt" 5
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "ssecvt")
+            (ior (and (match_operand:V2SI 0 "register_operand")
+                      (match_operand:V4SF 1 "register_operand"))
+                 (and (match_operand:V4SF 0 "register_operand")
+                      (match_operand:V2SI 1 "register_operand")))))
+  "slm-fp-0, slm-feu-0, nothing*3")
+
+;; memory and cvtpi2ps, cvtps2pi, cvttps2pi
+(define_insn_reservation  "slm_ssecvt_mem" 5
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "ssecvt")
+            (ior (and (match_operand:V2SI 0 "register_operand")
+                      (match_operand:V4SF 1 "memory_operand"))
+                 (and (match_operand:V4SF 0 "register_operand")
+                      (match_operand:V2SI 1 "memory_operand")))))
+"slm-fp-0, slm-feu-0, nothing*3")
+
+;; cvtpd2pi, cvtpi2pd
+(define_insn_reservation  "slm_ssecvt_1" 2
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "ssecvt")
+            (ior (and (match_operand:V2DF 0 "register_operand")
+                      (match_operand:V2SI 1 "register_operand"))
+                 (and (match_operand:V2SI 0 "register_operand")
+                      (match_operand:V2DF 1 "register_operand")))))
+  "slm-fp-0, slm-feu-0")
+
+;; memory and cvtpd2pi, cvtpi2pd
+(define_insn_reservation  "slm_ssecvt_1_mem" 2
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "ssecvt")
+            (ior (and (match_operand:V2DF 0 "register_operand")
+                      (match_operand:V2SI 1 "memory_operand"))
+                 (and (match_operand:V2SI 0 "register_operand")
+                      (match_operand:V2DF 1 "memory_operand")))))
+  "slm-fp-0, slm-feu-0")
+
+;; otherwise. 4 cycles average for cvtss2sd
+(define_insn_reservation  "slm_ssecvt_3" 4
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "ssecvt")
+            (not (ior (and (match_operand:V2SI 0 "register_operand")
+                           (match_operand:V4SF 1 "nonimmediate_operand"))
+                      (and (match_operand:V4SF 0 "register_operand")
+                           (match_operand:V2SI 1 "nonimmediate_operand"))))))
+  "slm-fp-0, nothing*3")
+
+;; memory and cvtsi2sd
+(define_insn_reservation  "slm_sseicvt" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sseicvt")
+            (and (match_operand:V2DF 0 "register_operand")
+                 (match_operand:SI 1 "nonimmediate_operand"))))
+  "slm-fp-0")
+
+;; otherwise. 8 cycles average for cvtsd2si
+(define_insn_reservation  "slm_sseicvt_2" 4
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "sseicvt")
+            (not (and (match_operand:V2DF 0 "register_operand")
+                      (match_operand:SI 1 "memory_operand")))))
+  "slm-fp-0, nothing*3")
+
+(define_insn_reservation  "slm_ssediv" 13
+  (and (eq_attr "cpu" "slm")
+       (eq_attr "type" "ssediv"))
+  "slm-fp-0, slm-feu-0*10, nothing*2")
+
+;; simple for fmov
+(define_insn_reservation  "slm_fmov" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "fmov")
+            (eq_attr "memory" "none")))
+  "slm-simple-either")
+
+;; simple for fmov
+(define_insn_reservation  "slm_fmov_mem" 1
+  (and (eq_attr "cpu" "slm")
+       (and (eq_attr "type" "fmov")
+            (eq_attr "memory" "!none")))
+  "slm-simple-either")
+
+;; Define bypass here
+
+;; There will be 0 cycle stall from cmp/test to jcc
+
+;; There will be 1 cycle stall from flag producer to cmov and adc/sbb
+(define_bypass 2 "slm_icmp, slm_test, slm_alu, slm_alu_carry,
+                  slm_alu1, slm_negnot, slm_incdec, slm_ishift,
+                  slm_ishift1, slm_rotate, slm_rotate1"
+                 "slm_icmov, slm_alu_carry")
+
+;; lea to shift source stall is 1 cycle
+(define_bypass 2 "slm_lea"
+                 "slm_ishift, slm_ishift1, slm_rotate, slm_rotate1"
+                 "!ix86_dep_by_shift_count")
+
+;; non-lea to shift count stall is 1 cycle
+(define_bypass 2 "slm_alu_carry,
+                  slm_alu,slm_alu1,slm_negnot,slm_imov,slm_imovx,
+                  slm_incdec,slm_ishift,slm_ishift1,slm_rotate,
+                  slm_rotate1, slm_setcc, slm_icmov, slm_pop,
+                  slm_alu_mem, slm_alu_carry_mem, slm_alu1_mem,
+                  slm_imovx_mem, slm_imovx_2_mem,
+                  slm_imov_mem, slm_icmov_mem, slm_fmov_mem"
+                 "slm_ishift, slm_ishift1, slm_rotate, slm_rotate1,
+                  slm_ishift_mem, slm_ishift1_mem,
+                  slm_rotate_mem, slm_rotate1_mem"
+                 "ix86_dep_by_shift_count")
diff --git a/gcc-4.9/gcc/config/i386/smmintrin.h b/gcc-4.9/gcc/config/i386/smmintrin.h
new file mode 100644
index 000000000..886ace43f
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/smmintrin.h
@@ -0,0 +1,862 @@
+/* Copyright (C) 2007-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 10.0.  */
+
+#ifndef _SMMINTRIN_H_INCLUDED
+#define _SMMINTRIN_H_INCLUDED
+
+/* We need definitions from the SSSE3, SSE3, SSE2 and SSE header
+   files.  */
+#include <tmmintrin.h>
+
+#ifndef __SSE4_1__
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define __DISABLE_SSE4_1__
+#endif /* __SSE4_1__ */
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT	0x00
+#define _MM_FROUND_TO_NEG_INF		0x01
+#define _MM_FROUND_TO_POS_INF		0x02
+#define _MM_FROUND_TO_ZERO		0x03
+#define _MM_FROUND_CUR_DIRECTION	0x04
+
+#define _MM_FROUND_RAISE_EXC		0x00
+#define _MM_FROUND_NO_EXC		0x08
+
+#define _MM_FROUND_NINT		\
+  (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR	\
+  (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL		\
+  (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC	\
+  (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT		\
+  (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT	\
+  (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+
+/* Test Instruction */
+/* Packed integer 128-bit bitwise comparison. Return 1 if
+   (__V & __M) == 0.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testz_si128 (__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestz128 ((__v2di)__M, (__v2di)__V);
+}
+
+/* Packed integer 128-bit bitwise comparison. Return 1 if
+   (__V & ~__M) == 0.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testc_si128 (__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestc128 ((__v2di)__M, (__v2di)__V);
+}
+
+/* Packed integer 128-bit bitwise comparison. Return 1 if
+   (__V & __M) != 0 && (__V & ~__M) != 0.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testnzc_si128 (__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestnzc128 ((__v2di)__M, (__v2di)__V);
+}
+
+/* Macros for packed integer 128-bit comparison intrinsics.  */
+#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
+
+#define _mm_test_all_ones(V) \
+  _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V)))
+
+#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))
+
+/* Packed/scalar double precision floating point rounding.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_pd (__m128d __V, const int __M)
+{
+  return (__m128d) __builtin_ia32_roundpd ((__v2df)__V, __M);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_sd(__m128d __D, __m128d __V, const int __M)
+{
+  return (__m128d) __builtin_ia32_roundsd ((__v2df)__D,
+					   (__v2df)__V,
+					   __M);
+}
+#else
+#define _mm_round_pd(V, M) \
+  ((__m128d) __builtin_ia32_roundpd ((__v2df)(__m128d)(V), (int)(M)))
+
+#define _mm_round_sd(D, V, M)						\
+  ((__m128d) __builtin_ia32_roundsd ((__v2df)(__m128d)(D),		\
+				     (__v2df)(__m128d)(V), (int)(M)))
+#endif
+
+/* Packed/scalar single precision floating point rounding.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_ps (__m128 __V, const int __M)
+{
+  return (__m128) __builtin_ia32_roundps ((__v4sf)__V, __M);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_ss (__m128 __D, __m128 __V, const int __M)
+{
+  return (__m128) __builtin_ia32_roundss ((__v4sf)__D,
+					  (__v4sf)__V,
+					  __M);
+}
+#else
+#define _mm_round_ps(V, M) \
+  ((__m128) __builtin_ia32_roundps ((__v4sf)(__m128)(V), (int)(M)))
+
+#define _mm_round_ss(D, V, M)						\
+  ((__m128) __builtin_ia32_roundss ((__v4sf)(__m128)(D),		\
+				    (__v4sf)(__m128)(V), (int)(M)))
+#endif
+
+/* Macros for ceil/floor intrinsics.  */
+#define _mm_ceil_pd(V)	   _mm_round_pd ((V), _MM_FROUND_CEIL)
+#define _mm_ceil_sd(D, V)  _mm_round_sd ((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_pd(V)	   _mm_round_pd((V), _MM_FROUND_FLOOR)
+#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR)
+
+#define _mm_ceil_ps(V)	   _mm_round_ps ((V), _MM_FROUND_CEIL)
+#define _mm_ceil_ss(D, V)  _mm_round_ss ((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_ps(V)	   _mm_round_ps ((V), _MM_FROUND_FLOOR)
+#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR)
+
+/* SSE4.1 */
+
+/* Integer blend instructions - select data from 2 sources using
+   constant/variable mask.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_epi16 (__m128i __X, __m128i __Y, const int __M)
+{
+  return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__X,
+					      (__v8hi)__Y,
+					      __M);
+}
+#else
+#define _mm_blend_epi16(X, Y, M)					\
+  ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(X),		\
+					(__v8hi)(__m128i)(Y), (int)(M)))
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_epi8 (__m128i __X, __m128i __Y, __m128i __M)
+{
+  return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__X,
+					       (__v16qi)__Y,
+					       (__v16qi)__M);
+}
+
+/* Single precision floating point blend instructions - select data
+   from 2 sources using constant/variable mask.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_ps (__m128 __X, __m128 __Y, const int __M)
+{
+  return (__m128) __builtin_ia32_blendps ((__v4sf)__X,
+					  (__v4sf)__Y,
+					  __M);
+}
+#else
+#define _mm_blend_ps(X, Y, M)						\
+  ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(X),		\
+				    (__v4sf)(__m128)(Y), (int)(M)))
+#endif
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_ps (__m128 __X, __m128 __Y, __m128 __M)
+{
+  return (__m128) __builtin_ia32_blendvps ((__v4sf)__X,
+					   (__v4sf)__Y,
+					   (__v4sf)__M);
+}
+
+/* Double precision floating point blend instructions - select data
+   from 2 sources using constant/variable mask.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_pd (__m128d __X, __m128d __Y, const int __M)
+{
+  return (__m128d) __builtin_ia32_blendpd ((__v2df)__X,
+					   (__v2df)__Y,
+					   __M);
+}
+#else
+#define _mm_blend_pd(X, Y, M)						\
+  ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(X),		\
+				     (__v2df)(__m128d)(Y), (int)(M)))
+#endif
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_pd (__m128d __X, __m128d __Y, __m128d __M)
+{
+  return (__m128d) __builtin_ia32_blendvpd ((__v2df)__X,
+					    (__v2df)__Y,
+					    (__v2df)__M);
+}
+
+/* Dot product instructions with mask-defined summing and zeroing parts
+   of result.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dp_ps (__m128 __X, __m128 __Y, const int __M)
+{
+  return (__m128) __builtin_ia32_dpps ((__v4sf)__X,
+				       (__v4sf)__Y,
+				       __M);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dp_pd (__m128d __X, __m128d __Y, const int __M)
+{
+  return (__m128d) __builtin_ia32_dppd ((__v2df)__X,
+					(__v2df)__Y,
+					__M);
+}
+#else
+#define _mm_dp_ps(X, Y, M)						\
+  ((__m128) __builtin_ia32_dpps ((__v4sf)(__m128)(X),			\
+				 (__v4sf)(__m128)(Y), (int)(M)))
+
+#define _mm_dp_pd(X, Y, M)						\
+  ((__m128d) __builtin_ia32_dppd ((__v2df)(__m128d)(X),			\
+				  (__v2df)(__m128d)(Y), (int)(M)))
+#endif
+
+/* Packed integer 64-bit comparison, zeroing or filling with ones
+   corresponding parts of result.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pcmpeqq ((__v2di)__X, (__v2di)__Y);
+}
+
+/*  Min/max packed integer instructions.  */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi8 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pminsb128 ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi8 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pminuw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pminsd128 ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pminud128 ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmaxud128 ((__v4si)__X, (__v4si)__Y);
+}
+
+/* Packed integer 32-bit multiplication with truncation of upper
+   halves of results.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmulld128 ((__v4si)__X, (__v4si)__Y);
+}
+
+/* Packed integer 32-bit multiplication of 2 pairs of operands
+   with two 64-bit results.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__X, (__v4si)__Y);
+}
+
+/* Insert single precision float into packed single precision array
+   element selected by index N.  The bits [7-6] of N define S
+   index, the bits [5-4] define D index, and bits [3-0] define
+   zeroing mask for D.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_ps (__m128 __D, __m128 __S, const int __N)
+{
+  return (__m128) __builtin_ia32_insertps128 ((__v4sf)__D,
+					      (__v4sf)__S,
+					      __N);
+}
+#else
+#define _mm_insert_ps(D, S, N)						\
+  ((__m128) __builtin_ia32_insertps128 ((__v4sf)(__m128)(D),		\
+					(__v4sf)(__m128)(S), (int)(N)))
+#endif
+
+/* Helper macro to create the N value for _mm_insert_ps.  */
+#define _MM_MK_INSERTPS_NDX(S, D, M) (((S) << 6) | ((D) << 4) | (M))
+
+/* Extract binary representation of single precision float from packed
+   single precision array element of X selected by index N.  */
+
+#ifdef __OPTIMIZE__
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_ps (__m128 __X, const int __N)
+{
+  union { int i; float f; } __tmp;
+  __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)__X, __N);
+  return __tmp.i;
+}
+#else
+#define _mm_extract_ps(X, N)						\
+  (__extension__							\
+   ({									\
+     union { int i; float f; } __tmp;					\
+     __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)(__m128)(X), (int)(N)); \
+     __tmp.i;								\
+   }))
+#endif
+
+/* Extract binary representation of single precision float into
+   D from packed single precision array element of S selected
+   by index N.  */
+#define _MM_EXTRACT_FLOAT(D, S, N) \
+  { (D) = __builtin_ia32_vec_ext_v4sf ((__v4sf)(S), (N)); }
+  
+/* Extract specified single precision float element into the lower
+   part of __m128.  */
+#define _MM_PICK_OUT_PS(X, N)				\
+  _mm_insert_ps (_mm_setzero_ps (), (X), 		\
+		 _MM_MK_INSERTPS_NDX ((N), 0, 0x0e))
+
+/* Insert integer, S, into packed integer array element of D
+   selected by index N.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi8 (__m128i __D, int __S, const int __N)
+{
+  return (__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)__D,
+						 __S, __N);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi32 (__m128i __D, int __S, const int __N)
+{
+  return (__m128i) __builtin_ia32_vec_set_v4si ((__v4si)__D,
+						 __S, __N);
+}
+
+#ifdef __x86_64__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi64 (__m128i __D, long long __S, const int __N)
+{
+  return (__m128i) __builtin_ia32_vec_set_v2di ((__v2di)__D,
+						 __S, __N);
+}
+#endif
+#else
+#define _mm_insert_epi8(D, S, N)					\
+  ((__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)(__m128i)(D),	\
+					   (int)(S), (int)(N)))
+
+#define _mm_insert_epi32(D, S, N)				\
+  ((__m128i) __builtin_ia32_vec_set_v4si ((__v4si)(__m128i)(D),	\
+					  (int)(S), (int)(N)))
+
+#ifdef __x86_64__
+#define _mm_insert_epi64(D, S, N)					\
+  ((__m128i) __builtin_ia32_vec_set_v2di ((__v2di)(__m128i)(D),		\
+					  (long long)(S), (int)(N)))
+#endif
+#endif
+
+/* Extract integer from packed integer array element of X selected by
+   index N.  */
+
+#ifdef __OPTIMIZE__
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi8 (__m128i __X, const int __N)
+{
+   return (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)__X, __N);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi32 (__m128i __X, const int __N)
+{
+   return __builtin_ia32_vec_ext_v4si ((__v4si)__X, __N);
+}
+
+#ifdef __x86_64__
+extern __inline long long  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi64 (__m128i __X, const int __N)
+{
+  return __builtin_ia32_vec_ext_v2di ((__v2di)__X, __N);
+}
+#endif
+#else
+#define _mm_extract_epi8(X, N) \
+  ((int) (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)(__m128i)(X), (int)(N)))
+#define _mm_extract_epi32(X, N) \
+  ((int) __builtin_ia32_vec_ext_v4si ((__v4si)(__m128i)(X), (int)(N)))
+
+#ifdef __x86_64__
+#define _mm_extract_epi64(X, N) \
+  ((long long) __builtin_ia32_vec_ext_v2di ((__v2di)(__m128i)(X), (int)(N)))
+#endif
+#endif
+
+/* Return horizontal packed word minimum and its index in bits [15:0]
+   and bits [18:16] respectively.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_minpos_epu16 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_phminposuw128 ((__v8hi)__X);
+}
+
+/* Packed integer sign-extension.  */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi32 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovsxbd128 ((__v16qi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_epi32 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovsxwd128 ((__v8hi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi64 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovsxbq128 ((__v16qi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_epi64 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovsxdq128 ((__v4si)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_epi64 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovsxwq128 ((__v8hi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi16 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovsxbw128 ((__v16qi)__X);
+}
+
+/* Packed integer zero-extension. */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi32 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovzxbd128 ((__v16qi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu16_epi32 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovzxwd128 ((__v8hi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi64 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovzxbq128 ((__v16qi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu32_epi64 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovzxdq128 ((__v4si)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu16_epi64 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovzxwq128 ((__v8hi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi16 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovzxbw128 ((__v16qi)__X);
+}
+
+/* Pack 8 double words from 2 operands into 8 words of result with
+   unsigned saturation. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packus_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_packusdw128 ((__v4si)__X, (__v4si)__Y);
+}
+
+/* Sum absolute 8-bit integer difference of adjacent groups of 4
+   byte integers in the first 2 operands.  Starting offsets within
+   operands are determined by the 3rd mask operand.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M)
+{
+  return (__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)__X,
+					      (__v16qi)__Y, __M);
+}
+#else
+#define _mm_mpsadbw_epu8(X, Y, M)					\
+  ((__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)(__m128i)(X),		\
+					(__v16qi)(__m128i)(Y), (int)(M)))
+#endif
+
+/* Load double quadword using non-temporal aligned hint.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_load_si128 (__m128i *__X)
+{
+  return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __X);
+}
+
+#ifndef __SSE4_2__
+#pragma GCC push_options
+#pragma GCC target("sse4.2")
+#define __DISABLE_SSE4_2__
+#endif /* __SSE4_2__ */
+
+/* These macros specify the source data format.  */
+#define _SIDD_UBYTE_OPS			0x00
+#define _SIDD_UWORD_OPS			0x01
+#define _SIDD_SBYTE_OPS			0x02
+#define _SIDD_SWORD_OPS			0x03
+
+/* These macros specify the comparison operation.  */
+#define _SIDD_CMP_EQUAL_ANY		0x00
+#define _SIDD_CMP_RANGES		0x04
+#define _SIDD_CMP_EQUAL_EACH		0x08
+#define _SIDD_CMP_EQUAL_ORDERED		0x0c
+
+/* These macros specify the polarity.  */
+#define _SIDD_POSITIVE_POLARITY		0x00
+#define _SIDD_NEGATIVE_POLARITY		0x10
+#define _SIDD_MASKED_POSITIVE_POLARITY	0x20
+#define _SIDD_MASKED_NEGATIVE_POLARITY	0x30
+
+/* These macros specify the output selection in _mm_cmpXstri ().  */
+#define _SIDD_LEAST_SIGNIFICANT		0x00
+#define _SIDD_MOST_SIGNIFICANT		0x40
+
+/* These macros specify the output selection in _mm_cmpXstrm ().  */
+#define _SIDD_BIT_MASK			0x00
+#define _SIDD_UNIT_MASK			0x40
+
+/* Intrinsics for text/string processing.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistrm (__m128i __X, __m128i __Y, const int __M)
+{
+  return (__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)__X,
+						(__v16qi)__Y,
+						__M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistri (__m128i __X, __m128i __Y, const int __M)
+{
+  return __builtin_ia32_pcmpistri128 ((__v16qi)__X,
+				      (__v16qi)__Y,
+				      __M);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestrm (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+  return (__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)__X, __LX,
+						(__v16qi)__Y, __LY,
+						__M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestri (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+  return __builtin_ia32_pcmpestri128 ((__v16qi)__X, __LX,
+				      (__v16qi)__Y, __LY,
+				      __M);
+}
+#else
+#define _mm_cmpistrm(X, Y, M)						\
+  ((__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)(__m128i)(X),	\
+					  (__v16qi)(__m128i)(Y), (int)(M)))
+#define _mm_cmpistri(X, Y, M)						\
+  ((int) __builtin_ia32_pcmpistri128 ((__v16qi)(__m128i)(X),		\
+				      (__v16qi)(__m128i)(Y), (int)(M)))
+
+#define _mm_cmpestrm(X, LX, Y, LY, M)					\
+  ((__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)(__m128i)(X),	\
+					  (int)(LX), (__v16qi)(__m128i)(Y), \
+					  (int)(LY), (int)(M)))
+#define _mm_cmpestri(X, LX, Y, LY, M)					\
+  ((int) __builtin_ia32_pcmpestri128 ((__v16qi)(__m128i)(X), (int)(LX),	\
+				      (__v16qi)(__m128i)(Y), (int)(LY),	\
+				      (int)(M)))
+#endif
+
+/* Intrinsics for text/string processing and reading values of
+   EFlags.  */
+
+#ifdef __OPTIMIZE__
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistra (__m128i __X, __m128i __Y, const int __M)
+{
+  return __builtin_ia32_pcmpistria128 ((__v16qi)__X,
+				       (__v16qi)__Y,
+				       __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistrc (__m128i __X, __m128i __Y, const int __M)
+{
+  return __builtin_ia32_pcmpistric128 ((__v16qi)__X,
+				       (__v16qi)__Y,
+				       __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistro (__m128i __X, __m128i __Y, const int __M)
+{
+  return __builtin_ia32_pcmpistrio128 ((__v16qi)__X,
+				       (__v16qi)__Y,
+				       __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistrs (__m128i __X, __m128i __Y, const int __M)
+{
+  return __builtin_ia32_pcmpistris128 ((__v16qi)__X,
+				       (__v16qi)__Y,
+				       __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistrz (__m128i __X, __m128i __Y, const int __M)
+{
+  return __builtin_ia32_pcmpistriz128 ((__v16qi)__X,
+				       (__v16qi)__Y,
+				       __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestra (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+  return __builtin_ia32_pcmpestria128 ((__v16qi)__X, __LX,
+				       (__v16qi)__Y, __LY,
+				       __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestrc (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+  return __builtin_ia32_pcmpestric128 ((__v16qi)__X, __LX,
+				       (__v16qi)__Y, __LY,
+				       __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestro (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+  return __builtin_ia32_pcmpestrio128 ((__v16qi)__X, __LX,
+				       (__v16qi)__Y, __LY,
+				       __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestrs (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+  return __builtin_ia32_pcmpestris128 ((__v16qi)__X, __LX,
+				       (__v16qi)__Y, __LY,
+				       __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestrz (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+  return __builtin_ia32_pcmpestriz128 ((__v16qi)__X, __LX,
+				       (__v16qi)__Y, __LY,
+				       __M);
+}
+#else
+#define _mm_cmpistra(X, Y, M)						\
+  ((int) __builtin_ia32_pcmpistria128 ((__v16qi)(__m128i)(X),		\
+				       (__v16qi)(__m128i)(Y), (int)(M)))
+#define _mm_cmpistrc(X, Y, M)						\
+  ((int) __builtin_ia32_pcmpistric128 ((__v16qi)(__m128i)(X),		\
+				       (__v16qi)(__m128i)(Y), (int)(M)))
+#define _mm_cmpistro(X, Y, M)						\
+  ((int) __builtin_ia32_pcmpistrio128 ((__v16qi)(__m128i)(X),		\
+				       (__v16qi)(__m128i)(Y), (int)(M)))
+#define _mm_cmpistrs(X, Y, M)						\
+  ((int) __builtin_ia32_pcmpistris128 ((__v16qi)(__m128i)(X),		\
+				       (__v16qi)(__m128i)(Y), (int)(M)))
+#define _mm_cmpistrz(X, Y, M)						\
+  ((int) __builtin_ia32_pcmpistriz128 ((__v16qi)(__m128i)(X),		\
+				       (__v16qi)(__m128i)(Y), (int)(M)))
+
+#define _mm_cmpestra(X, LX, Y, LY, M)					\
+  ((int) __builtin_ia32_pcmpestria128 ((__v16qi)(__m128i)(X), (int)(LX), \
+				       (__v16qi)(__m128i)(Y), (int)(LY), \
+				       (int)(M)))
+#define _mm_cmpestrc(X, LX, Y, LY, M)					\
+  ((int) __builtin_ia32_pcmpestric128 ((__v16qi)(__m128i)(X), (int)(LX), \
+				       (__v16qi)(__m128i)(Y), (int)(LY), \
+				       (int)(M)))
+#define _mm_cmpestro(X, LX, Y, LY, M)					\
+  ((int) __builtin_ia32_pcmpestrio128 ((__v16qi)(__m128i)(X), (int)(LX), \
+				       (__v16qi)(__m128i)(Y), (int)(LY), \
+				       (int)(M)))
+#define _mm_cmpestrs(X, LX, Y, LY, M)					\
+  ((int) __builtin_ia32_pcmpestris128 ((__v16qi)(__m128i)(X), (int)(LX), \
+				       (__v16qi)(__m128i)(Y), (int)(LY), \
+				       (int)(M)))
+#define _mm_cmpestrz(X, LX, Y, LY, M)					\
+  ((int) __builtin_ia32_pcmpestriz128 ((__v16qi)(__m128i)(X), (int)(LX), \
+				       (__v16qi)(__m128i)(Y), (int)(LY), \
+				       (int)(M)))
+#endif
+
+/* Packed integer 64-bit comparison, zeroing or filling with ones
+   corresponding parts of result.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pcmpgtq ((__v2di)__X, (__v2di)__Y);
+}
+
+#ifdef __DISABLE_SSE4_2__
+#undef __DISABLE_SSE4_2__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE4_2__ */
+
+#ifdef __DISABLE_SSE4_1__
+#undef __DISABLE_SSE4_1__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE4_1__ */
+
+#include <popcntintrin.h>
+
+#ifndef __SSE4_1__
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define __DISABLE_SSE4_1__
+#endif /* __SSE4_1__ */
+
+#ifndef __SSE4_2__
+#pragma GCC push_options
+#pragma GCC target("sse4.2")
+#define __DISABLE_SSE4_2__
+#endif /* __SSE4_1__ */
+
+/* Accumulate CRC32 (polynomial 0x11EDC6F41) value.  */
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_crc32_u8 (unsigned int __C, unsigned char __V)
+{
+  return __builtin_ia32_crc32qi (__C, __V);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_crc32_u16 (unsigned int __C, unsigned short __V)
+{
+  return __builtin_ia32_crc32hi (__C, __V);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_crc32_u32 (unsigned int __C, unsigned int __V)
+{
+  return __builtin_ia32_crc32si (__C, __V);
+}
+
+#ifdef __x86_64__
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_crc32_u64 (unsigned long long __C, unsigned long long __V)
+{
+  return __builtin_ia32_crc32di (__C, __V);
+}
+#endif
+
+#ifdef __DISABLE_SSE4_2__
+#undef __DISABLE_SSE4_2__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE4_2__ */
+
+#ifdef __DISABLE_SSE4_1__
+#undef __DISABLE_SSE4_1__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE4_1__ */
+
+#endif /* _SMMINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/sol2-9.h b/gcc-4.9/gcc/config/i386/sol2-9.h
new file mode 100644
index 000000000..9ae88aae5
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/sol2-9.h
@@ -0,0 +1,23 @@
+/* Target definitions for GCC for Intel 80386 running Solaris 9
+   Copyright (C) 2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Solaris 9 only guarantees 4-byte stack alignment as required by the i386
+   psABI, so realign it as necessary for SSE instructions.  */
+#undef STACK_REALIGN_DEFAULT
+#define STACK_REALIGN_DEFAULT 1
diff --git a/gcc-4.9/gcc/config/i386/sol2-bi.h b/gcc-4.9/gcc/config/i386/sol2-bi.h
new file mode 100644
index 000000000..66d17801f
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/sol2-bi.h
@@ -0,0 +1,109 @@
+/* Definitions of target machine for GCC, for bi-arch Solaris 2/x86.
+   Copyright (C) 2004-2014 Free Software Foundation, Inc.
+   Contributed by CodeSourcery, LLC.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Override i386/sol2.h version: return 8-byte vectors in MMX registers if
+   possible, matching Sun Studio 12 Update 1+ compilers and other x86
+   targets.  */
+#undef TARGET_SUBTARGET_DEFAULT
+#define TARGET_SUBTARGET_DEFAULT \
+	(MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS)
+
+#define SUBTARGET_OPTIMIZATION_OPTIONS				\
+  { OPT_LEVELS_1_PLUS, OPT_momit_leaf_frame_pointer, NULL, 1 }
+
+/* GNU as understands --32 and --64, but the native Solaris
+   assembler requires -xarch=generic or -xarch=generic64 instead.  */
+#ifdef USE_GAS
+#define ASM_CPU32_DEFAULT_SPEC "--32"
+#define ASM_CPU64_DEFAULT_SPEC "--64"
+#else
+#define ASM_CPU32_DEFAULT_SPEC "-xarch=generic"
+#define ASM_CPU64_DEFAULT_SPEC "-xarch=generic64"
+#endif
+
+#undef ASM_CPU_SPEC
+#define ASM_CPU_SPEC "%(asm_cpu_default)"
+
+/* Don't let i386/x86-64.h override i386/sol2.h version.  Still cannot use
+   -K PIC with the Solaris 10+ assembler, it gives many warnings:
+	Absolute relocation is used for symbol "<symbol>"  */
+#undef ASM_SPEC
+#define ASM_SPEC ASM_SPEC_BASE
+
+/* We do not need to search a special directory for startup files.  */
+#undef MD_STARTFILE_PREFIX
+
+#define DEFAULT_ARCH32_P !TARGET_64BIT_DEFAULT
+
+#define ARCH64_SUBDIR "amd64"
+
+#ifdef USE_GLD
+/* Since binutils 2.21, GNU ld supports new *_sol2 emulations to strictly
+   follow the Solaris 2 ABI.  Prefer them if present.  */
+#ifdef HAVE_LD_SOL2_EMULATION
+#define ARCH32_EMULATION "elf_i386_sol2"
+#define ARCH64_EMULATION "elf_x86_64_sol2"
+#else
+#define ARCH32_EMULATION "elf_i386"
+#define ARCH64_EMULATION "elf_x86_64"
+#endif
+#endif
+
+#undef ASM_COMMENT_START
+#define ASM_COMMENT_START "/"
+
+/* The native Solaris assembler can't calculate the difference between
+   symbols in different sections, which causes problems for -fPIC jump
+   tables in .rodata.  */
+#ifndef HAVE_AS_IX86_DIFF_SECT_DELTA
+#undef JUMP_TABLES_IN_TEXT_SECTION
+#define JUMP_TABLES_IN_TEXT_SECTION 1
+
+/* The native Solaris assembler cannot handle the SYMBOL-. syntax, but
+   requires SYMBOL@rel/@rel64 instead.  */
+#define ASM_OUTPUT_DWARF_PCREL(FILE, SIZE, LABEL)	\
+  do {							\
+    fputs (integer_asm_op (SIZE, FALSE), FILE);		\
+    assemble_name (FILE, LABEL);			\
+    fputs (SIZE == 8 ? "@rel64" : "@rel", FILE);	\
+  } while (0)
+#endif
+
+/* As in sol2.h, override the default from i386/x86-64.h to work around
+   Sun as TLS bug.  */
+#undef  ASM_OUTPUT_ALIGNED_COMMON
+#define ASM_OUTPUT_ALIGNED_COMMON(FILE, NAME, SIZE, ALIGN)		\
+  do									\
+    {									\
+      if (TARGET_SUN_TLS						\
+	  && in_section							\
+	  && ((in_section->common.flags & SECTION_TLS) == SECTION_TLS))	\
+	switch_to_section (bss_section);				\
+      x86_elf_aligned_common (FILE, NAME, SIZE, ALIGN);			\
+    }									\
+  while  (0)
+
+#define USE_IX86_FRAME_POINTER 1
+#define USE_X86_64_FRAME_POINTER 1
+
+#undef NO_PROFILE_COUNTERS
+
+#undef MCOUNT_NAME
+#define MCOUNT_NAME "_mcount"
diff --git a/gcc-4.9/gcc/config/i386/sol2.h b/gcc-4.9/gcc/config/i386/sol2.h
new file mode 100644
index 000000000..8a21a5910
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/sol2.h
@@ -0,0 +1,189 @@
+/* Target definitions for GCC for Intel 80386 running Solaris 2
+   Copyright (C) 1993-2014 Free Software Foundation, Inc.
+   Contributed by Fred Fish (fnf@cygnus.com).
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Augment i386/unix.h version to return 8-byte vectors in memory, matching
+   Sun Studio compilers until version 12, the only ones supported on
+   Solaris 9.  */
+#undef TARGET_SUBTARGET_DEFAULT
+#define TARGET_SUBTARGET_DEFAULT \
+	(MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS | MASK_VECT8_RETURNS)
+
+/* Old versions of the Solaris assembler can not handle the difference of
+   labels in different sections, so force DW_EH_PE_datarel.  */
+#undef ASM_PREFERRED_EH_DATA_FORMAT
+#define ASM_PREFERRED_EH_DATA_FORMAT(CODE,GLOBAL)			\
+  (flag_pic ? ((GLOBAL ? DW_EH_PE_indirect : 0)				\
+	       | (TARGET_64BIT ? DW_EH_PE_pcrel | DW_EH_PE_sdata4	\
+		  : DW_EH_PE_datarel))					\
+   : DW_EH_PE_absptr)
+
+/* The Solaris linker will not merge a read-only .eh_frame section
+   with a read-write .eh_frame section.  None of the encodings used
+   with non-PIC code require runtime relocations.  In 64-bit mode,
+   since there is no backwards compatibility issue, we use a read-only
+   section for .eh_frame.  In 32-bit mode, we use a writable .eh_frame
+   section in order to be compatible with G++ for Solaris x86.  */
+#undef EH_TABLES_CAN_BE_READ_ONLY
+#define EH_TABLES_CAN_BE_READ_ONLY (TARGET_64BIT)
+
+/* Follow Sun requirements for TLS code sequences and use Sun assembler TLS
+   syntax.  */
+#undef TARGET_SUN_TLS
+#define TARGET_SUN_TLS 1
+
+#undef  SIZE_TYPE
+#define SIZE_TYPE "unsigned int"
+
+#undef  PTRDIFF_TYPE
+#define PTRDIFF_TYPE "int"
+
+/* Solaris 2/Intel as chokes on #line directives before Solaris 10.  */
+#undef CPP_SPEC
+#define CPP_SPEC "%{,assembler-with-cpp:-P} %(cpp_subtarget)"
+
+#define ASM_CPU_DEFAULT_SPEC ""
+
+#define ASM_CPU_SPEC ""
+ 
+/* Don't include ASM_PIC_SPEC.  While the Solaris 9 assembler accepts
+   -K PIC, it gives many warnings:
+	R_386_32 relocation is used for symbol "<symbol>"
+   GNU as doesn't recognize -K at all.  */
+#undef ASM_SPEC
+#define ASM_SPEC ASM_SPEC_BASE
+
+#undef  ENDFILE_SPEC
+#define ENDFILE_SPEC \
+  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+   %{mpc32:crtprec32.o%s} \
+   %{mpc64:crtprec64.o%s} \
+   %{mpc80:crtprec80.o%s} \
+   crtend.o%s crtn.o%s"
+
+#define SUBTARGET_CPU_EXTRA_SPECS \
+  { "cpp_subtarget",	 CPP_SUBTARGET_SPEC },		\
+  { "asm_cpu",		 ASM_CPU_SPEC },		\
+  { "asm_cpu_default",	 ASM_CPU_DEFAULT_SPEC },	\
+
+#undef SUBTARGET_EXTRA_SPECS
+#define SUBTARGET_EXTRA_SPECS \
+  { "startfile_arch",	STARTFILE_ARCH_SPEC },		\
+  { "link_arch",	LINK_ARCH_SPEC },		\
+  SUBTARGET_CPU_EXTRA_SPECS
+
+/* Register the Solaris-specific #pragma directives.  */
+#define REGISTER_SUBTARGET_PRAGMAS() solaris_register_pragmas ()
+
+#undef LOCAL_LABEL_PREFIX
+#define LOCAL_LABEL_PREFIX "."
+
+/* The 32-bit Solaris assembler does not support .quad.  Do not use it.  */
+#ifndef HAVE_AS_IX86_QUAD
+#undef ASM_QUAD
+#endif
+
+/* The Solaris assembler wants a .local for non-exported aliases.  */
+#define ASM_OUTPUT_DEF_FROM_DECLS(FILE, DECL, TARGET)	\
+  do {							\
+    const char *declname =				\
+      IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (DECL));	\
+    ASM_OUTPUT_DEF ((FILE), declname,			\
+		    IDENTIFIER_POINTER (TARGET));	\
+    if (! TREE_PUBLIC (DECL))				\
+      {							\
+	fprintf ((FILE), "%s", LOCAL_ASM_OP);		\
+	assemble_name ((FILE), declname);		\
+	fprintf ((FILE), "\n");				\
+      }							\
+  } while (0)
+
+#ifndef USE_GAS
+/* The Sun assembler uses .tcomm for TLS common sections.  */
+#define TLS_COMMON_ASM_OP ".tcomm"
+
+/* Similar to the Sun assembler on SPARC, the native assembler requires
+   TLS objects to be declared as @tls_obj (not @tls_object).  Unlike SPARC,
+   gas doesn't understand this variant.  */
+#undef  ASM_DECLARE_OBJECT_NAME
+#define ASM_DECLARE_OBJECT_NAME(FILE, NAME, DECL)		\
+  do								\
+    {								\
+      HOST_WIDE_INT size;					\
+								\
+      if (targetm.have_tls && DECL_THREAD_LOCAL_P (DECL))	\
+	ASM_OUTPUT_TYPE_DIRECTIVE (FILE, NAME, "tls_obj");	\
+      else							\
+	ASM_OUTPUT_TYPE_DIRECTIVE (FILE, NAME, "object");	\
+								\
+      size_directive_output = 0;				\
+      if (!flag_inhibit_size_directive				\
+	  && (DECL) && DECL_SIZE (DECL))			\
+	{							\
+	  size_directive_output = 1;				\
+	  size = int_size_in_bytes (TREE_TYPE (DECL));		\
+	  ASM_OUTPUT_SIZE_DIRECTIVE (FILE, NAME, size);		\
+	}							\
+								\
+      ASM_OUTPUT_LABEL (FILE, NAME);				\
+    }								\
+  while (0)
+#endif /* !USE_GAS */
+
+/* Output a simple call for .init/.fini.  */
+#define ASM_OUTPUT_CALL(FILE, FN)				\
+  do								\
+    {								\
+      fprintf (FILE, "\tcall\t");				\
+      ix86_print_operand (FILE, XEXP (DECL_RTL (FN), 0), 'P');	\
+      fprintf (FILE, "\n");					\
+    }								\
+  while (0)
+
+#undef TARGET_ASM_NAMED_SECTION
+#define TARGET_ASM_NAMED_SECTION i386_solaris_elf_named_section
+
+#ifndef USE_GAS
+/* Emit COMDAT group signature symbols for Sun as.  */
+#undef TARGET_ASM_FILE_END
+#define TARGET_ASM_FILE_END solaris_file_end
+#endif
+
+/* Unlike GNU ld, Sun ld doesn't coalesce .ctors.N/.dtors.N sections, so
+   inhibit their creation.  Also cf. sparc/sysv4.h.  */
+#ifndef USE_GLD
+#define CTORS_SECTION_ASM_OP	"\t.section\t.ctors, \"aw\""
+#define DTORS_SECTION_ASM_OP	"\t.section\t.dtors, \"aw\""
+#endif
+
+/* We do not need NT_VERSION notes.  */
+#undef X86_FILE_START_VERSION_DIRECTIVE
+#define X86_FILE_START_VERSION_DIRECTIVE false
+
+/* Only recent versions of Solaris 11 ld properly support hidden .gnu.linkonce
+   sections, so don't use them.  */
+#ifndef USE_GLD
+#define USE_HIDDEN_LINKONCE 0
+#endif
+
+/* Put all *tf routines in libgcc.  */
+#undef LIBGCC2_HAS_TF_MODE
+#define LIBGCC2_HAS_TF_MODE 1
+#define LIBGCC2_TF_CEXT q
+#define TF_SIZE 113
diff --git a/gcc-4.9/gcc/config/i386/sse.md b/gcc-4.9/gcc/config/i386/sse.md
new file mode 100644
index 000000000..f30b27e86
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/sse.md
@@ -0,0 +1,15507 @@
+;; GCC machine description for SSE instructions
+;; Copyright (C) 2005-2014 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_c_enum "unspec" [
+  ;; SSE
+  UNSPEC_MOVNT
+  UNSPEC_LOADU
+  UNSPEC_STOREU
+
+  ;; SSE3
+  UNSPEC_LDDQU
+
+  ;; SSSE3
+  UNSPEC_PSHUFB
+  UNSPEC_PSIGN
+  UNSPEC_PALIGNR
+
+  ;; For SSE4A support
+  UNSPEC_EXTRQI
+  UNSPEC_EXTRQ
+  UNSPEC_INSERTQI
+  UNSPEC_INSERTQ
+
+  ;; For SSE4.1 support
+  UNSPEC_BLENDV
+  UNSPEC_INSERTPS
+  UNSPEC_DP
+  UNSPEC_MOVNTDQA
+  UNSPEC_MPSADBW
+  UNSPEC_PHMINPOSUW
+  UNSPEC_PTEST
+
+  ;; For SSE4.2 support
+  UNSPEC_PCMPESTR
+  UNSPEC_PCMPISTR
+
+  ;; For FMA4 support
+  UNSPEC_FMADDSUB
+  UNSPEC_XOP_UNSIGNED_CMP
+  UNSPEC_XOP_TRUEFALSE
+  UNSPEC_XOP_PERMUTE
+  UNSPEC_FRCZ
+
+  ;; For AES support
+  UNSPEC_AESENC
+  UNSPEC_AESENCLAST
+  UNSPEC_AESDEC
+  UNSPEC_AESDECLAST
+  UNSPEC_AESIMC
+  UNSPEC_AESKEYGENASSIST
+
+  ;; For PCLMUL support
+  UNSPEC_PCLMUL
+
+  ;; For AVX support
+  UNSPEC_PCMP
+  UNSPEC_VPERMIL
+  UNSPEC_VPERMIL2
+  UNSPEC_VPERMIL2F128
+  UNSPEC_CAST
+  UNSPEC_VTESTP
+  UNSPEC_VCVTPH2PS
+  UNSPEC_VCVTPS2PH
+
+  ;; For AVX2 support
+  UNSPEC_VPERMVAR
+  UNSPEC_VPERMTI
+  UNSPEC_GATHER
+  UNSPEC_VSIBADDR
+
+  ;; For AVX512F support
+  UNSPEC_VPERMI2
+  UNSPEC_VPERMT2
+  UNSPEC_VPERMI2_MASK
+  UNSPEC_UNSIGNED_FIX_NOTRUNC
+  UNSPEC_UNSIGNED_PCMP
+  UNSPEC_TESTM
+  UNSPEC_TESTNM
+  UNSPEC_SCATTER
+  UNSPEC_RCP14
+  UNSPEC_RSQRT14
+  UNSPEC_FIXUPIMM
+  UNSPEC_SCALEF
+  UNSPEC_VTERNLOG
+  UNSPEC_GETEXP
+  UNSPEC_GETMANT
+  UNSPEC_ALIGN
+  UNSPEC_CONFLICT
+  UNSPEC_COMPRESS
+  UNSPEC_COMPRESS_STORE
+  UNSPEC_EXPAND
+  UNSPEC_MASKED_EQ
+  UNSPEC_MASKED_GT
+
+  ;; For embed. rounding feature
+  UNSPEC_EMBEDDED_ROUNDING
+
+  ;; For AVX512PF support
+  UNSPEC_GATHER_PREFETCH
+  UNSPEC_SCATTER_PREFETCH
+
+  ;; For AVX512ER support
+  UNSPEC_EXP2
+  UNSPEC_RCP28
+  UNSPEC_RSQRT28
+
+  ;; For SHA support
+  UNSPEC_SHA1MSG1
+  UNSPEC_SHA1MSG2
+  UNSPEC_SHA1NEXTE
+  UNSPEC_SHA1RNDS4
+  UNSPEC_SHA256MSG1
+  UNSPEC_SHA256MSG2
+  UNSPEC_SHA256RNDS2
+])
+
+(define_c_enum "unspecv" [
+  UNSPECV_LDMXCSR
+  UNSPECV_STMXCSR
+  UNSPECV_CLFLUSH
+  UNSPECV_MONITOR
+  UNSPECV_MWAIT
+  UNSPECV_VZEROALL
+  UNSPECV_VZEROUPPER
+])
+
+;; All vector modes including V?TImode, used in move patterns.
+(define_mode_iterator VMOVE
+  [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
+   (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI
+   (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
+   (V8DI "TARGET_AVX512F")  (V4DI "TARGET_AVX") V2DI
+   (V2TI "TARGET_AVX") V1TI
+   (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
+   (V8DF "TARGET_AVX512F")  (V4DF "TARGET_AVX") V2DF])
+
+;; All vector modes
+(define_mode_iterator V
+  [(V32QI "TARGET_AVX") V16QI
+   (V16HI "TARGET_AVX") V8HI
+   (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
+   (V8DI "TARGET_AVX512F")  (V4DI "TARGET_AVX") V2DI
+   (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
+   (V8DF "TARGET_AVX512F")  (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
+
+;; All 128bit vector modes
+(define_mode_iterator V_128
+  [V16QI V8HI V4SI V2DI V4SF (V2DF "TARGET_SSE2")])
+
+;; All 256bit vector modes
+(define_mode_iterator V_256
+  [V32QI V16HI V8SI V4DI V8SF V4DF])
+
+;; All 512bit vector modes
+(define_mode_iterator V_512 [V64QI V32HI V16SI V8DI V16SF V8DF])
+
+;; All 256bit and 512bit vector modes
+(define_mode_iterator V_256_512
+  [V32QI V16HI V8SI V4DI V8SF V4DF
+   (V64QI "TARGET_AVX512F") (V32HI "TARGET_AVX512F") (V16SI "TARGET_AVX512F")
+   (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")])
+
+;; All vector float modes
+(define_mode_iterator VF
+  [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
+   (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
+
+;; 128- and 256-bit float vector modes
+(define_mode_iterator VF_128_256
+  [(V8SF "TARGET_AVX") V4SF
+   (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
+
+;; All SFmode vector float modes
+(define_mode_iterator VF1
+  [(V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF])
+
+;; 128- and 256-bit SF vector modes
+(define_mode_iterator VF1_128_256
+  [(V8SF "TARGET_AVX") V4SF])
+
+;; All DFmode vector float modes
+(define_mode_iterator VF2
+  [(V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF])
+
+;; 128- and 256-bit DF vector modes
+(define_mode_iterator VF2_128_256
+  [(V4DF "TARGET_AVX") V2DF])
+
+(define_mode_iterator VF2_512_256
+  [(V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX")])
+
+;; All 128bit vector float modes
+(define_mode_iterator VF_128
+  [V4SF (V2DF "TARGET_SSE2")])
+
+;; All 256bit vector float modes
+(define_mode_iterator VF_256
+  [V8SF V4DF])
+
+;; All 512bit vector float modes
+(define_mode_iterator VF_512
+  [V16SF V8DF])
+
+;; All vector integer modes
+(define_mode_iterator VI
+  [(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
+   (V32QI "TARGET_AVX") V16QI
+   (V16HI "TARGET_AVX") V8HI
+   (V8SI "TARGET_AVX") V4SI
+   (V4DI "TARGET_AVX") V2DI])
+
+(define_mode_iterator VI_AVX2
+  [(V32QI "TARGET_AVX2") V16QI
+   (V16HI "TARGET_AVX2") V8HI
+   (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI
+   (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI])
+
+;; All QImode vector integer modes
+(define_mode_iterator VI1
+  [(V32QI "TARGET_AVX") V16QI])
+
+(define_mode_iterator VI_UNALIGNED_LOADSTORE
+  [(V32QI "TARGET_AVX") V16QI
+   (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")])
+
+;; All DImode vector integer modes
+(define_mode_iterator VI8
+  [(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI])
+
+(define_mode_iterator VI1_AVX2
+  [(V32QI "TARGET_AVX2") V16QI])
+
+(define_mode_iterator VI2_AVX2
+  [(V16HI "TARGET_AVX2") V8HI])
+
+(define_mode_iterator VI2_AVX512F
+  [(V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX2") V8HI])
+
+(define_mode_iterator VI4_AVX
+  [(V8SI "TARGET_AVX") V4SI])
+
+(define_mode_iterator VI4_AVX2
+  [(V8SI "TARGET_AVX2") V4SI])
+
+(define_mode_iterator VI4_AVX512F
+  [(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI])
+
+(define_mode_iterator VI48_AVX512F
+  [(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI
+   (V8DI "TARGET_AVX512F")])
+
+(define_mode_iterator VI8_AVX2
+  [(V4DI "TARGET_AVX2") V2DI])
+
+(define_mode_iterator VI8_AVX2_AVX512F
+  [(V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI])
+
+;; All V8D* modes
+(define_mode_iterator V8FI
+  [V8DF V8DI])
+
+;; All V16S* modes
+(define_mode_iterator V16FI
+  [V16SF V16SI])
+
+;; ??? We should probably use TImode instead.
+(define_mode_iterator VIMAX_AVX2
+  [(V2TI "TARGET_AVX2") V1TI])
+
+;; ??? This should probably be dropped in favor of VIMAX_AVX2.
+(define_mode_iterator SSESCALARMODE
+  [(V2TI "TARGET_AVX2") TI])
+
+(define_mode_iterator VI12_AVX2
+  [(V32QI "TARGET_AVX2") V16QI
+   (V16HI "TARGET_AVX2") V8HI])
+
+(define_mode_iterator VI24_AVX2
+  [(V16HI "TARGET_AVX2") V8HI
+   (V8SI "TARGET_AVX2") V4SI])
+
+(define_mode_iterator VI124_AVX2_48_AVX512F
+  [(V32QI "TARGET_AVX2") V16QI
+   (V16HI "TARGET_AVX2") V8HI
+   (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI
+   (V8DI "TARGET_AVX512F")])
+
+(define_mode_iterator VI124_AVX512F
+  [(V32QI "TARGET_AVX2") V16QI
+   (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX2") V8HI
+   (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI])
+
+(define_mode_iterator VI124_AVX2
+  [(V32QI "TARGET_AVX2") V16QI
+   (V16HI "TARGET_AVX2") V8HI
+   (V8SI "TARGET_AVX2") V4SI])
+
+(define_mode_iterator VI248_AVX2
+  [(V16HI "TARGET_AVX2") V8HI
+   (V8SI "TARGET_AVX2") V4SI
+   (V4DI "TARGET_AVX2") V2DI])
+
+(define_mode_iterator VI248_AVX2_8_AVX512F
+  [(V16HI "TARGET_AVX2") V8HI
+   (V8SI "TARGET_AVX2") V4SI
+   (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI])
+
+(define_mode_iterator VI48_AVX2_48_AVX512F
+  [(V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX2") V4SI
+   (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX2") V2DI])
+
+(define_mode_iterator V48_AVX2
+  [V4SF V2DF
+   V8SF V4DF
+   (V4SI "TARGET_AVX2") (V2DI "TARGET_AVX2")
+   (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")])
+
+(define_mode_attr sse2_avx_avx512f
+  [(V16QI "sse2") (V32QI "avx") (V64QI "avx512f")
+   (V4SI  "sse2") (V8SI  "avx") (V16SI "avx512f")
+   (V8DI "avx512f")
+   (V16SF "avx512f") (V8SF "avx") (V4SF "avx")
+   (V8DF "avx512f") (V4DF "avx") (V2DF "avx")])
+
+(define_mode_attr sse2_avx2
+  [(V16QI "sse2") (V32QI "avx2")
+   (V8HI "sse2") (V16HI "avx2")
+   (V4SI "sse2") (V8SI "avx2") (V16SI "avx512f")
+   (V2DI "sse2") (V4DI "avx2") (V8DI "avx512f")
+   (V1TI "sse2") (V2TI "avx2")])
+
+(define_mode_attr ssse3_avx2
+   [(V16QI "ssse3") (V32QI "avx2")
+    (V4HI "ssse3") (V8HI "ssse3") (V16HI "avx2")
+    (V4SI "ssse3") (V8SI "avx2")
+    (V2DI "ssse3") (V4DI "avx2")
+    (TI "ssse3") (V2TI "avx2")])
+
+(define_mode_attr sse4_1_avx2
+   [(V16QI "sse4_1") (V32QI "avx2")
+    (V8HI "sse4_1") (V16HI "avx2")
+    (V4SI "sse4_1") (V8SI "avx2") (V16SI "avx512f")
+    (V2DI "sse4_1") (V4DI "avx2") (V8DI "avx512f")])
+
+(define_mode_attr avx_avx2
+  [(V4SF "avx") (V2DF "avx")
+   (V8SF "avx") (V4DF "avx")
+   (V4SI "avx2") (V2DI "avx2")
+   (V8SI "avx2") (V4DI "avx2")])
+
+(define_mode_attr vec_avx2
+  [(V16QI "vec") (V32QI "avx2")
+   (V8HI "vec") (V16HI "avx2")
+   (V4SI "vec") (V8SI "avx2")
+   (V2DI "vec") (V4DI "avx2")])
+
+(define_mode_attr avx2_avx512f
+  [(V4SI "avx2") (V8SI "avx2") (V16SI "avx512f")
+   (V2DI "avx2") (V4DI "avx2") (V8DI "avx512f")
+   (V8SF "avx2") (V16SF "avx512f")
+   (V4DF "avx2") (V8DF "avx512f")])
+
+(define_mode_attr shuffletype
+  [(V16SF "f") (V16SI "i") (V8DF "f") (V8DI "i")
+  (V8SF "f") (V8SI "i") (V4DF "f") (V4DI "i")
+  (V4SF "f") (V4SI "i") (V2DF "f") (V2DI "i")
+  (V32QI "i") (V16HI "u") (V16QI "i") (V8HI "i")
+  (V64QI "i") (V1TI "i") (V2TI "i")])
+
+(define_mode_attr ssequartermode
+  [(V16SF "V4SF") (V8DF "V2DF") (V16SI "V4SI") (V8DI "V2DI")])
+
+(define_mode_attr ssedoublemode
+  [(V16SF "V32SF") (V16SI "V32SI") (V8DI "V16DI") (V8DF "V16DF")
+   (V16HI "V16SI") (V8HI "V8SI") (V4HI "V4SI")
+   (V32QI "V32HI") (V16QI "V16HI")])
+
+(define_mode_attr ssebytemode
+  [(V4DI "V32QI") (V2DI "V16QI")])
+
+;; All 128bit vector integer modes
+(define_mode_iterator VI_128 [V16QI V8HI V4SI V2DI])
+
+;; All 256bit vector integer modes
+(define_mode_iterator VI_256 [V32QI V16HI V8SI V4DI])
+
+;; All 512bit vector integer modes
+(define_mode_iterator VI_512 [V64QI V32HI V16SI V8DI])
+
+;; Various 128bit vector integer mode combinations
+(define_mode_iterator VI12_128 [V16QI V8HI])
+(define_mode_iterator VI14_128 [V16QI V4SI])
+(define_mode_iterator VI124_128 [V16QI V8HI V4SI])
+(define_mode_iterator VI128_128 [V16QI V8HI V2DI])
+(define_mode_iterator VI24_128 [V8HI V4SI])
+(define_mode_iterator VI248_128 [V8HI V4SI V2DI])
+(define_mode_iterator VI48_128 [V4SI V2DI])
+
+;; Various 256bit and 512 vector integer mode combinations
+(define_mode_iterator VI124_256_48_512
+  [V32QI V16HI V8SI (V8DI "TARGET_AVX512F") (V16SI "TARGET_AVX512F")])
+(define_mode_iterator VI48_256 [V8SI V4DI])
+(define_mode_iterator VI48_512 [V16SI V8DI])
+(define_mode_iterator VI4_256_8_512 [V8SI V8DI])
+
+;; Int-float size matches
+(define_mode_iterator VI4F_128 [V4SI V4SF])
+(define_mode_iterator VI8F_128 [V2DI V2DF])
+(define_mode_iterator VI4F_256 [V8SI V8SF])
+(define_mode_iterator VI8F_256 [V4DI V4DF])
+(define_mode_iterator VI8F_256_512
+  [V4DI V4DF (V8DI "TARGET_AVX512F") (V8DF "TARGET_AVX512F")])
+(define_mode_iterator VI48F_256_512
+  [V8SI V8SF
+  (V16SI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
+  (V8DI  "TARGET_AVX512F") (V8DF  "TARGET_AVX512F")])
+(define_mode_iterator VI48F_512 [V16SI V16SF V8DI V8DF])
+
+;; Mapping from float mode to required SSE level
+(define_mode_attr sse
+  [(SF "sse") (DF "sse2")
+   (V4SF "sse") (V2DF "sse2")
+   (V16SF "avx512f") (V8SF "avx")
+   (V8DF "avx512f") (V4DF "avx")])
+
+(define_mode_attr sse2
+  [(V16QI "sse2") (V32QI "avx") (V64QI "avx512f")
+   (V2DI "sse2") (V4DI "avx") (V8DI "avx512f")])
+
+(define_mode_attr sse3
+  [(V16QI "sse3") (V32QI "avx")])
+
+(define_mode_attr sse4_1
+  [(V4SF "sse4_1") (V2DF "sse4_1")
+   (V8SF "avx") (V4DF "avx")
+   (V8DF "avx512f")])
+
+(define_mode_attr avxsizesuffix
+  [(V64QI "512") (V32HI "512") (V16SI "512") (V8DI "512")
+   (V32QI "256") (V16HI "256") (V8SI "256") (V4DI "256")
+   (V16QI "") (V8HI "") (V4SI "") (V2DI "")
+   (V16SF "512") (V8DF "512")
+   (V8SF "256") (V4DF "256")
+   (V4SF "") (V2DF "")])
+
+;; SSE instruction mode
+(define_mode_attr sseinsnmode
+  [(V64QI "XI") (V32HI "XI") (V16SI "XI") (V8DI "XI")
+   (V32QI "OI") (V16HI "OI") (V8SI "OI") (V4DI "OI") (V2TI "OI")
+   (V16QI "TI") (V8HI "TI") (V4SI "TI") (V2DI "TI") (V1TI "TI")
+   (V16SF "V16SF") (V8DF "V8DF")
+   (V8SF "V8SF") (V4DF "V4DF")
+   (V4SF "V4SF") (V2DF "V2DF")
+   (TI "TI")])
+
+;; Mapping of vector modes to corresponding mask size
+(define_mode_attr avx512fmaskmode
+  [(V16QI "HI")
+   (V16HI "HI") (V8HI  "QI")
+   (V16SI "HI") (V8SI  "QI") (V4SI  "QI")
+   (V8DI  "QI") (V4DI  "QI") (V2DI  "QI")
+   (V16SF "HI") (V8SF  "QI") (V4SF  "QI")
+   (V8DF  "QI") (V4DF  "QI") (V2DF  "QI")])
+
+;; Mapping of vector float modes to an integer mode of the same size
+(define_mode_attr sseintvecmode
+  [(V16SF "V16SI") (V8DF  "V8DI")
+   (V8SF  "V8SI")  (V4DF  "V4DI")
+   (V4SF  "V4SI")  (V2DF  "V2DI")
+   (V16SI "V16SI") (V8DI  "V8DI")
+   (V8SI  "V8SI")  (V4DI  "V4DI")
+   (V4SI  "V4SI")  (V2DI  "V2DI")
+   (V16HI "V16HI") (V8HI  "V8HI")
+   (V32QI "V32QI") (V16QI "V16QI")])
+
+(define_mode_attr sseintvecmodelower
+  [(V16SF "v16si")
+   (V8SF "v8si") (V4DF "v4di")
+   (V4SF "v4si") (V2DF "v2di")
+   (V8SI "v8si") (V4DI "v4di")
+   (V4SI "v4si") (V2DI "v2di")
+   (V16HI "v16hi") (V8HI "v8hi")
+   (V32QI "v32qi") (V16QI "v16qi")])
+
+;; Mapping of vector modes to a vector mode of double size
+(define_mode_attr ssedoublevecmode
+  [(V32QI "V64QI") (V16HI "V32HI") (V8SI "V16SI") (V4DI "V8DI")
+   (V16QI "V32QI") (V8HI "V16HI") (V4SI "V8SI") (V2DI "V4DI")
+   (V8SF "V16SF") (V4DF "V8DF")
+   (V4SF "V8SF") (V2DF "V4DF")])
+
+;; Mapping of vector modes to a vector mode of half size
+(define_mode_attr ssehalfvecmode
+  [(V64QI "V32QI") (V32HI "V16HI") (V16SI "V8SI") (V8DI "V4DI")
+   (V32QI "V16QI") (V16HI  "V8HI") (V8SI  "V4SI") (V4DI "V2DI")
+   (V16QI  "V8QI") (V8HI   "V4HI") (V4SI  "V2SI")
+   (V16SF "V8SF") (V8DF "V4DF")
+   (V8SF  "V4SF") (V4DF "V2DF")
+   (V4SF  "V2SF")])
+
+;; Mapping of vector modes ti packed single mode of the same size
+(define_mode_attr ssePSmode
+  [(V16SI "V16SF") (V8DF "V16SF")
+   (V16SF "V16SF") (V8DI "V16SF")
+   (V64QI "V16SF") (V32QI "V8SF") (V16QI "V4SF")
+   (V32HI "V16SF") (V16HI "V8SF") (V8HI "V4SF")
+   (V8SI "V8SF") (V4SI "V4SF")
+   (V4DI "V8SF") (V2DI "V4SF")
+   (V2TI "V8SF") (V1TI "V4SF")
+   (V8SF "V8SF") (V4SF "V4SF")
+   (V4DF "V8SF") (V2DF "V4SF")])
+
+;; Mapping of vector modes back to the scalar modes
+(define_mode_attr ssescalarmode
+  [(V64QI "QI") (V32QI "QI") (V16QI "QI")
+   (V32HI "HI") (V16HI "HI") (V8HI "HI")
+   (V16SI "SI") (V8SI "SI")  (V4SI "SI")
+   (V8DI "DI")  (V4DI "DI")  (V2DI "DI")
+   (V16SF "SF") (V8SF "SF")  (V4SF "SF")
+   (V8DF "DF")  (V4DF "DF")  (V2DF "DF")])
+
+;; Mapping of vector modes to the 128bit modes
+(define_mode_attr ssexmmmode
+  [(V64QI "V16QI") (V32QI "V16QI") (V16QI "V16QI")
+   (V32HI "V8HI")  (V16HI "V8HI") (V8HI "V8HI")
+   (V16SI "V4SI")  (V8SI "V4SI")  (V4SI "V4SI")
+   (V8DI "V2DI")   (V4DI "V2DI")  (V2DI "V2DI")
+   (V16SF "V4SF")  (V8SF "V4SF")  (V4SF "V4SF")
+   (V8DF "V2DF")   (V4DF "V2DF")  (V2DF "V2DF")])
+
+;; Pointer size override for scalar modes (Intel asm dialect)
+(define_mode_attr iptr
+  [(V32QI "b") (V16HI "w") (V8SI "k") (V4DI "q")
+   (V16QI "b") (V8HI "w") (V4SI "k") (V2DI "q")
+   (V8SF "k") (V4DF "q")
+   (V4SF "k") (V2DF "q")
+   (SF "k") (DF "q")])
+
+;; Number of scalar elements in each vector type
+(define_mode_attr ssescalarnum
+  [(V64QI "64") (V16SI "16") (V8DI "8")
+   (V32QI "32") (V16HI "16") (V8SI "8") (V4DI "4")
+   (V16QI "16") (V8HI "8") (V4SI "4") (V2DI "2")
+   (V16SF "16") (V8DF "8")
+   (V8SF "8") (V4DF "4")
+   (V4SF "4") (V2DF "2")])
+
+;; Mask of scalar elements in each vector type
+(define_mode_attr ssescalarnummask
+  [(V32QI "31") (V16HI "15") (V8SI "7") (V4DI "3")
+   (V16QI "15") (V8HI "7") (V4SI "3") (V2DI "1")
+   (V8SF "7") (V4DF "3")
+   (V4SF "3") (V2DF "1")])
+
+(define_mode_attr ssescalarsize
+  [(V8DI  "64") (V4DI  "64") (V2DI  "64")
+   (V32HI "16") (V16HI "16") (V8HI "16")
+   (V16SI "32") (V8SI "32") (V4SI "32")
+   (V16SF "32") (V8DF "64")])
+
+;; SSE prefix for integer vector modes
+(define_mode_attr sseintprefix
+  [(V2DI  "p") (V2DF  "")
+   (V4DI  "p") (V4DF  "")
+   (V8DI  "p") (V8DF  "")
+   (V4SI  "p") (V4SF  "")
+   (V8SI  "p") (V8SF  "")
+   (V16SI "p") (V16SF "")])
+
+;; SSE scalar suffix for vector modes
+(define_mode_attr ssescalarmodesuffix
+  [(SF "ss") (DF "sd")
+   (V8SF "ss") (V4DF "sd")
+   (V4SF "ss") (V2DF "sd")
+   (V8SI "ss") (V4DI "sd")
+   (V4SI "d")])
+
+;; Pack/unpack vector modes
+(define_mode_attr sseunpackmode
+  [(V16QI "V8HI") (V8HI "V4SI") (V4SI "V2DI")
+   (V32QI "V16HI") (V16HI "V8SI") (V8SI "V4DI")
+   (V32HI "V16SI") (V64QI "V32HI") (V16SI "V8DI")])
+
+(define_mode_attr ssepackmode
+  [(V8HI "V16QI") (V4SI "V8HI") (V2DI "V4SI")
+   (V16HI "V32QI") (V8SI "V16HI") (V4DI "V8SI")
+   (V32HI "V64QI") (V16SI "V32HI") (V8DI "V16SI")])
+
+;; Mapping of the max integer size for xop rotate immediate constraint
+(define_mode_attr sserotatemax
+  [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")])
+
+;; Mapping of mode to cast intrinsic name
+(define_mode_attr castmode [(V8SI "si") (V8SF "ps") (V4DF "pd")])
+
+;; Instruction suffix for sign and zero extensions.
+(define_code_attr extsuffix [(sign_extend "sx") (zero_extend "zx")])
+
+;; i128 for integer vectors and TARGET_AVX2, f128 otherwise.
+;; i64x4 or f64x4 for 512bit modes.
+(define_mode_attr i128
+  [(V16SF "f64x4") (V8SF "f128") (V8DF "f64x4") (V4DF "f128")
+   (V64QI "i64x4") (V32QI "%~128") (V32HI "i64x4") (V16HI "%~128")
+   (V16SI "i64x4") (V8SI "%~128") (V8DI "i64x4") (V4DI "%~128")])
+
+;; Mix-n-match
+(define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF])
+
+;; Mapping of immediate bits for blend instructions
+(define_mode_attr blendbits
+  [(V8SF "255") (V4SF "15") (V4DF "15") (V2DF "3")])
+
+;; Mapping suffixes for broadcast
+(define_mode_attr bcstscalarsuff
+  [(V16SI "d") (V16SF "ss") (V8DI "q") (V8DF "sd")])
+
+;; Include define_subst patterns for instructions with mask
+(include "subst.md")
+
+;; Patterns whose name begins with "sse{,2,3}_" are invoked by intrinsics.
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Move patterns
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; All of these patterns are enabled for SSE1 as well as SSE2.
+;; This is essential for maintaining stable calling conventions.
+
+(define_expand "mov<mode>"
+  [(set (match_operand:VMOVE 0 "nonimmediate_operand")
+	(match_operand:VMOVE 1 "nonimmediate_operand"))]
+  "TARGET_SSE"
+{
+  ix86_expand_vector_move (<MODE>mode, operands);
+  DONE;
+})
+
+(define_insn "*mov<mode>_internal"
+  [(set (match_operand:VMOVE 0 "nonimmediate_operand"               "=v,v ,m")
+	(match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand"  "C ,vm,v"))]
+  "TARGET_SSE
+   && (register_operand (operands[0], <MODE>mode)
+       || register_operand (operands[1], <MODE>mode))"
+{
+  int mode = get_attr_mode (insn);
+  switch (which_alternative)
+    {
+    case 0:
+      return standard_sse_constant_opcode (insn, operands[1]);
+    case 1:
+    case 2:
+      /* There is no evex-encoded vmov* for sizes smaller than 64-bytes
+	 in avx512f, so we need to use workarounds, to access sse registers
+	 16-31, which are evex-only.  */
+      if (TARGET_AVX512F && <MODE_SIZE> < 64
+	  && ((REG_P (operands[0])
+	       && EXT_REX_SSE_REGNO_P (REGNO (operands[0])))
+	      || (REG_P (operands[1])
+		  && EXT_REX_SSE_REGNO_P (REGNO (operands[1])))))
+	{
+	  if (memory_operand (operands[0], <MODE>mode))
+	    {
+	      if (<MODE_SIZE> == 32)
+		return "vextract<shuffletype>64x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
+	      else if (<MODE_SIZE> == 16)
+		return "vextract<shuffletype>32x4\t{$0x0, %g1, %0|%0, %g1, 0x0}";
+	      else
+		gcc_unreachable ();
+	    }
+	  else if (memory_operand (operands[1], <MODE>mode))
+	    {
+	      if (<MODE_SIZE> == 32)
+		return "vbroadcast<shuffletype>64x4\t{%1, %g0|%g0, %1}";
+	      else if (<MODE_SIZE> == 16)
+		return "vbroadcast<shuffletype>32x4\t{%1, %g0|%g0, %1}";
+	      else
+		gcc_unreachable ();
+	    }
+	  else
+	    /* Reg -> reg move is always aligned.  Just use wider move.  */
+	    switch (mode)
+	      {
+	      case MODE_V8SF:
+	      case MODE_V4SF:
+		return "vmovaps\t{%g1, %g0|%g0, %g1}";
+	      case MODE_V4DF:
+	      case MODE_V2DF:
+		return "vmovapd\t{%g1, %g0|%g0, %g1}";
+	      case MODE_OI:
+	      case MODE_TI:
+		return "vmovdqa64\t{%g1, %g0|%g0, %g1}";
+	      default:
+		gcc_unreachable ();
+	      }
+	}
+      switch (mode)
+	{
+	case MODE_V16SF:
+	case MODE_V8SF:
+	case MODE_V4SF:
+	  if (TARGET_AVX
+	      && (misaligned_operand (operands[0], <MODE>mode)
+		  || misaligned_operand (operands[1], <MODE>mode)))
+	    return "vmovups\t{%1, %0|%0, %1}";
+	  else
+	    return "%vmovaps\t{%1, %0|%0, %1}";
+
+	case MODE_V8DF:
+	case MODE_V4DF:
+	case MODE_V2DF:
+	  if (TARGET_AVX
+	      && (misaligned_operand (operands[0], <MODE>mode)
+		  || misaligned_operand (operands[1], <MODE>mode)))
+	    return "vmovupd\t{%1, %0|%0, %1}";
+	  else
+	    return "%vmovapd\t{%1, %0|%0, %1}";
+
+	case MODE_OI:
+	case MODE_TI:
+	  if (TARGET_AVX
+	      && (misaligned_operand (operands[0], <MODE>mode)
+		  || misaligned_operand (operands[1], <MODE>mode)))
+	    return "vmovdqu\t{%1, %0|%0, %1}";
+	  else
+	    return "%vmovdqa\t{%1, %0|%0, %1}";
+	case MODE_XI:
+	  if (misaligned_operand (operands[0], <MODE>mode)
+	      || misaligned_operand (operands[1], <MODE>mode))
+	    return "vmovdqu64\t{%1, %0|%0, %1}";
+	  else
+	    return "vmovdqa64\t{%1, %0|%0, %1}";
+
+	default:
+	  gcc_unreachable ();
+	}
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "sselog1,ssemov,ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set (attr "mode")
+	(cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+		 (const_string "<ssePSmode>")
+	       (and (match_test "<MODE_SIZE> == 16")
+		    (and (eq_attr "alternative" "2")
+			 (match_test "TARGET_SSE_TYPELESS_STORES")))
+		 (const_string "<ssePSmode>")
+	       (match_test "TARGET_AVX")
+		 (const_string "<sseinsnmode>")
+	       (ior (not (match_test "TARGET_SSE2"))
+		    (match_test "optimize_function_for_size_p (cfun)"))
+		 (const_string "V4SF")
+	       (and (eq_attr "alternative" "0")
+		    (match_test "TARGET_SSE_LOAD0_BY_PXOR"))
+		 (const_string "TI")
+	      ]
+	      (const_string "<sseinsnmode>")))])
+
+(define_insn "avx512f_load<mode>_mask"
+  [(set (match_operand:VI48F_512 0 "register_operand" "=v,v")
+	(vec_merge:VI48F_512
+	  (match_operand:VI48F_512 1 "nonimmediate_operand" "v,m")
+	  (match_operand:VI48F_512 2 "vector_move_operand" "0C,0C")
+	  (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")))]
+  "TARGET_AVX512F"
+{
+  switch (MODE_<sseinsnmode>)
+    {
+    case MODE_V8DF:
+    case MODE_V16SF:
+      if (misaligned_operand (operands[1], <MODE>mode))
+	return "vmovu<ssemodesuffix>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}";
+      return "vmova<ssemodesuffix>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}";
+    default:
+      if (misaligned_operand (operands[1], <MODE>mode))
+	return "vmovdqu<ssescalarsize>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}";
+      return "vmovdqa<ssescalarsize>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}";
+    }
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "memory" "none,load")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512f_blendm<mode>"
+  [(set (match_operand:VI48F_512 0 "register_operand" "=v")
+	(vec_merge:VI48F_512
+	  (match_operand:VI48F_512 2 "nonimmediate_operand" "vm")
+	  (match_operand:VI48F_512 1 "register_operand" "v")
+	  (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+  "v<sseintprefix>blendm<ssemodesuffix>\t{%2, %1, %0%{%3%}|%0%{%3%}, %1, %2}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512f_store<mode>_mask"
+  [(set (match_operand:VI48F_512 0 "memory_operand" "=m")
+	(vec_merge:VI48F_512
+	  (match_operand:VI48F_512 1 "register_operand" "v")
+	  (match_dup 0)
+	  (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+{
+  switch (MODE_<sseinsnmode>)
+    {
+    case MODE_V8DF:
+    case MODE_V16SF:
+      return "vmova<ssemodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+    default:
+      return "vmovdqa<ssescalarsize>\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+    }
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "memory" "store")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "sse2_movq128"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(vec_concat:V2DI
+	  (vec_select:DI
+	    (match_operand:V2DI 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0)]))
+	  (const_int 0)))]
+  "TARGET_SSE2"
+  "%vmovq\t{%1, %0|%0, %q1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+;; Move a DI from a 32-bit register pair (e.g. %edx:%eax) to an xmm.
+;; We'd rather avoid this entirely; if the 32-bit reg pair was loaded
+;; from memory, we'd prefer to load the memory directly into the %xmm
+;; register.  To facilitate this happy circumstance, this pattern won't
+;; split until after register allocation.  If the 64-bit value didn't
+;; come from memory, this is the best we can do.  This is much better
+;; than storing %edx:%eax into a stack temporary and loading an %xmm
+;; from there.
+
+(define_insn_and_split "movdi_to_sse"
+  [(parallel
+    [(set (match_operand:V4SI 0 "register_operand" "=?x,x")
+	  (subreg:V4SI (match_operand:DI 1 "nonimmediate_operand" "r,m") 0))
+     (clobber (match_scratch:V4SI 2 "=&x,X"))])]
+  "!TARGET_64BIT && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_TO_VEC"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+ if (register_operand (operands[1], DImode))
+   {
+      /* The DImode arrived in a pair of integral registers (e.g. %edx:%eax).
+	 Assemble the 64-bit DImode value in an xmm register.  */
+      emit_insn (gen_sse2_loadld (operands[0], CONST0_RTX (V4SImode),
+				  gen_rtx_SUBREG (SImode, operands[1], 0)));
+      emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode),
+				  gen_rtx_SUBREG (SImode, operands[1], 4)));
+      emit_insn (gen_vec_interleave_lowv4si (operands[0], operands[0],
+					     operands[2]));
+   }
+ else if (memory_operand (operands[1], DImode))
+   {
+     rtx tmp = gen_reg_rtx (V2DImode);
+     emit_insn (gen_vec_concatv2di (tmp, operands[1], const0_rtx));
+     emit_move_insn (operands[0], gen_lowpart (V4SImode, tmp));
+   }
+ else
+   gcc_unreachable ();
+})
+
+(define_split
+  [(set (match_operand:V4SF 0 "register_operand")
+	(match_operand:V4SF 1 "zero_extended_scalar_load_operand"))]
+  "TARGET_SSE && reload_completed"
+  [(set (match_dup 0)
+	(vec_merge:V4SF
+	  (vec_duplicate:V4SF (match_dup 1))
+	  (match_dup 2)
+	  (const_int 1)))]
+{
+  operands[1] = simplify_gen_subreg (SFmode, operands[1], V4SFmode, 0);
+  operands[2] = CONST0_RTX (V4SFmode);
+})
+
+(define_split
+  [(set (match_operand:V2DF 0 "register_operand")
+	(match_operand:V2DF 1 "zero_extended_scalar_load_operand"))]
+  "TARGET_SSE2 && reload_completed"
+  [(set (match_dup 0) (vec_concat:V2DF (match_dup 1) (match_dup 2)))]
+{
+  operands[1] = simplify_gen_subreg (DFmode, operands[1], V2DFmode, 0);
+  operands[2] = CONST0_RTX (DFmode);
+})
+
+(define_expand "movmisalign<mode>"
+  [(set (match_operand:VMOVE 0 "nonimmediate_operand")
+	(match_operand:VMOVE 1 "nonimmediate_operand"))]
+  "TARGET_SSE"
+{
+  ix86_expand_vector_move_misalign (<MODE>mode, operands);
+  DONE;
+})
+
+(define_expand "<sse>_loadu<ssemodesuffix><avxsizesuffix><mask_name>"
+  [(set (match_operand:VF 0 "register_operand")
+	(unspec:VF [(match_operand:VF 1 "nonimmediate_operand")]
+	  UNSPEC_LOADU))]
+  "TARGET_SSE && <mask_mode512bit_condition>"
+{
+  /* For AVX, normal *mov<mode>_internal pattern will handle unaligned loads
+     just fine if misaligned_operand is true, and without the UNSPEC it can
+     be combined with arithmetic instructions.  If misaligned_operand is
+     false, still emit UNSPEC_LOADU insn to honor user's request for
+     misaligned load.  */
+  if (TARGET_AVX
+      && misaligned_operand (operands[1], <MODE>mode))
+    {
+      rtx src = operands[1];
+      if (<mask_applied>)
+	src = gen_rtx_VEC_MERGE (<MODE>mode, operands[1],
+				 operands[2 * <mask_applied>],
+				 operands[3 * <mask_applied>]);
+      emit_insn (gen_rtx_SET (VOIDmode, operands[0], src));
+      DONE;
+    }
+})
+
+(define_insn "*<sse>_loadu<ssemodesuffix><avxsizesuffix><mask_name>"
+  [(set (match_operand:VF 0 "register_operand" "=v")
+	(unspec:VF
+	  [(match_operand:VF 1 "nonimmediate_operand" "vm")]
+	  UNSPEC_LOADU))]
+  "TARGET_SSE && <mask_mode512bit_condition>"
+{
+  switch (get_attr_mode (insn))
+    {
+    case MODE_V16SF:
+    case MODE_V8SF:
+    case MODE_V4SF:
+      return "%vmovups\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}";
+    default:
+      return "%vmovu<ssemodesuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}";
+    }
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "movu" "1")
+   (set_attr "ssememalign" "8")
+   (set_attr "prefix" "maybe_vex")
+   (set (attr "mode")
+	(cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+		 (const_string "<ssePSmode>")
+	       (match_test "TARGET_AVX")
+		 (const_string "<MODE>")
+	       (match_test "optimize_function_for_size_p (cfun)")
+		 (const_string "V4SF")
+	      ]
+	      (const_string "<MODE>")))])
+
+(define_insn "<sse>_storeu<ssemodesuffix><avxsizesuffix>"
+  [(set (match_operand:VF 0 "memory_operand" "=m")
+	(unspec:VF
+	  [(match_operand:VF 1 "register_operand" "v")]
+	  UNSPEC_STOREU))]
+  "TARGET_SSE"
+{
+  switch (get_attr_mode (insn))
+    {
+    case MODE_V16SF:
+    case MODE_V8SF:
+    case MODE_V4SF:
+      return "%vmovups\t{%1, %0|%0, %1}";
+    default:
+      return "%vmovu<ssemodesuffix>\t{%1, %0|%0, %1}";
+    }
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "movu" "1")
+   (set_attr "ssememalign" "8")
+   (set_attr "prefix" "maybe_vex")
+   (set (attr "mode")
+	(cond [(and (match_test "<MODE_SIZE> == 16")
+                    (ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+                         (match_test "TARGET_SSE_TYPELESS_STORES")))
+		 (const_string "<ssePSmode>")
+	       (match_test "TARGET_AVX")
+		 (const_string "<MODE>")
+	       (match_test "optimize_function_for_size_p (cfun)")
+		 (const_string "V4SF")
+	      ]
+	      (const_string "<MODE>")))])
+
+(define_insn "avx512f_storeu<ssemodesuffix>512_mask"
+  [(set (match_operand:VF_512 0 "memory_operand" "=m")
+	(vec_merge:VF_512
+	  (unspec:VF_512
+	    [(match_operand:VF_512 1 "register_operand" "v")]
+	    UNSPEC_STOREU)
+	  (match_dup 0)
+	  (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+{
+  switch (get_attr_mode (insn))
+    {
+    case MODE_V16SF:
+      return "vmovups\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+    default:
+      return "vmovu<ssemodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+    }
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "movu" "1")
+   (set_attr "memory" "store")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "<sse2_avx_avx512f>_loaddqu<mode><mask_name>"
+  [(set (match_operand:VI_UNALIGNED_LOADSTORE 0 "register_operand")
+	(unspec:VI_UNALIGNED_LOADSTORE
+	  [(match_operand:VI_UNALIGNED_LOADSTORE 1 "nonimmediate_operand")]
+	  UNSPEC_LOADU))]
+  "TARGET_SSE2 && <mask_mode512bit_condition>"
+{
+  /* For AVX, normal *mov<mode>_internal pattern will handle unaligned loads
+     just fine if misaligned_operand is true, and without the UNSPEC it can
+     be combined with arithmetic instructions.  If misaligned_operand is
+     false, still emit UNSPEC_LOADU insn to honor user's request for
+     misaligned load.  */
+  if (TARGET_AVX
+      && misaligned_operand (operands[1], <MODE>mode))
+    {
+      rtx src = operands[1];
+      if (<mask_applied>)
+	src = gen_rtx_VEC_MERGE (<MODE>mode, operands[1],
+				 operands[2 * <mask_applied>],
+				 operands[3 * <mask_applied>]);
+      emit_insn (gen_rtx_SET (VOIDmode, operands[0], src));
+      DONE;
+    }
+})
+
+(define_insn "*<sse2_avx_avx512f>_loaddqu<mode><mask_name>"
+  [(set (match_operand:VI_UNALIGNED_LOADSTORE 0 "register_operand" "=v")
+	(unspec:VI_UNALIGNED_LOADSTORE
+	  [(match_operand:VI_UNALIGNED_LOADSTORE 1 "nonimmediate_operand" "vm")]
+	  UNSPEC_LOADU))]
+  "TARGET_SSE2 && <mask_mode512bit_condition>"
+{
+  switch (get_attr_mode (insn))
+    {
+    case MODE_V8SF:
+    case MODE_V4SF:
+      return "%vmovups\t{%1, %0|%0, %1}";
+    case MODE_XI:
+      if (<MODE>mode == V8DImode)
+	return "vmovdqu64\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}";
+      else
+	return "vmovdqu32\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}";
+    default:
+      return "%vmovdqu\t{%1, %0|%0, %1}";
+    }
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "movu" "1")
+   (set_attr "ssememalign" "8")
+   (set (attr "prefix_data16")
+     (if_then_else
+       (match_test "TARGET_AVX")
+     (const_string "*")
+     (const_string "1")))
+   (set_attr "prefix" "maybe_vex")
+   (set (attr "mode")
+	(cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+		 (const_string "<ssePSmode>")
+	       (match_test "TARGET_AVX")
+		 (const_string "<sseinsnmode>")
+	       (match_test "optimize_function_for_size_p (cfun)")
+	         (const_string "V4SF")
+	      ]
+	      (const_string "<sseinsnmode>")))])
+
+(define_insn "<sse2_avx_avx512f>_storedqu<mode>"
+  [(set (match_operand:VI_UNALIGNED_LOADSTORE 0 "memory_operand" "=m")
+	(unspec:VI_UNALIGNED_LOADSTORE
+	  [(match_operand:VI_UNALIGNED_LOADSTORE 1 "register_operand" "v")]
+	  UNSPEC_STOREU))]
+  "TARGET_SSE2"
+{
+  switch (get_attr_mode (insn))
+    {
+    case MODE_V16SF:
+    case MODE_V8SF:
+    case MODE_V4SF:
+      return "%vmovups\t{%1, %0|%0, %1}";
+    case MODE_XI:
+      if (<MODE>mode == V8DImode)
+	return "vmovdqu64\t{%1, %0|%0, %1}";
+      else
+	return "vmovdqu32\t{%1, %0|%0, %1}";
+    default:
+      return "%vmovdqu\t{%1, %0|%0, %1}";
+    }
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "movu" "1")
+   (set_attr "ssememalign" "8")
+   (set (attr "prefix_data16")
+     (if_then_else
+       (match_test "TARGET_AVX")
+     (const_string "*")
+     (const_string "1")))
+   (set_attr "prefix" "maybe_vex")
+   (set (attr "mode")
+	(cond [(and (match_test "<MODE_SIZE> == 16")
+		    (ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+			 (match_test "TARGET_SSE_TYPELESS_STORES")))
+		 (const_string "<ssePSmode>")
+	       (match_test "TARGET_AVX")
+		 (const_string "<sseinsnmode>")
+	       (match_test "optimize_function_for_size_p (cfun)")
+	         (const_string "V4SF")
+	      ]
+	      (const_string "<sseinsnmode>")))])
+
+(define_insn "avx512f_storedqu<mode>_mask"
+  [(set (match_operand:VI48_512 0 "memory_operand" "=m")
+	(vec_merge:VI48_512
+	  (unspec:VI48_512
+	    [(match_operand:VI48_512 1 "register_operand" "v")]
+	    UNSPEC_STOREU)
+	  (match_dup 0)
+	  (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+{
+  if (<MODE>mode == V8DImode)
+    return "vmovdqu64\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+  else
+    return "vmovdqu32\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "movu" "1")
+   (set_attr "memory" "store")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<sse3>_lddqu<avxsizesuffix>"
+  [(set (match_operand:VI1 0 "register_operand" "=x")
+	(unspec:VI1 [(match_operand:VI1 1 "memory_operand" "m")]
+		    UNSPEC_LDDQU))]
+  "TARGET_SSE3"
+  "%vlddqu\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "movu" "1")
+   (set_attr "ssememalign" "8")
+   (set (attr "prefix_data16")
+     (if_then_else
+       (match_test "TARGET_AVX")
+     (const_string "*")
+     (const_string "0")))
+   (set (attr "prefix_rep")
+     (if_then_else
+       (match_test "TARGET_AVX")
+     (const_string "*")
+     (const_string "1")))
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "sse2_movnti<mode>"
+  [(set (match_operand:SWI48 0 "memory_operand" "=m")
+	(unspec:SWI48 [(match_operand:SWI48 1 "register_operand" "r")]
+		      UNSPEC_MOVNT))]
+  "TARGET_SSE2"
+  "movnti\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_data16" "0")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<sse>_movnt<mode>"
+  [(set (match_operand:VF 0 "memory_operand" "=m")
+	(unspec:VF
+	  [(match_operand:VF 1 "register_operand" "v")]
+	  UNSPEC_MOVNT))]
+  "TARGET_SSE"
+  "%vmovnt<ssemodesuffix>\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<sse2>_movnt<mode>"
+  [(set (match_operand:VI8 0 "memory_operand" "=m")
+	(unspec:VI8 [(match_operand:VI8 1 "register_operand" "v")]
+		    UNSPEC_MOVNT))]
+  "TARGET_SSE2"
+  "%vmovntdq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set (attr "prefix_data16")
+     (if_then_else
+       (match_test "TARGET_AVX")
+     (const_string "*")
+     (const_string "1")))
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+; Expand patterns for non-temporal stores.  At the moment, only those
+; that directly map to insns are defined; it would be possible to
+; define patterns for other modes that would expand to several insns.
+
+;; Modes handled by storent patterns.
+(define_mode_iterator STORENT_MODE
+  [(DI "TARGET_SSE2 && TARGET_64BIT") (SI "TARGET_SSE2")
+   (SF "TARGET_SSE4A") (DF "TARGET_SSE4A")
+   (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") (V2DI "TARGET_SSE2")
+   (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
+   (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")])
+
+(define_expand "storent<mode>"
+  [(set (match_operand:STORENT_MODE 0 "memory_operand")
+	(unspec:STORENT_MODE
+	  [(match_operand:STORENT_MODE 1 "register_operand")]
+	  UNSPEC_MOVNT))]
+  "TARGET_SSE")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel floating point arithmetic
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_expand "<code><mode>2"
+  [(set (match_operand:VF 0 "register_operand")
+	(absneg:VF
+	  (match_operand:VF 1 "register_operand")))]
+  "TARGET_SSE"
+  "ix86_expand_fp_absneg_operator (<CODE>, <MODE>mode, operands); DONE;")
+
+(define_insn_and_split "*absneg<mode>2"
+  [(set (match_operand:VF 0 "register_operand" "=x,x,v,v")
+	(match_operator:VF 3 "absneg_operator"
+	  [(match_operand:VF 1 "nonimmediate_operand" "0, xm, v, m")]))
+   (use (match_operand:VF 2 "nonimmediate_operand"    "xm, 0, vm,v"))]
+  "TARGET_SSE"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  enum rtx_code absneg_op;
+  rtx op1, op2;
+  rtx t;
+
+  if (TARGET_AVX)
+    {
+      if (MEM_P (operands[1]))
+	op1 = operands[2], op2 = operands[1];
+      else
+	op1 = operands[1], op2 = operands[2];
+    }
+  else
+    {
+      op1 = operands[0];
+      if (rtx_equal_p (operands[0], operands[1]))
+	op2 = operands[2];
+      else
+	op2 = operands[1];
+    }
+
+  absneg_op = GET_CODE (operands[3]) == NEG ? XOR : AND;
+  t = gen_rtx_fmt_ee (absneg_op, <MODE>mode, op1, op2);
+  t = gen_rtx_SET (VOIDmode, operands[0], t);
+  emit_insn (t);
+  DONE;
+}
+  [(set_attr "isa" "noavx,noavx,avx,avx")])
+
+(define_expand "<plusminus_insn><mode>3<mask_name><round_name>"
+  [(set (match_operand:VF 0 "register_operand")
+	(plusminus:VF
+	  (match_operand:VF 1 "<round_nimm_predicate>")
+	  (match_operand:VF 2 "<round_nimm_predicate>")))]
+  "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>"
+  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+
+(define_insn "*<plusminus_insn><mode>3<mask_name><round_name>"
+  [(set (match_operand:VF 0 "register_operand" "=x,v")
+	(plusminus:VF
+	  (match_operand:VF 1 "<round_nimm_predicate>" "<comm>0,v")
+	  (match_operand:VF 2 "<round_nimm_predicate>" "xm,<round_constraint>")))]
+  "TARGET_SSE && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands) && <mask_mode512bit_condition> && <round_mode512bit_condition>"
+  "@
+   <plusminus_mnemonic><ssemodesuffix>\t{%2, %0|%0, %2}
+   v<plusminus_mnemonic><ssemodesuffix>\t{<round_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_mask_op3>}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseadd")
+   (set_attr "prefix" "<mask_prefix3>")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<sse>_vm<plusminus_insn><mode>3<round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=x,v")
+	(vec_merge:VF_128
+	  (plusminus:VF_128
+	    (match_operand:VF_128 1 "register_operand" "0,v")
+	    (match_operand:VF_128 2 "nonimmediate_operand" "xm,<round_constraint>"))
+	  (match_dup 1)
+	  (const_int 1)))]
+  "TARGET_SSE"
+  "@
+   <plusminus_mnemonic><ssescalarmodesuffix>\t{%2, %0|%0, %<iptr>2}
+   v<plusminus_mnemonic><ssescalarmodesuffix>\t{<round_op3>%2, %1, %0|%0, %1, %<iptr>2<round_op3>}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseadd")
+   (set_attr "prefix" "<round_prefix>")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_expand "mul<mode>3<mask_name><round_name>"
+  [(set (match_operand:VF 0 "register_operand")
+	(mult:VF
+	  (match_operand:VF 1 "<round_nimm_predicate>")
+	  (match_operand:VF 2 "<round_nimm_predicate>")))]
+  "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>"
+  "ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);")
+
+(define_insn "*mul<mode>3<mask_name><round_name>"
+  [(set (match_operand:VF 0 "register_operand" "=x,v")
+	(mult:VF
+	  (match_operand:VF 1 "<round_nimm_predicate>" "%0,v")
+	  (match_operand:VF 2 "<round_nimm_predicate>" "xm,<round_constraint>")))]
+  "TARGET_SSE && ix86_binary_operator_ok (MULT, <MODE>mode, operands) && <mask_mode512bit_condition> && <round_mode512bit_condition>"
+  "@
+   mul<ssemodesuffix>\t{%2, %0|%0, %2}
+   vmul<ssemodesuffix>\t{<round_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_mask_op3>}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssemul")
+   (set_attr "prefix" "<mask_prefix3>")
+   (set_attr "btver2_decode" "direct,double")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<sse>_vm<multdiv_mnemonic><mode>3<round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=x,v")
+	(vec_merge:VF_128
+	  (multdiv:VF_128
+	    (match_operand:VF_128 1 "register_operand" "0,v")
+	    (match_operand:VF_128 2 "nonimmediate_operand" "xm,<round_constraint>"))
+	  (match_dup 1)
+	  (const_int 1)))]
+  "TARGET_SSE"
+  "@
+   <multdiv_mnemonic><ssescalarmodesuffix>\t{%2, %0|%0, %<iptr>2}
+   v<multdiv_mnemonic><ssescalarmodesuffix>\t{<round_op3>%2, %1, %0|%0, %1, %<iptr>2<round_op3>}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sse<multdiv_mnemonic>")
+   (set_attr "prefix" "<round_prefix>")
+   (set_attr "btver2_decode" "direct,double")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_expand "div<mode>3"
+  [(set (match_operand:VF2 0 "register_operand")
+	(div:VF2 (match_operand:VF2 1 "register_operand")
+		 (match_operand:VF2 2 "nonimmediate_operand")))]
+  "TARGET_SSE2"
+  "ix86_fixup_binary_operands_no_copy (DIV, <MODE>mode, operands);")
+
+(define_expand "div<mode>3"
+  [(set (match_operand:VF1 0 "register_operand")
+	(div:VF1 (match_operand:VF1 1 "register_operand")
+		 (match_operand:VF1 2 "nonimmediate_operand")))]
+  "TARGET_SSE"
+{
+  ix86_fixup_binary_operands_no_copy (DIV, <MODE>mode, operands);
+
+  if (TARGET_SSE_MATH
+      && TARGET_RECIP_VEC_DIV
+      && !optimize_insn_for_size_p ()
+      && flag_finite_math_only && !flag_trapping_math
+      && flag_unsafe_math_optimizations)
+    {
+      ix86_emit_swdivsf (operands[0], operands[1], operands[2], <MODE>mode);
+      DONE;
+    }
+})
+
+(define_insn "<sse>_div<mode>3<mask_name><round_name>"
+  [(set (match_operand:VF 0 "register_operand" "=x,v")
+	(div:VF
+	  (match_operand:VF 1 "register_operand" "0,v")
+	  (match_operand:VF 2 "<round_nimm_predicate>" "xm,<round_constraint>")))]
+  "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>"
+  "@
+   div<ssemodesuffix>\t{%2, %0|%0, %2}
+   vdiv<ssemodesuffix>\t{<round_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_mask_op3>}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssediv")
+   (set_attr "prefix" "<mask_prefix3>")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<sse>_rcp<mode>2"
+  [(set (match_operand:VF1_128_256 0 "register_operand" "=x")
+	(unspec:VF1_128_256
+	  [(match_operand:VF1_128_256 1 "nonimmediate_operand" "xm")] UNSPEC_RCP))]
+  "TARGET_SSE"
+  "%vrcpps\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "atom_sse_attr" "rcp")
+   (set_attr "btver2_sse_attr" "rcp")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "sse_vmrcpv4sf2"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
+	(vec_merge:V4SF
+	  (unspec:V4SF [(match_operand:V4SF 1 "nonimmediate_operand" "xm,xm")]
+		       UNSPEC_RCP)
+	  (match_operand:V4SF 2 "register_operand" "0,x")
+	  (const_int 1)))]
+  "TARGET_SSE"
+  "@
+   rcpss\t{%1, %0|%0, %k1}
+   vrcpss\t{%1, %2, %0|%0, %2, %k1}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sse")
+   (set_attr "ssememalign" "32")
+   (set_attr "atom_sse_attr" "rcp")
+   (set_attr "btver2_sse_attr" "rcp")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "SF")])
+
+(define_insn "<mask_codefor>rcp14<mode><mask_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+	(unspec:VF_512
+	  [(match_operand:VF_512 1 "nonimmediate_operand" "vm")]
+	  UNSPEC_RCP14))]
+  "TARGET_AVX512F"
+  "vrcp14<ssemodesuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "srcp14<mode>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+	(vec_merge:VF_128
+	  (unspec:VF_128
+	    [(match_operand:VF_128 1 "nonimmediate_operand" "vm")]
+	    UNSPEC_RCP14)
+	  (match_operand:VF_128 2 "register_operand" "v")
+	  (const_int 1)))]
+  "TARGET_AVX512F"
+  "vrcp14<ssescalarmodesuffix>\t{%1, %2, %0|%0, %2, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:VF2 0 "register_operand")
+	(sqrt:VF2 (match_operand:VF2 1 "nonimmediate_operand")))]
+  "TARGET_SSE2")
+
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:VF1 0 "register_operand")
+	(sqrt:VF1 (match_operand:VF1 1 "nonimmediate_operand")))]
+  "TARGET_SSE"
+{
+  if (TARGET_SSE_MATH
+      && TARGET_RECIP_VEC_SQRT
+      && !optimize_insn_for_size_p ()
+      && flag_finite_math_only && !flag_trapping_math
+      && flag_unsafe_math_optimizations)
+    {
+      ix86_emit_swsqrtsf (operands[0], operands[1], <MODE>mode, false);
+      DONE;
+    }
+})
+
+(define_insn "<sse>_sqrt<mode>2<mask_name><round_name>"
+  [(set (match_operand:VF 0 "register_operand" "=v")
+	(sqrt:VF (match_operand:VF 1 "<round_nimm_predicate>" "<round_constraint>")))]
+  "TARGET_SSE && <mask_mode512bit_condition> && <round_mode512bit_condition>"
+  "%vsqrt<ssemodesuffix>\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}"
+  [(set_attr "type" "sse")
+   (set_attr "atom_sse_attr" "sqrt")
+   (set_attr "btver2_sse_attr" "sqrt")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<sse>_vmsqrt<mode>2<round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=x,v")
+	(vec_merge:VF_128
+	  (sqrt:VF_128
+	    (match_operand:VF_128 1 "nonimmediate_operand" "xm,<round_constraint>"))
+	  (match_operand:VF_128 2 "register_operand" "0,v")
+	  (const_int 1)))]
+  "TARGET_SSE"
+  "@
+   sqrt<ssescalarmodesuffix>\t{%1, %0|%0, %<iptr>1}
+   vsqrt<ssescalarmodesuffix>\t{<round_op3>%1, %2, %0|%0, %2, %<iptr>1<round_op3>}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sse")
+   (set_attr "atom_sse_attr" "sqrt")
+   (set_attr "prefix" "<round_prefix>")
+   (set_attr "btver2_sse_attr" "sqrt")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_expand "rsqrt<mode>2"
+  [(set (match_operand:VF1_128_256 0 "register_operand")
+	(unspec:VF1_128_256
+	  [(match_operand:VF1_128_256 1 "nonimmediate_operand")] UNSPEC_RSQRT))]
+  "TARGET_SSE_MATH"
+{
+  ix86_emit_swsqrtsf (operands[0], operands[1], <MODE>mode, true);
+  DONE;
+})
+
+(define_insn "<sse>_rsqrt<mode>2"
+  [(set (match_operand:VF1_128_256 0 "register_operand" "=x")
+	(unspec:VF1_128_256
+	  [(match_operand:VF1_128_256 1 "nonimmediate_operand" "xm")] UNSPEC_RSQRT))]
+  "TARGET_SSE"
+  "%vrsqrtps\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<mask_codefor>rsqrt14<mode><mask_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+	(unspec:VF_512
+	  [(match_operand:VF_512 1 "nonimmediate_operand" "vm")]
+	  UNSPEC_RSQRT14))]
+  "TARGET_AVX512F"
+  "vrsqrt14<ssemodesuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "rsqrt14<mode>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+	(vec_merge:VF_128
+	  (unspec:VF_128
+	    [(match_operand:VF_128 1 "nonimmediate_operand" "vm")]
+	    UNSPEC_RSQRT14)
+	  (match_operand:VF_128 2 "register_operand" "v")
+	  (const_int 1)))]
+  "TARGET_AVX512F"
+  "vrsqrt14<ssescalarmodesuffix>\t{%1, %2, %0|%0, %2, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "sse_vmrsqrtv4sf2"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
+	(vec_merge:V4SF
+	  (unspec:V4SF [(match_operand:V4SF 1 "nonimmediate_operand" "xm,xm")]
+		       UNSPEC_RSQRT)
+	  (match_operand:V4SF 2 "register_operand" "0,x")
+	  (const_int 1)))]
+  "TARGET_SSE"
+  "@
+   rsqrtss\t{%1, %0|%0, %k1}
+   vrsqrtss\t{%1, %2, %0|%0, %2, %k1}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sse")
+   (set_attr "ssememalign" "32")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "SF")])
+
+;; ??? For !flag_finite_math_only, the representation with SMIN/SMAX
+;; isn't really correct, as those rtl operators aren't defined when
+;; applied to NaNs.  Hopefully the optimizers won't get too smart on us.
+
+(define_expand "<code><mode>3<mask_name><round_saeonly_name>"
+  [(set (match_operand:VF 0 "register_operand")
+	(smaxmin:VF
+	  (match_operand:VF 1 "<round_saeonly_nimm_predicate>")
+	  (match_operand:VF 2 "<round_saeonly_nimm_predicate>")))]
+  "TARGET_SSE && <mask_mode512bit_condition> && <round_saeonly_mode512bit_condition>"
+{
+  if (!flag_finite_math_only)
+    operands[1] = force_reg (<MODE>mode, operands[1]);
+  ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);
+})
+
+(define_insn "*<code><mode>3_finite<mask_name><round_saeonly_name>"
+  [(set (match_operand:VF 0 "register_operand" "=x,v")
+	(smaxmin:VF
+	  (match_operand:VF 1 "<round_saeonly_nimm_predicate>" "%0,v")
+	  (match_operand:VF 2 "<round_saeonly_nimm_predicate>" "xm,<round_saeonly_constraint>")))]
+  "TARGET_SSE && flag_finite_math_only
+   && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)
+   && <mask_mode512bit_condition> && <round_saeonly_mode512bit_condition>"
+  "@
+   <maxmin_float><ssemodesuffix>\t{%2, %0|%0, %2}
+   v<maxmin_float><ssemodesuffix>\t{<round_saeonly_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_saeonly_mask_op3>}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseadd")
+   (set_attr "btver2_sse_attr" "maxmin")
+   (set_attr "prefix" "<mask_prefix3>")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*<code><mode>3<mask_name><round_saeonly_name>"
+  [(set (match_operand:VF 0 "register_operand" "=x,v")
+	(smaxmin:VF
+	  (match_operand:VF 1 "register_operand" "0,v")
+	  (match_operand:VF 2 "<round_saeonly_nimm_predicate>" "xm,<round_saeonly_constraint>")))]
+  "TARGET_SSE && !flag_finite_math_only
+   && <mask_mode512bit_condition> && <round_saeonly_mode512bit_condition>"
+  "@
+   <maxmin_float><ssemodesuffix>\t{%2, %0|%0, %2}
+   v<maxmin_float><ssemodesuffix>\t{<round_saeonly_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_saeonly_mask_op3>}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseadd")
+   (set_attr "btver2_sse_attr" "maxmin")
+   (set_attr "prefix" "<mask_prefix3>")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<sse>_vm<code><mode>3<round_saeonly_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=x,v")
+	(vec_merge:VF_128
+	  (smaxmin:VF_128
+	    (match_operand:VF_128 1 "register_operand" "0,v")
+	    (match_operand:VF_128 2 "nonimmediate_operand" "xm,<round_saeonly_constraint>"))
+	 (match_dup 1)
+	 (const_int 1)))]
+  "TARGET_SSE"
+  "@
+   <maxmin_float><ssescalarmodesuffix>\t{%2, %0|%0, %<iptr>2}
+   v<maxmin_float><ssescalarmodesuffix>\t{<round_saeonly_op3>%2, %1, %0|%0, %1, %<iptr>2<round_saeonly_op3>}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sse")
+   (set_attr "btver2_sse_attr" "maxmin")
+   (set_attr "prefix" "<round_saeonly_prefix>")
+   (set_attr "mode" "<ssescalarmode>")])
+
+;; These versions of the min/max patterns implement exactly the operations
+;;   min = (op1 < op2 ? op1 : op2)
+;;   max = (!(op1 < op2) ? op1 : op2)
+;; Their operands are not commutative, and thus they may be used in the
+;; presence of -0.0 and NaN.
+
+(define_insn "*ieee_smin<mode>3"
+  [(set (match_operand:VF 0 "register_operand" "=v,v")
+	(unspec:VF
+	  [(match_operand:VF 1 "register_operand" "0,v")
+	   (match_operand:VF 2 "nonimmediate_operand" "vm,vm")]
+	 UNSPEC_IEEE_MIN))]
+  "TARGET_SSE"
+  "@
+   min<ssemodesuffix>\t{%2, %0|%0, %2}
+   vmin<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseadd")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*ieee_smax<mode>3"
+  [(set (match_operand:VF 0 "register_operand" "=v,v")
+	(unspec:VF
+	  [(match_operand:VF 1 "register_operand" "0,v")
+	   (match_operand:VF 2 "nonimmediate_operand" "vm,vm")]
+	 UNSPEC_IEEE_MAX))]
+  "TARGET_SSE"
+  "@
+   max<ssemodesuffix>\t{%2, %0|%0, %2}
+   vmax<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseadd")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx_addsubv4df3"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+	(vec_merge:V4DF
+	  (plus:V4DF
+	    (match_operand:V4DF 1 "register_operand" "x")
+	    (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
+	  (minus:V4DF (match_dup 1) (match_dup 2))
+	  (const_int 10)))]
+  "TARGET_AVX"
+  "vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
+(define_insn "sse3_addsubv2df3"
+  [(set (match_operand:V2DF 0 "register_operand" "=x,x")
+	(vec_merge:V2DF
+	  (plus:V2DF
+	    (match_operand:V2DF 1 "register_operand" "0,x")
+	    (match_operand:V2DF 2 "nonimmediate_operand" "xm,xm"))
+	  (minus:V2DF (match_dup 1) (match_dup 2))
+	  (const_int 2)))]
+  "TARGET_SSE3"
+  "@
+   addsubpd\t{%2, %0|%0, %2}
+   vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseadd")
+   (set_attr "atom_unit" "complex")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "V2DF")])
+
+(define_insn "avx_addsubv8sf3"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(vec_merge:V8SF
+	  (plus:V8SF
+	    (match_operand:V8SF 1 "register_operand" "x")
+	    (match_operand:V8SF 2 "nonimmediate_operand" "xm"))
+	  (minus:V8SF (match_dup 1) (match_dup 2))
+	  (const_int 170)))]
+  "TARGET_AVX"
+  "vaddsubps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "sse3_addsubv4sf3"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
+	(vec_merge:V4SF
+	  (plus:V4SF
+	    (match_operand:V4SF 1 "register_operand" "0,x")
+	    (match_operand:V4SF 2 "nonimmediate_operand" "xm,xm"))
+	  (minus:V4SF (match_dup 1) (match_dup 2))
+	  (const_int 10)))]
+  "TARGET_SSE3"
+  "@
+   addsubps\t{%2, %0|%0, %2}
+   vaddsubps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseadd")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "prefix_rep" "1,*")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "avx_h<plusminus_insn>v4df3"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+	(vec_concat:V4DF
+	  (vec_concat:V2DF
+	    (plusminus:DF
+	      (vec_select:DF
+		(match_operand:V4DF 1 "register_operand" "x")
+		(parallel [(const_int 0)]))
+	      (vec_select:DF (match_dup 1) (parallel [(const_int 1)])))
+	    (plusminus:DF
+	      (vec_select:DF
+		(match_operand:V4DF 2 "nonimmediate_operand" "xm")
+		(parallel [(const_int 0)]))
+	      (vec_select:DF (match_dup 2) (parallel [(const_int 1)]))))
+	  (vec_concat:V2DF
+	    (plusminus:DF
+	      (vec_select:DF (match_dup 1) (parallel [(const_int 2)]))
+	      (vec_select:DF (match_dup 1) (parallel [(const_int 3)])))
+	    (plusminus:DF
+	      (vec_select:DF (match_dup 2) (parallel [(const_int 2)]))
+	      (vec_select:DF (match_dup 2) (parallel [(const_int 3)]))))))]
+  "TARGET_AVX"
+  "vh<plusminus_mnemonic>pd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
+(define_expand "sse3_haddv2df3"
+  [(set (match_operand:V2DF 0 "register_operand")
+	(vec_concat:V2DF
+	  (plus:DF
+	    (vec_select:DF
+	      (match_operand:V2DF 1 "register_operand")
+	      (parallel [(const_int 0)]))
+	    (vec_select:DF (match_dup 1) (parallel [(const_int 1)])))
+	  (plus:DF
+	    (vec_select:DF
+	      (match_operand:V2DF 2 "nonimmediate_operand")
+	      (parallel [(const_int 0)]))
+	    (vec_select:DF (match_dup 2) (parallel [(const_int 1)])))))]
+  "TARGET_SSE3")
+
+(define_insn "*sse3_haddv2df3"
+  [(set (match_operand:V2DF 0 "register_operand" "=x,x")
+	(vec_concat:V2DF
+	  (plus:DF
+	    (vec_select:DF
+	      (match_operand:V2DF 1 "register_operand" "0,x")
+	      (parallel [(match_operand:SI 3 "const_0_to_1_operand")]))
+	    (vec_select:DF
+	      (match_dup 1)
+	      (parallel [(match_operand:SI 4 "const_0_to_1_operand")])))
+	  (plus:DF
+	    (vec_select:DF
+	      (match_operand:V2DF 2 "nonimmediate_operand" "xm,xm")
+	      (parallel [(match_operand:SI 5 "const_0_to_1_operand")]))
+	    (vec_select:DF
+	      (match_dup 2)
+	      (parallel [(match_operand:SI 6 "const_0_to_1_operand")])))))]
+  "TARGET_SSE3
+   && INTVAL (operands[3]) != INTVAL (operands[4])
+   && INTVAL (operands[5]) != INTVAL (operands[6])"
+  "@
+   haddpd\t{%2, %0|%0, %2}
+   vhaddpd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseadd")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "V2DF")])
+
+(define_insn "sse3_hsubv2df3"
+  [(set (match_operand:V2DF 0 "register_operand" "=x,x")
+	(vec_concat:V2DF
+	  (minus:DF
+	    (vec_select:DF
+	      (match_operand:V2DF 1 "register_operand" "0,x")
+	      (parallel [(const_int 0)]))
+	    (vec_select:DF (match_dup 1) (parallel [(const_int 1)])))
+	  (minus:DF
+	    (vec_select:DF
+	      (match_operand:V2DF 2 "nonimmediate_operand" "xm,xm")
+	      (parallel [(const_int 0)]))
+	    (vec_select:DF (match_dup 2) (parallel [(const_int 1)])))))]
+  "TARGET_SSE3"
+  "@
+   hsubpd\t{%2, %0|%0, %2}
+   vhsubpd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseadd")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "V2DF")])
+
+(define_insn "*sse3_haddv2df3_low"
+  [(set (match_operand:DF 0 "register_operand" "=x,x")
+	(plus:DF
+	  (vec_select:DF
+	    (match_operand:V2DF 1 "register_operand" "0,x")
+	    (parallel [(match_operand:SI 2 "const_0_to_1_operand")]))
+	  (vec_select:DF
+	    (match_dup 1)
+	    (parallel [(match_operand:SI 3 "const_0_to_1_operand")]))))]
+  "TARGET_SSE3
+   && INTVAL (operands[2]) != INTVAL (operands[3])"
+  "@
+   haddpd\t{%0, %0|%0, %0}
+   vhaddpd\t{%1, %1, %0|%0, %1, %1}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseadd1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "V2DF")])
+
+(define_insn "*sse3_hsubv2df3_low"
+  [(set (match_operand:DF 0 "register_operand" "=x,x")
+	(minus:DF
+	  (vec_select:DF
+	    (match_operand:V2DF 1 "register_operand" "0,x")
+	    (parallel [(const_int 0)]))
+	  (vec_select:DF
+	    (match_dup 1)
+	    (parallel [(const_int 1)]))))]
+  "TARGET_SSE3"
+  "@
+   hsubpd\t{%0, %0|%0, %0}
+   vhsubpd\t{%1, %1, %0|%0, %1, %1}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseadd1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "V2DF")])
+
+(define_insn "avx_h<plusminus_insn>v8sf3"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(vec_concat:V8SF
+	  (vec_concat:V4SF
+	    (vec_concat:V2SF
+	      (plusminus:SF
+		(vec_select:SF
+		  (match_operand:V8SF 1 "register_operand" "x")
+		  (parallel [(const_int 0)]))
+		(vec_select:SF (match_dup 1) (parallel [(const_int 1)])))
+	      (plusminus:SF
+		(vec_select:SF (match_dup 1) (parallel [(const_int 2)]))
+		(vec_select:SF (match_dup 1) (parallel [(const_int 3)]))))
+	    (vec_concat:V2SF
+	      (plusminus:SF
+		(vec_select:SF
+		  (match_operand:V8SF 2 "nonimmediate_operand" "xm")
+		  (parallel [(const_int 0)]))
+		(vec_select:SF (match_dup 2) (parallel [(const_int 1)])))
+	      (plusminus:SF
+		(vec_select:SF (match_dup 2) (parallel [(const_int 2)]))
+		(vec_select:SF (match_dup 2) (parallel [(const_int 3)])))))
+	  (vec_concat:V4SF
+	    (vec_concat:V2SF
+	      (plusminus:SF
+		(vec_select:SF (match_dup 1) (parallel [(const_int 4)]))
+		(vec_select:SF (match_dup 1) (parallel [(const_int 5)])))
+	      (plusminus:SF
+		(vec_select:SF (match_dup 1) (parallel [(const_int 6)]))
+		(vec_select:SF (match_dup 1) (parallel [(const_int 7)]))))
+	    (vec_concat:V2SF
+	      (plusminus:SF
+		(vec_select:SF (match_dup 2) (parallel [(const_int 4)]))
+		(vec_select:SF (match_dup 2) (parallel [(const_int 5)])))
+	      (plusminus:SF
+		(vec_select:SF (match_dup 2) (parallel [(const_int 6)]))
+		(vec_select:SF (match_dup 2) (parallel [(const_int 7)])))))))]
+  "TARGET_AVX"
+  "vh<plusminus_mnemonic>ps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "sse3_h<plusminus_insn>v4sf3"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
+	(vec_concat:V4SF
+	  (vec_concat:V2SF
+	    (plusminus:SF
+	      (vec_select:SF
+		(match_operand:V4SF 1 "register_operand" "0,x")
+		(parallel [(const_int 0)]))
+	      (vec_select:SF (match_dup 1) (parallel [(const_int 1)])))
+	    (plusminus:SF
+	      (vec_select:SF (match_dup 1) (parallel [(const_int 2)]))
+	      (vec_select:SF (match_dup 1) (parallel [(const_int 3)]))))
+	  (vec_concat:V2SF
+	    (plusminus:SF
+	      (vec_select:SF
+		(match_operand:V4SF 2 "nonimmediate_operand" "xm,xm")
+		(parallel [(const_int 0)]))
+	      (vec_select:SF (match_dup 2) (parallel [(const_int 1)])))
+	    (plusminus:SF
+	      (vec_select:SF (match_dup 2) (parallel [(const_int 2)]))
+	      (vec_select:SF (match_dup 2) (parallel [(const_int 3)]))))))]
+  "TARGET_SSE3"
+  "@
+   h<plusminus_mnemonic>ps\t{%2, %0|%0, %2}
+   vh<plusminus_mnemonic>ps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseadd")
+   (set_attr "atom_unit" "complex")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "prefix_rep" "1,*")
+   (set_attr "mode" "V4SF")])
+
+(define_expand "reduc_splus_v8df"
+  [(match_operand:V8DF 0 "register_operand")
+   (match_operand:V8DF 1 "register_operand")]
+  "TARGET_AVX512F"
+{
+  ix86_expand_reduc (gen_addv8df3, operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "reduc_splus_v4df"
+  [(match_operand:V4DF 0 "register_operand")
+   (match_operand:V4DF 1 "register_operand")]
+  "TARGET_AVX"
+{
+  rtx tmp = gen_reg_rtx (V4DFmode);
+  rtx tmp2 = gen_reg_rtx (V4DFmode);
+  emit_insn (gen_avx_haddv4df3 (tmp, operands[1], operands[1]));
+  emit_insn (gen_avx_vperm2f128v4df3 (tmp2, tmp, tmp, GEN_INT (1)));
+  emit_insn (gen_addv4df3 (operands[0], tmp, tmp2));
+  DONE;
+})
+
+(define_expand "reduc_splus_v2df"
+  [(match_operand:V2DF 0 "register_operand")
+   (match_operand:V2DF 1 "register_operand")]
+  "TARGET_SSE3"
+{
+  emit_insn (gen_sse3_haddv2df3 (operands[0], operands[1], operands[1]));
+  DONE;
+})
+
+(define_expand "reduc_splus_v16sf"
+  [(match_operand:V16SF 0 "register_operand")
+   (match_operand:V16SF 1 "register_operand")]
+  "TARGET_AVX512F"
+{
+  ix86_expand_reduc (gen_addv16sf3, operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "reduc_splus_v8sf"
+  [(match_operand:V8SF 0 "register_operand")
+   (match_operand:V8SF 1 "register_operand")]
+  "TARGET_AVX"
+{
+  rtx tmp = gen_reg_rtx (V8SFmode);
+  rtx tmp2 = gen_reg_rtx (V8SFmode);
+  emit_insn (gen_avx_haddv8sf3 (tmp, operands[1], operands[1]));
+  emit_insn (gen_avx_haddv8sf3 (tmp2, tmp, tmp));
+  emit_insn (gen_avx_vperm2f128v8sf3 (tmp, tmp2, tmp2, GEN_INT (1)));
+  emit_insn (gen_addv8sf3 (operands[0], tmp, tmp2));
+  DONE;
+})
+
+(define_expand "reduc_splus_v4sf"
+  [(match_operand:V4SF 0 "register_operand")
+   (match_operand:V4SF 1 "register_operand")]
+  "TARGET_SSE"
+{
+  if (TARGET_SSE3)
+    {
+      rtx tmp = gen_reg_rtx (V4SFmode);
+      emit_insn (gen_sse3_haddv4sf3 (tmp, operands[1], operands[1]));
+      emit_insn (gen_sse3_haddv4sf3 (operands[0], tmp, tmp));
+    }
+  else
+    ix86_expand_reduc (gen_addv4sf3, operands[0], operands[1]);
+  DONE;
+})
+
+;; Modes handled by reduc_sm{in,ax}* patterns.
+(define_mode_iterator REDUC_SMINMAX_MODE
+  [(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
+   (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
+   (V8SF "TARGET_AVX") (V4DF "TARGET_AVX")
+   (V4SF "TARGET_SSE") (V16SI "TARGET_AVX512F")
+   (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
+   (V8DF "TARGET_AVX512F")])
+
+(define_expand "reduc_<code>_<mode>"
+  [(smaxmin:REDUC_SMINMAX_MODE
+     (match_operand:REDUC_SMINMAX_MODE 0 "register_operand")
+     (match_operand:REDUC_SMINMAX_MODE 1 "register_operand"))]
+  ""
+{
+  ix86_expand_reduc (gen_<code><mode>3, operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "reduc_<code>_<mode>"
+  [(umaxmin:VI48_512
+     (match_operand:VI48_512 0 "register_operand")
+     (match_operand:VI48_512 1 "register_operand"))]
+  "TARGET_AVX512F"
+{
+  ix86_expand_reduc (gen_<code><mode>3, operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "reduc_<code>_<mode>"
+  [(umaxmin:VI_256
+     (match_operand:VI_256 0 "register_operand")
+     (match_operand:VI_256 1 "register_operand"))]
+  "TARGET_AVX2"
+{
+  ix86_expand_reduc (gen_<code><mode>3, operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "reduc_umin_v8hi"
+  [(umin:V8HI
+     (match_operand:V8HI 0 "register_operand")
+     (match_operand:V8HI 1 "register_operand"))]
+  "TARGET_SSE4_1"
+{
+  ix86_expand_reduc (gen_uminv8hi3, operands[0], operands[1]);
+  DONE;
+})
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel floating point comparisons
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "avx_cmp<mode>3"
+  [(set (match_operand:VF_128_256 0 "register_operand" "=x")
+	(unspec:VF_128_256
+	  [(match_operand:VF_128_256 1 "register_operand" "x")
+	   (match_operand:VF_128_256 2 "nonimmediate_operand" "xm")
+	   (match_operand:SI 3 "const_0_to_31_operand" "n")]
+	  UNSPEC_PCMP))]
+  "TARGET_AVX"
+  "vcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx_vmcmp<mode>3"
+  [(set (match_operand:VF_128 0 "register_operand" "=x")
+	(vec_merge:VF_128
+	  (unspec:VF_128
+	    [(match_operand:VF_128 1 "register_operand" "x")
+	     (match_operand:VF_128 2 "nonimmediate_operand" "xm")
+	     (match_operand:SI 3 "const_0_to_31_operand" "n")]
+	    UNSPEC_PCMP)
+	 (match_dup 1)
+	 (const_int 1)))]
+  "TARGET_AVX"
+  "vcmp<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %<iptr>2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_insn "*<sse>_maskcmp<mode>3_comm"
+  [(set (match_operand:VF_128_256 0 "register_operand" "=x,x")
+	(match_operator:VF_128_256 3 "sse_comparison_operator"
+	  [(match_operand:VF_128_256 1 "register_operand" "%0,x")
+	   (match_operand:VF_128_256 2 "nonimmediate_operand" "xm,xm")]))]
+  "TARGET_SSE
+   && GET_RTX_CLASS (GET_CODE (operands[3])) == RTX_COMM_COMPARE"
+  "@
+   cmp%D3<ssemodesuffix>\t{%2, %0|%0, %2}
+   vcmp%D3<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<sse>_maskcmp<mode>3"
+  [(set (match_operand:VF_128_256 0 "register_operand" "=x,x")
+	(match_operator:VF_128_256 3 "sse_comparison_operator"
+	  [(match_operand:VF_128_256 1 "register_operand" "0,x")
+	   (match_operand:VF_128_256 2 "nonimmediate_operand" "xm,xm")]))]
+  "TARGET_SSE"
+  "@
+   cmp%D3<ssemodesuffix>\t{%2, %0|%0, %2}
+   vcmp%D3<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<sse>_vmmaskcmp<mode>3"
+  [(set (match_operand:VF_128 0 "register_operand" "=x,x")
+	(vec_merge:VF_128
+	 (match_operator:VF_128 3 "sse_comparison_operator"
+	   [(match_operand:VF_128 1 "register_operand" "0,x")
+	    (match_operand:VF_128 2 "nonimmediate_operand" "xm,xm")])
+	 (match_dup 1)
+	 (const_int 1)))]
+  "TARGET_SSE"
+  "@
+   cmp%D3<ssescalarmodesuffix>\t{%2, %0|%0, %<iptr>2}
+   vcmp%D3<ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %<iptr>2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_mode_attr cmp_imm_predicate
+  [(V16SF "const_0_to_31_operand") (V8DF "const_0_to_31_operand")
+  (V16SI "const_0_to_7_operand") (V8DI "const_0_to_7_operand")])
+
+(define_insn "avx512f_cmp<mode>3<mask_scalar_merge_name><round_saeonly_name>"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+	(unspec:<avx512fmaskmode>
+	  [(match_operand:VI48F_512 1 "register_operand" "v")
+	   (match_operand:VI48F_512 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+	   (match_operand:SI 3 "<cmp_imm_predicate>" "n")]
+	  UNSPEC_PCMP))]
+  "TARGET_AVX512F && <round_saeonly_mode512bit_condition>"
+  "v<sseintprefix>cmp<ssemodesuffix>\t{%3, <round_saeonly_mask_scalar_merge_op4>%2, %1, %0<mask_scalar_merge_operand4>|%0<mask_scalar_merge_operand4>, %1, %2<round_saeonly_mask_scalar_merge_op4>, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512f_ucmp<mode>3<mask_scalar_merge_name>"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+	(unspec:<avx512fmaskmode>
+	  [(match_operand:VI48_512 1 "register_operand" "v")
+	   (match_operand:VI48_512 2 "nonimmediate_operand" "vm")
+	   (match_operand:SI 3 "const_0_to_7_operand" "n")]
+	  UNSPEC_UNSIGNED_PCMP))]
+  "TARGET_AVX512F"
+  "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0<mask_scalar_merge_operand4>|%0<mask_scalar_merge_operand4>, %1, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512f_vmcmp<mode>3<round_saeonly_name>"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+	(and:<avx512fmaskmode>
+	  (unspec:<avx512fmaskmode>
+	    [(match_operand:VF_128 1 "register_operand" "v")
+	     (match_operand:VF_128 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+	     (match_operand:SI 3 "const_0_to_31_operand" "n")]
+	    UNSPEC_PCMP)
+	  (const_int 1)))]
+  "TARGET_AVX512F"
+  "vcmp<ssescalarmodesuffix>\t{%3, <round_saeonly_op4>%2, %1, %0|%0, %1, %2<round_saeonly_op4>, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_insn "avx512f_vmcmp<mode>3_mask<round_saeonly_name>"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+	(and:<avx512fmaskmode>
+	  (unspec:<avx512fmaskmode>
+	    [(match_operand:VF_128 1 "register_operand" "v")
+	     (match_operand:VF_128 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+	     (match_operand:SI 3 "const_0_to_31_operand" "n")]
+	    UNSPEC_PCMP)
+	  (and:<avx512fmaskmode>
+	    (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")
+	    (const_int 1))))]
+  "TARGET_AVX512F"
+  "vcmp<ssescalarmodesuffix>\t{%3, <round_saeonly_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_saeonly_op5>, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_insn "avx512f_maskcmp<mode>3"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+	(match_operator:<avx512fmaskmode> 3 "sse_comparison_operator"
+	  [(match_operand:VF 1 "register_operand" "v")
+	   (match_operand:VF 2 "nonimmediate_operand" "vm")]))]
+  "TARGET_SSE"
+  "vcmp%D3<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<sse>_comi<round_saeonly_name>"
+  [(set (reg:CCFP FLAGS_REG)
+	(compare:CCFP
+	  (vec_select:MODEF
+	    (match_operand:<ssevecmode> 0 "register_operand" "v")
+	    (parallel [(const_int 0)]))
+	  (vec_select:MODEF
+	    (match_operand:<ssevecmode> 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+	    (parallel [(const_int 0)]))))]
+  "SSE_FLOAT_MODE_P (<MODE>mode)"
+  "%vcomi<ssemodesuffix>\t{<round_saeonly_op2>%1, %0|%0, %<iptr>1<round_saeonly_op2>}"
+  [(set_attr "type" "ssecomi")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "prefix_rep" "0")
+   (set (attr "prefix_data16")
+	(if_then_else (eq_attr "mode" "DF")
+		      (const_string "1")
+		      (const_string "0")))
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<sse>_ucomi<round_saeonly_name>"
+  [(set (reg:CCFPU FLAGS_REG)
+	(compare:CCFPU
+	  (vec_select:MODEF
+	    (match_operand:<ssevecmode> 0 "register_operand" "v")
+	    (parallel [(const_int 0)]))
+	  (vec_select:MODEF
+	    (match_operand:<ssevecmode> 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+	    (parallel [(const_int 0)]))))]
+  "SSE_FLOAT_MODE_P (<MODE>mode)"
+  "%vucomi<ssemodesuffix>\t{<round_saeonly_op2>%1, %0|%0, %<iptr>1<round_saeonly_op2>}"
+  [(set_attr "type" "ssecomi")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "prefix_rep" "0")
+   (set (attr "prefix_data16")
+	(if_then_else (eq_attr "mode" "DF")
+		      (const_string "1")
+		      (const_string "0")))
+   (set_attr "mode" "<MODE>")])
+
+(define_expand "vcond<V_512:mode><VF_512:mode>"
+  [(set (match_operand:V_512 0 "register_operand")
+	(if_then_else:V_512
+	  (match_operator 3 ""
+	    [(match_operand:VF_512 4 "nonimmediate_operand")
+	     (match_operand:VF_512 5 "nonimmediate_operand")])
+	  (match_operand:V_512 1 "general_operand")
+	  (match_operand:V_512 2 "general_operand")))]
+  "TARGET_AVX512F
+   && (GET_MODE_NUNITS (<V_512:MODE>mode)
+       == GET_MODE_NUNITS (<VF_512:MODE>mode))"
+{
+  bool ok = ix86_expand_fp_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
+(define_expand "vcond<V_256:mode><VF_256:mode>"
+  [(set (match_operand:V_256 0 "register_operand")
+	(if_then_else:V_256
+	  (match_operator 3 ""
+	    [(match_operand:VF_256 4 "nonimmediate_operand")
+	     (match_operand:VF_256 5 "nonimmediate_operand")])
+	  (match_operand:V_256 1 "general_operand")
+	  (match_operand:V_256 2 "general_operand")))]
+  "TARGET_AVX
+   && (GET_MODE_NUNITS (<V_256:MODE>mode)
+       == GET_MODE_NUNITS (<VF_256:MODE>mode))"
+{
+  bool ok = ix86_expand_fp_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
+(define_expand "vcond<V_128:mode><VF_128:mode>"
+  [(set (match_operand:V_128 0 "register_operand")
+	(if_then_else:V_128
+	  (match_operator 3 ""
+	    [(match_operand:VF_128 4 "nonimmediate_operand")
+	     (match_operand:VF_128 5 "nonimmediate_operand")])
+	  (match_operand:V_128 1 "general_operand")
+	  (match_operand:V_128 2 "general_operand")))]
+  "TARGET_SSE
+   && (GET_MODE_NUNITS (<V_128:MODE>mode)
+       == GET_MODE_NUNITS (<VF_128:MODE>mode))"
+{
+  bool ok = ix86_expand_fp_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel floating point logical operations
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "<sse>_andnot<mode>3"
+  [(set (match_operand:VF 0 "register_operand" "=x,v")
+	(and:VF
+	  (not:VF
+	    (match_operand:VF 1 "register_operand" "0,v"))
+	  (match_operand:VF 2 "nonimmediate_operand" "xm,vm")))]
+  "TARGET_SSE"
+{
+  static char buf[32];
+  const char *ops;
+  const char *suffix;
+
+  switch (get_attr_mode (insn))
+    {
+    case MODE_V8SF:
+    case MODE_V4SF:
+      suffix = "ps";
+      break;
+    default:
+      suffix = "<ssemodesuffix>";
+    }
+
+  switch (which_alternative)
+    {
+    case 0:
+      ops = "andn%s\t{%%2, %%0|%%0, %%2}";
+      break;
+    case 1:
+      ops = "vandn%s\t{%%2, %%1, %%0|%%0, %%1, %%2}";
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  /* There is no vandnp[sd].  Use vpandnq.  */
+  if (<MODE_SIZE> == 64)
+    {
+      suffix = "q";
+      ops = "vpandn%s\t{%%2, %%1, %%0|%%0, %%1, %%2}";
+    }
+
+  snprintf (buf, sizeof (buf), ops, suffix);
+  return buf;
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix" "orig,maybe_evex")
+   (set (attr "mode")
+	(cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+		 (const_string "<ssePSmode>")
+	       (match_test "TARGET_AVX")
+		 (const_string "<MODE>")
+	       (match_test "optimize_function_for_size_p (cfun)")
+		 (const_string "V4SF")
+	       ]
+	       (const_string "<MODE>")))])
+
+(define_expand "<code><mode>3"
+  [(set (match_operand:VF_128_256 0 "register_operand")
+	(any_logic:VF_128_256
+	  (match_operand:VF_128_256 1 "nonimmediate_operand")
+	  (match_operand:VF_128_256 2 "nonimmediate_operand")))]
+  "TARGET_SSE"
+  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+
+(define_expand "<code><mode>3"
+  [(set (match_operand:VF_512 0 "register_operand")
+       (fpint_logic:VF_512
+         (match_operand:VF_512 1 "nonimmediate_operand")
+         (match_operand:VF_512 2 "nonimmediate_operand")))]
+  "TARGET_AVX512F"
+  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+
+(define_insn "*<code><mode>3"
+  [(set (match_operand:VF 0 "register_operand" "=x,v")
+	(any_logic:VF
+	  (match_operand:VF 1 "nonimmediate_operand" "%0,v")
+	  (match_operand:VF 2 "nonimmediate_operand" "xm,vm")))]
+  "TARGET_SSE && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+{
+  static char buf[32];
+  const char *ops;
+  const char *suffix;
+
+  switch (get_attr_mode (insn))
+    {
+    case MODE_V8SF:
+    case MODE_V4SF:
+      suffix = "ps";
+      break;
+    default:
+      suffix = "<ssemodesuffix>";
+    }
+
+  switch (which_alternative)
+    {
+    case 0:
+      ops = "<logic>%s\t{%%2, %%0|%%0, %%2}";
+      break;
+    case 1:
+      ops = "v<logic>%s\t{%%2, %%1, %%0|%%0, %%1, %%2}";
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  /* There is no v<logic>p[sd].  Use vp<logic>q.  */
+  if (<MODE_SIZE> == 64)
+    {
+      suffix = "q";
+      ops = "vp<logic>%s\t{%%2, %%1, %%0|%%0, %%1, %%2}";
+    }
+
+  snprintf (buf, sizeof (buf), ops, suffix);
+  return buf;
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix" "orig,maybe_evex")
+   (set (attr "mode")
+	(cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+		 (const_string "<ssePSmode>")
+	       (match_test "TARGET_AVX")
+		 (const_string "<MODE>")
+	       (match_test "optimize_function_for_size_p (cfun)")
+		 (const_string "V4SF")
+	       ]
+	       (const_string "<MODE>")))])
+
+(define_expand "copysign<mode>3"
+  [(set (match_dup 4)
+	(and:VF
+	  (not:VF (match_dup 3))
+	  (match_operand:VF 1 "nonimmediate_operand")))
+   (set (match_dup 5)
+	(and:VF (match_dup 3)
+		(match_operand:VF 2 "nonimmediate_operand")))
+   (set (match_operand:VF 0 "register_operand")
+	(ior:VF (match_dup 4) (match_dup 5)))]
+  "TARGET_SSE"
+{
+  operands[3] = ix86_build_signbit_mask (<MODE>mode, 1, 0);
+
+  operands[4] = gen_reg_rtx (<MODE>mode);
+  operands[5] = gen_reg_rtx (<MODE>mode);
+})
+
+;; Also define scalar versions.  These are used for abs, neg, and
+;; conditional move.  Using subregs into vector modes causes register
+;; allocation lossage.  These patterns do not allow memory operands
+;; because the native instructions read the full 128-bits.
+
+(define_insn "*andnot<mode>3"
+  [(set (match_operand:MODEF 0 "register_operand" "=x,x")
+	(and:MODEF
+	  (not:MODEF
+	    (match_operand:MODEF 1 "register_operand" "0,x"))
+	    (match_operand:MODEF 2 "register_operand" "x,x")))]
+  "SSE_FLOAT_MODE_P (<MODE>mode)"
+{
+  static char buf[32];
+  const char *ops;
+  const char *suffix
+    = (get_attr_mode (insn) == MODE_V4SF) ? "ps" : "<ssevecmodesuffix>";
+
+  switch (which_alternative)
+    {
+    case 0:
+      ops = "andn%s\t{%%2, %%0|%%0, %%2}";
+      break;
+    case 1:
+      ops = "vandn%s\t{%%2, %%1, %%0|%%0, %%1, %%2}";
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  snprintf (buf, sizeof (buf), ops, suffix);
+  return buf;
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix" "orig,vex")
+   (set (attr "mode")
+	(cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+		 (const_string "V4SF")
+	       (match_test "TARGET_AVX")
+		 (const_string "<ssevecmode>")
+	       (match_test "optimize_function_for_size_p (cfun)")
+		 (const_string "V4SF")
+	       ]
+	       (const_string "<ssevecmode>")))])
+
+(define_insn "*andnottf3"
+  [(set (match_operand:TF 0 "register_operand" "=x,x")
+	(and:TF
+	  (not:TF (match_operand:TF 1 "register_operand" "0,x"))
+	  (match_operand:TF 2 "nonimmediate_operand" "xm,xm")))]
+  "TARGET_SSE"
+{
+  static char buf[32];
+  const char *ops;
+  const char *tmp
+    = (get_attr_mode (insn) == MODE_V4SF) ? "andnps" : "pandn";
+
+  switch (which_alternative)
+    {
+    case 0:
+      ops = "%s\t{%%2, %%0|%%0, %%2}";
+      break;
+    case 1:
+      ops = "v%s\t{%%2, %%1, %%0|%%0, %%1, %%2}";
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  snprintf (buf, sizeof (buf), ops, tmp);
+  return buf;
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set (attr "prefix_data16")
+     (if_then_else
+       (and (eq_attr "alternative" "0")
+	    (eq_attr "mode" "TI"))
+       (const_string "1")
+       (const_string "*")))
+   (set_attr "prefix" "orig,vex")
+   (set (attr "mode")
+	(cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+		 (const_string "V4SF")
+	       (match_test "TARGET_AVX")
+		 (const_string "TI")
+	       (ior (not (match_test "TARGET_SSE2"))
+		    (match_test "optimize_function_for_size_p (cfun)"))
+		 (const_string "V4SF")
+	       ]
+	       (const_string "TI")))])
+
+(define_insn "*<code><mode>3"
+  [(set (match_operand:MODEF 0 "register_operand" "=x,x")
+	(any_logic:MODEF
+	  (match_operand:MODEF 1 "register_operand" "%0,x")
+	  (match_operand:MODEF 2 "register_operand" "x,x")))]
+  "SSE_FLOAT_MODE_P (<MODE>mode)"
+{
+  static char buf[32];
+  const char *ops;
+  const char *suffix
+    = (get_attr_mode (insn) == MODE_V4SF) ? "ps" : "<ssevecmodesuffix>";
+
+  switch (which_alternative)
+    {
+    case 0:
+      ops = "<logic>%s\t{%%2, %%0|%%0, %%2}";
+      break;
+    case 1:
+      ops = "v<logic>%s\t{%%2, %%1, %%0|%%0, %%1, %%2}";
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  snprintf (buf, sizeof (buf), ops, suffix);
+  return buf;
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix" "orig,vex")
+   (set (attr "mode")
+	(cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+		 (const_string "V4SF")
+	       (match_test "TARGET_AVX")
+		 (const_string "<ssevecmode>")
+	       (match_test "optimize_function_for_size_p (cfun)")
+		 (const_string "V4SF")
+	       ]
+	       (const_string "<ssevecmode>")))])
+
+(define_expand "<code>tf3"
+  [(set (match_operand:TF 0 "register_operand")
+	(any_logic:TF
+	  (match_operand:TF 1 "nonimmediate_operand")
+	  (match_operand:TF 2 "nonimmediate_operand")))]
+  "TARGET_SSE"
+  "ix86_fixup_binary_operands_no_copy (<CODE>, TFmode, operands);")
+
+(define_insn "*<code>tf3"
+  [(set (match_operand:TF 0 "register_operand" "=x,x")
+	(any_logic:TF
+	  (match_operand:TF 1 "nonimmediate_operand" "%0,x")
+	  (match_operand:TF 2 "nonimmediate_operand" "xm,xm")))]
+  "TARGET_SSE
+   && ix86_binary_operator_ok (<CODE>, TFmode, operands)"
+{
+  static char buf[32];
+  const char *ops;
+  const char *tmp
+    = (get_attr_mode (insn) == MODE_V4SF) ? "<logic>ps" : "p<logic>";
+
+  switch (which_alternative)
+    {
+    case 0:
+      ops = "%s\t{%%2, %%0|%%0, %%2}";
+      break;
+    case 1:
+      ops = "v%s\t{%%2, %%1, %%0|%%0, %%1, %%2}";
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  snprintf (buf, sizeof (buf), ops, tmp);
+  return buf;
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set (attr "prefix_data16")
+     (if_then_else
+       (and (eq_attr "alternative" "0")
+	    (eq_attr "mode" "TI"))
+       (const_string "1")
+       (const_string "*")))
+   (set_attr "prefix" "orig,vex")
+   (set (attr "mode")
+	(cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+		 (const_string "V4SF")
+	       (match_test "TARGET_AVX")
+		 (const_string "TI")
+	       (ior (not (match_test "TARGET_SSE2"))
+		    (match_test "optimize_function_for_size_p (cfun)"))
+		 (const_string "V4SF")
+	       ]
+	       (const_string "TI")))])
+
+;; There are no floating point xor for V16SF and V8DF in avx512f
+;; but we need them for negation.  Instead we use int versions of
+;; xor.  Maybe there could be a better way to do that.
+
+(define_mode_attr avx512flogicsuff
+  [(V16SF "d") (V8DF "q")])
+
+(define_insn "avx512f_<logic><mode>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+	(fpint_logic:VF_512
+	  (match_operand:VF_512 1 "register_operand" "v")
+	  (match_operand:VF_512 2 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512F"
+  "vp<logic><avx512flogicsuff>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; FMA floating point multiply/accumulate instructions.  These include
+;; scalar versions of the instructions as well as vector versions.
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; The standard names for scalar FMA are only available with SSE math enabled.
+;; CPUID bit AVX512F enables evex encoded scalar and 512-bit fma.  It doesn't
+;; care about FMA bit, so we enable fma for TARGET_AVX512F even when TARGET_FMA
+;; and TARGET_FMA4 are both false.
+;; TODO: In theory AVX512F does not automatically imply FMA, and without FMA
+;; one must force the EVEX encoding of the fma insns.  Ideally we'd improve
+;; GAS to allow proper prefix selection.  However, for the moment all hardware
+;; that supports AVX512F also supports FMA so we can ignore this for now.
+(define_mode_iterator FMAMODEM
+  [(SF "TARGET_SSE_MATH && (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F)")
+   (DF "TARGET_SSE_MATH && (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F)")
+   (V4SF "TARGET_FMA || TARGET_FMA4")
+   (V2DF "TARGET_FMA || TARGET_FMA4")
+   (V8SF "TARGET_FMA || TARGET_FMA4")
+   (V4DF "TARGET_FMA || TARGET_FMA4")
+   (V16SF "TARGET_AVX512F")
+   (V8DF "TARGET_AVX512F")])
+
+(define_expand "fma<mode>4"
+  [(set (match_operand:FMAMODEM 0 "register_operand")
+	(fma:FMAMODEM
+	  (match_operand:FMAMODEM 1 "nonimmediate_operand")
+	  (match_operand:FMAMODEM 2 "nonimmediate_operand")
+	  (match_operand:FMAMODEM 3 "nonimmediate_operand")))]
+  "")
+
+(define_expand "fms<mode>4"
+  [(set (match_operand:FMAMODEM 0 "register_operand")
+	(fma:FMAMODEM
+	  (match_operand:FMAMODEM 1 "nonimmediate_operand")
+	  (match_operand:FMAMODEM 2 "nonimmediate_operand")
+	  (neg:FMAMODEM (match_operand:FMAMODEM 3 "nonimmediate_operand"))))]
+  "")
+
+(define_expand "fnma<mode>4"
+  [(set (match_operand:FMAMODEM 0 "register_operand")
+	(fma:FMAMODEM
+	  (neg:FMAMODEM (match_operand:FMAMODEM 1 "nonimmediate_operand"))
+	  (match_operand:FMAMODEM 2 "nonimmediate_operand")
+	  (match_operand:FMAMODEM 3 "nonimmediate_operand")))]
+  "")
+
+(define_expand "fnms<mode>4"
+  [(set (match_operand:FMAMODEM 0 "register_operand")
+	(fma:FMAMODEM
+	  (neg:FMAMODEM (match_operand:FMAMODEM 1 "nonimmediate_operand"))
+	  (match_operand:FMAMODEM 2 "nonimmediate_operand")
+	  (neg:FMAMODEM (match_operand:FMAMODEM 3 "nonimmediate_operand"))))]
+  "")
+
+;; The builtins for intrinsics are not constrained by SSE math enabled.
+(define_mode_iterator FMAMODE [(SF "TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F")
+			       (DF "TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F")
+			       (V4SF "TARGET_FMA || TARGET_FMA4")
+			       (V2DF "TARGET_FMA || TARGET_FMA4")
+			       (V8SF "TARGET_FMA || TARGET_FMA4")
+			       (V4DF "TARGET_FMA || TARGET_FMA4")
+			       (V16SF "TARGET_AVX512F")
+			       (V8DF "TARGET_AVX512F")])
+
+(define_expand "fma4i_fmadd_<mode>"
+  [(set (match_operand:FMAMODE 0 "register_operand")
+	(fma:FMAMODE
+	  (match_operand:FMAMODE 1 "nonimmediate_operand")
+	  (match_operand:FMAMODE 2 "nonimmediate_operand")
+	  (match_operand:FMAMODE 3 "nonimmediate_operand")))]
+  "")
+
+(define_expand "avx512f_fmadd_<mode>_maskz<round_expand_name>"
+  [(match_operand:VF_512 0 "register_operand")
+   (match_operand:VF_512 1 "<round_expand_nimm_predicate>")
+   (match_operand:VF_512 2 "<round_expand_nimm_predicate>")
+   (match_operand:VF_512 3 "<round_expand_nimm_predicate>")
+   (match_operand:<avx512fmaskmode> 4 "register_operand")]
+  "TARGET_AVX512F"
+{
+  emit_insn (gen_fma_fmadd_<mode>_maskz_1<round_expand_name> (
+    operands[0], operands[1], operands[2], operands[3],
+    CONST0_RTX (<MODE>mode), operands[4]<round_expand_operand>));
+  DONE;
+})
+
+(define_insn "<sd_mask_codefor>fma_fmadd_<mode><sd_maskz_name><round_name>"
+  [(set (match_operand:FMAMODE 0 "register_operand" "=v,v,v,x,x")
+	(fma:FMAMODE
+	  (match_operand:FMAMODE 1 "<round_nimm_predicate>" "%0,0,v,x,x")
+	  (match_operand:FMAMODE 2 "<round_nimm_predicate>" "<round_constraint>,v,<round_constraint>,x,m")
+	  (match_operand:FMAMODE 3 "<round_nimm_predicate>" "v,<round_constraint>,0,xm,x")))]
+  "<sd_mask_mode512bit_condition> && <round_mode512bit_condition>"
+  "@
+   vfmadd132<ssemodesuffix>\t{<round_sd_mask_op4>%2, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<round_sd_mask_op4>}
+   vfmadd213<ssemodesuffix>\t{<round_sd_mask_op4>%3, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<round_sd_mask_op4>}
+   vfmadd231<ssemodesuffix>\t{<round_sd_mask_op4>%2, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %2<round_sd_mask_op4>}
+   vfmadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   vfmadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "fma_avx512f,fma_avx512f,fma_avx512f,fma4,fma4")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_fmadd_<mode>_mask<round_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v,v")
+	(vec_merge:VF_512
+	  (fma:VF_512
+	    (match_operand:VF_512 1 "register_operand" "0,0")
+	    (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>,v")
+	    (match_operand:VF_512 3 "<round_nimm_predicate>" "v,<round_constraint>"))
+	  (match_dup 1)
+	  (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk,Yk")))]
+  "TARGET_AVX512F"
+  "@
+   vfmadd132<ssemodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %3, %2<round_op5>}
+   vfmadd213<ssemodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %2, %3<round_op5>}"
+  [(set_attr "isa" "fma_avx512f,fma_avx512f")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_fmadd_<mode>_mask3<round_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=x")
+	(vec_merge:VF_512
+	  (fma:VF_512
+	    (match_operand:VF_512 1 "register_operand" "x")
+	    (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>")
+	    (match_operand:VF_512 3 "register_operand" "0"))
+	  (match_dup 3)
+	  (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+  "vfmadd231<ssemodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_op5>}"
+  [(set_attr "isa" "fma_avx512f")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<sd_mask_codefor>fma_fmsub_<mode><sd_maskz_name><round_name>"
+  [(set (match_operand:FMAMODE 0 "register_operand" "=v,v,v,x,x")
+	(fma:FMAMODE
+	  (match_operand:FMAMODE   1 "<round_nimm_predicate>" "%0, 0, v, x,x")
+	  (match_operand:FMAMODE   2 "<round_nimm_predicate>" "<round_constraint>,v,<round_constraint>,x,m")
+	  (neg:FMAMODE
+	    (match_operand:FMAMODE 3 "<round_nimm_predicate>" "v,<round_constraint>,0,xm,x"))))]
+  "<sd_mask_mode512bit_condition> && <round_mode512bit_condition>"
+  "@
+   vfmsub132<ssemodesuffix>\t{<round_sd_mask_op4>%2, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<round_sd_mask_op4>}
+   vfmsub213<ssemodesuffix>\t{<round_sd_mask_op4>%3, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<round_sd_mask_op4>}
+   vfmsub231<ssemodesuffix>\t{<round_sd_mask_op4>%2, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %2<round_sd_mask_op4>}
+   vfmsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   vfmsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "fma_avx512f,fma_avx512f,fma_avx512f,fma4,fma4")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_fmsub_<mode>_mask<round_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v,v")
+	(vec_merge:VF_512
+	  (fma:VF_512
+	    (match_operand:VF_512 1 "register_operand" "0,0")
+	    (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>,v")
+	    (neg:VF_512
+	      (match_operand:VF_512 3 "<round_nimm_predicate>" "v,<round_constraint>")))
+	  (match_dup 1)
+	  (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk,Yk")))]
+  "TARGET_AVX512F"
+  "@
+   vfmsub132<ssemodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %3, %2<round_op5>}
+   vfmsub213<ssemodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %2, %3<round_op5>}"
+  [(set_attr "isa" "fma_avx512f,fma_avx512f")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_fmsub_<mode>_mask3<round_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+	(vec_merge:VF_512
+	  (fma:VF_512
+	    (match_operand:VF_512 1 "register_operand" "v")
+	    (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>")
+	    (neg:VF_512
+	      (match_operand:VF_512 3 "register_operand" "0")))
+	  (match_dup 3)
+	  (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+  "vfmsub231<ssemodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_op5>}"
+  [(set_attr "isa" "fma_avx512f")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<sd_mask_codefor>fma_fnmadd_<mode><sd_maskz_name><round_name>"
+  [(set (match_operand:FMAMODE 0 "register_operand" "=v,v,v,x,x")
+	(fma:FMAMODE
+	  (neg:FMAMODE
+	    (match_operand:FMAMODE 1 "<round_nimm_predicate>" "%0,0,v,x,x"))
+	  (match_operand:FMAMODE   2 "<round_nimm_predicate>" "<round_constraint>,v,<round_constraint>,x,m")
+	  (match_operand:FMAMODE   3 "<round_nimm_predicate>" "v,<round_constraint>,0,xm,x")))]
+  "<sd_mask_mode512bit_condition> && <round_mode512bit_condition>"
+  "@
+   vfnmadd132<ssemodesuffix>\t{<round_sd_mask_op4>%2, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<round_sd_mask_op4>}
+   vfnmadd213<ssemodesuffix>\t{<round_sd_mask_op4>%3, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<round_sd_mask_op4>}
+   vfnmadd231<ssemodesuffix>\t{<round_sd_mask_op4>%2, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %2<round_sd_mask_op4>}
+   vfnmadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   vfnmadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "fma_avx512f,fma_avx512f,fma_avx512f,fma4,fma4")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_fnmadd_<mode>_mask<round_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v,v")
+	(vec_merge:VF_512
+	  (fma:VF_512
+	    (neg:VF_512
+	      (match_operand:VF_512 1 "register_operand" "0,0"))
+	    (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>,v")
+	    (match_operand:VF_512 3 "<round_nimm_predicate>" "v,<round_constraint>"))
+	  (match_dup 1)
+	  (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk,Yk")))]
+  "TARGET_AVX512F"
+  "@
+   vfnmadd132<ssemodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %3, %2<round_op5>}
+   vfnmadd213<ssemodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %2, %3<round_op5>}"
+  [(set_attr "isa" "fma_avx512f,fma_avx512f")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_fnmadd_<mode>_mask3<round_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+	(vec_merge:VF_512
+	  (fma:VF_512
+	    (neg:VF_512
+	      (match_operand:VF_512 1 "register_operand" "v"))
+	    (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>")
+	    (match_operand:VF_512 3 "register_operand" "0"))
+	  (match_dup 3)
+	  (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+  "vfnmadd231<ssemodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_op5>}"
+  [(set_attr "isa" "fma_avx512f")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<sd_mask_codefor>fma_fnmsub_<mode><sd_maskz_name><round_name>"
+  [(set (match_operand:FMAMODE 0 "register_operand" "=v,v,v,x,x")
+	(fma:FMAMODE
+	  (neg:FMAMODE
+	    (match_operand:FMAMODE 1 "<round_nimm_predicate>" "%0,0,v,x,x"))
+	  (match_operand:FMAMODE   2 "<round_nimm_predicate>" "<round_constraint>,v,<round_constraint>,x,m")
+	  (neg:FMAMODE
+	    (match_operand:FMAMODE 3 "<round_nimm_predicate>" "v,<round_constraint>,0,xm,x"))))]
+  "<sd_mask_mode512bit_condition> && <round_mode512bit_condition>"
+  "@
+   vfnmsub132<ssemodesuffix>\t{<round_sd_mask_op4>%2, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<round_sd_mask_op4>}
+   vfnmsub213<ssemodesuffix>\t{<round_sd_mask_op4>%3, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<round_sd_mask_op4>}
+   vfnmsub231<ssemodesuffix>\t{<round_sd_mask_op4>%2, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %2<round_sd_mask_op4>}
+   vfnmsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   vfnmsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "fma_avx512f,fma_avx512f,fma_avx512f,fma4,fma4")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_fnmsub_<mode>_mask<round_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v,v")
+	(vec_merge:VF_512
+	  (fma:VF_512
+	    (neg:VF_512
+	      (match_operand:VF_512 1 "register_operand" "0,0"))
+	    (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>,v")
+	    (neg:VF_512
+	      (match_operand:VF_512 3 "<round_nimm_predicate>" "v,<round_constraint>")))
+	  (match_dup 1)
+	  (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk,Yk")))]
+  "TARGET_AVX512F"
+  "@
+   vfnmsub132<ssemodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %3, %2<round_op5>}
+   vfnmsub213<ssemodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %2, %3<round_op5>}"
+  [(set_attr "isa" "fma_avx512f,fma_avx512f")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_fnmsub_<mode>_mask3<round_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+	(vec_merge:VF_512
+	  (fma:VF_512
+	    (neg:VF_512
+	      (match_operand:VF_512 1 "register_operand" "v"))
+	    (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>")
+	    (neg:VF_512
+	      (match_operand:VF_512 3 "register_operand" "0")))
+	  (match_dup 3)
+	  (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+  "vfnmsub231<ssemodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_op5>}"
+  [(set_attr "isa" "fma_avx512f")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; FMA parallel floating point multiply addsub and subadd operations.
+
+;; It would be possible to represent these without the UNSPEC as
+;;
+;; (vec_merge
+;;   (fma op1 op2 op3)
+;;   (fma op1 op2 (neg op3))
+;;   (merge-const))
+;;
+;; But this doesn't seem useful in practice.
+
+(define_expand "fmaddsub_<mode>"
+  [(set (match_operand:VF 0 "register_operand")
+	(unspec:VF
+	  [(match_operand:VF 1 "nonimmediate_operand")
+	   (match_operand:VF 2 "nonimmediate_operand")
+	   (match_operand:VF 3 "nonimmediate_operand")]
+	  UNSPEC_FMADDSUB))]
+  "TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F")
+
+(define_expand "avx512f_fmaddsub_<mode>_maskz<round_expand_name>"
+  [(match_operand:VF_512 0 "register_operand")
+   (match_operand:VF_512 1 "<round_expand_nimm_predicate>")
+   (match_operand:VF_512 2 "<round_expand_nimm_predicate>")
+   (match_operand:VF_512 3 "<round_expand_nimm_predicate>")
+   (match_operand:<avx512fmaskmode> 4 "register_operand")]
+  "TARGET_AVX512F"
+{
+  emit_insn (gen_fma_fmaddsub_<mode>_maskz_1<round_expand_name> (
+    operands[0], operands[1], operands[2], operands[3],
+    CONST0_RTX (<MODE>mode), operands[4]<round_expand_operand>));
+  DONE;
+})
+
+(define_insn "<sd_mask_codefor>fma_fmaddsub_<mode><sd_maskz_name><round_name>"
+  [(set (match_operand:VF 0 "register_operand" "=v,v,v,x,x")
+	(unspec:VF
+	  [(match_operand:VF 1 "<round_nimm_predicate>" "%0,0,v,x,x")
+	   (match_operand:VF 2 "<round_nimm_predicate>" "<round_constraint>,v,<round_constraint>,x,m")
+	   (match_operand:VF 3 "<round_nimm_predicate>" "v,<round_constraint>,0,xm,x")]
+	  UNSPEC_FMADDSUB))]
+  "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F) && <sd_mask_mode512bit_condition> && <round_mode512bit_condition>"
+  "@
+   vfmaddsub132<ssemodesuffix>\t{<round_sd_mask_op4>%2, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<round_sd_mask_op4>}
+   vfmaddsub213<ssemodesuffix>\t{<round_sd_mask_op4>%3, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<round_sd_mask_op4>}
+   vfmaddsub231<ssemodesuffix>\t{<round_sd_mask_op4>%2, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %2<round_sd_mask_op4>}
+   vfmaddsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   vfmaddsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "fma_avx512f,fma_avx512f,fma_avx512f,fma4,fma4")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_fmaddsub_<mode>_mask<round_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v,v")
+	(vec_merge:VF_512
+	  (unspec:VF_512
+	    [(match_operand:VF_512 1 "register_operand" "0,0")
+	     (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>,v")
+	     (match_operand:VF_512 3 "<round_nimm_predicate>" "v,<round_constraint>")]
+	    UNSPEC_FMADDSUB)
+	  (match_dup 1)
+	  (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk,Yk")))]
+  "TARGET_AVX512F"
+  "@
+   vfmaddsub132<ssemodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %3, %2<round_op5>}
+   vfmaddsub213<ssemodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %2, %3<round_op5>}"
+  [(set_attr "isa" "fma_avx512f,fma_avx512f")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_fmaddsub_<mode>_mask3<round_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+	(vec_merge:VF_512
+	  (unspec:VF_512
+	    [(match_operand:VF_512 1 "register_operand" "v")
+	     (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>")
+	     (match_operand:VF_512 3 "register_operand" "0")]
+	    UNSPEC_FMADDSUB)
+	  (match_dup 3)
+	  (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+  "vfmaddsub231<ssemodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_op5>}"
+  [(set_attr "isa" "fma_avx512f")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<sd_mask_codefor>fma_fmsubadd_<mode><sd_maskz_name><round_name>"
+  [(set (match_operand:VF 0 "register_operand" "=v,v,v,x,x")
+	(unspec:VF
+	  [(match_operand:VF   1 "<round_nimm_predicate>" "%0,0,v,x,x")
+	   (match_operand:VF   2 "<round_nimm_predicate>" "<round_constraint>,v,<round_constraint>,x,m")
+	   (neg:VF
+	     (match_operand:VF 3 "<round_nimm_predicate>" "v,<round_constraint>,0,xm,x"))]
+	  UNSPEC_FMADDSUB))]
+  "(TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F) && <sd_mask_mode512bit_condition> && <round_mode512bit_condition>"
+  "@
+   vfmsubadd132<ssemodesuffix>\t{<round_sd_mask_op4>%2, %3, %0<sd_mask_op4>|%0<sd_mask_op4>, %3, %2<round_sd_mask_op4>}
+   vfmsubadd213<ssemodesuffix>\t{<round_sd_mask_op4>%3, %2, %0<sd_mask_op4>|%0<sd_mask_op4>, %2, %3<round_sd_mask_op4>}
+   vfmsubadd231<ssemodesuffix>\t{<round_sd_mask_op4>%2, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %2<round_sd_mask_op4>}
+   vfmsubadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   vfmsubadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "fma_avx512f,fma_avx512f,fma_avx512f,fma4,fma4")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_fmsubadd_<mode>_mask<round_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v,v")
+	(vec_merge:VF_512
+	  (unspec:VF_512
+	    [(match_operand:VF_512 1 "register_operand" "0,0")
+	     (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>,v")
+	     (neg:VF_512
+	       (match_operand:VF_512 3 "<round_nimm_predicate>" "v,<round_constraint>"))]
+	    UNSPEC_FMADDSUB)
+	  (match_dup 1)
+	  (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk,Yk")))]
+  "TARGET_AVX512F"
+  "@
+   vfmsubadd132<ssemodesuffix>\t{<round_op5>%2, %3, %0%{%4%}|%0%{%4%}, %3, %2<round_op5>}
+   vfmsubadd213<ssemodesuffix>\t{<round_op5>%3, %2, %0%{%4%}|%0%{%4%}, %2, %3<round_op5>}"
+  [(set_attr "isa" "fma_avx512f,fma_avx512f")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_fmsubadd_<mode>_mask3<round_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+	(vec_merge:VF_512
+	  (unspec:VF_512
+	    [(match_operand:VF_512 1 "register_operand" "v")
+	     (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>")
+	     (neg:VF_512
+	       (match_operand:VF_512 3 "register_operand" "0"))]
+	    UNSPEC_FMADDSUB)
+	  (match_dup 3)
+	  (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+  "vfmsubadd231<ssemodesuffix>\t{<round_op5>%2, %1, %0%{%4%}|%0%{%4%}, %1, %2<round_op5>}"
+  [(set_attr "isa" "fma_avx512f")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; FMA3 floating point scalar intrinsics. These merge result with
+;; high-order elements from the destination register.
+
+(define_expand "fmai_vmfmadd_<mode><round_name>"
+  [(set (match_operand:VF_128 0 "register_operand")
+	(vec_merge:VF_128
+	  (fma:VF_128
+	    (match_operand:VF_128 1 "<round_nimm_predicate>")
+	    (match_operand:VF_128 2 "<round_nimm_predicate>")
+	    (match_operand:VF_128 3 "<round_nimm_predicate>"))
+	  (match_dup 1)
+	  (const_int 1)))]
+  "TARGET_FMA")
+
+(define_insn "*fmai_fmadd_<mode>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v,v")
+        (vec_merge:VF_128
+	  (fma:VF_128
+	    (match_operand:VF_128 1 "<round_nimm_predicate>" " 0, 0")
+	    (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>, v")
+	    (match_operand:VF_128 3 "<round_nimm_predicate>" " v,<round_constraint>"))
+	  (match_dup 1)
+	  (const_int 1)))]
+  "TARGET_FMA || TARGET_AVX512F"
+  "@
+   vfmadd132<ssescalarmodesuffix>\t{<round_op4>%2, %3, %0|%0, %<iptr>3, %<iptr>2<round_op4>}
+   vfmadd213<ssescalarmodesuffix>\t{<round_op4>%3, %2, %0|%0, %<iptr>2, %<iptr>3<round_op4>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fmai_fmsub_<mode>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v,v")
+        (vec_merge:VF_128
+	  (fma:VF_128
+	    (match_operand:VF_128   1 "<round_nimm_predicate>" "0,0")
+	    (match_operand:VF_128   2 "<round_nimm_predicate>" "<round_constraint>,v")
+	    (neg:VF_128
+	      (match_operand:VF_128 3 "<round_nimm_predicate>" " v,<round_constraint>")))
+	  (match_dup 1)
+	  (const_int 1)))]
+  "TARGET_FMA || TARGET_AVX512F"
+  "@
+   vfmsub132<ssescalarmodesuffix>\t{<round_op4>%2, %3, %0|%0, %<iptr>3, %<iptr>2<round_op4>}
+   vfmsub213<ssescalarmodesuffix>\t{<round_op4>%3, %2, %0|%0, %<iptr>2, %<iptr>3<round_op4>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fmai_fnmadd_<mode><round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v,v")
+        (vec_merge:VF_128
+	  (fma:VF_128
+	    (neg:VF_128
+	      (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>,v"))
+	    (match_operand:VF_128   1 "<round_nimm_predicate>" "0,0")
+	    (match_operand:VF_128   3 "<round_nimm_predicate>" "v,<round_constraint>"))
+	  (match_dup 1)
+	  (const_int 1)))]
+  "TARGET_FMA || TARGET_AVX512F"
+  "@
+   vfnmadd132<ssescalarmodesuffix>\t{<round_op4>%2, %3, %0|%0, %<iptr>3, %<iptr>2<round_op4>}
+   vfnmadd213<ssescalarmodesuffix>\t{<round_op4>%3, %2, %0|%0, %<iptr>2, %<iptr>3<round_op4>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fmai_fnmsub_<mode><round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v,v")
+        (vec_merge:VF_128
+	  (fma:VF_128
+	    (neg:VF_128
+	      (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>, v"))
+	    (match_operand:VF_128   1 "<round_nimm_predicate>" " 0, 0")
+	    (neg:VF_128
+	      (match_operand:VF_128 3 "<round_nimm_predicate>" " v,<round_constraint>")))
+	  (match_dup 1)
+	  (const_int 1)))]
+  "TARGET_FMA || TARGET_AVX512F"
+  "@
+   vfnmsub132<ssescalarmodesuffix>\t{<round_op4>%2, %3, %0|%0, %<iptr>3, %<iptr>2<round_op4>}
+   vfnmsub213<ssescalarmodesuffix>\t{<round_op4>%3, %2, %0|%0, %<iptr>2, %<iptr>3<round_op4>}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; FMA4 floating point scalar intrinsics.  These write the
+;; entire destination register, with the high-order elements zeroed.
+
+(define_expand "fma4i_vmfmadd_<mode>"
+  [(set (match_operand:VF_128 0 "register_operand")
+	(vec_merge:VF_128
+	  (fma:VF_128
+	    (match_operand:VF_128 1 "nonimmediate_operand")
+	    (match_operand:VF_128 2 "nonimmediate_operand")
+	    (match_operand:VF_128 3 "nonimmediate_operand"))
+	  (match_dup 4)
+	  (const_int 1)))]
+  "TARGET_FMA4"
+  "operands[4] = CONST0_RTX (<MODE>mode);")
+
+(define_insn "*fma4i_vmfmadd_<mode>"
+  [(set (match_operand:VF_128 0 "register_operand" "=x,x")
+	(vec_merge:VF_128
+	  (fma:VF_128
+	    (match_operand:VF_128 1 "nonimmediate_operand" "%x,x")
+	    (match_operand:VF_128 2 "nonimmediate_operand" " x,m")
+	    (match_operand:VF_128 3 "nonimmediate_operand" "xm,x"))
+	  (match_operand:VF_128 4 "const0_operand")
+	  (const_int 1)))]
+  "TARGET_FMA4"
+  "vfmadd<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %<iptr>2, %<iptr>3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fma4i_vmfmsub_<mode>"
+  [(set (match_operand:VF_128 0 "register_operand" "=x,x")
+	(vec_merge:VF_128
+	  (fma:VF_128
+	    (match_operand:VF_128 1 "nonimmediate_operand" "%x,x")
+	    (match_operand:VF_128 2 "nonimmediate_operand" " x,m")
+	    (neg:VF_128
+	      (match_operand:VF_128 3 "nonimmediate_operand" "xm,x")))
+	  (match_operand:VF_128 4 "const0_operand")
+	  (const_int 1)))]
+  "TARGET_FMA4"
+  "vfmsub<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %<iptr>2, %<iptr>3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fma4i_vmfnmadd_<mode>"
+  [(set (match_operand:VF_128 0 "register_operand" "=x,x")
+	(vec_merge:VF_128
+	  (fma:VF_128
+	    (neg:VF_128
+	      (match_operand:VF_128 1 "nonimmediate_operand" "%x,x"))
+	    (match_operand:VF_128   2 "nonimmediate_operand" " x,m")
+	    (match_operand:VF_128   3 "nonimmediate_operand" "xm,x"))
+	  (match_operand:VF_128 4 "const0_operand")
+	  (const_int 1)))]
+  "TARGET_FMA4"
+  "vfnmadd<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %<iptr>2, %<iptr>3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fma4i_vmfnmsub_<mode>"
+  [(set (match_operand:VF_128 0 "register_operand" "=x,x")
+	(vec_merge:VF_128
+	  (fma:VF_128
+	    (neg:VF_128
+	      (match_operand:VF_128 1 "nonimmediate_operand" "%x,x"))
+	    (match_operand:VF_128   2 "nonimmediate_operand" " x,m")
+	    (neg:VF_128
+	      (match_operand:VF_128   3 "nonimmediate_operand" "xm,x")))
+	  (match_operand:VF_128 4 "const0_operand")
+	  (const_int 1)))]
+  "TARGET_FMA4"
+  "vfnmsub<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %<iptr>2, %<iptr>3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel single-precision floating point conversion operations
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "sse_cvtpi2ps"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(vec_merge:V4SF
+	  (vec_duplicate:V4SF
+	    (float:V2SF (match_operand:V2SI 2 "nonimmediate_operand" "ym")))
+	  (match_operand:V4SF 1 "register_operand" "0")
+	  (const_int 3)))]
+  "TARGET_SSE"
+  "cvtpi2ps\t{%2, %0|%0, %2}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "sse_cvtps2pi"
+  [(set (match_operand:V2SI 0 "register_operand" "=y")
+	(vec_select:V2SI
+	  (unspec:V4SI [(match_operand:V4SF 1 "nonimmediate_operand" "xm")]
+		       UNSPEC_FIX_NOTRUNC)
+	  (parallel [(const_int 0) (const_int 1)])))]
+  "TARGET_SSE"
+  "cvtps2pi\t{%1, %0|%0, %q1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "unit" "mmx")
+   (set_attr "mode" "DI")])
+
+(define_insn "sse_cvttps2pi"
+  [(set (match_operand:V2SI 0 "register_operand" "=y")
+	(vec_select:V2SI
+	  (fix:V4SI (match_operand:V4SF 1 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 0) (const_int 1)])))]
+  "TARGET_SSE"
+  "cvttps2pi\t{%1, %0|%0, %q1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "unit" "mmx")
+   (set_attr "prefix_rep" "0")
+   (set_attr "mode" "SF")])
+
+(define_insn "sse_cvtsi2ss<round_name>"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x,v")
+	(vec_merge:V4SF
+	  (vec_duplicate:V4SF
+	    (float:SF (match_operand:SI 2 "<round_nimm_predicate>" "r,m,<round_constraint3>")))
+	  (match_operand:V4SF 1 "register_operand" "0,0,v")
+	  (const_int 1)))]
+  "TARGET_SSE"
+  "@
+   cvtsi2ss\t{%2, %0|%0, %2}
+   cvtsi2ss\t{%2, %0|%0, %2}
+   vcvtsi2ss\t{<round_op3>%2, %1, %0|%0, %1, %2<round_op3>}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "sseicvt")
+   (set_attr "athlon_decode" "vector,double,*")
+   (set_attr "amdfam10_decode" "vector,double,*")
+   (set_attr "bdver1_decode" "double,direct,*")
+   (set_attr "btver2_decode" "double,double,double")
+   (set_attr "prefix" "orig,orig,maybe_evex")
+   (set_attr "mode" "SF")])
+
+(define_insn "sse_cvtsi2ssq<round_name>"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x,v")
+	(vec_merge:V4SF
+	  (vec_duplicate:V4SF
+	    (float:SF (match_operand:DI 2 "<round_nimm_predicate>" "r,m,<round_constraint3>")))
+	  (match_operand:V4SF 1 "register_operand" "0,0,v")
+	  (const_int 1)))]
+  "TARGET_SSE && TARGET_64BIT"
+  "@
+   cvtsi2ssq\t{%2, %0|%0, %2}
+   cvtsi2ssq\t{%2, %0|%0, %2}
+   vcvtsi2ssq\t{<round_op3>%2, %1, %0|%0, %1, %2<round_op3>}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "sseicvt")
+   (set_attr "athlon_decode" "vector,double,*")
+   (set_attr "amdfam10_decode" "vector,double,*")
+   (set_attr "bdver1_decode" "double,direct,*")
+   (set_attr "btver2_decode" "double,double,double")
+   (set_attr "length_vex" "*,*,4")
+   (set_attr "prefix_rex" "1,1,*")
+   (set_attr "prefix" "orig,orig,maybe_evex")
+   (set_attr "mode" "SF")])
+
+(define_insn "sse_cvtss2si<round_name>"
+  [(set (match_operand:SI 0 "register_operand" "=r,r")
+	(unspec:SI
+	  [(vec_select:SF
+	     (match_operand:V4SF 1 "<round_nimm_predicate>" "v,<round_constraint2>")
+	     (parallel [(const_int 0)]))]
+	  UNSPEC_FIX_NOTRUNC))]
+  "TARGET_SSE"
+  "%vcvtss2si\t{<round_op2>%1, %0|%0, %k1<round_op2>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "athlon_decode" "double,vector")
+   (set_attr "bdver1_decode" "double,double")
+   (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "SI")])
+
+(define_insn "sse_cvtss2si_2"
+  [(set (match_operand:SI 0 "register_operand" "=r,r")
+	(unspec:SI [(match_operand:SF 1 "nonimmediate_operand" "v,m")]
+		   UNSPEC_FIX_NOTRUNC))]
+  "TARGET_SSE"
+  "%vcvtss2si\t{%1, %0|%0, %k1}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "athlon_decode" "double,vector")
+   (set_attr "amdfam10_decode" "double,double")
+   (set_attr "bdver1_decode" "double,double")
+   (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "SI")])
+
+(define_insn "sse_cvtss2siq<round_name>"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(unspec:DI
+	  [(vec_select:SF
+	     (match_operand:V4SF 1 "<round_nimm_predicate>" "v,<round_constraint2>")
+	     (parallel [(const_int 0)]))]
+	  UNSPEC_FIX_NOTRUNC))]
+  "TARGET_SSE && TARGET_64BIT"
+  "%vcvtss2si{q}\t{<round_op2>%1, %0|%0, %k1<round_op2>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "athlon_decode" "double,vector")
+   (set_attr "bdver1_decode" "double,double")
+   (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "DI")])
+
+(define_insn "sse_cvtss2siq_2"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(unspec:DI [(match_operand:SF 1 "nonimmediate_operand" "v,m")]
+		   UNSPEC_FIX_NOTRUNC))]
+  "TARGET_SSE && TARGET_64BIT"
+  "%vcvtss2si{q}\t{%1, %0|%0, %k1}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "athlon_decode" "double,vector")
+   (set_attr "amdfam10_decode" "double,double")
+   (set_attr "bdver1_decode" "double,double")
+   (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "DI")])
+
+(define_insn "sse_cvttss2si<round_saeonly_name>"
+  [(set (match_operand:SI 0 "register_operand" "=r,r")
+	(fix:SI
+	  (vec_select:SF
+	    (match_operand:V4SF 1 "<round_saeonly_nimm_predicate>" "v,<round_saeonly_constraint2>")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_SSE"
+  "%vcvttss2si\t{<round_saeonly_op2>%1, %0|%0, %k1<round_saeonly_op2>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "athlon_decode" "double,vector")
+   (set_attr "amdfam10_decode" "double,double")
+   (set_attr "bdver1_decode" "double,double")
+   (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "SI")])
+
+(define_insn "sse_cvttss2siq<round_saeonly_name>"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(fix:DI
+	  (vec_select:SF
+	    (match_operand:V4SF 1 "<round_saeonly_nimm_predicate>" "v,<round_saeonly_constraint>")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_SSE && TARGET_64BIT"
+  "%vcvttss2si{q}\t{<round_saeonly_op2>%1, %0|%0, %k1<round_saeonly_op2>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "athlon_decode" "double,vector")
+   (set_attr "amdfam10_decode" "double,double")
+   (set_attr "bdver1_decode" "double,double")
+   (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "DI")])
+
+(define_insn "cvtusi2<ssescalarmodesuffix>32<round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+	(vec_merge:VF_128
+	  (vec_duplicate:VF_128
+	    (unsigned_float:<ssescalarmode>
+	      (match_operand:SI 2 "<round_nimm_predicate>" "<round_constraint3>")))
+	  (match_operand:VF_128 1 "register_operand" "v")
+	  (const_int 1)))]
+  "TARGET_AVX512F && <round_modev4sf_condition>"
+  "vcvtusi2<ssescalarmodesuffix>\t{<round_op3>%2, %1, %0|%0, %1, %2<round_op3>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_insn "cvtusi2<ssescalarmodesuffix>64<round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+	(vec_merge:VF_128
+	  (vec_duplicate:VF_128
+	    (unsigned_float:<ssescalarmode>
+	      (match_operand:DI 2 "<round_nimm_predicate>" "<round_constraint3>")))
+	  (match_operand:VF_128 1 "register_operand" "v")
+	  (const_int 1)))]
+  "TARGET_AVX512F && TARGET_64BIT"
+  "vcvtusi2<ssescalarmodesuffix>\t{<round_op3>%2, %1, %0|%0, %1, %2<round_op3>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_insn "float<sseintvecmodelower><mode>2<mask_name><round_name>"
+  [(set (match_operand:VF1 0 "register_operand" "=v")
+	(float:VF1
+	  (match_operand:<sseintvecmode> 1 "<round_nimm_predicate>" "<round_constraint>")))]
+  "TARGET_SSE2 && <mask_mode512bit_condition> && <round_mode512bit_condition>"
+  "%vcvtdq2ps\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "ufloatv16siv16sf2<mask_name><round_name>"
+  [(set (match_operand:V16SF 0 "register_operand" "=v")
+	(unsigned_float:V16SF
+	  (match_operand:V16SI 1 "<round_nimm_predicate>" "<round_constraint>")))]
+  "TARGET_AVX512F"
+  "vcvtudq2ps\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V16SF")])
+
+(define_expand "floatuns<sseintvecmodelower><mode>2"
+  [(match_operand:VF1 0 "register_operand")
+   (match_operand:<sseintvecmode> 1 "register_operand")]
+  "TARGET_SSE2 && (<MODE>mode == V4SFmode || TARGET_AVX2)"
+{
+  if (<MODE>mode == V16SFmode)
+    emit_insn (gen_ufloatv16siv16sf2 (operands[0], operands[1]));
+  else
+    ix86_expand_vector_convert_uns_vsivsf (operands[0], operands[1]);
+
+  DONE;
+})
+
+
+;; For <sse2_avx_avx512f>_fix_notrunc<sf2simodelower><mode> insn pattern
+(define_mode_attr sf2simodelower
+  [(V16SI "v16sf") (V8SI "v8sf") (V4SI "v4sf")])
+
+(define_insn "<sse2_avx_avx512f>_fix_notrunc<sf2simodelower><mode>"
+  [(set (match_operand:VI4_AVX 0 "register_operand" "=v")
+	(unspec:VI4_AVX
+	  [(match_operand:<ssePSmode> 1 "nonimmediate_operand" "vm")]
+	  UNSPEC_FIX_NOTRUNC))]
+  "TARGET_SSE2"
+  "%vcvtps2dq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set (attr "prefix_data16")
+     (if_then_else
+       (match_test "TARGET_AVX")
+     (const_string "*")
+     (const_string "1")))
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<mask_codefor>avx512f_fix_notruncv16sfv16si<mask_name><round_name>"
+  [(set (match_operand:V16SI 0 "register_operand" "=v")
+	(unspec:V16SI
+	  [(match_operand:V16SF 1 "<round_nimm_predicate>" "<round_constraint>")]
+	  UNSPEC_FIX_NOTRUNC))]
+  "TARGET_AVX512F"
+  "vcvtps2dq\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "<mask_codefor>avx512f_ufix_notruncv16sfv16si<mask_name><round_name>"
+  [(set (match_operand:V16SI 0 "register_operand" "=v")
+	(unspec:V16SI
+	  [(match_operand:V16SF 1 "<round_nimm_predicate>" "<round_constraint>")]
+	  UNSPEC_UNSIGNED_FIX_NOTRUNC))]
+  "TARGET_AVX512F"
+  "vcvtps2udq\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "<fixsuffix>fix_truncv16sfv16si2<mask_name><round_saeonly_name>"
+  [(set (match_operand:V16SI 0 "register_operand" "=v")
+	(any_fix:V16SI
+	  (match_operand:V16SF 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")))]
+  "TARGET_AVX512F"
+  "vcvttps2<fixsuffix>dq\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "fix_truncv8sfv8si2"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+	(fix:V8SI (match_operand:V8SF 1 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vcvttps2dq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "fix_truncv4sfv4si2"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(fix:V4SI (match_operand:V4SF 1 "nonimmediate_operand" "xm")))]
+  "TARGET_SSE2"
+  "%vcvttps2dq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set (attr "prefix_rep")
+     (if_then_else
+       (match_test "TARGET_AVX")
+     (const_string "*")
+     (const_string "1")))
+   (set (attr "prefix_data16")
+     (if_then_else
+       (match_test "TARGET_AVX")
+     (const_string "*")
+     (const_string "0")))
+   (set_attr "prefix_data16" "0")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+(define_expand "fixuns_trunc<mode><sseintvecmodelower>2"
+  [(match_operand:<sseintvecmode> 0 "register_operand")
+   (match_operand:VF1 1 "register_operand")]
+  "TARGET_SSE2"
+{
+  if (<MODE>mode == V16SFmode)
+    emit_insn (gen_ufix_truncv16sfv16si2 (operands[0],
+					  operands[1]));
+  else
+    {
+      rtx tmp[3];
+      tmp[0] = ix86_expand_adjust_ufix_to_sfix_si (operands[1], &tmp[2]);
+      tmp[1] = gen_reg_rtx (<sseintvecmode>mode);
+      emit_insn (gen_fix_trunc<mode><sseintvecmodelower>2 (tmp[1], tmp[0]));
+      emit_insn (gen_xor<sseintvecmodelower>3 (operands[0], tmp[1], tmp[2]));
+    }
+  DONE;
+})
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel double-precision floating point conversion operations
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "sse2_cvtpi2pd"
+  [(set (match_operand:V2DF 0 "register_operand" "=x,x")
+	(float:V2DF (match_operand:V2SI 1 "nonimmediate_operand" "y,m")))]
+  "TARGET_SSE2"
+  "cvtpi2pd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "unit" "mmx,*")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "mode" "V2DF")])
+
+(define_insn "sse2_cvtpd2pi"
+  [(set (match_operand:V2SI 0 "register_operand" "=y")
+	(unspec:V2SI [(match_operand:V2DF 1 "nonimmediate_operand" "xm")]
+		     UNSPEC_FIX_NOTRUNC))]
+  "TARGET_SSE2"
+  "cvtpd2pi\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "unit" "mmx")
+   (set_attr "bdver1_decode" "double")
+   (set_attr "btver2_decode" "direct")
+   (set_attr "prefix_data16" "1")
+   (set_attr "mode" "DI")])
+
+(define_insn "sse2_cvttpd2pi"
+  [(set (match_operand:V2SI 0 "register_operand" "=y")
+	(fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "xm")))]
+  "TARGET_SSE2"
+  "cvttpd2pi\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "unit" "mmx")
+   (set_attr "bdver1_decode" "double")
+   (set_attr "prefix_data16" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse2_cvtsi2sd"
+  [(set (match_operand:V2DF 0 "register_operand" "=x,x,x")
+	(vec_merge:V2DF
+	  (vec_duplicate:V2DF
+	    (float:DF (match_operand:SI 2 "nonimmediate_operand" "r,m,rm")))
+	  (match_operand:V2DF 1 "register_operand" "0,0,x")
+	  (const_int 1)))]
+  "TARGET_SSE2"
+  "@
+   cvtsi2sd\t{%2, %0|%0, %2}
+   cvtsi2sd\t{%2, %0|%0, %2}
+   vcvtsi2sd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "sseicvt")
+   (set_attr "athlon_decode" "double,direct,*")
+   (set_attr "amdfam10_decode" "vector,double,*")
+   (set_attr "bdver1_decode" "double,direct,*")
+   (set_attr "btver2_decode" "double,double,double")
+   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "mode" "DF")])
+
+(define_insn "sse2_cvtsi2sdq<round_name>"
+  [(set (match_operand:V2DF 0 "register_operand" "=x,x,v")
+	(vec_merge:V2DF
+	  (vec_duplicate:V2DF
+	    (float:DF (match_operand:DI 2 "<round_nimm_predicate>" "r,m,<round_constraint3>")))
+	  (match_operand:V2DF 1 "register_operand" "0,0,v")
+	  (const_int 1)))]
+  "TARGET_SSE2 && TARGET_64BIT"
+  "@
+   cvtsi2sdq\t{%2, %0|%0, %2}
+   cvtsi2sdq\t{%2, %0|%0, %2}
+   vcvtsi2sdq\t{<round_op3>%2, %1, %0|%0, %1, %2<round_op3>}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "sseicvt")
+   (set_attr "athlon_decode" "double,direct,*")
+   (set_attr "amdfam10_decode" "vector,double,*")
+   (set_attr "bdver1_decode" "double,direct,*")
+   (set_attr "length_vex" "*,*,4")
+   (set_attr "prefix_rex" "1,1,*")
+   (set_attr "prefix" "orig,orig,maybe_evex")
+   (set_attr "mode" "DF")])
+
+(define_insn "avx512f_vcvtss2usi<round_name>"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI
+	  [(vec_select:SF
+	     (match_operand:V4SF 1 "<round_nimm_predicate>" "<round_constraint>")
+	     (parallel [(const_int 0)]))]
+	  UNSPEC_UNSIGNED_FIX_NOTRUNC))]
+  "TARGET_AVX512F"
+  "vcvtss2usi\t{<round_op2>%1, %0|%0, %1<round_op2>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "SI")])
+
+(define_insn "avx512f_vcvtss2usiq<round_name>"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec:DI
+	  [(vec_select:SF
+	     (match_operand:V4SF 1 "<round_nimm_predicate>" "<round_constraint>")
+	     (parallel [(const_int 0)]))]
+	  UNSPEC_UNSIGNED_FIX_NOTRUNC))]
+  "TARGET_AVX512F && TARGET_64BIT"
+  "vcvtss2usi\t{<round_op2>%1, %0|%0, %1<round_op2>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "DI")])
+
+(define_insn "avx512f_vcvttss2usi<round_saeonly_name>"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unsigned_fix:SI
+	  (vec_select:SF
+	    (match_operand:V4SF 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_AVX512F"
+  "vcvttss2usi\t{<round_saeonly_op2>%1, %0|%0, %1<round_saeonly_op2>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "SI")])
+
+(define_insn "avx512f_vcvttss2usiq<round_saeonly_name>"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unsigned_fix:DI
+	  (vec_select:SF
+	    (match_operand:V4SF 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_AVX512F && TARGET_64BIT"
+  "vcvttss2usi\t{<round_saeonly_op2>%1, %0|%0, %1<round_saeonly_op2>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "DI")])
+
+(define_insn "avx512f_vcvtsd2usi<round_name>"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI
+	  [(vec_select:DF
+	     (match_operand:V2DF 1 "<round_nimm_predicate>" "<round_constraint>")
+	     (parallel [(const_int 0)]))]
+	  UNSPEC_UNSIGNED_FIX_NOTRUNC))]
+  "TARGET_AVX512F"
+  "vcvtsd2usi\t{<round_op2>%1, %0|%0, %1<round_op2>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "SI")])
+
+(define_insn "avx512f_vcvtsd2usiq<round_name>"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec:DI
+	  [(vec_select:DF
+	     (match_operand:V2DF 1 "<round_nimm_predicate>" "<round_constraint>")
+	     (parallel [(const_int 0)]))]
+	  UNSPEC_UNSIGNED_FIX_NOTRUNC))]
+  "TARGET_AVX512F && TARGET_64BIT"
+  "vcvtsd2usi\t{<round_op2>%1, %0|%0, %1<round_op2>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "DI")])
+
+(define_insn "avx512f_vcvttsd2usi<round_saeonly_name>"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unsigned_fix:SI
+	  (vec_select:DF
+	    (match_operand:V2DF 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_AVX512F"
+  "vcvttsd2usi\t{<round_saeonly_op2>%1, %0|%0, %1<round_saeonly_op2>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "SI")])
+
+(define_insn "avx512f_vcvttsd2usiq<round_saeonly_name>"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unsigned_fix:DI
+	  (vec_select:DF
+	    (match_operand:V2DF 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_AVX512F && TARGET_64BIT"
+  "vcvttsd2usi\t{<round_saeonly_op2>%1, %0|%0, %1<round_saeonly_op2>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "DI")])
+
+(define_insn "sse2_cvtsd2si<round_name>"
+  [(set (match_operand:SI 0 "register_operand" "=r,r")
+	(unspec:SI
+	  [(vec_select:DF
+	     (match_operand:V2DF 1 "<round_nimm_predicate>" "v,<round_constraint2>")
+	     (parallel [(const_int 0)]))]
+	  UNSPEC_FIX_NOTRUNC))]
+  "TARGET_SSE2"
+  "%vcvtsd2si\t{<round_op2>%1, %0|%0, %q1<round_op2>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "athlon_decode" "double,vector")
+   (set_attr "bdver1_decode" "double,double")
+   (set_attr "btver2_decode" "double,double")
+   (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "SI")])
+
+(define_insn "sse2_cvtsd2si_2"
+  [(set (match_operand:SI 0 "register_operand" "=r,r")
+	(unspec:SI [(match_operand:DF 1 "nonimmediate_operand" "v,m")]
+		   UNSPEC_FIX_NOTRUNC))]
+  "TARGET_SSE2"
+  "%vcvtsd2si\t{%1, %0|%0, %q1}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "athlon_decode" "double,vector")
+   (set_attr "amdfam10_decode" "double,double")
+   (set_attr "bdver1_decode" "double,double")
+   (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "SI")])
+
+(define_insn "sse2_cvtsd2siq<round_name>"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(unspec:DI
+	  [(vec_select:DF
+	     (match_operand:V2DF 1 "<round_nimm_predicate>" "v,<round_constraint2>")
+	     (parallel [(const_int 0)]))]
+	  UNSPEC_FIX_NOTRUNC))]
+  "TARGET_SSE2 && TARGET_64BIT"
+  "%vcvtsd2si{q}\t{<round_op2>%1, %0|%0, %q1<round_op2>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "athlon_decode" "double,vector")
+   (set_attr "bdver1_decode" "double,double")
+   (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "DI")])
+
+(define_insn "sse2_cvtsd2siq_2"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(unspec:DI [(match_operand:DF 1 "nonimmediate_operand" "v,m")]
+		   UNSPEC_FIX_NOTRUNC))]
+  "TARGET_SSE2 && TARGET_64BIT"
+  "%vcvtsd2si{q}\t{%1, %0|%0, %q1}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "athlon_decode" "double,vector")
+   (set_attr "amdfam10_decode" "double,double")
+   (set_attr "bdver1_decode" "double,double")
+   (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "DI")])
+
+(define_insn "sse2_cvttsd2si<round_saeonly_name>"
+  [(set (match_operand:SI 0 "register_operand" "=r,r")
+	(fix:SI
+	  (vec_select:DF
+	    (match_operand:V2DF 1 "<round_saeonly_nimm_predicate>" "v,<round_saeonly_constraint2>")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_SSE2"
+  "%vcvttsd2si\t{<round_saeonly_op2>%1, %0|%0, %q1<round_saeonly_op2>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "athlon_decode" "double,vector")
+   (set_attr "amdfam10_decode" "double,double")
+   (set_attr "bdver1_decode" "double,double")
+   (set_attr "btver2_decode" "double,double")
+   (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "SI")])
+
+(define_insn "sse2_cvttsd2siq<round_saeonly_name>"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(fix:DI
+	  (vec_select:DF
+	    (match_operand:V2DF 1 "<round_saeonly_nimm_predicate>" "v,<round_saeonly_constraint2>")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_SSE2 && TARGET_64BIT"
+  "%vcvttsd2si{q}\t{<round_saeonly_op2>%1, %0|%0, %q1<round_saeonly_op2>}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "athlon_decode" "double,vector")
+   (set_attr "amdfam10_decode" "double,double")
+   (set_attr "bdver1_decode" "double,double")
+   (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "DI")])
+
+;; For float<si2dfmode><mode>2 insn pattern
+(define_mode_attr si2dfmode
+  [(V8DF "V8SI") (V4DF "V4SI")])
+(define_mode_attr si2dfmodelower
+  [(V8DF "v8si") (V4DF "v4si")])
+
+(define_insn "float<si2dfmodelower><mode>2<mask_name>"
+  [(set (match_operand:VF2_512_256 0 "register_operand" "=v")
+	(float:VF2_512_256 (match_operand:<si2dfmode> 1 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX && <mask_mode512bit_condition>"
+  "vcvtdq2pd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "ufloatv8siv8df<mask_name>"
+  [(set (match_operand:V8DF 0 "register_operand" "=v")
+	(unsigned_float:V8DF
+	  (match_operand:V8SI 1 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512F"
+  "vcvtudq2pd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V8DF")])
+
+(define_insn "avx512f_cvtdq2pd512_2"
+  [(set (match_operand:V8DF 0 "register_operand" "=v")
+	(float:V8DF
+	  (vec_select:V8SI
+	    (match_operand:V16SI 1 "nonimmediate_operand" "vm")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)
+		       (const_int 4) (const_int 5)
+		       (const_int 6) (const_int 7)]))))]
+  "TARGET_AVX"
+  "vcvtdq2pd\t{%t1, %0|%0, %t1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V8DF")])
+
+(define_insn "avx_cvtdq2pd256_2"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+	(float:V4DF
+	  (vec_select:V4SI
+	    (match_operand:V8SI 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)]))))]
+  "TARGET_AVX"
+  "vcvtdq2pd\t{%x1, %0|%0, %x1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
+(define_insn "sse2_cvtdq2pd"
+  [(set (match_operand:V2DF 0 "register_operand" "=x")
+	(float:V2DF
+	  (vec_select:V2SI
+	    (match_operand:V4SI 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0) (const_int 1)]))))]
+  "TARGET_SSE2"
+  "%vcvtdq2pd\t{%1, %0|%0, %q1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "ssememalign" "64")
+   (set_attr "mode" "V2DF")])
+
+(define_insn "<mask_codefor>avx512f_cvtpd2dq512<mask_name><round_name>"
+  [(set (match_operand:V8SI 0 "register_operand" "=v")
+	(unspec:V8SI
+	  [(match_operand:V8DF 1 "<round_nimm_predicate>" "<round_constraint>")]
+	  UNSPEC_FIX_NOTRUNC))]
+  "TARGET_AVX512F"
+  "vcvtpd2dq\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx_cvtpd2dq256"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(unspec:V4SI [(match_operand:V4DF 1 "nonimmediate_operand" "xm")]
+		     UNSPEC_FIX_NOTRUNC))]
+  "TARGET_AVX"
+  "vcvtpd2dq{y}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_expand "avx_cvtpd2dq256_2"
+  [(set (match_operand:V8SI 0 "register_operand")
+	(vec_concat:V8SI
+	  (unspec:V4SI [(match_operand:V4DF 1 "nonimmediate_operand")]
+		       UNSPEC_FIX_NOTRUNC)
+	  (match_dup 2)))]
+  "TARGET_AVX"
+  "operands[2] = CONST0_RTX (V4SImode);")
+
+(define_insn "*avx_cvtpd2dq256_2"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+	(vec_concat:V8SI
+	  (unspec:V4SI [(match_operand:V4DF 1 "nonimmediate_operand" "xm")]
+		       UNSPEC_FIX_NOTRUNC)
+	  (match_operand:V4SI 2 "const0_operand")))]
+  "TARGET_AVX"
+  "vcvtpd2dq{y}\t{%1, %x0|%x0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "btver2_decode" "vector")
+   (set_attr "mode" "OI")])
+
+(define_expand "sse2_cvtpd2dq"
+  [(set (match_operand:V4SI 0 "register_operand")
+	(vec_concat:V4SI
+	  (unspec:V2SI [(match_operand:V2DF 1 "nonimmediate_operand")]
+		       UNSPEC_FIX_NOTRUNC)
+	  (match_dup 2)))]
+  "TARGET_SSE2"
+  "operands[2] = CONST0_RTX (V2SImode);")
+
+(define_insn "*sse2_cvtpd2dq"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(vec_concat:V4SI
+	  (unspec:V2SI [(match_operand:V2DF 1 "nonimmediate_operand" "xm")]
+		       UNSPEC_FIX_NOTRUNC)
+	  (match_operand:V2SI 2 "const0_operand")))]
+  "TARGET_SSE2"
+{
+  if (TARGET_AVX)
+    return "vcvtpd2dq{x}\t{%1, %0|%0, %1}";
+  else
+    return "cvtpd2dq\t{%1, %0|%0, %1}";
+}
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix_rep" "1")
+   (set_attr "prefix_data16" "0")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")
+   (set_attr "amdfam10_decode" "double")
+   (set_attr "athlon_decode" "vector")
+   (set_attr "bdver1_decode" "double")])
+
+(define_insn "avx512f_ufix_notruncv8dfv8si<mask_name><round_name>"
+  [(set (match_operand:V8SI 0 "register_operand" "=v")
+	(unspec:V8SI
+	  [(match_operand:V8DF 1 "<round_nimm_predicate>" "<round_constraint>")]
+	  UNSPEC_UNSIGNED_FIX_NOTRUNC))]
+  "TARGET_AVX512F"
+  "vcvtpd2udq\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "OI")])
+
+(define_insn "<fixsuffix>fix_truncv8dfv8si2<mask_name><round_saeonly_name>"
+  [(set (match_operand:V8SI 0 "register_operand" "=v")
+	(any_fix:V8SI
+	  (match_operand:V8DF 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")))]
+  "TARGET_AVX512F"
+  "vcvttpd2<fixsuffix>dq\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "OI")])
+
+(define_insn "fix_truncv4dfv4si2"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(fix:V4SI (match_operand:V4DF 1 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vcvttpd2dq{y}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_expand "avx_cvttpd2dq256_2"
+  [(set (match_operand:V8SI 0 "register_operand")
+	(vec_concat:V8SI
+	  (fix:V4SI (match_operand:V4DF 1 "nonimmediate_operand"))
+	  (match_dup 2)))]
+  "TARGET_AVX"
+  "operands[2] = CONST0_RTX (V4SImode);")
+
+(define_insn "*avx_cvttpd2dq256_2"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+	(vec_concat:V8SI
+	  (fix:V4SI (match_operand:V4DF 1 "nonimmediate_operand" "xm"))
+	  (match_operand:V4SI 2 "const0_operand")))]
+  "TARGET_AVX"
+  "vcvttpd2dq{y}\t{%1, %x0|%x0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "btver2_decode" "vector")
+   (set_attr "mode" "OI")])
+
+(define_expand "sse2_cvttpd2dq"
+  [(set (match_operand:V4SI 0 "register_operand")
+	(vec_concat:V4SI
+	  (fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand"))
+	  (match_dup 2)))]
+  "TARGET_SSE2"
+  "operands[2] = CONST0_RTX (V2SImode);")
+
+(define_insn "*sse2_cvttpd2dq"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(vec_concat:V4SI
+	  (fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "xm"))
+	  (match_operand:V2SI 2 "const0_operand")))]
+  "TARGET_SSE2"
+{
+  if (TARGET_AVX)
+    return "vcvttpd2dq{x}\t{%1, %0|%0, %1}";
+  else
+    return "cvttpd2dq\t{%1, %0|%0, %1}";
+}
+  [(set_attr "type" "ssecvt")
+   (set_attr "amdfam10_decode" "double")
+   (set_attr "athlon_decode" "vector")
+   (set_attr "bdver1_decode" "double")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse2_cvtsd2ss<round_name>"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x,v")
+	(vec_merge:V4SF
+	  (vec_duplicate:V4SF
+	    (float_truncate:V2SF
+	      (match_operand:V2DF 2 "nonimmediate_operand" "x,m,<round_constraint>")))
+	  (match_operand:V4SF 1 "register_operand" "0,0,v")
+	  (const_int 1)))]
+  "TARGET_SSE2"
+  "@
+   cvtsd2ss\t{%2, %0|%0, %2}
+   cvtsd2ss\t{%2, %0|%0, %q2}
+   vcvtsd2ss\t{<round_op3>%2, %1, %0|%0, %1, %q2<round_op3>}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssecvt")
+   (set_attr "athlon_decode" "vector,double,*")
+   (set_attr "amdfam10_decode" "vector,double,*")
+   (set_attr "bdver1_decode" "direct,direct,*")
+   (set_attr "btver2_decode" "double,double,double")
+   (set_attr "prefix" "orig,orig,<round_prefix>")
+   (set_attr "mode" "SF")])
+
+(define_insn "sse2_cvtss2sd<round_saeonly_name>"
+  [(set (match_operand:V2DF 0 "register_operand" "=x,x,v")
+	(vec_merge:V2DF
+	  (float_extend:V2DF
+	    (vec_select:V2SF
+	      (match_operand:V4SF 2 "nonimmediate_operand" "x,m,<round_saeonly_constraint>")
+	      (parallel [(const_int 0) (const_int 1)])))
+	  (match_operand:V2DF 1 "register_operand" "0,0,v")
+	  (const_int 1)))]
+  "TARGET_SSE2"
+  "@
+   cvtss2sd\t{%2, %0|%0, %2}
+   cvtss2sd\t{%2, %0|%0, %k2}
+   vcvtss2sd\t{<round_saeonly_op3>%2, %1, %0|%0, %1, %k2<round_saeonly_op3>}"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssecvt")
+   (set_attr "amdfam10_decode" "vector,double,*")
+   (set_attr "athlon_decode" "direct,direct,*")
+   (set_attr "bdver1_decode" "direct,direct,*")
+   (set_attr "btver2_decode" "double,double,double")
+   (set_attr "prefix" "orig,orig,<round_saeonly_prefix>")
+   (set_attr "mode" "DF")])
+
+(define_insn "<mask_codefor>avx512f_cvtpd2ps512<mask_name><round_name>"
+  [(set (match_operand:V8SF 0 "register_operand" "=v")
+	(float_truncate:V8SF
+	  (match_operand:V8DF 1 "<round_nimm_predicate>" "<round_constraint>")))]
+  "TARGET_AVX512F"
+  "vcvtpd2ps\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "avx_cvtpd2ps256"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(float_truncate:V4SF
+	  (match_operand:V4DF 1 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vcvtpd2ps{y}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "btver2_decode" "vector")
+   (set_attr "mode" "V4SF")])
+
+(define_expand "sse2_cvtpd2ps"
+  [(set (match_operand:V4SF 0 "register_operand")
+	(vec_concat:V4SF
+	  (float_truncate:V2SF
+	    (match_operand:V2DF 1 "nonimmediate_operand"))
+	  (match_dup 2)))]
+  "TARGET_SSE2"
+  "operands[2] = CONST0_RTX (V2SFmode);")
+
+(define_insn "*sse2_cvtpd2ps"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(vec_concat:V4SF
+	  (float_truncate:V2SF
+	    (match_operand:V2DF 1 "nonimmediate_operand" "xm"))
+	  (match_operand:V2SF 2 "const0_operand")))]
+  "TARGET_SSE2"
+{
+  if (TARGET_AVX)
+    return "vcvtpd2ps{x}\t{%1, %0|%0, %1}";
+  else
+    return "cvtpd2ps\t{%1, %0|%0, %1}";
+}
+  [(set_attr "type" "ssecvt")
+   (set_attr "amdfam10_decode" "double")
+   (set_attr "athlon_decode" "vector")
+   (set_attr "bdver1_decode" "double")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "V4SF")])
+
+;; For <sse2_avx_avx512f>_cvtps2pd<avxsizesuffix> insn pattern
+(define_mode_attr sf2dfmode
+  [(V8DF "V8SF") (V4DF "V4SF")])
+
+(define_insn "<sse2_avx_avx512f>_cvtps2pd<avxsizesuffix><mask_name><round_saeonly_name>"
+  [(set (match_operand:VF2_512_256 0 "register_operand" "=v")
+	(float_extend:VF2_512_256
+	  (match_operand:<sf2dfmode> 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")))]
+  "TARGET_AVX && <mask_mode512bit_condition> && <round_saeonly_mode512bit_condition>"
+  "vcvtps2pd\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*avx_cvtps2pd256_2"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+	(float_extend:V4DF
+	  (vec_select:V4SF
+	    (match_operand:V8SF 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)]))))]
+  "TARGET_AVX"
+  "vcvtps2pd\t{%x1, %0|%0, %x1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
+(define_insn "vec_unpacks_lo_v16sf"
+  [(set (match_operand:V8DF 0 "register_operand" "=v")
+	(float_extend:V8DF
+	  (vec_select:V8SF
+	    (match_operand:V16SF 1 "nonimmediate_operand" "vm")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)
+		       (const_int 4) (const_int 5)
+		       (const_int 6) (const_int 7)]))))]
+  "TARGET_AVX512F"
+  "vcvtps2pd\t{%t1, %0|%0, %t1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V8DF")])
+
+(define_insn "sse2_cvtps2pd"
+  [(set (match_operand:V2DF 0 "register_operand" "=x")
+	(float_extend:V2DF
+	  (vec_select:V2SF
+	    (match_operand:V4SF 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0) (const_int 1)]))))]
+  "TARGET_SSE2"
+  "%vcvtps2pd\t{%1, %0|%0, %q1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "amdfam10_decode" "direct")
+   (set_attr "athlon_decode" "double")
+   (set_attr "bdver1_decode" "double")
+   (set_attr "prefix_data16" "0")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "V2DF")])
+
+(define_expand "vec_unpacks_hi_v4sf"
+  [(set (match_dup 2)
+   (vec_select:V4SF
+     (vec_concat:V8SF
+       (match_dup 2)
+       (match_operand:V4SF 1 "nonimmediate_operand"))
+     (parallel [(const_int 6) (const_int 7)
+		(const_int 2) (const_int 3)])))
+  (set (match_operand:V2DF 0 "register_operand")
+   (float_extend:V2DF
+     (vec_select:V2SF
+       (match_dup 2)
+       (parallel [(const_int 0) (const_int 1)]))))]
+  "TARGET_SSE2"
+  "operands[2] = gen_reg_rtx (V4SFmode);")
+
+(define_expand "vec_unpacks_hi_v8sf"
+  [(set (match_dup 2)
+	(vec_select:V4SF
+	  (match_operand:V8SF 1 "nonimmediate_operand")
+	  (parallel [(const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)])))
+   (set (match_operand:V4DF 0 "register_operand")
+	(float_extend:V4DF
+	  (match_dup 2)))]
+  "TARGET_AVX"
+  "operands[2] = gen_reg_rtx (V4SFmode);")
+
+(define_expand "vec_unpacks_hi_v16sf"
+  [(set (match_dup 2)
+	(vec_select:V8SF
+	  (match_operand:V16SF 1 "nonimmediate_operand")
+	  (parallel [(const_int 8) (const_int 9)
+		     (const_int 10) (const_int 11)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)])))
+   (set (match_operand:V8DF 0 "register_operand")
+	(float_extend:V8DF
+	  (match_dup 2)))]
+"TARGET_AVX512F"
+"operands[2] = gen_reg_rtx (V8SFmode);")
+
+(define_expand "vec_unpacks_lo_v4sf"
+  [(set (match_operand:V2DF 0 "register_operand")
+	(float_extend:V2DF
+	  (vec_select:V2SF
+	    (match_operand:V4SF 1 "nonimmediate_operand")
+	    (parallel [(const_int 0) (const_int 1)]))))]
+  "TARGET_SSE2")
+
+(define_expand "vec_unpacks_lo_v8sf"
+  [(set (match_operand:V4DF 0 "register_operand")
+	(float_extend:V4DF
+	  (vec_select:V4SF
+	    (match_operand:V8SF 1 "nonimmediate_operand")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)]))))]
+  "TARGET_AVX")
+
+(define_mode_attr sseunpackfltmode
+  [(V8HI "V4SF") (V4SI "V2DF") (V16HI "V8SF")
+  (V8SI "V4DF") (V32HI "V16SF") (V16SI "V8DF")])
+
+(define_expand "vec_unpacks_float_hi_<mode>"
+  [(match_operand:<sseunpackfltmode> 0 "register_operand")
+   (match_operand:VI2_AVX512F 1 "register_operand")]
+  "TARGET_SSE2"
+{
+  rtx tmp = gen_reg_rtx (<sseunpackmode>mode);
+
+  emit_insn (gen_vec_unpacks_hi_<mode> (tmp, operands[1]));
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+			  gen_rtx_FLOAT (<sseunpackfltmode>mode, tmp)));
+  DONE;
+})
+
+(define_expand "vec_unpacks_float_lo_<mode>"
+  [(match_operand:<sseunpackfltmode> 0 "register_operand")
+   (match_operand:VI2_AVX512F 1 "register_operand")]
+  "TARGET_SSE2"
+{
+  rtx tmp = gen_reg_rtx (<sseunpackmode>mode);
+
+  emit_insn (gen_vec_unpacks_lo_<mode> (tmp, operands[1]));
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+			  gen_rtx_FLOAT (<sseunpackfltmode>mode, tmp)));
+  DONE;
+})
+
+(define_expand "vec_unpacku_float_hi_<mode>"
+  [(match_operand:<sseunpackfltmode> 0 "register_operand")
+   (match_operand:VI2_AVX512F 1 "register_operand")]
+  "TARGET_SSE2"
+{
+  rtx tmp = gen_reg_rtx (<sseunpackmode>mode);
+
+  emit_insn (gen_vec_unpacku_hi_<mode> (tmp, operands[1]));
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+			  gen_rtx_FLOAT (<sseunpackfltmode>mode, tmp)));
+  DONE;
+})
+
+(define_expand "vec_unpacku_float_lo_<mode>"
+  [(match_operand:<sseunpackfltmode> 0 "register_operand")
+   (match_operand:VI2_AVX512F 1 "register_operand")]
+  "TARGET_SSE2"
+{
+  rtx tmp = gen_reg_rtx (<sseunpackmode>mode);
+
+  emit_insn (gen_vec_unpacku_lo_<mode> (tmp, operands[1]));
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+			  gen_rtx_FLOAT (<sseunpackfltmode>mode, tmp)));
+  DONE;
+})
+
+(define_expand "vec_unpacks_float_hi_v4si"
+  [(set (match_dup 2)
+	(vec_select:V4SI
+	  (match_operand:V4SI 1 "nonimmediate_operand")
+	  (parallel [(const_int 2) (const_int 3)
+		     (const_int 2) (const_int 3)])))
+   (set (match_operand:V2DF 0 "register_operand")
+	(float:V2DF
+	  (vec_select:V2SI
+	  (match_dup 2)
+	    (parallel [(const_int 0) (const_int 1)]))))]
+  "TARGET_SSE2"
+  "operands[2] = gen_reg_rtx (V4SImode);")
+
+(define_expand "vec_unpacks_float_lo_v4si"
+  [(set (match_operand:V2DF 0 "register_operand")
+	(float:V2DF
+	  (vec_select:V2SI
+	    (match_operand:V4SI 1 "nonimmediate_operand")
+	    (parallel [(const_int 0) (const_int 1)]))))]
+  "TARGET_SSE2")
+
+(define_expand "vec_unpacks_float_hi_v8si"
+  [(set (match_dup 2)
+	(vec_select:V4SI
+	  (match_operand:V8SI 1 "nonimmediate_operand")
+	  (parallel [(const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)])))
+   (set (match_operand:V4DF 0 "register_operand")
+	(float:V4DF
+	  (match_dup 2)))]
+  "TARGET_AVX"
+  "operands[2] = gen_reg_rtx (V4SImode);")
+
+(define_expand "vec_unpacks_float_lo_v8si"
+  [(set (match_operand:V4DF 0 "register_operand")
+	(float:V4DF
+	  (vec_select:V4SI
+	    (match_operand:V8SI 1 "nonimmediate_operand")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)]))))]
+  "TARGET_AVX")
+
+(define_expand "vec_unpacks_float_hi_v16si"
+  [(set (match_dup 2)
+	(vec_select:V8SI
+	  (match_operand:V16SI 1 "nonimmediate_operand")
+	  (parallel [(const_int 8) (const_int 9)
+		     (const_int 10) (const_int 11)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)])))
+   (set (match_operand:V8DF 0 "register_operand")
+	(float:V8DF
+	  (match_dup 2)))]
+  "TARGET_AVX512F"
+  "operands[2] = gen_reg_rtx (V8SImode);")
+
+(define_expand "vec_unpacks_float_lo_v16si"
+  [(set (match_operand:V8DF 0 "register_operand")
+	(float:V8DF
+	  (vec_select:V8SI
+	    (match_operand:V16SI 1 "nonimmediate_operand")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)
+		       (const_int 4) (const_int 5)
+		       (const_int 6) (const_int 7)]))))]
+  "TARGET_AVX512F")
+
+(define_expand "vec_unpacku_float_hi_v4si"
+  [(set (match_dup 5)
+	(vec_select:V4SI
+	  (match_operand:V4SI 1 "nonimmediate_operand")
+	  (parallel [(const_int 2) (const_int 3)
+		     (const_int 2) (const_int 3)])))
+   (set (match_dup 6)
+	(float:V2DF
+	  (vec_select:V2SI
+	  (match_dup 5)
+	    (parallel [(const_int 0) (const_int 1)]))))
+   (set (match_dup 7)
+	(lt:V2DF (match_dup 6) (match_dup 3)))
+   (set (match_dup 8)
+	(and:V2DF (match_dup 7) (match_dup 4)))
+   (set (match_operand:V2DF 0 "register_operand")
+	(plus:V2DF (match_dup 6) (match_dup 8)))]
+  "TARGET_SSE2"
+{
+  REAL_VALUE_TYPE TWO32r;
+  rtx x;
+  int i;
+
+  real_ldexp (&TWO32r, &dconst1, 32);
+  x = const_double_from_real_value (TWO32r, DFmode);
+
+  operands[3] = force_reg (V2DFmode, CONST0_RTX (V2DFmode));
+  operands[4] = force_reg (V2DFmode,
+			   ix86_build_const_vector (V2DFmode, 1, x));
+
+  operands[5] = gen_reg_rtx (V4SImode);
+
+  for (i = 6; i < 9; i++)
+    operands[i] = gen_reg_rtx (V2DFmode);
+})
+
+(define_expand "vec_unpacku_float_lo_v4si"
+  [(set (match_dup 5)
+	(float:V2DF
+	  (vec_select:V2SI
+	    (match_operand:V4SI 1 "nonimmediate_operand")
+	    (parallel [(const_int 0) (const_int 1)]))))
+   (set (match_dup 6)
+	(lt:V2DF (match_dup 5) (match_dup 3)))
+   (set (match_dup 7)
+	(and:V2DF (match_dup 6) (match_dup 4)))
+   (set (match_operand:V2DF 0 "register_operand")
+	(plus:V2DF (match_dup 5) (match_dup 7)))]
+  "TARGET_SSE2"
+{
+  REAL_VALUE_TYPE TWO32r;
+  rtx x;
+  int i;
+
+  real_ldexp (&TWO32r, &dconst1, 32);
+  x = const_double_from_real_value (TWO32r, DFmode);
+
+  operands[3] = force_reg (V2DFmode, CONST0_RTX (V2DFmode));
+  operands[4] = force_reg (V2DFmode,
+			   ix86_build_const_vector (V2DFmode, 1, x));
+
+  for (i = 5; i < 8; i++)
+    operands[i] = gen_reg_rtx (V2DFmode);
+})
+
+(define_expand "vec_unpacku_float_hi_v8si"
+  [(match_operand:V4DF 0 "register_operand")
+   (match_operand:V8SI 1 "register_operand")]
+  "TARGET_AVX"
+{
+  REAL_VALUE_TYPE TWO32r;
+  rtx x, tmp[6];
+  int i;
+
+  real_ldexp (&TWO32r, &dconst1, 32);
+  x = const_double_from_real_value (TWO32r, DFmode);
+
+  tmp[0] = force_reg (V4DFmode, CONST0_RTX (V4DFmode));
+  tmp[1] = force_reg (V4DFmode, ix86_build_const_vector (V4DFmode, 1, x));
+  tmp[5] = gen_reg_rtx (V4SImode);
+
+  for (i = 2; i < 5; i++)
+    tmp[i] = gen_reg_rtx (V4DFmode);
+  emit_insn (gen_vec_extract_hi_v8si (tmp[5], operands[1]));
+  emit_insn (gen_floatv4siv4df2 (tmp[2], tmp[5]));
+  emit_insn (gen_rtx_SET (VOIDmode, tmp[3],
+			  gen_rtx_LT (V4DFmode, tmp[2], tmp[0])));
+  emit_insn (gen_andv4df3 (tmp[4], tmp[3], tmp[1]));
+  emit_insn (gen_addv4df3 (operands[0], tmp[2], tmp[4]));
+  DONE;
+})
+
+(define_expand "vec_unpacku_float_hi_v16si"
+  [(match_operand:V8DF 0 "register_operand")
+   (match_operand:V16SI 1 "register_operand")]
+  "TARGET_AVX512F"
+{
+  REAL_VALUE_TYPE TWO32r;
+  rtx k, x, tmp[4];
+
+  real_ldexp (&TWO32r, &dconst1, 32);
+  x = const_double_from_real_value (TWO32r, DFmode);
+
+  tmp[0] = force_reg (V8DFmode, CONST0_RTX (V8DFmode));
+  tmp[1] = force_reg (V8DFmode, ix86_build_const_vector (V8DFmode, 1, x));
+  tmp[2] = gen_reg_rtx (V8DFmode);
+  tmp[3] = gen_reg_rtx (V8SImode);
+  k = gen_reg_rtx (QImode);
+
+  emit_insn (gen_vec_extract_hi_v16si (tmp[3], operands[1]));
+  emit_insn (gen_floatv8siv8df2 (tmp[2], tmp[3]));
+  emit_insn (gen_rtx_SET (VOIDmode, k,
+			  gen_rtx_LT (QImode, tmp[2], tmp[0])));
+  emit_insn (gen_addv8df3_mask (tmp[2], tmp[2], tmp[1], tmp[2], k));
+  emit_move_insn (operands[0], tmp[2]);
+  DONE;
+})
+
+(define_expand "vec_unpacku_float_lo_v8si"
+  [(match_operand:V4DF 0 "register_operand")
+   (match_operand:V8SI 1 "nonimmediate_operand")]
+  "TARGET_AVX"
+{
+  REAL_VALUE_TYPE TWO32r;
+  rtx x, tmp[5];
+  int i;
+
+  real_ldexp (&TWO32r, &dconst1, 32);
+  x = const_double_from_real_value (TWO32r, DFmode);
+
+  tmp[0] = force_reg (V4DFmode, CONST0_RTX (V4DFmode));
+  tmp[1] = force_reg (V4DFmode, ix86_build_const_vector (V4DFmode, 1, x));
+
+  for (i = 2; i < 5; i++)
+    tmp[i] = gen_reg_rtx (V4DFmode);
+  emit_insn (gen_avx_cvtdq2pd256_2 (tmp[2], operands[1]));
+  emit_insn (gen_rtx_SET (VOIDmode, tmp[3],
+			  gen_rtx_LT (V4DFmode, tmp[2], tmp[0])));
+  emit_insn (gen_andv4df3 (tmp[4], tmp[3], tmp[1]));
+  emit_insn (gen_addv4df3 (operands[0], tmp[2], tmp[4]));
+  DONE;
+})
+
+(define_expand "vec_unpacku_float_lo_v16si"
+  [(match_operand:V8DF 0 "register_operand")
+   (match_operand:V16SI 1 "nonimmediate_operand")]
+  "TARGET_AVX512F"
+{
+  REAL_VALUE_TYPE TWO32r;
+  rtx k, x, tmp[3];
+
+  real_ldexp (&TWO32r, &dconst1, 32);
+  x = const_double_from_real_value (TWO32r, DFmode);
+
+  tmp[0] = force_reg (V8DFmode, CONST0_RTX (V8DFmode));
+  tmp[1] = force_reg (V8DFmode, ix86_build_const_vector (V8DFmode, 1, x));
+  tmp[2] = gen_reg_rtx (V8DFmode);
+  k = gen_reg_rtx (QImode);
+
+  emit_insn (gen_avx512f_cvtdq2pd512_2 (tmp[2], operands[1]));
+  emit_insn (gen_rtx_SET (VOIDmode, k,
+			  gen_rtx_LT (QImode, tmp[2], tmp[0])));
+  emit_insn (gen_addv8df3_mask (tmp[2], tmp[2], tmp[1], tmp[2], k));
+  emit_move_insn (operands[0], tmp[2]);
+  DONE;
+})
+
+(define_expand "vec_pack_trunc_<mode>"
+  [(set (match_dup 3)
+	(float_truncate:<sf2dfmode>
+	  (match_operand:VF2_512_256 1 "nonimmediate_operand")))
+   (set (match_dup 4)
+	(float_truncate:<sf2dfmode>
+	  (match_operand:VF2_512_256 2 "nonimmediate_operand")))
+   (set (match_operand:<ssePSmode> 0 "register_operand")
+	(vec_concat:<ssePSmode>
+	  (match_dup 3)
+	  (match_dup 4)))]
+  "TARGET_AVX"
+{
+  operands[3] = gen_reg_rtx (<sf2dfmode>mode);
+  operands[4] = gen_reg_rtx (<sf2dfmode>mode);
+})
+
+(define_expand "vec_pack_trunc_v2df"
+  [(match_operand:V4SF 0 "register_operand")
+   (match_operand:V2DF 1 "nonimmediate_operand")
+   (match_operand:V2DF 2 "nonimmediate_operand")]
+  "TARGET_SSE2"
+{
+  rtx tmp0, tmp1;
+
+  if (TARGET_AVX && !TARGET_PREFER_AVX128)
+    {
+      tmp0 = gen_reg_rtx (V4DFmode);
+      tmp1 = force_reg (V2DFmode, operands[1]);
+
+      emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2]));
+      emit_insn (gen_avx_cvtpd2ps256 (operands[0], tmp0));
+    }
+  else
+    {
+      tmp0 = gen_reg_rtx (V4SFmode);
+      tmp1 = gen_reg_rtx (V4SFmode);
+
+      emit_insn (gen_sse2_cvtpd2ps (tmp0, operands[1]));
+      emit_insn (gen_sse2_cvtpd2ps (tmp1, operands[2]));
+      emit_insn (gen_sse_movlhps (operands[0], tmp0, tmp1));
+    }
+  DONE;
+})
+
+(define_expand "vec_pack_sfix_trunc_v8df"
+  [(match_operand:V16SI 0 "register_operand")
+   (match_operand:V8DF 1 "nonimmediate_operand")
+   (match_operand:V8DF 2 "nonimmediate_operand")]
+  "TARGET_AVX512F"
+{
+  rtx r1, r2;
+
+  r1 = gen_reg_rtx (V8SImode);
+  r2 = gen_reg_rtx (V8SImode);
+
+  emit_insn (gen_fix_truncv8dfv8si2 (r1, operands[1]));
+  emit_insn (gen_fix_truncv8dfv8si2 (r2, operands[2]));
+  emit_insn (gen_avx_vec_concatv16si (operands[0], r1, r2));
+  DONE;
+})
+
+(define_expand "vec_pack_sfix_trunc_v4df"
+  [(match_operand:V8SI 0 "register_operand")
+   (match_operand:V4DF 1 "nonimmediate_operand")
+   (match_operand:V4DF 2 "nonimmediate_operand")]
+  "TARGET_AVX"
+{
+  rtx r1, r2;
+
+  r1 = gen_reg_rtx (V4SImode);
+  r2 = gen_reg_rtx (V4SImode);
+
+  emit_insn (gen_fix_truncv4dfv4si2 (r1, operands[1]));
+  emit_insn (gen_fix_truncv4dfv4si2 (r2, operands[2]));
+  emit_insn (gen_avx_vec_concatv8si (operands[0], r1, r2));
+  DONE;
+})
+
+(define_expand "vec_pack_sfix_trunc_v2df"
+  [(match_operand:V4SI 0 "register_operand")
+   (match_operand:V2DF 1 "nonimmediate_operand")
+   (match_operand:V2DF 2 "nonimmediate_operand")]
+  "TARGET_SSE2"
+{
+  rtx tmp0, tmp1, tmp2;
+
+  if (TARGET_AVX && !TARGET_PREFER_AVX128)
+    {
+      tmp0 = gen_reg_rtx (V4DFmode);
+      tmp1 = force_reg (V2DFmode, operands[1]);
+
+      emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2]));
+      emit_insn (gen_fix_truncv4dfv4si2 (operands[0], tmp0));
+    }
+  else
+    {
+      tmp0 = gen_reg_rtx (V4SImode);
+      tmp1 = gen_reg_rtx (V4SImode);
+      tmp2 = gen_reg_rtx (V2DImode);
+
+      emit_insn (gen_sse2_cvttpd2dq (tmp0, operands[1]));
+      emit_insn (gen_sse2_cvttpd2dq (tmp1, operands[2]));
+      emit_insn (gen_vec_interleave_lowv2di (tmp2,
+					     gen_lowpart (V2DImode, tmp0),
+					     gen_lowpart (V2DImode, tmp1)));
+      emit_move_insn (operands[0], gen_lowpart (V4SImode, tmp2));
+    }
+  DONE;
+})
+
+(define_mode_attr ssepackfltmode
+  [(V8DF "V16SI") (V4DF "V8SI") (V2DF "V4SI")])
+
+(define_expand "vec_pack_ufix_trunc_<mode>"
+  [(match_operand:<ssepackfltmode> 0 "register_operand")
+   (match_operand:VF2 1 "register_operand")
+   (match_operand:VF2 2 "register_operand")]
+  "TARGET_SSE2"
+{
+  if (<MODE>mode == V8DFmode)
+    {
+      rtx r1, r2;
+
+      r1 = gen_reg_rtx (V8SImode);
+      r2 = gen_reg_rtx (V8SImode);
+
+      emit_insn (gen_ufix_truncv8dfv8si2 (r1, operands[1]));
+      emit_insn (gen_ufix_truncv8dfv8si2 (r2, operands[2]));
+      emit_insn (gen_avx_vec_concatv16si (operands[0], r1, r2));
+    }
+  else
+    {
+      rtx tmp[7];
+      tmp[0] = ix86_expand_adjust_ufix_to_sfix_si (operands[1], &tmp[2]);
+      tmp[1] = ix86_expand_adjust_ufix_to_sfix_si (operands[2], &tmp[3]);
+      tmp[4] = gen_reg_rtx (<ssepackfltmode>mode);
+      emit_insn (gen_vec_pack_sfix_trunc_<mode> (tmp[4], tmp[0], tmp[1]));
+      if (<ssepackfltmode>mode == V4SImode || TARGET_AVX2)
+	{
+	  tmp[5] = gen_reg_rtx (<ssepackfltmode>mode);
+	  ix86_expand_vec_extract_even_odd (tmp[5], tmp[2], tmp[3], 0);
+	}
+      else
+	{
+	  tmp[5] = gen_reg_rtx (V8SFmode);
+	  ix86_expand_vec_extract_even_odd (tmp[5], gen_lowpart (V8SFmode, tmp[2]),
+					    gen_lowpart (V8SFmode, tmp[3]), 0);
+	  tmp[5] = gen_lowpart (V8SImode, tmp[5]);
+	}
+      tmp[6] = expand_simple_binop (<ssepackfltmode>mode, XOR, tmp[4], tmp[5],
+				    operands[0], 0, OPTAB_DIRECT);
+      if (tmp[6] != operands[0])
+	emit_move_insn (operands[0], tmp[6]);
+    }
+
+  DONE;
+})
+
+(define_expand "vec_pack_sfix_v4df"
+  [(match_operand:V8SI 0 "register_operand")
+   (match_operand:V4DF 1 "nonimmediate_operand")
+   (match_operand:V4DF 2 "nonimmediate_operand")]
+  "TARGET_AVX"
+{
+  rtx r1, r2;
+
+  r1 = gen_reg_rtx (V4SImode);
+  r2 = gen_reg_rtx (V4SImode);
+
+  emit_insn (gen_avx_cvtpd2dq256 (r1, operands[1]));
+  emit_insn (gen_avx_cvtpd2dq256 (r2, operands[2]));
+  emit_insn (gen_avx_vec_concatv8si (operands[0], r1, r2));
+  DONE;
+})
+
+(define_expand "vec_pack_sfix_v2df"
+  [(match_operand:V4SI 0 "register_operand")
+   (match_operand:V2DF 1 "nonimmediate_operand")
+   (match_operand:V2DF 2 "nonimmediate_operand")]
+  "TARGET_SSE2"
+{
+  rtx tmp0, tmp1, tmp2;
+
+  if (TARGET_AVX && !TARGET_PREFER_AVX128)
+    {
+      tmp0 = gen_reg_rtx (V4DFmode);
+      tmp1 = force_reg (V2DFmode, operands[1]);
+
+      emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2]));
+      emit_insn (gen_avx_cvtpd2dq256 (operands[0], tmp0));
+    }
+  else
+    {
+      tmp0 = gen_reg_rtx (V4SImode);
+      tmp1 = gen_reg_rtx (V4SImode);
+      tmp2 = gen_reg_rtx (V2DImode);
+
+      emit_insn (gen_sse2_cvtpd2dq (tmp0, operands[1]));
+      emit_insn (gen_sse2_cvtpd2dq (tmp1, operands[2]));
+      emit_insn (gen_vec_interleave_lowv2di (tmp2,
+					     gen_lowpart (V2DImode, tmp0),
+					     gen_lowpart (V2DImode, tmp1)));
+      emit_move_insn (operands[0], gen_lowpart (V4SImode, tmp2));
+    }
+  DONE;
+})
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel single-precision floating point element swizzling
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_expand "sse_movhlps_exp"
+  [(set (match_operand:V4SF 0 "nonimmediate_operand")
+	(vec_select:V4SF
+	  (vec_concat:V8SF
+	    (match_operand:V4SF 1 "nonimmediate_operand")
+	    (match_operand:V4SF 2 "nonimmediate_operand"))
+	  (parallel [(const_int 6)
+		     (const_int 7)
+		     (const_int 2)
+		     (const_int 3)])))]
+  "TARGET_SSE"
+{
+  rtx dst = ix86_fixup_binary_operands (UNKNOWN, V4SFmode, operands);
+
+  emit_insn (gen_sse_movhlps (dst, operands[1], operands[2]));
+
+  /* Fix up the destination if needed.  */
+  if (dst != operands[0])
+    emit_move_insn (operands[0], dst);
+
+  DONE;
+})
+
+(define_insn "sse_movhlps"
+  [(set (match_operand:V4SF 0 "nonimmediate_operand"     "=x,x,x,x,m")
+	(vec_select:V4SF
+	  (vec_concat:V8SF
+	    (match_operand:V4SF 1 "nonimmediate_operand" " 0,x,0,x,0")
+	    (match_operand:V4SF 2 "nonimmediate_operand" " x,x,o,o,x"))
+	  (parallel [(const_int 6)
+		     (const_int 7)
+		     (const_int 2)
+		     (const_int 3)])))]
+  "TARGET_SSE && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "@
+   movhlps\t{%2, %0|%0, %2}
+   vmovhlps\t{%2, %1, %0|%0, %1, %2}
+   movlps\t{%H2, %0|%0, %H2}
+   vmovlps\t{%H2, %1, %0|%0, %1, %H2}
+   %vmovhps\t{%2, %0|%q0, %2}"
+  [(set_attr "isa" "noavx,avx,noavx,avx,*")
+   (set_attr "type" "ssemov")
+   (set_attr "ssememalign" "64")
+   (set_attr "prefix" "orig,vex,orig,vex,maybe_vex")
+   (set_attr "mode" "V4SF,V4SF,V2SF,V2SF,V2SF")])
+
+(define_expand "sse_movlhps_exp"
+  [(set (match_operand:V4SF 0 "nonimmediate_operand")
+	(vec_select:V4SF
+	  (vec_concat:V8SF
+	    (match_operand:V4SF 1 "nonimmediate_operand")
+	    (match_operand:V4SF 2 "nonimmediate_operand"))
+	  (parallel [(const_int 0)
+		     (const_int 1)
+		     (const_int 4)
+		     (const_int 5)])))]
+  "TARGET_SSE"
+{
+  rtx dst = ix86_fixup_binary_operands (UNKNOWN, V4SFmode, operands);
+
+  emit_insn (gen_sse_movlhps (dst, operands[1], operands[2]));
+
+  /* Fix up the destination if needed.  */
+  if (dst != operands[0])
+    emit_move_insn (operands[0], dst);
+
+  DONE;
+})
+
+(define_insn "sse_movlhps"
+  [(set (match_operand:V4SF 0 "nonimmediate_operand"     "=x,x,x,x,o")
+	(vec_select:V4SF
+	  (vec_concat:V8SF
+	    (match_operand:V4SF 1 "nonimmediate_operand" " 0,x,0,x,0")
+	    (match_operand:V4SF 2 "nonimmediate_operand" " x,x,m,m,x"))
+	  (parallel [(const_int 0)
+		     (const_int 1)
+		     (const_int 4)
+		     (const_int 5)])))]
+  "TARGET_SSE && ix86_binary_operator_ok (UNKNOWN, V4SFmode, operands)"
+  "@
+   movlhps\t{%2, %0|%0, %2}
+   vmovlhps\t{%2, %1, %0|%0, %1, %2}
+   movhps\t{%2, %0|%0, %q2}
+   vmovhps\t{%2, %1, %0|%0, %1, %q2}
+   %vmovlps\t{%2, %H0|%H0, %2}"
+  [(set_attr "isa" "noavx,avx,noavx,avx,*")
+   (set_attr "type" "ssemov")
+   (set_attr "ssememalign" "64")
+   (set_attr "prefix" "orig,vex,orig,vex,maybe_vex")
+   (set_attr "mode" "V4SF,V4SF,V2SF,V2SF,V2SF")])
+
+(define_insn "<mask_codefor>avx512f_unpckhps512<mask_name>"
+  [(set (match_operand:V16SF 0 "register_operand" "=v")
+	(vec_select:V16SF
+	  (vec_concat:V32SF
+	    (match_operand:V16SF 1 "register_operand" "v")
+	    (match_operand:V16SF 2 "nonimmediate_operand" "vm"))
+	  (parallel [(const_int 2) (const_int 18)
+		     (const_int 3) (const_int 19)
+		     (const_int 6) (const_int 22)
+		     (const_int 7) (const_int 23)
+		     (const_int 10) (const_int 26)
+		     (const_int 11) (const_int 27)
+		     (const_int 14) (const_int 30)
+		     (const_int 15) (const_int 31)])))]
+  "TARGET_AVX512F"
+  "vunpckhps\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V16SF")])
+
+;; Recall that the 256-bit unpck insns only shuffle within their lanes.
+(define_insn "avx_unpckhps256"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(vec_select:V8SF
+	  (vec_concat:V16SF
+	    (match_operand:V8SF 1 "register_operand" "x")
+	    (match_operand:V8SF 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 2) (const_int 10)
+		     (const_int 3) (const_int 11)
+		     (const_int 6) (const_int 14)
+		     (const_int 7) (const_int 15)])))]
+  "TARGET_AVX"
+  "vunpckhps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_expand "vec_interleave_highv8sf"
+  [(set (match_dup 3)
+	(vec_select:V8SF
+	  (vec_concat:V16SF
+	    (match_operand:V8SF 1 "register_operand" "x")
+	    (match_operand:V8SF 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 0) (const_int 8)
+		     (const_int 1) (const_int 9)
+		     (const_int 4) (const_int 12)
+		     (const_int 5) (const_int 13)])))
+   (set (match_dup 4)
+	(vec_select:V8SF
+	  (vec_concat:V16SF
+	    (match_dup 1)
+	    (match_dup 2))
+	  (parallel [(const_int 2) (const_int 10)
+		     (const_int 3) (const_int 11)
+		     (const_int 6) (const_int 14)
+		     (const_int 7) (const_int 15)])))
+   (set (match_operand:V8SF 0 "register_operand")
+	(vec_select:V8SF
+	  (vec_concat:V16SF
+	    (match_dup 3)
+	    (match_dup 4))
+	  (parallel [(const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)])))]
+ "TARGET_AVX"
+{
+  operands[3] = gen_reg_rtx (V8SFmode);
+  operands[4] = gen_reg_rtx (V8SFmode);
+})
+
+(define_insn "vec_interleave_highv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
+	(vec_select:V4SF
+	  (vec_concat:V8SF
+	    (match_operand:V4SF 1 "register_operand" "0,x")
+	    (match_operand:V4SF 2 "nonimmediate_operand" "xm,xm"))
+	  (parallel [(const_int 2) (const_int 6)
+		     (const_int 3) (const_int 7)])))]
+  "TARGET_SSE"
+  "@
+   unpckhps\t{%2, %0|%0, %2}
+   vunpckhps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "<mask_codefor>avx512f_unpcklps512<mask_name>"
+  [(set (match_operand:V16SF 0 "register_operand" "=v")
+	(vec_select:V16SF
+	  (vec_concat:V32SF
+	    (match_operand:V16SF 1 "register_operand" "v")
+	    (match_operand:V16SF 2 "nonimmediate_operand" "vm"))
+	  (parallel [(const_int 0) (const_int 16)
+		     (const_int 1) (const_int 17)
+		     (const_int 4) (const_int 20)
+		     (const_int 5) (const_int 21)
+		     (const_int 8) (const_int 24)
+		     (const_int 9) (const_int 25)
+		     (const_int 12) (const_int 28)
+		     (const_int 13) (const_int 29)])))]
+  "TARGET_AVX512F"
+  "vunpcklps\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V16SF")])
+
+;; Recall that the 256-bit unpck insns only shuffle within their lanes.
+(define_insn "avx_unpcklps256"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(vec_select:V8SF
+	  (vec_concat:V16SF
+	    (match_operand:V8SF 1 "register_operand" "x")
+	    (match_operand:V8SF 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 0) (const_int 8)
+		     (const_int 1) (const_int 9)
+		     (const_int 4) (const_int 12)
+		     (const_int 5) (const_int 13)])))]
+  "TARGET_AVX"
+  "vunpcklps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_expand "vec_interleave_lowv8sf"
+  [(set (match_dup 3)
+	(vec_select:V8SF
+	  (vec_concat:V16SF
+	    (match_operand:V8SF 1 "register_operand" "x")
+	    (match_operand:V8SF 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 0) (const_int 8)
+		     (const_int 1) (const_int 9)
+		     (const_int 4) (const_int 12)
+		     (const_int 5) (const_int 13)])))
+   (set (match_dup 4)
+	(vec_select:V8SF
+	  (vec_concat:V16SF
+	    (match_dup 1)
+	    (match_dup 2))
+	  (parallel [(const_int 2) (const_int 10)
+		     (const_int 3) (const_int 11)
+		     (const_int 6) (const_int 14)
+		     (const_int 7) (const_int 15)])))
+   (set (match_operand:V8SF 0 "register_operand")
+	(vec_select:V8SF
+	  (vec_concat:V16SF
+	    (match_dup 3)
+	    (match_dup 4))
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 8) (const_int 9)
+		     (const_int 10) (const_int 11)])))]
+ "TARGET_AVX"
+{
+  operands[3] = gen_reg_rtx (V8SFmode);
+  operands[4] = gen_reg_rtx (V8SFmode);
+})
+
+(define_insn "vec_interleave_lowv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
+	(vec_select:V4SF
+	  (vec_concat:V8SF
+	    (match_operand:V4SF 1 "register_operand" "0,x")
+	    (match_operand:V4SF 2 "nonimmediate_operand" "xm,xm"))
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 1) (const_int 5)])))]
+  "TARGET_SSE"
+  "@
+   unpcklps\t{%2, %0|%0, %2}
+   vunpcklps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "V4SF")])
+
+;; These are modeled with the same vec_concat as the others so that we
+;; capture users of shufps that can use the new instructions
+(define_insn "avx_movshdup256"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(vec_select:V8SF
+	  (vec_concat:V16SF
+	    (match_operand:V8SF 1 "nonimmediate_operand" "xm")
+	    (match_dup 1))
+	  (parallel [(const_int 1) (const_int 1)
+		     (const_int 3) (const_int 3)
+		     (const_int 5) (const_int 5)
+		     (const_int 7) (const_int 7)])))]
+  "TARGET_AVX"
+  "vmovshdup\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "sse3_movshdup"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(vec_select:V4SF
+	  (vec_concat:V8SF
+	    (match_operand:V4SF 1 "nonimmediate_operand" "xm")
+	    (match_dup 1))
+	  (parallel [(const_int 1)
+		     (const_int 1)
+		     (const_int 7)
+		     (const_int 7)])))]
+  "TARGET_SSE3"
+  "%vmovshdup\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "<mask_codefor>avx512f_movshdup512<mask_name>"
+  [(set (match_operand:V16SF 0 "register_operand" "=v")
+	(vec_select:V16SF
+	  (vec_concat:V32SF
+	    (match_operand:V16SF 1 "nonimmediate_operand" "vm")
+	    (match_dup 1))
+	  (parallel [(const_int 1) (const_int 1)
+		     (const_int 3) (const_int 3)
+		     (const_int 5) (const_int 5)
+		     (const_int 7) (const_int 7)
+		     (const_int 9) (const_int 9)
+		     (const_int 11) (const_int 11)
+		     (const_int 13) (const_int 13)
+		     (const_int 15) (const_int 15)])))]
+  "TARGET_AVX512F"
+  "vmovshdup\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V16SF")])
+
+(define_insn "avx_movsldup256"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(vec_select:V8SF
+	  (vec_concat:V16SF
+	    (match_operand:V8SF 1 "nonimmediate_operand" "xm")
+	    (match_dup 1))
+	  (parallel [(const_int 0) (const_int 0)
+		     (const_int 2) (const_int 2)
+		     (const_int 4) (const_int 4)
+		     (const_int 6) (const_int 6)])))]
+  "TARGET_AVX"
+  "vmovsldup\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "sse3_movsldup"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(vec_select:V4SF
+	  (vec_concat:V8SF
+	    (match_operand:V4SF 1 "nonimmediate_operand" "xm")
+	    (match_dup 1))
+	  (parallel [(const_int 0)
+		     (const_int 0)
+		     (const_int 6)
+		     (const_int 6)])))]
+  "TARGET_SSE3"
+  "%vmovsldup\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "<mask_codefor>avx512f_movsldup512<mask_name>"
+  [(set (match_operand:V16SF 0 "register_operand" "=v")
+	(vec_select:V16SF
+	  (vec_concat:V32SF
+	    (match_operand:V16SF 1 "nonimmediate_operand" "vm")
+	    (match_dup 1))
+	  (parallel [(const_int 0) (const_int 0)
+		     (const_int 2) (const_int 2)
+		     (const_int 4) (const_int 4)
+		     (const_int 6) (const_int 6)
+		     (const_int 8) (const_int 8)
+		     (const_int 10) (const_int 10)
+		     (const_int 12) (const_int 12)
+		     (const_int 14) (const_int 14)])))]
+  "TARGET_AVX512F"
+  "vmovsldup\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V16SF")])
+
+(define_expand "avx_shufps256"
+  [(match_operand:V8SF 0 "register_operand")
+   (match_operand:V8SF 1 "register_operand")
+   (match_operand:V8SF 2 "nonimmediate_operand")
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_AVX"
+{
+  int mask = INTVAL (operands[3]);
+  emit_insn (gen_avx_shufps256_1 (operands[0], operands[1], operands[2],
+				  GEN_INT ((mask >> 0) & 3),
+				  GEN_INT ((mask >> 2) & 3),
+				  GEN_INT (((mask >> 4) & 3) + 8),
+				  GEN_INT (((mask >> 6) & 3) + 8),
+				  GEN_INT (((mask >> 0) & 3) + 4),
+				  GEN_INT (((mask >> 2) & 3) + 4),
+				  GEN_INT (((mask >> 4) & 3) + 12),
+				  GEN_INT (((mask >> 6) & 3) + 12)));
+  DONE;
+})
+
+;; One bit in mask selects 2 elements.
+(define_insn "avx_shufps256_1"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(vec_select:V8SF
+	  (vec_concat:V16SF
+	    (match_operand:V8SF 1 "register_operand" "x")
+	    (match_operand:V8SF 2 "nonimmediate_operand" "xm"))
+	  (parallel [(match_operand 3  "const_0_to_3_operand"  )
+		     (match_operand 4  "const_0_to_3_operand"  )
+		     (match_operand 5  "const_8_to_11_operand" )
+		     (match_operand 6  "const_8_to_11_operand" )
+		     (match_operand 7  "const_4_to_7_operand"  )
+		     (match_operand 8  "const_4_to_7_operand"  )
+		     (match_operand 9  "const_12_to_15_operand")
+		     (match_operand 10 "const_12_to_15_operand")])))]
+  "TARGET_AVX
+   && (INTVAL (operands[3]) == (INTVAL (operands[7]) - 4)
+       && INTVAL (operands[4]) == (INTVAL (operands[8]) - 4)
+       && INTVAL (operands[5]) == (INTVAL (operands[9]) - 4)
+       && INTVAL (operands[6]) == (INTVAL (operands[10]) - 4))"
+{
+  int mask;
+  mask = INTVAL (operands[3]);
+  mask |= INTVAL (operands[4]) << 2;
+  mask |= (INTVAL (operands[5]) - 8) << 4;
+  mask |= (INTVAL (operands[6]) - 8) << 6;
+  operands[3] = GEN_INT (mask);
+
+  return "vshufps\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+}
+  [(set_attr "type" "sseshuf")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_expand "sse_shufps"
+  [(match_operand:V4SF 0 "register_operand")
+   (match_operand:V4SF 1 "register_operand")
+   (match_operand:V4SF 2 "nonimmediate_operand")
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_SSE"
+{
+  int mask = INTVAL (operands[3]);
+  emit_insn (gen_sse_shufps_v4sf (operands[0], operands[1], operands[2],
+			       GEN_INT ((mask >> 0) & 3),
+			       GEN_INT ((mask >> 2) & 3),
+			       GEN_INT (((mask >> 4) & 3) + 4),
+			       GEN_INT (((mask >> 6) & 3) + 4)));
+  DONE;
+})
+
+(define_insn "sse_shufps_<mode>"
+  [(set (match_operand:VI4F_128 0 "register_operand" "=x,x")
+	(vec_select:VI4F_128
+	  (vec_concat:<ssedoublevecmode>
+	    (match_operand:VI4F_128 1 "register_operand" "0,x")
+	    (match_operand:VI4F_128 2 "nonimmediate_operand" "xm,xm"))
+	  (parallel [(match_operand 3 "const_0_to_3_operand")
+		     (match_operand 4 "const_0_to_3_operand")
+		     (match_operand 5 "const_4_to_7_operand")
+		     (match_operand 6 "const_4_to_7_operand")])))]
+  "TARGET_SSE"
+{
+  int mask = 0;
+  mask |= INTVAL (operands[3]) << 0;
+  mask |= INTVAL (operands[4]) << 2;
+  mask |= (INTVAL (operands[5]) - 4) << 4;
+  mask |= (INTVAL (operands[6]) - 4) << 6;
+  operands[3] = GEN_INT (mask);
+
+  switch (which_alternative)
+    {
+    case 0:
+      return "shufps\t{%3, %2, %0|%0, %2, %3}";
+    case 1:
+      return "vshufps\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseshuf")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "sse_storehps"
+  [(set (match_operand:V2SF 0 "nonimmediate_operand" "=m,x,x")
+	(vec_select:V2SF
+	  (match_operand:V4SF 1 "nonimmediate_operand" "x,x,o")
+	  (parallel [(const_int 2) (const_int 3)])))]
+  "TARGET_SSE"
+  "@
+   %vmovhps\t{%1, %0|%q0, %1}
+   %vmovhlps\t{%1, %d0|%d0, %1}
+   %vmovlps\t{%H1, %d0|%d0, %H1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "ssememalign" "64")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "V2SF,V4SF,V2SF")])
+
+(define_expand "sse_loadhps_exp"
+  [(set (match_operand:V4SF 0 "nonimmediate_operand")
+	(vec_concat:V4SF
+	  (vec_select:V2SF
+	    (match_operand:V4SF 1 "nonimmediate_operand")
+	    (parallel [(const_int 0) (const_int 1)]))
+	  (match_operand:V2SF 2 "nonimmediate_operand")))]
+  "TARGET_SSE"
+{
+  rtx dst = ix86_fixup_binary_operands (UNKNOWN, V4SFmode, operands);
+
+  emit_insn (gen_sse_loadhps (dst, operands[1], operands[2]));
+
+  /* Fix up the destination if needed.  */
+  if (dst != operands[0])
+    emit_move_insn (operands[0], dst);
+
+  DONE;
+})
+
+(define_insn "sse_loadhps"
+  [(set (match_operand:V4SF 0 "nonimmediate_operand"     "=x,x,x,x,o")
+	(vec_concat:V4SF
+	  (vec_select:V2SF
+	    (match_operand:V4SF 1 "nonimmediate_operand" " 0,x,0,x,0")
+	    (parallel [(const_int 0) (const_int 1)]))
+	  (match_operand:V2SF 2 "nonimmediate_operand"   " m,m,x,x,x")))]
+  "TARGET_SSE"
+  "@
+   movhps\t{%2, %0|%0, %q2}
+   vmovhps\t{%2, %1, %0|%0, %1, %q2}
+   movlhps\t{%2, %0|%0, %2}
+   vmovlhps\t{%2, %1, %0|%0, %1, %2}
+   %vmovlps\t{%2, %H0|%H0, %2}"
+  [(set_attr "isa" "noavx,avx,noavx,avx,*")
+   (set_attr "type" "ssemov")
+   (set_attr "ssememalign" "64")
+   (set_attr "prefix" "orig,vex,orig,vex,maybe_vex")
+   (set_attr "mode" "V2SF,V2SF,V4SF,V4SF,V2SF")])
+
+(define_insn "sse_storelps"
+  [(set (match_operand:V2SF 0 "nonimmediate_operand"   "=m,x,x")
+	(vec_select:V2SF
+	  (match_operand:V4SF 1 "nonimmediate_operand" " x,x,m")
+	  (parallel [(const_int 0) (const_int 1)])))]
+  "TARGET_SSE"
+  "@
+   %vmovlps\t{%1, %0|%q0, %1}
+   %vmovaps\t{%1, %0|%0, %1}
+   %vmovlps\t{%1, %d0|%d0, %q1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "V2SF,V4SF,V2SF")])
+
+(define_expand "sse_loadlps_exp"
+  [(set (match_operand:V4SF 0 "nonimmediate_operand")
+	(vec_concat:V4SF
+	  (match_operand:V2SF 2 "nonimmediate_operand")
+	  (vec_select:V2SF
+	    (match_operand:V4SF 1 "nonimmediate_operand")
+	    (parallel [(const_int 2) (const_int 3)]))))]
+  "TARGET_SSE"
+{
+  rtx dst = ix86_fixup_binary_operands (UNKNOWN, V4SFmode, operands);
+
+  emit_insn (gen_sse_loadlps (dst, operands[1], operands[2]));
+
+  /* Fix up the destination if needed.  */
+  if (dst != operands[0])
+    emit_move_insn (operands[0], dst);
+
+  DONE;
+})
+
+(define_insn "sse_loadlps"
+  [(set (match_operand:V4SF 0 "nonimmediate_operand"     "=x,x,x,x,m")
+	(vec_concat:V4SF
+	  (match_operand:V2SF 2 "nonimmediate_operand"   " 0,x,m,m,x")
+	  (vec_select:V2SF
+	    (match_operand:V4SF 1 "nonimmediate_operand" " x,x,0,x,0")
+	    (parallel [(const_int 2) (const_int 3)]))))]
+  "TARGET_SSE"
+  "@
+   shufps\t{$0xe4, %1, %0|%0, %1, 0xe4}
+   vshufps\t{$0xe4, %1, %2, %0|%0, %2, %1, 0xe4}
+   movlps\t{%2, %0|%0, %q2}
+   vmovlps\t{%2, %1, %0|%0, %1, %q2}
+   %vmovlps\t{%2, %0|%q0, %2}"
+  [(set_attr "isa" "noavx,avx,noavx,avx,*")
+   (set_attr "type" "sseshuf,sseshuf,ssemov,ssemov,ssemov")
+   (set_attr "ssememalign" "64")
+   (set_attr "length_immediate" "1,1,*,*,*")
+   (set_attr "prefix" "orig,vex,orig,vex,maybe_vex")
+   (set_attr "mode" "V4SF,V4SF,V2SF,V2SF,V2SF")])
+
+(define_insn "sse_movss"
+  [(set (match_operand:V4SF 0 "register_operand"   "=x,x")
+	(vec_merge:V4SF
+	  (match_operand:V4SF 2 "register_operand" " x,x")
+	  (match_operand:V4SF 1 "register_operand" " 0,x")
+	  (const_int 1)))]
+  "TARGET_SSE"
+  "@
+   movss\t{%2, %0|%0, %2}
+   vmovss\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "SF")])
+
+(define_insn "avx2_vec_dup<mode>"
+  [(set (match_operand:VF1_128_256 0 "register_operand" "=x")
+	(vec_duplicate:VF1_128_256
+	  (vec_select:SF
+	    (match_operand:V4SF 1 "register_operand" "x")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_AVX2"
+  "vbroadcastss\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sselog1")
+    (set_attr "prefix" "vex")
+    (set_attr "mode" "<MODE>")])
+
+(define_insn "avx2_vec_dupv8sf_1"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(vec_duplicate:V8SF
+	  (vec_select:SF
+	    (match_operand:V8SF 1 "register_operand" "x")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_AVX2"
+  "vbroadcastss\t{%x1, %0|%0, %x1}"
+  [(set_attr "type" "sselog1")
+    (set_attr "prefix" "vex")
+    (set_attr "mode" "V8SF")])
+
+(define_insn "vec_dupv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x,x")
+	(vec_duplicate:V4SF
+	  (match_operand:SF 1 "nonimmediate_operand" "x,m,0")))]
+  "TARGET_SSE"
+  "@
+   vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0}
+   vbroadcastss\t{%1, %0|%0, %1}
+   shufps\t{$0, %0, %0|%0, %0, 0}"
+  [(set_attr "isa" "avx,avx,noavx")
+   (set_attr "type" "sseshuf1,ssemov,sseshuf1")
+   (set_attr "length_immediate" "1,0,1")
+   (set_attr "prefix_extra" "0,1,*")
+   (set_attr "prefix" "vex,vex,orig")
+   (set_attr "mode" "V4SF")])
+
+;; Although insertps takes register source, we prefer
+;; unpcklps with register source since it is shorter.
+(define_insn "*vec_concatv2sf_sse4_1"
+  [(set (match_operand:V2SF 0 "register_operand"     "=x,x,x,x,x,*y ,*y")
+	(vec_concat:V2SF
+	  (match_operand:SF 1 "nonimmediate_operand" " 0,x,0,x,m, 0 , m")
+	  (match_operand:SF 2 "vector_move_operand"  " x,x,m,m,C,*ym, C")))]
+  "TARGET_SSE4_1"
+  "@
+   unpcklps\t{%2, %0|%0, %2}
+   vunpcklps\t{%2, %1, %0|%0, %1, %2}
+   insertps\t{$0x10, %2, %0|%0, %2, 0x10}
+   vinsertps\t{$0x10, %2, %1, %0|%0, %1, %2, 0x10}
+   %vmovss\t{%1, %0|%0, %1}
+   punpckldq\t{%2, %0|%0, %2}
+   movd\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "noavx,avx,noavx,avx,*,*,*")
+   (set_attr "type" "sselog,sselog,sselog,sselog,ssemov,mmxcvt,mmxmov")
+   (set_attr "prefix_data16" "*,*,1,*,*,*,*")
+   (set_attr "prefix_extra" "*,*,1,1,*,*,*")
+   (set_attr "length_immediate" "*,*,1,1,*,*,*")
+   (set_attr "prefix" "orig,vex,orig,vex,maybe_vex,orig,orig")
+   (set_attr "mode" "V4SF,V4SF,V4SF,V4SF,SF,DI,DI")])
+
+;; ??? In theory we can match memory for the MMX alternative, but allowing
+;; nonimmediate_operand for operand 2 and *not* allowing memory for the SSE
+;; alternatives pretty much forces the MMX alternative to be chosen.
+(define_insn "*vec_concatv2sf_sse"
+  [(set (match_operand:V2SF 0 "register_operand"     "=x,x,*y,*y")
+	(vec_concat:V2SF
+	  (match_operand:SF 1 "nonimmediate_operand" " 0,m, 0, m")
+	  (match_operand:SF 2 "reg_or_0_operand"     " x,C,*y, C")))]
+  "TARGET_SSE"
+  "@
+   unpcklps\t{%2, %0|%0, %2}
+   movss\t{%1, %0|%0, %1}
+   punpckldq\t{%2, %0|%0, %2}
+   movd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sselog,ssemov,mmxcvt,mmxmov")
+   (set_attr "mode" "V4SF,SF,DI,DI")])
+
+(define_insn "*vec_concatv4sf"
+  [(set (match_operand:V4SF 0 "register_operand"       "=x,x,x,x")
+	(vec_concat:V4SF
+	  (match_operand:V2SF 1 "register_operand"     " 0,x,0,x")
+	  (match_operand:V2SF 2 "nonimmediate_operand" " x,x,m,m")))]
+  "TARGET_SSE"
+  "@
+   movlhps\t{%2, %0|%0, %2}
+   vmovlhps\t{%2, %1, %0|%0, %1, %2}
+   movhps\t{%2, %0|%0, %q2}
+   vmovhps\t{%2, %1, %0|%0, %1, %q2}"
+  [(set_attr "isa" "noavx,avx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix" "orig,vex,orig,vex")
+   (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")])
+
+(define_expand "vec_init<mode>"
+  [(match_operand:V_128 0 "register_operand")
+   (match_operand 1)]
+  "TARGET_SSE"
+{
+  ix86_expand_vector_init (false, operands[0], operands[1]);
+  DONE;
+})
+
+;; Avoid combining registers from different units in a single alternative,
+;; see comment above inline_secondary_memory_needed function in i386.c
+(define_insn "vec_set<mode>_0"
+  [(set (match_operand:VI4F_128 0 "nonimmediate_operand"
+	  "=x,x,x ,x,x,x,x  ,x  ,m ,m   ,m")
+	(vec_merge:VI4F_128
+	  (vec_duplicate:VI4F_128
+	    (match_operand:<ssescalarmode> 2 "general_operand"
+	  " x,m,*r,m,x,x,*rm,*rm,!x,!*re,!*fF"))
+	  (match_operand:VI4F_128 1 "vector_move_operand"
+	  " C,C,C ,C,0,x,0  ,x  ,0 ,0   ,0")
+	  (const_int 1)))]
+  "TARGET_SSE"
+  "@
+   %vinsertps\t{$0xe, %d2, %0|%0, %d2, 0xe}
+   %vmov<ssescalarmodesuffix>\t{%2, %0|%0, %2}
+   %vmovd\t{%2, %0|%0, %2}
+   movss\t{%2, %0|%0, %2}
+   movss\t{%2, %0|%0, %2}
+   vmovss\t{%2, %1, %0|%0, %1, %2}
+   pinsrd\t{$0, %2, %0|%0, %2, 0}
+   vpinsrd\t{$0, %2, %1, %0|%0, %1, %2, 0}
+   #
+   #
+   #"
+  [(set_attr "isa" "sse4,sse2,sse2,noavx,noavx,avx,sse4_noavx,avx,*,*,*")
+   (set (attr "type")
+     (cond [(eq_attr "alternative" "0,6,7")
+	      (const_string "sselog")
+	    (eq_attr "alternative" "9")
+	      (const_string "imov")
+	    (eq_attr "alternative" "10")
+	      (const_string "fmov")
+	   ]
+	   (const_string "ssemov")))
+   (set_attr "prefix_extra" "*,*,*,*,*,*,1,1,*,*,*")
+   (set_attr "length_immediate" "*,*,*,*,*,*,1,1,*,*,*")
+   (set_attr "prefix" "maybe_vex,maybe_vex,maybe_vex,orig,orig,vex,orig,vex,*,*,*")
+   (set_attr "mode" "SF,<ssescalarmode>,SI,SF,SF,SF,TI,TI,*,*,*")])
+
+;; A subset is vec_setv4sf.
+(define_insn "*vec_setv4sf_sse4_1"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
+	(vec_merge:V4SF
+	  (vec_duplicate:V4SF
+	    (match_operand:SF 2 "nonimmediate_operand" "xm,xm"))
+	  (match_operand:V4SF 1 "register_operand" "0,x")
+	  (match_operand:SI 3 "const_int_operand")))]
+  "TARGET_SSE4_1
+   && ((unsigned) exact_log2 (INTVAL (operands[3]))
+       < GET_MODE_NUNITS (V4SFmode))"
+{
+  operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3])) << 4);
+  switch (which_alternative)
+    {
+    case 0:
+      return "insertps\t{%3, %2, %0|%0, %2, %3}";
+    case 1:
+      return "vinsertps\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "sse4_1_insertps"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
+	(unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" "xm,xm")
+		      (match_operand:V4SF 1 "register_operand" "0,x")
+		      (match_operand:SI 3 "const_0_to_255_operand" "n,n")]
+		     UNSPEC_INSERTPS))]
+  "TARGET_SSE4_1"
+{
+  if (MEM_P (operands[2]))
+    {
+      unsigned count_s = INTVAL (operands[3]) >> 6;
+      if (count_s)
+	operands[3] = GEN_INT (INTVAL (operands[3]) & 0x3f);
+      operands[2] = adjust_address_nv (operands[2], SFmode, count_s * 4);
+    }
+  switch (which_alternative)
+    {
+    case 0:
+      return "insertps\t{%3, %2, %0|%0, %2, %3}";
+    case 1:
+      return "vinsertps\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "V4SF")])
+
+(define_split
+  [(set (match_operand:VI4F_128 0 "memory_operand")
+	(vec_merge:VI4F_128
+	  (vec_duplicate:VI4F_128
+	    (match_operand:<ssescalarmode> 1 "nonmemory_operand"))
+	  (match_dup 0)
+	  (const_int 1)))]
+  "TARGET_SSE && reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+  "operands[0] = adjust_address (operands[0], <ssescalarmode>mode, 0);")
+
+(define_expand "vec_set<mode>"
+  [(match_operand:V 0 "register_operand")
+   (match_operand:<ssescalarmode> 1 "register_operand")
+   (match_operand 2 "const_int_operand")]
+  "TARGET_SSE"
+{
+  ix86_expand_vector_set (false, operands[0], operands[1],
+			  INTVAL (operands[2]));
+  DONE;
+})
+
+(define_insn_and_split "*vec_extractv4sf_0"
+  [(set (match_operand:SF 0 "nonimmediate_operand" "=x,m,f,r")
+	(vec_select:SF
+	  (match_operand:V4SF 1 "nonimmediate_operand" "xm,x,m,m")
+	  (parallel [(const_int 0)])))]
+  "TARGET_SSE && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+{
+  if (REG_P (operands[1]))
+    operands[1] = gen_rtx_REG (SFmode, REGNO (operands[1]));
+  else
+    operands[1] = adjust_address (operands[1], SFmode, 0);
+})
+
+(define_insn_and_split "*sse4_1_extractps"
+  [(set (match_operand:SF 0 "nonimmediate_operand" "=rm,x,x")
+	(vec_select:SF
+	  (match_operand:V4SF 1 "register_operand" "x,0,x")
+	  (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n,n,n")])))]
+  "TARGET_SSE4_1"
+  "@
+   %vextractps\t{%2, %1, %0|%0, %1, %2}
+   #
+   #"
+  "&& reload_completed && SSE_REG_P (operands[0])"
+  [(const_int 0)]
+{
+  rtx dest = gen_rtx_REG (V4SFmode, REGNO (operands[0]));
+  switch (INTVAL (operands[2]))
+    {
+    case 1:
+    case 3:
+      emit_insn (gen_sse_shufps_v4sf (dest, operands[1], operands[1],
+				      operands[2], operands[2],
+				      GEN_INT (INTVAL (operands[2]) + 4),
+				      GEN_INT (INTVAL (operands[2]) + 4)));
+      break;
+    case 2:
+      emit_insn (gen_vec_interleave_highv4sf (dest, operands[1], operands[1]));
+      break;
+    default:
+      /* 0 should be handled by the *vec_extractv4sf_0 pattern above.  */
+      gcc_unreachable ();
+    }
+  DONE;
+}
+  [(set_attr "isa" "*,noavx,avx")
+   (set_attr "type" "sselog,*,*")
+   (set_attr "prefix_data16" "1,*,*")
+   (set_attr "prefix_extra" "1,*,*")
+   (set_attr "length_immediate" "1,*,*")
+   (set_attr "prefix" "maybe_vex,*,*")
+   (set_attr "mode" "V4SF,*,*")])
+
+(define_insn_and_split "*vec_extractv4sf_mem"
+  [(set (match_operand:SF 0 "register_operand" "=x,*r,f")
+	(vec_select:SF
+	  (match_operand:V4SF 1 "memory_operand" "o,o,o")
+	  (parallel [(match_operand 2 "const_0_to_3_operand" "n,n,n")])))]
+  "TARGET_SSE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+{
+  operands[1] = adjust_address (operands[1], SFmode, INTVAL (operands[2]) * 4);
+})
+
+(define_expand "avx512f_vextract<shuffletype>32x4_mask"
+  [(match_operand:<ssequartermode> 0 "nonimmediate_operand")
+   (match_operand:V16FI 1 "register_operand")
+   (match_operand:SI 2 "const_0_to_3_operand")
+   (match_operand:<ssequartermode> 3 "nonimmediate_operand")
+   (match_operand:QI 4 "register_operand")]
+  "TARGET_AVX512F"
+{
+  if (MEM_P (operands[0]) && GET_CODE (operands[3]) == CONST_VECTOR)
+    operands[0] = force_reg (<ssequartermode>mode, operands[0]);
+  switch (INTVAL (operands[2]))
+    {
+    case 0:
+      emit_insn (gen_avx512f_vextract<shuffletype>32x4_1_mask (operands[0],
+          operands[1], GEN_INT (0), GEN_INT (1), GEN_INT (2),
+          GEN_INT (3), operands[3], operands[4]));
+      break;
+    case 1:
+      emit_insn (gen_avx512f_vextract<shuffletype>32x4_1_mask (operands[0],
+          operands[1], GEN_INT (4), GEN_INT (5), GEN_INT (6),
+          GEN_INT (7), operands[3], operands[4]));
+      break;
+    case 2:
+      emit_insn (gen_avx512f_vextract<shuffletype>32x4_1_mask (operands[0],
+          operands[1], GEN_INT (8), GEN_INT (9), GEN_INT (10),
+          GEN_INT (11), operands[3], operands[4]));
+      break;
+    case 3:
+      emit_insn (gen_avx512f_vextract<shuffletype>32x4_1_mask (operands[0],
+          operands[1], GEN_INT (12), GEN_INT (13), GEN_INT (14),
+          GEN_INT (15), operands[3], operands[4]));
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  DONE;
+})
+
+(define_insn "avx512f_vextract<shuffletype>32x4_1_maskm"
+  [(set (match_operand:<ssequartermode> 0 "memory_operand" "=m")
+	(vec_merge:<ssequartermode>
+	  (vec_select:<ssequartermode>
+	    (match_operand:V16FI 1 "register_operand" "v")
+	    (parallel [(match_operand 2  "const_0_to_15_operand")
+	      (match_operand 3  "const_0_to_15_operand")
+	      (match_operand 4  "const_0_to_15_operand")
+	      (match_operand 5  "const_0_to_15_operand")]))
+	  (match_operand:<ssequartermode> 6 "memory_operand" "0")
+	  (match_operand:QI 7 "register_operand" "Yk")))]
+  "TARGET_AVX512F && (INTVAL (operands[2]) = INTVAL (operands[3]) - 1)
+  && (INTVAL (operands[3]) = INTVAL (operands[4]) - 1)
+  && (INTVAL (operands[4]) = INTVAL (operands[5]) - 1)"
+{
+  operands[2] = GEN_INT ((INTVAL (operands[2])) >> 2);
+  return "vextract<shuffletype>32x4\t{%2, %1, %0%{%7%}|%0%{%7%}, %1, %2}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "memory" "store")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<mask_codefor>avx512f_vextract<shuffletype>32x4_1<mask_name>"
+  [(set (match_operand:<ssequartermode> 0 "<store_mask_predicate>" "=<store_mask_constraint>")
+	(vec_select:<ssequartermode>
+	  (match_operand:V16FI 1 "register_operand" "v")
+	  (parallel [(match_operand 2  "const_0_to_15_operand")
+            (match_operand 3  "const_0_to_15_operand")
+            (match_operand 4  "const_0_to_15_operand")
+            (match_operand 5  "const_0_to_15_operand")])))]
+  "TARGET_AVX512F && (INTVAL (operands[2]) = INTVAL (operands[3]) - 1)
+  && (INTVAL (operands[3]) = INTVAL (operands[4]) - 1)
+  && (INTVAL (operands[4]) = INTVAL (operands[5]) - 1)"
+{
+  operands[2] = GEN_INT ((INTVAL (operands[2])) >> 2);
+  return "vextract<shuffletype>32x4\t{%2, %1, %0<mask_operand6>|%0<mask_operand6>, %1, %2}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set (attr "memory")
+      (if_then_else (match_test "MEM_P (operands[0])")
+	(const_string "store")
+	(const_string "none")))
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512f_vextract<shuffletype>64x4_mask"
+  [(match_operand:<ssehalfvecmode> 0 "nonimmediate_operand")
+   (match_operand:V8FI 1 "register_operand")
+   (match_operand:SI 2 "const_0_to_1_operand")
+   (match_operand:<ssehalfvecmode> 3 "nonimmediate_operand")
+   (match_operand:QI 4 "register_operand")]
+  "TARGET_AVX512F"
+{
+  rtx (*insn)(rtx, rtx, rtx, rtx);
+
+  if (MEM_P (operands[0]) && GET_CODE (operands[3]) == CONST_VECTOR)
+    operands[0] = force_reg (<ssequartermode>mode, operands[0]);
+
+  switch (INTVAL (operands[2]))
+    {
+    case 0:
+      insn = gen_vec_extract_lo_<mode>_mask;
+      break;
+    case 1:
+      insn = gen_vec_extract_hi_<mode>_mask;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  emit_insn (insn (operands[0], operands[1], operands[3], operands[4]));
+  DONE;
+})
+
+(define_split
+  [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand")
+	(vec_select:<ssehalfvecmode>
+	  (match_operand:V8FI 1 "nonimmediate_operand")
+	  (parallel [(const_int 0) (const_int 1)
+            (const_int 2) (const_int 3)])))]
+  "TARGET_AVX512F && !(MEM_P (operands[0]) && MEM_P (operands[1]))
+  && reload_completed"
+  [(const_int 0)]
+{
+  rtx op1 = operands[1];
+  if (REG_P (op1))
+    op1 = gen_rtx_REG (<ssehalfvecmode>mode, REGNO (op1));
+  else
+    op1 = gen_lowpart (<ssehalfvecmode>mode, op1);
+  emit_move_insn (operands[0], op1);
+  DONE;
+})
+
+(define_insn "vec_extract_lo_<mode>_maskm"
+  [(set (match_operand:<ssehalfvecmode> 0 "memory_operand" "=m")
+	(vec_merge:<ssehalfvecmode>
+	  (vec_select:<ssehalfvecmode>
+	    (match_operand:V8FI 1 "register_operand" "v")
+	    (parallel [(const_int 0) (const_int 1)
+	      (const_int 2) (const_int 3)]))
+	  (match_operand:<ssehalfvecmode> 2 "memory_operand" "0")
+	  (match_operand:QI 3 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+"vextract<shuffletype>64x4\t{$0x0, %1, %0%{%3%}|%0%{%3%}, %1, 0x0}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "vec_extract_lo_<mode><mask_name>"
+  [(set (match_operand:<ssehalfvecmode> 0 "<store_mask_predicate>" "=<store_mask_constraint>")
+	(vec_select:<ssehalfvecmode>
+	  (match_operand:V8FI 1 "nonimmediate_operand" "vm")
+	  (parallel [(const_int 0) (const_int 1)
+            (const_int 2) (const_int 3)])))]
+  "TARGET_AVX512F && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+{
+  if (<mask_applied>)
+    return "vextract<shuffletype>64x4\t{$0x0, %1, %0<mask_operand2>|%0<mask_operand2>, %1, 0x0}";
+  else
+    return "#";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set (attr "memory")
+      (if_then_else (match_test "MEM_P (operands[0])")
+	(const_string "store")
+	(const_string "none")))
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "vec_extract_hi_<mode>_maskm"
+  [(set (match_operand:<ssehalfvecmode> 0 "memory_operand" "=m")
+	(vec_merge:<ssehalfvecmode>
+	  (vec_select:<ssehalfvecmode>
+	    (match_operand:V8FI 1 "register_operand" "v")
+	    (parallel [(const_int 4) (const_int 5)
+	      (const_int 6) (const_int 7)]))
+	  (match_operand:<ssehalfvecmode> 2 "memory_operand" "0")
+	  (match_operand:QI 3 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+  "vextract<shuffletype>64x4\t{$0x1, %1, %0%{%3%}|%0%{%3%}, %1, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "memory" "store")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "vec_extract_hi_<mode><mask_name>"
+  [(set (match_operand:<ssehalfvecmode> 0 "<store_mask_predicate>" "=<store_mask_constraint>")
+	(vec_select:<ssehalfvecmode>
+	  (match_operand:V8FI 1 "register_operand" "v")
+	  (parallel [(const_int 4) (const_int 5)
+            (const_int 6) (const_int 7)])))]
+  "TARGET_AVX512F"
+  "vextract<shuffletype>64x4\t{$0x1, %1, %0<mask_operand2>|%0<mask_operand2>, %1, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set (attr "memory")
+      (if_then_else (match_test "MEM_P (operands[0])")
+	(const_string "store")
+	(const_string "none")))
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx_vextractf128<mode>"
+  [(match_operand:<ssehalfvecmode> 0 "nonimmediate_operand")
+   (match_operand:V_256 1 "register_operand")
+   (match_operand:SI 2 "const_0_to_1_operand")]
+  "TARGET_AVX"
+{
+  rtx (*insn)(rtx, rtx);
+
+  switch (INTVAL (operands[2]))
+    {
+    case 0:
+      insn = gen_vec_extract_lo_<mode>;
+      break;
+    case 1:
+      insn = gen_vec_extract_hi_<mode>;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  emit_insn (insn (operands[0], operands[1]));
+  DONE;
+})
+
+(define_insn_and_split "vec_extract_lo_<mode>"
+  [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=v,m")
+	(vec_select:<ssehalfvecmode>
+	  (match_operand:V16FI 1 "nonimmediate_operand" "vm,v")
+	  (parallel [(const_int 0) (const_int 1)
+                     (const_int 2) (const_int 3)
+                     (const_int 4) (const_int 5)
+                     (const_int 6) (const_int 7)])))]
+  "TARGET_AVX512F && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rtx op1 = operands[1];
+  if (REG_P (op1))
+    op1 = gen_rtx_REG (<ssehalfvecmode>mode, REGNO (op1));
+  else
+    op1 = gen_lowpart (<ssehalfvecmode>mode, op1);
+  emit_move_insn (operands[0], op1);
+  DONE;
+})
+
+(define_insn "vec_extract_hi_<mode>"
+  [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=v,m")
+	(vec_select:<ssehalfvecmode>
+	  (match_operand:V16FI 1 "nonimmediate_operand" "v,v")
+	  (parallel [(const_int 8) (const_int 9)
+		     (const_int 10) (const_int 11)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)])))]
+  "TARGET_AVX512F"
+  "vextracti64x4\t{$0x1, %1, %0|%0, %1, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "memory" "none,store")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn_and_split "vec_extract_lo_<mode>"
+  [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=x,m")
+	(vec_select:<ssehalfvecmode>
+	  (match_operand:VI8F_256 1 "nonimmediate_operand" "xm,x")
+	  (parallel [(const_int 0) (const_int 1)])))]
+  "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+{
+  if (REG_P (operands[1]))
+    operands[1] = gen_rtx_REG (<ssehalfvecmode>mode, REGNO (operands[1]));
+  else
+    operands[1] = adjust_address (operands[1], <ssehalfvecmode>mode, 0);
+})
+
+(define_insn "vec_extract_hi_<mode>"
+  [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=x,m")
+	(vec_select:<ssehalfvecmode>
+	  (match_operand:VI8F_256 1 "register_operand" "x,x")
+	  (parallel [(const_int 2) (const_int 3)])))]
+  "TARGET_AVX"
+  "vextract<i128>\t{$0x1, %1, %0|%0, %1, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "memory" "none,store")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn_and_split "vec_extract_lo_<mode>"
+  [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=x,m")
+	(vec_select:<ssehalfvecmode>
+	  (match_operand:VI4F_256 1 "nonimmediate_operand" "xm,x")
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)])))]
+  "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+{
+  if (REG_P (operands[1]))
+    operands[1] = gen_rtx_REG (<ssehalfvecmode>mode, REGNO (operands[1]));
+  else
+    operands[1] = adjust_address (operands[1], <ssehalfvecmode>mode, 0);
+})
+
+(define_insn "vec_extract_hi_<mode>"
+  [(set (match_operand:<ssehalfvecmode> 0 "nonimmediate_operand" "=x,m")
+	(vec_select:<ssehalfvecmode>
+	  (match_operand:VI4F_256 1 "register_operand" "x,x")
+	  (parallel [(const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)])))]
+  "TARGET_AVX"
+  "vextract<i128>\t{$0x1, %1, %0|%0, %1, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "memory" "none,store")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn_and_split "vec_extract_lo_v32hi"
+  [(set (match_operand:V16HI 0 "nonimmediate_operand" "=v,m")
+	(vec_select:V16HI
+	  (match_operand:V32HI 1 "nonimmediate_operand" "vm,v")
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)
+		     (const_int 8) (const_int 9)
+		     (const_int 10) (const_int 11)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)])))]
+  "TARGET_AVX512F && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+{
+  if (REG_P (operands[1]))
+    operands[1] = gen_rtx_REG (V16HImode, REGNO (operands[1]));
+  else
+    operands[1] = adjust_address (operands[1], V16HImode, 0);
+})
+
+(define_insn "vec_extract_hi_v32hi"
+  [(set (match_operand:V16HI 0 "nonimmediate_operand" "=v,m")
+	(vec_select:V16HI
+	  (match_operand:V32HI 1 "nonimmediate_operand" "v,v")
+	  (parallel [(const_int 16) (const_int 17)
+		     (const_int 18) (const_int 19)
+		     (const_int 20) (const_int 21)
+		     (const_int 22) (const_int 23)
+		     (const_int 24) (const_int 25)
+		     (const_int 26) (const_int 27)
+		     (const_int 28) (const_int 29)
+		     (const_int 30) (const_int 31)])))]
+  "TARGET_AVX512F"
+  "vextracti64x4\t{$0x1, %1, %0|%0, %1, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "memory" "none,store")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn_and_split "vec_extract_lo_v16hi"
+  [(set (match_operand:V8HI 0 "nonimmediate_operand" "=x,m")
+	(vec_select:V8HI
+	  (match_operand:V16HI 1 "nonimmediate_operand" "xm,x")
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)])))]
+  "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+{
+  if (REG_P (operands[1]))
+    operands[1] = gen_rtx_REG (V8HImode, REGNO (operands[1]));
+  else
+    operands[1] = adjust_address (operands[1], V8HImode, 0);
+})
+
+(define_insn "vec_extract_hi_v16hi"
+  [(set (match_operand:V8HI 0 "nonimmediate_operand" "=x,m")
+	(vec_select:V8HI
+	  (match_operand:V16HI 1 "register_operand" "x,x")
+	  (parallel [(const_int 8) (const_int 9)
+		     (const_int 10) (const_int 11)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)])))]
+  "TARGET_AVX"
+  "vextract%~128\t{$0x1, %1, %0|%0, %1, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "memory" "none,store")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn_and_split "vec_extract_lo_v64qi"
+  [(set (match_operand:V32QI 0 "nonimmediate_operand" "=v,m")
+	(vec_select:V32QI
+	  (match_operand:V64QI 1 "nonimmediate_operand" "vm,v")
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)
+		     (const_int 8) (const_int 9)
+		     (const_int 10) (const_int 11)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)
+		     (const_int 16) (const_int 17)
+		     (const_int 18) (const_int 19)
+		     (const_int 20) (const_int 21)
+		     (const_int 22) (const_int 23)
+		     (const_int 24) (const_int 25)
+		     (const_int 26) (const_int 27)
+		     (const_int 28) (const_int 29)
+		     (const_int 30) (const_int 31)])))]
+  "TARGET_AVX512F && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+{
+  if (REG_P (operands[1]))
+    operands[1] = gen_rtx_REG (V32QImode, REGNO (operands[1]));
+  else
+    operands[1] = adjust_address (operands[1], V32QImode, 0);
+})
+
+(define_insn "vec_extract_hi_v64qi"
+  [(set (match_operand:V32QI 0 "nonimmediate_operand" "=v,m")
+	(vec_select:V32QI
+	  (match_operand:V64QI 1 "nonimmediate_operand" "v,v")
+	  (parallel [(const_int 32) (const_int 33)
+		     (const_int 34) (const_int 35)
+		     (const_int 36) (const_int 37)
+		     (const_int 38) (const_int 39)
+		     (const_int 40) (const_int 41)
+		     (const_int 42) (const_int 43)
+		     (const_int 44) (const_int 45)
+		     (const_int 46) (const_int 47)
+		     (const_int 48) (const_int 49)
+		     (const_int 50) (const_int 51)
+		     (const_int 52) (const_int 53)
+		     (const_int 54) (const_int 55)
+		     (const_int 56) (const_int 57)
+		     (const_int 58) (const_int 59)
+		     (const_int 60) (const_int 61)
+		     (const_int 62) (const_int 63)])))]
+  "TARGET_AVX512F"
+  "vextracti64x4\t{$0x1, %1, %0|%0, %1, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "memory" "none,store")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn_and_split "vec_extract_lo_v32qi"
+  [(set (match_operand:V16QI 0 "nonimmediate_operand" "=x,m")
+	(vec_select:V16QI
+	  (match_operand:V32QI 1 "nonimmediate_operand" "xm,x")
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)
+		     (const_int 8) (const_int 9)
+		     (const_int 10) (const_int 11)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)])))]
+  "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+{
+  if (REG_P (operands[1]))
+    operands[1] = gen_rtx_REG (V16QImode, REGNO (operands[1]));
+  else
+    operands[1] = adjust_address (operands[1], V16QImode, 0);
+})
+
+(define_insn "vec_extract_hi_v32qi"
+  [(set (match_operand:V16QI 0 "nonimmediate_operand" "=x,m")
+	(vec_select:V16QI
+	  (match_operand:V32QI 1 "register_operand" "x,x")
+	  (parallel [(const_int 16) (const_int 17)
+		     (const_int 18) (const_int 19)
+		     (const_int 20) (const_int 21)
+		     (const_int 22) (const_int 23)
+		     (const_int 24) (const_int 25)
+		     (const_int 26) (const_int 27)
+		     (const_int 28) (const_int 29)
+		     (const_int 30) (const_int 31)])))]
+  "TARGET_AVX"
+  "vextract%~128\t{$0x1, %1, %0|%0, %1, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "memory" "none,store")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+;; Modes handled by vec_extract patterns.
+(define_mode_iterator VEC_EXTRACT_MODE
+  [(V32QI "TARGET_AVX") V16QI
+   (V16HI "TARGET_AVX") V8HI
+   (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
+   (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI
+   (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF
+   (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF])
+
+(define_expand "vec_extract<mode>"
+  [(match_operand:<ssescalarmode> 0 "register_operand")
+   (match_operand:VEC_EXTRACT_MODE 1 "register_operand")
+   (match_operand 2 "const_int_operand")]
+  "TARGET_SSE"
+{
+  ix86_expand_vector_extract (false, operands[0], operands[1],
+			      INTVAL (operands[2]));
+  DONE;
+})
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel double-precision floating point element swizzling
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "<mask_codefor>avx512f_unpckhpd512<mask_name>"
+  [(set (match_operand:V8DF 0 "register_operand" "=v")
+	(vec_select:V8DF
+	  (vec_concat:V16DF
+	    (match_operand:V8DF 1 "nonimmediate_operand" "v")
+	    (match_operand:V8DF 2 "nonimmediate_operand" "vm"))
+	  (parallel [(const_int 1) (const_int 9)
+		     (const_int 3) (const_int 11)
+		     (const_int 5) (const_int 13)
+		     (const_int 7) (const_int 15)])))]
+  "TARGET_AVX512F"
+  "vunpckhpd\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V8DF")])
+
+;; Recall that the 256-bit unpck insns only shuffle within their lanes.
+(define_insn "avx_unpckhpd256"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+	(vec_select:V4DF
+	  (vec_concat:V8DF
+	    (match_operand:V4DF 1 "register_operand" "x")
+	    (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 1) (const_int 5)
+		     (const_int 3) (const_int 7)])))]
+  "TARGET_AVX"
+  "vunpckhpd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
+(define_expand "vec_interleave_highv4df"
+  [(set (match_dup 3)
+	(vec_select:V4DF
+	  (vec_concat:V8DF
+	    (match_operand:V4DF 1 "register_operand" "x")
+	    (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 2) (const_int 6)])))
+   (set (match_dup 4)
+	(vec_select:V4DF
+	  (vec_concat:V8DF
+	    (match_dup 1)
+	    (match_dup 2))
+	  (parallel [(const_int 1) (const_int 5)
+		     (const_int 3) (const_int 7)])))
+   (set (match_operand:V4DF 0 "register_operand")
+	(vec_select:V4DF
+	  (vec_concat:V8DF
+	    (match_dup 3)
+	    (match_dup 4))
+	  (parallel [(const_int 2) (const_int 3)
+		     (const_int 6) (const_int 7)])))]
+ "TARGET_AVX"
+{
+  operands[3] = gen_reg_rtx (V4DFmode);
+  operands[4] = gen_reg_rtx (V4DFmode);
+})
+
+
+(define_expand "vec_interleave_highv2df"
+  [(set (match_operand:V2DF 0 "register_operand")
+	(vec_select:V2DF
+	  (vec_concat:V4DF
+	    (match_operand:V2DF 1 "nonimmediate_operand")
+	    (match_operand:V2DF 2 "nonimmediate_operand"))
+	  (parallel [(const_int 1)
+		     (const_int 3)])))]
+  "TARGET_SSE2"
+{
+  if (!ix86_vec_interleave_v2df_operator_ok (operands, 1))
+    operands[2] = force_reg (V2DFmode, operands[2]);
+})
+
+(define_insn "*vec_interleave_highv2df"
+  [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,x,x,x,x,m")
+	(vec_select:V2DF
+	  (vec_concat:V4DF
+	    (match_operand:V2DF 1 "nonimmediate_operand" " 0,x,o,o,o,x")
+	    (match_operand:V2DF 2 "nonimmediate_operand" " x,x,1,0,x,0"))
+	  (parallel [(const_int 1)
+		     (const_int 3)])))]
+  "TARGET_SSE2 && ix86_vec_interleave_v2df_operator_ok (operands, 1)"
+  "@
+   unpckhpd\t{%2, %0|%0, %2}
+   vunpckhpd\t{%2, %1, %0|%0, %1, %2}
+   %vmovddup\t{%H1, %0|%0, %H1}
+   movlpd\t{%H1, %0|%0, %H1}
+   vmovlpd\t{%H1, %2, %0|%0, %2, %H1}
+   %vmovhpd\t{%1, %0|%q0, %1}"
+  [(set_attr "isa" "noavx,avx,sse3,noavx,avx,*")
+   (set_attr "type" "sselog,sselog,sselog,ssemov,ssemov,ssemov")
+   (set_attr "ssememalign" "64")
+   (set_attr "prefix_data16" "*,*,*,1,*,1")
+   (set_attr "prefix" "orig,vex,maybe_vex,orig,vex,maybe_vex")
+   (set_attr "mode" "V2DF,V2DF,DF,V1DF,V1DF,V1DF")])
+
+(define_expand "avx512f_movddup512<mask_name>"
+  [(set (match_operand:V8DF 0 "register_operand")
+	(vec_select:V8DF
+	  (vec_concat:V16DF
+	    (match_operand:V8DF 1 "nonimmediate_operand")
+	    (match_dup 1))
+	  (parallel [(const_int 0) (const_int 8)
+		     (const_int 2) (const_int 10)
+		     (const_int 4) (const_int 12)
+		     (const_int 6) (const_int 14)])))]
+  "TARGET_AVX512F")
+
+(define_expand "avx512f_unpcklpd512<mask_name>"
+  [(set (match_operand:V8DF 0 "register_operand")
+	(vec_select:V8DF
+	  (vec_concat:V16DF
+	    (match_operand:V8DF 1 "register_operand")
+	    (match_operand:V8DF 2 "nonimmediate_operand"))
+	  (parallel [(const_int 0) (const_int 8)
+		     (const_int 2) (const_int 10)
+		     (const_int 4) (const_int 12)
+		     (const_int 6) (const_int 14)])))]
+  "TARGET_AVX512F")
+
+(define_insn "*avx512f_unpcklpd512<mask_name>"
+  [(set (match_operand:V8DF 0 "register_operand" "=v,v")
+	(vec_select:V8DF
+	  (vec_concat:V16DF
+	    (match_operand:V8DF 1 "nonimmediate_operand" "vm, v")
+	    (match_operand:V8DF 2 "nonimmediate_operand" "1 ,vm"))
+	  (parallel [(const_int 0) (const_int 8)
+		     (const_int 2) (const_int 10)
+		     (const_int 4) (const_int 12)
+		     (const_int 6) (const_int 14)])))]
+  "TARGET_AVX512F"
+  "@
+   vmovddup\t{%1, %0<mask_operand3>|%0<mask_operand3>, %1}
+   vunpcklpd\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V8DF")])
+
+;; Recall that the 256-bit unpck insns only shuffle within their lanes.
+(define_expand "avx_movddup256"
+  [(set (match_operand:V4DF 0 "register_operand")
+	(vec_select:V4DF
+	  (vec_concat:V8DF
+	    (match_operand:V4DF 1 "nonimmediate_operand")
+	    (match_dup 1))
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 2) (const_int 6)])))]
+  "TARGET_AVX")
+
+(define_expand "avx_unpcklpd256"
+  [(set (match_operand:V4DF 0 "register_operand")
+	(vec_select:V4DF
+	  (vec_concat:V8DF
+	    (match_operand:V4DF 1 "register_operand")
+	    (match_operand:V4DF 2 "nonimmediate_operand"))
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 2) (const_int 6)])))]
+  "TARGET_AVX")
+
+(define_insn "*avx_unpcklpd256"
+  [(set (match_operand:V4DF 0 "register_operand"         "=x,x")
+	(vec_select:V4DF
+	  (vec_concat:V8DF
+	    (match_operand:V4DF 1 "nonimmediate_operand" " x,m")
+	    (match_operand:V4DF 2 "nonimmediate_operand" "xm,1"))
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 2) (const_int 6)])))]
+  "TARGET_AVX"
+  "@
+   vunpcklpd\t{%2, %1, %0|%0, %1, %2}
+   vmovddup\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
+(define_expand "vec_interleave_lowv4df"
+  [(set (match_dup 3)
+	(vec_select:V4DF
+	  (vec_concat:V8DF
+	    (match_operand:V4DF 1 "register_operand" "x")
+	    (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 2) (const_int 6)])))
+   (set (match_dup 4)
+	(vec_select:V4DF
+	  (vec_concat:V8DF
+	    (match_dup 1)
+	    (match_dup 2))
+	  (parallel [(const_int 1) (const_int 5)
+		     (const_int 3) (const_int 7)])))
+   (set (match_operand:V4DF 0 "register_operand")
+	(vec_select:V4DF
+	  (vec_concat:V8DF
+	    (match_dup 3)
+	    (match_dup 4))
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 4) (const_int 5)])))]
+ "TARGET_AVX"
+{
+  operands[3] = gen_reg_rtx (V4DFmode);
+  operands[4] = gen_reg_rtx (V4DFmode);
+})
+
+(define_expand "vec_interleave_lowv2df"
+  [(set (match_operand:V2DF 0 "register_operand")
+	(vec_select:V2DF
+	  (vec_concat:V4DF
+	    (match_operand:V2DF 1 "nonimmediate_operand")
+	    (match_operand:V2DF 2 "nonimmediate_operand"))
+	  (parallel [(const_int 0)
+		     (const_int 2)])))]
+  "TARGET_SSE2"
+{
+  if (!ix86_vec_interleave_v2df_operator_ok (operands, 0))
+    operands[1] = force_reg (V2DFmode, operands[1]);
+})
+
+(define_insn "*vec_interleave_lowv2df"
+  [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,x,x,x,x,o")
+	(vec_select:V2DF
+	  (vec_concat:V4DF
+	    (match_operand:V2DF 1 "nonimmediate_operand" " 0,x,m,0,x,0")
+	    (match_operand:V2DF 2 "nonimmediate_operand" " x,x,1,m,m,x"))
+	  (parallel [(const_int 0)
+		     (const_int 2)])))]
+  "TARGET_SSE2 && ix86_vec_interleave_v2df_operator_ok (operands, 0)"
+  "@
+   unpcklpd\t{%2, %0|%0, %2}
+   vunpcklpd\t{%2, %1, %0|%0, %1, %2}
+   %vmovddup\t{%1, %0|%0, %q1}
+   movhpd\t{%2, %0|%0, %q2}
+   vmovhpd\t{%2, %1, %0|%0, %1, %q2}
+   %vmovlpd\t{%2, %H0|%H0, %2}"
+  [(set_attr "isa" "noavx,avx,sse3,noavx,avx,*")
+   (set_attr "type" "sselog,sselog,sselog,ssemov,ssemov,ssemov")
+   (set_attr "ssememalign" "64")
+   (set_attr "prefix_data16" "*,*,*,1,*,1")
+   (set_attr "prefix" "orig,vex,maybe_vex,orig,vex,maybe_vex")
+   (set_attr "mode" "V2DF,V2DF,DF,V1DF,V1DF,V1DF")])
+
+(define_split
+  [(set (match_operand:V2DF 0 "memory_operand")
+	(vec_select:V2DF
+	  (vec_concat:V4DF
+	    (match_operand:V2DF 1 "register_operand")
+	    (match_dup 1))
+	  (parallel [(const_int 0)
+		     (const_int 2)])))]
+  "TARGET_SSE3 && reload_completed"
+  [(const_int 0)]
+{
+  rtx low = gen_rtx_REG (DFmode, REGNO (operands[1]));
+  emit_move_insn (adjust_address (operands[0], DFmode, 0), low);
+  emit_move_insn (adjust_address (operands[0], DFmode, 8), low);
+  DONE;
+})
+
+(define_split
+  [(set (match_operand:V2DF 0 "register_operand")
+	(vec_select:V2DF
+	  (vec_concat:V4DF
+	    (match_operand:V2DF 1 "memory_operand")
+	    (match_dup 1))
+	  (parallel [(match_operand:SI 2 "const_0_to_1_operand")
+		     (match_operand:SI 3 "const_int_operand")])))]
+  "TARGET_SSE3 && INTVAL (operands[2]) + 2 == INTVAL (operands[3])"
+  [(set (match_dup 0) (vec_duplicate:V2DF (match_dup 1)))]
+{
+  operands[1] = adjust_address (operands[1], DFmode, INTVAL (operands[2]) * 8);
+})
+
+(define_insn "avx512f_vmscalef<mode><round_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+	(vec_merge:VF_128
+	  (unspec:VF_128
+	    [(match_operand:VF_128 1 "register_operand" "v")
+	     (match_operand:VF_128 2 "<round_nimm_predicate>" "<round_constraint>")]
+	    UNSPEC_SCALEF)
+	  (match_dup 1)
+	  (const_int 1)))]
+  "TARGET_AVX512F"
+  "vscalef<ssescalarmodesuffix>\t{<round_op3>%2, %1, %0|%0, %1, %2<round_op3>}"
+  [(set_attr "prefix" "evex")
+   (set_attr "mode"  "<ssescalarmode>")])
+
+(define_insn "avx512f_scalef<mode><mask_name><round_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+	(unspec:VF_512
+	  [(match_operand:VF_512 1 "register_operand" "v")
+	   (match_operand:VF_512 2 "<round_nimm_predicate>" "<round_constraint>")]
+	  UNSPEC_SCALEF))]
+  "TARGET_AVX512F"
+  "vscalef<ssemodesuffix>\t{<round_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_mask_op3>}"
+  [(set_attr "prefix" "evex")
+   (set_attr "mode"  "<MODE>")])
+
+(define_expand "avx512f_vternlog<mode>_maskz"
+  [(match_operand:VI48_512 0 "register_operand")
+   (match_operand:VI48_512 1 "register_operand")
+   (match_operand:VI48_512 2 "register_operand")
+   (match_operand:VI48_512 3 "nonimmediate_operand")
+   (match_operand:SI 4 "const_0_to_255_operand")
+   (match_operand:<avx512fmaskmode> 5 "register_operand")]
+  "TARGET_AVX512F"
+{
+  emit_insn (gen_avx512f_vternlog<mode>_maskz_1 (
+    operands[0], operands[1], operands[2], operands[3],
+    operands[4], CONST0_RTX (<MODE>mode), operands[5]));
+  DONE;
+})
+
+(define_insn "avx512f_vternlog<mode><sd_maskz_name>"
+  [(set (match_operand:VI48_512 0 "register_operand" "=v")
+	(unspec:VI48_512
+	  [(match_operand:VI48_512 1 "register_operand" "0")
+	   (match_operand:VI48_512 2 "register_operand" "v")
+	   (match_operand:VI48_512 3 "nonimmediate_operand" "vm")
+	   (match_operand:SI 4 "const_0_to_255_operand")]
+	  UNSPEC_VTERNLOG))]
+  "TARGET_AVX512F"
+  "vpternlog<ssemodesuffix>\t{%4, %3, %2, %0<sd_mask_op5>|%0<sd_mask_op5>, %2, %3, %4}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512f_vternlog<mode>_mask"
+  [(set (match_operand:VI48_512 0 "register_operand" "=v")
+	(vec_merge:VI48_512
+	  (unspec:VI48_512
+	    [(match_operand:VI48_512 1 "register_operand" "0")
+	     (match_operand:VI48_512 2 "register_operand" "v")
+	     (match_operand:VI48_512 3 "nonimmediate_operand" "vm")
+	     (match_operand:SI 4 "const_0_to_255_operand")]
+	    UNSPEC_VTERNLOG)
+	  (match_dup 1)
+	  (match_operand:<avx512fmaskmode> 5 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+  "vpternlog<ssemodesuffix>\t{%4, %3, %2, %0%{%5%}|%0%{%5%}, %2, %3, %4}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512f_getexp<mode><mask_name><round_saeonly_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+        (unspec:VF_512 [(match_operand:VF_512 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")]
+                        UNSPEC_GETEXP))]
+   "TARGET_AVX512F"
+   "vgetexp<ssemodesuffix>\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}";
+    [(set_attr "prefix" "evex")
+     (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_sgetexp<mode><round_saeonly_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+	(vec_merge:VF_128
+	  (unspec:VF_128
+	    [(match_operand:VF_128 1 "register_operand" "v")
+	     (match_operand:VF_128 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")]
+	    UNSPEC_GETEXP)
+	  (match_dup 1)
+	  (const_int 1)))]
+   "TARGET_AVX512F"
+   "vgetexp<ssescalarmodesuffix>\t{<round_saeonly_op3>%2, %1, %0|%0, %1, %2<round_saeonly_op3>}";
+    [(set_attr "prefix" "evex")
+     (set_attr "mode" "<ssescalarmode>")])
+
+(define_insn "<mask_codefor>avx512f_align<mode><mask_name>"
+  [(set (match_operand:VI48_512 0 "register_operand" "=v")
+        (unspec:VI48_512 [(match_operand:VI48_512 1 "register_operand" "v")
+			  (match_operand:VI48_512 2 "nonimmediate_operand" "vm")
+			  (match_operand:SI 3 "const_0_to_255_operand")]
+			 UNSPEC_ALIGN))]
+  "TARGET_AVX512F"
+  "valign<ssemodesuffix>\t{%3, %2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2, %3}";
+  [(set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512f_shufps512_mask"
+  [(match_operand:V16SF 0 "register_operand")
+   (match_operand:V16SF 1 "register_operand")
+   (match_operand:V16SF 2 "nonimmediate_operand")
+   (match_operand:SI 3 "const_0_to_255_operand")
+   (match_operand:V16SF 4 "register_operand")
+   (match_operand:HI 5 "register_operand")]
+  "TARGET_AVX512F"
+{
+  int mask = INTVAL (operands[3]);
+  emit_insn (gen_avx512f_shufps512_1_mask (operands[0], operands[1], operands[2],
+					  GEN_INT ((mask >> 0) & 3),
+					  GEN_INT ((mask >> 2) & 3),
+					  GEN_INT (((mask >> 4) & 3) + 16),
+					  GEN_INT (((mask >> 6) & 3) + 16),
+					  GEN_INT (((mask >> 0) & 3) + 4),
+					  GEN_INT (((mask >> 2) & 3) + 4),
+					  GEN_INT (((mask >> 4) & 3) + 20),
+					  GEN_INT (((mask >> 6) & 3) + 20),
+					  GEN_INT (((mask >> 0) & 3) + 8),
+					  GEN_INT (((mask >> 2) & 3) + 8),
+					  GEN_INT (((mask >> 4) & 3) + 24),
+					  GEN_INT (((mask >> 6) & 3) + 24),
+					  GEN_INT (((mask >> 0) & 3) + 12),
+					  GEN_INT (((mask >> 2) & 3) + 12),
+					  GEN_INT (((mask >> 4) & 3) + 28),
+					  GEN_INT (((mask >> 6) & 3) + 28),
+					  operands[4], operands[5]));
+  DONE;
+})
+
+
+(define_expand "avx512f_fixupimm<mode>_maskz<round_saeonly_expand_name>"
+  [(match_operand:VF_512 0 "register_operand")
+   (match_operand:VF_512 1 "register_operand")
+   (match_operand:VF_512 2 "register_operand")
+   (match_operand:<sseintvecmode> 3 "<round_saeonly_expand_nimm_predicate>")
+   (match_operand:SI 4 "const_0_to_255_operand")
+   (match_operand:<avx512fmaskmode> 5 "register_operand")]
+  "TARGET_AVX512F"
+{
+  emit_insn (gen_avx512f_fixupimm<mode>_maskz_1<round_saeonly_expand_name> (
+	operands[0], operands[1], operands[2], operands[3],
+	operands[4], CONST0_RTX (<MODE>mode), operands[5]
+	<round_saeonly_expand_operand6>));
+  DONE;
+})
+
+(define_insn "avx512f_fixupimm<mode><sd_maskz_name><round_saeonly_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+        (unspec:VF_512
+          [(match_operand:VF_512 1 "register_operand" "0")
+	   (match_operand:VF_512 2 "register_operand" "v")
+           (match_operand:<sseintvecmode> 3 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+           (match_operand:SI 4 "const_0_to_255_operand")]
+           UNSPEC_FIXUPIMM))]
+  "TARGET_AVX512F"
+  "vfixupimm<ssemodesuffix>\t{%4, <round_saeonly_sd_mask_op5>%3, %2, %0<sd_mask_op5>|%0<sd_mask_op5>, %2, %3<round_saeonly_sd_mask_op5>, %4}";
+  [(set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_fixupimm<mode>_mask<round_saeonly_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+	(vec_merge:VF_512
+          (unspec:VF_512
+            [(match_operand:VF_512 1 "register_operand" "0")
+	     (match_operand:VF_512 2 "register_operand" "v")
+             (match_operand:<sseintvecmode> 3 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+             (match_operand:SI 4 "const_0_to_255_operand")]
+             UNSPEC_FIXUPIMM)
+	  (match_dup 1)
+	  (match_operand:<avx512fmaskmode> 5 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+  "vfixupimm<ssemodesuffix>\t{%4, <round_saeonly_op6>%3, %2, %0%{%5%}|%0%{%5%}, %2, %3<round_saeonly_op6>, %4}";
+  [(set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
+(define_expand "avx512f_sfixupimm<mode>_maskz<round_saeonly_expand_name>"
+  [(match_operand:VF_128 0 "register_operand")
+   (match_operand:VF_128 1 "register_operand")
+   (match_operand:VF_128 2 "register_operand")
+   (match_operand:<sseintvecmode> 3 "<round_saeonly_expand_nimm_predicate>")
+   (match_operand:SI 4 "const_0_to_255_operand")
+   (match_operand:<avx512fmaskmode> 5 "register_operand")]
+  "TARGET_AVX512F"
+{
+  emit_insn (gen_avx512f_sfixupimm<mode>_maskz_1<round_saeonly_expand_name> (
+	operands[0], operands[1], operands[2], operands[3],
+	operands[4], CONST0_RTX (<MODE>mode), operands[5]
+	<round_saeonly_expand_operand6>));
+  DONE;
+})
+
+(define_insn "avx512f_sfixupimm<mode><sd_maskz_name><round_saeonly_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+	(vec_merge:VF_128
+          (unspec:VF_128
+            [(match_operand:VF_128 1 "register_operand" "0")
+	     (match_operand:VF_128 2 "register_operand" "v")
+	     (match_operand:<sseintvecmode> 3 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+	     (match_operand:SI 4 "const_0_to_255_operand")]
+	    UNSPEC_FIXUPIMM)
+	  (match_dup 1)
+	  (const_int 1)))]
+   "TARGET_AVX512F"
+   "vfixupimm<ssescalarmodesuffix>\t{%4, <round_saeonly_sd_mask_op5>%3, %2, %0<sd_mask_op5>|%0<sd_mask_op5>, %2, %3<round_saeonly_sd_mask_op5>, %4}";
+   [(set_attr "prefix" "evex")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_insn "avx512f_sfixupimm<mode>_mask<round_saeonly_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+	(vec_merge:VF_128
+	  (vec_merge:VF_128
+	    (unspec:VF_128
+	       [(match_operand:VF_128 1 "register_operand" "0")
+		(match_operand:VF_128 2 "register_operand" "v")
+		(match_operand:<sseintvecmode> 3 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+		(match_operand:SI 4 "const_0_to_255_operand")]
+	       UNSPEC_FIXUPIMM)
+	    (match_dup 1)
+	    (const_int 1))
+	  (match_dup 1)
+	  (match_operand:<avx512fmaskmode> 5 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+  "vfixupimm<ssescalarmodesuffix>\t{%4, <round_saeonly_op6>%3, %2, %0%{%5%}|%0%{%5%}, %2, %3<round_saeonly_op6>, %4}";
+  [(set_attr "prefix" "evex")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_insn "avx512f_rndscale<mode><mask_name><round_saeonly_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+	(unspec:VF_512
+	  [(match_operand:VF_512 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+	   (match_operand:SI 2 "const_0_to_255_operand")]
+	  UNSPEC_ROUND))]
+  "TARGET_AVX512F"
+  "vrndscale<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}"
+  [(set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_rndscale<mode><round_saeonly_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+	(vec_merge:VF_128
+	  (unspec:VF_128
+	    [(match_operand:VF_128 1 "register_operand" "v")
+	     (match_operand:VF_128 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+	     (match_operand:SI 3 "const_0_to_255_operand")]
+	    UNSPEC_ROUND)
+	  (match_dup 1)
+	  (const_int 1)))]
+  "TARGET_AVX512F"
+  "vrndscale<ssescalarmodesuffix>\t{%3, <round_saeonly_op4>%2, %1, %0|%0, %1, %2<round_saeonly_op4>, %3}"
+  [(set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
+;; One bit in mask selects 2 elements.
+(define_insn "avx512f_shufps512_1<mask_name>"
+  [(set (match_operand:V16SF 0 "register_operand" "=v")
+	(vec_select:V16SF
+	  (vec_concat:V32SF
+	    (match_operand:V16SF 1 "register_operand" "v")
+	    (match_operand:V16SF 2 "nonimmediate_operand" "vm"))
+	  (parallel [(match_operand 3  "const_0_to_3_operand")
+		     (match_operand 4  "const_0_to_3_operand")
+		     (match_operand 5  "const_16_to_19_operand")
+		     (match_operand 6  "const_16_to_19_operand")
+		     (match_operand 7  "const_4_to_7_operand")
+		     (match_operand 8  "const_4_to_7_operand")
+		     (match_operand 9  "const_20_to_23_operand")
+		     (match_operand 10  "const_20_to_23_operand")
+		     (match_operand 11  "const_8_to_11_operand")
+		     (match_operand 12  "const_8_to_11_operand")
+		     (match_operand 13  "const_24_to_27_operand")
+		     (match_operand 14  "const_24_to_27_operand")
+		     (match_operand 15  "const_12_to_15_operand")
+		     (match_operand 16  "const_12_to_15_operand")
+		     (match_operand 17  "const_28_to_31_operand")
+		     (match_operand 18  "const_28_to_31_operand")])))]
+  "TARGET_AVX512F
+   && (INTVAL (operands[3]) == (INTVAL (operands[7]) - 4)
+       && INTVAL (operands[4]) == (INTVAL (operands[8]) - 4)
+       && INTVAL (operands[5]) == (INTVAL (operands[9]) - 4)
+       && INTVAL (operands[6]) == (INTVAL (operands[10]) - 4)
+       && INTVAL (operands[3]) == (INTVAL (operands[11]) - 8)
+       && INTVAL (operands[4]) == (INTVAL (operands[12]) - 8)
+       && INTVAL (operands[5]) == (INTVAL (operands[13]) - 8)
+       && INTVAL (operands[6]) == (INTVAL (operands[14]) - 8)
+       && INTVAL (operands[3]) == (INTVAL (operands[15]) - 12)
+       && INTVAL (operands[4]) == (INTVAL (operands[16]) - 12)
+       && INTVAL (operands[5]) == (INTVAL (operands[17]) - 12)
+       && INTVAL (operands[6]) == (INTVAL (operands[18]) - 12))"
+{
+  int mask;
+  mask = INTVAL (operands[3]);
+  mask |= INTVAL (operands[4]) << 2;
+  mask |= (INTVAL (operands[5]) - 16) << 4;
+  mask |= (INTVAL (operands[6]) - 16) << 6;
+  operands[3] = GEN_INT (mask);
+
+  return "vshufps\t{%3, %2, %1, %0<mask_operand19>|%0<mask_operand19>, %1, %2, %3}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V16SF")])
+
+(define_expand "avx512f_shufpd512_mask"
+  [(match_operand:V8DF 0 "register_operand")
+   (match_operand:V8DF 1 "register_operand")
+   (match_operand:V8DF 2 "nonimmediate_operand")
+   (match_operand:SI 3 "const_0_to_255_operand")
+   (match_operand:V8DF 4 "register_operand")
+   (match_operand:QI 5 "register_operand")]
+  "TARGET_AVX512F"
+{
+  int mask = INTVAL (operands[3]);
+  emit_insn (gen_avx512f_shufpd512_1_mask (operands[0], operands[1], operands[2],
+					GEN_INT (mask & 1),
+					GEN_INT (mask & 2 ? 9 : 8),
+					GEN_INT (mask & 4 ? 3 : 2),
+					GEN_INT (mask & 8 ? 11 : 10),
+					GEN_INT (mask & 16 ? 5 : 4),
+					GEN_INT (mask & 32 ? 13 : 12),
+					GEN_INT (mask & 64 ? 7 : 6),
+					GEN_INT (mask & 128 ? 15 : 14),
+					operands[4], operands[5]));
+  DONE;
+})
+
+(define_insn "avx512f_shufpd512_1<mask_name>"
+  [(set (match_operand:V8DF 0 "register_operand" "=v")
+	(vec_select:V8DF
+	  (vec_concat:V16DF
+	    (match_operand:V8DF 1 "register_operand" "v")
+	    (match_operand:V8DF 2 "nonimmediate_operand" "vm"))
+	  (parallel [(match_operand 3 "const_0_to_1_operand")
+		     (match_operand 4 "const_8_to_9_operand")
+		     (match_operand 5 "const_2_to_3_operand")
+		     (match_operand 6 "const_10_to_11_operand")
+		     (match_operand 7 "const_4_to_5_operand")
+		     (match_operand 8 "const_12_to_13_operand")
+		     (match_operand 9 "const_6_to_7_operand")
+		     (match_operand 10 "const_14_to_15_operand")])))]
+  "TARGET_AVX512F"
+{
+  int mask;
+  mask = INTVAL (operands[3]);
+  mask |= (INTVAL (operands[4]) - 8) << 1;
+  mask |= (INTVAL (operands[5]) - 2) << 2;
+  mask |= (INTVAL (operands[6]) - 10) << 3;
+  mask |= (INTVAL (operands[7]) - 4) << 4;
+  mask |= (INTVAL (operands[8]) - 12) << 5;
+  mask |= (INTVAL (operands[9]) - 6) << 6;
+  mask |= (INTVAL (operands[10]) - 14) << 7;
+  operands[3] = GEN_INT (mask);
+
+  return "vshufpd\t{%3, %2, %1, %0<mask_operand11>|%0<mask_operand11>, %1, %2, %3}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V8DF")])
+
+(define_expand "avx_shufpd256"
+  [(match_operand:V4DF 0 "register_operand")
+   (match_operand:V4DF 1 "register_operand")
+   (match_operand:V4DF 2 "nonimmediate_operand")
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_AVX"
+{
+  int mask = INTVAL (operands[3]);
+  emit_insn (gen_avx_shufpd256_1 (operands[0], operands[1], operands[2],
+				   GEN_INT (mask & 1),
+				   GEN_INT (mask & 2 ? 5 : 4),
+				   GEN_INT (mask & 4 ? 3 : 2),
+				   GEN_INT (mask & 8 ? 7 : 6)));
+  DONE;
+})
+
+(define_insn "avx_shufpd256_1"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+	(vec_select:V4DF
+	  (vec_concat:V8DF
+	    (match_operand:V4DF 1 "register_operand" "x")
+	    (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
+	  (parallel [(match_operand 3 "const_0_to_1_operand")
+		     (match_operand 4 "const_4_to_5_operand")
+		     (match_operand 5 "const_2_to_3_operand")
+		     (match_operand 6 "const_6_to_7_operand")])))]
+  "TARGET_AVX"
+{
+  int mask;
+  mask = INTVAL (operands[3]);
+  mask |= (INTVAL (operands[4]) - 4) << 1;
+  mask |= (INTVAL (operands[5]) - 2) << 2;
+  mask |= (INTVAL (operands[6]) - 6) << 3;
+  operands[3] = GEN_INT (mask);
+
+  return "vshufpd\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+}
+  [(set_attr "type" "sseshuf")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
+(define_expand "sse2_shufpd"
+  [(match_operand:V2DF 0 "register_operand")
+   (match_operand:V2DF 1 "register_operand")
+   (match_operand:V2DF 2 "nonimmediate_operand")
+   (match_operand:SI 3 "const_int_operand")]
+  "TARGET_SSE2"
+{
+  int mask = INTVAL (operands[3]);
+  emit_insn (gen_sse2_shufpd_v2df (operands[0], operands[1], operands[2],
+				GEN_INT (mask & 1),
+				GEN_INT (mask & 2 ? 3 : 2)));
+  DONE;
+})
+
+;; punpcklqdq and punpckhqdq are shorter than shufpd.
+(define_insn "avx2_interleave_highv4di"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+	(vec_select:V4DI
+	  (vec_concat:V8DI
+	    (match_operand:V4DI 1 "register_operand" "x")
+	    (match_operand:V4DI 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 1)
+		     (const_int 5)
+		     (const_int 3)
+		     (const_int 7)])))]
+  "TARGET_AVX2"
+  "vpunpckhqdq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "<mask_codefor>avx512f_interleave_highv8di<mask_name>"
+  [(set (match_operand:V8DI 0 "register_operand" "=v")
+	(vec_select:V8DI
+	  (vec_concat:V16DI
+	    (match_operand:V8DI 1 "register_operand" "v")
+	    (match_operand:V8DI 2 "nonimmediate_operand" "vm"))
+	  (parallel [(const_int 1) (const_int 9)
+		     (const_int 3) (const_int 11)
+		     (const_int 5) (const_int 13)
+		     (const_int 7) (const_int 15)])))]
+  "TARGET_AVX512F"
+  "vpunpckhqdq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "vec_interleave_highv2di"
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
+	(vec_select:V2DI
+	  (vec_concat:V4DI
+	    (match_operand:V2DI 1 "register_operand" "0,x")
+	    (match_operand:V2DI 2 "nonimmediate_operand" "xm,xm"))
+	  (parallel [(const_int 1)
+		     (const_int 3)])))]
+  "TARGET_SSE2"
+  "@
+   punpckhqdq\t{%2, %0|%0, %2}
+   vpunpckhqdq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx2_interleave_lowv4di"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+	(vec_select:V4DI
+	  (vec_concat:V8DI
+	    (match_operand:V4DI 1 "register_operand" "x")
+	    (match_operand:V4DI 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 0)
+		     (const_int 4)
+		     (const_int 2)
+		     (const_int 6)])))]
+  "TARGET_AVX2"
+  "vpunpcklqdq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "<mask_codefor>avx512f_interleave_lowv8di<mask_name>"
+  [(set (match_operand:V8DI 0 "register_operand" "=v")
+	(vec_select:V8DI
+	  (vec_concat:V16DI
+	    (match_operand:V8DI 1 "register_operand" "v")
+	    (match_operand:V8DI 2 "nonimmediate_operand" "vm"))
+	  (parallel [(const_int 0) (const_int 8)
+		     (const_int 2) (const_int 10)
+		     (const_int 4) (const_int 12)
+		     (const_int 6) (const_int 14)])))]
+  "TARGET_AVX512F"
+  "vpunpcklqdq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "vec_interleave_lowv2di"
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
+	(vec_select:V2DI
+	  (vec_concat:V4DI
+	    (match_operand:V2DI 1 "register_operand" "0,x")
+	    (match_operand:V2DI 2 "nonimmediate_operand" "xm,xm"))
+	  (parallel [(const_int 0)
+		     (const_int 2)])))]
+  "TARGET_SSE2"
+  "@
+   punpcklqdq\t{%2, %0|%0, %2}
+   vpunpcklqdq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse2_shufpd_<mode>"
+  [(set (match_operand:VI8F_128 0 "register_operand" "=x,x")
+	(vec_select:VI8F_128
+	  (vec_concat:<ssedoublevecmode>
+	    (match_operand:VI8F_128 1 "register_operand" "0,x")
+	    (match_operand:VI8F_128 2 "nonimmediate_operand" "xm,xm"))
+	  (parallel [(match_operand 3 "const_0_to_1_operand")
+		     (match_operand 4 "const_2_to_3_operand")])))]
+  "TARGET_SSE2"
+{
+  int mask;
+  mask = INTVAL (operands[3]);
+  mask |= (INTVAL (operands[4]) - 2) << 1;
+  operands[3] = GEN_INT (mask);
+
+  switch (which_alternative)
+    {
+    case 0:
+      return "shufpd\t{%3, %2, %0|%0, %2, %3}";
+    case 1:
+      return "vshufpd\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseshuf")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "V2DF")])
+
+;; Avoid combining registers from different units in a single alternative,
+;; see comment above inline_secondary_memory_needed function in i386.c
+(define_insn "sse2_storehpd"
+  [(set (match_operand:DF 0 "nonimmediate_operand"     "=m,x,x,x,*f,r")
+	(vec_select:DF
+	  (match_operand:V2DF 1 "nonimmediate_operand" " x,0,x,o,o,o")
+	  (parallel [(const_int 1)])))]
+  "TARGET_SSE2 && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "@
+   %vmovhpd\t{%1, %0|%0, %1}
+   unpckhpd\t%0, %0
+   vunpckhpd\t{%d1, %0|%0, %d1}
+   #
+   #
+   #"
+  [(set_attr "isa" "*,noavx,avx,*,*,*")
+   (set_attr "type" "ssemov,sselog1,sselog1,ssemov,fmov,imov")
+   (set (attr "prefix_data16")
+     (if_then_else
+       (and (eq_attr "alternative" "0")
+	    (not (match_test "TARGET_AVX")))
+       (const_string "1")
+       (const_string "*")))
+   (set_attr "prefix" "maybe_vex,orig,vex,*,*,*")
+   (set_attr "mode" "V1DF,V1DF,V2DF,DF,DF,DF")])
+
+(define_split
+  [(set (match_operand:DF 0 "register_operand")
+	(vec_select:DF
+	  (match_operand:V2DF 1 "memory_operand")
+	  (parallel [(const_int 1)])))]
+  "TARGET_SSE2 && reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+  "operands[1] = adjust_address (operands[1], DFmode, 8);")
+
+(define_insn "*vec_extractv2df_1_sse"
+  [(set (match_operand:DF 0 "nonimmediate_operand" "=m,x,x")
+	(vec_select:DF
+	  (match_operand:V2DF 1 "nonimmediate_operand" "x,x,o")
+	  (parallel [(const_int 1)])))]
+  "!TARGET_SSE2 && TARGET_SSE
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "@
+   movhps\t{%1, %0|%q0, %1}
+   movhlps\t{%1, %0|%0, %1}
+   movlps\t{%H1, %0|%0, %H1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "ssememalign" "64")
+   (set_attr "mode" "V2SF,V4SF,V2SF")])
+
+;; Avoid combining registers from different units in a single alternative,
+;; see comment above inline_secondary_memory_needed function in i386.c
+(define_insn "sse2_storelpd"
+  [(set (match_operand:DF 0 "nonimmediate_operand"     "=m,x,x,*f,r")
+	(vec_select:DF
+	  (match_operand:V2DF 1 "nonimmediate_operand" " x,x,m,m,m")
+	  (parallel [(const_int 0)])))]
+  "TARGET_SSE2 && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "@
+   %vmovlpd\t{%1, %0|%0, %1}
+   #
+   #
+   #
+   #"
+  [(set_attr "type" "ssemov,ssemov,ssemov,fmov,imov")
+   (set_attr "prefix_data16" "1,*,*,*,*")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "V1DF,DF,DF,DF,DF")])
+
+(define_split
+  [(set (match_operand:DF 0 "register_operand")
+	(vec_select:DF
+	  (match_operand:V2DF 1 "nonimmediate_operand")
+	  (parallel [(const_int 0)])))]
+  "TARGET_SSE2 && reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+{
+  if (REG_P (operands[1]))
+    operands[1] = gen_rtx_REG (DFmode, REGNO (operands[1]));
+  else
+    operands[1] = adjust_address (operands[1], DFmode, 0);
+})
+
+(define_insn "*vec_extractv2df_0_sse"
+  [(set (match_operand:DF 0 "nonimmediate_operand" "=m,x,x")
+	(vec_select:DF
+	  (match_operand:V2DF 1 "nonimmediate_operand" "x,x,m")
+	  (parallel [(const_int 0)])))]
+  "!TARGET_SSE2 && TARGET_SSE
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "@
+   movlps\t{%1, %0|%0, %1}
+   movaps\t{%1, %0|%0, %1}
+   movlps\t{%1, %0|%0, %q1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "mode" "V2SF,V4SF,V2SF")])
+
+(define_expand "sse2_loadhpd_exp"
+  [(set (match_operand:V2DF 0 "nonimmediate_operand")
+	(vec_concat:V2DF
+	  (vec_select:DF
+	    (match_operand:V2DF 1 "nonimmediate_operand")
+	    (parallel [(const_int 0)]))
+	  (match_operand:DF 2 "nonimmediate_operand")))]
+  "TARGET_SSE2"
+{
+  rtx dst = ix86_fixup_binary_operands (UNKNOWN, V2DFmode, operands);
+
+  emit_insn (gen_sse2_loadhpd (dst, operands[1], operands[2]));
+
+  /* Fix up the destination if needed.  */
+  if (dst != operands[0])
+    emit_move_insn (operands[0], dst);
+
+  DONE;
+})
+
+;; Avoid combining registers from different units in a single alternative,
+;; see comment above inline_secondary_memory_needed function in i386.c
+(define_insn "sse2_loadhpd"
+  [(set (match_operand:V2DF 0 "nonimmediate_operand"
+	  "=x,x,x,x,o,o ,o")
+	(vec_concat:V2DF
+	  (vec_select:DF
+	    (match_operand:V2DF 1 "nonimmediate_operand"
+	  " 0,x,0,x,0,0 ,0")
+	    (parallel [(const_int 0)]))
+	  (match_operand:DF 2 "nonimmediate_operand"
+	  " m,m,x,x,x,*f,r")))]
+  "TARGET_SSE2 && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "@
+   movhpd\t{%2, %0|%0, %2}
+   vmovhpd\t{%2, %1, %0|%0, %1, %2}
+   unpcklpd\t{%2, %0|%0, %2}
+   vunpcklpd\t{%2, %1, %0|%0, %1, %2}
+   #
+   #
+   #"
+  [(set_attr "isa" "noavx,avx,noavx,avx,*,*,*")
+   (set_attr "type" "ssemov,ssemov,sselog,sselog,ssemov,fmov,imov")
+   (set_attr "ssememalign" "64")
+   (set_attr "prefix_data16" "1,*,*,*,*,*,*")
+   (set_attr "prefix" "orig,vex,orig,vex,*,*,*")
+   (set_attr "mode" "V1DF,V1DF,V2DF,V2DF,DF,DF,DF")])
+
+(define_split
+  [(set (match_operand:V2DF 0 "memory_operand")
+	(vec_concat:V2DF
+	  (vec_select:DF (match_dup 0) (parallel [(const_int 0)]))
+	  (match_operand:DF 1 "register_operand")))]
+  "TARGET_SSE2 && reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+  "operands[0] = adjust_address (operands[0], DFmode, 8);")
+
+(define_expand "sse2_loadlpd_exp"
+  [(set (match_operand:V2DF 0 "nonimmediate_operand")
+	(vec_concat:V2DF
+	  (match_operand:DF 2 "nonimmediate_operand")
+	  (vec_select:DF
+	    (match_operand:V2DF 1 "nonimmediate_operand")
+	    (parallel [(const_int 1)]))))]
+  "TARGET_SSE2"
+{
+  rtx dst = ix86_fixup_binary_operands (UNKNOWN, V2DFmode, operands);
+
+  emit_insn (gen_sse2_loadlpd (dst, operands[1], operands[2]));
+
+  /* Fix up the destination if needed.  */
+  if (dst != operands[0])
+    emit_move_insn (operands[0], dst);
+
+  DONE;
+})
+
+;; Avoid combining registers from different units in a single alternative,
+;; see comment above inline_secondary_memory_needed function in i386.c
+(define_insn "sse2_loadlpd"
+  [(set (match_operand:V2DF 0 "nonimmediate_operand"
+	  "=x,x,x,x,x,x,x,x,m,m ,m")
+	(vec_concat:V2DF
+	  (match_operand:DF 2 "nonimmediate_operand"
+	  " m,m,m,x,x,0,0,x,x,*f,r")
+	  (vec_select:DF
+	    (match_operand:V2DF 1 "vector_move_operand"
+	  " C,0,x,0,x,x,o,o,0,0 ,0")
+	    (parallel [(const_int 1)]))))]
+  "TARGET_SSE2 && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "@
+   %vmovsd\t{%2, %0|%0, %2}
+   movlpd\t{%2, %0|%0, %2}
+   vmovlpd\t{%2, %1, %0|%0, %1, %2}
+   movsd\t{%2, %0|%0, %2}
+   vmovsd\t{%2, %1, %0|%0, %1, %2}
+   shufpd\t{$2, %1, %0|%0, %1, 2}
+   movhpd\t{%H1, %0|%0, %H1}
+   vmovhpd\t{%H1, %2, %0|%0, %2, %H1}
+   #
+   #
+   #"
+  [(set_attr "isa" "*,noavx,avx,noavx,avx,noavx,noavx,avx,*,*,*")
+   (set (attr "type")
+     (cond [(eq_attr "alternative" "5")
+	      (const_string "sselog")
+	    (eq_attr "alternative" "9")
+	      (const_string "fmov")
+	    (eq_attr "alternative" "10")
+	      (const_string "imov")
+	   ]
+	   (const_string "ssemov")))
+   (set_attr "ssememalign" "64")
+   (set_attr "prefix_data16" "*,1,*,*,*,*,1,*,*,*,*")
+   (set_attr "length_immediate" "*,*,*,*,*,1,*,*,*,*,*")
+   (set_attr "prefix" "maybe_vex,orig,vex,orig,vex,orig,orig,vex,*,*,*")
+   (set_attr "mode" "DF,V1DF,V1DF,V1DF,V1DF,V2DF,V1DF,V1DF,DF,DF,DF")])
+
+(define_split
+  [(set (match_operand:V2DF 0 "memory_operand")
+	(vec_concat:V2DF
+	  (match_operand:DF 1 "register_operand")
+	  (vec_select:DF (match_dup 0) (parallel [(const_int 1)]))))]
+  "TARGET_SSE2 && reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+  "operands[0] = adjust_address (operands[0], DFmode, 0);")
+
+(define_insn "sse2_movsd"
+  [(set (match_operand:V2DF 0 "nonimmediate_operand"   "=x,x,x,x,m,x,x,x,o")
+	(vec_merge:V2DF
+	  (match_operand:V2DF 2 "nonimmediate_operand" " x,x,m,m,x,0,0,x,0")
+	  (match_operand:V2DF 1 "nonimmediate_operand" " 0,x,0,x,0,x,o,o,x")
+	  (const_int 1)))]
+  "TARGET_SSE2"
+  "@
+   movsd\t{%2, %0|%0, %2}
+   vmovsd\t{%2, %1, %0|%0, %1, %2}
+   movlpd\t{%2, %0|%0, %q2}
+   vmovlpd\t{%2, %1, %0|%0, %1, %q2}
+   %vmovlpd\t{%2, %0|%q0, %2}
+   shufpd\t{$2, %1, %0|%0, %1, 2}
+   movhps\t{%H1, %0|%0, %H1}
+   vmovhps\t{%H1, %2, %0|%0, %2, %H1}
+   %vmovhps\t{%1, %H0|%H0, %1}"
+  [(set_attr "isa" "noavx,avx,noavx,avx,*,noavx,noavx,avx,*")
+   (set (attr "type")
+     (if_then_else
+       (eq_attr "alternative" "5")
+       (const_string "sselog")
+       (const_string "ssemov")))
+   (set (attr "prefix_data16")
+     (if_then_else
+       (and (eq_attr "alternative" "2,4")
+	    (not (match_test "TARGET_AVX")))
+       (const_string "1")
+       (const_string "*")))
+   (set_attr "length_immediate" "*,*,*,*,*,1,*,*,*")
+   (set_attr "ssememalign" "64")
+   (set_attr "prefix" "orig,vex,orig,vex,maybe_vex,orig,orig,vex,maybe_vex")
+   (set_attr "mode" "DF,DF,V1DF,V1DF,V1DF,V2DF,V1DF,V1DF,V1DF")])
+
+(define_insn "vec_dupv2df"
+  [(set (match_operand:V2DF 0 "register_operand"     "=x,x")
+	(vec_duplicate:V2DF
+	  (match_operand:DF 1 "nonimmediate_operand" " 0,xm")))]
+  "TARGET_SSE2"
+  "@
+   unpcklpd\t%0, %0
+   %vmovddup\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "noavx,sse3")
+   (set_attr "type" "sselog1")
+   (set_attr "prefix" "orig,maybe_vex")
+   (set_attr "mode" "V2DF,DF")])
+
+(define_insn "*vec_concatv2df"
+  [(set (match_operand:V2DF 0 "register_operand"     "=x,x,x,x,x,x,x,x")
+	(vec_concat:V2DF
+	  (match_operand:DF 1 "nonimmediate_operand" " 0,x,m,0,x,m,0,0")
+	  (match_operand:DF 2 "vector_move_operand"  " x,x,1,m,m,C,x,m")))]
+  "TARGET_SSE"
+  "@
+   unpcklpd\t{%2, %0|%0, %2}
+   vunpcklpd\t{%2, %1, %0|%0, %1, %2}
+   %vmovddup\t{%1, %0|%0, %1}
+   movhpd\t{%2, %0|%0, %2}
+   vmovhpd\t{%2, %1, %0|%0, %1, %2}
+   %vmovsd\t{%1, %0|%0, %1}
+   movlhps\t{%2, %0|%0, %2}
+   movhps\t{%2, %0|%0, %2}"
+  [(set_attr "isa" "sse2_noavx,avx,sse3,sse2_noavx,avx,sse2,noavx,noavx")
+   (set (attr "type")
+     (if_then_else
+       (eq_attr "alternative" "0,1,2")
+       (const_string "sselog")
+       (const_string "ssemov")))
+   (set_attr "prefix_data16" "*,*,*,1,*,*,*,*")
+   (set_attr "prefix" "orig,vex,maybe_vex,orig,vex,maybe_vex,orig,orig")
+   (set_attr "mode" "V2DF,V2DF,DF,V1DF,V1DF,DF,V4SF,V2SF")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel integer down-conversion operations
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_mode_iterator PMOV_DST_MODE [V16QI V16HI V8SI V8HI])
+(define_mode_attr pmov_src_mode
+  [(V16QI "V16SI") (V16HI "V16SI") (V8SI "V8DI") (V8HI "V8DI")])
+(define_mode_attr pmov_src_lower
+  [(V16QI "v16si") (V16HI "v16si") (V8SI "v8di") (V8HI "v8di")])
+(define_mode_attr pmov_suff
+  [(V16QI "db") (V16HI "dw") (V8SI "qd") (V8HI "qw")])
+
+(define_insn "*avx512f_<code><pmov_src_lower><mode>2"
+  [(set (match_operand:PMOV_DST_MODE 0 "nonimmediate_operand" "=v,m")
+	(any_truncate:PMOV_DST_MODE
+	  (match_operand:<pmov_src_mode> 1 "register_operand" "v,v")))]
+  "TARGET_AVX512F"
+  "vpmov<trunsuffix><pmov_suff>\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "memory" "none,store")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512f_<code><pmov_src_lower><mode>2_mask"
+  [(set (match_operand:PMOV_DST_MODE 0 "nonimmediate_operand" "=v,m")
+    (vec_merge:PMOV_DST_MODE
+      (any_truncate:PMOV_DST_MODE
+        (match_operand:<pmov_src_mode> 1 "register_operand" "v,v"))
+      (match_operand:PMOV_DST_MODE 2 "vector_move_operand" "0C,0")
+      (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")))]
+  "TARGET_AVX512F"
+  "vpmov<trunsuffix><pmov_suff>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "memory" "none,store")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512f_<code><pmov_src_lower><mode>2_mask_store"
+  [(set (match_operand:PMOV_DST_MODE 0 "memory_operand")
+    (vec_merge:PMOV_DST_MODE
+      (any_truncate:PMOV_DST_MODE
+        (match_operand:<pmov_src_mode> 1 "register_operand"))
+      (match_dup 0)
+      (match_operand:<avx512fmaskmode> 2 "register_operand")))]
+  "TARGET_AVX512F")
+
+(define_insn "*avx512f_<code>v8div16qi2"
+  [(set (match_operand:V16QI 0 "register_operand" "=v")
+	(vec_concat:V16QI
+	  (any_truncate:V8QI
+	    (match_operand:V8DI 1 "register_operand" "v"))
+	  (const_vector:V8QI [(const_int 0) (const_int 0)
+			      (const_int 0) (const_int 0)
+			      (const_int 0) (const_int 0)
+			      (const_int 0) (const_int 0)])))]
+  "TARGET_AVX512F"
+  "vpmov<trunsuffix>qb\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*avx512f_<code>v8div16qi2_store"
+  [(set (match_operand:V16QI 0 "memory_operand" "=m")
+	(vec_concat:V16QI
+	  (any_truncate:V8QI
+	    (match_operand:V8DI 1 "register_operand" "v"))
+	  (vec_select:V8QI
+	    (match_dup 0)
+	    (parallel [(const_int 8) (const_int 9)
+		       (const_int 10) (const_int 11)
+		       (const_int 12) (const_int 13)
+		       (const_int 14) (const_int 15)]))))]
+  "TARGET_AVX512F"
+  "vpmov<trunsuffix>qb\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "memory" "store")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx512f_<code>v8div16qi2_mask"
+  [(set (match_operand:V16QI 0 "register_operand" "=v")
+    (vec_concat:V16QI
+      (vec_merge:V8QI
+        (any_truncate:V8QI
+          (match_operand:V8DI 1 "register_operand" "v"))
+        (vec_select:V8QI
+          (match_operand:V16QI 2 "vector_move_operand" "0C")
+          (parallel [(const_int 0) (const_int 1)
+                     (const_int 2) (const_int 3)
+                     (const_int 4) (const_int 5)
+                     (const_int 6) (const_int 7)]))
+        (match_operand:QI 3 "register_operand" "Yk"))
+      (const_vector:V8QI [(const_int 0) (const_int 0)
+                          (const_int 0) (const_int 0)
+                          (const_int 0) (const_int 0)
+                          (const_int 0) (const_int 0)])))]
+  "TARGET_AVX512F"
+  "vpmov<trunsuffix>qb\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx512f_<code>v8div16qi2_mask_store"
+  [(set (match_operand:V16QI 0 "memory_operand" "=m")
+    (vec_concat:V16QI
+      (vec_merge:V8QI
+        (any_truncate:V8QI
+          (match_operand:V8DI 1 "register_operand" "v"))
+        (vec_select:V8QI
+          (match_dup 0)
+          (parallel [(const_int 0) (const_int 1)
+                     (const_int 2) (const_int 3)
+                     (const_int 4) (const_int 5)
+                     (const_int 6) (const_int 7)]))
+        (match_operand:QI 2 "register_operand" "Yk"))
+      (vec_select:V8QI
+        (match_dup 0)
+        (parallel [(const_int 8) (const_int 9)
+                   (const_int 10) (const_int 11)
+                   (const_int 12) (const_int 13)
+                   (const_int 14) (const_int 15)]))))]
+  "TARGET_AVX512F"
+  "vpmov<trunsuffix>qb\t{%1, %0%{%2%}|%0%{%2%}, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "memory" "store")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel integral arithmetic
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_expand "neg<mode>2"
+  [(set (match_operand:VI_AVX2 0 "register_operand")
+	(minus:VI_AVX2
+	  (match_dup 2)
+	  (match_operand:VI_AVX2 1 "nonimmediate_operand")))]
+  "TARGET_SSE2"
+  "operands[2] = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));")
+
+(define_expand "<plusminus_insn><mode>3<mask_name>"
+  [(set (match_operand:VI_AVX2 0 "register_operand")
+	(plusminus:VI_AVX2
+	  (match_operand:VI_AVX2 1 "nonimmediate_operand")
+	  (match_operand:VI_AVX2 2 "nonimmediate_operand")))]
+  "TARGET_SSE2 && <mask_mode512bit_condition>"
+  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+
+(define_insn "*<plusminus_insn><mode>3<mask_name>"
+  [(set (match_operand:VI_AVX2 0 "register_operand" "=x,v")
+	(plusminus:VI_AVX2
+	  (match_operand:VI_AVX2 1 "nonimmediate_operand" "<comm>0,v")
+	  (match_operand:VI_AVX2 2 "nonimmediate_operand" "xm,vm")))]
+  "TARGET_SSE2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands) && <mask_mode512bit_condition>"
+  "@
+   p<plusminus_mnemonic><ssemodesuffix>\t{%2, %0|%0, %2}
+   vp<plusminus_mnemonic><ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "<mask_prefix3>")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "<sse2_avx2>_<plusminus_insn><mode>3"
+  [(set (match_operand:VI12_AVX2 0 "register_operand")
+	(sat_plusminus:VI12_AVX2
+	  (match_operand:VI12_AVX2 1 "nonimmediate_operand")
+	  (match_operand:VI12_AVX2 2 "nonimmediate_operand")))]
+  "TARGET_SSE2"
+  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+
+(define_insn "*<sse2_avx2>_<plusminus_insn><mode>3"
+  [(set (match_operand:VI12_AVX2 0 "register_operand" "=x,v")
+	(sat_plusminus:VI12_AVX2
+	  (match_operand:VI12_AVX2 1 "nonimmediate_operand" "<comm>0,v")
+	  (match_operand:VI12_AVX2 2 "nonimmediate_operand" "xm,vm")))]
+  "TARGET_SSE2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "@
+   p<plusminus_mnemonic><ssemodesuffix>\t{%2, %0|%0, %2}
+   vp<plusminus_mnemonic><ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_expand "mul<mode>3"
+  [(set (match_operand:VI1_AVX2 0 "register_operand")
+	(mult:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand")
+		       (match_operand:VI1_AVX2 2 "register_operand")))]
+  "TARGET_SSE2"
+{
+  ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]);
+  DONE;
+})
+
+(define_expand "mul<mode>3"
+  [(set (match_operand:VI2_AVX2 0 "register_operand")
+	(mult:VI2_AVX2 (match_operand:VI2_AVX2 1 "nonimmediate_operand")
+		       (match_operand:VI2_AVX2 2 "nonimmediate_operand")))]
+  "TARGET_SSE2"
+  "ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);")
+
+(define_insn "*mul<mode>3"
+  [(set (match_operand:VI2_AVX2 0 "register_operand" "=x,x")
+	(mult:VI2_AVX2 (match_operand:VI2_AVX2 1 "nonimmediate_operand" "%0,x")
+		       (match_operand:VI2_AVX2 2 "nonimmediate_operand" "xm,xm")))]
+  "TARGET_SSE2 && ix86_binary_operator_ok (MULT, <MODE>mode, operands)"
+  "@
+   pmullw\t{%2, %0|%0, %2}
+   vpmullw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseimul")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "<s>mul<mode>3_highpart"
+  [(set (match_operand:VI2_AVX2 0 "register_operand")
+	(truncate:VI2_AVX2
+	  (lshiftrt:<ssedoublemode>
+	    (mult:<ssedoublemode>
+	      (any_extend:<ssedoublemode>
+		(match_operand:VI2_AVX2 1 "nonimmediate_operand"))
+	      (any_extend:<ssedoublemode>
+		(match_operand:VI2_AVX2 2 "nonimmediate_operand")))
+	    (const_int 16))))]
+  "TARGET_SSE2"
+  "ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);")
+
+(define_insn "*<s>mul<mode>3_highpart"
+  [(set (match_operand:VI2_AVX2 0 "register_operand" "=x,x")
+	(truncate:VI2_AVX2
+	  (lshiftrt:<ssedoublemode>
+	    (mult:<ssedoublemode>
+	      (any_extend:<ssedoublemode>
+		(match_operand:VI2_AVX2 1 "nonimmediate_operand" "%0,x"))
+	      (any_extend:<ssedoublemode>
+		(match_operand:VI2_AVX2 2 "nonimmediate_operand" "xm,xm")))
+	    (const_int 16))))]
+  "TARGET_SSE2 && ix86_binary_operator_ok (MULT, <MODE>mode, operands)"
+  "@
+   pmulh<u>w\t{%2, %0|%0, %2}
+   vpmulh<u>w\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseimul")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "vec_widen_umult_even_v16si<mask_name>"
+  [(set (match_operand:V8DI 0 "register_operand")
+        (mult:V8DI
+          (zero_extend:V8DI
+            (vec_select:V8SI
+              (match_operand:V16SI 1 "nonimmediate_operand")
+              (parallel [(const_int 0) (const_int 2)
+                         (const_int 4) (const_int 6)
+                         (const_int 8) (const_int 10)
+                         (const_int 12) (const_int 14)])))
+          (zero_extend:V8DI
+            (vec_select:V8SI
+              (match_operand:V16SI 2 "nonimmediate_operand")
+              (parallel [(const_int 0) (const_int 2)
+                         (const_int 4) (const_int 6)
+                         (const_int 8) (const_int 10)
+                         (const_int 12) (const_int 14)])))))]
+  "TARGET_AVX512F"
+  "ix86_fixup_binary_operands_no_copy (MULT, V16SImode, operands);")
+
+(define_insn "*vec_widen_umult_even_v16si<mask_name>"
+  [(set (match_operand:V8DI 0 "register_operand" "=v")
+        (mult:V8DI
+          (zero_extend:V8DI
+            (vec_select:V8SI
+              (match_operand:V16SI 1 "nonimmediate_operand" "%v")
+              (parallel [(const_int 0) (const_int 2)
+                         (const_int 4) (const_int 6)
+                         (const_int 8) (const_int 10)
+                         (const_int 12) (const_int 14)])))
+          (zero_extend:V8DI
+            (vec_select:V8SI
+              (match_operand:V16SI 2 "nonimmediate_operand" "vm")
+              (parallel [(const_int 0) (const_int 2)
+                         (const_int 4) (const_int 6)
+                         (const_int 8) (const_int 10)
+                         (const_int 12) (const_int 14)])))))]
+  "TARGET_AVX512F && ix86_binary_operator_ok (MULT, V16SImode, operands)"
+  "vpmuludq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "isa" "avx512f")
+   (set_attr "type" "sseimul")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_expand "vec_widen_umult_even_v8si"
+  [(set (match_operand:V4DI 0 "register_operand")
+	(mult:V4DI
+	  (zero_extend:V4DI
+	    (vec_select:V4SI
+	      (match_operand:V8SI 1 "nonimmediate_operand")
+	      (parallel [(const_int 0) (const_int 2)
+			 (const_int 4) (const_int 6)])))
+	  (zero_extend:V4DI
+	    (vec_select:V4SI
+	      (match_operand:V8SI 2 "nonimmediate_operand")
+	      (parallel [(const_int 0) (const_int 2)
+			 (const_int 4) (const_int 6)])))))]
+  "TARGET_AVX2"
+  "ix86_fixup_binary_operands_no_copy (MULT, V8SImode, operands);")
+
+(define_insn "*vec_widen_umult_even_v8si"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+	(mult:V4DI
+	  (zero_extend:V4DI
+	    (vec_select:V4SI
+	      (match_operand:V8SI 1 "nonimmediate_operand" "%x")
+	      (parallel [(const_int 0) (const_int 2)
+			 (const_int 4) (const_int 6)])))
+	  (zero_extend:V4DI
+	    (vec_select:V4SI
+	      (match_operand:V8SI 2 "nonimmediate_operand" "xm")
+	      (parallel [(const_int 0) (const_int 2)
+			 (const_int 4) (const_int 6)])))))]
+  "TARGET_AVX2 && ix86_binary_operator_ok (MULT, V8SImode, operands)"
+  "vpmuludq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_expand "vec_widen_umult_even_v4si"
+  [(set (match_operand:V2DI 0 "register_operand")
+	(mult:V2DI
+	  (zero_extend:V2DI
+	    (vec_select:V2SI
+	      (match_operand:V4SI 1 "nonimmediate_operand")
+	      (parallel [(const_int 0) (const_int 2)])))
+	  (zero_extend:V2DI
+	    (vec_select:V2SI
+	      (match_operand:V4SI 2 "nonimmediate_operand")
+	      (parallel [(const_int 0) (const_int 2)])))))]
+  "TARGET_SSE2"
+  "ix86_fixup_binary_operands_no_copy (MULT, V4SImode, operands);")
+
+(define_insn "*vec_widen_umult_even_v4si"
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
+	(mult:V2DI
+	  (zero_extend:V2DI
+	    (vec_select:V2SI
+	      (match_operand:V4SI 1 "nonimmediate_operand" "%0,x")
+	      (parallel [(const_int 0) (const_int 2)])))
+	  (zero_extend:V2DI
+	    (vec_select:V2SI
+	      (match_operand:V4SI 2 "nonimmediate_operand" "xm,xm")
+	      (parallel [(const_int 0) (const_int 2)])))))]
+  "TARGET_SSE2 && ix86_binary_operator_ok (MULT, V4SImode, operands)"
+  "@
+   pmuludq\t{%2, %0|%0, %2}
+   vpmuludq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseimul")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_expand "vec_widen_smult_even_v16si<mask_name>"
+  [(set (match_operand:V8DI 0 "register_operand")
+        (mult:V8DI
+          (sign_extend:V8DI
+            (vec_select:V8SI
+              (match_operand:V16SI 1 "nonimmediate_operand")
+              (parallel [(const_int 0) (const_int 2)
+                         (const_int 4) (const_int 6)
+                         (const_int 8) (const_int 10)
+                         (const_int 12) (const_int 14)])))
+          (sign_extend:V8DI
+            (vec_select:V8SI
+              (match_operand:V16SI 2 "nonimmediate_operand")
+              (parallel [(const_int 0) (const_int 2)
+                         (const_int 4) (const_int 6)
+                         (const_int 8) (const_int 10)
+                         (const_int 12) (const_int 14)])))))]
+  "TARGET_AVX512F"
+  "ix86_fixup_binary_operands_no_copy (MULT, V16SImode, operands);")
+
+(define_insn "*vec_widen_smult_even_v16si<mask_name>"
+  [(set (match_operand:V8DI 0 "register_operand" "=v")
+        (mult:V8DI
+          (sign_extend:V8DI
+            (vec_select:V8SI
+              (match_operand:V16SI 1 "nonimmediate_operand" "%v")
+              (parallel [(const_int 0) (const_int 2)
+                         (const_int 4) (const_int 6)
+                         (const_int 8) (const_int 10)
+                         (const_int 12) (const_int 14)])))
+          (sign_extend:V8DI
+            (vec_select:V8SI
+              (match_operand:V16SI 2 "nonimmediate_operand" "vm")
+              (parallel [(const_int 0) (const_int 2)
+                         (const_int 4) (const_int 6)
+                         (const_int 8) (const_int 10)
+                         (const_int 12) (const_int 14)])))))]
+  "TARGET_AVX512F && ix86_binary_operator_ok (MULT, V16SImode, operands)"
+  "vpmuldq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "isa" "avx512f")
+   (set_attr "type" "sseimul")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_expand "vec_widen_smult_even_v8si"
+  [(set (match_operand:V4DI 0 "register_operand")
+	(mult:V4DI
+	  (sign_extend:V4DI
+	    (vec_select:V4SI
+	      (match_operand:V8SI 1 "nonimmediate_operand")
+	      (parallel [(const_int 0) (const_int 2)
+			 (const_int 4) (const_int 6)])))
+	  (sign_extend:V4DI
+	    (vec_select:V4SI
+	      (match_operand:V8SI 2 "nonimmediate_operand")
+	      (parallel [(const_int 0) (const_int 2)
+			 (const_int 4) (const_int 6)])))))]
+  "TARGET_AVX2"
+  "ix86_fixup_binary_operands_no_copy (MULT, V8SImode, operands);")
+
+(define_insn "*vec_widen_smult_even_v8si"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+	(mult:V4DI
+	  (sign_extend:V4DI
+	    (vec_select:V4SI
+	      (match_operand:V8SI 1 "nonimmediate_operand" "x")
+	      (parallel [(const_int 0) (const_int 2)
+			 (const_int 4) (const_int 6)])))
+	  (sign_extend:V4DI
+	    (vec_select:V4SI
+	      (match_operand:V8SI 2 "nonimmediate_operand" "xm")
+	      (parallel [(const_int 0) (const_int 2)
+			 (const_int 4) (const_int 6)])))))]
+  "TARGET_AVX2 && ix86_binary_operator_ok (MULT, V8SImode, operands)"
+  "vpmuldq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_expand "sse4_1_mulv2siv2di3"
+  [(set (match_operand:V2DI 0 "register_operand")
+	(mult:V2DI
+	  (sign_extend:V2DI
+	    (vec_select:V2SI
+	      (match_operand:V4SI 1 "nonimmediate_operand")
+	      (parallel [(const_int 0) (const_int 2)])))
+	  (sign_extend:V2DI
+	    (vec_select:V2SI
+	      (match_operand:V4SI 2 "nonimmediate_operand")
+	      (parallel [(const_int 0) (const_int 2)])))))]
+  "TARGET_SSE4_1"
+  "ix86_fixup_binary_operands_no_copy (MULT, V4SImode, operands);")
+
+(define_insn "*sse4_1_mulv2siv2di3"
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
+	(mult:V2DI
+	  (sign_extend:V2DI
+	    (vec_select:V2SI
+	      (match_operand:V4SI 1 "nonimmediate_operand" "%0,x")
+	      (parallel [(const_int 0) (const_int 2)])))
+	  (sign_extend:V2DI
+	    (vec_select:V2SI
+	      (match_operand:V4SI 2 "nonimmediate_operand" "xm,xm")
+	      (parallel [(const_int 0) (const_int 2)])))))]
+  "TARGET_SSE4_1 && ix86_binary_operator_ok (MULT, V4SImode, operands)"
+  "@
+   pmuldq\t{%2, %0|%0, %2}
+   vpmuldq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseimul")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_expand "avx2_pmaddwd"
+  [(set (match_operand:V8SI 0 "register_operand")
+	(plus:V8SI
+	  (mult:V8SI
+	    (sign_extend:V8SI
+	      (vec_select:V8HI
+		(match_operand:V16HI 1 "nonimmediate_operand")
+		(parallel [(const_int 0) (const_int 2)
+			   (const_int 4) (const_int 6)
+			   (const_int 8) (const_int 10)
+			   (const_int 12) (const_int 14)])))
+	    (sign_extend:V8SI
+	      (vec_select:V8HI
+		(match_operand:V16HI 2 "nonimmediate_operand")
+		(parallel [(const_int 0) (const_int 2)
+			   (const_int 4) (const_int 6)
+			   (const_int 8) (const_int 10)
+			   (const_int 12) (const_int 14)]))))
+	  (mult:V8SI
+	    (sign_extend:V8SI
+	      (vec_select:V8HI (match_dup 1)
+		(parallel [(const_int 1) (const_int 3)
+			   (const_int 5) (const_int 7)
+			   (const_int 9) (const_int 11)
+			   (const_int 13) (const_int 15)])))
+	    (sign_extend:V8SI
+	      (vec_select:V8HI (match_dup 2)
+		(parallel [(const_int 1) (const_int 3)
+			   (const_int 5) (const_int 7)
+			   (const_int 9) (const_int 11)
+			   (const_int 13) (const_int 15)]))))))]
+  "TARGET_AVX2"
+  "ix86_fixup_binary_operands_no_copy (MULT, V16HImode, operands);")
+
+(define_insn "*avx2_pmaddwd"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+	(plus:V8SI
+	  (mult:V8SI
+	    (sign_extend:V8SI
+	      (vec_select:V8HI
+		(match_operand:V16HI 1 "nonimmediate_operand" "%x")
+		(parallel [(const_int 0) (const_int 2)
+			   (const_int 4) (const_int 6)
+			   (const_int 8) (const_int 10)
+			   (const_int 12) (const_int 14)])))
+	    (sign_extend:V8SI
+	      (vec_select:V8HI
+		(match_operand:V16HI 2 "nonimmediate_operand" "xm")
+		(parallel [(const_int 0) (const_int 2)
+			   (const_int 4) (const_int 6)
+			   (const_int 8) (const_int 10)
+			   (const_int 12) (const_int 14)]))))
+	  (mult:V8SI
+	    (sign_extend:V8SI
+	      (vec_select:V8HI (match_dup 1)
+		(parallel [(const_int 1) (const_int 3)
+			   (const_int 5) (const_int 7)
+			   (const_int 9) (const_int 11)
+			   (const_int 13) (const_int 15)])))
+	    (sign_extend:V8SI
+	      (vec_select:V8HI (match_dup 2)
+		(parallel [(const_int 1) (const_int 3)
+			   (const_int 5) (const_int 7)
+			   (const_int 9) (const_int 11)
+			   (const_int 13) (const_int 15)]))))))]
+  "TARGET_AVX2 && ix86_binary_operator_ok (MULT, V16HImode, operands)"
+  "vpmaddwd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_expand "sse2_pmaddwd"
+  [(set (match_operand:V4SI 0 "register_operand")
+	(plus:V4SI
+	  (mult:V4SI
+	    (sign_extend:V4SI
+	      (vec_select:V4HI
+		(match_operand:V8HI 1 "nonimmediate_operand")
+		(parallel [(const_int 0) (const_int 2)
+			   (const_int 4) (const_int 6)])))
+	    (sign_extend:V4SI
+	      (vec_select:V4HI
+		(match_operand:V8HI 2 "nonimmediate_operand")
+		(parallel [(const_int 0) (const_int 2)
+			   (const_int 4) (const_int 6)]))))
+	  (mult:V4SI
+	    (sign_extend:V4SI
+	      (vec_select:V4HI (match_dup 1)
+		(parallel [(const_int 1) (const_int 3)
+			   (const_int 5) (const_int 7)])))
+	    (sign_extend:V4SI
+	      (vec_select:V4HI (match_dup 2)
+		(parallel [(const_int 1) (const_int 3)
+			   (const_int 5) (const_int 7)]))))))]
+  "TARGET_SSE2"
+  "ix86_fixup_binary_operands_no_copy (MULT, V8HImode, operands);")
+
+(define_insn "*sse2_pmaddwd"
+  [(set (match_operand:V4SI 0 "register_operand" "=x,x")
+	(plus:V4SI
+	  (mult:V4SI
+	    (sign_extend:V4SI
+	      (vec_select:V4HI
+		(match_operand:V8HI 1 "nonimmediate_operand" "%0,x")
+		(parallel [(const_int 0) (const_int 2)
+			   (const_int 4) (const_int 6)])))
+	    (sign_extend:V4SI
+	      (vec_select:V4HI
+		(match_operand:V8HI 2 "nonimmediate_operand" "xm,xm")
+		(parallel [(const_int 0) (const_int 2)
+			   (const_int 4) (const_int 6)]))))
+	  (mult:V4SI
+	    (sign_extend:V4SI
+	      (vec_select:V4HI (match_dup 1)
+		(parallel [(const_int 1) (const_int 3)
+			   (const_int 5) (const_int 7)])))
+	    (sign_extend:V4SI
+	      (vec_select:V4HI (match_dup 2)
+		(parallel [(const_int 1) (const_int 3)
+			   (const_int 5) (const_int 7)]))))))]
+  "TARGET_SSE2 && ix86_binary_operator_ok (MULT, V8HImode, operands)"
+  "@
+   pmaddwd\t{%2, %0|%0, %2}
+   vpmaddwd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseiadd")
+   (set_attr "atom_unit" "simul")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_expand "mul<mode>3<mask_name>"
+  [(set (match_operand:VI4_AVX512F 0 "register_operand")
+	(mult:VI4_AVX512F
+	  (match_operand:VI4_AVX512F 1 "general_vector_operand")
+	  (match_operand:VI4_AVX512F 2 "general_vector_operand")))]
+  "TARGET_SSE2 && <mask_mode512bit_condition>"
+{
+  if (TARGET_SSE4_1)
+    {
+      if (!nonimmediate_operand (operands[1], <MODE>mode))
+	operands[1] = force_reg (<MODE>mode, operands[1]);
+      if (!nonimmediate_operand (operands[2], <MODE>mode))
+	operands[2] = force_reg (<MODE>mode, operands[2]);
+      ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);
+    }
+  else
+    {
+      ix86_expand_sse2_mulv4si3 (operands[0], operands[1], operands[2]);
+      DONE;
+    }
+})
+
+(define_insn "*<sse4_1_avx2>_mul<mode>3<mask_name>"
+  [(set (match_operand:VI4_AVX512F 0 "register_operand" "=x,v")
+	(mult:VI4_AVX512F
+	  (match_operand:VI4_AVX512F 1 "nonimmediate_operand" "%0,v")
+	  (match_operand:VI4_AVX512F 2 "nonimmediate_operand" "xm,vm")))]
+  "TARGET_SSE4_1 && ix86_binary_operator_ok (MULT, <MODE>mode, operands) && <mask_mode512bit_condition>"
+  "@
+   pmulld\t{%2, %0|%0, %2}
+   vpmulld\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseimul")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "<mask_prefix3>")
+   (set_attr "btver2_decode" "vector,vector")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "mul<mode>3"
+  [(set (match_operand:VI8_AVX2_AVX512F 0 "register_operand")
+	(mult:VI8_AVX2_AVX512F
+	  (match_operand:VI8_AVX2_AVX512F 1 "register_operand")
+	  (match_operand:VI8_AVX2_AVX512F 2 "register_operand")))]
+  "TARGET_SSE2"
+{
+  ix86_expand_sse2_mulvxdi3 (operands[0], operands[1], operands[2]);
+  DONE;
+})
+
+(define_expand "vec_widen_<s>mult_hi_<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand")
+   (any_extend:<sseunpackmode>
+     (match_operand:VI124_AVX2 1 "register_operand"))
+   (match_operand:VI124_AVX2 2 "register_operand")]
+  "TARGET_SSE2"
+{
+  ix86_expand_mul_widen_hilo (operands[0], operands[1], operands[2],
+			      <u_bool>, true);
+  DONE;
+})
+
+(define_expand "vec_widen_<s>mult_lo_<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand")
+   (any_extend:<sseunpackmode>
+     (match_operand:VI124_AVX2 1 "register_operand"))
+   (match_operand:VI124_AVX2 2 "register_operand")]
+  "TARGET_SSE2"
+{
+  ix86_expand_mul_widen_hilo (operands[0], operands[1], operands[2],
+			      <u_bool>, false);
+  DONE;
+})
+
+;; Most widen_<s>mult_even_<mode> can be handled directly from other
+;; named patterns, but signed V4SI needs special help for plain SSE2.
+(define_expand "vec_widen_smult_even_v4si"
+  [(match_operand:V2DI 0 "register_operand")
+   (match_operand:V4SI 1 "nonimmediate_operand")
+   (match_operand:V4SI 2 "nonimmediate_operand")]
+  "TARGET_SSE2"
+{
+  ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2],
+				 false, false);
+  DONE;
+})
+
+(define_expand "vec_widen_<s>mult_odd_<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand")
+   (any_extend:<sseunpackmode>
+     (match_operand:VI4_AVX512F 1 "general_vector_operand"))
+   (match_operand:VI4_AVX512F 2 "general_vector_operand")]
+  "TARGET_SSE2"
+{
+  ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2],
+				 <u_bool>, true);
+  DONE;
+})
+
+(define_expand "sdot_prod<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand")
+   (match_operand:VI2_AVX2 1 "register_operand")
+   (match_operand:VI2_AVX2 2 "register_operand")
+   (match_operand:<sseunpackmode> 3 "register_operand")]
+  "TARGET_SSE2"
+{
+  rtx t = gen_reg_rtx (<sseunpackmode>mode);
+  emit_insn (gen_<sse2_avx2>_pmaddwd (t, operands[1], operands[2]));
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+			  gen_rtx_PLUS (<sseunpackmode>mode,
+					operands[3], t)));
+  DONE;
+})
+
+;; Normally we use widen_mul_even/odd, but combine can't quite get it all
+;; back together when madd is available.
+(define_expand "sdot_prodv4si"
+  [(match_operand:V2DI 0 "register_operand")
+   (match_operand:V4SI 1 "register_operand")
+   (match_operand:V4SI 2 "register_operand")
+   (match_operand:V2DI 3 "register_operand")]
+  "TARGET_XOP"
+{
+  rtx t = gen_reg_rtx (V2DImode);
+  emit_insn (gen_xop_pmacsdqh (t, operands[1], operands[2], operands[3]));
+  emit_insn (gen_xop_pmacsdql (operands[0], operands[1], operands[2], t));
+  DONE;
+})
+
+(define_insn "ashr<mode>3"
+  [(set (match_operand:VI24_AVX2 0 "register_operand" "=x,x")
+	(ashiftrt:VI24_AVX2
+	  (match_operand:VI24_AVX2 1 "register_operand" "0,x")
+	  (match_operand:SI 2 "nonmemory_operand" "xN,xN")))]
+  "TARGET_SSE2"
+  "@
+   psra<ssemodesuffix>\t{%2, %0|%0, %2}
+   vpsra<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseishft")
+   (set (attr "length_immediate")
+     (if_then_else (match_operand 2 "const_int_operand")
+       (const_string "1")
+       (const_string "0")))
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "ashr<mode>3<mask_name>"
+  [(set (match_operand:VI48_512 0 "register_operand" "=v,v")
+	(ashiftrt:VI48_512
+	  (match_operand:VI48_512 1 "nonimmediate_operand" "v,vm")
+	  (match_operand:SI 2 "nonmemory_operand" "v,N")))]
+  "TARGET_AVX512F && <mask_mode512bit_condition>"
+  "vpsra<ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set (attr "length_immediate")
+     (if_then_else (match_operand 2 "const_int_operand")
+       (const_string "1")
+       (const_string "0")))
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<shift_insn><mode>3"
+  [(set (match_operand:VI248_AVX2 0 "register_operand" "=x,x")
+	(any_lshift:VI248_AVX2
+	  (match_operand:VI248_AVX2 1 "register_operand" "0,x")
+	  (match_operand:SI 2 "nonmemory_operand" "xN,xN")))]
+  "TARGET_SSE2"
+  "@
+   p<vshift><ssemodesuffix>\t{%2, %0|%0, %2}
+   vp<vshift><ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseishft")
+   (set (attr "length_immediate")
+     (if_then_else (match_operand 2 "const_int_operand")
+       (const_string "1")
+       (const_string "0")))
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<shift_insn><mode>3<mask_name>"
+  [(set (match_operand:VI48_512 0 "register_operand" "=v,v")
+	(any_lshift:VI48_512
+	  (match_operand:VI48_512 1 "nonimmediate_operand" "v,m")
+	  (match_operand:SI 2 "nonmemory_operand" "vN,N")))]
+  "TARGET_AVX512F && <mask_mode512bit_condition>"
+  "vp<vshift><ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "isa" "avx512f")
+   (set_attr "type" "sseishft")
+   (set (attr "length_immediate")
+     (if_then_else (match_operand 2 "const_int_operand")
+       (const_string "1")
+       (const_string "0")))
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+
+(define_expand "vec_shl_<mode>"
+  [(set (match_dup 3)
+	(ashift:V1TI
+	 (match_operand:VI_128 1 "register_operand")
+	 (match_operand:SI 2 "const_0_to_255_mul_8_operand")))
+   (set (match_operand:VI_128 0 "register_operand") (match_dup 4))]
+  "TARGET_SSE2"
+{
+  operands[1] = gen_lowpart (V1TImode, operands[1]);
+  operands[3] = gen_reg_rtx (V1TImode);
+  operands[4] = gen_lowpart (<MODE>mode, operands[3]);
+})
+
+(define_insn "<sse2_avx2>_ashl<mode>3"
+  [(set (match_operand:VIMAX_AVX2 0 "register_operand" "=x,x")
+	(ashift:VIMAX_AVX2
+	 (match_operand:VIMAX_AVX2 1 "register_operand" "0,x")
+	 (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n,n")))]
+  "TARGET_SSE2"
+{
+  operands[2] = GEN_INT (INTVAL (operands[2]) / 8);
+
+  switch (which_alternative)
+    {
+    case 0:
+      return "pslldq\t{%2, %0|%0, %2}";
+    case 1:
+      return "vpslldq\t{%2, %1, %0|%0, %1, %2}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseishft")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "vec_shr_<mode>"
+  [(set (match_dup 3)
+	(lshiftrt:V1TI
+	 (match_operand:VI_128 1 "register_operand")
+	 (match_operand:SI 2 "const_0_to_255_mul_8_operand")))
+   (set (match_operand:VI_128 0 "register_operand") (match_dup 4))]
+  "TARGET_SSE2"
+{
+  operands[1] = gen_lowpart (V1TImode, operands[1]);
+  operands[3] = gen_reg_rtx (V1TImode);
+  operands[4] = gen_lowpart (<MODE>mode, operands[3]);
+})
+
+(define_insn "<sse2_avx2>_lshr<mode>3"
+  [(set (match_operand:VIMAX_AVX2 0 "register_operand" "=x,x")
+	(lshiftrt:VIMAX_AVX2
+	 (match_operand:VIMAX_AVX2 1 "register_operand" "0,x")
+	 (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n,n")))]
+  "TARGET_SSE2"
+{
+  operands[2] = GEN_INT (INTVAL (operands[2]) / 8);
+
+  switch (which_alternative)
+    {
+    case 0:
+      return "psrldq\t{%2, %0|%0, %2}";
+    case 1:
+      return "vpsrldq\t{%2, %1, %0|%0, %1, %2}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseishft")
+   (set_attr "length_immediate" "1")
+   (set_attr "atom_unit" "sishuf")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512f_<rotate>v<mode><mask_name>"
+  [(set (match_operand:VI48_512 0 "register_operand" "=v")
+	(any_rotate:VI48_512
+	  (match_operand:VI48_512 1 "register_operand" "v")
+	  (match_operand:VI48_512 2 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512F"
+  "vp<rotate>v<ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512f_<rotate><mode><mask_name>"
+  [(set (match_operand:VI48_512 0 "register_operand" "=v")
+	(any_rotate:VI48_512
+	  (match_operand:VI48_512 1 "nonimmediate_operand" "vm")
+	  (match_operand:SI 2 "const_0_to_255_operand")))]
+  "TARGET_AVX512F"
+  "vp<rotate><ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "<code><mode>3<mask_name><round_name>"
+  [(set (match_operand:VI124_256_48_512 0 "register_operand")
+	(maxmin:VI124_256_48_512
+	  (match_operand:VI124_256_48_512 1 "<round_nimm_predicate>")
+	  (match_operand:VI124_256_48_512 2 "<round_nimm_predicate>")))]
+  "TARGET_AVX2 && <mask_mode512bit_condition> && <round_mode512bit_condition>"
+  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+
+(define_insn "*avx2_<code><mode>3<mask_name><round_name>"
+  [(set (match_operand:VI124_256_48_512 0 "register_operand" "=v")
+	(maxmin:VI124_256_48_512
+	  (match_operand:VI124_256_48_512 1 "<round_nimm_predicate>" "%v")
+	  (match_operand:VI124_256_48_512 2 "<round_nimm_predicate>" "<round_constraint>")))]
+  "TARGET_AVX2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)
+   && <mask_mode512bit_condition> && <round_mode512bit_condition>"
+  "vp<maxmin_int><ssemodesuffix>\t{<round_mask_op3>%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2<round_mask_op3>}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_evex")
+   (set_attr "mode" "OI")])
+
+(define_expand "<code><mode>3"
+  [(set (match_operand:VI8_AVX2 0 "register_operand")
+	(maxmin:VI8_AVX2
+	  (match_operand:VI8_AVX2 1 "register_operand")
+	  (match_operand:VI8_AVX2 2 "register_operand")))]
+  "TARGET_SSE4_2"
+{
+  enum rtx_code code;
+  rtx xops[6];
+  bool ok;
+
+  xops[0] = operands[0];
+
+  if (<CODE> == SMAX || <CODE> == UMAX)
+    {
+      xops[1] = operands[1];
+      xops[2] = operands[2];
+    }
+  else
+    {
+      xops[1] = operands[2];
+      xops[2] = operands[1];
+    }
+
+  code = (<CODE> == UMAX || <CODE> == UMIN) ? GTU : GT;
+
+  xops[3] = gen_rtx_fmt_ee (code, VOIDmode, operands[1], operands[2]);
+  xops[4] = operands[1];
+  xops[5] = operands[2];
+
+  ok = ix86_expand_int_vcond (xops);
+  gcc_assert (ok);
+  DONE;
+})
+
+(define_expand "<code><mode>3"
+  [(set (match_operand:VI124_128 0 "register_operand")
+	(smaxmin:VI124_128
+	  (match_operand:VI124_128 1 "nonimmediate_operand")
+	  (match_operand:VI124_128 2 "nonimmediate_operand")))]
+  "TARGET_SSE2"
+{
+  if (TARGET_SSE4_1 || <MODE>mode == V8HImode)
+    ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);
+  else
+    {
+      rtx xops[6];
+      bool ok;
+
+      xops[0] = operands[0];
+      operands[1] = force_reg (<MODE>mode, operands[1]);
+      operands[2] = force_reg (<MODE>mode, operands[2]);
+
+      if (<CODE> == SMAX)
+	{
+	  xops[1] = operands[1];
+	  xops[2] = operands[2];
+	}
+      else
+	{
+	  xops[1] = operands[2];
+	  xops[2] = operands[1];
+	}
+
+      xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]);
+      xops[4] = operands[1];
+      xops[5] = operands[2];
+
+      ok = ix86_expand_int_vcond (xops);
+      gcc_assert (ok);
+      DONE;
+    }
+})
+
+(define_insn "*sse4_1_<code><mode>3"
+  [(set (match_operand:VI14_128 0 "register_operand" "=x,x")
+	(smaxmin:VI14_128
+	  (match_operand:VI14_128 1 "nonimmediate_operand" "%0,x")
+	  (match_operand:VI14_128 2 "nonimmediate_operand" "xm,xm")))]
+  "TARGET_SSE4_1 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "@
+   p<maxmin_int><ssemodesuffix>\t{%2, %0|%0, %2}
+   vp<maxmin_int><ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*<code>v8hi3"
+  [(set (match_operand:V8HI 0 "register_operand" "=x,x")
+	(smaxmin:V8HI
+	  (match_operand:V8HI 1 "nonimmediate_operand" "%0,x")
+	  (match_operand:V8HI 2 "nonimmediate_operand" "xm,xm")))]
+  "TARGET_SSE2 && ix86_binary_operator_ok (<CODE>, V8HImode, operands)"
+  "@
+   p<maxmin_int>w\t{%2, %0|%0, %2}
+   vp<maxmin_int>w\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "*,1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_expand "<code><mode>3"
+  [(set (match_operand:VI124_128 0 "register_operand")
+	(umaxmin:VI124_128
+	  (match_operand:VI124_128 1 "nonimmediate_operand")
+	  (match_operand:VI124_128 2 "nonimmediate_operand")))]
+  "TARGET_SSE2"
+{
+  if (TARGET_SSE4_1 || <MODE>mode == V16QImode)
+    ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);
+  else if (<CODE> == UMAX && <MODE>mode == V8HImode)
+    {
+      rtx op0 = operands[0], op2 = operands[2], op3 = op0;
+      operands[1] = force_reg (<MODE>mode, operands[1]);
+      if (rtx_equal_p (op3, op2))
+	op3 = gen_reg_rtx (V8HImode);
+      emit_insn (gen_sse2_ussubv8hi3 (op3, operands[1], op2));
+      emit_insn (gen_addv8hi3 (op0, op3, op2));
+      DONE;
+    }
+  else
+    {
+      rtx xops[6];
+      bool ok;
+
+      operands[1] = force_reg (<MODE>mode, operands[1]);
+      operands[2] = force_reg (<MODE>mode, operands[2]);
+
+      xops[0] = operands[0];
+
+      if (<CODE> == UMAX)
+	{
+	  xops[1] = operands[1];
+	  xops[2] = operands[2];
+	}
+      else
+	{
+	  xops[1] = operands[2];
+	  xops[2] = operands[1];
+	}
+
+      xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]);
+      xops[4] = operands[1];
+      xops[5] = operands[2];
+
+      ok = ix86_expand_int_vcond (xops);
+      gcc_assert (ok);
+      DONE;
+    }
+})
+
+(define_insn "*sse4_1_<code><mode>3"
+  [(set (match_operand:VI24_128 0 "register_operand" "=x,x")
+	(umaxmin:VI24_128
+	  (match_operand:VI24_128 1 "nonimmediate_operand" "%0,x")
+	  (match_operand:VI24_128 2 "nonimmediate_operand" "xm,xm")))]
+  "TARGET_SSE4_1 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "@
+   p<maxmin_int><ssemodesuffix>\t{%2, %0|%0, %2}
+   vp<maxmin_int><ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*<code>v16qi3"
+  [(set (match_operand:V16QI 0 "register_operand" "=x,x")
+	(umaxmin:V16QI
+	  (match_operand:V16QI 1 "nonimmediate_operand" "%0,x")
+	  (match_operand:V16QI 2 "nonimmediate_operand" "xm,xm")))]
+  "TARGET_SSE2 && ix86_binary_operator_ok (<CODE>, V16QImode, operands)"
+  "@
+   p<maxmin_int>b\t{%2, %0|%0, %2}
+   vp<maxmin_int>b\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "*,1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel integral comparisons
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_expand "avx2_eq<mode>3"
+  [(set (match_operand:VI_256 0 "register_operand")
+	(eq:VI_256
+	  (match_operand:VI_256 1 "nonimmediate_operand")
+	  (match_operand:VI_256 2 "nonimmediate_operand")))]
+  "TARGET_AVX2"
+  "ix86_fixup_binary_operands_no_copy (EQ, <MODE>mode, operands);")
+
+(define_insn "*avx2_eq<mode>3"
+  [(set (match_operand:VI_256 0 "register_operand" "=x")
+	(eq:VI_256
+	  (match_operand:VI_256 1 "nonimmediate_operand" "%x")
+	  (match_operand:VI_256 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX2 && ix86_binary_operator_ok (EQ, <MODE>mode, operands)"
+  "vpcmpeq<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_expand "avx512f_eq<mode>3<mask_scalar_merge_name>"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand")
+	(unspec:<avx512fmaskmode>
+	  [(match_operand:VI48_512 1 "register_operand")
+	   (match_operand:VI48_512 2 "nonimmediate_operand")]
+	  UNSPEC_MASKED_EQ))]
+  "TARGET_AVX512F"
+  "ix86_fixup_binary_operands_no_copy (EQ, <MODE>mode, operands);")
+
+(define_insn "avx512f_eq<mode>3<mask_scalar_merge_name>_1"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+	(unspec:<avx512fmaskmode>
+	  [(match_operand:VI48_512 1 "register_operand" "%v")
+	   (match_operand:VI48_512 2 "nonimmediate_operand" "vm")]
+	  UNSPEC_MASKED_EQ))]
+  "TARGET_AVX512F && ix86_binary_operator_ok (EQ, <MODE>mode, operands)"
+  "vpcmpeq<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*sse4_1_eqv2di3"
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
+	(eq:V2DI
+	  (match_operand:V2DI 1 "nonimmediate_operand" "%0,x")
+	  (match_operand:V2DI 2 "nonimmediate_operand" "xm,xm")))]
+  "TARGET_SSE4_1 && ix86_binary_operator_ok (EQ, V2DImode, operands)"
+  "@
+   pcmpeqq\t{%2, %0|%0, %2}
+   vpcmpeqq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssecmp")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse2_eq<mode>3"
+  [(set (match_operand:VI124_128 0 "register_operand" "=x,x")
+	(eq:VI124_128
+	  (match_operand:VI124_128 1 "nonimmediate_operand" "%0,x")
+	  (match_operand:VI124_128 2 "nonimmediate_operand" "xm,xm")))]
+  "TARGET_SSE2 && !TARGET_XOP
+   && ix86_binary_operator_ok (EQ, <MODE>mode, operands)"
+  "@
+   pcmpeq<ssemodesuffix>\t{%2, %0|%0, %2}
+   vpcmpeq<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssecmp")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_expand "sse2_eq<mode>3"
+  [(set (match_operand:VI124_128 0 "register_operand")
+	(eq:VI124_128
+	  (match_operand:VI124_128 1 "nonimmediate_operand")
+	  (match_operand:VI124_128 2 "nonimmediate_operand")))]
+  "TARGET_SSE2 && !TARGET_XOP "
+  "ix86_fixup_binary_operands_no_copy (EQ, <MODE>mode, operands);")
+
+(define_expand "sse4_1_eqv2di3"
+  [(set (match_operand:V2DI 0 "register_operand")
+	(eq:V2DI
+	  (match_operand:V2DI 1 "nonimmediate_operand")
+	  (match_operand:V2DI 2 "nonimmediate_operand")))]
+  "TARGET_SSE4_1"
+  "ix86_fixup_binary_operands_no_copy (EQ, V2DImode, operands);")
+
+(define_insn "sse4_2_gtv2di3"
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
+	(gt:V2DI
+	  (match_operand:V2DI 1 "register_operand" "0,x")
+	  (match_operand:V2DI 2 "nonimmediate_operand" "xm,xm")))]
+  "TARGET_SSE4_2"
+  "@
+   pcmpgtq\t{%2, %0|%0, %2}
+   vpcmpgtq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssecmp")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx2_gt<mode>3"
+  [(set (match_operand:VI_256 0 "register_operand" "=x")
+	(gt:VI_256
+	  (match_operand:VI_256 1 "register_operand" "x")
+	  (match_operand:VI_256 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX2"
+  "vpcmpgt<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx512f_gt<mode>3<mask_scalar_merge_name>"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+	(unspec:<avx512fmaskmode>
+	  [(match_operand:VI48_512 1 "register_operand" "v")
+	   (match_operand:VI48_512 2 "nonimmediate_operand" "vm")] UNSPEC_MASKED_GT))]
+  "TARGET_AVX512F"
+  "vpcmpgt<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "sse2_gt<mode>3"
+  [(set (match_operand:VI124_128 0 "register_operand" "=x,x")
+	(gt:VI124_128
+	  (match_operand:VI124_128 1 "register_operand" "0,x")
+	  (match_operand:VI124_128 2 "nonimmediate_operand" "xm,xm")))]
+  "TARGET_SSE2 && !TARGET_XOP"
+  "@
+   pcmpgt<ssemodesuffix>\t{%2, %0|%0, %2}
+   vpcmpgt<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssecmp")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_expand "vcond<V_512:mode><VI_512:mode>"
+  [(set (match_operand:V_512 0 "register_operand")
+	(if_then_else:V_512
+	  (match_operator 3 ""
+	    [(match_operand:VI_512 4 "nonimmediate_operand")
+	     (match_operand:VI_512 5 "general_operand")])
+	  (match_operand:V_512 1)
+	  (match_operand:V_512 2)))]
+  "TARGET_AVX512F
+   && (GET_MODE_NUNITS (<V_512:MODE>mode)
+       == GET_MODE_NUNITS (<VI_512:MODE>mode))"
+{
+  bool ok = ix86_expand_int_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
+(define_expand "vcond<V_256:mode><VI_256:mode>"
+  [(set (match_operand:V_256 0 "register_operand")
+	(if_then_else:V_256
+	  (match_operator 3 ""
+	    [(match_operand:VI_256 4 "nonimmediate_operand")
+	     (match_operand:VI_256 5 "general_operand")])
+	  (match_operand:V_256 1)
+	  (match_operand:V_256 2)))]
+  "TARGET_AVX2
+   && (GET_MODE_NUNITS (<V_256:MODE>mode)
+       == GET_MODE_NUNITS (<VI_256:MODE>mode))"
+{
+  bool ok = ix86_expand_int_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
+(define_expand "vcond<V_128:mode><VI124_128:mode>"
+  [(set (match_operand:V_128 0 "register_operand")
+	(if_then_else:V_128
+	  (match_operator 3 ""
+	    [(match_operand:VI124_128 4 "nonimmediate_operand")
+	     (match_operand:VI124_128 5 "general_operand")])
+	  (match_operand:V_128 1)
+	  (match_operand:V_128 2)))]
+  "TARGET_SSE2
+   && (GET_MODE_NUNITS (<V_128:MODE>mode)
+       == GET_MODE_NUNITS (<VI124_128:MODE>mode))"
+{
+  bool ok = ix86_expand_int_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
+(define_expand "vcond<VI8F_128:mode>v2di"
+  [(set (match_operand:VI8F_128 0 "register_operand")
+	(if_then_else:VI8F_128
+	  (match_operator 3 ""
+	    [(match_operand:V2DI 4 "nonimmediate_operand")
+	     (match_operand:V2DI 5 "general_operand")])
+	  (match_operand:VI8F_128 1)
+	  (match_operand:VI8F_128 2)))]
+  "TARGET_SSE4_2"
+{
+  bool ok = ix86_expand_int_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
+(define_expand "vcondu<V_512:mode><VI_512:mode>"
+  [(set (match_operand:V_512 0 "register_operand")
+	(if_then_else:V_512
+	  (match_operator 3 ""
+	    [(match_operand:VI_512 4 "nonimmediate_operand")
+	     (match_operand:VI_512 5 "nonimmediate_operand")])
+	  (match_operand:V_512 1 "general_operand")
+	  (match_operand:V_512 2 "general_operand")))]
+  "TARGET_AVX512F
+   && (GET_MODE_NUNITS (<V_512:MODE>mode)
+       == GET_MODE_NUNITS (<VI_512:MODE>mode))"
+{
+  bool ok = ix86_expand_int_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
+(define_expand "vcondu<V_256:mode><VI_256:mode>"
+  [(set (match_operand:V_256 0 "register_operand")
+	(if_then_else:V_256
+	  (match_operator 3 ""
+	    [(match_operand:VI_256 4 "nonimmediate_operand")
+	     (match_operand:VI_256 5 "nonimmediate_operand")])
+	  (match_operand:V_256 1 "general_operand")
+	  (match_operand:V_256 2 "general_operand")))]
+  "TARGET_AVX2
+   && (GET_MODE_NUNITS (<V_256:MODE>mode)
+       == GET_MODE_NUNITS (<VI_256:MODE>mode))"
+{
+  bool ok = ix86_expand_int_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
+(define_expand "vcondu<V_128:mode><VI124_128:mode>"
+  [(set (match_operand:V_128 0 "register_operand")
+	(if_then_else:V_128
+	  (match_operator 3 ""
+	    [(match_operand:VI124_128 4 "nonimmediate_operand")
+	     (match_operand:VI124_128 5 "nonimmediate_operand")])
+	  (match_operand:V_128 1 "general_operand")
+	  (match_operand:V_128 2 "general_operand")))]
+  "TARGET_SSE2
+   && (GET_MODE_NUNITS (<V_128:MODE>mode)
+       == GET_MODE_NUNITS (<VI124_128:MODE>mode))"
+{
+  bool ok = ix86_expand_int_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
+(define_expand "vcondu<VI8F_128:mode>v2di"
+  [(set (match_operand:VI8F_128 0 "register_operand")
+	(if_then_else:VI8F_128
+	  (match_operator 3 ""
+	    [(match_operand:V2DI 4 "nonimmediate_operand")
+	     (match_operand:V2DI 5 "nonimmediate_operand")])
+	  (match_operand:VI8F_128 1 "general_operand")
+	  (match_operand:VI8F_128 2 "general_operand")))]
+  "TARGET_SSE4_2"
+{
+  bool ok = ix86_expand_int_vcond (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
+(define_mode_iterator VEC_PERM_AVX2
+  [V16QI V8HI V4SI V2DI V4SF V2DF
+   (V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
+   (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
+   (V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")
+   (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
+   (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")])
+
+(define_expand "vec_perm<mode>"
+  [(match_operand:VEC_PERM_AVX2 0 "register_operand")
+   (match_operand:VEC_PERM_AVX2 1 "register_operand")
+   (match_operand:VEC_PERM_AVX2 2 "register_operand")
+   (match_operand:<sseintvecmode> 3 "register_operand")]
+  "TARGET_SSSE3 || TARGET_AVX || TARGET_XOP"
+{
+  ix86_expand_vec_perm (operands);
+  DONE;
+})
+
+(define_mode_iterator VEC_PERM_CONST
+  [(V4SF "TARGET_SSE") (V4SI "TARGET_SSE")
+   (V2DF "TARGET_SSE") (V2DI "TARGET_SSE")
+   (V16QI "TARGET_SSE2") (V8HI "TARGET_SSE2")
+   (V8SF "TARGET_AVX") (V4DF "TARGET_AVX")
+   (V8SI "TARGET_AVX") (V4DI "TARGET_AVX")
+   (V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
+   (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
+   (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")])
+
+(define_expand "vec_perm_const<mode>"
+  [(match_operand:VEC_PERM_CONST 0 "register_operand")
+   (match_operand:VEC_PERM_CONST 1 "register_operand")
+   (match_operand:VEC_PERM_CONST 2 "register_operand")
+   (match_operand:<sseintvecmode> 3)]
+  ""
+{
+  if (ix86_expand_vec_perm_const (operands))
+    DONE;
+  else
+    FAIL;
+})
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel bitwise logical operations
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_expand "one_cmpl<mode>2"
+  [(set (match_operand:VI 0 "register_operand")
+	(xor:VI (match_operand:VI 1 "nonimmediate_operand")
+		(match_dup 2)))]
+  "TARGET_SSE"
+{
+  int i, n = GET_MODE_NUNITS (<MODE>mode);
+  rtvec v = rtvec_alloc (n);
+
+  for (i = 0; i < n; ++i)
+    RTVEC_ELT (v, i) = constm1_rtx;
+
+  operands[2] = force_reg (<MODE>mode, gen_rtx_CONST_VECTOR (<MODE>mode, v));
+})
+
+(define_expand "<sse2_avx2>_andnot<mode>3<mask_name>"
+  [(set (match_operand:VI_AVX2 0 "register_operand")
+	(and:VI_AVX2
+	  (not:VI_AVX2 (match_operand:VI_AVX2 1 "register_operand"))
+	  (match_operand:VI_AVX2 2 "nonimmediate_operand")))]
+  "TARGET_SSE2 && <mask_mode512bit_condition>")
+
+(define_insn "*andnot<mode>3<mask_name>"
+  [(set (match_operand:VI 0 "register_operand" "=x,v")
+	(and:VI
+	  (not:VI (match_operand:VI 1 "register_operand" "0,v"))
+	  (match_operand:VI 2 "nonimmediate_operand" "xm,vm")))]
+  "TARGET_SSE && <mask_mode512bit_condition>"
+{
+  static char buf[64];
+  const char *ops;
+  const char *tmp;
+
+  switch (get_attr_mode (insn))
+    {
+    case MODE_XI:
+      gcc_assert (TARGET_AVX512F);
+
+      tmp = "pandn<ssemodesuffix>";
+      break;
+
+    case MODE_OI:
+      gcc_assert (TARGET_AVX2);
+    case MODE_TI:
+      gcc_assert (TARGET_SSE2);
+
+      tmp = "pandn";
+      break;
+
+   case MODE_V16SF:
+      gcc_assert (TARGET_AVX512F);
+   case MODE_V8SF:
+      gcc_assert (TARGET_AVX);
+   case MODE_V4SF:
+      gcc_assert (TARGET_SSE);
+
+      tmp = "andnps";
+      break;
+
+   default:
+      gcc_unreachable ();
+   }
+
+  switch (which_alternative)
+    {
+    case 0:
+      ops = "%s\t{%%2, %%0|%%0, %%2}";
+      break;
+    case 1:
+      ops = "v%s\t{%%2, %%1, %%0<mask_operand3_1>|%%0<mask_operand3_1>, %%1, %%2}";
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  snprintf (buf, sizeof (buf), ops, tmp);
+  return buf;
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set (attr "prefix_data16")
+     (if_then_else
+       (and (eq_attr "alternative" "0")
+	    (eq_attr "mode" "TI"))
+       (const_string "1")
+       (const_string "*")))
+   (set_attr "prefix" "<mask_prefix3>")
+   (set (attr "mode")
+	(cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+		 (const_string "<ssePSmode>")
+	       (match_test "TARGET_AVX2")
+		 (const_string "<sseinsnmode>")
+	       (match_test "TARGET_AVX")
+		 (if_then_else
+		   (match_test "<MODE_SIZE> > 16")
+		   (const_string "V8SF")
+		   (const_string "<sseinsnmode>"))
+	       (ior (not (match_test "TARGET_SSE2"))
+		    (match_test "optimize_function_for_size_p (cfun)"))
+		 (const_string "V4SF")
+	      ]
+	      (const_string "<sseinsnmode>")))])
+
+(define_expand "<code><mode>3"
+  [(set (match_operand:VI 0 "register_operand")
+	(any_logic:VI
+	  (match_operand:VI 1 "nonimmediate_or_const_vector_operand")
+	  (match_operand:VI 2 "nonimmediate_or_const_vector_operand")))]
+  "TARGET_SSE"
+{
+  ix86_expand_vector_logical_operator (<CODE>, <MODE>mode, operands);
+  DONE;
+})
+
+(define_insn "<mask_codefor><code><mode>3<mask_name>"
+  [(set (match_operand:VI 0 "register_operand" "=x,v")
+	(any_logic:VI
+	  (match_operand:VI 1 "nonimmediate_operand" "%0,v")
+	  (match_operand:VI 2 "nonimmediate_operand" "xm,vm")))]
+  "TARGET_SSE && <mask_mode512bit_condition>
+   && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+{
+  static char buf[64];
+  const char *ops;
+  const char *tmp;
+
+  switch (get_attr_mode (insn))
+    {
+    case MODE_XI:
+      gcc_assert (TARGET_AVX512F);
+
+      tmp = "p<logic><ssemodesuffix>";
+      break;
+
+    case MODE_OI:
+      gcc_assert (TARGET_AVX2);
+    case MODE_TI:
+      gcc_assert (TARGET_SSE2);
+
+      tmp = "p<logic>";
+      break;
+
+   case MODE_V16SF:
+      gcc_assert (TARGET_AVX512F);
+   case MODE_V8SF:
+      gcc_assert (TARGET_AVX);
+   case MODE_V4SF:
+      gcc_assert (TARGET_SSE);
+
+      tmp = "<logic>ps";
+      break;
+
+   default:
+      gcc_unreachable ();
+   }
+
+  switch (which_alternative)
+    {
+    case 0:
+      ops = "%s\t{%%2, %%0|%%0, %%2}";
+      break;
+    case 1:
+      ops = "v%s\t{%%2, %%1, %%0<mask_operand3_1>|%%0<mask_operand3_1>, %%1, %%2}";
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  snprintf (buf, sizeof (buf), ops, tmp);
+  return buf;
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set (attr "prefix_data16")
+     (if_then_else
+       (and (eq_attr "alternative" "0")
+	    (eq_attr "mode" "TI"))
+       (const_string "1")
+       (const_string "*")))
+   (set_attr "prefix" "<mask_prefix3>")
+   (set (attr "mode")
+	(cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+		 (const_string "<ssePSmode>")
+	       (match_test "TARGET_AVX2")
+		 (const_string "<sseinsnmode>")
+	       (match_test "TARGET_AVX")
+		 (if_then_else
+		   (match_test "<MODE_SIZE> > 16")
+		   (const_string "V8SF")
+		   (const_string "<sseinsnmode>"))
+	       (ior (not (match_test "TARGET_SSE2"))
+		    (match_test "optimize_function_for_size_p (cfun)"))
+		 (const_string "V4SF")
+	      ]
+	      (const_string "<sseinsnmode>")))])
+
+(define_insn "avx512f_testm<mode>3<mask_scalar_merge_name>"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+	(unspec:<avx512fmaskmode>
+	 [(match_operand:VI48_512 1 "register_operand" "v")
+	  (match_operand:VI48_512 2 "nonimmediate_operand" "vm")]
+	 UNSPEC_TESTM))]
+  "TARGET_AVX512F"
+  "vptestm<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}"
+  [(set_attr "prefix" "evex")
+   (set_attr "mode"  "<sseinsnmode>")])
+
+(define_insn "avx512f_testnm<mode>3<mask_scalar_merge_name>"
+  [(set (match_operand:<avx512fmaskmode> 0 "register_operand" "=Yk")
+	(unspec:<avx512fmaskmode>
+	 [(match_operand:VI48_512 1 "register_operand" "v")
+	  (match_operand:VI48_512 2 "nonimmediate_operand" "vm")]
+	 UNSPEC_TESTNM))]
+  "TARGET_AVX512F"
+  "vptestnm<ssemodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}"
+  [(set_attr "prefix" "evex")
+   (set_attr "mode"  "<sseinsnmode>")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel integral element swizzling
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_expand "vec_pack_trunc_<mode>"
+  [(match_operand:<ssepackmode> 0 "register_operand")
+   (match_operand:VI248_AVX2_8_AVX512F 1 "register_operand")
+   (match_operand:VI248_AVX2_8_AVX512F 2 "register_operand")]
+  "TARGET_SSE2"
+{
+  rtx op1 = gen_lowpart (<ssepackmode>mode, operands[1]);
+  rtx op2 = gen_lowpart (<ssepackmode>mode, operands[2]);
+  ix86_expand_vec_extract_even_odd (operands[0], op1, op2, 0);
+  DONE;
+})
+
+(define_insn "<sse2_avx2>_packsswb"
+  [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
+	(vec_concat:VI1_AVX2
+	  (ss_truncate:<ssehalfvecmode>
+	    (match_operand:<sseunpackmode> 1 "register_operand" "0,x"))
+	  (ss_truncate:<ssehalfvecmode>
+	    (match_operand:<sseunpackmode> 2 "nonimmediate_operand" "xm,xm"))))]
+  "TARGET_SSE2"
+  "@
+   packsswb\t{%2, %0|%0, %2}
+   vpacksswb\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<sse2_avx2>_packssdw"
+  [(set (match_operand:VI2_AVX2 0 "register_operand" "=x,x")
+	(vec_concat:VI2_AVX2
+	  (ss_truncate:<ssehalfvecmode>
+	    (match_operand:<sseunpackmode> 1 "register_operand" "0,x"))
+	  (ss_truncate:<ssehalfvecmode>
+	    (match_operand:<sseunpackmode> 2 "nonimmediate_operand" "xm,xm"))))]
+  "TARGET_SSE2"
+  "@
+   packssdw\t{%2, %0|%0, %2}
+   vpackssdw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<sse2_avx2>_packuswb"
+  [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
+	(vec_concat:VI1_AVX2
+	  (us_truncate:<ssehalfvecmode>
+	    (match_operand:<sseunpackmode> 1 "register_operand" "0,x"))
+	  (us_truncate:<ssehalfvecmode>
+	    (match_operand:<sseunpackmode> 2 "nonimmediate_operand" "xm,xm"))))]
+  "TARGET_SSE2"
+  "@
+   packuswb\t{%2, %0|%0, %2}
+   vpackuswb\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx2_interleave_highv32qi"
+  [(set (match_operand:V32QI 0 "register_operand" "=x")
+	(vec_select:V32QI
+	  (vec_concat:V64QI
+	    (match_operand:V32QI 1 "register_operand" "x")
+	    (match_operand:V32QI 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 8)  (const_int 40)
+		     (const_int 9)  (const_int 41)
+		     (const_int 10) (const_int 42)
+		     (const_int 11) (const_int 43)
+		     (const_int 12) (const_int 44)
+		     (const_int 13) (const_int 45)
+		     (const_int 14) (const_int 46)
+		     (const_int 15) (const_int 47)
+		     (const_int 24) (const_int 56)
+		     (const_int 25) (const_int 57)
+		     (const_int 26) (const_int 58)
+		     (const_int 27) (const_int 59)
+		     (const_int 28) (const_int 60)
+		     (const_int 29) (const_int 61)
+		     (const_int 30) (const_int 62)
+		     (const_int 31) (const_int 63)])))]
+  "TARGET_AVX2"
+  "vpunpckhbw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "vec_interleave_highv16qi"
+  [(set (match_operand:V16QI 0 "register_operand" "=x,x")
+	(vec_select:V16QI
+	  (vec_concat:V32QI
+	    (match_operand:V16QI 1 "register_operand" "0,x")
+	    (match_operand:V16QI 2 "nonimmediate_operand" "xm,xm"))
+	  (parallel [(const_int 8)  (const_int 24)
+		     (const_int 9)  (const_int 25)
+		     (const_int 10) (const_int 26)
+		     (const_int 11) (const_int 27)
+		     (const_int 12) (const_int 28)
+		     (const_int 13) (const_int 29)
+		     (const_int 14) (const_int 30)
+		     (const_int 15) (const_int 31)])))]
+  "TARGET_SSE2"
+  "@
+   punpckhbw\t{%2, %0|%0, %2}
+   vpunpckhbw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx2_interleave_lowv32qi"
+  [(set (match_operand:V32QI 0 "register_operand" "=x")
+	(vec_select:V32QI
+	  (vec_concat:V64QI
+	    (match_operand:V32QI 1 "register_operand" "x")
+	    (match_operand:V32QI 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 0) (const_int 32)
+		     (const_int 1) (const_int 33)
+		     (const_int 2) (const_int 34)
+		     (const_int 3) (const_int 35)
+		     (const_int 4) (const_int 36)
+		     (const_int 5) (const_int 37)
+		     (const_int 6) (const_int 38)
+		     (const_int 7) (const_int 39)
+		     (const_int 16) (const_int 48)
+		     (const_int 17) (const_int 49)
+		     (const_int 18) (const_int 50)
+		     (const_int 19) (const_int 51)
+		     (const_int 20) (const_int 52)
+		     (const_int 21) (const_int 53)
+		     (const_int 22) (const_int 54)
+		     (const_int 23) (const_int 55)])))]
+  "TARGET_AVX2"
+  "vpunpcklbw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "vec_interleave_lowv16qi"
+  [(set (match_operand:V16QI 0 "register_operand" "=x,x")
+	(vec_select:V16QI
+	  (vec_concat:V32QI
+	    (match_operand:V16QI 1 "register_operand" "0,x")
+	    (match_operand:V16QI 2 "nonimmediate_operand" "xm,xm"))
+	  (parallel [(const_int 0) (const_int 16)
+		     (const_int 1) (const_int 17)
+		     (const_int 2) (const_int 18)
+		     (const_int 3) (const_int 19)
+		     (const_int 4) (const_int 20)
+		     (const_int 5) (const_int 21)
+		     (const_int 6) (const_int 22)
+		     (const_int 7) (const_int 23)])))]
+  "TARGET_SSE2"
+  "@
+   punpcklbw\t{%2, %0|%0, %2}
+   vpunpcklbw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx2_interleave_highv16hi"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+	(vec_select:V16HI
+	  (vec_concat:V32HI
+	    (match_operand:V16HI 1 "register_operand" "x")
+	    (match_operand:V16HI 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 4) (const_int 20)
+		     (const_int 5) (const_int 21)
+		     (const_int 6) (const_int 22)
+		     (const_int 7) (const_int 23)
+		     (const_int 12) (const_int 28)
+		     (const_int 13) (const_int 29)
+		     (const_int 14) (const_int 30)
+		     (const_int 15) (const_int 31)])))]
+  "TARGET_AVX2"
+  "vpunpckhwd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "vec_interleave_highv8hi"
+  [(set (match_operand:V8HI 0 "register_operand" "=x,x")
+	(vec_select:V8HI
+	  (vec_concat:V16HI
+	    (match_operand:V8HI 1 "register_operand" "0,x")
+	    (match_operand:V8HI 2 "nonimmediate_operand" "xm,xm"))
+	  (parallel [(const_int 4) (const_int 12)
+		     (const_int 5) (const_int 13)
+		     (const_int 6) (const_int 14)
+		     (const_int 7) (const_int 15)])))]
+  "TARGET_SSE2"
+  "@
+   punpckhwd\t{%2, %0|%0, %2}
+   vpunpckhwd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx2_interleave_lowv16hi"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+	(vec_select:V16HI
+	  (vec_concat:V32HI
+	    (match_operand:V16HI 1 "register_operand" "x")
+	    (match_operand:V16HI 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 0) (const_int 16)
+		     (const_int 1) (const_int 17)
+		     (const_int 2) (const_int 18)
+		     (const_int 3) (const_int 19)
+		     (const_int 8) (const_int 24)
+		     (const_int 9) (const_int 25)
+		     (const_int 10) (const_int 26)
+		     (const_int 11) (const_int 27)])))]
+  "TARGET_AVX2"
+  "vpunpcklwd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "vec_interleave_lowv8hi"
+  [(set (match_operand:V8HI 0 "register_operand" "=x,x")
+	(vec_select:V8HI
+	  (vec_concat:V16HI
+	    (match_operand:V8HI 1 "register_operand" "0,x")
+	    (match_operand:V8HI 2 "nonimmediate_operand" "xm,xm"))
+	  (parallel [(const_int 0) (const_int 8)
+		     (const_int 1) (const_int 9)
+		     (const_int 2) (const_int 10)
+		     (const_int 3) (const_int 11)])))]
+  "TARGET_SSE2"
+  "@
+   punpcklwd\t{%2, %0|%0, %2}
+   vpunpcklwd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx2_interleave_highv8si"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+	(vec_select:V8SI
+	  (vec_concat:V16SI
+	    (match_operand:V8SI 1 "register_operand" "x")
+	    (match_operand:V8SI 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 2) (const_int 10)
+		     (const_int 3) (const_int 11)
+		     (const_int 6) (const_int 14)
+		     (const_int 7) (const_int 15)])))]
+  "TARGET_AVX2"
+  "vpunpckhdq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "<mask_codefor>avx512f_interleave_highv16si<mask_name>"
+  [(set (match_operand:V16SI 0 "register_operand" "=v")
+	(vec_select:V16SI
+	  (vec_concat:V32SI
+	    (match_operand:V16SI 1 "register_operand" "v")
+	    (match_operand:V16SI 2 "nonimmediate_operand" "vm"))
+	  (parallel [(const_int 2) (const_int 18)
+		     (const_int 3) (const_int 19)
+		     (const_int 6) (const_int 22)
+		     (const_int 7) (const_int 23)
+		     (const_int 10) (const_int 26)
+		     (const_int 11) (const_int 27)
+		     (const_int 14) (const_int 30)
+		     (const_int 15) (const_int 31)])))]
+  "TARGET_AVX512F"
+  "vpunpckhdq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+
+(define_insn "vec_interleave_highv4si"
+  [(set (match_operand:V4SI 0 "register_operand" "=x,x")
+	(vec_select:V4SI
+	  (vec_concat:V8SI
+	    (match_operand:V4SI 1 "register_operand" "0,x")
+	    (match_operand:V4SI 2 "nonimmediate_operand" "xm,xm"))
+	  (parallel [(const_int 2) (const_int 6)
+		     (const_int 3) (const_int 7)])))]
+  "TARGET_SSE2"
+  "@
+   punpckhdq\t{%2, %0|%0, %2}
+   vpunpckhdq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx2_interleave_lowv8si"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+	(vec_select:V8SI
+	  (vec_concat:V16SI
+	    (match_operand:V8SI 1 "register_operand" "x")
+	    (match_operand:V8SI 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 0) (const_int 8)
+		     (const_int 1) (const_int 9)
+		     (const_int 4) (const_int 12)
+		     (const_int 5) (const_int 13)])))]
+  "TARGET_AVX2"
+  "vpunpckldq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "<mask_codefor>avx512f_interleave_lowv16si<mask_name>"
+  [(set (match_operand:V16SI 0 "register_operand" "=v")
+	(vec_select:V16SI
+	  (vec_concat:V32SI
+	    (match_operand:V16SI 1 "register_operand" "v")
+	    (match_operand:V16SI 2 "nonimmediate_operand" "vm"))
+	  (parallel [(const_int 0) (const_int 16)
+		     (const_int 1) (const_int 17)
+		     (const_int 4) (const_int 20)
+		     (const_int 5) (const_int 21)
+		     (const_int 8) (const_int 24)
+		     (const_int 9) (const_int 25)
+		     (const_int 12) (const_int 28)
+		     (const_int 13) (const_int 29)])))]
+  "TARGET_AVX512F"
+  "vpunpckldq\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "vec_interleave_lowv4si"
+  [(set (match_operand:V4SI 0 "register_operand" "=x,x")
+	(vec_select:V4SI
+	  (vec_concat:V8SI
+	    (match_operand:V4SI 1 "register_operand" "0,x")
+	    (match_operand:V4SI 2 "nonimmediate_operand" "xm,xm"))
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 1) (const_int 5)])))]
+  "TARGET_SSE2"
+  "@
+   punpckldq\t{%2, %0|%0, %2}
+   vpunpckldq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_expand "vec_interleave_high<mode>"
+  [(match_operand:VI_256 0 "register_operand" "=x")
+   (match_operand:VI_256 1 "register_operand" "x")
+   (match_operand:VI_256 2 "nonimmediate_operand" "xm")]
+ "TARGET_AVX2"
+{
+  rtx t1 = gen_reg_rtx (<MODE>mode);
+  rtx t2 = gen_reg_rtx (<MODE>mode);
+  rtx t3 = gen_reg_rtx (V4DImode);
+  emit_insn (gen_avx2_interleave_low<mode> (t1, operands[1], operands[2]));
+  emit_insn (gen_avx2_interleave_high<mode> (t2,  operands[1], operands[2]));
+  emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, t1),
+				gen_lowpart (V4DImode, t2),
+				GEN_INT (1 + (3 << 4))));
+  emit_move_insn (operands[0], gen_lowpart (<MODE>mode, t3));
+  DONE;
+})
+
+(define_expand "vec_interleave_low<mode>"
+  [(match_operand:VI_256 0 "register_operand" "=x")
+   (match_operand:VI_256 1 "register_operand" "x")
+   (match_operand:VI_256 2 "nonimmediate_operand" "xm")]
+ "TARGET_AVX2"
+{
+  rtx t1 = gen_reg_rtx (<MODE>mode);
+  rtx t2 = gen_reg_rtx (<MODE>mode);
+  rtx t3 = gen_reg_rtx (V4DImode);
+  emit_insn (gen_avx2_interleave_low<mode> (t1, operands[1], operands[2]));
+  emit_insn (gen_avx2_interleave_high<mode> (t2, operands[1], operands[2]));
+  emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, t1),
+				gen_lowpart (V4DImode, t2),
+				GEN_INT (0 + (2 << 4))));
+  emit_move_insn (operands[0], gen_lowpart (<MODE>mode, t3));
+  DONE;
+})
+
+;; Modes handled by pinsr patterns.
+(define_mode_iterator PINSR_MODE
+  [(V16QI "TARGET_SSE4_1") V8HI
+   (V4SI "TARGET_SSE4_1")
+   (V2DI "TARGET_SSE4_1 && TARGET_64BIT")])
+
+(define_mode_attr sse2p4_1
+  [(V16QI "sse4_1") (V8HI "sse2")
+   (V4SI "sse4_1") (V2DI "sse4_1")])
+
+;; sse4_1_pinsrd must come before sse2_loadld since it is preferred.
+(define_insn "<sse2p4_1>_pinsr<ssemodesuffix>"
+  [(set (match_operand:PINSR_MODE 0 "register_operand" "=x,x,x,x")
+	(vec_merge:PINSR_MODE
+	  (vec_duplicate:PINSR_MODE
+	    (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "r,m,r,m"))
+	  (match_operand:PINSR_MODE 1 "register_operand" "0,0,x,x")
+	  (match_operand:SI 3 "const_int_operand")))]
+  "TARGET_SSE2
+   && ((unsigned) exact_log2 (INTVAL (operands[3]))
+       < GET_MODE_NUNITS (<MODE>mode))"
+{
+  operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3])));
+
+  switch (which_alternative)
+    {
+    case 0:
+      if (GET_MODE_SIZE (<ssescalarmode>mode) < GET_MODE_SIZE (SImode))
+	return "pinsr<ssemodesuffix>\t{%3, %k2, %0|%0, %k2, %3}";
+      /* FALLTHRU */
+    case 1:
+      return "pinsr<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}";
+    case 2:
+      if (GET_MODE_SIZE (<ssescalarmode>mode) < GET_MODE_SIZE (SImode))
+	return "vpinsr<ssemodesuffix>\t{%3, %k2, %1, %0|%0, %1, %k2, %3}";
+      /* FALLTHRU */
+    case 3:
+      return "vpinsr<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "noavx,noavx,avx,avx")
+   (set_attr "type" "sselog")
+   (set (attr "prefix_rex")
+     (if_then_else
+       (and (not (match_test "TARGET_AVX"))
+	    (eq (const_string "<MODE>mode") (const_string "V2DImode")))
+       (const_string "1")
+       (const_string "*")))
+   (set (attr "prefix_data16")
+     (if_then_else
+       (and (not (match_test "TARGET_AVX"))
+	    (eq (const_string "<MODE>mode") (const_string "V8HImode")))
+       (const_string "1")
+       (const_string "*")))
+   (set (attr "prefix_extra")
+     (if_then_else
+       (and (not (match_test "TARGET_AVX"))
+	    (eq (const_string "<MODE>mode") (const_string "V8HImode")))
+       (const_string "*")
+       (const_string "1")))
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,orig,vex,vex")
+   (set_attr "mode" "TI")])
+
+(define_expand "avx512f_vinsert<shuffletype>32x4_mask"
+  [(match_operand:V16FI 0 "register_operand")
+   (match_operand:V16FI 1 "register_operand")
+   (match_operand:<ssequartermode> 2 "nonimmediate_operand")
+   (match_operand:SI 3 "const_0_to_3_operand")
+   (match_operand:V16FI 4 "register_operand")
+   (match_operand:<avx512fmaskmode> 5 "register_operand")]
+  "TARGET_AVX512F"
+{
+  switch (INTVAL (operands[3]))
+    {
+    case 0:
+      emit_insn (gen_avx512f_vinsert<shuffletype>32x4_1_mask (operands[0],
+          operands[1], operands[2], GEN_INT (0xFFF), operands[4],
+	  operands[5]));
+      break;
+    case 1:
+      emit_insn (gen_avx512f_vinsert<shuffletype>32x4_1_mask (operands[0],
+          operands[1], operands[2], GEN_INT (0xF0FF), operands[4],
+	  operands[5]));
+      break;
+    case 2:
+      emit_insn (gen_avx512f_vinsert<shuffletype>32x4_1_mask (operands[0],
+          operands[1], operands[2], GEN_INT (0xFF0F), operands[4],
+	  operands[5]));
+      break;
+    case 3:
+      emit_insn (gen_avx512f_vinsert<shuffletype>32x4_1_mask (operands[0],
+          operands[1], operands[2], GEN_INT (0xFFF0), operands[4],
+	  operands[5]));
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  DONE;
+
+})
+
+(define_insn "<mask_codefor>avx512f_vinsert<shuffletype>32x4_1<mask_name>"
+  [(set (match_operand:V16FI 0 "register_operand" "=v")
+	(vec_merge:V16FI
+	  (match_operand:V16FI 1 "register_operand" "v")
+	  (vec_duplicate:V16FI
+		(match_operand:<ssequartermode> 2 "nonimmediate_operand" "vm"))
+	  (match_operand:SI 3 "const_int_operand" "n")))]
+  "TARGET_AVX512F"
+{
+  int mask;
+  if (INTVAL (operands[3]) == 0xFFF)
+      mask = 0;
+  else if ( INTVAL (operands[3]) == 0xF0FF)
+      mask = 1;
+  else if ( INTVAL (operands[3]) == 0xFF0F)
+      mask = 2;
+  else if ( INTVAL (operands[3]) == 0xFFF0)
+      mask = 3;
+  else
+      gcc_unreachable ();
+
+  operands[3] = GEN_INT (mask);
+
+  return "vinsert<shuffletype>32x4\t{%3, %2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2, %3}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512f_vinsert<shuffletype>64x4_mask"
+  [(match_operand:V8FI 0 "register_operand")
+   (match_operand:V8FI 1 "register_operand")
+   (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand")
+   (match_operand:SI 3 "const_0_to_1_operand")
+   (match_operand:V8FI 4 "register_operand")
+   (match_operand:<avx512fmaskmode> 5 "register_operand")]
+  "TARGET_AVX512F"
+{
+  int mask = INTVAL (operands[3]);
+  if (mask == 0)
+    emit_insn (gen_vec_set_lo_<mode>_mask
+      (operands[0], operands[1], operands[2],
+       operands[4], operands[5]));
+  else
+    emit_insn (gen_vec_set_hi_<mode>_mask
+      (operands[0], operands[1], operands[2],
+       operands[4], operands[5]));
+  DONE;
+})
+
+(define_insn "vec_set_lo_<mode><mask_name>"
+  [(set (match_operand:V8FI 0 "register_operand" "=v")
+	(vec_concat:V8FI
+	  (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" "vm")
+	  (vec_select:<ssehalfvecmode>
+	    (match_operand:V8FI 1 "register_operand" "v")
+	    (parallel [(const_int 4) (const_int 5)
+              (const_int 6) (const_int 7)]))))]
+  "TARGET_AVX512F"
+  "vinsert<shuffletype>64x4\t{$0x0, %2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2, $0x0}"
+  [(set_attr "type" "sselog")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "vec_set_hi_<mode><mask_name>"
+  [(set (match_operand:V8FI 0 "register_operand" "=v")
+	(vec_concat:V8FI
+	  (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" "vm")
+	  (vec_select:<ssehalfvecmode>
+	    (match_operand:V8FI 1 "register_operand" "v")
+	    (parallel [(const_int 0) (const_int 1)
+              (const_int 2) (const_int 3)]))))]
+  "TARGET_AVX512F"
+  "vinsert<shuffletype>64x4\t{$0x1, %2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2, $0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_expand "avx512f_shuf_<shuffletype>64x2_mask"
+  [(match_operand:V8FI 0 "register_operand")
+   (match_operand:V8FI 1 "register_operand")
+   (match_operand:V8FI 2 "nonimmediate_operand")
+   (match_operand:SI 3 "const_0_to_255_operand")
+   (match_operand:V8FI 4 "register_operand")
+   (match_operand:QI 5 "register_operand")]
+  "TARGET_AVX512F"
+{
+  int mask = INTVAL (operands[3]);
+  emit_insn (gen_avx512f_shuf_<shuffletype>64x2_1_mask
+      (operands[0], operands[1], operands[2],
+       GEN_INT (((mask >> 0) & 3) * 2),
+       GEN_INT (((mask >> 0) & 3) * 2 + 1),
+       GEN_INT (((mask >> 2) & 3) * 2),
+       GEN_INT (((mask >> 2) & 3) * 2 + 1),
+       GEN_INT (((mask >> 4) & 3) * 2 + 8),
+       GEN_INT (((mask >> 4) & 3) * 2 + 9),
+       GEN_INT (((mask >> 6) & 3) * 2 + 8),
+       GEN_INT (((mask >> 6) & 3) * 2 + 9),
+       operands[4], operands[5]));
+  DONE;
+})
+
+(define_insn "avx512f_shuf_<shuffletype>64x2_1<mask_name>"
+  [(set (match_operand:V8FI 0 "register_operand" "=v")
+	(vec_select:V8FI
+	  (vec_concat:<ssedoublemode>
+	    (match_operand:V8FI 1 "register_operand" "v")
+	    (match_operand:V8FI 2 "nonimmediate_operand" "vm"))
+	  (parallel [(match_operand 3  "const_0_to_7_operand")
+		     (match_operand 4  "const_0_to_7_operand")
+		     (match_operand 5  "const_0_to_7_operand")
+		     (match_operand 6  "const_0_to_7_operand")
+		     (match_operand 7  "const_8_to_15_operand")
+		     (match_operand 8  "const_8_to_15_operand")
+		     (match_operand 9  "const_8_to_15_operand")
+		     (match_operand 10  "const_8_to_15_operand")])))]
+  "TARGET_AVX512F
+   && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1)
+       && INTVAL (operands[5]) == (INTVAL (operands[6]) - 1)
+       && INTVAL (operands[7]) == (INTVAL (operands[8]) - 1)
+       && INTVAL (operands[9]) == (INTVAL (operands[10]) - 1))"
+{
+  int mask;
+  mask = INTVAL (operands[3]) / 2;
+  mask |= INTVAL (operands[5]) / 2 << 2;
+  mask |= (INTVAL (operands[7]) - 8) / 2 << 4;
+  mask |= (INTVAL (operands[9]) - 8) / 2 << 6;
+  operands[3] = GEN_INT (mask);
+
+  return "vshuf<shuffletype>64x2\t{%3, %2, %1, %0<mask_operand11>|%0<mask_operand11>, %1, %2, %3}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512f_shuf_<shuffletype>32x4_mask"
+  [(match_operand:V16FI 0 "register_operand")
+   (match_operand:V16FI 1 "register_operand")
+   (match_operand:V16FI 2 "nonimmediate_operand")
+   (match_operand:SI 3 "const_0_to_255_operand")
+   (match_operand:V16FI 4 "register_operand")
+   (match_operand:HI 5 "register_operand")]
+  "TARGET_AVX512F"
+{
+  int mask = INTVAL (operands[3]);
+  emit_insn (gen_avx512f_shuf_<shuffletype>32x4_1_mask
+      (operands[0], operands[1], operands[2],
+       GEN_INT (((mask >> 0) & 3) * 4),
+       GEN_INT (((mask >> 0) & 3) * 4 + 1),
+       GEN_INT (((mask >> 0) & 3) * 4 + 2),
+       GEN_INT (((mask >> 0) & 3) * 4 + 3),
+       GEN_INT (((mask >> 2) & 3) * 4),
+       GEN_INT (((mask >> 2) & 3) * 4 + 1),
+       GEN_INT (((mask >> 2) & 3) * 4 + 2),
+       GEN_INT (((mask >> 2) & 3) * 4 + 3),
+       GEN_INT (((mask >> 4) & 3) * 4 + 16),
+       GEN_INT (((mask >> 4) & 3) * 4 + 17),
+       GEN_INT (((mask >> 4) & 3) * 4 + 18),
+       GEN_INT (((mask >> 4) & 3) * 4 + 19),
+       GEN_INT (((mask >> 6) & 3) * 4 + 16),
+       GEN_INT (((mask >> 6) & 3) * 4 + 17),
+       GEN_INT (((mask >> 6) & 3) * 4 + 18),
+       GEN_INT (((mask >> 6) & 3) * 4 + 19),
+       operands[4], operands[5]));
+  DONE;
+})
+
+(define_insn "avx512f_shuf_<shuffletype>32x4_1<mask_name>"
+  [(set (match_operand:V16FI 0 "register_operand" "=v")
+	(vec_select:V16FI
+	  (vec_concat:<ssedoublemode>
+	    (match_operand:V16FI 1 "register_operand" "v")
+	    (match_operand:V16FI 2 "nonimmediate_operand" "vm"))
+	  (parallel [(match_operand 3  "const_0_to_15_operand")
+		     (match_operand 4  "const_0_to_15_operand")
+		     (match_operand 5  "const_0_to_15_operand")
+		     (match_operand 6  "const_0_to_15_operand")
+		     (match_operand 7  "const_0_to_15_operand")
+		     (match_operand 8  "const_0_to_15_operand")
+		     (match_operand 9  "const_0_to_15_operand")
+		     (match_operand 10  "const_0_to_15_operand")
+		     (match_operand 11  "const_16_to_31_operand")
+		     (match_operand 12  "const_16_to_31_operand")
+		     (match_operand 13  "const_16_to_31_operand")
+		     (match_operand 14  "const_16_to_31_operand")
+		     (match_operand 15  "const_16_to_31_operand")
+		     (match_operand 16  "const_16_to_31_operand")
+		     (match_operand 17  "const_16_to_31_operand")
+		     (match_operand 18  "const_16_to_31_operand")])))]
+  "TARGET_AVX512F
+   && (INTVAL (operands[3]) == (INTVAL (operands[4]) - 1)
+       && INTVAL (operands[3]) == (INTVAL (operands[5]) - 2)
+       && INTVAL (operands[3]) == (INTVAL (operands[6]) - 3)
+       && INTVAL (operands[7]) == (INTVAL (operands[8]) - 1)
+       && INTVAL (operands[7]) == (INTVAL (operands[9]) - 2)
+       && INTVAL (operands[7]) == (INTVAL (operands[10]) - 3)
+       && INTVAL (operands[11]) == (INTVAL (operands[12]) - 1)
+       && INTVAL (operands[11]) == (INTVAL (operands[13]) - 2)
+       && INTVAL (operands[11]) == (INTVAL (operands[14]) - 3)
+       && INTVAL (operands[15]) == (INTVAL (operands[16]) - 1)
+       && INTVAL (operands[15]) == (INTVAL (operands[17]) - 2)
+       && INTVAL (operands[15]) == (INTVAL (operands[18]) - 3))"
+{
+  int mask;
+  mask = INTVAL (operands[3]) / 4;
+  mask |= INTVAL (operands[7]) / 4 << 2;
+  mask |= (INTVAL (operands[11]) - 16) / 4 << 4;
+  mask |= (INTVAL (operands[15]) - 16) / 4 << 6;
+  operands[3] = GEN_INT (mask);
+
+  return "vshuf<shuffletype>32x4\t{%3, %2, %1, %0<mask_operand19>|%0<mask_operand19>, %1, %2, %3}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512f_pshufdv3_mask"
+  [(match_operand:V16SI 0 "register_operand")
+   (match_operand:V16SI 1 "nonimmediate_operand")
+   (match_operand:SI 2 "const_0_to_255_operand")
+   (match_operand:V16SI 3 "register_operand")
+   (match_operand:HI 4 "register_operand")]
+  "TARGET_AVX512F"
+{
+  int mask = INTVAL (operands[2]);
+  emit_insn (gen_avx512f_pshufd_1_mask (operands[0], operands[1],
+				       GEN_INT ((mask >> 0) & 3),
+				       GEN_INT ((mask >> 2) & 3),
+				       GEN_INT ((mask >> 4) & 3),
+				       GEN_INT ((mask >> 6) & 3),
+				       GEN_INT (((mask >> 0) & 3) + 4),
+				       GEN_INT (((mask >> 2) & 3) + 4),
+				       GEN_INT (((mask >> 4) & 3) + 4),
+				       GEN_INT (((mask >> 6) & 3) + 4),
+				       GEN_INT (((mask >> 0) & 3) + 8),
+				       GEN_INT (((mask >> 2) & 3) + 8),
+				       GEN_INT (((mask >> 4) & 3) + 8),
+				       GEN_INT (((mask >> 6) & 3) + 8),
+				       GEN_INT (((mask >> 0) & 3) + 12),
+				       GEN_INT (((mask >> 2) & 3) + 12),
+				       GEN_INT (((mask >> 4) & 3) + 12),
+				       GEN_INT (((mask >> 6) & 3) + 12),
+				       operands[3], operands[4]));
+  DONE;
+})
+
+(define_insn "avx512f_pshufd_1<mask_name>"
+  [(set (match_operand:V16SI 0 "register_operand" "=v")
+	(vec_select:V16SI
+	  (match_operand:V16SI 1 "nonimmediate_operand" "vm")
+	  (parallel [(match_operand 2 "const_0_to_3_operand")
+		     (match_operand 3 "const_0_to_3_operand")
+		     (match_operand 4 "const_0_to_3_operand")
+		     (match_operand 5 "const_0_to_3_operand")
+		     (match_operand 6 "const_4_to_7_operand")
+		     (match_operand 7 "const_4_to_7_operand")
+		     (match_operand 8 "const_4_to_7_operand")
+		     (match_operand 9 "const_4_to_7_operand")
+		     (match_operand 10 "const_8_to_11_operand")
+		     (match_operand 11 "const_8_to_11_operand")
+		     (match_operand 12 "const_8_to_11_operand")
+		     (match_operand 13 "const_8_to_11_operand")
+		     (match_operand 14 "const_12_to_15_operand")
+		     (match_operand 15 "const_12_to_15_operand")
+		     (match_operand 16 "const_12_to_15_operand")
+		     (match_operand 17 "const_12_to_15_operand")])))]
+  "TARGET_AVX512F
+   && INTVAL (operands[2]) + 4 == INTVAL (operands[6])
+   && INTVAL (operands[3]) + 4 == INTVAL (operands[7])
+   && INTVAL (operands[4]) + 4 == INTVAL (operands[8])
+   && INTVAL (operands[5]) + 4 == INTVAL (operands[9])
+   && INTVAL (operands[2]) + 8 == INTVAL (operands[10])
+   && INTVAL (operands[3]) + 8 == INTVAL (operands[11])
+   && INTVAL (operands[4]) + 8 == INTVAL (operands[12])
+   && INTVAL (operands[5]) + 8 == INTVAL (operands[13])
+   && INTVAL (operands[2]) + 12 == INTVAL (operands[14])
+   && INTVAL (operands[3]) + 12 == INTVAL (operands[15])
+   && INTVAL (operands[4]) + 12 == INTVAL (operands[16])
+   && INTVAL (operands[5]) + 12 == INTVAL (operands[17])"
+{
+  int mask = 0;
+  mask |= INTVAL (operands[2]) << 0;
+  mask |= INTVAL (operands[3]) << 2;
+  mask |= INTVAL (operands[4]) << 4;
+  mask |= INTVAL (operands[5]) << 6;
+  operands[2] = GEN_INT (mask);
+
+  return "vpshufd\t{%2, %1, %0<mask_operand18>|%0<mask_operand18>, %1, %2}";
+}
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "evex")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "XI")])
+
+(define_expand "avx2_pshufdv3"
+  [(match_operand:V8SI 0 "register_operand")
+   (match_operand:V8SI 1 "nonimmediate_operand")
+   (match_operand:SI 2 "const_0_to_255_operand")]
+  "TARGET_AVX2"
+{
+  int mask = INTVAL (operands[2]);
+  emit_insn (gen_avx2_pshufd_1 (operands[0], operands[1],
+				GEN_INT ((mask >> 0) & 3),
+				GEN_INT ((mask >> 2) & 3),
+				GEN_INT ((mask >> 4) & 3),
+				GEN_INT ((mask >> 6) & 3),
+				GEN_INT (((mask >> 0) & 3) + 4),
+				GEN_INT (((mask >> 2) & 3) + 4),
+				GEN_INT (((mask >> 4) & 3) + 4),
+				GEN_INT (((mask >> 6) & 3) + 4)));
+  DONE;
+})
+
+(define_insn "avx2_pshufd_1"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+	(vec_select:V8SI
+	  (match_operand:V8SI 1 "nonimmediate_operand" "xm")
+	  (parallel [(match_operand 2 "const_0_to_3_operand")
+		     (match_operand 3 "const_0_to_3_operand")
+		     (match_operand 4 "const_0_to_3_operand")
+		     (match_operand 5 "const_0_to_3_operand")
+		     (match_operand 6 "const_4_to_7_operand")
+		     (match_operand 7 "const_4_to_7_operand")
+		     (match_operand 8 "const_4_to_7_operand")
+		     (match_operand 9 "const_4_to_7_operand")])))]
+  "TARGET_AVX2
+   && INTVAL (operands[2]) + 4 == INTVAL (operands[6])
+   && INTVAL (operands[3]) + 4 == INTVAL (operands[7])
+   && INTVAL (operands[4]) + 4 == INTVAL (operands[8])
+   && INTVAL (operands[5]) + 4 == INTVAL (operands[9])"
+{
+  int mask = 0;
+  mask |= INTVAL (operands[2]) << 0;
+  mask |= INTVAL (operands[3]) << 2;
+  mask |= INTVAL (operands[4]) << 4;
+  mask |= INTVAL (operands[5]) << 6;
+  operands[2] = GEN_INT (mask);
+
+  return "vpshufd\t{%2, %1, %0|%0, %1, %2}";
+}
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "vex")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "OI")])
+
+(define_expand "sse2_pshufd"
+  [(match_operand:V4SI 0 "register_operand")
+   (match_operand:V4SI 1 "nonimmediate_operand")
+   (match_operand:SI 2 "const_int_operand")]
+  "TARGET_SSE2"
+{
+  int mask = INTVAL (operands[2]);
+  emit_insn (gen_sse2_pshufd_1 (operands[0], operands[1],
+				GEN_INT ((mask >> 0) & 3),
+				GEN_INT ((mask >> 2) & 3),
+				GEN_INT ((mask >> 4) & 3),
+				GEN_INT ((mask >> 6) & 3)));
+  DONE;
+})
+
+(define_insn "sse2_pshufd_1"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(vec_select:V4SI
+	  (match_operand:V4SI 1 "nonimmediate_operand" "xm")
+	  (parallel [(match_operand 2 "const_0_to_3_operand")
+		     (match_operand 3 "const_0_to_3_operand")
+		     (match_operand 4 "const_0_to_3_operand")
+		     (match_operand 5 "const_0_to_3_operand")])))]
+  "TARGET_SSE2"
+{
+  int mask = 0;
+  mask |= INTVAL (operands[2]) << 0;
+  mask |= INTVAL (operands[3]) << 2;
+  mask |= INTVAL (operands[4]) << 4;
+  mask |= INTVAL (operands[5]) << 6;
+  operands[2] = GEN_INT (mask);
+
+  return "%vpshufd\t{%2, %1, %0|%0, %1, %2}";
+}
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "TI")])
+
+(define_expand "avx2_pshuflwv3"
+  [(match_operand:V16HI 0 "register_operand")
+   (match_operand:V16HI 1 "nonimmediate_operand")
+   (match_operand:SI 2 "const_0_to_255_operand")]
+  "TARGET_AVX2"
+{
+  int mask = INTVAL (operands[2]);
+  emit_insn (gen_avx2_pshuflw_1 (operands[0], operands[1],
+				 GEN_INT ((mask >> 0) & 3),
+				 GEN_INT ((mask >> 2) & 3),
+				 GEN_INT ((mask >> 4) & 3),
+				 GEN_INT ((mask >> 6) & 3),
+				 GEN_INT (((mask >> 0) & 3) + 8),
+				 GEN_INT (((mask >> 2) & 3) + 8),
+				 GEN_INT (((mask >> 4) & 3) + 8),
+				 GEN_INT (((mask >> 6) & 3) + 8)));
+  DONE;
+})
+
+(define_insn "avx2_pshuflw_1"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+	(vec_select:V16HI
+	  (match_operand:V16HI 1 "nonimmediate_operand" "xm")
+	  (parallel [(match_operand 2 "const_0_to_3_operand")
+		     (match_operand 3 "const_0_to_3_operand")
+		     (match_operand 4 "const_0_to_3_operand")
+		     (match_operand 5 "const_0_to_3_operand")
+		     (const_int 4)
+		     (const_int 5)
+		     (const_int 6)
+		     (const_int 7)
+		     (match_operand 6 "const_8_to_11_operand")
+		     (match_operand 7 "const_8_to_11_operand")
+		     (match_operand 8 "const_8_to_11_operand")
+		     (match_operand 9 "const_8_to_11_operand")
+		     (const_int 12)
+		     (const_int 13)
+		     (const_int 14)
+		     (const_int 15)])))]
+  "TARGET_AVX2
+   && INTVAL (operands[2]) + 8 == INTVAL (operands[6])
+   && INTVAL (operands[3]) + 8 == INTVAL (operands[7])
+   && INTVAL (operands[4]) + 8 == INTVAL (operands[8])
+   && INTVAL (operands[5]) + 8 == INTVAL (operands[9])"
+{
+  int mask = 0;
+  mask |= INTVAL (operands[2]) << 0;
+  mask |= INTVAL (operands[3]) << 2;
+  mask |= INTVAL (operands[4]) << 4;
+  mask |= INTVAL (operands[5]) << 6;
+  operands[2] = GEN_INT (mask);
+
+  return "vpshuflw\t{%2, %1, %0|%0, %1, %2}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "OI")])
+
+(define_expand "sse2_pshuflw"
+  [(match_operand:V8HI 0 "register_operand")
+   (match_operand:V8HI 1 "nonimmediate_operand")
+   (match_operand:SI 2 "const_int_operand")]
+  "TARGET_SSE2"
+{
+  int mask = INTVAL (operands[2]);
+  emit_insn (gen_sse2_pshuflw_1 (operands[0], operands[1],
+				 GEN_INT ((mask >> 0) & 3),
+				 GEN_INT ((mask >> 2) & 3),
+				 GEN_INT ((mask >> 4) & 3),
+				 GEN_INT ((mask >> 6) & 3)));
+  DONE;
+})
+
+(define_insn "sse2_pshuflw_1"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(vec_select:V8HI
+	  (match_operand:V8HI 1 "nonimmediate_operand" "xm")
+	  (parallel [(match_operand 2 "const_0_to_3_operand")
+		     (match_operand 3 "const_0_to_3_operand")
+		     (match_operand 4 "const_0_to_3_operand")
+		     (match_operand 5 "const_0_to_3_operand")
+		     (const_int 4)
+		     (const_int 5)
+		     (const_int 6)
+		     (const_int 7)])))]
+  "TARGET_SSE2"
+{
+  int mask = 0;
+  mask |= INTVAL (operands[2]) << 0;
+  mask |= INTVAL (operands[3]) << 2;
+  mask |= INTVAL (operands[4]) << 4;
+  mask |= INTVAL (operands[5]) << 6;
+  operands[2] = GEN_INT (mask);
+
+  return "%vpshuflw\t{%2, %1, %0|%0, %1, %2}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "0")
+   (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "TI")])
+
+(define_expand "avx2_pshufhwv3"
+  [(match_operand:V16HI 0 "register_operand")
+   (match_operand:V16HI 1 "nonimmediate_operand")
+   (match_operand:SI 2 "const_0_to_255_operand")]
+  "TARGET_AVX2"
+{
+  int mask = INTVAL (operands[2]);
+  emit_insn (gen_avx2_pshufhw_1 (operands[0], operands[1],
+				 GEN_INT (((mask >> 0) & 3) + 4),
+				 GEN_INT (((mask >> 2) & 3) + 4),
+				 GEN_INT (((mask >> 4) & 3) + 4),
+				 GEN_INT (((mask >> 6) & 3) + 4),
+				 GEN_INT (((mask >> 0) & 3) + 12),
+				 GEN_INT (((mask >> 2) & 3) + 12),
+				 GEN_INT (((mask >> 4) & 3) + 12),
+				 GEN_INT (((mask >> 6) & 3) + 12)));
+  DONE;
+})
+
+(define_insn "avx2_pshufhw_1"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+	(vec_select:V16HI
+	  (match_operand:V16HI 1 "nonimmediate_operand" "xm")
+	  (parallel [(const_int 0)
+		     (const_int 1)
+		     (const_int 2)
+		     (const_int 3)
+		     (match_operand 2 "const_4_to_7_operand")
+		     (match_operand 3 "const_4_to_7_operand")
+		     (match_operand 4 "const_4_to_7_operand")
+		     (match_operand 5 "const_4_to_7_operand")
+		     (const_int 8)
+		     (const_int 9)
+		     (const_int 10)
+		     (const_int 11)
+		     (match_operand 6 "const_12_to_15_operand")
+		     (match_operand 7 "const_12_to_15_operand")
+		     (match_operand 8 "const_12_to_15_operand")
+		     (match_operand 9 "const_12_to_15_operand")])))]
+  "TARGET_AVX2
+   && INTVAL (operands[2]) + 8 == INTVAL (operands[6])
+   && INTVAL (operands[3]) + 8 == INTVAL (operands[7])
+   && INTVAL (operands[4]) + 8 == INTVAL (operands[8])
+   && INTVAL (operands[5]) + 8 == INTVAL (operands[9])"
+{
+  int mask = 0;
+  mask |= (INTVAL (operands[2]) - 4) << 0;
+  mask |= (INTVAL (operands[3]) - 4) << 2;
+  mask |= (INTVAL (operands[4]) - 4) << 4;
+  mask |= (INTVAL (operands[5]) - 4) << 6;
+  operands[2] = GEN_INT (mask);
+
+  return "vpshufhw\t{%2, %1, %0|%0, %1, %2}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "OI")])
+
+(define_expand "sse2_pshufhw"
+  [(match_operand:V8HI 0 "register_operand")
+   (match_operand:V8HI 1 "nonimmediate_operand")
+   (match_operand:SI 2 "const_int_operand")]
+  "TARGET_SSE2"
+{
+  int mask = INTVAL (operands[2]);
+  emit_insn (gen_sse2_pshufhw_1 (operands[0], operands[1],
+				 GEN_INT (((mask >> 0) & 3) + 4),
+				 GEN_INT (((mask >> 2) & 3) + 4),
+				 GEN_INT (((mask >> 4) & 3) + 4),
+				 GEN_INT (((mask >> 6) & 3) + 4)));
+  DONE;
+})
+
+(define_insn "sse2_pshufhw_1"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(vec_select:V8HI
+	  (match_operand:V8HI 1 "nonimmediate_operand" "xm")
+	  (parallel [(const_int 0)
+		     (const_int 1)
+		     (const_int 2)
+		     (const_int 3)
+		     (match_operand 2 "const_4_to_7_operand")
+		     (match_operand 3 "const_4_to_7_operand")
+		     (match_operand 4 "const_4_to_7_operand")
+		     (match_operand 5 "const_4_to_7_operand")])))]
+  "TARGET_SSE2"
+{
+  int mask = 0;
+  mask |= (INTVAL (operands[2]) - 4) << 0;
+  mask |= (INTVAL (operands[3]) - 4) << 2;
+  mask |= (INTVAL (operands[4]) - 4) << 4;
+  mask |= (INTVAL (operands[5]) - 4) << 6;
+  operands[2] = GEN_INT (mask);
+
+  return "%vpshufhw\t{%2, %1, %0|%0, %1, %2}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_rep" "1")
+   (set_attr "prefix_data16" "0")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "TI")])
+
+(define_expand "sse2_loadd"
+  [(set (match_operand:V4SI 0 "register_operand")
+	(vec_merge:V4SI
+	  (vec_duplicate:V4SI
+	    (match_operand:SI 1 "nonimmediate_operand"))
+	  (match_dup 2)
+	  (const_int 1)))]
+  "TARGET_SSE"
+  "operands[2] = CONST0_RTX (V4SImode);")
+
+(define_insn "sse2_loadld"
+  [(set (match_operand:V4SI 0 "register_operand"       "=x,Yi,x,x,x")
+	(vec_merge:V4SI
+	  (vec_duplicate:V4SI
+	    (match_operand:SI 2 "nonimmediate_operand" "m ,r ,m,x,x"))
+	  (match_operand:V4SI 1 "reg_or_0_operand"     "C ,C ,C,0,x")
+	  (const_int 1)))]
+  "TARGET_SSE"
+  "@
+   %vmovd\t{%2, %0|%0, %2}
+   %vmovd\t{%2, %0|%0, %2}
+   movss\t{%2, %0|%0, %2}
+   movss\t{%2, %0|%0, %2}
+   vmovss\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "sse2,*,noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex,maybe_vex,orig,orig,vex")
+   (set_attr "mode" "TI,TI,V4SF,SF,SF")])
+
+(define_insn "*vec_extract<mode>"
+  [(set (match_operand:<ssescalarmode> 0 "nonimmediate_operand" "=r,m")
+	(vec_select:<ssescalarmode>
+	  (match_operand:VI12_128 1 "register_operand" "x,x")
+	  (parallel
+	    [(match_operand:SI 2 "const_0_to_<ssescalarnummask>_operand")])))]
+  "TARGET_SSE4_1"
+  "@
+   %vpextr<ssemodesuffix>\t{%2, %1, %k0|%k0, %1, %2}
+   %vpextr<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog1")
+   (set (attr "prefix_data16")
+     (if_then_else
+       (and (eq_attr "alternative" "0")
+	    (eq (const_string "<MODE>mode") (const_string "V8HImode")))
+       (const_string "1")
+       (const_string "*")))
+   (set (attr "prefix_extra")
+     (if_then_else
+       (and (eq_attr "alternative" "0")
+	    (eq (const_string "<MODE>mode") (const_string "V8HImode")))
+       (const_string "*")
+       (const_string "1")))
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*vec_extractv8hi_sse2"
+  [(set (match_operand:HI 0 "register_operand" "=r")
+	(vec_select:HI
+	  (match_operand:V8HI 1 "register_operand" "x")
+	  (parallel
+	    [(match_operand:SI 2 "const_0_to_7_operand")])))]
+  "TARGET_SSE2 && !TARGET_SSE4_1"
+  "pextrw\t{%2, %1, %k0|%k0, %1, %2}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix_data16" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*vec_extractv16qi_zext"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(zero_extend:SWI48
+	  (vec_select:QI
+	    (match_operand:V16QI 1 "register_operand" "x")
+	    (parallel
+	      [(match_operand:SI 2 "const_0_to_15_operand")]))))]
+  "TARGET_SSE4_1"
+  "%vpextrb\t{%2, %1, %k0|%k0, %1, %2}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*vec_extractv8hi_zext"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+	(zero_extend:SWI48
+	  (vec_select:HI
+	    (match_operand:V8HI 1 "register_operand" "x")
+	    (parallel
+	      [(match_operand:SI 2 "const_0_to_7_operand")]))))]
+  "TARGET_SSE2"
+  "%vpextrw\t{%2, %1, %k0|%k0, %1, %2}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix_data16" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*vec_extract<mode>_mem"
+  [(set (match_operand:<ssescalarmode> 0 "register_operand" "=r")
+	(vec_select:<ssescalarmode>
+	  (match_operand:VI12_128 1 "memory_operand" "o")
+	  (parallel
+	    [(match_operand 2 "const_0_to_<ssescalarnummask>_operand")])))]
+  "TARGET_SSE"
+  "#")
+
+(define_insn "*vec_extract<ssevecmodelower>_0"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand"	       "=r ,r,x ,m")
+	(vec_select:SWI48
+	  (match_operand:<ssevecmode> 1 "nonimmediate_operand" "mYj,x,xm,x")
+	  (parallel [(const_int 0)])))]
+  "TARGET_SSE && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "#"
+  [(set_attr "isa" "*,sse4,*,*")])
+
+(define_insn_and_split "*vec_extractv4si_0_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (vec_select:SI
+	    (match_operand:V4SI 1 "register_operand" "x")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_64BIT && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_FROM_VEC"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:DI (match_dup 1)))]
+  "operands[1] = gen_rtx_REG (SImode, REGNO (operands[1]));")
+
+(define_insn "*vec_extractv2di_0_sse"
+  [(set (match_operand:DI 0 "nonimmediate_operand"     "=x,m")
+	(vec_select:DI
+	  (match_operand:V2DI 1 "nonimmediate_operand" "xm,x")
+	  (parallel [(const_int 0)])))]
+  "TARGET_SSE && !TARGET_64BIT
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "#")
+
+(define_split
+  [(set (match_operand:SWI48x 0 "nonimmediate_operand")
+	(vec_select:SWI48x
+	  (match_operand:<ssevecmode> 1 "register_operand")
+	  (parallel [(const_int 0)])))]
+  "TARGET_SSE && reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+  "operands[1] = gen_rtx_REG (<MODE>mode, REGNO (operands[1]));")
+
+(define_insn "*vec_extractv4si"
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,x,x")
+	(vec_select:SI
+	  (match_operand:V4SI 1 "register_operand" "x,0,x")
+	  (parallel [(match_operand:SI 2 "const_0_to_3_operand")])))]
+  "TARGET_SSE4_1"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "%vpextrd\t{%2, %1, %0|%0, %1, %2}";
+
+    case 1:
+      operands [2] = GEN_INT (INTVAL (operands[2]) * 4);
+      return "psrldq\t{%2, %0|%0, %2}";
+
+    case 2:
+      operands [2] = GEN_INT (INTVAL (operands[2]) * 4);
+      return "vpsrldq\t{%2, %1, %0|%0, %1, %2}";
+
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "*,noavx,avx")
+   (set_attr "type" "sselog1,sseishft1,sseishft1")
+   (set_attr "prefix_extra" "1,*,*")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "maybe_vex,orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*vec_extractv4si_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (vec_select:SI
+	    (match_operand:V4SI 1 "register_operand" "x")
+	    (parallel [(match_operand:SI 2 "const_0_to_3_operand")]))))]
+  "TARGET_64BIT && TARGET_SSE4_1"
+  "%vpextrd\t{%2, %1, %k0|%k0, %1, %2}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*vec_extractv4si_mem"
+  [(set (match_operand:SI 0 "register_operand" "=x,r")
+	(vec_select:SI
+	  (match_operand:V4SI 1 "memory_operand" "o,o")
+	  (parallel [(match_operand 2 "const_0_to_3_operand")])))]
+  "TARGET_SSE"
+  "#")
+
+(define_insn_and_split "*vec_extractv4si_zext_mem"
+  [(set (match_operand:DI 0 "register_operand" "=x,r")
+	(zero_extend:DI
+	  (vec_select:SI
+	    (match_operand:V4SI 1 "memory_operand" "o,o")
+	    (parallel [(match_operand:SI 2 "const_0_to_3_operand")]))))]
+  "TARGET_64BIT && TARGET_SSE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:DI (match_dup 1)))]
+{
+  operands[1] = adjust_address (operands[1], SImode, INTVAL (operands[2]) * 4);
+})
+
+(define_insn "*vec_extractv2di_1"
+  [(set (match_operand:DI 0 "nonimmediate_operand"     "=rm,m,x,x,x,x,r")
+	(vec_select:DI
+	  (match_operand:V2DI 1 "nonimmediate_operand"  "x ,x,0,x,x,o,o")
+	  (parallel [(const_int 1)])))]
+  "TARGET_SSE && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "@
+   %vpextrq\t{$1, %1, %0|%0, %1, 1}
+   %vmovhps\t{%1, %0|%0, %1}
+   psrldq\t{$8, %0|%0, 8}
+   vpsrldq\t{$8, %1, %0|%0, %1, 8}
+   movhlps\t{%1, %0|%0, %1}
+   #
+   #"
+  [(set_attr "isa" "x64_sse4,*,sse2_noavx,avx,noavx,*,x64")
+   (set_attr "type" "sselog1,ssemov,sseishft1,sseishft1,ssemov,ssemov,imov")
+   (set_attr "length_immediate" "1,*,1,1,*,*,*")
+   (set_attr "prefix_rex" "1,*,*,*,*,*,*")
+   (set_attr "prefix_extra" "1,*,*,*,*,*,*")
+   (set_attr "prefix" "maybe_vex,maybe_vex,orig,vex,orig,*,*")
+   (set_attr "mode" "TI,V2SF,TI,TI,V4SF,DI,DI")])
+
+(define_split
+  [(set (match_operand:<ssescalarmode> 0 "register_operand")
+	(vec_select:<ssescalarmode>
+	  (match_operand:VI_128 1 "memory_operand")
+	  (parallel
+	    [(match_operand 2 "const_0_to_<ssescalarnummask>_operand")])))]
+  "TARGET_SSE && reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+{
+  int offs = INTVAL (operands[2]) * GET_MODE_SIZE (<ssescalarmode>mode);
+
+  operands[1] = adjust_address (operands[1], <ssescalarmode>mode, offs);
+})
+
+(define_insn "*vec_dupv4si"
+  [(set (match_operand:V4SI 0 "register_operand"     "=x,x,x")
+	(vec_duplicate:V4SI
+	  (match_operand:SI 1 "nonimmediate_operand" " x,m,0")))]
+  "TARGET_SSE"
+  "@
+   %vpshufd\t{$0, %1, %0|%0, %1, 0}
+   vbroadcastss\t{%1, %0|%0, %1}
+   shufps\t{$0, %0, %0|%0, %0, 0}"
+  [(set_attr "isa" "sse2,avx,noavx")
+   (set_attr "type" "sselog1,ssemov,sselog1")
+   (set_attr "length_immediate" "1,0,1")
+   (set_attr "prefix_extra" "0,1,*")
+   (set_attr "prefix" "maybe_vex,vex,orig")
+   (set_attr "mode" "TI,V4SF,V4SF")])
+
+(define_insn "*vec_dupv2di"
+  [(set (match_operand:V2DI 0 "register_operand"     "=x,x,x,x")
+	(vec_duplicate:V2DI
+	  (match_operand:DI 1 "nonimmediate_operand" " 0,x,m,0")))]
+  "TARGET_SSE"
+  "@
+   punpcklqdq\t%0, %0
+   vpunpcklqdq\t{%d1, %0|%0, %d1}
+   %vmovddup\t{%1, %0|%0, %1}
+   movlhps\t%0, %0"
+  [(set_attr "isa" "sse2_noavx,avx,sse3,noavx")
+   (set_attr "type" "sselog1,sselog1,sselog1,ssemov")
+   (set_attr "prefix" "orig,vex,maybe_vex,orig")
+   (set_attr "mode" "TI,TI,DF,V4SF")])
+
+(define_insn "*vec_concatv2si_sse4_1"
+  [(set (match_operand:V2SI 0 "register_operand"     "=x, x,x,x, x, *y,*y")
+	(vec_concat:V2SI
+	  (match_operand:SI 1 "nonimmediate_operand" " 0, x,0,x,rm,  0,rm")
+	  (match_operand:SI 2 "vector_move_operand"  "rm,rm,x,x, C,*ym, C")))]
+  "TARGET_SSE4_1"
+  "@
+   pinsrd\t{$1, %2, %0|%0, %2, 1}
+   vpinsrd\t{$1, %2, %1, %0|%0, %1, %2, 1}
+   punpckldq\t{%2, %0|%0, %2}
+   vpunpckldq\t{%2, %1, %0|%0, %1, %2}
+   %vmovd\t{%1, %0|%0, %1}
+   punpckldq\t{%2, %0|%0, %2}
+   movd\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "noavx,avx,noavx,avx,*,*,*")
+   (set_attr "type" "sselog,sselog,sselog,sselog,ssemov,mmxcvt,mmxmov")
+   (set_attr "prefix_extra" "1,1,*,*,*,*,*")
+   (set_attr "length_immediate" "1,1,*,*,*,*,*")
+   (set_attr "prefix" "orig,vex,orig,vex,maybe_vex,orig,orig")
+   (set_attr "mode" "TI,TI,TI,TI,TI,DI,DI")])
+
+;; ??? In theory we can match memory for the MMX alternative, but allowing
+;; nonimmediate_operand for operand 2 and *not* allowing memory for the SSE
+;; alternatives pretty much forces the MMX alternative to be chosen.
+(define_insn "*vec_concatv2si"
+  [(set (match_operand:V2SI 0 "register_operand"     "=x,x ,*y,x,x,*y,*y")
+	(vec_concat:V2SI
+	  (match_operand:SI 1 "nonimmediate_operand" " 0,rm,rm,0,m, 0,*rm")
+	  (match_operand:SI 2 "reg_or_0_operand"     " x,C ,C, x,C,*y,C")))]
+  "TARGET_SSE && !TARGET_SSE4_1"
+  "@
+   punpckldq\t{%2, %0|%0, %2}
+   movd\t{%1, %0|%0, %1}
+   movd\t{%1, %0|%0, %1}
+   unpcklps\t{%2, %0|%0, %2}
+   movss\t{%1, %0|%0, %1}
+   punpckldq\t{%2, %0|%0, %2}
+   movd\t{%1, %0|%0, %1}"
+  [(set_attr "isa" "sse2,sse2,sse2,*,*,*,*")
+   (set_attr "type" "sselog,ssemov,mmxmov,sselog,ssemov,mmxcvt,mmxmov")
+   (set_attr "mode" "TI,TI,DI,V4SF,SF,DI,DI")])
+
+(define_insn "*vec_concatv4si"
+  [(set (match_operand:V4SI 0 "register_operand"       "=x,x,x,x,x")
+	(vec_concat:V4SI
+	  (match_operand:V2SI 1 "register_operand"     " 0,x,0,0,x")
+	  (match_operand:V2SI 2 "nonimmediate_operand" " x,x,x,m,m")))]
+  "TARGET_SSE"
+  "@
+   punpcklqdq\t{%2, %0|%0, %2}
+   vpunpcklqdq\t{%2, %1, %0|%0, %1, %2}
+   movlhps\t{%2, %0|%0, %2}
+   movhps\t{%2, %0|%0, %q2}
+   vmovhps\t{%2, %1, %0|%0, %1, %q2}"
+  [(set_attr "isa" "sse2_noavx,avx,noavx,noavx,avx")
+   (set_attr "type" "sselog,sselog,ssemov,ssemov,ssemov")
+   (set_attr "prefix" "orig,vex,orig,orig,vex")
+   (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")])
+
+;; movd instead of movq is required to handle broken assemblers.
+(define_insn "vec_concatv2di"
+  [(set (match_operand:V2DI 0 "register_operand"
+	  "=x,x ,Yi,x ,!x,x,x,x,x,x")
+	(vec_concat:V2DI
+	  (match_operand:DI 1 "nonimmediate_operand"
+	  " 0,x ,r ,xm,*y,0,x,0,0,x")
+	  (match_operand:DI 2 "vector_move_operand"
+	  "rm,rm,C ,C ,C ,x,x,x,m,m")))]
+  "TARGET_SSE"
+  "@
+   pinsrq\t{$1, %2, %0|%0, %2, 1}
+   vpinsrq\t{$1, %2, %1, %0|%0, %1, %2, 1}
+   * return HAVE_AS_IX86_INTERUNIT_MOVQ ? \"%vmovq\t{%1, %0|%0, %1}\" : \"%vmovd\t{%1, %0|%0, %1}\";
+   %vmovq\t{%1, %0|%0, %1}
+   movq2dq\t{%1, %0|%0, %1}
+   punpcklqdq\t{%2, %0|%0, %2}
+   vpunpcklqdq\t{%2, %1, %0|%0, %1, %2}
+   movlhps\t{%2, %0|%0, %2}
+   movhps\t{%2, %0|%0, %2}
+   vmovhps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "x64_sse4_noavx,x64_avx,x64,sse2,sse2,sse2_noavx,avx,noavx,noavx,avx")
+   (set (attr "type")
+     (if_then_else
+       (eq_attr "alternative" "0,1,5,6")
+       (const_string "sselog")
+       (const_string "ssemov")))
+   (set_attr "prefix_rex" "1,1,1,*,*,*,*,*,*,*")
+   (set_attr "prefix_extra" "1,1,*,*,*,*,*,*,*,*")
+   (set_attr "length_immediate" "1,1,*,*,*,*,*,*,*,*")
+   (set_attr "prefix" "orig,vex,maybe_vex,maybe_vex,orig,orig,vex,orig,orig,vex")
+   (set_attr "mode" "TI,TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF")])
+
+(define_expand "vec_unpacks_lo_<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand")
+   (match_operand:VI124_AVX512F 1 "register_operand")]
+  "TARGET_SSE2"
+  "ix86_expand_sse_unpack (operands[0], operands[1], false, false); DONE;")
+
+(define_expand "vec_unpacks_hi_<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand")
+   (match_operand:VI124_AVX512F 1 "register_operand")]
+  "TARGET_SSE2"
+  "ix86_expand_sse_unpack (operands[0], operands[1], false, true); DONE;")
+
+(define_expand "vec_unpacku_lo_<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand")
+   (match_operand:VI124_AVX512F 1 "register_operand")]
+  "TARGET_SSE2"
+  "ix86_expand_sse_unpack (operands[0], operands[1], true, false); DONE;")
+
+(define_expand "vec_unpacku_hi_<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand")
+   (match_operand:VI124_AVX512F 1 "register_operand")]
+  "TARGET_SSE2"
+  "ix86_expand_sse_unpack (operands[0], operands[1], true, true); DONE;")
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Miscellaneous
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_expand "<sse2_avx2>_uavg<mode>3"
+  [(set (match_operand:VI12_AVX2 0 "register_operand")
+	(truncate:VI12_AVX2
+	  (lshiftrt:<ssedoublemode>
+	    (plus:<ssedoublemode>
+	      (plus:<ssedoublemode>
+		(zero_extend:<ssedoublemode>
+		  (match_operand:VI12_AVX2 1 "nonimmediate_operand"))
+		(zero_extend:<ssedoublemode>
+		  (match_operand:VI12_AVX2 2 "nonimmediate_operand")))
+	      (match_dup 3))
+	    (const_int 1))))]
+  "TARGET_SSE2"
+{
+  operands[3] = CONST1_RTX(<MODE>mode);
+  ix86_fixup_binary_operands_no_copy (PLUS, <MODE>mode, operands);
+})
+
+(define_insn "*<sse2_avx2>_uavg<mode>3"
+  [(set (match_operand:VI12_AVX2 0 "register_operand" "=x,x")
+	(truncate:VI12_AVX2
+	  (lshiftrt:<ssedoublemode>
+	    (plus:<ssedoublemode>
+	      (plus:<ssedoublemode>
+		(zero_extend:<ssedoublemode>
+		  (match_operand:VI12_AVX2 1 "nonimmediate_operand" "%0,x"))
+		(zero_extend:<ssedoublemode>
+		  (match_operand:VI12_AVX2 2 "nonimmediate_operand" "xm,xm")))
+	      (match_operand:VI12_AVX2 3 "const1_operand"))
+	    (const_int 1))))]
+  "TARGET_SSE2 && ix86_binary_operator_ok (PLUS, <MODE>mode, operands)"
+  "@
+   pavg<ssemodesuffix>\t{%2, %0|%0, %2}
+   vpavg<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+;; The correct representation for this is absolutely enormous, and
+;; surely not generally useful.
+(define_insn "<sse2_avx2>_psadbw"
+  [(set (match_operand:VI8_AVX2 0 "register_operand" "=x,x")
+	(unspec:VI8_AVX2
+	  [(match_operand:<ssebytemode> 1 "register_operand" "0,x")
+	   (match_operand:<ssebytemode> 2 "nonimmediate_operand" "xm,xm")]
+	  UNSPEC_PSADBW))]
+  "TARGET_SSE2"
+  "@
+   psadbw\t{%2, %0|%0, %2}
+   vpsadbw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseiadd")
+   (set_attr "atom_unit" "simul")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<sse>_movmsk<ssemodesuffix><avxsizesuffix>"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI
+	  [(match_operand:VF_128_256 1 "register_operand" "x")]
+	  UNSPEC_MOVMSK))]
+  "TARGET_SSE"
+  "%vmovmsk<ssemodesuffix>\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx2_pmovmskb"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:V32QI 1 "register_operand" "x")]
+		   UNSPEC_MOVMSK))]
+  "TARGET_AVX2"
+  "vpmovmskb\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "DI")])
+
+(define_insn "sse2_pmovmskb"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI [(match_operand:V16QI 1 "register_operand" "x")]
+		   UNSPEC_MOVMSK))]
+  "TARGET_SSE2"
+  "%vpmovmskb\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "SI")])
+
+(define_expand "sse2_maskmovdqu"
+  [(set (match_operand:V16QI 0 "memory_operand")
+	(unspec:V16QI [(match_operand:V16QI 1 "register_operand")
+		       (match_operand:V16QI 2 "register_operand")
+		       (match_dup 0)]
+		      UNSPEC_MASKMOV))]
+  "TARGET_SSE2")
+
+(define_insn "*sse2_maskmovdqu"
+  [(set (mem:V16QI (match_operand:P 0 "register_operand" "D"))
+	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "x")
+		       (match_operand:V16QI 2 "register_operand" "x")
+		       (mem:V16QI (match_dup 0))]
+		      UNSPEC_MASKMOV))]
+  "TARGET_SSE2"
+{
+  /* We can't use %^ here due to ASM_OUTPUT_OPCODE processing
+     that requires %v to be at the beginning of the opcode name.  */
+  if (Pmode != word_mode)
+    fputs ("\taddr32", asm_out_file);
+  return "%vmaskmovdqu\t{%2, %1|%1, %2}";
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_data16" "1")
+   (set (attr "length_address")
+     (symbol_ref ("Pmode != word_mode")))
+   ;; The implicit %rdi operand confuses default length_vex computation.
+   (set (attr "length_vex")
+     (symbol_ref ("3 + REX_SSE_REGNO_P (REGNO (operands[2]))")))
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse_ldmxcsr"
+  [(unspec_volatile [(match_operand:SI 0 "memory_operand" "m")]
+		    UNSPECV_LDMXCSR)]
+  "TARGET_SSE"
+  "%vldmxcsr\t%0"
+  [(set_attr "type" "sse")
+   (set_attr "atom_sse_attr" "mxcsr")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "memory" "load")])
+
+(define_insn "sse_stmxcsr"
+  [(set (match_operand:SI 0 "memory_operand" "=m")
+	(unspec_volatile:SI [(const_int 0)] UNSPECV_STMXCSR))]
+  "TARGET_SSE"
+  "%vstmxcsr\t%0"
+  [(set_attr "type" "sse")
+   (set_attr "atom_sse_attr" "mxcsr")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "memory" "store")])
+
+(define_insn "sse2_clflush"
+  [(unspec_volatile [(match_operand 0 "address_operand" "p")]
+		    UNSPECV_CLFLUSH)]
+  "TARGET_SSE2"
+  "clflush\t%a0"
+  [(set_attr "type" "sse")
+   (set_attr "atom_sse_attr" "fence")
+   (set_attr "memory" "unknown")])
+
+
+(define_insn "sse3_mwait"
+  [(unspec_volatile [(match_operand:SI 0 "register_operand" "a")
+		     (match_operand:SI 1 "register_operand" "c")]
+		    UNSPECV_MWAIT)]
+  "TARGET_SSE3"
+;; 64bit version is "mwait %rax,%rcx". But only lower 32bits are used.
+;; Since 32bit register operands are implicitly zero extended to 64bit,
+;; we only need to set up 32bit registers.
+  "mwait"
+  [(set_attr "length" "3")])
+
+(define_insn "sse3_monitor_<mode>"
+  [(unspec_volatile [(match_operand:P 0 "register_operand" "a")
+		     (match_operand:SI 1 "register_operand" "c")
+		     (match_operand:SI 2 "register_operand" "d")]
+		    UNSPECV_MONITOR)]
+  "TARGET_SSE3"
+;; 64bit version is "monitor %rax,%rcx,%rdx". But only lower 32bits in
+;; RCX and RDX are used.  Since 32bit register operands are implicitly
+;; zero extended to 64bit, we only need to set up 32bit registers.
+  "%^monitor"
+  [(set (attr "length")
+     (symbol_ref ("(Pmode != word_mode) + 3")))])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; SSSE3 instructions
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_code_iterator ssse3_plusminus [plus ss_plus minus ss_minus])
+
+(define_insn "avx2_ph<plusminus_mnemonic>wv16hi3"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+	(vec_concat:V16HI
+	  (vec_concat:V8HI
+	    (vec_concat:V4HI
+	      (vec_concat:V2HI
+		(ssse3_plusminus:HI
+		  (vec_select:HI
+		    (match_operand:V16HI 1 "register_operand" "x")
+		    (parallel [(const_int 0)]))
+		  (vec_select:HI (match_dup 1) (parallel [(const_int 1)])))
+		(ssse3_plusminus:HI
+		  (vec_select:HI (match_dup 1) (parallel [(const_int 2)]))
+		  (vec_select:HI (match_dup 1) (parallel [(const_int 3)]))))
+	      (vec_concat:V2HI
+		(ssse3_plusminus:HI
+		  (vec_select:HI (match_dup 1) (parallel [(const_int 4)]))
+		  (vec_select:HI (match_dup 1) (parallel [(const_int 5)])))
+		(ssse3_plusminus:HI
+		  (vec_select:HI (match_dup 1) (parallel [(const_int 6)]))
+		  (vec_select:HI (match_dup 1) (parallel [(const_int 7)])))))
+	    (vec_concat:V4HI
+	      (vec_concat:V2HI
+		(ssse3_plusminus:HI
+		  (vec_select:HI (match_dup 1) (parallel [(const_int 8)]))
+		  (vec_select:HI (match_dup 1) (parallel [(const_int 9)])))
+		(ssse3_plusminus:HI
+		  (vec_select:HI (match_dup 1) (parallel [(const_int 10)]))
+		  (vec_select:HI (match_dup 1) (parallel [(const_int 11)]))))
+	      (vec_concat:V2HI
+		(ssse3_plusminus:HI
+		  (vec_select:HI (match_dup 1) (parallel [(const_int 12)]))
+		  (vec_select:HI (match_dup 1) (parallel [(const_int 13)])))
+		(ssse3_plusminus:HI
+		  (vec_select:HI (match_dup 1) (parallel [(const_int 14)]))
+		  (vec_select:HI (match_dup 1) (parallel [(const_int 15)]))))))
+	  (vec_concat:V8HI
+	    (vec_concat:V4HI
+	      (vec_concat:V2HI
+		(ssse3_plusminus:HI
+		  (vec_select:HI
+		    (match_operand:V16HI 2 "nonimmediate_operand" "xm")
+		    (parallel [(const_int 0)]))
+		  (vec_select:HI (match_dup 2) (parallel [(const_int 1)])))
+		(ssse3_plusminus:HI
+		  (vec_select:HI (match_dup 2) (parallel [(const_int 2)]))
+		  (vec_select:HI (match_dup 2) (parallel [(const_int 3)]))))
+	      (vec_concat:V2HI
+		(ssse3_plusminus:HI
+		  (vec_select:HI (match_dup 2) (parallel [(const_int 4)]))
+		  (vec_select:HI (match_dup 2) (parallel [(const_int 5)])))
+		(ssse3_plusminus:HI
+		  (vec_select:HI (match_dup 2) (parallel [(const_int 6)]))
+		  (vec_select:HI (match_dup 2) (parallel [(const_int 7)])))))
+	    (vec_concat:V4HI
+	      (vec_concat:V2HI
+		(ssse3_plusminus:HI
+		  (vec_select:HI (match_dup 2) (parallel [(const_int 8)]))
+		  (vec_select:HI (match_dup 2) (parallel [(const_int 9)])))
+		(ssse3_plusminus:HI
+		  (vec_select:HI (match_dup 2) (parallel [(const_int 10)]))
+		  (vec_select:HI (match_dup 2) (parallel [(const_int 11)]))))
+	      (vec_concat:V2HI
+		(ssse3_plusminus:HI
+		  (vec_select:HI (match_dup 2) (parallel [(const_int 12)]))
+		  (vec_select:HI (match_dup 2) (parallel [(const_int 13)])))
+		(ssse3_plusminus:HI
+		  (vec_select:HI (match_dup 2) (parallel [(const_int 14)]))
+		  (vec_select:HI (match_dup 2) (parallel [(const_int 15)]))))))))]
+  "TARGET_AVX2"
+  "vph<plusminus_mnemonic>w\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "ssse3_ph<plusminus_mnemonic>wv8hi3"
+  [(set (match_operand:V8HI 0 "register_operand" "=x,x")
+	(vec_concat:V8HI
+	  (vec_concat:V4HI
+	    (vec_concat:V2HI
+	      (ssse3_plusminus:HI
+		(vec_select:HI
+		  (match_operand:V8HI 1 "register_operand" "0,x")
+		  (parallel [(const_int 0)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 1)])))
+	      (ssse3_plusminus:HI
+		(vec_select:HI (match_dup 1) (parallel [(const_int 2)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 3)]))))
+	    (vec_concat:V2HI
+	      (ssse3_plusminus:HI
+		(vec_select:HI (match_dup 1) (parallel [(const_int 4)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 5)])))
+	      (ssse3_plusminus:HI
+		(vec_select:HI (match_dup 1) (parallel [(const_int 6)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 7)])))))
+	  (vec_concat:V4HI
+	    (vec_concat:V2HI
+	      (ssse3_plusminus:HI
+		(vec_select:HI
+		  (match_operand:V8HI 2 "nonimmediate_operand" "xm,xm")
+		  (parallel [(const_int 0)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 1)])))
+	      (ssse3_plusminus:HI
+		(vec_select:HI (match_dup 2) (parallel [(const_int 2)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 3)]))))
+	    (vec_concat:V2HI
+	      (ssse3_plusminus:HI
+		(vec_select:HI (match_dup 2) (parallel [(const_int 4)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 5)])))
+	      (ssse3_plusminus:HI
+		(vec_select:HI (match_dup 2) (parallel [(const_int 6)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 7)])))))))]
+  "TARGET_SSSE3"
+  "@
+   ph<plusminus_mnemonic>w\t{%2, %0|%0, %2}
+   vph<plusminus_mnemonic>w\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseiadd")
+   (set_attr "atom_unit" "complex")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "ssse3_ph<plusminus_mnemonic>wv4hi3"
+  [(set (match_operand:V4HI 0 "register_operand" "=y")
+	(vec_concat:V4HI
+	  (vec_concat:V2HI
+	    (ssse3_plusminus:HI
+	      (vec_select:HI
+		(match_operand:V4HI 1 "register_operand" "0")
+		(parallel [(const_int 0)]))
+	      (vec_select:HI (match_dup 1) (parallel [(const_int 1)])))
+	    (ssse3_plusminus:HI
+	      (vec_select:HI (match_dup 1) (parallel [(const_int 2)]))
+	      (vec_select:HI (match_dup 1) (parallel [(const_int 3)]))))
+	  (vec_concat:V2HI
+	    (ssse3_plusminus:HI
+	      (vec_select:HI
+		(match_operand:V4HI 2 "nonimmediate_operand" "ym")
+		(parallel [(const_int 0)]))
+	      (vec_select:HI (match_dup 2) (parallel [(const_int 1)])))
+	    (ssse3_plusminus:HI
+	      (vec_select:HI (match_dup 2) (parallel [(const_int 2)]))
+	      (vec_select:HI (match_dup 2) (parallel [(const_int 3)]))))))]
+  "TARGET_SSSE3"
+  "ph<plusminus_mnemonic>w\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "atom_unit" "complex")
+   (set_attr "prefix_extra" "1")
+   (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
+   (set_attr "mode" "DI")])
+
+(define_insn "avx2_ph<plusminus_mnemonic>dv8si3"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+	(vec_concat:V8SI
+	  (vec_concat:V4SI
+	    (vec_concat:V2SI
+	      (plusminus:SI
+		(vec_select:SI
+		  (match_operand:V8SI 1 "register_operand" "x")
+		  (parallel [(const_int 0)]))
+		(vec_select:SI (match_dup 1) (parallel [(const_int 1)])))
+	      (plusminus:SI
+		(vec_select:SI (match_dup 1) (parallel [(const_int 2)]))
+		(vec_select:SI (match_dup 1) (parallel [(const_int 3)]))))
+	    (vec_concat:V2SI
+	      (plusminus:SI
+		(vec_select:SI (match_dup 1) (parallel [(const_int 4)]))
+		(vec_select:SI (match_dup 1) (parallel [(const_int 5)])))
+	      (plusminus:SI
+		(vec_select:SI (match_dup 1) (parallel [(const_int 6)]))
+		(vec_select:SI (match_dup 1) (parallel [(const_int 7)])))))
+	  (vec_concat:V4SI
+	    (vec_concat:V2SI
+	      (plusminus:SI
+		(vec_select:SI
+		  (match_operand:V8SI 2 "nonimmediate_operand" "xm")
+		  (parallel [(const_int 0)]))
+		(vec_select:SI (match_dup 2) (parallel [(const_int 1)])))
+	      (plusminus:SI
+		(vec_select:SI (match_dup 2) (parallel [(const_int 2)]))
+		(vec_select:SI (match_dup 2) (parallel [(const_int 3)]))))
+	    (vec_concat:V2SI
+	      (plusminus:SI
+		(vec_select:SI (match_dup 2) (parallel [(const_int 4)]))
+		(vec_select:SI (match_dup 2) (parallel [(const_int 5)])))
+	      (plusminus:SI
+		(vec_select:SI (match_dup 2) (parallel [(const_int 6)]))
+		(vec_select:SI (match_dup 2) (parallel [(const_int 7)])))))))]
+  "TARGET_AVX2"
+  "vph<plusminus_mnemonic>d\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "ssse3_ph<plusminus_mnemonic>dv4si3"
+  [(set (match_operand:V4SI 0 "register_operand" "=x,x")
+	(vec_concat:V4SI
+	  (vec_concat:V2SI
+	    (plusminus:SI
+	      (vec_select:SI
+		(match_operand:V4SI 1 "register_operand" "0,x")
+		(parallel [(const_int 0)]))
+	      (vec_select:SI (match_dup 1) (parallel [(const_int 1)])))
+	    (plusminus:SI
+	      (vec_select:SI (match_dup 1) (parallel [(const_int 2)]))
+	      (vec_select:SI (match_dup 1) (parallel [(const_int 3)]))))
+	  (vec_concat:V2SI
+	    (plusminus:SI
+	      (vec_select:SI
+		(match_operand:V4SI 2 "nonimmediate_operand" "xm,xm")
+		(parallel [(const_int 0)]))
+	      (vec_select:SI (match_dup 2) (parallel [(const_int 1)])))
+	    (plusminus:SI
+	      (vec_select:SI (match_dup 2) (parallel [(const_int 2)]))
+	      (vec_select:SI (match_dup 2) (parallel [(const_int 3)]))))))]
+  "TARGET_SSSE3"
+  "@
+   ph<plusminus_mnemonic>d\t{%2, %0|%0, %2}
+   vph<plusminus_mnemonic>d\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseiadd")
+   (set_attr "atom_unit" "complex")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "ssse3_ph<plusminus_mnemonic>dv2si3"
+  [(set (match_operand:V2SI 0 "register_operand" "=y")
+	(vec_concat:V2SI
+	  (plusminus:SI
+	    (vec_select:SI
+	      (match_operand:V2SI 1 "register_operand" "0")
+	      (parallel [(const_int 0)]))
+	    (vec_select:SI (match_dup 1) (parallel [(const_int 1)])))
+	  (plusminus:SI
+	    (vec_select:SI
+	      (match_operand:V2SI 2 "nonimmediate_operand" "ym")
+	      (parallel [(const_int 0)]))
+	    (vec_select:SI (match_dup 2) (parallel [(const_int 1)])))))]
+  "TARGET_SSSE3"
+  "ph<plusminus_mnemonic>d\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "atom_unit" "complex")
+   (set_attr "prefix_extra" "1")
+   (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
+   (set_attr "mode" "DI")])
+
+(define_insn "avx2_pmaddubsw256"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+	(ss_plus:V16HI
+	  (mult:V16HI
+	    (zero_extend:V16HI
+	      (vec_select:V16QI
+		(match_operand:V32QI 1 "register_operand" "x")
+		(parallel [(const_int 0) (const_int 2)
+			   (const_int 4) (const_int 6)
+			   (const_int 8) (const_int 10)
+			   (const_int 12) (const_int 14)
+			   (const_int 16) (const_int 18)
+			   (const_int 20) (const_int 22)
+			   (const_int 24) (const_int 26)
+			   (const_int 28) (const_int 30)])))
+	    (sign_extend:V16HI
+	      (vec_select:V16QI
+		(match_operand:V32QI 2 "nonimmediate_operand" "xm")
+		(parallel [(const_int 0) (const_int 2)
+			   (const_int 4) (const_int 6)
+			   (const_int 8) (const_int 10)
+			   (const_int 12) (const_int 14)
+			   (const_int 16) (const_int 18)
+			   (const_int 20) (const_int 22)
+			   (const_int 24) (const_int 26)
+			   (const_int 28) (const_int 30)]))))
+	  (mult:V16HI
+	    (zero_extend:V16HI
+	      (vec_select:V16QI (match_dup 1)
+		(parallel [(const_int 1) (const_int 3)
+			   (const_int 5) (const_int 7)
+			   (const_int 9) (const_int 11)
+			   (const_int 13) (const_int 15)
+			   (const_int 17) (const_int 19)
+			   (const_int 21) (const_int 23)
+			   (const_int 25) (const_int 27)
+			   (const_int 29) (const_int 31)])))
+	    (sign_extend:V16HI
+	      (vec_select:V16QI (match_dup 2)
+		(parallel [(const_int 1) (const_int 3)
+			   (const_int 5) (const_int 7)
+			   (const_int 9) (const_int 11)
+			   (const_int 13) (const_int 15)
+			   (const_int 17) (const_int 19)
+			   (const_int 21) (const_int 23)
+			   (const_int 25) (const_int 27)
+			   (const_int 29) (const_int 31)]))))))]
+  "TARGET_AVX2"
+  "vpmaddubsw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "ssse3_pmaddubsw128"
+  [(set (match_operand:V8HI 0 "register_operand" "=x,x")
+	(ss_plus:V8HI
+	  (mult:V8HI
+	    (zero_extend:V8HI
+	      (vec_select:V8QI
+		(match_operand:V16QI 1 "register_operand" "0,x")
+		(parallel [(const_int 0) (const_int 2)
+			   (const_int 4) (const_int 6)
+			   (const_int 8) (const_int 10)
+			   (const_int 12) (const_int 14)])))
+	    (sign_extend:V8HI
+	      (vec_select:V8QI
+		(match_operand:V16QI 2 "nonimmediate_operand" "xm,xm")
+		(parallel [(const_int 0) (const_int 2)
+			   (const_int 4) (const_int 6)
+			   (const_int 8) (const_int 10)
+			   (const_int 12) (const_int 14)]))))
+	  (mult:V8HI
+	    (zero_extend:V8HI
+	      (vec_select:V8QI (match_dup 1)
+		(parallel [(const_int 1) (const_int 3)
+			   (const_int 5) (const_int 7)
+			   (const_int 9) (const_int 11)
+			   (const_int 13) (const_int 15)])))
+	    (sign_extend:V8HI
+	      (vec_select:V8QI (match_dup 2)
+		(parallel [(const_int 1) (const_int 3)
+			   (const_int 5) (const_int 7)
+			   (const_int 9) (const_int 11)
+			   (const_int 13) (const_int 15)]))))))]
+  "TARGET_SSSE3"
+  "@
+   pmaddubsw\t{%2, %0|%0, %2}
+   vpmaddubsw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseiadd")
+   (set_attr "atom_unit" "simul")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "ssse3_pmaddubsw"
+  [(set (match_operand:V4HI 0 "register_operand" "=y")
+	(ss_plus:V4HI
+	  (mult:V4HI
+	    (zero_extend:V4HI
+	      (vec_select:V4QI
+		(match_operand:V8QI 1 "register_operand" "0")
+		(parallel [(const_int 0) (const_int 2)
+			   (const_int 4) (const_int 6)])))
+	    (sign_extend:V4HI
+	      (vec_select:V4QI
+		(match_operand:V8QI 2 "nonimmediate_operand" "ym")
+		(parallel [(const_int 0) (const_int 2)
+			   (const_int 4) (const_int 6)]))))
+	  (mult:V4HI
+	    (zero_extend:V4HI
+	      (vec_select:V4QI (match_dup 1)
+		(parallel [(const_int 1) (const_int 3)
+			   (const_int 5) (const_int 7)])))
+	    (sign_extend:V4HI
+	      (vec_select:V4QI (match_dup 2)
+		(parallel [(const_int 1) (const_int 3)
+			   (const_int 5) (const_int 7)]))))))]
+  "TARGET_SSSE3"
+  "pmaddubsw\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "atom_unit" "simul")
+   (set_attr "prefix_extra" "1")
+   (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
+   (set_attr "mode" "DI")])
+
+(define_mode_iterator PMULHRSW
+  [V4HI V8HI (V16HI "TARGET_AVX2")])
+
+(define_expand "<ssse3_avx2>_pmulhrsw<mode>3"
+  [(set (match_operand:PMULHRSW 0 "register_operand")
+	(truncate:PMULHRSW
+	  (lshiftrt:<ssedoublemode>
+	    (plus:<ssedoublemode>
+	      (lshiftrt:<ssedoublemode>
+		(mult:<ssedoublemode>
+		  (sign_extend:<ssedoublemode>
+		    (match_operand:PMULHRSW 1 "nonimmediate_operand"))
+		  (sign_extend:<ssedoublemode>
+		    (match_operand:PMULHRSW 2 "nonimmediate_operand")))
+		(const_int 14))
+	      (match_dup 3))
+	    (const_int 1))))]
+  "TARGET_AVX2"
+{
+  operands[3] = CONST1_RTX(<MODE>mode);
+  ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);
+})
+
+(define_insn "*<ssse3_avx2>_pmulhrsw<mode>3"
+  [(set (match_operand:VI2_AVX2 0 "register_operand" "=x,x")
+	(truncate:VI2_AVX2
+	  (lshiftrt:<ssedoublemode>
+	    (plus:<ssedoublemode>
+	      (lshiftrt:<ssedoublemode>
+		(mult:<ssedoublemode>
+		  (sign_extend:<ssedoublemode>
+		    (match_operand:VI2_AVX2 1 "nonimmediate_operand" "%0,x"))
+		  (sign_extend:<ssedoublemode>
+		    (match_operand:VI2_AVX2 2 "nonimmediate_operand" "xm,xm")))
+		(const_int 14))
+	      (match_operand:VI2_AVX2 3 "const1_operand"))
+	    (const_int 1))))]
+  "TARGET_SSSE3 && ix86_binary_operator_ok (MULT, <MODE>mode, operands)"
+  "@
+   pmulhrsw\t{%2, %0|%0, %2}
+   vpmulhrsw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseimul")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*ssse3_pmulhrswv4hi3"
+  [(set (match_operand:V4HI 0 "register_operand" "=y")
+	(truncate:V4HI
+	  (lshiftrt:V4SI
+	    (plus:V4SI
+	      (lshiftrt:V4SI
+		(mult:V4SI
+		  (sign_extend:V4SI
+		    (match_operand:V4HI 1 "nonimmediate_operand" "%0"))
+		  (sign_extend:V4SI
+		    (match_operand:V4HI 2 "nonimmediate_operand" "ym")))
+		(const_int 14))
+	      (match_operand:V4HI 3 "const1_operand"))
+	    (const_int 1))))]
+  "TARGET_SSSE3 && ix86_binary_operator_ok (MULT, V4HImode, operands)"
+  "pmulhrsw\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix_extra" "1")
+   (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
+   (set_attr "mode" "DI")])
+
+(define_insn "<ssse3_avx2>_pshufb<mode>3"
+  [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
+	(unspec:VI1_AVX2
+	  [(match_operand:VI1_AVX2 1 "register_operand" "0,x")
+	   (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")]
+	  UNSPEC_PSHUFB))]
+  "TARGET_SSSE3"
+  "@
+   pshufb\t{%2, %0|%0, %2}
+   vpshufb\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog1")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "btver2_decode" "vector,vector")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "ssse3_pshufbv8qi3"
+  [(set (match_operand:V8QI 0 "register_operand" "=y")
+	(unspec:V8QI [(match_operand:V8QI 1 "register_operand" "0")
+		      (match_operand:V8QI 2 "nonimmediate_operand" "ym")]
+		     UNSPEC_PSHUFB))]
+  "TARGET_SSSE3"
+  "pshufb\t{%2, %0|%0, %2}";
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
+   (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
+   (set_attr "mode" "DI")])
+
+(define_insn "<ssse3_avx2>_psign<mode>3"
+  [(set (match_operand:VI124_AVX2 0 "register_operand" "=x,x")
+	(unspec:VI124_AVX2
+	  [(match_operand:VI124_AVX2 1 "register_operand" "0,x")
+	   (match_operand:VI124_AVX2 2 "nonimmediate_operand" "xm,xm")]
+	  UNSPEC_PSIGN))]
+  "TARGET_SSSE3"
+  "@
+   psign<ssemodesuffix>\t{%2, %0|%0, %2}
+   vpsign<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog1")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "ssse3_psign<mode>3"
+  [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
+	(unspec:MMXMODEI
+	  [(match_operand:MMXMODEI 1 "register_operand" "0")
+	   (match_operand:MMXMODEI 2 "nonimmediate_operand" "ym")]
+	  UNSPEC_PSIGN))]
+  "TARGET_SSSE3"
+  "psign<mmxvecsize>\t{%2, %0|%0, %2}";
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
+   (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
+   (set_attr "mode" "DI")])
+
+(define_insn "<ssse3_avx2>_palignr<mode>"
+  [(set (match_operand:SSESCALARMODE 0 "register_operand" "=x,x")
+	(unspec:SSESCALARMODE
+	  [(match_operand:SSESCALARMODE 1 "register_operand" "0,x")
+	   (match_operand:SSESCALARMODE 2 "nonimmediate_operand" "xm,xm")
+	   (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n,n")]
+	  UNSPEC_PALIGNR))]
+  "TARGET_SSSE3"
+{
+  operands[3] = GEN_INT (INTVAL (operands[3]) / 8);
+
+  switch (which_alternative)
+    {
+    case 0:
+      return "palignr\t{%3, %2, %0|%0, %2, %3}";
+    case 1:
+      return "vpalignr\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseishft")
+   (set_attr "atom_unit" "sishuf")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "ssse3_palignrdi"
+  [(set (match_operand:DI 0 "register_operand" "=y")
+	(unspec:DI [(match_operand:DI 1 "register_operand" "0")
+		    (match_operand:DI 2 "nonimmediate_operand" "ym")
+		    (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n")]
+		   UNSPEC_PALIGNR))]
+  "TARGET_SSSE3"
+{
+  operands[3] = GEN_INT (INTVAL (operands[3]) / 8);
+  return "palignr\t{%3, %2, %0|%0, %2, %3}";
+}
+  [(set_attr "type" "sseishft")
+   (set_attr "atom_unit" "sishuf")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
+   (set_attr "mode" "DI")])
+
+(define_insn "<mask_codefor>abs<mode>2<mask_name>"
+  [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand" "=v")
+	(abs:VI124_AVX2_48_AVX512F
+	  (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand" "vm")))]
+  "TARGET_SSSE3 && <mask_mode512bit_condition>"
+  "%vpabs<ssemodesuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "abs<mode>2"
+  [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand")
+	(abs:VI124_AVX2_48_AVX512F
+	  (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand")))]
+  "TARGET_SSE2"
+{
+  if (!TARGET_SSSE3)
+    {
+      ix86_expand_sse2_abs (operands[0], operands[1]);
+      DONE;
+    }
+})
+
+(define_insn "abs<mode>2"
+  [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
+	(abs:MMXMODEI
+	  (match_operand:MMXMODEI 1 "nonimmediate_operand" "ym")))]
+  "TARGET_SSSE3"
+  "pabs<mmxvecsize>\t{%1, %0|%0, %1}";
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix_rep" "0")
+   (set_attr "prefix_extra" "1")
+   (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
+   (set_attr "mode" "DI")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; AMD SSE4A instructions
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "sse4a_movnt<mode>"
+  [(set (match_operand:MODEF 0 "memory_operand" "=m")
+	(unspec:MODEF
+	  [(match_operand:MODEF 1 "register_operand" "x")]
+	  UNSPEC_MOVNT))]
+  "TARGET_SSE4A"
+  "movnt<ssemodesuffix>\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "sse4a_vmmovnt<mode>"
+  [(set (match_operand:<ssescalarmode> 0 "memory_operand" "=m")
+	(unspec:<ssescalarmode>
+	  [(vec_select:<ssescalarmode>
+	     (match_operand:VF_128 1 "register_operand" "x")
+	     (parallel [(const_int 0)]))]
+	  UNSPEC_MOVNT))]
+  "TARGET_SSE4A"
+  "movnt<ssescalarmodesuffix>\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_insn "sse4a_extrqi"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
+		      (match_operand 2 "const_0_to_255_operand")
+		      (match_operand 3 "const_0_to_255_operand")]
+		     UNSPEC_EXTRQI))]
+  "TARGET_SSE4A"
+  "extrq\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix_data16" "1")
+   (set_attr "length_immediate" "2")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4a_extrq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
+		      (match_operand:V16QI 2 "register_operand" "x")]
+		     UNSPEC_EXTRQ))]
+  "TARGET_SSE4A"
+  "extrq\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix_data16" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4a_insertqi"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
+		      (match_operand:V2DI 2 "register_operand" "x")
+		      (match_operand 3 "const_0_to_255_operand")
+		      (match_operand 4 "const_0_to_255_operand")]
+		     UNSPEC_INSERTQI))]
+  "TARGET_SSE4A"
+  "insertq\t{%4, %3, %2, %0|%0, %2, %3, %4}"
+  [(set_attr "type" "sseins")
+   (set_attr "prefix_data16" "0")
+   (set_attr "prefix_rep" "1")
+   (set_attr "length_immediate" "2")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4a_insertq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
+		      (match_operand:V2DI 2 "register_operand" "x")]
+		     UNSPEC_INSERTQ))]
+  "TARGET_SSE4A"
+  "insertq\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sseins")
+   (set_attr "prefix_data16" "0")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "TI")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Intel SSE4.1 instructions
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "<sse4_1>_blend<ssemodesuffix><avxsizesuffix>"
+  [(set (match_operand:VF_128_256 0 "register_operand" "=x,x")
+	(vec_merge:VF_128_256
+	  (match_operand:VF_128_256 2 "nonimmediate_operand" "xm,xm")
+	  (match_operand:VF_128_256 1 "register_operand" "0,x")
+	  (match_operand:SI 3 "const_0_to_<blendbits>_operand")))]
+  "TARGET_SSE4_1"
+  "@
+   blend<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
+   vblend<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>"
+  [(set (match_operand:VF_128_256 0 "register_operand" "=x,x")
+	(unspec:VF_128_256
+	  [(match_operand:VF_128_256 1 "register_operand" "0,x")
+	   (match_operand:VF_128_256 2 "nonimmediate_operand" "xm,xm")
+	   (match_operand:VF_128_256 3 "register_operand" "Yz,x")]
+	  UNSPEC_BLENDV))]
+  "TARGET_SSE4_1"
+  "@
+   blendv<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
+   vblendv<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "btver2_decode" "vector,vector") 
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<sse4_1>_dp<ssemodesuffix><avxsizesuffix>"
+  [(set (match_operand:VF_128_256 0 "register_operand" "=x,x")
+	(unspec:VF_128_256
+	  [(match_operand:VF_128_256 1 "nonimmediate_operand" "%0,x")
+	   (match_operand:VF_128_256 2 "nonimmediate_operand" "xm,xm")
+	   (match_operand:SI 3 "const_0_to_255_operand" "n,n")]
+	  UNSPEC_DP))]
+  "TARGET_SSE4_1"
+  "@
+   dp<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
+   vdp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssemul")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "btver2_decode" "vector,vector")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<sse4_1_avx2>_movntdqa"
+  [(set (match_operand:VI8_AVX2_AVX512F 0 "register_operand" "=x, v")
+	(unspec:VI8_AVX2_AVX512F [(match_operand:VI8_AVX2_AVX512F 1 "memory_operand" "m, m")]
+		     UNSPEC_MOVNTDQA))]
+  "TARGET_SSE4_1"
+  "%vmovntdqa\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1, *")
+   (set_attr "prefix" "maybe_vex, evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<sse4_1_avx2>_mpsadbw"
+  [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
+	(unspec:VI1_AVX2
+	  [(match_operand:VI1_AVX2 1 "register_operand" "0,x")
+	   (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")
+	   (match_operand:SI 3 "const_0_to_255_operand" "n,n")]
+	  UNSPEC_MPSADBW))]
+  "TARGET_SSE4_1"
+  "@
+   mpsadbw\t{%3, %2, %0|%0, %2, %3}
+   vmpsadbw\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "btver2_decode" "vector,vector")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx2_packusdw"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+	(vec_concat:V16HI
+	  (us_truncate:V8HI
+	    (match_operand:V8SI 1 "register_operand" "x"))
+	  (us_truncate:V8HI
+	    (match_operand:V8SI 2 "nonimmediate_operand" "xm"))))]
+  "TARGET_AVX2"
+  "vpackusdw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "sse4_1_packusdw"
+  [(set (match_operand:V8HI 0 "register_operand" "=x,x")
+	(vec_concat:V8HI
+	  (us_truncate:V4HI
+	    (match_operand:V4SI 1 "register_operand" "0,x"))
+	  (us_truncate:V4HI
+	    (match_operand:V4SI 2 "nonimmediate_operand" "xm,xm"))))]
+  "TARGET_SSE4_1"
+  "@
+   packusdw\t{%2, %0|%0, %2}
+   vpackusdw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "<sse4_1_avx2>_pblendvb"
+  [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
+	(unspec:VI1_AVX2
+	  [(match_operand:VI1_AVX2 1 "register_operand"  "0,x")
+	   (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")
+	   (match_operand:VI1_AVX2 3 "register_operand" "Yz,x")]
+	  UNSPEC_BLENDV))]
+  "TARGET_SSE4_1"
+  "@
+   pblendvb\t{%3, %2, %0|%0, %2, %3}
+   vpblendvb\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "*,1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "btver2_decode" "vector,vector")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "sse4_1_pblendw"
+  [(set (match_operand:V8HI 0 "register_operand" "=x,x")
+	(vec_merge:V8HI
+	  (match_operand:V8HI 2 "nonimmediate_operand" "xm,xm")
+	  (match_operand:V8HI 1 "register_operand" "0,x")
+	  (match_operand:SI 3 "const_0_to_255_operand" "n,n")))]
+  "TARGET_SSE4_1"
+  "@
+   pblendw\t{%3, %2, %0|%0, %2, %3}
+   vpblendw\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+;; The builtin uses an 8-bit immediate.  Expand that.
+(define_expand "avx2_pblendw"
+  [(set (match_operand:V16HI 0 "register_operand")
+	(vec_merge:V16HI
+	  (match_operand:V16HI 2 "nonimmediate_operand")
+	  (match_operand:V16HI 1 "register_operand")
+	  (match_operand:SI 3 "const_0_to_255_operand")))]
+  "TARGET_AVX2"
+{
+  HOST_WIDE_INT val = INTVAL (operands[3]) & 0xff;
+  operands[3] = GEN_INT (val << 8 | val);
+})
+
+(define_insn "*avx2_pblendw"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+	(vec_merge:V16HI
+	  (match_operand:V16HI 2 "nonimmediate_operand" "xm")
+	  (match_operand:V16HI 1 "register_operand" "x")
+	  (match_operand:SI 3 "avx2_pblendw_operand" "n")))]
+  "TARGET_AVX2"
+{
+  operands[3] = GEN_INT (INTVAL (operands[3]) & 0xff);
+  return "vpblendw\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx2_pblendd<mode>"
+  [(set (match_operand:VI4_AVX2 0 "register_operand" "=x")
+	(vec_merge:VI4_AVX2
+	  (match_operand:VI4_AVX2 2 "nonimmediate_operand" "xm")
+	  (match_operand:VI4_AVX2 1 "register_operand" "x")
+	  (match_operand:SI 3 "const_0_to_255_operand" "n")))]
+  "TARGET_AVX2"
+  "vpblendd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "sse4_1_phminposuw"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(unspec:V8HI [(match_operand:V8HI 1 "nonimmediate_operand" "xm")]
+		     UNSPEC_PHMINPOSUW))]
+  "TARGET_SSE4_1"
+  "%vphminposuw\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx2_<code>v16qiv16hi2"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+	(any_extend:V16HI
+	  (match_operand:V16QI 1 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX2"
+  "vpmov<extsuffix>bw\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "sse4_1_<code>v8qiv8hi2"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(any_extend:V8HI
+	  (vec_select:V8QI
+	    (match_operand:V16QI 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)
+		       (const_int 4) (const_int 5)
+		       (const_int 6) (const_int 7)]))))]
+  "TARGET_SSE4_1"
+  "%vpmov<extsuffix>bw\t{%1, %0|%0, %q1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "ssememalign" "64")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "<mask_codefor>avx512f_<code>v16qiv16si2<mask_name>"
+  [(set (match_operand:V16SI 0 "register_operand" "=v")
+	(any_extend:V16SI
+	  (match_operand:V16QI 1 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512F"
+  "vpmov<extsuffix>bd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "avx2_<code>v8qiv8si2"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+	(any_extend:V8SI
+	  (vec_select:V8QI
+	    (match_operand:V16QI 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)
+		       (const_int 4) (const_int 5)
+		       (const_int 6) (const_int 7)]))))]
+  "TARGET_AVX2"
+  "vpmov<extsuffix>bd\t{%1, %0|%0, %q1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "sse4_1_<code>v4qiv4si2"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(any_extend:V4SI
+	  (vec_select:V4QI
+	    (match_operand:V16QI 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)]))))]
+  "TARGET_SSE4_1"
+  "%vpmov<extsuffix>bd\t{%1, %0|%0, %k1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "ssememalign" "32")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx512f_<code>v16hiv16si2<mask_name>"
+  [(set (match_operand:V16SI 0 "register_operand" "=v")
+	(any_extend:V16SI
+	  (match_operand:V16HI 1 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512F"
+  "vpmov<extsuffix>wd\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "avx2_<code>v8hiv8si2"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+	(any_extend:V8SI
+	    (match_operand:V8HI 1 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX2"
+  "vpmov<extsuffix>wd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "sse4_1_<code>v4hiv4si2"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(any_extend:V4SI
+	  (vec_select:V4HI
+	    (match_operand:V8HI 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)]))))]
+  "TARGET_SSE4_1"
+  "%vpmov<extsuffix>wd\t{%1, %0|%0, %q1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "ssememalign" "64")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx512f_<code>v8qiv8di2<mask_name>"
+  [(set (match_operand:V8DI 0 "register_operand" "=v")
+	(any_extend:V8DI
+	  (vec_select:V8QI
+	    (match_operand:V16QI 1 "nonimmediate_operand" "vm")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)
+		       (const_int 4) (const_int 5)
+		       (const_int 6) (const_int 7)]))))]
+  "TARGET_AVX512F"
+  "vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %k1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "avx2_<code>v4qiv4di2"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+	(any_extend:V4DI
+	  (vec_select:V4QI
+	    (match_operand:V16QI 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)]))))]
+  "TARGET_AVX2"
+  "vpmov<extsuffix>bq\t{%1, %0|%0, %k1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "sse4_1_<code>v2qiv2di2"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(any_extend:V2DI
+	  (vec_select:V2QI
+	    (match_operand:V16QI 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0) (const_int 1)]))))]
+  "TARGET_SSE4_1"
+  "%vpmov<extsuffix>bq\t{%1, %0|%0, %w1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "ssememalign" "16")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx512f_<code>v8hiv8di2<mask_name>"
+  [(set (match_operand:V8DI 0 "register_operand" "=v")
+	(any_extend:V8DI
+	  (match_operand:V8HI 1 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512F"
+  "vpmov<extsuffix>wq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %q1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "avx2_<code>v4hiv4di2"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+	(any_extend:V4DI
+	  (vec_select:V4HI
+	    (match_operand:V8HI 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)]))))]
+  "TARGET_AVX2"
+  "vpmov<extsuffix>wq\t{%1, %0|%0, %q1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "sse4_1_<code>v2hiv2di2"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(any_extend:V2DI
+	  (vec_select:V2HI
+	    (match_operand:V8HI 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0) (const_int 1)]))))]
+  "TARGET_SSE4_1"
+  "%vpmov<extsuffix>wq\t{%1, %0|%0, %k1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "ssememalign" "32")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx512f_<code>v8siv8di2<mask_name>"
+  [(set (match_operand:V8DI 0 "register_operand" "=v")
+	(any_extend:V8DI
+	  (match_operand:V8SI 1 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512F"
+  "vpmov<extsuffix>dq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "avx2_<code>v4siv4di2"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+	(any_extend:V4DI
+	    (match_operand:V4SI 1 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX2"
+  "vpmov<extsuffix>dq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "OI")])
+
+(define_insn "sse4_1_<code>v2siv2di2"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(any_extend:V2DI
+	  (vec_select:V2SI
+	    (match_operand:V4SI 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0) (const_int 1)]))))]
+  "TARGET_SSE4_1"
+  "%vpmov<extsuffix>dq\t{%1, %0|%0, %q1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "ssememalign" "64")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+;; ptestps/ptestpd are very similar to comiss and ucomiss when
+;; setting FLAGS_REG. But it is not a really compare instruction.
+(define_insn "avx_vtest<ssemodesuffix><avxsizesuffix>"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC [(match_operand:VF_128_256 0 "register_operand" "x")
+		    (match_operand:VF_128_256 1 "nonimmediate_operand" "xm")]
+		   UNSPEC_VTESTP))]
+  "TARGET_AVX"
+  "vtest<ssemodesuffix>\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecomi")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+;; ptest is very similar to comiss and ucomiss when setting FLAGS_REG.
+;; But it is not a really compare instruction.
+(define_insn "avx_ptest256"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC [(match_operand:V4DI 0 "register_operand" "x")
+		    (match_operand:V4DI 1 "nonimmediate_operand" "xm")]
+		   UNSPEC_PTEST))]
+  "TARGET_AVX"
+  "vptest\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecomi")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "btver2_decode" "vector")
+   (set_attr "mode" "OI")])
+
+(define_insn "sse4_1_ptest"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC [(match_operand:V2DI 0 "register_operand" "x")
+		    (match_operand:V2DI 1 "nonimmediate_operand" "xm")]
+		   UNSPEC_PTEST))]
+  "TARGET_SSE4_1"
+  "%vptest\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecomi")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "<sse4_1>_round<ssemodesuffix><avxsizesuffix>"
+  [(set (match_operand:VF_128_256 0 "register_operand" "=x")
+	(unspec:VF_128_256
+	  [(match_operand:VF_128_256 1 "nonimmediate_operand" "xm")
+	   (match_operand:SI 2 "const_0_to_15_operand" "n")]
+	  UNSPEC_ROUND))]
+  "TARGET_ROUND"
+  "%vround<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecvt")
+   (set (attr "prefix_data16")
+     (if_then_else
+       (match_test "TARGET_AVX")
+     (const_string "*")
+     (const_string "1")))
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_expand "<sse4_1>_round<ssemodesuffix>_sfix<avxsizesuffix>"
+  [(match_operand:<sseintvecmode> 0 "register_operand")
+   (match_operand:VF1_128_256 1 "nonimmediate_operand")
+   (match_operand:SI 2 "const_0_to_15_operand")]
+  "TARGET_ROUND"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+
+  emit_insn
+    (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp, operands[1],
+						       operands[2]));
+  emit_insn
+    (gen_fix_trunc<mode><sseintvecmodelower>2 (operands[0], tmp));
+  DONE;
+})
+
+(define_expand "avx512f_roundpd512"
+  [(match_operand:V8DF 0 "register_operand")
+   (match_operand:V8DF 1 "nonimmediate_operand")
+   (match_operand:SI 2 "const_0_to_15_operand")]
+  "TARGET_AVX512F"
+{
+  emit_insn (gen_avx512f_rndscalev8df (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_expand "<sse4_1>_round<ssemodesuffix>_vec_pack_sfix<avxsizesuffix>"
+  [(match_operand:<ssepackfltmode> 0 "register_operand")
+   (match_operand:VF2 1 "nonimmediate_operand")
+   (match_operand:VF2 2 "nonimmediate_operand")
+   (match_operand:SI 3 "const_0_to_15_operand")]
+  "TARGET_ROUND"
+{
+  rtx tmp0, tmp1;
+
+  if (<MODE>mode == V2DFmode
+      && TARGET_AVX && !TARGET_PREFER_AVX128)
+    {
+      rtx tmp2 = gen_reg_rtx (V4DFmode);
+
+      tmp0 = gen_reg_rtx (V4DFmode);
+      tmp1 = force_reg (V2DFmode, operands[1]);
+
+      emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2]));
+      emit_insn (gen_avx_roundpd256 (tmp2, tmp0, operands[3]));
+      emit_insn (gen_fix_truncv4dfv4si2 (operands[0], tmp2));
+    }
+  else
+    {
+      tmp0 = gen_reg_rtx (<MODE>mode);
+      tmp1 = gen_reg_rtx (<MODE>mode);
+
+      emit_insn
+       (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp0, operands[1],
+							  operands[3]));
+      emit_insn
+       (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp1, operands[2],
+							  operands[3]));
+      emit_insn
+       (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1));
+    }
+  DONE;
+})
+
+(define_insn "sse4_1_round<ssescalarmodesuffix>"
+  [(set (match_operand:VF_128 0 "register_operand" "=x,x")
+	(vec_merge:VF_128
+	  (unspec:VF_128
+	    [(match_operand:VF_128 2 "register_operand" "x,x")
+	     (match_operand:SI 3 "const_0_to_15_operand" "n,n")]
+	    UNSPEC_ROUND)
+	  (match_operand:VF_128 1 "register_operand" "0,x")
+	  (const_int 1)))]
+  "TARGET_ROUND"
+  "@
+   round<ssescalarmodesuffix>\t{%3, %2, %0|%0, %2, %3}
+   vround<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssecvt")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix_data16" "1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_expand "round<mode>2"
+  [(set (match_dup 4)
+	(plus:VF
+	  (match_operand:VF 1 "register_operand")
+	  (match_dup 3)))
+   (set (match_operand:VF 0 "register_operand")
+	(unspec:VF
+	  [(match_dup 4) (match_dup 5)]
+	  UNSPEC_ROUND))]
+  "TARGET_ROUND && !flag_trapping_math"
+{
+  enum machine_mode scalar_mode;
+  const struct real_format *fmt;
+  REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+  rtx half, vec_half;
+
+  scalar_mode = GET_MODE_INNER (<MODE>mode);
+
+  /* load nextafter (0.5, 0.0) */
+  fmt = REAL_MODE_FORMAT (scalar_mode);
+  real_2expN (&half_minus_pred_half, -(fmt->p) - 1, scalar_mode);
+  REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
+  half = const_double_from_real_value (pred_half, scalar_mode);
+
+  vec_half = ix86_build_const_vector (<MODE>mode, true, half);
+  vec_half = force_reg (<MODE>mode, vec_half);
+
+  operands[3] = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_copysign<mode>3 (operands[3], vec_half, operands[1]));
+
+  operands[4] = gen_reg_rtx (<MODE>mode);
+  operands[5] = GEN_INT (ROUND_TRUNC);
+})
+
+(define_expand "round<mode>2_sfix"
+  [(match_operand:<sseintvecmode> 0 "register_operand")
+   (match_operand:VF1_128_256 1 "register_operand")]
+  "TARGET_ROUND && !flag_trapping_math"
+{
+  rtx tmp = gen_reg_rtx (<MODE>mode);
+
+  emit_insn (gen_round<mode>2 (tmp, operands[1]));
+
+  emit_insn
+    (gen_fix_trunc<mode><sseintvecmodelower>2 (operands[0], tmp));
+  DONE;
+})
+
+(define_expand "round<mode>2_vec_pack_sfix"
+  [(match_operand:<ssepackfltmode> 0 "register_operand")
+   (match_operand:VF2 1 "register_operand")
+   (match_operand:VF2 2 "register_operand")]
+  "TARGET_ROUND && !flag_trapping_math"
+{
+  rtx tmp0, tmp1;
+
+  if (<MODE>mode == V2DFmode
+      && TARGET_AVX && !TARGET_PREFER_AVX128)
+    {
+      rtx tmp2 = gen_reg_rtx (V4DFmode);
+
+      tmp0 = gen_reg_rtx (V4DFmode);
+      tmp1 = force_reg (V2DFmode, operands[1]);
+
+      emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2]));
+      emit_insn (gen_roundv4df2 (tmp2, tmp0));
+      emit_insn (gen_fix_truncv4dfv4si2 (operands[0], tmp2));
+    }
+  else
+    {
+      tmp0 = gen_reg_rtx (<MODE>mode);
+      tmp1 = gen_reg_rtx (<MODE>mode);
+
+      emit_insn (gen_round<mode>2 (tmp0, operands[1]));
+      emit_insn (gen_round<mode>2 (tmp1, operands[2]));
+
+      emit_insn
+       (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1));
+    }
+  DONE;
+})
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Intel SSE4.2 string/text processing instructions
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn_and_split "sse4_2_pcmpestr"
+  [(set (match_operand:SI 0 "register_operand" "=c,c")
+	(unspec:SI
+	  [(match_operand:V16QI 2 "register_operand" "x,x")
+	   (match_operand:SI 3 "register_operand" "a,a")
+	   (match_operand:V16QI 4 "nonimmediate_operand" "x,m")
+	   (match_operand:SI 5 "register_operand" "d,d")
+	   (match_operand:SI 6 "const_0_to_255_operand" "n,n")]
+	  UNSPEC_PCMPESTR))
+   (set (match_operand:V16QI 1 "register_operand" "=Yz,Yz")
+	(unspec:V16QI
+	  [(match_dup 2)
+	   (match_dup 3)
+	   (match_dup 4)
+	   (match_dup 5)
+	   (match_dup 6)]
+	  UNSPEC_PCMPESTR))
+   (set (reg:CC FLAGS_REG)
+	(unspec:CC
+	  [(match_dup 2)
+	   (match_dup 3)
+	   (match_dup 4)
+	   (match_dup 5)
+	   (match_dup 6)]
+	  UNSPEC_PCMPESTR))]
+  "TARGET_SSE4_2
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  int ecx = !find_regno_note (curr_insn, REG_UNUSED, REGNO (operands[0]));
+  int xmm0 = !find_regno_note (curr_insn, REG_UNUSED, REGNO (operands[1]));
+  int flags = !find_regno_note (curr_insn, REG_UNUSED, FLAGS_REG);
+
+  if (ecx)
+    emit_insn (gen_sse4_2_pcmpestri (operands[0], operands[2],
+				     operands[3], operands[4],
+				     operands[5], operands[6]));
+  if (xmm0)
+    emit_insn (gen_sse4_2_pcmpestrm (operands[1], operands[2],
+				     operands[3], operands[4],
+				     operands[5], operands[6]));
+  if (flags && !(ecx || xmm0))
+    emit_insn (gen_sse4_2_pcmpestr_cconly (NULL, NULL,
+					   operands[2], operands[3],
+					   operands[4], operands[5],
+					   operands[6]));
+  if (!(flags || ecx || xmm0))
+    emit_note (NOTE_INSN_DELETED);
+
+  DONE;
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "ssememalign" "8")
+   (set_attr "length_immediate" "1")
+   (set_attr "memory" "none,load")
+   (set_attr "mode" "TI")])
+
+(define_insn_and_split "*sse4_2_pcmpestr_unaligned"
+  [(set (match_operand:SI 0 "register_operand" "=c")
+	(unspec:SI
+	  [(match_operand:V16QI 2 "register_operand" "x")
+	   (match_operand:SI 3 "register_operand" "a")
+	   (unspec:V16QI
+	     [(match_operand:V16QI 4 "memory_operand" "m")]
+	     UNSPEC_LOADU)
+	   (match_operand:SI 5 "register_operand" "d")
+	   (match_operand:SI 6 "const_0_to_255_operand" "n")]
+	  UNSPEC_PCMPESTR))
+   (set (match_operand:V16QI 1 "register_operand" "=Yz")
+	(unspec:V16QI
+	  [(match_dup 2)
+	   (match_dup 3)
+	   (unspec:V16QI [(match_dup 4)] UNSPEC_LOADU)
+	   (match_dup 5)
+	   (match_dup 6)]
+	  UNSPEC_PCMPESTR))
+   (set (reg:CC FLAGS_REG)
+	(unspec:CC
+	  [(match_dup 2)
+	   (match_dup 3)
+	   (unspec:V16QI [(match_dup 4)] UNSPEC_LOADU)
+	   (match_dup 5)
+	   (match_dup 6)]
+	  UNSPEC_PCMPESTR))]
+  "TARGET_SSE4_2
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  int ecx = !find_regno_note (curr_insn, REG_UNUSED, REGNO (operands[0]));
+  int xmm0 = !find_regno_note (curr_insn, REG_UNUSED, REGNO (operands[1]));
+  int flags = !find_regno_note (curr_insn, REG_UNUSED, FLAGS_REG);
+
+  if (ecx)
+    emit_insn (gen_sse4_2_pcmpestri (operands[0], operands[2],
+				     operands[3], operands[4],
+				     operands[5], operands[6]));
+  if (xmm0)
+    emit_insn (gen_sse4_2_pcmpestrm (operands[1], operands[2],
+				     operands[3], operands[4],
+				     operands[5], operands[6]));
+  if (flags && !(ecx || xmm0))
+    emit_insn (gen_sse4_2_pcmpestr_cconly (NULL, NULL,
+					   operands[2], operands[3],
+					   operands[4], operands[5],
+					   operands[6]));
+  if (!(flags || ecx || xmm0))
+    emit_note (NOTE_INSN_DELETED);
+
+  DONE;
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "ssememalign" "8")
+   (set_attr "length_immediate" "1")
+   (set_attr "memory" "load")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_2_pcmpestri"
+  [(set (match_operand:SI 0 "register_operand" "=c,c")
+	(unspec:SI
+	  [(match_operand:V16QI 1 "register_operand" "x,x")
+	   (match_operand:SI 2 "register_operand" "a,a")
+	   (match_operand:V16QI 3 "nonimmediate_operand" "x,m")
+	   (match_operand:SI 4 "register_operand" "d,d")
+	   (match_operand:SI 5 "const_0_to_255_operand" "n,n")]
+	  UNSPEC_PCMPESTR))
+   (set (reg:CC FLAGS_REG)
+	(unspec:CC
+	  [(match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3)
+	   (match_dup 4)
+	   (match_dup 5)]
+	  UNSPEC_PCMPESTR))]
+  "TARGET_SSE4_2"
+  "%vpcmpestri\t{%5, %3, %1|%1, %3, %5}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "ssememalign" "8")
+   (set_attr "length_immediate" "1")
+   (set_attr "btver2_decode" "vector")
+   (set_attr "memory" "none,load")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_2_pcmpestrm"
+  [(set (match_operand:V16QI 0 "register_operand" "=Yz,Yz")
+	(unspec:V16QI
+	  [(match_operand:V16QI 1 "register_operand" "x,x")
+	   (match_operand:SI 2 "register_operand" "a,a")
+	   (match_operand:V16QI 3 "nonimmediate_operand" "x,m")
+	   (match_operand:SI 4 "register_operand" "d,d")
+	   (match_operand:SI 5 "const_0_to_255_operand" "n,n")]
+	  UNSPEC_PCMPESTR))
+   (set (reg:CC FLAGS_REG)
+	(unspec:CC
+	  [(match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3)
+	   (match_dup 4)
+	   (match_dup 5)]
+	  UNSPEC_PCMPESTR))]
+  "TARGET_SSE4_2"
+  "%vpcmpestrm\t{%5, %3, %1|%1, %3, %5}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "ssememalign" "8")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "btver2_decode" "vector")
+   (set_attr "memory" "none,load")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_2_pcmpestr_cconly"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC
+	  [(match_operand:V16QI 2 "register_operand" "x,x,x,x")
+	   (match_operand:SI 3 "register_operand" "a,a,a,a")
+	   (match_operand:V16QI 4 "nonimmediate_operand" "x,m,x,m")
+	   (match_operand:SI 5 "register_operand" "d,d,d,d")
+	   (match_operand:SI 6 "const_0_to_255_operand" "n,n,n,n")]
+	  UNSPEC_PCMPESTR))
+   (clobber (match_scratch:V16QI 0 "=Yz,Yz,X,X"))
+   (clobber (match_scratch:SI    1 "= X, X,c,c"))]
+  "TARGET_SSE4_2"
+  "@
+   %vpcmpestrm\t{%6, %4, %2|%2, %4, %6}
+   %vpcmpestrm\t{%6, %4, %2|%2, %4, %6}
+   %vpcmpestri\t{%6, %4, %2|%2, %4, %6}
+   %vpcmpestri\t{%6, %4, %2|%2, %4, %6}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "ssememalign" "8")
+   (set_attr "length_immediate" "1")
+   (set_attr "memory" "none,load,none,load")
+   (set_attr "btver2_decode" "vector,vector,vector,vector") 
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+(define_insn_and_split "sse4_2_pcmpistr"
+  [(set (match_operand:SI 0 "register_operand" "=c,c")
+	(unspec:SI
+	  [(match_operand:V16QI 2 "register_operand" "x,x")
+	   (match_operand:V16QI 3 "nonimmediate_operand" "x,m")
+	   (match_operand:SI 4 "const_0_to_255_operand" "n,n")]
+	  UNSPEC_PCMPISTR))
+   (set (match_operand:V16QI 1 "register_operand" "=Yz,Yz")
+	(unspec:V16QI
+	  [(match_dup 2)
+	   (match_dup 3)
+	   (match_dup 4)]
+	  UNSPEC_PCMPISTR))
+   (set (reg:CC FLAGS_REG)
+	(unspec:CC
+	  [(match_dup 2)
+	   (match_dup 3)
+	   (match_dup 4)]
+	  UNSPEC_PCMPISTR))]
+  "TARGET_SSE4_2
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  int ecx = !find_regno_note (curr_insn, REG_UNUSED, REGNO (operands[0]));
+  int xmm0 = !find_regno_note (curr_insn, REG_UNUSED, REGNO (operands[1]));
+  int flags = !find_regno_note (curr_insn, REG_UNUSED, FLAGS_REG);
+
+  if (ecx)
+    emit_insn (gen_sse4_2_pcmpistri (operands[0], operands[2],
+				     operands[3], operands[4]));
+  if (xmm0)
+    emit_insn (gen_sse4_2_pcmpistrm (operands[1], operands[2],
+				     operands[3], operands[4]));
+  if (flags && !(ecx || xmm0))
+    emit_insn (gen_sse4_2_pcmpistr_cconly (NULL, NULL,
+					   operands[2], operands[3],
+					   operands[4]));
+  if (!(flags || ecx || xmm0))
+    emit_note (NOTE_INSN_DELETED);
+
+  DONE;
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "ssememalign" "8")
+   (set_attr "length_immediate" "1")
+   (set_attr "memory" "none,load")
+   (set_attr "mode" "TI")])
+
+(define_insn_and_split "*sse4_2_pcmpistr_unaligned"
+  [(set (match_operand:SI 0 "register_operand" "=c")
+	(unspec:SI
+	  [(match_operand:V16QI 2 "register_operand" "x")
+	   (unspec:V16QI
+	     [(match_operand:V16QI 3 "memory_operand" "m")]
+	     UNSPEC_LOADU)
+	   (match_operand:SI 4 "const_0_to_255_operand" "n")]
+	  UNSPEC_PCMPISTR))
+   (set (match_operand:V16QI 1 "register_operand" "=Yz")
+	(unspec:V16QI
+	  [(match_dup 2)
+	   (unspec:V16QI [(match_dup 3)] UNSPEC_LOADU)
+	   (match_dup 4)]
+	  UNSPEC_PCMPISTR))
+   (set (reg:CC FLAGS_REG)
+	(unspec:CC
+	  [(match_dup 2)
+	   (unspec:V16QI [(match_dup 3)] UNSPEC_LOADU)
+	   (match_dup 4)]
+	  UNSPEC_PCMPISTR))]
+  "TARGET_SSE4_2
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  int ecx = !find_regno_note (curr_insn, REG_UNUSED, REGNO (operands[0]));
+  int xmm0 = !find_regno_note (curr_insn, REG_UNUSED, REGNO (operands[1]));
+  int flags = !find_regno_note (curr_insn, REG_UNUSED, FLAGS_REG);
+
+  if (ecx)
+    emit_insn (gen_sse4_2_pcmpistri (operands[0], operands[2],
+				     operands[3], operands[4]));
+  if (xmm0)
+    emit_insn (gen_sse4_2_pcmpistrm (operands[1], operands[2],
+				     operands[3], operands[4]));
+  if (flags && !(ecx || xmm0))
+    emit_insn (gen_sse4_2_pcmpistr_cconly (NULL, NULL,
+					   operands[2], operands[3],
+					   operands[4]));
+  if (!(flags || ecx || xmm0))
+    emit_note (NOTE_INSN_DELETED);
+
+  DONE;
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "ssememalign" "8")
+   (set_attr "length_immediate" "1")
+   (set_attr "memory" "load")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_2_pcmpistri"
+  [(set (match_operand:SI 0 "register_operand" "=c,c")
+	(unspec:SI
+	  [(match_operand:V16QI 1 "register_operand" "x,x")
+	   (match_operand:V16QI 2 "nonimmediate_operand" "x,m")
+	   (match_operand:SI 3 "const_0_to_255_operand" "n,n")]
+	  UNSPEC_PCMPISTR))
+   (set (reg:CC FLAGS_REG)
+	(unspec:CC
+	  [(match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  UNSPEC_PCMPISTR))]
+  "TARGET_SSE4_2"
+  "%vpcmpistri\t{%3, %2, %1|%1, %2, %3}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "ssememalign" "8")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "memory" "none,load")
+   (set_attr "btver2_decode" "vector")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_2_pcmpistrm"
+  [(set (match_operand:V16QI 0 "register_operand" "=Yz,Yz")
+	(unspec:V16QI
+	  [(match_operand:V16QI 1 "register_operand" "x,x")
+	   (match_operand:V16QI 2 "nonimmediate_operand" "x,m")
+	   (match_operand:SI 3 "const_0_to_255_operand" "n,n")]
+	  UNSPEC_PCMPISTR))
+   (set (reg:CC FLAGS_REG)
+	(unspec:CC
+	  [(match_dup 1)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  UNSPEC_PCMPISTR))]
+  "TARGET_SSE4_2"
+  "%vpcmpistrm\t{%3, %2, %1|%1, %2, %3}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "ssememalign" "8")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "memory" "none,load")
+   (set_attr "btver2_decode" "vector")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_2_pcmpistr_cconly"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC
+	  [(match_operand:V16QI 2 "register_operand" "x,x,x,x")
+	   (match_operand:V16QI 3 "nonimmediate_operand" "x,m,x,m")
+	   (match_operand:SI 4 "const_0_to_255_operand" "n,n,n,n")]
+	  UNSPEC_PCMPISTR))
+   (clobber (match_scratch:V16QI 0 "=Yz,Yz,X,X"))
+   (clobber (match_scratch:SI    1 "= X, X,c,c"))]
+  "TARGET_SSE4_2"
+  "@
+   %vpcmpistrm\t{%4, %3, %2|%2, %3, %4}
+   %vpcmpistrm\t{%4, %3, %2|%2, %3, %4}
+   %vpcmpistri\t{%4, %3, %2|%2, %3, %4}
+   %vpcmpistri\t{%4, %3, %2|%2, %3, %4}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "ssememalign" "8")
+   (set_attr "length_immediate" "1")
+   (set_attr "memory" "none,load,none,load")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "btver2_decode" "vector,vector,vector,vector")
+   (set_attr "mode" "TI")])
+
+;; Packed float variants
+(define_mode_attr GATHER_SCATTER_SF_MEM_MODE
+		      [(V8DI "V8SF") (V16SI "V16SF")])
+
+(define_expand "avx512pf_gatherpf<mode>sf"
+  [(unspec
+     [(match_operand:<avx512fmaskmode> 0 "register_or_constm1_operand")
+      (mem:<GATHER_SCATTER_SF_MEM_MODE>
+	(match_par_dup 5
+	  [(match_operand 2 "vsib_address_operand")
+	   (match_operand:VI48_512 1 "register_operand")
+	   (match_operand:SI 3 "const1248_operand")]))
+      (match_operand:SI 4 "const_2_to_3_operand")]
+     UNSPEC_GATHER_PREFETCH)]
+  "TARGET_AVX512PF"
+{
+  operands[5]
+    = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[2], operands[1],
+					operands[3]), UNSPEC_VSIBADDR);
+})
+
+(define_insn "*avx512pf_gatherpf<mode>sf_mask"
+  [(unspec
+     [(match_operand:<avx512fmaskmode> 0 "register_operand" "Yk")
+      (match_operator:<GATHER_SCATTER_SF_MEM_MODE> 5 "vsib_mem_operator"
+	[(unspec:P
+	   [(match_operand:P 2 "vsib_address_operand" "Tv")
+	    (match_operand:VI48_512 1 "register_operand" "v")
+	    (match_operand:SI 3 "const1248_operand" "n")]
+	   UNSPEC_VSIBADDR)])
+      (match_operand:SI 4 "const_2_to_3_operand" "n")]
+     UNSPEC_GATHER_PREFETCH)]
+  "TARGET_AVX512PF"
+{
+  switch (INTVAL (operands[4]))
+    {
+    case 3:
+      return "vgatherpf0<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}";
+    case 2:
+      return "vgatherpf1<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "*avx512pf_gatherpf<mode>sf"
+  [(unspec
+     [(const_int -1)
+      (match_operator:<GATHER_SCATTER_SF_MEM_MODE> 4 "vsib_mem_operator"
+	[(unspec:P
+	   [(match_operand:P 1 "vsib_address_operand" "Tv")
+	    (match_operand:VI48_512 0 "register_operand" "v")
+	    (match_operand:SI 2 "const1248_operand" "n")]
+	   UNSPEC_VSIBADDR)])
+      (match_operand:SI 3 "const_2_to_3_operand" "n")]
+     UNSPEC_GATHER_PREFETCH)]
+  "TARGET_AVX512PF"
+{
+  switch (INTVAL (operands[3]))
+    {
+    case 3:
+      return "vgatherpf0<ssemodesuffix>ps\t{%4|%4}";
+    case 2:
+      return "vgatherpf1<ssemodesuffix>ps\t{%4|%4}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+;; Packed double variants
+(define_expand "avx512pf_gatherpf<mode>df"
+  [(unspec
+     [(match_operand:<avx512fmaskmode> 0 "register_or_constm1_operand")
+      (mem:V8DF
+	(match_par_dup 5
+	  [(match_operand 2 "vsib_address_operand")
+	   (match_operand:VI4_256_8_512 1 "register_operand")
+	   (match_operand:SI 3 "const1248_operand")]))
+      (match_operand:SI 4 "const_2_to_3_operand")]
+     UNSPEC_GATHER_PREFETCH)]
+  "TARGET_AVX512PF"
+{
+  operands[5]
+    = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[2], operands[1],
+					operands[3]), UNSPEC_VSIBADDR);
+})
+
+(define_insn "*avx512pf_gatherpf<mode>df_mask"
+  [(unspec
+     [(match_operand:<avx512fmaskmode> 0 "register_operand" "Yk")
+      (match_operator:V8DF 5 "vsib_mem_operator"
+	[(unspec:P
+	   [(match_operand:P 2 "vsib_address_operand" "Tv")
+	    (match_operand:VI4_256_8_512 1 "register_operand" "v")
+	    (match_operand:SI 3 "const1248_operand" "n")]
+	   UNSPEC_VSIBADDR)])
+      (match_operand:SI 4 "const_2_to_3_operand" "n")]
+     UNSPEC_GATHER_PREFETCH)]
+  "TARGET_AVX512PF"
+{
+  switch (INTVAL (operands[4]))
+    {
+    case 3:
+      return "vgatherpf0<ssemodesuffix>pd\t{%5%{%0%}|%5%{%0%}}";
+    case 2:
+      return "vgatherpf1<ssemodesuffix>pd\t{%5%{%0%}|%5%{%0%}}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "*avx512pf_gatherpf<mode>df"
+  [(unspec
+     [(const_int -1)
+      (match_operator:V8DF 4 "vsib_mem_operator"
+	[(unspec:P
+	   [(match_operand:P 1 "vsib_address_operand" "Tv")
+	    (match_operand:VI4_256_8_512 0 "register_operand" "v")
+	    (match_operand:SI 2 "const1248_operand" "n")]
+	   UNSPEC_VSIBADDR)])
+      (match_operand:SI 3 "const_2_to_3_operand" "n")]
+     UNSPEC_GATHER_PREFETCH)]
+  "TARGET_AVX512PF"
+{
+  switch (INTVAL (operands[3]))
+    {
+    case 3:
+      return "vgatherpf0<ssemodesuffix>pd\t{%4|%4}";
+    case 2:
+      return "vgatherpf1<ssemodesuffix>pd\t{%4|%4}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+;; Packed float variants
+(define_expand "avx512pf_scatterpf<mode>sf"
+  [(unspec
+     [(match_operand:<avx512fmaskmode> 0 "register_or_constm1_operand")
+      (mem:<GATHER_SCATTER_SF_MEM_MODE>
+	(match_par_dup 5
+	  [(match_operand 2 "vsib_address_operand")
+	   (match_operand:VI48_512 1 "register_operand")
+	   (match_operand:SI 3 "const1248_operand")]))
+      (match_operand:SI 4 "const2367_operand")]
+     UNSPEC_SCATTER_PREFETCH)]
+  "TARGET_AVX512PF"
+{
+  operands[5]
+    = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[2], operands[1],
+					operands[3]), UNSPEC_VSIBADDR);
+})
+
+(define_insn "*avx512pf_scatterpf<mode>sf_mask"
+  [(unspec
+     [(match_operand:<avx512fmaskmode> 0 "register_operand" "Yk")
+      (match_operator:<GATHER_SCATTER_SF_MEM_MODE> 5 "vsib_mem_operator"
+	[(unspec:P
+	   [(match_operand:P 2 "vsib_address_operand" "Tv")
+	    (match_operand:VI48_512 1 "register_operand" "v")
+	    (match_operand:SI 3 "const1248_operand" "n")]
+	   UNSPEC_VSIBADDR)])
+      (match_operand:SI 4 "const2367_operand" "n")]
+     UNSPEC_SCATTER_PREFETCH)]
+  "TARGET_AVX512PF"
+{
+  switch (INTVAL (operands[4]))
+    {
+    case 3:
+    case 7:
+      return "vscatterpf0<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}";
+    case 2:
+    case 6:
+      return "vscatterpf1<ssemodesuffix>ps\t{%5%{%0%}|%5%{%0%}}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "*avx512pf_scatterpf<mode>sf"
+  [(unspec
+     [(const_int -1)
+      (match_operator:<GATHER_SCATTER_SF_MEM_MODE> 4 "vsib_mem_operator"
+	[(unspec:P
+	   [(match_operand:P 1 "vsib_address_operand" "Tv")
+	    (match_operand:VI48_512 0 "register_operand" "v")
+	    (match_operand:SI 2 "const1248_operand" "n")]
+	   UNSPEC_VSIBADDR)])
+      (match_operand:SI 3 "const2367_operand" "n")]
+     UNSPEC_SCATTER_PREFETCH)]
+  "TARGET_AVX512PF"
+{
+  switch (INTVAL (operands[3]))
+    {
+    case 3:
+    case 7:
+      return "vscatterpf0<ssemodesuffix>ps\t{%4|%4}";
+    case 2:
+    case 6:
+      return "vscatterpf1<ssemodesuffix>ps\t{%4|%4}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+;; Packed double variants
+(define_expand "avx512pf_scatterpf<mode>df"
+  [(unspec
+     [(match_operand:<avx512fmaskmode> 0 "register_or_constm1_operand")
+      (mem:V8DF
+	(match_par_dup 5
+	  [(match_operand 2 "vsib_address_operand")
+	   (match_operand:VI4_256_8_512 1 "register_operand")
+	   (match_operand:SI 3 "const1248_operand")]))
+      (match_operand:SI 4 "const2367_operand")]
+     UNSPEC_SCATTER_PREFETCH)]
+  "TARGET_AVX512PF"
+{
+  operands[5]
+    = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[2], operands[1],
+					operands[3]), UNSPEC_VSIBADDR);
+})
+
+(define_insn "*avx512pf_scatterpf<mode>df_mask"
+  [(unspec
+     [(match_operand:<avx512fmaskmode> 0 "register_operand" "Yk")
+      (match_operator:V8DF 5 "vsib_mem_operator"
+	[(unspec:P
+	   [(match_operand:P 2 "vsib_address_operand" "Tv")
+	    (match_operand:VI4_256_8_512 1 "register_operand" "v")
+	    (match_operand:SI 3 "const1248_operand" "n")]
+	   UNSPEC_VSIBADDR)])
+      (match_operand:SI 4 "const2367_operand" "n")]
+     UNSPEC_SCATTER_PREFETCH)]
+  "TARGET_AVX512PF"
+{
+  switch (INTVAL (operands[4]))
+    {
+    case 3:
+    case 7:
+      return "vscatterpf0<ssemodesuffix>pd\t{%5%{%0%}|%5%{%0%}}";
+    case 2:
+    case 6:
+      return "vscatterpf1<ssemodesuffix>pd\t{%5%{%0%}|%5%{%0%}}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "*avx512pf_scatterpf<mode>df"
+  [(unspec
+     [(const_int -1)
+      (match_operator:V8DF 4 "vsib_mem_operator"
+	[(unspec:P
+	   [(match_operand:P 1 "vsib_address_operand" "Tv")
+	    (match_operand:VI4_256_8_512 0 "register_operand" "v")
+	    (match_operand:SI 2 "const1248_operand" "n")]
+	   UNSPEC_VSIBADDR)])
+      (match_operand:SI 3 "const2367_operand" "n")]
+     UNSPEC_SCATTER_PREFETCH)]
+  "TARGET_AVX512PF"
+{
+  switch (INTVAL (operands[3]))
+    {
+    case 3:
+    case 7:
+      return "vscatterpf0<ssemodesuffix>pd\t{%4|%4}";
+    case 2:
+    case 6:
+      return "vscatterpf1<ssemodesuffix>pd\t{%4|%4}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "avx512er_exp2<mode><mask_name><round_saeonly_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+	(unspec:VF_512
+	  [(match_operand:VF_512 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")]
+	  UNSPEC_EXP2))]
+  "TARGET_AVX512ER"
+  "vexp2<ssemodesuffix>\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}"
+  [(set_attr "prefix" "evex")
+   (set_attr "type" "sse")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<mask_codefor>avx512er_rcp28<mode><mask_name><round_saeonly_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+	(unspec:VF_512
+	  [(match_operand:VF_512 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")]
+	  UNSPEC_RCP28))]
+  "TARGET_AVX512ER"
+  "vrcp28<ssemodesuffix>\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}"
+  [(set_attr "prefix" "evex")
+   (set_attr "type" "sse")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512er_vmrcp28<mode><round_saeonly_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+	(vec_merge:VF_128
+	  (unspec:VF_128
+	    [(match_operand:VF_128 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")]
+	    UNSPEC_RCP28)
+	  (match_operand:VF_128 2 "register_operand" "v")
+	  (const_int 1)))]
+  "TARGET_AVX512ER"
+  "vrcp28<ssescalarmodesuffix>\t{<round_saeonly_op3>%1, %2, %0|%0, %2, %1<round_saeonly_op3>}"
+  [(set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set_attr "type" "sse")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<mask_codefor>avx512er_rsqrt28<mode><mask_name><round_saeonly_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+	(unspec:VF_512
+	  [(match_operand:VF_512 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")]
+	  UNSPEC_RSQRT28))]
+  "TARGET_AVX512ER"
+  "vrsqrt28<ssemodesuffix>\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}"
+  [(set_attr "prefix" "evex")
+   (set_attr "type" "sse")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512er_vmrsqrt28<mode><round_saeonly_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+	(vec_merge:VF_128
+	  (unspec:VF_128
+	    [(match_operand:VF_128 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")]
+	    UNSPEC_RSQRT28)
+	  (match_operand:VF_128 2 "register_operand" "v")
+	  (const_int 1)))]
+  "TARGET_AVX512ER"
+  "vrsqrt28<ssescalarmodesuffix>\t{<round_saeonly_op3>%1, %2, %0|%0, %2, %1<round_saeonly_op3>}"
+  [(set_attr "length_immediate" "1")
+   (set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; XOP instructions
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_code_iterator xop_plus [plus ss_plus])
+
+(define_code_attr macs [(plus "macs") (ss_plus "macss")])
+(define_code_attr madcs [(plus "madcs") (ss_plus "madcss")])
+
+;; XOP parallel integer multiply/add instructions.
+
+(define_insn "xop_p<macs><ssemodesuffix><ssemodesuffix>"
+  [(set (match_operand:VI24_128 0 "register_operand" "=x")
+	(xop_plus:VI24_128
+	 (mult:VI24_128
+	  (match_operand:VI24_128 1 "nonimmediate_operand" "%x")
+	  (match_operand:VI24_128 2 "nonimmediate_operand" "xm"))
+	 (match_operand:VI24_128 3 "register_operand" "x")))]
+  "TARGET_XOP"
+  "vp<macs><ssemodesuffix><ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "TI")])
+
+(define_insn "xop_p<macs>dql"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(xop_plus:V2DI
+	 (mult:V2DI
+	  (sign_extend:V2DI
+	   (vec_select:V2SI
+	    (match_operand:V4SI 1 "nonimmediate_operand" "%x")
+	    (parallel [(const_int 0) (const_int 2)])))
+	  (sign_extend:V2DI
+	   (vec_select:V2SI
+	    (match_operand:V4SI 2 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0) (const_int 2)]))))
+	 (match_operand:V2DI 3 "register_operand" "x")))]
+  "TARGET_XOP"
+  "vp<macs>dql\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "TI")])
+
+(define_insn "xop_p<macs>dqh"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(xop_plus:V2DI
+	 (mult:V2DI
+	  (sign_extend:V2DI
+	   (vec_select:V2SI
+	    (match_operand:V4SI 1 "nonimmediate_operand" "%x")
+	    (parallel [(const_int 1) (const_int 3)])))
+	  (sign_extend:V2DI
+	   (vec_select:V2SI
+	    (match_operand:V4SI 2 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 1) (const_int 3)]))))
+	 (match_operand:V2DI 3 "register_operand" "x")))]
+  "TARGET_XOP"
+  "vp<macs>dqh\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "TI")])
+
+;; XOP parallel integer multiply/add instructions for the intrinisics
+(define_insn "xop_p<macs>wd"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(xop_plus:V4SI
+	 (mult:V4SI
+	  (sign_extend:V4SI
+	   (vec_select:V4HI
+	    (match_operand:V8HI 1 "nonimmediate_operand" "%x")
+	    (parallel [(const_int 1) (const_int 3)
+		       (const_int 5) (const_int 7)])))
+	  (sign_extend:V4SI
+	   (vec_select:V4HI
+	    (match_operand:V8HI 2 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 1) (const_int 3)
+		       (const_int 5) (const_int 7)]))))
+	 (match_operand:V4SI 3 "register_operand" "x")))]
+  "TARGET_XOP"
+  "vp<macs>wd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "TI")])
+
+(define_insn "xop_p<madcs>wd"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(xop_plus:V4SI
+	 (plus:V4SI
+	  (mult:V4SI
+	   (sign_extend:V4SI
+	    (vec_select:V4HI
+	     (match_operand:V8HI 1 "nonimmediate_operand" "%x")
+	     (parallel [(const_int 0) (const_int 2)
+			(const_int 4) (const_int 6)])))
+	   (sign_extend:V4SI
+	    (vec_select:V4HI
+	     (match_operand:V8HI 2 "nonimmediate_operand" "xm")
+	     (parallel [(const_int 0) (const_int 2)
+			(const_int 4) (const_int 6)]))))
+	  (mult:V4SI
+	   (sign_extend:V4SI
+	    (vec_select:V4HI
+	     (match_dup 1)
+	     (parallel [(const_int 1) (const_int 3)
+			(const_int 5) (const_int 7)])))
+	   (sign_extend:V4SI
+	    (vec_select:V4HI
+	     (match_dup 2)
+	     (parallel [(const_int 1) (const_int 3)
+			(const_int 5) (const_int 7)])))))
+	 (match_operand:V4SI 3 "register_operand" "x")))]
+  "TARGET_XOP"
+  "vp<madcs>wd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "TI")])
+
+;; XOP parallel XMM conditional moves
+(define_insn "xop_pcmov_<mode><avxsizesuffix>"
+  [(set (match_operand:V 0 "register_operand" "=x,x")
+	(if_then_else:V
+	  (match_operand:V 3 "nonimmediate_operand" "x,m")
+	  (match_operand:V 1 "register_operand" "x,x")
+	  (match_operand:V 2 "nonimmediate_operand" "xm,x")))]
+  "TARGET_XOP"
+  "vpcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sse4arg")])
+
+;; XOP horizontal add/subtract instructions
+(define_insn "xop_phadd<u>bw"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(plus:V8HI
+	 (any_extend:V8HI
+	  (vec_select:V8QI
+	   (match_operand:V16QI 1 "nonimmediate_operand" "xm")
+	   (parallel [(const_int 0) (const_int 2)
+		      (const_int 4) (const_int 6)
+		      (const_int 8) (const_int 10)
+		      (const_int 12) (const_int 14)])))
+	 (any_extend:V8HI
+	  (vec_select:V8QI
+	   (match_dup 1)
+	   (parallel [(const_int 1) (const_int 3)
+		      (const_int 5) (const_int 7)
+		      (const_int 9) (const_int 11)
+		      (const_int 13) (const_int 15)])))))]
+  "TARGET_XOP"
+  "vphadd<u>bw\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "xop_phadd<u>bd"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(plus:V4SI
+	 (plus:V4SI
+	  (any_extend:V4SI
+	   (vec_select:V4QI
+	    (match_operand:V16QI 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0) (const_int 4)
+		       (const_int 8) (const_int 12)])))
+	  (any_extend:V4SI
+	   (vec_select:V4QI
+	    (match_dup 1)
+	    (parallel [(const_int 1) (const_int 5)
+		       (const_int 9) (const_int 13)]))))
+	 (plus:V4SI
+	  (any_extend:V4SI
+	   (vec_select:V4QI
+	    (match_dup 1)
+	    (parallel [(const_int 2) (const_int 6)
+		       (const_int 10) (const_int 14)])))
+	  (any_extend:V4SI
+	   (vec_select:V4QI
+	    (match_dup 1)
+	    (parallel [(const_int 3) (const_int 7)
+		       (const_int 11) (const_int 15)]))))))]
+  "TARGET_XOP"
+  "vphadd<u>bd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "xop_phadd<u>bq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(plus:V2DI
+	 (plus:V2DI
+	  (plus:V2DI
+	   (any_extend:V2DI
+	    (vec_select:V2QI
+	     (match_operand:V16QI 1 "nonimmediate_operand" "xm")
+	     (parallel [(const_int 0) (const_int 8)])))
+	   (any_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 1) (const_int 9)]))))
+	  (plus:V2DI
+	   (any_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 2) (const_int 10)])))
+	   (any_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 3) (const_int 11)])))))
+	 (plus:V2DI
+	  (plus:V2DI
+	   (any_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 4) (const_int 12)])))
+	   (any_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 5) (const_int 13)]))))
+	  (plus:V2DI
+	   (any_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 6) (const_int 14)])))
+	   (any_extend:V2DI
+	    (vec_select:V2QI
+	     (match_dup 1)
+	     (parallel [(const_int 7) (const_int 15)])))))))]
+  "TARGET_XOP"
+  "vphadd<u>bq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "xop_phadd<u>wd"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(plus:V4SI
+	 (any_extend:V4SI
+	  (vec_select:V4HI
+	   (match_operand:V8HI 1 "nonimmediate_operand" "xm")
+	   (parallel [(const_int 0) (const_int 2)
+		      (const_int 4) (const_int 6)])))
+	 (any_extend:V4SI
+	  (vec_select:V4HI
+	   (match_dup 1)
+	   (parallel [(const_int 1) (const_int 3)
+		      (const_int 5) (const_int 7)])))))]
+  "TARGET_XOP"
+  "vphadd<u>wd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "xop_phadd<u>wq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(plus:V2DI
+	 (plus:V2DI
+	  (any_extend:V2DI
+	   (vec_select:V2HI
+	    (match_operand:V8HI 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0) (const_int 4)])))
+	  (any_extend:V2DI
+	   (vec_select:V2HI
+	    (match_dup 1)
+	    (parallel [(const_int 1) (const_int 5)]))))
+	 (plus:V2DI
+	  (any_extend:V2DI
+	   (vec_select:V2HI
+	    (match_dup 1)
+	    (parallel [(const_int 2) (const_int 6)])))
+	  (any_extend:V2DI
+	   (vec_select:V2HI
+	    (match_dup 1)
+	    (parallel [(const_int 3) (const_int 7)]))))))]
+  "TARGET_XOP"
+  "vphadd<u>wq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "xop_phadd<u>dq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(plus:V2DI
+	 (any_extend:V2DI
+	  (vec_select:V2SI
+	   (match_operand:V4SI 1 "nonimmediate_operand" "xm")
+	   (parallel [(const_int 0) (const_int 2)])))
+	 (any_extend:V2DI
+	  (vec_select:V2SI
+	   (match_dup 1)
+	   (parallel [(const_int 1) (const_int 3)])))))]
+  "TARGET_XOP"
+  "vphadd<u>dq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "xop_phsubbw"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(minus:V8HI
+	 (sign_extend:V8HI
+	  (vec_select:V8QI
+	   (match_operand:V16QI 1 "nonimmediate_operand" "xm")
+	   (parallel [(const_int 0) (const_int 2)
+		      (const_int 4) (const_int 6)
+		      (const_int 8) (const_int 10)
+		      (const_int 12) (const_int 14)])))
+	 (sign_extend:V8HI
+	  (vec_select:V8QI
+	   (match_dup 1)
+	   (parallel [(const_int 1) (const_int 3)
+		      (const_int 5) (const_int 7)
+		      (const_int 9) (const_int 11)
+		      (const_int 13) (const_int 15)])))))]
+  "TARGET_XOP"
+  "vphsubbw\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "xop_phsubwd"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(minus:V4SI
+	 (sign_extend:V4SI
+	  (vec_select:V4HI
+	   (match_operand:V8HI 1 "nonimmediate_operand" "xm")
+	   (parallel [(const_int 0) (const_int 2)
+		      (const_int 4) (const_int 6)])))
+	 (sign_extend:V4SI
+	  (vec_select:V4HI
+	   (match_dup 1)
+	   (parallel [(const_int 1) (const_int 3)
+		      (const_int 5) (const_int 7)])))))]
+  "TARGET_XOP"
+  "vphsubwd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+(define_insn "xop_phsubdq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(minus:V2DI
+	 (sign_extend:V2DI
+	  (vec_select:V2SI
+	   (match_operand:V4SI 1 "nonimmediate_operand" "xm")
+	   (parallel [(const_int 0) (const_int 2)])))
+	 (sign_extend:V2DI
+	  (vec_select:V2SI
+	   (match_dup 1)
+	   (parallel [(const_int 1) (const_int 3)])))))]
+  "TARGET_XOP"
+  "vphsubdq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseiadd1")])
+
+;; XOP permute instructions
+(define_insn "xop_pperm"
+  [(set (match_operand:V16QI 0 "register_operand" "=x,x")
+	(unspec:V16QI
+	  [(match_operand:V16QI 1 "register_operand" "x,x")
+	   (match_operand:V16QI 2 "nonimmediate_operand" "x,m")
+	   (match_operand:V16QI 3 "nonimmediate_operand" "xm,x")]
+	  UNSPEC_XOP_PERMUTE))]
+  "TARGET_XOP && !(MEM_P (operands[2]) && MEM_P (operands[3]))"
+  "vpperm\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sse4arg")
+   (set_attr "mode" "TI")])
+
+;; XOP pack instructions that combine two vectors into a smaller vector
+(define_insn "xop_pperm_pack_v2di_v4si"
+  [(set (match_operand:V4SI 0 "register_operand" "=x,x")
+	(vec_concat:V4SI
+	 (truncate:V2SI
+	  (match_operand:V2DI 1 "register_operand" "x,x"))
+	 (truncate:V2SI
+	  (match_operand:V2DI 2 "nonimmediate_operand" "x,m"))))
+   (use (match_operand:V16QI 3 "nonimmediate_operand" "xm,x"))]
+  "TARGET_XOP && !(MEM_P (operands[2]) && MEM_P (operands[3]))"
+  "vpperm\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sse4arg")
+   (set_attr "mode" "TI")])
+
+(define_insn "xop_pperm_pack_v4si_v8hi"
+  [(set (match_operand:V8HI 0 "register_operand" "=x,x")
+	(vec_concat:V8HI
+	 (truncate:V4HI
+	  (match_operand:V4SI 1 "register_operand" "x,x"))
+	 (truncate:V4HI
+	  (match_operand:V4SI 2 "nonimmediate_operand" "x,m"))))
+   (use (match_operand:V16QI 3 "nonimmediate_operand" "xm,x"))]
+  "TARGET_XOP && !(MEM_P (operands[2]) && MEM_P (operands[3]))"
+  "vpperm\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sse4arg")
+   (set_attr "mode" "TI")])
+
+(define_insn "xop_pperm_pack_v8hi_v16qi"
+  [(set (match_operand:V16QI 0 "register_operand" "=x,x")
+	(vec_concat:V16QI
+	 (truncate:V8QI
+	  (match_operand:V8HI 1 "register_operand" "x,x"))
+	 (truncate:V8QI
+	  (match_operand:V8HI 2 "nonimmediate_operand" "x,m"))))
+   (use (match_operand:V16QI 3 "nonimmediate_operand" "xm,x"))]
+  "TARGET_XOP && !(MEM_P (operands[2]) && MEM_P (operands[3]))"
+  "vpperm\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sse4arg")
+   (set_attr "mode" "TI")])
+
+;; XOP packed rotate instructions
+(define_expand "rotl<mode>3"
+  [(set (match_operand:VI_128 0 "register_operand")
+	(rotate:VI_128
+	 (match_operand:VI_128 1 "nonimmediate_operand")
+	 (match_operand:SI 2 "general_operand")))]
+  "TARGET_XOP"
+{
+  /* If we were given a scalar, convert it to parallel */
+  if (! const_0_to_<sserotatemax>_operand (operands[2], SImode))
+    {
+      rtvec vs = rtvec_alloc (<ssescalarnum>);
+      rtx par = gen_rtx_PARALLEL (<MODE>mode, vs);
+      rtx reg = gen_reg_rtx (<MODE>mode);
+      rtx op2 = operands[2];
+      int i;
+
+      if (GET_MODE (op2) != <ssescalarmode>mode)
+	{
+	  op2 = gen_reg_rtx (<ssescalarmode>mode);
+	  convert_move (op2, operands[2], false);
+	}
+
+      for (i = 0; i < <ssescalarnum>; i++)
+	RTVEC_ELT (vs, i) = op2;
+
+      emit_insn (gen_vec_init<mode> (reg, par));
+      emit_insn (gen_xop_vrotl<mode>3 (operands[0], operands[1], reg));
+      DONE;
+    }
+})
+
+(define_expand "rotr<mode>3"
+  [(set (match_operand:VI_128 0 "register_operand")
+	(rotatert:VI_128
+	 (match_operand:VI_128 1 "nonimmediate_operand")
+	 (match_operand:SI 2 "general_operand")))]
+  "TARGET_XOP"
+{
+  /* If we were given a scalar, convert it to parallel */
+  if (! const_0_to_<sserotatemax>_operand (operands[2], SImode))
+    {
+      rtvec vs = rtvec_alloc (<ssescalarnum>);
+      rtx par = gen_rtx_PARALLEL (<MODE>mode, vs);
+      rtx neg = gen_reg_rtx (<MODE>mode);
+      rtx reg = gen_reg_rtx (<MODE>mode);
+      rtx op2 = operands[2];
+      int i;
+
+      if (GET_MODE (op2) != <ssescalarmode>mode)
+	{
+	  op2 = gen_reg_rtx (<ssescalarmode>mode);
+	  convert_move (op2, operands[2], false);
+	}
+
+      for (i = 0; i < <ssescalarnum>; i++)
+	RTVEC_ELT (vs, i) = op2;
+
+      emit_insn (gen_vec_init<mode> (reg, par));
+      emit_insn (gen_neg<mode>2 (neg, reg));
+      emit_insn (gen_xop_vrotl<mode>3 (operands[0], operands[1], neg));
+      DONE;
+    }
+})
+
+(define_insn "xop_rotl<mode>3"
+  [(set (match_operand:VI_128 0 "register_operand" "=x")
+	(rotate:VI_128
+	 (match_operand:VI_128 1 "nonimmediate_operand" "xm")
+	 (match_operand:SI 2 "const_0_to_<sserotatemax>_operand" "n")))]
+  "TARGET_XOP"
+  "vprot<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "xop_rotr<mode>3"
+  [(set (match_operand:VI_128 0 "register_operand" "=x")
+	(rotatert:VI_128
+	 (match_operand:VI_128 1 "nonimmediate_operand" "xm")
+	 (match_operand:SI 2 "const_0_to_<sserotatemax>_operand" "n")))]
+  "TARGET_XOP"
+{
+  operands[3]
+    = GEN_INT (GET_MODE_BITSIZE (<ssescalarmode>mode) - INTVAL (operands[2]));
+  return \"vprot<ssemodesuffix>\t{%3, %1, %0|%0, %1, %3}\";
+}
+  [(set_attr "type" "sseishft")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "TI")])
+
+(define_expand "vrotr<mode>3"
+  [(match_operand:VI_128 0 "register_operand")
+   (match_operand:VI_128 1 "register_operand")
+   (match_operand:VI_128 2 "register_operand")]
+  "TARGET_XOP"
+{
+  rtx reg = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_neg<mode>2 (reg, operands[2]));
+  emit_insn (gen_xop_vrotl<mode>3 (operands[0], operands[1], reg));
+  DONE;
+})
+
+(define_expand "vrotl<mode>3"
+  [(match_operand:VI_128 0 "register_operand")
+   (match_operand:VI_128 1 "register_operand")
+   (match_operand:VI_128 2 "register_operand")]
+  "TARGET_XOP"
+{
+  emit_insn (gen_xop_vrotl<mode>3 (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_insn "xop_vrotl<mode>3"
+  [(set (match_operand:VI_128 0 "register_operand" "=x,x")
+	(if_then_else:VI_128
+	 (ge:VI_128
+	  (match_operand:VI_128 2 "nonimmediate_operand" "x,m")
+	  (const_int 0))
+	 (rotate:VI_128
+	  (match_operand:VI_128 1 "nonimmediate_operand" "xm,x")
+	  (match_dup 2))
+	 (rotatert:VI_128
+	  (match_dup 1)
+	  (neg:VI_128 (match_dup 2)))))]
+  "TARGET_XOP && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "vprot<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set_attr "prefix_data16" "0")
+   (set_attr "prefix_extra" "2")
+   (set_attr "mode" "TI")])
+
+;; XOP packed shift instructions.
+(define_expand "vlshr<mode>3"
+  [(set (match_operand:VI12_128 0 "register_operand")
+	(lshiftrt:VI12_128
+	  (match_operand:VI12_128 1 "register_operand")
+	  (match_operand:VI12_128 2 "nonimmediate_operand")))]
+  "TARGET_XOP"
+{
+  rtx neg = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_neg<mode>2 (neg, operands[2]));
+  emit_insn (gen_xop_shl<mode>3 (operands[0], operands[1], neg));
+  DONE;
+})
+
+(define_expand "vlshr<mode>3"
+  [(set (match_operand:VI48_128 0 "register_operand")
+	(lshiftrt:VI48_128
+	  (match_operand:VI48_128 1 "register_operand")
+	  (match_operand:VI48_128 2 "nonimmediate_operand")))]
+  "TARGET_AVX2 || TARGET_XOP"
+{
+  if (!TARGET_AVX2)
+    {
+      rtx neg = gen_reg_rtx (<MODE>mode);
+      emit_insn (gen_neg<mode>2 (neg, operands[2]));
+      emit_insn (gen_xop_shl<mode>3 (operands[0], operands[1], neg));
+      DONE;
+    }
+})
+
+(define_expand "vlshr<mode>3"
+  [(set (match_operand:VI48_512 0 "register_operand")
+	(lshiftrt:VI48_512
+	  (match_operand:VI48_512 1 "register_operand")
+	  (match_operand:VI48_512 2 "nonimmediate_operand")))]
+  "TARGET_AVX512F")
+
+(define_expand "vlshr<mode>3"
+  [(set (match_operand:VI48_256 0 "register_operand")
+	(lshiftrt:VI48_256
+	  (match_operand:VI48_256 1 "register_operand")
+	  (match_operand:VI48_256 2 "nonimmediate_operand")))]
+  "TARGET_AVX2")
+
+(define_expand "vashr<mode>3"
+  [(set (match_operand:VI128_128 0 "register_operand")
+	(ashiftrt:VI128_128
+	  (match_operand:VI128_128 1 "register_operand")
+	  (match_operand:VI128_128 2 "nonimmediate_operand")))]
+  "TARGET_XOP"
+{
+  rtx neg = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_neg<mode>2 (neg, operands[2]));
+  emit_insn (gen_xop_sha<mode>3 (operands[0], operands[1], neg));
+  DONE;
+})
+
+(define_expand "vashrv4si3"
+  [(set (match_operand:V4SI 0 "register_operand")
+	(ashiftrt:V4SI (match_operand:V4SI 1 "register_operand")
+		       (match_operand:V4SI 2 "nonimmediate_operand")))]
+  "TARGET_AVX2 || TARGET_XOP"
+{
+  if (!TARGET_AVX2)
+    {
+      rtx neg = gen_reg_rtx (V4SImode);
+      emit_insn (gen_negv4si2 (neg, operands[2]));
+      emit_insn (gen_xop_shav4si3 (operands[0], operands[1], neg));
+      DONE;
+    }
+})
+
+(define_expand "vashrv16si3"
+  [(set (match_operand:V16SI 0 "register_operand")
+	(ashiftrt:V16SI (match_operand:V16SI 1 "register_operand")
+		        (match_operand:V16SI 2 "nonimmediate_operand")))]
+  "TARGET_AVX512F")
+
+(define_expand "vashrv8si3"
+  [(set (match_operand:V8SI 0 "register_operand")
+	(ashiftrt:V8SI (match_operand:V8SI 1 "register_operand")
+		       (match_operand:V8SI 2 "nonimmediate_operand")))]
+  "TARGET_AVX2")
+
+(define_expand "vashl<mode>3"
+  [(set (match_operand:VI12_128 0 "register_operand")
+	(ashift:VI12_128
+	  (match_operand:VI12_128 1 "register_operand")
+	  (match_operand:VI12_128 2 "nonimmediate_operand")))]
+  "TARGET_XOP"
+{
+  emit_insn (gen_xop_sha<mode>3 (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_expand "vashl<mode>3"
+  [(set (match_operand:VI48_128 0 "register_operand")
+	(ashift:VI48_128
+	  (match_operand:VI48_128 1 "register_operand")
+	  (match_operand:VI48_128 2 "nonimmediate_operand")))]
+  "TARGET_AVX2 || TARGET_XOP"
+{
+  if (!TARGET_AVX2)
+    {
+      operands[2] = force_reg (<MODE>mode, operands[2]);
+      emit_insn (gen_xop_sha<mode>3 (operands[0], operands[1], operands[2]));
+      DONE;
+    }
+})
+
+(define_expand "vashl<mode>3"
+  [(set (match_operand:VI48_512 0 "register_operand")
+	(ashift:VI48_512
+	  (match_operand:VI48_512 1 "register_operand")
+	  (match_operand:VI48_512 2 "nonimmediate_operand")))]
+  "TARGET_AVX512F")
+
+(define_expand "vashl<mode>3"
+  [(set (match_operand:VI48_256 0 "register_operand")
+	(ashift:VI48_256
+	  (match_operand:VI48_256 1 "register_operand")
+	  (match_operand:VI48_256 2 "nonimmediate_operand")))]
+  "TARGET_AVX2")
+
+(define_insn "xop_sha<mode>3"
+  [(set (match_operand:VI_128 0 "register_operand" "=x,x")
+	(if_then_else:VI_128
+	 (ge:VI_128
+	  (match_operand:VI_128 2 "nonimmediate_operand" "x,m")
+	  (const_int 0))
+	 (ashift:VI_128
+	  (match_operand:VI_128 1 "nonimmediate_operand" "xm,x")
+	  (match_dup 2))
+	 (ashiftrt:VI_128
+	  (match_dup 1)
+	  (neg:VI_128 (match_dup 2)))))]
+  "TARGET_XOP && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "vpsha<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set_attr "prefix_data16" "0")
+   (set_attr "prefix_extra" "2")
+   (set_attr "mode" "TI")])
+
+(define_insn "xop_shl<mode>3"
+  [(set (match_operand:VI_128 0 "register_operand" "=x,x")
+	(if_then_else:VI_128
+	 (ge:VI_128
+	  (match_operand:VI_128 2 "nonimmediate_operand" "x,m")
+	  (const_int 0))
+	 (ashift:VI_128
+	  (match_operand:VI_128 1 "nonimmediate_operand" "xm,x")
+	  (match_dup 2))
+	 (lshiftrt:VI_128
+	  (match_dup 1)
+	  (neg:VI_128 (match_dup 2)))))]
+  "TARGET_XOP && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "vpshl<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set_attr "prefix_data16" "0")
+   (set_attr "prefix_extra" "2")
+   (set_attr "mode" "TI")])
+
+(define_expand "<shift_insn><mode>3"
+  [(set (match_operand:VI1_AVX2 0 "register_operand")
+	(any_shift:VI1_AVX2
+	  (match_operand:VI1_AVX2 1 "register_operand")
+	  (match_operand:SI 2 "nonmemory_operand")))]
+  "TARGET_SSE2"
+{
+  if (TARGET_XOP && <MODE>mode == V16QImode)
+    {
+      bool negate = false;
+      rtx (*gen) (rtx, rtx, rtx);
+      rtx tmp, par;
+      int i;
+
+      if (<CODE> != ASHIFT)
+	{
+	  if (CONST_INT_P (operands[2]))
+	    operands[2] = GEN_INT (-INTVAL (operands[2]));
+	  else
+	    negate = true;
+	}
+      par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
+      for (i = 0; i < 16; i++)
+        XVECEXP (par, 0, i) = operands[2];
+
+      tmp = gen_reg_rtx (V16QImode);
+      emit_insn (gen_vec_initv16qi (tmp, par));
+
+      if (negate)
+	emit_insn (gen_negv16qi2 (tmp, tmp));
+
+      gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3);
+      emit_insn (gen (operands[0], operands[1], tmp));
+    }
+  else
+    ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
+  DONE;
+})
+
+(define_expand "ashrv2di3"
+  [(set (match_operand:V2DI 0 "register_operand")
+	(ashiftrt:V2DI
+	  (match_operand:V2DI 1 "register_operand")
+	  (match_operand:DI 2 "nonmemory_operand")))]
+  "TARGET_XOP"
+{
+  rtx reg = gen_reg_rtx (V2DImode);
+  rtx par;
+  bool negate = false;
+  int i;
+
+  if (CONST_INT_P (operands[2]))
+    operands[2] = GEN_INT (-INTVAL (operands[2]));
+  else
+    negate = true;
+
+  par = gen_rtx_PARALLEL (V2DImode, rtvec_alloc (2));
+  for (i = 0; i < 2; i++)
+    XVECEXP (par, 0, i) = operands[2];
+
+  emit_insn (gen_vec_initv2di (reg, par));
+
+  if (negate)
+    emit_insn (gen_negv2di2 (reg, reg));
+
+  emit_insn (gen_xop_shav2di3 (operands[0], operands[1], reg));
+  DONE;
+})
+
+;; XOP FRCZ support
+(define_insn "xop_frcz<mode>2"
+  [(set (match_operand:FMAMODE 0 "register_operand" "=x")
+	(unspec:FMAMODE
+	 [(match_operand:FMAMODE 1 "nonimmediate_operand" "xm")]
+	 UNSPEC_FRCZ))]
+  "TARGET_XOP"
+  "vfrcz<ssemodesuffix>\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt1")
+   (set_attr "mode" "<MODE>")])
+
+(define_expand "xop_vmfrcz<mode>2"
+  [(set (match_operand:VF_128 0 "register_operand")
+	(vec_merge:VF_128
+	  (unspec:VF_128
+	   [(match_operand:VF_128 1 "nonimmediate_operand")]
+	   UNSPEC_FRCZ)
+	  (match_dup 2)
+	  (const_int 1)))]
+  "TARGET_XOP"
+  "operands[2] = CONST0_RTX (<MODE>mode);")
+
+(define_insn "*xop_vmfrcz<mode>2"
+  [(set (match_operand:VF_128 0 "register_operand" "=x")
+	(vec_merge:VF_128
+	  (unspec:VF_128
+	   [(match_operand:VF_128 1 "nonimmediate_operand" "xm")]
+	   UNSPEC_FRCZ)
+	  (match_operand:VF_128 2 "const0_operand")
+	  (const_int 1)))]
+  "TARGET_XOP"
+  "vfrcz<ssescalarmodesuffix>\t{%1, %0|%0, %<iptr>1}"
+  [(set_attr "type" "ssecvt1")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "xop_maskcmp<mode>3"
+  [(set (match_operand:VI_128 0 "register_operand" "=x")
+	(match_operator:VI_128 1 "ix86_comparison_int_operator"
+	 [(match_operand:VI_128 2 "register_operand" "x")
+	  (match_operand:VI_128 3 "nonimmediate_operand" "xm")]))]
+  "TARGET_XOP"
+  "vpcom%Y1<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "sse4arg")
+   (set_attr "prefix_data16" "0")
+   (set_attr "prefix_rep" "0")
+   (set_attr "prefix_extra" "2")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "xop_maskcmp_uns<mode>3"
+  [(set (match_operand:VI_128 0 "register_operand" "=x")
+	(match_operator:VI_128 1 "ix86_comparison_uns_operator"
+	 [(match_operand:VI_128 2 "register_operand" "x")
+	  (match_operand:VI_128 3 "nonimmediate_operand" "xm")]))]
+  "TARGET_XOP"
+  "vpcom%Y1u<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "prefix_data16" "0")
+   (set_attr "prefix_rep" "0")
+   (set_attr "prefix_extra" "2")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "TI")])
+
+;; Version of pcom*u* that is called from the intrinsics that allows pcomequ*
+;; and pcomneu* not to be converted to the signed ones in case somebody needs
+;; the exact instruction generated for the intrinsic.
+(define_insn "xop_maskcmp_uns2<mode>3"
+  [(set (match_operand:VI_128 0 "register_operand" "=x")
+	(unspec:VI_128
+	 [(match_operator:VI_128 1 "ix86_comparison_uns_operator"
+	  [(match_operand:VI_128 2 "register_operand" "x")
+	   (match_operand:VI_128 3 "nonimmediate_operand" "xm")])]
+	 UNSPEC_XOP_UNSIGNED_CMP))]
+  "TARGET_XOP"
+  "vpcom%Y1u<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "prefix_data16" "0")
+   (set_attr "prefix_extra" "2")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "TI")])
+
+;; Pcomtrue and pcomfalse support.  These are useless instructions, but are
+;; being added here to be complete.
+(define_insn "xop_pcom_tf<mode>3"
+  [(set (match_operand:VI_128 0 "register_operand" "=x")
+	(unspec:VI_128
+	  [(match_operand:VI_128 1 "register_operand" "x")
+	   (match_operand:VI_128 2 "nonimmediate_operand" "xm")
+	   (match_operand:SI 3 "const_int_operand" "n")]
+	  UNSPEC_XOP_TRUEFALSE))]
+  "TARGET_XOP"
+{
+  return ((INTVAL (operands[3]) != 0)
+	  ? "vpcomtrue<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+	  : "vpcomfalse<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}");
+}
+  [(set_attr "type" "ssecmp")
+   (set_attr "prefix_data16" "0")
+   (set_attr "prefix_extra" "2")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "xop_vpermil2<mode>3"
+  [(set (match_operand:VF_128_256 0 "register_operand" "=x")
+	(unspec:VF_128_256
+	  [(match_operand:VF_128_256 1 "register_operand" "x")
+	   (match_operand:VF_128_256 2 "nonimmediate_operand" "%x")
+	   (match_operand:<sseintvecmode> 3 "nonimmediate_operand" "xm")
+	   (match_operand:SI 4 "const_0_to_3_operand" "n")]
+	  UNSPEC_VPERMIL2))]
+  "TARGET_XOP"
+  "vpermil2<ssemodesuffix>\t{%4, %3, %2, %1, %0|%0, %1, %2, %3, %4}"
+  [(set_attr "type" "sse4arg")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "<MODE>")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "aesenc"
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0,x")
+		       (match_operand:V2DI 2 "nonimmediate_operand" "xm,xm")]
+		      UNSPEC_AESENC))]
+  "TARGET_AES"
+  "@
+   aesenc\t{%2, %0|%0, %2}
+   vaesenc\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "btver2_decode" "double,double")
+   (set_attr "mode" "TI")])
+
+(define_insn "aesenclast"
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0,x")
+		       (match_operand:V2DI 2 "nonimmediate_operand" "xm,xm")]
+		      UNSPEC_AESENCLAST))]
+  "TARGET_AES"
+  "@
+   aesenclast\t{%2, %0|%0, %2}
+   vaesenclast\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "btver2_decode" "double,double") 
+   (set_attr "mode" "TI")])
+
+(define_insn "aesdec"
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0,x")
+		       (match_operand:V2DI 2 "nonimmediate_operand" "xm,xm")]
+		      UNSPEC_AESDEC))]
+  "TARGET_AES"
+  "@
+   aesdec\t{%2, %0|%0, %2}
+   vaesdec\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "btver2_decode" "double,double") 
+   (set_attr "mode" "TI")])
+
+(define_insn "aesdeclast"
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0,x")
+		       (match_operand:V2DI 2 "nonimmediate_operand" "xm,xm")]
+		      UNSPEC_AESDECLAST))]
+  "TARGET_AES"
+  "@
+   aesdeclast\t{%2, %0|%0, %2}
+   vaesdeclast\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "btver2_decode" "double,double")
+   (set_attr "mode" "TI")])
+
+(define_insn "aesimc"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(unspec:V2DI [(match_operand:V2DI 1 "nonimmediate_operand" "xm")]
+		      UNSPEC_AESIMC))]
+  "TARGET_AES"
+  "%vaesimc\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "aeskeygenassist"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(unspec:V2DI [(match_operand:V2DI 1 "nonimmediate_operand" "xm")
+		      (match_operand:SI 2 "const_0_to_255_operand" "n")]
+		     UNSPEC_AESKEYGENASSIST))]
+  "TARGET_AES"
+  "%vaeskeygenassist\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "pclmulqdq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0,x")
+		      (match_operand:V2DI 2 "nonimmediate_operand" "xm,xm")
+		      (match_operand:SI 3 "const_0_to_255_operand" "n,n")]
+		     UNSPEC_PCLMUL))]
+  "TARGET_PCLMUL"
+  "@
+   pclmulqdq\t{%3, %2, %0|%0, %2, %3}
+   vpclmulqdq\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "TI")])
+
+(define_expand "avx_vzeroall"
+  [(match_par_dup 0 [(const_int 0)])]
+  "TARGET_AVX"
+{
+  int nregs = TARGET_64BIT ? 16 : 8;
+  int regno;
+
+  operands[0] = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (nregs + 1));
+
+  XVECEXP (operands[0], 0, 0)
+    = gen_rtx_UNSPEC_VOLATILE (VOIDmode, gen_rtvec (1, const0_rtx),
+			       UNSPECV_VZEROALL);
+
+  for (regno = 0; regno < nregs; regno++)
+    XVECEXP (operands[0], 0, regno + 1)
+      = gen_rtx_SET (VOIDmode,
+		     gen_rtx_REG (V8SImode, SSE_REGNO (regno)),
+		     CONST0_RTX (V8SImode));
+})
+
+(define_insn "*avx_vzeroall"
+  [(match_parallel 0 "vzeroall_operation"
+    [(unspec_volatile [(const_int 0)] UNSPECV_VZEROALL)])]
+  "TARGET_AVX"
+  "vzeroall"
+  [(set_attr "type" "sse")
+   (set_attr "modrm" "0")
+   (set_attr "memory" "none")
+   (set_attr "prefix" "vex")
+   (set_attr "btver2_decode" "vector")
+   (set_attr "mode" "OI")])
+
+;; Clear the upper 128bits of AVX registers, equivalent to a NOP
+;; if the upper 128bits are unused.
+(define_insn "avx_vzeroupper"
+  [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)]
+  "TARGET_AVX"
+  "vzeroupper"
+  [(set_attr "type" "sse")
+   (set_attr "modrm" "0")
+   (set_attr "memory" "none")
+   (set_attr "prefix" "vex")
+   (set_attr "btver2_decode" "vector")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx2_pbroadcast<mode>"
+  [(set (match_operand:VI 0 "register_operand" "=x")
+	(vec_duplicate:VI
+	  (vec_select:<ssescalarmode>
+	    (match_operand:<ssexmmmode> 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_AVX2"
+  "vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx2_pbroadcast<mode>_1"
+  [(set (match_operand:VI_256 0 "register_operand" "=x,x")
+	(vec_duplicate:VI_256
+	  (vec_select:<ssescalarmode>
+	    (match_operand:VI_256 1 "nonimmediate_operand" "m,x")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_AVX2"
+  "@
+   vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %<iptr>1}
+   vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<avx2_avx512f>_permvar<mode><mask_name>"
+  [(set (match_operand:VI48F_256_512 0 "register_operand" "=v")
+	(unspec:VI48F_256_512
+	  [(match_operand:VI48F_256_512 1 "nonimmediate_operand" "vm")
+	   (match_operand:<sseintvecmode> 2 "register_operand" "v")]
+	  UNSPEC_VPERMVAR))]
+  "TARGET_AVX2 && <mask_mode512bit_condition>"
+  "vperm<ssemodesuffix>\t{%1, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "<mask_prefix2>")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "<avx2_avx512f>_perm<mode>"
+  [(match_operand:VI8F_256_512 0 "register_operand")
+   (match_operand:VI8F_256_512 1 "nonimmediate_operand")
+   (match_operand:SI 2 "const_0_to_255_operand")]
+  "TARGET_AVX2"
+{
+  int mask = INTVAL (operands[2]);
+  emit_insn (gen_<avx2_avx512f>_perm<mode>_1 (operands[0], operands[1],
+					      GEN_INT ((mask >> 0) & 3),
+					      GEN_INT ((mask >> 2) & 3),
+					      GEN_INT ((mask >> 4) & 3),
+					      GEN_INT ((mask >> 6) & 3)));
+  DONE;
+})
+
+(define_expand "avx512f_perm<mode>_mask"
+  [(match_operand:V8FI 0 "register_operand")
+   (match_operand:V8FI 1 "nonimmediate_operand")
+   (match_operand:SI 2 "const_0_to_255_operand")
+   (match_operand:V8FI 3 "vector_move_operand")
+   (match_operand:<avx512fmaskmode> 4 "register_operand")]
+  "TARGET_AVX512F"
+{
+  int mask = INTVAL (operands[2]);
+  emit_insn (gen_<avx2_avx512f>_perm<mode>_1_mask (operands[0], operands[1],
+						   GEN_INT ((mask >> 0) & 3),
+						   GEN_INT ((mask >> 2) & 3),
+						   GEN_INT ((mask >> 4) & 3),
+						   GEN_INT ((mask >> 6) & 3),
+						   operands[3], operands[4]));
+  DONE;
+})
+
+(define_insn "<avx2_avx512f>_perm<mode>_1<mask_name>"
+  [(set (match_operand:VI8F_256_512 0 "register_operand" "=v")
+	(vec_select:VI8F_256_512
+	  (match_operand:VI8F_256_512 1 "nonimmediate_operand" "vm")
+	  (parallel [(match_operand 2 "const_0_to_3_operand")
+		     (match_operand 3 "const_0_to_3_operand")
+		     (match_operand 4 "const_0_to_3_operand")
+		     (match_operand 5 "const_0_to_3_operand")])))]
+  "TARGET_AVX2 && <mask_mode512bit_condition>"
+{
+  int mask = 0;
+  mask |= INTVAL (operands[2]) << 0;
+  mask |= INTVAL (operands[3]) << 2;
+  mask |= INTVAL (operands[4]) << 4;
+  mask |= INTVAL (operands[5]) << 6;
+  operands[2] = GEN_INT (mask);
+  return "vperm<ssemodesuffix>\t{%2, %1, %0<mask_operand6>|%0<mask_operand6>, %1, %2}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "<mask_prefix2>")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx2_permv2ti"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+	(unspec:V4DI
+	  [(match_operand:V4DI 1 "register_operand" "x")
+	   (match_operand:V4DI 2 "nonimmediate_operand" "xm")
+	   (match_operand:SI 3 "const_0_to_255_operand" "n")]
+	  UNSPEC_VPERMTI))]
+  "TARGET_AVX2"
+  "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx2_vec_dupv4df"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+	(vec_duplicate:V4DF
+	  (vec_select:DF
+	    (match_operand:V2DF 1 "register_operand" "x")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_AVX2"
+  "vbroadcastsd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
+;; Modes handled by AVX vec_dup patterns.
+(define_mode_iterator AVX_VEC_DUP_MODE
+  [V8SI V8SF V4DI V4DF])
+
+(define_insn "vec_dup<mode>"
+  [(set (match_operand:AVX_VEC_DUP_MODE 0 "register_operand" "=x,x,x")
+	(vec_duplicate:AVX_VEC_DUP_MODE
+	  (match_operand:<ssescalarmode> 1 "nonimmediate_operand" "m,x,?x")))]
+  "TARGET_AVX"
+  "@
+   vbroadcast<ssescalarmodesuffix>\t{%1, %0|%0, %1}
+   vbroadcast<ssescalarmodesuffix>\t{%x1, %0|%0, %x1}
+   #"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "isa" "*,avx2,noavx2")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "<mask_codefor>avx512f_vec_dup<mode><mask_name>"
+  [(set (match_operand:VI48F_512 0 "register_operand" "=v")
+	(vec_duplicate:VI48F_512
+	  (vec_select:<ssescalarmode>
+	    (match_operand:<ssexmmmode> 1 "nonimmediate_operand" "vm")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_AVX512F"
+  "v<sseintprefix>broadcast<bcstscalarsuff>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<mask_codefor>avx512f_broadcast<mode><mask_name>"
+  [(set (match_operand:V16FI 0 "register_operand" "=v,v")
+	(vec_duplicate:V16FI
+	  (match_operand:<ssexmmmode> 1 "nonimmediate_operand" "v,m")))]
+  "TARGET_AVX512F"
+  "@
+   vshuf<shuffletype>32x4\t{$0x0, %g1, %g1, %0<mask_operand2>|%0<mask_operand2>, %g1, %g1, 0x0}
+   vbroadcast<shuffletype>32x4\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<mask_codefor>avx512f_broadcast<mode><mask_name>"
+  [(set (match_operand:V8FI 0 "register_operand" "=v,v")
+	(vec_duplicate:V8FI
+	  (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "v,m")))]
+  "TARGET_AVX512F"
+  "@
+   vshuf<shuffletype>64x2\t{$0x44, %g1, %g1, %0<mask_operand2>|%0<mask_operand2>, %g1, %g1, 0x44}
+   vbroadcast<shuffletype>64x4\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<mask_codefor>avx512f_vec_dup_gpr<mode><mask_name>"
+  [(set (match_operand:VI48_512 0 "register_operand" "=v")
+	(vec_duplicate:VI48_512
+	  (match_operand:<ssescalarmode> 1 "register_operand" "r")))]
+  "TARGET_AVX512F && (<MODE>mode != V8DImode || TARGET_64BIT)"
+  "vpbroadcast<bcstscalarsuff>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<mask_codefor>avx512f_vec_dup_mem<mode><mask_name>"
+  [(set (match_operand:VI48F_512 0 "register_operand" "=v")
+	(vec_duplicate:VI48F_512
+	  (match_operand:<ssescalarmode> 1 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512F"
+  "v<sseintprefix>broadcast<bcstscalarsuff>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx2_vbroadcasti128_<mode>"
+  [(set (match_operand:VI_256 0 "register_operand" "=x")
+	(vec_concat:VI_256
+	  (match_operand:<ssehalfvecmode> 1 "memory_operand" "m")
+	  (match_dup 1)))]
+  "TARGET_AVX2"
+  "vbroadcasti128\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_split
+  [(set (match_operand:AVX_VEC_DUP_MODE 0 "register_operand")
+	(vec_duplicate:AVX_VEC_DUP_MODE
+	  (match_operand:<ssescalarmode> 1 "register_operand")))]
+  "TARGET_AVX && !TARGET_AVX2 && reload_completed"
+  [(set (match_dup 2)
+	(vec_duplicate:<ssehalfvecmode> (match_dup 1)))
+   (set (match_dup 0)
+	(vec_concat:AVX_VEC_DUP_MODE (match_dup 2) (match_dup 2)))]
+  "operands[2] = gen_rtx_REG (<ssehalfvecmode>mode, REGNO (operands[0]));")
+
+(define_insn "avx_vbroadcastf128_<mode>"
+  [(set (match_operand:V_256 0 "register_operand" "=x,x,x")
+	(vec_concat:V_256
+	  (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "m,0,?x")
+	  (match_dup 1)))]
+  "TARGET_AVX"
+  "@
+   vbroadcast<i128>\t{%1, %0|%0, %1}
+   vinsert<i128>\t{$1, %1, %0, %0|%0, %0, %1, 1}
+   vperm2<i128>\t{$0, %t1, %t1, %0|%0, %t1, %t1, 0}"
+  [(set_attr "type" "ssemov,sselog1,sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "0,1,1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512cd_maskb_vec_dupv8di"
+  [(set (match_operand:V8DI 0 "register_operand" "=v")
+	(vec_duplicate:V8DI
+	  (zero_extend:DI
+	    (match_operand:QI 1 "register_operand" "Yk"))))]
+  "TARGET_AVX512CD"
+  "vpbroadcastmb2q\t{%1, %0|%0, %1}"
+  [(set_attr "type" "mskmov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+(define_insn "avx512cd_maskw_vec_dupv16si"
+  [(set (match_operand:V16SI 0 "register_operand" "=v")
+	(vec_duplicate:V16SI
+	  (zero_extend:SI
+	    (match_operand:HI 1 "register_operand" "Yk"))))]
+  "TARGET_AVX512CD"
+  "vpbroadcastmw2d\t{%1, %0|%0, %1}"
+  [(set_attr "type" "mskmov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "XI")])
+
+;; Recognize broadcast as a vec_select as produced by builtin_vec_perm.
+;; If it so happens that the input is in memory, use vbroadcast.
+;; Otherwise use vpermilp (and in the case of 256-bit modes, vperm2f128).
+(define_insn "*avx_vperm_broadcast_v4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x,x")
+	(vec_select:V4SF
+	  (match_operand:V4SF 1 "nonimmediate_operand" "m,o,x")
+	  (match_parallel 2 "avx_vbroadcast_operand"
+	    [(match_operand 3 "const_int_operand" "C,n,n")])))]
+  "TARGET_AVX"
+{
+  int elt = INTVAL (operands[3]);
+  switch (which_alternative)
+    {
+    case 0:
+    case 1:
+      operands[1] = adjust_address_nv (operands[1], SFmode, elt * 4);
+      return "vbroadcastss\t{%1, %0|%0, %k1}";
+    case 2:
+      operands[2] = GEN_INT (elt * 0x55);
+      return "vpermilps\t{%2, %1, %0|%0, %1, %2}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "ssemov,ssemov,sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "0,0,1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "SF,SF,V4SF")])
+
+(define_insn_and_split "*avx_vperm_broadcast_<mode>"
+  [(set (match_operand:VF_256 0 "register_operand" "=x,x,x")
+	(vec_select:VF_256
+	  (match_operand:VF_256 1 "nonimmediate_operand" "m,o,?x")
+	  (match_parallel 2 "avx_vbroadcast_operand"
+	    [(match_operand 3 "const_int_operand" "C,n,n")])))]
+  "TARGET_AVX"
+  "#"
+  "&& reload_completed && (<MODE>mode != V4DFmode || !TARGET_AVX2)"
+  [(set (match_dup 0) (vec_duplicate:VF_256 (match_dup 1)))]
+{
+  rtx op0 = operands[0], op1 = operands[1];
+  int elt = INTVAL (operands[3]);
+
+  if (REG_P (op1))
+    {
+      int mask;
+
+      if (TARGET_AVX2 && elt == 0)
+	{
+	  emit_insn (gen_vec_dup<mode> (op0, gen_lowpart (<ssescalarmode>mode,
+							  op1)));
+	  DONE;
+	}
+
+      /* Shuffle element we care about into all elements of the 128-bit lane.
+	 The other lane gets shuffled too, but we don't care.  */
+      if (<MODE>mode == V4DFmode)
+	mask = (elt & 1 ? 15 : 0);
+      else
+	mask = (elt & 3) * 0x55;
+      emit_insn (gen_avx_vpermil<mode> (op0, op1, GEN_INT (mask)));
+
+      /* Shuffle the lane we care about into both lanes of the dest.  */
+      mask = (elt / (<ssescalarnum> / 2)) * 0x11;
+      emit_insn (gen_avx_vperm2f128<mode>3 (op0, op0, op0, GEN_INT (mask)));
+      DONE;
+    }
+
+  operands[1] = adjust_address (op1, <ssescalarmode>mode,
+				elt * GET_MODE_SIZE (<ssescalarmode>mode));
+})
+
+(define_expand "<sse2_avx_avx512f>_vpermil<mode><mask_name>"
+  [(set (match_operand:VF2 0 "register_operand")
+	(vec_select:VF2
+	  (match_operand:VF2 1 "nonimmediate_operand")
+	  (match_operand:SI 2 "const_0_to_255_operand")))]
+  "TARGET_AVX && <mask_mode512bit_condition>"
+{
+  int mask = INTVAL (operands[2]);
+  rtx perm[<ssescalarnum>];
+
+  int i;
+  for (i = 0; i < <ssescalarnum>; i = i + 2)
+    {
+      perm[i]     = GEN_INT (((mask >> i)       & 1) + i);
+      perm[i + 1] = GEN_INT (((mask >> (i + 1)) & 1) + i);
+    }
+
+  operands[2]
+    = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm));
+})
+
+(define_expand "<sse2_avx_avx512f>_vpermil<mode><mask_name>"
+  [(set (match_operand:VF1 0 "register_operand")
+	(vec_select:VF1
+	  (match_operand:VF1 1 "nonimmediate_operand")
+	  (match_operand:SI 2 "const_0_to_255_operand")))]
+  "TARGET_AVX && <mask_mode512bit_condition>"
+{
+  int mask = INTVAL (operands[2]);
+  rtx perm[<ssescalarnum>];
+
+  int i;
+  for (i = 0; i < <ssescalarnum>; i = i + 4)
+    {
+      perm[i]     = GEN_INT (((mask >> 0) & 3) + i);
+      perm[i + 1] = GEN_INT (((mask >> 2) & 3) + i);
+      perm[i + 2] = GEN_INT (((mask >> 4) & 3) + i);
+      perm[i + 3] = GEN_INT (((mask >> 6) & 3) + i);
+    }
+
+  operands[2]
+    = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm));
+})
+
+(define_insn "*<sse2_avx_avx512f>_vpermilp<mode><mask_name>"
+  [(set (match_operand:VF 0 "register_operand" "=v")
+	(vec_select:VF
+	  (match_operand:VF 1 "nonimmediate_operand" "vm")
+	  (match_parallel 2 ""
+	    [(match_operand 3 "const_int_operand")])))]
+  "TARGET_AVX && <mask_mode512bit_condition>
+   && avx_vpermilp_parallel (operands[2], <MODE>mode)"
+{
+  int mask = avx_vpermilp_parallel (operands[2], <MODE>mode) - 1;
+  operands[2] = GEN_INT (mask);
+  return "vpermil<ssemodesuffix>\t{%2, %1, %0<mask_operand4>|%0<mask_operand4>, %1, %2}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "<mask_prefix>")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<sse2_avx_avx512f>_vpermilvar<mode>3<mask_name>"
+  [(set (match_operand:VF 0 "register_operand" "=v")
+	(unspec:VF
+	  [(match_operand:VF 1 "register_operand" "v")
+	   (match_operand:<sseintvecmode> 2 "nonimmediate_operand" "vm")]
+	  UNSPEC_VPERMIL))]
+  "TARGET_AVX && <mask_mode512bit_condition>"
+  "vpermil<ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "btver2_decode" "vector")
+   (set_attr "prefix" "<mask_prefix>")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512f_vpermi2var<mode>3_maskz"
+  [(match_operand:VI48F_512 0 "register_operand" "=v")
+   (match_operand:VI48F_512 1 "register_operand" "v")
+   (match_operand:<sseintvecmode> 2 "register_operand" "0")
+   (match_operand:VI48F_512 3 "nonimmediate_operand" "vm")
+   (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")]
+  "TARGET_AVX512F"
+{
+  emit_insn (gen_avx512f_vpermi2var<mode>3_maskz_1 (
+	operands[0], operands[1], operands[2], operands[3],
+	CONST0_RTX (<MODE>mode), operands[4]));
+  DONE;
+})
+
+(define_insn "avx512f_vpermi2var<mode>3<sd_maskz_name>"
+  [(set (match_operand:VI48F_512 0 "register_operand" "=v")
+	(unspec:VI48F_512
+	  [(match_operand:VI48F_512 1 "register_operand" "v")
+	   (match_operand:<sseintvecmode> 2 "register_operand" "0")
+	   (match_operand:VI48F_512 3 "nonimmediate_operand" "vm")]
+	  UNSPEC_VPERMI2))]
+  "TARGET_AVX512F"
+  "vpermi2<ssemodesuffix>\t{%3, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %3}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512f_vpermi2var<mode>3_mask"
+  [(set (match_operand:VI48F_512 0 "register_operand" "=v")
+	(vec_merge:VI48F_512
+	  (unspec:VI48F_512
+	    [(match_operand:VI48F_512 1 "register_operand" "v")
+	    (match_operand:<sseintvecmode> 2 "register_operand" "0")
+	    (match_operand:VI48F_512 3 "nonimmediate_operand" "vm")]
+	    UNSPEC_VPERMI2_MASK)
+	  (match_dup 0)
+	  (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+  "vpermi2<ssemodesuffix>\t{%3, %1, %0%{%4%}|%0%{%4%}, %1, %3}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512f_vpermt2var<mode>3_maskz"
+  [(match_operand:VI48F_512 0 "register_operand" "=v")
+   (match_operand:<sseintvecmode> 1 "register_operand" "v")
+   (match_operand:VI48F_512 2 "register_operand" "0")
+   (match_operand:VI48F_512 3 "nonimmediate_operand" "vm")
+   (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")]
+  "TARGET_AVX512F"
+{
+  emit_insn (gen_avx512f_vpermt2var<mode>3_maskz_1 (
+	operands[0], operands[1], operands[2], operands[3],
+	CONST0_RTX (<MODE>mode), operands[4]));
+  DONE;
+})
+
+(define_insn "avx512f_vpermt2var<mode>3<sd_maskz_name>"
+  [(set (match_operand:VI48F_512 0 "register_operand" "=v")
+	(unspec:VI48F_512
+	  [(match_operand:<sseintvecmode> 1 "register_operand" "v")
+	   (match_operand:VI48F_512 2 "register_operand" "0")
+	   (match_operand:VI48F_512 3 "nonimmediate_operand" "vm")]
+	  UNSPEC_VPERMT2))]
+  "TARGET_AVX512F"
+  "vpermt2<ssemodesuffix>\t{%3, %1, %0<sd_mask_op4>|%0<sd_mask_op4>, %1, %3}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512f_vpermt2var<mode>3_mask"
+  [(set (match_operand:VI48F_512 0 "register_operand" "=v")
+	(vec_merge:VI48F_512
+	  (unspec:VI48F_512
+	    [(match_operand:<sseintvecmode> 1 "register_operand" "v")
+	    (match_operand:VI48F_512 2 "register_operand" "0")
+	    (match_operand:VI48F_512 3 "nonimmediate_operand" "vm")]
+	    UNSPEC_VPERMT2)
+	  (match_dup 2)
+	  (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+  "vpermt2<ssemodesuffix>\t{%3, %1, %0%{%4%}|%0%{%4%}, %1, %3}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx_vperm2f128<mode>3"
+  [(set (match_operand:AVX256MODE2P 0 "register_operand")
+	(unspec:AVX256MODE2P
+	  [(match_operand:AVX256MODE2P 1 "register_operand")
+	   (match_operand:AVX256MODE2P 2 "nonimmediate_operand")
+	   (match_operand:SI 3 "const_0_to_255_operand")]
+	  UNSPEC_VPERMIL2F128))]
+  "TARGET_AVX"
+{
+  int mask = INTVAL (operands[3]);
+  if ((mask & 0x88) == 0)
+    {
+      rtx perm[<ssescalarnum>], t1, t2;
+      int i, base, nelt = <ssescalarnum>, nelt2 = nelt / 2;
+
+      base = (mask & 3) * nelt2;
+      for (i = 0; i < nelt2; ++i)
+	perm[i] = GEN_INT (base + i);
+
+      base = ((mask >> 4) & 3) * nelt2;
+      for (i = 0; i < nelt2; ++i)
+	perm[i + nelt2] = GEN_INT (base + i);
+
+      t2 = gen_rtx_VEC_CONCAT (<ssedoublevecmode>mode,
+			       operands[1], operands[2]);
+      t1 = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, perm));
+      t2 = gen_rtx_VEC_SELECT (<MODE>mode, t2, t1);
+      t2 = gen_rtx_SET (VOIDmode, operands[0], t2);
+      emit_insn (t2);
+      DONE;
+    }
+})
+
+;; Note that bits 7 and 3 of the imm8 allow lanes to be zeroed, which
+;; means that in order to represent this properly in rtl we'd have to
+;; nest *another* vec_concat with a zero operand and do the select from
+;; a 4x wide vector.  That doesn't seem very nice.
+(define_insn "*avx_vperm2f128<mode>_full"
+  [(set (match_operand:AVX256MODE2P 0 "register_operand" "=x")
+	(unspec:AVX256MODE2P
+	  [(match_operand:AVX256MODE2P 1 "register_operand" "x")
+	   (match_operand:AVX256MODE2P 2 "nonimmediate_operand" "xm")
+	   (match_operand:SI 3 "const_0_to_255_operand" "n")]
+	  UNSPEC_VPERMIL2F128))]
+  "TARGET_AVX"
+  "vperm2<i128>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*avx_vperm2f128<mode>_nozero"
+  [(set (match_operand:AVX256MODE2P 0 "register_operand" "=x")
+	(vec_select:AVX256MODE2P
+	  (vec_concat:<ssedoublevecmode>
+	    (match_operand:AVX256MODE2P 1 "register_operand" "x")
+	    (match_operand:AVX256MODE2P 2 "nonimmediate_operand" "xm"))
+	  (match_parallel 3 ""
+	    [(match_operand 4 "const_int_operand")])))]
+  "TARGET_AVX
+   && avx_vperm2f128_parallel (operands[3], <MODE>mode)"
+{
+  int mask = avx_vperm2f128_parallel (operands[3], <MODE>mode) - 1;
+  if (mask == 0x12)
+    return "vinsert<i128>\t{$0, %x2, %1, %0|%0, %1, %x2, 0}";
+  if (mask == 0x20)
+    return "vinsert<i128>\t{$1, %x2, %1, %0|%0, %1, %x2, 1}";
+  operands[3] = GEN_INT (mask);
+  return "vperm2<i128>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx_vinsertf128<mode>"
+  [(match_operand:V_256 0 "register_operand")
+   (match_operand:V_256 1 "register_operand")
+   (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand")
+   (match_operand:SI 3 "const_0_to_1_operand")]
+  "TARGET_AVX"
+{
+  rtx (*insn)(rtx, rtx, rtx);
+
+  switch (INTVAL (operands[3]))
+    {
+    case 0:
+      insn = gen_vec_set_lo_<mode>;
+      break;
+    case 1:
+      insn = gen_vec_set_hi_<mode>;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  emit_insn (insn (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_insn "avx2_vec_set_lo_v4di"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+	(vec_concat:V4DI
+	  (match_operand:V2DI 2 "nonimmediate_operand" "xm")
+	  (vec_select:V2DI
+	    (match_operand:V4DI 1 "register_operand" "x")
+	    (parallel [(const_int 2) (const_int 3)]))))]
+  "TARGET_AVX2"
+  "vinserti128\t{$0x0, %2, %1, %0|%0, %1, %2, 0x0}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx2_vec_set_hi_v4di"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+	(vec_concat:V4DI
+	  (vec_select:V2DI
+	    (match_operand:V4DI 1 "register_operand" "x")
+	    (parallel [(const_int 0) (const_int 1)]))
+	  (match_operand:V2DI 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX2"
+  "vinserti128\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "vec_set_lo_<mode>"
+  [(set (match_operand:VI8F_256 0 "register_operand" "=x")
+	(vec_concat:VI8F_256
+	  (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" "xm")
+	  (vec_select:<ssehalfvecmode>
+	    (match_operand:VI8F_256 1 "register_operand" "x")
+	    (parallel [(const_int 2) (const_int 3)]))))]
+  "TARGET_AVX"
+  "vinsert<i128>\t{$0x0, %2, %1, %0|%0, %1, %2, 0x0}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "vec_set_hi_<mode>"
+  [(set (match_operand:VI8F_256 0 "register_operand" "=x")
+	(vec_concat:VI8F_256
+	  (vec_select:<ssehalfvecmode>
+	    (match_operand:VI8F_256 1 "register_operand" "x")
+	    (parallel [(const_int 0) (const_int 1)]))
+	  (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vinsert<i128>\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "vec_set_lo_<mode>"
+  [(set (match_operand:VI4F_256 0 "register_operand" "=x")
+	(vec_concat:VI4F_256
+	  (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" "xm")
+	  (vec_select:<ssehalfvecmode>
+	    (match_operand:VI4F_256 1 "register_operand" "x")
+	    (parallel [(const_int 4) (const_int 5)
+		       (const_int 6) (const_int 7)]))))]
+  "TARGET_AVX"
+  "vinsert<i128>\t{$0x0, %2, %1, %0|%0, %1, %2, 0x0}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "vec_set_hi_<mode>"
+  [(set (match_operand:VI4F_256 0 "register_operand" "=x")
+	(vec_concat:VI4F_256
+	  (vec_select:<ssehalfvecmode>
+	    (match_operand:VI4F_256 1 "register_operand" "x")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)]))
+	  (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vinsert<i128>\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "vec_set_lo_v16hi"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+	(vec_concat:V16HI
+	  (match_operand:V8HI 2 "nonimmediate_operand" "xm")
+	  (vec_select:V8HI
+	    (match_operand:V16HI 1 "register_operand" "x")
+	    (parallel [(const_int 8) (const_int 9)
+		       (const_int 10) (const_int 11)
+		       (const_int 12) (const_int 13)
+		       (const_int 14) (const_int 15)]))))]
+  "TARGET_AVX"
+  "vinsert%~128\t{$0x0, %2, %1, %0|%0, %1, %2, 0x0}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "vec_set_hi_v16hi"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+	(vec_concat:V16HI
+	  (vec_select:V8HI
+	    (match_operand:V16HI 1 "register_operand" "x")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)
+		       (const_int 4) (const_int 5)
+		       (const_int 6) (const_int 7)]))
+	  (match_operand:V8HI 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vinsert%~128\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "vec_set_lo_v32qi"
+  [(set (match_operand:V32QI 0 "register_operand" "=x")
+	(vec_concat:V32QI
+	  (match_operand:V16QI 2 "nonimmediate_operand" "xm")
+	  (vec_select:V16QI
+	    (match_operand:V32QI 1 "register_operand" "x")
+	    (parallel [(const_int 16) (const_int 17)
+		       (const_int 18) (const_int 19)
+		       (const_int 20) (const_int 21)
+		       (const_int 22) (const_int 23)
+		       (const_int 24) (const_int 25)
+		       (const_int 26) (const_int 27)
+		       (const_int 28) (const_int 29)
+		       (const_int 30) (const_int 31)]))))]
+  "TARGET_AVX"
+  "vinsert%~128\t{$0x0, %2, %1, %0|%0, %1, %2, 0x0}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "vec_set_hi_v32qi"
+  [(set (match_operand:V32QI 0 "register_operand" "=x")
+	(vec_concat:V32QI
+	  (vec_select:V16QI
+	    (match_operand:V32QI 1 "register_operand" "x")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)
+		       (const_int 4) (const_int 5)
+		       (const_int 6) (const_int 7)
+		       (const_int 8) (const_int 9)
+		       (const_int 10) (const_int 11)
+		       (const_int 12) (const_int 13)
+		       (const_int 14) (const_int 15)]))
+	  (match_operand:V16QI 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vinsert%~128\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "<avx_avx2>_maskload<ssemodesuffix><avxsizesuffix>"
+  [(set (match_operand:V48_AVX2 0 "register_operand" "=x")
+	(unspec:V48_AVX2
+	  [(match_operand:<sseintvecmode> 2 "register_operand" "x")
+	   (match_operand:V48_AVX2 1 "memory_operand" "m")]
+	  UNSPEC_MASKMOV))]
+  "TARGET_AVX"
+  "v<sseintprefix>maskmov<ssemodesuffix>\t{%1, %2, %0|%0, %2, %1}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "btver2_decode" "vector")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<avx_avx2>_maskstore<ssemodesuffix><avxsizesuffix>"
+  [(set (match_operand:V48_AVX2 0 "memory_operand" "+m")
+	(unspec:V48_AVX2
+	  [(match_operand:<sseintvecmode> 1 "register_operand" "x")
+	   (match_operand:V48_AVX2 2 "register_operand" "x")
+	   (match_dup 0)]
+	  UNSPEC_MASKMOV))]
+  "TARGET_AVX"
+  "v<sseintprefix>maskmov<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "btver2_decode" "vector") 
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "maskload<mode>"
+  [(set (match_operand:V48_AVX2 0 "register_operand")
+	(unspec:V48_AVX2
+	  [(match_operand:<sseintvecmode> 2 "register_operand")
+	   (match_operand:V48_AVX2 1 "memory_operand")]
+	  UNSPEC_MASKMOV))]
+  "TARGET_AVX")
+
+(define_expand "maskstore<mode>"
+  [(set (match_operand:V48_AVX2 0 "memory_operand")
+	(unspec:V48_AVX2
+	  [(match_operand:<sseintvecmode> 2 "register_operand")
+	   (match_operand:V48_AVX2 1 "register_operand")
+	   (match_dup 0)]
+	  UNSPEC_MASKMOV))]
+  "TARGET_AVX")
+
+(define_insn_and_split "avx_<castmode><avxsizesuffix>_<castmode>"
+  [(set (match_operand:AVX256MODE2P 0 "nonimmediate_operand" "=x,m")
+	(unspec:AVX256MODE2P
+	  [(match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "xm,x")]
+	  UNSPEC_CAST))]
+  "TARGET_AVX"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  if (REG_P (op0))
+    op0 = gen_rtx_REG (<ssehalfvecmode>mode, REGNO (op0));
+  else
+    op1 = gen_rtx_REG (<MODE>mode, REGNO (op1));
+  emit_move_insn (op0, op1);
+  DONE;
+})
+
+(define_expand "vec_init<mode>"
+  [(match_operand:V_256 0 "register_operand")
+   (match_operand 1)]
+  "TARGET_AVX"
+{
+  ix86_expand_vector_init (false, operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "vec_init<mode>"
+  [(match_operand:VI48F_512 0 "register_operand")
+   (match_operand 1)]
+  "TARGET_AVX512F"
+{
+  ix86_expand_vector_init (false, operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "avx2_extracti128"
+  [(match_operand:V2DI 0 "nonimmediate_operand")
+   (match_operand:V4DI 1 "register_operand")
+   (match_operand:SI 2 "const_0_to_1_operand")]
+  "TARGET_AVX2"
+{
+  rtx (*insn)(rtx, rtx);
+
+  switch (INTVAL (operands[2]))
+    {
+    case 0:
+      insn = gen_vec_extract_lo_v4di;
+      break;
+    case 1:
+      insn = gen_vec_extract_hi_v4di;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  emit_insn (insn (operands[0], operands[1]));
+  DONE;
+})
+
+(define_expand "avx2_inserti128"
+  [(match_operand:V4DI 0 "register_operand")
+   (match_operand:V4DI 1 "register_operand")
+   (match_operand:V2DI 2 "nonimmediate_operand")
+   (match_operand:SI 3 "const_0_to_1_operand")]
+  "TARGET_AVX2"
+{
+  rtx (*insn)(rtx, rtx, rtx);
+
+  switch (INTVAL (operands[3]))
+    {
+    case 0:
+      insn = gen_avx2_vec_set_lo_v4di;
+      break;
+    case 1:
+      insn = gen_avx2_vec_set_hi_v4di;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  emit_insn (insn (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_insn "<avx2_avx512f>_ashrv<mode><mask_name>"
+  [(set (match_operand:VI48_AVX512F 0 "register_operand" "=v")
+	(ashiftrt:VI48_AVX512F
+	  (match_operand:VI48_AVX512F 1 "register_operand" "v")
+	  (match_operand:VI48_AVX512F 2 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX2 && <mask_mode512bit_condition>"
+  "vpsrav<ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set_attr "prefix" "maybe_evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<avx2_avx512f>_<shift_insn>v<mode><mask_name>"
+  [(set (match_operand:VI48_AVX2_48_AVX512F 0 "register_operand" "=v")
+	(any_lshift:VI48_AVX2_48_AVX512F
+	  (match_operand:VI48_AVX2_48_AVX512F 1 "register_operand" "v")
+	  (match_operand:VI48_AVX2_48_AVX512F 2 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX2 && <mask_mode512bit_condition>"
+  "vp<vshift>v<ssemodesuffix>\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set_attr "prefix" "maybe_evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+;; For avx_vec_concat<mode> insn pattern
+(define_mode_attr concat_tg_mode
+  [(V32QI "t") (V16HI "t") (V8SI "t") (V4DI "t") (V8SF "t") (V4DF "t")
+   (V64QI "g") (V32HI "g") (V16SI "g") (V8DI "g") (V16SF "g") (V8DF "g")])
+
+(define_insn "avx_vec_concat<mode>"
+  [(set (match_operand:V_256_512 0 "register_operand" "=x,x")
+	(vec_concat:V_256_512
+	  (match_operand:<ssehalfvecmode> 1 "register_operand" "x,x")
+	  (match_operand:<ssehalfvecmode> 2 "vector_move_operand" "xm,C")))]
+  "TARGET_AVX"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "vinsert<i128>\t{$0x1, %2, %<concat_tg_mode>1, %0|%0, %<concat_tg_mode>1, %2, 0x1}";
+    case 1:
+      switch (get_attr_mode (insn))
+	{
+	case MODE_V16SF:
+	  return "vmovaps\t{%1, %t0|%t0, %1}";
+	case MODE_V8DF:
+	  return "vmovapd\t{%1, %t0|%t0, %1}";
+	case MODE_V8SF:
+	  return "vmovaps\t{%1, %x0|%x0, %1}";
+	case MODE_V4DF:
+	  return "vmovapd\t{%1, %x0|%x0, %1}";
+	case MODE_XI:
+	  return "vmovdqa\t{%1, %t0|%t0, %1}";
+	case MODE_OI:
+	  return "vmovdqa\t{%1, %x0|%x0, %1}";
+	default:
+	  gcc_unreachable ();
+	}
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "sselog,ssemov")
+   (set_attr "prefix_extra" "1,*")
+   (set_attr "length_immediate" "1,*")
+   (set_attr "prefix" "maybe_evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "vcvtph2ps"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(vec_select:V4SF
+	  (unspec:V8SF [(match_operand:V8HI 1 "register_operand" "x")]
+		       UNSPEC_VCVTPH2PS)
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)])))]
+  "TARGET_F16C"
+  "vcvtph2ps\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "*vcvtph2ps_load"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(unspec:V4SF [(match_operand:V4HI 1 "memory_operand" "m")]
+		     UNSPEC_VCVTPH2PS))]
+  "TARGET_F16C"
+  "vcvtph2ps\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "vcvtph2ps256"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(unspec:V8SF [(match_operand:V8HI 1 "nonimmediate_operand" "xm")]
+		     UNSPEC_VCVTPH2PS))]
+  "TARGET_F16C"
+  "vcvtph2ps\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "btver2_decode" "double")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "<mask_codefor>avx512f_vcvtph2ps512<mask_name><round_saeonly_name>"
+  [(set (match_operand:V16SF 0 "register_operand" "=v")
+	(unspec:V16SF
+	  [(match_operand:V16HI 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")]
+	  UNSPEC_VCVTPH2PS))]
+  "TARGET_AVX512F"
+  "vcvtph2ps\t{<round_saeonly_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_saeonly_mask_op2>}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V16SF")])
+
+(define_expand "vcvtps2ph"
+  [(set (match_operand:V8HI 0 "register_operand")
+	(vec_concat:V8HI
+	  (unspec:V4HI [(match_operand:V4SF 1 "register_operand")
+			(match_operand:SI 2 "const_0_to_255_operand")]
+		       UNSPEC_VCVTPS2PH)
+	  (match_dup 3)))]
+  "TARGET_F16C"
+  "operands[3] = CONST0_RTX (V4HImode);")
+
+(define_insn "*vcvtps2ph"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(vec_concat:V8HI
+	  (unspec:V4HI [(match_operand:V4SF 1 "register_operand" "x")
+			(match_operand:SI 2 "const_0_to_255_operand" "N")]
+		       UNSPEC_VCVTPS2PH)
+	  (match_operand:V4HI 3 "const0_operand")))]
+  "TARGET_F16C"
+  "vcvtps2ph\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "*vcvtps2ph_store"
+  [(set (match_operand:V4HI 0 "memory_operand" "=m")
+	(unspec:V4HI [(match_operand:V4SF 1 "register_operand" "x")
+		      (match_operand:SI 2 "const_0_to_255_operand" "N")]
+		     UNSPEC_VCVTPS2PH))]
+  "TARGET_F16C"
+  "vcvtps2ph\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "vcvtps2ph256"
+  [(set (match_operand:V8HI 0 "nonimmediate_operand" "=xm")
+	(unspec:V8HI [(match_operand:V8SF 1 "register_operand" "x")
+		      (match_operand:SI 2 "const_0_to_255_operand" "N")]
+		     UNSPEC_VCVTPS2PH))]
+  "TARGET_F16C"
+  "vcvtps2ph\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "btver2_decode" "vector")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "<mask_codefor>avx512f_vcvtps2ph512<mask_name>"
+  [(set (match_operand:V16HI 0 "nonimmediate_operand" "=vm")
+	(unspec:V16HI
+	  [(match_operand:V16SF 1 "register_operand" "v")
+	   (match_operand:SI 2 "const_0_to_255_operand" "N")]
+	  UNSPEC_VCVTPS2PH))]
+  "TARGET_AVX512F"
+  "vcvtps2ph\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "V16SF")])
+
+;; For gather* insn patterns
+(define_mode_iterator VEC_GATHER_MODE
+		      [V2DI V2DF V4DI V4DF V4SI V4SF V8SI V8SF])
+(define_mode_attr VEC_GATHER_IDXSI
+		      [(V2DI "V4SI") (V4DI "V4SI") (V8DI "V8SI")
+		       (V2DF "V4SI") (V4DF "V4SI") (V8DF "V8SI")
+		       (V4SI "V4SI") (V8SI "V8SI") (V16SI "V16SI")
+		       (V4SF "V4SI") (V8SF "V8SI") (V16SF "V16SI")])
+
+(define_mode_attr VEC_GATHER_IDXDI
+		      [(V2DI "V2DI") (V4DI "V4DI") (V8DI "V8DI")
+		       (V2DF "V2DI") (V4DF "V4DI") (V8DF "V8DI")
+		       (V4SI "V2DI") (V8SI "V4DI") (V16SI "V8DI")
+		       (V4SF "V2DI") (V8SF "V4DI") (V16SF "V8DI")])
+
+(define_mode_attr VEC_GATHER_SRCDI
+		      [(V2DI "V2DI") (V4DI "V4DI") (V8DI "V8DI")
+		       (V2DF "V2DF") (V4DF "V4DF") (V8DF "V8DF")
+		       (V4SI "V4SI") (V8SI "V4SI") (V16SI "V8SI")
+		       (V4SF "V4SF") (V8SF "V4SF") (V16SF "V8SF")])
+
+(define_expand "avx2_gathersi<mode>"
+  [(parallel [(set (match_operand:VEC_GATHER_MODE 0 "register_operand")
+		   (unspec:VEC_GATHER_MODE
+		     [(match_operand:VEC_GATHER_MODE 1 "register_operand")
+		      (mem:<ssescalarmode>
+			(match_par_dup 7
+			  [(match_operand 2 "vsib_address_operand")
+			   (match_operand:<VEC_GATHER_IDXSI>
+			      3 "register_operand")
+			   (match_operand:SI 5 "const1248_operand ")]))
+		      (mem:BLK (scratch))
+		      (match_operand:VEC_GATHER_MODE 4 "register_operand")]
+		     UNSPEC_GATHER))
+	      (clobber (match_scratch:VEC_GATHER_MODE 6))])]
+  "TARGET_AVX2"
+{
+  operands[7]
+    = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[2], operands[3],
+					operands[5]), UNSPEC_VSIBADDR);
+})
+
+(define_insn "*avx2_gathersi<mode>"
+  [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x")
+	(unspec:VEC_GATHER_MODE
+	  [(match_operand:VEC_GATHER_MODE 2 "register_operand" "0")
+	   (match_operator:<ssescalarmode> 7 "vsib_mem_operator"
+	     [(unspec:P
+		[(match_operand:P 3 "vsib_address_operand" "Tv")
+		 (match_operand:<VEC_GATHER_IDXSI> 4 "register_operand" "x")
+		 (match_operand:SI 6 "const1248_operand" "n")]
+		UNSPEC_VSIBADDR)])
+	   (mem:BLK (scratch))
+	   (match_operand:VEC_GATHER_MODE 5 "register_operand" "1")]
+	  UNSPEC_GATHER))
+   (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))]
+  "TARGET_AVX2"
+  "v<sseintprefix>gatherd<ssemodesuffix>\t{%1, %7, %0|%0, %7, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*avx2_gathersi<mode>_2"
+  [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x")
+	(unspec:VEC_GATHER_MODE
+	  [(pc)
+	   (match_operator:<ssescalarmode> 6 "vsib_mem_operator"
+	     [(unspec:P
+		[(match_operand:P 2 "vsib_address_operand" "Tv")
+		 (match_operand:<VEC_GATHER_IDXSI> 3 "register_operand" "x")
+		 (match_operand:SI 5 "const1248_operand" "n")]
+		UNSPEC_VSIBADDR)])
+	   (mem:BLK (scratch))
+	   (match_operand:VEC_GATHER_MODE 4 "register_operand" "1")]
+	  UNSPEC_GATHER))
+   (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))]
+  "TARGET_AVX2"
+  "v<sseintprefix>gatherd<ssemodesuffix>\t{%1, %6, %0|%0, %6, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx2_gatherdi<mode>"
+  [(parallel [(set (match_operand:VEC_GATHER_MODE 0 "register_operand")
+		   (unspec:VEC_GATHER_MODE
+		     [(match_operand:<VEC_GATHER_SRCDI> 1 "register_operand")
+		      (mem:<ssescalarmode>
+			(match_par_dup 7
+			  [(match_operand 2 "vsib_address_operand")
+			   (match_operand:<VEC_GATHER_IDXDI>
+			      3 "register_operand")
+			   (match_operand:SI 5 "const1248_operand ")]))
+		      (mem:BLK (scratch))
+		      (match_operand:<VEC_GATHER_SRCDI>
+			4 "register_operand")]
+		     UNSPEC_GATHER))
+	      (clobber (match_scratch:VEC_GATHER_MODE 6))])]
+  "TARGET_AVX2"
+{
+  operands[7]
+    = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[2], operands[3],
+					operands[5]), UNSPEC_VSIBADDR);
+})
+
+(define_insn "*avx2_gatherdi<mode>"
+  [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x")
+	(unspec:VEC_GATHER_MODE
+	  [(match_operand:<VEC_GATHER_SRCDI> 2 "register_operand" "0")
+	   (match_operator:<ssescalarmode> 7 "vsib_mem_operator"
+	     [(unspec:P
+		[(match_operand:P 3 "vsib_address_operand" "Tv")
+		 (match_operand:<VEC_GATHER_IDXDI> 4 "register_operand" "x")
+		 (match_operand:SI 6 "const1248_operand" "n")]
+		UNSPEC_VSIBADDR)])
+	   (mem:BLK (scratch))
+	   (match_operand:<VEC_GATHER_SRCDI> 5 "register_operand" "1")]
+	  UNSPEC_GATHER))
+   (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))]
+  "TARGET_AVX2"
+  "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %7, %2|%2, %7, %5}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*avx2_gatherdi<mode>_2"
+  [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x")
+	(unspec:VEC_GATHER_MODE
+	  [(pc)
+	   (match_operator:<ssescalarmode> 6 "vsib_mem_operator"
+	     [(unspec:P
+		[(match_operand:P 2 "vsib_address_operand" "Tv")
+		 (match_operand:<VEC_GATHER_IDXDI> 3 "register_operand" "x")
+		 (match_operand:SI 5 "const1248_operand" "n")]
+		UNSPEC_VSIBADDR)])
+	   (mem:BLK (scratch))
+	   (match_operand:<VEC_GATHER_SRCDI> 4 "register_operand" "1")]
+	  UNSPEC_GATHER))
+   (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))]
+  "TARGET_AVX2"
+{
+  if (<MODE>mode != <VEC_GATHER_SRCDI>mode)
+    return "v<sseintprefix>gatherq<ssemodesuffix>\t{%4, %6, %x0|%x0, %6, %4}";
+  return "v<sseintprefix>gatherq<ssemodesuffix>\t{%4, %6, %0|%0, %6, %4}";
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*avx2_gatherdi<mode>_3"
+  [(set (match_operand:<VEC_GATHER_SRCDI> 0 "register_operand" "=&x")
+	(vec_select:<VEC_GATHER_SRCDI>
+	  (unspec:VI4F_256
+	    [(match_operand:<VEC_GATHER_SRCDI> 2 "register_operand" "0")
+	     (match_operator:<ssescalarmode> 7 "vsib_mem_operator"
+	       [(unspec:P
+		  [(match_operand:P 3 "vsib_address_operand" "Tv")
+		   (match_operand:<VEC_GATHER_IDXDI> 4 "register_operand" "x")
+		   (match_operand:SI 6 "const1248_operand" "n")]
+		  UNSPEC_VSIBADDR)])
+	     (mem:BLK (scratch))
+	     (match_operand:<VEC_GATHER_SRCDI> 5 "register_operand" "1")]
+	     UNSPEC_GATHER)
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)])))
+   (clobber (match_scratch:VI4F_256 1 "=&x"))]
+  "TARGET_AVX2"
+  "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %7, %0|%0, %7, %5}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*avx2_gatherdi<mode>_4"
+  [(set (match_operand:<VEC_GATHER_SRCDI> 0 "register_operand" "=&x")
+	(vec_select:<VEC_GATHER_SRCDI>
+	  (unspec:VI4F_256
+	    [(pc)
+	     (match_operator:<ssescalarmode> 6 "vsib_mem_operator"
+	       [(unspec:P
+		  [(match_operand:P 2 "vsib_address_operand" "Tv")
+		   (match_operand:<VEC_GATHER_IDXDI> 3 "register_operand" "x")
+		   (match_operand:SI 5 "const1248_operand" "n")]
+		  UNSPEC_VSIBADDR)])
+	     (mem:BLK (scratch))
+	     (match_operand:<VEC_GATHER_SRCDI> 4 "register_operand" "1")]
+	    UNSPEC_GATHER)
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)])))
+   (clobber (match_scratch:VI4F_256 1 "=&x"))]
+  "TARGET_AVX2"
+  "v<sseintprefix>gatherq<ssemodesuffix>\t{%4, %6, %0|%0, %6, %4}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512f_gathersi<mode>"
+  [(parallel [(set (match_operand:VI48F_512 0 "register_operand")
+		   (unspec:VI48F_512
+		     [(match_operand:VI48F_512 1 "register_operand")
+		      (match_operand:<avx512fmaskmode> 4 "register_operand")
+		      (mem:<ssescalarmode>
+			(match_par_dup 6
+			  [(match_operand 2 "vsib_address_operand")
+			   (match_operand:<VEC_GATHER_IDXSI> 3 "register_operand")
+			   (match_operand:SI 5 "const1248_operand")]))]
+		     UNSPEC_GATHER))
+	      (clobber (match_scratch:<avx512fmaskmode> 7))])]
+  "TARGET_AVX512F"
+{
+  operands[6]
+    = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[2], operands[3],
+					operands[5]), UNSPEC_VSIBADDR);
+})
+
+(define_insn "*avx512f_gathersi<mode>"
+  [(set (match_operand:VI48F_512 0 "register_operand" "=&v")
+	(unspec:VI48F_512
+	  [(match_operand:VI48F_512 1 "register_operand" "0")
+	   (match_operand:<avx512fmaskmode> 7 "register_operand" "2")
+	   (match_operator:<ssescalarmode> 6 "vsib_mem_operator"
+	     [(unspec:P
+		[(match_operand:P 4 "vsib_address_operand" "Tv")
+		 (match_operand:<VEC_GATHER_IDXSI> 3 "register_operand" "v")
+		 (match_operand:SI 5 "const1248_operand" "n")]
+		UNSPEC_VSIBADDR)])]
+	  UNSPEC_GATHER))
+   (clobber (match_scratch:<avx512fmaskmode> 2 "=&Yk"))]
+  "TARGET_AVX512F"
+  "v<sseintprefix>gatherd<ssemodesuffix>\t{%6, %0%{%2%}|%0%{%2%}, %g6}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*avx512f_gathersi<mode>_2"
+  [(set (match_operand:VI48F_512 0 "register_operand" "=&v")
+	(unspec:VI48F_512
+	  [(pc)
+	   (match_operand:<avx512fmaskmode> 6 "register_operand" "1")
+	   (match_operator:<ssescalarmode> 5 "vsib_mem_operator"
+	     [(unspec:P
+		[(match_operand:P 3 "vsib_address_operand" "Tv")
+		 (match_operand:<VEC_GATHER_IDXSI> 2 "register_operand" "v")
+		 (match_operand:SI 4 "const1248_operand" "n")]
+		UNSPEC_VSIBADDR)])]
+	  UNSPEC_GATHER))
+   (clobber (match_scratch:<avx512fmaskmode> 1 "=&Yk"))]
+  "TARGET_AVX512F"
+  "v<sseintprefix>gatherd<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %g5}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+
+(define_expand "avx512f_gatherdi<mode>"
+  [(parallel [(set (match_operand:VI48F_512 0 "register_operand")
+		   (unspec:VI48F_512
+		     [(match_operand:<VEC_GATHER_SRCDI> 1 "register_operand")
+		      (match_operand:QI 4 "register_operand")
+		      (mem:<ssescalarmode>
+			(match_par_dup 6
+			  [(match_operand 2 "vsib_address_operand")
+			   (match_operand:<VEC_GATHER_IDXDI> 3 "register_operand")
+			   (match_operand:SI 5 "const1248_operand")]))]
+		     UNSPEC_GATHER))
+	      (clobber (match_scratch:QI 7))])]
+  "TARGET_AVX512F"
+{
+  operands[6]
+    = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[2], operands[3],
+					operands[5]), UNSPEC_VSIBADDR);
+})
+
+(define_insn "*avx512f_gatherdi<mode>"
+  [(set (match_operand:VI48F_512 0 "register_operand" "=&v")
+	(unspec:VI48F_512
+	  [(match_operand:<VEC_GATHER_SRCDI> 1 "register_operand" "0")
+	   (match_operand:QI 7 "register_operand" "2")
+	   (match_operator:<ssescalarmode> 6 "vsib_mem_operator"
+	     [(unspec:P
+		[(match_operand:P 4 "vsib_address_operand" "Tv")
+		 (match_operand:<VEC_GATHER_IDXDI> 3 "register_operand" "v")
+		 (match_operand:SI 5 "const1248_operand" "n")]
+		UNSPEC_VSIBADDR)])]
+	  UNSPEC_GATHER))
+   (clobber (match_scratch:QI 2 "=&Yk"))]
+  "TARGET_AVX512F"
+  "v<sseintprefix>gatherq<ssemodesuffix>\t{%6, %1%{%2%}|%1%{%2%}, %g6}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "*avx512f_gatherdi<mode>_2"
+  [(set (match_operand:VI48F_512 0 "register_operand" "=&v")
+	(unspec:VI48F_512
+	  [(pc)
+	   (match_operand:QI 6 "register_operand" "1")
+	   (match_operator:<ssescalarmode> 5 "vsib_mem_operator"
+	     [(unspec:P
+		[(match_operand:P 3 "vsib_address_operand" "Tv")
+		 (match_operand:<VEC_GATHER_IDXDI> 2 "register_operand" "v")
+		 (match_operand:SI 4 "const1248_operand" "n")]
+		UNSPEC_VSIBADDR)])]
+	  UNSPEC_GATHER))
+   (clobber (match_scratch:QI 1 "=&Yk"))]
+  "TARGET_AVX512F"
+{
+  if (<MODE>mode != <VEC_GATHER_SRCDI>mode)
+    return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %t0%{%1%}|%t0%{%1%}, %g5}";
+  return "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %0%{%1%}|%0%{%1%}, %g5}";
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512f_scattersi<mode>"
+  [(parallel [(set (mem:VI48F_512
+		     (match_par_dup 5
+		       [(match_operand 0 "vsib_address_operand")
+			(match_operand:<VEC_GATHER_IDXSI> 2 "register_operand")
+			(match_operand:SI 4 "const1248_operand")]))
+		   (unspec:VI48F_512
+		     [(match_operand:<avx512fmaskmode> 1 "register_operand")
+		      (match_operand:VI48F_512 3 "register_operand")]
+		     UNSPEC_SCATTER))
+	      (clobber (match_scratch:<avx512fmaskmode> 6))])]
+  "TARGET_AVX512F"
+{
+  operands[5]
+    = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[0], operands[2],
+					operands[4]), UNSPEC_VSIBADDR);
+})
+
+(define_insn "*avx512f_scattersi<mode>"
+  [(set (match_operator:VI48F_512 5 "vsib_mem_operator"
+	  [(unspec:P
+	     [(match_operand:P 0 "vsib_address_operand" "Tv")
+	      (match_operand:<VEC_GATHER_IDXSI> 2 "register_operand" "v")
+	      (match_operand:SI 4 "const1248_operand" "n")]
+	     UNSPEC_VSIBADDR)])
+	(unspec:VI48F_512
+	  [(match_operand:<avx512fmaskmode> 6 "register_operand" "1")
+	   (match_operand:VI48F_512 3 "register_operand" "v")]
+	  UNSPEC_SCATTER))
+   (clobber (match_scratch:<avx512fmaskmode> 1 "=&Yk"))]
+  "TARGET_AVX512F"
+  "v<sseintprefix>scatterd<ssemodesuffix>\t{%3, %5%{%1%}|%5%{%1%}, %3}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512f_scatterdi<mode>"
+  [(parallel [(set (mem:VI48F_512
+		     (match_par_dup 5
+		       [(match_operand 0 "vsib_address_operand")
+			(match_operand:V8DI 2 "register_operand")
+			(match_operand:SI 4 "const1248_operand")]))
+		   (unspec:VI48F_512
+		     [(match_operand:QI 1 "register_operand")
+		      (match_operand:<VEC_GATHER_SRCDI> 3 "register_operand")]
+		     UNSPEC_SCATTER))
+	      (clobber (match_scratch:QI 6))])]
+  "TARGET_AVX512F"
+{
+  operands[5]
+    = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[0], operands[2],
+					operands[4]), UNSPEC_VSIBADDR);
+})
+
+(define_insn "*avx512f_scatterdi<mode>"
+  [(set (match_operator:VI48F_512 5 "vsib_mem_operator"
+	  [(unspec:P
+	     [(match_operand:P 0 "vsib_address_operand" "Tv")
+	      (match_operand:V8DI 2 "register_operand" "v")
+	      (match_operand:SI 4 "const1248_operand" "n")]
+	     UNSPEC_VSIBADDR)])
+	(unspec:VI48F_512
+	  [(match_operand:QI 6 "register_operand" "1")
+	   (match_operand:<VEC_GATHER_SRCDI> 3 "register_operand" "v")]
+	  UNSPEC_SCATTER))
+   (clobber (match_scratch:QI 1 "=&Yk"))]
+  "TARGET_AVX512F"
+  "v<sseintprefix>scatterq<ssemodesuffix>\t{%3, %5%{%1%}|%5%{%1%}, %3}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512f_compress<mode>_mask"
+  [(set (match_operand:VI48F_512 0 "register_operand" "=v")
+	(unspec:VI48F_512
+	  [(match_operand:VI48F_512 1 "register_operand" "v")
+	   (match_operand:VI48F_512 2 "vector_move_operand" "0C")
+	   (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")]
+	  UNSPEC_COMPRESS))]
+  "TARGET_AVX512F"
+  "v<sseintprefix>compress<ssemodesuffix>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512f_compressstore<mode>_mask"
+  [(set (match_operand:VI48F_512 0 "memory_operand" "=m")
+	(unspec:VI48F_512
+	  [(match_operand:VI48F_512 1 "register_operand" "x")
+	   (match_dup 0)
+	   (match_operand:<avx512fmaskmode> 2 "register_operand" "Yk")]
+	  UNSPEC_COMPRESS_STORE))]
+  "TARGET_AVX512F"
+  "v<sseintprefix>compress<ssemodesuffix>\t{%1, %0%{%2%}|%0%{%2%}, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "memory" "store")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx512f_expand<mode>_maskz"
+  [(set (match_operand:VI48F_512 0 "register_operand")
+	(unspec:VI48F_512
+	  [(match_operand:VI48F_512 1 "nonimmediate_operand")
+	   (match_operand:VI48F_512 2 "vector_move_operand")
+	   (match_operand:<avx512fmaskmode> 3 "register_operand")]
+	  UNSPEC_EXPAND))]
+  "TARGET_AVX512F"
+  "operands[2] = CONST0_RTX (<MODE>mode);")
+
+(define_insn "avx512f_expand<mode>_mask"
+  [(set (match_operand:VI48F_512 0 "register_operand" "=v,v")
+	(unspec:VI48F_512
+	  [(match_operand:VI48F_512 1 "nonimmediate_operand" "v,m")
+	   (match_operand:VI48F_512 2 "vector_move_operand" "0C,0C")
+	   (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")]
+	  UNSPEC_EXPAND))]
+  "TARGET_AVX512F"
+  "v<sseintprefix>expand<ssemodesuffix>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "memory" "none,load")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx512f_getmant<mode><mask_name><round_saeonly_name>"
+  [(set (match_operand:VF_512 0 "register_operand" "=v")
+	(unspec:VF_512
+	  [(match_operand:VF_512 1 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+	   (match_operand:SI 2 "const_0_to_15_operand")]
+	  UNSPEC_GETMANT))]
+  "TARGET_AVX512F"
+  "vgetmant<ssemodesuffix>\t{%2, <round_saeonly_mask_op3>%1, %0<mask_operand3>|%0<mask_operand3>, %1<round_saeonly_mask_op3>, %2}";
+  [(set_attr "prefix" "evex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx512f_getmant<mode><round_saeonly_name>"
+  [(set (match_operand:VF_128 0 "register_operand" "=v")
+	(vec_merge:VF_128
+	  (unspec:VF_128
+	    [(match_operand:VF_128 1 "register_operand" "v")
+	     (match_operand:VF_128 2 "<round_saeonly_nimm_predicate>" "<round_saeonly_constraint>")
+	     (match_operand:SI 3 "const_0_to_15_operand")]
+	    UNSPEC_GETMANT)
+	  (match_dup 1)
+	  (const_int 1)))]
+   "TARGET_AVX512F"
+   "vgetmant<ssescalarmodesuffix>\t{%3, <round_saeonly_op4>%2, %1, %0|%0, %1, %2<round_saeonly_op4>, %3}";
+   [(set_attr "prefix" "evex")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_insn "clz<mode>2<mask_name>"
+  [(set (match_operand:VI48_512 0 "register_operand" "=v")
+	(clz:VI48_512
+	  (match_operand:VI48_512 1 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512CD"
+  "vplzcnt<ssemodesuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<mask_codefor>conflict<mode><mask_name>"
+  [(set (match_operand:VI48_512 0 "register_operand" "=v")
+	(unspec:VI48_512
+	  [(match_operand:VI48_512 1 "nonimmediate_operand" "vm")]
+	  UNSPEC_CONFLICT))]
+  "TARGET_AVX512CD"
+  "vpconflict<ssemodesuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "sha1msg1"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(unspec:V4SI
+	  [(match_operand:V4SI 1 "register_operand" "0")
+	   (match_operand:V4SI 2 "nonimmediate_operand" "xm")]
+	  UNSPEC_SHA1MSG1))]
+  "TARGET_SHA"
+  "sha1msg1\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sselog1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sha1msg2"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(unspec:V4SI
+	  [(match_operand:V4SI 1 "register_operand" "0")
+	   (match_operand:V4SI 2 "nonimmediate_operand" "xm")]
+	  UNSPEC_SHA1MSG2))]
+  "TARGET_SHA"
+  "sha1msg2\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sselog1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sha1nexte"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(unspec:V4SI
+	  [(match_operand:V4SI 1 "register_operand" "0")
+	   (match_operand:V4SI 2 "nonimmediate_operand" "xm")]
+	  UNSPEC_SHA1NEXTE))]
+  "TARGET_SHA"
+  "sha1nexte\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sselog1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sha1rnds4"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(unspec:V4SI
+	  [(match_operand:V4SI 1 "register_operand" "0")
+	   (match_operand:V4SI 2 "nonimmediate_operand" "xm")
+	   (match_operand:SI 3 "const_0_to_3_operand" "n")]
+	  UNSPEC_SHA1RNDS4))]
+  "TARGET_SHA"
+  "sha1rnds4\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "sselog1")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sha256msg1"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(unspec:V4SI
+	  [(match_operand:V4SI 1 "register_operand" "0")
+	   (match_operand:V4SI 2 "nonimmediate_operand" "xm")]
+	  UNSPEC_SHA256MSG1))]
+  "TARGET_SHA"
+  "sha256msg1\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sselog1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sha256msg2"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(unspec:V4SI
+	  [(match_operand:V4SI 1 "register_operand" "0")
+	   (match_operand:V4SI 2 "nonimmediate_operand" "xm")]
+	  UNSPEC_SHA256MSG2))]
+  "TARGET_SHA"
+  "sha256msg2\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sselog1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sha256rnds2"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(unspec:V4SI
+	  [(match_operand:V4SI 1 "register_operand" "0")
+	   (match_operand:V4SI 2 "nonimmediate_operand" "xm")
+	   (match_operand:V4SI 3 "register_operand" "Yz")]
+	  UNSPEC_SHA256RNDS2))]
+  "TARGET_SHA"
+  "sha256rnds2\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "sselog1")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "TI")])
diff --git a/gcc-4.9/gcc/config/i386/ssemath.h b/gcc-4.9/gcc/config/i386/ssemath.h
new file mode 100644
index 000000000..ec8d74a62
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/ssemath.h
@@ -0,0 +1,28 @@
+/* Copyright (C) 2010-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#undef TARGET_FPMATH_DEFAULT
+#define TARGET_FPMATH_DEFAULT (TARGET_SSE2 ? FPMATH_SSE : FPMATH_387)
+
+#undef TARGET_FPMATH_DEFAULT_P
+#define TARGET_FPMATH_DEFAULT_P(x) \
+  (TARGET_SSE2_P(x) ? FPMATH_SSE : FPMATH_387)
+
+#undef TARGET_SUBTARGET32_ISA_DEFAULT
+#define TARGET_SUBTARGET32_ISA_DEFAULT \
+   (OPTION_MASK_ISA_MMX | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_SSE2)
diff --git a/gcc-4.9/gcc/config/i386/stringop.def b/gcc-4.9/gcc/config/i386/stringop.def
new file mode 100644
index 000000000..279aa1961
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/stringop.def
@@ -0,0 +1,37 @@
+/* Definitions for stringop strategy for IA-32.
+   Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License 
+along with GCC; see the files COPYING3.  If not,
+see <http://www.gnu.org/licenses/>.  */
+
+DEF_ENUM
+DEF_ALG (no_stringop, no_stringop)
+DEF_ENUM
+DEF_ALG (libcall, libcall)
+DEF_ENUM
+DEF_ALG (rep_prefix_1_byte, rep_byte)
+DEF_ENUM
+DEF_ALG (rep_prefix_4_byte, rep_4byte)
+DEF_ENUM
+DEF_ALG (rep_prefix_8_byte, rep_8byte)
+DEF_ENUM
+DEF_ALG (loop_1_byte, byte_loop)
+DEF_ENUM
+DEF_ALG (loop, loop)
+DEF_ENUM
+DEF_ALG (unrolled_loop, unrolled_loop)
+DEF_ENUM
+DEF_ALG (vector_loop, vector_loop)
diff --git a/gcc-4.9/gcc/config/i386/stringop.opt b/gcc-4.9/gcc/config/i386/stringop.opt
new file mode 100644
index 000000000..bb8d2d2b7
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/stringop.opt
@@ -0,0 +1,31 @@
+/* Definitions for stringop option handling for IA-32.
+   Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the files COPYING3.  If not,
+see <http://www.gnu.org/licenses/>.  */
+
+Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte)
+
+#undef DEF_ENUM
+#define DEF_ENUM EnumValue
+
+#undef DEF_ALG
+#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg)
+
+#include "stringop.def"
+
+#undef DEF_ENUM
+#undef DEF_ALG
diff --git a/gcc-4.9/gcc/config/i386/subst.md b/gcc-4.9/gcc/config/i386/subst.md
new file mode 100644
index 000000000..1654cbae6
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/subst.md
@@ -0,0 +1,198 @@
+;; GCC machine description for AVX512F instructions
+;; Copyright (C) 2013-2014 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; Some iterators for extending subst as much as possible
+;; All vectors (Use it for destination)
+(define_mode_iterator SUBST_V
+  [V16QI
+   V16HI V8HI
+   V16SI V8SI  V4SI
+   V8DI  V4DI  V2DI
+   V16SF V8SF  V4SF
+   V8DF  V4DF  V2DF])
+
+(define_mode_iterator SUBST_S
+  [QI HI SI DI])
+
+(define_mode_iterator SUBST_A
+  [V16QI
+   V16HI V8HI
+   V16SI V8SI  V4SI
+   V8DI  V4DI  V2DI
+   V16SF V8SF  V4SF
+   V8DF  V4DF  V2DF
+   QI HI SI DI SF DF
+   CCFP CCFPU])
+
+(define_subst_attr "mask_name" "mask" "" "_mask")
+(define_subst_attr "mask_applied" "mask" "false" "true")
+(define_subst_attr "mask_operand2" "mask" "" "%{%3%}%N2")
+(define_subst_attr "mask_operand3" "mask" "" "%{%4%}%N3")
+(define_subst_attr "mask_operand3_1" "mask" "" "%%{%%4%%}%%N3") ;; for sprintf
+(define_subst_attr "mask_operand4" "mask" "" "%{%5%}%N4")
+(define_subst_attr "mask_operand6" "mask" "" "%{%7%}%N6")
+(define_subst_attr "mask_operand11" "mask" "" "%{%12%}%N11")
+(define_subst_attr "mask_operand18" "mask" "" "%{%19%}%N18")
+(define_subst_attr "mask_operand19" "mask" "" "%{%20%}%N19")
+(define_subst_attr "mask_codefor" "mask" "*" "")
+(define_subst_attr "mask_mode512bit_condition" "mask" "1" "(<MODE_SIZE> == 64)")
+(define_subst_attr "store_mask_constraint" "mask" "vm" "v")
+(define_subst_attr "store_mask_predicate" "mask" "nonimmediate_operand" "register_operand")
+(define_subst_attr "mask_prefix" "mask" "vex" "evex")
+(define_subst_attr "mask_prefix2" "mask" "maybe_vex" "evex")
+(define_subst_attr "mask_prefix3" "mask" "orig,vex" "evex")
+
+(define_subst "mask"
+  [(set (match_operand:SUBST_V 0)
+        (match_operand:SUBST_V 1))]
+  "TARGET_AVX512F"
+  [(set (match_dup 0)
+        (vec_merge:SUBST_V
+	  (match_dup 1)
+	  (match_operand:SUBST_V 2 "vector_move_operand" "0C")
+	  (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))])
+
+(define_subst_attr "mask_scalar_merge_name" "mask_scalar_merge" "" "_mask")
+(define_subst_attr "mask_scalar_merge_operand3" "mask_scalar_merge" "" "%{%3%}")
+(define_subst_attr "mask_scalar_merge_operand4" "mask_scalar_merge" "" "%{%4%}")
+
+(define_subst "mask_scalar_merge"
+  [(set (match_operand:SUBST_S 0)
+        (match_operand:SUBST_S 1))]
+  "TARGET_AVX512F"
+  [(set (match_dup 0)
+        (and:SUBST_S
+	  (match_dup 1)
+	  (match_operand:SUBST_S 3 "register_operand" "Yk")))])
+
+(define_subst_attr "sd_maskz_name" "sd" "" "_maskz_1")
+(define_subst_attr "sd_mask_op4" "sd" "" "%{%5%}%N4")
+(define_subst_attr "sd_mask_op5" "sd" "" "%{%6%}%N5")
+(define_subst_attr "sd_mask_codefor" "sd" "*" "")
+(define_subst_attr "sd_mask_mode512bit_condition" "sd" "1" "(<MODE_SIZE> == 64)")
+
+(define_subst "sd"
+ [(set (match_operand:SUBST_V 0)
+       (match_operand:SUBST_V 1))]
+ ""
+ [(set (match_dup 0)
+       (vec_merge:SUBST_V
+	 (match_dup 1)
+	 (match_operand:SUBST_V 2 "const0_operand" "C")
+	 (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))
+])
+
+(define_subst_attr "round_name" "round" "" "_round")
+(define_subst_attr "round_mask_operand2" "mask" "%R2" "%R4")
+(define_subst_attr "round_mask_operand3" "mask" "%R3" "%R5")
+(define_subst_attr "round_sd_mask_operand4" "sd" "%R4" "%R6")
+(define_subst_attr "round_op2" "round" "" "%R2")
+(define_subst_attr "round_op3" "round" "" "%R3")
+(define_subst_attr "round_op4" "round" "" "%R4")
+(define_subst_attr "round_op5" "round" "" "%R5")
+(define_subst_attr "round_op6" "round" "" "%R6")
+(define_subst_attr "round_mask_op2" "round" "" "<round_mask_operand2>")
+(define_subst_attr "round_mask_op3" "round" "" "<round_mask_operand3>")
+(define_subst_attr "round_mask_scalar_op3" "round" "" "<round_mask_scalar_operand3>")
+(define_subst_attr "round_sd_mask_op4" "round" "" "<round_sd_mask_operand4>")
+(define_subst_attr "round_constraint" "round" "vm" "v")
+(define_subst_attr "round_constraint2" "round" "m" "v")
+(define_subst_attr "round_constraint3" "round" "rm" "r")
+(define_subst_attr "round_nimm_predicate" "round" "nonimmediate_operand" "register_operand")
+(define_subst_attr "round_prefix" "round" "vex" "evex")
+(define_subst_attr "round_mode512bit_condition" "round" "1" "(<MODE>mode == V16SFmode || <MODE>mode == V8DFmode)")
+(define_subst_attr "round_modev4sf_condition" "round" "1" "(<MODE>mode == V4SFmode)")
+(define_subst_attr "round_codefor" "round" "*" "")
+(define_subst_attr "round_opnum" "round" "5" "6")
+
+(define_subst "round"
+  [(set (match_operand:SUBST_A 0)
+        (match_operand:SUBST_A 1))]
+  "TARGET_AVX512F"
+  [(parallel[
+     (set (match_dup 0)
+          (match_dup 1))
+     (unspec [(match_operand:SI 2 "const_4_or_8_to_11_operand")] UNSPEC_EMBEDDED_ROUNDING)])])
+
+(define_subst_attr "round_saeonly_name" "round_saeonly" "" "_round")
+(define_subst_attr "round_saeonly_mask_operand2" "mask" "%r2" "%r4")
+(define_subst_attr "round_saeonly_mask_operand3" "mask" "%r3" "%r5")
+(define_subst_attr "round_saeonly_mask_scalar_merge_operand4" "mask_scalar_merge" "%r4" "%r5")
+(define_subst_attr "round_saeonly_sd_mask_operand5" "sd" "%r5" "%r7")
+(define_subst_attr "round_saeonly_op2" "round_saeonly" "" "%r2")
+(define_subst_attr "round_saeonly_op3" "round_saeonly" "" "%r3")
+(define_subst_attr "round_saeonly_op4" "round_saeonly" "" "%r4")
+(define_subst_attr "round_saeonly_op5" "round_saeonly" "" "%r5")
+(define_subst_attr "round_saeonly_op6" "round_saeonly" "" "%r6")
+(define_subst_attr "round_saeonly_prefix" "round_saeonly" "vex" "evex")
+(define_subst_attr "round_saeonly_mask_op2" "round_saeonly" "" "<round_saeonly_mask_operand2>")
+(define_subst_attr "round_saeonly_mask_op3" "round_saeonly" "" "<round_saeonly_mask_operand3>")
+(define_subst_attr "round_saeonly_mask_scalar_merge_op4" "round_saeonly" "" "<round_saeonly_mask_scalar_merge_operand4>")
+(define_subst_attr "round_saeonly_sd_mask_op5" "round_saeonly" "" "<round_saeonly_sd_mask_operand5>")
+(define_subst_attr "round_saeonly_constraint" "round_saeonly" "vm" "v")
+(define_subst_attr "round_saeonly_constraint2" "round_saeonly" "m" "v")
+(define_subst_attr "round_saeonly_nimm_predicate" "round_saeonly" "nonimmediate_operand" "register_operand")
+(define_subst_attr "round_saeonly_mode512bit_condition" "round_saeonly" "1" "(<MODE>mode == V16SFmode || <MODE>mode == V8DFmode)")
+
+(define_subst "round_saeonly"
+  [(set (match_operand:SUBST_A 0)
+        (match_operand:SUBST_A 1))]
+  "TARGET_AVX512F"
+  [(parallel[
+     (set (match_dup 0)
+          (match_dup 1))
+     (unspec [(match_operand:SI 2 "const48_operand")] UNSPEC_EMBEDDED_ROUNDING)])])
+
+(define_subst_attr "round_expand_name" "round_expand" "" "_round")
+(define_subst_attr "round_expand_nimm_predicate" "round_expand" "nonimmediate_operand" "register_operand")
+(define_subst_attr "round_expand_operand" "round_expand" "" ", operands[5]")
+
+(define_subst "round_expand"
+ [(match_operand:SUBST_V 0)
+  (match_operand:SUBST_V 1)
+  (match_operand:SUBST_V 2)
+  (match_operand:SUBST_V 3)
+  (match_operand:SUBST_S 4)]
+  "TARGET_AVX512F"
+  [(match_dup 0)
+   (match_dup 1)
+   (match_dup 2)
+   (match_dup 3)
+   (match_dup 4)
+   (unspec [(match_operand:SI 5 "const_4_or_8_to_11_operand")] UNSPEC_EMBEDDED_ROUNDING)])
+
+(define_subst_attr "round_saeonly_expand_name" "round_saeonly_expand" "" "_round")
+(define_subst_attr "round_saeonly_expand_nimm_predicate" "round_saeonly_expand" "nonimmediate_operand" "register_operand")
+(define_subst_attr "round_saeonly_expand_operand6" "round_saeonly_expand" "" ", operands[6]")
+
+(define_subst "round_saeonly_expand"
+ [(match_operand:SUBST_V 0)
+  (match_operand:SUBST_V 1)
+  (match_operand:SUBST_V 2)
+  (match_operand:SUBST_A 3)
+  (match_operand:SI 4)
+  (match_operand:SUBST_S 5)]
+  "TARGET_AVX512F"
+  [(match_dup 0)
+   (match_dup 1)
+   (match_dup 2)
+   (match_dup 3)
+   (match_dup 4)
+   (match_dup 5)
+   (unspec [(match_operand:SI 6 "const48_operand")] UNSPEC_EMBEDDED_ROUNDING)])
diff --git a/gcc-4.9/gcc/config/i386/sync.md b/gcc-4.9/gcc/config/i386/sync.md
new file mode 100644
index 000000000..4cd449ebf
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/sync.md
@@ -0,0 +1,606 @@
+;; GCC machine description for i386 synchronization instructions.
+;; Copyright (C) 2005-2014 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_c_enum "unspec" [
+  UNSPEC_LFENCE
+  UNSPEC_SFENCE
+  UNSPEC_MFENCE
+  UNSPEC_MOVA	; For __atomic support
+  UNSPEC_LDA
+  UNSPEC_STA
+])
+
+(define_c_enum "unspecv" [
+  UNSPECV_CMPXCHG
+  UNSPECV_XCHG
+  UNSPECV_LOCK
+])
+
+(define_expand "sse2_lfence"
+  [(set (match_dup 0)
+	(unspec:BLK [(match_dup 0)] UNSPEC_LFENCE))]
+  "TARGET_SSE2"
+{
+  operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[0]) = 1;
+})
+
+(define_insn "*sse2_lfence"
+  [(set (match_operand:BLK 0)
+	(unspec:BLK [(match_dup 0)] UNSPEC_LFENCE))]
+  "TARGET_SSE2"
+  "lfence"
+  [(set_attr "type" "sse")
+   (set_attr "length_address" "0")
+   (set_attr "atom_sse_attr" "lfence")
+   (set_attr "memory" "unknown")])
+
+(define_expand "sse_sfence"
+  [(set (match_dup 0)
+	(unspec:BLK [(match_dup 0)] UNSPEC_SFENCE))]
+  "TARGET_SSE || TARGET_3DNOW_A"
+{
+  operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[0]) = 1;
+})
+
+(define_insn "*sse_sfence"
+  [(set (match_operand:BLK 0)
+	(unspec:BLK [(match_dup 0)] UNSPEC_SFENCE))]
+  "TARGET_SSE || TARGET_3DNOW_A"
+  "sfence"
+  [(set_attr "type" "sse")
+   (set_attr "length_address" "0")
+   (set_attr "atom_sse_attr" "fence")
+   (set_attr "memory" "unknown")])
+
+(define_expand "sse2_mfence"
+  [(set (match_dup 0)
+	(unspec:BLK [(match_dup 0)] UNSPEC_MFENCE))]
+  "TARGET_SSE2"
+{
+  operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[0]) = 1;
+})
+
+(define_insn "mfence_sse2"
+  [(set (match_operand:BLK 0)
+	(unspec:BLK [(match_dup 0)] UNSPEC_MFENCE))]
+  "TARGET_64BIT || TARGET_SSE2"
+  "mfence"
+  [(set_attr "type" "sse")
+   (set_attr "length_address" "0")
+   (set_attr "atom_sse_attr" "fence")
+   (set_attr "memory" "unknown")])
+
+(define_insn "mfence_nosse"
+  [(set (match_operand:BLK 0)
+	(unspec:BLK [(match_dup 0)] UNSPEC_MFENCE))
+   (clobber (reg:CC FLAGS_REG))]
+  "!(TARGET_64BIT || TARGET_SSE2)"
+  "lock{%;} or{l}\t{$0, (%%esp)|DWORD PTR [esp], 0}"
+  [(set_attr "memory" "unknown")])
+
+(define_expand "mem_thread_fence"
+  [(match_operand:SI 0 "const_int_operand")]		;; model
+  ""
+{
+  enum memmodel model = (enum memmodel) (INTVAL (operands[0]) & MEMMODEL_MASK);
+
+  /* Unless this is a SEQ_CST fence, the i386 memory model is strong
+     enough not to require barriers of any kind.  */
+  if (model == MEMMODEL_SEQ_CST)
+    {
+      rtx (*mfence_insn)(rtx);
+      rtx mem;
+
+      if (TARGET_64BIT || TARGET_SSE2)
+	mfence_insn = gen_mfence_sse2;
+      else
+	mfence_insn = gen_mfence_nosse;
+
+      mem = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+      MEM_VOLATILE_P (mem) = 1;
+
+      emit_insn (mfence_insn (mem));
+    }
+  DONE;
+})
+
+;; ??? From volume 3 section 8.1.1 Guaranteed Atomic Operations,
+;; Only beginning at Pentium family processors do we get any guarantee of
+;; atomicity in aligned 64-bit quantities.  Beginning at P6, we get a
+;; guarantee for 64-bit accesses that do not cross a cacheline boundary.
+;;
+;; Note that the TARGET_CMPXCHG8B test below is a stand-in for "Pentium".
+;;
+;; Importantly, *no* processor makes atomicity guarantees for larger
+;; accesses.  In particular, there's no way to perform an atomic TImode
+;; move, despite the apparent applicability of MOVDQA et al.
+
+(define_mode_iterator ATOMIC
+   [QI HI SI
+    (DI "TARGET_64BIT || (TARGET_CMPXCHG8B && (TARGET_80387 || TARGET_SSE))")
+   ])
+
+(define_expand "atomic_load<mode>"
+  [(set (match_operand:ATOMIC 0 "register_operand")
+	(unspec:ATOMIC [(match_operand:ATOMIC 1 "memory_operand")
+			(match_operand:SI 2 "const_int_operand")]
+		       UNSPEC_MOVA))]
+  ""
+{
+  /* For DImode on 32-bit, we can use the FPU to perform the load.  */
+  if (<MODE>mode == DImode && !TARGET_64BIT)
+    emit_insn (gen_atomic_loaddi_fpu
+	       (operands[0], operands[1],
+	        assign_386_stack_local (DImode, SLOT_TEMP)));
+  else
+    emit_move_insn (operands[0], operands[1]);
+  DONE;
+})
+
+(define_insn_and_split "atomic_loaddi_fpu"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=x,m,?r")
+	(unspec:DI [(match_operand:DI 1 "memory_operand" "m,m,m")]
+		   UNSPEC_MOVA))
+   (clobber (match_operand:DI 2 "memory_operand" "=X,X,m"))
+   (clobber (match_scratch:DF 3 "=X,xf,xf"))]
+  "!TARGET_64BIT && (TARGET_80387 || TARGET_SSE)"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rtx dst = operands[0], src = operands[1];
+  rtx mem = operands[2], tmp = operands[3];
+
+  if (SSE_REG_P (dst))
+    emit_move_insn (dst, src);
+  else
+    {
+      if (MEM_P (dst))
+	mem = dst;
+
+      if (STACK_REG_P (tmp))
+        {
+	  emit_insn (gen_loaddi_via_fpu (tmp, src));
+	  emit_insn (gen_storedi_via_fpu (mem, tmp));
+	}
+      else
+	{
+	  adjust_reg_mode (tmp, DImode);
+	  emit_move_insn (tmp, src);
+	  emit_move_insn (mem, tmp);
+	}
+
+      if (mem != dst)
+	emit_move_insn (dst, mem);
+    }
+  DONE;
+})
+
+(define_expand "atomic_store<mode>"
+  [(set (match_operand:ATOMIC 0 "memory_operand")
+	(unspec:ATOMIC [(match_operand:ATOMIC 1 "register_operand")
+			(match_operand:SI 2 "const_int_operand")]
+		       UNSPEC_MOVA))]
+  ""
+{
+  enum memmodel model = (enum memmodel) (INTVAL (operands[2]) & MEMMODEL_MASK);
+
+  if (<MODE>mode == DImode && !TARGET_64BIT)
+    {
+      /* For DImode on 32-bit, we can use the FPU to perform the store.  */
+      /* Note that while we could perform a cmpxchg8b loop, that turns
+	 out to be significantly larger than this plus a barrier.  */
+      emit_insn (gen_atomic_storedi_fpu
+		 (operands[0], operands[1],
+	          assign_386_stack_local (DImode, SLOT_TEMP)));
+    }
+  else
+    {
+      /* For seq-cst stores, when we lack MFENCE, use XCHG.  */
+      if (model == MEMMODEL_SEQ_CST && !(TARGET_64BIT || TARGET_SSE2))
+	{
+	  emit_insn (gen_atomic_exchange<mode> (gen_reg_rtx (<MODE>mode),
+						operands[0], operands[1],
+						operands[2]));
+	  DONE;
+	}
+
+      /* Otherwise use a store.  */
+      emit_insn (gen_atomic_store<mode>_1 (operands[0], operands[1],
+					   operands[2]));
+    }
+  /* ... followed by an MFENCE, if required.  */
+  if (model == MEMMODEL_SEQ_CST)
+    emit_insn (gen_mem_thread_fence (operands[2]));
+  DONE;
+})
+
+(define_insn "atomic_store<mode>_1"
+  [(set (match_operand:SWI 0 "memory_operand" "=m")
+	(unspec:SWI [(match_operand:SWI 1 "<nonmemory_operand>" "<r><i>")
+		     (match_operand:SI 2 "const_int_operand")]
+		    UNSPEC_MOVA))]
+  ""
+  "%K2mov{<imodesuffix>}\t{%1, %0|%0, %1}")
+
+(define_insn_and_split "atomic_storedi_fpu"
+  [(set (match_operand:DI 0 "memory_operand" "=m,m,m")
+	(unspec:DI [(match_operand:DI 1 "register_operand" "x,m,?r")]
+		   UNSPEC_MOVA))
+   (clobber (match_operand:DI 2 "memory_operand" "=X,X,m"))
+   (clobber (match_scratch:DF 3 "=X,xf,xf"))]
+  "!TARGET_64BIT && (TARGET_80387 || TARGET_SSE)"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rtx dst = operands[0], src = operands[1];
+  rtx mem = operands[2], tmp = operands[3];
+
+  if (!SSE_REG_P (src))
+    {
+      if (REG_P (src))
+	{
+	  emit_move_insn (mem, src);
+	  src = mem;
+	}
+
+      if (STACK_REG_P (tmp))
+	{
+	  emit_insn (gen_loaddi_via_fpu (tmp, src));
+	  emit_insn (gen_storedi_via_fpu (dst, tmp));
+	  DONE;
+	}
+      else
+	{
+	  adjust_reg_mode (tmp, DImode);
+	  emit_move_insn (tmp, mem);
+	  src = tmp;
+	}
+    }
+  emit_move_insn (dst, src);
+  DONE;
+})
+
+;; ??? You'd think that we'd be able to perform this via FLOAT + FIX_TRUNC
+;; operations.  But the fix_trunc patterns want way more setup than we want
+;; to provide.  Note that the scratch is DFmode instead of XFmode in order
+;; to make it easy to allocate a scratch in either SSE or FP_REGs above.
+
+(define_insn "loaddi_via_fpu"
+  [(set (match_operand:DF 0 "register_operand" "=f")
+	(unspec:DF [(match_operand:DI 1 "memory_operand" "m")] UNSPEC_LDA))]
+  "TARGET_80387"
+  "fild%Z1\t%1"
+  [(set_attr "type" "fmov")
+   (set_attr "mode" "DF")
+   (set_attr "fp_int_src" "true")])
+
+(define_insn "storedi_via_fpu"
+  [(set (match_operand:DI 0 "memory_operand" "=m")
+	(unspec:DI [(match_operand:DF 1 "register_operand" "f")] UNSPEC_STA))]
+  "TARGET_80387"
+{
+  gcc_assert (find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != NULL_RTX);
+
+  return "fistp%Z0\t%0";
+}
+  [(set_attr "type" "fmov")
+   (set_attr "mode" "DI")])
+
+(define_expand "atomic_compare_and_swap<mode>"
+  [(match_operand:QI 0 "register_operand")	;; bool success output
+   (match_operand:SWI124 1 "register_operand")	;; oldval output
+   (match_operand:SWI124 2 "memory_operand")	;; memory
+   (match_operand:SWI124 3 "register_operand")	;; expected input
+   (match_operand:SWI124 4 "register_operand")	;; newval input
+   (match_operand:SI 5 "const_int_operand")	;; is_weak
+   (match_operand:SI 6 "const_int_operand")	;; success model
+   (match_operand:SI 7 "const_int_operand")]	;; failure model
+  "TARGET_CMPXCHG"
+{
+  emit_insn
+   (gen_atomic_compare_and_swap<mode>_1
+    (operands[1], operands[2], operands[3], operands[4], operands[6]));
+  ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
+		     const0_rtx);
+  DONE;
+})
+
+(define_mode_iterator CASMODE
+  [(DI "TARGET_64BIT || TARGET_CMPXCHG8B")
+   (TI "TARGET_64BIT && TARGET_CMPXCHG16B")])
+(define_mode_attr CASHMODE [(DI "SI") (TI "DI")])
+
+(define_expand "atomic_compare_and_swap<mode>"
+  [(match_operand:QI 0 "register_operand")	;; bool success output
+   (match_operand:CASMODE 1 "register_operand")	;; oldval output
+   (match_operand:CASMODE 2 "memory_operand")	;; memory
+   (match_operand:CASMODE 3 "register_operand")	;; expected input
+   (match_operand:CASMODE 4 "register_operand")	;; newval input
+   (match_operand:SI 5 "const_int_operand")	;; is_weak
+   (match_operand:SI 6 "const_int_operand")	;; success model
+   (match_operand:SI 7 "const_int_operand")]	;; failure model
+  "TARGET_CMPXCHG"
+{
+  if (<MODE>mode == DImode && TARGET_64BIT)
+    {
+      emit_insn
+       (gen_atomic_compare_and_swapdi_1
+	(operands[1], operands[2], operands[3], operands[4], operands[6]));
+    }
+  else
+    {
+      enum machine_mode hmode = <CASHMODE>mode;
+      rtx lo_o, lo_e, lo_n, hi_o, hi_e, hi_n, mem;
+
+      lo_o = operands[1];
+      mem  = operands[2];
+      lo_e = operands[3];
+      lo_n = operands[4];
+      hi_o = gen_highpart (hmode, lo_o);
+      hi_e = gen_highpart (hmode, lo_e);
+      hi_n = gen_highpart (hmode, lo_n);
+      lo_o = gen_lowpart (hmode, lo_o);
+      lo_e = gen_lowpart (hmode, lo_e);
+      lo_n = gen_lowpart (hmode, lo_n);
+
+      if (!cmpxchg8b_pic_memory_operand (mem, <MODE>mode))
+ 	mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
+
+      emit_insn
+       (gen_atomic_compare_and_swap<mode>_doubleword
+        (lo_o, hi_o, mem, lo_e, hi_e, lo_n, hi_n, operands[6]));
+    }
+
+  ix86_expand_setcc (operands[0], EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
+		     const0_rtx);
+  DONE;
+})
+
+(define_insn "atomic_compare_and_swap<mode>_1"
+  [(set (match_operand:SWI 0 "register_operand" "=a")
+	(unspec_volatile:SWI
+	  [(match_operand:SWI 1 "memory_operand" "+m")
+	   (match_operand:SWI 2 "register_operand" "0")
+	   (match_operand:SWI 3 "register_operand" "<r>")
+	   (match_operand:SI 4 "const_int_operand")]
+	  UNSPECV_CMPXCHG))
+   (set (match_dup 1)
+	(unspec_volatile:SWI [(const_int 0)] UNSPECV_CMPXCHG))
+   (set (reg:CCZ FLAGS_REG)
+        (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG))]
+  "TARGET_CMPXCHG"
+  "lock{%;} %K4cmpxchg{<imodesuffix>}\t{%3, %1|%1, %3}")
+
+;; For double-word compare and swap, we are obliged to play tricks with
+;; the input newval (op5:op6) because the Intel register numbering does
+;; not match the gcc register numbering, so the pair must be CX:BX.
+;; That said, in order to take advantage of possible lower-subreg opts,
+;; treat all of the integral operands in the same way.
+
+;; Operands 5 and 6 really need to be different registers, which in
+;; this case means op5 must not be ecx.  If op5 and op6 are the same
+;; (like when the input is -1LL) GCC might chose to allocate op5 to ecx,
+;; like op6.  This breaks, as the xchg will move the PIC register
+;; contents to %ecx then --> boom.
+
+(define_mode_attr doublemodesuffix [(SI "8") (DI "16")])
+(define_mode_attr regprefix [(SI "e") (DI "r")])
+
+(define_insn "atomic_compare_and_swap<dwi>_doubleword"
+  [(set (match_operand:DWIH 0 "register_operand" "=a,a")
+	(unspec_volatile:DWIH
+	  [(match_operand:<DWI> 2 "cmpxchg8b_pic_memory_operand" "+m,m")
+	   (match_operand:DWIH 3 "register_operand" "0,0")
+	   (match_operand:DWIH 4 "register_operand" "1,1")
+	   (match_operand:DWIH 5 "register_operand" "b,!*r")
+	   (match_operand:DWIH 6 "register_operand" "c,c")
+	   (match_operand:SI 7 "const_int_operand")]
+	  UNSPECV_CMPXCHG))
+   (set (match_operand:DWIH 1 "register_operand" "=d,d")
+	(unspec_volatile:DWIH [(const_int 0)] UNSPECV_CMPXCHG))
+   (set (match_dup 2)
+	(unspec_volatile:<DWI> [(const_int 0)] UNSPECV_CMPXCHG))
+   (set (reg:CCZ FLAGS_REG)
+        (unspec_volatile:CCZ [(const_int 0)] UNSPECV_CMPXCHG))
+   (clobber (match_scratch:DWIH 8 "=X,&5"))]
+  "TARGET_CMPXCHG<doublemodesuffix>B"
+{
+  bool swap = REGNO (operands[5]) != BX_REG;
+  const char *xchg = "xchg{<imodesuffix>}\t%%<regprefix>bx, %5";
+
+  if (swap)
+    {
+      output_asm_insn (xchg, operands);
+      if (ix86_emit_cfi ())
+	{
+	  output_asm_insn (".cfi_remember_state", operands);
+	  output_asm_insn (".cfi_register\t%%<regprefix>bx, %5", operands);
+	}
+    }
+  output_asm_insn ("lock{%;} %K7cmpxchg<doublemodesuffix>b\t%2", operands);
+  if (swap)
+    {
+      output_asm_insn (xchg, operands);
+      if (ix86_emit_cfi ())
+	output_asm_insn (".cfi_restore_state", operands);
+    }
+
+  return "";
+})
+
+;; For operand 2 nonmemory_operand predicate is used instead of
+;; register_operand to allow combiner to better optimize atomic
+;; additions of constants.
+(define_insn "atomic_fetch_add<mode>"
+  [(set (match_operand:SWI 0 "register_operand" "=<r>")
+	(unspec_volatile:SWI
+	  [(match_operand:SWI 1 "memory_operand" "+m")
+	   (match_operand:SI 3 "const_int_operand")]		;; model
+	  UNSPECV_XCHG))
+   (set (match_dup 1)
+	(plus:SWI (match_dup 1)
+		  (match_operand:SWI 2 "nonmemory_operand" "0")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_XADD"
+  "lock{%;} %K3xadd{<imodesuffix>}\t{%0, %1|%1, %0}")
+
+;; This peephole2 and following insn optimize
+;; __sync_fetch_and_add (x, -N) == N into just lock {add,sub,inc,dec}
+;; followed by testing of flags instead of lock xadd and comparisons.
+(define_peephole2
+  [(set (match_operand:SWI 0 "register_operand")
+	(match_operand:SWI 2 "const_int_operand"))
+   (parallel [(set (match_dup 0)
+		   (unspec_volatile:SWI
+		     [(match_operand:SWI 1 "memory_operand")
+		      (match_operand:SI 4 "const_int_operand")]
+		     UNSPECV_XCHG))
+	      (set (match_dup 1)
+		   (plus:SWI (match_dup 1)
+			     (match_dup 0)))
+	      (clobber (reg:CC FLAGS_REG))])
+   (set (reg:CCZ FLAGS_REG)
+	(compare:CCZ (match_dup 0)
+		     (match_operand:SWI 3 "const_int_operand")))]
+  "peep2_reg_dead_p (3, operands[0])
+   && (unsigned HOST_WIDE_INT) INTVAL (operands[2])
+      == -(unsigned HOST_WIDE_INT) INTVAL (operands[3])
+   && !reg_overlap_mentioned_p (operands[0], operands[1])"
+  [(parallel [(set (reg:CCZ FLAGS_REG)
+		   (compare:CCZ
+		     (unspec_volatile:SWI [(match_dup 1) (match_dup 4)]
+					  UNSPECV_XCHG)
+		     (match_dup 3)))
+	      (set (match_dup 1)
+		   (plus:SWI (match_dup 1)
+			     (match_dup 2)))])])
+
+(define_insn "*atomic_fetch_add_cmp<mode>"
+  [(set (reg:CCZ FLAGS_REG)
+	(compare:CCZ
+	  (unspec_volatile:SWI
+	    [(match_operand:SWI 0 "memory_operand" "+m")
+	     (match_operand:SI 3 "const_int_operand")]		;; model
+	    UNSPECV_XCHG)
+	  (match_operand:SWI 2 "const_int_operand" "i")))
+   (set (match_dup 0)
+	(plus:SWI (match_dup 0)
+		  (match_operand:SWI 1 "const_int_operand" "i")))]
+  "(unsigned HOST_WIDE_INT) INTVAL (operands[1])
+   == -(unsigned HOST_WIDE_INT) INTVAL (operands[2])"
+{
+  if (incdec_operand (operands[1], <MODE>mode))
+    {
+      if (operands[1] == const1_rtx)
+	return "lock{%;} %K3inc{<imodesuffix>}\t%0";
+      else
+	{
+	  gcc_assert (operands[1] == constm1_rtx);
+	  return "lock{%;} %K3dec{<imodesuffix>}\t%0";
+	}
+    }
+
+  if (x86_maybe_negate_const_int (&operands[1], <MODE>mode))
+    return "lock{%;} %K3sub{<imodesuffix>}\t{%1, %0|%0, %1}";
+
+  return "lock{%;} %K3add{<imodesuffix>}\t{%1, %0|%0, %1}";
+})
+
+;; Recall that xchg implicitly sets LOCK#, so adding it again wastes space.
+;; In addition, it is always a full barrier, so we can ignore the memory model.
+(define_insn "atomic_exchange<mode>"
+  [(set (match_operand:SWI 0 "register_operand" "=<r>")		;; output
+	(unspec_volatile:SWI
+	  [(match_operand:SWI 1 "memory_operand" "+m")		;; memory
+	   (match_operand:SI 3 "const_int_operand")]		;; model
+	  UNSPECV_XCHG))
+   (set (match_dup 1)
+	(match_operand:SWI 2 "register_operand" "0"))]		;; input
+  ""
+  "%K3xchg{<imodesuffix>}\t{%1, %0|%0, %1}")
+
+(define_insn "atomic_add<mode>"
+  [(set (match_operand:SWI 0 "memory_operand" "+m")
+	(unspec_volatile:SWI
+	  [(plus:SWI (match_dup 0)
+		     (match_operand:SWI 1 "nonmemory_operand" "<r><i>"))
+	   (match_operand:SI 2 "const_int_operand")]		;; model
+	  UNSPECV_LOCK))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+{
+  if (incdec_operand (operands[1], <MODE>mode))
+    {
+      if (operands[1] == const1_rtx)
+	return "lock{%;} %K2inc{<imodesuffix>}\t%0";
+      else
+	{
+	  gcc_assert (operands[1] == constm1_rtx);
+	  return "lock{%;} %K2dec{<imodesuffix>}\t%0";
+	}
+    }
+
+  if (x86_maybe_negate_const_int (&operands[1], <MODE>mode))
+    return "lock{%;} %K2sub{<imodesuffix>}\t{%1, %0|%0, %1}";
+
+  return "lock{%;} %K2add{<imodesuffix>}\t{%1, %0|%0, %1}";
+})
+
+(define_insn "atomic_sub<mode>"
+  [(set (match_operand:SWI 0 "memory_operand" "+m")
+	(unspec_volatile:SWI
+	  [(minus:SWI (match_dup 0)
+		      (match_operand:SWI 1 "nonmemory_operand" "<r><i>"))
+	   (match_operand:SI 2 "const_int_operand")]		;; model
+	  UNSPECV_LOCK))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+{
+  if (incdec_operand (operands[1], <MODE>mode))
+    {
+      if (operands[1] == const1_rtx)
+	return "lock{%;} %K2dec{<imodesuffix>}\t%0";
+      else
+	{
+	  gcc_assert (operands[1] == constm1_rtx);
+	  return "lock{%;} %K2inc{<imodesuffix>}\t%0";
+	}
+    }
+
+  if (x86_maybe_negate_const_int (&operands[1], <MODE>mode))
+    return "lock{%;} %K2add{<imodesuffix>}\t{%1, %0|%0, %1}";
+
+  return "lock{%;} %K2sub{<imodesuffix>}\t{%1, %0|%0, %1}";
+})
+
+(define_insn "atomic_<logic><mode>"
+  [(set (match_operand:SWI 0 "memory_operand" "+m")
+	(unspec_volatile:SWI
+	  [(any_logic:SWI (match_dup 0)
+			  (match_operand:SWI 1 "nonmemory_operand" "<r><i>"))
+	   (match_operand:SI 2 "const_int_operand")]		;; model
+	  UNSPECV_LOCK))
+   (clobber (reg:CC FLAGS_REG))]
+  ""
+  "lock{%;} %K2<logic>{<imodesuffix>}\t{%1, %0|%0, %1}")
diff --git a/gcc-4.9/gcc/config/i386/sysv4.h b/gcc-4.9/gcc/config/i386/sysv4.h
new file mode 100644
index 000000000..011b228ca
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/sysv4.h
@@ -0,0 +1,72 @@
+/* Target definitions for GCC for Intel 80386 running System V.4
+   Copyright (C) 1991-2014 Free Software Foundation, Inc.
+
+   Written by Ron Guilmette (rfg@netcom.com).
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Output at beginning of assembler file.  */
+/* The .file command should always begin the output.  */
+
+#define TARGET_ASM_FILE_START_FILE_DIRECTIVE true
+#undef X86_FILE_START_VERSION_DIRECTIVE
+#define X86_FILE_START_VERSION_DIRECTIVE true
+
+#undef DBX_REGISTER_NUMBER
+#define DBX_REGISTER_NUMBER(n)  svr4_dbx_register_map[n]
+
+/* A C statement (sans semicolon) to output to the stdio stream
+   FILE the assembler definition of uninitialized global DECL named
+   NAME whose size is SIZE bytes and alignment is ALIGN bytes.
+   Try to use asm_output_aligned_bss to implement this macro.  */
+
+#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \
+  asm_output_aligned_bss (FILE, DECL, NAME, SIZE, ALIGN)
+
+/* Handle special EH pointer encodings.  Absolute, pc-relative, and
+   indirect are handled automatically.  */
+#define ASM_MAYBE_OUTPUT_ENCODED_ADDR_RTX(FILE, ENCODING, SIZE, ADDR, DONE) \
+  do {									\
+    if ((SIZE) == 4 && ((ENCODING) & 0x70) == DW_EH_PE_datarel)		\
+      {									\
+	fputs (ASM_LONG, (FILE));					\
+	assemble_name (FILE, XSTR (ADDR, 0));				\
+	fputs (((ENCODING) & DW_EH_PE_indirect ? "@GOT" : "@GOTOFF"), (FILE)); \
+	goto DONE;							\
+      }									\
+  } while (0)
+
+/* Used by crtstuff.c to initialize the base of data-relative relocations.
+   These are GOT relative on x86, so return the pic register.  */
+#ifdef __PIC__
+#define CRT_GET_RFIB_DATA(BASE)			\
+  {						\
+    register void *ebx_ __asm__("ebx");		\
+    BASE = ebx_;				\
+  }
+#else
+#define CRT_GET_RFIB_DATA(BASE)						\
+  __asm__ ("call\t.LPR%=\n"						\
+	   ".LPR%=:\n\t"						\
+	   "pop{l}\t%0\n\t"						\
+	   /* Due to a GAS bug, this cannot use EAX.  That encodes	\
+	      smaller than the traditional EBX, which results in the	\
+	      offset being off by one.  */				\
+	   "add{l}\t{$_GLOBAL_OFFSET_TABLE_+[.-.LPR%=],%0"		\
+		   "|%0,_GLOBAL_OFFSET_TABLE_+(.-.LPR%=)}"		\
+	   : "=d"(BASE))
+#endif
diff --git a/gcc-4.9/gcc/config/i386/t-cygming b/gcc-4.9/gcc/config/i386/t-cygming
new file mode 100644
index 000000000..9544e4914
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/t-cygming
@@ -0,0 +1,48 @@
+# Copyright (C) 2003-2014 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+# cygwin and mingw always have a limits.h, but, depending upon how we are
+# doing the build, it may not be installed yet.
+LIMITS_H_TEST = true
+
+winnt.o: $(srcdir)/config/i386/winnt.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
+  $(TM_H) $(RTL_H) $(REGS_H) hard-reg-set.h output.h $(TREE_H) flags.h \
+  $(TM_P_H) $(HASH_TABLE_H) $(GGC_H) $(LTO_STREAMER_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+	$(srcdir)/config/i386/winnt.c
+
+winnt-cxx.o: $(srcdir)/config/i386/winnt-cxx.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
+  $(TM_H) $(TREE_H) flags.h \
+  $(TM_P_H) $(HASHTAB_H) $(GGC_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+	$(srcdir)/config/i386/winnt-cxx.c
+
+
+winnt-stubs.o: $(srcdir)/config/i386/winnt-stubs.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
+  $(TM_H) $(RTL_H) $(REGS_H) hard-reg-set.h output.h $(TREE_H) flags.h \
+  $(TM_P_H) $(HASHTAB_H) $(GGC_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+	$(srcdir)/config/i386/winnt-stubs.c
+
+msformat-c.o: $(srcdir)/config/i386/msformat-c.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
+  $(TM_H) $(RTL_H) $(REGS_H) hard-reg-set.h output.h $(TREE_H) flags.h \
+  $(TM_P_H) $(HASHTAB_H) $(GGC_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+	$(srcdir)/config/i386/msformat-c.c
+
+STMP_FIXINC=stmp-fixinc
diff --git a/gcc-4.9/gcc/config/i386/t-cygwin-w64 b/gcc-4.9/gcc/config/i386/t-cygwin-w64
new file mode 100644
index 000000000..01968fd88
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/t-cygwin-w64
@@ -0,0 +1,3 @@
+MULTILIB_OPTIONS = m64/m32
+MULTILIB_DIRNAMES = 64
+MULTILIB_OSDIRNAMES = ../lib ../lib32
diff --git a/gcc-4.9/gcc/config/i386/t-darwin b/gcc-4.9/gcc/config/i386/t-darwin
new file mode 100644
index 000000000..bf44504d4
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/t-darwin
@@ -0,0 +1,2 @@
+MULTILIB_OPTIONS = m64
+MULTILIB_DIRNAMES = x86_64
diff --git a/gcc-4.9/gcc/config/i386/t-darwin64 b/gcc-4.9/gcc/config/i386/t-darwin64
new file mode 100644
index 000000000..6a6b22f1e
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/t-darwin64
@@ -0,0 +1,2 @@
+MULTILIB_OPTIONS = m32
+MULTILIB_DIRNAMES = i386
diff --git a/gcc-4.9/gcc/config/i386/t-gmm_malloc b/gcc-4.9/gcc/config/i386/t-gmm_malloc
new file mode 100644
index 000000000..c37f8a759
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/t-gmm_malloc
@@ -0,0 +1,6 @@
+# Install gmm_malloc.h as mm_malloc.h.
+
+EXTRA_HEADERS += mm_malloc.h
+mm_malloc.h: $(srcdir)/config/i386/gmm_malloc.h
+	rm -f $@
+	cat $^ > $@
diff --git a/gcc-4.9/gcc/config/i386/t-gnu b/gcc-4.9/gcc/config/i386/t-gnu
new file mode 100644
index 000000000..5f946c716
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/t-gnu
@@ -0,0 +1 @@
+MULTIARCH_DIRNAME = $(call if_multiarch,i386-gnu)
diff --git a/gcc-4.9/gcc/config/i386/t-i386 b/gcc-4.9/gcc/config/i386/t-i386
new file mode 100644
index 000000000..345e84421
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/t-i386
@@ -0,0 +1,31 @@
+# Copyright (C) 2008-2014 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+OPTIONS_H_EXTRA += $(srcdir)/config/i386/stringop.def
+TM_H += $(srcdir)/config/i386/x86-tune.def
+
+i386-c.o: $(srcdir)/config/i386/i386-c.c i386-builtin-types.inc
+	  $(COMPILE) $<
+	  $(POSTCOMPILE)
+
+i386-builtin-types.inc: s-i386-bt ; @true
+s-i386-bt: $(srcdir)/config/i386/i386-builtin-types.awk \
+  $(srcdir)/config/i386/i386-builtin-types.def
+	$(AWK) -f $^ > tmp-bt.inc
+	$(SHELL) $(srcdir)/../move-if-change tmp-bt.inc i386-builtin-types.inc
+	$(STAMP) $@
diff --git a/gcc-4.9/gcc/config/i386/t-interix b/gcc-4.9/gcc/config/i386/t-interix
new file mode 100644
index 000000000..24f5243f5
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/t-interix
@@ -0,0 +1,30 @@
+# Copyright (C) 2011-2014 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+winnt.o: $(srcdir)/config/i386/winnt.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
+  $(TM_H) $(RTL_H) $(REGS_H) hard-reg-set.h output.h $(TREE_H) flags.h \
+  $(TM_P_H) $(HASH_TABLE_H) $(GGC_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/i386/winnt.c
+
+winnt-stubs.o: $(srcdir)/config/i386/winnt-stubs.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
+  $(TM_H) $(RTL_H) $(REGS_H) hard-reg-set.h output.h $(TREE_H) flags.h \
+  $(TM_P_H) toplev.h $(HASHTAB_H) $(GGC_H)
+	$(COMPILER) -c $(ALL_CFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+	$(srcdir)/config/i386/winnt-stubs.c
+
diff --git a/gcc-4.9/gcc/config/i386/t-kfreebsd b/gcc-4.9/gcc/config/i386/t-kfreebsd
new file mode 100644
index 000000000..762d520fa
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/t-kfreebsd
@@ -0,0 +1,5 @@
+MULTIARCH_DIRNAME = $(call if_multiarch,i386-kfreebsd-gnu)
+
+# MULTILIB_OSDIRNAMES are set in t-linux64.
+KFREEBSD_OS = $(filter kfreebsd%, $(word 3, $(subst -, ,$(target))))
+MULTILIB_OSDIRNAMES := $(filter-out mx32=%,$(subst linux,$(KFREEBSD_OS),$(MULTILIB_OSDIRNAMES)))
diff --git a/gcc-4.9/gcc/config/i386/t-linux b/gcc-4.9/gcc/config/i386/t-linux
new file mode 100644
index 000000000..155314c08
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/t-linux
@@ -0,0 +1 @@
+MULTIARCH_DIRNAME = $(call if_multiarch,i386-linux-gnu)
diff --git a/gcc-4.9/gcc/config/i386/t-linux64 b/gcc-4.9/gcc/config/i386/t-linux64
new file mode 100644
index 000000000..5ec8907a9
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/t-linux64
@@ -0,0 +1,38 @@
+# Copyright (C) 2002-2014 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+# On Debian, Ubuntu and other derivative distributions, the 32bit libraries
+# are found in /lib32 and /usr/lib32, /lib64 and /usr/lib64 are symlinks to
+# /lib and /usr/lib, while other distributions install libraries into /lib64
+# and /usr/lib64.  The LSB does not enforce the use of /lib64 and /usr/lib64,
+# it doesn't tell anything about the 32bit libraries on those systems.  Set
+# MULTILIB_OSDIRNAMES according to what is found on the target.
+
+# To support i386, x86-64 and x32 libraries, the directory structrue
+# should be:
+#
+# 	/lib has i386 libraries.
+# 	/lib64 has x86-64 libraries.
+# 	/libx32 has x32 libraries.
+#
+comma=,
+MULTILIB_OPTIONS    = $(subst $(comma),/,$(TM_MULTILIB_CONFIG))
+MULTILIB_DIRNAMES   = $(patsubst m%, %, $(subst /, ,$(MULTILIB_OPTIONS)))
+MULTILIB_OSDIRNAMES = m64=../lib64$(call if_multiarch,:x86_64-linux-gnu)
+MULTILIB_OSDIRNAMES+= m32=$(if $(wildcard $(shell echo $(SYSTEM_HEADER_DIR))/../../usr/lib32),../lib32,../lib)$(call if_multiarch,:i386-linux-gnu)
+MULTILIB_OSDIRNAMES+= mx32=../libx32$(call if_multiarch,:x86_64-linux-gnux32)
diff --git a/gcc-4.9/gcc/config/i386/t-mingw-w32 b/gcc-4.9/gcc/config/i386/t-mingw-w32
new file mode 100644
index 000000000..4fc8582cf
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/t-mingw-w32
@@ -0,0 +1,3 @@
+MULTILIB_OPTIONS = m64/m32
+MULTILIB_DIRNAMES = 64 32
+MULTILIB_OSDIRNAMES = ../lib64 ../lib
diff --git a/gcc-4.9/gcc/config/i386/t-mingw-w64 b/gcc-4.9/gcc/config/i386/t-mingw-w64
new file mode 100644
index 000000000..c809ebd7d
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/t-mingw-w64
@@ -0,0 +1,3 @@
+MULTILIB_OPTIONS = m64/m32
+MULTILIB_DIRNAMES = 64 32
+MULTILIB_OSDIRNAMES = ../lib ../lib32
diff --git a/gcc-4.9/gcc/config/i386/t-openbsd b/gcc-4.9/gcc/config/i386/t-openbsd
new file mode 100644
index 000000000..4f8ff657a
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/t-openbsd
@@ -0,0 +1,4 @@
+# gdb gets confused if pic code is linked with non pic
+# We cope by building variants of libgcc.
+MULTILIB_OPTIONS = fpic
+MULTILIB_MATCHES=fpic=fPIC
diff --git a/gcc-4.9/gcc/config/i386/t-pmm_malloc b/gcc-4.9/gcc/config/i386/t-pmm_malloc
new file mode 100644
index 000000000..109009fbf
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/t-pmm_malloc
@@ -0,0 +1,6 @@
+# Install pmm_malloc.h as mm_malloc.h.
+
+EXTRA_HEADERS += mm_malloc.h
+mm_malloc.h: $(srcdir)/config/i386/pmm_malloc.h
+	rm -f $@
+	cat $^ > $@
diff --git a/gcc-4.9/gcc/config/i386/t-rtems b/gcc-4.9/gcc/config/i386/t-rtems
new file mode 100644
index 000000000..e3934179e
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/t-rtems
@@ -0,0 +1,26 @@
+# Copyright (C) 1999-2014 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+#
+
+MULTILIB_OPTIONS = mtune=i486/mtune=pentium/mtune=pentiumpro msoft-float
+MULTILIB_DIRNAMES= m486 mpentium mpentiumpro soft-float
+MULTILIB_MATCHES = msoft-float=mno-80387
+MULTILIB_MATCHES += mtune?pentium=mtune?k6 mtune?pentiumpro=mtune?athlon
+MULTILIB_EXCEPTIONS = \
+mtune=pentium/*msoft-float* \
+mtune=pentiumpro/*msoft-float*
diff --git a/gcc-4.9/gcc/config/i386/t-sol2-64 b/gcc-4.9/gcc/config/i386/t-sol2-64
new file mode 100644
index 000000000..4e70f0bed
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/t-sol2-64
@@ -0,0 +1,21 @@
+# Copyright (C) 2004-2014 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+MULTILIB_OPTIONS = m32/m64
+MULTILIB_DIRNAMES = 32 amd64
+MULTILIB_OSDIRNAMES = . amd64
diff --git a/gcc-4.9/gcc/config/i386/t-vxworks b/gcc-4.9/gcc/config/i386/t-vxworks
new file mode 100644
index 000000000..c440b1f90
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/t-vxworks
@@ -0,0 +1,8 @@
+# Multilibs for VxWorks.
+
+# Build multilibs for normal, -mrtp, and -mrtp -fPIC.
+MULTILIB_OPTIONS = mrtp fPIC
+MULTILIB_DIRNAMES =
+MULTILIB_MATCHES = fPIC=fpic
+MULTILIB_EXCEPTIONS = fPIC
+
diff --git a/gcc-4.9/gcc/config/i386/t-vxworksae b/gcc-4.9/gcc/config/i386/t-vxworksae
new file mode 100644
index 000000000..0cea2bbf3
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/t-vxworksae
@@ -0,0 +1,5 @@
+# Multilibs for VxWorks AE.
+
+MULTILIB_OPTIONS = mvthreads
+MULTILIB_MATCHES =
+MULTILIB_EXCEPTIONS = 
diff --git a/gcc-4.9/gcc/config/i386/tbmintrin.h b/gcc-4.9/gcc/config/i386/tbmintrin.h
new file mode 100644
index 000000000..871f53280
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/tbmintrin.h
@@ -0,0 +1,180 @@
+/* Copyright (C) 2010-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86INTRIN_H_INCLUDED
+# error "Never use <tbmintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _TBMINTRIN_H_INCLUDED
+#define _TBMINTRIN_H_INCLUDED
+
+#ifndef __TBM__
+#pragma GCC push_options
+#pragma GCC target("tbm")
+#define __DISABLE_TBM__
+#endif /* __TBM__ */
+
+#ifdef __OPTIMIZE__
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bextri_u32 (unsigned int __X, const unsigned int __I)
+{
+	return __builtin_ia32_bextri_u32 (__X, __I);
+}
+#else
+#define __bextri_u32(X, I)                                           \
+        ((unsigned int)__builtin_ia32_bextri_u32 ((unsigned int)(X), \
+	                                          (unsigned int)(I)))
+#endif /*__OPTIMIZE__ */
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcfill_u32 (unsigned int __X)
+{
+  return __X & (__X + 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blci_u32 (unsigned int __X)
+{
+  return __X | ~(__X + 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcic_u32 (unsigned int __X)
+{
+  return ~__X & (__X + 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcmsk_u32 (unsigned int __X)
+{
+  return __X ^ (__X + 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcs_u32 (unsigned int __X)
+{
+  return __X | (__X + 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsfill_u32 (unsigned int __X)
+{
+  return __X | (__X - 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsic_u32 (unsigned int __X)
+{
+  return ~__X | (__X - 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__t1mskc_u32 (unsigned int __X)
+{
+  return ~__X | (__X + 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzmsk_u32 (unsigned int __X)
+{
+  return ~__X & (__X - 1);
+}
+
+
+
+#ifdef __x86_64__
+#ifdef __OPTIMIZE__
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bextri_u64 (unsigned long long __X, const unsigned int __I)
+{
+  return __builtin_ia32_bextri_u64 (__X, __I);
+}
+#else
+#define __bextri_u64(X, I)						   \
+  ((unsigned long long)__builtin_ia32_bextri_u64 ((unsigned long long)(X), \
+						  (unsigned long long)(I)))
+#endif /*__OPTIMIZE__ */
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcfill_u64 (unsigned long long __X)
+{
+  return __X & (__X + 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blci_u64 (unsigned long long __X)
+{
+  return __X | ~(__X + 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcic_u64 (unsigned long long __X)
+{
+  return ~__X & (__X + 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcmsk_u64 (unsigned long long __X)
+{
+  return __X ^ (__X + 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcs_u64 (unsigned long long __X)
+{
+  return __X | (__X + 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsfill_u64 (unsigned long long __X)
+{
+  return __X | (__X - 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsic_u64 (unsigned long long __X)
+{
+  return ~__X | (__X - 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__t1mskc_u64 (unsigned long long __X)
+{
+  return ~__X | (__X + 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzmsk_u64 (unsigned long long __X)
+{
+  return ~__X & (__X - 1);
+}
+
+
+#endif /* __x86_64__  */
+
+#ifdef __DISABLE_TBM__
+#undef __DISABLE_TBM__
+#pragma GCC pop_options
+#endif /* __DISABLE_TBM__ */
+
+#endif /* _TBMINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/tmmintrin.h b/gcc-4.9/gcc/config/i386/tmmintrin.h
new file mode 100644
index 000000000..89556d24b
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/tmmintrin.h
@@ -0,0 +1,249 @@
+/* Copyright (C) 2006-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.1.  */
+
+#ifndef _TMMINTRIN_H_INCLUDED
+#define _TMMINTRIN_H_INCLUDED
+
+/* We need definitions from the SSE3, SSE2 and SSE header files*/
+#include <pmmintrin.h>
+
+#ifndef __SSSE3__
+#pragma GCC push_options
+#pragma GCC target("ssse3")
+#define __DISABLE_SSSE3__
+#endif /* __SSSE3__ */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_epi16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_phaddw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_phaddd128 ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadds_epi16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_phaddsw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_pi16 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_phaddw ((__v4hi)__X, (__v4hi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_pi32 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_phaddd ((__v2si)__X, (__v2si)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadds_pi16 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_phaddsw ((__v4hi)__X, (__v4hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_epi16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_phsubw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_phsubd128 ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubs_epi16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_phsubsw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_pi16 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_phsubw ((__v4hi)__X, (__v4hi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_pi32 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_phsubd ((__v2si)__X, (__v2si)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubs_pi16 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_phsubsw ((__v4hi)__X, (__v4hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddubs_epi16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmaddubsw128 ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddubs_pi16 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_pmaddubsw ((__v8qi)__X, (__v8qi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhrs_epi16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmulhrsw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhrs_pi16 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_pmulhrsw ((__v4hi)__X, (__v4hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_epi8 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pshufb128 ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi8 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_pshufb ((__v8qi)__X, (__v8qi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_epi8 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psignb128 ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_epi16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psignw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psignd128 ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_pi8 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_psignb ((__v8qi)__X, (__v8qi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_pi16 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_psignw ((__v4hi)__X, (__v4hi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_pi32 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_psignd ((__v2si)__X, (__v2si)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
+{
+  return (__m128i) __builtin_ia32_palignr128 ((__v2di)__X,
+					      (__v2di)__Y, __N * 8);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_alignr_pi8(__m64 __X, __m64 __Y, const int __N)
+{
+  return (__m64) __builtin_ia32_palignr ((__v1di)__X,
+					 (__v1di)__Y, __N * 8);
+}
+#else
+#define _mm_alignr_epi8(X, Y, N)					\
+  ((__m128i) __builtin_ia32_palignr128 ((__v2di)(__m128i)(X),		\
+					(__v2di)(__m128i)(Y),		\
+					(int)(N) * 8))
+#define _mm_alignr_pi8(X, Y, N)						\
+  ((__m64) __builtin_ia32_palignr ((__v1di)(__m64)(X),			\
+				   (__v1di)(__m64)(Y),			\
+				   (int)(N) * 8))
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi8 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pabsb128 ((__v16qi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi16 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pabsw128 ((__v8hi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi32 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pabsd128 ((__v4si)__X);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pi8 (__m64 __X)
+{
+  return (__m64) __builtin_ia32_pabsb ((__v8qi)__X);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pi16 (__m64 __X)
+{
+  return (__m64) __builtin_ia32_pabsw ((__v4hi)__X);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pi32 (__m64 __X)
+{
+  return (__m64) __builtin_ia32_pabsd ((__v2si)__X);
+}
+
+#ifdef __DISABLE_SSSE3__
+#undef __DISABLE_SSSE3__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSSE3__ */
+
+#endif /* _TMMINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/unix.h b/gcc-4.9/gcc/config/i386/unix.h
new file mode 100644
index 000000000..d4fdf9b4b
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/unix.h
@@ -0,0 +1,80 @@
+/* Definitions for Unix assembler syntax for the Intel 80386.
+   Copyright (C) 1988-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* This file defines the aspects of assembler syntax
+   that are the same for all the i386 Unix systems
+   (though they may differ in non-Unix systems).  */
+
+/* Define macro used to output shift-double opcodes when the shift
+   count is in %cl.  Some assemblers require %cl as an argument;
+   some don't.  This macro controls what to do: by default, don't
+   print %cl.  */
+#define SHIFT_DOUBLE_OMITS_COUNT 1
+
+/* Define the syntax of pseudo-ops, labels and comments.  */
+
+/* String containing the assembler's comment-starter.
+   Note the trailing space is necessary in case the character
+   that immediately follows the comment is '*'.  If this happens
+   and the space is not there the assembler will interpret this
+   as the start of a C-like slash-star comment and complain when
+   there is no terminator.  */
+
+#define ASM_COMMENT_START "/ "
+
+/* Output to assembler file text saying following lines
+   may contain character constants, extra white space, comments, etc.  */
+
+#define ASM_APP_ON "/APP\n"
+
+/* Output to assembler file text saying following lines
+   no longer contain unusual constructs.  */
+
+#define ASM_APP_OFF "/NO_APP\n"
+
+/* Output before read-only data.  */
+
+#define TEXT_SECTION_ASM_OP "\t.text"
+
+/* Output before writable (initialized) data.  */
+
+#define DATA_SECTION_ASM_OP "\t.data"
+
+/* Output before writable (uninitialized) data.  */
+
+#define BSS_SECTION_ASM_OP "\t.bss"
+
+/* Globalizing directive for a label.  */
+#define GLOBAL_ASM_OP "\t.globl\t"
+
+/* By default, target has a 80387, uses IEEE compatible arithmetic,
+   and returns float values in the 387.  */
+#undef TARGET_SUBTARGET_DEFAULT
+#define TARGET_SUBTARGET_DEFAULT \
+	(MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS)
+
+/* By default, 64-bit mode uses 128-bit long double.  */
+#undef TARGET_SUBTARGET64_DEFAULT
+#define TARGET_SUBTARGET64_DEFAULT \
+	MASK_128BIT_LONG_DOUBLE
diff --git a/gcc-4.9/gcc/config/i386/vx-common.h b/gcc-4.9/gcc/config/i386/vx-common.h
new file mode 100644
index 000000000..136c2d9af
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/vx-common.h
@@ -0,0 +1,33 @@
+/* IA32 VxWorks and VxWorks AE target definitions.
+   Copyright (C) 2007-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \
+  asm_output_aligned_bss (FILE, DECL, NAME, SIZE, ALIGN)
+
+/* VxWorks uses the same ABI as Solaris 2, so use i386/sol2.h version.  */
+
+#undef TARGET_SUBTARGET_DEFAULT
+#define TARGET_SUBTARGET_DEFAULT \
+	(MASK_80387 | MASK_IEEE_FP | MASK_FLOAT_RETURNS | MASK_VECT8_RETURNS)
+
+/* Provide our target specific DBX_REGISTER_NUMBER.  VxWorks relies on
+   the SVR4 numbering.  */
+
+#undef DBX_REGISTER_NUMBER
+#define DBX_REGISTER_NUMBER(n)  svr4_dbx_register_map[n]
diff --git a/gcc-4.9/gcc/config/i386/vxworks.h b/gcc-4.9/gcc/config/i386/vxworks.h
new file mode 100644
index 000000000..49206e015
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/vxworks.h
@@ -0,0 +1,73 @@
+/* IA32 VxWorks target definitions for GNU compiler.
+   Copyright (C) 2003-2014 Free Software Foundation, Inc.
+   Updated by CodeSourcery, LLC.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#undef  ASM_SPEC
+#define ASM_SPEC ""
+
+#define TARGET_OS_CPP_BUILTINS()			\
+  do							\
+    {							\
+      VXWORKS_OS_CPP_BUILTINS ();			\
+      if (TARGET_386)					\
+        builtin_define ("CPU=I80386");			\
+      else if (TARGET_486)				\
+        builtin_define ("CPU=I80486");			\
+      else if (TARGET_PENTIUM)				\
+        {						\
+          builtin_define ("CPU=PENTIUM");		\
+          builtin_define ("CPU_VARIANT=PENTIUM");	\
+        }						\
+      else if (TARGET_PENTIUMPRO)			\
+        {						\
+          builtin_define ("CPU=PENTIUM2");		\
+          builtin_define ("CPU_VARIANT=PENTIUMPRO");	\
+        }						\
+      else if (TARGET_PENTIUM4)				\
+        {						\
+          builtin_define ("CPU=PENTIUM4");		\
+          builtin_define ("CPU_VARIANT=PENTIUM4");	\
+        }						\
+    }							\
+  while (0)
+
+#undef  CPP_SPEC
+#define CPP_SPEC VXWORKS_ADDITIONAL_CPP_SPEC
+#undef  LIB_SPEC
+#define LIB_SPEC VXWORKS_LIB_SPEC
+#undef  STARTFILE_SPEC
+#define STARTFILE_SPEC VXWORKS_STARTFILE_SPEC
+#undef  ENDFILE_SPEC
+#define ENDFILE_SPEC VXWORKS_ENDFILE_SPEC
+#undef  LINK_SPEC
+#define LINK_SPEC VXWORKS_LINK_SPEC
+
+#undef  SUBTARGET_SWITCHES
+#define SUBTARGET_SWITCHES EXTRA_SUBTARGET_SWITCHES
+
+#undef SUBTARGET_OVERRIDE_OPTIONS
+#define SUBTARGET_OVERRIDE_OPTIONS VXWORKS_OVERRIDE_OPTIONS
+
+/* No _mcount profiling on VxWorks.  */
+#undef FUNCTION_PROFILER
+#define FUNCTION_PROFILER(FILE,LABELNO) VXWORKS_FUNCTION_PROFILER(FILE,LABELNO)
+
+/* We cannot use PC-relative accesses for VxWorks PIC because there is no
+   fixed gap between segments.  */
+#undef ASM_PREFERRED_EH_DATA_FORMAT
diff --git a/gcc-4.9/gcc/config/i386/vxworksae.h b/gcc-4.9/gcc/config/i386/vxworksae.h
new file mode 100644
index 000000000..bb63c079c
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/vxworksae.h
@@ -0,0 +1,35 @@
+/* IA32 VxWorks AE target definitions for GNU compiler.
+   Copyright (C) 2005-2014 Free Software Foundation, Inc.
+   Contributed by CodeSourcery, LLC.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* On VxWorks AE, we only want SIMNT.  */
+#undef VXWORKS_CPU_DEFINE
+#define VXWORKS_CPU_DEFINE()			\
+  do						\
+    builtin_define ("CPU=SIMNT");		\
+  while (0)
+
+#undef  ASM_SPEC
+#define ASM_SPEC ""
+
+#undef  SIZE_TYPE
+#define SIZE_TYPE "unsigned int"
+
+#undef  PTRDIFF_TYPE
+#define PTRDIFF_TYPE "int"
diff --git a/gcc-4.9/gcc/config/i386/winnt-cxx.c b/gcc-4.9/gcc/config/i386/winnt-cxx.c
new file mode 100644
index 000000000..aa75f9157
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/winnt-cxx.c
@@ -0,0 +1,184 @@
+/* Target support for C++ classes on Windows.
+   Contributed by Danny Smith (dannysmith@users.sourceforge.net)
+   Copyright (C) 2005-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "tree.h"
+#include "stringpool.h"
+#include "attribs.h"
+#include "cp/cp-tree.h" /* This is why we're a separate module.  */
+#include "flags.h"
+#include "tm_p.h"
+#include "diagnostic-core.h"
+#include "hashtab.h"
+
+bool
+i386_pe_type_dllimport_p (tree decl)
+{
+  gcc_assert (TREE_CODE (decl) == VAR_DECL 
+	      || TREE_CODE (decl) == FUNCTION_DECL);
+
+  if (TARGET_NOP_FUN_DLLIMPORT && TREE_CODE (decl) == FUNCTION_DECL)
+    return false;
+
+  /* We ignore the dllimport attribute for inline member functions.
+     This differs from MSVC behavior which treats it like GNUC
+     'extern inline' extension.  Also ignore for template
+     instantiations with linkonce semantics and artificial methods.  */
+  if (TREE_CODE (decl) ==  FUNCTION_DECL
+      && (DECL_DECLARED_INLINE_P (decl)
+	  || DECL_TEMPLATE_INSTANTIATION (decl)
+	  || DECL_ARTIFICIAL (decl)))
+    return false;
+  
+  /* Overrides of the class dllimport decls by out-of-class definitions are 
+     handled by tree.c:merge_dllimport_decl_attributes.   */
+  return true;
+}
+
+bool
+i386_pe_type_dllexport_p (tree decl)
+{
+  gcc_assert (TREE_CODE (decl) == VAR_DECL 
+              || TREE_CODE (decl) == FUNCTION_DECL);
+
+  /* Avoid exporting compiler-generated default dtors and copy ctors.
+     The only artificial methods that need to be exported are virtual
+     and non-virtual thunks.  */
+  if (TREE_CODE (TREE_TYPE (decl)) == METHOD_TYPE
+      && DECL_ARTIFICIAL (decl) && !DECL_THUNK_P (decl))
+    return false;
+  if (TREE_CODE (decl) == FUNCTION_DECL
+      && DECL_DECLARED_INLINE_P (decl))
+    {
+      if (DECL_REALLY_EXTERN (decl)
+	  || !flag_keep_inline_dllexport)
+	return false;
+    }
+  return true;
+}
+
+static inline void maybe_add_dllimport (tree decl) 
+{
+  if (i386_pe_type_dllimport_p (decl))
+    DECL_DLLIMPORT_P (decl) = 1;
+}
+
+static inline void maybe_add_dllexport (tree decl) 
+{
+  if (i386_pe_type_dllexport_p (decl))
+    {   
+      tree decl_attrs = DECL_ATTRIBUTES (decl);
+      if (lookup_attribute ("dllexport", decl_attrs) != NULL_TREE)
+	/* Already done.  */
+	return;
+      DECL_ATTRIBUTES (decl) = tree_cons (get_identifier ("dllexport"),
+					  NULL_TREE, decl_attrs);
+    }
+}
+
+void
+i386_pe_adjust_class_at_definition (tree t)
+{
+  tree member;
+
+  gcc_assert (CLASS_TYPE_P (t));
+ 
+ 
+  if (lookup_attribute ("dllexport", TYPE_ATTRIBUTES (t)) != NULL_TREE)
+    {
+      tree tmv = TYPE_MAIN_VARIANT (t);
+
+      /* Make sure that we set dllexport attribute to typeinfo's
+	 base declaration, as otherwise it would fail to be exported as
+	 it isn't a class-member.  */
+      if (tmv != NULL_TREE
+	  && CLASSTYPE_TYPEINFO_VAR (tmv) != NULL_TREE)
+	{
+	  tree na, ti_decl = CLASSTYPE_TYPEINFO_VAR (tmv);
+	  na = tree_cons (get_identifier ("dllexport"), NULL_TREE,
+			  NULL_TREE);
+	  decl_attributes (&ti_decl, na, 0);
+	}
+
+      /* Check static VAR_DECL's.  */
+      for (member = TYPE_FIELDS (t); member; member = DECL_CHAIN (member))
+	if (TREE_CODE (member) == VAR_DECL)     
+	  maybe_add_dllexport (member);
+    
+      /* Check FUNCTION_DECL's.  */
+      for (member = TYPE_METHODS (t); member;  member = DECL_CHAIN (member))
+	if (TREE_CODE (member) == FUNCTION_DECL)
+	  {
+	    tree thunk;
+	    maybe_add_dllexport (member);
+	  
+	    /* Also add the attribute to its thunks.  */
+	    for (thunk = DECL_THUNKS (member); thunk;
+		 thunk = TREE_CHAIN (thunk))
+	      maybe_add_dllexport (thunk);
+	}
+      /* Check vtables  */
+      for (member = CLASSTYPE_VTABLES (t); member;  member = DECL_CHAIN (member))
+	if (TREE_CODE (member) == VAR_DECL) 
+	  maybe_add_dllexport (member);
+    }
+
+  else if (lookup_attribute ("dllimport", TYPE_ATTRIBUTES (t)) != NULL_TREE)
+    {
+      /* We don't actually add the attribute to the decl, just set the flag
+	 that signals that the address of this symbol is not a compile-time
+	 constant.   Any subsequent out-of-class declaration of members wil
+	 cause the DECL_DLLIMPORT_P flag to be unset.
+	 (See  tree.c: merge_dllimport_decl_attributes).
+	 That is just right since out-of class declarations can only be a
+	 definition.   */
+
+      /* Check static VAR_DECL's.  */
+      for (member = TYPE_FIELDS (t); member; member = DECL_CHAIN (member))
+	if (TREE_CODE (member) == VAR_DECL)     
+	  maybe_add_dllimport (member);
+    
+      /* Check FUNCTION_DECL's.  */
+      for (member = TYPE_METHODS (t); member;  member = DECL_CHAIN (member))
+	if (TREE_CODE (member) == FUNCTION_DECL)
+	  {
+	    tree thunk;
+	    maybe_add_dllimport (member);
+	  
+	    /* Also add the attribute to its thunks.  */
+	    for (thunk = DECL_THUNKS (member); thunk;
+		 thunk = DECL_CHAIN (thunk))
+	      maybe_add_dllimport (thunk);
+	 }
+ 
+      /* Check vtables  */
+      for (member = CLASSTYPE_VTABLES (t); member;  member = DECL_CHAIN (member))
+	if (TREE_CODE (member) == VAR_DECL) 
+	  maybe_add_dllimport (member);
+
+      /* We leave typeinfo tables alone.  We can't mark TI objects as
+	dllimport, since the address of a secondary VTT may be needed
+	for static initialization of a primary VTT.  VTT's  of
+	dllimport'd classes should always be link-once COMDAT.  */ 
+    }
+}
diff --git a/gcc-4.9/gcc/config/i386/winnt-stubs.c b/gcc-4.9/gcc/config/i386/winnt-stubs.c
new file mode 100644
index 000000000..30321d0f7
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/winnt-stubs.c
@@ -0,0 +1,51 @@
+/* Dummy subroutines for language-specific support on Windows.
+   Contributed by Danny Smith (dannysmith@users.sourceforge.net)
+   Copyright (C) 2005-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "rtl.h"
+#include "regs.h"
+#include "hard-reg-set.h"
+#include "output.h"
+#include "tree.h"
+#include "flags.h"
+#include "tm_p.h"
+#include "diagnostic-core.h"
+#include "hashtab.h"
+
+bool
+i386_pe_type_dllimport_p (tree decl ATTRIBUTE_UNUSED)
+{
+  return false;
+}
+
+
+bool
+i386_pe_type_dllexport_p (tree decl ATTRIBUTE_UNUSED)
+{
+  return false;
+}
+
+
+void
+i386_pe_adjust_class_at_definition (tree t ATTRIBUTE_UNUSED)
+{ }
diff --git a/gcc-4.9/gcc/config/i386/winnt.c b/gcc-4.9/gcc/config/i386/winnt.c
new file mode 100644
index 000000000..bcfd48a03
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/winnt.c
@@ -0,0 +1,1304 @@
+/* Subroutines for insn-output.c for Windows NT.
+   Contributed by Douglas Rupp (drupp@cs.washington.edu)
+   Copyright (C) 1995-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "rtl.h"
+#include "regs.h"
+#include "hard-reg-set.h"
+#include "output.h"
+#include "tree.h"
+#include "stringpool.h"
+#include "varasm.h"
+#include "flags.h"
+#include "tm_p.h"
+#include "diagnostic-core.h"
+#include "hash-table.h"
+#include "langhooks.h"
+#include "ggc.h"
+#include "target.h"
+#include "except.h"
+#include "pointer-set.h"
+#include "hash-table.h"
+#include "vec.h"
+#include "basic-block.h"
+#include "tree-ssa-alias.h"
+#include "internal-fn.h"
+#include "gimple-fold.h"
+#include "tree-eh.h"
+#include "gimple-expr.h"
+#include "is-a.h"
+#include "gimple.h"
+#include "lto-streamer.h"
+
+/* i386/PE specific attribute support.
+
+   i386/PE has two new attributes:
+   dllexport - for exporting a function/variable that will live in a dll
+   dllimport - for importing a function/variable from a dll
+
+   Microsoft allows multiple declspecs in one __declspec, separating
+   them with spaces.  We do NOT support this.  Instead, use __declspec
+   multiple times.
+*/
+
+/* Handle a "shared" attribute;
+   arguments as in struct attribute_spec.handler.  */
+tree
+ix86_handle_shared_attribute (tree *node, tree name,
+			      tree args ATTRIBUTE_UNUSED,
+			      int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
+{
+  if (TREE_CODE (*node) != VAR_DECL)
+    {
+      warning (OPT_Wattributes, "%qE attribute only applies to variables",
+	       name);
+      *no_add_attrs = true;
+    }
+
+  return NULL_TREE;
+}
+
+/* Handle a "selectany" attribute;
+   arguments as in struct attribute_spec.handler.  */
+tree
+ix86_handle_selectany_attribute (tree *node, tree name,
+			         tree args ATTRIBUTE_UNUSED,
+			         int flags ATTRIBUTE_UNUSED,
+				 bool *no_add_attrs)
+{
+  /* The attribute applies only to objects that are initialized and have
+     external linkage.  However, we may not know about initialization
+     until the language frontend has processed the decl. We'll check for
+     initialization later in encode_section_info.  */
+  if (TREE_CODE (*node) != VAR_DECL || !TREE_PUBLIC (*node))
+    {	
+      error ("%qE attribute applies only to initialized variables"
+       	     " with external linkage", name);
+      *no_add_attrs = true;
+    }
+
+  return NULL_TREE;
+}
+
+
+/* Return the type that we should use to determine if DECL is
+   imported or exported.  */
+
+static tree
+associated_type (tree decl)
+{
+  return (DECL_CONTEXT (decl) && TYPE_P (DECL_CONTEXT (decl))
+          ?  DECL_CONTEXT (decl) : NULL_TREE);
+}
+
+/* Return true if DECL should be a dllexport'd object.  */
+
+static bool
+i386_pe_determine_dllexport_p (tree decl)
+{
+  if (TREE_CODE (decl) != VAR_DECL && TREE_CODE (decl) != FUNCTION_DECL)
+    return false;
+
+  /* Don't export local clones of dllexports.  */
+  if (!TREE_PUBLIC (decl))
+    return false;
+
+  if (TREE_CODE (decl) == FUNCTION_DECL
+      && DECL_DECLARED_INLINE_P (decl)
+      && !flag_keep_inline_dllexport)
+    return false; 
+
+  if (lookup_attribute ("dllexport", DECL_ATTRIBUTES (decl)))
+    return true;
+
+  return false;
+}
+
+/* Return true if DECL should be a dllimport'd object.  */
+
+static bool
+i386_pe_determine_dllimport_p (tree decl)
+{
+  tree assoc;
+
+  if (TREE_CODE (decl) != VAR_DECL && TREE_CODE (decl) != FUNCTION_DECL)
+    return false;
+
+  if (DECL_DLLIMPORT_P (decl))
+    return true;
+
+  /* The DECL_DLLIMPORT_P flag was set for decls in the class definition
+     by  targetm.cxx.adjust_class_at_definition.  Check again to emit
+     error message if the class attribute has been overridden by an
+     out-of-class definition of static data.  */
+  assoc = associated_type (decl);
+  if (assoc && lookup_attribute ("dllimport", TYPE_ATTRIBUTES (assoc))
+      && TREE_CODE (decl) == VAR_DECL
+      && TREE_STATIC (decl) && TREE_PUBLIC (decl)
+      && !DECL_EXTERNAL (decl)
+      /* vtable's are linkonce constants, so defining a vtable is not
+	 an error as long as we don't try to import it too.  */
+      && !DECL_VIRTUAL_P (decl))
+	error ("definition of static data member %q+D of "
+	       "dllimport%'d class", decl);
+
+  return false;
+}
+
+/* Handle the -mno-fun-dllimport target switch.  */
+
+bool
+i386_pe_valid_dllimport_attribute_p (const_tree decl)
+{
+   if (TARGET_NOP_FUN_DLLIMPORT && TREE_CODE (decl) == FUNCTION_DECL)
+     return false;
+   return true;
+}
+
+/* Return string which is the function name, identified by ID, modified
+   with a suffix consisting of an atsign (@) followed by the number of
+   bytes of arguments.  If ID is NULL use the DECL_NAME as base. If
+   FASTCALL is true, also add the FASTCALL_PREFIX.
+   Return NULL if no change required.  */
+
+static tree
+gen_stdcall_or_fastcall_suffix (tree decl, tree id, bool fastcall)
+{
+  HOST_WIDE_INT total = 0;
+  const char *old_str = IDENTIFIER_POINTER (id != NULL_TREE ? id : DECL_NAME (decl));
+  char *new_str, *p;
+  tree type = TREE_TYPE (DECL_ORIGIN (decl));
+  tree arg;
+  function_args_iterator args_iter;
+
+  gcc_assert (TREE_CODE (decl) == FUNCTION_DECL);  
+
+  if (prototype_p (type))
+    {
+      /* This attribute is ignored for variadic functions.  */ 
+      if (stdarg_p (type))
+	return NULL_TREE;
+
+      /* Quit if we hit an incomplete type.  Error is reported
+	 by convert_arguments in c-typeck.c or cp/typeck.c.  */
+      FOREACH_FUNCTION_ARGS(type, arg, args_iter)
+	{
+	  HOST_WIDE_INT parm_size;
+	  HOST_WIDE_INT parm_boundary_bytes = PARM_BOUNDARY / BITS_PER_UNIT;
+
+	  if (! COMPLETE_TYPE_P (arg))
+	    break;
+
+	  parm_size = int_size_in_bytes (arg);
+	  if (parm_size < 0)
+	    break;
+
+	  /* Must round up to include padding.  This is done the same
+	     way as in store_one_arg.  */
+	  parm_size = ((parm_size + parm_boundary_bytes - 1)
+		       / parm_boundary_bytes * parm_boundary_bytes);
+	  total += parm_size;
+	}
+    }
+
+  /* Assume max of 8 base 10 digits in the suffix.  */
+  p = new_str = XALLOCAVEC (char, 1 + strlen (old_str) + 1 + 8 + 1);
+  if (fastcall)
+    *p++ = FASTCALL_PREFIX;
+  sprintf (p, "%s@" HOST_WIDE_INT_PRINT_DEC, old_str, total);
+
+  return get_identifier (new_str);
+}
+
+/* Maybe decorate and get a new identifier for the DECL of a stdcall or
+   fastcall function. The original identifier is supplied in ID. */
+
+static tree
+i386_pe_maybe_mangle_decl_assembler_name (tree decl, tree id)
+{
+  tree new_id = NULL_TREE;
+
+  if (TREE_CODE (decl) == FUNCTION_DECL)
+    { 
+      unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
+      if ((ccvt & IX86_CALLCVT_STDCALL) != 0)
+        {
+	  if (TARGET_RTD)
+	    /* If we are using -mrtd emit undecorated symbol and let linker
+	       do the proper resolving.  */
+	    return NULL_TREE;
+	  new_id = gen_stdcall_or_fastcall_suffix (decl, id, false);
+	}
+      else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+	new_id = gen_stdcall_or_fastcall_suffix (decl, id, true);
+    }
+
+  return new_id;
+}
+
+/* Emit an assembler directive to set symbol for DECL visibility to
+   the visibility type VIS, which must not be VISIBILITY_DEFAULT.
+   As for PE there is no hidden support in gas, we just warn for
+   user-specified visibility attributes.  */
+
+void
+i386_pe_assemble_visibility (tree decl,
+			     int vis ATTRIBUTE_UNUSED)
+{
+  if (!decl
+      || !lookup_attribute ("visibility", DECL_ATTRIBUTES (decl)))
+    return;
+  if (!DECL_ARTIFICIAL (decl))
+    warning (OPT_Wattributes, "visibility attribute not supported "
+			      "in this configuration; ignored");
+}
+
+/* This is used as a target hook to modify the DECL_ASSEMBLER_NAME
+   in the language-independent default hook
+   langhooks,c:lhd_set_decl_assembler_name ()
+   and in cp/mangle,c:mangle_decl ().  */
+tree
+i386_pe_mangle_decl_assembler_name (tree decl, tree id)
+{
+  tree new_id = i386_pe_maybe_mangle_decl_assembler_name (decl, id);   
+
+  return (new_id ? new_id : id);
+}
+
+/* This hook behaves the same as varasm.c/assemble_name(), but
+   generates the name into memory rather than outputting it to
+   a file stream.  */
+
+tree
+i386_pe_mangle_assembler_name (const char *name ATTRIBUTE_UNUSED)
+{
+  const char *skipped = name + (*name == '*' ? 1 : 0);
+  const char *stripped = targetm.strip_name_encoding (skipped);
+  if (*name != '*' && *user_label_prefix && *stripped != FASTCALL_PREFIX)
+    stripped = ACONCAT ((user_label_prefix, stripped, NULL));
+  return get_identifier (stripped);
+}
+
+void
+i386_pe_encode_section_info (tree decl, rtx rtl, int first)
+{
+  rtx symbol;
+  int flags;
+
+  /* Do this last, due to our frobbing of DECL_DLLIMPORT_P above.  */
+  default_encode_section_info (decl, rtl, first);
+
+  /* Careful not to prod global register variables.  */
+  if (!MEM_P (rtl))
+    return;
+
+  symbol = XEXP (rtl, 0);
+  gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
+
+  switch (TREE_CODE (decl))
+    {
+    case FUNCTION_DECL:
+      /* FIXME:  Imported stdcall names are not modified by the Ada frontend.
+	 Check and decorate the RTL name now.  */
+      if  (strcmp (lang_hooks.name, "GNU Ada") == 0)
+	{
+	  tree new_id;
+	  tree old_id = DECL_ASSEMBLER_NAME (decl);
+	  const char* asm_str = IDENTIFIER_POINTER (old_id);
+	  /* Do not change the identifier if a verbatim asmspec
+	     or if stdcall suffix already added. */
+	  if (!(*asm_str == '*' || strchr (asm_str, '@'))
+	      && (new_id = i386_pe_maybe_mangle_decl_assembler_name (decl,
+								     old_id)))
+	    XSTR (symbol, 0) = IDENTIFIER_POINTER (new_id);
+	}
+      break;
+
+    case VAR_DECL:
+      if (lookup_attribute ("selectany", DECL_ATTRIBUTES (decl)))
+	{
+	  if (DECL_INITIAL (decl)
+	      /* If an object is initialized with a ctor, the static
+		 initialization and destruction code for it is present in
+		 each unit defining the object.  The code that calls the
+		 ctor is protected by a link-once guard variable, so that
+		 the object still has link-once semantics,  */
+	      || TYPE_NEEDS_CONSTRUCTING (TREE_TYPE (decl)))
+	    make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
+	  else
+	    error ("%q+D:'selectany' attribute applies only to "
+		   "initialized objects", decl);
+	}
+      break;
+
+    default:
+      return;
+    }
+
+  /* Mark the decl so we can tell from the rtl whether the object is
+     dllexport'd or dllimport'd.  tree.c: merge_dllimport_decl_attributes
+     handles dllexport/dllimport override semantics.  */
+  flags = (SYMBOL_REF_FLAGS (symbol) &
+	   ~(SYMBOL_FLAG_DLLIMPORT | SYMBOL_FLAG_DLLEXPORT));
+  if (i386_pe_determine_dllexport_p (decl))
+    flags |= SYMBOL_FLAG_DLLEXPORT;
+  else if (i386_pe_determine_dllimport_p (decl))
+    flags |= SYMBOL_FLAG_DLLIMPORT;
+ 
+  SYMBOL_REF_FLAGS (symbol) = flags;
+}
+
+
+bool
+i386_pe_binds_local_p (const_tree exp)
+{
+  if ((TREE_CODE (exp) == VAR_DECL || TREE_CODE (exp) == FUNCTION_DECL)
+      && DECL_DLLIMPORT_P (exp))
+    return false;
+
+  /* External public symbols, which aren't weakref-s,
+     have local-binding for PE targets.  */
+  if (DECL_P (exp)
+      && !lookup_attribute ("weakref", DECL_ATTRIBUTES (exp))
+      && TREE_PUBLIC (exp)
+      && DECL_EXTERNAL (exp))
+    return true;
+  return default_binds_local_p_1 (exp, 0);
+}
+
+/* Also strip the fastcall prefix and stdcall suffix.  */
+
+const char *
+i386_pe_strip_name_encoding_full (const char *str)
+{
+  const char *p;
+  const char *name = default_strip_name_encoding (str);
+
+  /* Strip leading '@' on fastcall symbols.  */
+  if (*name == '@')
+    name++;
+
+  /* Strip trailing "@n".  */
+  p = strchr (name, '@');
+  if (p)
+    return ggc_alloc_string (name, p - name);
+
+  return name;
+}
+
+void
+i386_pe_unique_section (tree decl, int reloc)
+{
+  int len;
+  const char *name, *prefix;
+  char *string;
+
+  /* Ignore RELOC, if we are allowed to put relocated
+     const data into read-only section.  */
+  if (!flag_writable_rel_rdata)
+    reloc = 0;
+  name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
+  name = i386_pe_strip_name_encoding_full (name);
+
+  /* The object is put in, for example, section .text$foo.
+     The linker will then ultimately place them in .text
+     (everything from the $ on is stripped). Don't put
+     read-only data in .rdata section to avoid a PE linker
+     bug when .rdata$* grouped sections are used in code
+     without a .rdata section.  */
+  if (TREE_CODE (decl) == FUNCTION_DECL)
+    prefix = ".text$";
+  else if (decl_readonly_section (decl, reloc))
+    prefix = ".rdata$";
+  else
+    prefix = ".data$";
+  len = strlen (name) + strlen (prefix);
+  string = XALLOCAVEC (char, len + 1);
+  sprintf (string, "%s%s", prefix, name);
+
+  DECL_SECTION_NAME (decl) = build_string (len, string);
+}
+
+/* Local and global relocs can be placed always into readonly memory for
+   memory for PE-COFF targets.  */
+int
+i386_pe_reloc_rw_mask (void)
+{
+  return 0;
+}
+
+/* Select a set of attributes for section NAME based on the properties
+   of DECL and whether or not RELOC indicates that DECL's initializer
+   might contain runtime relocations.
+
+   We make the section read-only and executable for a function decl,
+   read-only for a const data decl, and writable for a non-const data decl.
+
+   If the section has already been defined, to not allow it to have
+   different attributes, as (1) this is ambiguous since we're not seeing
+   all the declarations up front and (2) some assemblers (e.g. SVR4)
+   do not recognize section redefinitions.  */
+/* ??? This differs from the "standard" PE implementation in that we
+   handle the SHARED variable attribute.  Should this be done for all
+   PE targets?  */
+
+#define SECTION_PE_SHARED	SECTION_MACH_DEP
+
+unsigned int
+i386_pe_section_type_flags (tree decl, const char *name, int reloc)
+{
+  static hash_table <pointer_hash <unsigned int> > htab;
+  unsigned int flags;
+  unsigned int **slot;
+
+  /* Ignore RELOC, if we are allowed to put relocated
+     const data into read-only section.  */
+  if (!flag_writable_rel_rdata)
+    reloc = 0;
+  /* The names we put in the hashtable will always be the unique
+     versions given to us by the stringtable, so we can just use
+     their addresses as the keys.  */
+  if (!htab.is_created ())
+    htab.create (31);
+
+  if (decl && TREE_CODE (decl) == FUNCTION_DECL)
+    flags = SECTION_CODE;
+  else if (decl && decl_readonly_section (decl, reloc))
+    flags = 0;
+  else
+    {
+      flags = SECTION_WRITE;
+
+      if (decl && TREE_CODE (decl) == VAR_DECL
+	  && lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
+	flags |= SECTION_PE_SHARED;
+    }
+
+  if (decl && DECL_P (decl) && DECL_ONE_ONLY (decl))
+    flags |= SECTION_LINKONCE;
+
+  /* See if we already have an entry for this section.  */
+  slot = htab.find_slot ((const unsigned int *)name, INSERT);
+  if (!*slot)
+    {
+      *slot = (unsigned int *) xmalloc (sizeof (unsigned int));
+      **slot = flags;
+    }
+  else
+    {
+      if (decl && **slot != flags)
+	error ("%q+D causes a section type conflict", decl);
+    }
+
+  return flags;
+}
+
+void
+i386_pe_asm_named_section (const char *name, unsigned int flags, 
+			   tree decl)
+{
+  char flagchars[8], *f = flagchars;
+
+#if defined (HAVE_GAS_SECTION_EXCLUDE) && HAVE_GAS_SECTION_EXCLUDE == 1
+  if ((flags & SECTION_EXCLUDE) != 0)
+    *f++ = 'e';
+#endif
+
+  if ((flags & (SECTION_CODE | SECTION_WRITE)) == 0)
+    /* readonly data */
+    {
+      *f++ ='d';  /* This is necessary for older versions of gas.  */
+      *f++ ='r';
+    }
+  else	
+    {
+      if (flags & SECTION_CODE)
+        *f++ = 'x';
+      if (flags & SECTION_WRITE)
+        *f++ = 'w';
+      if (flags & SECTION_PE_SHARED)
+        *f++ = 's';
+#if !defined (HAVE_GAS_SECTION_EXCLUDE) || HAVE_GAS_SECTION_EXCLUDE == 0
+      /* If attribute "e" isn't supported we mark this section as
+         never-load.  */
+      if ((flags & SECTION_EXCLUDE) != 0)
+	*f++ = 'n';
+#endif
+    }
+
+  /* LTO sections need 1-byte alignment to avoid confusing the
+     zlib decompression algorithm with trailing zero pad bytes.  */
+  if (strncmp (name, LTO_SECTION_NAME_PREFIX,
+			strlen (LTO_SECTION_NAME_PREFIX)) == 0)
+    *f++ = '0';
+
+  *f = '\0';
+
+  fprintf (asm_out_file, "\t.section\t%s,\"%s\"\n", name, flagchars);
+
+  if (flags & SECTION_LINKONCE)
+    {
+      /* Functions may have been compiled at various levels of
+	 optimization so we can't use `same_size' here.
+	 Instead, have the linker pick one, without warning.
+	 If 'selectany' attribute has been specified,  MS compiler
+	 sets 'discard' characteristic, rather than telling linker
+	 to warn of size or content mismatch, so do the same.  */ 
+      bool discard = (flags & SECTION_CODE)
+		      || (TREE_CODE (decl) != IDENTIFIER_NODE
+			  && lookup_attribute ("selectany",
+					       DECL_ATTRIBUTES (decl)));
+      fprintf (asm_out_file, "\t.linkonce %s\n",
+	       (discard  ? "discard" : "same_size"));
+    }
+}
+
+/* Beware, DECL may be NULL if compile_file() is emitting the LTO marker.  */
+
+void
+i386_pe_asm_output_aligned_decl_common (FILE *stream, tree decl,
+					const char *name, HOST_WIDE_INT size,
+					HOST_WIDE_INT align ATTRIBUTE_UNUSED)
+{
+  HOST_WIDE_INT rounded;
+
+  /* Compute as in assemble_noswitch_variable, since we don't have
+     support for aligned common on older binutils.  We must also
+     avoid emitting a common symbol of size zero, as this is the
+     overloaded representation that indicates an undefined external
+     symbol in the PE object file format.  */
+  rounded = size ? size : 1;
+  rounded += (BIGGEST_ALIGNMENT / BITS_PER_UNIT) - 1;
+  rounded = (rounded / (BIGGEST_ALIGNMENT / BITS_PER_UNIT)
+	     * (BIGGEST_ALIGNMENT / BITS_PER_UNIT));
+  
+  i386_pe_maybe_record_exported_symbol (decl, name, 1);
+
+  fprintf (stream, "\t.comm\t");
+  assemble_name (stream, name);
+  if (use_pe_aligned_common)
+    fprintf (stream, ", " HOST_WIDE_INT_PRINT_DEC ", %d\n",
+	   size ? size : (HOST_WIDE_INT) 1,
+	   exact_log2 (align) - exact_log2 (CHAR_BIT));
+  else
+    fprintf (stream, ", " HOST_WIDE_INT_PRINT_DEC "\t" ASM_COMMENT_START
+	   " " HOST_WIDE_INT_PRINT_DEC "\n", rounded, size);
+}
+
+/* The Microsoft linker requires that every function be marked as
+   DT_FCN.  When using gas on cygwin, we must emit appropriate .type
+   directives.  */
+
+#include "gsyms.h"
+
+/* Mark a function appropriately.  This should only be called for
+   functions for which we are not emitting COFF debugging information.
+   FILE is the assembler output file, NAME is the name of the
+   function, and PUB is nonzero if the function is globally
+   visible.  */
+
+void
+i386_pe_declare_function_type (FILE *file, const char *name, int pub)
+{
+  fprintf (file, "\t.def\t");
+  assemble_name (file, name);
+  fprintf (file, ";\t.scl\t%d;\t.type\t%d;\t.endef\n",
+	   pub ? (int) C_EXT : (int) C_STAT,
+	   (int) DT_FCN << N_BTSHFT);
+}
+
+/* Keep a list of external functions.  */
+
+struct GTY(()) extern_list
+{
+  struct extern_list *next;
+  tree decl;
+  const char *name;
+};
+
+static GTY(()) struct extern_list *extern_head;
+
+/* Assemble an external function reference.  We need to keep a list of
+   these, so that we can output the function types at the end of the
+   assembly.  We can't output the types now, because we might see a
+   definition of the function later on and emit debugging information
+   for it then.  */
+
+void
+i386_pe_record_external_function (tree decl, const char *name)
+{
+  struct extern_list *p;
+
+  p = ggc_alloc_extern_list ();
+  p->next = extern_head;
+  p->decl = decl;
+  p->name = name;
+  extern_head = p;
+}
+
+/* Keep a list of exported symbols.  */
+
+struct GTY(()) export_list
+{
+  struct export_list *next;
+  const char *name;
+  int is_data;		/* used to type tag exported symbols.  */
+};
+
+/* Keep a list of stub symbols.  */
+
+struct GTY(()) stub_list
+{
+  struct stub_list *next;
+  const char *name;
+};
+
+static GTY(()) struct export_list *export_head;
+
+static GTY(()) struct stub_list *stub_head;
+
+/* Assemble an export symbol entry.  We need to keep a list of
+   these, so that we can output the export list at the end of the
+   assembly.  We used to output these export symbols in each function,
+   but that causes problems with GNU ld when the sections are
+   linkonce.  Beware, DECL may be NULL if compile_file() is emitting
+   the LTO marker.  */
+
+void
+i386_pe_maybe_record_exported_symbol (tree decl, const char *name, int is_data)
+{
+  rtx symbol;
+  struct export_list *p;
+
+  if (!decl)
+    return;
+
+  symbol = XEXP (DECL_RTL (decl), 0);
+  gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
+  if (!SYMBOL_REF_DLLEXPORT_P (symbol))
+    return;
+
+  gcc_assert (TREE_PUBLIC (decl));
+
+  p = ggc_alloc_export_list ();
+  p->next = export_head;
+  p->name = name;
+  p->is_data = is_data;
+  export_head = p;
+}
+
+void
+i386_pe_record_stub (const char *name)
+{
+  struct stub_list *p;
+
+  if (!name || *name == 0)
+    return;
+
+  p = stub_head;
+  while (p != NULL)
+    {
+      if (p->name[0] == *name
+          && !strcmp (p->name, name))
+	return;
+      p = p->next;
+    }
+
+  p = ggc_alloc_stub_list ();
+  p->next = stub_head;
+  p->name = name;
+  stub_head = p;
+}
+
+
+#ifdef CXX_WRAP_SPEC_LIST
+
+/* Hashtable helpers.  */
+
+struct wrapped_symbol_hasher : typed_noop_remove <char>
+{
+  typedef char value_type;
+  typedef char compare_type;
+  static inline hashval_t hash (const value_type *);
+  static inline bool equal (const value_type *, const compare_type *);
+  static inline void remove (value_type *);
+};
+
+inline hashval_t
+wrapped_symbol_hasher::hash (const value_type *v)
+{
+  return htab_hash_string (v);
+}
+
+/*  Hash table equality helper function.  */
+
+inline bool
+wrapped_symbol_hasher::equal (const value_type *x, const compare_type *y)
+{
+  return !strcmp (x, y);
+}
+
+/* Search for a function named TARGET in the list of library wrappers
+   we are using, returning a pointer to it if found or NULL if not.
+   This function might be called on quite a few symbols, and we only
+   have the list of names of wrapped functions available to us as a
+   spec string, so first time round we lazily initialise a hash table
+   to make things quicker.  */
+
+static const char *
+i386_find_on_wrapper_list (const char *target)
+{
+  static char first_time = 1;
+  static hash_table <wrapped_symbol_hasher> wrappers;
+
+  if (first_time)
+    {
+      /* Beware that this is not a complicated parser, it assumes
+         that any sequence of non-whitespace beginning with an
+	 underscore is one of the wrapped symbols.  For now that's
+	 adequate to distinguish symbols from spec substitutions
+	 and command-line options.  */
+      static char wrapper_list_buffer[] = CXX_WRAP_SPEC_LIST;
+      char *bufptr;
+      /* Breaks up the char array into separated strings
+         strings and enter them into the hash table.  */
+      wrappers.create (8);
+      for (bufptr = wrapper_list_buffer; *bufptr; ++bufptr)
+	{
+	  char *found = NULL;
+	  if (ISSPACE (*bufptr))
+	    continue;
+	  if (*bufptr == '_')
+	    found = bufptr;
+	  while (*bufptr && !ISSPACE (*bufptr))
+	    ++bufptr;
+	  if (*bufptr)
+	    *bufptr = 0;
+	  if (found)
+	    *wrappers.find_slot (found, INSERT) = found;
+	}
+      first_time = 0;
+    }
+
+  return wrappers.find (target);
+}
+
+#endif /* CXX_WRAP_SPEC_LIST */
+
+/* This is called at the end of assembly.  For each external function
+   which has not been defined, we output a declaration now.  We also
+   output the .drectve section.  */
+
+void
+i386_pe_file_end (void)
+{
+  struct extern_list *p;
+
+  for (p = extern_head; p != NULL; p = p->next)
+    {
+      tree decl;
+
+      decl = p->decl;
+
+      /* Positively ensure only one declaration for any given symbol.  */
+      if (! TREE_ASM_WRITTEN (decl)
+	  && TREE_SYMBOL_REFERENCED (DECL_ASSEMBLER_NAME (decl)))
+	{
+#ifdef CXX_WRAP_SPEC_LIST
+	  /* To ensure the DLL that provides the corresponding real
+	     functions is still loaded at runtime, we must reference
+	     the real function so that an (unused) import is created.  */
+	  const char *realsym = i386_find_on_wrapper_list (p->name);
+	  if (realsym)
+	    i386_pe_declare_function_type (asm_out_file,
+		concat ("__real_", realsym, NULL), TREE_PUBLIC (decl));
+#endif /* CXX_WRAP_SPEC_LIST */
+	  TREE_ASM_WRITTEN (decl) = 1;
+	  i386_pe_declare_function_type (asm_out_file, p->name,
+					 TREE_PUBLIC (decl));
+	}
+    }
+
+  if (export_head)
+    {
+      struct export_list *q;
+      drectve_section ();
+      for (q = export_head; q != NULL; q = q->next)
+	{
+	  fprintf (asm_out_file, "\t.ascii \" -export:\\\"%s\\\"%s\"\n",
+		   default_strip_name_encoding (q->name),
+		   (q->is_data ? ",data" : ""));
+	}
+    }
+
+  if (stub_head)
+    {
+      struct stub_list *q;
+
+      for (q = stub_head; q != NULL; q = q->next)
+	{
+	  const char *name = q->name;
+	  const char *oname;
+
+	  if (name[0] == '*')
+	    ++name;
+	  oname = name;
+	  if (name[0] == '.')
+	    ++name;
+	  if (strncmp (name, "refptr.", 7) != 0)
+	    continue;
+	  name += 7;
+	  fprintf (asm_out_file, "\t.section\t.rdata$%s, \"dr\"\n"
+	  		   "\t.globl\t%s\n"
+			   "\t.linkonce\tdiscard\n", oname, oname);
+	  fprintf (asm_out_file, "%s:\n\t.quad\t%s\n", oname, name);
+	}
+    }
+}
+
+
+/* x64 Structured Exception Handling unwind info.  */
+
+struct seh_frame_state
+{
+  /* SEH records saves relative to the "current" stack pointer, whether
+     or not there's a frame pointer in place.  This tracks the current
+     stack pointer offset from the CFA.  */
+  HOST_WIDE_INT sp_offset;
+
+  /* The CFA is located at CFA_REG + CFA_OFFSET.  */
+  HOST_WIDE_INT cfa_offset;
+  rtx cfa_reg;
+};
+
+/* Set up data structures beginning output for SEH.  */
+
+void
+i386_pe_seh_init (FILE *f)
+{
+  struct seh_frame_state *seh;
+
+  if (!TARGET_SEH)
+    return;
+  if (cfun->is_thunk)
+    return;
+
+  /* We cannot support DRAP with SEH.  We turned off support for it by
+     re-defining MAX_STACK_ALIGNMENT when SEH is enabled.  */
+  gcc_assert (!stack_realign_drap);
+
+  seh = XCNEW (struct seh_frame_state);
+  cfun->machine->seh = seh;
+
+  seh->sp_offset = INCOMING_FRAME_SP_OFFSET;
+  seh->cfa_offset = INCOMING_FRAME_SP_OFFSET;
+  seh->cfa_reg = stack_pointer_rtx;
+
+  fputs ("\t.seh_proc\t", f);
+  assemble_name (f, IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (cfun->decl)));
+  fputc ('\n', f);
+}
+
+void
+i386_pe_seh_end_prologue (FILE *f)
+{
+  struct seh_frame_state *seh;
+
+  if (!TARGET_SEH)
+    return;
+  if (cfun->is_thunk)
+    return;
+  seh = cfun->machine->seh;
+
+  XDELETE (seh);
+  cfun->machine->seh = NULL;
+
+  fputs ("\t.seh_endprologue\n", f);
+}
+
+static void
+i386_pe_seh_fini (FILE *f)
+{
+  if (!TARGET_SEH)
+    return;
+  if (cfun->is_thunk)
+    return;
+  fputs ("\t.seh_endproc\n", f);
+}
+
+/* Emit an assembler directive to save REG via a PUSH.  */
+
+static void
+seh_emit_push (FILE *f, struct seh_frame_state *seh, rtx reg)
+{
+  unsigned int regno = REGNO (reg);
+
+  gcc_checking_assert (GENERAL_REGNO_P (regno));
+
+  seh->sp_offset += UNITS_PER_WORD;
+  if (seh->cfa_reg == stack_pointer_rtx)
+    seh->cfa_offset += UNITS_PER_WORD;
+
+  fputs ("\t.seh_pushreg\t", f);
+  print_reg (reg, 0, f);
+  fputc ('\n', f);
+}
+
+/* Emit an assembler directive to save REG at CFA - CFA_OFFSET.  */
+
+static void
+seh_emit_save (FILE *f, struct seh_frame_state *seh,
+	       rtx reg, HOST_WIDE_INT cfa_offset)
+{
+  unsigned int regno = REGNO (reg);
+  HOST_WIDE_INT offset;
+
+  /* Negative save offsets are of course not supported, since that
+     would be a store below the stack pointer and thus clobberable.  */
+  gcc_assert (seh->sp_offset >= cfa_offset);
+  offset = seh->sp_offset - cfa_offset;
+
+  fputs ((SSE_REGNO_P (regno) ? "\t.seh_savexmm\t"
+	 : GENERAL_REGNO_P (regno) ?  "\t.seh_savereg\t"
+	 : (gcc_unreachable (), "")), f);
+  print_reg (reg, 0, f);
+  fprintf (f, ", " HOST_WIDE_INT_PRINT_DEC "\n", offset);
+}
+
+/* Emit an assembler directive to adjust RSP by OFFSET.  */
+
+static void
+seh_emit_stackalloc (FILE *f, struct seh_frame_state *seh,
+		     HOST_WIDE_INT offset)
+{
+  /* We're only concerned with prologue stack allocations, which all
+     are subtractions from the stack pointer.  */
+  gcc_assert (offset < 0);
+  offset = -offset;
+
+  if (seh->cfa_reg == stack_pointer_rtx)
+    seh->cfa_offset += offset;
+  seh->sp_offset += offset;
+
+  /* Do not output the stackalloc in that case (it won't work as there is no
+     encoding for very large frame size).  */
+  if (offset < SEH_MAX_FRAME_SIZE)
+    fprintf (f, "\t.seh_stackalloc\t" HOST_WIDE_INT_PRINT_DEC "\n", offset);
+}
+
+/* Process REG_CFA_ADJUST_CFA for SEH.  */
+
+static void
+seh_cfa_adjust_cfa (FILE *f, struct seh_frame_state *seh, rtx pat)
+{
+  rtx dest, src;
+  HOST_WIDE_INT reg_offset = 0;
+  unsigned int dest_regno;
+
+  dest = SET_DEST (pat);
+  src = SET_SRC (pat);
+
+  if (GET_CODE (src) == PLUS)
+    {
+      reg_offset = INTVAL (XEXP (src, 1));
+      src = XEXP (src, 0);
+    }
+  else if (GET_CODE (src) == MINUS)
+    {
+      reg_offset = -INTVAL (XEXP (src, 1));
+      src = XEXP (src, 0);
+    }
+  gcc_assert (src == stack_pointer_rtx);
+  gcc_assert (seh->cfa_reg == stack_pointer_rtx);
+  dest_regno = REGNO (dest);
+
+  if (dest_regno == STACK_POINTER_REGNUM)
+    seh_emit_stackalloc (f, seh, reg_offset);
+  else if (dest_regno == HARD_FRAME_POINTER_REGNUM)
+    {
+      HOST_WIDE_INT offset;
+
+      seh->cfa_reg = dest;
+      seh->cfa_offset -= reg_offset;
+
+      offset = seh->sp_offset - seh->cfa_offset;
+
+      gcc_assert ((offset & 15) == 0);
+      gcc_assert (IN_RANGE (offset, 0, 240));
+
+      fputs ("\t.seh_setframe\t", f);
+      print_reg (seh->cfa_reg, 0, f);
+      fprintf (f, ", " HOST_WIDE_INT_PRINT_DEC "\n", offset);
+    }
+  else
+    gcc_unreachable ();
+}
+
+/* Process REG_CFA_OFFSET for SEH.  */
+
+static void
+seh_cfa_offset (FILE *f, struct seh_frame_state *seh, rtx pat)
+{
+  rtx dest, src;
+  HOST_WIDE_INT reg_offset;
+
+  dest = SET_DEST (pat);
+  src = SET_SRC (pat);
+
+  gcc_assert (MEM_P (dest));
+  dest = XEXP (dest, 0);
+  if (REG_P (dest))
+    reg_offset = 0;
+  else
+    {
+      gcc_assert (GET_CODE (dest) == PLUS);
+      reg_offset = INTVAL (XEXP (dest, 1));
+      dest = XEXP (dest, 0);
+    }
+  gcc_assert (dest == seh->cfa_reg);
+
+  seh_emit_save (f, seh, src, seh->cfa_offset - reg_offset);
+}
+
+/* Process a FRAME_RELATED_EXPR for SEH.  */
+
+static void
+seh_frame_related_expr (FILE *f, struct seh_frame_state *seh, rtx pat)
+{
+  rtx dest, src;
+  HOST_WIDE_INT addend;
+
+  /* See the full loop in dwarf2out_frame_debug_expr.  */
+  if (GET_CODE (pat) == PARALLEL || GET_CODE (pat) == SEQUENCE)
+    {
+      int i, n = XVECLEN (pat, 0), pass, npass;
+
+      npass = (GET_CODE (pat) == PARALLEL ? 2 : 1);
+      for (pass = 0; pass < npass; ++pass)
+	for (i = 0; i < n; ++i)
+	  {
+	    rtx ele = XVECEXP (pat, 0, i);
+
+	    if (GET_CODE (ele) != SET)
+	      continue;
+	    dest = SET_DEST (ele);
+
+	    /* Process each member of the PARALLEL independently.  The first
+	       member is always processed; others only if they are marked.  */
+	    if (i == 0 || RTX_FRAME_RELATED_P (ele))
+	      {
+		/* Evaluate all register saves in the first pass and all
+		   register updates in the second pass.  */
+		if ((MEM_P (dest) ^ pass) || npass == 1)
+		  seh_frame_related_expr (f, seh, ele);
+	      }
+	  }
+      return;
+    }
+
+  dest = SET_DEST (pat);
+  src = SET_SRC (pat);
+
+  switch (GET_CODE (dest))
+    {
+    case REG:
+      switch (GET_CODE (src))
+	{
+	case REG:
+	  /* REG = REG: This should be establishing a frame pointer.  */
+	  gcc_assert (src == stack_pointer_rtx);
+	  gcc_assert (dest == hard_frame_pointer_rtx);
+	  seh_cfa_adjust_cfa (f, seh, pat);
+	  break;
+
+	case PLUS:
+	  addend = INTVAL (XEXP (src, 1));
+	  src = XEXP (src, 0);
+	  if (dest == hard_frame_pointer_rtx)
+	    seh_cfa_adjust_cfa (f, seh, pat);
+	  else if (dest == stack_pointer_rtx)
+	    {
+	      gcc_assert (src == stack_pointer_rtx);
+	      seh_emit_stackalloc (f, seh, addend);
+	    }
+	  else
+	    gcc_unreachable ();
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+      break;
+
+    case MEM:
+      /* A save of some kind.  */
+      dest = XEXP (dest, 0);
+      if (GET_CODE (dest) == PRE_DEC)
+	{
+	  gcc_checking_assert (GET_MODE (src) == Pmode);
+	  gcc_checking_assert (REG_P (src));
+	  seh_emit_push (f, seh, src);
+	}
+      else
+	seh_cfa_offset (f, seh, pat);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* This function looks at a single insn and emits any SEH directives
+   required for unwind of this insn.  */
+
+void
+i386_pe_seh_unwind_emit (FILE *asm_out_file, rtx insn)
+{
+  rtx note, pat;
+  bool handled_one = false;
+  struct seh_frame_state *seh;
+
+  if (!TARGET_SEH)
+    return;
+
+  /* We free the SEH data once done with the prologue.  Ignore those
+     RTX_FRAME_RELATED_P insns that are associated with the epilogue.  */
+  seh = cfun->machine->seh;
+  if (seh == NULL)
+    return;
+
+  if (NOTE_P (insn) || !RTX_FRAME_RELATED_P (insn))
+    return;
+
+  for (note = REG_NOTES (insn); note ; note = XEXP (note, 1))
+    {
+      switch (REG_NOTE_KIND (note))
+	{
+	case REG_FRAME_RELATED_EXPR:
+	  pat = XEXP (note, 0);
+	  goto found;
+
+	case REG_CFA_DEF_CFA:
+	case REG_CFA_EXPRESSION:
+	  /* Only emitted with DRAP, which we disable.  */
+	  gcc_unreachable ();
+	  break;
+
+	case REG_CFA_REGISTER:
+	  /* Only emitted in epilogues, which we skip.  */
+	  gcc_unreachable ();
+
+	case REG_CFA_ADJUST_CFA:
+	  pat = XEXP (note, 0);
+	  if (pat == NULL)
+	    {
+	      pat = PATTERN (insn);
+	      if (GET_CODE (pat) == PARALLEL)
+		pat = XVECEXP (pat, 0, 0);
+	    }
+	  seh_cfa_adjust_cfa (asm_out_file, seh, pat);
+	  handled_one = true;
+	  break;
+
+	case REG_CFA_OFFSET:
+	  pat = XEXP (note, 0);
+	  if (pat == NULL)
+	    pat = single_set (insn);
+	  seh_cfa_offset (asm_out_file, seh, pat);
+	  handled_one = true;
+	  break;
+
+	default:
+	  break;
+	}
+    }
+  if (handled_one)
+    return;
+  pat = PATTERN (insn);
+ found:
+  seh_frame_related_expr (asm_out_file, seh, pat);
+}
+
+void
+i386_pe_seh_emit_except_personality (rtx personality)
+{
+  int flags = 0;
+
+  if (!TARGET_SEH)
+    return;
+
+  fputs ("\t.seh_handler\t", asm_out_file);
+  output_addr_const (asm_out_file, personality);
+
+#if 0
+  /* ??? The current implementation of _GCC_specific_handler requires
+     both except and unwind handling, regardless of which sorts the
+     user-level function requires.  */
+  eh_region r;
+  FOR_ALL_EH_REGION(r)
+    {
+      if (r->type == ERT_CLEANUP)
+	flags |= 1;
+      else
+	flags |= 2;
+    }
+#else
+  flags = 3;
+#endif
+
+  if (flags & 1)
+    fputs (", @unwind", asm_out_file);
+  if (flags & 2)
+    fputs (", @except", asm_out_file);
+  fputc ('\n', asm_out_file);
+}
+
+void
+i386_pe_seh_init_sections (void)
+{
+  if (TARGET_SEH)
+    exception_section = get_unnamed_section (0, output_section_asm_op,
+					     "\t.seh_handlerdata");
+}
+
+void
+i386_pe_start_function (FILE *f, const char *name, tree decl)
+{
+  i386_pe_maybe_record_exported_symbol (decl, name, 0);
+  if (write_symbols != SDB_DEBUG)
+    i386_pe_declare_function_type (f, name, TREE_PUBLIC (decl));
+  /* In case section was altered by debugging output.  */
+  if (decl != NULL_TREE)
+    switch_to_section (function_section (decl));
+  ASM_OUTPUT_FUNCTION_LABEL (f, name, decl);
+}
+
+void
+i386_pe_end_function (FILE *f, const char *name ATTRIBUTE_UNUSED,
+		      tree decl ATTRIBUTE_UNUSED)
+{
+  i386_pe_seh_fini (f);
+}
+
+
+#include "gt-winnt.h"
diff --git a/gcc-4.9/gcc/config/i386/wmmintrin.h b/gcc-4.9/gcc/config/i386/wmmintrin.h
new file mode 100644
index 000000000..2002375c6
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/wmmintrin.h
@@ -0,0 +1,132 @@
+/* Copyright (C) 2008-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 10.1.  */
+
+#ifndef _WMMINTRIN_H_INCLUDED
+#define _WMMINTRIN_H_INCLUDED
+
+/* We need definitions from the SSE2 header file.  */
+#include <emmintrin.h>
+
+/* AES */
+
+#ifndef __AES__
+#pragma GCC push_options
+#pragma GCC target("aes")
+#define __DISABLE_AES__
+#endif /* __AES__ */
+
+/* Performs 1 round of AES decryption of the first m128i using 
+   the second m128i as a round key.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesdec_si128 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_aesdec128 ((__v2di)__X, (__v2di)__Y);
+}
+
+/* Performs the last round of AES decryption of the first m128i 
+   using the second m128i as a round key.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesdeclast_si128 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_aesdeclast128 ((__v2di)__X,
+						 (__v2di)__Y);
+}
+
+/* Performs 1 round of AES encryption of the first m128i using 
+   the second m128i as a round key.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesenc_si128 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_aesenc128 ((__v2di)__X, (__v2di)__Y);
+}
+
+/* Performs the last round of AES encryption of the first m128i
+   using the second m128i as a round key.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesenclast_si128 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_aesenclast128 ((__v2di)__X, (__v2di)__Y);
+}
+
+/* Performs the InverseMixColumn operation on the source m128i 
+   and stores the result into m128i destination.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesimc_si128 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_aesimc128 ((__v2di)__X);
+}
+
+/* Generates a m128i round key for the input m128i AES cipher key and
+   byte round constant.  The second parameter must be a compile time
+   constant.  */
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aeskeygenassist_si128 (__m128i __X, const int __C)
+{
+  return (__m128i) __builtin_ia32_aeskeygenassist128 ((__v2di)__X, __C);
+}
+#else
+#define _mm_aeskeygenassist_si128(X, C)					\
+  ((__m128i) __builtin_ia32_aeskeygenassist128 ((__v2di)(__m128i)(X),	\
+						(int)(C)))
+#endif
+
+#ifdef __DISABLE_AES__
+#undef __DISABLE_AES__
+#pragma GCC pop_options
+#endif /* __DISABLE_AES__ */
+
+/* PCLMUL */
+
+#ifndef __PCLMUL__
+#pragma GCC push_options
+#pragma GCC target("pclmul")
+#define __DISABLE_PCLMUL__
+#endif /* __PCLMUL__ */
+
+/* Performs carry-less integer multiplication of 64-bit halves of
+   128-bit input operands.  The third parameter inducates which 64-bit
+   haves of the input parameters v1 and v2 should be used. It must be
+   a compile time constant.  */
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_clmulepi64_si128 (__m128i __X, __m128i __Y, const int __I)
+{
+  return (__m128i) __builtin_ia32_pclmulqdq128 ((__v2di)__X,
+						(__v2di)__Y, __I);
+}
+#else
+#define _mm_clmulepi64_si128(X, Y, I)					\
+  ((__m128i) __builtin_ia32_pclmulqdq128 ((__v2di)(__m128i)(X),		\
+					  (__v2di)(__m128i)(Y), (int)(I)))
+#endif
+
+#ifdef __DISABLE_PCLMUL__
+#undef __DISABLE_PCLMUL__
+#pragma GCC pop_options
+#endif /* __DISABLE_PCLMUL__ */
+
+#endif /* _WMMINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/x-cygwin b/gcc-4.9/gcc/config/i386/x-cygwin
new file mode 100644
index 000000000..752af76ef
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/x-cygwin
@@ -0,0 +1,4 @@
+host-cygwin.o : $(srcdir)/config/i386/host-cygwin.c $(CONFIG_H) $(SYSTEM_H) \
+  coretypes.h hosthooks.h $(HOSTHOOKS_DEF_H) toplev.h diagnostic.h
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/i386/host-cygwin.c
diff --git a/gcc-4.9/gcc/config/i386/x-darwin b/gcc-4.9/gcc/config/i386/x-darwin
new file mode 100644
index 000000000..4967d695c
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/x-darwin
@@ -0,0 +1,3 @@
+host-i386-darwin.o : $(srcdir)/config/i386/host-i386-darwin.c
+	$(COMPILE) $<
+	$(POSTCOMPILE)
diff --git a/gcc-4.9/gcc/config/i386/x-i386 b/gcc-4.9/gcc/config/i386/x-i386
new file mode 100644
index 000000000..1f3db1d19
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/x-i386
@@ -0,0 +1,3 @@
+driver-i386.o : $(srcdir)/config/i386/driver-i386.c
+	$(COMPILE) $<
+	$(POSTCOMPILE)
diff --git a/gcc-4.9/gcc/config/i386/x-mingw32 b/gcc-4.9/gcc/config/i386/x-mingw32
new file mode 100644
index 000000000..333346018
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/x-mingw32
@@ -0,0 +1,31 @@
+# Copyright (C) 2003-2014 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+#
+#
+# Make local_includedir relative to EXEC_PREFIX 
+#
+local_includedir=$(libsubdir)/$(unlibsubdir)/..`echo $(exec_prefix) | sed -e 's|^$(prefix)||' -e 's|/[^/]*|/..|g'`/include
+
+# On MinGW, we use "%IA64d" to print 64-bit integers, and the format-checking
+# code does not handle that, so we have to disable checking here.
+WERROR_FLAGS += -Wno-format
+
+host-mingw32.o : $(srcdir)/config/i386/host-mingw32.c $(CONFIG_H) $(SYSTEM_H) \
+  coretypes.h hosthooks.h hosthooks-def.h toplev.h $(DIAGNOSTIC_H) $(HOOKS_H)
+	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+		$(srcdir)/config/i386/host-mingw32.c
diff --git a/gcc-4.9/gcc/config/i386/x86-64.h b/gcc-4.9/gcc/config/i386/x86-64.h
new file mode 100644
index 000000000..16fc68581
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/x86-64.h
@@ -0,0 +1,108 @@
+/* OS independent definitions for AMD x86-64.
+   Copyright (C) 2001-2014 Free Software Foundation, Inc.
+   Contributed by Bo Thorsen <bo@suse.de>.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#undef ASM_COMMENT_START
+#define ASM_COMMENT_START "#"
+
+#undef DBX_REGISTER_NUMBER
+#define DBX_REGISTER_NUMBER(n) \
+  (TARGET_64BIT ? dbx64_register_map[n] : svr4_dbx_register_map[n])
+
+/* Output assembler code to FILE to call the profiler.  */
+#define NO_PROFILE_COUNTERS 1
+
+#undef MCOUNT_NAME
+#define MCOUNT_NAME "mcount"
+
+#undef SIZE_TYPE
+#define SIZE_TYPE (TARGET_LP64 ? "long unsigned int" : "unsigned int")
+
+#undef PTRDIFF_TYPE
+#define PTRDIFF_TYPE (TARGET_LP64 ? "long int" : "int")
+
+#undef WCHAR_TYPE
+#define WCHAR_TYPE "int"
+
+#undef WCHAR_TYPE_SIZE
+#define WCHAR_TYPE_SIZE 32
+
+#undef ASM_SPEC
+#define ASM_SPEC "%{m32:--32} %{m64:--64} %{mx32:--x32}"
+
+#undef ASM_OUTPUT_ALIGNED_BSS
+#define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \
+  x86_output_aligned_bss (FILE, DECL, NAME, SIZE, ALIGN)
+
+#undef  ASM_OUTPUT_ALIGNED_COMMON
+#define ASM_OUTPUT_ALIGNED_COMMON(FILE, NAME, SIZE, ALIGN)		\
+  x86_elf_aligned_common (FILE, NAME, SIZE, ALIGN);
+
+/* This is used to align code labels according to Intel recommendations.  */
+
+#ifdef HAVE_GAS_MAX_SKIP_P2ALIGN
+#define ASM_OUTPUT_MAX_SKIP_ALIGN(FILE,LOG,MAX_SKIP)			\
+  do {									\
+    if ((LOG) != 0) {							\
+      if ((MAX_SKIP) == 0) fprintf ((FILE), "\t.p2align %d\n", (LOG));	\
+      else {								\
+	fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP));	\
+	/* Make sure that we have at least 8 byte alignment if > 8 byte \
+	   alignment is preferred.  */					\
+	if ((LOG) > 3							\
+	    && (1 << (LOG)) > ((MAX_SKIP) + 1)				\
+	    && (MAX_SKIP) >= 7)						\
+	  fputs ("\t.p2align 3\n", (FILE));				\
+      }									\
+    }									\
+  } while (0)
+#undef  ASM_OUTPUT_MAX_SKIP_PAD
+#define ASM_OUTPUT_MAX_SKIP_PAD(FILE, LOG, MAX_SKIP)			\
+  if ((LOG) != 0)							\
+    {									\
+      if ((MAX_SKIP) == 0)						\
+        fprintf ((FILE), "\t.p2align %d\n", (LOG));			\
+      else								\
+        fprintf ((FILE), "\t.p2align %d,,%d\n", (LOG), (MAX_SKIP));	\
+    }
+#endif
+
+
+/* i386 System V Release 4 uses DWARF debugging info.
+   x86-64 ABI specifies DWARF2.  */
+
+#define DWARF2_DEBUGGING_INFO 1
+#define DWARF2_UNWIND_INFO 1
+
+#undef PREFERRED_DEBUGGING_TYPE
+#define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG
+
+#undef TARGET_ASM_SELECT_SECTION
+#define TARGET_ASM_SELECT_SECTION  x86_64_elf_select_section
+
+#undef TARGET_ASM_UNIQUE_SECTION
+#define TARGET_ASM_UNIQUE_SECTION  x86_64_elf_unique_section
+
+#undef TARGET_SECTION_TYPE_FLAGS
+#define TARGET_SECTION_TYPE_FLAGS  x86_64_elf_section_type_flags
diff --git a/gcc-4.9/gcc/config/i386/x86-tune.def b/gcc-4.9/gcc/config/i386/x86-tune.def
new file mode 100644
index 000000000..839910267
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/x86-tune.def
@@ -0,0 +1,525 @@
+/* Definitions of x86 tunable features.
+   Copyright (C) 2013-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* Tuning for a given CPU XXXX consists of:
+    - adding new CPU into:
+	- adding PROCESSOR_XXX to processor_type (in i386.h)
+	- possibly adding XXX into CPU attribute in i386.md
+	- adding XXX to processor_alias_table (in i386.c)
+    - introducing ix86_XXX_cost in i386.c
+	- Stringop generation table can be build based on test_stringop
+	- script (once rest of tuning is complete)
+    - designing a scheduler model in
+	- XXXX.md file
+	- Updating ix86_issue_rate and ix86_adjust_cost in i386.md
+	- possibly updating ia32_multipass_dfa_lookahead, ix86_sched_reorder
+	  and ix86_sched_init_global if those tricks are needed.
+    - Tunning the flags bellow. Those are split into sections and each
+      section is very roughly ordered by importance.  */
+
+/*****************************************************************************/
+/* Scheduling flags. 					                     */
+/*****************************************************************************/
+
+/* X86_TUNE_SCHEDULE: Enable scheduling.  */
+DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
+          m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL 
+	  | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
+
+/* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming
+   on modern chips.  Preffer stores affecting whole integer register
+   over partial stores.  For example preffer MOVZBL or MOVQ to load 8bit
+   value over movb.  */
+DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
+          m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
+	  | m_AMD_MULTIPLE | m_GENERIC)
+
+/* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store
+   destinations to be 128bit to allow register renaming on 128bit SSE units,
+   but usually results in one extra microop on 64bit SSE units.
+   Experimental results shows that disabling this option on P4 brings over 20%
+   SPECfp regression, while enabling it on K8 brings roughly 2.4% regression
+   that can be partly masked by careful scheduling of moves.  */
+DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
+          m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
+	  | m_INTEL | m_AMDFAM10 | m_BDVER | m_GENERIC)
+
+/* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
+   are resolved on SSE register parts instead of whole registers, so we may
+   maintain just lower part of scalar values in proper format leaving the
+   upper part undefined.  */
+DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
+
+/* X86_TUNE_PARTIAL_FLAG_REG_STALL: this flag disables use of of flags
+   set by instructions affecting just some flags (in particular shifts).
+   This is because Core2 resolves dependencies on whole flags register
+   and such sequences introduce false dependency on previous instruction
+   setting full flags.
+
+   The flags does not affect generation of INC and DEC that is controlled
+   by X86_TUNE_USE_INCDEC.
+
+   This flag may be dropped from generic once core2-corei5 machines are
+   rare enough.  */
+DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
+          m_CORE2 | m_GENERIC)
+
+/* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
+   partial dependencies.  */
+DEF_TUNE (X86_TUNE_MOVX, "movx",
+          m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
+	  | m_INTEL | m_GEODE | m_AMD_MULTIPLE  | m_GENERIC)
+
+/* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
+   full sized loads.  */
+DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
+          m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
+	  | m_AMD_MULTIPLE | m_GENERIC)
+
+/* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent
+   conditional jump instruction for 32 bit TARGET.
+   FIXME: revisit for generic.  */
+DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32",
+          m_CORE_ALL | m_BDVER)
+
+/* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent
+   conditional jump instruction for TARGET_64BIT.
+   FIXME: revisit for generic.  */
+DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
+          m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER)
+
+/* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a
+   subsequent conditional jump instruction when the condition jump
+   check sign flag (SF) or overflow flag (OF).  */
+DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
+          m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER)
+
+/* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
+   jump instruction when the alu instruction produces the CCFLAG consumed by
+   the conditional jump instruction. */
+DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
+          m_SANDYBRIDGE | m_HASWELL)
+
+/* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
+   during reassociation of integer computation.  */
+DEF_TUNE (X86_TUNE_REASSOC_INT_TO_PARALLEL, "reassoc_int_to_parallel",
+          m_BONNELL)
+
+/* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
+   during reassociation of fp computation.  */
+DEF_TUNE (X86_TUNE_REASSOC_FP_TO_PARALLEL, "reassoc_fp_to_parallel",
+          m_BONNELL | m_SILVERMONT | m_HASWELL | m_INTEL | m_BDVER1
+	  | m_BDVER2 | m_GENERIC)
+
+/*****************************************************************************/
+/* Function prologue, epilogue and function calling sequences.               */
+/*****************************************************************************/
+
+/* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing
+   arguments in prologue/epilogue instead of separately for each call
+   by push/pop instructions.
+   This increase code size by about 5% in 32bit mode, less so in 64bit mode
+   because parameters are passed in registers.  It is considerable
+   win for targets without stack engine that prevents multple push operations
+   to happen in parallel.
+
+   FIXME: the flags is incorrectly enabled for amdfam10, Bulldozer,
+   Bobcat and Generic.  This is because disabling it causes large
+   regression on mgrid due to IRA limitation leading to unecessary
+   use of the frame pointer in 32bit mode.  */
+DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
+	  m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_INTEL
+	  | m_ATHLON_K8)
+
+/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are
+   considered on critical path.  */
+DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
+          m_PPRO | m_ATHLON_K8)
+
+/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in epilogues that are
+   considered on critical path.  */
+DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move",
+          m_PPRO | m_ATHLON_K8)
+
+/* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits.  */
+DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
+	  m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
+
+/* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
+   Some chips, like 486 and Pentium works faster with separate load
+   and push instructions.  */
+DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
+          m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
+          | m_GENERIC)
+
+/* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
+   over esp subtraction.  */
+DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT
+          | m_K6_GEODE)
+
+/* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
+   over esp subtraction.  */
+DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_K6_GEODE)
+
+/* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
+   over esp addition.  */
+DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT | m_PPRO)
+
+/* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
+   over esp addition.  */
+DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT)
+
+/*****************************************************************************/
+/* Branch predictor tuning  		                                     */
+/*****************************************************************************/
+
+/* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4
+   instructions long.  */
+DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_BONNELL)
+
+/* X86_TUNE_PAD_RETURNS: Place NOP before every RET that is a destination
+   of conditional jump or directly preceded by other jump instruction.
+   This is important for AND K8-AMDFAM10 because the branch prediction
+   architecture expect at most one jump per 2 byte window.  Failing to
+   pad returns leads to misaligned return stack.  */
+DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
+          m_ATHLON_K8 | m_AMDFAM10 | m_GENERIC)
+
+/* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
+   than 4 branch instructions in the 16 byte window.  */
+DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
+          m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_INTEL |
+	  m_ATHLON_K8 | m_AMDFAM10)
+
+/*****************************************************************************/
+/* Integer instruction selection tuning                                      */
+/*****************************************************************************/
+
+/* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching
+   at -O3.  For the moment, the prefetching seems badly tuned for Intel
+   chips.  */
+DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial",
+          m_K6_GEODE | m_AMD_MULTIPLE)
+
+/* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
+   on 16-bit immediate moves into memory on Core2 and Corei7.  */
+DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
+
+/* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such
+   as "add mem, reg".  */
+DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_PPRO))
+
+/* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions.   */
+DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
+          ~(m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
+	    | m_GENERIC))
+
+/* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
+   for DFmode copies */
+DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
+          ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
+	    | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GENERIC))
+
+/* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
+   will impact LEA instruction selection. */
+DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_INTEL)
+
+/* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation.  */
+DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr",
+	  m_BONNELL | m_SILVERMONT)
+
+/* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
+   vector path on AMD machines.
+   FIXME: Do we need to enable this for core? */
+DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem",
+          m_K8 | m_AMDFAM10)
+
+/* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
+   machines.
+   FIXME: Do we need to enable this for core? */
+DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
+          m_K8 | m_AMDFAM10)
+
+/* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
+   a conditional move.  */
+DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
+	  m_BONNELL | m_SILVERMONT | m_INTEL)
+
+/* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such
+   as MOVS and STOS (without a REP prefix) to move/set sequences of bytes.  */
+DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
+
+/* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
+   compact prologues and epilogues by issuing a misaligned moves.  This
+   requires target to handle misaligned moves and partial memory stalls
+   reasonably well.
+   FIXME: This may actualy be a win on more targets than listed here.  */
+DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES,
+	  "misaligned_move_string_pro_epilogues",
+	  m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC)
+
+/* X86_TUNE_USE_SAHF: Controls use of SAHF.  */
+DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
+          m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
+	  | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER
+	  | m_GENERIC)
+
+/* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions.  */
+DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
+	  ~(m_PENT | m_BONNELL | m_SILVERMONT | m_INTEL  | m_K6))
+
+/* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions.  */
+DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
+          m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL | m_AMD_MULTIPLE
+	  | m_GENERIC)
+
+/*****************************************************************************/
+/* 387 instruction selection tuning                                          */
+/*****************************************************************************/
+
+/* X86_TUNE_USE_HIMODE_FIOP: Enables use of x87 instructions with 16bit
+   integer operand.
+   FIXME: Why this is disabled for modern chips?  */
+DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
+          m_386 | m_486 | m_K6_GEODE)
+
+/* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit
+   integer operand.  */
+DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
+          ~(m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
+	    | m_INTEL | m_AMD_MULTIPLE | m_GENERIC))
+
+/* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp.  */
+DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE)
+
+/* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI.  */
+DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
+          m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
+	  | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC)
+
+/*****************************************************************************/
+/* SSE instruction selection tuning                                          */
+/*****************************************************************************/
+
+/* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
+   instructions.  */
+DEF_TUNE (X86_TUNE_VECTORIZE_DOUBLE, "vectorize_double", ~m_BONNELL)
+
+/* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
+   regs instead of memory.  */
+DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
+          m_CORE_ALL)
+
+/* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead
+   of a sequence loading registers by parts.  */
+DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
+          m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_AMDFAM10 | m_BDVER
+	  | m_BTVER | m_SILVERMONT | m_INTEL | m_GENERIC)
+
+/* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead
+   of a sequence loading registers by parts.  */
+DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
+          m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_SILVERMONT
+	  | m_INTEL | m_GENERIC)
+
+/* Use packed single precision instructions where posisble.  I.e. movups instead
+   of movupd.  */
+DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
+          m_BDVER)
+
+/* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores.   */
+DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
+	  m_AMD_MULTIPLE | m_CORE_ALL | m_GENERIC)
+
+/* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to
+   xorps/xorpd and other variants.  */
+DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
+	  m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_GENERIC)
+
+/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer
+   to SSE registers.  If disabled, the moves will be done by storing
+   the value to memory and reloading.  */
+DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
+          ~(m_AMD_MULTIPLE | m_GENERIC))
+
+/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE
+   to integer registers.  If disabled, the moves will be done by storing
+   the value to memory and reloading.  */
+DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec",
+          ~m_ATHLON_K8)
+
+/* X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions
+   to use both SSE and integer registers at a same time.
+   FIXME: revisit importance of this for generic.  */
+DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
+          ~(m_AMDFAM10 | m_BDVER))
+
+/* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
+   fp converts to destination register.  */
+DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
+          m_SILVERMONT | m_INTEL)
+
+/* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
+   from FP to FP.  This form of instructions avoids partial write to the
+   destination.  */
+DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
+          m_AMDFAM10)
+
+/* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
+   from integer to FP. */
+DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
+
+/*****************************************************************************/
+/* AVX instruction selection tuning (some of SSE flags affects AVX, too)     */
+/*****************************************************************************/
+
+/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are
+   split.  */
+DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal",
+          ~(m_NEHALEM | m_SANDYBRIDGE | m_GENERIC))
+
+/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are
+   split.  */
+DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal",
+          ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_GENERIC))
+
+/* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
+   the auto-vectorizer.  */
+DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2)
+
+/*****************************************************************************/
+/* Historical relics: tuning flags that helps a specific old CPU designs     */
+/*****************************************************************************/
+
+/* X86_TUNE_DOUBLE_WITH_ADD: Use add instead of sal to double value in
+   an integer register.  */
+DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386)
+
+/* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations,
+   such as fsqrt, fprem, fsin, fcos, fsincos etc.
+   Should be enabled for all targets that always has coprocesor.  */
+DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387",
+          ~(m_386 | m_486))
+
+/* X86_TUNE_UNROLL_STRLEN: Produce (quite lame) unrolled sequence for
+   inline strlen.  This affects only -minline-all-stringops mode. By
+   default we always dispatch to a library since our internal strlen
+   is bad.  */
+DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen", ~m_386)
+
+/* X86_TUNE_SHIFT1: Enables use of short encoding of "sal reg" instead of
+   longer "sal $1, reg".  */
+DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486)
+
+/* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead
+   of mozbl/movwl.  */
+DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and",  m_486 | m_PENT)
+
+/* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
+   and SImode multiply, but 386 and 486 do HImode multiply faster.  */
+DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
+          ~(m_386 | m_486))
+
+/* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic
+   into 16bit/8bit when resulting sequence is shorter.  For example
+   for "and $-65536, reg" to 16bit store of 0.  */
+DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix", ~(m_386 | m_486 | m_PENT))
+
+/* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions
+   such as "add $1, mem".  */
+DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write", ~m_PENT)
+
+/* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
+   than a MOV.  */
+DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT)
+
+/* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
+   but one byte longer.  */
+DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT)
+
+/* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled
+   use of partial registers by renaming.  This improved performance of 16bit
+   code where upper halves of registers are not used.  It also leads to
+   an penalty whenever a 16bit store is followed by 32bit use.  This flag
+   disables production of such sequences in common cases.
+   See also X86_TUNE_HIMODE_MATH.
+
+   In current implementation the partial register stalls are not eliminated
+   very well - they can be introduced via subregs synthesized by combine
+   and can happen in caller/callee saving sequences.  */
+DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
+
+/* X86_TUNE_PROMOTE_QIMODE: When it is cheap, turn 8bit arithmetic to
+   corresponding 32bit arithmetic.  */
+DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode",
+	  ~m_PPRO)
+
+/* X86_TUNE_PROMOTE_HI_REGS: Same, but for 16bit artihmetic.  Again we avoid
+   partial register stalls on PentiumPro targets. */
+DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO)
+
+/* X86_TUNE_HIMODE_MATH: Enable use of 16bit arithmetic.
+   On PPro this flag is meant to avoid partial register stalls.  */
+DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO)
+
+/* X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates
+   directly to memory.  */
+DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO)
+
+/* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx.  */
+DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4)
+
+/* X86_TUNE_USE_MOV0: Use "mov $0, reg" instead of "xor reg, reg" to clear
+   integer register.  */
+DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6)
+
+/* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
+   operand that cannot be represented using a modRM byte.  The XOR
+   replacement is long decoded, so this split helps here as well.  */
+DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6)
+
+/* X86_TUNE_AVOID_VECTOR_DECODE: Enable splitters that avoid vector decoded
+   forms of instructions on K8 targets.  */
+DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
+          m_K8)
+
+/*****************************************************************************/
+/* This never worked well before.                                            */
+/*****************************************************************************/
+
+/* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
+   on simulation result. But after P4 was made, no performance benefit
+   was observed with branch hints.  It also increases the code size.
+   As a result, icc never generates branch hints.  */
+DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", 0)
+
+/* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic.  */
+DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", ~0)
+
+/* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit
+   arithmetic to 32bit via PROMOTE_MODE macro.  This code generation scheme
+   is usually used for RISC targets.  */
+DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0)
+
+/* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
+   on hardware capabilities. Bdver3 hardware has a loop buffer which makes
+   unrolling small loop less important. For, such architectures we adjust
+   the unroll factor so that the unrolled loop fits the loop buffer.  */
+DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4)
diff --git a/gcc-4.9/gcc/config/i386/x86intrin.h b/gcc-4.9/gcc/config/i386/x86intrin.h
new file mode 100644
index 000000000..80e9e6f33
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/x86intrin.h
@@ -0,0 +1,78 @@
+/* Copyright (C) 2008-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86INTRIN_H_INCLUDED
+#define _X86INTRIN_H_INCLUDED
+
+#include <ia32intrin.h>
+
+#include <mmintrin.h>
+
+#include <xmmintrin.h>
+
+#include <emmintrin.h>
+
+#include <pmmintrin.h>
+
+#include <tmmintrin.h>
+
+#include <ammintrin.h>
+
+#include <smmintrin.h>
+
+#include <wmmintrin.h>
+
+/* For including AVX instructions */
+#include <immintrin.h>
+
+#include <mm3dnow.h>
+
+#include <fma4intrin.h>
+
+#include <xopintrin.h>
+
+#include <lwpintrin.h>
+
+#include <bmiintrin.h>
+
+#include <bmi2intrin.h>
+
+#include <tbmintrin.h>
+
+#include <lzcntintrin.h>
+
+#include <popcntintrin.h>
+
+#include <rdseedintrin.h>
+
+#include <prfchwintrin.h>
+
+#include <fxsrintrin.h>
+
+#include <xsaveintrin.h>
+
+#include <xsaveoptintrin.h>
+
+#include <adxintrin.h>
+
+#endif /* _X86INTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/xm-cygwin.h b/gcc-4.9/gcc/config/i386/xm-cygwin.h
new file mode 100644
index 000000000..d66a46df5
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/xm-cygwin.h
@@ -0,0 +1,21 @@
+/* Configuration for GCC for hosting on Windows NT.
+   using a unix style C library.
+   Copyright (C) 1995-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define HOST_EXECUTABLE_SUFFIX ".exe"
diff --git a/gcc-4.9/gcc/config/i386/xm-djgpp.h b/gcc-4.9/gcc/config/i386/xm-djgpp.h
new file mode 100644
index 000000000..2f7989c16
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/xm-djgpp.h
@@ -0,0 +1,83 @@
+/* Configuration for GCC for Intel 80386 running DJGPP.
+   Copyright (C) 1988-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Use semicolons to separate elements of a path.  */
+#define PATH_SEPARATOR ';'
+
+#define HOST_EXECUTABLE_SUFFIX ".exe"
+
+/* System dependent initialization for collect2
+   to tell system() to act like Unix.  */
+#define COLLECT2_HOST_INITIALIZATION \
+  do { __system_flags |= (__system_allow_multiple_cmds			\
+		          | __system_emulate_chdir); } while (0)
+
+/* Define a version appropriate for DOS.  */
+#undef XREF_FILE_NAME
+#define XREF_FILE_NAME(xref_file, file) \
+  do { \
+    const char xref_ext[] = ".gxref"; \
+    strcpy (xref_file, file); \
+    s = basename (xref_file); \
+    t = strchr (s, '.'); \
+    if (t) \
+      strcpy (t, xref_ext); \
+    else \
+      strcat (xref_file, xref_ext); \
+  } while (0)
+
+#undef GCC_DRIVER_HOST_INITIALIZATION
+#define GCC_DRIVER_HOST_INITIALIZATION \
+  do { \
+    /* If the environment variable DJDIR is not defined, then DJGPP is not \
+       installed correctly and GCC will quickly become confused with the \
+       default prefix settings. Report the problem now so the user doesn't \
+       receive deceptive "file not found" error messages later.  */ \
+    char *djdir = getenv ("DJDIR"); \
+    if (djdir == NULL) \
+      { \
+        /* DJDIR is automatically defined by the DJGPP environment config \
+           file pointed to by the environment variable DJGPP. Examine DJGPP \
+           to try and figure out what's wrong.  */ \
+        char *djgpp = getenv ("DJGPP"); \
+        if (djgpp == NULL) \
+          fatal ("environment variable DJGPP not defined"); \
+        else if (access (djgpp, R_OK) == 0) \
+          fatal ("environment variable DJGPP points to missing file '%s'", \
+                 djgpp); \
+        else \
+          fatal ("environment variable DJGPP points to corrupt file '%s'", \
+                  djgpp); \
+      } \
+  } while (0)
+
+/* Canonicalize paths containing '/dev/env/'; used in prefix.c.
+   _fixpath is a djgpp-specific function to canonicalize a path.
+   "/dev/env/DJDIR" evaluates to "c:/djgpp" if DJDIR is "c:/djgpp" for
+   example.  It removes any trailing '/', so add it back.  */
+/* We cannot free PATH below as it can point to string constant  */
+#define UPDATE_PATH_HOST_CANONICALIZE(PATH) \
+  if (memcmp ((PATH), "/dev/env/", sizeof("/dev/env/") - 1) == 0) \
+    {						\
+      static char fixed_path[FILENAME_MAX + 1];	\
+						\
+      _fixpath ((PATH), fixed_path);		\
+      strcat (fixed_path, "/");			\
+      (PATH) = xstrdup (fixed_path);		\
+    } 
diff --git a/gcc-4.9/gcc/config/i386/xm-mingw32.h b/gcc-4.9/gcc/config/i386/xm-mingw32.h
new file mode 100644
index 000000000..b6d87a42a
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/xm-mingw32.h
@@ -0,0 +1,40 @@
+/* Configuration for GCC for hosting on Windows32.
+   using GNU tools and the Windows32 API Library.
+   Copyright (C) 1997-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#define HOST_EXECUTABLE_SUFFIX ".exe"
+
+#undef PATH_SEPARATOR
+#define PATH_SEPARATOR ';'
+
+/* This is the name of the null device on windows.  */
+#define HOST_BIT_BUCKET "nul"
+
+/*  The st_ino field of struct stat is always 0.  */
+#define HOST_LACKS_INODE_NUMBERS
+
+#ifdef __MINGW32__
+#undef __USE_MINGW_ANSI_STDIO
+#define __USE_MINGW_ANSI_STDIO 1
+#else
+/* MSVCRT does not support the "ll" format specifier for printing
+   "long long" values.  Instead, we use "I64".  */
+#define HOST_LONG_LONG_FORMAT "I64"
+#endif
+
diff --git a/gcc-4.9/gcc/config/i386/xmmintrin.h b/gcc-4.9/gcc/config/i386/xmmintrin.h
new file mode 100644
index 000000000..a3824e73f
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/xmmintrin.h
@@ -0,0 +1,1265 @@
+/* Copyright (C) 2002-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef _XMMINTRIN_H_INCLUDED
+#define _XMMINTRIN_H_INCLUDED
+
+/* We need type definitions from the MMX header file.  */
+#include <mmintrin.h>
+
+/* Get _mm_malloc () and _mm_free ().  */
+#include <mm_malloc.h>
+
+/* Constants for use with _mm_prefetch.  */
+enum _mm_hint
+{
+  /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
+  _MM_HINT_ET0 = 7,
+  _MM_HINT_ET1 = 6,
+  _MM_HINT_T0 = 3,
+  _MM_HINT_T1 = 2,
+  _MM_HINT_T2 = 1,
+  _MM_HINT_NTA = 0
+};
+
+/* Loads one cache line from address P to a location "closer" to the
+   processor.  The selector I specifies the type of prefetch operation.  */
+#ifdef __OPTIMIZE__
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_prefetch (const void *__P, enum _mm_hint __I)
+{
+  __builtin_prefetch (__P, (__I & 0x4) >> 2, __I & 0x3);
+}
+#else
+#define _mm_prefetch(P, I) \
+  __builtin_prefetch ((P), ((I & 0x4) >> 2), (I & 0x3))
+#endif
+
+#ifndef __SSE__
+#pragma GCC push_options
+#pragma GCC target("sse")
+#define __DISABLE_SSE__
+#endif /* __SSE__ */
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+
+/* Internal data types for implementing the intrinsics.  */
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+/* Create a selector for use with the SHUFPS instruction.  */
+#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
+ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
+
+/* Bits in the MXCSR.  */
+#define _MM_EXCEPT_MASK       0x003f
+#define _MM_EXCEPT_INVALID    0x0001
+#define _MM_EXCEPT_DENORM     0x0002
+#define _MM_EXCEPT_DIV_ZERO   0x0004
+#define _MM_EXCEPT_OVERFLOW   0x0008
+#define _MM_EXCEPT_UNDERFLOW  0x0010
+#define _MM_EXCEPT_INEXACT    0x0020
+
+#define _MM_MASK_MASK         0x1f80
+#define _MM_MASK_INVALID      0x0080
+#define _MM_MASK_DENORM       0x0100
+#define _MM_MASK_DIV_ZERO     0x0200
+#define _MM_MASK_OVERFLOW     0x0400
+#define _MM_MASK_UNDERFLOW    0x0800
+#define _MM_MASK_INEXACT      0x1000
+
+#define _MM_ROUND_MASK        0x6000
+#define _MM_ROUND_NEAREST     0x0000
+#define _MM_ROUND_DOWN        0x2000
+#define _MM_ROUND_UP          0x4000
+#define _MM_ROUND_TOWARD_ZERO 0x6000
+
+#define _MM_FLUSH_ZERO_MASK   0x8000
+#define _MM_FLUSH_ZERO_ON     0x8000
+#define _MM_FLUSH_ZERO_OFF    0x0000
+
+/* Create an undefined vector.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_ps (void)
+{
+  __m128 __Y = __Y;
+  return __Y;
+}
+
+/* Create a vector of zeros.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_ps (void)
+{
+  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
+}
+
+/* Perform the respective operation on the lower SPFP (single-precision
+   floating-point) values of A and B; the upper three SPFP values are
+   passed through from A.  */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_ss (__m128 __A)
+{
+  return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_ss (__m128 __A)
+{
+  return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_ss (__m128 __A)
+{
+  return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Perform the respective operation on the four SPFP values in A and B.  */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Perform logical bit-wise operations on 128-bit values.  */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_ps (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_andps (__A, __B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andnot_ps (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_andnps (__A, __B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_ps (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_orps (__A, __B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_ps (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_xorps (__A, __B);
+}
+
+/* Perform a comparison on the lower SPFP values of A and B.  If the
+   comparison is true, place a mask of all ones in the result, otherwise a
+   mask of zeros.  The upper three SPFP values are passed through from A.  */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
+					(__v4sf)
+					__builtin_ia32_cmpltss ((__v4sf) __B,
+								(__v4sf)
+								__A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
+					(__v4sf)
+					__builtin_ia32_cmpless ((__v4sf) __B,
+								(__v4sf)
+								__A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
+					(__v4sf)
+					__builtin_ia32_cmpnltss ((__v4sf) __B,
+								 (__v4sf)
+								 __A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
+					(__v4sf)
+					__builtin_ia32_cmpnless ((__v4sf) __B,
+								 (__v4sf)
+								 __A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Perform a comparison on the four SPFP values of A and B.  For each
+   element, if the comparison is true, place a mask of all ones in the
+   result, otherwise a mask of zeros.  */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Compare the lower SPFP values of A and B and return 1 if true
+   and 0 if false.  */
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comieq_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comilt_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comile_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comigt_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comige_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comineq_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomieq_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomilt_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomile_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomigt_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomige_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomineq_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Convert the lower SPFP value to a 32-bit integer according to the current
+   rounding mode.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_si32 (__m128 __A)
+{
+  return __builtin_ia32_cvtss2si ((__v4sf) __A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_ss2si (__m128 __A)
+{
+  return _mm_cvtss_si32 (__A);
+}
+
+#ifdef __x86_64__
+/* Convert the lower SPFP value to a 32-bit integer according to the
+   current rounding mode.  */
+
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_si64 (__m128 __A)
+{
+  return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_si64x (__m128 __A)
+{
+  return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
+}
+#endif
+
+/* Convert the two lower SPFP values to 32-bit integers according to the
+   current rounding mode.  Return the integers in packed form.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pi32 (__m128 __A)
+{
+  return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_ps2pi (__m128 __A)
+{
+  return _mm_cvtps_pi32 (__A);
+}
+
+/* Truncate the lower SPFP value to a 32-bit integer.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_si32 (__m128 __A)
+{
+  return __builtin_ia32_cvttss2si ((__v4sf) __A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_ss2si (__m128 __A)
+{
+  return _mm_cvttss_si32 (__A);
+}
+
+#ifdef __x86_64__
+/* Truncate the lower SPFP value to a 32-bit integer.  */
+
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_si64 (__m128 __A)
+{
+  return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_si64x (__m128 __A)
+{
+  return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
+}
+#endif
+
+/* Truncate the two lower SPFP values to 32-bit integers.  Return the
+   integers in packed form.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttps_pi32 (__m128 __A)
+{
+  return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_ps2pi (__m128 __A)
+{
+  return _mm_cvttps_pi32 (__A);
+}
+
+/* Convert B to a SPFP value and insert it as element zero in A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_ss (__m128 __A, int __B)
+{
+  return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_si2ss (__m128 __A, int __B)
+{
+  return _mm_cvtsi32_ss (__A, __B);
+}
+
+#ifdef __x86_64__
+/* Convert B to a SPFP value and insert it as element zero in A.  */
+
+/* Intel intrinsic.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_ss (__m128 __A, long long __B)
+{
+  return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_ss (__m128 __A, long long __B)
+{
+  return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
+}
+#endif
+
+/* Convert the two 32-bit values in B to SPFP form and insert them
+   as the two lower elements in A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi32_ps (__m128 __A, __m64 __B)
+{
+  return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_pi2ps (__m128 __A, __m64 __B)
+{
+  return _mm_cvtpi32_ps (__A, __B);
+}
+
+/* Convert the four signed 16-bit values in A to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi16_ps (__m64 __A)
+{
+  __v4hi __sign;
+  __v2si __hisi, __losi;
+  __v4sf __zero, __ra, __rb;
+
+  /* This comparison against zero gives us a mask that can be used to
+     fill in the missing sign bits in the unpack operations below, so
+     that we get signed values after unpacking.  */
+  __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A);
+
+  /* Convert the four words to doublewords.  */
+  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
+  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
+
+  /* Convert the doublewords to floating point two at a time.  */
+  __zero = (__v4sf) _mm_setzero_ps ();
+  __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
+  __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
+
+  return (__m128) __builtin_ia32_movlhps (__ra, __rb);
+}
+
+/* Convert the four unsigned 16-bit values in A to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpu16_ps (__m64 __A)
+{
+  __v2si __hisi, __losi;
+  __v4sf __zero, __ra, __rb;
+
+  /* Convert the four words to doublewords.  */
+  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL);
+  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL);
+
+  /* Convert the doublewords to floating point two at a time.  */
+  __zero = (__v4sf) _mm_setzero_ps ();
+  __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
+  __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
+
+  return (__m128) __builtin_ia32_movlhps (__ra, __rb);
+}
+
+/* Convert the low four signed 8-bit values in A to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi8_ps (__m64 __A)
+{
+  __v8qi __sign;
+
+  /* This comparison against zero gives us a mask that can be used to
+     fill in the missing sign bits in the unpack operations below, so
+     that we get signed values after unpacking.  */
+  __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A);
+
+  /* Convert the four low bytes to words.  */
+  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
+
+  return _mm_cvtpi16_ps(__A);
+}
+
+/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpu8_ps(__m64 __A)
+{
+  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL);
+  return _mm_cvtpu16_ps(__A);
+}
+
+/* Convert the four signed 32-bit values in A and B to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
+{
+  __v4sf __zero = (__v4sf) _mm_setzero_ps ();
+  __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
+  __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B);
+  return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
+}
+
+/* Convert the four SPFP values in A to four signed 16-bit integers.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pi16(__m128 __A)
+{
+  __v4sf __hisf = (__v4sf)__A;
+  __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
+  __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
+  __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
+  return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
+}
+
+/* Convert the four SPFP values in A to four signed 8-bit integers.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pi8(__m128 __A)
+{
+  __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
+  return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL);
+}
+
+/* Selects four specific SPFP values from A and B based on MASK.  */
+#ifdef __OPTIMIZE__
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
+{
+  return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
+}
+#else
+#define _mm_shuffle_ps(A, B, MASK)					\
+  ((__m128) __builtin_ia32_shufps ((__v4sf)(__m128)(A),			\
+				   (__v4sf)(__m128)(B), (int)(MASK)))
+#endif
+
+/* Selects and interleaves the upper two SPFP values from A and B.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Selects and interleaves the lower two SPFP values from A and B.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Sets the upper two SPFP values with 64-bits of data loaded from P;
+   the lower two values are passed through from A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadh_pi (__m128 __A, __m64 const *__P)
+{
+  return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (const __v2sf *)__P);
+}
+
+/* Stores the upper two SPFP values of A into P.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeh_pi (__m64 *__P, __m128 __A)
+{
+  __builtin_ia32_storehps ((__v2sf *)__P, (__v4sf)__A);
+}
+
+/* Moves the upper two values of B into the lower two values of A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movehl_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Moves the lower two values of B into the upper two values of A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movelh_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Sets the lower two SPFP values with 64-bits of data loaded from P;
+   the upper two values are passed through from A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadl_pi (__m128 __A, __m64 const *__P)
+{
+  return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (const __v2sf *)__P);
+}
+
+/* Stores the lower two SPFP values of A into P.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storel_pi (__m64 *__P, __m128 __A)
+{
+  __builtin_ia32_storelps ((__v2sf *)__P, (__v4sf)__A);
+}
+
+/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_ps (__m128 __A)
+{
+  return __builtin_ia32_movmskps ((__v4sf)__A);
+}
+
+/* Return the contents of the control register.  */
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getcsr (void)
+{
+  return __builtin_ia32_stmxcsr ();
+}
+
+/* Read exception bits from the control register.  */
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_GET_EXCEPTION_STATE (void)
+{
+  return _mm_getcsr() & _MM_EXCEPT_MASK;
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_GET_EXCEPTION_MASK (void)
+{
+  return _mm_getcsr() & _MM_MASK_MASK;
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_GET_ROUNDING_MODE (void)
+{
+  return _mm_getcsr() & _MM_ROUND_MASK;
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_GET_FLUSH_ZERO_MODE (void)
+{
+  return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
+}
+
+/* Set the control register to I.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setcsr (unsigned int __I)
+{
+  __builtin_ia32_ldmxcsr (__I);
+}
+
+/* Set exception bits in the control register.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_SET_EXCEPTION_STATE(unsigned int __mask)
+{
+  _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_SET_EXCEPTION_MASK (unsigned int __mask)
+{
+  _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_SET_ROUNDING_MODE (unsigned int __mode)
+{
+  _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
+{
+  _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
+}
+
+/* Create a vector with element 0 as F and the rest zero.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ss (float __F)
+{
+  return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
+}
+
+/* Create a vector with all four elements equal to F.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_ps (float __F)
+{
+  return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ps1 (float __F)
+{
+  return _mm_set1_ps (__F);
+}
+
+/* Create a vector with element 0 as *P and the rest zero.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ss (float const *__P)
+{
+  return _mm_set_ss (*__P);
+}
+
+/* Create a vector with all four elements equal to *P.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load1_ps (float const *__P)
+{
+  return _mm_set1_ps (*__P);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ps1 (float const *__P)
+{
+  return _mm_load1_ps (__P);
+}
+
+/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ps (float const *__P)
+{
+  return (__m128) *(__v4sf *)__P;
+}
+
+/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_ps (float const *__P)
+{
+  return (__m128) __builtin_ia32_loadups (__P);
+}
+
+/* Load four SPFP values in reverse order.  The address must be aligned.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadr_ps (float const *__P)
+{
+  __v4sf __tmp = *(__v4sf *)__P;
+  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
+}
+
+/* Create the vector [Z Y X W].  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
+{
+  return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
+}
+
+/* Create the vector [W X Y Z].  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_ps (float __Z, float __Y, float __X, float __W)
+{
+  return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
+}
+
+/* Stores the lower SPFP value.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ss (float *__P, __m128 __A)
+{
+  *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
+}
+
+extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_f32 (__m128 __A)
+{
+  return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
+}
+
+/* Store four SPFP values.  The address must be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ps (float *__P, __m128 __A)
+{
+  *(__v4sf *)__P = (__v4sf)__A;
+}
+
+/* Store four SPFP values.  The address need not be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_ps (float *__P, __m128 __A)
+{
+  __builtin_ia32_storeups (__P, (__v4sf)__A);
+}
+
+/* Store the lower SPFP value across four words.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store1_ps (float *__P, __m128 __A)
+{
+  __v4sf __va = (__v4sf)__A;
+  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
+  _mm_storeu_ps (__P, __tmp);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ps1 (float *__P, __m128 __A)
+{
+  _mm_store1_ps (__P, __A);
+}
+
+/* Store four SPFP values in reverse order.  The address must be aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storer_ps (float *__P, __m128 __A)
+{
+  __v4sf __va = (__v4sf)__A;
+  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
+  _mm_store_ps (__P, __tmp);
+}
+
+/* Sets the low SPFP value of A from the low value of B.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Extracts one of the four words of A.  The selector N must be immediate.  */
+#ifdef __OPTIMIZE__
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_pi16 (__m64 const __A, int const __N)
+{
+  return __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pextrw (__m64 const __A, int const __N)
+{
+  return _mm_extract_pi16 (__A, __N);
+}
+#else
+#define _mm_extract_pi16(A, N)	\
+  ((int) __builtin_ia32_vec_ext_v4hi ((__v4hi)(__m64)(A), (int)(N)))
+
+#define _m_pextrw(A, N) _mm_extract_pi16(A, N)
+#endif
+
+/* Inserts word D into one of four words of A.  The selector N must be
+   immediate.  */
+#ifdef __OPTIMIZE__
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
+{
+  return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pinsrw (__m64 const __A, int const __D, int const __N)
+{
+  return _mm_insert_pi16 (__A, __D, __N);
+}
+#else
+#define _mm_insert_pi16(A, D, N)				\
+  ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(__m64)(A),	\
+					(int)(D), (int)(N)))
+
+#define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N)
+#endif
+
+/* Compute the element-wise maximum of signed 16-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_pi16 (__m64 __A, __m64 __B)
+{
+  return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmaxsw (__m64 __A, __m64 __B)
+{
+  return _mm_max_pi16 (__A, __B);
+}
+
+/* Compute the element-wise maximum of unsigned 8-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_pu8 (__m64 __A, __m64 __B)
+{
+  return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmaxub (__m64 __A, __m64 __B)
+{
+  return _mm_max_pu8 (__A, __B);
+}
+
+/* Compute the element-wise minimum of signed 16-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_pi16 (__m64 __A, __m64 __B)
+{
+  return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pminsw (__m64 __A, __m64 __B)
+{
+  return _mm_min_pi16 (__A, __B);
+}
+
+/* Compute the element-wise minimum of unsigned 8-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_pu8 (__m64 __A, __m64 __B)
+{
+  return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pminub (__m64 __A, __m64 __B)
+{
+  return _mm_min_pu8 (__A, __B);
+}
+
+/* Create an 8-bit mask of the signs of 8-bit values.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_pi8 (__m64 __A)
+{
+  return __builtin_ia32_pmovmskb ((__v8qi)__A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmovmskb (__m64 __A)
+{
+  return _mm_movemask_pi8 (__A);
+}
+
+/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
+   in B and produce the high 16 bits of the 32-bit results.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_pu16 (__m64 __A, __m64 __B)
+{
+  return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmulhuw (__m64 __A, __m64 __B)
+{
+  return _mm_mulhi_pu16 (__A, __B);
+}
+
+/* Return a combination of the four 16-bit values in A.  The selector
+   must be an immediate.  */
+#ifdef __OPTIMIZE__
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi16 (__m64 __A, int const __N)
+{
+  return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pshufw (__m64 __A, int const __N)
+{
+  return _mm_shuffle_pi16 (__A, __N);
+}
+#else
+#define _mm_shuffle_pi16(A, N) \
+  ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N)))
+
+#define _m_pshufw(A, N) _mm_shuffle_pi16 (A, N)
+#endif
+
+/* Conditionally store byte elements of A into P.  The high bit of each
+   byte in the selector N determines whether the corresponding byte from
+   A is stored.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
+{
+  __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_maskmovq (__m64 __A, __m64 __N, char *__P)
+{
+  _mm_maskmove_si64 (__A, __N, __P);
+}
+
+/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_pu8 (__m64 __A, __m64 __B)
+{
+  return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pavgb (__m64 __A, __m64 __B)
+{
+  return _mm_avg_pu8 (__A, __B);
+}
+
+/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_pu16 (__m64 __A, __m64 __B)
+{
+  return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pavgw (__m64 __A, __m64 __B)
+{
+  return _mm_avg_pu16 (__A, __B);
+}
+
+/* Compute the sum of the absolute differences of the unsigned 8-bit
+   values in A and B.  Return the value in the lower 16-bit word; the
+   upper words are cleared.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sad_pu8 (__m64 __A, __m64 __B)
+{
+  return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psadbw (__m64 __A, __m64 __B)
+{
+  return _mm_sad_pu8 (__A, __B);
+}
+
+/* Stores the data in A to the address P without polluting the caches.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_pi (__m64 *__P, __m64 __A)
+{
+  __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
+}
+
+/* Likewise.  The address must be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_ps (float *__P, __m128 __A)
+{
+  __builtin_ia32_movntps (__P, (__v4sf)__A);
+}
+
+/* Guarantees that every preceding store is globally visible before
+   any subsequent store.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sfence (void)
+{
+  __builtin_ia32_sfence ();
+}
+
+/* The execution of the next instruction is delayed by an implementation
+   specific amount of time.  The instruction does not modify the
+   architectural state.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_pause (void)
+{
+  __builtin_ia32_pause ();
+}
+
+/* Transpose the 4x4 matrix composed of row[0-3].  */
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)			\
+do {									\
+  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
+  __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1);			\
+  __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3);			\
+  __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1);			\
+  __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3);			\
+  (row0) = __builtin_ia32_movlhps (__t0, __t1);				\
+  (row1) = __builtin_ia32_movhlps (__t1, __t0);				\
+  (row2) = __builtin_ia32_movlhps (__t2, __t3);				\
+  (row3) = __builtin_ia32_movhlps (__t3, __t2);				\
+} while (0)
+
+/* For backward source compatibility.  */
+# include <emmintrin.h>
+
+#ifdef __DISABLE_SSE__
+#undef __DISABLE_SSE__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE__ */
+
+#endif /* _XMMINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/xopintrin.h b/gcc-4.9/gcc/config/i386/xopintrin.h
new file mode 100644
index 000000000..cc82bc5fa
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/xopintrin.h
@@ -0,0 +1,844 @@
+/* Copyright (C) 2007-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86INTRIN_H_INCLUDED
+# error "Never use <xopintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _XOPMMINTRIN_H_INCLUDED
+#define _XOPMMINTRIN_H_INCLUDED
+
+#include <fma4intrin.h>
+
+#ifndef __XOP__
+#pragma GCC push_options
+#pragma GCC target("xop")
+#define __DISABLE_XOP__
+#endif /* __XOP__ */
+
+/* Integer multiply/add intructions. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_vpmacssww ((__v8hi)__A,(__v8hi)__B, (__v8hi)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_vpmacsww ((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmacsswd ((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmacswd ((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmacssdd ((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmacsdd ((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmacssdql ((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmacsdql ((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmacssdqh ((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmacsdqh ((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmadcsswd ((__v8hi)__A,(__v8hi)__B,(__v4si)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmadcswd ((__v8hi)__A,(__v8hi)__B,(__v4si)__C);
+}
+
+/* Packed Integer Horizontal Add and Subtract */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddw_epi8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphaddbw ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddd_epi8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphaddbd ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddq_epi8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphaddbq ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddd_epi16(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphaddwd ((__v8hi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddq_epi16(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphaddwq ((__v8hi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddq_epi32(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphadddq ((__v4si)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddw_epu8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphaddubw ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddd_epu8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphaddubd ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddq_epu8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphaddubq ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddd_epu16(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphadduwd ((__v8hi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddq_epu16(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphadduwq ((__v8hi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddq_epu32(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphaddudq ((__v4si)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubw_epi8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphsubbw ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubd_epi16(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphsubwd ((__v8hi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubq_epi32(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphsubdq ((__v4si)__A);
+}
+
+/* Vector conditional move and permute */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpcmov (__A, __B, __C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpperm ((__v16qi)__A, (__v16qi)__B, (__v16qi)__C);
+}
+
+/* Packed Integer Rotates and Shifts
+   Rotates - Non-Immediate form */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rot_epi8(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vprotb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rot_epi16(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vprotw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rot_epi32(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vprotd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rot_epi64(__m128i __A,  __m128i __B)
+{
+  return (__m128i)  __builtin_ia32_vprotq ((__v2di)__A, (__v2di)__B);
+}
+
+/* Rotates - Immediate form */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roti_epi8(__m128i __A, const int __B)
+{
+  return  (__m128i) __builtin_ia32_vprotbi ((__v16qi)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roti_epi16(__m128i __A, const int __B)
+{
+  return  (__m128i) __builtin_ia32_vprotwi ((__v8hi)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roti_epi32(__m128i __A, const int __B)
+{
+  return  (__m128i) __builtin_ia32_vprotdi ((__v4si)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roti_epi64(__m128i __A, const int __B)
+{
+  return  (__m128i) __builtin_ia32_vprotqi ((__v2di)__A, __B);
+}
+#else
+#define _mm_roti_epi8(A, N) \
+  ((__m128i) __builtin_ia32_vprotbi ((__v16qi)(__m128i)(A), (int)(N)))
+#define _mm_roti_epi16(A, N) \
+  ((__m128i) __builtin_ia32_vprotwi ((__v8hi)(__m128i)(A), (int)(N)))
+#define _mm_roti_epi32(A, N) \
+  ((__m128i) __builtin_ia32_vprotdi ((__v4si)(__m128i)(A), (int)(N)))
+#define _mm_roti_epi64(A, N) \
+  ((__m128i) __builtin_ia32_vprotqi ((__v2di)(__m128i)(A), (int)(N)))
+#endif
+
+/* Shifts */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shl_epi8(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vpshlb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shl_epi16(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vpshlw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shl_epi32(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vpshld ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shl_epi64(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vpshlq ((__v2di)__A, (__v2di)__B);
+}
+
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha_epi8(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vpshab ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha_epi16(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vpshaw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha_epi32(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vpshad ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha_epi64(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vpshaq ((__v2di)__A, (__v2di)__B);
+}
+
+/* Compare and Predicate Generation
+   pcom (integer, unsinged bytes) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomltub ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomleub ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgtub ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgeub ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomequb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomnequb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomfalseub ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomtrueub ((__v16qi)__A, (__v16qi)__B);
+}
+
+/*pcom (integer, unsinged words) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomltuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomleuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgtuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgeuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomequw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomnequw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomfalseuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomtrueuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+/*pcom (integer, unsinged double words) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomltud ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomleud ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgtud ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgeud ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomequd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomnequd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomfalseud ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomtrueud ((__v4si)__A, (__v4si)__B);
+}
+
+/*pcom (integer, unsinged quad words) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomltuq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomleuq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgtuq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgeuq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomequq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomnequq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomfalseuq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomtrueuq ((__v2di)__A, (__v2di)__B);
+}
+
+/*pcom (integer, signed bytes) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomltb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomleb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgtb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgeb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomeqb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomneqb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomfalseb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomtrueb ((__v16qi)__A, (__v16qi)__B);
+}
+
+/*pcom (integer, signed words) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomltw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomlew ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgtw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgew ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomeqw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomneqw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomfalsew ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomtruew ((__v8hi)__A, (__v8hi)__B);
+}
+
+/*pcom (integer, signed double words) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomltd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomled ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgtd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomged ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomeqd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomneqd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomfalsed ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomtrued ((__v4si)__A, (__v4si)__B);
+}
+
+/*pcom (integer, signed quad words) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomltq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomleq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgtq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgeq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomeqq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomneqq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomfalseq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomtrueq ((__v2di)__A, (__v2di)__B);
+}
+
+/* FRCZ */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_frcz_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_vfrczps ((__v4sf)__A);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_frcz_pd (__m128d __A)
+{
+  return (__m128d) __builtin_ia32_vfrczpd ((__v2df)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_frcz_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movss ((__v4sf)__A,
+					(__v4sf)
+					__builtin_ia32_vfrczss ((__v4sf)__B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_frcz_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_movsd ((__v2df)__A,
+					 (__v2df)
+					 __builtin_ia32_vfrczsd ((__v2df)__B));
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_frcz_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_vfrczps256 ((__v8sf)__A);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_frcz_pd (__m256d __A)
+{
+  return (__m256d) __builtin_ia32_vfrczpd256 ((__v4df)__A);
+}
+
+/* PERMIL2 */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permute2_pd (__m128d __X, __m128d __Y, __m128i __C, const int __I)
+{
+  return (__m128d) __builtin_ia32_vpermil2pd ((__v2df)__X,
+					      (__v2df)__Y,
+					      (__v2di)__C,
+					      __I);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2_pd (__m256d __X, __m256d __Y, __m256i __C, const int __I)
+{
+  return (__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)__X,
+						 (__v4df)__Y,
+						 (__v4di)__C,
+						 __I);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permute2_ps (__m128 __X, __m128 __Y, __m128i __C, const int __I)
+{
+  return (__m128) __builtin_ia32_vpermil2ps ((__v4sf)__X,
+					     (__v4sf)__Y,
+					     (__v4si)__C,
+					     __I);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2_ps (__m256 __X, __m256 __Y, __m256i __C, const int __I)
+{
+  return (__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)__X,
+						(__v8sf)__Y,
+						(__v8si)__C,
+						__I);
+}
+#else
+#define _mm_permute2_pd(X, Y, C, I)					\
+  ((__m128d) __builtin_ia32_vpermil2pd ((__v2df)(__m128d)(X),		\
+					(__v2df)(__m128d)(Y),		\
+					(__v2di)(__m128d)(C),		\
+					(int)(I)))
+
+#define _mm256_permute2_pd(X, Y, C, I)					\
+  ((__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)(__m256d)(X),	\
+					   (__v4df)(__m256d)(Y),	\
+					   (__v4di)(__m256d)(C),	\
+					   (int)(I)))
+
+#define _mm_permute2_ps(X, Y, C, I)					\
+  ((__m128) __builtin_ia32_vpermil2ps ((__v4sf)(__m128)(X),		\
+				       (__v4sf)(__m128)(Y),		\
+				       (__v4si)(__m128)(C),		\
+				       (int)(I)))
+
+#define _mm256_permute2_ps(X, Y, C, I)					\
+  ((__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)(__m256)(X),		\
+					  (__v8sf)(__m256)(Y),  	\
+					  (__v8si)(__m256)(C),		\
+ 					  (int)(I)))
+#endif /* __OPTIMIZE__ */
+
+#ifdef __DISABLE_XOP__
+#undef __DISABLE_XOP__
+#pragma GCC pop_options
+#endif /* __DISABLE_XOP__ */
+
+#endif /* _XOPMMINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/xsaveintrin.h b/gcc-4.9/gcc/config/i386/xsaveintrin.h
new file mode 100644
index 000000000..47be25f0c
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/xsaveintrin.h
@@ -0,0 +1,72 @@
+/* Copyright (C) 2012-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* #if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED */
+/* # error "Never use <xsaveintrin.h> directly; include <x86intrin.h> instead." */
+/* #endif */
+
+#ifndef _XSAVEINTRIN_H_INCLUDED
+#define _XSAVEINTRIN_H_INCLUDED
+
+#ifndef __XSAVE__
+#pragma GCC push_options
+#pragma GCC target("xsave")
+#define __DISABLE_XSAVE__
+#endif /* __XSAVE__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsave (void *__P, long long __M)
+{
+  return __builtin_ia32_xsave (__P, __M);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xrstor (void *__P, long long __M)
+{
+  return __builtin_ia32_xrstor (__P, __M);
+}
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsave64 (void *__P, long long __M)
+{
+  return __builtin_ia32_xsave64 (__P, __M);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xrstor64 (void *__P, long long __M)
+{
+  return __builtin_ia32_xrstor64 (__P, __M);
+}
+#endif
+
+#ifdef __DISABLE_XSAVE__
+#undef __DISABLE_XSAVE__
+#pragma GCC pop_options
+#endif /* __DISABLE_XSAVE__ */
+
+#endif /* _XSAVEINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/xsaveoptintrin.h b/gcc-4.9/gcc/config/i386/xsaveoptintrin.h
new file mode 100644
index 000000000..d7534b41c
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/xsaveoptintrin.h
@@ -0,0 +1,58 @@
+/* Copyright (C) 2012-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* #if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED */
+/* # error "Never use <xsaveoptintrin.h> directly; include <x86intrin.h> instead." */
+/* #endif */
+
+#ifndef _XSAVEOPTINTRIN_H_INCLUDED
+#define _XSAVEOPTINTRIN_H_INCLUDED
+
+#ifndef __XSAVEOPT__
+#pragma GCC push_options
+#pragma GCC target("xsaveopt")
+#define __DISABLE_XSAVEOPT__
+#endif /* __XSAVEOPT__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsaveopt (void *__P, long long __M)
+{
+  return __builtin_ia32_xsaveopt (__P, __M);
+}
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsaveopt64 (void *__P, long long __M)
+{
+  return __builtin_ia32_xsaveopt64 (__P, __M);
+}
+#endif
+
+#ifdef __DISABLE_XSAVEOPT__
+#undef __DISABLE_XSAVEOPT__
+#pragma GCC pop_options
+#endif /* __DISABLE_XSAVEOPT__ */
+
+#endif /* _XSAVEOPTINTRIN_H_INCLUDED */
diff --git a/gcc-4.9/gcc/config/i386/xtestintrin.h b/gcc-4.9/gcc/config/i386/xtestintrin.h
new file mode 100644
index 000000000..ba79e5c5e
--- /dev/null
+++ b/gcc-4.9/gcc/config/i386/xtestintrin.h
@@ -0,0 +1,51 @@
+/* Copyright (C) 2012-2014 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+# error "Never use <xtestintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _XTESTINTRIN_H_INCLUDED
+#define _XTESTINTRIN_H_INCLUDED
+
+#ifndef __RTM__
+#pragma GCC push_options
+#pragma GCC target("rtm")
+#define __DISABLE_RTM__
+#endif /* __RTM__ */
+
+/* Return non-zero if the instruction executes inside an RTM or HLE code
+   region.  Return zero otherwise.   */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xtest (void)
+{
+  return __builtin_ia32_xtest ();
+}
+
+#ifdef __DISABLE_RTM__
+#undef __DISABLE_RTM__
+#pragma GCC pop_options
+#endif /* __DISABLE_RTM__ */
+
+#endif /* _XTESTINTRIN_H_INCLUDED */
author	Ben Cheng <bccheng@google.com>	2014-03-25 22:37:19 -0700
committer	Ben Cheng <bccheng@google.com>	2014-03-25 22:37:19 -0700
commit	1bc5aee63eb72b341f506ad058502cd0361f0d10 (patch)
tree	c607e8252f3405424ff15bc2d00aa38dadbb2518 /gcc-4.9/gcc/config/i386
parent	283a0bf58fcf333c58a2a92c3ebbc41fb9eb1fdb (diff)
download	toolchain_gcc-1bc5aee63eb72b341f506ad058502cd0361f0d10.tar.gz toolchain_gcc-1bc5aee63eb72b341f506ad058502cd0361f0d10.tar.bz2 toolchain_gcc-1bc5aee63eb72b341f506ad058502cd0361f0d10.zip