8 files changed, 1474 insertions, 0 deletions
diff --git a/gcc-4.9/libgcc/config/tilepro/atomic.c b/gcc-4.9/libgcc/config/tilepro/atomic.c
new file mode 100644
index 000000000..66ef8fd7d
--- /dev/null
+++ b/gcc-4.9/libgcc/config/tilepro/atomic.c
@@ -0,0 +1,397 @@
+/* TILE atomics.
+   Copyright (C) 2011-2014 Free Software Foundation, Inc.
+   Contributed by Walter Lee (walt@tilera.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "tconfig.h"
+#include "coretypes.h"
+#include "atomic.h"
+
+#define bool unsigned char
+
+/* This code should be inlined by the compiler, but for now support
+   it as out-of-line methods in libgcc.  */
+
+static inline void
+pre_atomic_barrier (int model)
+{
+  switch ((enum memmodel) model)
+    {
+    case MEMMODEL_RELEASE:
+    case MEMMODEL_ACQ_REL:
+    case MEMMODEL_SEQ_CST:
+      __atomic_thread_fence (model);
+      break;
+    default:
+      break;
+    }
+  return;
+}
+
+static inline void
+post_atomic_barrier (int model)
+{
+  switch ((enum memmodel) model)
+    {
+    case MEMMODEL_ACQUIRE:
+    case MEMMODEL_ACQ_REL:
+    case MEMMODEL_SEQ_CST:
+      __atomic_thread_fence (model);
+      break;
+    default:
+      break;
+    }
+  return;
+}
+
+#define __unused __attribute__((unused))
+
+#define __fetch_and_do(proto, type, size, opname, top, bottom)	\
+proto								\
+{								\
+  top;								\
+  type rv = arch_atomic_##opname(p, i);				\
+  bottom;							\
+  return rv;							\
+}
+
+#define __atomic_fetch_and_do(type, size, opname)			\
+  __fetch_and_do(type __atomic_fetch_##opname##_##size(type* p, type i, int model), \
+		 type, size, opname,					\
+		 pre_atomic_barrier(model),				\
+		 post_atomic_barrier(model))				\
+
+__atomic_fetch_and_do (int, 4, add)
+__atomic_fetch_and_do (int, 4, sub)
+__atomic_fetch_and_do (int, 4, or)
+__atomic_fetch_and_do (int, 4, and)
+__atomic_fetch_and_do (int, 4, xor)
+__atomic_fetch_and_do (int, 4, nand)
+__atomic_fetch_and_do (long long, 8, add)
+__atomic_fetch_and_do (long long, 8, sub)
+__atomic_fetch_and_do (long long, 8, or)
+__atomic_fetch_and_do (long long, 8, and)
+__atomic_fetch_and_do (long long, 8, xor)
+__atomic_fetch_and_do (long long, 8, nand)
+
+#define __sync_fetch_and_do(type, size, opname)				\
+  __fetch_and_do(type __sync_fetch_and_##opname##_##size(type* p, type i), \
+		 type, size, opname,					\
+		 arch_atomic_write_barrier(),				\
+		 arch_atomic_read_barrier())
+
+__sync_fetch_and_do (int, 4, add)
+__sync_fetch_and_do (int, 4, sub)
+__sync_fetch_and_do (int, 4, or)
+__sync_fetch_and_do (int, 4, and)
+__sync_fetch_and_do (int, 4, xor)
+__sync_fetch_and_do (int, 4, nand)
+__sync_fetch_and_do (long long, 8, add)
+__sync_fetch_and_do (long long, 8, sub)
+__sync_fetch_and_do (long long, 8, or)
+__sync_fetch_and_do (long long, 8, and)
+__sync_fetch_and_do (long long, 8, xor)
+__sync_fetch_and_do (long long, 8, nand)
+
+#define __do_and_fetch(proto, type, size, opname, op, op2, top, bottom)	\
+proto									\
+{									\
+  top;									\
+  type rv = op2 (arch_atomic_##opname(p, i) op i);			\
+  bottom;								\
+  return rv;								\
+}
+
+#define __atomic_do_and_fetch(type, size, opname, op, op2)		\
+  __do_and_fetch(type __atomic_##opname##_fetch_##size(type* p, type i, int model), \
+		 type, size, opname, op, op2,				\
+		 pre_atomic_barrier(model),				\
+		 post_atomic_barrier(model))				\
+
+__atomic_do_and_fetch (int, 4, add, +, )
+__atomic_do_and_fetch (int, 4, sub, -, )
+__atomic_do_and_fetch (int, 4, or, |, )
+__atomic_do_and_fetch (int, 4, and, &, )
+__atomic_do_and_fetch (int, 4, xor, |, )
+__atomic_do_and_fetch (int, 4, nand, &, ~)
+__atomic_do_and_fetch (long long, 8, add, +, )
+__atomic_do_and_fetch (long long, 8, sub, -, )
+__atomic_do_and_fetch (long long, 8, or, |, )
+__atomic_do_and_fetch (long long, 8, and, &, )
+__atomic_do_and_fetch (long long, 8, xor, |, )
+__atomic_do_and_fetch (long long, 8, nand, &, ~)
+
+#define __sync_do_and_fetch(type, size, opname, op, op2)		\
+  __do_and_fetch(type __sync_##opname##_and_fetch_##size(type* p, type i), \
+		 type, size, opname, op, op2,				\
+		 arch_atomic_write_barrier(),				\
+		 arch_atomic_read_barrier())				\
+
+__sync_do_and_fetch (int, 4, add, +, )
+__sync_do_and_fetch (int, 4, sub, -, )
+__sync_do_and_fetch (int, 4, or, |, )
+__sync_do_and_fetch (int, 4, and, &, )
+__sync_do_and_fetch (int, 4, xor, |, )
+__sync_do_and_fetch (int, 4, nand, &, ~)
+__sync_do_and_fetch (long long, 8, add, +, )
+__sync_do_and_fetch (long long, 8, sub, -, )
+__sync_do_and_fetch (long long, 8, or, |, )
+__sync_do_and_fetch (long long, 8, and, &, )
+__sync_do_and_fetch (long long, 8, xor, |, )
+__sync_do_and_fetch (long long, 8, nand, &, ~)
+
+#define __atomic_exchange_methods(type, size)				\
+bool									\
+__atomic_compare_exchange_##size(volatile type* ptr, type* oldvalp,	\
+				 type newval, bool weak __unused,	\
+				 int models, int modelf __unused)	\
+{									\
+  type oldval = *oldvalp;						\
+  pre_atomic_barrier(models);						\
+  type retval = arch_atomic_val_compare_and_exchange(ptr, oldval, newval); \
+  post_atomic_barrier(models);						\
+  bool success = (retval == oldval);					\
+  *oldvalp = retval;							\
+  return success;							\
+}									\
+									\
+type									\
+__atomic_exchange_##size(volatile type* ptr, type val, int model)	\
+{									\
+  pre_atomic_barrier(model);						\
+  type retval = arch_atomic_exchange(ptr, val);				\
+  post_atomic_barrier(model);						\
+  return retval;							\
+}
+
+__atomic_exchange_methods (int, 4)
+__atomic_exchange_methods (long long, 8)
+
+#define __sync_exchange_methods(type, size)				\
+type									\
+__sync_val_compare_and_swap_##size(type* ptr, type oldval, type newval)	\
+{									\
+  arch_atomic_write_barrier();						\
+  type retval = arch_atomic_val_compare_and_exchange(ptr, oldval, newval); \
+  arch_atomic_read_barrier();						\
+  return retval;							\
+}									\
+									\
+bool									\
+__sync_bool_compare_and_swap_##size(type* ptr, type oldval, type newval) \
+{									\
+  arch_atomic_write_barrier();						\
+  bool retval = arch_atomic_bool_compare_and_exchange(ptr, oldval, newval); \
+  arch_atomic_read_barrier();						\
+  return retval;							\
+}									\
+									\
+type									\
+__sync_lock_test_and_set_##size(type* ptr, type val)			\
+{									\
+  type retval = arch_atomic_exchange(ptr, val);				\
+  arch_atomic_acquire_barrier_value(retval);				\
+  return retval;							\
+}
+
+__sync_exchange_methods (int, 4)
+__sync_exchange_methods (long long, 8)
+
+#ifdef __LITTLE_ENDIAN__
+#define BIT_OFFSET(n, type) ((n) * 8)
+#else
+#define BIT_OFFSET(n, type) ((4 - sizeof(type) - (n)) * 8)
+#endif
+
+/* Subword methods require the same approach for both TILEPro and
+   TILE-Gx.  We load the background data for the word, insert the
+   desired subword piece, then compare-and-exchange it into place.  */
+#define u8 unsigned char
+#define u16 unsigned short
+
+#define __subword_cmpxchg_body(type, size, ptr, guess, val)		\
+  ({									\
+    unsigned int *p = (unsigned int *)((unsigned long)ptr & ~3UL);	\
+    const int shift = BIT_OFFSET((unsigned long)ptr & 3UL, type);	\
+    const unsigned int valmask = (1 << (sizeof(type) * 8)) - 1;		\
+    const unsigned int bgmask = ~(valmask << shift);			\
+    unsigned int oldword = *p;						\
+    type oldval = (oldword >> shift) & valmask;				\
+    if (__builtin_expect((oldval == guess), 1)) {			\
+      unsigned int word = (oldword & bgmask) | ((val & valmask) << shift); \
+      oldword = arch_atomic_val_compare_and_exchange(p, oldword, word);	\
+      oldval = (oldword >> shift) & valmask;				\
+    }									\
+    oldval;								\
+  })									\
+
+#define __atomic_subword_cmpxchg(type, size)				\
+  									\
+bool									\
+__atomic_compare_exchange_##size(volatile type* ptr, type* guess_ptr,	\
+				 type val, bool weak __unused, int models, \
+				 int modelf __unused)			\
+{									\
+  pre_atomic_barrier(models);						\
+  type guess = *guess_ptr;						\
+  type oldval = __subword_cmpxchg_body(type, size, ptr, guess, val);	\
+  post_atomic_barrier(models);						\
+  bool success = (oldval == guess);					\
+  *guess_ptr = oldval;							\
+  return success;							\
+}
+
+__atomic_subword_cmpxchg (u8, 1)
+__atomic_subword_cmpxchg (u16, 2)
+
+#define __sync_subword_cmpxchg(type, size)				\
+  									\
+type									\
+__sync_val_compare_and_swap_##size(type* ptr, type guess, type val)	\
+{									\
+  arch_atomic_write_barrier();						\
+  type oldval = __subword_cmpxchg_body(type, size, ptr, guess, val);	\
+  arch_atomic_read_barrier();						\
+  return oldval;							\
+}									\
+									\
+bool									\
+__sync_bool_compare_and_swap_##size(type* ptr, type guess, type val)	\
+{									\
+  type oldval = __sync_val_compare_and_swap_##size(ptr, guess, val);	\
+  return oldval == guess;						\
+}
+
+__sync_subword_cmpxchg (u8, 1)
+__sync_subword_cmpxchg (u16, 2)
+
+/* For the atomic-update subword methods, we use the same approach as
+   above, but we retry until we succeed if the compare-and-exchange
+   fails.  */
+#define __subword(type, proto, top, expr, bottom)			\
+proto									\
+{									\
+  top									\
+  unsigned int *p = (unsigned int *)((unsigned long)ptr & ~3UL);	\
+  const int shift = BIT_OFFSET((unsigned long)ptr & 3UL, type);		\
+  const unsigned int valmask = (1 << (sizeof(type) * 8)) - 1;		\
+  const unsigned int bgmask = ~(valmask << shift);			\
+  unsigned int oldword, xword = *p;					\
+  type val, oldval;							\
+  do {									\
+    oldword = xword;							\
+    oldval = (oldword >> shift) & valmask;				\
+    val = expr;								\
+    unsigned int word = (oldword & bgmask) | ((val & valmask) << shift); \
+    xword = arch_atomic_val_compare_and_exchange(p, oldword, word);	\
+  } while (__builtin_expect(xword != oldword, 0));			\
+  bottom								\
+}
+
+#define __atomic_subword_fetch(type, funcname, expr, retval)		\
+  __subword(type,							\
+	    type __atomic_ ## funcname(volatile type *ptr, type i, int model), \
+	    pre_atomic_barrier(model);,					\
+	    expr,							\
+	    post_atomic_barrier(model); return retval;)
+
+__atomic_subword_fetch (u8, fetch_add_1, oldval + i, oldval)
+__atomic_subword_fetch (u8, fetch_sub_1, oldval - i, oldval)
+__atomic_subword_fetch (u8, fetch_or_1, oldval | i, oldval)
+__atomic_subword_fetch (u8, fetch_and_1, oldval & i, oldval)
+__atomic_subword_fetch (u8, fetch_xor_1, oldval ^ i, oldval)
+__atomic_subword_fetch (u8, fetch_nand_1, ~(oldval & i), oldval)
+
+__atomic_subword_fetch (u16, fetch_add_2, oldval + i, oldval)
+__atomic_subword_fetch (u16, fetch_sub_2, oldval - i, oldval)
+__atomic_subword_fetch (u16, fetch_or_2, oldval | i, oldval)
+__atomic_subword_fetch (u16, fetch_and_2, oldval & i, oldval)
+__atomic_subword_fetch (u16, fetch_xor_2, oldval ^ i, oldval)
+__atomic_subword_fetch (u16, fetch_nand_2, ~(oldval & i), oldval)
+
+__atomic_subword_fetch (u8, add_fetch_1, oldval + i, val)
+__atomic_subword_fetch (u8, sub_fetch_1, oldval - i, val)
+__atomic_subword_fetch (u8, or_fetch_1, oldval | i, val)
+__atomic_subword_fetch (u8, and_fetch_1, oldval & i, val)
+__atomic_subword_fetch (u8, xor_fetch_1, oldval ^ i, val)
+__atomic_subword_fetch (u8, nand_fetch_1, ~(oldval & i), val)
+
+__atomic_subword_fetch (u16, add_fetch_2, oldval + i, val)
+__atomic_subword_fetch (u16, sub_fetch_2, oldval - i, val)
+__atomic_subword_fetch (u16, or_fetch_2, oldval | i, val)
+__atomic_subword_fetch (u16, and_fetch_2, oldval & i, val)
+__atomic_subword_fetch (u16, xor_fetch_2, oldval ^ i, val)
+__atomic_subword_fetch (u16, nand_fetch_2, ~(oldval & i), val)
+
+#define __sync_subword_fetch(type, funcname, expr, retval)	\
+  __subword(type,						\
+	    type __sync_ ## funcname(type *ptr, type i),	\
+	    arch_atomic_read_barrier();,			\
+	    expr,						\
+	    arch_atomic_write_barrier(); return retval;)
+
+__sync_subword_fetch (u8, fetch_and_add_1, oldval + i, oldval)
+__sync_subword_fetch (u8, fetch_and_sub_1, oldval - i, oldval)
+__sync_subword_fetch (u8, fetch_and_or_1, oldval | i, oldval)
+__sync_subword_fetch (u8, fetch_and_and_1, oldval & i, oldval)
+__sync_subword_fetch (u8, fetch_and_xor_1, oldval ^ i, oldval)
+__sync_subword_fetch (u8, fetch_and_nand_1, ~(oldval & i), oldval)
+
+__sync_subword_fetch (u16, fetch_and_add_2, oldval + i, oldval)
+__sync_subword_fetch (u16, fetch_and_sub_2, oldval - i, oldval)
+__sync_subword_fetch (u16, fetch_and_or_2, oldval | i, oldval)
+__sync_subword_fetch (u16, fetch_and_and_2, oldval & i, oldval)
+__sync_subword_fetch (u16, fetch_and_xor_2, oldval ^ i, oldval)
+__sync_subword_fetch (u16, fetch_and_nand_2, ~(oldval & i), oldval)
+
+__sync_subword_fetch (u8, add_and_fetch_1, oldval + i, val)
+__sync_subword_fetch (u8, sub_and_fetch_1, oldval - i, val)
+__sync_subword_fetch (u8, or_and_fetch_1, oldval | i, val)
+__sync_subword_fetch (u8, and_and_fetch_1, oldval & i, val)
+__sync_subword_fetch (u8, xor_and_fetch_1, oldval ^ i, val)
+__sync_subword_fetch (u8, nand_and_fetch_1, ~(oldval & i), val)
+
+__sync_subword_fetch (u16, add_and_fetch_2, oldval + i, val)
+__sync_subword_fetch (u16, sub_and_fetch_2, oldval - i, val)
+__sync_subword_fetch (u16, or_and_fetch_2, oldval | i, val)
+__sync_subword_fetch (u16, and_and_fetch_2, oldval & i, val)
+__sync_subword_fetch (u16, xor_and_fetch_2, oldval ^ i, val)
+__sync_subword_fetch (u16, nand_and_fetch_2, ~(oldval & i), val)
+
+#define __atomic_subword_lock(type, size)				\
+  __subword(type,							\
+	    type __atomic_exchange_##size(volatile type* ptr, type nval, int model), \
+	    pre_atomic_barrier(model);,					\
+	    nval,							\
+	    post_atomic_barrier(model); return oldval;)
+
+__atomic_subword_lock (u8, 1)
+__atomic_subword_lock (u16, 2)
+
+#define __sync_subword_lock(type, size)					\
+  __subword(type,							\
+	    type __sync_lock_test_and_set_##size(type* ptr, type nval), \
+	    ,								\
+	    nval,							\
+	    arch_atomic_acquire_barrier_value(oldval); return oldval;)
+
+__sync_subword_lock (u8, 1)
+__sync_subword_lock (u16, 2)
diff --git a/gcc-4.9/libgcc/config/tilepro/atomic.h b/gcc-4.9/libgcc/config/tilepro/atomic.h
new file mode 100644
index 000000000..404e15ee2
--- /dev/null
+++ b/gcc-4.9/libgcc/config/tilepro/atomic.h
@@ -0,0 +1,435 @@
+/* Macros for atomic functionality for tile.
+   Copyright (C) 2011-2014 Free Software Foundation, Inc.
+   Contributed by Walter Lee (walt@tilera.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+
+/* Provides macros for common atomic functionality.  */
+
+#ifndef _ATOMIC_H_
+#define _ATOMIC_H_
+
+#ifdef __tilegx__
+/* Atomic instruction macros
+
+   The macros provided by atomic.h simplify access to the TILE-Gx
+   architecture's atomic instructions.  The architecture provides a
+   variety of atomic instructions, including "exchange", "compare and
+   exchange", "fetch and ADD", "fetch and AND", "fetch and OR", and
+   "fetch and ADD if greater than or equal to zero".
+
+   No barrier or fence semantics are implied by any of the atomic
+   instructions for manipulating memory; you must specify the barriers
+   that you wish explicitly, using the provided macros.
+
+   Any integral 32- or 64-bit value can be used as the argument
+   to these macros, such as "int", "long long", "unsigned long", etc.
+   The pointers must be aligned to 4 or 8 bytes for 32- or 64-bit data.
+   The "exchange" and "compare and exchange" macros may also take
+   pointer values.  We use the pseudo-type "VAL" in the documentation
+   to indicate the use of an appropriate type.  */
+#else
+/* Atomic instruction macros
+
+   The macros provided by atomic.h simplify access to the Tile
+   architecture's atomic instructions.  Since the architecture
+   supports test-and-set as its only in-silicon atomic operation, many
+   of the operations provided by this header are implemented as
+   fast-path calls to Linux emulation routines.
+
+   Using the kernel for atomic operations allows userspace to take
+   advantage of the kernel's existing atomic-integer support (managed
+   by a distributed array of locks).  The kernel provides proper
+   ordering among simultaneous atomic operations on different cores,
+   and guarantees a process can not be context-switched part way
+   through an atomic operation.  By virtue of sharing the kernel
+   atomic implementation, the userspace atomic operations
+   are compatible with the atomic methods provided by the kernel's
+   futex() syscall API.  Note that these operations never cause Linux
+   kernel scheduling, and are in fact invisible to the kernel; they
+   simply act as regular function calls but with an elevated privilege
+   level.  Note that the kernel's distributed lock array is hashed by
+   using only VA bits from the atomic value's address (to avoid the
+   performance hit of page table locking and multiple page-table
+   lookups to get the PA) and only the VA bits that are below page
+   granularity (to properly lock simultaneous accesses to the same
+   page mapped at different VAs).  As a result, simultaneous atomic
+   operations on values whose addresses are at the same offset on a
+   page will contend in the kernel for the same lock array element.
+
+   No barrier or fence semantics are implied by any of the atomic
+   instructions for manipulating memory; you must specify the barriers
+   that you wish explicitly, using the provided macros.
+
+   Any integral 32- or 64-bit value can be used as the argument
+   to these macros, such as "int", "long long", "unsigned long", etc.
+   The pointers must be aligned to 4 or 8 bytes for 32- or 64-bit data.
+   The "exchange" and "compare and exchange" macros may also take
+   pointer values.  We use the pseudo-type "VAL" in the documentation
+   to indicate the use of an appropriate type.
+
+   The 32-bit routines are implemented using a single kernel fast
+   syscall, as is the 64-bit compare-and-exchange.  The other 64-bit
+   routines are implemented by looping over the 64-bit
+   compare-and-exchange routine, so may be potentially less efficient.  */
+#endif
+
+#ifdef __tilegx__
+#include <arch/spr_def.h>
+#else
+#include <asm/unistd.h>
+#endif
+
+
+/* 32-bit integer compare-and-exchange.  */
+static __inline __attribute__ ((always_inline))
+     int arch_atomic_val_compare_and_exchange_4 (volatile int *mem,
+						 int oldval, int newval)
+{
+#ifdef __tilegx__
+  __insn_mtspr (SPR_CMPEXCH_VALUE, oldval);
+  return __insn_cmpexch4 (mem, newval);
+#else
+  int result;
+  __asm__ __volatile__ ("swint1":"=R00" (result),
+			"=m" (*mem):"R10" (__NR_FAST_cmpxchg), "R00" (mem),
+			"R01" (oldval), "R02" (newval), "m" (*mem):"r20",
+			"r21", "r22", "r23", "r24", "r25", "r26", "r27",
+			"r28", "r29", "memory");
+  return result;
+#endif
+}
+
+/* 64-bit integer compare-and-exchange.  */
+static __inline __attribute__ ((always_inline))
+     long long arch_atomic_val_compare_and_exchange_8 (volatile long long
+						       *mem, long long oldval,
+						       long long newval)
+{
+#ifdef __tilegx__
+  __insn_mtspr (SPR_CMPEXCH_VALUE, oldval);
+  return __insn_cmpexch (mem, newval);
+#else
+  unsigned int result_lo, result_hi;
+  unsigned int oldval_lo = oldval & 0xffffffffu, oldval_hi = oldval >> 32;
+  unsigned int newval_lo = newval & 0xffffffffu, newval_hi = newval >> 32;
+  __asm__ __volatile__ ("swint1":"=R00" (result_lo), "=R01" (result_hi),
+			"=m" (*mem):"R10" (__NR_FAST_cmpxchg64), "R00" (mem),
+			"R02" (oldval_lo), "R03" (oldval_hi),
+			"R04" (newval_lo), "R05" (newval_hi),
+			"m" (*mem):"r20", "r21", "r22", "r23", "r24", "r25",
+			"r26", "r27", "r28", "r29", "memory");
+  return ((long long) result_hi) << 32 | result_lo;
+#endif
+}
+
+/* This non-existent symbol is called for sizes other than "4" and "8",
+   indicating a bug in the caller.  */
+extern int __arch_atomic_error_bad_argument_size (void)
+  __attribute__ ((warning ("sizeof atomic argument not 4 or 8")));
+
+
+#define arch_atomic_val_compare_and_exchange(mem, o, n)                 \
+  __extension__ ({                                                      \
+    (__typeof(*(mem)))(__typeof(*(mem)-*(mem)))                         \
+      ((sizeof(*(mem)) == 8) ?                                          \
+       arch_atomic_val_compare_and_exchange_8(                          \
+         (volatile long long*)(mem), (__typeof((o)-(o)))(o),            \
+         (__typeof((n)-(n)))(n)) :                                      \
+       (sizeof(*(mem)) == 4) ?                                          \
+       arch_atomic_val_compare_and_exchange_4(                          \
+         (volatile int*)(mem), (__typeof((o)-(o)))(o),                  \
+         (__typeof((n)-(n)))(n)) :                                      \
+       __arch_atomic_error_bad_argument_size());                        \
+  })
+
+#define arch_atomic_bool_compare_and_exchange(mem, o, n)                \
+  __extension__ ({                                                      \
+    __typeof(o) __o = (o);                                              \
+    __builtin_expect(                                                   \
+      __o == arch_atomic_val_compare_and_exchange((mem), __o, (n)), 1); \
+  })
+
+
+/* Loop with compare_and_exchange until we guess the correct value.
+   Normally "expr" will be an expression using __old and __value.  */
+#define __arch_atomic_update_cmpxchg(mem, value, expr)                  \
+  __extension__ ({                                                      \
+    __typeof(value) __value = (value);                                  \
+    __typeof(*(mem)) *__mem = (mem), __old = *__mem, __guess;           \
+    do {                                                                \
+      __guess = __old;                                                  \
+      __old = arch_atomic_val_compare_and_exchange(__mem, __old, (expr));    \
+    } while (__builtin_expect(__old != __guess, 0));                    \
+    __old;                                                              \
+  })
+
+#ifdef __tilegx__
+
+/* Generic atomic op with 8- or 4-byte variant.
+   The _mask, _addend, and _expr arguments are ignored on tilegx.  */
+#define __arch_atomic_update(mem, value, op, _mask, _addend, _expr)     \
+  __extension__ ({                                                      \
+    ((__typeof(*(mem)))                                                 \
+     ((sizeof(*(mem)) == 8) ? (__typeof(*(mem)-*(mem)))__insn_##op(     \
+        (volatile void *)(mem),                                         \
+        (long long)(__typeof((value)-(value)))(value)) :                \
+      (sizeof(*(mem)) == 4) ? (int)__insn_##op##4(                      \
+        (volatile void *)(mem),                                         \
+        (int)(__typeof((value)-(value)))(value)) :                      \
+      __arch_atomic_error_bad_argument_size()));                        \
+  })
+
+#else
+
+/* This uses TILEPro's fast syscall support to atomically compute:
+
+   int old = *ptr;
+   *ptr = (old & mask) + addend;
+   return old;
+
+   This primitive can be used for atomic exchange, add, or, and.
+   Only 32-bit support is provided.  */
+static __inline __attribute__ ((always_inline))
+     int
+     __arch_atomic_update_4 (volatile int *mem, int mask, int addend)
+{
+  int result;
+  __asm__ __volatile__ ("swint1":"=R00" (result),
+			"=m" (*mem):"R10" (__NR_FAST_atomic_update),
+			"R00" (mem), "R01" (mask), "R02" (addend),
+			"m" (*mem):"r20", "r21", "r22", "r23", "r24", "r25",
+			"r26", "r27", "r28", "r29", "memory");
+  return result;
+}
+
+/* Generic atomic op with 8- or 4-byte variant.
+   The _op argument is ignored on tilepro.  */
+#define __arch_atomic_update(mem, value, _op, mask, addend, expr)       \
+  __extension__ ({                                                      \
+    (__typeof(*(mem)))(__typeof(*(mem)-*(mem)))                         \
+      ((sizeof(*(mem)) == 8) ?                                          \
+       __arch_atomic_update_cmpxchg((mem), (value), (expr)) :           \
+       (sizeof(*(mem)) == 4) ?                                          \
+       __arch_atomic_update_4((volatile int*)(mem),                     \
+                              (__typeof((mask)-(mask)))(mask),          \
+                              (__typeof((addend)-(addend)))(addend)) :  \
+       __arch_atomic_error_bad_argument_size());                        \
+  })
+
+#endif /* __tilegx__ */
+
+
+#define arch_atomic_exchange(mem, newvalue) \
+  __arch_atomic_update(mem, newvalue, exch, 0, newvalue, __value)
+
+#define arch_atomic_add(mem, value) \
+  __arch_atomic_update(mem, value, fetchadd, -1, value, __old + __value)
+
+#define arch_atomic_sub(mem, value) arch_atomic_add((mem), -(value))
+
+#define arch_atomic_increment(mem) arch_atomic_add((mem), 1)
+
+#define arch_atomic_decrement(mem) arch_atomic_add((mem), -1)
+
+#define arch_atomic_and(mem, mask) \
+  __arch_atomic_update(mem, mask, fetchand, mask, 0, __old & __value)
+
+#define arch_atomic_or(mem, mask) \
+  __arch_atomic_update(mem, mask, fetchor, ~mask, mask, __old | __value)
+
+#define arch_atomic_xor(mem, mask) \
+  __arch_atomic_update_cmpxchg(mem, mask, __old ^ __value)
+
+#define arch_atomic_nand(mem, mask) \
+  __arch_atomic_update_cmpxchg(mem, mask, ~(__old & __value))
+
+#define arch_atomic_bit_set(mem, bit)                                   \
+  __extension__ ({                                                      \
+    __typeof(*(mem)) __mask = (__typeof(*(mem)))1 << (bit);             \
+    __mask & arch_atomic_or((mem), __mask);                             \
+  })
+
+#define arch_atomic_bit_clear(mem, bit)                                 \
+  __extension__ ({                                                      \
+    __typeof(*(mem)) __mask = (__typeof(*(mem)))1 << (bit);             \
+    __mask & arch_atomic_and((mem), ~__mask);                           \
+  })
+
+#ifdef __tilegx__
+/* Atomically store a new value to memory.
+   Note that you can freely use types of any size here, unlike the
+   other atomic routines, which require 32- or 64-bit types.
+   This accessor is provided for compatibility with TILEPro, which
+   required an explicit atomic operation for stores that needed
+   to be atomic with respect to other atomic methods in this header.  */
+#define arch_atomic_write(mem, value) ((void) (*(mem) = (value)))
+#else
+#define arch_atomic_write(mem, value)                                   \
+  do {                                                                  \
+    __typeof(mem) __aw_mem = (mem);                                     \
+    __typeof(value) __aw_val = (value);                                 \
+    unsigned int *__aw_mem32, __aw_intval, __aw_val32, __aw_off, __aw_mask; \
+    __aw_intval = (__typeof((value) - (value)))__aw_val;                \
+    switch (sizeof(*__aw_mem)) {                                        \
+    case 8:                                                             \
+      __arch_atomic_update_cmpxchg(__aw_mem, __aw_val, __value);        \
+      break;                                                            \
+    case 4:                                                             \
+      __arch_atomic_update_4((int *)__aw_mem, 0, __aw_intval);          \
+      break;                                                            \
+    case 2:                                                             \
+      __aw_off = 8 * ((long)__aw_mem & 0x2);                            \
+      __aw_mask = 0xffffU << __aw_off;                                  \
+      __aw_mem32 = (unsigned int *)((long)__aw_mem & ~0x2);             \
+      __aw_val32 = (__aw_intval << __aw_off) & __aw_mask;               \
+      __arch_atomic_update_cmpxchg(__aw_mem32, __aw_val32,              \
+                                   (__old & ~__aw_mask) | __value);     \
+      break;                                                            \
+    case 1:                                                             \
+      __aw_off = 8 * ((long)__aw_mem & 0x3);                            \
+      __aw_mask = 0xffU << __aw_off;                                    \
+      __aw_mem32 = (unsigned int *)((long)__aw_mem & ~0x3);             \
+      __aw_val32 = (__aw_intval << __aw_off) & __aw_mask;               \
+      __arch_atomic_update_cmpxchg(__aw_mem32, __aw_val32,              \
+                                   (__old & ~__aw_mask) | __value);     \
+      break;                                                            \
+    }                                                                   \
+  } while (0)
+#endif
+
+/* Compiler barrier.
+
+   This macro prevents loads or stores from being moved by the compiler
+   across the macro.  Any loaded value that was loaded before this
+   macro must then be reloaded by the compiler.  */
+#define arch_atomic_compiler_barrier() __asm__ __volatile__("" ::: "memory")
+
+/* Full memory barrier.
+
+   This macro has the semantics of arch_atomic_compiler_barrer(), but also
+   ensures that previous stores are visible to other cores, and that
+   all previous loaded values have been placed into their target
+   register on this core.  */
+#define arch_atomic_full_barrier() __insn_mf()
+
+/* Read memory barrier.
+
+   Ensure that all reads by this processor that occurred prior to the
+   read memory barrier have completed, and that no reads that occur
+   after the read memory barrier on this processor are initiated
+   before the barrier.
+
+   On current TILE chips a read barrier is implemented as a full barrier,
+   but this may not be true in later versions of the architecture.
+
+   See also arch_atomic_acquire_barrier() for the appropriate idiom to use
+   to ensure no reads are lifted above an atomic lock instruction.  */
+#define arch_atomic_read_barrier() arch_atomic_full_barrier()
+
+/* Write memory barrier.
+
+   Ensure that all writes by this processor that occurred prior to the
+   write memory barrier have completed, and that no writes that occur
+   after the write memory barrier on this processor are initiated
+   before the barrier.
+
+   On current TILE chips a write barrier is implemented as a full barrier,
+   but this may not be true in later versions of the architecture.
+
+   See also arch_atomic_release_barrier() for the appropriate idiom to use
+   to ensure all writes are complete prior to an atomic unlock instruction.  */
+#define arch_atomic_write_barrier() arch_atomic_full_barrier()
+
+/* Lock acquisition barrier.
+
+   Ensure that no load operations that follow this macro in the
+   program can issue prior to the barrier.  Without such a barrier,
+   the compiler can reorder them to issue earlier, or the hardware can
+   issue them speculatively.  The latter is not currently done in the
+   Tile microarchitecture, but using this operation improves
+   portability to future implementations.
+
+   This operation is intended to be used as part of the "acquire"
+   path for locking, that is, when entering a critical section.
+   This should be done after the atomic operation that actually
+   acquires the lock, and in conjunction with a "control dependency"
+   that checks the atomic operation result to see if the lock was
+   in fact acquired.  See the arch_atomic_read_barrier() macro
+   for a heavier-weight barrier to use in certain unusual constructs,
+   or arch_atomic_acquire_barrier_value() if no control dependency exists.  */
+#define arch_atomic_acquire_barrier() arch_atomic_compiler_barrier()
+
+/* Lock release barrier.
+
+   Ensure that no store operations that precede this macro in the
+   program complete subsequent to the barrier.  Without such a
+   barrier, the compiler can reorder stores to issue later, or stores
+   can be still outstanding in the memory network.
+
+   This operation is intended to be used as part of the "release" path
+   for locking, that is, when leaving a critical section.  This should
+   be done before the operation (such as a store of zero) that
+   actually releases the lock.  */
+#define arch_atomic_release_barrier() arch_atomic_write_barrier()
+
+/* Barrier until the read of a particular value is complete.
+
+   This is occasionally useful when constructing certain locking
+   scenarios.  For example, you might write a routine that issues an
+   atomic instruction to enter a critical section, then reads one or
+   more values within the critical section without checking to see if
+   the critical section was in fact acquired, and only later checks
+   the atomic instruction result to see if the lock was acquired.  If
+   so the routine could properly release the lock and know that the
+   values that were read were valid.
+
+   In this scenario, it is required to wait for the result of the
+   atomic instruction, even if the value itself is not checked.  This
+   guarantees that if the atomic instruction succeeded in taking the lock,
+   the lock was held before any reads in the critical section issued.  */
+#define arch_atomic_acquire_barrier_value(val) \
+  __asm__ __volatile__("move %0, %0" :: "r"(val))
+
+/* Access the given variable in memory exactly once.
+
+   In some contexts, an algorithm may need to force access to memory,
+   since otherwise the compiler may think it can optimize away a
+   memory load or store; for example, in a loop when polling memory to
+   see if another cpu has updated it yet.  Generally this is only
+   required for certain very carefully hand-tuned algorithms; using it
+   unnecessarily may result in performance losses.
+
+   A related use of this macro is to ensure that the compiler does not
+   rematerialize the value of "x" by reloading it from memory
+   unexpectedly; the "volatile" marking will prevent the compiler from
+   being able to rematerialize.  This is helpful if an algorithm needs
+   to read a variable without locking, but needs it to have the same
+   value if it ends up being used several times within the algorithm.
+
+   Note that multiple uses of this macro are guaranteed to be ordered,
+   i.e. the compiler will not reorder stores or loads that are wrapped
+   in arch_atomic_access_once().  */
+#define arch_atomic_access_once(x) (*(volatile __typeof(x) *)&(x))
+
+
+
+#endif /* !_ATOMIC_H_ */
diff --git a/gcc-4.9/libgcc/config/tilepro/linux-unwind.h b/gcc-4.9/libgcc/config/tilepro/linux-unwind.h
new file mode 100644
index 000000000..27481cfcd
--- /dev/null
+++ b/gcc-4.9/libgcc/config/tilepro/linux-unwind.h
@@ -0,0 +1,99 @@
+/* DWARF2 EH unwinding support for TILEPro.
+   Copyright (C) 2011-2014 Free Software Foundation, Inc.
+   Contributed by Walter Lee (walt@tilera.com)
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef inhibit_libc
+
+#include <arch/abi.h>
+#include <signal.h>
+#include <sys/ucontext.h>
+#include <linux/unistd.h>
+
+/* Macro to define a copy of the kernel's __rt_sigreturn function
+   (in arch/tile/kernel/entry.S).  If that function is changed,
+   this one needs to be changed to match it.  */
+#define _sigreturn_asm(REG, NR) asm(                    \
+    ".pushsection .text.__rt_sigreturn,\"a\"\n"         \
+    ".global __rt_sigreturn\n"                          \
+    ".type __rt_sigreturn,@function\n"                  \
+    "__rt_sigreturn:\n"                                 \
+    "moveli " #REG ", " #NR "\n"                        \
+    "swint1\n"                                          \
+    ".size __rt_sigreturn, . - __rt_sigreturn\n"        \
+    ".popsection")
+#define sigreturn_asm(REG, NR) _sigreturn_asm(REG, NR)
+sigreturn_asm (TREG_SYSCALL_NR_NAME, __NR_rt_sigreturn);
+#define SIGRETURN_LEN 16
+extern char __rt_sigreturn[];
+
+#define MD_FALLBACK_FRAME_STATE_FOR tile_fallback_frame_state
+
+static _Unwind_Reason_Code
+tile_fallback_frame_state (struct _Unwind_Context *context,
+			      _Unwind_FrameState *fs)
+{
+  unsigned char *pc = context->ra;
+  struct sigcontext *sc;
+  long new_cfa;
+  int i;
+
+  struct rt_sigframe {
+    unsigned char save_area[C_ABI_SAVE_AREA_SIZE];
+    siginfo_t info;
+    struct ucontext uc;
+  } *rt_;
+
+  /* Return if this is not a signal handler.  */
+  if (memcmp (pc, __rt_sigreturn, SIGRETURN_LEN) != 0)
+    return _URC_END_OF_STACK;
+
+  /* It was a signal handler; update the reported PC to point to our
+     copy, since that will be findable with dladdr() and therefore
+     somewhat easier to help understand what actually happened. */
+  context->ra = __rt_sigreturn;
+
+  rt_ = context->cfa;
+  sc = &rt_->uc.uc_mcontext;
+
+  new_cfa = sc->sp;
+  fs->regs.cfa_how = CFA_REG_OFFSET;
+  fs->regs.cfa_reg = STACK_POINTER_REGNUM;
+  fs->regs.cfa_offset = new_cfa - (long) context->cfa;
+
+  for (i = 0; i < 56; ++i)
+    {
+      fs->regs.reg[i].how = REG_SAVED_OFFSET;
+      fs->regs.reg[i].loc.offset
+	= (long)&sc->gregs[i] - new_cfa;
+    }
+
+  fs->regs.reg[56].how = REG_SAVED_OFFSET;
+  fs->regs.reg[56].loc.offset = (long)&sc->pc - new_cfa;
+  fs->retaddr_column = 56;
+  fs->signal_frame = 1;
+
+  return _URC_NO_REASON;
+}
+
+#endif /* ifdef inhibit_libc  */
diff --git a/gcc-4.9/libgcc/config/tilepro/sfp-machine.h b/gcc-4.9/libgcc/config/tilepro/sfp-machine.h
new file mode 100644
index 000000000..6953d8d8d
--- /dev/null
+++ b/gcc-4.9/libgcc/config/tilepro/sfp-machine.h
@@ -0,0 +1,59 @@
+#define _FP_W_TYPE_SIZE		32
+#define _FP_W_TYPE		unsigned long
+#define _FP_WS_TYPE		signed long
+#define _FP_I_TYPE		long
+
+/* The type of the result of a floating point comparison.  This must
+   match `__libgcc_cmp_return__' in GCC for the target.  */
+typedef int __gcc_CMPtype __attribute__ ((mode (__libgcc_cmp_return__)));
+#define CMPtype __gcc_CMPtype
+
+#define _FP_MUL_MEAT_S(R,X,Y)				\
+  _FP_MUL_MEAT_1_wide(_FP_WFRACBITS_S,R,X,Y,umul_ppmm)
+#define _FP_MUL_MEAT_D(R,X,Y)				\
+  _FP_MUL_MEAT_2_wide(_FP_WFRACBITS_D,R,X,Y,umul_ppmm)
+#define _FP_MUL_MEAT_Q(R,X,Y)				\
+  _FP_MUL_MEAT_4_wide(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm)
+
+#define _FP_DIV_MEAT_S(R,X,Y)	_FP_DIV_MEAT_1_loop(S,R,X,Y)
+#define _FP_DIV_MEAT_D(R,X,Y)	_FP_DIV_MEAT_2_udiv(D,R,X,Y)
+#define _FP_DIV_MEAT_Q(R,X,Y)	_FP_DIV_MEAT_4_udiv(Q,R,X,Y)
+
+#define _FP_NANFRAC_S		_FP_QNANBIT_S
+#define _FP_NANFRAC_D		_FP_QNANBIT_D, 0
+#define _FP_NANFRAC_Q		_FP_QNANBIT_Q, 0, 0, 0
+#define _FP_NANSIGN_S		1
+#define _FP_NANSIGN_D		1
+#define _FP_NANSIGN_Q		1
+
+#define _FP_KEEPNANFRACP 1
+#define _FP_QNANNEGATEDP 0
+
+#define _FP_CHOOSENAN(fs, wc, R, X, Y, OP)			\
+  do {								\
+    if ((_FP_FRAC_HIGH_RAW_##fs(X) & _FP_QNANBIT_##fs)		\
+	&& !(_FP_FRAC_HIGH_RAW_##fs(Y) & _FP_QNANBIT_##fs))	\
+      {								\
+	R##_s = Y##_s;						\
+	_FP_FRAC_COPY_##wc(R,Y);				\
+      }								\
+    else							\
+      {								\
+	R##_s = X##_s;						\
+	_FP_FRAC_COPY_##wc(R,X);				\
+      }								\
+    R##_c = FP_CLS_NAN;						\
+  } while (0)
+
+#define _FP_TININESS_AFTER_ROUNDING 0
+
+#define	__LITTLE_ENDIAN	1234
+#define	__BIG_ENDIAN	4321
+
+#define __BYTE_ORDER __LITTLE_ENDIAN
+
+/* Define ALIASNAME as a strong alias for NAME.  */
+# define strong_alias(name, aliasname) _strong_alias(name, aliasname)
+# define _strong_alias(name, aliasname) \
+  extern __typeof (name) aliasname __attribute__ ((alias (#name)));
+
diff --git a/gcc-4.9/libgcc/config/tilepro/softdivide.c b/gcc-4.9/libgcc/config/tilepro/softdivide.c
new file mode 100644
index 000000000..8a539f467
--- /dev/null
+++ b/gcc-4.9/libgcc/config/tilepro/softdivide.c
@@ -0,0 +1,353 @@
+/* Division and remainder routines for Tile.
+   Copyright (C) 2011-2014 Free Software Foundation, Inc.
+   Contributed by Walter Lee (walt@tilera.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+typedef int int32_t;
+typedef unsigned uint32_t;
+typedef long long int64_t;
+typedef unsigned long long uint64_t;
+
+/* Raise signal 8 (SIGFPE) with code 1 (FPE_INTDIV).  */
+static inline void
+raise_intdiv (void)
+{
+  asm ("{ raise; moveli zero, 8 + (1 << 6) }");
+}
+
+
+#ifndef __tilegx__
+/*__udivsi3 - 32 bit integer unsigned divide  */
+static inline uint32_t __attribute__ ((always_inline))
+__udivsi3_inline (uint32_t dividend, uint32_t divisor)
+{
+  /* Divide out any power of two factor from dividend and divisor.
+     Note that when dividing by zero the divisor will remain zero,
+     which is all we need to detect that case below.  */
+  const int power_of_two_factor = __insn_ctz (divisor);
+  divisor >>= power_of_two_factor;
+  dividend >>= power_of_two_factor;
+
+  /* Checks for division by power of two or division by zero.  */
+  if (divisor <= 1)
+    {
+      if (divisor == 0)
+	{
+	  raise_intdiv ();
+	  return 0;
+	}
+      return dividend;
+    }
+
+  /* Compute (a / b) by repeatedly finding the largest N
+     such that (b << N) <= a. For each such N, set bit N in the
+     quotient, subtract (b << N) from a, and keep going. Think of this as
+     the reverse of the "shift-and-add" that a multiply does. The values
+     of N are precisely those shift counts.
+
+     Finding N is easy. First, use clz(b) - clz(a) to find the N
+     that lines up the high bit of (b << N) with the high bit of a.
+     Any larger value of N would definitely make (b << N) > a,
+     which is too big.
+
+     Then, if (b << N) > a (because it has larger low bits), decrement
+     N by one.  This adjustment will definitely make (b << N) less
+     than a, because a's high bit is now one higher than b's.  */
+
+  /* Precomputing the max_ values allows us to avoid a subtract
+     in the inner loop and just right shift by clz(remainder).  */
+  const int divisor_clz = __insn_clz (divisor);
+  const uint32_t max_divisor = divisor << divisor_clz;
+  const uint32_t max_qbit = 1 << divisor_clz;
+
+  uint32_t quotient = 0;
+  uint32_t remainder = dividend;
+
+  while (remainder >= divisor)
+    {
+      int shift = __insn_clz (remainder);
+      uint32_t scaled_divisor = max_divisor >> shift;
+      uint32_t quotient_bit = max_qbit >> shift;
+
+      int too_big = (scaled_divisor > remainder);
+      scaled_divisor >>= too_big;
+      quotient_bit >>= too_big;
+      remainder -= scaled_divisor;
+      quotient |= quotient_bit;
+    }
+  return quotient;
+}
+#endif /* !__tilegx__ */
+
+
+/* __udivdi3 - 64 bit integer unsigned divide  */
+static inline uint64_t __attribute__ ((always_inline))
+__udivdi3_inline (uint64_t dividend, uint64_t divisor)
+{
+  /* Divide out any power of two factor from dividend and divisor.
+     Note that when dividing by zero the divisor will remain zero,
+     which is all we need to detect that case below.  */
+  const int power_of_two_factor = __builtin_ctzll (divisor);
+  divisor >>= power_of_two_factor;
+  dividend >>= power_of_two_factor;
+
+  /* Checks for division by power of two or division by zero.  */
+  if (divisor <= 1)
+    {
+      if (divisor == 0)
+	{
+	  raise_intdiv ();
+	  return 0;
+	}
+      return dividend;
+    }
+
+#ifndef __tilegx__
+  if (((uint32_t) (dividend >> 32) | ((uint32_t) (divisor >> 32))) == 0)
+    {
+      /* Operands both fit in 32 bits, so use faster 32 bit algorithm.  */
+      return __udivsi3_inline ((uint32_t) dividend, (uint32_t) divisor);
+    }
+#endif /* !__tilegx__ */
+
+  /* See algorithm description in __udivsi3  */
+
+  const int divisor_clz = __builtin_clzll (divisor);
+  const uint64_t max_divisor = divisor << divisor_clz;
+  const uint64_t max_qbit = 1ULL << divisor_clz;
+
+  uint64_t quotient = 0;
+  uint64_t remainder = dividend;
+
+  while (remainder >= divisor)
+    {
+      int shift = __builtin_clzll (remainder);
+      uint64_t scaled_divisor = max_divisor >> shift;
+      uint64_t quotient_bit = max_qbit >> shift;
+
+      int too_big = (scaled_divisor > remainder);
+      scaled_divisor >>= too_big;
+      quotient_bit >>= too_big;
+      remainder -= scaled_divisor;
+      quotient |= quotient_bit;
+    }
+  return quotient;
+}
+
+
+#ifndef __tilegx__
+/* __umodsi3 - 32 bit integer unsigned modulo  */
+static inline uint32_t __attribute__ ((always_inline))
+__umodsi3_inline (uint32_t dividend, uint32_t divisor)
+{
+  /* Shortcircuit mod by a power of two (and catch mod by zero).  */
+  const uint32_t mask = divisor - 1;
+  if ((divisor & mask) == 0)
+    {
+      if (divisor == 0)
+	{
+	  raise_intdiv ();
+	  return 0;
+	}
+      return dividend & mask;
+    }
+
+  /* We compute the remainder (a % b) by repeatedly subtracting off
+     multiples of b from a until a < b. The key is that subtracting
+     off a multiple of b does not affect the result mod b.
+
+     To make the algorithm run efficiently, we need to subtract
+     off a large multiple of b at each step. We subtract the largest
+     (b << N) that is <= a.
+
+     Finding N is easy. First, use clz(b) - clz(a) to find the N
+     that lines up the high bit of (b << N) with the high bit of a.
+     Any larger value of N would definitely make (b << N) > a,
+     which is too big.
+
+     Then, if (b << N) > a (because it has larger low bits), decrement
+     N by one.  This adjustment will definitely make (b << N) less
+     than a, because a's high bit is now one higher than b's.  */
+  const uint32_t max_divisor = divisor << __insn_clz (divisor);
+
+  uint32_t remainder = dividend;
+  while (remainder >= divisor)
+    {
+      const int shift = __insn_clz (remainder);
+      uint32_t scaled_divisor = max_divisor >> shift;
+      scaled_divisor >>= (scaled_divisor > remainder);
+      remainder -= scaled_divisor;
+    }
+
+  return remainder;
+}
+#endif /* !__tilegx__ */
+
+
+/* __umoddi3 - 64 bit integer unsigned modulo  */
+static inline uint64_t __attribute__ ((always_inline))
+__umoddi3_inline (uint64_t dividend, uint64_t divisor)
+{
+#ifndef __tilegx__
+  if (((uint32_t) (dividend >> 32) | ((uint32_t) (divisor >> 32))) == 0)
+    {
+      /* Operands both fit in 32 bits, so use faster 32 bit algorithm.  */
+      return __umodsi3_inline ((uint32_t) dividend, (uint32_t) divisor);
+    }
+#endif /* !__tilegx__ */
+
+  /* Shortcircuit mod by a power of two (and catch mod by zero).  */
+  const uint64_t mask = divisor - 1;
+  if ((divisor & mask) == 0)
+    {
+      if (divisor == 0)
+	{
+	  raise_intdiv ();
+	  return 0;
+	}
+      return dividend & mask;
+    }
+
+  /* See algorithm description in __umodsi3  */
+  const uint64_t max_divisor = divisor << __builtin_clzll (divisor);
+
+  uint64_t remainder = dividend;
+  while (remainder >= divisor)
+    {
+      const int shift = __builtin_clzll (remainder);
+      uint64_t scaled_divisor = max_divisor >> shift;
+      scaled_divisor >>= (scaled_divisor > remainder);
+      remainder -= scaled_divisor;
+    }
+
+  return remainder;
+}
+
+
+uint32_t __udivsi3 (uint32_t dividend, uint32_t divisor);
+#ifdef L_tile_udivsi3
+uint32_t
+__udivsi3 (uint32_t dividend, uint32_t divisor)
+{
+#ifndef __tilegx__
+  return __udivsi3_inline (dividend, divisor);
+#else /* !__tilegx__ */
+  uint64_t n = __udivdi3_inline (((uint64_t) dividend), ((uint64_t) divisor));
+  return (uint32_t) n;
+#endif /* !__tilegx__ */
+}
+#endif
+
+#define ABS(x) ((x) >= 0 ? (x) : -(x))
+
+int32_t __divsi3 (int32_t dividend, int32_t divisor);
+#ifdef L_tile_divsi3
+/* __divsi3 - 32 bit integer signed divide  */
+int32_t
+__divsi3 (int32_t dividend, int32_t divisor)
+{
+#ifndef __tilegx__
+  uint32_t n = __udivsi3_inline (ABS (dividend), ABS (divisor));
+#else /* !__tilegx__ */
+  uint64_t n =
+    __udivdi3_inline (ABS ((int64_t) dividend), ABS ((int64_t) divisor));
+#endif /* !__tilegx__ */
+  if ((dividend ^ divisor) < 0)
+    n = -n;
+  return (int32_t) n;
+}
+#endif
+
+
+uint64_t __udivdi3 (uint64_t dividend, uint64_t divisor);
+#ifdef L_tile_udivdi3
+uint64_t
+__udivdi3 (uint64_t dividend, uint64_t divisor)
+{
+  return __udivdi3_inline (dividend, divisor);
+}
+#endif
+
+/*__divdi3 - 64 bit integer signed divide  */
+int64_t __divdi3 (int64_t dividend, int64_t divisor);
+#ifdef L_tile_divdi3
+int64_t
+__divdi3 (int64_t dividend, int64_t divisor)
+{
+  uint64_t n = __udivdi3_inline (ABS (dividend), ABS (divisor));
+  if ((dividend ^ divisor) < 0)
+    n = -n;
+  return (int64_t) n;
+}
+#endif
+
+
+uint32_t __umodsi3 (uint32_t dividend, uint32_t divisor);
+#ifdef L_tile_umodsi3
+uint32_t
+__umodsi3 (uint32_t dividend, uint32_t divisor)
+{
+#ifndef __tilegx__
+  return __umodsi3_inline (dividend, divisor);
+#else /* !__tilegx__ */
+  return __umoddi3_inline ((uint64_t) dividend, (uint64_t) divisor);
+#endif /* !__tilegx__ */
+}
+#endif
+
+
+/* __modsi3 - 32 bit integer signed modulo  */
+int32_t __modsi3 (int32_t dividend, int32_t divisor);
+#ifdef L_tile_modsi3
+int32_t
+__modsi3 (int32_t dividend, int32_t divisor)
+{
+#ifndef __tilegx__
+  uint32_t remainder = __umodsi3_inline (ABS (dividend), ABS (divisor));
+#else /* !__tilegx__ */
+  uint64_t remainder =
+    __umoddi3_inline (ABS ((int64_t) dividend), ABS ((int64_t) divisor));
+#endif /* !__tilegx__ */
+  return (int32_t) ((dividend >= 0) ? remainder : -remainder);
+}
+#endif
+
+
+uint64_t __umoddi3 (uint64_t dividend, uint64_t divisor);
+#ifdef L_tile_umoddi3
+uint64_t
+__umoddi3 (uint64_t dividend, uint64_t divisor)
+{
+  return __umoddi3_inline (dividend, divisor);
+}
+#endif
+
+
+/* __moddi3 - 64 bit integer signed modulo  */
+int64_t __moddi3 (int64_t dividend, int64_t divisor);
+#ifdef L_tile_moddi3
+int64_t
+__moddi3 (int64_t dividend, int64_t divisor)
+{
+  uint64_t remainder = __umoddi3_inline (ABS (dividend), ABS (divisor));
+  return (int64_t) ((dividend >= 0) ? remainder : -remainder);
+}
+#endif
diff --git a/gcc-4.9/libgcc/config/tilepro/softmpy.S b/gcc-4.9/libgcc/config/tilepro/softmpy.S
new file mode 100644
index 000000000..4922dc764
--- /dev/null
+++ b/gcc-4.9/libgcc/config/tilepro/softmpy.S
@@ -0,0 +1,94 @@
+/* 64-bit multiplication support for TILEPro.
+   Copyright (C) 2011-2014 Free Software Foundation, Inc.
+   Contributed by Walter Lee (walt@tilera.com)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* 64-bit multiplication support.  */
+
+	.file "softmpy.S"
+
+/* Parameters */
+#define lo0             r9   /* low 32 bits of n0  */
+#define hi0             r1   /* high 32 bits of n0 */
+#define lo1             r2   /* low 32 bits of n1  */
+#define hi1             r3   /* high 32 bits of n1 */
+
+/* temps */
+#define result1_a       r4
+#define result1_b       r5
+
+#define tmp0            r6
+#define tmp0_left_16    r7
+#define tmp1            r8
+
+	.section .text.__muldi3, "ax"
+	.align 8
+	.globl __muldi3
+	.type __muldi3, @function
+__muldi3:
+	{
+	 move	     lo0, r0 /* so we can write "out r0" while "in r0" alive */
+	 mulhl_uu    tmp0, lo1, r0
+	}
+	{
+	 mulll_uu    result1_a, lo1, hi0
+	}
+	{
+	 move        tmp1, tmp0
+	 mulhla_uu   tmp0, lo0, lo1
+	}
+	{
+	 mulhlsa_uu  result1_a, lo1, hi0
+	}
+	{
+	 mulll_uu    result1_b, lo0, hi1
+	 slt_u       tmp1, tmp0, tmp1
+	}
+	{
+	 mulhlsa_uu  result1_a, lo0, hi1
+	 shli        r0, tmp0, 16
+	}
+	{
+	 move        tmp0_left_16, r0
+	 mulhha_uu   result1_b, lo0, lo1
+	}
+	{
+	 mullla_uu   r0, lo1, lo0
+	 shli        tmp1, tmp1, 16
+	}
+	{
+	 mulhlsa_uu  result1_b, hi0, lo1
+	 inthh       tmp1, tmp1, tmp0
+	}
+	{
+	 mulhlsa_uu  result1_a, hi1, lo0
+	 slt_u       tmp0, r0, tmp0_left_16
+	}
+	/* NOTE: this will stall for a cycle here. Oh well. */
+	{
+	 add         r1, tmp0, tmp1
+	 add         result1_a, result1_a, result1_b
+	}
+	{
+	 add         r1, r1, result1_a
+	 jrp         lr
+	}
+	.size __muldi3,.-__muldi3
diff --git a/gcc-4.9/libgcc/config/tilepro/t-crtstuff b/gcc-4.9/libgcc/config/tilepro/t-crtstuff
new file mode 100644
index 000000000..eddc45ce9
--- /dev/null
+++ b/gcc-4.9/libgcc/config/tilepro/t-crtstuff
@@ -0,0 +1,4 @@
+# crtend*.o cannot be compiled without -fno-asynchronous-unwind-tables,
+# because then __FRAME_END__ might not be the last thing in .eh_frame
+# section.
+CRTSTUFF_T_CFLAGS += -fno-asynchronous-unwind-tables
diff --git a/gcc-4.9/libgcc/config/tilepro/t-tilepro b/gcc-4.9/libgcc/config/tilepro/t-tilepro
new file mode 100644
index 000000000..eb6894ce1
--- /dev/null
+++ b/gcc-4.9/libgcc/config/tilepro/t-tilepro
@@ -0,0 +1,33 @@
+LIB2ADD += \
+  $(srcdir)/config/tilepro/softmpy.S \
+  $(srcdir)/config/tilepro/atomic.c
+
+LIB2FUNCS_EXCLUDE += \
+  _divdi3 \
+  _moddi3 \
+  _muldi3 \
+  _udivdi3 \
+  _umoddi3
+
+SOFTDIVIDE_FUNCS := \
+  _tile_udivsi3 \
+  _tile_divsi3 \
+  _tile_udivdi3 \
+  _tile_divdi3 \
+  _tile_umodsi3 \
+  _tile_modsi3 \
+  _tile_umoddi3 \
+  _tile_moddi3
+
+softdivide-o = $(patsubst %,%$(objext),$(SOFTDIVIDE_FUNCS))
+$(softdivide-o): %$(objext): $(srcdir)/config/tilepro/softdivide.c
+	$(gcc_compile) -ffunction-sections -DMAYBE_STATIC= -DL$* -c $< \
+	  $(vis_hide)
+libgcc-objects += $(softdivide-o)
+
+ifeq ($(enable_shared),yes)
+softdivide-s-o = $(patsubst %,%_s$(objext),$(SOFTDIVIDE_FUNCS))
+$(softdivide-s-o): %_s$(objext): $(srcdir)/config/tilepro/softdivide.c
+	$(gcc_s_compile) -ffunction-sections -DMAYBE_STATIC= -DL$* -c $<
+libgcc-s-objects += $(softdivide-s-o)
+endif