aboutsummaryrefslogtreecommitdiffstats
path: root/gcc-4.9/libgcc/config/tilepro
diff options
context:
space:
mode:
Diffstat (limited to 'gcc-4.9/libgcc/config/tilepro')
-rw-r--r--gcc-4.9/libgcc/config/tilepro/atomic.c397
-rw-r--r--gcc-4.9/libgcc/config/tilepro/atomic.h435
-rw-r--r--gcc-4.9/libgcc/config/tilepro/linux-unwind.h99
-rw-r--r--gcc-4.9/libgcc/config/tilepro/sfp-machine.h59
-rw-r--r--gcc-4.9/libgcc/config/tilepro/softdivide.c353
-rw-r--r--gcc-4.9/libgcc/config/tilepro/softmpy.S94
-rw-r--r--gcc-4.9/libgcc/config/tilepro/t-crtstuff4
-rw-r--r--gcc-4.9/libgcc/config/tilepro/t-tilepro33
8 files changed, 1474 insertions, 0 deletions
diff --git a/gcc-4.9/libgcc/config/tilepro/atomic.c b/gcc-4.9/libgcc/config/tilepro/atomic.c
new file mode 100644
index 000000000..66ef8fd7d
--- /dev/null
+++ b/gcc-4.9/libgcc/config/tilepro/atomic.c
@@ -0,0 +1,397 @@
+/* TILE atomics.
+ Copyright (C) 2011-2014 Free Software Foundation, Inc.
+ Contributed by Walter Lee (walt@tilera.com)
+
+ This file is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 3, or (at your option) any
+ later version.
+
+ This file is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "tconfig.h"
+#include "coretypes.h"
+#include "atomic.h"
+
+#define bool unsigned char
+
+/* This code should be inlined by the compiler, but for now support
+ it as out-of-line methods in libgcc. */
+
+static inline void
+pre_atomic_barrier (int model)
+{
+ switch ((enum memmodel) model)
+ {
+ case MEMMODEL_RELEASE:
+ case MEMMODEL_ACQ_REL:
+ case MEMMODEL_SEQ_CST:
+ __atomic_thread_fence (model);
+ break;
+ default:
+ break;
+ }
+ return;
+}
+
+static inline void
+post_atomic_barrier (int model)
+{
+ switch ((enum memmodel) model)
+ {
+ case MEMMODEL_ACQUIRE:
+ case MEMMODEL_ACQ_REL:
+ case MEMMODEL_SEQ_CST:
+ __atomic_thread_fence (model);
+ break;
+ default:
+ break;
+ }
+ return;
+}
+
+#define __unused __attribute__((unused))
+
+#define __fetch_and_do(proto, type, size, opname, top, bottom) \
+proto \
+{ \
+ top; \
+ type rv = arch_atomic_##opname(p, i); \
+ bottom; \
+ return rv; \
+}
+
+#define __atomic_fetch_and_do(type, size, opname) \
+ __fetch_and_do(type __atomic_fetch_##opname##_##size(type* p, type i, int model), \
+ type, size, opname, \
+ pre_atomic_barrier(model), \
+ post_atomic_barrier(model)) \
+
+__atomic_fetch_and_do (int, 4, add)
+__atomic_fetch_and_do (int, 4, sub)
+__atomic_fetch_and_do (int, 4, or)
+__atomic_fetch_and_do (int, 4, and)
+__atomic_fetch_and_do (int, 4, xor)
+__atomic_fetch_and_do (int, 4, nand)
+__atomic_fetch_and_do (long long, 8, add)
+__atomic_fetch_and_do (long long, 8, sub)
+__atomic_fetch_and_do (long long, 8, or)
+__atomic_fetch_and_do (long long, 8, and)
+__atomic_fetch_and_do (long long, 8, xor)
+__atomic_fetch_and_do (long long, 8, nand)
+
+#define __sync_fetch_and_do(type, size, opname) \
+ __fetch_and_do(type __sync_fetch_and_##opname##_##size(type* p, type i), \
+ type, size, opname, \
+ arch_atomic_write_barrier(), \
+ arch_atomic_read_barrier())
+
+__sync_fetch_and_do (int, 4, add)
+__sync_fetch_and_do (int, 4, sub)
+__sync_fetch_and_do (int, 4, or)
+__sync_fetch_and_do (int, 4, and)
+__sync_fetch_and_do (int, 4, xor)
+__sync_fetch_and_do (int, 4, nand)
+__sync_fetch_and_do (long long, 8, add)
+__sync_fetch_and_do (long long, 8, sub)
+__sync_fetch_and_do (long long, 8, or)
+__sync_fetch_and_do (long long, 8, and)
+__sync_fetch_and_do (long long, 8, xor)
+__sync_fetch_and_do (long long, 8, nand)
+
+#define __do_and_fetch(proto, type, size, opname, op, op2, top, bottom) \
+proto \
+{ \
+ top; \
+ type rv = op2 (arch_atomic_##opname(p, i) op i); \
+ bottom; \
+ return rv; \
+}
+
+#define __atomic_do_and_fetch(type, size, opname, op, op2) \
+ __do_and_fetch(type __atomic_##opname##_fetch_##size(type* p, type i, int model), \
+ type, size, opname, op, op2, \
+ pre_atomic_barrier(model), \
+ post_atomic_barrier(model)) \
+
+__atomic_do_and_fetch (int, 4, add, +, )
+__atomic_do_and_fetch (int, 4, sub, -, )
+__atomic_do_and_fetch (int, 4, or, |, )
+__atomic_do_and_fetch (int, 4, and, &, )
+__atomic_do_and_fetch (int, 4, xor, |, )
+__atomic_do_and_fetch (int, 4, nand, &, ~)
+__atomic_do_and_fetch (long long, 8, add, +, )
+__atomic_do_and_fetch (long long, 8, sub, -, )
+__atomic_do_and_fetch (long long, 8, or, |, )
+__atomic_do_and_fetch (long long, 8, and, &, )
+__atomic_do_and_fetch (long long, 8, xor, |, )
+__atomic_do_and_fetch (long long, 8, nand, &, ~)
+
+#define __sync_do_and_fetch(type, size, opname, op, op2) \
+ __do_and_fetch(type __sync_##opname##_and_fetch_##size(type* p, type i), \
+ type, size, opname, op, op2, \
+ arch_atomic_write_barrier(), \
+ arch_atomic_read_barrier()) \
+
+__sync_do_and_fetch (int, 4, add, +, )
+__sync_do_and_fetch (int, 4, sub, -, )
+__sync_do_and_fetch (int, 4, or, |, )
+__sync_do_and_fetch (int, 4, and, &, )
+__sync_do_and_fetch (int, 4, xor, |, )
+__sync_do_and_fetch (int, 4, nand, &, ~)
+__sync_do_and_fetch (long long, 8, add, +, )
+__sync_do_and_fetch (long long, 8, sub, -, )
+__sync_do_and_fetch (long long, 8, or, |, )
+__sync_do_and_fetch (long long, 8, and, &, )
+__sync_do_and_fetch (long long, 8, xor, |, )
+__sync_do_and_fetch (long long, 8, nand, &, ~)
+
+#define __atomic_exchange_methods(type, size) \
+bool \
+__atomic_compare_exchange_##size(volatile type* ptr, type* oldvalp, \
+ type newval, bool weak __unused, \
+ int models, int modelf __unused) \
+{ \
+ type oldval = *oldvalp; \
+ pre_atomic_barrier(models); \
+ type retval = arch_atomic_val_compare_and_exchange(ptr, oldval, newval); \
+ post_atomic_barrier(models); \
+ bool success = (retval == oldval); \
+ *oldvalp = retval; \
+ return success; \
+} \
+ \
+type \
+__atomic_exchange_##size(volatile type* ptr, type val, int model) \
+{ \
+ pre_atomic_barrier(model); \
+ type retval = arch_atomic_exchange(ptr, val); \
+ post_atomic_barrier(model); \
+ return retval; \
+}
+
+__atomic_exchange_methods (int, 4)
+__atomic_exchange_methods (long long, 8)
+
+#define __sync_exchange_methods(type, size) \
+type \
+__sync_val_compare_and_swap_##size(type* ptr, type oldval, type newval) \
+{ \
+ arch_atomic_write_barrier(); \
+ type retval = arch_atomic_val_compare_and_exchange(ptr, oldval, newval); \
+ arch_atomic_read_barrier(); \
+ return retval; \
+} \
+ \
+bool \
+__sync_bool_compare_and_swap_##size(type* ptr, type oldval, type newval) \
+{ \
+ arch_atomic_write_barrier(); \
+ bool retval = arch_atomic_bool_compare_and_exchange(ptr, oldval, newval); \
+ arch_atomic_read_barrier(); \
+ return retval; \
+} \
+ \
+type \
+__sync_lock_test_and_set_##size(type* ptr, type val) \
+{ \
+ type retval = arch_atomic_exchange(ptr, val); \
+ arch_atomic_acquire_barrier_value(retval); \
+ return retval; \
+}
+
+__sync_exchange_methods (int, 4)
+__sync_exchange_methods (long long, 8)
+
+#ifdef __LITTLE_ENDIAN__
+#define BIT_OFFSET(n, type) ((n) * 8)
+#else
+#define BIT_OFFSET(n, type) ((4 - sizeof(type) - (n)) * 8)
+#endif
+
+/* Subword methods require the same approach for both TILEPro and
+ TILE-Gx. We load the background data for the word, insert the
+ desired subword piece, then compare-and-exchange it into place. */
+#define u8 unsigned char
+#define u16 unsigned short
+
+#define __subword_cmpxchg_body(type, size, ptr, guess, val) \
+ ({ \
+ unsigned int *p = (unsigned int *)((unsigned long)ptr & ~3UL); \
+ const int shift = BIT_OFFSET((unsigned long)ptr & 3UL, type); \
+ const unsigned int valmask = (1 << (sizeof(type) * 8)) - 1; \
+ const unsigned int bgmask = ~(valmask << shift); \
+ unsigned int oldword = *p; \
+ type oldval = (oldword >> shift) & valmask; \
+ if (__builtin_expect((oldval == guess), 1)) { \
+ unsigned int word = (oldword & bgmask) | ((val & valmask) << shift); \
+ oldword = arch_atomic_val_compare_and_exchange(p, oldword, word); \
+ oldval = (oldword >> shift) & valmask; \
+ } \
+ oldval; \
+ }) \
+
+#define __atomic_subword_cmpxchg(type, size) \
+ \
+bool \
+__atomic_compare_exchange_##size(volatile type* ptr, type* guess_ptr, \
+ type val, bool weak __unused, int models, \
+ int modelf __unused) \
+{ \
+ pre_atomic_barrier(models); \
+ type guess = *guess_ptr; \
+ type oldval = __subword_cmpxchg_body(type, size, ptr, guess, val); \
+ post_atomic_barrier(models); \
+ bool success = (oldval == guess); \
+ *guess_ptr = oldval; \
+ return success; \
+}
+
+__atomic_subword_cmpxchg (u8, 1)
+__atomic_subword_cmpxchg (u16, 2)
+
+#define __sync_subword_cmpxchg(type, size) \
+ \
+type \
+__sync_val_compare_and_swap_##size(type* ptr, type guess, type val) \
+{ \
+ arch_atomic_write_barrier(); \
+ type oldval = __subword_cmpxchg_body(type, size, ptr, guess, val); \
+ arch_atomic_read_barrier(); \
+ return oldval; \
+} \
+ \
+bool \
+__sync_bool_compare_and_swap_##size(type* ptr, type guess, type val) \
+{ \
+ type oldval = __sync_val_compare_and_swap_##size(ptr, guess, val); \
+ return oldval == guess; \
+}
+
+__sync_subword_cmpxchg (u8, 1)
+__sync_subword_cmpxchg (u16, 2)
+
+/* For the atomic-update subword methods, we use the same approach as
+ above, but we retry until we succeed if the compare-and-exchange
+ fails. */
+#define __subword(type, proto, top, expr, bottom) \
+proto \
+{ \
+ top \
+ unsigned int *p = (unsigned int *)((unsigned long)ptr & ~3UL); \
+ const int shift = BIT_OFFSET((unsigned long)ptr & 3UL, type); \
+ const unsigned int valmask = (1 << (sizeof(type) * 8)) - 1; \
+ const unsigned int bgmask = ~(valmask << shift); \
+ unsigned int oldword, xword = *p; \
+ type val, oldval; \
+ do { \
+ oldword = xword; \
+ oldval = (oldword >> shift) & valmask; \
+ val = expr; \
+ unsigned int word = (oldword & bgmask) | ((val & valmask) << shift); \
+ xword = arch_atomic_val_compare_and_exchange(p, oldword, word); \
+ } while (__builtin_expect(xword != oldword, 0)); \
+ bottom \
+}
+
+#define __atomic_subword_fetch(type, funcname, expr, retval) \
+ __subword(type, \
+ type __atomic_ ## funcname(volatile type *ptr, type i, int model), \
+ pre_atomic_barrier(model);, \
+ expr, \
+ post_atomic_barrier(model); return retval;)
+
+__atomic_subword_fetch (u8, fetch_add_1, oldval + i, oldval)
+__atomic_subword_fetch (u8, fetch_sub_1, oldval - i, oldval)
+__atomic_subword_fetch (u8, fetch_or_1, oldval | i, oldval)
+__atomic_subword_fetch (u8, fetch_and_1, oldval & i, oldval)
+__atomic_subword_fetch (u8, fetch_xor_1, oldval ^ i, oldval)
+__atomic_subword_fetch (u8, fetch_nand_1, ~(oldval & i), oldval)
+
+__atomic_subword_fetch (u16, fetch_add_2, oldval + i, oldval)
+__atomic_subword_fetch (u16, fetch_sub_2, oldval - i, oldval)
+__atomic_subword_fetch (u16, fetch_or_2, oldval | i, oldval)
+__atomic_subword_fetch (u16, fetch_and_2, oldval & i, oldval)
+__atomic_subword_fetch (u16, fetch_xor_2, oldval ^ i, oldval)
+__atomic_subword_fetch (u16, fetch_nand_2, ~(oldval & i), oldval)
+
+__atomic_subword_fetch (u8, add_fetch_1, oldval + i, val)
+__atomic_subword_fetch (u8, sub_fetch_1, oldval - i, val)
+__atomic_subword_fetch (u8, or_fetch_1, oldval | i, val)
+__atomic_subword_fetch (u8, and_fetch_1, oldval & i, val)
+__atomic_subword_fetch (u8, xor_fetch_1, oldval ^ i, val)
+__atomic_subword_fetch (u8, nand_fetch_1, ~(oldval & i), val)
+
+__atomic_subword_fetch (u16, add_fetch_2, oldval + i, val)
+__atomic_subword_fetch (u16, sub_fetch_2, oldval - i, val)
+__atomic_subword_fetch (u16, or_fetch_2, oldval | i, val)
+__atomic_subword_fetch (u16, and_fetch_2, oldval & i, val)
+__atomic_subword_fetch (u16, xor_fetch_2, oldval ^ i, val)
+__atomic_subword_fetch (u16, nand_fetch_2, ~(oldval & i), val)
+
+#define __sync_subword_fetch(type, funcname, expr, retval) \
+ __subword(type, \
+ type __sync_ ## funcname(type *ptr, type i), \
+ arch_atomic_read_barrier();, \
+ expr, \
+ arch_atomic_write_barrier(); return retval;)
+
+__sync_subword_fetch (u8, fetch_and_add_1, oldval + i, oldval)
+__sync_subword_fetch (u8, fetch_and_sub_1, oldval - i, oldval)
+__sync_subword_fetch (u8, fetch_and_or_1, oldval | i, oldval)
+__sync_subword_fetch (u8, fetch_and_and_1, oldval & i, oldval)
+__sync_subword_fetch (u8, fetch_and_xor_1, oldval ^ i, oldval)
+__sync_subword_fetch (u8, fetch_and_nand_1, ~(oldval & i), oldval)
+
+__sync_subword_fetch (u16, fetch_and_add_2, oldval + i, oldval)
+__sync_subword_fetch (u16, fetch_and_sub_2, oldval - i, oldval)
+__sync_subword_fetch (u16, fetch_and_or_2, oldval | i, oldval)
+__sync_subword_fetch (u16, fetch_and_and_2, oldval & i, oldval)
+__sync_subword_fetch (u16, fetch_and_xor_2, oldval ^ i, oldval)
+__sync_subword_fetch (u16, fetch_and_nand_2, ~(oldval & i), oldval)
+
+__sync_subword_fetch (u8, add_and_fetch_1, oldval + i, val)
+__sync_subword_fetch (u8, sub_and_fetch_1, oldval - i, val)
+__sync_subword_fetch (u8, or_and_fetch_1, oldval | i, val)
+__sync_subword_fetch (u8, and_and_fetch_1, oldval & i, val)
+__sync_subword_fetch (u8, xor_and_fetch_1, oldval ^ i, val)
+__sync_subword_fetch (u8, nand_and_fetch_1, ~(oldval & i), val)
+
+__sync_subword_fetch (u16, add_and_fetch_2, oldval + i, val)
+__sync_subword_fetch (u16, sub_and_fetch_2, oldval - i, val)
+__sync_subword_fetch (u16, or_and_fetch_2, oldval | i, val)
+__sync_subword_fetch (u16, and_and_fetch_2, oldval & i, val)
+__sync_subword_fetch (u16, xor_and_fetch_2, oldval ^ i, val)
+__sync_subword_fetch (u16, nand_and_fetch_2, ~(oldval & i), val)
+
+#define __atomic_subword_lock(type, size) \
+ __subword(type, \
+ type __atomic_exchange_##size(volatile type* ptr, type nval, int model), \
+ pre_atomic_barrier(model);, \
+ nval, \
+ post_atomic_barrier(model); return oldval;)
+
+__atomic_subword_lock (u8, 1)
+__atomic_subword_lock (u16, 2)
+
+#define __sync_subword_lock(type, size) \
+ __subword(type, \
+ type __sync_lock_test_and_set_##size(type* ptr, type nval), \
+ , \
+ nval, \
+ arch_atomic_acquire_barrier_value(oldval); return oldval;)
+
+__sync_subword_lock (u8, 1)
+__sync_subword_lock (u16, 2)
diff --git a/gcc-4.9/libgcc/config/tilepro/atomic.h b/gcc-4.9/libgcc/config/tilepro/atomic.h
new file mode 100644
index 000000000..404e15ee2
--- /dev/null
+++ b/gcc-4.9/libgcc/config/tilepro/atomic.h
@@ -0,0 +1,435 @@
+/* Macros for atomic functionality for tile.
+ Copyright (C) 2011-2014 Free Software Foundation, Inc.
+ Contributed by Walter Lee (walt@tilera.com)
+
+ This file is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 3, or (at your option) any
+ later version.
+
+ This file is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+/* Provides macros for common atomic functionality. */
+
+#ifndef _ATOMIC_H_
+#define _ATOMIC_H_
+
+#ifdef __tilegx__
+/* Atomic instruction macros
+
+ The macros provided by atomic.h simplify access to the TILE-Gx
+ architecture's atomic instructions. The architecture provides a
+ variety of atomic instructions, including "exchange", "compare and
+ exchange", "fetch and ADD", "fetch and AND", "fetch and OR", and
+ "fetch and ADD if greater than or equal to zero".
+
+ No barrier or fence semantics are implied by any of the atomic
+ instructions for manipulating memory; you must specify the barriers
+ that you wish explicitly, using the provided macros.
+
+ Any integral 32- or 64-bit value can be used as the argument
+ to these macros, such as "int", "long long", "unsigned long", etc.
+ The pointers must be aligned to 4 or 8 bytes for 32- or 64-bit data.
+ The "exchange" and "compare and exchange" macros may also take
+ pointer values. We use the pseudo-type "VAL" in the documentation
+ to indicate the use of an appropriate type. */
+#else
+/* Atomic instruction macros
+
+ The macros provided by atomic.h simplify access to the Tile
+ architecture's atomic instructions. Since the architecture
+ supports test-and-set as its only in-silicon atomic operation, many
+ of the operations provided by this header are implemented as
+ fast-path calls to Linux emulation routines.
+
+ Using the kernel for atomic operations allows userspace to take
+ advantage of the kernel's existing atomic-integer support (managed
+ by a distributed array of locks). The kernel provides proper
+ ordering among simultaneous atomic operations on different cores,
+ and guarantees a process can not be context-switched part way
+ through an atomic operation. By virtue of sharing the kernel
+ atomic implementation, the userspace atomic operations
+ are compatible with the atomic methods provided by the kernel's
+ futex() syscall API. Note that these operations never cause Linux
+ kernel scheduling, and are in fact invisible to the kernel; they
+ simply act as regular function calls but with an elevated privilege
+ level. Note that the kernel's distributed lock array is hashed by
+ using only VA bits from the atomic value's address (to avoid the
+ performance hit of page table locking and multiple page-table
+ lookups to get the PA) and only the VA bits that are below page
+ granularity (to properly lock simultaneous accesses to the same
+ page mapped at different VAs). As a result, simultaneous atomic
+ operations on values whose addresses are at the same offset on a
+ page will contend in the kernel for the same lock array element.
+
+ No barrier or fence semantics are implied by any of the atomic
+ instructions for manipulating memory; you must specify the barriers
+ that you wish explicitly, using the provided macros.
+
+ Any integral 32- or 64-bit value can be used as the argument
+ to these macros, such as "int", "long long", "unsigned long", etc.
+ The pointers must be aligned to 4 or 8 bytes for 32- or 64-bit data.
+ The "exchange" and "compare and exchange" macros may also take
+ pointer values. We use the pseudo-type "VAL" in the documentation
+ to indicate the use of an appropriate type.
+
+ The 32-bit routines are implemented using a single kernel fast
+ syscall, as is the 64-bit compare-and-exchange. The other 64-bit
+ routines are implemented by looping over the 64-bit
+ compare-and-exchange routine, so may be potentially less efficient. */
+#endif
+
+#ifdef __tilegx__
+#include <arch/spr_def.h>
+#else
+#include <asm/unistd.h>
+#endif
+
+
+/* 32-bit integer compare-and-exchange. */
+static __inline __attribute__ ((always_inline))
+ int arch_atomic_val_compare_and_exchange_4 (volatile int *mem,
+ int oldval, int newval)
+{
+#ifdef __tilegx__
+ __insn_mtspr (SPR_CMPEXCH_VALUE, oldval);
+ return __insn_cmpexch4 (mem, newval);
+#else
+ int result;
+ __asm__ __volatile__ ("swint1":"=R00" (result),
+ "=m" (*mem):"R10" (__NR_FAST_cmpxchg), "R00" (mem),
+ "R01" (oldval), "R02" (newval), "m" (*mem):"r20",
+ "r21", "r22", "r23", "r24", "r25", "r26", "r27",
+ "r28", "r29", "memory");
+ return result;
+#endif
+}
+
+/* 64-bit integer compare-and-exchange. */
+static __inline __attribute__ ((always_inline))
+ long long arch_atomic_val_compare_and_exchange_8 (volatile long long
+ *mem, long long oldval,
+ long long newval)
+{
+#ifdef __tilegx__
+ __insn_mtspr (SPR_CMPEXCH_VALUE, oldval);
+ return __insn_cmpexch (mem, newval);
+#else
+ unsigned int result_lo, result_hi;
+ unsigned int oldval_lo = oldval & 0xffffffffu, oldval_hi = oldval >> 32;
+ unsigned int newval_lo = newval & 0xffffffffu, newval_hi = newval >> 32;
+ __asm__ __volatile__ ("swint1":"=R00" (result_lo), "=R01" (result_hi),
+ "=m" (*mem):"R10" (__NR_FAST_cmpxchg64), "R00" (mem),
+ "R02" (oldval_lo), "R03" (oldval_hi),
+ "R04" (newval_lo), "R05" (newval_hi),
+ "m" (*mem):"r20", "r21", "r22", "r23", "r24", "r25",
+ "r26", "r27", "r28", "r29", "memory");
+ return ((long long) result_hi) << 32 | result_lo;
+#endif
+}
+
+/* This non-existent symbol is called for sizes other than "4" and "8",
+ indicating a bug in the caller. */
+extern int __arch_atomic_error_bad_argument_size (void)
+ __attribute__ ((warning ("sizeof atomic argument not 4 or 8")));
+
+
+#define arch_atomic_val_compare_and_exchange(mem, o, n) \
+ __extension__ ({ \
+ (__typeof(*(mem)))(__typeof(*(mem)-*(mem))) \
+ ((sizeof(*(mem)) == 8) ? \
+ arch_atomic_val_compare_and_exchange_8( \
+ (volatile long long*)(mem), (__typeof((o)-(o)))(o), \
+ (__typeof((n)-(n)))(n)) : \
+ (sizeof(*(mem)) == 4) ? \
+ arch_atomic_val_compare_and_exchange_4( \
+ (volatile int*)(mem), (__typeof((o)-(o)))(o), \
+ (__typeof((n)-(n)))(n)) : \
+ __arch_atomic_error_bad_argument_size()); \
+ })
+
+#define arch_atomic_bool_compare_and_exchange(mem, o, n) \
+ __extension__ ({ \
+ __typeof(o) __o = (o); \
+ __builtin_expect( \
+ __o == arch_atomic_val_compare_and_exchange((mem), __o, (n)), 1); \
+ })
+
+
+/* Loop with compare_and_exchange until we guess the correct value.
+ Normally "expr" will be an expression using __old and __value. */
+#define __arch_atomic_update_cmpxchg(mem, value, expr) \
+ __extension__ ({ \
+ __typeof(value) __value = (value); \
+ __typeof(*(mem)) *__mem = (mem), __old = *__mem, __guess; \
+ do { \
+ __guess = __old; \
+ __old = arch_atomic_val_compare_and_exchange(__mem, __old, (expr)); \
+ } while (__builtin_expect(__old != __guess, 0)); \
+ __old; \
+ })
+
+#ifdef __tilegx__
+
+/* Generic atomic op with 8- or 4-byte variant.
+ The _mask, _addend, and _expr arguments are ignored on tilegx. */
+#define __arch_atomic_update(mem, value, op, _mask, _addend, _expr) \
+ __extension__ ({ \
+ ((__typeof(*(mem))) \
+ ((sizeof(*(mem)) == 8) ? (__typeof(*(mem)-*(mem)))__insn_##op( \
+ (volatile void *)(mem), \
+ (long long)(__typeof((value)-(value)))(value)) : \
+ (sizeof(*(mem)) == 4) ? (int)__insn_##op##4( \
+ (volatile void *)(mem), \
+ (int)(__typeof((value)-(value)))(value)) : \
+ __arch_atomic_error_bad_argument_size())); \
+ })
+
+#else
+
+/* This uses TILEPro's fast syscall support to atomically compute:
+
+ int old = *ptr;
+ *ptr = (old & mask) + addend;
+ return old;
+
+ This primitive can be used for atomic exchange, add, or, and.
+ Only 32-bit support is provided. */
+static __inline __attribute__ ((always_inline))
+ int
+ __arch_atomic_update_4 (volatile int *mem, int mask, int addend)
+{
+ int result;
+ __asm__ __volatile__ ("swint1":"=R00" (result),
+ "=m" (*mem):"R10" (__NR_FAST_atomic_update),
+ "R00" (mem), "R01" (mask), "R02" (addend),
+ "m" (*mem):"r20", "r21", "r22", "r23", "r24", "r25",
+ "r26", "r27", "r28", "r29", "memory");
+ return result;
+}
+
+/* Generic atomic op with 8- or 4-byte variant.
+ The _op argument is ignored on tilepro. */
+#define __arch_atomic_update(mem, value, _op, mask, addend, expr) \
+ __extension__ ({ \
+ (__typeof(*(mem)))(__typeof(*(mem)-*(mem))) \
+ ((sizeof(*(mem)) == 8) ? \
+ __arch_atomic_update_cmpxchg((mem), (value), (expr)) : \
+ (sizeof(*(mem)) == 4) ? \
+ __arch_atomic_update_4((volatile int*)(mem), \
+ (__typeof((mask)-(mask)))(mask), \
+ (__typeof((addend)-(addend)))(addend)) : \
+ __arch_atomic_error_bad_argument_size()); \
+ })
+
+#endif /* __tilegx__ */
+
+
+#define arch_atomic_exchange(mem, newvalue) \
+ __arch_atomic_update(mem, newvalue, exch, 0, newvalue, __value)
+
+#define arch_atomic_add(mem, value) \
+ __arch_atomic_update(mem, value, fetchadd, -1, value, __old + __value)
+
+#define arch_atomic_sub(mem, value) arch_atomic_add((mem), -(value))
+
+#define arch_atomic_increment(mem) arch_atomic_add((mem), 1)
+
+#define arch_atomic_decrement(mem) arch_atomic_add((mem), -1)
+
+#define arch_atomic_and(mem, mask) \
+ __arch_atomic_update(mem, mask, fetchand, mask, 0, __old & __value)
+
+#define arch_atomic_or(mem, mask) \
+ __arch_atomic_update(mem, mask, fetchor, ~mask, mask, __old | __value)
+
+#define arch_atomic_xor(mem, mask) \
+ __arch_atomic_update_cmpxchg(mem, mask, __old ^ __value)
+
+#define arch_atomic_nand(mem, mask) \
+ __arch_atomic_update_cmpxchg(mem, mask, ~(__old & __value))
+
+#define arch_atomic_bit_set(mem, bit) \
+ __extension__ ({ \
+ __typeof(*(mem)) __mask = (__typeof(*(mem)))1 << (bit); \
+ __mask & arch_atomic_or((mem), __mask); \
+ })
+
+#define arch_atomic_bit_clear(mem, bit) \
+ __extension__ ({ \
+ __typeof(*(mem)) __mask = (__typeof(*(mem)))1 << (bit); \
+ __mask & arch_atomic_and((mem), ~__mask); \
+ })
+
+#ifdef __tilegx__
+/* Atomically store a new value to memory.
+ Note that you can freely use types of any size here, unlike the
+ other atomic routines, which require 32- or 64-bit types.
+ This accessor is provided for compatibility with TILEPro, which
+ required an explicit atomic operation for stores that needed
+ to be atomic with respect to other atomic methods in this header. */
+#define arch_atomic_write(mem, value) ((void) (*(mem) = (value)))
+#else
+#define arch_atomic_write(mem, value) \
+ do { \
+ __typeof(mem) __aw_mem = (mem); \
+ __typeof(value) __aw_val = (value); \
+ unsigned int *__aw_mem32, __aw_intval, __aw_val32, __aw_off, __aw_mask; \
+ __aw_intval = (__typeof((value) - (value)))__aw_val; \
+ switch (sizeof(*__aw_mem)) { \
+ case 8: \
+ __arch_atomic_update_cmpxchg(__aw_mem, __aw_val, __value); \
+ break; \
+ case 4: \
+ __arch_atomic_update_4((int *)__aw_mem, 0, __aw_intval); \
+ break; \
+ case 2: \
+ __aw_off = 8 * ((long)__aw_mem & 0x2); \
+ __aw_mask = 0xffffU << __aw_off; \
+ __aw_mem32 = (unsigned int *)((long)__aw_mem & ~0x2); \
+ __aw_val32 = (__aw_intval << __aw_off) & __aw_mask; \
+ __arch_atomic_update_cmpxchg(__aw_mem32, __aw_val32, \
+ (__old & ~__aw_mask) | __value); \
+ break; \
+ case 1: \
+ __aw_off = 8 * ((long)__aw_mem & 0x3); \
+ __aw_mask = 0xffU << __aw_off; \
+ __aw_mem32 = (unsigned int *)((long)__aw_mem & ~0x3); \
+ __aw_val32 = (__aw_intval << __aw_off) & __aw_mask; \
+ __arch_atomic_update_cmpxchg(__aw_mem32, __aw_val32, \
+ (__old & ~__aw_mask) | __value); \
+ break; \
+ } \
+ } while (0)
+#endif
+
+/* Compiler barrier.
+
+ This macro prevents loads or stores from being moved by the compiler
+ across the macro. Any loaded value that was loaded before this
+ macro must then be reloaded by the compiler. */
+#define arch_atomic_compiler_barrier() __asm__ __volatile__("" ::: "memory")
+
+/* Full memory barrier.
+
+ This macro has the semantics of arch_atomic_compiler_barrer(), but also
+ ensures that previous stores are visible to other cores, and that
+ all previous loaded values have been placed into their target
+ register on this core. */
+#define arch_atomic_full_barrier() __insn_mf()
+
+/* Read memory barrier.
+
+ Ensure that all reads by this processor that occurred prior to the
+ read memory barrier have completed, and that no reads that occur
+ after the read memory barrier on this processor are initiated
+ before the barrier.
+
+ On current TILE chips a read barrier is implemented as a full barrier,
+ but this may not be true in later versions of the architecture.
+
+ See also arch_atomic_acquire_barrier() for the appropriate idiom to use
+ to ensure no reads are lifted above an atomic lock instruction. */
+#define arch_atomic_read_barrier() arch_atomic_full_barrier()
+
+/* Write memory barrier.
+
+ Ensure that all writes by this processor that occurred prior to the
+ write memory barrier have completed, and that no writes that occur
+ after the write memory barrier on this processor are initiated
+ before the barrier.
+
+ On current TILE chips a write barrier is implemented as a full barrier,
+ but this may not be true in later versions of the architecture.
+
+ See also arch_atomic_release_barrier() for the appropriate idiom to use
+ to ensure all writes are complete prior to an atomic unlock instruction. */
+#define arch_atomic_write_barrier() arch_atomic_full_barrier()
+
+/* Lock acquisition barrier.
+
+ Ensure that no load operations that follow this macro in the
+ program can issue prior to the barrier. Without such a barrier,
+ the compiler can reorder them to issue earlier, or the hardware can
+ issue them speculatively. The latter is not currently done in the
+ Tile microarchitecture, but using this operation improves
+ portability to future implementations.
+
+ This operation is intended to be used as part of the "acquire"
+ path for locking, that is, when entering a critical section.
+ This should be done after the atomic operation that actually
+ acquires the lock, and in conjunction with a "control dependency"
+ that checks the atomic operation result to see if the lock was
+ in fact acquired. See the arch_atomic_read_barrier() macro
+ for a heavier-weight barrier to use in certain unusual constructs,
+ or arch_atomic_acquire_barrier_value() if no control dependency exists. */
+#define arch_atomic_acquire_barrier() arch_atomic_compiler_barrier()
+
+/* Lock release barrier.
+
+ Ensure that no store operations that precede this macro in the
+ program complete subsequent to the barrier. Without such a
+ barrier, the compiler can reorder stores to issue later, or stores
+ can be still outstanding in the memory network.
+
+ This operation is intended to be used as part of the "release" path
+ for locking, that is, when leaving a critical section. This should
+ be done before the operation (such as a store of zero) that
+ actually releases the lock. */
+#define arch_atomic_release_barrier() arch_atomic_write_barrier()
+
+/* Barrier until the read of a particular value is complete.
+
+ This is occasionally useful when constructing certain locking
+ scenarios. For example, you might write a routine that issues an
+ atomic instruction to enter a critical section, then reads one or
+ more values within the critical section without checking to see if
+ the critical section was in fact acquired, and only later checks
+ the atomic instruction result to see if the lock was acquired. If
+ so the routine could properly release the lock and know that the
+ values that were read were valid.
+
+ In this scenario, it is required to wait for the result of the
+ atomic instruction, even if the value itself is not checked. This
+ guarantees that if the atomic instruction succeeded in taking the lock,
+ the lock was held before any reads in the critical section issued. */
+#define arch_atomic_acquire_barrier_value(val) \
+ __asm__ __volatile__("move %0, %0" :: "r"(val))
+
+/* Access the given variable in memory exactly once.
+
+ In some contexts, an algorithm may need to force access to memory,
+ since otherwise the compiler may think it can optimize away a
+ memory load or store; for example, in a loop when polling memory to
+ see if another cpu has updated it yet. Generally this is only
+ required for certain very carefully hand-tuned algorithms; using it
+ unnecessarily may result in performance losses.
+
+ A related use of this macro is to ensure that the compiler does not
+ rematerialize the value of "x" by reloading it from memory
+ unexpectedly; the "volatile" marking will prevent the compiler from
+ being able to rematerialize. This is helpful if an algorithm needs
+ to read a variable without locking, but needs it to have the same
+ value if it ends up being used several times within the algorithm.
+
+ Note that multiple uses of this macro are guaranteed to be ordered,
+ i.e. the compiler will not reorder stores or loads that are wrapped
+ in arch_atomic_access_once(). */
+#define arch_atomic_access_once(x) (*(volatile __typeof(x) *)&(x))
+
+
+
+#endif /* !_ATOMIC_H_ */
diff --git a/gcc-4.9/libgcc/config/tilepro/linux-unwind.h b/gcc-4.9/libgcc/config/tilepro/linux-unwind.h
new file mode 100644
index 000000000..27481cfcd
--- /dev/null
+++ b/gcc-4.9/libgcc/config/tilepro/linux-unwind.h
@@ -0,0 +1,99 @@
+/* DWARF2 EH unwinding support for TILEPro.
+ Copyright (C) 2011-2014 Free Software Foundation, Inc.
+ Contributed by Walter Lee (walt@tilera.com)
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
+#ifndef inhibit_libc
+
+#include <arch/abi.h>
+#include <signal.h>
+#include <sys/ucontext.h>
+#include <linux/unistd.h>
+
+/* Macro to define a copy of the kernel's __rt_sigreturn function
+ (in arch/tile/kernel/entry.S). If that function is changed,
+ this one needs to be changed to match it. */
+#define _sigreturn_asm(REG, NR) asm( \
+ ".pushsection .text.__rt_sigreturn,\"a\"\n" \
+ ".global __rt_sigreturn\n" \
+ ".type __rt_sigreturn,@function\n" \
+ "__rt_sigreturn:\n" \
+ "moveli " #REG ", " #NR "\n" \
+ "swint1\n" \
+ ".size __rt_sigreturn, . - __rt_sigreturn\n" \
+ ".popsection")
+#define sigreturn_asm(REG, NR) _sigreturn_asm(REG, NR)
+sigreturn_asm (TREG_SYSCALL_NR_NAME, __NR_rt_sigreturn);
+#define SIGRETURN_LEN 16
+extern char __rt_sigreturn[];
+
+#define MD_FALLBACK_FRAME_STATE_FOR tile_fallback_frame_state
+
+static _Unwind_Reason_Code
+tile_fallback_frame_state (struct _Unwind_Context *context,
+ _Unwind_FrameState *fs)
+{
+ unsigned char *pc = context->ra;
+ struct sigcontext *sc;
+ long new_cfa;
+ int i;
+
+ struct rt_sigframe {
+ unsigned char save_area[C_ABI_SAVE_AREA_SIZE];
+ siginfo_t info;
+ struct ucontext uc;
+ } *rt_;
+
+ /* Return if this is not a signal handler. */
+ if (memcmp (pc, __rt_sigreturn, SIGRETURN_LEN) != 0)
+ return _URC_END_OF_STACK;
+
+ /* It was a signal handler; update the reported PC to point to our
+ copy, since that will be findable with dladdr() and therefore
+ somewhat easier to help understand what actually happened. */
+ context->ra = __rt_sigreturn;
+
+ rt_ = context->cfa;
+ sc = &rt_->uc.uc_mcontext;
+
+ new_cfa = sc->sp;
+ fs->regs.cfa_how = CFA_REG_OFFSET;
+ fs->regs.cfa_reg = STACK_POINTER_REGNUM;
+ fs->regs.cfa_offset = new_cfa - (long) context->cfa;
+
+ for (i = 0; i < 56; ++i)
+ {
+ fs->regs.reg[i].how = REG_SAVED_OFFSET;
+ fs->regs.reg[i].loc.offset
+ = (long)&sc->gregs[i] - new_cfa;
+ }
+
+ fs->regs.reg[56].how = REG_SAVED_OFFSET;
+ fs->regs.reg[56].loc.offset = (long)&sc->pc - new_cfa;
+ fs->retaddr_column = 56;
+ fs->signal_frame = 1;
+
+ return _URC_NO_REASON;
+}
+
+#endif /* ifdef inhibit_libc */
diff --git a/gcc-4.9/libgcc/config/tilepro/sfp-machine.h b/gcc-4.9/libgcc/config/tilepro/sfp-machine.h
new file mode 100644
index 000000000..6953d8d8d
--- /dev/null
+++ b/gcc-4.9/libgcc/config/tilepro/sfp-machine.h
@@ -0,0 +1,59 @@
+#define _FP_W_TYPE_SIZE 32
+#define _FP_W_TYPE unsigned long
+#define _FP_WS_TYPE signed long
+#define _FP_I_TYPE long
+
+/* The type of the result of a floating point comparison. This must
+ match `__libgcc_cmp_return__' in GCC for the target. */
+typedef int __gcc_CMPtype __attribute__ ((mode (__libgcc_cmp_return__)));
+#define CMPtype __gcc_CMPtype
+
+#define _FP_MUL_MEAT_S(R,X,Y) \
+ _FP_MUL_MEAT_1_wide(_FP_WFRACBITS_S,R,X,Y,umul_ppmm)
+#define _FP_MUL_MEAT_D(R,X,Y) \
+ _FP_MUL_MEAT_2_wide(_FP_WFRACBITS_D,R,X,Y,umul_ppmm)
+#define _FP_MUL_MEAT_Q(R,X,Y) \
+ _FP_MUL_MEAT_4_wide(_FP_WFRACBITS_Q,R,X,Y,umul_ppmm)
+
+#define _FP_DIV_MEAT_S(R,X,Y) _FP_DIV_MEAT_1_loop(S,R,X,Y)
+#define _FP_DIV_MEAT_D(R,X,Y) _FP_DIV_MEAT_2_udiv(D,R,X,Y)
+#define _FP_DIV_MEAT_Q(R,X,Y) _FP_DIV_MEAT_4_udiv(Q,R,X,Y)
+
+#define _FP_NANFRAC_S _FP_QNANBIT_S
+#define _FP_NANFRAC_D _FP_QNANBIT_D, 0
+#define _FP_NANFRAC_Q _FP_QNANBIT_Q, 0, 0, 0
+#define _FP_NANSIGN_S 1
+#define _FP_NANSIGN_D 1
+#define _FP_NANSIGN_Q 1
+
+#define _FP_KEEPNANFRACP 1
+#define _FP_QNANNEGATEDP 0
+
+#define _FP_CHOOSENAN(fs, wc, R, X, Y, OP) \
+ do { \
+ if ((_FP_FRAC_HIGH_RAW_##fs(X) & _FP_QNANBIT_##fs) \
+ && !(_FP_FRAC_HIGH_RAW_##fs(Y) & _FP_QNANBIT_##fs)) \
+ { \
+ R##_s = Y##_s; \
+ _FP_FRAC_COPY_##wc(R,Y); \
+ } \
+ else \
+ { \
+ R##_s = X##_s; \
+ _FP_FRAC_COPY_##wc(R,X); \
+ } \
+ R##_c = FP_CLS_NAN; \
+ } while (0)
+
+#define _FP_TININESS_AFTER_ROUNDING 0
+
+#define __LITTLE_ENDIAN 1234
+#define __BIG_ENDIAN 4321
+
+#define __BYTE_ORDER __LITTLE_ENDIAN
+
+/* Define ALIASNAME as a strong alias for NAME. */
+# define strong_alias(name, aliasname) _strong_alias(name, aliasname)
+# define _strong_alias(name, aliasname) \
+ extern __typeof (name) aliasname __attribute__ ((alias (#name)));
+
diff --git a/gcc-4.9/libgcc/config/tilepro/softdivide.c b/gcc-4.9/libgcc/config/tilepro/softdivide.c
new file mode 100644
index 000000000..8a539f467
--- /dev/null
+++ b/gcc-4.9/libgcc/config/tilepro/softdivide.c
@@ -0,0 +1,353 @@
+/* Division and remainder routines for Tile.
+ Copyright (C) 2011-2014 Free Software Foundation, Inc.
+ Contributed by Walter Lee (walt@tilera.com)
+
+ This file is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 3, or (at your option) any
+ later version.
+
+ This file is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+typedef int int32_t;
+typedef unsigned uint32_t;
+typedef long long int64_t;
+typedef unsigned long long uint64_t;
+
+/* Raise signal 8 (SIGFPE) with code 1 (FPE_INTDIV). */
+static inline void
+raise_intdiv (void)
+{
+ asm ("{ raise; moveli zero, 8 + (1 << 6) }");
+}
+
+
+#ifndef __tilegx__
+/*__udivsi3 - 32 bit integer unsigned divide */
+static inline uint32_t __attribute__ ((always_inline))
+__udivsi3_inline (uint32_t dividend, uint32_t divisor)
+{
+ /* Divide out any power of two factor from dividend and divisor.
+ Note that when dividing by zero the divisor will remain zero,
+ which is all we need to detect that case below. */
+ const int power_of_two_factor = __insn_ctz (divisor);
+ divisor >>= power_of_two_factor;
+ dividend >>= power_of_two_factor;
+
+ /* Checks for division by power of two or division by zero. */
+ if (divisor <= 1)
+ {
+ if (divisor == 0)
+ {
+ raise_intdiv ();
+ return 0;
+ }
+ return dividend;
+ }
+
+ /* Compute (a / b) by repeatedly finding the largest N
+ such that (b << N) <= a. For each such N, set bit N in the
+ quotient, subtract (b << N) from a, and keep going. Think of this as
+ the reverse of the "shift-and-add" that a multiply does. The values
+ of N are precisely those shift counts.
+
+ Finding N is easy. First, use clz(b) - clz(a) to find the N
+ that lines up the high bit of (b << N) with the high bit of a.
+ Any larger value of N would definitely make (b << N) > a,
+ which is too big.
+
+ Then, if (b << N) > a (because it has larger low bits), decrement
+ N by one. This adjustment will definitely make (b << N) less
+ than a, because a's high bit is now one higher than b's. */
+
+ /* Precomputing the max_ values allows us to avoid a subtract
+ in the inner loop and just right shift by clz(remainder). */
+ const int divisor_clz = __insn_clz (divisor);
+ const uint32_t max_divisor = divisor << divisor_clz;
+ const uint32_t max_qbit = 1 << divisor_clz;
+
+ uint32_t quotient = 0;
+ uint32_t remainder = dividend;
+
+ while (remainder >= divisor)
+ {
+ int shift = __insn_clz (remainder);
+ uint32_t scaled_divisor = max_divisor >> shift;
+ uint32_t quotient_bit = max_qbit >> shift;
+
+ int too_big = (scaled_divisor > remainder);
+ scaled_divisor >>= too_big;
+ quotient_bit >>= too_big;
+ remainder -= scaled_divisor;
+ quotient |= quotient_bit;
+ }
+ return quotient;
+}
+#endif /* !__tilegx__ */
+
+
+/* __udivdi3 - 64 bit integer unsigned divide */
+static inline uint64_t __attribute__ ((always_inline))
+__udivdi3_inline (uint64_t dividend, uint64_t divisor)
+{
+ /* Divide out any power of two factor from dividend and divisor.
+ Note that when dividing by zero the divisor will remain zero,
+ which is all we need to detect that case below. */
+ const int power_of_two_factor = __builtin_ctzll (divisor);
+ divisor >>= power_of_two_factor;
+ dividend >>= power_of_two_factor;
+
+ /* Checks for division by power of two or division by zero. */
+ if (divisor <= 1)
+ {
+ if (divisor == 0)
+ {
+ raise_intdiv ();
+ return 0;
+ }
+ return dividend;
+ }
+
+#ifndef __tilegx__
+ if (((uint32_t) (dividend >> 32) | ((uint32_t) (divisor >> 32))) == 0)
+ {
+ /* Operands both fit in 32 bits, so use faster 32 bit algorithm. */
+ return __udivsi3_inline ((uint32_t) dividend, (uint32_t) divisor);
+ }
+#endif /* !__tilegx__ */
+
+ /* See algorithm description in __udivsi3 */
+
+ const int divisor_clz = __builtin_clzll (divisor);
+ const uint64_t max_divisor = divisor << divisor_clz;
+ const uint64_t max_qbit = 1ULL << divisor_clz;
+
+ uint64_t quotient = 0;
+ uint64_t remainder = dividend;
+
+ while (remainder >= divisor)
+ {
+ int shift = __builtin_clzll (remainder);
+ uint64_t scaled_divisor = max_divisor >> shift;
+ uint64_t quotient_bit = max_qbit >> shift;
+
+ int too_big = (scaled_divisor > remainder);
+ scaled_divisor >>= too_big;
+ quotient_bit >>= too_big;
+ remainder -= scaled_divisor;
+ quotient |= quotient_bit;
+ }
+ return quotient;
+}
+
+
+#ifndef __tilegx__
+/* __umodsi3 - 32 bit integer unsigned modulo */
+static inline uint32_t __attribute__ ((always_inline))
+__umodsi3_inline (uint32_t dividend, uint32_t divisor)
+{
+ /* Shortcircuit mod by a power of two (and catch mod by zero). */
+ const uint32_t mask = divisor - 1;
+ if ((divisor & mask) == 0)
+ {
+ if (divisor == 0)
+ {
+ raise_intdiv ();
+ return 0;
+ }
+ return dividend & mask;
+ }
+
+ /* We compute the remainder (a % b) by repeatedly subtracting off
+ multiples of b from a until a < b. The key is that subtracting
+ off a multiple of b does not affect the result mod b.
+
+ To make the algorithm run efficiently, we need to subtract
+ off a large multiple of b at each step. We subtract the largest
+ (b << N) that is <= a.
+
+ Finding N is easy. First, use clz(b) - clz(a) to find the N
+ that lines up the high bit of (b << N) with the high bit of a.
+ Any larger value of N would definitely make (b << N) > a,
+ which is too big.
+
+ Then, if (b << N) > a (because it has larger low bits), decrement
+ N by one. This adjustment will definitely make (b << N) less
+ than a, because a's high bit is now one higher than b's. */
+ const uint32_t max_divisor = divisor << __insn_clz (divisor);
+
+ uint32_t remainder = dividend;
+ while (remainder >= divisor)
+ {
+ const int shift = __insn_clz (remainder);
+ uint32_t scaled_divisor = max_divisor >> shift;
+ scaled_divisor >>= (scaled_divisor > remainder);
+ remainder -= scaled_divisor;
+ }
+
+ return remainder;
+}
+#endif /* !__tilegx__ */
+
+
+/* __umoddi3 - 64 bit integer unsigned modulo */
+static inline uint64_t __attribute__ ((always_inline))
+__umoddi3_inline (uint64_t dividend, uint64_t divisor)
+{
+#ifndef __tilegx__
+ if (((uint32_t) (dividend >> 32) | ((uint32_t) (divisor >> 32))) == 0)
+ {
+ /* Operands both fit in 32 bits, so use faster 32 bit algorithm. */
+ return __umodsi3_inline ((uint32_t) dividend, (uint32_t) divisor);
+ }
+#endif /* !__tilegx__ */
+
+ /* Shortcircuit mod by a power of two (and catch mod by zero). */
+ const uint64_t mask = divisor - 1;
+ if ((divisor & mask) == 0)
+ {
+ if (divisor == 0)
+ {
+ raise_intdiv ();
+ return 0;
+ }
+ return dividend & mask;
+ }
+
+ /* See algorithm description in __umodsi3 */
+ const uint64_t max_divisor = divisor << __builtin_clzll (divisor);
+
+ uint64_t remainder = dividend;
+ while (remainder >= divisor)
+ {
+ const int shift = __builtin_clzll (remainder);
+ uint64_t scaled_divisor = max_divisor >> shift;
+ scaled_divisor >>= (scaled_divisor > remainder);
+ remainder -= scaled_divisor;
+ }
+
+ return remainder;
+}
+
+
+uint32_t __udivsi3 (uint32_t dividend, uint32_t divisor);
+#ifdef L_tile_udivsi3
+uint32_t
+__udivsi3 (uint32_t dividend, uint32_t divisor)
+{
+#ifndef __tilegx__
+ return __udivsi3_inline (dividend, divisor);
+#else /* !__tilegx__ */
+ uint64_t n = __udivdi3_inline (((uint64_t) dividend), ((uint64_t) divisor));
+ return (uint32_t) n;
+#endif /* !__tilegx__ */
+}
+#endif
+
+#define ABS(x) ((x) >= 0 ? (x) : -(x))
+
+int32_t __divsi3 (int32_t dividend, int32_t divisor);
+#ifdef L_tile_divsi3
+/* __divsi3 - 32 bit integer signed divide */
+int32_t
+__divsi3 (int32_t dividend, int32_t divisor)
+{
+#ifndef __tilegx__
+ uint32_t n = __udivsi3_inline (ABS (dividend), ABS (divisor));
+#else /* !__tilegx__ */
+ uint64_t n =
+ __udivdi3_inline (ABS ((int64_t) dividend), ABS ((int64_t) divisor));
+#endif /* !__tilegx__ */
+ if ((dividend ^ divisor) < 0)
+ n = -n;
+ return (int32_t) n;
+}
+#endif
+
+
+uint64_t __udivdi3 (uint64_t dividend, uint64_t divisor);
+#ifdef L_tile_udivdi3
+uint64_t
+__udivdi3 (uint64_t dividend, uint64_t divisor)
+{
+ return __udivdi3_inline (dividend, divisor);
+}
+#endif
+
+/*__divdi3 - 64 bit integer signed divide */
+int64_t __divdi3 (int64_t dividend, int64_t divisor);
+#ifdef L_tile_divdi3
+int64_t
+__divdi3 (int64_t dividend, int64_t divisor)
+{
+ uint64_t n = __udivdi3_inline (ABS (dividend), ABS (divisor));
+ if ((dividend ^ divisor) < 0)
+ n = -n;
+ return (int64_t) n;
+}
+#endif
+
+
+uint32_t __umodsi3 (uint32_t dividend, uint32_t divisor);
+#ifdef L_tile_umodsi3
+uint32_t
+__umodsi3 (uint32_t dividend, uint32_t divisor)
+{
+#ifndef __tilegx__
+ return __umodsi3_inline (dividend, divisor);
+#else /* !__tilegx__ */
+ return __umoddi3_inline ((uint64_t) dividend, (uint64_t) divisor);
+#endif /* !__tilegx__ */
+}
+#endif
+
+
+/* __modsi3 - 32 bit integer signed modulo */
+int32_t __modsi3 (int32_t dividend, int32_t divisor);
+#ifdef L_tile_modsi3
+int32_t
+__modsi3 (int32_t dividend, int32_t divisor)
+{
+#ifndef __tilegx__
+ uint32_t remainder = __umodsi3_inline (ABS (dividend), ABS (divisor));
+#else /* !__tilegx__ */
+ uint64_t remainder =
+ __umoddi3_inline (ABS ((int64_t) dividend), ABS ((int64_t) divisor));
+#endif /* !__tilegx__ */
+ return (int32_t) ((dividend >= 0) ? remainder : -remainder);
+}
+#endif
+
+
+uint64_t __umoddi3 (uint64_t dividend, uint64_t divisor);
+#ifdef L_tile_umoddi3
+uint64_t
+__umoddi3 (uint64_t dividend, uint64_t divisor)
+{
+ return __umoddi3_inline (dividend, divisor);
+}
+#endif
+
+
+/* __moddi3 - 64 bit integer signed modulo */
+int64_t __moddi3 (int64_t dividend, int64_t divisor);
+#ifdef L_tile_moddi3
+int64_t
+__moddi3 (int64_t dividend, int64_t divisor)
+{
+ uint64_t remainder = __umoddi3_inline (ABS (dividend), ABS (divisor));
+ return (int64_t) ((dividend >= 0) ? remainder : -remainder);
+}
+#endif
diff --git a/gcc-4.9/libgcc/config/tilepro/softmpy.S b/gcc-4.9/libgcc/config/tilepro/softmpy.S
new file mode 100644
index 000000000..4922dc764
--- /dev/null
+++ b/gcc-4.9/libgcc/config/tilepro/softmpy.S
@@ -0,0 +1,94 @@
+/* 64-bit multiplication support for TILEPro.
+ Copyright (C) 2011-2014 Free Software Foundation, Inc.
+ Contributed by Walter Lee (walt@tilera.com)
+
+ This file is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 3, or (at your option) any
+ later version.
+
+ This file is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* 64-bit multiplication support. */
+
+ .file "softmpy.S"
+
+/* Parameters */
+#define lo0 r9 /* low 32 bits of n0 */
+#define hi0 r1 /* high 32 bits of n0 */
+#define lo1 r2 /* low 32 bits of n1 */
+#define hi1 r3 /* high 32 bits of n1 */
+
+/* temps */
+#define result1_a r4
+#define result1_b r5
+
+#define tmp0 r6
+#define tmp0_left_16 r7
+#define tmp1 r8
+
+ .section .text.__muldi3, "ax"
+ .align 8
+ .globl __muldi3
+ .type __muldi3, @function
+__muldi3:
+ {
+ move lo0, r0 /* so we can write "out r0" while "in r0" alive */
+ mulhl_uu tmp0, lo1, r0
+ }
+ {
+ mulll_uu result1_a, lo1, hi0
+ }
+ {
+ move tmp1, tmp0
+ mulhla_uu tmp0, lo0, lo1
+ }
+ {
+ mulhlsa_uu result1_a, lo1, hi0
+ }
+ {
+ mulll_uu result1_b, lo0, hi1
+ slt_u tmp1, tmp0, tmp1
+ }
+ {
+ mulhlsa_uu result1_a, lo0, hi1
+ shli r0, tmp0, 16
+ }
+ {
+ move tmp0_left_16, r0
+ mulhha_uu result1_b, lo0, lo1
+ }
+ {
+ mullla_uu r0, lo1, lo0
+ shli tmp1, tmp1, 16
+ }
+ {
+ mulhlsa_uu result1_b, hi0, lo1
+ inthh tmp1, tmp1, tmp0
+ }
+ {
+ mulhlsa_uu result1_a, hi1, lo0
+ slt_u tmp0, r0, tmp0_left_16
+ }
+ /* NOTE: this will stall for a cycle here. Oh well. */
+ {
+ add r1, tmp0, tmp1
+ add result1_a, result1_a, result1_b
+ }
+ {
+ add r1, r1, result1_a
+ jrp lr
+ }
+ .size __muldi3,.-__muldi3
diff --git a/gcc-4.9/libgcc/config/tilepro/t-crtstuff b/gcc-4.9/libgcc/config/tilepro/t-crtstuff
new file mode 100644
index 000000000..eddc45ce9
--- /dev/null
+++ b/gcc-4.9/libgcc/config/tilepro/t-crtstuff
@@ -0,0 +1,4 @@
+# crtend*.o cannot be compiled without -fno-asynchronous-unwind-tables,
+# because then __FRAME_END__ might not be the last thing in .eh_frame
+# section.
+CRTSTUFF_T_CFLAGS += -fno-asynchronous-unwind-tables
diff --git a/gcc-4.9/libgcc/config/tilepro/t-tilepro b/gcc-4.9/libgcc/config/tilepro/t-tilepro
new file mode 100644
index 000000000..eb6894ce1
--- /dev/null
+++ b/gcc-4.9/libgcc/config/tilepro/t-tilepro
@@ -0,0 +1,33 @@
+LIB2ADD += \
+ $(srcdir)/config/tilepro/softmpy.S \
+ $(srcdir)/config/tilepro/atomic.c
+
+LIB2FUNCS_EXCLUDE += \
+ _divdi3 \
+ _moddi3 \
+ _muldi3 \
+ _udivdi3 \
+ _umoddi3
+
+SOFTDIVIDE_FUNCS := \
+ _tile_udivsi3 \
+ _tile_divsi3 \
+ _tile_udivdi3 \
+ _tile_divdi3 \
+ _tile_umodsi3 \
+ _tile_modsi3 \
+ _tile_umoddi3 \
+ _tile_moddi3
+
+softdivide-o = $(patsubst %,%$(objext),$(SOFTDIVIDE_FUNCS))
+$(softdivide-o): %$(objext): $(srcdir)/config/tilepro/softdivide.c
+ $(gcc_compile) -ffunction-sections -DMAYBE_STATIC= -DL$* -c $< \
+ $(vis_hide)
+libgcc-objects += $(softdivide-o)
+
+ifeq ($(enable_shared),yes)
+softdivide-s-o = $(patsubst %,%_s$(objext),$(SOFTDIVIDE_FUNCS))
+$(softdivide-s-o): %_s$(objext): $(srcdir)/config/tilepro/softdivide.c
+ $(gcc_s_compile) -ffunction-sections -DMAYBE_STATIC= -DL$* -c $<
+libgcc-s-objects += $(softdivide-s-o)
+endif