diff options
Diffstat (limited to 'gcc-4.9/libgcc/config/tilepro/atomic.h')
-rw-r--r-- | gcc-4.9/libgcc/config/tilepro/atomic.h | 435 |
1 files changed, 435 insertions, 0 deletions
diff --git a/gcc-4.9/libgcc/config/tilepro/atomic.h b/gcc-4.9/libgcc/config/tilepro/atomic.h new file mode 100644 index 000000000..404e15ee2 --- /dev/null +++ b/gcc-4.9/libgcc/config/tilepro/atomic.h @@ -0,0 +1,435 @@ +/* Macros for atomic functionality for tile. + Copyright (C) 2011-2014 Free Software Foundation, Inc. + Contributed by Walter Lee (walt@tilera.com) + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + + +/* Provides macros for common atomic functionality. */ + +#ifndef _ATOMIC_H_ +#define _ATOMIC_H_ + +#ifdef __tilegx__ +/* Atomic instruction macros + + The macros provided by atomic.h simplify access to the TILE-Gx + architecture's atomic instructions. The architecture provides a + variety of atomic instructions, including "exchange", "compare and + exchange", "fetch and ADD", "fetch and AND", "fetch and OR", and + "fetch and ADD if greater than or equal to zero". + + No barrier or fence semantics are implied by any of the atomic + instructions for manipulating memory; you must specify the barriers + that you wish explicitly, using the provided macros. + + Any integral 32- or 64-bit value can be used as the argument + to these macros, such as "int", "long long", "unsigned long", etc. + The pointers must be aligned to 4 or 8 bytes for 32- or 64-bit data. + The "exchange" and "compare and exchange" macros may also take + pointer values. We use the pseudo-type "VAL" in the documentation + to indicate the use of an appropriate type. */ +#else +/* Atomic instruction macros + + The macros provided by atomic.h simplify access to the Tile + architecture's atomic instructions. Since the architecture + supports test-and-set as its only in-silicon atomic operation, many + of the operations provided by this header are implemented as + fast-path calls to Linux emulation routines. + + Using the kernel for atomic operations allows userspace to take + advantage of the kernel's existing atomic-integer support (managed + by a distributed array of locks). The kernel provides proper + ordering among simultaneous atomic operations on different cores, + and guarantees a process can not be context-switched part way + through an atomic operation. By virtue of sharing the kernel + atomic implementation, the userspace atomic operations + are compatible with the atomic methods provided by the kernel's + futex() syscall API. Note that these operations never cause Linux + kernel scheduling, and are in fact invisible to the kernel; they + simply act as regular function calls but with an elevated privilege + level. Note that the kernel's distributed lock array is hashed by + using only VA bits from the atomic value's address (to avoid the + performance hit of page table locking and multiple page-table + lookups to get the PA) and only the VA bits that are below page + granularity (to properly lock simultaneous accesses to the same + page mapped at different VAs). As a result, simultaneous atomic + operations on values whose addresses are at the same offset on a + page will contend in the kernel for the same lock array element. + + No barrier or fence semantics are implied by any of the atomic + instructions for manipulating memory; you must specify the barriers + that you wish explicitly, using the provided macros. + + Any integral 32- or 64-bit value can be used as the argument + to these macros, such as "int", "long long", "unsigned long", etc. + The pointers must be aligned to 4 or 8 bytes for 32- or 64-bit data. + The "exchange" and "compare and exchange" macros may also take + pointer values. We use the pseudo-type "VAL" in the documentation + to indicate the use of an appropriate type. + + The 32-bit routines are implemented using a single kernel fast + syscall, as is the 64-bit compare-and-exchange. The other 64-bit + routines are implemented by looping over the 64-bit + compare-and-exchange routine, so may be potentially less efficient. */ +#endif + +#ifdef __tilegx__ +#include <arch/spr_def.h> +#else +#include <asm/unistd.h> +#endif + + +/* 32-bit integer compare-and-exchange. */ +static __inline __attribute__ ((always_inline)) + int arch_atomic_val_compare_and_exchange_4 (volatile int *mem, + int oldval, int newval) +{ +#ifdef __tilegx__ + __insn_mtspr (SPR_CMPEXCH_VALUE, oldval); + return __insn_cmpexch4 (mem, newval); +#else + int result; + __asm__ __volatile__ ("swint1":"=R00" (result), + "=m" (*mem):"R10" (__NR_FAST_cmpxchg), "R00" (mem), + "R01" (oldval), "R02" (newval), "m" (*mem):"r20", + "r21", "r22", "r23", "r24", "r25", "r26", "r27", + "r28", "r29", "memory"); + return result; +#endif +} + +/* 64-bit integer compare-and-exchange. */ +static __inline __attribute__ ((always_inline)) + long long arch_atomic_val_compare_and_exchange_8 (volatile long long + *mem, long long oldval, + long long newval) +{ +#ifdef __tilegx__ + __insn_mtspr (SPR_CMPEXCH_VALUE, oldval); + return __insn_cmpexch (mem, newval); +#else + unsigned int result_lo, result_hi; + unsigned int oldval_lo = oldval & 0xffffffffu, oldval_hi = oldval >> 32; + unsigned int newval_lo = newval & 0xffffffffu, newval_hi = newval >> 32; + __asm__ __volatile__ ("swint1":"=R00" (result_lo), "=R01" (result_hi), + "=m" (*mem):"R10" (__NR_FAST_cmpxchg64), "R00" (mem), + "R02" (oldval_lo), "R03" (oldval_hi), + "R04" (newval_lo), "R05" (newval_hi), + "m" (*mem):"r20", "r21", "r22", "r23", "r24", "r25", + "r26", "r27", "r28", "r29", "memory"); + return ((long long) result_hi) << 32 | result_lo; +#endif +} + +/* This non-existent symbol is called for sizes other than "4" and "8", + indicating a bug in the caller. */ +extern int __arch_atomic_error_bad_argument_size (void) + __attribute__ ((warning ("sizeof atomic argument not 4 or 8"))); + + +#define arch_atomic_val_compare_and_exchange(mem, o, n) \ + __extension__ ({ \ + (__typeof(*(mem)))(__typeof(*(mem)-*(mem))) \ + ((sizeof(*(mem)) == 8) ? \ + arch_atomic_val_compare_and_exchange_8( \ + (volatile long long*)(mem), (__typeof((o)-(o)))(o), \ + (__typeof((n)-(n)))(n)) : \ + (sizeof(*(mem)) == 4) ? \ + arch_atomic_val_compare_and_exchange_4( \ + (volatile int*)(mem), (__typeof((o)-(o)))(o), \ + (__typeof((n)-(n)))(n)) : \ + __arch_atomic_error_bad_argument_size()); \ + }) + +#define arch_atomic_bool_compare_and_exchange(mem, o, n) \ + __extension__ ({ \ + __typeof(o) __o = (o); \ + __builtin_expect( \ + __o == arch_atomic_val_compare_and_exchange((mem), __o, (n)), 1); \ + }) + + +/* Loop with compare_and_exchange until we guess the correct value. + Normally "expr" will be an expression using __old and __value. */ +#define __arch_atomic_update_cmpxchg(mem, value, expr) \ + __extension__ ({ \ + __typeof(value) __value = (value); \ + __typeof(*(mem)) *__mem = (mem), __old = *__mem, __guess; \ + do { \ + __guess = __old; \ + __old = arch_atomic_val_compare_and_exchange(__mem, __old, (expr)); \ + } while (__builtin_expect(__old != __guess, 0)); \ + __old; \ + }) + +#ifdef __tilegx__ + +/* Generic atomic op with 8- or 4-byte variant. + The _mask, _addend, and _expr arguments are ignored on tilegx. */ +#define __arch_atomic_update(mem, value, op, _mask, _addend, _expr) \ + __extension__ ({ \ + ((__typeof(*(mem))) \ + ((sizeof(*(mem)) == 8) ? (__typeof(*(mem)-*(mem)))__insn_##op( \ + (volatile void *)(mem), \ + (long long)(__typeof((value)-(value)))(value)) : \ + (sizeof(*(mem)) == 4) ? (int)__insn_##op##4( \ + (volatile void *)(mem), \ + (int)(__typeof((value)-(value)))(value)) : \ + __arch_atomic_error_bad_argument_size())); \ + }) + +#else + +/* This uses TILEPro's fast syscall support to atomically compute: + + int old = *ptr; + *ptr = (old & mask) + addend; + return old; + + This primitive can be used for atomic exchange, add, or, and. + Only 32-bit support is provided. */ +static __inline __attribute__ ((always_inline)) + int + __arch_atomic_update_4 (volatile int *mem, int mask, int addend) +{ + int result; + __asm__ __volatile__ ("swint1":"=R00" (result), + "=m" (*mem):"R10" (__NR_FAST_atomic_update), + "R00" (mem), "R01" (mask), "R02" (addend), + "m" (*mem):"r20", "r21", "r22", "r23", "r24", "r25", + "r26", "r27", "r28", "r29", "memory"); + return result; +} + +/* Generic atomic op with 8- or 4-byte variant. + The _op argument is ignored on tilepro. */ +#define __arch_atomic_update(mem, value, _op, mask, addend, expr) \ + __extension__ ({ \ + (__typeof(*(mem)))(__typeof(*(mem)-*(mem))) \ + ((sizeof(*(mem)) == 8) ? \ + __arch_atomic_update_cmpxchg((mem), (value), (expr)) : \ + (sizeof(*(mem)) == 4) ? \ + __arch_atomic_update_4((volatile int*)(mem), \ + (__typeof((mask)-(mask)))(mask), \ + (__typeof((addend)-(addend)))(addend)) : \ + __arch_atomic_error_bad_argument_size()); \ + }) + +#endif /* __tilegx__ */ + + +#define arch_atomic_exchange(mem, newvalue) \ + __arch_atomic_update(mem, newvalue, exch, 0, newvalue, __value) + +#define arch_atomic_add(mem, value) \ + __arch_atomic_update(mem, value, fetchadd, -1, value, __old + __value) + +#define arch_atomic_sub(mem, value) arch_atomic_add((mem), -(value)) + +#define arch_atomic_increment(mem) arch_atomic_add((mem), 1) + +#define arch_atomic_decrement(mem) arch_atomic_add((mem), -1) + +#define arch_atomic_and(mem, mask) \ + __arch_atomic_update(mem, mask, fetchand, mask, 0, __old & __value) + +#define arch_atomic_or(mem, mask) \ + __arch_atomic_update(mem, mask, fetchor, ~mask, mask, __old | __value) + +#define arch_atomic_xor(mem, mask) \ + __arch_atomic_update_cmpxchg(mem, mask, __old ^ __value) + +#define arch_atomic_nand(mem, mask) \ + __arch_atomic_update_cmpxchg(mem, mask, ~(__old & __value)) + +#define arch_atomic_bit_set(mem, bit) \ + __extension__ ({ \ + __typeof(*(mem)) __mask = (__typeof(*(mem)))1 << (bit); \ + __mask & arch_atomic_or((mem), __mask); \ + }) + +#define arch_atomic_bit_clear(mem, bit) \ + __extension__ ({ \ + __typeof(*(mem)) __mask = (__typeof(*(mem)))1 << (bit); \ + __mask & arch_atomic_and((mem), ~__mask); \ + }) + +#ifdef __tilegx__ +/* Atomically store a new value to memory. + Note that you can freely use types of any size here, unlike the + other atomic routines, which require 32- or 64-bit types. + This accessor is provided for compatibility with TILEPro, which + required an explicit atomic operation for stores that needed + to be atomic with respect to other atomic methods in this header. */ +#define arch_atomic_write(mem, value) ((void) (*(mem) = (value))) +#else +#define arch_atomic_write(mem, value) \ + do { \ + __typeof(mem) __aw_mem = (mem); \ + __typeof(value) __aw_val = (value); \ + unsigned int *__aw_mem32, __aw_intval, __aw_val32, __aw_off, __aw_mask; \ + __aw_intval = (__typeof((value) - (value)))__aw_val; \ + switch (sizeof(*__aw_mem)) { \ + case 8: \ + __arch_atomic_update_cmpxchg(__aw_mem, __aw_val, __value); \ + break; \ + case 4: \ + __arch_atomic_update_4((int *)__aw_mem, 0, __aw_intval); \ + break; \ + case 2: \ + __aw_off = 8 * ((long)__aw_mem & 0x2); \ + __aw_mask = 0xffffU << __aw_off; \ + __aw_mem32 = (unsigned int *)((long)__aw_mem & ~0x2); \ + __aw_val32 = (__aw_intval << __aw_off) & __aw_mask; \ + __arch_atomic_update_cmpxchg(__aw_mem32, __aw_val32, \ + (__old & ~__aw_mask) | __value); \ + break; \ + case 1: \ + __aw_off = 8 * ((long)__aw_mem & 0x3); \ + __aw_mask = 0xffU << __aw_off; \ + __aw_mem32 = (unsigned int *)((long)__aw_mem & ~0x3); \ + __aw_val32 = (__aw_intval << __aw_off) & __aw_mask; \ + __arch_atomic_update_cmpxchg(__aw_mem32, __aw_val32, \ + (__old & ~__aw_mask) | __value); \ + break; \ + } \ + } while (0) +#endif + +/* Compiler barrier. + + This macro prevents loads or stores from being moved by the compiler + across the macro. Any loaded value that was loaded before this + macro must then be reloaded by the compiler. */ +#define arch_atomic_compiler_barrier() __asm__ __volatile__("" ::: "memory") + +/* Full memory barrier. + + This macro has the semantics of arch_atomic_compiler_barrer(), but also + ensures that previous stores are visible to other cores, and that + all previous loaded values have been placed into their target + register on this core. */ +#define arch_atomic_full_barrier() __insn_mf() + +/* Read memory barrier. + + Ensure that all reads by this processor that occurred prior to the + read memory barrier have completed, and that no reads that occur + after the read memory barrier on this processor are initiated + before the barrier. + + On current TILE chips a read barrier is implemented as a full barrier, + but this may not be true in later versions of the architecture. + + See also arch_atomic_acquire_barrier() for the appropriate idiom to use + to ensure no reads are lifted above an atomic lock instruction. */ +#define arch_atomic_read_barrier() arch_atomic_full_barrier() + +/* Write memory barrier. + + Ensure that all writes by this processor that occurred prior to the + write memory barrier have completed, and that no writes that occur + after the write memory barrier on this processor are initiated + before the barrier. + + On current TILE chips a write barrier is implemented as a full barrier, + but this may not be true in later versions of the architecture. + + See also arch_atomic_release_barrier() for the appropriate idiom to use + to ensure all writes are complete prior to an atomic unlock instruction. */ +#define arch_atomic_write_barrier() arch_atomic_full_barrier() + +/* Lock acquisition barrier. + + Ensure that no load operations that follow this macro in the + program can issue prior to the barrier. Without such a barrier, + the compiler can reorder them to issue earlier, or the hardware can + issue them speculatively. The latter is not currently done in the + Tile microarchitecture, but using this operation improves + portability to future implementations. + + This operation is intended to be used as part of the "acquire" + path for locking, that is, when entering a critical section. + This should be done after the atomic operation that actually + acquires the lock, and in conjunction with a "control dependency" + that checks the atomic operation result to see if the lock was + in fact acquired. See the arch_atomic_read_barrier() macro + for a heavier-weight barrier to use in certain unusual constructs, + or arch_atomic_acquire_barrier_value() if no control dependency exists. */ +#define arch_atomic_acquire_barrier() arch_atomic_compiler_barrier() + +/* Lock release barrier. + + Ensure that no store operations that precede this macro in the + program complete subsequent to the barrier. Without such a + barrier, the compiler can reorder stores to issue later, or stores + can be still outstanding in the memory network. + + This operation is intended to be used as part of the "release" path + for locking, that is, when leaving a critical section. This should + be done before the operation (such as a store of zero) that + actually releases the lock. */ +#define arch_atomic_release_barrier() arch_atomic_write_barrier() + +/* Barrier until the read of a particular value is complete. + + This is occasionally useful when constructing certain locking + scenarios. For example, you might write a routine that issues an + atomic instruction to enter a critical section, then reads one or + more values within the critical section without checking to see if + the critical section was in fact acquired, and only later checks + the atomic instruction result to see if the lock was acquired. If + so the routine could properly release the lock and know that the + values that were read were valid. + + In this scenario, it is required to wait for the result of the + atomic instruction, even if the value itself is not checked. This + guarantees that if the atomic instruction succeeded in taking the lock, + the lock was held before any reads in the critical section issued. */ +#define arch_atomic_acquire_barrier_value(val) \ + __asm__ __volatile__("move %0, %0" :: "r"(val)) + +/* Access the given variable in memory exactly once. + + In some contexts, an algorithm may need to force access to memory, + since otherwise the compiler may think it can optimize away a + memory load or store; for example, in a loop when polling memory to + see if another cpu has updated it yet. Generally this is only + required for certain very carefully hand-tuned algorithms; using it + unnecessarily may result in performance losses. + + A related use of this macro is to ensure that the compiler does not + rematerialize the value of "x" by reloading it from memory + unexpectedly; the "volatile" marking will prevent the compiler from + being able to rematerialize. This is helpful if an algorithm needs + to read a variable without locking, but needs it to have the same + value if it ends up being used several times within the algorithm. + + Note that multiple uses of this macro are guaranteed to be ordered, + i.e. the compiler will not reorder stores or loads that are wrapped + in arch_atomic_access_once(). */ +#define arch_atomic_access_once(x) (*(volatile __typeof(x) *)&(x)) + + + +#endif /* !_ATOMIC_H_ */ |