8 files changed, 374 insertions, 66 deletions
diff --git a/gcc-4.8/libgcc/config/arm/sfp-machine.h b/gcc-4.8/libgcc/config/arm/sfp-machine.h
index a89d05a00..f29b23eeb 100644
--- a/gcc-4.8/libgcc/config/arm/sfp-machine.h
+++ b/gcc-4.8/libgcc/config/arm/sfp-machine.h
@@ -19,10 +19,12 @@ typedef int __gcc_CMPtype __attribute__ ((mode (__libgcc_cmp_return__)));
 #define _FP_DIV_MEAT_D(R,X,Y)	_FP_DIV_MEAT_2_udiv(D,R,X,Y)
 #define _FP_DIV_MEAT_Q(R,X,Y)	_FP_DIV_MEAT_4_udiv(Q,R,X,Y)
 
-#define _FP_NANFRAC_H		((_FP_QNANBIT_H << 1) - 1)
-#define _FP_NANFRAC_S		((_FP_QNANBIT_S << 1) - 1)
-#define _FP_NANFRAC_D		((_FP_QNANBIT_D << 1) - 1), -1
-#define _FP_NANFRAC_Q		((_FP_QNANBIT_Q << 1) - 1), -1, -1, -1
+/* According to RTABI, QNAN is only with the most significant bit of the
+   significand set, and all other significand bits zero.  */
+#define _FP_NANFRAC_H		_FP_QNANBIT_H
+#define _FP_NANFRAC_S		_FP_QNANBIT_S
+#define _FP_NANFRAC_D		_FP_QNANBIT_D, 0
+#define _FP_NANFRAC_Q		_FP_QNANBIT_Q, 0, 0, 0
 #define _FP_NANSIGN_H		0
 #define _FP_NANSIGN_S		0
 #define _FP_NANSIGN_D		0
diff --git a/gcc-4.8/libgcc/config/i386/cygming-crtbegin.c b/gcc-4.8/libgcc/config/i386/cygming-crtbegin.c
index 6a513cb3f..210bdb494 100644
--- a/gcc-4.8/libgcc/config/i386/cygming-crtbegin.c
+++ b/gcc-4.8/libgcc/config/i386/cygming-crtbegin.c
@@ -69,6 +69,9 @@ static EH_FRAME_SECTION_CONST char __EH_FRAME_BEGIN__[]
   = { };
 
 static struct object obj;
+
+/* Handle of libgcc's DLL reference.  */
+HANDLE hmod_libgcc;
 #endif
 
 #if TARGET_USE_JCR_SECTION
@@ -93,9 +96,14 @@ __gcc_register_frame (void)
 
   void (*register_frame_fn) (const void *, struct object *);
   HANDLE h = GetModuleHandle (LIBGCC_SONAME);
+
   if (h)
+    {
+      /* Increasing the load-count of LIBGCC_SONAME DLL.  */
+      hmod_libgcc = LoadLibrary (LIBGCC_SONAME);
     register_frame_fn = (void (*) (const void *, struct object *))
 			GetProcAddress (h, "__register_frame_info");
+    }
   else 
     register_frame_fn = __register_frame_info;
   if (register_frame_fn)
@@ -132,5 +140,7 @@ __gcc_deregister_frame (void)
     deregister_frame_fn = __deregister_frame_info;
   if (deregister_frame_fn)
      deregister_frame_fn (__EH_FRAME_BEGIN__);
+  if (hmod_libgcc)
+    FreeLibrary (hmod_libgcc);
 #endif
 }
diff --git a/gcc-4.8/libgcc/config/i386/sfp-exceptions.c b/gcc-4.8/libgcc/config/i386/sfp-exceptions.c
index fbaaab22f..1bab10b0b 100644
--- a/gcc-4.8/libgcc/config/i386/sfp-exceptions.c
+++ b/gcc-4.8/libgcc/config/i386/sfp-exceptions.c
@@ -47,21 +47,33 @@ __sfp_handle_exceptions (int _fex)
   if (_fex & FP_EX_INVALID)
     {
       float f = 0.0f;
-#ifdef __x86_64__
+#ifdef __SSE_MATH__
+      volatile float r __attribute__ ((unused));
       asm volatile ("%vdivss\t{%0, %d0|%d0, %0}" : "+x" (f));
+      r = f; /* Needed to trigger exception.   */
 #else
       asm volatile ("fdiv\t{%y0, %0|%0, %y0}" : "+t" (f));
-      asm volatile ("fwait");
+      /* No need for fwait, exception is triggered by emitted fstp.  */
 #endif
     }
+  if (_fex & FP_EX_DENORM)
+    {
+      struct fenv temp;
+      asm volatile ("fnstenv\t%0" : "=m" (temp));
+      temp.__status_word |= FP_EX_DENORM;
+      asm volatile ("fldenv\t%0" : : "m" (temp));
+      asm volatile ("fwait");
+    }
   if (_fex & FP_EX_DIVZERO)
     {
       float f = 1.0f, g = 0.0f;
-#ifdef __x86_64__
+#ifdef __SSE_MATH__
+      volatile float r __attribute__ ((unused));
       asm volatile ("%vdivss\t{%1, %d0|%d0, %1}" : "+x" (f) : "xm" (g));
+      r = f; /* Needed to trigger exception.   */
 #else
       asm volatile ("fdivs\t%1" : "+t" (f) : "m" (g));
-      asm volatile ("fwait");
+      /* No need for fwait, exception is triggered by emitted fstp.  */
 #endif
     }
   if (_fex & FP_EX_OVERFLOW)
@@ -82,11 +94,15 @@ __sfp_handle_exceptions (int _fex)
     }
   if (_fex & FP_EX_INEXACT)
     {
-      struct fenv temp;
-      asm volatile ("fnstenv\t%0" : "=m" (temp));
-      temp.__status_word |= FP_EX_INEXACT;
-      asm volatile ("fldenv\t%0" : : "m" (temp));
-      asm volatile ("fwait");
+      float f = 1.0f, g = 3.0f;
+#ifdef __SSE_MATH__
+      volatile float r __attribute__ ((unused));
+      asm volatile ("%vdivss\t{%1, %d0|%d0, %1}" : "+x" (f) : "xm" (g));
+      r = f; /* Needed to trigger exception.   */
+#else
+      asm volatile ("fdivs\t%1" : "+t" (f) : "m" (g));
+      /* No need for fwait, exception is triggered by emitted fstp.  */
+#endif
     }
 };
 #endif
diff --git a/gcc-4.8/libgcc/config/libbid/ChangeLog b/gcc-4.8/libgcc/config/libbid/ChangeLog
index c1f04e845..678fcee0f 100644
--- a/gcc-4.8/libgcc/config/libbid/ChangeLog
+++ b/gcc-4.8/libgcc/config/libbid/ChangeLog
@@ -1,3 +1,7 @@
+2014-05-22  Release Manager
+
+	* GCC 4.8.3 released.
+
 2013-10-16  Release Manager
 
 	* GCC 4.8.2 released.
diff --git a/gcc-4.8/libgcc/config/rs6000/ibm-ldouble.c b/gcc-4.8/libgcc/config/rs6000/ibm-ldouble.c
index 28e02e995..574e395f8 100644
--- a/gcc-4.8/libgcc/config/rs6000/ibm-ldouble.c
+++ b/gcc-4.8/libgcc/config/rs6000/ibm-ldouble.c
@@ -188,7 +188,16 @@ __gcc_qdiv (double a, double b, double c, double d)
       || nonfinite (t))
     return t;
 
-  /* Finite nonzero result requires corrections to the highest order term.  */
+  /* Finite nonzero result requires corrections to the highest order
+     term.  These corrections require the low part of c * t to be
+     exactly represented in double.  */
+  if (fabs (a) <= 0x1p-969)
+    {
+      a *= 0x1p106;
+      b *= 0x1p106;
+      c *= 0x1p106;
+      d *= 0x1p106;
+    }
 
   s = c * t;                    /* (s,sigma) = c*t exactly.  */
   w = -(-b + d * t);	/* Written to get fnmsub for speed, but not
diff --git a/gcc-4.8/libgcc/config/rs6000/linux-unwind.h b/gcc-4.8/libgcc/config/rs6000/linux-unwind.h
index c9273c404..a421b1582 100644
--- a/gcc-4.8/libgcc/config/rs6000/linux-unwind.h
+++ b/gcc-4.8/libgcc/config/rs6000/linux-unwind.h
@@ -24,9 +24,19 @@
 
 #define R_LR		65
 #define R_CR2		70
+#define R_CR3		71
+#define R_CR4		72
 #define R_VR0		77
 #define R_VRSAVE	109
 
+#ifdef __powerpc64__
+#if _CALL_ELF == 2
+#define TOC_SAVE_SLOT	24
+#else
+#define TOC_SAVE_SLOT	40
+#endif
+#endif
+
 struct gcc_vregs
 {
   __attribute__ ((vector_size (16))) int vr[32];
@@ -107,6 +117,8 @@ get_regs (struct _Unwind_Context *context)
     }
   else if (pc[1] == 0x380000AC)
     {
+#if _CALL_ELF != 2
+      /* These old kernel versions never supported ELFv2.  */
       /* This works for 2.4 kernels, but not for 2.6 kernels with vdso
 	 because pc isn't pointing into the stack.  Can be removed when
 	 no one is running 2.4.19 or 2.4.20, the first two ppc64
@@ -121,6 +133,7 @@ get_regs (struct _Unwind_Context *context)
       if ((long) frame24->puc != -21 * 8)
 	return frame24->puc->regs;
       else
+#endif
 	{
 	  /* This works for 2.4.21 and later kernels.  */
 	  struct rt_sigframe {
@@ -185,6 +198,7 @@ ppc_fallback_frame_state (struct _Unwind_Context *context,
 {
   struct gcc_regs *regs = get_regs (context);
   struct gcc_vregs *vregs;
+  long cr_offset;
   long new_cfa;
   int i;
 
@@ -206,11 +220,21 @@ ppc_fallback_frame_state (struct _Unwind_Context *context,
       fs->regs.reg[i].loc.offset = (long) &regs->gpr[i] - new_cfa;
     }
 
+  /* The CR is saved in the low 32 bits of regs->ccr.  */
+  cr_offset = (long) &regs->ccr - new_cfa;
+#ifndef __LITTLE_ENDIAN__
+  cr_offset += sizeof (long) - 4;
+#endif
+  /* In the ELFv1 ABI, CR2 stands in for the whole CR.  */
   fs->regs.reg[R_CR2].how = REG_SAVED_OFFSET;
-  /* CR? regs are always 32-bit and PPC is big-endian, so in 64-bit
-     libgcc loc.offset needs to point to the low 32 bits of regs->ccr.  */
-  fs->regs.reg[R_CR2].loc.offset = (long) &regs->ccr - new_cfa
-				   + sizeof (long) - 4;
+  fs->regs.reg[R_CR2].loc.offset = cr_offset;
+#if _CALL_ELF == 2
+  /* In the ELFv2 ABI, every CR field has a separate CFI entry.  */
+  fs->regs.reg[R_CR3].how = REG_SAVED_OFFSET;
+  fs->regs.reg[R_CR3].loc.offset = cr_offset;
+  fs->regs.reg[R_CR4].how = REG_SAVED_OFFSET;
+  fs->regs.reg[R_CR4].loc.offset = cr_offset;
+#endif
 
   fs->regs.reg[R_LR].how = REG_SAVED_OFFSET;
   fs->regs.reg[R_LR].loc.offset = (long) &regs->link - new_cfa;
@@ -294,9 +318,13 @@ frob_update_context (struct _Unwind_Context *context, _Unwind_FrameState *fs ATT
 	 figure out if it was saved.  The big problem here is that the
 	 code that does the save/restore is generated by the linker, so
 	 we have no good way to determine at compile time what to do.  */
-      if (pc[0] == 0xF8410028
+      if (pc[0] == 0xF8410000 + TOC_SAVE_SLOT
+#if _CALL_ELF != 2
+	  /* The ELFv2 linker never generates the old PLT stub form.  */
 	  || ((pc[0] & 0xFFFF0000) == 0x3D820000
-	      && pc[1] == 0xF8410028))
+	      && pc[1] == 0xF8410000 + TOC_SAVE_SLOT)
+#endif
+	  )
 	{
 	  /* We are in a plt call stub or r2 adjusting long branch stub,
 	     before r2 has been saved.  Keep REG_UNSAVED.  */
@@ -305,18 +333,21 @@ frob_update_context (struct _Unwind_Context *context, _Unwind_FrameState *fs ATT
 	{
 	  unsigned int *insn
 	    = (unsigned int *) _Unwind_GetGR (context, R_LR);
-	  if (insn && *insn == 0xE8410028)
-	    _Unwind_SetGRPtr (context, 2, context->cfa + 40);
+	  if (insn && *insn == 0xE8410000 + TOC_SAVE_SLOT)
+	    _Unwind_SetGRPtr (context, 2, context->cfa + TOC_SAVE_SLOT);
+#if _CALL_ELF != 2
+	  /* ELFv2 does not use this function pointer call sequence.  */
 	  else if (pc[0] == 0x4E800421
-		   && pc[1] == 0xE8410028)
+		   && pc[1] == 0xE8410000 + TOC_SAVE_SLOT)
 	    {
 	      /* We are at the bctrl instruction in a call via function
 		 pointer.  gcc always emits the load of the new R2 just
 		 before the bctrl so this is the first and only place
 		 we need to use the stored R2.  */
 	      _Unwind_Word sp = _Unwind_GetGR (context, 1);
-	      _Unwind_SetGRPtr (context, 2, (void *)(sp + 40));
+	      _Unwind_SetGRPtr (context, 2, (void *)(sp + TOC_SAVE_SLOT));
 	    }
+#endif
 	}
     }
 #endif
diff --git a/gcc-4.8/libgcc/config/rs6000/tramp.S b/gcc-4.8/libgcc/config/rs6000/tramp.S
index 14cb18de2..fe2a4543b 100644
--- a/gcc-4.8/libgcc/config/rs6000/tramp.S
+++ b/gcc-4.8/libgcc/config/rs6000/tramp.S
@@ -116,4 +116,70 @@ FUNC_END(__trampoline_setup)
 
 #endif
 
+#elif _CALL_ELF == 2
+	.type	trampoline_initial,@object
+	.align	3
+trampoline_initial:
+	ld	r11,.Lchain(r12)
+	ld	r12,.Lfunc(r12)
+	mtctr	r12
+	bctr
+.Lfunc = .-trampoline_initial
+	.quad	0			/* will be replaced with function address */
+.Lchain = .-trampoline_initial
+	.quad	0			/* will be replaced with static chain */
+
+trampoline_size = .-trampoline_initial
+	.size	trampoline_initial,trampoline_size
+
+
+/* R3 = stack address to store trampoline */
+/* R4 = length of trampoline area */
+/* R5 = function address */
+/* R6 = static chain */
+
+	.pushsection ".toc","aw"
+.LC0:
+	.quad	trampoline_initial-8
+	.popsection
+
+FUNC_START(__trampoline_setup)
+	addis 7,2,.LC0@toc@ha
+	ld 7,.LC0@toc@l(7)	/* trampoline address -8 */
+
+	li	r8,trampoline_size	/* verify that the trampoline is big enough */
+	cmpw	cr1,r8,r4
+	srwi	r4,r4,3		/* # doublewords to move */
+	addi	r9,r3,-8	/* adjust pointer for stdu */
+	mtctr	r4
+	blt	cr1,.Labort
+
+	/* Copy the instructions to the stack */
+.Lmove:
+	ldu	r10,8(r7)
+	stdu	r10,8(r9)
+	bdnz	.Lmove
+
+	/* Store correct function and static chain */
+	std	r5,.Lfunc(r3)
+	std	r6,.Lchain(r3)
+
+	/* Now flush both caches */
+	mtctr	r4
+.Lcache:
+	icbi	0,r3
+	dcbf	0,r3
+	addi	r3,r3,8
+	bdnz	.Lcache
+
+	/* Finally synchronize things & return */
+	sync
+	isync
+	blr
+
+.Labort:
+	bl	JUMP_TARGET(abort)
+	nop
+FUNC_END(__trampoline_setup)
+
 #endif
diff --git a/gcc-4.8/libgcc/config/tilepro/atomic.c b/gcc-4.8/libgcc/config/tilepro/atomic.c
index 762bd6dc7..415353254 100644
--- a/gcc-4.8/libgcc/config/tilepro/atomic.c
+++ b/gcc-4.8/libgcc/config/tilepro/atomic.c
@@ -28,7 +28,7 @@
 /* This code should be inlined by the compiler, but for now support
    it as out-of-line methods in libgcc.  */
 
-static void
+static inline void
 pre_atomic_barrier (int model)
 {
   switch ((enum memmodel) model)
@@ -44,7 +44,7 @@ pre_atomic_barrier (int model)
   return;
 }
 
-static void
+static inline void
 post_atomic_barrier (int model)
 {
   switch ((enum memmodel) model)
@@ -62,16 +62,21 @@ post_atomic_barrier (int model)
 
 #define __unused __attribute__((unused))
 
-#define __atomic_fetch_and_do(type, size, opname)		\
-type								\
-__atomic_fetch_##opname##_##size(type* p, type i, int model)	\
+#define __fetch_and_do(proto, type, size, opname, top, bottom)	\
+proto								\
 {								\
-  pre_atomic_barrier(model);					\
+  top;								\
   type rv = arch_atomic_##opname(p, i);				\
-  post_atomic_barrier(model);					\
+  bottom;							\
   return rv;							\
 }
 
+#define __atomic_fetch_and_do(type, size, opname)			\
+  __fetch_and_do(type __atomic_fetch_##opname##_##size(type* p, type i, int model), \
+		 type, size, opname,					\
+		 pre_atomic_barrier(model),				\
+		 post_atomic_barrier(model))				\
+
 __atomic_fetch_and_do (int, 4, add)
 __atomic_fetch_and_do (int, 4, sub)
 __atomic_fetch_and_do (int, 4, or)
@@ -84,27 +89,73 @@ __atomic_fetch_and_do (long long, 8, or)
 __atomic_fetch_and_do (long long, 8, and)
 __atomic_fetch_and_do (long long, 8, xor)
 __atomic_fetch_and_do (long long, 8, nand)
-#define __atomic_do_and_fetch(type, size, opname, op)		\
-type								\
-__atomic_##opname##_fetch_##size(type* p, type i, int model)	\
+
+#define __sync_fetch_and_do(type, size, opname)				\
+  __fetch_and_do(type __sync_fetch_and_##opname##_##size(type* p, type i), \
+		 type, size, opname,					\
+		 arch_atomic_write_barrier(),				\
+		 arch_atomic_read_barrier())
+
+__sync_fetch_and_do (int, 4, add)
+__sync_fetch_and_do (int, 4, sub)
+__sync_fetch_and_do (int, 4, or)
+__sync_fetch_and_do (int, 4, and)
+__sync_fetch_and_do (int, 4, xor)
+__sync_fetch_and_do (int, 4, nand)
+__sync_fetch_and_do (long long, 8, add)
+__sync_fetch_and_do (long long, 8, sub)
+__sync_fetch_and_do (long long, 8, or)
+__sync_fetch_and_do (long long, 8, and)
+__sync_fetch_and_do (long long, 8, xor)
+__sync_fetch_and_do (long long, 8, nand)
+
+#define __do_and_fetch(proto, type, size, opname, op, op2, top, bottom)	\
+proto									\
 {								\
-  pre_atomic_barrier(model);					\
-  type rv = arch_atomic_##opname(p, i) op i;			\
-  post_atomic_barrier(model);					\
+  top;									\
+  type rv = op2 (arch_atomic_##opname(p, i) op i);			\
+  bottom;								\
   return rv;							\
 }
-__atomic_do_and_fetch (int, 4, add, +)
-__atomic_do_and_fetch (int, 4, sub, -)
-__atomic_do_and_fetch (int, 4, or, |)
-__atomic_do_and_fetch (int, 4, and, &)
-__atomic_do_and_fetch (int, 4, xor, |)
-__atomic_do_and_fetch (int, 4, nand, &)
-__atomic_do_and_fetch (long long, 8, add, +)
-__atomic_do_and_fetch (long long, 8, sub, -)
-__atomic_do_and_fetch (long long, 8, or, |)
-__atomic_do_and_fetch (long long, 8, and, &)
-__atomic_do_and_fetch (long long, 8, xor, |)
-__atomic_do_and_fetch (long long, 8, nand, &)
+
+#define __atomic_do_and_fetch(type, size, opname, op, op2)		\
+  __do_and_fetch(type __atomic_##opname##_fetch_##size(type* p, type i, int model), \
+		 type, size, opname, op, op2,				\
+		 pre_atomic_barrier(model),				\
+		 post_atomic_barrier(model))				\
+
+__atomic_do_and_fetch (int, 4, add, +, )
+__atomic_do_and_fetch (int, 4, sub, -, )
+__atomic_do_and_fetch (int, 4, or, |, )
+__atomic_do_and_fetch (int, 4, and, &, )
+__atomic_do_and_fetch (int, 4, xor, |, )
+__atomic_do_and_fetch (int, 4, nand, &, ~)
+__atomic_do_and_fetch (long long, 8, add, +, )
+__atomic_do_and_fetch (long long, 8, sub, -, )
+__atomic_do_and_fetch (long long, 8, or, |, )
+__atomic_do_and_fetch (long long, 8, and, &, )
+__atomic_do_and_fetch (long long, 8, xor, |, )
+__atomic_do_and_fetch (long long, 8, nand, &, ~)
+
+#define __sync_do_and_fetch(type, size, opname, op, op2)		\
+  __do_and_fetch(type __sync_##opname##_and_fetch_##size(type* p, type i), \
+		 type, size, opname, op, op2,				\
+		 arch_atomic_write_barrier(),				\
+		 arch_atomic_read_barrier())				\
+
+__sync_do_and_fetch (int, 4, add, +, )
+__sync_do_and_fetch (int, 4, sub, -, )
+__sync_do_and_fetch (int, 4, or, |, )
+__sync_do_and_fetch (int, 4, and, &, )
+__sync_do_and_fetch (int, 4, xor, |, )
+__sync_do_and_fetch (int, 4, nand, &, ~)
+__sync_do_and_fetch (long long, 8, add, +, )
+__sync_do_and_fetch (long long, 8, sub, -, )
+__sync_do_and_fetch (long long, 8, or, |, )
+__sync_do_and_fetch (long long, 8, and, &, )
+__sync_do_and_fetch (long long, 8, xor, |, )
+__sync_do_and_fetch (long long, 8, nand, &, ~)
+
 #define __atomic_exchange_methods(type, size)				\
 bool									\
 __atomic_compare_exchange_##size(volatile type* ptr, type* oldvalp,	\
@@ -128,49 +179,117 @@ __atomic_exchange_##size(volatile type* ptr, type val, int model)	\
   post_atomic_barrier(model);						\
   return retval;							\
 }
+
 __atomic_exchange_methods (int, 4)
 __atomic_exchange_methods (long long, 8)
 
+#define __sync_exchange_methods(type, size)				\
+type									\
+__sync_val_compare_and_swap_##size(type* ptr, type oldval, type newval)	\
+{									\
+  arch_atomic_write_barrier();						\
+  type retval = arch_atomic_val_compare_and_exchange(ptr, oldval, newval); \
+  arch_atomic_read_barrier();						\
+  return retval;							\
+}									\
+									\
+bool									\
+__sync_bool_compare_and_swap_##size(type* ptr, type oldval, type newval) \
+{									\
+  arch_atomic_write_barrier();						\
+  bool retval = arch_atomic_bool_compare_and_exchange(ptr, oldval, newval); \
+  arch_atomic_read_barrier();						\
+  return retval;							\
+}									\
+									\
+type									\
+__sync_lock_test_and_set_##size(type* ptr, type val)			\
+{									\
+  type retval = arch_atomic_exchange(ptr, val);				\
+  arch_atomic_acquire_barrier_value(retval);				\
+  return retval;							\
+}
+
+__sync_exchange_methods (int, 4)
+__sync_exchange_methods (long long, 8)
+
+#ifdef __LITTLE_ENDIAN__
+#define BIT_OFFSET(n, type) ((n) * 8)
+#else
+#define BIT_OFFSET(n, type) ((4 - sizeof(type) - (n)) * 8)
+#endif
+
 /* Subword methods require the same approach for both TILEPro and
    TILE-Gx.  We load the background data for the word, insert the
    desired subword piece, then compare-and-exchange it into place.  */
 #define u8 unsigned char
 #define u16 unsigned short
-#define __atomic_subword_cmpxchg(type, size)				\
-  									\
-bool									\
-__atomic_compare_exchange_##size(volatile type* ptr, type* guess,	\
-				 type val, bool weak __unused, int models, \
-				 int modelf __unused)			\
-{									\
-  pre_atomic_barrier(models);						\
+
+#define __subword_cmpxchg_body(type, size, ptr, guess, val)		\
+  ({									\
   unsigned int *p = (unsigned int *)((unsigned long)ptr & ~3UL);	\
-  const int shift = ((unsigned long)ptr & 3UL) * 8;			\
+    const int shift = BIT_OFFSET((unsigned long)ptr & 3UL, type);	\
   const unsigned int valmask = (1 << (sizeof(type) * 8)) - 1;		\
   const unsigned int bgmask = ~(valmask << shift);			\
   unsigned int oldword = *p;						\
   type oldval = (oldword >> shift) & valmask;				\
-  if (__builtin_expect((oldval == *guess), 1)) {			\
+    if (__builtin_expect((oldval == guess), 1)) {			\
     unsigned int word = (oldword & bgmask) | ((val & valmask) << shift); \
     oldword = arch_atomic_val_compare_and_exchange(p, oldword, word);	\
     oldval = (oldword >> shift) & valmask;				\
   }									\
+    oldval;								\
+  })									\
+
+#define __atomic_subword_cmpxchg(type, size)				\
+  									\
+bool									\
+__atomic_compare_exchange_##size(volatile type* ptr, type* guess_ptr,	\
+				 type val, bool weak __unused, int models, \
+				 int modelf __unused)			\
+{									\
+  pre_atomic_barrier(models);						\
+  type guess = *guess_ptr;						\
+  type oldval = __subword_cmpxchg_body(type, size, ptr, guess, val);	\
   post_atomic_barrier(models);						\
-  bool success = (oldval == *guess);					\
-  *guess = oldval;							\
+  bool success = (oldval == guess);					\
+  *guess_ptr = oldval;							\
   return success;							\
 }
+
 __atomic_subword_cmpxchg (u8, 1)
 __atomic_subword_cmpxchg (u16, 2)
+
+#define __sync_subword_cmpxchg(type, size)				\
+  									\
+type									\
+__sync_val_compare_and_swap_##size(type* ptr, type guess, type val)	\
+{									\
+  arch_atomic_write_barrier();						\
+  type oldval = __subword_cmpxchg_body(type, size, ptr, guess, val);	\
+  arch_atomic_read_barrier();						\
+  return oldval;							\
+}									\
+									\
+bool									\
+__sync_bool_compare_and_swap_##size(type* ptr, type guess, type val)	\
+{									\
+  type oldval = __sync_val_compare_and_swap_##size(ptr, guess, val);	\
+  return oldval == guess;						\
+}
+
+__sync_subword_cmpxchg (u8, 1)
+__sync_subword_cmpxchg (u16, 2)
+
 /* For the atomic-update subword methods, we use the same approach as
    above, but we retry until we succeed if the compare-and-exchange
    fails.  */
-#define __atomic_subword(type, proto, top, expr, bottom)		\
+#define __subword(type, proto, top, expr, bottom)			\
 proto									\
 {									\
   top									\
   unsigned int *p = (unsigned int *)((unsigned long)ptr & ~3UL);	\
-  const int shift = ((unsigned long)ptr & 3UL) * 8;			\
+  const int shift = BIT_OFFSET((unsigned long)ptr & 3UL, type);		\
   const unsigned int valmask = (1 << (sizeof(type) * 8)) - 1;		\
   const unsigned int bgmask = ~(valmask << shift);			\
   unsigned int oldword, xword = *p;					\
@@ -184,42 +303,93 @@ proto									\
   } while (__builtin_expect(xword != oldword, 0));			\
   bottom								\
 }
+
 #define __atomic_subword_fetch(type, funcname, expr, retval)		\
-  __atomic_subword(type,						\
+  __subword(type,							\
 		   type __atomic_ ## funcname(volatile type *ptr, type i, int model), \
 		   pre_atomic_barrier(model);,				\
 		   expr,						\
 		   post_atomic_barrier(model); return retval;)
+
 __atomic_subword_fetch (u8, fetch_add_1, oldval + i, oldval)
 __atomic_subword_fetch (u8, fetch_sub_1, oldval - i, oldval)
 __atomic_subword_fetch (u8, fetch_or_1, oldval | i, oldval)
 __atomic_subword_fetch (u8, fetch_and_1, oldval & i, oldval)
 __atomic_subword_fetch (u8, fetch_xor_1, oldval ^ i, oldval)
 __atomic_subword_fetch (u8, fetch_nand_1, ~(oldval & i), oldval)
+
 __atomic_subword_fetch (u16, fetch_add_2, oldval + i, oldval)
 __atomic_subword_fetch (u16, fetch_sub_2, oldval - i, oldval)
 __atomic_subword_fetch (u16, fetch_or_2, oldval | i, oldval)
 __atomic_subword_fetch (u16, fetch_and_2, oldval & i, oldval)
 __atomic_subword_fetch (u16, fetch_xor_2, oldval ^ i, oldval)
 __atomic_subword_fetch (u16, fetch_nand_2, ~(oldval & i), oldval)
+
 __atomic_subword_fetch (u8, add_fetch_1, oldval + i, val)
 __atomic_subword_fetch (u8, sub_fetch_1, oldval - i, val)
 __atomic_subword_fetch (u8, or_fetch_1, oldval | i, val)
 __atomic_subword_fetch (u8, and_fetch_1, oldval & i, val)
 __atomic_subword_fetch (u8, xor_fetch_1, oldval ^ i, val)
 __atomic_subword_fetch (u8, nand_fetch_1, ~(oldval & i), val)
+
 __atomic_subword_fetch (u16, add_fetch_2, oldval + i, val)
 __atomic_subword_fetch (u16, sub_fetch_2, oldval - i, val)
 __atomic_subword_fetch (u16, or_fetch_2, oldval | i, val)
 __atomic_subword_fetch (u16, and_fetch_2, oldval & i, val)
 __atomic_subword_fetch (u16, xor_fetch_2, oldval ^ i, val)
 __atomic_subword_fetch (u16, nand_fetch_2, ~(oldval & i), val)
+
+#define __sync_subword_fetch(type, funcname, expr, retval)	\
+  __subword(type,						\
+	    type __sync_ ## funcname(type *ptr, type i),	\
+	    arch_atomic_read_barrier();,			\
+	    expr,						\
+	    arch_atomic_write_barrier(); return retval;)
+
+__sync_subword_fetch (u8, fetch_and_add_1, oldval + i, oldval)
+__sync_subword_fetch (u8, fetch_and_sub_1, oldval - i, oldval)
+__sync_subword_fetch (u8, fetch_and_or_1, oldval | i, oldval)
+__sync_subword_fetch (u8, fetch_and_and_1, oldval & i, oldval)
+__sync_subword_fetch (u8, fetch_and_xor_1, oldval ^ i, oldval)
+__sync_subword_fetch (u8, fetch_and_nand_1, ~(oldval & i), oldval)
+
+__sync_subword_fetch (u16, fetch_and_add_2, oldval + i, oldval)
+__sync_subword_fetch (u16, fetch_and_sub_2, oldval - i, oldval)
+__sync_subword_fetch (u16, fetch_and_or_2, oldval | i, oldval)
+__sync_subword_fetch (u16, fetch_and_and_2, oldval & i, oldval)
+__sync_subword_fetch (u16, fetch_and_xor_2, oldval ^ i, oldval)
+__sync_subword_fetch (u16, fetch_and_nand_2, ~(oldval & i), oldval)
+
+__sync_subword_fetch (u8, add_and_fetch_1, oldval + i, val)
+__sync_subword_fetch (u8, sub_and_fetch_1, oldval - i, val)
+__sync_subword_fetch (u8, or_and_fetch_1, oldval | i, val)
+__sync_subword_fetch (u8, and_and_fetch_1, oldval & i, val)
+__sync_subword_fetch (u8, xor_and_fetch_1, oldval ^ i, val)
+__sync_subword_fetch (u8, nand_and_fetch_1, ~(oldval & i), val)
+
+__sync_subword_fetch (u16, add_and_fetch_2, oldval + i, val)
+__sync_subword_fetch (u16, sub_and_fetch_2, oldval - i, val)
+__sync_subword_fetch (u16, or_and_fetch_2, oldval | i, val)
+__sync_subword_fetch (u16, and_and_fetch_2, oldval & i, val)
+__sync_subword_fetch (u16, xor_and_fetch_2, oldval ^ i, val)
+__sync_subword_fetch (u16, nand_and_fetch_2, ~(oldval & i), val)
+
 #define __atomic_subword_lock(type, size)				\
-									\
-__atomic_subword(type,							\
+  __subword(type,							\
 		 type __atomic_exchange_##size(volatile type* ptr, type nval, int model), \
 	         pre_atomic_barrier(model);,				\
 	         nval,							\
 	         post_atomic_barrier(model); return oldval;)
+
 __atomic_subword_lock (u8, 1)
 __atomic_subword_lock (u16, 2)
+
+#define __sync_subword_lock(type, size)					\
+  __subword(type,							\
+	    type __sync_lock_test_and_set_##size(type* ptr, type nval), \
+	    ,								\
+	    nval,							\
+	    arch_atomic_acquire_barrier_value(oldval); return oldval;)
+
+__sync_subword_lock (u8, 1)
+__sync_subword_lock (u16, 2)