/* TILE atomics.
   Copyright (C) 2011-2014 Free Software Foundation, Inc.
   Contributed by Walter Lee (walt@tilera.com)

   This file is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by the
   Free Software Foundation; either version 3, or (at your option) any
   later version.

   This file is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#include "tconfig.h"
#include "coretypes.h"
#include "atomic.h"

#define bool unsigned char

/* This code should be inlined by the compiler, but for now support
   it as out-of-line methods in libgcc.  */

static inline void
pre_atomic_barrier (int model)
{
  switch ((enum memmodel) model)
    {
    case MEMMODEL_RELEASE:
    case MEMMODEL_ACQ_REL:
    case MEMMODEL_SEQ_CST:
      __atomic_thread_fence (model);
      break;
    default:
      break;
    }
  return;
}

static inline void
post_atomic_barrier (int model)
{
  switch ((enum memmodel) model)
    {
    case MEMMODEL_ACQUIRE:
    case MEMMODEL_ACQ_REL:
    case MEMMODEL_SEQ_CST:
      __atomic_thread_fence (model);
      break;
    default:
      break;
    }
  return;
}

#define __unused __attribute__((unused))

#define __fetch_and_do(proto, type, size, opname, top, bottom)	\
proto								\
{								\
  top;								\
  type rv = arch_atomic_##opname(p, i);				\
  bottom;							\
  return rv;							\
}

#define __atomic_fetch_and_do(type, size, opname)			\
  __fetch_and_do(type __atomic_fetch_##opname##_##size(type* p, type i, int model), \
		 type, size, opname,					\
		 pre_atomic_barrier(model),				\
		 post_atomic_barrier(model))				\

__atomic_fetch_and_do (int, 4, add)
__atomic_fetch_and_do (int, 4, sub)
__atomic_fetch_and_do (int, 4, or)
__atomic_fetch_and_do (int, 4, and)
__atomic_fetch_and_do (int, 4, xor)
__atomic_fetch_and_do (int, 4, nand)
__atomic_fetch_and_do (long long, 8, add)
__atomic_fetch_and_do (long long, 8, sub)
__atomic_fetch_and_do (long long, 8, or)
__atomic_fetch_and_do (long long, 8, and)
__atomic_fetch_and_do (long long, 8, xor)
__atomic_fetch_and_do (long long, 8, nand)

#define __sync_fetch_and_do(type, size, opname)				\
  __fetch_and_do(type __sync_fetch_and_##opname##_##size(type* p, type i), \
		 type, size, opname,					\
		 arch_atomic_write_barrier(),				\
		 arch_atomic_read_barrier())

__sync_fetch_and_do (int, 4, add)
__sync_fetch_and_do (int, 4, sub)
__sync_fetch_and_do (int, 4, or)
__sync_fetch_and_do (int, 4, and)
__sync_fetch_and_do (int, 4, xor)
__sync_fetch_and_do (int, 4, nand)
__sync_fetch_and_do (long long, 8, add)
__sync_fetch_and_do (long long, 8, sub)
__sync_fetch_and_do (long long, 8, or)
__sync_fetch_and_do (long long, 8, and)
__sync_fetch_and_do (long long, 8, xor)
__sync_fetch_and_do (long long, 8, nand)

#define __do_and_fetch(proto, type, size, opname, op, op2, top, bottom)	\
proto									\
{									\
  top;									\
  type rv = op2 (arch_atomic_##opname(p, i) op i);			\
  bottom;								\
  return rv;								\
}

#define __atomic_do_and_fetch(type, size, opname, op, op2)		\
  __do_and_fetch(type __atomic_##opname##_fetch_##size(type* p, type i, int model), \
		 type, size, opname, op, op2,				\
		 pre_atomic_barrier(model),				\
		 post_atomic_barrier(model))				\

__atomic_do_and_fetch (int, 4, add, +, )
__atomic_do_and_fetch (int, 4, sub, -, )
__atomic_do_and_fetch (int, 4, or, |, )
__atomic_do_and_fetch (int, 4, and, &, )
__atomic_do_and_fetch (int, 4, xor, |, )
__atomic_do_and_fetch (int, 4, nand, &, ~)
__atomic_do_and_fetch (long long, 8, add, +, )
__atomic_do_and_fetch (long long, 8, sub, -, )
__atomic_do_and_fetch (long long, 8, or, |, )
__atomic_do_and_fetch (long long, 8, and, &, )
__atomic_do_and_fetch (long long, 8, xor, |, )
__atomic_do_and_fetch (long long, 8, nand, &, ~)

#define __sync_do_and_fetch(type, size, opname, op, op2)		\
  __do_and_fetch(type __sync_##opname##_and_fetch_##size(type* p, type i), \
		 type, size, opname, op, op2,				\
		 arch_atomic_write_barrier(),				\
		 arch_atomic_read_barrier())				\

__sync_do_and_fetch (int, 4, add, +, )
__sync_do_and_fetch (int, 4, sub, -, )
__sync_do_and_fetch (int, 4, or, |, )
__sync_do_and_fetch (int, 4, and, &, )
__sync_do_and_fetch (int, 4, xor, |, )
__sync_do_and_fetch (int, 4, nand, &, ~)
__sync_do_and_fetch (long long, 8, add, +, )
__sync_do_and_fetch (long long, 8, sub, -, )
__sync_do_and_fetch (long long, 8, or, |, )
__sync_do_and_fetch (long long, 8, and, &, )
__sync_do_and_fetch (long long, 8, xor, |, )
__sync_do_and_fetch (long long, 8, nand, &, ~)

#define __atomic_exchange_methods(type, size)				\
bool									\
__atomic_compare_exchange_##size(volatile type* ptr, type* oldvalp,	\
				 type newval, bool weak __unused,	\
				 int models, int modelf __unused)	\
{									\
  type oldval = *oldvalp;						\
  pre_atomic_barrier(models);						\
  type retval = arch_atomic_val_compare_and_exchange(ptr, oldval, newval); \
  post_atomic_barrier(models);						\
  bool success = (retval == oldval);					\
  *oldvalp = retval;							\
  return success;							\
}									\
									\
type									\
__atomic_exchange_##size(volatile type* ptr, type val, int model)	\
{									\
  pre_atomic_barrier(model);						\
  type retval = arch_atomic_exchange(ptr, val);				\
  post_atomic_barrier(model);						\
  return retval;							\
}

__atomic_exchange_methods (int, 4)
__atomic_exchange_methods (long long, 8)

#define __sync_exchange_methods(type, size)				\
type									\
__sync_val_compare_and_swap_##size(type* ptr, type oldval, type newval)	\
{									\
  arch_atomic_write_barrier();						\
  type retval = arch_atomic_val_compare_and_exchange(ptr, oldval, newval); \
  arch_atomic_read_barrier();						\
  return retval;							\
}									\
									\
bool									\
__sync_bool_compare_and_swap_##size(type* ptr, type oldval, type newval) \
{									\
  arch_atomic_write_barrier();						\
  bool retval = arch_atomic_bool_compare_and_exchange(ptr, oldval, newval); \
  arch_atomic_read_barrier();						\
  return retval;							\
}									\
									\
type									\
__sync_lock_test_and_set_##size(type* ptr, type val)			\
{									\
  type retval = arch_atomic_exchange(ptr, val);				\
  arch_atomic_acquire_barrier_value(retval);				\
  return retval;							\
}

__sync_exchange_methods (int, 4)
__sync_exchange_methods (long long, 8)

#ifdef __LITTLE_ENDIAN__
#define BIT_OFFSET(n, type) ((n) * 8)
#else
#define BIT_OFFSET(n, type) ((4 - sizeof(type) - (n)) * 8)
#endif

/* Subword methods require the same approach for both TILEPro and
   TILE-Gx.  We load the background data for the word, insert the
   desired subword piece, then compare-and-exchange it into place.  */
#define u8 unsigned char
#define u16 unsigned short

#define __subword_cmpxchg_body(type, size, ptr, guess, val)		\
  ({									\
    unsigned int *p = (unsigned int *)((unsigned long)ptr & ~3UL);	\
    const int shift = BIT_OFFSET((unsigned long)ptr & 3UL, type);	\
    const unsigned int valmask = (1 << (sizeof(type) * 8)) - 1;		\
    const unsigned int bgmask = ~(valmask << shift);			\
    unsigned int oldword = *p;						\
    type oldval = (oldword >> shift) & valmask;				\
    if (__builtin_expect((oldval == guess), 1)) {			\
      unsigned int word = (oldword & bgmask) | ((val & valmask) << shift); \
      oldword = arch_atomic_val_compare_and_exchange(p, oldword, word);	\
      oldval = (oldword >> shift) & valmask;				\
    }									\
    oldval;								\
  })									\

#define __atomic_subword_cmpxchg(type, size)				\
  									\
bool									\
__atomic_compare_exchange_##size(volatile type* ptr, type* guess_ptr,	\
				 type val, bool weak __unused, int models, \
				 int modelf __unused)			\
{									\
  pre_atomic_barrier(models);						\
  type guess = *guess_ptr;						\
  type oldval = __subword_cmpxchg_body(type, size, ptr, guess, val);	\
  post_atomic_barrier(models);						\
  bool success = (oldval == guess);					\
  *guess_ptr = oldval;							\
  return success;							\
}

__atomic_subword_cmpxchg (u8, 1)
__atomic_subword_cmpxchg (u16, 2)

#define __sync_subword_cmpxchg(type, size)				\
  									\
type									\
__sync_val_compare_and_swap_##size(type* ptr, type guess, type val)	\
{									\
  arch_atomic_write_barrier();						\
  type oldval = __subword_cmpxchg_body(type, size, ptr, guess, val);	\
  arch_atomic_read_barrier();						\
  return oldval;							\
}									\
									\
bool									\
__sync_bool_compare_and_swap_##size(type* ptr, type guess, type val)	\
{									\
  type oldval = __sync_val_compare_and_swap_##size(ptr, guess, val);	\
  return oldval == guess;						\
}

__sync_subword_cmpxchg (u8, 1)
__sync_subword_cmpxchg (u16, 2)

/* For the atomic-update subword methods, we use the same approach as
   above, but we retry until we succeed if the compare-and-exchange
   fails.  */
#define __subword(type, proto, top, expr, bottom)			\
proto									\
{									\
  top									\
  unsigned int *p = (unsigned int *)((unsigned long)ptr & ~3UL);	\
  const int shift = BIT_OFFSET((unsigned long)ptr & 3UL, type);		\
  const unsigned int valmask = (1 << (sizeof(type) * 8)) - 1;		\
  const unsigned int bgmask = ~(valmask << shift);			\
  unsigned int oldword, xword = *p;					\
  type val, oldval;							\
  do {									\
    oldword = xword;							\
    oldval = (oldword >> shift) & valmask;				\
    val = expr;								\
    unsigned int word = (oldword & bgmask) | ((val & valmask) << shift); \
    xword = arch_atomic_val_compare_and_exchange(p, oldword, word);	\
  } while (__builtin_expect(xword != oldword, 0));			\
  bottom								\
}

#define __atomic_subword_fetch(type, funcname, expr, retval)		\
  __subword(type,							\
	    type __atomic_ ## funcname(volatile type *ptr, type i, int model), \
	    pre_atomic_barrier(model);,					\
	    expr,							\
	    post_atomic_barrier(model); return retval;)

__atomic_subword_fetch (u8, fetch_add_1, oldval + i, oldval)
__atomic_subword_fetch (u8, fetch_sub_1, oldval - i, oldval)
__atomic_subword_fetch (u8, fetch_or_1, oldval | i, oldval)
__atomic_subword_fetch (u8, fetch_and_1, oldval & i, oldval)
__atomic_subword_fetch (u8, fetch_xor_1, oldval ^ i, oldval)
__atomic_subword_fetch (u8, fetch_nand_1, ~(oldval & i), oldval)

__atomic_subword_fetch (u16, fetch_add_2, oldval + i, oldval)
__atomic_subword_fetch (u16, fetch_sub_2, oldval - i, oldval)
__atomic_subword_fetch (u16, fetch_or_2, oldval | i, oldval)
__atomic_subword_fetch (u16, fetch_and_2, oldval & i, oldval)
__atomic_subword_fetch (u16, fetch_xor_2, oldval ^ i, oldval)
__atomic_subword_fetch (u16, fetch_nand_2, ~(oldval & i), oldval)

__atomic_subword_fetch (u8, add_fetch_1, oldval + i, val)
__atomic_subword_fetch (u8, sub_fetch_1, oldval - i, val)
__atomic_subword_fetch (u8, or_fetch_1, oldval | i, val)
__atomic_subword_fetch (u8, and_fetch_1, oldval & i, val)
__atomic_subword_fetch (u8, xor_fetch_1, oldval ^ i, val)
__atomic_subword_fetch (u8, nand_fetch_1, ~(oldval & i), val)

__atomic_subword_fetch (u16, add_fetch_2, oldval + i, val)
__atomic_subword_fetch (u16, sub_fetch_2, oldval - i, val)
__atomic_subword_fetch (u16, or_fetch_2, oldval | i, val)
__atomic_subword_fetch (u16, and_fetch_2, oldval & i, val)
__atomic_subword_fetch (u16, xor_fetch_2, oldval ^ i, val)
__atomic_subword_fetch (u16, nand_fetch_2, ~(oldval & i), val)

#define __sync_subword_fetch(type, funcname, expr, retval)	\
  __subword(type,						\
	    type __sync_ ## funcname(type *ptr, type i),	\
	    arch_atomic_read_barrier();,			\
	    expr,						\
	    arch_atomic_write_barrier(); return retval;)

__sync_subword_fetch (u8, fetch_and_add_1, oldval + i, oldval)
__sync_subword_fetch (u8, fetch_and_sub_1, oldval - i, oldval)
__sync_subword_fetch (u8, fetch_and_or_1, oldval | i, oldval)
__sync_subword_fetch (u8, fetch_and_and_1, oldval & i, oldval)
__sync_subword_fetch (u8, fetch_and_xor_1, oldval ^ i, oldval)
__sync_subword_fetch (u8, fetch_and_nand_1, ~(oldval & i), oldval)

__sync_subword_fetch (u16, fetch_and_add_2, oldval + i, oldval)
__sync_subword_fetch (u16, fetch_and_sub_2, oldval - i, oldval)
__sync_subword_fetch (u16, fetch_and_or_2, oldval | i, oldval)
__sync_subword_fetch (u16, fetch_and_and_2, oldval & i, oldval)
__sync_subword_fetch (u16, fetch_and_xor_2, oldval ^ i, oldval)
__sync_subword_fetch (u16, fetch_and_nand_2, ~(oldval & i), oldval)

__sync_subword_fetch (u8, add_and_fetch_1, oldval + i, val)
__sync_subword_fetch (u8, sub_and_fetch_1, oldval - i, val)
__sync_subword_fetch (u8, or_and_fetch_1, oldval | i, val)
__sync_subword_fetch (u8, and_and_fetch_1, oldval & i, val)
__sync_subword_fetch (u8, xor_and_fetch_1, oldval ^ i, val)
__sync_subword_fetch (u8, nand_and_fetch_1, ~(oldval & i), val)

__sync_subword_fetch (u16, add_and_fetch_2, oldval + i, val)
__sync_subword_fetch (u16, sub_and_fetch_2, oldval - i, val)
__sync_subword_fetch (u16, or_and_fetch_2, oldval | i, val)
__sync_subword_fetch (u16, and_and_fetch_2, oldval & i, val)
__sync_subword_fetch (u16, xor_and_fetch_2, oldval ^ i, val)
__sync_subword_fetch (u16, nand_and_fetch_2, ~(oldval & i), val)

#define __atomic_subword_lock(type, size)				\
  __subword(type,							\
	    type __atomic_exchange_##size(volatile type* ptr, type nval, int model), \
	    pre_atomic_barrier(model);,					\
	    nval,							\
	    post_atomic_barrier(model); return oldval;)

__atomic_subword_lock (u8, 1)
__atomic_subword_lock (u16, 2)

#define __sync_subword_lock(type, size)					\
  __subword(type,							\
	    type __sync_lock_test_and_set_##size(type* ptr, type nval), \
	    ,								\
	    nval,							\
	    arch_atomic_acquire_barrier_value(oldval); return oldval;)

__sync_subword_lock (u8, 1)
__sync_subword_lock (u16, 2)