/* Copyright (C) 2007  Free Software Foundation, Inc.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 2, or (at your option) any later
version.

In addition to the permissions in the GNU General Public License, the
Free Software Foundation gives you unlimited permission to link the
compiled version of this file into combinations with other programs,
and to distribute those combinations without any restriction coming
from the use of this file.  (The General Public License restrictions
do apply in other respects; for example, they cover modification of
the file, and distribution when not linked into a combine
executable.)

GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING.  If not, write to the Free
Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.  */

/*****************************************************************************
 *
 *    Helper add functions (for fma)
 *
 *    __BID_INLINE__ UINT64 get_add64(
 *        UINT64 sign_x, int exponent_x, UINT64 coefficient_x, 
 *        UINT64 sign_y, int exponent_y, UINT64 coefficient_y, 
 *  					 int rounding_mode)
 *
 *   __BID_INLINE__ UINT64 get_add128(
 *                       UINT64 sign_x, int exponent_x, UINT64 coefficient_x, 
 *                       UINT64 sign_y, int final_exponent_y, UINT128 CY, 
 *                       int extra_digits, int rounding_mode)
 *
 *****************************************************************************
 *
 *  Algorithm description:
 *
 *  get_add64:  same as BID64 add, but arguments are unpacked and there 
 *                                 are no special case checks
 *
 *  get_add128: add 64-bit coefficient to 128-bit product (which contains 
 *                                        16+extra_digits decimal digits), 
 *                         return BID64 result
 *              - the exponents are compared and the two coefficients are 
 *                properly aligned for addition/subtraction
 *              - multiple paths are needed
 *              - final result exponent is calculated and the lower term is
 *                      rounded first if necessary, to avoid manipulating 
 *                      coefficients longer than 128 bits 
 *
 ****************************************************************************/

#ifndef _INLINE_BID_ADD_H_
#define _INLINE_BID_ADD_H_

#include "bid_internal.h"

#define MAX_FORMAT_DIGITS     16
#define DECIMAL_EXPONENT_BIAS 398
#define MASK_BINARY_EXPONENT  0x7ff0000000000000ull
#define BINARY_EXPONENT_BIAS  0x3ff
#define UPPER_EXPON_LIMIT     51

///////////////////////////////////////////////////////////////////////
//
// get_add64() is essentially the same as bid_add(), except that 
//             the arguments are unpacked
//
//////////////////////////////////////////////////////////////////////
__BID_INLINE__ UINT64
get_add64 (UINT64 sign_x, int exponent_x, UINT64 coefficient_x,
	   UINT64 sign_y, int exponent_y, UINT64 coefficient_y,
	   int rounding_mode, unsigned *fpsc) {
  UINT128 CA, CT, CT_new;
  UINT64 sign_a, sign_b, coefficient_a, coefficient_b, sign_s, sign_ab,
    rem_a;
  UINT64 saved_ca, saved_cb, C0_64, C64, remainder_h, T1, carry, tmp,
    C64_new;
  int_double tempx;
  int exponent_a, exponent_b, diff_dec_expon;
  int bin_expon_ca, extra_digits, amount, scale_k, scale_ca;
  unsigned rmode, status;

  // sort arguments by exponent
  if (exponent_x <= exponent_y) {
    sign_a = sign_y;
    exponent_a = exponent_y;
    coefficient_a = coefficient_y;
    sign_b = sign_x;
    exponent_b = exponent_x;
    coefficient_b = coefficient_x;
  } else {
    sign_a = sign_x;
    exponent_a = exponent_x;
    coefficient_a = coefficient_x;
    sign_b = sign_y;
    exponent_b = exponent_y;
    coefficient_b = coefficient_y;
  }

  // exponent difference
  diff_dec_expon = exponent_a - exponent_b;

  /* get binary coefficients of x and y */

  //--- get number of bits in the coefficients of x and y ---

  tempx.d = (double) coefficient_a;
  bin_expon_ca = ((tempx.i & MASK_BINARY_EXPONENT) >> 52) - 0x3ff;

  if (!coefficient_a) {
    return get_BID64 (sign_b, exponent_b, coefficient_b, rounding_mode,
		      fpsc);
  }
  if (diff_dec_expon > MAX_FORMAT_DIGITS) {
    // normalize a to a 16-digit coefficient

    scale_ca = estimate_decimal_digits[bin_expon_ca];
    if (coefficient_a >= power10_table_128[scale_ca].w[0])
      scale_ca++;

    scale_k = 16 - scale_ca;

    coefficient_a *= power10_table_128[scale_k].w[0];

    diff_dec_expon -= scale_k;
    exponent_a -= scale_k;

    /* get binary coefficients of x and y */

    //--- get number of bits in the coefficients of x and y ---
    tempx.d = (double) coefficient_a;
    bin_expon_ca = ((tempx.i & MASK_BINARY_EXPONENT) >> 52) - 0x3ff;

    if (diff_dec_expon > MAX_FORMAT_DIGITS) {
#ifdef SET_STATUS_FLAGS
      if (coefficient_b) {
	__set_status_flags (fpsc, INEXACT_EXCEPTION);
      }
#endif

#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
#ifndef IEEE_ROUND_NEAREST
      if (((rounding_mode) & 3) && coefficient_b)	// not ROUNDING_TO_NEAREST
      {
	switch (rounding_mode) {
	case ROUNDING_DOWN:
	  if (sign_b) {
	    coefficient_a -= ((((SINT64) sign_a) >> 63) | 1);
	    if (coefficient_a < 1000000000000000ull) {
	      exponent_a--;
	      coefficient_a = 9999999999999999ull;
	    } else if (coefficient_a >= 10000000000000000ull) {
	      exponent_a++;
	      coefficient_a = 1000000000000000ull;
	    }
	  }
	  break;
	case ROUNDING_UP:
	  if (!sign_b) {
	    coefficient_a += ((((SINT64) sign_a) >> 63) | 1);
	    if (coefficient_a < 1000000000000000ull) {
	      exponent_a--;
	      coefficient_a = 9999999999999999ull;
	    } else if (coefficient_a >= 10000000000000000ull) {
	      exponent_a++;
	      coefficient_a = 1000000000000000ull;
	    }
	  }
	  break;
	default:	// RZ
	  if (sign_a != sign_b) {
	    coefficient_a--;
	    if (coefficient_a < 1000000000000000ull) {
	      exponent_a--;
	      coefficient_a = 9999999999999999ull;
	    }
	  }
	  break;
	}
      } else
#endif
#endif
	// check special case here
	if ((coefficient_a == 1000000000000000ull)
	    && (diff_dec_expon == MAX_FORMAT_DIGITS + 1)
	    && (sign_a ^ sign_b)
	    && (coefficient_b > 5000000000000000ull)) {
	coefficient_a = 9999999999999999ull;
	exponent_a--;
      }

      return get_BID64 (sign_a, exponent_a, coefficient_a,
			rounding_mode, fpsc);
    }
  }
  // test whether coefficient_a*10^(exponent_a-exponent_b)  may exceed 2^62
  if (bin_expon_ca + estimate_bin_expon[diff_dec_expon] < 60) {
    // coefficient_a*10^(exponent_a-exponent_b)<2^63

    // multiply by 10^(exponent_a-exponent_b)
    coefficient_a *= power10_table_128[diff_dec_expon].w[0];

    // sign mask
    sign_b = ((SINT64) sign_b) >> 63;
    // apply sign to coeff. of b
    coefficient_b = (coefficient_b + sign_b) ^ sign_b;

    // apply sign to coefficient a
    sign_a = ((SINT64) sign_a) >> 63;
    coefficient_a = (coefficient_a + sign_a) ^ sign_a;

    coefficient_a += coefficient_b;
    // get sign
    sign_s = ((SINT64) coefficient_a) >> 63;
    coefficient_a = (coefficient_a + sign_s) ^ sign_s;
    sign_s &= 0x8000000000000000ull;

    // coefficient_a < 10^16 ?
    if (coefficient_a < power10_table_128[MAX_FORMAT_DIGITS].w[0]) {
#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
#ifndef IEEE_ROUND_NEAREST
      if (rounding_mode == ROUNDING_DOWN && (!coefficient_a)
	  && sign_a != sign_b)
	sign_s = 0x8000000000000000ull;
#endif
#endif
      return get_BID64 (sign_s, exponent_b, coefficient_a,
			rounding_mode, fpsc);
    }
    // otherwise rounding is necessary

    // already know coefficient_a<10^19
    // coefficient_a < 10^17 ?
    if (coefficient_a < power10_table_128[17].w[0])
      extra_digits = 1;
    else if (coefficient_a < power10_table_128[18].w[0])
      extra_digits = 2;
    else
      extra_digits = 3;

#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
#ifndef IEEE_ROUND_NEAREST
    rmode = rounding_mode;
    if (sign_s && (unsigned) (rmode - 1) < 2)
      rmode = 3 - rmode;
#else
    rmode = 0;
#endif
#else
    rmode = 0;
#endif
    coefficient_a += round_const_table[rmode][extra_digits];

    // get P*(2^M[extra_digits])/10^extra_digits
    __mul_64x64_to_128 (CT, coefficient_a,
			reciprocals10_64[extra_digits]);

    // now get P/10^extra_digits: shift C64 right by M[extra_digits]-128
    amount = short_recip_scale[extra_digits];
    C64 = CT.w[1] >> amount;

  } else {
    // coefficient_a*10^(exponent_a-exponent_b) is large
    sign_s = sign_a;

#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
#ifndef IEEE_ROUND_NEAREST
    rmode = rounding_mode;
    if (sign_s && (unsigned) (rmode - 1) < 2)
      rmode = 3 - rmode;
#else
    rmode = 0;
#endif
#else
    rmode = 0;
#endif

    // check whether we can take faster path
    scale_ca = estimate_decimal_digits[bin_expon_ca];

    sign_ab = sign_a ^ sign_b;
    sign_ab = ((SINT64) sign_ab) >> 63;

    // T1 = 10^(16-diff_dec_expon)
    T1 = power10_table_128[16 - diff_dec_expon].w[0];

    // get number of digits in coefficient_a
    //P_ca = power10_table_128[scale_ca].w[0];
    //P_ca_m1 = power10_table_128[scale_ca-1].w[0];
    if (coefficient_a >= power10_table_128[scale_ca].w[0]) {
      scale_ca++;
      //P_ca_m1 = P_ca;
      //P_ca = power10_table_128[scale_ca].w[0];
    }

    scale_k = 16 - scale_ca;

    // apply sign
    //Ts = (T1 + sign_ab) ^ sign_ab;

    // test range of ca
    //X = coefficient_a + Ts - P_ca_m1;

    // addition
    saved_ca = coefficient_a - T1;
    coefficient_a =
      (SINT64) saved_ca *(SINT64) power10_table_128[scale_k].w[0];
    extra_digits = diff_dec_expon - scale_k;

    // apply sign
    saved_cb = (coefficient_b + sign_ab) ^ sign_ab;
    // add 10^16 and rounding constant
    coefficient_b =
      saved_cb + 10000000000000000ull +
      round_const_table[rmode][extra_digits];

    // get P*(2^M[extra_digits])/10^extra_digits
    __mul_64x64_to_128 (CT, coefficient_b,
			reciprocals10_64[extra_digits]);

    // now get P/10^extra_digits: shift C64 right by M[extra_digits]-128
    amount = short_recip_scale[extra_digits];
    C0_64 = CT.w[1] >> amount;

    // result coefficient 
    C64 = C0_64 + coefficient_a;
    // filter out difficult (corner) cases
    // the following test is equivalent to 
    // ( (initial_coefficient_a + Ts) < P_ca && 
    //     (initial_coefficient_a + Ts) > P_ca_m1 ), 
    // which ensures the number of digits in coefficient_a does not change 
    // after adding (the appropriately scaled and rounded) coefficient_b
    if ((UINT64) (C64 - 1000000000000000ull - 1) >
	9000000000000000ull - 2) {
      if (C64 >= 10000000000000000ull) {
	// result has more than 16 digits
	if (!scale_k) {
	  // must divide coeff_a by 10
	  saved_ca = saved_ca + T1;
	  __mul_64x64_to_128 (CA, saved_ca, 0x3333333333333334ull);
	  //reciprocals10_64[1]);
	  coefficient_a = CA.w[1] >> 1;
	  rem_a =
	    saved_ca - (coefficient_a << 3) - (coefficient_a << 1);
	  coefficient_a = coefficient_a - T1;

	  saved_cb +=
	    /*90000000000000000 */ +rem_a *
	    power10_table_128[diff_dec_expon].w[0];
	} else
	  coefficient_a =
	    (SINT64) (saved_ca - T1 -
		      (T1 << 3)) * (SINT64) power10_table_128[scale_k -
							      1].w[0];

	extra_digits++;
	coefficient_b =
	  saved_cb + 100000000000000000ull +
	  round_const_table[rmode][extra_digits];

	// get P*(2^M[extra_digits])/10^extra_digits
	__mul_64x64_to_128 (CT, coefficient_b,
			    reciprocals10_64[extra_digits]);

	// now get P/10^extra_digits: shift C64 right by M[extra_digits]-128
	amount = short_recip_scale[extra_digits];
	C0_64 = CT.w[1] >> amount;

	// result coefficient 
	C64 = C0_64 + coefficient_a;
      } else if (C64 <= 1000000000000000ull) {
	// less than 16 digits in result
	coefficient_a =
	  (SINT64) saved_ca *(SINT64) power10_table_128[scale_k +
							1].w[0];
	//extra_digits --;
	exponent_b--;
	coefficient_b =
	  (saved_cb << 3) + (saved_cb << 1) + 100000000000000000ull +
	  round_const_table[rmode][extra_digits];

	// get P*(2^M[extra_digits])/10^extra_digits
	__mul_64x64_to_128 (CT_new, coefficient_b,
			    reciprocals10_64[extra_digits]);

	// now get P/10^extra_digits: shift C64 right by M[extra_digits]-128
	amount = short_recip_scale[extra_digits];
	C0_64 = CT_new.w[1] >> amount;

	// result coefficient 
	C64_new = C0_64 + coefficient_a;
	if (C64_new < 10000000000000000ull) {
	  C64 = C64_new;
#ifdef SET_STATUS_FLAGS
	  CT = CT_new;
#endif
	} else
	  exponent_b++;
      }

    }

  }

#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
#ifndef IEEE_ROUND_NEAREST
  if (rmode == 0)	//ROUNDING_TO_NEAREST
#endif
    if (C64 & 1) {
      // check whether fractional part of initial_P/10^extra_digits 
      // is exactly .5
      // this is the same as fractional part of 
      //      (initial_P + 0.5*10^extra_digits)/10^extra_digits is exactly zero

      // get remainder
      remainder_h = CT.w[1] << (64 - amount);

      // test whether fractional part is 0
      if (!remainder_h && (CT.w[0] < reciprocals10_64[extra_digits])) {
	C64--;
      }
    }
#endif

#ifdef SET_STATUS_FLAGS
  status = INEXACT_EXCEPTION;

  // get remainder
  remainder_h = CT.w[1] << (64 - amount);

  switch (rmode) {
  case ROUNDING_TO_NEAREST:
  case ROUNDING_TIES_AWAY:
    // test whether fractional part is 0
    if ((remainder_h == 0x8000000000000000ull)
	&& (CT.w[0] < reciprocals10_64[extra_digits]))
      status = EXACT_STATUS;
    break;
  case ROUNDING_DOWN:
  case ROUNDING_TO_ZERO:
    if (!remainder_h && (CT.w[0] < reciprocals10_64[extra_digits]))
      status = EXACT_STATUS;
    break;
  default:
    // round up
    __add_carry_out (tmp, carry, CT.w[0],
		     reciprocals10_64[extra_digits]);
    if ((remainder_h >> (64 - amount)) + carry >=
	(((UINT64) 1) << amount))
      status = EXACT_STATUS;
    break;
  }
  __set_status_flags (fpsc, status);

#endif

  return get_BID64 (sign_s, exponent_b + extra_digits, C64,
		    rounding_mode, fpsc);
}


///////////////////////////////////////////////////////////////////
// round 128-bit coefficient and return result in BID64 format
// do not worry about midpoint cases
//////////////////////////////////////////////////////////////////
static UINT64
__bid_simple_round64_sticky (UINT64 sign, int exponent, UINT128 P,
			     int extra_digits, int rounding_mode,
			     unsigned *fpsc) {
  UINT128 Q_high, Q_low, C128;
  UINT64 C64;
  int amount, rmode;

  rmode = rounding_mode;
#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
#ifndef IEEE_ROUND_NEAREST
  if (sign && (unsigned) (rmode - 1) < 2)
    rmode = 3 - rmode;
#endif
#endif
  __add_128_64 (P, P, round_const_table[rmode][extra_digits]);

  // get P*(2^M[extra_digits])/10^extra_digits
  __mul_128x128_full (Q_high, Q_low, P,
		      reciprocals10_128[extra_digits]);

  // now get P/10^extra_digits: shift Q_high right by M[extra_digits]-128
  amount = recip_scale[extra_digits];
  __shr_128 (C128, Q_high, amount);

  C64 = __low_64 (C128);

#ifdef SET_STATUS_FLAGS

  __set_status_flags (fpsc, INEXACT_EXCEPTION);

#endif

  return get_BID64 (sign, exponent, C64, rounding_mode, fpsc);
}

///////////////////////////////////////////////////////////////////
// round 128-bit coefficient and return result in BID64 format
///////////////////////////////////////////////////////////////////
static UINT64
__bid_full_round64 (UINT64 sign, int exponent, UINT128 P,
		    int extra_digits, int rounding_mode,
		    unsigned *fpsc) {
  UINT128 Q_high, Q_low, C128, Stemp, PU;
  UINT64 remainder_h, C64, carry, CY;
  int amount, amount2, rmode, status = 0;

  if (exponent < 0) {
    if (exponent >= -16 && (extra_digits + exponent < 0)) {
      extra_digits = -exponent;
#ifdef SET_STATUS_FLAGS
      if (extra_digits > 0) {
	rmode = rounding_mode;
	if (sign && (unsigned) (rmode - 1) < 2)
	  rmode = 3 - rmode;
	__add_128_128 (PU, P,
		       round_const_table_128[rmode][extra_digits]);
	if (__unsigned_compare_gt_128
	    (power10_table_128[extra_digits + 15], PU))
	  status = UNDERFLOW_EXCEPTION;
      }
#endif
    }
  }

  if (extra_digits > 0) {
    exponent += extra_digits;
    rmode = rounding_mode;
    if (sign && (unsigned) (rmode - 1) < 2)
      rmode = 3 - rmode;
    __add_128_128 (P, P, round_const_table_128[rmode][extra_digits]);

    // get P*(2^M[extra_digits])/10^extra_digits
    __mul_128x128_full (Q_high, Q_low, P,
			reciprocals10_128[extra_digits]);

    // now get P/10^extra_digits: shift Q_high right by M[extra_digits]-128
    amount = recip_scale[extra_digits];
    __shr_128_long (C128, Q_high, amount);

    C64 = __low_64 (C128);

#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
#ifndef IEEE_ROUND_NEAREST
    if (rmode == 0)	//ROUNDING_TO_NEAREST
#endif
      if (C64 & 1) {
	// check whether fractional part of initial_P/10^extra_digits 
	// is exactly .5

	// get remainder
	amount2 = 64 - amount;
	remainder_h = 0;
	remainder_h--;
	remainder_h >>= amount2;
	remainder_h = remainder_h & Q_high.w[0];

	if (!remainder_h
	    && (Q_low.w[1] < reciprocals10_128[extra_digits].w[1]
		|| (Q_low.w[1] == reciprocals10_128[extra_digits].w[1]
		    && Q_low.w[0] <
		    reciprocals10_128[extra_digits].w[0]))) {
	  C64--;
	}
      }
#endif

#ifdef SET_STATUS_FLAGS
    status |= INEXACT_EXCEPTION;

    // get remainder
    remainder_h = Q_high.w[0] << (64 - amount);

    switch (rmode) {
    case ROUNDING_TO_NEAREST:
    case ROUNDING_TIES_AWAY:
      // test whether fractional part is 0
      if (remainder_h == 0x8000000000000000ull
	  && (Q_low.w[1] < reciprocals10_128[extra_digits].w[1]
	      || (Q_low.w[1] == reciprocals10_128[extra_digits].w[1]
		  && Q_low.w[0] <
		  reciprocals10_128[extra_digits].w[0])))
	status = EXACT_STATUS;
      break;
    case ROUNDING_DOWN:
    case ROUNDING_TO_ZERO:
      if (!remainder_h
	  && (Q_low.w[1] < reciprocals10_128[extra_digits].w[1]
	      || (Q_low.w[1] == reciprocals10_128[extra_digits].w[1]
		  && Q_low.w[0] <
		  reciprocals10_128[extra_digits].w[0])))
	status = EXACT_STATUS;
      break;
    default:
      // round up
      __add_carry_out (Stemp.w[0], CY, Q_low.w[0],
		       reciprocals10_128[extra_digits].w[0]);
      __add_carry_in_out (Stemp.w[1], carry, Q_low.w[1],
			  reciprocals10_128[extra_digits].w[1], CY);
      if ((remainder_h >> (64 - amount)) + carry >=
	  (((UINT64) 1) << amount))
	status = EXACT_STATUS;
    }

    __set_status_flags (fpsc, status);

#endif
  } else {
    C64 = P.w[0];
    if (!C64) {
      sign = 0;
      if (rounding_mode == ROUNDING_DOWN)
	sign = 0x8000000000000000ull;
    }
  }
  return get_BID64 (sign, exponent, C64, rounding_mode, fpsc);
}

/////////////////////////////////////////////////////////////////////////////////
// round 192-bit coefficient (P, remainder_P) and return result in BID64 format
// the lowest 64 bits (remainder_P) are used for midpoint checking only
////////////////////////////////////////////////////////////////////////////////
static UINT64
__bid_full_round64_remainder (UINT64 sign, int exponent, UINT128 P,
			      int extra_digits, UINT64 remainder_P,
			      int rounding_mode, unsigned *fpsc,
			      unsigned uf_status) {
  UINT128 Q_high, Q_low, C128, Stemp;
  UINT64 remainder_h, C64, carry, CY;
  int amount, amount2, rmode, status = uf_status;

  rmode = rounding_mode;
  if (sign && (unsigned) (rmode - 1) < 2)
    rmode = 3 - rmode;
  if (rmode == ROUNDING_UP && remainder_P) {
    P.w[0]++;
    if (!P.w[0])
      P.w[1]++;
  }

  if (extra_digits) {
    __add_128_64 (P, P, round_const_table[rmode][extra_digits]);

    // get P*(2^M[extra_digits])/10^extra_digits
    __mul_128x128_full (Q_high, Q_low, P,
			reciprocals10_128[extra_digits]);

    // now get P/10^extra_digits: shift Q_high right by M[extra_digits]-128
    amount = recip_scale[extra_digits];
    __shr_128 (C128, Q_high, amount);

    C64 = __low_64 (C128);

#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
#ifndef IEEE_ROUND_NEAREST
    if (rmode == 0)	//ROUNDING_TO_NEAREST
#endif
      if (!remainder_P && (C64 & 1)) {
	// check whether fractional part of initial_P/10^extra_digits 
	// is exactly .5

	// get remainder
	amount2 = 64 - amount;
	remainder_h = 0;
	remainder_h--;
	remainder_h >>= amount2;
	remainder_h = remainder_h & Q_high.w[0];

	if (!remainder_h
	    && (Q_low.w[1] < reciprocals10_128[extra_digits].w[1]
		|| (Q_low.w[1] == reciprocals10_128[extra_digits].w[1]
		    && Q_low.w[0] <
		    reciprocals10_128[extra_digits].w[0]))) {
	  C64--;
	}
      }
#endif

#ifdef SET_STATUS_FLAGS
    status |= INEXACT_EXCEPTION;

    if (!remainder_P) {
      // get remainder
      remainder_h = Q_high.w[0] << (64 - amount);

      switch (rmode) {
      case ROUNDING_TO_NEAREST:
      case ROUNDING_TIES_AWAY:
	// test whether fractional part is 0
	if (remainder_h == 0x8000000000000000ull
	    && (Q_low.w[1] < reciprocals10_128[extra_digits].w[1]
		|| (Q_low.w[1] == reciprocals10_128[extra_digits].w[1]
		    && Q_low.w[0] <
		    reciprocals10_128[extra_digits].w[0])))
	  status = EXACT_STATUS;
	break;
      case ROUNDING_DOWN:
      case ROUNDING_TO_ZERO:
	if (!remainder_h
	    && (Q_low.w[1] < reciprocals10_128[extra_digits].w[1]
		|| (Q_low.w[1] == reciprocals10_128[extra_digits].w[1]
		    && Q_low.w[0] <
		    reciprocals10_128[extra_digits].w[0])))
	  status = EXACT_STATUS;
	break;
      default:
	// round up
	__add_carry_out (Stemp.w[0], CY, Q_low.w[0],
			 reciprocals10_128[extra_digits].w[0]);
	__add_carry_in_out (Stemp.w[1], carry, Q_low.w[1],
			    reciprocals10_128[extra_digits].w[1], CY);
	if ((remainder_h >> (64 - amount)) + carry >=
	    (((UINT64) 1) << amount))
	  status = EXACT_STATUS;
      }
    }
    __set_status_flags (fpsc, status);

#endif
  } else {
    C64 = P.w[0];
#ifdef SET_STATUS_FLAGS
    if (remainder_P) {
      __set_status_flags (fpsc, uf_status | INEXACT_EXCEPTION);
    }
#endif
  }

  return get_BID64 (sign, exponent + extra_digits, C64, rounding_mode,
		    fpsc);
}


///////////////////////////////////////////////////////////////////
// get P/10^extra_digits
// result fits in 64 bits
///////////////////////////////////////////////////////////////////
__BID_INLINE__ UINT64
__truncate (UINT128 P, int extra_digits)
// extra_digits <= 16
{
  UINT128 Q_high, Q_low, C128;
  UINT64 C64;
  int amount;

  // get P*(2^M[extra_digits])/10^extra_digits
  __mul_128x128_full (Q_high, Q_low, P,
		      reciprocals10_128[extra_digits]);

  // now get P/10^extra_digits: shift Q_high right by M[extra_digits]-128
  amount = recip_scale[extra_digits];
  __shr_128 (C128, Q_high, amount);

  C64 = __low_64 (C128);

  return C64;
}


///////////////////////////////////////////////////////////////////
// return number of decimal digits in 128-bit value X
///////////////////////////////////////////////////////////////////
__BID_INLINE__ int
__get_dec_digits64 (UINT128 X) {
  int_double tempx;
  int digits_x, bin_expon_cx;

  if (!X.w[1]) {
    //--- get number of bits in the coefficients of x and y ---
    tempx.d = (double) X.w[0];
    bin_expon_cx = ((tempx.i & MASK_BINARY_EXPONENT) >> 52) - 0x3ff;
    // get number of decimal digits in the coeff_x
    digits_x = estimate_decimal_digits[bin_expon_cx];
    if (X.w[0] >= power10_table_128[digits_x].w[0])
      digits_x++;
    return digits_x;
  }
  tempx.d = (double) X.w[1];
  bin_expon_cx = ((tempx.i & MASK_BINARY_EXPONENT) >> 52) - 0x3ff;
  // get number of decimal digits in the coeff_x
  digits_x = estimate_decimal_digits[bin_expon_cx + 64];
  if (__unsigned_compare_ge_128 (X, power10_table_128[digits_x]))
    digits_x++;

  return digits_x;
}


////////////////////////////////////////////////////////////////////////////////
//
// add 64-bit coefficient to 128-bit coefficient, return result in BID64 format
//
////////////////////////////////////////////////////////////////////////////////
__BID_INLINE__ UINT64
get_add128 (UINT64 sign_x, int exponent_x, UINT64 coefficient_x,
	    UINT64 sign_y, int final_exponent_y, UINT128 CY,
	    int extra_digits, int rounding_mode, unsigned *fpsc) {
  UINT128 CY_L, CX, FS, F, CT, ST, T2;
  UINT64 CYh, CY0L, T, S, coefficient_y, remainder_y;
  SINT64 D = 0;
  int_double tempx;
  int diff_dec_expon, extra_digits2, exponent_y, status;
  int extra_dx, diff_dec2, bin_expon_cx, digits_x, rmode;

  // CY has more than 16 decimal digits

  exponent_y = final_exponent_y - extra_digits;

#ifdef IEEE_ROUND_NEAREST_TIES_AWAY
  rounding_mode = 0;
#endif
#ifdef IEEE_ROUND_NEAREST
  rounding_mode = 0;
#endif

  if (exponent_x > exponent_y) {
    // normalize x
    //--- get number of bits in the coefficients of x and y ---
    tempx.d = (double) coefficient_x;
    bin_expon_cx = ((tempx.i & MASK_BINARY_EXPONENT) >> 52) - 0x3ff;
    // get number of decimal digits in the coeff_x
    digits_x = estimate_decimal_digits[bin_expon_cx];
    if (coefficient_x >= power10_table_128[digits_x].w[0])
      digits_x++;

    extra_dx = 16 - digits_x;
    coefficient_x *= power10_table_128[extra_dx].w[0];
    if ((sign_x ^ sign_y) && (coefficient_x == 1000000000000000ull)) {
      extra_dx++;
      coefficient_x = 10000000000000000ull;
    }
    exponent_x -= extra_dx;

    if (exponent_x > exponent_y) {

      // exponent_x > exponent_y
      diff_dec_expon = exponent_x - exponent_y;

      if (exponent_x <= final_exponent_y + 1) {
	__mul_64x64_to_128 (CX, coefficient_x,
			    power10_table_128[diff_dec_expon].w[0]);

	if (sign_x == sign_y) {
	  __add_128_128 (CT, CY, CX);
	  if ((exponent_x >
	       final_exponent_y) /*&& (final_exponent_y>0) */ )
	    extra_digits++;
	  if (__unsigned_compare_ge_128
	      (CT, power10_table_128[16 + extra_digits]))
	    extra_digits++;
	} else {
	  __sub_128_128 (CT, CY, CX);
	  if (((SINT64) CT.w[1]) < 0) {
	    CT.w[0] = 0 - CT.w[0];
	    CT.w[1] = 0 - CT.w[1];
	    if (CT.w[0])
	      CT.w[1]--;
	    sign_y = sign_x;
	  } else if (!(CT.w[1] | CT.w[0])) {
	    sign_y =
	      (rounding_mode !=
	       ROUNDING_DOWN) ? 0 : 0x8000000000000000ull;
	  }
	  if ((exponent_x + 1 >=
	       final_exponent_y) /*&& (final_exponent_y>=0) */ ) {
	    extra_digits = __get_dec_digits64 (CT) - 16;
	    if (extra_digits <= 0) {
	      if (!CT.w[0] && rounding_mode == ROUNDING_DOWN)
		sign_y = 0x8000000000000000ull;
	      return get_BID64 (sign_y, exponent_y, CT.w[0],
				rounding_mode, fpsc);
	    }
	  } else
	    if (__unsigned_compare_gt_128
		(power10_table_128[15 + extra_digits], CT))
	    extra_digits--;
	}

	return __bid_full_round64 (sign_y, exponent_y, CT, extra_digits,
				   rounding_mode, fpsc);
      }
      // diff_dec2+extra_digits is the number of digits to eliminate from 
      //                           argument CY
      diff_dec2 = exponent_x - final_exponent_y;

      if (diff_dec2 >= 17) {
#ifndef IEEE_ROUND_NEAREST
#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
	if ((rounding_mode) & 3) {
	  switch (rounding_mode) {
	  case ROUNDING_UP:
	    if (!sign_y) {
	      D = ((SINT64) (sign_x ^ sign_y)) >> 63;
	      D = D + D + 1;
	      coefficient_x += D;
	    }
	    break;
	  case ROUNDING_DOWN:
	    if (sign_y) {
	      D = ((SINT64) (sign_x ^ sign_y)) >> 63;
	      D = D + D + 1;
	      coefficient_x += D;
	    }
	    break;
	  case ROUNDING_TO_ZERO:
	    if (sign_y != sign_x) {
	      D = 0 - 1;
	      coefficient_x += D;
	    }
	    break;
	  }
	  if (coefficient_x < 1000000000000000ull) {
	    coefficient_x -= D;
	    coefficient_x =
	      D + (coefficient_x << 1) + (coefficient_x << 3);
	    exponent_x--;
	  }
	}
#endif
#endif
#ifdef SET_STATUS_FLAGS
	if (CY.w[1] | CY.w[0])
	  __set_status_flags (fpsc, INEXACT_EXCEPTION);
#endif
	return get_BID64 (sign_x, exponent_x, coefficient_x,
			  rounding_mode, fpsc);
      }
      // here exponent_x <= 16+final_exponent_y

      // truncate CY to 16 dec. digits
      CYh = __truncate (CY, extra_digits);

      // get remainder
      T = power10_table_128[extra_digits].w[0];
      __mul_64x64_to_64 (CY0L, CYh, T);

      remainder_y = CY.w[0] - CY0L;

      // align coeff_x, CYh
      __mul_64x64_to_128 (CX, coefficient_x,
			  power10_table_128[diff_dec2].w[0]);

      if (sign_x == sign_y) {
	__add_128_64 (CT, CX, CYh);
	if (__unsigned_compare_ge_128
	    (CT, power10_table_128[16 + diff_dec2]))
	  diff_dec2++;
      } else {
	if (remainder_y)
	  CYh++;
	__sub_128_64 (CT, CX, CYh);
	if (__unsigned_compare_gt_128
	    (power10_table_128[15 + diff_dec2], CT))
	  diff_dec2--;
      }

      return __bid_full_round64_remainder (sign_x, final_exponent_y, CT,
					   diff_dec2, remainder_y,
					   rounding_mode, fpsc, 0);
    }
  }
  // Here (exponent_x <= exponent_y)
  {
    diff_dec_expon = exponent_y - exponent_x;

    if (diff_dec_expon > MAX_FORMAT_DIGITS) {
      rmode = rounding_mode;

      if ((sign_x ^ sign_y)) {
	if (!CY.w[0])
	  CY.w[1]--;
	CY.w[0]--;
	if (__unsigned_compare_gt_128
	    (power10_table_128[15 + extra_digits], CY)) {
	  if (rmode & 3) {
	    extra_digits--;
	    final_exponent_y--;
	  } else {
	    CY.w[0] = 1000000000000000ull;
	    CY.w[1] = 0;
	    extra_digits = 0;
	  }
	}
      }
      __scale128_10 (CY, CY);
      extra_digits++;
      CY.w[0] |= 1;

      return __bid_simple_round64_sticky (sign_y, final_exponent_y, CY,
					  extra_digits, rmode, fpsc);
    }
    // apply sign to coeff_x
    sign_x ^= sign_y;
    sign_x = ((SINT64) sign_x) >> 63;
    CX.w[0] = (coefficient_x + sign_x) ^ sign_x;
    CX.w[1] = sign_x;

    // check whether CY (rounded to 16 digits) and CX have 
    //                     any digits in the same position
    diff_dec2 = final_exponent_y - exponent_x;

    if (diff_dec2 <= 17) {
      // align CY to 10^ex
      S = power10_table_128[diff_dec_expon].w[0];
      __mul_64x128_short (CY_L, S, CY);

      __add_128_128 (ST, CY_L, CX);
      extra_digits2 = __get_dec_digits64 (ST) - 16;
      return __bid_full_round64 (sign_y, exponent_x, ST, extra_digits2,
				 rounding_mode, fpsc);
    }
    // truncate CY to 16 dec. digits
    CYh = __truncate (CY, extra_digits);

    // get remainder
    T = power10_table_128[extra_digits].w[0];
    __mul_64x64_to_64 (CY0L, CYh, T);

    coefficient_y = CY.w[0] - CY0L;
    // add rounding constant
    rmode = rounding_mode;
    if (sign_y && (unsigned) (rmode - 1) < 2)
      rmode = 3 - rmode;
#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
#ifndef IEEE_ROUND_NEAREST
    if (!(rmode & 3))	//ROUNDING_TO_NEAREST
#endif
#endif
    {
      coefficient_y += round_const_table[rmode][extra_digits];
    }
    // align coefficient_y,  coefficient_x
    S = power10_table_128[diff_dec_expon].w[0];
    __mul_64x64_to_128 (F, coefficient_y, S);

    // fraction
    __add_128_128 (FS, F, CX);

#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
#ifndef IEEE_ROUND_NEAREST
    if (rmode == 0)	//ROUNDING_TO_NEAREST
#endif
    {
      // rounding code, here RN_EVEN
      // 10^(extra_digits+diff_dec_expon)
      T2 = power10_table_128[diff_dec_expon + extra_digits];
      if (__unsigned_compare_gt_128 (FS, T2)
	  || ((CYh & 1) && __test_equal_128 (FS, T2))) {
	CYh++;
	__sub_128_128 (FS, FS, T2);
      }
    }
#endif
#ifndef IEEE_ROUND_NEAREST
#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
    if (rmode == 4)	//ROUNDING_TO_NEAREST
#endif
    {
      // rounding code, here RN_AWAY
      // 10^(extra_digits+diff_dec_expon)
      T2 = power10_table_128[diff_dec_expon + extra_digits];
      if (__unsigned_compare_ge_128 (FS, T2)) {
	CYh++;
	__sub_128_128 (FS, FS, T2);
      }
    }
#endif
#ifndef IEEE_ROUND_NEAREST
#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
    switch (rmode) {
    case ROUNDING_DOWN:
    case ROUNDING_TO_ZERO:
      if ((SINT64) FS.w[1] < 0) {
	CYh--;
	if (CYh < 1000000000000000ull) {
	  CYh = 9999999999999999ull;
	  final_exponent_y--;
	}
      } else {
	T2 = power10_table_128[diff_dec_expon + extra_digits];
	if (__unsigned_compare_ge_128 (FS, T2)) {
	  CYh++;
	  __sub_128_128 (FS, FS, T2);
	}
      }
      break;
    case ROUNDING_UP:
      if ((SINT64) FS.w[1] < 0)
	break;
      T2 = power10_table_128[diff_dec_expon + extra_digits];
      if (__unsigned_compare_gt_128 (FS, T2)) {
	CYh += 2;
	__sub_128_128 (FS, FS, T2);
      } else if ((FS.w[1] == T2.w[1]) && (FS.w[0] == T2.w[0])) {
	CYh++;
	FS.w[1] = FS.w[0] = 0;
      } else if (FS.w[1] | FS.w[0])
	CYh++;
      break;
    }
#endif
#endif

#ifdef SET_STATUS_FLAGS
    status = INEXACT_EXCEPTION;
#ifndef IEEE_ROUND_NEAREST
#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
    if (!(rmode & 3))
#endif
#endif
    {
      // RN modes
      if ((FS.w[1] ==
	   round_const_table_128[0][diff_dec_expon + extra_digits].w[1])
	  && (FS.w[0] ==
	      round_const_table_128[0][diff_dec_expon +
				       extra_digits].w[0]))
	status = EXACT_STATUS;
    }
#ifndef IEEE_ROUND_NEAREST
#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
    else if (!FS.w[1] && !FS.w[0])
      status = EXACT_STATUS;
#endif
#endif

    __set_status_flags (fpsc, status);
#endif

    return get_BID64 (sign_y, final_exponent_y, CYh, rounding_mode,
		      fpsc);
  }

}

//////////////////////////////////////////////////////////////////////////
//
//  If coefficient_z is less than 16 digits long, normalize to 16 digits
//
/////////////////////////////////////////////////////////////////////////
static UINT64
BID_normalize (UINT64 sign_z, int exponent_z,
	       UINT64 coefficient_z, UINT64 round_dir, int round_flag,
	       int rounding_mode, unsigned *fpsc) {
  SINT64 D;
  int_double tempx;
  int digits_z, bin_expon, scale, rmode;

#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
#ifndef IEEE_ROUND_NEAREST
  rmode = rounding_mode;
  if (sign_z && (unsigned) (rmode - 1) < 2)
    rmode = 3 - rmode;
#else
  if (coefficient_z >= power10_table_128[15].w[0])
    return z;
#endif
#endif

  //--- get number of bits in the coefficients of x and y ---
  tempx.d = (double) coefficient_z;
  bin_expon = ((tempx.i & MASK_BINARY_EXPONENT) >> 52) - 0x3ff;
  // get number of decimal digits in the coeff_x
  digits_z = estimate_decimal_digits[bin_expon];
  if (coefficient_z >= power10_table_128[digits_z].w[0])
    digits_z++;

  scale = 16 - digits_z;
  exponent_z -= scale;
  if (exponent_z < 0) {
    scale += exponent_z;
    exponent_z = 0;
  }
  coefficient_z *= power10_table_128[scale].w[0];

#ifdef SET_STATUS_FLAGS
  if (round_flag) {
    __set_status_flags (fpsc, INEXACT_EXCEPTION);
    if (coefficient_z < 1000000000000000ull)
      __set_status_flags (fpsc, UNDERFLOW_EXCEPTION);
    else if ((coefficient_z == 1000000000000000ull) && !exponent_z
	     && ((SINT64) (round_dir ^ sign_z) < 0) && round_flag
	     && (rmode == ROUNDING_DOWN || rmode == ROUNDING_TO_ZERO))
      __set_status_flags (fpsc, UNDERFLOW_EXCEPTION);
  }
#endif

#ifndef IEEE_ROUND_NEAREST_TIES_AWAY
#ifndef IEEE_ROUND_NEAREST
  if (round_flag && (rmode & 3)) {
    D = round_dir ^ sign_z;

    if (rmode == ROUNDING_UP) {
      if (D >= 0)
	coefficient_z++;
    } else {
      if (D < 0)
	coefficient_z--;
      if (coefficient_z < 1000000000000000ull && exponent_z) {
	coefficient_z = 9999999999999999ull;
	exponent_z--;
      }
    }
  }
#endif
#endif

  return get_BID64 (sign_z, exponent_z, coefficient_z, rounding_mode,
		    fpsc);
}


//////////////////////////////////////////////////////////////////////////
//
//    0*10^ey + cz*10^ez,   ey<ez  
//
//////////////////////////////////////////////////////////////////////////

__BID_INLINE__ UINT64
add_zero64 (int exponent_y, UINT64 sign_z, int exponent_z,
	    UINT64 coefficient_z, unsigned *prounding_mode,
	    unsigned *fpsc) {
  int_double tempx;
  int bin_expon, scale_k, scale_cz;
  int diff_expon;

  diff_expon = exponent_z - exponent_y;

  tempx.d = (double) coefficient_z;
  bin_expon = ((tempx.i & MASK_BINARY_EXPONENT) >> 52) - 0x3ff;
  scale_cz = estimate_decimal_digits[bin_expon];
  if (coefficient_z >= power10_table_128[scale_cz].w[0])
    scale_cz++;

  scale_k = 16 - scale_cz;
  if (diff_expon < scale_k)
    scale_k = diff_expon;
  coefficient_z *= power10_table_128[scale_k].w[0];

  return get_BID64 (sign_z, exponent_z - scale_k, coefficient_z,
		    *prounding_mode, fpsc);
}
#endif