diff options
author | Ben Cheng <bccheng@google.com> | 2014-03-25 22:37:19 -0700 |
---|---|---|
committer | Ben Cheng <bccheng@google.com> | 2014-03-25 22:37:19 -0700 |
commit | 1bc5aee63eb72b341f506ad058502cd0361f0d10 (patch) | |
tree | c607e8252f3405424ff15bc2d00aa38dadbb2518 /gcc-4.9/libgcc/config/avr/lib1funcs-fixed.S | |
parent | 283a0bf58fcf333c58a2a92c3ebbc41fb9eb1fdb (diff) | |
download | toolchain_gcc-1bc5aee63eb72b341f506ad058502cd0361f0d10.tar.gz toolchain_gcc-1bc5aee63eb72b341f506ad058502cd0361f0d10.tar.bz2 toolchain_gcc-1bc5aee63eb72b341f506ad058502cd0361f0d10.zip |
Initial checkin of GCC 4.9.0 from trunk (r208799).
Change-Id: I48a3c08bb98542aa215912a75f03c0890e497dba
Diffstat (limited to 'gcc-4.9/libgcc/config/avr/lib1funcs-fixed.S')
-rw-r--r-- | gcc-4.9/libgcc/config/avr/lib1funcs-fixed.S | 1915 |
1 files changed, 1915 insertions, 0 deletions
diff --git a/gcc-4.9/libgcc/config/avr/lib1funcs-fixed.S b/gcc-4.9/libgcc/config/avr/lib1funcs-fixed.S new file mode 100644 index 000000000..8f3ed9201 --- /dev/null +++ b/gcc-4.9/libgcc/config/avr/lib1funcs-fixed.S @@ -0,0 +1,1915 @@ +/* -*- Mode: Asm -*- */ +;; Copyright (C) 2012-2014 Free Software Foundation, Inc. +;; Contributed by Sean D'Epagnier (sean@depagnier.com) +;; Georg-Johann Lay (avr@gjlay.de) + +;; This file is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by the +;; Free Software Foundation; either version 3, or (at your option) any +;; later version. + +;; In addition to the permissions in the GNU General Public License, the +;; Free Software Foundation gives you unlimited permission to link the +;; compiled version of this file into combinations with other programs, +;; and to distribute those combinations without any restriction coming +;; from the use of this file. (The General Public License restrictions +;; do apply in other respects; for example, they cover modification of +;; the file, and distribution when not linked into a combine +;; executable.) + +;; This file is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with this program; see the file COPYING. If not, write to +;; the Free Software Foundation, 51 Franklin Street, Fifth Floor, +;; Boston, MA 02110-1301, USA. + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Fixed point library routines for AVR +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +.section .text.libgcc.fixed, "ax", @progbits + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Conversions to float +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +#if defined (L_fractqqsf) +DEFUN __fractqqsf + ;; Move in place for SA -> SF conversion + clr r22 + mov r23, r24 + ;; Sign-extend + lsl r24 + sbc r24, r24 + mov r25, r24 + XJMP __fractsasf +ENDF __fractqqsf +#endif /* L_fractqqsf */ + +#if defined (L_fractuqqsf) +DEFUN __fractuqqsf + ;; Move in place for USA -> SF conversion + clr r22 + mov r23, r24 + ;; Zero-extend + clr r24 + clr r25 + XJMP __fractusasf +ENDF __fractuqqsf +#endif /* L_fractuqqsf */ + +#if defined (L_fracthqsf) +DEFUN __fracthqsf + ;; Move in place for SA -> SF conversion + wmov 22, 24 + ;; Sign-extend + lsl r25 + sbc r24, r24 + mov r25, r24 + XJMP __fractsasf +ENDF __fracthqsf +#endif /* L_fracthqsf */ + +#if defined (L_fractuhqsf) +DEFUN __fractuhqsf + ;; Move in place for USA -> SF conversion + wmov 22, 24 + ;; Zero-extend + clr r24 + clr r25 + XJMP __fractusasf +ENDF __fractuhqsf +#endif /* L_fractuhqsf */ + +#if defined (L_fracthasf) +DEFUN __fracthasf + ;; Move in place for SA -> SF conversion + clr r22 + mov r23, r24 + mov r24, r25 + ;; Sign-extend + lsl r25 + sbc r25, r25 + XJMP __fractsasf +ENDF __fracthasf +#endif /* L_fracthasf */ + +#if defined (L_fractuhasf) +DEFUN __fractuhasf + ;; Move in place for USA -> SF conversion + clr r22 + mov r23, r24 + mov r24, r25 + ;; Zero-extend + clr r25 + XJMP __fractusasf +ENDF __fractuhasf +#endif /* L_fractuhasf */ + + +#if defined (L_fractsqsf) +DEFUN __fractsqsf + XCALL __floatsisf + ;; Divide non-zero results by 2^31 to move the + ;; decimal point into place + tst r25 + breq 0f + subi r24, exp_lo (31) + sbci r25, exp_hi (31) +0: ret +ENDF __fractsqsf +#endif /* L_fractsqsf */ + +#if defined (L_fractusqsf) +DEFUN __fractusqsf + XCALL __floatunsisf + ;; Divide non-zero results by 2^32 to move the + ;; decimal point into place + cpse r25, __zero_reg__ + subi r25, exp_hi (32) + ret +ENDF __fractusqsf +#endif /* L_fractusqsf */ + +#if defined (L_fractsasf) +DEFUN __fractsasf + XCALL __floatsisf + ;; Divide non-zero results by 2^15 to move the + ;; decimal point into place + tst r25 + breq 0f + subi r24, exp_lo (15) + sbci r25, exp_hi (15) +0: ret +ENDF __fractsasf +#endif /* L_fractsasf */ + +#if defined (L_fractusasf) +DEFUN __fractusasf + XCALL __floatunsisf + ;; Divide non-zero results by 2^16 to move the + ;; decimal point into place + cpse r25, __zero_reg__ + subi r25, exp_hi (16) + ret +ENDF __fractusasf +#endif /* L_fractusasf */ + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Conversions from float +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +#if defined (L_fractsfqq) +DEFUN __fractsfqq + ;; Multiply with 2^{24+7} to get a QQ result in r25 + subi r24, exp_lo (-31) + sbci r25, exp_hi (-31) + XCALL __fixsfsi + mov r24, r25 + ret +ENDF __fractsfqq +#endif /* L_fractsfqq */ + +#if defined (L_fractsfuqq) +DEFUN __fractsfuqq + ;; Multiply with 2^{24+8} to get a UQQ result in r25 + subi r25, exp_hi (-32) + XCALL __fixunssfsi + mov r24, r25 + ret +ENDF __fractsfuqq +#endif /* L_fractsfuqq */ + +#if defined (L_fractsfha) +DEFUN __fractsfha + ;; Multiply with 2^{16+7} to get a HA result in r25:r24 + subi r24, exp_lo (-23) + sbci r25, exp_hi (-23) + XJMP __fixsfsi +ENDF __fractsfha +#endif /* L_fractsfha */ + +#if defined (L_fractsfuha) +DEFUN __fractsfuha + ;; Multiply with 2^24 to get a UHA result in r25:r24 + subi r25, exp_hi (-24) + XJMP __fixunssfsi +ENDF __fractsfuha +#endif /* L_fractsfuha */ + +#if defined (L_fractsfhq) +FALIAS __fractsfsq + +DEFUN __fractsfhq + ;; Multiply with 2^{16+15} to get a HQ result in r25:r24 + ;; resp. with 2^31 to get a SQ result in r25:r22 + subi r24, exp_lo (-31) + sbci r25, exp_hi (-31) + XJMP __fixsfsi +ENDF __fractsfhq +#endif /* L_fractsfhq */ + +#if defined (L_fractsfuhq) +FALIAS __fractsfusq + +DEFUN __fractsfuhq + ;; Multiply with 2^{16+16} to get a UHQ result in r25:r24 + ;; resp. with 2^32 to get a USQ result in r25:r22 + subi r25, exp_hi (-32) + XJMP __fixunssfsi +ENDF __fractsfuhq +#endif /* L_fractsfuhq */ + +#if defined (L_fractsfsa) +DEFUN __fractsfsa + ;; Multiply with 2^15 to get a SA result in r25:r22 + subi r24, exp_lo (-15) + sbci r25, exp_hi (-15) + XJMP __fixsfsi +ENDF __fractsfsa +#endif /* L_fractsfsa */ + +#if defined (L_fractsfusa) +DEFUN __fractsfusa + ;; Multiply with 2^16 to get a USA result in r25:r22 + subi r25, exp_hi (-16) + XJMP __fixunssfsi +ENDF __fractsfusa +#endif /* L_fractsfusa */ + + +;; For multiplication the functions here are called directly from +;; avr-fixed.md instead of using the standard libcall mechanisms. +;; This can make better code because GCC knows exactly which +;; of the call-used registers (not all of them) are clobbered. */ + +/******************************************************* + Fractional Multiplication 8 x 8 without MUL +*******************************************************/ + +#if defined (L_mulqq3) && !defined (__AVR_HAVE_MUL__) +;;; R23 = R24 * R25 +;;; Clobbers: __tmp_reg__, R22, R24, R25 +;;; Rounding: ??? +DEFUN __mulqq3 + XCALL __fmuls + ;; TR 18037 requires that (-1) * (-1) does not overflow + ;; The only input that can produce -1 is (-1)^2. + dec r23 + brvs 0f + inc r23 +0: ret +ENDF __mulqq3 +#endif /* L_mulqq3 && ! HAVE_MUL */ + +/******************************************************* + Fractional Multiply .16 x .16 with and without MUL +*******************************************************/ + +#if defined (L_mulhq3) +;;; Same code with and without MUL, but the interfaces differ: +;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25) +;;; Clobbers: ABI, called by optabs +;;; MUL: (R25:R24) = (R19:R18) * (R27:R26) +;;; Clobbers: __tmp_reg__, R22, R23 +;;; Rounding: -0.5 LSB <= error <= 0.5 LSB +DEFUN __mulhq3 + XCALL __mulhisi3 + ;; Shift result into place + lsl r23 + rol r24 + rol r25 + brvs 1f + ;; Round + sbrc r23, 7 + adiw r24, 1 + ret +1: ;; Overflow. TR 18037 requires (-1)^2 not to overflow + ldi r24, lo8 (0x7fff) + ldi r25, hi8 (0x7fff) + ret +ENDF __mulhq3 +#endif /* defined (L_mulhq3) */ + +#if defined (L_muluhq3) +;;; Same code with and without MUL, but the interfaces differ: +;;; no MUL: (R25:R24) *= (R23:R22) +;;; Clobbers: ABI, called by optabs +;;; MUL: (R25:R24) = (R19:R18) * (R27:R26) +;;; Clobbers: __tmp_reg__, R22, R23 +;;; Rounding: -0.5 LSB < error <= 0.5 LSB +DEFUN __muluhq3 + XCALL __umulhisi3 + ;; Round + sbrc r23, 7 + adiw r24, 1 + ret +ENDF __muluhq3 +#endif /* L_muluhq3 */ + + +/******************************************************* + Fixed Multiply 8.8 x 8.8 with and without MUL +*******************************************************/ + +#if defined (L_mulha3) +;;; Same code with and without MUL, but the interfaces differ: +;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25) +;;; Clobbers: ABI, called by optabs +;;; MUL: (R25:R24) = (R19:R18) * (R27:R26) +;;; Clobbers: __tmp_reg__, R22, R23 +;;; Rounding: -0.5 LSB <= error <= 0.5 LSB +DEFUN __mulha3 + XCALL __mulhisi3 + lsl r22 + rol r23 + rol r24 + XJMP __muluha3_round +ENDF __mulha3 +#endif /* L_mulha3 */ + +#if defined (L_muluha3) +;;; Same code with and without MUL, but the interfaces differ: +;;; no MUL: (R25:R24) *= (R23:R22) +;;; Clobbers: ABI, called by optabs +;;; MUL: (R25:R24) = (R19:R18) * (R27:R26) +;;; Clobbers: __tmp_reg__, R22, R23 +;;; Rounding: -0.5 LSB < error <= 0.5 LSB +DEFUN __muluha3 + XCALL __umulhisi3 + XJMP __muluha3_round +ENDF __muluha3 +#endif /* L_muluha3 */ + +#if defined (L_muluha3_round) +DEFUN __muluha3_round + ;; Shift result into place + mov r25, r24 + mov r24, r23 + ;; Round + sbrc r22, 7 + adiw r24, 1 + ret +ENDF __muluha3_round +#endif /* L_muluha3_round */ + + +/******************************************************* + Fixed Multiplication 16.16 x 16.16 +*******************************************************/ + +;; Bits outside the result (below LSB), used in the signed version +#define GUARD __tmp_reg__ + +#if defined (__AVR_HAVE_MUL__) + +;; Multiplier +#define A0 16 +#define A1 A0+1 +#define A2 A1+1 +#define A3 A2+1 + +;; Multiplicand +#define B0 20 +#define B1 B0+1 +#define B2 B1+1 +#define B3 B2+1 + +;; Result +#define C0 24 +#define C1 C0+1 +#define C2 C1+1 +#define C3 C2+1 + +#if defined (L_mulusa3) +;;; (C3:C0) = (A3:A0) * (B3:B0) +DEFUN __mulusa3 + set + ;; Fallthru +ENDF __mulusa3 + +;;; Round for last digit iff T = 1 +;;; Return guard bits in GUARD (__tmp_reg__). +;;; Rounding, T = 0: -1.0 LSB < error <= 0 LSB +;;; Rounding, T = 1: -0.5 LSB < error <= 0.5 LSB +DEFUN __mulusa3_round + ;; Some of the MUL instructions have LSBs outside the result. + ;; Don't ignore these LSBs in order to tame rounding error. + ;; Use C2/C3 for these LSBs. + + clr C0 + clr C1 + mul A0, B0 $ movw C2, r0 + + mul A1, B0 $ add C3, r0 $ adc C0, r1 + mul A0, B1 $ add C3, r0 $ adc C0, r1 $ rol C1 + + ;; Round if T = 1. Store guarding bits outside the result for rounding + ;; and left-shift by the signed version (function below). + brtc 0f + sbrc C3, 7 + adiw C0, 1 +0: push C3 + + ;; The following MULs don't have LSBs outside the result. + ;; C2/C3 is the high part. + + mul A0, B2 $ add C0, r0 $ adc C1, r1 $ sbc C2, C2 + mul A1, B1 $ add C0, r0 $ adc C1, r1 $ sbci C2, 0 + mul A2, B0 $ add C0, r0 $ adc C1, r1 $ sbci C2, 0 + neg C2 + + mul A0, B3 $ add C1, r0 $ adc C2, r1 $ sbc C3, C3 + mul A1, B2 $ add C1, r0 $ adc C2, r1 $ sbci C3, 0 + mul A2, B1 $ add C1, r0 $ adc C2, r1 $ sbci C3, 0 + mul A3, B0 $ add C1, r0 $ adc C2, r1 $ sbci C3, 0 + neg C3 + + mul A1, B3 $ add C2, r0 $ adc C3, r1 + mul A2, B2 $ add C2, r0 $ adc C3, r1 + mul A3, B1 $ add C2, r0 $ adc C3, r1 + + mul A2, B3 $ add C3, r0 + mul A3, B2 $ add C3, r0 + + ;; Guard bits used in the signed version below. + pop GUARD + clr __zero_reg__ + ret +ENDF __mulusa3_round +#endif /* L_mulusa3 */ + +#if defined (L_mulsa3) +;;; (C3:C0) = (A3:A0) * (B3:B0) +;;; Clobbers: __tmp_reg__, T +;;; Rounding: -0.5 LSB <= error <= 0.5 LSB +DEFUN __mulsa3 + clt + XCALL __mulusa3_round + ;; A posteriori sign extension of the operands + tst B3 + brpl 1f + sub C2, A0 + sbc C3, A1 +1: sbrs A3, 7 + rjmp 2f + sub C2, B0 + sbc C3, B1 +2: + ;; Shift 1 bit left to adjust for 15 fractional bits + lsl GUARD + rol C0 + rol C1 + rol C2 + rol C3 + ;; Round last digit + lsl GUARD + adc C0, __zero_reg__ + adc C1, __zero_reg__ + adc C2, __zero_reg__ + adc C3, __zero_reg__ + ret +ENDF __mulsa3 +#endif /* L_mulsa3 */ + +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef B0 +#undef B1 +#undef B2 +#undef B3 +#undef C0 +#undef C1 +#undef C2 +#undef C3 + +#else /* __AVR_HAVE_MUL__ */ + +#define A0 18 +#define A1 A0+1 +#define A2 A0+2 +#define A3 A0+3 + +#define B0 22 +#define B1 B0+1 +#define B2 B0+2 +#define B3 B0+3 + +#define C0 22 +#define C1 C0+1 +#define C2 C0+2 +#define C3 C0+3 + +;; __tmp_reg__ +#define CC0 0 +;; __zero_reg__ +#define CC1 1 +#define CC2 16 +#define CC3 17 + +#define AA0 26 +#define AA1 AA0+1 +#define AA2 30 +#define AA3 AA2+1 + +#if defined (L_mulsa3) +;;; (R25:R22) *= (R21:R18) +;;; Clobbers: ABI, called by optabs +;;; Rounding: -1 LSB <= error <= 1 LSB +DEFUN __mulsa3 + push B0 + push B1 + push B3 + clt + XCALL __mulusa3_round + pop r30 + ;; sign-extend B + bst r30, 7 + brtc 1f + ;; A1, A0 survived in R27:R26 + sub C2, AA0 + sbc C3, AA1 +1: + pop AA1 ;; B1 + pop AA0 ;; B0 + + ;; sign-extend A. A3 survived in R31 + bst AA3, 7 + brtc 2f + sub C2, AA0 + sbc C3, AA1 +2: + ;; Shift 1 bit left to adjust for 15 fractional bits + lsl GUARD + rol C0 + rol C1 + rol C2 + rol C3 + ;; Round last digit + lsl GUARD + adc C0, __zero_reg__ + adc C1, __zero_reg__ + adc C2, __zero_reg__ + adc C3, __zero_reg__ + ret +ENDF __mulsa3 +#endif /* L_mulsa3 */ + +#if defined (L_mulusa3) +;;; (R25:R22) *= (R21:R18) +;;; Clobbers: ABI, called by optabs +;;; Rounding: -1 LSB <= error <= 1 LSB +DEFUN __mulusa3 + set + ;; Fallthru +ENDF __mulusa3 + +;;; A[] survives in 26, 27, 30, 31 +;;; Also used by __mulsa3 with T = 0 +;;; Round if T = 1 +;;; Return Guard bits in GUARD (__tmp_reg__), used by signed version. +DEFUN __mulusa3_round + push CC2 + push CC3 + ; clear result + clr __tmp_reg__ + wmov CC2, CC0 + ; save multiplicand + wmov AA0, A0 + wmov AA2, A2 + rjmp 3f + + ;; Loop the integral part + +1: ;; CC += A * 2^n; n >= 0 + add CC0,A0 $ adc CC1,A1 $ adc CC2,A2 $ adc CC3,A3 + +2: ;; A <<= 1 + lsl A0 $ rol A1 $ rol A2 $ rol A3 + +3: ;; IBIT(B) >>= 1 + ;; Carry = n-th bit of B; n >= 0 + lsr B3 + ror B2 + brcs 1b + sbci B3, 0 + brne 2b + + ;; Loop the fractional part + ;; B2/B3 is 0 now, use as guard bits for rounding + ;; Restore multiplicand + wmov A0, AA0 + wmov A2, AA2 + rjmp 5f + +4: ;; CC += A:Guard * 2^n; n < 0 + add B3,B2 $ adc CC0,A0 $ adc CC1,A1 $ adc CC2,A2 $ adc CC3,A3 +5: + ;; A:Guard >>= 1 + lsr A3 $ ror A2 $ ror A1 $ ror A0 $ ror B2 + + ;; FBIT(B) <<= 1 + ;; Carry = n-th bit of B; n < 0 + lsl B0 + rol B1 + brcs 4b + sbci B0, 0 + brne 5b + + ;; Save guard bits and set carry for rounding + push B3 + lsl B3 + ;; Move result into place + wmov C2, CC2 + wmov C0, CC0 + clr __zero_reg__ + brtc 6f + ;; Round iff T = 1 + adc C0, __zero_reg__ + adc C1, __zero_reg__ + adc C2, __zero_reg__ + adc C3, __zero_reg__ +6: + pop GUARD + ;; Epilogue + pop CC3 + pop CC2 + ret +ENDF __mulusa3_round +#endif /* L_mulusa3 */ + +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef B0 +#undef B1 +#undef B2 +#undef B3 +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef AA0 +#undef AA1 +#undef AA2 +#undef AA3 +#undef CC0 +#undef CC1 +#undef CC2 +#undef CC3 + +#endif /* __AVR_HAVE_MUL__ */ + +#undef GUARD + +/*********************************************************** + Fixed unsigned saturated Multiplication 8.8 x 8.8 +***********************************************************/ + +#define C0 22 +#define C1 C0+1 +#define C2 C0+2 +#define C3 C0+3 +#define SS __tmp_reg__ + +#if defined (L_usmuluha3) +DEFUN __usmuluha3 + ;; Widening multiply +#ifdef __AVR_HAVE_MUL__ + ;; Adjust interface + movw R26, R22 + movw R18, R24 +#endif /* HAVE MUL */ + XCALL __umulhisi3 + tst C3 + brne .Lmax + ;; Round, target is in C1..C2 + lsl C0 + adc C1, __zero_reg__ + adc C2, __zero_reg__ + brcs .Lmax + ;; Move result into place + mov C3, C2 + mov C2, C1 + ret +.Lmax: + ;; Saturate + ldi C2, 0xff + ldi C3, 0xff + ret +ENDF __usmuluha3 +#endif /* L_usmuluha3 */ + +/*********************************************************** + Fixed signed saturated Multiplication s8.7 x s8.7 +***********************************************************/ + +#if defined (L_ssmulha3) +DEFUN __ssmulha3 + ;; Widening multiply +#ifdef __AVR_HAVE_MUL__ + ;; Adjust interface + movw R26, R22 + movw R18, R24 +#endif /* HAVE MUL */ + XCALL __mulhisi3 + ;; Adjust decimal point + lsl C0 + rol C1 + rol C2 + brvs .LsatC3.3 + ;; The 9 MSBs must be the same + rol C3 + sbc SS, SS + cp C3, SS + brne .LsatSS + ;; Round + lsl C0 + adc C1, __zero_reg__ + adc C2, __zero_reg__ + brvs .Lmax + ;; Move result into place + mov C3, C2 + mov C2, C1 + ret +.Lmax: + ;; Load 0x7fff + clr C3 +.LsatC3.3: + ;; C3 < 0 --> 0x8000 + ;; C3 >= 0 --> 0x7fff + mov SS, C3 +.LsatSS: + ;; Load min / max value: + ;; SS = -1 --> 0x8000 + ;; SS = 0 --> 0x7fff + ldi C3, 0x7f + ldi C2, 0xff + sbrc SS, 7 + adiw C2, 1 + ret +ENDF __ssmulha3 +#endif /* L_ssmulha3 */ + +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef SS + +/*********************************************************** + Fixed unsigned saturated Multiplication 16.16 x 16.16 +***********************************************************/ + +#define C0 18 +#define C1 C0+1 +#define C2 C0+2 +#define C3 C0+3 +#define C4 C0+4 +#define C5 C0+5 +#define C6 C0+6 +#define C7 C0+7 +#define SS __tmp_reg__ + +#if defined (L_usmulusa3) +;; R22[4] = R22[4] *{ssat} R18[4] +;; Ordinary ABI function +DEFUN __usmulusa3 + ;; Widening multiply + XCALL __umulsidi3 + or C7, C6 + brne .Lmax + ;; Round, target is in C2..C5 + lsl C1 + adc C2, __zero_reg__ + adc C3, __zero_reg__ + adc C4, __zero_reg__ + adc C5, __zero_reg__ + brcs .Lmax + ;; Move result into place + wmov C6, C4 + wmov C4, C2 + ret +.Lmax: + ;; Saturate + ldi C7, 0xff + ldi C6, 0xff + wmov C4, C6 + ret +ENDF __usmulusa3 +#endif /* L_usmulusa3 */ + +/*********************************************************** + Fixed signed saturated Multiplication s16.15 x s16.15 +***********************************************************/ + +#if defined (L_ssmulsa3) +;; R22[4] = R22[4] *{ssat} R18[4] +;; Ordinary ABI function +DEFUN __ssmulsa3 + ;; Widening multiply + XCALL __mulsidi3 + ;; Adjust decimal point + lsl C1 + rol C2 + rol C3 + rol C4 + rol C5 + brvs .LsatC7.7 + ;; The 17 MSBs must be the same + rol C6 + rol C7 + sbc SS, SS + cp C6, SS + cpc C7, SS + brne .LsatSS + ;; Round + lsl C1 + adc C2, __zero_reg__ + adc C3, __zero_reg__ + adc C4, __zero_reg__ + adc C5, __zero_reg__ + brvs .Lmax + ;; Move result into place + wmov C6, C4 + wmov C4, C2 + ret + +.Lmax: + ;; Load 0x7fffffff + clr C7 +.LsatC7.7: + ;; C7 < 0 --> 0x80000000 + ;; C7 >= 0 --> 0x7fffffff + lsl C7 + sbc SS, SS +.LsatSS: + ;; Load min / max value: + ;; SS = -1 --> 0x80000000 + ;; SS = 0 --> 0x7fffffff + com SS + mov C4, SS + mov C5, C4 + wmov C6, C4 + subi C7, 0x80 + ret +ENDF __ssmulsa3 +#endif /* L_ssmulsa3 */ + +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#undef SS + +/******************************************************* + Fractional Division 8 / 8 +*******************************************************/ + +#define r_divd r25 /* dividend */ +#define r_quo r24 /* quotient */ +#define r_div r22 /* divisor */ +#define r_sign __tmp_reg__ + +#if defined (L_divqq3) +DEFUN __divqq3 + mov r_sign, r_divd + eor r_sign, r_div + sbrc r_div, 7 + neg r_div + sbrc r_divd, 7 + neg r_divd + XCALL __divqq_helper + lsr r_quo + sbrc r_sign, 7 ; negate result if needed + neg r_quo + ret +ENDF __divqq3 +#endif /* L_divqq3 */ + +#if defined (L_udivuqq3) +DEFUN __udivuqq3 + cp r_divd, r_div + brsh 0f + XJMP __divqq_helper + ;; Result is out of [0, 1) ==> Return 1 - eps. +0: ldi r_quo, 0xff + ret +ENDF __udivuqq3 +#endif /* L_udivuqq3 */ + + +#if defined (L_divqq_helper) +DEFUN __divqq_helper + clr r_quo ; clear quotient + inc __zero_reg__ ; init loop counter, used per shift +__udivuqq3_loop: + lsl r_divd ; shift dividend + brcs 0f ; dividend overflow + cp r_divd,r_div ; compare dividend & divisor + brcc 0f ; dividend >= divisor + rol r_quo ; shift quotient (with CARRY) + rjmp __udivuqq3_cont +0: + sub r_divd,r_div ; restore dividend + lsl r_quo ; shift quotient (without CARRY) +__udivuqq3_cont: + lsl __zero_reg__ ; shift loop-counter bit + brne __udivuqq3_loop + com r_quo ; complement result + ; because C flag was complemented in loop + ret +ENDF __divqq_helper +#endif /* L_divqq_helper */ + +#undef r_divd +#undef r_quo +#undef r_div +#undef r_sign + + +/******************************************************* + Fractional Division 16 / 16 +*******************************************************/ +#define r_divdL 26 /* dividend Low */ +#define r_divdH 27 /* dividend Hig */ +#define r_quoL 24 /* quotient Low */ +#define r_quoH 25 /* quotient High */ +#define r_divL 22 /* divisor */ +#define r_divH 23 /* divisor */ +#define r_cnt 21 + +#if defined (L_divhq3) +DEFUN __divhq3 + mov r0, r_divdH + eor r0, r_divH + sbrs r_divH, 7 + rjmp 1f + NEG2 r_divL +1: + sbrs r_divdH, 7 + rjmp 2f + NEG2 r_divdL +2: + cp r_divdL, r_divL + cpc r_divdH, r_divH + breq __divhq3_minus1 ; if equal return -1 + XCALL __udivuhq3 + lsr r_quoH + ror r_quoL + brpl 9f + ;; negate result if needed + NEG2 r_quoL +9: + ret +__divhq3_minus1: + ldi r_quoH, 0x80 + clr r_quoL + ret +ENDF __divhq3 +#endif /* defined (L_divhq3) */ + +#if defined (L_udivuhq3) +DEFUN __udivuhq3 + sub r_quoH,r_quoH ; clear quotient and carry + ;; FALLTHRU +ENDF __udivuhq3 + +DEFUN __udivuha3_common + clr r_quoL ; clear quotient + ldi r_cnt,16 ; init loop counter +__udivuhq3_loop: + rol r_divdL ; shift dividend (with CARRY) + rol r_divdH + brcs __udivuhq3_ep ; dividend overflow + cp r_divdL,r_divL ; compare dividend & divisor + cpc r_divdH,r_divH + brcc __udivuhq3_ep ; dividend >= divisor + rol r_quoL ; shift quotient (with CARRY) + rjmp __udivuhq3_cont +__udivuhq3_ep: + sub r_divdL,r_divL ; restore dividend + sbc r_divdH,r_divH + lsl r_quoL ; shift quotient (without CARRY) +__udivuhq3_cont: + rol r_quoH ; shift quotient + dec r_cnt ; decrement loop counter + brne __udivuhq3_loop + com r_quoL ; complement result + com r_quoH ; because C flag was complemented in loop + ret +ENDF __udivuha3_common +#endif /* defined (L_udivuhq3) */ + +/******************************************************* + Fixed Division 8.8 / 8.8 +*******************************************************/ +#if defined (L_divha3) +DEFUN __divha3 + mov r0, r_divdH + eor r0, r_divH + sbrs r_divH, 7 + rjmp 1f + NEG2 r_divL +1: + sbrs r_divdH, 7 + rjmp 2f + NEG2 r_divdL +2: + XCALL __udivuha3 + lsr r_quoH ; adjust to 7 fractional bits + ror r_quoL + sbrs r0, 7 ; negate result if needed + ret + NEG2 r_quoL + ret +ENDF __divha3 +#endif /* defined (L_divha3) */ + +#if defined (L_udivuha3) +DEFUN __udivuha3 + mov r_quoH, r_divdL + mov r_divdL, r_divdH + clr r_divdH + lsl r_quoH ; shift quotient into carry + XJMP __udivuha3_common ; same as fractional after rearrange +ENDF __udivuha3 +#endif /* defined (L_udivuha3) */ + +#undef r_divdL +#undef r_divdH +#undef r_quoL +#undef r_quoH +#undef r_divL +#undef r_divH +#undef r_cnt + +/******************************************************* + Fixed Division 16.16 / 16.16 +*******************************************************/ + +#define r_arg1L 24 /* arg1 gets passed already in place */ +#define r_arg1H 25 +#define r_arg1HL 26 +#define r_arg1HH 27 +#define r_divdL 26 /* dividend Low */ +#define r_divdH 27 +#define r_divdHL 30 +#define r_divdHH 31 /* dividend High */ +#define r_quoL 22 /* quotient Low */ +#define r_quoH 23 +#define r_quoHL 24 +#define r_quoHH 25 /* quotient High */ +#define r_divL 18 /* divisor Low */ +#define r_divH 19 +#define r_divHL 20 +#define r_divHH 21 /* divisor High */ +#define r_cnt __zero_reg__ /* loop count (0 after the loop!) */ + +#if defined (L_divsa3) +DEFUN __divsa3 + mov r0, r_arg1HH + eor r0, r_divHH + sbrs r_divHH, 7 + rjmp 1f + NEG4 r_divL +1: + sbrs r_arg1HH, 7 + rjmp 2f + NEG4 r_arg1L +2: + XCALL __udivusa3 + lsr r_quoHH ; adjust to 15 fractional bits + ror r_quoHL + ror r_quoH + ror r_quoL + sbrs r0, 7 ; negate result if needed + ret + ;; negate r_quoL + XJMP __negsi2 +ENDF __divsa3 +#endif /* defined (L_divsa3) */ + +#if defined (L_udivusa3) +DEFUN __udivusa3 + ldi r_divdHL, 32 ; init loop counter + mov r_cnt, r_divdHL + clr r_divdHL + clr r_divdHH + wmov r_quoL, r_divdHL + lsl r_quoHL ; shift quotient into carry + rol r_quoHH +__udivusa3_loop: + rol r_divdL ; shift dividend (with CARRY) + rol r_divdH + rol r_divdHL + rol r_divdHH + brcs __udivusa3_ep ; dividend overflow + cp r_divdL,r_divL ; compare dividend & divisor + cpc r_divdH,r_divH + cpc r_divdHL,r_divHL + cpc r_divdHH,r_divHH + brcc __udivusa3_ep ; dividend >= divisor + rol r_quoL ; shift quotient (with CARRY) + rjmp __udivusa3_cont +__udivusa3_ep: + sub r_divdL,r_divL ; restore dividend + sbc r_divdH,r_divH + sbc r_divdHL,r_divHL + sbc r_divdHH,r_divHH + lsl r_quoL ; shift quotient (without CARRY) +__udivusa3_cont: + rol r_quoH ; shift quotient + rol r_quoHL + rol r_quoHH + dec r_cnt ; decrement loop counter + brne __udivusa3_loop + com r_quoL ; complement result + com r_quoH ; because C flag was complemented in loop + com r_quoHL + com r_quoHH + ret +ENDF __udivusa3 +#endif /* defined (L_udivusa3) */ + +#undef r_arg1L +#undef r_arg1H +#undef r_arg1HL +#undef r_arg1HH +#undef r_divdL +#undef r_divdH +#undef r_divdHL +#undef r_divdHH +#undef r_quoL +#undef r_quoH +#undef r_quoHL +#undef r_quoHH +#undef r_divL +#undef r_divH +#undef r_divHL +#undef r_divHH +#undef r_cnt + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Saturation, 1 Byte +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; First Argument and Return Register +#define A0 24 + +#if defined (L_ssabs_1) +DEFUN __ssabs_1 + sbrs A0, 7 + ret + neg A0 + sbrc A0,7 + dec A0 + ret +ENDF __ssabs_1 +#endif /* L_ssabs_1 */ + +#undef A0 + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Saturation, 2 Bytes +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; First Argument and Return Register +#define A0 24 +#define A1 A0+1 + +#if defined (L_ssneg_2) +DEFUN __ssneg_2 + NEG2 A0 + brvc 0f + sbiw A0, 1 +0: ret +ENDF __ssneg_2 +#endif /* L_ssneg_2 */ + +#if defined (L_ssabs_2) +DEFUN __ssabs_2 + sbrs A1, 7 + ret + XJMP __ssneg_2 +ENDF __ssabs_2 +#endif /* L_ssabs_2 */ + +#undef A0 +#undef A1 + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Saturation, 4 Bytes +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; First Argument and Return Register +#define A0 22 +#define A1 A0+1 +#define A2 A0+2 +#define A3 A0+3 + +#if defined (L_ssneg_4) +DEFUN __ssneg_4 + XCALL __negsi2 + brvc 0f + ldi A3, 0x7f + ldi A2, 0xff + ldi A1, 0xff + ldi A0, 0xff +0: ret +ENDF __ssneg_4 +#endif /* L_ssneg_4 */ + +#if defined (L_ssabs_4) +DEFUN __ssabs_4 + sbrs A3, 7 + ret + XJMP __ssneg_4 +ENDF __ssabs_4 +#endif /* L_ssabs_4 */ + +#undef A0 +#undef A1 +#undef A2 +#undef A3 + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Saturation, 8 Bytes +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; First Argument and Return Register +#define A0 18 +#define A1 A0+1 +#define A2 A0+2 +#define A3 A0+3 +#define A4 A0+4 +#define A5 A0+5 +#define A6 A0+6 +#define A7 A0+7 + +#if defined (L_clr_8) +FALIAS __usneguta2 +FALIAS __usneguda2 +FALIAS __usnegudq2 + +;; Clear Carry and all Bytes +DEFUN __clr_8 + ;; Clear Carry and set Z + sub A7, A7 + ;; FALLTHRU +ENDF __clr_8 +;; Propagate Carry to all Bytes, Carry unaltered +DEFUN __sbc_8 + sbc A7, A7 + sbc A6, A6 + wmov A4, A6 + wmov A2, A6 + wmov A0, A6 + ret +ENDF __sbc_8 +#endif /* L_clr_8 */ + +#if defined (L_ssneg_8) +FALIAS __ssnegta2 +FALIAS __ssnegda2 +FALIAS __ssnegdq2 + +DEFUN __ssneg_8 + XCALL __negdi2 + brvc 0f + ;; A[] = 0x7fffffff + sec + XCALL __sbc_8 + ldi A7, 0x7f +0: ret +ENDF __ssneg_8 +#endif /* L_ssneg_8 */ + +#if defined (L_ssabs_8) +FALIAS __ssabsta2 +FALIAS __ssabsda2 +FALIAS __ssabsdq2 + +DEFUN __ssabs_8 + sbrs A7, 7 + ret + XJMP __ssneg_8 +ENDF __ssabs_8 +#endif /* L_ssabs_8 */ + +;; Second Argument +#define B0 10 +#define B1 B0+1 +#define B2 B0+2 +#define B3 B0+3 +#define B4 B0+4 +#define B5 B0+5 +#define B6 B0+6 +#define B7 B0+7 + +#if defined (L_usadd_8) +FALIAS __usadduta3 +FALIAS __usadduda3 +FALIAS __usaddudq3 + +DEFUN __usadd_8 + XCALL __adddi3 + brcs 0f + ret +0: ;; A[] = 0xffffffff + XJMP __sbc_8 +ENDF __usadd_8 +#endif /* L_usadd_8 */ + +#if defined (L_ussub_8) +FALIAS __ussubuta3 +FALIAS __ussubuda3 +FALIAS __ussubudq3 + +DEFUN __ussub_8 + XCALL __subdi3 + brcs 0f + ret +0: ;; A[] = 0 + XJMP __clr_8 +ENDF __ussub_8 +#endif /* L_ussub_8 */ + +#if defined (L_ssadd_8) +FALIAS __ssaddta3 +FALIAS __ssaddda3 +FALIAS __ssadddq3 + +DEFUN __ssadd_8 + XCALL __adddi3 + brvc 0f + ;; A = (B >= 0) ? INT64_MAX : INT64_MIN + cpi B7, 0x80 + XCALL __sbc_8 + subi A7, 0x80 +0: ret +ENDF __ssadd_8 +#endif /* L_ssadd_8 */ + +#if defined (L_sssub_8) +FALIAS __sssubta3 +FALIAS __sssubda3 +FALIAS __sssubdq3 + +DEFUN __sssub_8 + XCALL __subdi3 + brvc 0f + ;; A = (B < 0) ? INT64_MAX : INT64_MIN + ldi A7, 0x7f + cp A7, B7 + XCALL __sbc_8 + subi A7, 0x80 +0: ret +ENDF __sssub_8 +#endif /* L_sssub_8 */ + +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef A4 +#undef A5 +#undef A6 +#undef A7 +#undef B0 +#undef B1 +#undef B2 +#undef B3 +#undef B4 +#undef B5 +#undef B6 +#undef B7 + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Rounding Helpers +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +#ifdef L_mask1 + +#define AA 24 +#define CC 25 + +;; R25 = 1 << (R24 & 7) +;; CC = 1 << (AA & 7) +;; Clobbers: None +DEFUN __mask1 + ;; CC = 2 ^ AA.1 + ldi CC, 1 << 2 + sbrs AA, 1 + ldi CC, 1 << 0 + ;; CC *= 2 ^ AA.0 + sbrc AA, 0 + lsl CC + ;; CC *= 2 ^ AA.2 + sbrc AA, 2 + swap CC + ret +ENDF __mask1 + +#undef AA +#undef CC +#endif /* L_mask1 */ + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; The rounding point. Any bits smaller than +;; 2^{-RP} will be cleared. +#define RP R24 + +#define A0 22 +#define A1 A0 + 1 + +#define C0 24 +#define C1 C0 + 1 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Rounding, 1 Byte +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +#ifdef L_roundqq3 + +;; R24 = round (R22, R24) +;; Clobbers: R22, __tmp_reg__ +DEFUN __roundqq3 + mov __tmp_reg__, C1 + subi RP, __QQ_FBIT__ - 1 + neg RP + ;; R25 = 1 << RP (Total offset is FBIT-1 - RP) + XCALL __mask1 + mov C0, C1 + ;; Add-Saturate 2^{-RP-1} + add A0, C0 + brvc 0f + ldi C0, 0x7f + rjmp 9f +0: ;; Mask out bits beyond RP + lsl C0 + neg C0 + and C0, A0 +9: mov C1, __tmp_reg__ + ret +ENDF __roundqq3 +#endif /* L_roundqq3 */ + +#ifdef L_rounduqq3 + +;; R24 = round (R22, R24) +;; Clobbers: R22, __tmp_reg__ +DEFUN __rounduqq3 + mov __tmp_reg__, C1 + subi RP, __UQQ_FBIT__ - 1 + neg RP + ;; R25 = 1 << RP (Total offset is FBIT-1 - RP) + XCALL __mask1 + mov C0, C1 + ;; Add-Saturate 2^{-RP-1} + add A0, C0 + brcc 0f + ldi C0, 0xff + rjmp 9f +0: ;; Mask out bits beyond RP + lsl C0 + neg C0 + and C0, A0 +9: mov C1, __tmp_reg__ + ret +ENDF __rounduqq3 +#endif /* L_rounduqq3 */ + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Rounding, 2 Bytes +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +#ifdef L_addmask_2 + +;; [ R25:R24 = 1 << (R24 & 15) +;; R23:R22 += 1 << (R24 & 15) ] +;; SREG is set according to the addition +DEFUN __addmask_2 + ;; R25 = 1 << (R24 & 7) + XCALL __mask1 + cpi RP, 1 << 3 + sbc C0, C0 + ;; Swap C0 and C1 if RP.3 was set + and C0, C1 + eor C1, C0 + ;; Finally, add the power-of-two: A[] += C[] + add A0, C0 + adc A1, C1 + ret +ENDF __addmask_2 +#endif /* L_addmask_2 */ + +#ifdef L_round_s2 + +;; R25:R24 = round (R23:R22, R24) +;; Clobbers: R23, R22 +DEFUN __roundhq3 + subi RP, __HQ_FBIT__ - __HA_FBIT__ +ENDF __roundhq3 +DEFUN __roundha3 + subi RP, __HA_FBIT__ - 1 + neg RP + ;; [ R25:R24 = 1 << (FBIT-1 - RP) + ;; R23:R22 += 1 << (FBIT-1 - RP) ] + XCALL __addmask_2 + XJMP __round_s2_const +ENDF __roundha3 + +#endif /* L_round_s2 */ + +#ifdef L_round_u2 + +;; R25:R24 = round (R23:R22, R24) +;; Clobbers: R23, R22 +DEFUN __rounduhq3 + subi RP, __UHQ_FBIT__ - __UHA_FBIT__ +ENDF __rounduhq3 +DEFUN __rounduha3 + subi RP, __UHA_FBIT__ - 1 + neg RP + ;; [ R25:R24 = 1 << (FBIT-1 - RP) + ;; R23:R22 += 1 << (FBIT-1 - RP) ] + XCALL __addmask_2 + XJMP __round_u2_const +ENDF __rounduha3 + +#endif /* L_round_u2 */ + + +#ifdef L_round_2_const + +;; Helpers for 2 byte wide rounding + +DEFUN __round_s2_const + brvc 2f + ldi C1, 0x7f + rjmp 1f + ;; FALLTHRU (Barrier) +ENDF __round_s2_const + +DEFUN __round_u2_const + brcc 2f + ldi C1, 0xff +1: + ldi C0, 0xff + rjmp 9f +2: + ;; Saturation is performed now. + ;; Currently, we have C[] = 2^{-RP-1} + ;; C[] = 2^{-RP} + lsl C0 + rol C1 + ;; + NEG2 C0 + ;; Clear the bits beyond the rounding point. + and C0, A0 + and C1, A1 +9: ret +ENDF __round_u2_const + +#endif /* L_round_2_const */ + +#undef A0 +#undef A1 +#undef C0 +#undef C1 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Rounding, 4 Bytes +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +#define A0 18 +#define A1 A0 + 1 +#define A2 A0 + 2 +#define A3 A0 + 3 + +#define C0 22 +#define C1 C0 + 1 +#define C2 C0 + 2 +#define C3 C0 + 3 + +#ifdef L_addmask_4 + +;; [ R25:R22 = 1 << (R24 & 31) +;; R21:R18 += 1 << (R24 & 31) ] +;; SREG is set according to the addition +DEFUN __addmask_4 + ;; R25 = 1 << (R24 & 7) + XCALL __mask1 + cpi RP, 1 << 4 + sbc C0, C0 + sbc C1, C1 + ;; Swap C2 with C3 if RP.3 is not set + cpi RP, 1 << 3 + sbc C2, C2 + and C2, C3 + eor C3, C2 + ;; Swap C3:C2 with C1:C0 if RP.4 is not set + and C0, C2 $ eor C2, C0 + and C1, C3 $ eor C3, C1 + ;; Finally, add the power-of-two: A[] += C[] + add A0, C0 + adc A1, C1 + adc A2, C2 + adc A3, C3 + ret +ENDF __addmask_4 +#endif /* L_addmask_4 */ + +#ifdef L_round_s4 + +;; R25:R22 = round (R21:R18, R24) +;; Clobbers: R18...R21 +DEFUN __roundsq3 + subi RP, __SQ_FBIT__ - __SA_FBIT__ +ENDF __roundsq3 +DEFUN __roundsa3 + subi RP, __SA_FBIT__ - 1 + neg RP + ;; [ R25:R22 = 1 << (FBIT-1 - RP) + ;; R21:R18 += 1 << (FBIT-1 - RP) ] + XCALL __addmask_4 + XJMP __round_s4_const +ENDF __roundsa3 + +#endif /* L_round_s4 */ + +#ifdef L_round_u4 + +;; R25:R22 = round (R21:R18, R24) +;; Clobbers: R18...R21 +DEFUN __roundusq3 + subi RP, __USQ_FBIT__ - __USA_FBIT__ +ENDF __roundusq3 +DEFUN __roundusa3 + subi RP, __USA_FBIT__ - 1 + neg RP + ;; [ R25:R22 = 1 << (FBIT-1 - RP) + ;; R21:R18 += 1 << (FBIT-1 - RP) ] + XCALL __addmask_4 + XJMP __round_u4_const +ENDF __roundusa3 + +#endif /* L_round_u4 */ + + +#ifdef L_round_4_const + +;; Helpers for 4 byte wide rounding + +DEFUN __round_s4_const + brvc 2f + ldi C3, 0x7f + rjmp 1f + ;; FALLTHRU (Barrier) +ENDF __round_s4_const + +DEFUN __round_u4_const + brcc 2f + ldi C3, 0xff +1: + ldi C2, 0xff + ldi C1, 0xff + ldi C0, 0xff + rjmp 9f +2: + ;; Saturation is performed now. + ;; Currently, we have C[] = 2^{-RP-1} + ;; C[] = 2^{-RP} + lsl C0 + rol C1 + rol C2 + rol C3 + XCALL __negsi2 + ;; Clear the bits beyond the rounding point. + and C0, A0 + and C1, A1 + and C2, A2 + and C3, A3 +9: ret +ENDF __round_u4_const + +#endif /* L_round_4_const */ + +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef C0 +#undef C1 +#undef C2 +#undef C3 + +#undef RP + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Rounding, 8 Bytes +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +#define RP 16 +#define FBITm1 31 + +#define C0 18 +#define C1 C0 + 1 +#define C2 C0 + 2 +#define C3 C0 + 3 +#define C4 C0 + 4 +#define C5 C0 + 5 +#define C6 C0 + 6 +#define C7 C0 + 7 + +#define A0 16 +#define A1 17 +#define A2 26 +#define A3 27 +#define A4 28 +#define A5 29 +#define A6 30 +#define A7 31 + + +#ifdef L_rounddq3 +;; R25:R18 = round (R25:R18, R16) +;; Clobbers: ABI +DEFUN __rounddq3 + ldi FBITm1, __DQ_FBIT__ - 1 + clt + XJMP __round_x8 +ENDF __rounddq3 +#endif /* L_rounddq3 */ + +#ifdef L_roundudq3 +;; R25:R18 = round (R25:R18, R16) +;; Clobbers: ABI +DEFUN __roundudq3 + ldi FBITm1, __UDQ_FBIT__ - 1 + set + XJMP __round_x8 +ENDF __roundudq3 +#endif /* L_roundudq3 */ + +#ifdef L_roundda3 +;; R25:R18 = round (R25:R18, R16) +;; Clobbers: ABI +DEFUN __roundda3 + ldi FBITm1, __DA_FBIT__ - 1 + clt + XJMP __round_x8 +ENDF __roundda3 +#endif /* L_roundda3 */ + +#ifdef L_rounduda3 +;; R25:R18 = round (R25:R18, R16) +;; Clobbers: ABI +DEFUN __rounduda3 + ldi FBITm1, __UDA_FBIT__ - 1 + set + XJMP __round_x8 +ENDF __rounduda3 +#endif /* L_rounduda3 */ + +#ifdef L_roundta3 +;; R25:R18 = round (R25:R18, R16) +;; Clobbers: ABI +DEFUN __roundta3 + ldi FBITm1, __TA_FBIT__ - 1 + clt + XJMP __round_x8 +ENDF __roundta3 +#endif /* L_roundta3 */ + +#ifdef L_rounduta3 +;; R25:R18 = round (R25:R18, R16) +;; Clobbers: ABI +DEFUN __rounduta3 + ldi FBITm1, __UTA_FBIT__ - 1 + set + XJMP __round_x8 +ENDF __rounduta3 +#endif /* L_rounduta3 */ + + +#ifdef L_round_x8 +DEFUN __round_x8 + push r16 + push r17 + push r28 + push r29 + ;; Compute log2 of addend from rounding point + sub RP, FBITm1 + neg RP + ;; Move input to work register A[] + push C0 + mov A1, C1 + wmov A2, C2 + wmov A4, C4 + wmov A6, C6 + ;; C[] = 1 << (FBIT-1 - RP) + XCALL __clr_8 + inc C0 + XCALL __ashldi3 + pop A0 + ;; A[] += C[] + add A0, C0 + adc A1, C1 + adc A2, C2 + adc A3, C3 + adc A4, C4 + adc A5, C5 + adc A6, C6 + adc A7, C7 + brts 1f + ;; Signed + brvc 3f + ;; Signed overflow: A[] = 0x7f... + brvs 2f +1: ;; Unsigned + brcc 3f + ;; Unsigned overflow: A[] = 0xff... +2: ldi C7, 0xff + ldi C6, 0xff + wmov C0, C6 + wmov C2, C6 + wmov C4, C6 + bld C7, 7 + rjmp 9f +3: + ;; C[] = -C[] - C[] + push A0 + ldi r16, 1 + XCALL __ashldi3 + pop A0 + XCALL __negdi2 + ;; Clear the bits beyond the rounding point. + and C0, A0 + and C1, A1 + and C2, A2 + and C3, A3 + and C4, A4 + and C5, A5 + and C6, A6 + and C7, A7 +9: ;; Epilogue + pop r29 + pop r28 + pop r17 + pop r16 + ret +ENDF __round_x8 + +#endif /* L_round_x8 */ + +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef A4 +#undef A5 +#undef A6 +#undef A7 + +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 + +#undef RP +#undef FBITm1 + + +;; Supply implementations / symbols for the bit-banging functions +;; __builtin_avr_bitsfx and __builtin_avr_fxbits +#ifdef L_ret +DEFUN __ret + ret +ENDF __ret +#endif /* L_ret */ |