From b094d6c4bf572654a031ecc4afe675154c886dc5 Mon Sep 17 00:00:00 2001 From: Jing Yu Date: Thu, 22 Jul 2010 14:03:48 -0700 Subject: commit gcc-4.4.3 which is used to build gcc-4.4.3 Android toolchain in master. The source is based on fsf gcc-4.4.3 and contains local patches which are recorded in gcc-4.4.3/README.google. Change-Id: Id8c6d6927df274ae9749196a1cc24dbd9abc9887 --- gcc-4.4.3/gcc/config/sh/lib1funcs-Os-4-200.asm | 322 +++++++++++++++++++++++++ 1 file changed, 322 insertions(+) create mode 100644 gcc-4.4.3/gcc/config/sh/lib1funcs-Os-4-200.asm (limited to 'gcc-4.4.3/gcc/config/sh/lib1funcs-Os-4-200.asm') diff --git a/gcc-4.4.3/gcc/config/sh/lib1funcs-Os-4-200.asm b/gcc-4.4.3/gcc/config/sh/lib1funcs-Os-4-200.asm new file mode 100644 index 000000000..aae57ccd3 --- /dev/null +++ b/gcc-4.4.3/gcc/config/sh/lib1funcs-Os-4-200.asm @@ -0,0 +1,322 @@ +/* Copyright (C) 2006, 2009 Free Software Foundation, Inc. + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +/* Moderately Space-optimized libgcc routines for the Renesas SH / + STMicroelectronics ST40 CPUs. + Contributed by J"orn Rennecke joern.rennecke@st.com. */ + +#include "lib1funcs.h" + +#if !__SHMEDIA__ +#ifdef L_udivsi3_i4i + +/* 88 bytes; sh4-200 cycle counts: + divisor >= 2G: 11 cycles + dividend < 2G: 48 cycles + dividend >= 2G: divisor != 1: 54 cycles + dividend >= 2G, divisor == 1: 22 cycles */ +#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__) +!! args in r4 and r5, result in r0, clobber r1 + + .global GLOBAL(udivsi3_i4i) + FUNC(GLOBAL(udivsi3_i4i)) +GLOBAL(udivsi3_i4i): + mova L1,r0 + cmp/pz r5 + sts fpscr,r1 + lds.l @r0+,fpscr + sts.l fpul,@-r15 + bf LOCAL(huge_divisor) + mov.l r1,@-r15 + lds r4,fpul + cmp/pz r4 +#ifdef FMOVD_WORKS + fmov.d dr0,@-r15 + float fpul,dr0 + fmov.d dr2,@-r15 + bt LOCAL(dividend_adjusted) + mov #1,r1 + fmov.d @r0,dr2 + cmp/eq r1,r5 + bt LOCAL(div_by_1) + fadd dr2,dr0 +LOCAL(dividend_adjusted): + lds r5,fpul + float fpul,dr2 + fdiv dr2,dr0 +LOCAL(div_by_1): + fmov.d @r15+,dr2 + ftrc dr0,fpul + fmov.d @r15+,dr0 +#else /* !FMOVD_WORKS */ + fmov.s DR01,@-r15 + mov #1,r1 + fmov.s DR00,@-r15 + float fpul,dr0 + fmov.s DR21,@-r15 + bt/s LOCAL(dividend_adjusted) + fmov.s DR20,@-r15 + cmp/eq r1,r5 + bt LOCAL(div_by_1) + fmov.s @r0+,DR20 + fmov.s @r0,DR21 + fadd dr2,dr0 +LOCAL(dividend_adjusted): + lds r5,fpul + float fpul,dr2 + fdiv dr2,dr0 +LOCAL(div_by_1): + fmov.s @r15+,DR20 + fmov.s @r15+,DR21 + ftrc dr0,fpul + fmov.s @r15+,DR00 + fmov.s @r15+,DR01 +#endif /* !FMOVD_WORKS */ + lds.l @r15+,fpscr + sts fpul,r0 + rts + lds.l @r15+,fpul + +#ifdef FMOVD_WORKS + .p2align 3 ! make double below 8 byte aligned. +#endif +LOCAL(huge_divisor): + lds r1,fpscr + add #4,r15 + cmp/hs r5,r4 + rts + movt r0 + + .p2align 2 +L1: +#ifndef FMOVD_WORKS + .long 0x80000 +#else + .long 0x180000 +#endif + .double 4294967296 + + ENDFUNC(GLOBAL(udivsi3_i4i)) +#elif !defined (__sh1__) /* !__SH_FPU_DOUBLE__ */ + +#if 0 +/* With 36 bytes, the following would probably be the most compact + implementation, but with 139 cycles on an sh4-200, it is extremely slow. */ +GLOBAL(udivsi3_i4i): + mov.l r2,@-r15 + mov #0,r1 + div0u + mov r1,r2 + mov.l r3,@-r15 + mov r1,r3 + sett + mov r4,r0 +LOCAL(loop): + rotcr r2 + ; + bt/s LOCAL(end) + cmp/gt r2,r3 + rotcl r0 + bra LOCAL(loop) + div1 r5,r1 +LOCAL(end): + rotcl r0 + mov.l @r15+,r3 + rts + mov.l @r15+,r2 +#endif /* 0 */ + +/* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i + sh4-200 run times: + udiv small divisor: 55 cycles + udiv large divisor: 52 cycles + sdiv small divisor, positive result: 59 cycles + sdiv large divisor, positive result: 56 cycles + sdiv small divisor, negative result: 65 cycles (*) + sdiv large divisor, negative result: 62 cycles (*) + (*): r2 is restored in the rts delay slot and has a lingering latency + of two more cycles. */ + .balign 4 + .global GLOBAL(udivsi3_i4i) + FUNC(GLOBAL(udivsi3_i4i)) + FUNC(GLOBAL(sdivsi3_i4i)) +GLOBAL(udivsi3_i4i): + sts pr,r1 + mov.l r4,@-r15 + extu.w r5,r0 + cmp/eq r5,r0 + swap.w r4,r0 + shlr16 r4 + bf/s LOCAL(large_divisor) + div0u + mov.l r5,@-r15 + shll16 r5 +LOCAL(sdiv_small_divisor): + div1 r5,r4 + bsr LOCAL(div6) + div1 r5,r4 + div1 r5,r4 + bsr LOCAL(div6) + div1 r5,r4 + xtrct r4,r0 + xtrct r0,r4 + bsr LOCAL(div7) + swap.w r4,r4 + div1 r5,r4 + bsr LOCAL(div7) + div1 r5,r4 + xtrct r4,r0 + mov.l @r15+,r5 + swap.w r0,r0 + mov.l @r15+,r4 + jmp @r1 + rotcl r0 +LOCAL(div7): + div1 r5,r4 +LOCAL(div6): + div1 r5,r4; div1 r5,r4; div1 r5,r4 + div1 r5,r4; div1 r5,r4; rts; div1 r5,r4 + +LOCAL(divx3): + rotcl r0 + div1 r5,r4 + rotcl r0 + div1 r5,r4 + rotcl r0 + rts + div1 r5,r4 + +LOCAL(large_divisor): + mov.l r5,@-r15 +LOCAL(sdiv_large_divisor): + xor r4,r0 + .rept 4 + rotcl r0 + bsr LOCAL(divx3) + div1 r5,r4 + .endr + mov.l @r15+,r5 + mov.l @r15+,r4 + jmp @r1 + rotcl r0 + ENDFUNC(GLOBAL(udivsi3_i4i)) + + .global GLOBAL(sdivsi3_i4i) +GLOBAL(sdivsi3_i4i): + mov.l r4,@-r15 + cmp/pz r5 + mov.l r5,@-r15 + bt/s LOCAL(pos_divisor) + cmp/pz r4 + neg r5,r5 + extu.w r5,r0 + bt/s LOCAL(neg_result) + cmp/eq r5,r0 + neg r4,r4 +LOCAL(pos_result): + swap.w r4,r0 + bra LOCAL(sdiv_check_divisor) + sts pr,r1 +LOCAL(pos_divisor): + extu.w r5,r0 + bt/s LOCAL(pos_result) + cmp/eq r5,r0 + neg r4,r4 +LOCAL(neg_result): + mova LOCAL(negate_result),r0 + ; + mov r0,r1 + swap.w r4,r0 + lds r2,macl + sts pr,r2 +LOCAL(sdiv_check_divisor): + shlr16 r4 + bf/s LOCAL(sdiv_large_divisor) + div0u + bra LOCAL(sdiv_small_divisor) + shll16 r5 + .balign 4 +LOCAL(negate_result): + neg r0,r0 + jmp @r2 + sts macl,r2 + ENDFUNC(GLOBAL(sdivsi3_i4i)) +#endif /* !__SH_FPU_DOUBLE__ */ +#endif /* L_udivsi3_i4i */ + +#ifdef L_sdivsi3_i4i +#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__) +/* 48 bytes, 45 cycles on sh4-200 */ +!! args in r4 and r5, result in r0, clobber r1 + + .global GLOBAL(sdivsi3_i4i) + FUNC(GLOBAL(sdivsi3_i4i)) +GLOBAL(sdivsi3_i4i): + sts.l fpscr,@-r15 + sts fpul,r1 + mova L1,r0 + lds.l @r0+,fpscr + lds r4,fpul +#ifdef FMOVD_WORKS + fmov.d dr0,@-r15 + float fpul,dr0 + lds r5,fpul + fmov.d dr2,@-r15 +#else + fmov.s DR01,@-r15 + fmov.s DR00,@-r15 + float fpul,dr0 + lds r5,fpul + fmov.s DR21,@-r15 + fmov.s DR20,@-r15 +#endif + float fpul,dr2 + fdiv dr2,dr0 +#ifdef FMOVD_WORKS + fmov.d @r15+,dr2 +#else + fmov.s @r15+,DR20 + fmov.s @r15+,DR21 +#endif + ftrc dr0,fpul +#ifdef FMOVD_WORKS + fmov.d @r15+,dr0 +#else + fmov.s @r15+,DR00 + fmov.s @r15+,DR01 +#endif + lds.l @r15+,fpscr + sts fpul,r0 + rts + lds r1,fpul + + .p2align 2 +L1: +#ifndef FMOVD_WORKS + .long 0x80000 +#else + .long 0x180000 +#endif + + ENDFUNC(GLOBAL(sdivsi3_i4i)) +#endif /* __SH_FPU_DOUBLE__ */ +#endif /* L_sdivsi3_i4i */ +#endif /* !__SHMEDIA__ */ -- cgit v1.2.3