From b094d6c4bf572654a031ecc4afe675154c886dc5 Mon Sep 17 00:00:00 2001
From: Jing Yu <jingyu@google.com>
Date: Thu, 22 Jul 2010 14:03:48 -0700
Subject: commit gcc-4.4.3 which is used to build gcc-4.4.3 Android toolchain
 in master. The source is based on fsf gcc-4.4.3 and contains local patches
 which are recorded in gcc-4.4.3/README.google.

Change-Id: Id8c6d6927df274ae9749196a1cc24dbd9abc9887
---
 gcc-4.4.3/gcc/config/sh/lib1funcs-Os-4-200.asm | 322 +++++++++++++++++++++++++
 1 file changed, 322 insertions(+)
 create mode 100644 gcc-4.4.3/gcc/config/sh/lib1funcs-Os-4-200.asm

(limited to 'gcc-4.4.3/gcc/config/sh/lib1funcs-Os-4-200.asm')

diff --git a/gcc-4.4.3/gcc/config/sh/lib1funcs-Os-4-200.asm b/gcc-4.4.3/gcc/config/sh/lib1funcs-Os-4-200.asm
new file mode 100644
index 000000000..aae57ccd3
--- /dev/null
+++ b/gcc-4.4.3/gcc/config/sh/lib1funcs-Os-4-200.asm
@@ -0,0 +1,322 @@
+/* Copyright (C) 2006, 2009 Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* Moderately Space-optimized libgcc routines for the Renesas SH /
+   STMicroelectronics ST40 CPUs.
+   Contributed by J"orn Rennecke joern.rennecke@st.com.  */
+
+#include "lib1funcs.h"
+
+#if !__SHMEDIA__
+#ifdef L_udivsi3_i4i
+
+/* 88 bytes; sh4-200 cycle counts:
+   divisor  >= 2G: 11 cycles
+   dividend <  2G: 48 cycles
+   dividend >= 2G: divisor != 1: 54 cycles
+   dividend >= 2G, divisor == 1: 22 cycles */
+#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
+!! args in r4 and r5, result in r0, clobber r1
+
+	.global GLOBAL(udivsi3_i4i)
+	FUNC(GLOBAL(udivsi3_i4i))
+GLOBAL(udivsi3_i4i):
+	mova L1,r0
+	cmp/pz r5
+	sts fpscr,r1
+	lds.l @r0+,fpscr
+	sts.l fpul,@-r15
+	bf LOCAL(huge_divisor)
+	mov.l r1,@-r15
+	lds r4,fpul
+	cmp/pz r4
+#ifdef FMOVD_WORKS
+	fmov.d dr0,@-r15
+	float fpul,dr0
+	fmov.d dr2,@-r15
+	bt LOCAL(dividend_adjusted)
+	mov #1,r1
+	fmov.d @r0,dr2
+	cmp/eq r1,r5
+	bt LOCAL(div_by_1)
+	fadd dr2,dr0
+LOCAL(dividend_adjusted):
+	lds r5,fpul
+	float fpul,dr2
+	fdiv dr2,dr0
+LOCAL(div_by_1):
+	fmov.d @r15+,dr2
+	ftrc dr0,fpul
+	fmov.d @r15+,dr0
+#else /* !FMOVD_WORKS */
+	fmov.s DR01,@-r15
+	mov #1,r1
+	fmov.s DR00,@-r15
+	float fpul,dr0
+	fmov.s DR21,@-r15
+	bt/s LOCAL(dividend_adjusted)
+	fmov.s DR20,@-r15
+	cmp/eq r1,r5
+	bt LOCAL(div_by_1)
+	fmov.s @r0+,DR20
+	fmov.s @r0,DR21
+	fadd dr2,dr0
+LOCAL(dividend_adjusted):
+	lds r5,fpul
+	float fpul,dr2
+	fdiv dr2,dr0
+LOCAL(div_by_1):
+	fmov.s @r15+,DR20
+	fmov.s @r15+,DR21
+	ftrc dr0,fpul
+	fmov.s @r15+,DR00
+	fmov.s @r15+,DR01
+#endif /* !FMOVD_WORKS */
+	lds.l @r15+,fpscr
+	sts fpul,r0
+	rts
+	lds.l @r15+,fpul
+
+#ifdef FMOVD_WORKS
+	.p2align 3        ! make double below 8 byte aligned.
+#endif
+LOCAL(huge_divisor):
+	lds r1,fpscr
+	add #4,r15
+	cmp/hs r5,r4
+	rts
+	movt r0
+
+	.p2align 2
+L1:
+#ifndef FMOVD_WORKS
+	.long 0x80000
+#else
+	.long 0x180000
+#endif
+	.double 4294967296
+
+	ENDFUNC(GLOBAL(udivsi3_i4i))
+#elif !defined (__sh1__)  /* !__SH_FPU_DOUBLE__ */
+
+#if 0
+/* With 36 bytes, the following would probably be the most compact
+   implementation, but with 139 cycles on an sh4-200, it is extremely slow.  */
+GLOBAL(udivsi3_i4i):
+	mov.l r2,@-r15
+	mov #0,r1
+	div0u
+	mov r1,r2
+	mov.l r3,@-r15
+	mov r1,r3
+	sett
+	mov r4,r0
+LOCAL(loop):
+	rotcr r2
+	;
+	bt/s LOCAL(end)
+	cmp/gt r2,r3
+	rotcl r0
+	bra LOCAL(loop)
+	div1 r5,r1
+LOCAL(end):
+	rotcl r0
+	mov.l @r15+,r3
+	rts
+	mov.l @r15+,r2
+#endif /* 0 */
+
+/* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i
+   sh4-200 run times:
+   udiv small divisor: 55 cycles
+   udiv large divisor: 52 cycles
+   sdiv small divisor, positive result: 59 cycles
+   sdiv large divisor, positive result: 56 cycles
+   sdiv small divisor, negative result: 65 cycles (*)
+   sdiv large divisor, negative result: 62 cycles (*)
+   (*): r2 is restored in the rts delay slot and has a lingering latency
+        of two more cycles.  */
+	.balign 4
+	.global	GLOBAL(udivsi3_i4i)
+	FUNC(GLOBAL(udivsi3_i4i))
+	FUNC(GLOBAL(sdivsi3_i4i))
+GLOBAL(udivsi3_i4i):
+	sts pr,r1
+	mov.l r4,@-r15
+	extu.w r5,r0
+	cmp/eq r5,r0
+	swap.w r4,r0
+	shlr16 r4
+	bf/s LOCAL(large_divisor)
+	div0u
+	mov.l r5,@-r15
+	shll16 r5
+LOCAL(sdiv_small_divisor):
+	div1 r5,r4
+	bsr LOCAL(div6)
+	div1 r5,r4
+	div1 r5,r4
+	bsr LOCAL(div6)
+	div1 r5,r4
+	xtrct r4,r0
+	xtrct r0,r4
+	bsr LOCAL(div7)
+	swap.w r4,r4
+	div1 r5,r4
+	bsr LOCAL(div7)
+	div1 r5,r4
+	xtrct r4,r0
+	mov.l @r15+,r5
+	swap.w r0,r0
+	mov.l @r15+,r4
+	jmp @r1
+	rotcl r0
+LOCAL(div7):
+	div1 r5,r4
+LOCAL(div6):
+	            div1 r5,r4; div1 r5,r4; div1 r5,r4
+	div1 r5,r4; div1 r5,r4; rts;        div1 r5,r4
+
+LOCAL(divx3):
+	rotcl r0
+	div1 r5,r4
+	rotcl r0
+	div1 r5,r4
+	rotcl r0
+	rts
+	div1 r5,r4
+
+LOCAL(large_divisor):
+	mov.l r5,@-r15
+LOCAL(sdiv_large_divisor):
+	xor r4,r0
+	.rept 4
+	rotcl r0
+	bsr LOCAL(divx3)
+	div1 r5,r4
+	.endr
+	mov.l @r15+,r5
+	mov.l @r15+,r4
+	jmp @r1
+	rotcl r0
+	ENDFUNC(GLOBAL(udivsi3_i4i))
+
+	.global	GLOBAL(sdivsi3_i4i)
+GLOBAL(sdivsi3_i4i):
+	mov.l r4,@-r15
+	cmp/pz r5
+	mov.l r5,@-r15
+	bt/s LOCAL(pos_divisor)
+	cmp/pz r4
+	neg r5,r5
+	extu.w r5,r0
+	bt/s LOCAL(neg_result)
+	cmp/eq r5,r0
+	neg r4,r4
+LOCAL(pos_result):
+	swap.w r4,r0
+	bra LOCAL(sdiv_check_divisor)
+	sts pr,r1
+LOCAL(pos_divisor):
+	extu.w r5,r0
+	bt/s LOCAL(pos_result)
+	cmp/eq r5,r0
+	neg r4,r4
+LOCAL(neg_result):
+	mova LOCAL(negate_result),r0
+	;
+	mov r0,r1
+	swap.w r4,r0
+	lds r2,macl
+	sts pr,r2
+LOCAL(sdiv_check_divisor):
+	shlr16 r4
+	bf/s LOCAL(sdiv_large_divisor)
+	div0u
+	bra LOCAL(sdiv_small_divisor)
+	shll16 r5
+	.balign 4
+LOCAL(negate_result):
+	neg r0,r0
+	jmp @r2
+	sts macl,r2
+	ENDFUNC(GLOBAL(sdivsi3_i4i))
+#endif /* !__SH_FPU_DOUBLE__ */
+#endif /* L_udivsi3_i4i */
+
+#ifdef L_sdivsi3_i4i
+#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
+/* 48 bytes, 45 cycles on sh4-200  */
+!! args in r4 and r5, result in r0, clobber r1
+
+	.global GLOBAL(sdivsi3_i4i)
+	FUNC(GLOBAL(sdivsi3_i4i))
+GLOBAL(sdivsi3_i4i):
+	sts.l fpscr,@-r15
+	sts fpul,r1
+	mova L1,r0
+	lds.l @r0+,fpscr
+	lds r4,fpul
+#ifdef FMOVD_WORKS
+	fmov.d dr0,@-r15
+	float fpul,dr0
+	lds r5,fpul
+	fmov.d dr2,@-r15
+#else
+	fmov.s DR01,@-r15
+	fmov.s DR00,@-r15
+	float fpul,dr0
+	lds r5,fpul
+	fmov.s DR21,@-r15
+	fmov.s DR20,@-r15
+#endif
+	float fpul,dr2
+	fdiv dr2,dr0
+#ifdef FMOVD_WORKS
+	fmov.d @r15+,dr2
+#else
+	fmov.s @r15+,DR20
+	fmov.s @r15+,DR21
+#endif
+	ftrc dr0,fpul
+#ifdef FMOVD_WORKS
+	fmov.d @r15+,dr0
+#else
+	fmov.s @r15+,DR00
+	fmov.s @r15+,DR01
+#endif
+	lds.l @r15+,fpscr
+	sts fpul,r0
+	rts
+	lds r1,fpul
+
+	.p2align 2
+L1:
+#ifndef FMOVD_WORKS
+	.long 0x80000
+#else
+	.long 0x180000
+#endif
+
+	ENDFUNC(GLOBAL(sdivsi3_i4i))
+#endif /* __SH_FPU_DOUBLE__ */
+#endif /* L_sdivsi3_i4i */
+#endif /* !__SHMEDIA__ */
-- 
cgit v1.2.3