/* Copyright (C) 2006, 2009 Free Software Foundation, Inc.

This file is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 3, or (at your option) any
later version.

This file is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */

/* Moderately Space-optimized libgcc routines for the Renesas SH /
   STMicroelectronics ST40 CPUs.
   Contributed by J"orn Rennecke joern.rennecke@st.com.  */

#include "lib1funcs.h"

#if !__SHMEDIA__
#ifdef L_udivsi3_i4i

/* 88 bytes; sh4-200 cycle counts:
   divisor  >= 2G: 11 cycles
   dividend <  2G: 48 cycles
   dividend >= 2G: divisor != 1: 54 cycles
   dividend >= 2G, divisor == 1: 22 cycles */
#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
!! args in r4 and r5, result in r0, clobber r1

	.global GLOBAL(udivsi3_i4i)
	FUNC(GLOBAL(udivsi3_i4i))
GLOBAL(udivsi3_i4i):
	mova L1,r0
	cmp/pz r5
	sts fpscr,r1
	lds.l @r0+,fpscr
	sts.l fpul,@-r15
	bf LOCAL(huge_divisor)
	mov.l r1,@-r15
	lds r4,fpul
	cmp/pz r4
#ifdef FMOVD_WORKS
	fmov.d dr0,@-r15
	float fpul,dr0
	fmov.d dr2,@-r15
	bt LOCAL(dividend_adjusted)
	mov #1,r1
	fmov.d @r0,dr2
	cmp/eq r1,r5
	bt LOCAL(div_by_1)
	fadd dr2,dr0
LOCAL(dividend_adjusted):
	lds r5,fpul
	float fpul,dr2
	fdiv dr2,dr0
LOCAL(div_by_1):
	fmov.d @r15+,dr2
	ftrc dr0,fpul
	fmov.d @r15+,dr0
#else /* !FMOVD_WORKS */
	fmov.s DR01,@-r15
	mov #1,r1
	fmov.s DR00,@-r15
	float fpul,dr0
	fmov.s DR21,@-r15
	bt/s LOCAL(dividend_adjusted)
	fmov.s DR20,@-r15
	cmp/eq r1,r5
	bt LOCAL(div_by_1)
	fmov.s @r0+,DR20
	fmov.s @r0,DR21
	fadd dr2,dr0
LOCAL(dividend_adjusted):
	lds r5,fpul
	float fpul,dr2
	fdiv dr2,dr0
LOCAL(div_by_1):
	fmov.s @r15+,DR20
	fmov.s @r15+,DR21
	ftrc dr0,fpul
	fmov.s @r15+,DR00
	fmov.s @r15+,DR01
#endif /* !FMOVD_WORKS */
	lds.l @r15+,fpscr
	sts fpul,r0
	rts
	lds.l @r15+,fpul

#ifdef FMOVD_WORKS
	.p2align 3        ! make double below 8 byte aligned.
#endif
LOCAL(huge_divisor):
	lds r1,fpscr
	add #4,r15
	cmp/hs r5,r4
	rts
	movt r0

	.p2align 2
L1:
#ifndef FMOVD_WORKS
	.long 0x80000
#else
	.long 0x180000
#endif
	.double 4294967296

	ENDFUNC(GLOBAL(udivsi3_i4i))
#elif !defined (__sh1__)  /* !__SH_FPU_DOUBLE__ */

#if 0
/* With 36 bytes, the following would probably be the most compact
   implementation, but with 139 cycles on an sh4-200, it is extremely slow.  */
GLOBAL(udivsi3_i4i):
	mov.l r2,@-r15
	mov #0,r1
	div0u
	mov r1,r2
	mov.l r3,@-r15
	mov r1,r3
	sett
	mov r4,r0
LOCAL(loop):
	rotcr r2
	;
	bt/s LOCAL(end)
	cmp/gt r2,r3
	rotcl r0
	bra LOCAL(loop)
	div1 r5,r1
LOCAL(end):
	rotcl r0
	mov.l @r15+,r3
	rts
	mov.l @r15+,r2
#endif /* 0 */

/* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i
   sh4-200 run times:
   udiv small divisor: 55 cycles
   udiv large divisor: 52 cycles
   sdiv small divisor, positive result: 59 cycles
   sdiv large divisor, positive result: 56 cycles
   sdiv small divisor, negative result: 65 cycles (*)
   sdiv large divisor, negative result: 62 cycles (*)
   (*): r2 is restored in the rts delay slot and has a lingering latency
        of two more cycles.  */
	.balign 4
	.global	GLOBAL(udivsi3_i4i)
	FUNC(GLOBAL(udivsi3_i4i))
	FUNC(GLOBAL(sdivsi3_i4i))
GLOBAL(udivsi3_i4i):
	sts pr,r1
	mov.l r4,@-r15
	extu.w r5,r0
	cmp/eq r5,r0
	swap.w r4,r0
	shlr16 r4
	bf/s LOCAL(large_divisor)
	div0u
	mov.l r5,@-r15
	shll16 r5
LOCAL(sdiv_small_divisor):
	div1 r5,r4
	bsr LOCAL(div6)
	div1 r5,r4
	div1 r5,r4
	bsr LOCAL(div6)
	div1 r5,r4
	xtrct r4,r0
	xtrct r0,r4
	bsr LOCAL(div7)
	swap.w r4,r4
	div1 r5,r4
	bsr LOCAL(div7)
	div1 r5,r4
	xtrct r4,r0
	mov.l @r15+,r5
	swap.w r0,r0
	mov.l @r15+,r4
	jmp @r1
	rotcl r0
LOCAL(div7):
	div1 r5,r4
LOCAL(div6):
	            div1 r5,r4; div1 r5,r4; div1 r5,r4
	div1 r5,r4; div1 r5,r4; rts;        div1 r5,r4

LOCAL(divx3):
	rotcl r0
	div1 r5,r4
	rotcl r0
	div1 r5,r4
	rotcl r0
	rts
	div1 r5,r4

LOCAL(large_divisor):
	mov.l r5,@-r15
LOCAL(sdiv_large_divisor):
	xor r4,r0
	.rept 4
	rotcl r0
	bsr LOCAL(divx3)
	div1 r5,r4
	.endr
	mov.l @r15+,r5
	mov.l @r15+,r4
	jmp @r1
	rotcl r0
	ENDFUNC(GLOBAL(udivsi3_i4i))

	.global	GLOBAL(sdivsi3_i4i)
GLOBAL(sdivsi3_i4i):
	mov.l r4,@-r15
	cmp/pz r5
	mov.l r5,@-r15
	bt/s LOCAL(pos_divisor)
	cmp/pz r4
	neg r5,r5
	extu.w r5,r0
	bt/s LOCAL(neg_result)
	cmp/eq r5,r0
	neg r4,r4
LOCAL(pos_result):
	swap.w r4,r0
	bra LOCAL(sdiv_check_divisor)
	sts pr,r1
LOCAL(pos_divisor):
	extu.w r5,r0
	bt/s LOCAL(pos_result)
	cmp/eq r5,r0
	neg r4,r4
LOCAL(neg_result):
	mova LOCAL(negate_result),r0
	;
	mov r0,r1
	swap.w r4,r0
	lds r2,macl
	sts pr,r2
LOCAL(sdiv_check_divisor):
	shlr16 r4
	bf/s LOCAL(sdiv_large_divisor)
	div0u
	bra LOCAL(sdiv_small_divisor)
	shll16 r5
	.balign 4
LOCAL(negate_result):
	neg r0,r0
	jmp @r2
	sts macl,r2
	ENDFUNC(GLOBAL(sdivsi3_i4i))
#endif /* !__SH_FPU_DOUBLE__ */
#endif /* L_udivsi3_i4i */

#ifdef L_sdivsi3_i4i
#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
/* 48 bytes, 45 cycles on sh4-200  */
!! args in r4 and r5, result in r0, clobber r1

	.global GLOBAL(sdivsi3_i4i)
	FUNC(GLOBAL(sdivsi3_i4i))
GLOBAL(sdivsi3_i4i):
	sts.l fpscr,@-r15
	sts fpul,r1
	mova L1,r0
	lds.l @r0+,fpscr
	lds r4,fpul
#ifdef FMOVD_WORKS
	fmov.d dr0,@-r15
	float fpul,dr0
	lds r5,fpul
	fmov.d dr2,@-r15
#else
	fmov.s DR01,@-r15
	fmov.s DR00,@-r15
	float fpul,dr0
	lds r5,fpul
	fmov.s DR21,@-r15
	fmov.s DR20,@-r15
#endif
	float fpul,dr2
	fdiv dr2,dr0
#ifdef FMOVD_WORKS
	fmov.d @r15+,dr2
#else
	fmov.s @r15+,DR20
	fmov.s @r15+,DR21
#endif
	ftrc dr0,fpul
#ifdef FMOVD_WORKS
	fmov.d @r15+,dr0
#else
	fmov.s @r15+,DR00
	fmov.s @r15+,DR01
#endif
	lds.l @r15+,fpscr
	sts fpul,r0
	rts
	lds r1,fpul

	.p2align 2
L1:
#ifndef FMOVD_WORKS
	.long 0x80000
#else
	.long 0x180000
#endif

	ENDFUNC(GLOBAL(sdivsi3_i4i))
#endif /* __SH_FPU_DOUBLE__ */
#endif /* L_sdivsi3_i4i */
#endif /* !__SHMEDIA__ */