/* Overlay manager for SPU. Copyright (C) 2006-2014 Free Software Foundation, Inc. This file is part of the GNU Binutils. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA. */ /* MFC DMA defn's. */ #define MFC_GET_CMD 0x40 #define MFC_MAX_DMA_SIZE 0x4000 #define MFC_TAG_UPDATE_ALL 2 #define MFC_TAG_ID 0 /* Register usage. */ #define reserved1 $75 #define parm $75 #define tab1 reserved1 #define tab2 reserved1 #define vma reserved1 #define oldvma reserved1 #define newmask reserved1 #define map reserved1 #define reserved2 $76 #define off1 reserved2 #define off2 reserved2 #define present1 reserved2 #define present2 reserved2 #define sz reserved2 #define cmp reserved2 #define add64 reserved2 #define cgbits reserved2 #define off3 reserved2 #define off4 reserved2 #define addr4 reserved2 #define off5 reserved2 #define tagstat reserved2 #define reserved3 $77 #define size1 reserved3 #define size2 reserved3 #define rv3 reserved3 #define ealo reserved3 #define cmd reserved3 #define off64 reserved3 #define tab3 reserved3 #define tab4 reserved3 #define tab5 reserved3 #define reserved4 $78 #define ovl reserved4 #define rv2 reserved4 #define rv5 reserved4 #define cgshuf reserved4 #define newovl reserved4 #define irqtmp1 reserved4 #define irqtmp2 reserved4 #define reserved5 $79 #define target reserved5 #define save1 $74 #define rv4 save1 #define rv7 save1 #define tagid save1 #define maxsize save1 #define pbyte save1 #define pbit save1 #define save2 $73 #define cur save2 #define rv6 save2 #define osize save2 #define zovl save2 #define oldovl save2 #define newvma save2 #define save3 $72 #define rv1 save3 #define ea64 save3 #define buf3 save3 #define genwi save3 #define newmap save3 #define oldmask save3 #define save4 $71 #define irq_stat save4 .text .align 4 .type __rv_pattern, @object .size __rv_pattern, 16 __rv_pattern: .word 0x00010203, 0x10111213, 0x80808080, 0x80808080 .type __cg_pattern, @object .size __cg_pattern, 16 __cg_pattern: .word 0x04050607, 0x80808080, 0x80808080, 0x80808080 .type __ovly_current, @object .size __ovly_current, 16 __ovly_current: .space 16 /* * __ovly_return - stub for returning from overlay functions. * * On entry the four slots of $lr are: * __ovly_return, prev ovl index, caller return addr, undefined. * * Load the previous overlay and jump to the caller return address. * Updates __ovly_current. */ .align 4 .global __ovly_return .type __ovly_return, @function __ovly_return: ila tab1, _ovly_table - 16 # 0,2 0 shlqbyi ovl, $lr, 4 # 1,4 0 #nop shlqbyi target, $lr, 8 # 1,4 1 #nop; lnop #nop; lnop shli off1, ovl, 4 # 0,4 4 #lnop #nop hbr ovly_ret9, target # 1,15 5 #nop; lnop #nop; lnop #nop lqx vma, tab1, off1 # 1,6 8 #ifdef OVLY_IRQ_SAVE nop stqd save4, -64($sp) # 1,6 9 #else #nop; lnop #endif #nop; lnop #nop; lnop #nop; lnop #nop; lnop #nop rotqbyi size1, vma, 4 # 1,4 14 #nop stqd save3, -48($sp) # 1,6 15 #nop stqd save2, -32($sp) # 1,6 16 #nop stqd save1, -16($sp) # 1,6 17 andi present1, size1, 1 # 0,2 18 stqr ovl, __ovly_current # 1,6 18 #nop; lnop #nop brz present1, do_load # 1,4 20 ovly_ret9: #nop bi target # 1,4 21 /* * __ovly_load - copy an overlay partion to local store. * * On entry $75 points to a word consisting of the overlay index in * the top 14 bits, and the target address in the bottom 18 bits. * * Sets up $lr to return via __ovly_return. If $lr is already set * to return via __ovly_return, don't change it. In that case we * have a tail call from one overlay function to another. * Updates __ovly_current. */ .align 3 .global __ovly_load .type __ovly_load, @function __ovly_load: #if OVL_STUB_SIZE == 8 ######## #nop lqd target, 0(parm) # 1,6 -11 #nop; lnop #nop; lnop #nop; lnop #nop; lnop #nop; lnop #nop rotqby target, target, parm # 1,4 -5 ila tab2, _ovly_table - 16 # 0,2 -4 stqd save3, -48($sp) # 1,6 -4 #nop stqd save2, -32($sp) # 1,6 -3 #nop stqd save1, -16($sp) # 1,6 -2 rotmi ovl, target, -18 # 0,4 -1 hbr ovly_load9, target # 1,15 -1 ila rv1, __ovly_return # 0,2 0 #lnop #nop; lnop #nop lqr cur, __ovly_current # 1,6 2 shli off2, ovl, 4 # 0,4 3 stqr ovl, __ovly_current # 1,6 3 ceq rv2, $lr, rv1 # 0,2 4 lqr rv3, __rv_pattern # 1,6 4 #nop; lnop #nop; lnop #nop lqx vma, tab2, off2 # 1,6 7 ######## #else /* OVL_STUB_SIZE == 16 */ ######## ila tab2, _ovly_table - 16 # 0,2 0 stqd save3, -48($sp) # 1,6 0 ila rv1, __ovly_return # 0,2 1 stqd save2, -32($sp) # 1,6 1 shli off2, ovl, 4 # 0,4 2 lqr cur, __ovly_current # 1,6 2 nop stqr ovl, __ovly_current # 1,6 3 ceq rv2, $lr, rv1 # 0,2 4 lqr rv3, __rv_pattern # 1,6 4 #nop hbr ovly_load9, target # 1,15 5 #nop lqx vma, tab2, off2 # 1,6 6 #nop stqd save1, -16($sp) # 1,6 7 ######## #endif #nop; lnop #nop; lnop #nop shufb rv4, rv1, cur, rv3 # 1,4 10 #nop fsmb rv5, rv2 # 1,4 11 #nop rotqmbyi rv6, $lr, -8 # 1,4 12 #nop rotqbyi size2, vma, 4 # 1,4 13 #nop lqd save3, -48($sp) # 1,6 14 #nop; lnop or rv7, rv4, rv6 # 0,2 16 lqd save2, -32($sp) # 1,6 16 andi present2, size2, 1 # 0,2 17 #ifdef OVLY_IRQ_SAVE stqd save4, -64($sp) # 1,6 17 #else lnop # 1,0 17 #endif selb $lr, rv7, $lr, rv5 # 0,2 18 lqd save1, -16($sp) # 1,6 18 #nop brz present2, do_load # 1,4 19 ovly_load9: #nop bi target # 1,4 20 /* If we get here, we are about to load a new overlay. * "vma" contains the relevant entry from _ovly_table[]. * extern struct { * u32 vma; * u32 size; * u32 file_offset; * u32 buf; * } _ovly_table[]; */ .align 3 .global __ovly_load_event .type __ovly_load_event, @function __ovly_load_event: do_load: #ifdef OVLY_IRQ_SAVE ila irqtmp1, do_load10 # 0,2 -5 rotqbyi sz, vma, 8 # 1,4 -5 #nop rdch irq_stat, $SPU_RdMachStat # 1,6 -4 #nop bid irqtmp1 # 1,4 -3 do_load10: nop #else #nop rotqbyi sz, vma, 8 # 1,4 0 #endif rotqbyi osize, vma, 4 # 1,4 1 #nop lqa ea64, _EAR_ # 1,6 2 #nop lqr cgshuf, __cg_pattern # 1,6 3 /* We could predict the branch at the end of this loop by adding a few instructions, and there are plenty of free cycles to do so without impacting loop execution time. However, it doesn't make a great deal of sense since we need to wait for the dma to complete anyway. */ __ovly_xfer_loop: #nop rotqmbyi off64, sz, -4 # 1,4 4 #nop; lnop #nop; lnop #nop; lnop cg cgbits, ea64, off64 # 0,2 8 #lnop #nop; lnop #nop shufb add64, cgbits, cgbits, cgshuf # 1,4 10 #nop; lnop #nop; lnop #nop; lnop addx add64, ea64, off64 # 0,2 14 #lnop ila maxsize, MFC_MAX_DMA_SIZE # 0,2 15 lnop ori ea64, add64, 0 # 0,2 16 rotqbyi ealo, add64, 4 # 1,4 16 cgt cmp, osize, maxsize # 0,2 17 wrch $MFC_LSA, vma # 1,6 17 #nop; lnop selb sz, osize, maxsize, cmp # 0,2 19 wrch $MFC_EAH, ea64 # 1,6 19 ila tagid, MFC_TAG_ID # 0,2 20 wrch $MFC_EAL, ealo # 1,6 20 ila cmd, MFC_GET_CMD # 0,2 21 wrch $MFC_Size, sz # 1,6 21 sf osize, sz, osize # 0,2 22 wrch $MFC_TagId, tagid # 1,6 22 a vma, vma, sz # 0,2 23 wrch $MFC_Cmd, cmd # 1,6 23 #nop brnz osize, __ovly_xfer_loop # 1,4 24 /* Now update our data structions while waiting for DMA to complete. Low bit of .size needs to be cleared on the _ovly_table entry corresponding to the evicted overlay, and set on the entry for the newly loaded overlay. Note that no overlay may in fact be evicted as _ovly_buf_table[] starts with all zeros. Don't zap .size entry for zero index! Also of course update the _ovly_buf_table entry. */ #nop lqr newovl, __ovly_current # 1,6 25 #nop; lnop #nop; lnop #nop; lnop #nop; lnop #nop; lnop shli off3, newovl, 4 # 0,4 31 #lnop ila tab3, _ovly_table - 16 # 0,2 32 #lnop #nop fsmbi pbyte, 0x100 # 1,4 33 #nop; lnop #nop lqx vma, tab3, off3 # 1,6 35 #nop; lnop andi pbit, pbyte, 1 # 0,2 37 lnop #nop; lnop #nop; lnop #nop; lnop or newvma, vma, pbit # 0,2 41 rotqbyi buf3, vma, 12 # 1,4 41 #nop; lnop #nop stqx newvma, tab3, off3 # 1,6 43 #nop; lnop shli off4, buf3, 2 # 1,4 45 #lnop ila tab4, _ovly_buf_table - 4 # 0,2 46 #lnop #nop; lnop #nop; lnop #nop lqx map, tab4, off4 # 1,6 49 #nop cwx genwi, tab4, off4 # 1,4 50 a addr4, tab4, off4 # 0,2 51 #lnop #nop; lnop #nop; lnop #nop; lnop #nop rotqby oldovl, map, addr4 # 1,4 55 #nop shufb newmap, newovl, map, genwi # 0,4 56 #if MFC_TAG_ID < 16 ila newmask, 1 << MFC_TAG_ID # 0,2 57 #else ilhu newmask, 1 << (MFC_TAG_ID - 16) # 0,2 57 #endif #lnop #nop; lnop #nop; lnop stqd newmap, 0(addr4) # 1,6 60 /* Save app's tagmask, wait for DMA complete, restore mask. */ ila tagstat, MFC_TAG_UPDATE_ALL # 0,2 61 rdch oldmask, $MFC_RdTagMask # 1,6 61 #nop wrch $MFC_WrTagMask, newmask # 1,6 62 #nop wrch $MFC_WrTagUpdate, tagstat # 1,6 63 #nop rdch tagstat, $MFC_RdTagStat # 1,6 64 #nop sync # 1,4 65 /* Any hint prior to the sync is lost. A hint here allows the branch to complete 15 cycles after the hint. With no hint the branch will take 18 or 19 cycles. */ ila tab5, _ovly_table - 16 # 0,2 66 hbr do_load99, target # 1,15 66 shli off5, oldovl, 4 # 0,4 67 wrch $MFC_WrTagMask, oldmask # 1,6 67 ceqi zovl, oldovl, 0 # 0,2 68 #lnop #nop; lnop #nop fsm zovl, zovl # 1,4 70 #nop lqx oldvma, tab5, off5 # 1,6 71 #nop lqd save3, -48($sp) # 1,6 72 #nop; lnop andc pbit, pbit, zovl # 0,2 74 lqd save2, -32($sp) # 1,6 74 #ifdef OVLY_IRQ_SAVE ila irqtmp2, do_load90 # 0,2 75 #lnop andi irq_stat, irq_stat, 1 # 0,2 76 #lnop #else #nop; lnop #nop; lnop #endif andc oldvma, oldvma, pbit # 0,2 77 lqd save1, -16($sp) # 1,6 77 nop # 0,0 78 #lnop #nop stqx oldvma, tab5, off5 # 1,6 79 #nop #ifdef OVLY_IRQ_SAVE binze irq_stat, irqtmp2 # 1,4 80 do_load90: #nop lqd save4, -64($sp) # 1,6 84 #else #nop; lnop #endif .global _ovly_debug_event .type _ovly_debug_event, @function _ovly_debug_event: nop /* Branch to target address. */ do_load99: bi target # 1,4 81/85 .size __ovly_load, . - __ovly_load