# libFLAC - Free Lossless Audio Codec library # Copyright (C) 2004,2005,2006,2007 Josh Coalson # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # - Neither the name of the Xiph.org Foundation nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .text .align 2 .globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16 .type _FLAC__lpc_restore_signal_asm_ppc_altivec_16, @function .globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8 .type _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8, @function _FLAC__lpc_restore_signal_asm_ppc_altivec_16: # r3: residual[] # r4: data_len # r5: qlp_coeff[] # r6: order # r7: lp_quantization # r8: data[] # see src/libFLAC/lpc.c:FLAC__lpc_restore_signal() # these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual # bps<=15 for mid-side coding, since that uses an extra bit) # these should be fast; the inner loop is unrolled (it takes no more than # 3*(order%4) instructions, all of which are arithmetic), and all of the # coefficients and all relevant history stay in registers, so the outer loop # has only one load from memory (the residual) # I have not yet run this through simg4, so there may be some avoidable stalls, # and there may be a somewhat more clever way to do the outer loop # the branch mechanism may prevent dynamic loading; I still need to examine # this issue, and there may be a more elegant method stmw r31,-4(r1) addi r9,r1,-28 li r31,0xf andc r9,r9,r31 # for quadword-aligned stack data slwi r6,r6,2 # adjust for word size slwi r4,r4,2 add r4,r4,r8 # r4 = data+data_len mfspr r0,256 # cache old vrsave addis r31,0,0xffff ori r31,r31,0xfc00 mtspr 256,r31 # declare VRs in vrsave cmplw cr0,r8,r4 # i> lp_quantization lvewx v21,0,r3 # v21[n]: *residual vperm v21,v21,v21,v18 # v21[3]: *residual vaddsws v20,v21,v20 # v20[3]: *residual + (sum >> lp_quantization) vsldoi v18,v18,v18,4 # increment shift vector vperm v21,v20,v20,v17 # v21[n]: shift for storage vsldoi v17,v17,v17,12 # increment shift vector stvewx v21,0,r8 vsldoi v20,v20,v20,12 vsldoi v8,v8,v20,4 # insert value onto history addi r3,r3,4 addi r8,r8,4 cmplw cr0,r8,r4 # i> lp_quantization lvewx v9,0,r3 # v9[n]: *residual vperm v9,v9,v9,v6 # v9[3]: *residual vaddsws v8,v9,v8 # v8[3]: *residual + (sum >> lp_quantization) vsldoi v6,v6,v6,4 # increment shift vector vperm v9,v8,v8,v5 # v9[n]: shift for storage vsldoi v5,v5,v5,12 # increment shift vector stvewx v9,0,r8 vsldoi v8,v8,v8,12 vsldoi v2,v2,v8,4 # insert value onto history addi r3,r3,4 addi r8,r8,4 cmplw cr0,r8,r4 # i