diff options
Diffstat (limited to 'common/arm/ih264_resi_trans_a9.s')
-rwxr-xr-x | common/arm/ih264_resi_trans_a9.s | 604 |
1 files changed, 0 insertions, 604 deletions
diff --git a/common/arm/ih264_resi_trans_a9.s b/common/arm/ih264_resi_trans_a9.s deleted file mode 100755 index 08821f5..0000000 --- a/common/arm/ih264_resi_trans_a9.s +++ /dev/null @@ -1,604 +0,0 @@ -@/****************************************************************************** -@ * -@ * Copyright (C) 2015 The Android Open Source Project -@ * -@ * Licensed under the Apache License, Version 2.0 (the "License"); -@ * you may not use this file except in compliance with the License. -@ * You may obtain a copy of the License at: -@ * -@ * http://www.apache.org/licenses/LICENSE-2.0 -@ * -@ * Unless required by applicable law or agreed to in writing, software -@ * distributed under the License is distributed on an "AS IS" BASIS, -@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -@ * See the License for the specific language governing permissions and -@ * limitations under the License. -@ * -@ ***************************************************************************** -@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore -@*/ -@/** -@******************************************************************************* -@* @file -@* ih264_resi_trans_a9.s -@* -@* @brief -@* Contains function definitions for residual and forward trans -@* -@* @author -@* Ittiam -@* -@* @par List of Functions: -@* ih264_resi_trans_4x4_a9 -@* ih264_resi_trans_8x8_a9 -@* @remarks -@* None -@* -@******************************************************************************* - - -.text -.p2align 2 -@***************************************************************************** -@* -@* Function Name : ih264_resi_trans_4x4_a9 -@* Description : This function does cf4 of H264 followed by and approximate scaling -@* -@* Arguments : -@ R0 :pointer to src buffer -@ R1 :pointer to pred buffer -@ R2 :pointer to dst buffer -@ R3 :src_stride -@ STACk :pred_stride,dst_stride - -@* Values Returned : NONE -@* -@* Register Usage : -@* Stack Usage : -@* Cycles : Around -@* Interruptiaility : Interruptable -@* -@* Known Limitations -@* \Assumptions : -@* -@* Revision History : -@* DD MM YYYY Author(s) Changes -@* 30 12 2009 100633 First version -@* -@***************************************************************************** - - - .global ih264_resi_trans_4x4_a9 - .extern g_scal_coff_h264_4x4 -g_scal_coff_h264_4x4_addr: - .long g_scal_coff_h264_4x4 - 4x4lbl - 8 - -ih264_resi_trans_4x4_a9: - - @R0 :pointer to src buffer - @R1 :pointer to pred buffer - @R2 :pointer to dst buffer - @R3 :src_stride - @STACk :pred_stride,dst_stride - - push {r4-r12, lr} @push all the variables first - - mov r6, sp - add r6, r6, #40 @decrement stack pointer,to accomodate two variables - ldmfd r6, {r4-r5} @load the strides into registers - @R4 pred_stride - @R5 dst_stride - - - @we have to give the stride as post inrement in VLDR1 - @but since thr stride is from end of row 1 to start of row 2, - @we need to add the size of the curent row to strides ie we need to add 4 to it (4 bytes) - @ADD R3,#4 - @ADD R4,#4 - @ADD R5,#4 - @in case of dst the stride represnts 16 bit ie 2*8bits - @hence we need to add #4 to it and thenm multiply by 2 - @--------------------function loading done------------------------ - - @lets find residual - @data is like 1a -> d0[1:31] d0[32:64] - @ a b c d # # # # - vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer - vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer - @ data is like 1a -> q4[1:63] q4[64:148] - @ d8[1:63] d9[1:63] - @ a b c d # # # # - - vld1.u8 d28, [r0], r3 @load row 2 of src to d28[0] - vld1.u8 d29, [r1], r4 @load row2 of pred to d29[0] - - vld1.u8 d26, [r0], r3 @load row 3 of src to d26[0] - vsubl.u8 q0, d30, d31 @curr - pred for row one - - vld1.u8 d27, [r1], r4 @load row 3of pred t0 d27[0] - vsubl.u8 q1, d28, d29 @find row 2 of src -pred to d0 - - vld1.u8 d24, [r0], r3 @load row 4 of src to d24[0] - - vld1.u8 d25, [r1], r4 @load row 4 of src tp d25[0] - vsubl.u8 q2, d26, d27 @load src-pred row 3 to d[2] - - lsl r5, r5, #2 @ multiply dst stride by since we are storing 32 bit values - ldr r6, g_scal_coff_h264_4x4_addr -4x4lbl: - add r6, r6, pc @ load the address of global array - - vsubl.u8 q3, d24, d25 @load row 4 of src - pred to q6 - - @after this - @D0 -> 1a - @D2 -> 2a - @D4 -> 3a - @D6 -> 4a - - @transpose the matrix so that we can do the horizontal transform first - @#1 #2 #3 #4 - @a b c d ---- D0 - @e f g h -----D2 - @i j k l -----D4 - @m n o p -----D6 - @transpose the inner 2x2 blocks - vtrn.16 d0, d2 - vld1.s16 {q10}, [r6]! @ load the scaling values 0-7; - vtrn.16 d4, d6 - @a e c g - @b f d h - @i m k o - @j n l p - vtrn.32 d0, d4 - vtrn.32 d2, d6 - @a e i m #1 -- D0 --- x4 - @b f j n #2 -- D2 --- x5 - @c g k o #3 -- D4 ----x6 - @d h l p #4 -- D6 ----x7 - - @we have loaded the residuals into the registers , now we need to add and subtract them - @let us do the horiz transform first - - vsub.s16 d5, d2, d4 @x2 = x5-x6 - vsub.s16 d7, d0, d6 @x3 = x4-x7; - - vadd.s16 d3, d2, d4 @x1 = x5+x6 - vadd.s16 d1, d0, d6 @x0 = x4+x7 - - - vshl.s16 d31, d7, #1 @ - vshl.s16 d30, d5, #1 @ - - vadd.s16 d0, d1, d3 @x0 + x1; - vsub.s16 d4, d1, d3 @x0 - x1; - - vadd.s16 d2, d31, d5 @U_SHIFT(x3,1,shft) + x2; - vsub.s16 d6, d7, d30 @x3 - U_SHIFT(x2,1,shft); - - @taking transform again so as to make do vert transform - vtrn.16 d0, d2 - vtrn.16 d4, d6 - - vtrn.32 d0, d4 - vtrn.32 d2, d6 - - @let us do vertical transform - @same code as horiz - - vadd.s16 d1, d0, d6 @x0 = x4+x7 - vadd.s16 d3, d2, d4 @x1 = x5+x6 - vsub.s16 d7, d0, d6 @x3 = x4-x7; - vsub.s16 d5, d2, d4 @x2 = x5-x6 - - -@Since we are going to do scal / quant or whatever, we are going to divide by -@a 32 bit number. So we have to expand the values - - @VADDL.S16 Q12,D1,D3;x0 + x1 - @VSUBL.S16 Q14,D1,D3;x0 - x1 - - @VSHL.S16 D8,D5,#1; - @VSHL.S16 D9,D7,#1; - - @VADDL.S16 Q13,D9,D5 ; + x2 - @VSUBL.S16 Q15,D7,D8 ;x3 - U_SHIFT(x2,1,shft) - -@scaling follows - -@now we need to do the scaling,so load the scaling matrix -@mutliplying by the scaling coeffient; store the results from q5-q8 ; - - vadd.s16 d24, d3, d1 @x4 = x0 + x1 - vsub.s16 d28, d1, d3 @x6 = x0 - x1 - - vshl.s16 d0, d7, #1 @ U_SHIFT(x3,1,shft) - vmull.s16 q4, d24, d20 @x4*s0 - - vshl.s16 d2, d5, #1 @ U_SHIFT(x2,1,shft) - - vadd.s16 d26, d0, d5 @x5 = U_SHIFT(x3,1,shft) + x2 - vmull.s16 q5, d26, d21 @x5*s1 - - vst1.s32 {q4}, [r2], r5 @save 4 pixels of row1 current buffer and increment pointer by stride - - vld1.s16 {q10}, [r6] @load 8-16 scaling coeffcients - - vsub.s16 d30, d7, d2 @x7 = x3 - U_SHIFT(x2,1,shft) - - vmull.s16 q6, d28, d20 @x6*s2 - vst1.s32 {q5}, [r2], r5 - - vmull.s16 q7, d30, d21 @x7*s3 - - - vst1.s32 {q6}, [r2], r5 - vst1.s32 {q7}, [r2] - - pop {r4-r12, pc} @pop back all variables - - - - -@***************************************************************************** -@* Function Name : ih264_resi_trans_8x8_a9 -@* Description : This function does cf8 followd by an approximate normalization of H264 -@* -@* Arguments : -@* R0 :pointer to src buffer -@ R1 :pointer to pred buffer -@ R2 :pointer to dst buffer -@ R3 :src_stride -@ STACk :pred_stride,dst_st -@* -@* -@* Values Returned : NONE -@* -@* Register Usage : -@* Stack Usage : -@* Cycles : Around -@* Interruptiaility : Interruptable -@* -@* Known Limitations -@* \Assumptions : -@* -@* Revision History : -@* DD MM YYYY Author(s) Changes -@* 30 12 2009 100633 First version -@* -@***************************************************************************** - - - .global ih264_resi_trans_8x8_a9 - .extern g_scal_coff_h264_8x8 -g_scal_coff_h264_8x8_addr: - .long g_scal_coff_h264_8x8 - 8x8lbl - 8 - - -ih264_resi_trans_8x8_a9: - - @R0 :pointer to src buffer - @R1 :pointer to pred buffer - @R2 :pointer to dst buffer - @R3 :src_stride - @STACk :pred_stride,dst_stride - - push {r4-r12, lr} @push all the variables first - - mov r6, sp - add r6, r6, #40 @decrement stack pointer,to accomodate two variables - ldmfd r6, {r4-r5} @load the strides into registers - @R4 pred_stride - @R5 dst_stride - - @we have to give the stride as post inrement in vst1 - @in case of dst the stride represnts 16 bit ie 2*8bits - @hence we need to add #4 to it and thenm multiply by 2 - @--------------------function loading done------------------------ - - @lets find residual - @data is like 1a -> d0[1:31] d0[32:64] - @ a b c d # # # # - vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer - vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer - - vld1.u8 d28, [r0], r3 @src rw2 - vld1.u8 d29, [r1], r4 @pred rw2 - vsubl.u8 q0, d30, d31 @src-pred rw1 - - vld1.u8 d26, [r0], r3 - vld1.u8 d27, [r1], r4 - vsubl.u8 q1, d28, d29 - - vld1.u8 d24, [r0], r3 - vld1.u8 d25, [r1], r4 - vsubl.u8 q2, d26, d27 - - vld1.u8 d22, [r0], r3 - vld1.u8 d23, [r1], r4 - vsubl.u8 q3, d24, d25 - - vld1.u8 d20, [r0], r3 - vld1.u8 d21, [r1], r4 - vsubl.u8 q4, d22, d23 - - vld1.u8 d18, [r0], r3 - vld1.u8 d19, [r1], r4 - vsubl.u8 q5, d20, d21 - - vld1.u8 d16, [r0], r3 - vld1.u8 d17, [r1], r4 - vsubl.u8 q6, d18, d19 - - lsl r5, r5, #2 - - - vsubl.u8 q7, d16, d17 - - @after this - @Q0 -> 1a - @Q1 -> 2a - @Q2 -> 3a - @Q3 -> 4a - @Q4 -> 5a - @Q5 -> 6a - @Q6 -> 7a - @Q7 -> 8a - - @transpose the matrix so that we can do the horizontal transform first - - @transpose the inner 2x2 blocks - vtrn.16 q0, q1 - vtrn.16 q2, q3 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - - @transpose the inner 4x4 blocks - vtrn.32 q0, q2 - vtrn.32 q1, q3 - - vtrn.32 q4, q6 - vtrn.32 q5, q7 - - @transpose the outer 8x8 blocks - vswp d1, d8 - vswp d7, d14 - vswp d3, d10 - vswp d5, d12 - @transpose done - -@@this point we will have data in Q0-Q7 -@Q7 will be populated within 2 clock cycle -@all others are availabe @ this clock cycle - - @we have loaded the residuals into the registers , now we need to add and subtract them - @let us do the horiz transform first - - vadd.s16 q8, q0, q7 @ a0 = r0 + r7; - vadd.s16 q9, q1, q6 @ a1 = r1 + r6; - vadd.s16 q10, q2, q5 @ a2 = r2 + r5; - vadd.s16 q11, q3, q4 @ a3 = r3 + r4; - - vsub.s16 q12, q0, q7 @ b0 = r0 - r7; - vsub.s16 q13, q1, q6 @ b1 = r1 - r6; - vsub.s16 q15, q3, q4 @ b3 = r3 - r4; - vsub.s16 q14, q2, q5 @ b2 = r2 - r5; - - vadd.s16 q1, q8, q11 @ a4 = a0 + a3; - vadd.s16 q3, q9, q10 @ a5 = a1 + a2; - vsub.s16 q7, q9, q10 @ a7 = a1 - a2; - vsub.s16 q5, q8, q11 @ a6 = a0 - a3; - - ldr r6, g_scal_coff_h264_8x8_addr -8x8lbl: - add r6, r6, pc @ load the address of global array - - vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5; - vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft); - - vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5; - - vadd.s16 q2, q5, q8 @ - - - vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7; - vsub.s16 q6, q9, q7 @ - -@do not change Q0,Q2.Q4,Q6 they contain results -@Q1,Q3,Q5,Q7 TO STORE RESULTS -@Q8 Q9 Q10 Q11 USE @WILL - - vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft) - vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft) - vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft) - vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft) - - vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0); - vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1); - vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2); - vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3); - - vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0); - vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1); - vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2); - vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3); - - vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0); - vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2); - vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1); - vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3); - - vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft) - vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft); - vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft); - vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft); - - - vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft); - vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft); - vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft); - vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7; - - @------------horiz transform done------------------------- - @results are in Q0-Q7 - @all other neon registes can be used at will - -@doing vertical transform -@code exact copy of horiz transform above - - @transpose the inner 2x2 blocks - vtrn.16 q0, q1 - vtrn.16 q2, q3 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - - @transpose the inner 4x4 blocks - vtrn.32 q0, q2 - vtrn.32 q1, q3 - - vtrn.32 q4, q6 - vtrn.32 q5, q7 - - @transpose the outer 8x8 blocks - vswp d1, d8 - vswp d3, d10 - vswp d5, d12 - vswp d7, d14 - - @transpose done - - vadd.s16 q8, q0, q7 @ a0 = r0 + r7; - vadd.s16 q9, q1, q6 @ a1 = r1 + r6; - vadd.s16 q10, q2, q5 @ a2 = r2 + r5; - vadd.s16 q11, q3, q4 @ a3 = r3 + r4; - - vsub.s16 q12, q0, q7 @ b0 = r0 - r7; - vsub.s16 q13, q1, q6 @ b1 = r1 - r6; - vsub.s16 q14, q2, q5 @ b2 = r2 - r5; - vsub.s16 q15, q3, q4 @ b3 = r3 - r4; - - vadd.s16 q1, q8, q11 @ a4 = a0 + a3; - vadd.s16 q3, q9, q10 @ a5 = a1 + a2; - vsub.s16 q5, q8, q11 @ a6 = a0 - a3; - vsub.s16 q7, q9, q10 @ a7 = a1 - a2; - - - vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5; - - vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft); - @DSHIFT_TO_0 Q8,Q7,#1,#0 - vadd.s16 q2, q5, q8 @ - - vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5; - - vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7; - vsub.s16 q6, q9, q7 @ - -@do not change Q0,Q2.Q4,Q6 they contain results -@Q1,Q3,Q5,Q7 TO STORE RESULTS -@Q8 Q9 Q10 Q11 USE @WILL - - vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft) - vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft) - vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft) - vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft) - - - vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0); - vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1); - vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2); - vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3); - - vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0); - vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2); - vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1); - vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3); - - vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0); - vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2); - vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1); - vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3); - - vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft) - vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft); - vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft); - vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft); - - -@since we are going to scal by small values, we need not expand the guys to 32 bit bit values - vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft); - vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7; - vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft); - vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft); - - @------------vert transform done------------------------- - @results are in Q0-Q7 - @all other neon registes can be used at will - - @scaling - @since the 8x8 scaling matrix repeats in 1x4,1x4 block , - @we need only load 4 values for each row and in total 4 rows - vld1.s16 {q14-q15}, [r6] @ - - @since we need to get a 32 bit o/p for two 16 bit multiplications - @we need a VMULL instruction -@-----------------------------first and second row - - vmull.s16 q8, d0, d28 @scale the first row first 4 elem - vmull.s16 q9, d28, d1 @scale the second row last 4 elemts - - vmull.s16 q10, d2, d29 @ scale second row first 4 elem - vmull.s16 q11, d29, d3 @scale the second row last 4 elem - vmull.s16 q12, d4, d30 @scale third row first 4 elem - - vst1.s32 {q8, q9}, [r2], r5 @ write the first row complete - - vmull.s16 q13, d30, d5 @scale the third row last 4 elem - vmull.s16 q8, d6, d31 @scale the fourth row first 4 elem - - - vst1.s32 {q10, q11}, [r2], r5 @store the second row complete - -@------------------------------- 3rd and 4th row - - vmull.s16 q9, d31, d7 @scale the fourth row second column - - vst1.s32 {q12, q13}, [r2], r5 @store the third row complete - - vmull.s16 q10, d8, d28 @scale the 5th row fisrst 4 elms - vmull.s16 q11, d28, d9 @scale the 5th row second 4 elems - - vmull.s16 q12, d10, d29 @scale the 6th row first4 elements - - - vst1.s32 {q8, q9}, [r2], r5 @store fifth row - -@--------------------------------5th and 6th row - - vmull.s16 q13, d29, d11 @scale 6th row sendond 4 elems - - vmull.s16 q8, d12, d30 @scale 7th rw first 4 elms - - vst1.s32 {q10, q11}, [r2], r5 @store 6th row second 4 elements - - vmull.s16 q9, d30, d13 @scale 7th rw second 4 elms - vmull.s16 q10, d14, d31 @scale 8th rw forst 4 elms - - - vst1.s32 {q12, q13}, [r2], r5 @store 6th row - -@----------------------------------7th and 8th row - vmull.s16 q11, d31, d15 @scale 8th row second 4 elms - - vst1.s32 {q8, q9}, [r2], r5 @store 7th row - vst1.s32 {q10, q11}, [r2], r5 @store 8th row - -@----------------------------------done writing - - pop {r4-r12, pc} @pop back all variables - - - - - - |