@/***************************************************************************** @* @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore @* @* Licensed under the Apache License, Version 2.0 (the "License"); @* you may not use this file except in compliance with the License. @* You may obtain a copy of the License at: @* @* http://www.apache.org/licenses/LICENSE-2.0 @* @* Unless required by applicable law or agreed to in writing, software @* distributed under the License is distributed on an "AS IS" BASIS, @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @* See the License for the specific language governing permissions and @* limitations under the License. @* @*****************************************************************************/ @/** @/******************************************************************************* @* @file @* ihevc_deblk_luma_vert.s @* @* @brief @* contains function definitions for inter prediction interpolation. @* functions are coded using neon intrinsics and can be compiled using @* rvct @* @* @author @* anand s @* @* @par list of functions: @* @* @* @remarks @* none @* @*******************************************************************************/ .text .align 4 .extern gai4_ihevc_tc_table .extern gai4_ihevc_beta_table .globl ihevc_deblk_luma_vert_a9q gai4_ihevc_tc_table_addr: .long gai4_ihevc_tc_table - ulbl1 - 8 gai4_ihevc_beta_table_addr: .long gai4_ihevc_beta_table - ulbl2 - 8 .type ihevc_deblk_luma_vert_a9q, %function ihevc_deblk_luma_vert_a9q: push {r3-r12,lr} ldr r4,[sp,#0x2c] ldr r5,[sp,#0x30] add r3,r3,r4 add r3,r3,#1 ldr r6, [sp,#0x34] asr r3,r3,#1 add r7,r3,r5,lsl #1 add r3,r3,r6,lsl #1 cmp r7,#0x33 movgt r7,#0x33 bgt l1.56 cmp r7,#0x0 movlt r7,#0x0 @ r7 has the beta_index value l1.56: @ bic r2,r2,#1 asr r2,r2,#1 add r3,r3,r2,lsl #1 cmp r3,#0x35 movgt r3,#0x35 bgt l1.88 cmp r3,#0x0 movlt r3,#0x0 @ r3 has the tc_index value @ qp_luma = (quant_param_p + quant_param_q + 1) >> 1@ @ beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@ @ tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@ l1.88: ldr r2,gai4_ihevc_beta_table_addr ulbl2: add r2,r2,pc vmov.i8 d18,#0x2 ldr r4,gai4_ihevc_tc_table_addr ulbl1: add r4,r4,pc ldr r5,[r2,r7,lsl #2] @ beta vmov.i16 q8,#0x2 ldr r6,[r4,r3,lsl #2] @ tc lsl r8,r6,#1 cmp r6,#0 vdup.8 d19,r8 sub r7,r0,#4 vmov.i8 d23,#0x3 beq l1.964 vld1.8 {d24},[r7],r1 ldrb r8,[r0,#-3] @ -3 value vld1.8 {d1},[r7],r1 ldrb r10,[r0,#-2] @-2 value vld1.8 {d2},[r7],r1 ldrb r11,[r0,#-1] @-1 value vld1.8 {d0},[r7] ldrb r12,[r0,#0] @ 0 value ldrb r9,[r0,#1] @ 1 value vtrn.8 d24,d1 ldrb r2,[r0,#2] @ 2 value vtrn.8 d2,d0 add r12,r12,r2 subs r9,r12,r9,lsl #1 @ dq0 value is stored in r9 rsbmi r9,r9,#0 @dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@ vtrn.16 d24,d2 add r8,r8,r11 vtrn.16 d1,d0 subs r8,r8,r10,lsl #1 rsbmi r8,r8,#0 @ dp0 value is stored in r8 @ dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@ add r14,r1,r1,lsl #1 add r14,r0,r14 vdup.32 d4,d24[1] ldrb r2,[r14,#-3] @ -2 value vdup.32 d7,d2[1] ldrb r10,[r14,#-2] @ -2 value vdup.32 d3,d2[0] ldrb r11,[r14,#-1] @ -1 value vdup.32 d5,d1[1] ldrb r12,[r14,#0] @ 0 value vdup.32 d6,d1[0] ldrb r3,[r14,#1] @ 1 value vdup.32 d2,d0[0] ldrb r4,[r14,#2] @ 2 value add r12,r12,r4 subs r12,r12,r3,lsl #1 @ dq3value is stored in r12 rsbmi r12,r12,#0 @ dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@ add r2,r2,r11 subs r11,r2,r10,lsl #1 rsbmi r11,r11,#0 @ dp3 value is stored in r8 @ dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )@ add r3,r8,r9 @ r3 has the d0 value add r4,r11,r12 @ r4 has the d3 value @ d0 = dp0 + dq0@ @ d3 = dp3 + dq3@ add r14,r8,r11 @ r13 has the value dp add r12,r12,r9 @ r12 has the value dq @ dp = dp0 + dp3@ @ dq = dq0 + dq3@ add r11, r3, r4 @ r3 has the value d @ d = d0 + d3@ cmp r11,r5 vdup.32 d22,d0[1] bge l1.964 @ if(d < beta) @ registers which cannont be altered : r3,r4 r5,r6,r12,r13,r0,r1,r11 @ registers for use: r2,r7,r8,r9,r10, vqsub.u8 d30,d7,d19 asr r10,r5,#2 vqadd.u8 d31,d7,d19 cmp r10,r3,lsl #1 vaddl.u8 q0,d5,d4 ble l1.336 ldrb r2,[r0,#-4] vaddw.u8 q0,q0,d2 ldrb r7,[r0,#-1] vmull.u8 q10,d7,d23 ldrb r3,[r0,#0] vmlal.u8 q10,d22,d18 ldrb r8,[r0,#3] @ ubfx r7,r2,#24,#8 @ has the -1 value @ and r2,#0xff @ has the -4 value @ ubfx r8,r3,#24,#8 @ has the 3 value @ and r3,#0xff @ r4 has the 0 value vadd.i16 q10,q10,q0 subs r8,r8,r3 vrshrn.i16 d22,q10,#3 rsbmi r8,r8,#0 subs r2,r2,r7 vmin.u8 d21,d22,d31 rsbmi r2,r2,#0 vmax.u8 d22,d21,d30 add r8,r8,r2 vaddl.u8 q10,d7,d3 cmp r8,r5,asr #3 vmla.i16 q10,q0,q8 bge l1.336 vaddw.u8 q0,q0,d7 subs r7,r3,r7 vrshrn.i16 d20,q10,#3 rsbmi r7,r7,#0 vrshrn.i16 d0,q0,#2 mov r10,#5 vqadd.u8 d30,d5,d19 mul r10,r10,r6 vqsub.u8 d31,d5,d19 add r10,#1 cmp r7,r10,asr #1 bge l1.336 @ if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) ) @ && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) ) asr r10,r5,#2 vqsub.u8 d25,d4,d19 cmp r10,r4,lsl #1 vqadd.u8 d21,d4,d19 ble l1.336 vmin.u8 d26,d20,d21 add r4,r1,r1,lsl #1 add r4,r4,r0 vmax.u8 d20,d26,d25 ldrb r2,[r4,#-4] vmin.u8 d19,d0,d30 ldrb r7,[r4,#-1] vmax.u8 d21,d19,d31 ldrb r3,[r4,#0] lsl r10,r6,#1 ldrb r8,[r4,#3] @ ubfx r7,r2,#24,#8 @ has the -1 value @ and r2,#0xff @ has the -4 value @ ubfx r8,r3,#24,#8 @ has the 3 value @ and r3,#0xff @ r4 has the 0 value vaddl.u8 q0,d2,d3 vdup.8 d19,r10 subs r8,r8,r3 vaddw.u8 q0,q0,d4 rsbmi r8,r8,#0 vqadd.u8 d30,d2,d19 subs r2,r2,r7 vqsub.u8 d31,d2,d19 rsbmi r2,r2,#0 vaddl.u8 q13,d5,d6 add r8,r8,r2 vmla.i16 q13,q0,q8 cmp r8,r5,asr #3 bge l1.336 vrshrn.i16 d26,q13,#3 subs r7,r3,r7 vqadd.u8 d27,d3,d19 rsbmi r7,r7,#0 vqsub.u8 d28,d3,d19 mov r10,#5 vmin.u8 d16,d26,d30 mul r10,r10,r6 add r10,#1 cmp r7,r10,asr #1 vmax.u8 d26,d16,d31 bge l1.336 vqadd.u8 d30,d6,d19 mov r2,#2 ldr r4,[sp,#0x38] @ loading the filter_flag_p vqsub.u8 d31,d6,d19 ldr r5,[sp,#0x3c] @ loading the filter_flag_q b end_dep_deq_decision @ r2 has the value of de @ r6 has teh value of tc @ r5 has the value of beta @ r14 has the value of dp @ r12 has the value of dq @ r0 has the value of source address @ r1 has the src stride l1.336: mov r2,#1 l1.424: mov r11,r5 ldr r4,[sp,#0x38] @ loading the filter_flag_p ldr r5,[sp,#0x3c] @ loading the filter_flag_q cmp r6,#1 moveq r9,#0 moveq r10,#0 beq end_dep_deq_decision and r7,r4,r5 cmp r7,#1 beq both_flags_set cmp r4,#0 beq set_flag_dep_zero add r8,r11,r11,asr #1 mov r10,#0 asr r8,#3 cmp r8,r14 movgt r9,#1 movle r9,#0 b end_dep_deq_decision set_flag_dep_zero: add r8,r11,r11,asr #1 mov r9,#0 asr r8,#3 cmp r8,r12 movgt r10,#1 movle r10,#0 b end_dep_deq_decision both_flags_set: add r8,r11,r11,asr #1 asr r8,#3 cmp r8,r14 movgt r9,#1 movle r9,#0 cmp r8,r12 movgt r10,#1 movle r10,#0 end_dep_deq_decision: @r0=source address @r1=stride @ r2 =de @ r4=flag p @r5= flag q @r6 =tc @ r9 =dep @ r10=deq @ b l1.964 cmp r2,#2 @ r4 has the value of de bne l1.968 cmp r5,#0 beq l1.780 @ r5 has the flag of q add r3,r0,#2 vst1.8 {d22[0]},[r3],r1 vst1.8 {d22[1]},[r3],r1 vst1.8 {d22[2]},[r3],r1 vst1.8 {d22[3]},[r3] add r3,r0,r1 vtrn.8 d20,d21 vst1.16 {d20[0]},[r0] vst1.16 {d21[0]},[r3],r1 vst1.16 {d20[1]},[r3],r1 vst1.16 {d21[1]},[r3] l1.780: cmp r4,#0 beq l1.964 @ r5 has the flag p vdup.32 d7,d24[0] sub r3,r0,#1 vaddw.u8 q8,q0,d6 add r7,r3,r1 vrshrn.i16 d2,q8,#2 vst1.8 {d26[0]},[r3] sub r0,r0,#3 vmin.u8 d16,d2,d27 vst1.8 {d26[1]},[r7],r1 vmull.u8 q1,d6,d23 vmlal.u8 q1,d7,d18 vst1.8 {d26[2]},[r7],r1 vmax.u8 d5,d16,d28 vst1.8 {d26[3]},[r7] vadd.i16 q0,q1,q0 vrshrn.i16 d0,q0,#3 vmin.u8 d1,d0,d30 vmax.u8 d0,d1,d31 vtrn.8 d0,d5 vst1.16 {d0[0]},[r0],r1 vst1.16 {d5[0]},[r0],r1 vst1.16 {d0[1]},[r0],r1 vst1.16 {d5[1]},[r0] l1.964: pop {r3-r12,pc} l1.968: vmov.i16 q0,#0x9 rsb r11,r6,#0 cmp r4,#0 @ checks for the flag p vmov.i16 q8,#0x3 vmov.i8 d24,#0x1 vdup.8 d30,r11 and r11,r6,#0xff vdup.8 d31,r11 vsubl.u8 q9,d4,d2 vmul.i16 q9,q9,q0 vsubl.u8 q0,d5,d3 vmul.i16 q8,q0,q8 vsub.i16 q8,q9,q8 vrshr.s16 q8,q8,#4 @ delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@ vabs.s16 q0,q8 vmovn.i16 d0,q0 @ storing the absolute values of delta in d0 vqmovn.s16 d16,q8 @ storing the clipped values of delta in d16 vmov.i8 d1,#0xa vdup.8 d21,r11 vmul.i8 d1,d1,d21 @ d1 stores the value (10 * tc) @if(abs(delta) < 10 * tc) vmin.s8 d18,d16,d31 vmax.s8 d20,d18,d30 @ delta = clip3(delta, -tc, tc)@ vmovl.s8 q8,d20 vmovl.u8 q9,d2 vadd.i16 q9,q9,q8 vqmovun.s16 d22,q9 vmovl.u8 q9,d4 vsub.i16 q8,q9,q8 vqmovun.s16 d23,q8 @ tmp_p0 = clip_u8(pu1_src[-1] + delta)@ @ tmp_q0 = clip_u8(pu1_src[0] - delta)@ beq l1.1272 cmp r9,#1 bne l1.1212 @ checks for the flag dep asr r3,r6,#1 vaddl.u8 q8,d6,d2 vaddw.u8 q8,q8,d24 vdup.8 d18,r3 rsb r3,r3,#0 vdup.8 d19,r3 vshr.u16 q8,q8,#1 vmovn.i16 d16,q8 vsubl.u8 q8,d16,d3 vaddw.s8 q8,q8,d20 vshr.s16 q8,q8,#1 vqmovn.s16 d16,q8 vmin.s8 d17,d16,d18 vmax.s8 d16,d19,d17 vmovl.u8 q9,d3 vmovl.s8 q8,d16 vadd.i16 q8,q9,q8 vqmovun.s16 d16,q8 vmov d30,d3 vcge.u8 d3,d0,d1 vbsl d3,d30,d16 l1.1212: vdup.8 d16,r11 sub r12,r0,#3 sub r3,r0,#1 @ vmul.i8 d16,d16,d1 vtrn.8 d6,d3 vst1.16 {d6[0]},[r12],r1 vcge.u8 d16,d0,d1 vst1.16 {d3[0]},[r12],r1 vbsl d16,d2,d22 vst1.8 {d16[0]},[r3],r1 vst1.8 {d16[1]},[r3],r1 vst1.16 {d6[1]},[r12],r1 vst1.8 {d16[2]},[r3],r1 vst1.16 {d3[1]},[r12] vst1.8 {d16[3]},[r3] l1.1272: @ ldr r3,[sp,#0x38] cmp r5,#0 beq l1.964 @ checks for the flag q cmp r10,#1 bne l1.1412 @ checks for the flag deq vmov d2,d7 asr r3,r6,#1 vdup.8 d6,r3 rsb r3,r3,#0 vdup.8 d16,r3 vaddl.u8 q1,d2,d4 vaddw.u8 q1,q1,d24 vshr.u16 q1,q1,#1 vmovn.i16 d2,q1 vsubl.u8 q1,d2,d5 vsubw.s8 q1,q1,d20 vshr.s16 q1,q1,#1 vqmovn.s16 d3,q1 vmin.s8 d2,d3,d6 vmax.s8 d3,d16,d2 @ vdup.8 d6,r2 @ vmul.i8 d6,d6,d1 vmovl.u8 q8,d5 vmovl.s8 q1,d3 vadd.i16 q1,q8,q1 vqmovun.s16 d3,q1 vmov d30,d5 vcge.u8 d5,d0,d1 vbsl d5,d30,d3 l1.1412: @ vdup.8 d2,r2 add r3,r0,#2 add r11,r3,r1 @ vmul.i8 d1,d2,d1 vst1.8 {d7[0]},[r3] vst1.8 {d7[1]},[r11],r1 vst1.8 {d7[2]},[r11],r1 vcge.u8 d0,d0,d1 vst1.8 {d7[3]},[r11] vbsl d0,d4,d23 vtrn.8 d0,d5 vst1.16 {d0[0]},[r0],r1 vst1.16 {d5[0]},[r0],r1 vst1.16 {d0[1]},[r0],r1 vst1.16 {d5[1]},[r0] pop {r3-r12,pc}