summaryrefslogtreecommitdiffstats
path: root/common/arm64/ihevc_deblk_luma_horz.s
diff options
context:
space:
mode:
Diffstat (limited to 'common/arm64/ihevc_deblk_luma_horz.s')
-rw-r--r--common/arm64/ihevc_deblk_luma_horz.s586
1 files changed, 586 insertions, 0 deletions
diff --git a/common/arm64/ihevc_deblk_luma_horz.s b/common/arm64/ihevc_deblk_luma_horz.s
new file mode 100644
index 0000000..a5c314d
--- /dev/null
+++ b/common/arm64/ihevc_deblk_luma_horz.s
@@ -0,0 +1,586 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///*******************************************************************************
+//* @file
+//* ihevc_deblk_luma_vert.s
+//*
+//* @brief
+//* contains function definitions for inter prediction interpolation.
+//* functions are coded using neon intrinsics and can be compiled using
+
+//* rvct
+//*
+//* @author
+//* anand s
+//*
+//* @par list of functions:
+//*
+//*
+//* @remarks
+//* none
+//*
+//*******************************************************************************/
+
+.text
+.align 4
+
+
+.extern gai4_ihevc_tc_table
+.extern gai4_ihevc_beta_table
+.globl ihevc_deblk_luma_horz_av8
+
+.type ihevc_deblk_luma_horz_av8, %function
+
+ihevc_deblk_luma_horz_av8:
+ // stmfd sp!, {x3-x12,x14}
+ sxtw x5,w5
+ sxtw x6,w6
+ stp d8,d9,[sp,#-16]!
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
+ stp x19, x20,[sp,#-16]!
+ stp x21, x22,[sp,#-16]!
+
+ mov x21,x7
+ ldr w22,[sp,#96]
+
+ add x3,x3,x4
+ add x3,x3,#1
+ asr x3,x3,#1
+ add x7,x3,x5,lsl #1
+ add x3,x3,x6,lsl #1
+ cmp x7,#0x33
+ mov x20,#0x33
+ csel x7, x20, x7,gt
+ bgt l1.1532
+ cmp x7,#0x0
+ mov x20,#0x0
+ csel x7, x20, x7,lt // x7 has the beta_index value
+l1.1532:
+ // bic x2,x2,#1
+ asr x2,x2,#1
+
+ add x3,x3,x2,lsl #1
+ cmp x3,#0x35
+ mov x20,#0x35
+ csel x3, x20, x3,gt
+ bgt l1.1564
+ cmp x3,#0x0
+ mov x20,#0x0
+ csel x3, x20, x3,lt // x3 has the tc_index value
+
+ // qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
+ // beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
+ // tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
+
+l1.1564:
+ adrp x2, :got:gai4_ihevc_beta_table
+ ldr x2, [x2, #:got_lo12:gai4_ihevc_beta_table]
+
+ adrp x4, :got:gai4_ihevc_tc_table
+ ldr x4, [x4, #:got_lo12:gai4_ihevc_tc_table]
+
+ ldr w5, [x2,x7,lsl #2] // beta
+ ldr w6, [x4,x3,lsl #2] // tc
+
+
+
+ cmp x6,#0
+ beq l1.2404
+ movi v0.4h, #0x2
+ lsl x7,x6,#1
+ add x14,x1,x1,lsl #1
+ neg x19,x14
+ ldr w8, [x0,x19] // -3 value
+ dup v1.8b,w7
+ lsl x19,x1,#1
+ neg x19,x19
+ ldr w10, [x0,x19] //-2 value
+ dup v23.2s,w8 // -3 value
+ neg x19,x1
+ ldr w11, [x0,x19] //-1 value
+ dup v24.2s,w10 // -2 value
+ and x8,x8,#0xff
+ ldr w12, [x0,#0] // 0 value
+ dup v25.2s,w11 // -1 value
+ and x10,x10,#0xff
+ ldr w9, [x0,x1] // 1 value
+ dup v26.2s,w12 // 0 value
+ and x11,x11,#0xff
+ lsl x19,x1,#1
+ ldr w2, [x0,x19] // 2 value
+ dup v27.2s,w9 // 1value
+ and x12,x12,#0xff
+ dup v28.2s,w2 // 2 value
+ and x9,x9,#0xff
+ and x2,x2,#0xff
+
+ add x12,x12,x2
+ subs x9,x12,x9,lsl #1 // dq0 value is stored in x9
+ csneg x9,x9,x9,pl
+ //dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
+
+ add x8,x8,x11
+ subs x8,x8,x10,lsl #1
+ csneg x8,x8,x8,pl // dp0 value is stored in x8
+ // dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
+
+
+
+ add x3,x1,x1,lsl #1
+ add x14,x0,#3
+
+
+ neg x19,x3
+ ldrb w2,[x14,x19] // -2 value
+ lsl x19,x1,#1
+ neg x19,x19
+ ldrb w10,[x14,x19] // -2 value
+ neg x19,x1
+ ldrb w11,[x14,x19] // -1 value
+ ldrb w12,[x14,#0] // 0 value
+ ldrb w3,[x14,x1] // 1 value
+ lsl x19,x1,#1
+ ldrb w4,[x14,x19] // 2 value
+
+
+ add x12,x12,x4
+ subs x12,x12,x3,lsl #1 // dq3value is stored in x12
+ csneg x12,x12,x12,pl
+ // dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
+
+
+ add x2,x2,x11
+ subs x11,x2,x10,lsl #1
+ csneg x11,x11,x11,pl // dp3 value is stored in x8
+ // dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )@
+
+
+
+ add x3,x8,x9 // x3 has the d0 value
+ add x4,x11,x12 // x4 has the d3 value
+
+
+ // d0 = dp0 + dq0@
+ // d3 = dp3 + dq3@
+
+ add x14,x8,x11 // x13 has the value dp
+ add x12,x12,x9 // x12 has the value dq
+ // dp = dp0 + dp3@
+ // dq = dq0 + dq3@
+
+ add x11, x3, x4 // x3 has the value d
+
+ // d = d0 + d3@
+
+
+ cmp x11,x5
+ bge l1.2404
+
+ // if(d < beta)
+
+
+ // registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11
+
+ // registers for use: x2,x7,x8,x9,x10,
+
+ asr x10,x5,#2
+ uqadd v30.8b, v26.8b , v1.8b
+ cmp x10,x3,lsl #1
+ uqsub v31.8b, v26.8b , v1.8b
+ ble l1.1840
+ add x10,x1,x1,lsl #1
+ uaddl v6.8h, v25.8b , v26.8b
+ neg x19,x1
+ ldr w2, [x0,x19,lsl #2] // has the -4 value
+ neg x19, x1
+ ldrb w7,[x0,x19] // has the -1 value
+ dup v22.2s,w2 // -4 value
+ uaddw v8.8h, v6.8h , v27.8b
+ ldrb w3,[x0,#0] // x4 has the 0 value
+ uqadd v16.8b, v27.8b , v1.8b
+ and x2,x2,#0xff
+ mul v12.8h, v8.8h, v0.4h[0]
+ ldr w8, [x0,x10] // has the 3 value
+ uaddl v10.8h, v24.8b , v28.8b
+ subs x2,x2,x7
+ uqsub v17.8b, v27.8b , v1.8b
+ dup v29.2s,w8 // 3 value
+ and x8,x8,#0xff
+ add v12.8h, v12.8h , v10.8h
+ csneg x2,x2,x2,pl
+ rshrn v20.8b, v12.8h,#3
+ subs x8,x8,x3
+ csneg x8,x8,x8,pl
+ umin v18.8b, v20.8b , v30.8b
+ add x8,x8,x2
+
+ cmp x8,x5,asr #3
+ bge l1.1840
+ uaddw v14.8h, v8.8h , v28.8b
+ subs x7,x3,x7
+ umax v4.8b, v18.8b , v31.8b
+ csneg x7,x7,x7,pl
+ uqadd v30.8b, v28.8b , v1.8b
+ mov x10,#5
+ rshrn v21.8b, v14.8h,#2
+ mul x10, x10, x6
+ uqsub v31.8b, v28.8b , v1.8b
+ add x10, x10,#1
+ cmp x7,x10,asr #1
+ umin v18.8b, v21.8b , v16.8b
+ bge l1.1840
+
+
+ // if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) )
+ // && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
+
+ umax v5.8b, v18.8b , v17.8b
+ asr x10,x5,#2
+ uaddl v16.8h, v29.8b , v28.8b
+ cmp x10,x4,lsl #1
+ ble l1.1840
+
+ add x10,x1,x1,lsl #1
+ mul v16.8h, v16.8h, v0.4h[0]
+ add x4,x0,#3
+
+
+ lsl x19,x1,#2
+ neg x19,x19
+ ldrb w2,[x4,x19]
+ add v16.8h, v16.8h , v14.8h
+ neg x19,x1
+ ldrb w7,[x4,x19]
+ rshrn v19.8b, v16.8h,#3
+ ldrb w3,[x4,#0]
+ ldrb w8,[x4,x10]
+ // ubfx x7,x2,#24,#8 @ has the -1 value
+ // and x2,#0xff @ has the -4 value
+ // ubfx x8,x3,#24,#8 @ has the 3 value
+ // and x3,#0xff @ x4 has the 0 value
+
+
+
+ subs x8,x8,x3
+ umin v18.8b, v19.8b , v30.8b
+ csneg x8,x8,x8,pl
+ uaddl v6.8h, v25.8b , v24.8b
+ subs x2,x2,x7
+ umax v3.8b, v18.8b , v31.8b
+ csneg x2,x2,x2,pl
+ uaddw v8.8h, v6.8h , v26.8b
+ add x8,x8,x2
+ uqadd v30.8b, v25.8b , v1.8b
+ cmp x8,x5,asr #3
+ uqsub v31.8b, v25.8b , v1.8b
+ bge l1.1840
+ mul v12.8h, v8.8h, v0.4h[0]
+ subs x7,x3,x7
+ uqadd v16.8b, v24.8b , v1.8b
+ csneg x7,x7,x7,pl
+ uaddl v10.8h, v23.8b , v27.8b
+ mov x10,#5
+ uqsub v17.8b, v24.8b , v1.8b
+ mul x10, x10, x6
+ add v12.8h, v12.8h , v10.8h
+ add x10, x10,#1
+ rshrn v20.8b, v12.8h,#3
+ cmp x7,x10,asr #1
+ uaddw v14.8h, v8.8h , v23.8b
+ bge l1.1840
+ umin v18.8b, v20.8b , v30.8b
+ mov x2,#2
+ uqadd v30.8b, v23.8b , v1.8b
+ mov w4,w21
+ umax v2.8b, v18.8b , v31.8b
+ mov w5,w22
+ rshrn v21.8b, v14.8h,#2
+ b end_dep_deq_decision_horz
+ // x2 has the value of de
+ // x6 has teh value of tc
+ // x5 has the value of beta
+ // x14 has the value of dp
+ // x12 has the value of dq
+ // x0 has the value of source address
+ // x1 has the src stride
+
+l1.1840:
+ mov x2,#1
+
+ mov x11,x5
+ mov w4,w21
+ mov w5,w22
+
+ cmp x6,#1
+ mov x20,#0
+ csel x9, x20, x9,eq
+ mov x20,#0
+ csel x10, x20, x10,eq
+ beq end_dep_deq_decision_horz
+
+ and x7,x4,x5
+ cmp x7,#1
+ beq both_flags_set_horz
+ cmp x4,#0
+ beq set_flag_dep_zero_horz
+
+
+ add x8,x11,x11,asr #1
+ mov x10,#0
+ asr x8,x8,#3
+ cmp x8,x14
+ mov x20,#1
+ csel x9, x20, x9,gt
+ mov x20,#0
+ csel x9, x20, x9,le
+ b end_dep_deq_decision_horz
+set_flag_dep_zero_horz:
+
+ add x8,x11,x11,asr #1
+ mov x9,#0
+ asr x8,x8,#3
+ cmp x8,x12
+ mov x20,#1
+ csel x10, x20, x10,gt
+ mov x20,#0
+ csel x10, x20, x10,le
+ b end_dep_deq_decision_horz
+
+both_flags_set_horz:
+ add x8,x11,x11,asr #1
+ asr x8,x8,#3
+ cmp x8,x14
+ mov x20,#1
+ csel x9, x20, x9,gt
+ mov x20,#0
+ csel x9, x20, x9,le
+ cmp x8,x12
+ mov x20,#1
+ csel x10, x20, x10,gt
+ mov x20,#0
+ csel x10, x20, x10,le
+end_dep_deq_decision_horz:
+
+ //x0=source address
+ //x1=stride
+ // x2 =de
+ // x4=flag p
+ //x5= flag q
+ //x6 =tc
+ // x9 =dep
+ // x10=deq
+
+
+
+ // add x14,x1,x1,lsl #1
+ // lsl x7,x6,#1
+ // vdup.8 d1,x7
+ // vmov.i16 d0,#0x2
+ umin v18.8b, v21.8b , v16.8b
+ cmp x2,#1
+ uqsub v31.8b, v23.8b , v1.8b
+ beq l1.2408
+ uaddl v8.8h, v23.8b , v22.8b
+ cmp x5,#1
+
+ bne strong_filtering_p
+
+strong_filtering_q:
+ mov x12,x0
+ st1 {v4.s}[0],[x12],x1
+ st1 {v5.s}[0],[x12],x1
+ st1 {v3.s}[0],[x12]
+ cmp x4,#1
+ bne l1.2404
+strong_filtering_p:
+ umax v5.8b, v18.8b , v17.8b
+ mov x12,x0
+ mul v8.8h, v8.8h, v0.4h[0]
+ sub x20,x1,#0
+ neg x11, x20
+ add v16.8h, v8.8h , v14.8h
+ add x12,x12,x11
+ rshrn v19.8b, v16.8h,#3
+ st1 {v2.s}[0],[x12],x11
+ umin v18.8b, v19.8b , v30.8b
+ st1 {v5.s}[0],[x12],x11
+ umax v3.8b, v18.8b , v31.8b
+ st1 {v3.s}[0],[x12]
+
+l1.2404:
+ // ldmfd sp!, {x3-x12,pc}
+ ldp x21, x22,[sp],#16
+ ldp x19, x20,[sp],#16
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ret
+
+ // x4=flag p
+ //x5= flag q
+ //x6 =tc
+ // x9 =dep
+ // x10=deq
+
+
+ // d22 -4 value
+
+ //d23 @ -3 value
+
+ // vdup.32 d24,x11 @ -2 value
+
+ // vdup.32 d25, x11 @-1 value
+
+ // vdup.32 d26,x11 @ 0 value
+
+ // vdup.32 d27,x11 @ 1value
+
+ // vdup.32 d28,x11 @ 2 value
+
+ // vdup.32 d29,x11 @ 3 value
+
+l1.2408:
+
+ movi v0.4h, #0x9
+
+ usubl v10.8h, v26.8b , v25.8b
+
+ mul v10.8h, v10.8h, v0.4h[0]
+
+ movi v0.4h, #0x3
+
+ usubl v12.8h, v27.8b , v24.8b
+ mul v12.8h, v12.8h, v0.4h[0]
+
+
+ dup v30.8b,w6 // duplicating the +tc value
+
+ sub x20,x6,#0
+ neg x12, x20
+ dup v31.8b,w12 // duplicating the -tc value
+
+
+
+ sub v10.8h, v10.8h , v12.8h
+
+
+
+ srshr v10.8h, v10.8h,#4
+ // delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
+
+ abs v8.8h, v10.8h
+ xtn v9.8b, v8.8h
+ // storing the absolute values of delta in d9
+
+ sqxtn v10.8b, v10.8h
+ // storing the clipped values of delta in d16
+
+
+ smin v11.8b, v10.8b , v30.8b
+ smax v8.8b, v31.8b , v11.8b // d8 has the value delta = clip3(delta, -tc, tc)//
+
+
+ uxtl v6.8h, v25.8b
+
+ saddw v4.8h, v6.8h , v8.8b
+
+ sqxtun v12.8b, v4.8h
+ uxtl v6.8h, v26.8b
+ ssubw v4.8h, v6.8h , v8.8b
+ sqxtun v13.8b, v4.8h
+
+
+ mov x11,#0xa
+ mul x12, x11, x6
+ dup v2.8b,w12 // d2 has the 10*tc value
+ mov v18.8b, v24.8b
+ dup v0.8b,w6
+ sshr v0.8b,v0.8b,#1
+ neg v1.8b, v0.8b
+
+ cmp x4,#1
+ bne l1.2724
+ cmp x9,#1
+ bne l1.2700
+
+ // d12 and d13 have the value temp_p0 and temp_q0
+ uaddl v14.8h, v23.8b , v25.8b
+ rshrn v14.8b, v14.8h,#1
+ usubl v14.8h, v14.8b , v24.8b
+ saddw v14.8h, v14.8h , v8.8b
+ sqshrn v14.8b, v14.8h,#1
+ smin v15.8b, v14.8b , v0.8b
+ smax v14.8b, v1.8b , v15.8b
+
+ // d14 has the delta p value
+ uxtl v16.8h, v24.8b
+ saddw v16.8h, v16.8h , v14.8b
+ sqxtun v14.8b, v16.8h
+
+ // d14 =tmp_p1 = clip_u8(pu1_src[-2 * src_strd] + delta_p)@
+ cmhs v18.8b,v9.8b,v2.8b
+ bsl v18.8b,v24.8b,v14.8b
+
+l1.2700:
+ mov x12,x0
+ sub x20,x1,#0
+ neg x11, x20
+ add x12,x12,x11
+ cmhs v19.8b,v9.8b,v2.8b
+ bsl v19.8b,v25.8b,v12.8b
+ st1 {v19.s}[0],[x12],x11
+ st1 {v18.s}[0],[x12]
+l1.2724:
+ cmp x5,#1
+ bne l1.2404
+ cmp x10,#1
+ mov v18.8b, v27.8b
+ bne l1.2852
+
+ uaddl v14.8h, v26.8b , v28.8b
+ rshrn v14.8b, v14.8h,#1
+ usubl v14.8h, v14.8b , v27.8b
+ ssubw v14.8h, v14.8h , v8.8b
+ sqshrn v14.8b, v14.8h,#1
+ smin v15.8b, v14.8b , v0.8b
+ smax v14.8b, v1.8b , v15.8b
+// d14 has the delta p value
+ uxtl v16.8h, v27.8b
+ saddw v16.8h, v16.8h , v14.8b
+ sqxtun v14.8b, v16.8h
+ cmhs v18.8b,v9.8b,v2.8b
+ bsl v18.8b,v27.8b,v14.8b
+l1.2852:
+ mov x12,x0
+ cmhs v19.8b,v9.8b,v2.8b
+ bsl v19.8b,v26.8b,v13.8b
+ st1 {v19.s}[0],[x12],x1
+ st1 {v18.s}[0],[x12]
+ // ldmfd sp!, {x3-x12,x15}
+ ldp x21, x22,[sp],#16
+ ldp x19, x20,[sp],#16
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ret
+
+