1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
|
///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///*******************************************************************************
//* @file
//* ihevc_deblk_luma_vert.s
//*
//* @brief
//* contains function definitions for inter prediction interpolation.
//* functions are coded using neon intrinsics and can be compiled using
//* rvct
//*
//* @author
//* anand s
//*
//* @par list of functions:
//*
//*
//* @remarks
//* none
//*
//*******************************************************************************/
.text
.align 4
.extern gai4_ihevc_tc_table
.extern gai4_ihevc_beta_table
.globl ihevc_deblk_luma_horz_av8
.type ihevc_deblk_luma_horz_av8, %function
ihevc_deblk_luma_horz_av8:
// stmfd sp!, {x3-x12,x14}
sxtw x5,w5
sxtw x6,w6
stp d8,d9,[sp,#-16]!
stp d10,d11,[sp,#-16]!
stp d12,d13,[sp,#-16]!
stp d14,d15,[sp,#-16]!
stp x19, x20,[sp,#-16]!
stp x21, x22,[sp,#-16]!
mov x21,x7
ldr w22,[sp,#96]
add x3,x3,x4
add x3,x3,#1
asr x3,x3,#1
add x7,x3,x5,lsl #1
add x3,x3,x6,lsl #1
cmp x7,#0x33
mov x20,#0x33
csel x7, x20, x7,gt
bgt l1.1532
cmp x7,#0x0
mov x20,#0x0
csel x7, x20, x7,lt // x7 has the beta_index value
l1.1532:
// bic x2,x2,#1
asr x2,x2,#1
add x3,x3,x2,lsl #1
cmp x3,#0x35
mov x20,#0x35
csel x3, x20, x3,gt
bgt l1.1564
cmp x3,#0x0
mov x20,#0x0
csel x3, x20, x3,lt // x3 has the tc_index value
// qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
// beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
// tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
l1.1564:
adrp x2, :got:gai4_ihevc_beta_table
ldr x2, [x2, #:got_lo12:gai4_ihevc_beta_table]
adrp x4, :got:gai4_ihevc_tc_table
ldr x4, [x4, #:got_lo12:gai4_ihevc_tc_table]
ldr w5, [x2,x7,lsl #2] // beta
ldr w6, [x4,x3,lsl #2] // tc
cmp x6,#0
beq l1.2404
movi v0.4h, #0x2
lsl x7,x6,#1
add x14,x1,x1,lsl #1
neg x19,x14
ldr w8, [x0,x19] // -3 value
dup v1.8b,w7
lsl x19,x1,#1
neg x19,x19
ldr w10, [x0,x19] //-2 value
dup v23.2s,w8 // -3 value
neg x19,x1
ldr w11, [x0,x19] //-1 value
dup v24.2s,w10 // -2 value
and x8,x8,#0xff
ldr w12, [x0,#0] // 0 value
dup v25.2s,w11 // -1 value
and x10,x10,#0xff
ldr w9, [x0,x1] // 1 value
dup v26.2s,w12 // 0 value
and x11,x11,#0xff
lsl x19,x1,#1
ldr w2, [x0,x19] // 2 value
dup v27.2s,w9 // 1value
and x12,x12,#0xff
dup v28.2s,w2 // 2 value
and x9,x9,#0xff
and x2,x2,#0xff
add x12,x12,x2
subs x9,x12,x9,lsl #1 // dq0 value is stored in x9
csneg x9,x9,x9,pl
//dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
add x8,x8,x11
subs x8,x8,x10,lsl #1
csneg x8,x8,x8,pl // dp0 value is stored in x8
// dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
add x3,x1,x1,lsl #1
add x14,x0,#3
neg x19,x3
ldrb w2,[x14,x19] // -2 value
lsl x19,x1,#1
neg x19,x19
ldrb w10,[x14,x19] // -2 value
neg x19,x1
ldrb w11,[x14,x19] // -1 value
ldrb w12,[x14,#0] // 0 value
ldrb w3,[x14,x1] // 1 value
lsl x19,x1,#1
ldrb w4,[x14,x19] // 2 value
add x12,x12,x4
subs x12,x12,x3,lsl #1 // dq3value is stored in x12
csneg x12,x12,x12,pl
// dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
add x2,x2,x11
subs x11,x2,x10,lsl #1
csneg x11,x11,x11,pl // dp3 value is stored in x8
// dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )@
add x3,x8,x9 // x3 has the d0 value
add x4,x11,x12 // x4 has the d3 value
// d0 = dp0 + dq0@
// d3 = dp3 + dq3@
add x14,x8,x11 // x13 has the value dp
add x12,x12,x9 // x12 has the value dq
// dp = dp0 + dp3@
// dq = dq0 + dq3@
add x11, x3, x4 // x3 has the value d
// d = d0 + d3@
cmp x11,x5
bge l1.2404
// if(d < beta)
// registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11
// registers for use: x2,x7,x8,x9,x10,
asr x10,x5,#2
uqadd v30.8b, v26.8b , v1.8b
cmp x10,x3,lsl #1
uqsub v31.8b, v26.8b , v1.8b
ble l1.1840
add x10,x1,x1,lsl #1
uaddl v6.8h, v25.8b , v26.8b
neg x19,x1
ldr w2, [x0,x19,lsl #2] // has the -4 value
neg x19, x1
ldrb w7,[x0,x19] // has the -1 value
dup v22.2s,w2 // -4 value
uaddw v8.8h, v6.8h , v27.8b
ldrb w3,[x0,#0] // x4 has the 0 value
uqadd v16.8b, v27.8b , v1.8b
and x2,x2,#0xff
mul v12.8h, v8.8h, v0.4h[0]
ldr w8, [x0,x10] // has the 3 value
uaddl v10.8h, v24.8b , v28.8b
subs x2,x2,x7
uqsub v17.8b, v27.8b , v1.8b
dup v29.2s,w8 // 3 value
and x8,x8,#0xff
add v12.8h, v12.8h , v10.8h
csneg x2,x2,x2,pl
rshrn v20.8b, v12.8h,#3
subs x8,x8,x3
csneg x8,x8,x8,pl
umin v18.8b, v20.8b , v30.8b
add x8,x8,x2
cmp x8,x5,asr #3
bge l1.1840
uaddw v14.8h, v8.8h , v28.8b
subs x7,x3,x7
umax v4.8b, v18.8b , v31.8b
csneg x7,x7,x7,pl
uqadd v30.8b, v28.8b , v1.8b
mov x10,#5
rshrn v21.8b, v14.8h,#2
mul x10, x10, x6
uqsub v31.8b, v28.8b , v1.8b
add x10, x10,#1
cmp x7,x10,asr #1
umin v18.8b, v21.8b , v16.8b
bge l1.1840
// if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) )
// && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
umax v5.8b, v18.8b , v17.8b
asr x10,x5,#2
uaddl v16.8h, v29.8b , v28.8b
cmp x10,x4,lsl #1
ble l1.1840
add x10,x1,x1,lsl #1
mul v16.8h, v16.8h, v0.4h[0]
add x4,x0,#3
lsl x19,x1,#2
neg x19,x19
ldrb w2,[x4,x19]
add v16.8h, v16.8h , v14.8h
neg x19,x1
ldrb w7,[x4,x19]
rshrn v19.8b, v16.8h,#3
ldrb w3,[x4,#0]
ldrb w8,[x4,x10]
// ubfx x7,x2,#24,#8 @ has the -1 value
// and x2,#0xff @ has the -4 value
// ubfx x8,x3,#24,#8 @ has the 3 value
// and x3,#0xff @ x4 has the 0 value
subs x8,x8,x3
umin v18.8b, v19.8b , v30.8b
csneg x8,x8,x8,pl
uaddl v6.8h, v25.8b , v24.8b
subs x2,x2,x7
umax v3.8b, v18.8b , v31.8b
csneg x2,x2,x2,pl
uaddw v8.8h, v6.8h , v26.8b
add x8,x8,x2
uqadd v30.8b, v25.8b , v1.8b
cmp x8,x5,asr #3
uqsub v31.8b, v25.8b , v1.8b
bge l1.1840
mul v12.8h, v8.8h, v0.4h[0]
subs x7,x3,x7
uqadd v16.8b, v24.8b , v1.8b
csneg x7,x7,x7,pl
uaddl v10.8h, v23.8b , v27.8b
mov x10,#5
uqsub v17.8b, v24.8b , v1.8b
mul x10, x10, x6
add v12.8h, v12.8h , v10.8h
add x10, x10,#1
rshrn v20.8b, v12.8h,#3
cmp x7,x10,asr #1
uaddw v14.8h, v8.8h , v23.8b
bge l1.1840
umin v18.8b, v20.8b , v30.8b
mov x2,#2
uqadd v30.8b, v23.8b , v1.8b
mov w4,w21
umax v2.8b, v18.8b , v31.8b
mov w5,w22
rshrn v21.8b, v14.8h,#2
b end_dep_deq_decision_horz
// x2 has the value of de
// x6 has teh value of tc
// x5 has the value of beta
// x14 has the value of dp
// x12 has the value of dq
// x0 has the value of source address
// x1 has the src stride
l1.1840:
mov x2,#1
mov x11,x5
mov w4,w21
mov w5,w22
cmp x6,#1
mov x20,#0
csel x9, x20, x9,eq
mov x20,#0
csel x10, x20, x10,eq
beq end_dep_deq_decision_horz
and x7,x4,x5
cmp x7,#1
beq both_flags_set_horz
cmp x4,#0
beq set_flag_dep_zero_horz
add x8,x11,x11,asr #1
mov x10,#0
asr x8,x8,#3
cmp x8,x14
mov x20,#1
csel x9, x20, x9,gt
mov x20,#0
csel x9, x20, x9,le
b end_dep_deq_decision_horz
set_flag_dep_zero_horz:
add x8,x11,x11,asr #1
mov x9,#0
asr x8,x8,#3
cmp x8,x12
mov x20,#1
csel x10, x20, x10,gt
mov x20,#0
csel x10, x20, x10,le
b end_dep_deq_decision_horz
both_flags_set_horz:
add x8,x11,x11,asr #1
asr x8,x8,#3
cmp x8,x14
mov x20,#1
csel x9, x20, x9,gt
mov x20,#0
csel x9, x20, x9,le
cmp x8,x12
mov x20,#1
csel x10, x20, x10,gt
mov x20,#0
csel x10, x20, x10,le
end_dep_deq_decision_horz:
//x0=source address
//x1=stride
// x2 =de
// x4=flag p
//x5= flag q
//x6 =tc
// x9 =dep
// x10=deq
// add x14,x1,x1,lsl #1
// lsl x7,x6,#1
// vdup.8 d1,x7
// vmov.i16 d0,#0x2
umin v18.8b, v21.8b , v16.8b
cmp x2,#1
uqsub v31.8b, v23.8b , v1.8b
beq l1.2408
uaddl v8.8h, v23.8b , v22.8b
cmp x5,#1
bne strong_filtering_p
strong_filtering_q:
mov x12,x0
st1 {v4.s}[0],[x12],x1
st1 {v5.s}[0],[x12],x1
st1 {v3.s}[0],[x12]
cmp x4,#1
bne l1.2404
strong_filtering_p:
umax v5.8b, v18.8b , v17.8b
mov x12,x0
mul v8.8h, v8.8h, v0.4h[0]
sub x20,x1,#0
neg x11, x20
add v16.8h, v8.8h , v14.8h
add x12,x12,x11
rshrn v19.8b, v16.8h,#3
st1 {v2.s}[0],[x12],x11
umin v18.8b, v19.8b , v30.8b
st1 {v5.s}[0],[x12],x11
umax v3.8b, v18.8b , v31.8b
st1 {v3.s}[0],[x12]
l1.2404:
// ldmfd sp!, {x3-x12,pc}
ldp x21, x22,[sp],#16
ldp x19, x20,[sp],#16
ldp d14,d15,[sp],#16
ldp d12,d13,[sp],#16
ldp d10,d11,[sp],#16
ldp d8,d9,[sp],#16
ret
// x4=flag p
//x5= flag q
//x6 =tc
// x9 =dep
// x10=deq
// d22 -4 value
//d23 @ -3 value
// vdup.32 d24,x11 @ -2 value
// vdup.32 d25, x11 @-1 value
// vdup.32 d26,x11 @ 0 value
// vdup.32 d27,x11 @ 1value
// vdup.32 d28,x11 @ 2 value
// vdup.32 d29,x11 @ 3 value
l1.2408:
movi v0.4h, #0x9
usubl v10.8h, v26.8b , v25.8b
mul v10.8h, v10.8h, v0.4h[0]
movi v0.4h, #0x3
usubl v12.8h, v27.8b , v24.8b
mul v12.8h, v12.8h, v0.4h[0]
dup v30.8b,w6 // duplicating the +tc value
sub x20,x6,#0
neg x12, x20
dup v31.8b,w12 // duplicating the -tc value
sub v10.8h, v10.8h , v12.8h
srshr v10.8h, v10.8h,#4
// delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
abs v8.8h, v10.8h
xtn v9.8b, v8.8h
// storing the absolute values of delta in d9
sqxtn v10.8b, v10.8h
// storing the clipped values of delta in d16
smin v11.8b, v10.8b , v30.8b
smax v8.8b, v31.8b , v11.8b // d8 has the value delta = clip3(delta, -tc, tc)//
uxtl v6.8h, v25.8b
saddw v4.8h, v6.8h , v8.8b
sqxtun v12.8b, v4.8h
uxtl v6.8h, v26.8b
ssubw v4.8h, v6.8h , v8.8b
sqxtun v13.8b, v4.8h
mov x11,#0xa
mul x12, x11, x6
dup v2.8b,w12 // d2 has the 10*tc value
mov v18.8b, v24.8b
dup v0.8b,w6
sshr v0.8b,v0.8b,#1
neg v1.8b, v0.8b
cmp x4,#1
bne l1.2724
cmp x9,#1
bne l1.2700
// d12 and d13 have the value temp_p0 and temp_q0
uaddl v14.8h, v23.8b , v25.8b
rshrn v14.8b, v14.8h,#1
usubl v14.8h, v14.8b , v24.8b
saddw v14.8h, v14.8h , v8.8b
sqshrn v14.8b, v14.8h,#1
smin v15.8b, v14.8b , v0.8b
smax v14.8b, v1.8b , v15.8b
// d14 has the delta p value
uxtl v16.8h, v24.8b
saddw v16.8h, v16.8h , v14.8b
sqxtun v14.8b, v16.8h
// d14 =tmp_p1 = clip_u8(pu1_src[-2 * src_strd] + delta_p)@
cmhs v18.8b,v9.8b,v2.8b
bsl v18.8b,v24.8b,v14.8b
l1.2700:
mov x12,x0
sub x20,x1,#0
neg x11, x20
add x12,x12,x11
cmhs v19.8b,v9.8b,v2.8b
bsl v19.8b,v25.8b,v12.8b
st1 {v19.s}[0],[x12],x11
st1 {v18.s}[0],[x12]
l1.2724:
cmp x5,#1
bne l1.2404
cmp x10,#1
mov v18.8b, v27.8b
bne l1.2852
uaddl v14.8h, v26.8b , v28.8b
rshrn v14.8b, v14.8h,#1
usubl v14.8h, v14.8b , v27.8b
ssubw v14.8h, v14.8h , v8.8b
sqshrn v14.8b, v14.8h,#1
smin v15.8b, v14.8b , v0.8b
smax v14.8b, v1.8b , v15.8b
// d14 has the delta p value
uxtl v16.8h, v27.8b
saddw v16.8h, v16.8h , v14.8b
sqxtun v14.8b, v16.8h
cmhs v18.8b,v9.8b,v2.8b
bsl v18.8b,v27.8b,v14.8b
l1.2852:
mov x12,x0
cmhs v19.8b,v9.8b,v2.8b
bsl v19.8b,v26.8b,v13.8b
st1 {v19.s}[0],[x12],x1
st1 {v18.s}[0],[x12]
// ldmfd sp!, {x3-x12,x15}
ldp x21, x22,[sp],#16
ldp x19, x20,[sp],#16
ldp d14,d15,[sp],#16
ldp d12,d13,[sp],#16
ldp d10,d11,[sp],#16
ldp d8,d9,[sp],#16
ret
|