summaryrefslogtreecommitdiffstats
path: root/common/arm64/ihevc_deblk_luma_vert.s
blob: 4379a693d4159aa962f000cbbce7c9a7d8a79914 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
///*******************************************************************************
//* //file
//*  ihevc_deblk_luma_vert.s
//*
//* //brief
//*  contains function definitions for inter prediction  interpolation.
//* functions are coded using neon  intrinsics and can be compiled using

//* rvct
//*
//* //author
//*  anand s
//*
//* //par list of functions:
//*
//*
//* //remarks
//*  none
//*
//*******************************************************************************/

.text
.align 4



.extern gai4_ihevc_tc_table
.extern gai4_ihevc_beta_table

.globl ihevc_deblk_luma_vert_av8

.type ihevc_deblk_luma_vert_av8, %function

ihevc_deblk_luma_vert_av8:

    sxtw        x5,w5
    sxtw        x6,w6
    stp         d8,d9,[sp,#-16]!
    stp         d10,d11,[sp,#-16]!
    stp         d12,d13,[sp,#-16]!
    stp         d14,d15,[sp,#-16]!
    stp         x19, x20,[sp,#-16]!
    stp         x21, x22,[sp,#-16]!
    mov         x21,x7
    ldr         w22,[sp,#96]
    add         x3,x3,x4
    add         x3,x3,#1
    asr         x3,x3,#1
    add         x7,x3,x5,lsl #1
    add         x3,x3,x6,lsl #1
    cmp         x7,#0x33
    mov         x20,#0x33
    csel        x7, x20, x7,gt
    bgt         l1.56
    cmp         x7,#0x0
    mov         x20,#0x0
    csel        x7, x20, x7,lt              // x7 has the beta_index value
l1.56:

//     bic      x2,x2,#1
    asr         x2,x2,#1

    add         x3,x3,x2,lsl #1
    cmp         x3,#0x35
    mov         x20,#0x35
    csel        x3, x20, x3,gt
    bgt         l1.88
    cmp         x3,#0x0
    mov         x20,#0x0
    csel        x3, x20, x3,lt              // x3 has the tc_index value

//    qp_luma = (quant_param_p + quant_param_q + 1) >> 1//
//    beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)//
//    tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)//

l1.88:
    adrp        x2, :got:gai4_ihevc_beta_table
    ldr         x2, [x2, #:got_lo12:gai4_ihevc_beta_table]

    movi        v18.8b, #0x2
    adrp        x4, :got:gai4_ihevc_tc_table
    ldr         x4, [x4, #:got_lo12:gai4_ihevc_tc_table]

    ldr         w5,[x2,x7,lsl #2]           // beta
    movi        v16.8h, #0x2
    ldr         w6,[x4,x3,lsl #2]           // tc
    lsl         x8,x6,#1
    cmp         x6,#0
    dup         v19.8b,w8
    sub         x7,x0,#4
    movi        v23.8b, #0x3
    beq         l1.964


    sub         x19,x0,#3
    ld1         {v15.8b},[x7],x1
    ldrb        w8,[x19]                    // -3 value
    ld1         {v1.8b},[x7],x1
    ldrb        w10,[x19,#1]                //-2 value
    ld1         {v29.8b},[x7],x1
    ldrb        w11,[x19,#2]                //-1 value
    ld1         {v0.8b},[x7]
    ldrb        w12,[x0,#0]                 // 0 value
    ldrb        w9,[x0,#1]                  // 1 value
    trn1        v24.8b,v15.8b,v1.8b
    trn2        v1.8b,v15.8b,v1.8b
    ldrb        w2,[x0,#2]                  // 2 value
    trn1        v2.8b,v29.8b,v0.8b
    trn2        v0.8b,v29.8b,v0.8b
    add         x12,x12,x2
    subs        x9,x12,x9,lsl #1            // dq0 value is stored in x9
    csneg       x9,x9,x9,pl
//dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )//
    mov         v29.8b,v24.8b
    trn1        v24.4h,v29.4h,v2.4h
    trn2        v2.4h,v29.4h,v2.4h
    add         x8,x8,x11
    mov         v15.8b,v1.8b
    trn1        v1.4h,v15.4h,v0.4h
    trn2        v0.4h,v15.4h,v0.4h
    subs        x8,x8,x10,lsl #1
    csneg       x8,x8,x8,pl
//  dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )//



    add         x14,x1,x1,lsl #1
    add         x14,x0,x14

    sub         x19,x14,#3
    dup         v4.2s, v24.s[1]
    ldrb        w2,[x19]                    // -2 value
    dup         v7.2s, v2.s[1]
    ldrb        w10,[x19,#1]                // -2 value
    dup         v3.2s, v2.s[0]
    ldrb        w11,[x19,#2]                // -1 value
    dup         v5.2s, v1.s[1]
    ldrb        w12,[x14,#0]                // 0 value
    dup         v6.2s, v1.s[0]
    ldrb        w3,[x14,#1]                 // 1 value
    dup         v2.2s, v0.s[0]
    ldrb        w4,[x14,#2]                 // 2 value


    add         x12,x12,x4
    subs        x12,x12,x3,lsl #1           // dq3value is stored in x12
    csneg       x12,x12,x12,pl
//    dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )//


    add         x2,x2,x11
    subs        x11,x2,x10,lsl #1
    csneg       x11,x11,x11,pl              // dp3 value is stored in x8
//    dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]   + pu1_src[3 * src_strd - 1] )//



    add         x3,x8,x9                    // x3 has the d0 value
    add         x4,x11,x12                  // x4 has the d3 value


//    d0 = dp0 + dq0//
//    d3 = dp3 + dq3//

    add         x14,x8,x11                  // x13 has the value dp
    add         x12,x12,x9                  // x12 has the value  dq
//    dp = dp0 + dp3//
//   dq = dq0 + dq3//

    add         x11, x3, x4                 // x3 has the value d

//   d = d0 + d3//


    cmp         x11,x5
    dup         v22.2s, v0.s[1]
    bge         l1.964

//    if(d < beta)


    // registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11

    // registers for use: x2,x7,x8,x9,x10,
    uqsub       v30.8b,v7.8b,v19.8b
    asr         x10,x5,#2
    uqadd       v31.8b,v7.8b,v19.8b
    cmp         x10,x3,lsl #1
    uaddl       v0.8h,v5.8b,v4.8b
    ble         l1.336

    sub         x19,x0,4
    ldrb        w2,[x19]
    uaddw       v0.8h,  v0.8h ,  v2.8b
    ldrb        w7,[x19,#3]
    umull       v20.8h, v7.8b, v23.8b
    ldrb        w3,[x0,#0]
    umlal       v20.8h, v22.8b, v18.8b
    ldrb        w8,[x0,#3]
//   ubfx   x7,x2,#24,#8           // has the -1 value
//  and    x2,#0xff               // has the -4 value
//  ubfx   x8,x3,#24,#8           // has the 3 value
//  and    x3,#0xff               // x4 has the 0 value

    add         v20.8h,  v20.8h ,  v0.8h
    subs        x8,x8,x3
    rshrn       v22.8b,v20.8h,#3
    csneg       x8,x8,x8,pl
    subs        x2,x2,x7
    umin        v21.8b,  v22.8b ,  v31.8b
    csneg       x2,x2,x2,pl
    umax        v22.8b,  v21.8b ,  v30.8b
    add         x8,x8,x2
    uaddl       v20.8h,v7.8b,v3.8b
    cmp         x8,x5,asr #3
    mla         v20.8h, v0.8h, v16.8h
    bge         l1.336
    uaddw       v0.8h,  v0.8h ,  v7.8b
    subs        x7,x3,x7
    rshrn       v20.8b,v20.8h,#3
    csneg       x7,x7,x7,pl
    rshrn       v0.8b,v0.8h,#2
    mov         x10,#5
    uqadd       v30.8b,v5.8b,v19.8b
    mul         x10, x10, x6
    uqsub       v31.8b,v5.8b,v19.8b
    add         x10, x10,#1
    cmp         x7,x10,asr #1
    bge         l1.336


//        if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4])  < (beta >> 3) )
//            && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )


    asr         x10,x5,#2
    uqsub       v25.8b,v4.8b,v19.8b
    cmp         x10,x4,lsl #1
    uqadd       v21.8b,v4.8b,v19.8b
    ble         l1.336
    umin        v26.8b,  v20.8b ,  v21.8b
    add         x4,x1,x1,lsl #1
    add         x4,x4,x0
    umax        v20.8b,  v26.8b ,  v25.8b
    sub         x19,x4,#4
    ldrb        w2,[x19]
    umin        v19.8b,  v0.8b ,  v30.8b
    ldrb        w7,[x19,#3]
    umax        v21.8b,  v19.8b ,  v31.8b
    ldrb        w3,[x4,#0]
    lsl         x10,x6,#1
    ldrb        w8,[x4,#3]
//   ubfx   x7,x2,#24,#8           // has the -1 value
//  and    x2,#0xff               // has the -4 value
//  ubfx   x8,x3,#24,#8           // has the 3 value
//  and    x3,#0xff               // x4 has the 0 value
    uaddl       v0.8h,v2.8b,v3.8b
    dup         v19.8b,w10
    subs        x8,x8,x3
    uaddw       v0.8h,  v0.8h ,  v4.8b
    csneg       x8,x8,x8,pl
    uqadd       v30.8b,v2.8b,v19.8b
    subs        x2,x2,x7
    uqsub       v31.8b,v2.8b,v19.8b
    csneg       x2,x2,x2,pl
    uaddl       v26.8h,v5.8b,v6.8b
    add         x8,x8,x2
    mla         v26.8h, v0.8h, v16.8h
    cmp         x8,x5,asr #3
    bge         l1.336
    rshrn       v26.8b,v26.8h,#3
    subs        x7,x3,x7
    uqadd       v27.8b,v3.8b,v19.8b
    csneg       x7,x7,x7,pl
    uqsub       v28.8b,v3.8b,v19.8b
    mov         x10,#5
    umin        v16.8b,  v26.8b ,  v30.8b
    mul         x10, x10, x6
    add         x10, x10,#1
    cmp         x7,x10,asr #1
    umax        v26.8b,  v16.8b ,  v31.8b
    bge         l1.336
    uqadd       v30.8b,v6.8b,v19.8b

    mov         x2,#2
    mov         x4,x21
    uqsub       v31.8b,v6.8b,v19.8b
    mov         x5,x22
    b           end_dep_deq_decision
// x2 has the value of de
// x6 has teh value of tc
// x5 has the value of beta
// x14 has the value of dp
// x12 has the value of dq
// x0 has the value of source address
// x1 has the src stride

l1.336:
    mov         x2,#1
l1.424:
    mov         x11,x5
    mov         x4,x21
    mov         x5,x22

    cmp         x6,#1
    mov         x20,#0
    csel        x9, x20, x9,eq
    mov         x20,#0
    csel        x10, x20, x10,eq
    beq         end_dep_deq_decision

    and         x7,x4,x5

    cmp         x7,#1
    beq         both_flags_set
    cmp         x4,#0
    beq         set_flag_dep_zero


    add         x8,x11,x11,asr #1
    mov         x10,#0
    asr         x8,x8,#3
    cmp         x8,x14
    mov         x20,#1
    csel        x9, x20, x9,gt
    mov         x20,#0
    csel        x9, x20, x9,le
    b           end_dep_deq_decision
set_flag_dep_zero:

    add         x8,x11,x11,asr #1
    mov         x9,#0
    asr         x8,x8,#3
    cmp         x8,x12
    mov         x20,#1
    csel        x10, x20, x10,gt
    mov         x20,#0
    csel        x10, x20, x10,le
    b           end_dep_deq_decision

both_flags_set:
    add         x8,x11,x11,asr #1
    asr         x8,x8,#3
    cmp         x8,x14
    mov         x20,#1
    csel        x9, x20, x9,gt
    mov         x20,#0
    csel        x9, x20, x9,le
    cmp         x8,x12
    mov         x20,#1
    csel        x10, x20, x10,gt
    mov         x20,#0
    csel        x10, x20, x10,le
end_dep_deq_decision:

//x0=source address
//x1=stride
// x2 =de
// x4=flag p
//x5= flag q
//x6 =tc
// x9 =dep
// x10=deq
//    b    l1.964


    cmp         x2,#2
// x4 has the value of de
    bne         l1.968

    cmp         x5,#0
    beq         l1.780
// x5 has the flag of q

    add         x3,x0,#2
    st1         {v22.b}[0],[x3],x1

    st1         {v22.b}[1],[x3],x1

    st1         {v22.b}[2],[x3],x1

    st1         {v22.b}[3],[x3]
    add         x3,x0,x1
    mov         v29.8b,v20.8b
    trn1        v20.8b,v29.8b,v21.8b
    trn2        v21.8b,v29.8b,v21.8b

    st1         {v20.h}[0],[x0]
    st1         {v21.h}[0],[x3],x1
    st1         {v20.h}[1],[x3],x1
    st1         {v21.h}[1],[x3]


l1.780:
    cmp         x4,#0
    beq         l1.964
    // x4 has the flag p


    dup         v7.2s, v24.s[0]
    sub         x3,x0,#1
    uaddw       v16.8h,  v0.8h ,  v6.8b
    add         x7,x3,x1
    rshrn       v2.8b,v16.8h,#2
    st1         {v26.b}[0],[x3]
    sub         x0,x0,#3
    umin        v16.8b,  v2.8b ,  v27.8b
    st1         {v26.b}[1],[x7],x1
    umull       v2.8h, v6.8b, v23.8b
    umlal       v2.8h, v7.8b, v18.8b
    st1         {v26.b}[2],[x7],x1
    umax        v5.8b,  v16.8b ,  v28.8b
    st1         {v26.b}[3],[x7]
    add         v0.8h,  v2.8h ,  v0.8h
    rshrn       v0.8b,v0.8h,#3


    umin        v1.8b,  v0.8b ,  v30.8b
    umax        v0.8b,  v1.8b ,  v31.8b

    mov         v29.8b,v0.8b
    trn1        v0.8b,v29.8b,v5.8b
    trn2        v5.8b,v29.8b,v5.8b
    st1         {v0.h}[0],[x0],x1
    st1         {v5.h}[0],[x0],x1
    st1         {v0.h}[1],[x0],x1
    st1         {v5.h}[1],[x0]
l1.964:
    ldp         x21, x22,[sp],#16
    ldp         x19, x20,[sp],#16
    ldp         d14,d15,[sp],#16
    ldp         d12,d13,[sp],#16
    ldp         d10,d11,[sp],#16
    ldp         d8,d9,[sp],#16
    ret

l1.968:


    movi        v0.8h, #0x9
    neg         x11, x6
    cmp         x4,#0
    // checks for the flag p
    movi        v16.8h, #0x3
    movi        v24.8b, #0x1


    dup         v30.8b,w11
    and         x11,x6,#0xff
    dup         v31.8b,w11

    usubl       v18.8h,v4.8b,v2.8b
    mul         v18.8h, v18.8h, v0.8h
    usubl       v0.8h,v5.8b,v3.8b



    mul         v16.8h, v0.8h, v16.8h
    sub         v16.8h,  v18.8h ,  v16.8h
    srshr       v16.8h,v16.8h,#4
//   delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4//

    abs         v0.8h, v16.8h
    xtn         v0.8b,  v0.8h
    // storing the absolute values of delta in d0

    sqxtn       v16.8b,v16.8h
    // storing the clipped values of delta in d16

    movi        v1.8b, #0xa
    dup         v21.8b,w11
    mul         v1.8b, v1.8b, v21.8b
    // d1 stores the value (10 * tc)

//if(abs(delta) < 10 * tc)

    smin        v18.8b,  v16.8b ,  v31.8b
    smax        v20.8b,  v18.8b ,  v30.8b

// delta = clip3(delta, -tc, tc)//
    sxtl        v16.8h, v20.8b
    uxtl        v18.8h, v2.8b
    add         v18.8h,  v18.8h ,  v16.8h

    sqxtun      v22.8b, v18.8h
    uxtl        v18.8h, v4.8b
    sub         v16.8h,  v18.8h ,  v16.8h
    sqxtun      v23.8b, v16.8h
// tmp_p0 = clip_u8(pu1_src[-1] + delta)//
//  tmp_q0 = clip_u8(pu1_src[0] - delta)//
    beq         l1.1272



    cmp         x9,#1
    bne         l1.1212
// checks for the flag dep

    asr         x3,x6,#1


    uaddl       v16.8h,v6.8b,v2.8b
    uaddw       v16.8h,  v16.8h ,  v24.8b
    dup         v18.8b,w3
    sub         x20,x3,#0
    neg         x3, x20
    dup         v19.8b,w3
    ushr        v16.8h,v16.8h,#1
    xtn         v16.8b,  v16.8h

    usubl       v16.8h,v16.8b,v3.8b
    saddw       v16.8h,  v16.8h ,  v20.8b
    sshr        v16.8h,v16.8h,#1
    sqxtn       v16.8b,v16.8h

    smin        v17.8b,  v16.8b ,  v18.8b
    smax        v16.8b,  v19.8b ,  v17.8b




    uxtl        v18.8h, v3.8b
    sxtl        v16.8h, v16.8b
    add         v16.8h,  v18.8h ,  v16.8h

    sqxtun      v16.8b, v16.8h
    mov         v30.8b,v3.8b
    cmhs        v3.8b,v0.8b,v1.8b


    bsl         v3.8b,v30.8b,v16.8b
l1.1212:
    dup         v16.8b,w11
    sub         x12,x0,#3
    sub         x3,x0,#1
//     smul v16.8b, v16.8b, v1.8b
    mov         v29.8b,v6.8b
    trn1        v6.8b,v29.8b,v3.8b
    trn2        v3.8b,v29.8b,v3.8b
    st1         {v6.h}[0],[x12],x1
    cmhs        v16.8b,v0.8b,v1.8b
    st1         {v3.h}[0],[x12],x1
    bsl         v16.8b,v2.8b,v22.8b
    st1         {v16.b}[0],[x3],x1
    st1         {v16.b}[1],[x3],x1
    st1         {v6.h}[1],[x12],x1
    st1         {v16.b}[2],[x3],x1
    st1         {v3.h}[1],[x12]
    st1         {v16.b}[3],[x3]
l1.1272:
    cmp         x5,#0
    beq         l1.964
    // checks for the flag q
    cmp         x10,#1
    bne         l1.1412
    // checks for the flag deq
    mov         v2.8b,v7.8b
    asr         x3,x6,#1

    dup         v6.8b,w3
    sub         x20,x3,#0
    neg         x3, x20
    dup         v16.8b,w3
    uaddl       v2.8h,v2.8b,v4.8b
    uaddw       v2.8h,  v2.8h ,  v24.8b
    ushr        v2.8h,v2.8h,#1
    xtn         v2.8b,  v2.8h

    usubl       v2.8h,v2.8b,v5.8b
    ssubw       v2.8h,  v2.8h ,  v20.8b
    sshr        v2.8h,v2.8h,#1
    sqxtn       v3.8b,v2.8h

    smin        v2.8b,  v3.8b ,  v6.8b
    smax        v3.8b,  v16.8b ,  v2.8b
    //  dup  v6.8b,w2
    //   smul v6.8b, v6.8b, v1.8b



    uxtl        v16.8h, v5.8b
    sxtl        v2.8h, v3.8b
    add         v2.8h,  v16.8h ,  v2.8h
    sqxtun      v3.8b, v2.8h
    mov         v30.8b,v5.8b
    cmhs        v5.8b,v0.8b,v1.8b


    bsl         v5.8b,v30.8b,v3.8b
l1.1412:
    //  dup  v2.8b,w2
    add         x3,x0,#2
    add         x11,x3,x1
    //   smul v1.8b, v2.8b, v1.8b
    st1         {v7.b}[0],[x3]
    st1         {v7.b}[1],[x11],x1
    st1         {v7.b}[2],[x11],x1
    cmhs        v0.8b,v0.8b,v1.8b
    st1         {v7.b}[3],[x11]
    bsl         v0.8b,v4.8b,v23.8b
    mov         v29.8b,v0.8b
    trn1        v0.8b,v29.8b,v5.8b
    trn2        v5.8b,v29.8b,v5.8b
    st1         {v0.h}[0],[x0],x1
    st1         {v5.h}[0],[x0],x1
    st1         {v0.h}[1],[x0],x1
    st1         {v5.h}[1],[x0]

    ldp         x21, x22,[sp],#16
    ldp         x19, x20,[sp],#16
    ldp         d14,d15,[sp],#16
    ldp         d12,d13,[sp],#16
    ldp         d10,d11,[sp],#16
    ldp         d8,d9,[sp],#16
    ret