common/arm64/ihevc_weighted_pred_bi.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316

///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
//*******************************************************************************
//* //file
//*  ihevc_weighted_pred_bi.s
//*
//* //brief
//*  contains function definitions for weighted prediction used in inter
//* prediction
//*
//* //author
//*  parthiban v
//*
//* //par list of functions:
//*  - ihevc_weighted_pred_bi()
//*
//* //remarks
//*  none
//*
//*******************************************************************************
//*/
///**
//*******************************************************************************
//*
//* //brief
//*  does bi-weighted prediction on the arrays pointed by  pi2_src1 and
//* pi2_src2 and stores it at location pointed  by pi2_dst   assumptions : the
//* function is optimized considering the fact width and  height are multiple
//* of 2.
//*
//* //par description:
//*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
//* off1 + 1) << (shift - 1) ) >> shift
//*
//* //param[in] pi2_src1
//*  pointer to source 1
//*
//* //param[in] pi2_src2
//*  pointer to source 2
//*
//* //param[out] pu1_dst
//*  pointer to destination
//*
//* //param[in] src_strd1
//*  source stride 1
//*
//* //param[in] src_strd2
//*  source stride 2
//*
//* //param[in] dst_strd
//*  destination stride
//*
//* //param[in] wgt0
//*  weight to be multiplied to source 1
//*
//* //param[in] off0
//*  offset 0
//*
//* //param[in] wgt1
//*  weight to be multiplied to source 2
//*
//* //param[in] off1
//*  offset 1
//*
//* //param[in] shift
//*  (14 bit depth) + log2_weight_denominator
//*
//* //param[in] lvl_shift1
//*  added before shift and offset
//*
//* //param[in] lvl_shift2
//*  added before shift and offset
//*
//* //param[in] ht
//*  height of the source
//*
//* //param[in] wd
//*  width of the source
//*
//* //returns
//*
//* //remarks
//*  none
//*
//*******************************************************************************
//*/

//void ihevc_weighted_pred_bi(word16 *pi2_src1,
//                            word16 *pi2_src2,
//                            uword8 *pu1_dst,
//                            word32 src_strd1,
//                            word32 src_strd2,
//                            word32 dst_strd,
//                            word32 wgt0,
//                            word32 off0,
//                            word32 wgt1,
//                            word32 off1,
//                            word32 shift,
//                            word32 lvl_shift1,
//                            word32 lvl_shift2,
//                            word32 ht,
//                            word32 wd)

//**************variables vs registers*****************************************
//    x0 => *pi2_src1
//    x1 => *pi2_src2
//    x2 => *pu1_dst
//    x3 =>  src_strd1
//    x4 =>  src_strd2
//    x5 =>  dst_strd
//    x6 =>  wgt0
//    x7 =>  off0
//    x8 =>  wgt1
//    x9 =>  off1
//    x10 =>  shift
//    x11 =>  lvl_shift1
//    x12 =>    lvl_shift2
//    x14 =>    ht
//    x7    =>    wd

.text
.align 4

.include "ihevc_neon_macros.s"

.globl ihevc_weighted_pred_bi_av8

.type ihevc_weighted_pred_bi_av8, %function

ihevc_weighted_pred_bi_av8:

    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments

    ldr         w8,[sp,#0]
    ldr         w9,[sp,#8]
    ldr         w10,[sp,#16]
    ldr         w11,[sp,#24]
    ldr         w12,[sp,#32]
    ldr         w13,[sp,#40]
    ldr         w14,[sp,#48]

    sxtw        x8,w8
    sxtw        x9,w9
    sxtw        x10,w10
    sxtw        x11,w11
    sxtw        x12,w12


    stp         x19, x20,[sp,#-16]!
    stp         x21, x22,[sp,#-16]!
    stp         x23, x24,[sp,#-16]!
    stp         x25, x26,[sp,#-16]!

    mov         x15,x4 // src_strd2 40
    mov         x16,x5 // dst_strd 44
    mov         x17,x6 // wgt0 48
    mov         x19,x7 // off0 52
    mov         x20,x8 // wgt1 56
    mov         x21,x9 // off1 60
    mov         x22,x10 // shift 64
    mov         x23,x11 // lvl_shift1 68
    mov         x24,x12 // lvl_shift2 72
    mov         x25,x13 // ht 76
    mov         x26,x14 // wd 80

    mov         x6,x17                      //load wgt0
    mov         x11,x23                     //load lvl_shift1
    mov         x12,x24                     //load lvl_shift2
    mov         v7.h[0],w6                  //moved for scalar multiplication
    mul         x4, x11 , x6                //lvl_shift1 * wgt0
    mov         x8,x20                      //load wgt1
    mov         x7,x19                      //load off0
    mov         v7.h[1],w8                  //moved for scalar multiplication
    madd        x4,x12,x8,x4                //(lvl_shift1 * wgt0) + (lvl_shift2 * wgt1)
    mov         x9,x21                      //load off1
    add         x5,x7,x9                    //off0 + off1
    mov         x10,x22                     //load shift
    add         x5,x5,#1                    //off0 + off1 + 1
    sub         x14,x10,#1                  //shift - 1
    mov         x7,x26                      //load wd
    lsl         x5,x5,x14                   //((off0 + off1 + 1) << (shift - 1))
    dup         v28.4s,w10                  //vmovq_n_s32(0-shift)
    add         x4,x4,x5                    //tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1))
    dup         v30.4s,w4                   //vmovq_n_s32(tmp_lvl_shift)
    neg         v28.4s, v28.4s
    mov         x4,x15                      //load src_strd2
    lsl         x9,x7,#1
    mov         x5,x16                      //load dst_strd
    lsl         x3,x3,#1
    mov         x14,x25                     //load ht
    lsl         x4,x4,#1

    cmp         x14,#0                      //check ht == 0
    beq         end_loops                   //if equal, then end the function

outer_loop:
    cmp         x7,#0                       //check wd == 0
    beq         end_loops                   //if equal, then end the function

core_loop:
    add         x6,x0,x3                    //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         x8,x1,x4                    //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    ld1         {v0.4h},[x0],#8             //load and increment the pi2_src1
    add         x10,x2,x5                   //pu1_dst_tmp = pu1_dst + dst_strd
    ld1         {v1.4h},[x1],#8             //load and increment the pi2_src2
    smull       v4.4s, v0.4h, v7.4h[0]      //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0)
    ld1         {v2.4h},[x6],x3             //load and increment the pi2_src_tmp1 ii iteration
    smull       v5.4s, v1.4h, v7.4h[1]      //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
    ld1         {v3.4h},[x8],x4             //load and increment the pi2_src_tmp1 ii iteration
    add         v4.4s,  v4.4s ,  v5.4s      //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2)

    ld1         {v0.4h},[x6],x3             //load and increment the pi2_src1 iii iteration
    smull       v6.4s, v2.4h, v7.4h[0]      //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration

    ld1         {v1.4h},[x8],x4             //load and increment the pi2_src2 iii iteration
    add         v4.4s,  v4.4s ,  v30.4s     //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
    smull       v19.4s, v0.4h, v7.4h[0]     //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration

    ld1         {v2.4h},[x6],x3             //load and increment the pi2_src_tmp1 iv iteration
    smull       v17.4s, v3.4h, v7.4h[1]     //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
    sshl        v4.4s,v4.4s,v28.4s          //vshlq_s32(i4_tmp1_t1, tmp_shift_t)

    ld1         {v3.4h},[x8],x4             //load and increment the pi2_src_tmp1 iv iteration
    add         v6.4s,  v6.4s ,  v17.4s     //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration

    sqxtun      v4.4h, v4.4s                //vqmovun_s32(sto_res_tmp1)
    smull       v16.4s, v1.4h, v7.4h[1]     //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration

    add         v6.4s,  v6.4s ,  v30.4s     //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration
    //mov v5, v4                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
    add         v19.4s,  v19.4s ,  v16.4s   //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration

    sshl        v6.4s,v6.4s,v28.4s
    //vshl.s32    q5,q5,q14                    //vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration
    smull       v18.4s, v2.4h, v7.4h[0]     //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration
    uqxtn       v4.8b,v4.8h
    //vqmovn.u16    d4,q2                        //vqmovn_u16(sto_res_tmp3)
    add         v19.4s,  v19.4s ,  v30.4s   //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration

    sqxtun      v6.4h, v6.4s                //vqmovun_s32(sto_res_tmp1) ii iteration
    smull       v20.4s, v3.4h, v7.4h[1]     //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration

    sshl        v19.4s,v19.4s,v28.4s
    //vshl.s32    q7,q7,q14                    //vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration
    //mov v11, v10                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration

    add         v18.4s,  v18.4s ,  v20.4s   //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
    sqxtun      v19.4h, v19.4s              //vqmovun_s32(sto_res_tmp1) iii iteration

    add         v18.4s,  v18.4s ,  v30.4s   //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteration
    st1         {v4.s}[0],[x2],#4           //store pu1_dst i iteration

    uqxtn       v6.8b,v6.8h
    //vqmovn.u16    d10,q5                        //vqmovn_u16(sto_res_tmp3) ii iteration
    sshl        v18.4s,v18.4s,v28.4s
    //vshl.s32    q9,q9,q14                    //vshlq_s32(i4_tmp2_t1, tmp_shift_t) iv iteration
    st1         {v6.s}[0],[x10],x5          //store pu1_dst ii iteration


    //mov v15, v14                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
    uqxtn       v19.8b,v19.8h
    //vqmovn.u16    d14,q7                        //vqmovn_u16(sto_res_tmp3) iii iteration
    sqxtun      v18.4h, v18.4s              //vqmovun_s32(sto_res_tmp1) iv iteration
    //mov v19, v18                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
    st1         {v19.s}[0],[x10],x5         //store pu1_dst iii iteration
    uqxtn       v18.8b,v18.8h
    //vqmovn.u16    d18,q9                        //vqmovn_u16(sto_res_tmp3) iv iteration
    subs        x7,x7,#4                    //decrement wd by 4 and check for 0
    st1         {v18.s}[0],[x10],x5         //store pu1_dst iv iteration

    bgt         core_loop                   //if greater than 0 repeat the core loop again

end_core_loop:
    sub         x20,x9,x3,lsl #2            //2*src_strd1 - wd
    neg         x11, x20
    subs        x14,x14,#4                  //decrement the ht by 4
    sub         x20,x9,x4,lsl #2            //2*src_strd2 - wd
    neg         x12, x20
    add         x0,x0,x11                   //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
    asr         x7,x9,#1
    add         x1,x1,x12                   //pi2_src2 + 4*src_strd2 - 2*wd
    sub         x20,x7,x5,lsl #2            //2*dst_strd - wd
    neg         x10, x20
    add         x2,x2,x10                   //pu1_dst + dst_std - wd
    bgt         core_loop                   //if ht is greater than 0 goto outer_loop

end_loops:
    // ldmfd sp!,{x4-x12,x15}              //reload the registers from sp
    ldp         x25, x26,[sp],#16
    ldp         x23, x24,[sp],#16
    ldp         x21, x22,[sp],#16
    ldp         x19, x20,[sp],#16

    ret