diff options
author | Martin Storsjo <martin@martin.st> | 2015-06-10 12:05:14 +0300 |
---|---|---|
committer | Marco Nelissen <marcone@google.com> | 2015-06-25 08:25:46 -0700 |
commit | 9f81a0a2024d1aa640e15085717a8164f770eba4 (patch) | |
tree | 4095089abfaace92958b0013b636f09cfbae3374 /common | |
parent | 436fccb1641f9f25afff6cf20f9d4957c08f43cd (diff) | |
download | android_external_libavc-9f81a0a2024d1aa640e15085717a8164f770eba4.tar.gz android_external_libavc-9f81a0a2024d1aa640e15085717a8164f770eba4.tar.bz2 android_external_libavc-9f81a0a2024d1aa640e15085717a8164f770eba4.zip |
armv8: Remove redundant NEON element size declarations
When specifying one specific lane of the vector, the number of
lanes don't need to be specified.
The clang built-in assembler doesn't allow the redundant
declarations, while binutils gas work fine with both forms.
Change-Id: I86077ce0774d4594a1295b6860e4944df87dde2f
Diffstat (limited to 'common')
-rw-r--r-- | common/armv8/ih264_deblk_chroma_av8.s | 2 | ||||
-rw-r--r-- | common/armv8/ih264_deblk_luma_av8.s | 8 | ||||
-rw-r--r-- | common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s | 24 | ||||
-rw-r--r-- | common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s | 28 | ||||
-rw-r--r-- | common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s | 24 | ||||
-rw-r--r-- | common/armv8/ih264_intra_pred_chroma_av8.s | 6 | ||||
-rw-r--r-- | common/armv8/ih264_intra_pred_luma_16x16_av8.s | 2 | ||||
-rw-r--r-- | common/armv8/ih264_intra_pred_luma_8x8_av8.s | 6 | ||||
-rw-r--r-- | common/armv8/ih264_resi_trans_quant_av8.s | 2 | ||||
-rw-r--r-- | common/armv8/ih264_weighted_bi_pred_av8.s | 56 | ||||
-rw-r--r-- | common/armv8/ih264_weighted_pred_av8.s | 28 |
11 files changed, 93 insertions, 93 deletions
diff --git a/common/armv8/ih264_deblk_chroma_av8.s b/common/armv8/ih264_deblk_chroma_av8.s index 3021556..a4dbd23 100644 --- a/common/armv8/ih264_deblk_chroma_av8.s +++ b/common/armv8/ih264_deblk_chroma_av8.s @@ -337,7 +337,7 @@ ih264_deblk_chroma_horz_bslt4_av8: ldr x9, [sp, #80] sub x0, x0, x1, lsl #1 //x0 = uc_edgePixelU pointing to p1 of chroma U rev w7, w7 // - mov v12.2s[0], w7 //D12[0] = ui_Bs + mov v12.s[0], w7 //D12[0] = ui_Bs ld1 {v16.s}[0], [x8] //D16[0] contains cliptab_cb ld1 {v17.s}[0], [x9] //D17[0] contains cliptab_cr ld2 {v6.8b, v7.8b}, [x0], x1 //Q3=p1 diff --git a/common/armv8/ih264_deblk_luma_av8.s b/common/armv8/ih264_deblk_luma_av8.s index bcdb03f..1b3950d 100644 --- a/common/armv8/ih264_deblk_luma_av8.s +++ b/common/armv8/ih264_deblk_luma_av8.s @@ -97,7 +97,7 @@ ih264_deblk_luma_horz_bslt4_av8: sub x0, x0, x1 //x0 pointer to p2 rev w4, w4 // ld1 {v10.8b, v11.8b}, [x0], x1 //p2 values are loaded into q5 - mov v12.2s[0], w4 //d12[0] = ui_Bs + mov v12.s[0], w4 //d12[0] = ui_Bs mov x6, x0 //keeping backup of pointer to p1 ld1 {v8.8b, v9.8b}, [x0], x1 //p1 values are loaded into q4 mov x7, x0 //keeping backup of pointer to p0 @@ -364,8 +364,8 @@ ih264_deblk_luma_horz_bs4_av8: mov v26.d[1] , v27.d[0] mov v2.d[1] , v3.d[0] uaddl v16.8h, v31.8b, v25.8b //p2+p3 H - mla v12.8h, v8.8h , v1.4h[0] //(p0+q0+p1)+3*p2+2*p3 L - mla v4.8h, v16.8h , v1.4h[0] //(p0+q0+p1)+3*p2+2*p3 H + mla v12.8h, v8.8h , v1.h[0] //(p0+q0+p1)+3*p2+2*p3 L + mla v4.8h, v16.8h , v1.h[0] //(p0+q0+p1)+3*p2+2*p3 H bic v16.16b, v20.16b , v18.16b //((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) mov v17.d[0] , v16.d[1] //&& (Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) bit v2.16b, v28.16b , v20.16b //choosing between po' and p0" @@ -443,7 +443,7 @@ ih264_deblk_luma_vert_bslt4_av8: ld1 {v4.8b}, [x0], x1 //row3 rev w12, w12 //reversing ui_bs ld1 {v6.8b}, [x0], x1 //row4 - mov v18.2s[0], w12 //d12[0] = ui_Bs + mov v18.s[0], w12 //d12[0] = ui_Bs ld1 {v16.s}[0], [x14] //D16[0] contains cliptab ld1 {v8.8b}, [x0], x1 //row5 uxtl v18.8h, v18.8b //q6 = uc_Bs in each 16 bt scalar diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s index 202c516..d2897b6 100644 --- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s +++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s @@ -146,7 +146,7 @@ loop_16: uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h - mov v18.2s[1], v19.2s[0] + mov v18.s[1], v19.s[0] ext v24.16b, v20.16b , v22.16b , #4 ext v26.16b, v20.16b , v22.16b , #6 @@ -174,7 +174,7 @@ loop_16: uqxtn v19.8b, v19.8h uqxtn v25.8b, v25.8h - mov v19.2s[1], v25.2s[0] + mov v19.s[1], v25.s[0] uaddl v22.8h, v4.8b, v10.8b ld1 {v0.2s, v1.2s}, [x0], #16 // Vector load from src[6_0] @@ -228,7 +228,7 @@ loop_16: uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h - mov v18.2s[1], v19.2s[0] + mov v18.s[1], v19.s[0] ext v24.16b, v20.16b , v22.16b , #4 ext v26.16b, v20.16b , v22.16b , #6 @@ -253,7 +253,7 @@ loop_16: uqxtn v19.8b, v19.8h uqxtn v25.8b, v25.8h - mov v19.2s[1], v25.2s[0] + mov v19.s[1], v25.s[0] uaddl v22.8h, v6.8b, v0.8b @@ -306,7 +306,7 @@ loop_16: uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h - mov v18.2s[1], v19.2s[0] + mov v18.s[1], v19.s[0] ext v24.16b, v20.16b , v22.16b , #4 @@ -334,7 +334,7 @@ loop_16: uqxtn v19.8b, v19.8h uqxtn v25.8b, v25.8h - mov v19.2s[1], v25.2s[0] + mov v19.s[1], v25.s[0] @@ -387,7 +387,7 @@ loop_16: uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h - mov v18.2s[1], v19.2s[0] + mov v18.s[1], v19.s[0] ext v24.16b, v20.16b , v22.16b , #4 @@ -427,7 +427,7 @@ loop_16: uqxtn v19.8b, v19.8h uqxtn v25.8b, v25.8h - mov v19.2s[1], v25.2s[0] + mov v19.s[1], v25.s[0] @@ -501,7 +501,7 @@ loop_8: ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[7_0] uqxtn v25.8b, v12.8h uqxtn v13.8b, v13.8h - mov v25.2s[1], v13.2s[0] + mov v25.s[1], v13.s[0] uaddl v16.8h, v8.8b, v10.8b @@ -535,7 +535,7 @@ loop_8: uaddl v28.8h, v9.8b, v11.8b uqxtn v13.8b, v16.8h uqxtn v17.8b, v17.8h - mov v13.2s[1], v17.2s[0] + mov v13.s[1], v17.s[0] uaddl v14.8h, v5.8b, v3.8b @@ -576,7 +576,7 @@ loop_8: mls v16.8h, v30.8h , v24.8h uqxtn v27.8b, v12.8h uqxtn v13.8b, v13.8h - mov v27.2s[1], v13.2s[0] + mov v27.s[1], v13.s[0] ext v22.16b, v28.16b , v16.16b , #10 @@ -616,7 +616,7 @@ loop_8: subs x4, x4, #4 uqxtn v13.8b, v16.8h uqxtn v17.8b, v17.8h - mov v13.2s[1], v17.2s[0] + mov v13.s[1], v17.s[0] mov v0.16b, v8.16b diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s index 38f971b..546c807 100644 --- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s +++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s @@ -275,7 +275,7 @@ loop_16_lowhalf: uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h - mov v18.2s[1], v19.2s[0] + mov v18.s[1], v19.s[0] ld1 {v0.2s, v1.2s}, [x0], x2 // row 5 load for horizontal filter @@ -313,7 +313,7 @@ loop_16_lowhalf: uaddl v2.8h, v1.8b, v4.8b uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h - mov v18.2s[1], v19.2s[0] + mov v18.s[1], v19.s[0] add v30.8h, v14.8h , v16.8h mls v8.8h, v2.8h , v24.8h ld1 {v0.2s, v1.2s}, [x0], x2 // row 6 load for horizontal filter @@ -355,7 +355,7 @@ loop_16_lowhalf: mls v28.8h, v2.8h , v24.8h uqxtn v27.8b, v18.8h uqxtn v19.8b, v19.8h - mov v27.2s[1], v19.2s[0] + mov v27.s[1], v19.s[0] saddl v18.4s, v12.4h, v28.4h saddl2 v6.4s, v12.8h, v28.8h @@ -384,7 +384,7 @@ loop_16_lowhalf: uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h - mov v18.2s[1], v19.2s[0] + mov v18.s[1], v19.s[0] mov v12.16b, v8.16b mov v13.16b, v9.16b @@ -523,7 +523,7 @@ loop_16_highhalf: mls v20.8h, v2.8h , v24.8h uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h - mov v18.2s[1], v19.2s[0] + mov v18.s[1], v19.s[0] ld1 {v0.2s, v1.2s}, [x8], x2 urhadd v26.8b, v18.8b , v26.8b @@ -558,7 +558,7 @@ loop_16_highhalf: uaddl v2.8h, v1.8b, v4.8b uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h - mov v18.2s[1], v19.2s[0] + mov v18.s[1], v19.s[0] add v30.8h, v14.8h , v16.8h mls v8.8h, v2.8h , v24.8h ld1 {v0.2s, v1.2s}, [x8], x2 @@ -598,7 +598,7 @@ loop_16_highhalf: mls v28.8h, v2.8h , v24.8h uqxtn v27.8b, v18.8h uqxtn v19.8b, v19.8h - mov v27.2s[1], v19.2s[0] + mov v27.s[1], v19.s[0] saddl v18.4s, v12.4h, v28.4h @@ -627,7 +627,7 @@ loop_16_highhalf: uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h - mov v18.2s[1], v19.2s[0] + mov v18.s[1], v19.s[0] mov v12.16b, v8.16b mov v13.16b, v9.16b @@ -768,7 +768,7 @@ loop_8: uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h - mov v18.2s[1], v19.2s[0] + mov v18.s[1], v19.s[0] ld1 {v0.2s, v1.2s}, [x0], x2 // row 5 load for horizontal filter @@ -812,7 +812,7 @@ loop_8: uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h - mov v18.2s[1], v19.2s[0] + mov v18.s[1], v19.s[0] add v30.8h, v14.8h , v16.8h mls v8.8h, v2.8h , v24.8h @@ -855,7 +855,7 @@ loop_8: uqxtn v27.8b, v18.8h uqxtn v19.8b, v19.8h - mov v27.2s[1], v19.2s[0] + mov v27.s[1], v19.s[0] saddl v18.4s, v12.4h, v28.4h saddl2 v6.4s, v12.8h, v28.8h @@ -885,7 +885,7 @@ loop_8: uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h - mov v18.2s[1], v19.2s[0] + mov v18.s[1], v19.s[0] mov v12.16b, v8.16b @@ -1024,7 +1024,7 @@ loop_4: sqrshrun v9.8b, v6.8h, #5 sqrshrun v7.8b, v7.8h, #5 - mov v9.2s[1], v7.2s[0] + mov v9.s[1], v7.s[0] ext v20.8b, v18.8b , v19.8b , #2 @@ -1089,7 +1089,7 @@ loop_4: sqrshrun v10.8b, v8.8h, #5 sqrshrun v9.8b, v9.8h, #5 - mov v10.2s[1], v9.2s[0] + mov v10.s[1], v9.s[0] mov v12.8b, v28.8b diff --git a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s index b1e4866..3f3e297 100644 --- a/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s +++ b/common/armv8/ih264_inter_pred_luma_horz_qpel_vert_hpel_av8.s @@ -209,7 +209,7 @@ loop_16: uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h - mov v18.2s[1], v19.2s[0] + mov v18.s[1], v19.s[0] ext v24.16b, v20.16b , v22.16b , #4 ext v26.16b, v20.16b , v22.16b , #6 @@ -238,7 +238,7 @@ loop_16: uqxtn v19.8b, v19.8h uqxtn v18.8b, v18.8h - mov v19.2s[1], v18.2s[0] + mov v19.s[1], v18.s[0] ld1 {v18.2s}, [x1] sqrshrun v20.8b, v20.8h, #5 @@ -297,7 +297,7 @@ loop_16: uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h - mov v18.2s[1], v19.2s[0] + mov v18.s[1], v19.s[0] ext v24.16b, v20.16b , v22.16b , #4 ext v26.16b, v20.16b , v22.16b , #6 @@ -323,7 +323,7 @@ loop_16: ld1 {v22.4s}, [x6], x7 uqxtn v19.8b, v19.8h uqxtn v18.8b, v18.8h - mov v19.2s[1], v18.2s[0] + mov v19.s[1], v18.s[0] ld1 {v18.4s}, [x1] sqrshrun v20.8b, v20.8h, #5 sqrshrun v21.8b, v22.8h, #5 @@ -380,7 +380,7 @@ loop_16: uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h - mov v18.2s[1], v19.2s[0] + mov v18.s[1], v19.s[0] ext v24.16b, v20.16b , v22.16b , #4 @@ -409,7 +409,7 @@ loop_16: uqxtn v19.8b, v19.8h uqxtn v18.8b, v18.8h - mov v19.2s[1], v18.2s[0] + mov v19.s[1], v18.s[0] ld1 {v18.2s}, [x1] sqrshrun v20.8b, v20.8h, #5 @@ -466,7 +466,7 @@ loop_16: ld1 {v22.4s}, [x9], #16 uqxtn v18.8b, v18.8h uqxtn v19.8b, v19.8h - mov v18.2s[1], v19.2s[0] + mov v18.s[1], v19.s[0] ext v24.16b, v20.16b , v22.16b , #4 @@ -506,7 +506,7 @@ loop_16: uqxtn v19.8b, v19.8h uqxtn v18.8b, v18.8h - mov v19.2s[1], v18.2s[0] + mov v19.s[1], v18.s[0] ld1 {v20.4s}, [x6], #16 ld1 {v22.4s}, [x6], x7 @@ -586,7 +586,7 @@ loop_8: ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[7_0] uqxtn v25.8b, v12.8h uqxtn v13.8b, v13.8h - mov v25.2s[1], v13.2s[0] + mov v25.s[1], v13.s[0] uaddl v16.8h, v8.8b, v10.8b @@ -620,7 +620,7 @@ loop_8: uaddl v28.8h, v9.8b, v11.8b uqxtn v13.8b, v16.8h uqxtn v17.8b, v17.8h - mov v13.2s[1], v17.2s[0] + mov v13.s[1], v17.s[0] urhadd v12.16b, v12.16b , v14.16b urhadd v13.16b, v13.16b , v15.16b @@ -662,7 +662,7 @@ loop_8: mls v16.8h, v30.8h , v24.8h uqxtn v27.8b, v12.8h uqxtn v13.8b, v13.8h - mov v27.2s[1], v13.2s[0] + mov v27.s[1], v13.s[0] sqrshrun v14.8b, v14.8h, #5 ext v22.16b, v28.16b , v16.16b , #10 @@ -702,7 +702,7 @@ loop_8: subs x4, x4, #4 uqxtn v13.8b, v16.8h uqxtn v17.8b, v17.8h - mov v13.2s[1], v17.2s[0] + mov v13.s[1], v17.s[0] urhadd v12.16b, v12.16b , v14.16b urhadd v13.16b, v13.16b , v15.16b diff --git a/common/armv8/ih264_intra_pred_chroma_av8.s b/common/armv8/ih264_intra_pred_chroma_av8.s index 2c5efb3..8f0f282 100644 --- a/common/armv8/ih264_intra_pred_chroma_av8.s +++ b/common/armv8/ih264_intra_pred_chroma_av8.s @@ -501,7 +501,7 @@ ih264_intra_pred_chroma_8x8_mode_plane_av8: add v16.8h, v0.8h , v16.8h dup v20.8h, v22.h[0] mul v4.8h, v6.8h , v20.8h - dup v30.8h, v22.4h[1] + dup v30.8h, v22.h[1] mul v18.8h, v6.8h , v20.8h mul v14.8h, v6.8h , v30.8h mul v8.8h, v6.8h , v30.8h @@ -511,7 +511,7 @@ ih264_intra_pred_chroma_8x8_mode_plane_av8: sqrshrun v28.8b, v24.8h, #5 add v26.8h, v16.8h , v8.8h sqrshrun v29.8b, v0.8h, #5 - dup v20.8h, v22.4h[2] + dup v20.8h, v22.h[2] st1 {v28.8b, v29.8b}, [x1], x3 sqrshrun v28.8b, v2.8h, #5 sqrshrun v29.8b, v26.8h, #5 @@ -520,7 +520,7 @@ ih264_intra_pred_chroma_8x8_mode_plane_av8: st1 {v28.8b, v29.8b}, [x1], x3 add v24.8h, v12.8h , v4.8h add v0.8h, v16.8h , v18.8h - dup v30.8h, v22.4h[3] + dup v30.8h, v22.h[3] sqrshrun v28.8b, v24.8h, #5 sqrshrun v29.8b, v0.8h, #5 mul v14.8h, v6.8h , v30.8h diff --git a/common/armv8/ih264_intra_pred_luma_16x16_av8.s b/common/armv8/ih264_intra_pred_luma_16x16_av8.s index a9eb165..c1847b5 100644 --- a/common/armv8/ih264_intra_pred_luma_16x16_av8.s +++ b/common/armv8/ih264_intra_pred_luma_16x16_av8.s @@ -467,7 +467,7 @@ ih264_intra_pred_luma_16x16_mode_plane_av8: ldrb w5, [x7], #-1 sxtw x5, w5 add x8, x8, x8, lsl #1 - dup v4.8h, v0.4h[0] + dup v4.8h, v0.h[0] add x12, x12, x8 ldrb w9, [x0], #1 sxtw x9, w9 diff --git a/common/armv8/ih264_intra_pred_luma_8x8_av8.s b/common/armv8/ih264_intra_pred_luma_8x8_av8.s index 2b972ca..bf9a4c1 100644 --- a/common/armv8/ih264_intra_pred_luma_8x8_av8.s +++ b/common/armv8/ih264_intra_pred_luma_8x8_av8.s @@ -337,7 +337,7 @@ ih264_intra_pred_luma_8x8_mode_dc_av8: uaddlp v3.2s, v1.4h uaddlp v2.1d, v3.2s dup v10.8h, w5 - dup v8.8h, v2.4h[0] + dup v8.8h, v2.h[0] add v12.8h, v8.8h , v10.8h sqrshrun v31.8b, v12.8h, #4 st1 {v31.8b}, [x1], x3 @@ -360,7 +360,7 @@ top_available: // ONLT TOP AVAILABLE uaddlp v13.2s, v14.4h uaddlp v12.1d, v13.2s rshrn v4.8b, v12.8h, #3 - dup v31.8b, v4.8b[0] + dup v31.8b, v4.b[0] st1 {v31.8b}, [x1], x3 st1 {v31.8b}, [x1], x3 st1 {v31.8b}, [x1], x3 @@ -1059,7 +1059,7 @@ ih264_intra_pred_luma_8x8_mode_horz_u_av8: mov v30.16b, v4.16b mov v31.16b, v6.16b tbl v12.8b, {v30.16b, v31.16b}, v10.8b - dup v14.16b, v5.8b[7] // + dup v14.16b, v5.b[7] // tbl v13.8b, {v30.16b, v31.16b}, v11.8b mov v12.d[1], v13.d[0] ext v16.16b, v12.16b , v14.16b , #2 diff --git a/common/armv8/ih264_resi_trans_quant_av8.s b/common/armv8/ih264_resi_trans_quant_av8.s index dc1c680..316c220 100644 --- a/common/armv8/ih264_resi_trans_quant_av8.s +++ b/common/armv8/ih264_resi_trans_quant_av8.s @@ -665,7 +665,7 @@ ih264_hadamard_quant_2x2_uv_av8: ld2 {v0.4h-v1.4h}, [x0] //load src ld1 {v30.h}[0], [x2] //load pu2_scale_matrix[0] - dup v30.4h, v30.4h[0] //pu2_scale_matrix + dup v30.4h, v30.h[0] //pu2_scale_matrix uxtl v30.4s, v30.4h //pu2_scale_matrix neg w4, w4 diff --git a/common/armv8/ih264_weighted_bi_pred_av8.s b/common/armv8/ih264_weighted_bi_pred_av8.s index 96ef50a..b039fba 100644 --- a/common/armv8/ih264_weighted_bi_pred_av8.s +++ b/common/armv8/ih264_weighted_bi_pred_av8.s @@ -173,10 +173,10 @@ loop_4: //each iteration processes four rows ld1 {v10.s}[1], [x1], x4 //load row 4 in source 2 uxtl v8.8h, v8.8b //converting rows 3,4 in source 1 to 16-bit uxtl v10.8h, v10.8b //converting rows 3,4 in source 2 to 16-bit - mul v4.8h, v4.8h , v2.4h[0] //weight 1 mult. for rows 1,2 - mla v4.8h, v6.8h , v2.4h[2] //weight 2 mult. for rows 1,2 - mul v8.8h, v8.8h , v2.4h[0] //weight 1 mult. for rows 3,4 - mla v8.8h, v10.8h , v2.4h[2] //weight 2 mult. for rows 3,4 + mul v4.8h, v4.8h , v2.h[0] //weight 1 mult. for rows 1,2 + mla v4.8h, v6.8h , v2.h[2] //weight 2 mult. for rows 1,2 + mul v8.8h, v8.8h , v2.h[0] //weight 1 mult. for rows 3,4 + mla v8.8h, v10.8h , v2.h[2] //weight 2 mult. for rows 3,4 subs w11, w11, #4 //decrement ht by 4 srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from rows 1,2 srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from rows 3,4 @@ -205,18 +205,18 @@ loop_8: //each iteration processes four rows ld1 {v18.8b}, [x1], x4 //load row 4 in source 2 uxtl v8.8h, v8.8b //converting row 2 in source 1 to 16-bit uxtl v10.8h, v10.8b //converting row 2 in source 2 to 16-bit - mul v4.8h, v4.8h , v2.4h[0] //weight 1 mult. for row 1 - mla v4.8h, v6.8h , v2.4h[2] //weight 2 mult. for row 1 + mul v4.8h, v4.8h , v2.h[0] //weight 1 mult. for row 1 + mla v4.8h, v6.8h , v2.h[2] //weight 2 mult. for row 1 uxtl v12.8h, v12.8b //converting row 3 in source 1 to 16-bit uxtl v14.8h, v14.8b //converting row 3 in source 2 to 16-bit - mul v8.8h, v8.8h , v2.4h[0] //weight 1 mult. for row 2 - mla v8.8h, v10.8h , v2.4h[2] //weight 2 mult. for row 2 + mul v8.8h, v8.8h , v2.h[0] //weight 1 mult. for row 2 + mla v8.8h, v10.8h , v2.h[2] //weight 2 mult. for row 2 uxtl v16.8h, v16.8b //converting row 4 in source 1 to 16-bit uxtl v18.8h, v18.8b //converting row 4 in source 2 to 16-bit - mul v12.8h, v12.8h , v2.4h[0] //weight 1 mult. for row 3 - mla v12.8h, v14.8h , v2.4h[2] //weight 2 mult. for row 3 - mul v16.8h, v16.8h , v2.4h[0] //weight 1 mult. for row 4 - mla v16.8h, v18.8h , v2.4h[2] //weight 2 mult. for row 4 + mul v12.8h, v12.8h , v2.h[0] //weight 1 mult. for row 3 + mla v12.8h, v14.8h , v2.h[2] //weight 2 mult. for row 3 + mul v16.8h, v16.8h , v2.h[0] //weight 1 mult. for row 4 + mla v16.8h, v18.8h , v2.h[2] //weight 2 mult. for row 4 srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from row 1 srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from row 2 srshl v12.8h, v12.8h , v0.8h //rounds off the weighted samples from row 3 @@ -251,35 +251,35 @@ loop_16: //each iteration processes two rows ld1 {v18.8b, v19.8b}, [x1], x4 //load row 4 in source 2 uxtl v4.8h, v5.8b //converting row 1H in source 1 to 16-bit uxtl v6.8h, v7.8b //converting row 1H in source 2 to 16-bit - mul v20.8h, v20.8h , v2.4h[0] //weight 1 mult. for row 1L - mla v20.8h, v22.8h , v2.4h[2] //weight 2 mult. for row 1L + mul v20.8h, v20.8h , v2.h[0] //weight 1 mult. for row 1L + mla v20.8h, v22.8h , v2.h[2] //weight 2 mult. for row 1L uxtl v24.8h, v8.8b //converting row 2L in source 1 to 16-bit uxtl v26.8h, v10.8b //converting row 2L in source 2 to 16-bit - mul v4.8h, v4.8h , v2.4h[0] //weight 1 mult. for row 1H - mla v4.8h, v6.8h , v2.4h[2] //weight 2 mult. for row 1H + mul v4.8h, v4.8h , v2.h[0] //weight 1 mult. for row 1H + mla v4.8h, v6.8h , v2.h[2] //weight 2 mult. for row 1H uxtl v8.8h, v9.8b //converting row 2H in source 1 to 16-bit uxtl v10.8h, v11.8b //converting row 2H in source 2 to 16-bit - mul v24.8h, v24.8h , v2.4h[0] //weight 1 mult. for row 2L - mla v24.8h, v26.8h , v2.4h[2] //weight 2 mult. for row 2L + mul v24.8h, v24.8h , v2.h[0] //weight 1 mult. for row 2L + mla v24.8h, v26.8h , v2.h[2] //weight 2 mult. for row 2L uxtl v28.8h, v12.8b //converting row 3L in source 1 to 16-bit uxtl v30.8h, v14.8b //converting row 3L in source 2 to 16-bit - mul v8.8h, v8.8h , v2.4h[0] //weight 1 mult. for row 2H - mla v8.8h, v10.8h , v2.4h[2] //weight 2 mult. for row 2H + mul v8.8h, v8.8h , v2.h[0] //weight 1 mult. for row 2H + mla v8.8h, v10.8h , v2.h[2] //weight 2 mult. for row 2H uxtl v12.8h, v13.8b //converting row 3H in source 1 to 16-bit uxtl v14.8h, v15.8b //converting row 3H in source 2 to 16-bit - mul v28.8h, v28.8h , v2.4h[0] //weight 1 mult. for row 3L - mla v28.8h, v30.8h , v2.4h[2] //weight 2 mult. for row 3L + mul v28.8h, v28.8h , v2.h[0] //weight 1 mult. for row 3L + mla v28.8h, v30.8h , v2.h[2] //weight 2 mult. for row 3L uxtl v22.8h, v16.8b //converting row 4L in source 1 to 16-bit uxtl v6.8h, v18.8b //converting row 4L in source 2 to 16-bit - mul v12.8h, v12.8h , v2.4h[0] //weight 1 mult. for row 3H - mla v12.8h, v14.8h , v2.4h[2] //weight 2 mult. for row 3H + mul v12.8h, v12.8h , v2.h[0] //weight 1 mult. for row 3H + mla v12.8h, v14.8h , v2.h[2] //weight 2 mult. for row 3H uxtl v16.8h, v17.8b //converting row 4H in source 1 to 16-bit uxtl v18.8h, v19.8b //converting row 4H in source 2 to 16-bit - mul v22.8h, v22.8h , v2.4h[0] //weight 1 mult. for row 4L - mla v22.8h, v6.8h , v2.4h[2] //weight 2 mult. for row 4L + mul v22.8h, v22.8h , v2.h[0] //weight 1 mult. for row 4L + mla v22.8h, v6.8h , v2.h[2] //weight 2 mult. for row 4L srshl v20.8h, v20.8h , v0.8h //rounds off the weighted samples from row 1L - mul v16.8h, v16.8h , v2.4h[0] //weight 1 mult. for row 4H - mla v16.8h, v18.8h , v2.4h[2] //weight 2 mult. for row 4H + mul v16.8h, v16.8h , v2.h[0] //weight 1 mult. for row 4H + mla v16.8h, v18.8h , v2.h[2] //weight 2 mult. for row 4H srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from row 1H srshl v24.8h, v24.8h , v0.8h //rounds off the weighted samples from row 2L saddw v20.8h, v20.8h , v3.8b //adding offset for row 1L diff --git a/common/armv8/ih264_weighted_pred_av8.s b/common/armv8/ih264_weighted_pred_av8.s index ec5bb7a..69ed3b0 100644 --- a/common/armv8/ih264_weighted_pred_av8.s +++ b/common/armv8/ih264_weighted_pred_av8.s @@ -143,8 +143,8 @@ loop_4: //each iteration processes four rows uxtl v4.8h, v4.8b //converting rows 1,2 to 16-bit uxtl v6.8h, v6.8b //converting rows 3,4 to 16-bit - mul v4.8h, v4.8h , v2.4h[0] //weight mult. for rows 1,2 - mul v6.8h, v6.8h , v2.4h[0] //weight mult. for rows 3,4 + mul v4.8h, v4.8h , v2.h[0] //weight mult. for rows 1,2 + mul v6.8h, v6.8h , v2.h[0] //weight mult. for rows 3,4 subs w7, w7, #4 //decrement ht by 4 srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from rows 1,2 @@ -175,11 +175,11 @@ loop_8: //each iteration processes four rows uxtl v6.8h, v6.8b //converting row 2 to 16-bit uxtl v8.8h, v8.8b //converting row 3 to 16-bit - mul v4.8h, v4.8h , v2.4h[0] //weight mult. for row 1 + mul v4.8h, v4.8h , v2.h[0] //weight mult. for row 1 uxtl v10.8h, v10.8b //converting row 4 to 16-bit - mul v6.8h, v6.8h , v2.4h[0] //weight mult. for row 2 - mul v8.8h, v8.8h , v2.4h[0] //weight mult. for row 3 - mul v10.8h, v10.8h , v2.4h[0] //weight mult. for row 4 + mul v6.8h, v6.8h , v2.h[0] //weight mult. for row 2 + mul v8.8h, v8.8h , v2.h[0] //weight mult. for row 3 + mul v10.8h, v10.8h , v2.h[0] //weight mult. for row 4 srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from row 1 srshl v6.8h, v6.8h , v0.8h //rounds off the weighted samples from row 2 @@ -214,20 +214,20 @@ loop_16: //each iteration processes two rows uxtl v14.8h, v5.8b //converting row 1H to 16-bit ld1 {v10.8b, v11.8b}, [x0], x2 //load row 4 in source uxtl v16.8h, v6.8b //converting row 2L to 16-bit - mul v12.8h, v12.8h , v2.4h[0] //weight mult. for row 1L + mul v12.8h, v12.8h , v2.h[0] //weight mult. for row 1L uxtl v18.8h, v7.8b //converting row 2H to 16-bit - mul v14.8h, v14.8h , v2.4h[0] //weight mult. for row 1H + mul v14.8h, v14.8h , v2.h[0] //weight mult. for row 1H uxtl v20.8h, v8.8b //converting row 3L to 16-bit - mul v16.8h, v16.8h , v2.4h[0] //weight mult. for row 2L + mul v16.8h, v16.8h , v2.h[0] //weight mult. for row 2L uxtl v22.8h, v9.8b //converting row 3H to 16-bit - mul v18.8h, v18.8h , v2.4h[0] //weight mult. for row 2H + mul v18.8h, v18.8h , v2.h[0] //weight mult. for row 2H uxtl v24.8h, v10.8b //converting row 4L to 16-bit - mul v20.8h, v20.8h , v2.4h[0] //weight mult. for row 3L + mul v20.8h, v20.8h , v2.h[0] //weight mult. for row 3L uxtl v26.8h, v11.8b //converting row 4H to 16-bit - mul v22.8h, v22.8h , v2.4h[0] //weight mult. for row 3H - mul v24.8h, v24.8h , v2.4h[0] //weight mult. for row 4L + mul v22.8h, v22.8h , v2.h[0] //weight mult. for row 3H + mul v24.8h, v24.8h , v2.h[0] //weight mult. for row 4L srshl v12.8h, v12.8h , v0.8h //rounds off the weighted samples from row 1L - mul v26.8h, v26.8h , v2.4h[0] //weight mult. for row 4H + mul v26.8h, v26.8h , v2.h[0] //weight mult. for row 4H srshl v14.8h, v14.8h , v0.8h //rounds off the weighted samples from row 1H srshl v16.8h, v16.8h , v0.8h //rounds off the weighted samples from row 2L saddw v12.8h, v12.8h , v3.8b //adding offset for row 1L |