summaryrefslogtreecommitdiffstats
path: root/common/armv8/ih264_weighted_bi_pred_av8.s
diff options
context:
space:
mode:
authorMartin Storsjo <martin@martin.st>2015-06-10 12:05:14 +0300
committerMarco Nelissen <marcone@google.com>2015-06-25 08:25:46 -0700
commit9f81a0a2024d1aa640e15085717a8164f770eba4 (patch)
tree4095089abfaace92958b0013b636f09cfbae3374 /common/armv8/ih264_weighted_bi_pred_av8.s
parent436fccb1641f9f25afff6cf20f9d4957c08f43cd (diff)
downloadandroid_external_libavc-9f81a0a2024d1aa640e15085717a8164f770eba4.tar.gz
android_external_libavc-9f81a0a2024d1aa640e15085717a8164f770eba4.tar.bz2
android_external_libavc-9f81a0a2024d1aa640e15085717a8164f770eba4.zip
armv8: Remove redundant NEON element size declarations
When specifying one specific lane of the vector, the number of lanes don't need to be specified. The clang built-in assembler doesn't allow the redundant declarations, while binutils gas work fine with both forms. Change-Id: I86077ce0774d4594a1295b6860e4944df87dde2f
Diffstat (limited to 'common/armv8/ih264_weighted_bi_pred_av8.s')
-rw-r--r--common/armv8/ih264_weighted_bi_pred_av8.s56
1 files changed, 28 insertions, 28 deletions
diff --git a/common/armv8/ih264_weighted_bi_pred_av8.s b/common/armv8/ih264_weighted_bi_pred_av8.s
index 96ef50a..b039fba 100644
--- a/common/armv8/ih264_weighted_bi_pred_av8.s
+++ b/common/armv8/ih264_weighted_bi_pred_av8.s
@@ -173,10 +173,10 @@ loop_4: //each iteration processes four rows
ld1 {v10.s}[1], [x1], x4 //load row 4 in source 2
uxtl v8.8h, v8.8b //converting rows 3,4 in source 1 to 16-bit
uxtl v10.8h, v10.8b //converting rows 3,4 in source 2 to 16-bit
- mul v4.8h, v4.8h , v2.4h[0] //weight 1 mult. for rows 1,2
- mla v4.8h, v6.8h , v2.4h[2] //weight 2 mult. for rows 1,2
- mul v8.8h, v8.8h , v2.4h[0] //weight 1 mult. for rows 3,4
- mla v8.8h, v10.8h , v2.4h[2] //weight 2 mult. for rows 3,4
+ mul v4.8h, v4.8h , v2.h[0] //weight 1 mult. for rows 1,2
+ mla v4.8h, v6.8h , v2.h[2] //weight 2 mult. for rows 1,2
+ mul v8.8h, v8.8h , v2.h[0] //weight 1 mult. for rows 3,4
+ mla v8.8h, v10.8h , v2.h[2] //weight 2 mult. for rows 3,4
subs w11, w11, #4 //decrement ht by 4
srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from rows 1,2
srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from rows 3,4
@@ -205,18 +205,18 @@ loop_8: //each iteration processes four rows
ld1 {v18.8b}, [x1], x4 //load row 4 in source 2
uxtl v8.8h, v8.8b //converting row 2 in source 1 to 16-bit
uxtl v10.8h, v10.8b //converting row 2 in source 2 to 16-bit
- mul v4.8h, v4.8h , v2.4h[0] //weight 1 mult. for row 1
- mla v4.8h, v6.8h , v2.4h[2] //weight 2 mult. for row 1
+ mul v4.8h, v4.8h , v2.h[0] //weight 1 mult. for row 1
+ mla v4.8h, v6.8h , v2.h[2] //weight 2 mult. for row 1
uxtl v12.8h, v12.8b //converting row 3 in source 1 to 16-bit
uxtl v14.8h, v14.8b //converting row 3 in source 2 to 16-bit
- mul v8.8h, v8.8h , v2.4h[0] //weight 1 mult. for row 2
- mla v8.8h, v10.8h , v2.4h[2] //weight 2 mult. for row 2
+ mul v8.8h, v8.8h , v2.h[0] //weight 1 mult. for row 2
+ mla v8.8h, v10.8h , v2.h[2] //weight 2 mult. for row 2
uxtl v16.8h, v16.8b //converting row 4 in source 1 to 16-bit
uxtl v18.8h, v18.8b //converting row 4 in source 2 to 16-bit
- mul v12.8h, v12.8h , v2.4h[0] //weight 1 mult. for row 3
- mla v12.8h, v14.8h , v2.4h[2] //weight 2 mult. for row 3
- mul v16.8h, v16.8h , v2.4h[0] //weight 1 mult. for row 4
- mla v16.8h, v18.8h , v2.4h[2] //weight 2 mult. for row 4
+ mul v12.8h, v12.8h , v2.h[0] //weight 1 mult. for row 3
+ mla v12.8h, v14.8h , v2.h[2] //weight 2 mult. for row 3
+ mul v16.8h, v16.8h , v2.h[0] //weight 1 mult. for row 4
+ mla v16.8h, v18.8h , v2.h[2] //weight 2 mult. for row 4
srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from row 1
srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from row 2
srshl v12.8h, v12.8h , v0.8h //rounds off the weighted samples from row 3
@@ -251,35 +251,35 @@ loop_16: //each iteration processes two rows
ld1 {v18.8b, v19.8b}, [x1], x4 //load row 4 in source 2
uxtl v4.8h, v5.8b //converting row 1H in source 1 to 16-bit
uxtl v6.8h, v7.8b //converting row 1H in source 2 to 16-bit
- mul v20.8h, v20.8h , v2.4h[0] //weight 1 mult. for row 1L
- mla v20.8h, v22.8h , v2.4h[2] //weight 2 mult. for row 1L
+ mul v20.8h, v20.8h , v2.h[0] //weight 1 mult. for row 1L
+ mla v20.8h, v22.8h , v2.h[2] //weight 2 mult. for row 1L
uxtl v24.8h, v8.8b //converting row 2L in source 1 to 16-bit
uxtl v26.8h, v10.8b //converting row 2L in source 2 to 16-bit
- mul v4.8h, v4.8h , v2.4h[0] //weight 1 mult. for row 1H
- mla v4.8h, v6.8h , v2.4h[2] //weight 2 mult. for row 1H
+ mul v4.8h, v4.8h , v2.h[0] //weight 1 mult. for row 1H
+ mla v4.8h, v6.8h , v2.h[2] //weight 2 mult. for row 1H
uxtl v8.8h, v9.8b //converting row 2H in source 1 to 16-bit
uxtl v10.8h, v11.8b //converting row 2H in source 2 to 16-bit
- mul v24.8h, v24.8h , v2.4h[0] //weight 1 mult. for row 2L
- mla v24.8h, v26.8h , v2.4h[2] //weight 2 mult. for row 2L
+ mul v24.8h, v24.8h , v2.h[0] //weight 1 mult. for row 2L
+ mla v24.8h, v26.8h , v2.h[2] //weight 2 mult. for row 2L
uxtl v28.8h, v12.8b //converting row 3L in source 1 to 16-bit
uxtl v30.8h, v14.8b //converting row 3L in source 2 to 16-bit
- mul v8.8h, v8.8h , v2.4h[0] //weight 1 mult. for row 2H
- mla v8.8h, v10.8h , v2.4h[2] //weight 2 mult. for row 2H
+ mul v8.8h, v8.8h , v2.h[0] //weight 1 mult. for row 2H
+ mla v8.8h, v10.8h , v2.h[2] //weight 2 mult. for row 2H
uxtl v12.8h, v13.8b //converting row 3H in source 1 to 16-bit
uxtl v14.8h, v15.8b //converting row 3H in source 2 to 16-bit
- mul v28.8h, v28.8h , v2.4h[0] //weight 1 mult. for row 3L
- mla v28.8h, v30.8h , v2.4h[2] //weight 2 mult. for row 3L
+ mul v28.8h, v28.8h , v2.h[0] //weight 1 mult. for row 3L
+ mla v28.8h, v30.8h , v2.h[2] //weight 2 mult. for row 3L
uxtl v22.8h, v16.8b //converting row 4L in source 1 to 16-bit
uxtl v6.8h, v18.8b //converting row 4L in source 2 to 16-bit
- mul v12.8h, v12.8h , v2.4h[0] //weight 1 mult. for row 3H
- mla v12.8h, v14.8h , v2.4h[2] //weight 2 mult. for row 3H
+ mul v12.8h, v12.8h , v2.h[0] //weight 1 mult. for row 3H
+ mla v12.8h, v14.8h , v2.h[2] //weight 2 mult. for row 3H
uxtl v16.8h, v17.8b //converting row 4H in source 1 to 16-bit
uxtl v18.8h, v19.8b //converting row 4H in source 2 to 16-bit
- mul v22.8h, v22.8h , v2.4h[0] //weight 1 mult. for row 4L
- mla v22.8h, v6.8h , v2.4h[2] //weight 2 mult. for row 4L
+ mul v22.8h, v22.8h , v2.h[0] //weight 1 mult. for row 4L
+ mla v22.8h, v6.8h , v2.h[2] //weight 2 mult. for row 4L
srshl v20.8h, v20.8h , v0.8h //rounds off the weighted samples from row 1L
- mul v16.8h, v16.8h , v2.4h[0] //weight 1 mult. for row 4H
- mla v16.8h, v18.8h , v2.4h[2] //weight 2 mult. for row 4H
+ mul v16.8h, v16.8h , v2.h[0] //weight 1 mult. for row 4H
+ mla v16.8h, v18.8h , v2.h[2] //weight 2 mult. for row 4H
srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from row 1H
srshl v24.8h, v24.8h , v0.8h //rounds off the weighted samples from row 2L
saddw v20.8h, v20.8h , v3.8b //adding offset for row 1L