diff options
author | Martin Storsjo <martin@martin.st> | 2015-06-10 12:05:14 +0300 |
---|---|---|
committer | Marco Nelissen <marcone@google.com> | 2015-06-25 08:25:46 -0700 |
commit | 9f81a0a2024d1aa640e15085717a8164f770eba4 (patch) | |
tree | 4095089abfaace92958b0013b636f09cfbae3374 /common/armv8/ih264_weighted_bi_pred_av8.s | |
parent | 436fccb1641f9f25afff6cf20f9d4957c08f43cd (diff) | |
download | android_external_libavc-9f81a0a2024d1aa640e15085717a8164f770eba4.tar.gz android_external_libavc-9f81a0a2024d1aa640e15085717a8164f770eba4.tar.bz2 android_external_libavc-9f81a0a2024d1aa640e15085717a8164f770eba4.zip |
armv8: Remove redundant NEON element size declarations
When specifying one specific lane of the vector, the number of
lanes don't need to be specified.
The clang built-in assembler doesn't allow the redundant
declarations, while binutils gas work fine with both forms.
Change-Id: I86077ce0774d4594a1295b6860e4944df87dde2f
Diffstat (limited to 'common/armv8/ih264_weighted_bi_pred_av8.s')
-rw-r--r-- | common/armv8/ih264_weighted_bi_pred_av8.s | 56 |
1 files changed, 28 insertions, 28 deletions
diff --git a/common/armv8/ih264_weighted_bi_pred_av8.s b/common/armv8/ih264_weighted_bi_pred_av8.s index 96ef50a..b039fba 100644 --- a/common/armv8/ih264_weighted_bi_pred_av8.s +++ b/common/armv8/ih264_weighted_bi_pred_av8.s @@ -173,10 +173,10 @@ loop_4: //each iteration processes four rows ld1 {v10.s}[1], [x1], x4 //load row 4 in source 2 uxtl v8.8h, v8.8b //converting rows 3,4 in source 1 to 16-bit uxtl v10.8h, v10.8b //converting rows 3,4 in source 2 to 16-bit - mul v4.8h, v4.8h , v2.4h[0] //weight 1 mult. for rows 1,2 - mla v4.8h, v6.8h , v2.4h[2] //weight 2 mult. for rows 1,2 - mul v8.8h, v8.8h , v2.4h[0] //weight 1 mult. for rows 3,4 - mla v8.8h, v10.8h , v2.4h[2] //weight 2 mult. for rows 3,4 + mul v4.8h, v4.8h , v2.h[0] //weight 1 mult. for rows 1,2 + mla v4.8h, v6.8h , v2.h[2] //weight 2 mult. for rows 1,2 + mul v8.8h, v8.8h , v2.h[0] //weight 1 mult. for rows 3,4 + mla v8.8h, v10.8h , v2.h[2] //weight 2 mult. for rows 3,4 subs w11, w11, #4 //decrement ht by 4 srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from rows 1,2 srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from rows 3,4 @@ -205,18 +205,18 @@ loop_8: //each iteration processes four rows ld1 {v18.8b}, [x1], x4 //load row 4 in source 2 uxtl v8.8h, v8.8b //converting row 2 in source 1 to 16-bit uxtl v10.8h, v10.8b //converting row 2 in source 2 to 16-bit - mul v4.8h, v4.8h , v2.4h[0] //weight 1 mult. for row 1 - mla v4.8h, v6.8h , v2.4h[2] //weight 2 mult. for row 1 + mul v4.8h, v4.8h , v2.h[0] //weight 1 mult. for row 1 + mla v4.8h, v6.8h , v2.h[2] //weight 2 mult. for row 1 uxtl v12.8h, v12.8b //converting row 3 in source 1 to 16-bit uxtl v14.8h, v14.8b //converting row 3 in source 2 to 16-bit - mul v8.8h, v8.8h , v2.4h[0] //weight 1 mult. for row 2 - mla v8.8h, v10.8h , v2.4h[2] //weight 2 mult. for row 2 + mul v8.8h, v8.8h , v2.h[0] //weight 1 mult. for row 2 + mla v8.8h, v10.8h , v2.h[2] //weight 2 mult. for row 2 uxtl v16.8h, v16.8b //converting row 4 in source 1 to 16-bit uxtl v18.8h, v18.8b //converting row 4 in source 2 to 16-bit - mul v12.8h, v12.8h , v2.4h[0] //weight 1 mult. for row 3 - mla v12.8h, v14.8h , v2.4h[2] //weight 2 mult. for row 3 - mul v16.8h, v16.8h , v2.4h[0] //weight 1 mult. for row 4 - mla v16.8h, v18.8h , v2.4h[2] //weight 2 mult. for row 4 + mul v12.8h, v12.8h , v2.h[0] //weight 1 mult. for row 3 + mla v12.8h, v14.8h , v2.h[2] //weight 2 mult. for row 3 + mul v16.8h, v16.8h , v2.h[0] //weight 1 mult. for row 4 + mla v16.8h, v18.8h , v2.h[2] //weight 2 mult. for row 4 srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from row 1 srshl v8.8h, v8.8h , v0.8h //rounds off the weighted samples from row 2 srshl v12.8h, v12.8h , v0.8h //rounds off the weighted samples from row 3 @@ -251,35 +251,35 @@ loop_16: //each iteration processes two rows ld1 {v18.8b, v19.8b}, [x1], x4 //load row 4 in source 2 uxtl v4.8h, v5.8b //converting row 1H in source 1 to 16-bit uxtl v6.8h, v7.8b //converting row 1H in source 2 to 16-bit - mul v20.8h, v20.8h , v2.4h[0] //weight 1 mult. for row 1L - mla v20.8h, v22.8h , v2.4h[2] //weight 2 mult. for row 1L + mul v20.8h, v20.8h , v2.h[0] //weight 1 mult. for row 1L + mla v20.8h, v22.8h , v2.h[2] //weight 2 mult. for row 1L uxtl v24.8h, v8.8b //converting row 2L in source 1 to 16-bit uxtl v26.8h, v10.8b //converting row 2L in source 2 to 16-bit - mul v4.8h, v4.8h , v2.4h[0] //weight 1 mult. for row 1H - mla v4.8h, v6.8h , v2.4h[2] //weight 2 mult. for row 1H + mul v4.8h, v4.8h , v2.h[0] //weight 1 mult. for row 1H + mla v4.8h, v6.8h , v2.h[2] //weight 2 mult. for row 1H uxtl v8.8h, v9.8b //converting row 2H in source 1 to 16-bit uxtl v10.8h, v11.8b //converting row 2H in source 2 to 16-bit - mul v24.8h, v24.8h , v2.4h[0] //weight 1 mult. for row 2L - mla v24.8h, v26.8h , v2.4h[2] //weight 2 mult. for row 2L + mul v24.8h, v24.8h , v2.h[0] //weight 1 mult. for row 2L + mla v24.8h, v26.8h , v2.h[2] //weight 2 mult. for row 2L uxtl v28.8h, v12.8b //converting row 3L in source 1 to 16-bit uxtl v30.8h, v14.8b //converting row 3L in source 2 to 16-bit - mul v8.8h, v8.8h , v2.4h[0] //weight 1 mult. for row 2H - mla v8.8h, v10.8h , v2.4h[2] //weight 2 mult. for row 2H + mul v8.8h, v8.8h , v2.h[0] //weight 1 mult. for row 2H + mla v8.8h, v10.8h , v2.h[2] //weight 2 mult. for row 2H uxtl v12.8h, v13.8b //converting row 3H in source 1 to 16-bit uxtl v14.8h, v15.8b //converting row 3H in source 2 to 16-bit - mul v28.8h, v28.8h , v2.4h[0] //weight 1 mult. for row 3L - mla v28.8h, v30.8h , v2.4h[2] //weight 2 mult. for row 3L + mul v28.8h, v28.8h , v2.h[0] //weight 1 mult. for row 3L + mla v28.8h, v30.8h , v2.h[2] //weight 2 mult. for row 3L uxtl v22.8h, v16.8b //converting row 4L in source 1 to 16-bit uxtl v6.8h, v18.8b //converting row 4L in source 2 to 16-bit - mul v12.8h, v12.8h , v2.4h[0] //weight 1 mult. for row 3H - mla v12.8h, v14.8h , v2.4h[2] //weight 2 mult. for row 3H + mul v12.8h, v12.8h , v2.h[0] //weight 1 mult. for row 3H + mla v12.8h, v14.8h , v2.h[2] //weight 2 mult. for row 3H uxtl v16.8h, v17.8b //converting row 4H in source 1 to 16-bit uxtl v18.8h, v19.8b //converting row 4H in source 2 to 16-bit - mul v22.8h, v22.8h , v2.4h[0] //weight 1 mult. for row 4L - mla v22.8h, v6.8h , v2.4h[2] //weight 2 mult. for row 4L + mul v22.8h, v22.8h , v2.h[0] //weight 1 mult. for row 4L + mla v22.8h, v6.8h , v2.h[2] //weight 2 mult. for row 4L srshl v20.8h, v20.8h , v0.8h //rounds off the weighted samples from row 1L - mul v16.8h, v16.8h , v2.4h[0] //weight 1 mult. for row 4H - mla v16.8h, v18.8h , v2.4h[2] //weight 2 mult. for row 4H + mul v16.8h, v16.8h , v2.h[0] //weight 1 mult. for row 4H + mla v16.8h, v18.8h , v2.h[2] //weight 2 mult. for row 4H srshl v4.8h, v4.8h , v0.8h //rounds off the weighted samples from row 1H srshl v24.8h, v24.8h , v0.8h //rounds off the weighted samples from row 2L saddw v20.8h, v20.8h , v3.8b //adding offset for row 1L |