diff options
Diffstat (limited to 'common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s')
-rw-r--r-- | common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s | 143 |
1 files changed, 73 insertions, 70 deletions
diff --git a/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s b/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s index e9f83ff..5d65e63 100644 --- a/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s +++ b/common/arm64/ihevc_intra_pred_filters_chroma_mode_11_to_17.s @@ -105,7 +105,9 @@ ihevc_intra_pred_chroma_mode_11_to_17_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments - push_v_regs + + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! stp x19, x20,[sp,#-16]! adrp x7, :got:gai4_ihevc_ang_table @@ -279,8 +281,8 @@ prologue_8_16_32: // mov x0, #32 movi v28.8b, #32 - sqxtn v8.8b, v22.8h - shl v8.8b, v8.8b,#1 // 2 * idx + sqxtn v19.8b, v22.8h + shl v19.8b, v19.8b,#1 // 2 * idx and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0 @@ -292,15 +294,15 @@ prologue_8_16_32: add v27.8b, v27.8b , v29.8b mov x0,#0 - add v8.8b, v8.8b , v27.8b //ref_main_idx (add row) - sub v8.8b, v8.8b , v26.8b //ref_main_idx (row 0) - add v9.8b, v8.8b , v29.8b //ref_main_idx + 1 (row 0) - tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0) + add v19.8b, v19.8b , v27.8b //ref_main_idx (add row) + sub v19.8b, v19.8b , v26.8b //ref_main_idx (row 0) + add v21.8b, v19.8b , v29.8b //ref_main_idx + 1 (row 0) + tbl v12.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 0) sub v7.8b, v28.8b , v6.8b //32-fract - tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0) - add v4.8b, v8.8b , v29.8b //ref_main_idx (row 1) - add v5.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 1) + tbl v13.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 0) + add v4.8b, v19.8b , v29.8b //ref_main_idx (row 1) + add v5.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 1) // mov x0, #4 @ 2 *(row * 2 ) movi v29.8b, #4 @@ -310,38 +312,38 @@ prologue_8_16_32: umlal v24.8h, v13.8b, v6.8b //mul (row 0) tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1) - add v8.8b, v8.8b , v29.8b //ref_main_idx (row 2) - add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 2) + add v19.8b, v19.8b , v29.8b //ref_main_idx (row 2) + add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 2) rshrn v24.8b, v24.8h,#5 //round shft (row 0) - tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2) + tbl v14.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 2) umull v22.8h, v16.8b, v7.8b //mul (row 1) umlal v22.8h, v17.8b, v6.8b //mul (row 1) - tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2) + tbl v15.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 2) add v4.8b, v4.8b , v29.8b //ref_main_idx (row 3) add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 3) st1 {v24.8b},[x2], x3 //st (row 0) rshrn v22.8b, v22.8h,#5 //round shft (row 1) - tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3) + tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3) umull v20.8h, v14.8b, v7.8b //mul (row 2) umlal v20.8h, v15.8b, v6.8b //mul (row 2) - tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3) - add v8.8b, v8.8b , v29.8b //ref_main_idx (row 4) - add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 4) + tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3) + add v19.8b, v19.8b , v29.8b //ref_main_idx (row 4) + add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 4) st1 {v22.8b},[x2], x3 //st (row 1) rshrn v20.8b, v20.8h,#5 //round shft (row 2) - tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4) - umull v18.8h, v10.8b, v7.8b //mul (row 3) - umlal v18.8h, v11.8b, v6.8b //mul (row 3) + tbl v12.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 4) + umull v18.8h, v23.8b, v7.8b //mul (row 3) + umlal v18.8h, v25.8b, v6.8b //mul (row 3) - tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4) + tbl v13.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 4) add v4.8b, v4.8b , v29.8b //ref_main_idx (row 5) add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 5) @@ -353,32 +355,32 @@ prologue_8_16_32: umlal v24.8h, v13.8b, v6.8b //mul (row 4) tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5) - add v8.8b, v8.8b , v29.8b //ref_main_idx (row 6) - add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 6) + add v19.8b, v19.8b , v29.8b //ref_main_idx (row 6) + add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 6) st1 {v18.8b},[x2], x3 //st (row 3) cmp x4,#4 beq end_func rshrn v24.8b, v24.8h,#5 //round shft (row 4) - tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6) + tbl v14.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 6) umull v22.8h, v16.8b, v7.8b //mul (row 5) umlal v22.8h, v17.8b, v6.8b //mul (row 5) - tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6) + tbl v15.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 6) add v4.8b, v4.8b , v29.8b //ref_main_idx (row 7) add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 7) st1 {v24.8b},[x2], x3 //st (row 4) rshrn v22.8b, v22.8h,#5 //round shft (row 5) - tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) + tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) umull v20.8h, v14.8b, v7.8b //mul (row 6) umlal v20.8h, v15.8b, v6.8b //mul (row 6) - tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) - umull v18.8h, v10.8b, v7.8b //mul (row 7) - umlal v18.8h, v11.8b, v6.8b //mul (row 7) + tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) + umull v18.8h, v23.8b, v7.8b //mul (row 7) + umlal v18.8h, v25.8b, v6.8b //mul (row 7) st1 {v22.8b},[x2], x3 //st (row 5) rshrn v20.8b, v20.8h,#5 //round shft (row 6) @@ -413,10 +415,10 @@ lbl400: ld1 {v31.8b},[x14],#8 smull v12.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) - xtn v10.8b, v12.8h + xtn v23.8b, v12.8h sshr v12.8h, v12.8h,#5 - sqxtn v11.8b, v12.8h - shl v11.8b, v11.8b,#1 + sqxtn v25.8b, v12.8h + shl v25.8b, v25.8b,#1 orr x5,x0,x0, lsl#8 add x5, x5,#0x002 add x5, x5,#0x300 @@ -427,7 +429,7 @@ lbl400: add x9, x9, x0, lsl #1 // sub x9, x9, #1 dup v26.8b,w9 - add v8.8b, v27.8b , v11.8b //ref_main_idx (add row) + add v19.8b, v27.8b , v25.8b //ref_main_idx (add row) mov x5,x2 // sub x4,x4,#8 @@ -435,16 +437,16 @@ lbl400: kernel_8_16_32: movi v29.8b, #2 //contains #2 for adding to get ref_main_idx + 1 - sub v8.8b, v8.8b , v26.8b //ref_main_idx - mov v26.8b, v10.8b + sub v19.8b, v19.8b , v26.8b //ref_main_idx + mov v26.8b, v23.8b subs x11, x11, #8 add x6, x1, x9 - tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) - add v9.8b, v29.8b , v8.8b //ref_main_idx + 1 + tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) + add v21.8b, v29.8b , v19.8b //ref_main_idx + 1 umull v20.8h, v14.8b, v7.8b //mul (row 6) - tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) + tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) umlal v20.8h, v15.8b, v6.8b //mul (row 6) add x20, x0, #8 @@ -468,15 +470,15 @@ kernel_8_16_32: ldr x14, [x14, #:got_lo12:col_for_intra_chroma] lbl452: - add v4.8b, v29.8b , v8.8b //ref_main_idx (row 1) - tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 0) - add v5.8b, v29.8b , v9.8b //ref_main_idx + 1 (row 1) + add v4.8b, v29.8b , v19.8b //ref_main_idx (row 1) + tbl v12.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 0) + add v5.8b, v29.8b , v21.8b //ref_main_idx + 1 (row 1) movi v29.8b, #31 //contains #2 for adding to get ref_main_idx + 1 - umull v18.8h, v10.8b, v7.8b //mul (row 7) - tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 0) - umlal v18.8h, v11.8b, v6.8b //mul (row 7) + umull v18.8h, v23.8b, v7.8b //mul (row 7) + tbl v13.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 0) + umlal v18.8h, v25.8b, v6.8b //mul (row 7) ld1 {v31.8b},[x14],#8 and v6.8b, v29.8b , v26.8b //fract values in d1/ idx values in d0 @@ -486,9 +488,9 @@ lbl452: st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5) rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6) - add v8.8b, v29.8b , v8.8b //ref_main_idx (row 2) + add v19.8b, v29.8b , v19.8b //ref_main_idx (row 2) tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1) - add v9.8b, v29.8b , v9.8b //ref_main_idx + 1 (row 2) + add v21.8b, v29.8b , v21.8b //ref_main_idx + 1 (row 2) lsl x20, x4, #1 csel x11,x20,x11,le @@ -505,22 +507,22 @@ lbl452: rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7) add v4.8b, v4.8b , v29.8b //ref_main_idx (row 3) - tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 2) + tbl v14.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 2) add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 3) umull v22.8h, v16.8b, v7.8b //mul (row 1) - tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 2) + tbl v15.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 2) umlal v22.8h, v17.8b, v6.8b //mul (row 1) rshrn v24.8b, v24.8h,#5 //round shft (row 0) st1 {v18.8b},[x5], x3 //(from previous loop)st (row 7) - add v8.8b, v8.8b , v29.8b //ref_main_idx (row 4) - tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3) - add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 4) + add v19.8b, v19.8b , v29.8b //ref_main_idx (row 4) + tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3) + add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 4) umull v20.8h, v14.8b, v7.8b //mul (row 2) - tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3) + tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3) umlal v20.8h, v15.8b, v6.8b //mul (row 2) smull v14.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) @@ -532,22 +534,22 @@ lbl452: rshrn v22.8b, v22.8h,#5 //round shft (row 1) add v4.8b, v4.8b , v29.8b //ref_main_idx (row 5) - tbl v12.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 4) + tbl v12.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 4) add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 5) - umull v18.8h, v10.8b, v7.8b //mul (row 3) - tbl v13.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 4) - umlal v18.8h, v11.8b, v6.8b //mul (row 3) + umull v18.8h, v23.8b, v7.8b //mul (row 3) + tbl v13.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 4) + umlal v18.8h, v25.8b, v6.8b //mul (row 3) st1 {v22.8b},[x2], x3 //st (row 1) rshrn v20.8b, v20.8h,#5 //round shft (row 2) - xtn v10.8b, v14.8h + xtn v23.8b, v14.8h sshr v14.8h, v14.8h,#5 - add v8.8b, v8.8b , v29.8b //ref_main_idx (row 6) + add v19.8b, v19.8b , v29.8b //ref_main_idx (row 6) tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5) - add v9.8b, v9.8b , v29.8b //ref_main_idx + 1 (row 6) + add v21.8b, v21.8b , v29.8b //ref_main_idx + 1 (row 6) umull v24.8h, v12.8b, v7.8b //mul (row 4) tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5) @@ -557,19 +559,19 @@ lbl452: rshrn v18.8b, v18.8h,#5 //round shft (row 3) // sub x9, x9, #1 - sqxtn v11.8b, v14.8h + sqxtn v25.8b, v14.8h add v4.8b, v4.8b , v29.8b //ref_main_idx (row 7) - tbl v14.8b, { v0.16b, v1.16b}, v8.8b //load from ref_main_idx (row 6) + tbl v14.8b, { v0.16b, v1.16b}, v19.8b //load from ref_main_idx (row 6) add v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 7) - shl v11.8b, v11.8b,#1 + shl v25.8b, v25.8b,#1 umull v22.8h, v16.8b, v7.8b //mul (row 5) - tbl v15.8b, { v0.16b, v1.16b}, v9.8b //load from ref_main_idx + 1 (row 6) + tbl v15.8b, { v0.16b, v1.16b}, v21.8b //load from ref_main_idx + 1 (row 6) umlal v22.8h, v17.8b, v6.8b //mul (row 5) - add v8.8b, v27.8b , v11.8b //ref_main_idx (add row) + add v19.8b, v27.8b , v25.8b //ref_main_idx (add row) dup v26.8b,w9 st1 {v18.8b},[x2], x3 //st (row 3) @@ -589,17 +591,17 @@ lbl452: bne kernel_8_16_32 epil_8_16_32: - tbl v10.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) + tbl v23.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) umull v20.8h, v14.8b, v7.8b //mul (row 6) - tbl v11.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) + tbl v25.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) umlal v20.8h, v15.8b, v6.8b //mul (row 6) st1 {v24.8b},[x5], x3 //st (row 4) rshrn v24.8b, v22.8h,#5 //round shft (row 5) - umull v18.8h, v10.8b, v7.8b //mul (row 7) - umlal v18.8h, v11.8b, v6.8b //mul (row 7) + umull v18.8h, v23.8b, v7.8b //mul (row 7) + umlal v18.8h, v25.8b, v6.8b //mul (row 7) st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5) rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6) @@ -613,7 +615,8 @@ end_func: add sp, sp, #132 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 - pop_v_regs + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 ret |