diff options
Diffstat (limited to 'encoder')
-rw-r--r-- | encoder/arm/ime_distortion_metrics_a9q.s | 1350 | ||||
-rw-r--r-- | encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s | 2 | ||||
-rw-r--r-- | encoder/armv8/ih264e_half_pel_av8.s | 7 | ||||
-rw-r--r-- | encoder/armv8/ime_distortion_metrics_av8.s | 1 | ||||
-rw-r--r-- | encoder/x86/ih264e_intra_modes_eval_ssse3.c | 2 | ||||
-rw-r--r-- | encoder/x86/ime_distortion_metrics_sse42.c | 3 |
6 files changed, 685 insertions, 680 deletions
diff --git a/encoder/arm/ime_distortion_metrics_a9q.s b/encoder/arm/ime_distortion_metrics_a9q.s index b58911e..27fbe3d 100644 --- a/encoder/arm/ime_distortion_metrics_a9q.s +++ b/encoder/arm/ime_distortion_metrics_a9q.s @@ -17,9 +17,9 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** -@/** +@** @****************************************************************************** @* @* @@ -48,7 +48,7 @@ @ -@/** +@** @****************************************************************************** @* @* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode) @@ -79,59 +79,62 @@ @* @remarks @* @****************************************************************************** -@*/ +@* .text .p2align 2 + .global ime_compute_sad_16x16_fast_a9q + ime_compute_sad_16x16_fast_a9q: - stmfd sp!, {r12, lr} - lsl r2, r2, #1 - lsl r3, r3, #1 + stmfd sp!, {r12, lr} + vpush {d8-d15} + lsl r2, r2, #1 + lsl r3, r3, #1 @for bringing buffer2 into cache..., dummy load instructions - @ LDR r12,[r1] + @LDR r12,[r1] - vld1.8 {d4, d5}, [r0], r2 - vld1.8 {d6, d7}, [r1], r3 - mov r12, #6 - vld1.8 {d8, d9}, [r0], r2 - vabdl.u8 q0, d6, d4 - vabdl.u8 q1, d7, d5 - vld1.8 {d10, d11}, [r1], r3 + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + mov r12, #6 + vld1.8 {d8, d9}, [r0], r2 + vabdl.u8 q0, d6, d4 + vabdl.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 loop_sad_16x16_fast: - vld1.8 {d4, d5}, [r0], r2 - vabal.u8 q0, d10, d8 - vabal.u8 q1, d11, d9 - vld1.8 {d6, d7}, [r1], r3 - subs r12, #2 - vld1.8 {d8, d9}, [r0], r2 - vabal.u8 q0, d6, d4 - vabal.u8 q1, d7, d5 - vld1.8 {d10, d11}, [r1], r3 - - bne loop_sad_16x16_fast + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + vld1.8 {d6, d7}, [r1], r3 + subs r12, #2 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 - vabal.u8 q0, d10, d8 - vabal.u8 q1, d11, d9 + bne loop_sad_16x16_fast - vadd.i16 q0, q0, q1 - vadd.i16 d0, d1, d0 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 - ldr r12, [sp, #12] - vpaddl.u16 d0, d0 - vpaddl.u32 d0, d0 - vshl.u32 d0, d0, #1 - vst1.32 {d0[0]}, [r12] + vadd.i16 q0, q0, q1 + vadd.i16 d0, d1, d0 + vpop {d8-d15} + ldr r12, [sp, #12] + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vshl.u32 d0, d0, #1 + vst1.32 {d0[0]}, [r12] - ldmfd sp!, {r12, pc} + ldmfd sp!, {r12, pc} -@/** +@** @****************************************************************************** @* @* @brief computes distortion (SAD) between 2 16x8 blocks @@ -163,56 +166,57 @@ loop_sad_16x16_fast: @* @remarks @* @****************************************************************************** -@*/ +@* @ .global ime_compute_sad_16x8_a9q + ime_compute_sad_16x8_a9q: - stmfd sp!, {r12, lr} + stmfd sp!, {r12, lr} @for bringing buffer2 into cache..., dummy load instructions @LDR r12,[r1] - vld1.8 {d4, d5}, [r0], r2 - vld1.8 {d6, d7}, [r1], r3 - mov r12, #6 - vld1.8 {d8, d9}, [r0], r2 - vabdl.u8 q0, d6, d4 - vabdl.u8 q1, d7, d5 - vld1.8 {d10, d11}, [r1], r3 + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + mov r12, #6 + vpush {d8-d15} + vld1.8 {d8, d9}, [r0], r2 + vabdl.u8 q0, d6, d4 + vabdl.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 loop_sad_16x8: - vld1.8 {d4, d5}, [r0], r2 - vabal.u8 q0, d10, d8 - vabal.u8 q1, d11, d9 - vld1.8 {d6, d7}, [r1], r3 - subs r12, #2 - vld1.8 {d8, d9}, [r0], r2 - vabal.u8 q0, d6, d4 - vabal.u8 q1, d7, d5 - vld1.8 {d10, d11}, [r1], r3 - - bne loop_sad_16x8 - - vabal.u8 q0, d10, d8 - vabal.u8 q1, d11, d9 + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + vld1.8 {d6, d7}, [r1], r3 + subs r12, #2 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 - vadd.i16 q0, q0, q1 - vadd.i16 d0, d1, d0 + bne loop_sad_16x8 - ldr r12, [sp, #12] - vpaddl.u16 d0, d0 - vpaddl.u32 d0, d0 - vst1.32 {d0[0]}, [r12] + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 - ldmfd sp!, {r12, pc} + vadd.i16 q0, q0, q1 + vadd.i16 d0, d1, d0 + vpop {d8-d15} + ldr r12, [sp, #12] + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vst1.32 {d0[0]}, [r12] + ldmfd sp!, {r12, pc} -@/** +@** @****************************************************************************** @* @* @brief computes distortion (SAD) between 2 16x16 blocks with early exit @@ -243,100 +247,103 @@ loop_sad_16x8: @* @remarks @* @****************************************************************************** -@*/ +@* + .global ime_compute_sad_16x16_ea8_a9q ime_compute_sad_16x16_ea8_a9q: - stmfd sp!, {r5-r7, lr} - lsl r2, r2, #1 - lsl r3, r3, #1 + stmfd sp!, {r5-r7, lr} + lsl r2, r2, #1 + lsl r3, r3, #1 @for bringing buffer2 into cache..., dummy load instructions @LDR r12,[r1] - vld1.8 {d4, d5}, [r0], r2 - vld1.8 {d6, d7}, [r1], r3 - mov r5, #6 - vld1.8 {d8, d9}, [r0], r2 - vabdl.u8 q0, d6, d4 - vabdl.u8 q1, d7, d5 - vld1.8 {d10, d11}, [r1], r3 - ldrd r6, r7, [sp, #16] + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + mov r5, #6 + ldrd r6, r7, [sp, #16] + vpush {d8-d15} + vld1.8 {d8, d9}, [r0], r2 + vabdl.u8 q0, d6, d4 + vabdl.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 + @r6 = i4_max_sad, r7 = pi4_mb_distortion loop_sad_16x16_ea8_1: - vld1.8 {d4, d5}, [r0], r2 - vabal.u8 q0, d10, d8 - vabal.u8 q1, d11, d9 - vld1.8 {d6, d7}, [r1], r3 - subs r5, #2 - vld1.8 {d8, d9}, [r0], r2 - vabal.u8 q0, d6, d4 - vabal.u8 q1, d7, d5 - vld1.8 {d10, d11}, [r1], r3 - - bne loop_sad_16x16_ea8_1 - - vabal.u8 q0, d10, d8 - sub r0, r0, r2, lsl #3 - vabal.u8 q1, d11, d9 - sub r1, r1, r3, lsl #3 - - vadd.i16 q6, q0, q1 - add r0, r0, r2, asr #1 - vadd.i16 d12, d12, d13 - add r1, r1, r3, asr #1 - - vpaddl.u16 d12, d12 - vld1.8 {d4, d5}, [r0], r2 - vld1.8 {d6, d7}, [r1], r3 - vpaddl.u32 d12, d12 - vld1.8 {d8, d9}, [r0], r2 - vabal.u8 q0, d6, d4 - vabal.u8 q1, d7, d5 - - vst1.32 {d12[0]}, [r7] - ldr r5, [r7] - cmp r5, r6 - bgt end_func_16x16_ea8 - - vld1.8 {d10, d11}, [r1], r3 - mov r5, #6 + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + vld1.8 {d6, d7}, [r1], r3 + subs r5, #2 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 + + bne loop_sad_16x16_ea8_1 + + vabal.u8 q0, d10, d8 + sub r0, r0, r2, lsl #3 + vabal.u8 q1, d11, d9 + sub r1, r1, r3, lsl #3 + + vadd.i16 q6, q0, q1 + add r0, r0, r2, asr #1 + vadd.i16 d12, d12, d13 + add r1, r1, r3, asr #1 + + vpaddl.u16 d12, d12 + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + vpaddl.u32 d12, d12 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + + vst1.32 {d12[0]}, [r7] + ldr r5, [r7] + cmp r5, r6 + bgt end_func_16x16_ea8 + + vld1.8 {d10, d11}, [r1], r3 + mov r5, #6 loop_sad_16x16_ea8_2: - vld1.8 {d4, d5}, [r0], r2 - vabal.u8 q0, d10, d8 - vabal.u8 q1, d11, d9 - vld1.8 {d6, d7}, [r1], r3 - subs r5, #2 - vld1.8 {d8, d9}, [r0], r2 - vabal.u8 q0, d6, d4 - vabal.u8 q1, d7, d5 - vld1.8 {d10, d11}, [r1], r3 + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + vld1.8 {d6, d7}, [r1], r3 + subs r5, #2 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 - bne loop_sad_16x16_ea8_2 + bne loop_sad_16x16_ea8_2 - vabal.u8 q0, d10, d8 - vabal.u8 q1, d11, d9 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 - vadd.i16 q0, q0, q1 - vadd.i16 d0, d1, d0 + vadd.i16 q0, q0, q1 + vadd.i16 d0, d1, d0 - vpaddl.u16 d0, d0 - vpaddl.u32 d0, d0 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 - vst1.32 {d0[0]}, [r7] + vst1.32 {d0[0]}, [r7] end_func_16x16_ea8: - - ldmfd sp!, {r5-r7, pc} + vpop {d8-d15} + ldmfd sp!, {r5-r7, pc} -@/* +@* @//--------------------------------------------------------------------------- @// Function Name : Calculate_Mad2_prog() @// @@ -346,7 +353,7 @@ end_func_16x16_ea8: @// Platform : CortexA8/NEON . @// @//----------------------------------------------------------------------------- -@*/ +@* .global ime_calculate_sad2_prog_a9q @@ -358,72 +365,72 @@ ime_calculate_sad2_prog_a9q: @ r3 = RefBufferWidth <UWORD32> @ stack = CurBufferWidth <UWORD32>, psad <UWORD32 *> - stmfd sp!, {r4-r5, lr} - - ldr r4, [sp, #8] @ load src stride to r4 - mov r5, #14 + stmfd sp!, {r4-r5, lr} + ldr r4, [sp, #8] @ load src stride to r4 + mov r5, #14 + vpush {d8-d15} @Row 1 - vld1.8 {d0, d1}, [r2], r4 @ load src Row 1 - vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1 - vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1 + vld1.8 {d0, d1}, [r2], r4 @ load src Row 1 + vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1 + vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1 @Row 2 - vld1.8 {d6, d7}, [r2], r4 @ load src Row 2 - vabdl.u8 q6, d2, d0 - vabdl.u8 q7, d3, d1 - vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2 - vabdl.u8 q8, d4, d0 - vabdl.u8 q9, d5, d1 - vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2 + vld1.8 {d6, d7}, [r2], r4 @ load src Row 2 + vabdl.u8 q6, d2, d0 + vabdl.u8 q7, d3, d1 + vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2 + vabdl.u8 q8, d4, d0 + vabdl.u8 q9, d5, d1 + vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2 loop_sad2_prog: - subs r5, #2 + subs r5, #2 @Row 1 - vld1.8 {d0, d1}, [r2], r4 @ load src Row 1 - vabal.u8 q6, d8, d6 - vabal.u8 q7, d9, d7 - vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1 - vabal.u8 q8, d10, d6 - vabal.u8 q9, d11, d7 - vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1 + vld1.8 {d0, d1}, [r2], r4 @ load src Row 1 + vabal.u8 q6, d8, d6 + vabal.u8 q7, d9, d7 + vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1 + vabal.u8 q8, d10, d6 + vabal.u8 q9, d11, d7 + vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1 @Row 2 - vld1.8 {d6, d7}, [r2], r4 @ load src Row 2 - vabal.u8 q6, d2, d0 - vabal.u8 q7, d3, d1 - vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2 - vabal.u8 q8, d4, d0 - vabal.u8 q9, d5, d1 - vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2 + vld1.8 {d6, d7}, [r2], r4 @ load src Row 2 + vabal.u8 q6, d2, d0 + vabal.u8 q7, d3, d1 + vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2 + vabal.u8 q8, d4, d0 + vabal.u8 q9, d5, d1 + vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2 - bne loop_sad2_prog + bne loop_sad2_prog - vabal.u8 q6, d8, d6 - vabal.u8 q7, d9, d7 - vabal.u8 q8, d10, d6 - vabal.u8 q9, d11, d7 + vabal.u8 q6, d8, d6 + vabal.u8 q7, d9, d7 + vabal.u8 q8, d10, d6 + vabal.u8 q9, d11, d7 @ Compute SAD - vadd.u16 q6, q6, q7 @ Q6 : sad_ref1 - vadd.u16 q8, q8, q9 @ Q8 : sad_ref2 + vadd.u16 q6, q6, q7 @ Q6 : sad_ref1 + vadd.u16 q8, q8, q9 @ Q8 : sad_ref2 - vadd.u16 d12, d12, d13 - ldr r5, [sp, #16] @ loading pi4_sad to r5 - vadd.u16 d16, d16, d17 + vadd.u16 d12, d12, d13 + ldr r5, [sp, #16] @ loading pi4_sad to r5 + vadd.u16 d16, d16, d17 - vpadd.u16 d12, d12, d16 - vpaddl.u16 d12, d12 + vpadd.u16 d12, d12, d16 + vpaddl.u16 d12, d12 - vst1.64 {d12}, [r5]! + vst1.64 {d12}, [r5]! + vpop {d8-d15} + ldmfd sp!, {r4-r5, pc} - ldmfd sp!, {r4-r5, pc} - -@/* +@* @//--------------------------------------------------------------------------- @// Function Name : Calculate_Mad3_prog() @// @@ -433,7 +440,7 @@ loop_sad2_prog: @// Platform : CortexA8/NEON . @// @//----------------------------------------------------------------------------- -@*/ +@* .global ime_calculate_sad3_prog_a9q @@ -446,90 +453,90 @@ ime_calculate_sad3_prog_a9q: @ stack = RefBufferWidth <UWORD32>, CurBufferWidth <UWORD32>, psad <UWORD32 *> - stmfd sp!, {r4-r6, lr} - - ldrd r4, r5, [sp, #16] @ load ref stride to r4, src stride to r5 - mov r6, #14 - - @ Row 1 - vld1.8 {d0, d1}, [r3], r5 @ load src Row 1 - vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1 - vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1 - vabdl.u8 q8, d2, d0 - vabdl.u8 q9, d3, d1 - vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1 - vabdl.u8 q10, d4, d0 - vabdl.u8 q11, d5, d1 - - @ Row 2 - vld1.8 {d8, d9}, [r3], r5 @ load src Row 1 - vabdl.u8 q12, d6, d0 - vabdl.u8 q13, d7, d1 - vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1 - vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1 - vabal.u8 q8, d10, d8 - vabal.u8 q9, d11, d9 - vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1 - vabal.u8 q10, d12, d8 - vabal.u8 q11, d13, d9 + stmfd sp!, {r4-r6, lr} + + ldrd r4, r5, [sp, #16] @ load ref stride to r4, src stride to r5 + mov r6, #14 + vpush {d8-d15} + @Row 1 + vld1.8 {d0, d1}, [r3], r5 @ load src Row 1 + vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1 + vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1 + vabdl.u8 q8, d2, d0 + vabdl.u8 q9, d3, d1 + vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1 + vabdl.u8 q10, d4, d0 + vabdl.u8 q11, d5, d1 + + @Row 2 + vld1.8 {d8, d9}, [r3], r5 @ load src Row 1 + vabdl.u8 q12, d6, d0 + vabdl.u8 q13, d7, d1 + vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1 + vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1 + vabal.u8 q8, d10, d8 + vabal.u8 q9, d11, d9 + vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1 + vabal.u8 q10, d12, d8 + vabal.u8 q11, d13, d9 loop_sad3_prog: @Row 1 - vld1.8 {d0, d1}, [r3], r5 @ load src Row 1 - vabal.u8 q12, d14, d8 - vabal.u8 q13, d15, d9 - vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1 - vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1 - vabal.u8 q8, d2, d0 - vabal.u8 q9, d3, d1 - vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1 - vabal.u8 q10, d4, d0 - vabal.u8 q11, d5, d1 + vld1.8 {d0, d1}, [r3], r5 @ load src Row 1 + vabal.u8 q12, d14, d8 + vabal.u8 q13, d15, d9 + vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1 + vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1 + vabal.u8 q8, d2, d0 + vabal.u8 q9, d3, d1 + vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1 + vabal.u8 q10, d4, d0 + vabal.u8 q11, d5, d1 @Row 2 - vld1.8 {d8, d9}, [r3], r5 @ load src Row 1 - vabal.u8 q12, d6, d0 - vabal.u8 q13, d7, d1 - vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1 - subs r6, #2 - vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1 - vabal.u8 q8, d10, d8 - vabal.u8 q9, d11, d9 - vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1 - vabal.u8 q10, d12, d8 - vabal.u8 q11, d13, d9 - - bne loop_sad3_prog - - vabal.u8 q12, d14, d8 - vabal.u8 q13, d15, d9 + vld1.8 {d8, d9}, [r3], r5 @ load src Row 1 + vabal.u8 q12, d6, d0 + vabal.u8 q13, d7, d1 + vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1 + subs r6, #2 + vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1 + vabal.u8 q8, d10, d8 + vabal.u8 q9, d11, d9 + vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1 + vabal.u8 q10, d12, d8 + vabal.u8 q11, d13, d9 + + bne loop_sad3_prog + + vabal.u8 q12, d14, d8 + vabal.u8 q13, d15, d9 @ Compute SAD - vadd.u16 q8, q8, q9 @ Q8 : sad_ref1 - vadd.u16 q10, q10, q11 @ Q10 : sad_ref2 - vadd.u16 q12, q12, q13 @ Q12 : sad_ref3 + vadd.u16 q8, q8, q9 @ Q8 : sad_ref1 + vadd.u16 q10, q10, q11 @ Q10 : sad_ref2 + vadd.u16 q12, q12, q13 @ Q12 : sad_ref3 - vadd.u16 d16, d16, d17 - vadd.u16 d20, d20, d21 - vadd.u16 d24, d24, d25 + vadd.u16 d16, d16, d17 + vadd.u16 d20, d20, d21 + vadd.u16 d24, d24, d25 - vpadd.u16 d16, d16, d20 - vpadd.u16 d24, d24, d24 + vpadd.u16 d16, d16, d20 + vpadd.u16 d24, d24, d24 - ldr r6, [sp, #24] @ loading pi4_sad to r6 - vpaddl.u16 d16, d16 - vpaddl.u16 d24, d24 + ldr r6, [sp, #24] @ loading pi4_sad to r6 + vpaddl.u16 d16, d16 + vpaddl.u16 d24, d24 - vst1.64 {d16}, [r6]! - vst1.32 {d24[0]}, [r6] + vst1.64 {d16}, [r6]! + vst1.32 {d24[0]}, [r6] + vpop {d8-d15} + ldmfd sp!, {r4-r6, pc} - ldmfd sp!, {r4-r6, pc} - -@/** +@** @****************************************************************************** @* @* @brief computes distortion (SAD) for sub-pel motion estimation @@ -551,7 +558,7 @@ loop_sad3_prog: @* @remarks @* @****************************************************************************** -@*/ +@* .text .p2align 2 @@ -560,115 +567,116 @@ loop_sad3_prog: ime_sub_pel_compute_sad_16x16_a9q: - stmfd sp!, {r4-r11, lr} @store register values to stack + stmfd sp!, {r4-r11, lr} @store register values to stack - ldr r9, [sp, #36] - ldr r10, [sp, #40] + ldr r9, [sp, #36] + ldr r10, [sp, #40] + vpush {d8-d15} + sub r4, r1, #1 @ x left + sub r5, r2, r10 @ y top - sub r4, r1, #1 @ x left - sub r5, r2, r10 @ y top + sub r6, r3, #1 @ xy left + sub r7, r3, r10 @ xy top - sub r6, r3, #1 @ xy left - sub r7, r3, r10 @ xy top - - sub r8, r7, #1 @ xy top-left - mov r11, #15 + sub r8, r7, #1 @ xy top-left + mov r11, #15 @for bringing buffer2 into cache..., dummy load instructions @ LDR r12,[r1] @ LDR r12,[sp,#12] - vld1.8 {d0, d1}, [r0], r9 @ src - vld1.8 {d2, d3}, [r5], r10 @ y top LOAD - vld1.8 {d4, d5}, [r7], r10 @ xy top LOAD - vld1.8 {d6, d7}, [r8], r10 @ xy top-left LOAD - - vabdl.u8 q6, d2, d0 @ y top ABS1 - vabdl.u8 q7, d4, d0 @ xy top ABS1 - vld1.8 {d8, d9}, [r1], r10 @ x LOAD - vabdl.u8 q8, d6, d0 @ xy top-left ABS1 - vabdl.u8 q9, d8, d0 @ x ABS1 - vld1.8 {d10, d11}, [r4], r10 @ x left LOAD - - vabal.u8 q6, d3, d1 @ y top ABS2 - vabal.u8 q7, d5, d1 @ xy top ABS2 - vld1.8 {d2, d3}, [r2], r10 @ y LOAD - vabal.u8 q8, d7, d1 @ xy top-left ABS2 - vabal.u8 q9, d9, d1 @ x ABS2 - vld1.8 {d4, d5}, [r3], r10 @ xy LOAD - - vabdl.u8 q10, d10, d0 @ x left ABS1 - vabdl.u8 q11, d2, d0 @ y ABS1 - vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD - vabdl.u8 q12, d4, d0 @ xy ABS1 - vabdl.u8 q13, d6, d0 @ xy left ABS1 + vld1.8 {d0, d1}, [r0], r9 @ src + vld1.8 {d2, d3}, [r5], r10 @ y top LOAD + vld1.8 {d4, d5}, [r7], r10 @ xy top LOAD + vld1.8 {d6, d7}, [r8], r10 @ xy top-left LOAD + + vabdl.u8 q6, d2, d0 @ y top ABS1 + vabdl.u8 q7, d4, d0 @ xy top ABS1 + vld1.8 {d8, d9}, [r1], r10 @ x LOAD + vabdl.u8 q8, d6, d0 @ xy top-left ABS1 + vabdl.u8 q9, d8, d0 @ x ABS1 + vld1.8 {d10, d11}, [r4], r10 @ x left LOAD + + vabal.u8 q6, d3, d1 @ y top ABS2 + vabal.u8 q7, d5, d1 @ xy top ABS2 + vld1.8 {d2, d3}, [r2], r10 @ y LOAD + vabal.u8 q8, d7, d1 @ xy top-left ABS2 + vabal.u8 q9, d9, d1 @ x ABS2 + vld1.8 {d4, d5}, [r3], r10 @ xy LOAD + + vabdl.u8 q10, d10, d0 @ x left ABS1 + vabdl.u8 q11, d2, d0 @ y ABS1 + vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD + vabdl.u8 q12, d4, d0 @ xy ABS1 + vabdl.u8 q13, d6, d0 @ xy left ABS1 loop_sub_pel_16x16: - vabal.u8 q10, d11, d1 @ x left ABS2 - vabal.u8 q11, d3, d1 @ y ABS2 - subs r11, #1 - vabal.u8 q12, d5, d1 @ xy ABS2 - vabal.u8 q13, d7, d1 @ xy left ABS2 - - vld1.8 {d0, d1}, [r0], r9 @ src - vabal.u8 q6, d2, d0 @ y top ABS1 - vabal.u8 q7, d4, d0 @ xy top ABS1 - vld1.8 {d8, d9}, [r1], r10 @ x LOAD - vabal.u8 q8, d6, d0 @ xy top-left ABS1 - vabal.u8 q9, d8, d0 @ x ABS1 - vld1.8 {d10, d11}, [r4], r10 @ x left LOAD - - vabal.u8 q6, d3, d1 @ y top ABS2 - vabal.u8 q7, d5, d1 @ xy top ABS2 - vld1.8 {d2, d3}, [r2], r10 @ y LOAD - vabal.u8 q8, d7, d1 @ xy top-left ABS2 - vabal.u8 q9, d9, d1 @ x ABS2 - vld1.8 {d4, d5}, [r3], r10 @ xy LOAD - - vabal.u8 q10, d10, d0 @ x left ABS1 - vabal.u8 q11, d2, d0 @ y ABS1 - vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD - vabal.u8 q12, d4, d0 @ xy ABS1 - vabal.u8 q13, d6, d0 @ xy left ABS1 - - bne loop_sub_pel_16x16 - - vabal.u8 q10, d11, d1 @ x left ABS2 - vabal.u8 q11, d3, d1 @ y ABS2 - vabal.u8 q12, d5, d1 @ xy ABS2 - vabal.u8 q13, d7, d1 @ xy left ABS2 - - vadd.i16 d0, d18, d19 @ x - vadd.i16 d3, d12, d13 @ y top - vadd.i16 d6, d14, d15 @ xy top - vadd.i16 d5, d26, d27 @ xy left - vadd.i16 d1, d20, d21 @ x left - vadd.i16 d2, d22, d23 @ y - vadd.i16 d4, d24, d25 @ xy - vadd.i16 d7, d16, d17 @ xy top left - - vpadd.i16 d0, d0, d1 - vpadd.i16 d2, d2, d3 - vpadd.i16 d4, d4, d5 - vpadd.i16 d6, d6, d7 - - vpaddl.u16 d0, d0 - vpaddl.u16 d2, d2 - ldr r11, [sp, #44] - vpaddl.u16 d4, d4 - vpaddl.u16 d6, d6 - - vst1.32 {d0}, [r11]! - vst1.32 {d2}, [r11]! - vst1.32 {d4}, [r11]! - vst1.32 {d6}, [r11]! - - ldmfd sp!, {r4-r11, pc} @Restoring registers from stack - - - -@/** + vabal.u8 q10, d11, d1 @ x left ABS2 + vabal.u8 q11, d3, d1 @ y ABS2 + subs r11, #1 + vabal.u8 q12, d5, d1 @ xy ABS2 + vabal.u8 q13, d7, d1 @ xy left ABS2 + + vld1.8 {d0, d1}, [r0], r9 @ src + vabal.u8 q6, d2, d0 @ y top ABS1 + vabal.u8 q7, d4, d0 @ xy top ABS1 + vld1.8 {d8, d9}, [r1], r10 @ x LOAD + vabal.u8 q8, d6, d0 @ xy top-left ABS1 + vabal.u8 q9, d8, d0 @ x ABS1 + vld1.8 {d10, d11}, [r4], r10 @ x left LOAD + + vabal.u8 q6, d3, d1 @ y top ABS2 + vabal.u8 q7, d5, d1 @ xy top ABS2 + vld1.8 {d2, d3}, [r2], r10 @ y LOAD + vabal.u8 q8, d7, d1 @ xy top-left ABS2 + vabal.u8 q9, d9, d1 @ x ABS2 + vld1.8 {d4, d5}, [r3], r10 @ xy LOAD + + vabal.u8 q10, d10, d0 @ x left ABS1 + vabal.u8 q11, d2, d0 @ y ABS1 + vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD + vabal.u8 q12, d4, d0 @ xy ABS1 + vabal.u8 q13, d6, d0 @ xy left ABS1 + + bne loop_sub_pel_16x16 + + vabal.u8 q10, d11, d1 @ x left ABS2 + vabal.u8 q11, d3, d1 @ y ABS2 + vabal.u8 q12, d5, d1 @ xy ABS2 + vabal.u8 q13, d7, d1 @ xy left ABS2 + + vadd.i16 d0, d18, d19 @ x + vadd.i16 d3, d12, d13 @ y top + vadd.i16 d6, d14, d15 @ xy top + vadd.i16 d5, d26, d27 @ xy left + vadd.i16 d1, d20, d21 @ x left + vadd.i16 d2, d22, d23 @ y + vadd.i16 d4, d24, d25 @ xy + vadd.i16 d7, d16, d17 @ xy top left + + vpadd.i16 d0, d0, d1 + vpadd.i16 d2, d2, d3 + vpadd.i16 d4, d4, d5 + vpadd.i16 d6, d6, d7 + + vpaddl.u16 d0, d0 + vpaddl.u16 d2, d2 + vpop {d8-d15} + ldr r11, [sp, #44] + vpaddl.u16 d4, d4 + vpaddl.u16 d6, d6 + + vst1.32 {d0}, [r11]! + vst1.32 {d2}, [r11]! + vst1.32 {d4}, [r11]! + vst1.32 {d6}, [r11]! + + ldmfd sp!, {r4-r11, pc} @Restoring registers from stack + + + +@** @****************************************************************************** @* @* @brief computes distortion (SAD) between 2 16x16 blocks @@ -699,7 +707,7 @@ loop_sub_pel_16x16: @* @remarks @* @****************************************************************************** -@*/ +@* .text .p2align 2 @@ -710,51 +718,52 @@ ime_compute_sad_16x16_a9q: @STMFD sp!,{r12,lr} - stmfd sp!, {r12, r14} @store register values to stack + stmfd sp!, {r12, r14} @store register values to stack @for bringing buffer2 into cache..., dummy load instructions @ LDR r12,[r1] @ LDR r12,[sp,#12] - vld1.8 {d4, d5}, [r0], r2 - vld1.8 {d6, d7}, [r1], r3 - - mov r12, #14 - vld1.8 {d8, d9}, [r0], r2 - vabdl.u8 q0, d4, d6 - vld1.8 {d10, d11}, [r1], r3 - vabdl.u8 q1, d5, d7 + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + vpush {d8-d15} + mov r12, #14 + vld1.8 {d8, d9}, [r0], r2 + vabdl.u8 q0, d4, d6 + vld1.8 {d10, d11}, [r1], r3 + vabdl.u8 q1, d5, d7 loop_sad_16x16: - vld1.8 {d4, d5}, [r0], r2 - vabal.u8 q0, d8, d10 - vld1.8 {d6, d7}, [r1], r3 - vabal.u8 q1, d9, d11 + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d8, d10 + vld1.8 {d6, d7}, [r1], r3 + vabal.u8 q1, d9, d11 - vld1.8 {d8, d9}, [r0], r2 - vabal.u8 q0, d4, d6 - subs r12, #2 - vld1.8 {d10, d11}, [r1], r3 - vabal.u8 q1, d5, d7 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d4, d6 + subs r12, #2 + vld1.8 {d10, d11}, [r1], r3 + vabal.u8 q1, d5, d7 - bne loop_sad_16x16 + bne loop_sad_16x16 - vabal.u8 q0, d8, d10 - vabal.u8 q1, d9, d11 + vabal.u8 q0, d8, d10 + vabal.u8 q1, d9, d11 - vadd.i16 q0, q0, q1 - vadd.i16 d0, d1, d0 - ldr r12, [sp, #12] + vadd.i16 q0, q0, q1 + vadd.i16 d0, d1, d0 + vpop {d8-d15} + ldr r12, [sp, #12] - vpaddl.u16 d0, d0 - vpaddl.u32 d0, d0 - vst1.32 {d0[0]}, [r12] + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vst1.32 {d0[0]}, [r12] - ldmfd sp!, {r12, pc} @Restoring registers from stack + ldmfd sp!, {r12, pc} @Restoring registers from stack -@/* +@* @//--------------------------------------------------------------------------- @// Function Name : Calculate_Mad4_prog() @// @@ -764,7 +773,7 @@ loop_sad_16x16: @// Platform : CortexA8/NEON . @// @//----------------------------------------------------------------------------- -@*/ +@* .global ime_calculate_sad4_prog_a9q @@ -775,20 +784,20 @@ ime_calculate_sad4_prog_a9q: @ r3 = CurBufferWidth <UWORD32> @ stack = psad <UWORD32 *> {at 0x34} - stmfd sp!, {r4-r7, lr} + stmfd sp!, {r4-r7, lr} @UWORD8 *left_ptr = temp_frame - 1; @UWORD8 *right_ptr = temp_frame + 1; @UWORD8 *top_ptr = temp_frame - RefBufferWidth; @UWORD8 *bot_ptr = temp_frame + RefBufferWidth; - mov r7, #14 - sub r4, r0, #0x01 @r4 = left_ptr - add r5, r0, #0x1 @r5 = right_ptr - sub r6, r0, r2 @r6 = top_ptr - add r0, r0, r2 @r0 = bot_ptr + mov r7, #14 + sub r4, r0, #0x01 @r4 = left_ptr + add r5, r0, #0x1 @r5 = right_ptr + sub r6, r0, r2 @r6 = top_ptr + add r0, r0, r2 @r0 = bot_ptr @r1 = buffer_ptr - + vpush {d8-d15} @D0:D1 : buffer @D2:D3 : top @D4:D5 : left @@ -796,94 +805,93 @@ ime_calculate_sad4_prog_a9q: @D8:D9 : bottom @Row 1 - vld1.8 {d0, d1}, [r1], r3 @ load src Row 1 - vld1.8 {d2, d3}, [r6], r2 @ load top Row 1 - vld1.8 {d4, d5}, [r4], r2 @ load left Row 1 + vld1.8 {d0, d1}, [r1], r3 @ load src Row 1 + vld1.8 {d2, d3}, [r6], r2 @ load top Row 1 + vld1.8 {d4, d5}, [r4], r2 @ load left Row 1 - vabdl.u8 q5, d2, d0 - vld1.8 {d6, d7}, [r5], r2 @ load right Row 1 - vabdl.u8 q6, d3, d1 + vabdl.u8 q5, d2, d0 + vld1.8 {d6, d7}, [r5], r2 @ load right Row 1 + vabdl.u8 q6, d3, d1 - vabdl.u8 q7, d0, d4 - vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1 - vabdl.u8 q8, d1, d5 + vabdl.u8 q7, d0, d4 + vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1 + vabdl.u8 q8, d1, d5 @Row 2 - vabdl.u8 q9, d0, d6 - vld1.8 {d26, d27}, [r1], r3 @ load src Row 2 - vabdl.u8 q10, d1, d7 + vabdl.u8 q9, d0, d6 + vld1.8 {d26, d27}, [r1], r3 @ load src Row 2 + vabdl.u8 q10, d1, d7 - vabdl.u8 q11, d0, d8 - vld1.8 {d2, d3}, [r6], r2 @ load top Row 2 - vabdl.u8 q12, d1, d9 + vabdl.u8 q11, d0, d8 + vld1.8 {d2, d3}, [r6], r2 @ load top Row 2 + vabdl.u8 q12, d1, d9 loop_sad4_prog: - vabal.u8 q5, d26, d2 - vld1.8 {d4, d5}, [r4], r2 @ load left Row 2 - vabal.u8 q6, d27, d3 + vabal.u8 q5, d26, d2 + vld1.8 {d4, d5}, [r4], r2 @ load left Row 2 + vabal.u8 q6, d27, d3 - vabal.u8 q7, d26, d4 - vld1.8 {d6, d7}, [r5], r2 @ load right Row 2 - vabal.u8 q8, d27, d5 + vabal.u8 q7, d26, d4 + vld1.8 {d6, d7}, [r5], r2 @ load right Row 2 + vabal.u8 q8, d27, d5 - vabal.u8 q9, d26, d6 - vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2 - vabal.u8 q10, d27, d7 + vabal.u8 q9, d26, d6 + vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2 + vabal.u8 q10, d27, d7 @Row 1 - vabal.u8 q11, d26, d8 - vld1.8 {d0, d1}, [r1], r3 @ load src Row 1 - vabal.u8 q12, d27, d9 - - vld1.8 {d2, d3}, [r6], r2 @ load top Row 1 - subs r7, #2 - vld1.8 {d4, d5}, [r4], r2 @ load left Row 1 + vabal.u8 q11, d26, d8 + vld1.8 {d0, d1}, [r1], r3 @ load src Row 1 + vabal.u8 q12, d27, d9 - vabal.u8 q5, d0, d2 + vld1.8 {d2, d3}, [r6], r2 @ load top Row 1 + subs r7, #2 + vld1.8 {d4, d5}, [r4], r2 @ load left Row 1 - vld1.8 {d6, d7}, [r5], r2 @ load right Row 1 - vabal.u8 q6, d1, d3 + vabal.u8 q5, d0, d2 + vld1.8 {d6, d7}, [r5], r2 @ load right Row 1 + vabal.u8 q6, d1, d3 - vabal.u8 q7, d0, d4 - vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1 - vabal.u8 q8, d1, d5 + vabal.u8 q7, d0, d4 + vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1 + vabal.u8 q8, d1, d5 @Row 2 - vabal.u8 q9, d0, d6 - vld1.8 {d26, d27}, [r1], r3 @ load src Row 2 - vabal.u8 q10, d1, d7 + vabal.u8 q9, d0, d6 + vld1.8 {d26, d27}, [r1], r3 @ load src Row 2 + vabal.u8 q10, d1, d7 - vabal.u8 q11, d0, d8 - vld1.8 {d2, d3}, [r6], r2 @ load top Row 2 - vabal.u8 q12, d1, d9 + vabal.u8 q11, d0, d8 + vld1.8 {d2, d3}, [r6], r2 @ load top Row 2 + vabal.u8 q12, d1, d9 - bne loop_sad4_prog + bne loop_sad4_prog - vabal.u8 q5, d26, d2 - vld1.8 {d4, d5}, [r4], r2 @ load left Row 2 - vabal.u8 q6, d27, d3 + vabal.u8 q5, d26, d2 + vld1.8 {d4, d5}, [r4], r2 @ load left Row 2 + vabal.u8 q6, d27, d3 - vabal.u8 q7, d26, d4 - vld1.8 {d6, d7}, [r5], r2 @ load right Row 2 - vabal.u8 q8, d27, d5 + vabal.u8 q7, d26, d4 + vld1.8 {d6, d7}, [r5], r2 @ load right Row 2 + vabal.u8 q8, d27, d5 - vabal.u8 q9, d26, d6 - vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2 - vabal.u8 q10, d27, d7 + vabal.u8 q9, d26, d6 + vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2 + vabal.u8 q10, d27, d7 - vabal.u8 q11, d26, d8 - vabal.u8 q12, d27, d9 + vabal.u8 q11, d26, d8 + vabal.u8 q12, d27, d9 @;Q5:Q6 : sad_top @;Q7:Q8 : sad_left @;Q9:Q10 : sad_right @;Q11:Q12 : sad_bot - vadd.u16 q5, q5, q6 - vadd.u16 q7, q7, q8 - vadd.u16 q9, q9, q10 - vadd.u16 q11, q11, q12 + vadd.u16 q5, q5, q6 + vadd.u16 q7, q7, q8 + vadd.u16 q9, q9, q10 + vadd.u16 q11, q11, q12 @; Free :- @; Q6,Q8,Q10,Q12 @@ -893,10 +901,10 @@ loop_sad4_prog: @;Q9 -> D18:D19 @;Q11 -> D22:D23 - vadd.u16 d10, d10, d11 - vadd.u16 d14, d14, d15 - vadd.u16 d18, d18, d19 - vadd.u16 d22, d22, d23 + vadd.u16 d10, d10, d11 + vadd.u16 d14, d14, d15 + vadd.u16 d18, d18, d19 + vadd.u16 d22, d22, d23 @;D10 : sad_top @;D14 : sad_left @@ -904,35 +912,35 @@ loop_sad4_prog: @;D22 : sad_bot - vpaddl.u16 d11, d10 - vpaddl.u16 d15, d14 - vpaddl.u16 d19, d18 - vpaddl.u16 d23, d22 + vpaddl.u16 d11, d10 + vpaddl.u16 d15, d14 + vpaddl.u16 d19, d18 + vpaddl.u16 d23, d22 @;D11 : sad_top @;D15 : sad_left @;D19 : sad_right @;D23 : sad_bot - vpaddl.u32 d10, d11 - vpaddl.u32 d22, d23 - vpaddl.u32 d14, d15 - vpaddl.u32 d18, d19 + vpaddl.u32 d10, d11 + vpaddl.u32 d22, d23 + vpaddl.u32 d14, d15 + vpaddl.u32 d18, d19 @;D10 : sad_top @;D14 : sad_left @;D18 : sad_right @;D22 : sad_bot - ldr r4, [sp, #20] @;Can be rearranged - - vsli.64 d10, d22, #32 - vsli.64 d14, d18, #32 + ldr r4, [sp, #84] @;Can be rearranged - vst1.64 {d14}, [r4]! - vst1.64 {d10}, [r4]! + vsli.64 d10, d22, #32 + vsli.64 d14, d18, #32 - ldmfd sp!, {r4-r7, pc} + vst1.64 {d14}, [r4]! + vst1.64 {d10}, [r4]! + vpop {d8-d15} + ldmfd sp!, {r4-r7, pc} @@ -974,37 +982,37 @@ ime_compute_satqd_16x16_lumainter_a9q: @R5 :Distortion,ie SAD @R6 :is nonzero - push {r4-r12, lr} @push all the variables first + push {r4-r12, lr} @push all the variables first @ADD SP,SP,#40 ;decrement stack pointer,to accomodate two variables - ldr r4, [sp, #40] @load the threshold address - - mov r8, #8 @Number of 4x8 blocks to be processed - mov r10, #0 @Sad - mov r7, #0 @Nonzero info + ldr r4, [sp, #40] @load the threshold address + vpush {d8-d15} + mov r8, #8 @Number of 4x8 blocks to be processed + mov r10, #0 @Sad + mov r7, #0 @Nonzero info @---------------------------------------------------- - vld1.u8 d30, [r0], r2 @I load 8 pix src row 1 + vld1.u8 d30, [r0], r2 @I load 8 pix src row 1 - vld1.u8 d31, [r1], r3 @I load 8 pix pred row 1 + vld1.u8 d31, [r1], r3 @I load 8 pix pred row 1 - vld1.u8 d28, [r0], r2 @I load 8 pix src row 2 + vld1.u8 d28, [r0], r2 @I load 8 pix src row 2 - vld1.u8 d29, [r1], r3 @I load 8 pix pred row 2 + vld1.u8 d29, [r1], r3 @I load 8 pix pred row 2 - vld1.u8 d26, [r0], r2 @I load 8 pix src row 3 - vabdl.u8 q0, d30, d31 @I Abs diff r1 blk 12 + vld1.u8 d26, [r0], r2 @I load 8 pix src row 3 + vabdl.u8 q0, d30, d31 @I Abs diff r1 blk 12 - vld1.u8 d27, [r1], r3 @I load 8 pix pred row 3 + vld1.u8 d27, [r1], r3 @I load 8 pix pred row 3 - vld1.u8 d24, [r0], r2 @I load 8 pix src row 4 + vld1.u8 d24, [r0], r2 @I load 8 pix src row 4 - vld1.u8 d25, [r1], r3 @I load 8 pix pred row 4 - vabdl.u8 q1, d28, d29 @I Abs diff r1 blk 12 + vld1.u8 d25, [r1], r3 @I load 8 pix pred row 4 + vabdl.u8 q1, d28, d29 @I Abs diff r1 blk 12 - vld1.u16 {q11}, [r4] @I load the threhold - vabdl.u8 q2, d26, d27 @I Abs diff r1 blk 12 + vld1.u16 {q11}, [r4] @I load the threhold + vabdl.u8 q2, d26, d27 @I Abs diff r1 blk 12 - vabdl.u8 q3, d24, d25 @I Abs diff r1 blk 12 + vabdl.u8 q3, d24, d25 @I Abs diff r1 blk 12 @@ -1013,128 +1021,128 @@ core_loop: @S5 S6 S7 S8 A5 A6 A7 A8 @S9 S10 S11 S12 A9 A10 A11 A12 @S13 S14 S15 S16 A13 A14 A15 A16 - ands r11, r8, #1 @II See if we are at even or odd block - vadd.u16 q4 , q0, q3 @I Add r1 r4 - lsl r11, r2, #2 @II Move back src 4 rows + ands r11, r8, #1 @II See if we are at even or odd block + vadd.u16 q4 , q0, q3 @I Add r1 r4 + lsl r11, r2, #2 @II Move back src 4 rows - subeq r0, r0, r11 @II Move back src 4 rows if we are at even block - vadd.u16 q5 , q1, q2 @I Add r2 r3 - addeq r0, r0, #8 @II Move src 8 cols forward if we are at even block + subeq r0, r0, r11 @II Move back src 4 rows if we are at even block + vadd.u16 q5 , q1, q2 @I Add r2 r3 + addeq r0, r0, #8 @II Move src 8 cols forward if we are at even block - lsl r11, r3, #2 @II Move back pred 4 rows - vtrn.16 d8 , d10 @I trnspse 1 - subeq r1, r1, r11 @II Move back pred 4 rows if we are at even block + lsl r11, r3, #2 @II Move back pred 4 rows + vtrn.16 d8 , d10 @I trnspse 1 + subeq r1, r1, r11 @II Move back pred 4 rows if we are at even block - addeq r1, r1, #8 @II Move pred 8 cols forward if we are at even block - vtrn.16 d9 , d11 @I trnspse 2 - subne r0, r0, #8 @II Src 8clos back for odd rows + addeq r1, r1, #8 @II Move pred 8 cols forward if we are at even block + vtrn.16 d9 , d11 @I trnspse 2 + subne r0, r0, #8 @II Src 8clos back for odd rows - subne r1, r1, #8 @II Pred 8 cols back for odd rows - vtrn.32 d10, d11 @I trnspse 4 + subne r1, r1, #8 @II Pred 8 cols back for odd rows + vtrn.32 d10, d11 @I trnspse 4 - vtrn.32 d8 , d9 @I trnspse 3 - vswp d10, d11 @I rearrange so that the q4 and q5 add properly + vtrn.32 d8 , d9 @I trnspse 3 + vswp d10, d11 @I rearrange so that the q4 and q5 add properly @D8 S1 S4 A1 A4 @D9 S2 S3 A2 A3 @D11 S1 S4 A1 A4 @D10 S2 S3 A2 A3 - vadd.s16 q6, q4, q5 @I Get s1 s4 - vld1.u8 d30, [r0], r2 @II load first 8 pix src row 1 + vadd.s16 q6, q4, q5 @I Get s1 s4 + vld1.u8 d30, [r0], r2 @II load first 8 pix src row 1 - vtrn.s16 d12, d13 @I Get s2 s3 + vtrn.s16 d12, d13 @I Get s2 s3 @D12 S1 S4 A1 A4 @D13 S2 S3 A2 A3 - vshl.s16 q7, q6 , #1 @I si = si<<1 - vld1.u8 d31, [r1], r3 @II load first 8 pix pred row 1 + vshl.s16 q7, q6 , #1 @I si = si<<1 + vld1.u8 d31, [r1], r3 @II load first 8 pix pred row 1 - vpadd.s16 d16, d12, d13 @I (s1 + s4) (s2 + s3) - vld1.u8 d28, [r0], r2 @II load first 8 pix src row 2 + vpadd.s16 d16, d12, d13 @I (s1 + s4) (s2 + s3) + vld1.u8 d28, [r0], r2 @II load first 8 pix src row 2 @ D16 S14 A14 S23 A23 - vrev32.16 d0, d16 @I - vuzp.s16 d16, d0 @I + vrev32.16 d0, d16 @I + vuzp.s16 d16, d0 @I @D16 S14 S23 A14 A23 - vadd.s16 d17, d12, d13 @I (s1 + s2) (s3 + s4) - vld1.u8 d29, [r1], r3 @II load first 8 pix pred row 2 + vadd.s16 d17, d12, d13 @I (s1 + s2) (s3 + s4) + vld1.u8 d29, [r1], r3 @II load first 8 pix pred row 2 @D17 S12 S34 A12 A34 - vrev32.16 q9, q7 @I Rearrange si's + vrev32.16 q9, q7 @I Rearrange si's @Q9 Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2 @D12 S1 S4 A1 A4 @D19 Z3 Z2 Y3 Y2 - vsub.s16 d8, d12, d19 @I (s1 - (s3<<1)) (s4 - (s2<<1)) - vld1.u8 d26, [r0], r2 @II load first 8 pix src row 3 + vsub.s16 d8, d12, d19 @I (s1 - (s3<<1)) (s4 - (s2<<1)) + vld1.u8 d26, [r0], r2 @II load first 8 pix src row 3 @D13 S2 S3 A2 A3 @D18 Z4 Z1 Y4 Y1 - vsub.s16 d9, d13, d18 @I (s2 - (s4<<1)) (s3 - (s1<<1)) - vld1.u8 d27, [r1], r3 @II load first 8 pix pred row 3 + vsub.s16 d9, d13, d18 @I (s2 - (s4<<1)) (s3 - (s1<<1)) + vld1.u8 d27, [r1], r3 @II load first 8 pix pred row 3 @Q10 S8 S5 A8 A5 S7 S4 A7 A4 @D16 S14 S23 A14 A23 - vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4 - vld1.u8 d24, [r0], r2 @II load first 8 pix src row 4 + vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4 + vld1.u8 d24, [r0], r2 @II load first 8 pix src row 4 @D22 SAD1 SAD2 junk junk @Q8 S2 S1 A2 A1 S6 S3 A6 A3 @Q10 S8 S5 A8 A5 S7 S4 A7 A4 - vtrn.32 q8, q4 @I Rearrange to make ls of each block togather + vtrn.32 q8, q4 @I Rearrange to make ls of each block togather @Q8 S2 S1 S8 S5 S6 S3 S7 S4 @Q10 A2 A1 A8 A5 A6 A3 A7 A4 - ldrh r11, [r4, #16] @I Load the threshold for DC val blk 1 - vdup.s16 q6, d10[0] @I Get the sad blk 1 - vabdl.u8 q0, d30, d31 @II Abs diff r1 blk 12 + ldrh r11, [r4, #16] @I Load the threshold for DC val blk 1 + vdup.s16 q6, d10[0] @I Get the sad blk 1 + vabdl.u8 q0, d30, d31 @II Abs diff r1 blk 12 - vshl.s16 q7, q6, #1 @I sad_2 = sad_1<<1 - vmov.s16 r9, d10[0] @I Get the sad for block 1 + vshl.s16 q7, q6, #1 @I sad_2 = sad_1<<1 + vmov.s16 r9, d10[0] @I Get the sad for block 1 - vsub.s16 q9, q7, q8 @I Add to the lss - vmov.s16 r5, d10[1] @I Get the sad for block 2 + vsub.s16 q9, q7, q8 @I Add to the lss + vmov.s16 r5, d10[1] @I Get the sad for block 2 - vcle.s16 q7, q11, q9 @I Add to the lss - vld1.u8 d25, [r1], r3 @II load first 8 pix pred row 4 + vcle.s16 q7, q11, q9 @I Add to the lss + vld1.u8 d25, [r1], r3 @II load first 8 pix pred row 4 - vdup.s16 q15, d10[1] @I Get the sad blk 1 - vabdl.u8 q1, d28, d29 @II Abs diff r1 blk 12 + vdup.s16 q15, d10[1] @I Get the sad blk 1 + vabdl.u8 q1, d28, d29 @II Abs diff r1 blk 12 - vshl.s16 q14, q15, #1 @I sad_2 = sad_1<<1 - vsub.s16 q3, q14, q4 @I Add to the lss - vcle.s16 q15, q11, q3 @I Add to the lss + vshl.s16 q14, q15, #1 @I sad_2 = sad_1<<1 + vsub.s16 q3, q14, q4 @I Add to the lss + vcle.s16 q15, q11, q3 @I Add to the lss - ADD R10, R10, R9 @I Add to the global sad blk 1 - vtrn.u8 q15, q7 @I get all comparison bits to one reg - vabdl.u8 q2, d26, d27 @II Abs diff r1 blk 12 + ADD R10, R10, R9 @I Add to the global sad blk 1 + vtrn.u8 q15, q7 @I get all comparison bits to one reg + vabdl.u8 q2, d26, d27 @II Abs diff r1 blk 12 - ADD R10, R10, R5 @I Add to the global sad blk 2 - vshr.u8 q14, q15, #7 @I Shift the bits so that no overflow occurs - cmp r11, r9 + ADD R10, R10, R5 @I Add to the global sad blk 2 + vshr.u8 q14, q15, #7 @I Shift the bits so that no overflow occurs + cmp r11, r9 - movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 1 ;I Compare with threshold blk 1 - vadd.u8 d28, d28, d29 @I Add the bits - cmp r11, r5 @I Compare with threshold blk 2 + movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 1 ;I Compare with threshold blk 1 + vadd.u8 d28, d28, d29 @I Add the bits + cmp r11, r5 @I Compare with threshold blk 2 - movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 2 - vpadd.u8 d28, d28, d29 @I Add the bits + movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 2 + vpadd.u8 d28, d28, d29 @I Add the bits - vmov.u32 r11, d28[0] @I Since a set bit now represents a unstatisofrd contifon store it in r11 - vabdl.u8 q3, d24, d25 @II Abs diff r1 blk 12 + vmov.u32 r11, d28[0] @I Since a set bit now represents a unstatisofrd contifon store it in r11 + vabdl.u8 q3, d24, d25 @II Abs diff r1 blk 12 - orr r7, r7, r11 @I get the guy to r11 + orr r7, r7, r11 @I get the guy to r11 - sub r8, r8, #1 @I Decremrnt block count + sub r8, r8, #1 @I Decremrnt block count - cmp r7, #0 @I If we have atlest one non zero block - bne compute_sad_only @I if a non zero block is der,From now on compute sad only + cmp r7, #0 @I If we have atlest one non zero block + bne compute_sad_only @I if a non zero block is der,From now on compute sad only - cmp r8, #1 @I See if we are at the last block - bne core_loop @I If the blocks are zero, lets continue the satdq + cmp r8, #1 @I See if we are at the last block + bne core_loop @I If the blocks are zero, lets continue the satdq @EPILOUGE for core loop @@ -1142,94 +1150,94 @@ core_loop: @S5 S6 S7 S8 A5 A6 A7 A8 @S9 S10 S11 S12 A9 A10 A11 A12 @S13 S14 S15 S16 A13 A14 A15 A16 - vadd.u16 q4 , q0, q3 @Add r1 r4 - vadd.u16 q5 , q1, q2 @Add r2 r3 + vadd.u16 q4 , q0, q3 @Add r1 r4 + vadd.u16 q5 , q1, q2 @Add r2 r3 @D8 S1 S2 S2 S1 @D10 S4 S3 S3 S4 @D9 A1 A2 A2 A1 @D11 A4 A3 A3 A4 - vtrn.16 d8 , d10 @I trnspse 1 - vtrn.16 d9 , d11 @I trnspse 2 - vtrn.32 d8 , d9 @I trnspse 3 - vtrn.32 d10, d11 @I trnspse 4 + vtrn.16 d8 , d10 @I trnspse 1 + vtrn.16 d9 , d11 @I trnspse 2 + vtrn.32 d8 , d9 @I trnspse 3 + vtrn.32 d10, d11 @I trnspse 4 - vswp d10, d11 @I rearrange so that the q4 and q5 add properly + vswp d10, d11 @I rearrange so that the q4 and q5 add properly @D8 S1 S4 A1 A4 @D9 S2 S3 A2 A3 @D11 S1 S4 A1 A4 @D10 S2 S3 A2 A3 - vadd.s16 q6, q4, q5 @Get s1 s4 - vtrn.s16 d12, d13 @Get s2 s3 + vadd.s16 q6, q4, q5 @Get s1 s4 + vtrn.s16 d12, d13 @Get s2 s3 @D12 S1 S4 A1 A4 @D13 S2 S3 A2 A3 - vshl.s16 q7, q6 , #1 @si = si<<1 - vmov.s16 r9, d10[0] @Get the sad for block 1 + vshl.s16 q7, q6 , #1 @si = si<<1 + vmov.s16 r9, d10[0] @Get the sad for block 1 - vpadd.s16 d16, d12, d13 @(s1 + s4) (s2 + s3) - vmov.s16 r5, d10[1] @Get the sad for block 2 + vpadd.s16 d16, d12, d13 @(s1 + s4) (s2 + s3) + vmov.s16 r5, d10[1] @Get the sad for block 2 @D16 S14 A14 S23 A23 - vrev32.16 d30, d16 @ - vuzp.s16 d16, d30 @ + vrev32.16 d30, d16 @ + vuzp.s16 d16, d30 @ @D16 S14 S23 A14 A23 - vadd.s16 d17, d12, d13 @(s1 + s2) (s3 + s4) + vadd.s16 d17, d12, d13 @(s1 + s2) (s3 + s4) @D17 S12 S34 A12 A34 - vrev32.16 q9, q7 @Rearrange si's + vrev32.16 q9, q7 @Rearrange si's @Q9 Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2 @D12 S1 S4 A1 A4 @D19 Z3 Z2 Y3 Y2 - vsub.s16 d8, d12, d19 @(s1 - (s3<<1)) (s4 - (s2<<1)) + vsub.s16 d8, d12, d19 @(s1 - (s3<<1)) (s4 - (s2<<1)) @D13 S2 S3 A2 A3 @D18 Z4 Z1 Y4 Y1 - vsub.s16 d9, d13, d18 @(s2 - (s4<<1)) (s3 - (s1<<1)) + vsub.s16 d9, d13, d18 @(s2 - (s4<<1)) (s3 - (s1<<1)) @Q10 S8 S5 A8 A5 S7 S4 A7 A4 @D16 S14 S23 A14 A23 - vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4 + vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4 @D22 SAD1 SAD2 junk junk - vmov.u16 r9, d10[0] @Get the sad for block 1 - vmov.u16 r5, d10[1] @Get the sad for block 2 + vmov.u16 r9, d10[0] @Get the sad for block 1 + vmov.u16 r5, d10[1] @Get the sad for block 2 @Q8 S2 S1 A2 A1 S6 S3 A6 A3 @Q10 S8 S5 A8 A5 S7 S4 A7 A4 - ldrh r11, [r4, #16] @Load the threshold for DC val blk 1 - vtrn.32 q8, q4 @Rearrange to make ls of each block togather - ADD R10, R10, R9 @Add to the global sad blk 1 + ldrh r11, [r4, #16] @Load the threshold for DC val blk 1 + vtrn.32 q8, q4 @Rearrange to make ls of each block togather + ADD R10, R10, R9 @Add to the global sad blk 1 @Q8 S2 S1 S8 S5 S6 S3 S7 S4 @Q10 A2 A1 A8 A5 A6 A3 A7 A4 - vld1.u16 {q11}, [r4] @load the threhold - ADD R10, R10, R5 @Add to the global sad blk 2 + vld1.u16 {q11}, [r4] @load the threhold + ADD R10, R10, R5 @Add to the global sad blk 2 - vdup.u16 q6, d10[0] @Get the sad blk 1 + vdup.u16 q6, d10[0] @Get the sad blk 1 - cmp r11, r9 @Compare with threshold blk 1 - vshl.u16 q7, q6, #1 @sad_2 = sad_1<<1 + cmp r11, r9 @Compare with threshold blk 1 + vshl.u16 q7, q6, #1 @sad_2 = sad_1<<1 - vsub.s16 q9, q7, q8 @Add to the lss + vsub.s16 q9, q7, q8 @Add to the lss - vcle.s16 q15, q11, q9 @Add to the lss - movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 1 + vcle.s16 q15, q11, q9 @Add to the lss + movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 1 - cmp r11, r5 @Compare with threshold blk 2 - vdup.u16 q14, d10[1] @Get the sad blk 1 + cmp r11, r5 @Compare with threshold blk 2 + vdup.u16 q14, d10[1] @Get the sad blk 1 - vshl.u16 q13, q14, #1 @sad_2 = sad_1<<1 - vsub.s16 q12, q13, q4 @Add to the lss - vcle.s16 q14, q11, q12 @Add to the lss - movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 2 + vshl.u16 q13, q14, #1 @sad_2 = sad_1<<1 + vsub.s16 q12, q13, q4 @Add to the lss + vcle.s16 q14, q11, q12 @Add to the lss + movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 2 - vtrn.u8 q14, q15 @get all comparison bits to one reg - vshr.u8 q14, q14, #7 @Shift the bits so that no overflow occurs - vadd.u8 d28, d28, d29 @Add the bits - vpadd.u8 d28, d28, d29 @Add the bits - vmov.u32 r11, d28[0] @Since a set bit now represents a unstatisofrd contifon store it in r11 - orr r7, r7, r11 @get the guy to r11 + vtrn.u8 q14, q15 @get all comparison bits to one reg + vshr.u8 q14, q14, #7 @Shift the bits so that no overflow occurs + vadd.u8 d28, d28, d29 @Add the bits + vpadd.u8 d28, d28, d29 @Add the bits + vmov.u32 r11, d28[0] @Since a set bit now represents a unstatisofrd contifon store it in r11 + orr r7, r7, r11 @get the guy to r11 - b funcend_sad_16x16 @Since all blocks ar processed nw, got to end + b funcend_sad_16x16 @Since all blocks ar processed nw, got to end compute_sad_only: @This block computes SAD only, so will be lighter @IT will start processign at n odd block @@ -1237,117 +1245,119 @@ compute_sad_only: @This block computes SAD only, so will b @and then for two blocks at a time @The counter is r7, hence r7 blocks will be processed - and r11, r8, #1 @Get the last bit of counter - cmp r11, #0 @See if we are at even or odd block + and r11, r8, #1 @Get the last bit of counter + cmp r11, #0 @See if we are at even or odd block @iif the blk is even we just have to set the pointer to the @start of current row - lsleq r11, r2, #2 @I Move back src 4 rows - subeq r0, r0, r11 @I Move back src 4 rows if we are at even block + lsleq r11, r2, #2 @I Move back src 4 rows + subeq r0, r0, r11 @I Move back src 4 rows if we are at even block - lsleq r11, r3, #2 @I Move back pred 4 rows - subeq r1, r1, r11 @I Move back pred 4 rows if we are at even block + lsleq r11, r3, #2 @I Move back pred 4 rows + subeq r1, r1, r11 @I Move back pred 4 rows if we are at even block @ADDEQ R8,R8,#2 ;Inc counter - beq skip_odd_blk @If the blk is odd we have to compute sad + beq skip_odd_blk @If the blk is odd we have to compute sad - vadd.u16 q4, q0, q1 @Add SAD of row1 and row2 - vadd.u16 q5, q2, q3 @Add SAD of row3 and row4 - vadd.u16 q6, q4, q5 @Add SAD of row 1-4 - vadd.u16 d14, d12, d13 @Add Blk1 and blk2 - vpadd.u16 d16, d14, d15 @Add col 1-2 and 3-4 - vpadd.u16 d18, d16, d17 @Add col 12-34 + vadd.u16 q4, q0, q1 @Add SAD of row1 and row2 + vadd.u16 q5, q2, q3 @Add SAD of row3 and row4 + vadd.u16 q6, q4, q5 @Add SAD of row 1-4 + vadd.u16 d14, d12, d13 @Add Blk1 and blk2 + vpadd.u16 d16, d14, d15 @Add col 1-2 and 3-4 + vpadd.u16 d18, d16, d17 @Add col 12-34 - vmov.u16 r9, d18[0] @Move sad to arm - ADD R10, R10, R9 @Add to the global sad + vmov.u16 r9, d18[0] @Move sad to arm + ADD R10, R10, R9 @Add to the global sad - sub r8, r8, #1 @Dec counter - cmp r8, #0 @See if we processed last block - beq funcend_sad_16x16 @if lprocessed last block goto end of func + sub r8, r8, #1 @Dec counter + cmp r8, #0 @See if we processed last block + beq funcend_sad_16x16 @if lprocessed last block goto end of func - sub r0, r0, #8 @Since we processed od block move back src by 8 cols - sub r1, r1, #8 @Since we processed od block move back pred by 8 cols + sub r0, r0, #8 @Since we processed od block move back src by 8 cols + sub r1, r1, #8 @Since we processed od block move back pred by 8 cols skip_odd_blk: - vmov.s16 q0, #0 @Initialize the accumulator - vmov.s16 q1, #0 @Initialize the accumulator + vmov.s16 q0, #0 @Initialize the accumulator + vmov.s16 q1, #0 @Initialize the accumulator - vld1.u8 {q15}, [r0], r2 @load src r1 - vld1.u8 {q14}, [r1], r3 @load pred r1 + vld1.u8 {q15}, [r0], r2 @load src r1 + vld1.u8 {q14}, [r1], r3 @load pred r1 - vld1.u8 {q13}, [r0], r2 @load src r2 - vld1.u8 {q12}, [r1], r3 @load pred r2 + vld1.u8 {q13}, [r0], r2 @load src r2 + vld1.u8 {q12}, [r1], r3 @load pred r2 - vld1.u8 {q11}, [r0], r2 @load src r3 - vld1.u8 {q10}, [r1], r3 @load pred r2 + vld1.u8 {q11}, [r0], r2 @load src r3 + vld1.u8 {q10}, [r1], r3 @load pred r2 - vld1.u8 {q9}, [r0], r2 @load src r4 - vld1.u8 {q8}, [r1], r3 @load pred r4 + vld1.u8 {q9}, [r0], r2 @load src r4 + vld1.u8 {q8}, [r1], r3 @load pred r4 - cmp r8, #2 - beq sad_epilouge + cmp r8, #2 + beq sad_epilouge sad_loop: - vabal.u8 q0, d30, d28 @I accumulate Abs diff R1 - vabal.u8 q1, d31, d29 @I accumulate Abs diff R1 + vabal.u8 q0, d30, d28 @I accumulate Abs diff R1 + vabal.u8 q1, d31, d29 @I accumulate Abs diff R1 - vld1.u8 {q15}, [r0], r2 @II load r1 src - vabal.u8 q0, d26, d24 @I accumulate Abs diff R2 + vld1.u8 {q15}, [r0], r2 @II load r1 src + vabal.u8 q0, d26, d24 @I accumulate Abs diff R2 - vld1.u8 {q14}, [r1], r3 @II load r1 pred - vabal.u8 q1, d27, d25 @I accumulate Abs diff R2 + vld1.u8 {q14}, [r1], r3 @II load r1 pred + vabal.u8 q1, d27, d25 @I accumulate Abs diff R2 - vld1.u8 {q13}, [r0], r2 @II load r3 src - vabal.u8 q0, d22, d20 @I accumulate Abs diff R3 + vld1.u8 {q13}, [r0], r2 @II load r3 src + vabal.u8 q0, d22, d20 @I accumulate Abs diff R3 - vld1.u8 {q12}, [r1], r3 @II load r2 pred - vabal.u8 q1, d23, d21 @I accumulate Abs diff R3 + vld1.u8 {q12}, [r1], r3 @II load r2 pred + vabal.u8 q1, d23, d21 @I accumulate Abs diff R3 - vld1.u8 {q11}, [r0], r2 @II load r3 src - vabal.u8 q0, d18, d16 @I accumulate Abs diff R4 + vld1.u8 {q11}, [r0], r2 @II load r3 src + vabal.u8 q0, d18, d16 @I accumulate Abs diff R4 - sub r8, r8, #2 @Since we processe 16 pix @a time, dec by 2 - vld1.u8 {q10}, [r1], r3 @II load r3 pred - vabal.u8 q1, d19, d17 @I accumulate Abs diff R4 + sub r8, r8, #2 @Since we processe 16 pix @a time, dec by 2 + vld1.u8 {q10}, [r1], r3 @II load r3 pred + vabal.u8 q1, d19, d17 @I accumulate Abs diff R4 - cmp r8, #2 @Check if last loop - vld1.u8 {q9}, [r0], r2 @II load r4 src - vld1.u8 {q8}, [r1], r3 @II load r4 pred + cmp r8, #2 @Check if last loop + vld1.u8 {q9}, [r0], r2 @II load r4 src + vld1.u8 {q8}, [r1], r3 @II load r4 pred - bne sad_loop @Go back to SAD computation + bne sad_loop @Go back to SAD computation sad_epilouge: - vabal.u8 q0, d30, d28 @Accumulate Abs diff R1 - vabal.u8 q1, d31, d29 @Accumulate Abs diff R1 + vabal.u8 q0, d30, d28 @Accumulate Abs diff R1 + vabal.u8 q1, d31, d29 @Accumulate Abs diff R1 - vabal.u8 q0, d26, d24 @Accumulate Abs diff R2 - vabal.u8 q1, d27, d25 @Accumulate Abs diff R2 + vabal.u8 q0, d26, d24 @Accumulate Abs diff R2 + vabal.u8 q1, d27, d25 @Accumulate Abs diff R2 - vabal.u8 q0, d22, d20 @Accumulate Abs diff R3 - vabal.u8 q1, d23, d21 @Aaccumulate Abs diff R3 + vabal.u8 q0, d22, d20 @Accumulate Abs diff R3 + vabal.u8 q1, d23, d21 @Aaccumulate Abs diff R3 - vabal.u8 q0, d18, d16 @Accumulate Abs diff R4 - vabal.u8 q1, d19, d17 @Accumulate Abs diff R4 + vabal.u8 q0, d18, d16 @Accumulate Abs diff R4 + vabal.u8 q1, d19, d17 @Accumulate Abs diff R4 - vadd.u16 q2, q0, q1 @ADD two accumulators - vadd.u16 d6, d4, d5 @Add two blk sad - vpadd.u16 d8, d6, d7 @Add col 1-2 and 3-4 sad - vpadd.u16 d10, d8, d9 @Add col 12-34 sad + vadd.u16 q2, q0, q1 @ADD two accumulators + vadd.u16 d6, d4, d5 @Add two blk sad + vpadd.u16 d8, d6, d7 @Add col 1-2 and 3-4 sad + vpadd.u16 d10, d8, d9 @Add col 12-34 sad - vmov.u16 r9, d10[0] @move SAD to ARM - ADD R10, R10, R9 @Add to the global sad + vmov.u16 r9, d10[0] @move SAD to ARM + ADD R10, R10, R9 @Add to the global sad funcend_sad_16x16: @End of fucntion process - ldr r5, [sp, #44] - ldr r6, [sp, #48] - str r7, [r6] @Store the is zero reg - str r10, [r5] @Store sad + vpop {d8-d15} + ldr r5, [sp, #44] + ldr r6, [sp, #48] + + str r7, [r6] @Store the is zero reg + str r10, [r5] @Store sad @SUB SP,SP,#40 - pop {r4-r12, pc} + pop {r4-r12, pc} diff --git a/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s b/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s index c442077..e768c21 100644 --- a/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s +++ b/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s @@ -76,7 +76,7 @@ .p2align 2 .include "ih264_neon_macros.s" -.globl ih264e_evaluate_intra16x16_modes_av8 +.global ih264e_evaluate_intra16x16_modes_av8 ih264e_evaluate_intra16x16_modes_av8: diff --git a/encoder/armv8/ih264e_half_pel_av8.s b/encoder/armv8/ih264e_half_pel_av8.s index 6dbd8f8..817faa6 100644 --- a/encoder/armv8/ih264e_half_pel_av8.s +++ b/encoder/armv8/ih264e_half_pel_av8.s @@ -1015,10 +1015,3 @@ filter_2dvh_skip_row: ///***************************************** - - - - - - - .section .note.gnu-stack,"",%progbits diff --git a/encoder/armv8/ime_distortion_metrics_av8.s b/encoder/armv8/ime_distortion_metrics_av8.s index 99ebc8a..47c3425 100644 --- a/encoder/armv8/ime_distortion_metrics_av8.s +++ b/encoder/armv8/ime_distortion_metrics_av8.s @@ -975,4 +975,3 @@ satdq_end_func: ldp d8, d9, [sp], #16 pop_v_regs ret - .section .note.gnu-stack,"",%progbits diff --git a/encoder/x86/ih264e_intra_modes_eval_ssse3.c b/encoder/x86/ih264e_intra_modes_eval_ssse3.c index 657921f..0f4a9ad 100644 --- a/encoder/x86/ih264e_intra_modes_eval_ssse3.c +++ b/encoder/x86/ih264e_intra_modes_eval_ssse3.c @@ -487,7 +487,7 @@ void ih264e_evaluate_intra_4x4_modes_ssse3(UWORD8 *pu1_src, INT_MAX, INT_MAX, INT_MAX, INT_MAX }; WORD32 min_cost; - WORD32 lambda4 = u4_lambda << 2; + UWORD32 lambda4 = u4_lambda << 2; WORD32 dst_strd2, dst_strd3; __m128i left_top_16x8b, src_16x8b, pred0_16x8b, sad_8x16b; diff --git a/encoder/x86/ime_distortion_metrics_sse42.c b/encoder/x86/ime_distortion_metrics_sse42.c index 0876788..baf18a4 100644 --- a/encoder/x86/ime_distortion_metrics_sse42.c +++ b/encoder/x86/ime_distortion_metrics_sse42.c @@ -110,6 +110,7 @@ void ime_compute_sad_16x16_sse42(UWORD8 *pu1_src, __m128i res_r0, res_r1, res_r2, res_r3; __m128i sad_val; int val1, val2; + UNUSED (i4_max_sad); // Row 0-3 sad calculation src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); @@ -248,6 +249,7 @@ void ime_compute_sad_16x8_sse42(UWORD8 *pu1_src, WORD32 i4_max_sad, WORD32 *pi4_mb_distortion) { + UNUSED (i4_max_sad); __m128i src_r0, src_r1, src_r2, src_r3; __m128i est_r0, est_r1, est_r2, est_r3; __m128i res_r0, res_r1, res_r2, res_r3; @@ -498,6 +500,7 @@ void ime_compute_sad_16x16_fast_sse42(UWORD8 *pu1_src, WORD32 i4_max_sad, WORD32 *pi4_mb_distortion) { + UNUSED (i4_max_sad); __m128i src_r0, src_r1, src_r2, src_r3; __m128i est_r0, est_r1, est_r2, est_r3; __m128i res_r0, res_r1, res_r2, res_r3; |