summaryrefslogtreecommitdiffstats
path: root/encoder
diff options
context:
space:
mode:
Diffstat (limited to 'encoder')
-rw-r--r--encoder/arm/ime_distortion_metrics_a9q.s1350
-rw-r--r--encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s2
-rw-r--r--encoder/armv8/ih264e_half_pel_av8.s7
-rw-r--r--encoder/armv8/ime_distortion_metrics_av8.s1
-rw-r--r--encoder/x86/ih264e_intra_modes_eval_ssse3.c2
-rw-r--r--encoder/x86/ime_distortion_metrics_sse42.c3
6 files changed, 685 insertions, 680 deletions
diff --git a/encoder/arm/ime_distortion_metrics_a9q.s b/encoder/arm/ime_distortion_metrics_a9q.s
index b58911e..27fbe3d 100644
--- a/encoder/arm/ime_distortion_metrics_a9q.s
+++ b/encoder/arm/ime_distortion_metrics_a9q.s
@@ -17,9 +17,9 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
-@/**
+@**
@******************************************************************************
@*
@*
@@ -48,7 +48,7 @@
@
-@/**
+@**
@******************************************************************************
@*
@* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode)
@@ -79,59 +79,62 @@
@* @remarks
@*
@******************************************************************************
-@*/
+@*
.text
.p2align 2
+
.global ime_compute_sad_16x16_fast_a9q
+
ime_compute_sad_16x16_fast_a9q:
- stmfd sp!, {r12, lr}
- lsl r2, r2, #1
- lsl r3, r3, #1
+ stmfd sp!, {r12, lr}
+ vpush {d8-d15}
+ lsl r2, r2, #1
+ lsl r3, r3, #1
@for bringing buffer2 into cache..., dummy load instructions
- @ LDR r12,[r1]
+ @LDR r12,[r1]
- vld1.8 {d4, d5}, [r0], r2
- vld1.8 {d6, d7}, [r1], r3
- mov r12, #6
- vld1.8 {d8, d9}, [r0], r2
- vabdl.u8 q0, d6, d4
- vabdl.u8 q1, d7, d5
- vld1.8 {d10, d11}, [r1], r3
+ vld1.8 {d4, d5}, [r0], r2
+ vld1.8 {d6, d7}, [r1], r3
+ mov r12, #6
+ vld1.8 {d8, d9}, [r0], r2
+ vabdl.u8 q0, d6, d4
+ vabdl.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
loop_sad_16x16_fast:
- vld1.8 {d4, d5}, [r0], r2
- vabal.u8 q0, d10, d8
- vabal.u8 q1, d11, d9
- vld1.8 {d6, d7}, [r1], r3
- subs r12, #2
- vld1.8 {d8, d9}, [r0], r2
- vabal.u8 q0, d6, d4
- vabal.u8 q1, d7, d5
- vld1.8 {d10, d11}, [r1], r3
-
- bne loop_sad_16x16_fast
+ vld1.8 {d4, d5}, [r0], r2
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
+ vld1.8 {d6, d7}, [r1], r3
+ subs r12, #2
+ vld1.8 {d8, d9}, [r0], r2
+ vabal.u8 q0, d6, d4
+ vabal.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
- vabal.u8 q0, d10, d8
- vabal.u8 q1, d11, d9
+ bne loop_sad_16x16_fast
- vadd.i16 q0, q0, q1
- vadd.i16 d0, d1, d0
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
- ldr r12, [sp, #12]
- vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
- vshl.u32 d0, d0, #1
- vst1.32 {d0[0]}, [r12]
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d1, d0
+ vpop {d8-d15}
+ ldr r12, [sp, #12]
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vshl.u32 d0, d0, #1
+ vst1.32 {d0[0]}, [r12]
- ldmfd sp!, {r12, pc}
+ ldmfd sp!, {r12, pc}
-@/**
+@**
@******************************************************************************
@*
@* @brief computes distortion (SAD) between 2 16x8 blocks
@@ -163,56 +166,57 @@ loop_sad_16x16_fast:
@* @remarks
@*
@******************************************************************************
-@*/
+@*
@
.global ime_compute_sad_16x8_a9q
+
ime_compute_sad_16x8_a9q:
- stmfd sp!, {r12, lr}
+ stmfd sp!, {r12, lr}
@for bringing buffer2 into cache..., dummy load instructions
@LDR r12,[r1]
- vld1.8 {d4, d5}, [r0], r2
- vld1.8 {d6, d7}, [r1], r3
- mov r12, #6
- vld1.8 {d8, d9}, [r0], r2
- vabdl.u8 q0, d6, d4
- vabdl.u8 q1, d7, d5
- vld1.8 {d10, d11}, [r1], r3
+ vld1.8 {d4, d5}, [r0], r2
+ vld1.8 {d6, d7}, [r1], r3
+ mov r12, #6
+ vpush {d8-d15}
+ vld1.8 {d8, d9}, [r0], r2
+ vabdl.u8 q0, d6, d4
+ vabdl.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
loop_sad_16x8:
- vld1.8 {d4, d5}, [r0], r2
- vabal.u8 q0, d10, d8
- vabal.u8 q1, d11, d9
- vld1.8 {d6, d7}, [r1], r3
- subs r12, #2
- vld1.8 {d8, d9}, [r0], r2
- vabal.u8 q0, d6, d4
- vabal.u8 q1, d7, d5
- vld1.8 {d10, d11}, [r1], r3
-
- bne loop_sad_16x8
-
- vabal.u8 q0, d10, d8
- vabal.u8 q1, d11, d9
+ vld1.8 {d4, d5}, [r0], r2
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
+ vld1.8 {d6, d7}, [r1], r3
+ subs r12, #2
+ vld1.8 {d8, d9}, [r0], r2
+ vabal.u8 q0, d6, d4
+ vabal.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
- vadd.i16 q0, q0, q1
- vadd.i16 d0, d1, d0
+ bne loop_sad_16x8
- ldr r12, [sp, #12]
- vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
- vst1.32 {d0[0]}, [r12]
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
- ldmfd sp!, {r12, pc}
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d1, d0
+ vpop {d8-d15}
+ ldr r12, [sp, #12]
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vst1.32 {d0[0]}, [r12]
+ ldmfd sp!, {r12, pc}
-@/**
+@**
@******************************************************************************
@*
@* @brief computes distortion (SAD) between 2 16x16 blocks with early exit
@@ -243,100 +247,103 @@ loop_sad_16x8:
@* @remarks
@*
@******************************************************************************
-@*/
+@*
+
.global ime_compute_sad_16x16_ea8_a9q
ime_compute_sad_16x16_ea8_a9q:
- stmfd sp!, {r5-r7, lr}
- lsl r2, r2, #1
- lsl r3, r3, #1
+ stmfd sp!, {r5-r7, lr}
+ lsl r2, r2, #1
+ lsl r3, r3, #1
@for bringing buffer2 into cache..., dummy load instructions
@LDR r12,[r1]
- vld1.8 {d4, d5}, [r0], r2
- vld1.8 {d6, d7}, [r1], r3
- mov r5, #6
- vld1.8 {d8, d9}, [r0], r2
- vabdl.u8 q0, d6, d4
- vabdl.u8 q1, d7, d5
- vld1.8 {d10, d11}, [r1], r3
- ldrd r6, r7, [sp, #16]
+ vld1.8 {d4, d5}, [r0], r2
+ vld1.8 {d6, d7}, [r1], r3
+ mov r5, #6
+ ldrd r6, r7, [sp, #16]
+ vpush {d8-d15}
+ vld1.8 {d8, d9}, [r0], r2
+ vabdl.u8 q0, d6, d4
+ vabdl.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
+
@r6 = i4_max_sad, r7 = pi4_mb_distortion
loop_sad_16x16_ea8_1:
- vld1.8 {d4, d5}, [r0], r2
- vabal.u8 q0, d10, d8
- vabal.u8 q1, d11, d9
- vld1.8 {d6, d7}, [r1], r3
- subs r5, #2
- vld1.8 {d8, d9}, [r0], r2
- vabal.u8 q0, d6, d4
- vabal.u8 q1, d7, d5
- vld1.8 {d10, d11}, [r1], r3
-
- bne loop_sad_16x16_ea8_1
-
- vabal.u8 q0, d10, d8
- sub r0, r0, r2, lsl #3
- vabal.u8 q1, d11, d9
- sub r1, r1, r3, lsl #3
-
- vadd.i16 q6, q0, q1
- add r0, r0, r2, asr #1
- vadd.i16 d12, d12, d13
- add r1, r1, r3, asr #1
-
- vpaddl.u16 d12, d12
- vld1.8 {d4, d5}, [r0], r2
- vld1.8 {d6, d7}, [r1], r3
- vpaddl.u32 d12, d12
- vld1.8 {d8, d9}, [r0], r2
- vabal.u8 q0, d6, d4
- vabal.u8 q1, d7, d5
-
- vst1.32 {d12[0]}, [r7]
- ldr r5, [r7]
- cmp r5, r6
- bgt end_func_16x16_ea8
-
- vld1.8 {d10, d11}, [r1], r3
- mov r5, #6
+ vld1.8 {d4, d5}, [r0], r2
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
+ vld1.8 {d6, d7}, [r1], r3
+ subs r5, #2
+ vld1.8 {d8, d9}, [r0], r2
+ vabal.u8 q0, d6, d4
+ vabal.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
+
+ bne loop_sad_16x16_ea8_1
+
+ vabal.u8 q0, d10, d8
+ sub r0, r0, r2, lsl #3
+ vabal.u8 q1, d11, d9
+ sub r1, r1, r3, lsl #3
+
+ vadd.i16 q6, q0, q1
+ add r0, r0, r2, asr #1
+ vadd.i16 d12, d12, d13
+ add r1, r1, r3, asr #1
+
+ vpaddl.u16 d12, d12
+ vld1.8 {d4, d5}, [r0], r2
+ vld1.8 {d6, d7}, [r1], r3
+ vpaddl.u32 d12, d12
+ vld1.8 {d8, d9}, [r0], r2
+ vabal.u8 q0, d6, d4
+ vabal.u8 q1, d7, d5
+
+ vst1.32 {d12[0]}, [r7]
+ ldr r5, [r7]
+ cmp r5, r6
+ bgt end_func_16x16_ea8
+
+ vld1.8 {d10, d11}, [r1], r3
+ mov r5, #6
loop_sad_16x16_ea8_2:
- vld1.8 {d4, d5}, [r0], r2
- vabal.u8 q0, d10, d8
- vabal.u8 q1, d11, d9
- vld1.8 {d6, d7}, [r1], r3
- subs r5, #2
- vld1.8 {d8, d9}, [r0], r2
- vabal.u8 q0, d6, d4
- vabal.u8 q1, d7, d5
- vld1.8 {d10, d11}, [r1], r3
+ vld1.8 {d4, d5}, [r0], r2
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
+ vld1.8 {d6, d7}, [r1], r3
+ subs r5, #2
+ vld1.8 {d8, d9}, [r0], r2
+ vabal.u8 q0, d6, d4
+ vabal.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
- bne loop_sad_16x16_ea8_2
+ bne loop_sad_16x16_ea8_2
- vabal.u8 q0, d10, d8
- vabal.u8 q1, d11, d9
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
- vadd.i16 q0, q0, q1
- vadd.i16 d0, d1, d0
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d1, d0
- vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
- vst1.32 {d0[0]}, [r7]
+ vst1.32 {d0[0]}, [r7]
end_func_16x16_ea8:
-
- ldmfd sp!, {r5-r7, pc}
+ vpop {d8-d15}
+ ldmfd sp!, {r5-r7, pc}
-@/*
+@*
@//---------------------------------------------------------------------------
@// Function Name : Calculate_Mad2_prog()
@//
@@ -346,7 +353,7 @@ end_func_16x16_ea8:
@// Platform : CortexA8/NEON .
@//
@//-----------------------------------------------------------------------------
-@*/
+@*
.global ime_calculate_sad2_prog_a9q
@@ -358,72 +365,72 @@ ime_calculate_sad2_prog_a9q:
@ r3 = RefBufferWidth <UWORD32>
@ stack = CurBufferWidth <UWORD32>, psad <UWORD32 *>
- stmfd sp!, {r4-r5, lr}
-
- ldr r4, [sp, #8] @ load src stride to r4
- mov r5, #14
+ stmfd sp!, {r4-r5, lr}
+ ldr r4, [sp, #8] @ load src stride to r4
+ mov r5, #14
+ vpush {d8-d15}
@Row 1
- vld1.8 {d0, d1}, [r2], r4 @ load src Row 1
- vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1
- vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1
+ vld1.8 {d0, d1}, [r2], r4 @ load src Row 1
+ vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1
+ vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1
@Row 2
- vld1.8 {d6, d7}, [r2], r4 @ load src Row 2
- vabdl.u8 q6, d2, d0
- vabdl.u8 q7, d3, d1
- vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2
- vabdl.u8 q8, d4, d0
- vabdl.u8 q9, d5, d1
- vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2
+ vld1.8 {d6, d7}, [r2], r4 @ load src Row 2
+ vabdl.u8 q6, d2, d0
+ vabdl.u8 q7, d3, d1
+ vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2
+ vabdl.u8 q8, d4, d0
+ vabdl.u8 q9, d5, d1
+ vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2
loop_sad2_prog:
- subs r5, #2
+ subs r5, #2
@Row 1
- vld1.8 {d0, d1}, [r2], r4 @ load src Row 1
- vabal.u8 q6, d8, d6
- vabal.u8 q7, d9, d7
- vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1
- vabal.u8 q8, d10, d6
- vabal.u8 q9, d11, d7
- vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1
+ vld1.8 {d0, d1}, [r2], r4 @ load src Row 1
+ vabal.u8 q6, d8, d6
+ vabal.u8 q7, d9, d7
+ vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1
+ vabal.u8 q8, d10, d6
+ vabal.u8 q9, d11, d7
+ vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1
@Row 2
- vld1.8 {d6, d7}, [r2], r4 @ load src Row 2
- vabal.u8 q6, d2, d0
- vabal.u8 q7, d3, d1
- vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2
- vabal.u8 q8, d4, d0
- vabal.u8 q9, d5, d1
- vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2
+ vld1.8 {d6, d7}, [r2], r4 @ load src Row 2
+ vabal.u8 q6, d2, d0
+ vabal.u8 q7, d3, d1
+ vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2
+ vabal.u8 q8, d4, d0
+ vabal.u8 q9, d5, d1
+ vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2
- bne loop_sad2_prog
+ bne loop_sad2_prog
- vabal.u8 q6, d8, d6
- vabal.u8 q7, d9, d7
- vabal.u8 q8, d10, d6
- vabal.u8 q9, d11, d7
+ vabal.u8 q6, d8, d6
+ vabal.u8 q7, d9, d7
+ vabal.u8 q8, d10, d6
+ vabal.u8 q9, d11, d7
@ Compute SAD
- vadd.u16 q6, q6, q7 @ Q6 : sad_ref1
- vadd.u16 q8, q8, q9 @ Q8 : sad_ref2
+ vadd.u16 q6, q6, q7 @ Q6 : sad_ref1
+ vadd.u16 q8, q8, q9 @ Q8 : sad_ref2
- vadd.u16 d12, d12, d13
- ldr r5, [sp, #16] @ loading pi4_sad to r5
- vadd.u16 d16, d16, d17
+ vadd.u16 d12, d12, d13
+ ldr r5, [sp, #16] @ loading pi4_sad to r5
+ vadd.u16 d16, d16, d17
- vpadd.u16 d12, d12, d16
- vpaddl.u16 d12, d12
+ vpadd.u16 d12, d12, d16
+ vpaddl.u16 d12, d12
- vst1.64 {d12}, [r5]!
+ vst1.64 {d12}, [r5]!
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r5, pc}
- ldmfd sp!, {r4-r5, pc}
-
-@/*
+@*
@//---------------------------------------------------------------------------
@// Function Name : Calculate_Mad3_prog()
@//
@@ -433,7 +440,7 @@ loop_sad2_prog:
@// Platform : CortexA8/NEON .
@//
@//-----------------------------------------------------------------------------
-@*/
+@*
.global ime_calculate_sad3_prog_a9q
@@ -446,90 +453,90 @@ ime_calculate_sad3_prog_a9q:
@ stack = RefBufferWidth <UWORD32>, CurBufferWidth <UWORD32>, psad <UWORD32 *>
- stmfd sp!, {r4-r6, lr}
-
- ldrd r4, r5, [sp, #16] @ load ref stride to r4, src stride to r5
- mov r6, #14
-
- @ Row 1
- vld1.8 {d0, d1}, [r3], r5 @ load src Row 1
- vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1
- vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1
- vabdl.u8 q8, d2, d0
- vabdl.u8 q9, d3, d1
- vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1
- vabdl.u8 q10, d4, d0
- vabdl.u8 q11, d5, d1
-
- @ Row 2
- vld1.8 {d8, d9}, [r3], r5 @ load src Row 1
- vabdl.u8 q12, d6, d0
- vabdl.u8 q13, d7, d1
- vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1
- vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1
- vabal.u8 q8, d10, d8
- vabal.u8 q9, d11, d9
- vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1
- vabal.u8 q10, d12, d8
- vabal.u8 q11, d13, d9
+ stmfd sp!, {r4-r6, lr}
+
+ ldrd r4, r5, [sp, #16] @ load ref stride to r4, src stride to r5
+ mov r6, #14
+ vpush {d8-d15}
+ @Row 1
+ vld1.8 {d0, d1}, [r3], r5 @ load src Row 1
+ vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1
+ vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1
+ vabdl.u8 q8, d2, d0
+ vabdl.u8 q9, d3, d1
+ vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1
+ vabdl.u8 q10, d4, d0
+ vabdl.u8 q11, d5, d1
+
+ @Row 2
+ vld1.8 {d8, d9}, [r3], r5 @ load src Row 1
+ vabdl.u8 q12, d6, d0
+ vabdl.u8 q13, d7, d1
+ vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1
+ vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1
+ vabal.u8 q8, d10, d8
+ vabal.u8 q9, d11, d9
+ vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1
+ vabal.u8 q10, d12, d8
+ vabal.u8 q11, d13, d9
loop_sad3_prog:
@Row 1
- vld1.8 {d0, d1}, [r3], r5 @ load src Row 1
- vabal.u8 q12, d14, d8
- vabal.u8 q13, d15, d9
- vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1
- vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1
- vabal.u8 q8, d2, d0
- vabal.u8 q9, d3, d1
- vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1
- vabal.u8 q10, d4, d0
- vabal.u8 q11, d5, d1
+ vld1.8 {d0, d1}, [r3], r5 @ load src Row 1
+ vabal.u8 q12, d14, d8
+ vabal.u8 q13, d15, d9
+ vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1
+ vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1
+ vabal.u8 q8, d2, d0
+ vabal.u8 q9, d3, d1
+ vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1
+ vabal.u8 q10, d4, d0
+ vabal.u8 q11, d5, d1
@Row 2
- vld1.8 {d8, d9}, [r3], r5 @ load src Row 1
- vabal.u8 q12, d6, d0
- vabal.u8 q13, d7, d1
- vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1
- subs r6, #2
- vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1
- vabal.u8 q8, d10, d8
- vabal.u8 q9, d11, d9
- vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1
- vabal.u8 q10, d12, d8
- vabal.u8 q11, d13, d9
-
- bne loop_sad3_prog
-
- vabal.u8 q12, d14, d8
- vabal.u8 q13, d15, d9
+ vld1.8 {d8, d9}, [r3], r5 @ load src Row 1
+ vabal.u8 q12, d6, d0
+ vabal.u8 q13, d7, d1
+ vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1
+ subs r6, #2
+ vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1
+ vabal.u8 q8, d10, d8
+ vabal.u8 q9, d11, d9
+ vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1
+ vabal.u8 q10, d12, d8
+ vabal.u8 q11, d13, d9
+
+ bne loop_sad3_prog
+
+ vabal.u8 q12, d14, d8
+ vabal.u8 q13, d15, d9
@ Compute SAD
- vadd.u16 q8, q8, q9 @ Q8 : sad_ref1
- vadd.u16 q10, q10, q11 @ Q10 : sad_ref2
- vadd.u16 q12, q12, q13 @ Q12 : sad_ref3
+ vadd.u16 q8, q8, q9 @ Q8 : sad_ref1
+ vadd.u16 q10, q10, q11 @ Q10 : sad_ref2
+ vadd.u16 q12, q12, q13 @ Q12 : sad_ref3
- vadd.u16 d16, d16, d17
- vadd.u16 d20, d20, d21
- vadd.u16 d24, d24, d25
+ vadd.u16 d16, d16, d17
+ vadd.u16 d20, d20, d21
+ vadd.u16 d24, d24, d25
- vpadd.u16 d16, d16, d20
- vpadd.u16 d24, d24, d24
+ vpadd.u16 d16, d16, d20
+ vpadd.u16 d24, d24, d24
- ldr r6, [sp, #24] @ loading pi4_sad to r6
- vpaddl.u16 d16, d16
- vpaddl.u16 d24, d24
+ ldr r6, [sp, #24] @ loading pi4_sad to r6
+ vpaddl.u16 d16, d16
+ vpaddl.u16 d24, d24
- vst1.64 {d16}, [r6]!
- vst1.32 {d24[0]}, [r6]
+ vst1.64 {d16}, [r6]!
+ vst1.32 {d24[0]}, [r6]
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r6, pc}
- ldmfd sp!, {r4-r6, pc}
-
-@/**
+@**
@******************************************************************************
@*
@* @brief computes distortion (SAD) for sub-pel motion estimation
@@ -551,7 +558,7 @@ loop_sad3_prog:
@* @remarks
@*
@******************************************************************************
-@*/
+@*
.text
.p2align 2
@@ -560,115 +567,116 @@ loop_sad3_prog:
ime_sub_pel_compute_sad_16x16_a9q:
- stmfd sp!, {r4-r11, lr} @store register values to stack
+ stmfd sp!, {r4-r11, lr} @store register values to stack
- ldr r9, [sp, #36]
- ldr r10, [sp, #40]
+ ldr r9, [sp, #36]
+ ldr r10, [sp, #40]
+ vpush {d8-d15}
+ sub r4, r1, #1 @ x left
+ sub r5, r2, r10 @ y top
- sub r4, r1, #1 @ x left
- sub r5, r2, r10 @ y top
+ sub r6, r3, #1 @ xy left
+ sub r7, r3, r10 @ xy top
- sub r6, r3, #1 @ xy left
- sub r7, r3, r10 @ xy top
-
- sub r8, r7, #1 @ xy top-left
- mov r11, #15
+ sub r8, r7, #1 @ xy top-left
+ mov r11, #15
@for bringing buffer2 into cache..., dummy load instructions
@ LDR r12,[r1]
@ LDR r12,[sp,#12]
- vld1.8 {d0, d1}, [r0], r9 @ src
- vld1.8 {d2, d3}, [r5], r10 @ y top LOAD
- vld1.8 {d4, d5}, [r7], r10 @ xy top LOAD
- vld1.8 {d6, d7}, [r8], r10 @ xy top-left LOAD
-
- vabdl.u8 q6, d2, d0 @ y top ABS1
- vabdl.u8 q7, d4, d0 @ xy top ABS1
- vld1.8 {d8, d9}, [r1], r10 @ x LOAD
- vabdl.u8 q8, d6, d0 @ xy top-left ABS1
- vabdl.u8 q9, d8, d0 @ x ABS1
- vld1.8 {d10, d11}, [r4], r10 @ x left LOAD
-
- vabal.u8 q6, d3, d1 @ y top ABS2
- vabal.u8 q7, d5, d1 @ xy top ABS2
- vld1.8 {d2, d3}, [r2], r10 @ y LOAD
- vabal.u8 q8, d7, d1 @ xy top-left ABS2
- vabal.u8 q9, d9, d1 @ x ABS2
- vld1.8 {d4, d5}, [r3], r10 @ xy LOAD
-
- vabdl.u8 q10, d10, d0 @ x left ABS1
- vabdl.u8 q11, d2, d0 @ y ABS1
- vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD
- vabdl.u8 q12, d4, d0 @ xy ABS1
- vabdl.u8 q13, d6, d0 @ xy left ABS1
+ vld1.8 {d0, d1}, [r0], r9 @ src
+ vld1.8 {d2, d3}, [r5], r10 @ y top LOAD
+ vld1.8 {d4, d5}, [r7], r10 @ xy top LOAD
+ vld1.8 {d6, d7}, [r8], r10 @ xy top-left LOAD
+
+ vabdl.u8 q6, d2, d0 @ y top ABS1
+ vabdl.u8 q7, d4, d0 @ xy top ABS1
+ vld1.8 {d8, d9}, [r1], r10 @ x LOAD
+ vabdl.u8 q8, d6, d0 @ xy top-left ABS1
+ vabdl.u8 q9, d8, d0 @ x ABS1
+ vld1.8 {d10, d11}, [r4], r10 @ x left LOAD
+
+ vabal.u8 q6, d3, d1 @ y top ABS2
+ vabal.u8 q7, d5, d1 @ xy top ABS2
+ vld1.8 {d2, d3}, [r2], r10 @ y LOAD
+ vabal.u8 q8, d7, d1 @ xy top-left ABS2
+ vabal.u8 q9, d9, d1 @ x ABS2
+ vld1.8 {d4, d5}, [r3], r10 @ xy LOAD
+
+ vabdl.u8 q10, d10, d0 @ x left ABS1
+ vabdl.u8 q11, d2, d0 @ y ABS1
+ vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD
+ vabdl.u8 q12, d4, d0 @ xy ABS1
+ vabdl.u8 q13, d6, d0 @ xy left ABS1
loop_sub_pel_16x16:
- vabal.u8 q10, d11, d1 @ x left ABS2
- vabal.u8 q11, d3, d1 @ y ABS2
- subs r11, #1
- vabal.u8 q12, d5, d1 @ xy ABS2
- vabal.u8 q13, d7, d1 @ xy left ABS2
-
- vld1.8 {d0, d1}, [r0], r9 @ src
- vabal.u8 q6, d2, d0 @ y top ABS1
- vabal.u8 q7, d4, d0 @ xy top ABS1
- vld1.8 {d8, d9}, [r1], r10 @ x LOAD
- vabal.u8 q8, d6, d0 @ xy top-left ABS1
- vabal.u8 q9, d8, d0 @ x ABS1
- vld1.8 {d10, d11}, [r4], r10 @ x left LOAD
-
- vabal.u8 q6, d3, d1 @ y top ABS2
- vabal.u8 q7, d5, d1 @ xy top ABS2
- vld1.8 {d2, d3}, [r2], r10 @ y LOAD
- vabal.u8 q8, d7, d1 @ xy top-left ABS2
- vabal.u8 q9, d9, d1 @ x ABS2
- vld1.8 {d4, d5}, [r3], r10 @ xy LOAD
-
- vabal.u8 q10, d10, d0 @ x left ABS1
- vabal.u8 q11, d2, d0 @ y ABS1
- vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD
- vabal.u8 q12, d4, d0 @ xy ABS1
- vabal.u8 q13, d6, d0 @ xy left ABS1
-
- bne loop_sub_pel_16x16
-
- vabal.u8 q10, d11, d1 @ x left ABS2
- vabal.u8 q11, d3, d1 @ y ABS2
- vabal.u8 q12, d5, d1 @ xy ABS2
- vabal.u8 q13, d7, d1 @ xy left ABS2
-
- vadd.i16 d0, d18, d19 @ x
- vadd.i16 d3, d12, d13 @ y top
- vadd.i16 d6, d14, d15 @ xy top
- vadd.i16 d5, d26, d27 @ xy left
- vadd.i16 d1, d20, d21 @ x left
- vadd.i16 d2, d22, d23 @ y
- vadd.i16 d4, d24, d25 @ xy
- vadd.i16 d7, d16, d17 @ xy top left
-
- vpadd.i16 d0, d0, d1
- vpadd.i16 d2, d2, d3
- vpadd.i16 d4, d4, d5
- vpadd.i16 d6, d6, d7
-
- vpaddl.u16 d0, d0
- vpaddl.u16 d2, d2
- ldr r11, [sp, #44]
- vpaddl.u16 d4, d4
- vpaddl.u16 d6, d6
-
- vst1.32 {d0}, [r11]!
- vst1.32 {d2}, [r11]!
- vst1.32 {d4}, [r11]!
- vst1.32 {d6}, [r11]!
-
- ldmfd sp!, {r4-r11, pc} @Restoring registers from stack
-
-
-
-@/**
+ vabal.u8 q10, d11, d1 @ x left ABS2
+ vabal.u8 q11, d3, d1 @ y ABS2
+ subs r11, #1
+ vabal.u8 q12, d5, d1 @ xy ABS2
+ vabal.u8 q13, d7, d1 @ xy left ABS2
+
+ vld1.8 {d0, d1}, [r0], r9 @ src
+ vabal.u8 q6, d2, d0 @ y top ABS1
+ vabal.u8 q7, d4, d0 @ xy top ABS1
+ vld1.8 {d8, d9}, [r1], r10 @ x LOAD
+ vabal.u8 q8, d6, d0 @ xy top-left ABS1
+ vabal.u8 q9, d8, d0 @ x ABS1
+ vld1.8 {d10, d11}, [r4], r10 @ x left LOAD
+
+ vabal.u8 q6, d3, d1 @ y top ABS2
+ vabal.u8 q7, d5, d1 @ xy top ABS2
+ vld1.8 {d2, d3}, [r2], r10 @ y LOAD
+ vabal.u8 q8, d7, d1 @ xy top-left ABS2
+ vabal.u8 q9, d9, d1 @ x ABS2
+ vld1.8 {d4, d5}, [r3], r10 @ xy LOAD
+
+ vabal.u8 q10, d10, d0 @ x left ABS1
+ vabal.u8 q11, d2, d0 @ y ABS1
+ vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD
+ vabal.u8 q12, d4, d0 @ xy ABS1
+ vabal.u8 q13, d6, d0 @ xy left ABS1
+
+ bne loop_sub_pel_16x16
+
+ vabal.u8 q10, d11, d1 @ x left ABS2
+ vabal.u8 q11, d3, d1 @ y ABS2
+ vabal.u8 q12, d5, d1 @ xy ABS2
+ vabal.u8 q13, d7, d1 @ xy left ABS2
+
+ vadd.i16 d0, d18, d19 @ x
+ vadd.i16 d3, d12, d13 @ y top
+ vadd.i16 d6, d14, d15 @ xy top
+ vadd.i16 d5, d26, d27 @ xy left
+ vadd.i16 d1, d20, d21 @ x left
+ vadd.i16 d2, d22, d23 @ y
+ vadd.i16 d4, d24, d25 @ xy
+ vadd.i16 d7, d16, d17 @ xy top left
+
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d2, d2, d3
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d6, d6, d7
+
+ vpaddl.u16 d0, d0
+ vpaddl.u16 d2, d2
+ vpop {d8-d15}
+ ldr r11, [sp, #44]
+ vpaddl.u16 d4, d4
+ vpaddl.u16 d6, d6
+
+ vst1.32 {d0}, [r11]!
+ vst1.32 {d2}, [r11]!
+ vst1.32 {d4}, [r11]!
+ vst1.32 {d6}, [r11]!
+
+ ldmfd sp!, {r4-r11, pc} @Restoring registers from stack
+
+
+
+@**
@******************************************************************************
@*
@* @brief computes distortion (SAD) between 2 16x16 blocks
@@ -699,7 +707,7 @@ loop_sub_pel_16x16:
@* @remarks
@*
@******************************************************************************
-@*/
+@*
.text
.p2align 2
@@ -710,51 +718,52 @@ ime_compute_sad_16x16_a9q:
@STMFD sp!,{r12,lr}
- stmfd sp!, {r12, r14} @store register values to stack
+ stmfd sp!, {r12, r14} @store register values to stack
@for bringing buffer2 into cache..., dummy load instructions
@ LDR r12,[r1]
@ LDR r12,[sp,#12]
- vld1.8 {d4, d5}, [r0], r2
- vld1.8 {d6, d7}, [r1], r3
-
- mov r12, #14
- vld1.8 {d8, d9}, [r0], r2
- vabdl.u8 q0, d4, d6
- vld1.8 {d10, d11}, [r1], r3
- vabdl.u8 q1, d5, d7
+ vld1.8 {d4, d5}, [r0], r2
+ vld1.8 {d6, d7}, [r1], r3
+ vpush {d8-d15}
+ mov r12, #14
+ vld1.8 {d8, d9}, [r0], r2
+ vabdl.u8 q0, d4, d6
+ vld1.8 {d10, d11}, [r1], r3
+ vabdl.u8 q1, d5, d7
loop_sad_16x16:
- vld1.8 {d4, d5}, [r0], r2
- vabal.u8 q0, d8, d10
- vld1.8 {d6, d7}, [r1], r3
- vabal.u8 q1, d9, d11
+ vld1.8 {d4, d5}, [r0], r2
+ vabal.u8 q0, d8, d10
+ vld1.8 {d6, d7}, [r1], r3
+ vabal.u8 q1, d9, d11
- vld1.8 {d8, d9}, [r0], r2
- vabal.u8 q0, d4, d6
- subs r12, #2
- vld1.8 {d10, d11}, [r1], r3
- vabal.u8 q1, d5, d7
+ vld1.8 {d8, d9}, [r0], r2
+ vabal.u8 q0, d4, d6
+ subs r12, #2
+ vld1.8 {d10, d11}, [r1], r3
+ vabal.u8 q1, d5, d7
- bne loop_sad_16x16
+ bne loop_sad_16x16
- vabal.u8 q0, d8, d10
- vabal.u8 q1, d9, d11
+ vabal.u8 q0, d8, d10
+ vabal.u8 q1, d9, d11
- vadd.i16 q0, q0, q1
- vadd.i16 d0, d1, d0
- ldr r12, [sp, #12]
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d1, d0
+ vpop {d8-d15}
+ ldr r12, [sp, #12]
- vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
- vst1.32 {d0[0]}, [r12]
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vst1.32 {d0[0]}, [r12]
- ldmfd sp!, {r12, pc} @Restoring registers from stack
+ ldmfd sp!, {r12, pc} @Restoring registers from stack
-@/*
+@*
@//---------------------------------------------------------------------------
@// Function Name : Calculate_Mad4_prog()
@//
@@ -764,7 +773,7 @@ loop_sad_16x16:
@// Platform : CortexA8/NEON .
@//
@//-----------------------------------------------------------------------------
-@*/
+@*
.global ime_calculate_sad4_prog_a9q
@@ -775,20 +784,20 @@ ime_calculate_sad4_prog_a9q:
@ r3 = CurBufferWidth <UWORD32>
@ stack = psad <UWORD32 *> {at 0x34}
- stmfd sp!, {r4-r7, lr}
+ stmfd sp!, {r4-r7, lr}
@UWORD8 *left_ptr = temp_frame - 1;
@UWORD8 *right_ptr = temp_frame + 1;
@UWORD8 *top_ptr = temp_frame - RefBufferWidth;
@UWORD8 *bot_ptr = temp_frame + RefBufferWidth;
- mov r7, #14
- sub r4, r0, #0x01 @r4 = left_ptr
- add r5, r0, #0x1 @r5 = right_ptr
- sub r6, r0, r2 @r6 = top_ptr
- add r0, r0, r2 @r0 = bot_ptr
+ mov r7, #14
+ sub r4, r0, #0x01 @r4 = left_ptr
+ add r5, r0, #0x1 @r5 = right_ptr
+ sub r6, r0, r2 @r6 = top_ptr
+ add r0, r0, r2 @r0 = bot_ptr
@r1 = buffer_ptr
-
+ vpush {d8-d15}
@D0:D1 : buffer
@D2:D3 : top
@D4:D5 : left
@@ -796,94 +805,93 @@ ime_calculate_sad4_prog_a9q:
@D8:D9 : bottom
@Row 1
- vld1.8 {d0, d1}, [r1], r3 @ load src Row 1
- vld1.8 {d2, d3}, [r6], r2 @ load top Row 1
- vld1.8 {d4, d5}, [r4], r2 @ load left Row 1
+ vld1.8 {d0, d1}, [r1], r3 @ load src Row 1
+ vld1.8 {d2, d3}, [r6], r2 @ load top Row 1
+ vld1.8 {d4, d5}, [r4], r2 @ load left Row 1
- vabdl.u8 q5, d2, d0
- vld1.8 {d6, d7}, [r5], r2 @ load right Row 1
- vabdl.u8 q6, d3, d1
+ vabdl.u8 q5, d2, d0
+ vld1.8 {d6, d7}, [r5], r2 @ load right Row 1
+ vabdl.u8 q6, d3, d1
- vabdl.u8 q7, d0, d4
- vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1
- vabdl.u8 q8, d1, d5
+ vabdl.u8 q7, d0, d4
+ vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1
+ vabdl.u8 q8, d1, d5
@Row 2
- vabdl.u8 q9, d0, d6
- vld1.8 {d26, d27}, [r1], r3 @ load src Row 2
- vabdl.u8 q10, d1, d7
+ vabdl.u8 q9, d0, d6
+ vld1.8 {d26, d27}, [r1], r3 @ load src Row 2
+ vabdl.u8 q10, d1, d7
- vabdl.u8 q11, d0, d8
- vld1.8 {d2, d3}, [r6], r2 @ load top Row 2
- vabdl.u8 q12, d1, d9
+ vabdl.u8 q11, d0, d8
+ vld1.8 {d2, d3}, [r6], r2 @ load top Row 2
+ vabdl.u8 q12, d1, d9
loop_sad4_prog:
- vabal.u8 q5, d26, d2
- vld1.8 {d4, d5}, [r4], r2 @ load left Row 2
- vabal.u8 q6, d27, d3
+ vabal.u8 q5, d26, d2
+ vld1.8 {d4, d5}, [r4], r2 @ load left Row 2
+ vabal.u8 q6, d27, d3
- vabal.u8 q7, d26, d4
- vld1.8 {d6, d7}, [r5], r2 @ load right Row 2
- vabal.u8 q8, d27, d5
+ vabal.u8 q7, d26, d4
+ vld1.8 {d6, d7}, [r5], r2 @ load right Row 2
+ vabal.u8 q8, d27, d5
- vabal.u8 q9, d26, d6
- vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2
- vabal.u8 q10, d27, d7
+ vabal.u8 q9, d26, d6
+ vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2
+ vabal.u8 q10, d27, d7
@Row 1
- vabal.u8 q11, d26, d8
- vld1.8 {d0, d1}, [r1], r3 @ load src Row 1
- vabal.u8 q12, d27, d9
-
- vld1.8 {d2, d3}, [r6], r2 @ load top Row 1
- subs r7, #2
- vld1.8 {d4, d5}, [r4], r2 @ load left Row 1
+ vabal.u8 q11, d26, d8
+ vld1.8 {d0, d1}, [r1], r3 @ load src Row 1
+ vabal.u8 q12, d27, d9
- vabal.u8 q5, d0, d2
+ vld1.8 {d2, d3}, [r6], r2 @ load top Row 1
+ subs r7, #2
+ vld1.8 {d4, d5}, [r4], r2 @ load left Row 1
- vld1.8 {d6, d7}, [r5], r2 @ load right Row 1
- vabal.u8 q6, d1, d3
+ vabal.u8 q5, d0, d2
+ vld1.8 {d6, d7}, [r5], r2 @ load right Row 1
+ vabal.u8 q6, d1, d3
- vabal.u8 q7, d0, d4
- vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1
- vabal.u8 q8, d1, d5
+ vabal.u8 q7, d0, d4
+ vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1
+ vabal.u8 q8, d1, d5
@Row 2
- vabal.u8 q9, d0, d6
- vld1.8 {d26, d27}, [r1], r3 @ load src Row 2
- vabal.u8 q10, d1, d7
+ vabal.u8 q9, d0, d6
+ vld1.8 {d26, d27}, [r1], r3 @ load src Row 2
+ vabal.u8 q10, d1, d7
- vabal.u8 q11, d0, d8
- vld1.8 {d2, d3}, [r6], r2 @ load top Row 2
- vabal.u8 q12, d1, d9
+ vabal.u8 q11, d0, d8
+ vld1.8 {d2, d3}, [r6], r2 @ load top Row 2
+ vabal.u8 q12, d1, d9
- bne loop_sad4_prog
+ bne loop_sad4_prog
- vabal.u8 q5, d26, d2
- vld1.8 {d4, d5}, [r4], r2 @ load left Row 2
- vabal.u8 q6, d27, d3
+ vabal.u8 q5, d26, d2
+ vld1.8 {d4, d5}, [r4], r2 @ load left Row 2
+ vabal.u8 q6, d27, d3
- vabal.u8 q7, d26, d4
- vld1.8 {d6, d7}, [r5], r2 @ load right Row 2
- vabal.u8 q8, d27, d5
+ vabal.u8 q7, d26, d4
+ vld1.8 {d6, d7}, [r5], r2 @ load right Row 2
+ vabal.u8 q8, d27, d5
- vabal.u8 q9, d26, d6
- vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2
- vabal.u8 q10, d27, d7
+ vabal.u8 q9, d26, d6
+ vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2
+ vabal.u8 q10, d27, d7
- vabal.u8 q11, d26, d8
- vabal.u8 q12, d27, d9
+ vabal.u8 q11, d26, d8
+ vabal.u8 q12, d27, d9
@;Q5:Q6 : sad_top
@;Q7:Q8 : sad_left
@;Q9:Q10 : sad_right
@;Q11:Q12 : sad_bot
- vadd.u16 q5, q5, q6
- vadd.u16 q7, q7, q8
- vadd.u16 q9, q9, q10
- vadd.u16 q11, q11, q12
+ vadd.u16 q5, q5, q6
+ vadd.u16 q7, q7, q8
+ vadd.u16 q9, q9, q10
+ vadd.u16 q11, q11, q12
@; Free :-
@; Q6,Q8,Q10,Q12
@@ -893,10 +901,10 @@ loop_sad4_prog:
@;Q9 -> D18:D19
@;Q11 -> D22:D23
- vadd.u16 d10, d10, d11
- vadd.u16 d14, d14, d15
- vadd.u16 d18, d18, d19
- vadd.u16 d22, d22, d23
+ vadd.u16 d10, d10, d11
+ vadd.u16 d14, d14, d15
+ vadd.u16 d18, d18, d19
+ vadd.u16 d22, d22, d23
@;D10 : sad_top
@;D14 : sad_left
@@ -904,35 +912,35 @@ loop_sad4_prog:
@;D22 : sad_bot
- vpaddl.u16 d11, d10
- vpaddl.u16 d15, d14
- vpaddl.u16 d19, d18
- vpaddl.u16 d23, d22
+ vpaddl.u16 d11, d10
+ vpaddl.u16 d15, d14
+ vpaddl.u16 d19, d18
+ vpaddl.u16 d23, d22
@;D11 : sad_top
@;D15 : sad_left
@;D19 : sad_right
@;D23 : sad_bot
- vpaddl.u32 d10, d11
- vpaddl.u32 d22, d23
- vpaddl.u32 d14, d15
- vpaddl.u32 d18, d19
+ vpaddl.u32 d10, d11
+ vpaddl.u32 d22, d23
+ vpaddl.u32 d14, d15
+ vpaddl.u32 d18, d19
@;D10 : sad_top
@;D14 : sad_left
@;D18 : sad_right
@;D22 : sad_bot
- ldr r4, [sp, #20] @;Can be rearranged
-
- vsli.64 d10, d22, #32
- vsli.64 d14, d18, #32
+ ldr r4, [sp, #84] @;Can be rearranged
- vst1.64 {d14}, [r4]!
- vst1.64 {d10}, [r4]!
+ vsli.64 d10, d22, #32
+ vsli.64 d14, d18, #32
- ldmfd sp!, {r4-r7, pc}
+ vst1.64 {d14}, [r4]!
+ vst1.64 {d10}, [r4]!
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r7, pc}
@@ -974,37 +982,37 @@ ime_compute_satqd_16x16_lumainter_a9q:
@R5 :Distortion,ie SAD
@R6 :is nonzero
- push {r4-r12, lr} @push all the variables first
+ push {r4-r12, lr} @push all the variables first
@ADD SP,SP,#40 ;decrement stack pointer,to accomodate two variables
- ldr r4, [sp, #40] @load the threshold address
-
- mov r8, #8 @Number of 4x8 blocks to be processed
- mov r10, #0 @Sad
- mov r7, #0 @Nonzero info
+ ldr r4, [sp, #40] @load the threshold address
+ vpush {d8-d15}
+ mov r8, #8 @Number of 4x8 blocks to be processed
+ mov r10, #0 @Sad
+ mov r7, #0 @Nonzero info
@----------------------------------------------------
- vld1.u8 d30, [r0], r2 @I load 8 pix src row 1
+ vld1.u8 d30, [r0], r2 @I load 8 pix src row 1
- vld1.u8 d31, [r1], r3 @I load 8 pix pred row 1
+ vld1.u8 d31, [r1], r3 @I load 8 pix pred row 1
- vld1.u8 d28, [r0], r2 @I load 8 pix src row 2
+ vld1.u8 d28, [r0], r2 @I load 8 pix src row 2
- vld1.u8 d29, [r1], r3 @I load 8 pix pred row 2
+ vld1.u8 d29, [r1], r3 @I load 8 pix pred row 2
- vld1.u8 d26, [r0], r2 @I load 8 pix src row 3
- vabdl.u8 q0, d30, d31 @I Abs diff r1 blk 12
+ vld1.u8 d26, [r0], r2 @I load 8 pix src row 3
+ vabdl.u8 q0, d30, d31 @I Abs diff r1 blk 12
- vld1.u8 d27, [r1], r3 @I load 8 pix pred row 3
+ vld1.u8 d27, [r1], r3 @I load 8 pix pred row 3
- vld1.u8 d24, [r0], r2 @I load 8 pix src row 4
+ vld1.u8 d24, [r0], r2 @I load 8 pix src row 4
- vld1.u8 d25, [r1], r3 @I load 8 pix pred row 4
- vabdl.u8 q1, d28, d29 @I Abs diff r1 blk 12
+ vld1.u8 d25, [r1], r3 @I load 8 pix pred row 4
+ vabdl.u8 q1, d28, d29 @I Abs diff r1 blk 12
- vld1.u16 {q11}, [r4] @I load the threhold
- vabdl.u8 q2, d26, d27 @I Abs diff r1 blk 12
+ vld1.u16 {q11}, [r4] @I load the threhold
+ vabdl.u8 q2, d26, d27 @I Abs diff r1 blk 12
- vabdl.u8 q3, d24, d25 @I Abs diff r1 blk 12
+ vabdl.u8 q3, d24, d25 @I Abs diff r1 blk 12
@@ -1013,128 +1021,128 @@ core_loop:
@S5 S6 S7 S8 A5 A6 A7 A8
@S9 S10 S11 S12 A9 A10 A11 A12
@S13 S14 S15 S16 A13 A14 A15 A16
- ands r11, r8, #1 @II See if we are at even or odd block
- vadd.u16 q4 , q0, q3 @I Add r1 r4
- lsl r11, r2, #2 @II Move back src 4 rows
+ ands r11, r8, #1 @II See if we are at even or odd block
+ vadd.u16 q4 , q0, q3 @I Add r1 r4
+ lsl r11, r2, #2 @II Move back src 4 rows
- subeq r0, r0, r11 @II Move back src 4 rows if we are at even block
- vadd.u16 q5 , q1, q2 @I Add r2 r3
- addeq r0, r0, #8 @II Move src 8 cols forward if we are at even block
+ subeq r0, r0, r11 @II Move back src 4 rows if we are at even block
+ vadd.u16 q5 , q1, q2 @I Add r2 r3
+ addeq r0, r0, #8 @II Move src 8 cols forward if we are at even block
- lsl r11, r3, #2 @II Move back pred 4 rows
- vtrn.16 d8 , d10 @I trnspse 1
- subeq r1, r1, r11 @II Move back pred 4 rows if we are at even block
+ lsl r11, r3, #2 @II Move back pred 4 rows
+ vtrn.16 d8 , d10 @I trnspse 1
+ subeq r1, r1, r11 @II Move back pred 4 rows if we are at even block
- addeq r1, r1, #8 @II Move pred 8 cols forward if we are at even block
- vtrn.16 d9 , d11 @I trnspse 2
- subne r0, r0, #8 @II Src 8clos back for odd rows
+ addeq r1, r1, #8 @II Move pred 8 cols forward if we are at even block
+ vtrn.16 d9 , d11 @I trnspse 2
+ subne r0, r0, #8 @II Src 8clos back for odd rows
- subne r1, r1, #8 @II Pred 8 cols back for odd rows
- vtrn.32 d10, d11 @I trnspse 4
+ subne r1, r1, #8 @II Pred 8 cols back for odd rows
+ vtrn.32 d10, d11 @I trnspse 4
- vtrn.32 d8 , d9 @I trnspse 3
- vswp d10, d11 @I rearrange so that the q4 and q5 add properly
+ vtrn.32 d8 , d9 @I trnspse 3
+ vswp d10, d11 @I rearrange so that the q4 and q5 add properly
@D8 S1 S4 A1 A4
@D9 S2 S3 A2 A3
@D11 S1 S4 A1 A4
@D10 S2 S3 A2 A3
- vadd.s16 q6, q4, q5 @I Get s1 s4
- vld1.u8 d30, [r0], r2 @II load first 8 pix src row 1
+ vadd.s16 q6, q4, q5 @I Get s1 s4
+ vld1.u8 d30, [r0], r2 @II load first 8 pix src row 1
- vtrn.s16 d12, d13 @I Get s2 s3
+ vtrn.s16 d12, d13 @I Get s2 s3
@D12 S1 S4 A1 A4
@D13 S2 S3 A2 A3
- vshl.s16 q7, q6 , #1 @I si = si<<1
- vld1.u8 d31, [r1], r3 @II load first 8 pix pred row 1
+ vshl.s16 q7, q6 , #1 @I si = si<<1
+ vld1.u8 d31, [r1], r3 @II load first 8 pix pred row 1
- vpadd.s16 d16, d12, d13 @I (s1 + s4) (s2 + s3)
- vld1.u8 d28, [r0], r2 @II load first 8 pix src row 2
+ vpadd.s16 d16, d12, d13 @I (s1 + s4) (s2 + s3)
+ vld1.u8 d28, [r0], r2 @II load first 8 pix src row 2
@ D16 S14 A14 S23 A23
- vrev32.16 d0, d16 @I
- vuzp.s16 d16, d0 @I
+ vrev32.16 d0, d16 @I
+ vuzp.s16 d16, d0 @I
@D16 S14 S23 A14 A23
- vadd.s16 d17, d12, d13 @I (s1 + s2) (s3 + s4)
- vld1.u8 d29, [r1], r3 @II load first 8 pix pred row 2
+ vadd.s16 d17, d12, d13 @I (s1 + s2) (s3 + s4)
+ vld1.u8 d29, [r1], r3 @II load first 8 pix pred row 2
@D17 S12 S34 A12 A34
- vrev32.16 q9, q7 @I Rearrange si's
+ vrev32.16 q9, q7 @I Rearrange si's
@Q9 Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2
@D12 S1 S4 A1 A4
@D19 Z3 Z2 Y3 Y2
- vsub.s16 d8, d12, d19 @I (s1 - (s3<<1)) (s4 - (s2<<1))
- vld1.u8 d26, [r0], r2 @II load first 8 pix src row 3
+ vsub.s16 d8, d12, d19 @I (s1 - (s3<<1)) (s4 - (s2<<1))
+ vld1.u8 d26, [r0], r2 @II load first 8 pix src row 3
@D13 S2 S3 A2 A3
@D18 Z4 Z1 Y4 Y1
- vsub.s16 d9, d13, d18 @I (s2 - (s4<<1)) (s3 - (s1<<1))
- vld1.u8 d27, [r1], r3 @II load first 8 pix pred row 3
+ vsub.s16 d9, d13, d18 @I (s2 - (s4<<1)) (s3 - (s1<<1))
+ vld1.u8 d27, [r1], r3 @II load first 8 pix pred row 3
@Q10 S8 S5 A8 A5 S7 S4 A7 A4
@D16 S14 S23 A14 A23
- vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4
- vld1.u8 d24, [r0], r2 @II load first 8 pix src row 4
+ vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4
+ vld1.u8 d24, [r0], r2 @II load first 8 pix src row 4
@D22 SAD1 SAD2 junk junk
@Q8 S2 S1 A2 A1 S6 S3 A6 A3
@Q10 S8 S5 A8 A5 S7 S4 A7 A4
- vtrn.32 q8, q4 @I Rearrange to make ls of each block togather
+ vtrn.32 q8, q4 @I Rearrange to make ls of each block togather
@Q8 S2 S1 S8 S5 S6 S3 S7 S4
@Q10 A2 A1 A8 A5 A6 A3 A7 A4
- ldrh r11, [r4, #16] @I Load the threshold for DC val blk 1
- vdup.s16 q6, d10[0] @I Get the sad blk 1
- vabdl.u8 q0, d30, d31 @II Abs diff r1 blk 12
+ ldrh r11, [r4, #16] @I Load the threshold for DC val blk 1
+ vdup.s16 q6, d10[0] @I Get the sad blk 1
+ vabdl.u8 q0, d30, d31 @II Abs diff r1 blk 12
- vshl.s16 q7, q6, #1 @I sad_2 = sad_1<<1
- vmov.s16 r9, d10[0] @I Get the sad for block 1
+ vshl.s16 q7, q6, #1 @I sad_2 = sad_1<<1
+ vmov.s16 r9, d10[0] @I Get the sad for block 1
- vsub.s16 q9, q7, q8 @I Add to the lss
- vmov.s16 r5, d10[1] @I Get the sad for block 2
+ vsub.s16 q9, q7, q8 @I Add to the lss
+ vmov.s16 r5, d10[1] @I Get the sad for block 2
- vcle.s16 q7, q11, q9 @I Add to the lss
- vld1.u8 d25, [r1], r3 @II load first 8 pix pred row 4
+ vcle.s16 q7, q11, q9 @I Add to the lss
+ vld1.u8 d25, [r1], r3 @II load first 8 pix pred row 4
- vdup.s16 q15, d10[1] @I Get the sad blk 1
- vabdl.u8 q1, d28, d29 @II Abs diff r1 blk 12
+ vdup.s16 q15, d10[1] @I Get the sad blk 1
+ vabdl.u8 q1, d28, d29 @II Abs diff r1 blk 12
- vshl.s16 q14, q15, #1 @I sad_2 = sad_1<<1
- vsub.s16 q3, q14, q4 @I Add to the lss
- vcle.s16 q15, q11, q3 @I Add to the lss
+ vshl.s16 q14, q15, #1 @I sad_2 = sad_1<<1
+ vsub.s16 q3, q14, q4 @I Add to the lss
+ vcle.s16 q15, q11, q3 @I Add to the lss
- ADD R10, R10, R9 @I Add to the global sad blk 1
- vtrn.u8 q15, q7 @I get all comparison bits to one reg
- vabdl.u8 q2, d26, d27 @II Abs diff r1 blk 12
+ ADD R10, R10, R9 @I Add to the global sad blk 1
+ vtrn.u8 q15, q7 @I get all comparison bits to one reg
+ vabdl.u8 q2, d26, d27 @II Abs diff r1 blk 12
- ADD R10, R10, R5 @I Add to the global sad blk 2
- vshr.u8 q14, q15, #7 @I Shift the bits so that no overflow occurs
- cmp r11, r9
+ ADD R10, R10, R5 @I Add to the global sad blk 2
+ vshr.u8 q14, q15, #7 @I Shift the bits so that no overflow occurs
+ cmp r11, r9
- movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 1 ;I Compare with threshold blk 1
- vadd.u8 d28, d28, d29 @I Add the bits
- cmp r11, r5 @I Compare with threshold blk 2
+ movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 1 ;I Compare with threshold blk 1
+ vadd.u8 d28, d28, d29 @I Add the bits
+ cmp r11, r5 @I Compare with threshold blk 2
- movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 2
- vpadd.u8 d28, d28, d29 @I Add the bits
+ movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 2
+ vpadd.u8 d28, d28, d29 @I Add the bits
- vmov.u32 r11, d28[0] @I Since a set bit now represents a unstatisofrd contifon store it in r11
- vabdl.u8 q3, d24, d25 @II Abs diff r1 blk 12
+ vmov.u32 r11, d28[0] @I Since a set bit now represents a unstatisofrd contifon store it in r11
+ vabdl.u8 q3, d24, d25 @II Abs diff r1 blk 12
- orr r7, r7, r11 @I get the guy to r11
+ orr r7, r7, r11 @I get the guy to r11
- sub r8, r8, #1 @I Decremrnt block count
+ sub r8, r8, #1 @I Decremrnt block count
- cmp r7, #0 @I If we have atlest one non zero block
- bne compute_sad_only @I if a non zero block is der,From now on compute sad only
+ cmp r7, #0 @I If we have atlest one non zero block
+ bne compute_sad_only @I if a non zero block is der,From now on compute sad only
- cmp r8, #1 @I See if we are at the last block
- bne core_loop @I If the blocks are zero, lets continue the satdq
+ cmp r8, #1 @I See if we are at the last block
+ bne core_loop @I If the blocks are zero, lets continue the satdq
@EPILOUGE for core loop
@@ -1142,94 +1150,94 @@ core_loop:
@S5 S6 S7 S8 A5 A6 A7 A8
@S9 S10 S11 S12 A9 A10 A11 A12
@S13 S14 S15 S16 A13 A14 A15 A16
- vadd.u16 q4 , q0, q3 @Add r1 r4
- vadd.u16 q5 , q1, q2 @Add r2 r3
+ vadd.u16 q4 , q0, q3 @Add r1 r4
+ vadd.u16 q5 , q1, q2 @Add r2 r3
@D8 S1 S2 S2 S1
@D10 S4 S3 S3 S4
@D9 A1 A2 A2 A1
@D11 A4 A3 A3 A4
- vtrn.16 d8 , d10 @I trnspse 1
- vtrn.16 d9 , d11 @I trnspse 2
- vtrn.32 d8 , d9 @I trnspse 3
- vtrn.32 d10, d11 @I trnspse 4
+ vtrn.16 d8 , d10 @I trnspse 1
+ vtrn.16 d9 , d11 @I trnspse 2
+ vtrn.32 d8 , d9 @I trnspse 3
+ vtrn.32 d10, d11 @I trnspse 4
- vswp d10, d11 @I rearrange so that the q4 and q5 add properly
+ vswp d10, d11 @I rearrange so that the q4 and q5 add properly
@D8 S1 S4 A1 A4
@D9 S2 S3 A2 A3
@D11 S1 S4 A1 A4
@D10 S2 S3 A2 A3
- vadd.s16 q6, q4, q5 @Get s1 s4
- vtrn.s16 d12, d13 @Get s2 s3
+ vadd.s16 q6, q4, q5 @Get s1 s4
+ vtrn.s16 d12, d13 @Get s2 s3
@D12 S1 S4 A1 A4
@D13 S2 S3 A2 A3
- vshl.s16 q7, q6 , #1 @si = si<<1
- vmov.s16 r9, d10[0] @Get the sad for block 1
+ vshl.s16 q7, q6 , #1 @si = si<<1
+ vmov.s16 r9, d10[0] @Get the sad for block 1
- vpadd.s16 d16, d12, d13 @(s1 + s4) (s2 + s3)
- vmov.s16 r5, d10[1] @Get the sad for block 2
+ vpadd.s16 d16, d12, d13 @(s1 + s4) (s2 + s3)
+ vmov.s16 r5, d10[1] @Get the sad for block 2
@D16 S14 A14 S23 A23
- vrev32.16 d30, d16 @
- vuzp.s16 d16, d30 @
+ vrev32.16 d30, d16 @
+ vuzp.s16 d16, d30 @
@D16 S14 S23 A14 A23
- vadd.s16 d17, d12, d13 @(s1 + s2) (s3 + s4)
+ vadd.s16 d17, d12, d13 @(s1 + s2) (s3 + s4)
@D17 S12 S34 A12 A34
- vrev32.16 q9, q7 @Rearrange si's
+ vrev32.16 q9, q7 @Rearrange si's
@Q9 Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2
@D12 S1 S4 A1 A4
@D19 Z3 Z2 Y3 Y2
- vsub.s16 d8, d12, d19 @(s1 - (s3<<1)) (s4 - (s2<<1))
+ vsub.s16 d8, d12, d19 @(s1 - (s3<<1)) (s4 - (s2<<1))
@D13 S2 S3 A2 A3
@D18 Z4 Z1 Y4 Y1
- vsub.s16 d9, d13, d18 @(s2 - (s4<<1)) (s3 - (s1<<1))
+ vsub.s16 d9, d13, d18 @(s2 - (s4<<1)) (s3 - (s1<<1))
@Q10 S8 S5 A8 A5 S7 S4 A7 A4
@D16 S14 S23 A14 A23
- vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4
+ vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4
@D22 SAD1 SAD2 junk junk
- vmov.u16 r9, d10[0] @Get the sad for block 1
- vmov.u16 r5, d10[1] @Get the sad for block 2
+ vmov.u16 r9, d10[0] @Get the sad for block 1
+ vmov.u16 r5, d10[1] @Get the sad for block 2
@Q8 S2 S1 A2 A1 S6 S3 A6 A3
@Q10 S8 S5 A8 A5 S7 S4 A7 A4
- ldrh r11, [r4, #16] @Load the threshold for DC val blk 1
- vtrn.32 q8, q4 @Rearrange to make ls of each block togather
- ADD R10, R10, R9 @Add to the global sad blk 1
+ ldrh r11, [r4, #16] @Load the threshold for DC val blk 1
+ vtrn.32 q8, q4 @Rearrange to make ls of each block togather
+ ADD R10, R10, R9 @Add to the global sad blk 1
@Q8 S2 S1 S8 S5 S6 S3 S7 S4
@Q10 A2 A1 A8 A5 A6 A3 A7 A4
- vld1.u16 {q11}, [r4] @load the threhold
- ADD R10, R10, R5 @Add to the global sad blk 2
+ vld1.u16 {q11}, [r4] @load the threhold
+ ADD R10, R10, R5 @Add to the global sad blk 2
- vdup.u16 q6, d10[0] @Get the sad blk 1
+ vdup.u16 q6, d10[0] @Get the sad blk 1
- cmp r11, r9 @Compare with threshold blk 1
- vshl.u16 q7, q6, #1 @sad_2 = sad_1<<1
+ cmp r11, r9 @Compare with threshold blk 1
+ vshl.u16 q7, q6, #1 @sad_2 = sad_1<<1
- vsub.s16 q9, q7, q8 @Add to the lss
+ vsub.s16 q9, q7, q8 @Add to the lss
- vcle.s16 q15, q11, q9 @Add to the lss
- movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 1
+ vcle.s16 q15, q11, q9 @Add to the lss
+ movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 1
- cmp r11, r5 @Compare with threshold blk 2
- vdup.u16 q14, d10[1] @Get the sad blk 1
+ cmp r11, r5 @Compare with threshold blk 2
+ vdup.u16 q14, d10[1] @Get the sad blk 1
- vshl.u16 q13, q14, #1 @sad_2 = sad_1<<1
- vsub.s16 q12, q13, q4 @Add to the lss
- vcle.s16 q14, q11, q12 @Add to the lss
- movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 2
+ vshl.u16 q13, q14, #1 @sad_2 = sad_1<<1
+ vsub.s16 q12, q13, q4 @Add to the lss
+ vcle.s16 q14, q11, q12 @Add to the lss
+ movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 2
- vtrn.u8 q14, q15 @get all comparison bits to one reg
- vshr.u8 q14, q14, #7 @Shift the bits so that no overflow occurs
- vadd.u8 d28, d28, d29 @Add the bits
- vpadd.u8 d28, d28, d29 @Add the bits
- vmov.u32 r11, d28[0] @Since a set bit now represents a unstatisofrd contifon store it in r11
- orr r7, r7, r11 @get the guy to r11
+ vtrn.u8 q14, q15 @get all comparison bits to one reg
+ vshr.u8 q14, q14, #7 @Shift the bits so that no overflow occurs
+ vadd.u8 d28, d28, d29 @Add the bits
+ vpadd.u8 d28, d28, d29 @Add the bits
+ vmov.u32 r11, d28[0] @Since a set bit now represents a unstatisofrd contifon store it in r11
+ orr r7, r7, r11 @get the guy to r11
- b funcend_sad_16x16 @Since all blocks ar processed nw, got to end
+ b funcend_sad_16x16 @Since all blocks ar processed nw, got to end
compute_sad_only: @This block computes SAD only, so will be lighter
@IT will start processign at n odd block
@@ -1237,117 +1245,119 @@ compute_sad_only: @This block computes SAD only, so will b
@and then for two blocks at a time
@The counter is r7, hence r7 blocks will be processed
- and r11, r8, #1 @Get the last bit of counter
- cmp r11, #0 @See if we are at even or odd block
+ and r11, r8, #1 @Get the last bit of counter
+ cmp r11, #0 @See if we are at even or odd block
@iif the blk is even we just have to set the pointer to the
@start of current row
- lsleq r11, r2, #2 @I Move back src 4 rows
- subeq r0, r0, r11 @I Move back src 4 rows if we are at even block
+ lsleq r11, r2, #2 @I Move back src 4 rows
+ subeq r0, r0, r11 @I Move back src 4 rows if we are at even block
- lsleq r11, r3, #2 @I Move back pred 4 rows
- subeq r1, r1, r11 @I Move back pred 4 rows if we are at even block
+ lsleq r11, r3, #2 @I Move back pred 4 rows
+ subeq r1, r1, r11 @I Move back pred 4 rows if we are at even block
@ADDEQ R8,R8,#2 ;Inc counter
- beq skip_odd_blk @If the blk is odd we have to compute sad
+ beq skip_odd_blk @If the blk is odd we have to compute sad
- vadd.u16 q4, q0, q1 @Add SAD of row1 and row2
- vadd.u16 q5, q2, q3 @Add SAD of row3 and row4
- vadd.u16 q6, q4, q5 @Add SAD of row 1-4
- vadd.u16 d14, d12, d13 @Add Blk1 and blk2
- vpadd.u16 d16, d14, d15 @Add col 1-2 and 3-4
- vpadd.u16 d18, d16, d17 @Add col 12-34
+ vadd.u16 q4, q0, q1 @Add SAD of row1 and row2
+ vadd.u16 q5, q2, q3 @Add SAD of row3 and row4
+ vadd.u16 q6, q4, q5 @Add SAD of row 1-4
+ vadd.u16 d14, d12, d13 @Add Blk1 and blk2
+ vpadd.u16 d16, d14, d15 @Add col 1-2 and 3-4
+ vpadd.u16 d18, d16, d17 @Add col 12-34
- vmov.u16 r9, d18[0] @Move sad to arm
- ADD R10, R10, R9 @Add to the global sad
+ vmov.u16 r9, d18[0] @Move sad to arm
+ ADD R10, R10, R9 @Add to the global sad
- sub r8, r8, #1 @Dec counter
- cmp r8, #0 @See if we processed last block
- beq funcend_sad_16x16 @if lprocessed last block goto end of func
+ sub r8, r8, #1 @Dec counter
+ cmp r8, #0 @See if we processed last block
+ beq funcend_sad_16x16 @if lprocessed last block goto end of func
- sub r0, r0, #8 @Since we processed od block move back src by 8 cols
- sub r1, r1, #8 @Since we processed od block move back pred by 8 cols
+ sub r0, r0, #8 @Since we processed od block move back src by 8 cols
+ sub r1, r1, #8 @Since we processed od block move back pred by 8 cols
skip_odd_blk:
- vmov.s16 q0, #0 @Initialize the accumulator
- vmov.s16 q1, #0 @Initialize the accumulator
+ vmov.s16 q0, #0 @Initialize the accumulator
+ vmov.s16 q1, #0 @Initialize the accumulator
- vld1.u8 {q15}, [r0], r2 @load src r1
- vld1.u8 {q14}, [r1], r3 @load pred r1
+ vld1.u8 {q15}, [r0], r2 @load src r1
+ vld1.u8 {q14}, [r1], r3 @load pred r1
- vld1.u8 {q13}, [r0], r2 @load src r2
- vld1.u8 {q12}, [r1], r3 @load pred r2
+ vld1.u8 {q13}, [r0], r2 @load src r2
+ vld1.u8 {q12}, [r1], r3 @load pred r2
- vld1.u8 {q11}, [r0], r2 @load src r3
- vld1.u8 {q10}, [r1], r3 @load pred r2
+ vld1.u8 {q11}, [r0], r2 @load src r3
+ vld1.u8 {q10}, [r1], r3 @load pred r2
- vld1.u8 {q9}, [r0], r2 @load src r4
- vld1.u8 {q8}, [r1], r3 @load pred r4
+ vld1.u8 {q9}, [r0], r2 @load src r4
+ vld1.u8 {q8}, [r1], r3 @load pred r4
- cmp r8, #2
- beq sad_epilouge
+ cmp r8, #2
+ beq sad_epilouge
sad_loop:
- vabal.u8 q0, d30, d28 @I accumulate Abs diff R1
- vabal.u8 q1, d31, d29 @I accumulate Abs diff R1
+ vabal.u8 q0, d30, d28 @I accumulate Abs diff R1
+ vabal.u8 q1, d31, d29 @I accumulate Abs diff R1
- vld1.u8 {q15}, [r0], r2 @II load r1 src
- vabal.u8 q0, d26, d24 @I accumulate Abs diff R2
+ vld1.u8 {q15}, [r0], r2 @II load r1 src
+ vabal.u8 q0, d26, d24 @I accumulate Abs diff R2
- vld1.u8 {q14}, [r1], r3 @II load r1 pred
- vabal.u8 q1, d27, d25 @I accumulate Abs diff R2
+ vld1.u8 {q14}, [r1], r3 @II load r1 pred
+ vabal.u8 q1, d27, d25 @I accumulate Abs diff R2
- vld1.u8 {q13}, [r0], r2 @II load r3 src
- vabal.u8 q0, d22, d20 @I accumulate Abs diff R3
+ vld1.u8 {q13}, [r0], r2 @II load r3 src
+ vabal.u8 q0, d22, d20 @I accumulate Abs diff R3
- vld1.u8 {q12}, [r1], r3 @II load r2 pred
- vabal.u8 q1, d23, d21 @I accumulate Abs diff R3
+ vld1.u8 {q12}, [r1], r3 @II load r2 pred
+ vabal.u8 q1, d23, d21 @I accumulate Abs diff R3
- vld1.u8 {q11}, [r0], r2 @II load r3 src
- vabal.u8 q0, d18, d16 @I accumulate Abs diff R4
+ vld1.u8 {q11}, [r0], r2 @II load r3 src
+ vabal.u8 q0, d18, d16 @I accumulate Abs diff R4
- sub r8, r8, #2 @Since we processe 16 pix @a time, dec by 2
- vld1.u8 {q10}, [r1], r3 @II load r3 pred
- vabal.u8 q1, d19, d17 @I accumulate Abs diff R4
+ sub r8, r8, #2 @Since we processe 16 pix @a time, dec by 2
+ vld1.u8 {q10}, [r1], r3 @II load r3 pred
+ vabal.u8 q1, d19, d17 @I accumulate Abs diff R4
- cmp r8, #2 @Check if last loop
- vld1.u8 {q9}, [r0], r2 @II load r4 src
- vld1.u8 {q8}, [r1], r3 @II load r4 pred
+ cmp r8, #2 @Check if last loop
+ vld1.u8 {q9}, [r0], r2 @II load r4 src
+ vld1.u8 {q8}, [r1], r3 @II load r4 pred
- bne sad_loop @Go back to SAD computation
+ bne sad_loop @Go back to SAD computation
sad_epilouge:
- vabal.u8 q0, d30, d28 @Accumulate Abs diff R1
- vabal.u8 q1, d31, d29 @Accumulate Abs diff R1
+ vabal.u8 q0, d30, d28 @Accumulate Abs diff R1
+ vabal.u8 q1, d31, d29 @Accumulate Abs diff R1
- vabal.u8 q0, d26, d24 @Accumulate Abs diff R2
- vabal.u8 q1, d27, d25 @Accumulate Abs diff R2
+ vabal.u8 q0, d26, d24 @Accumulate Abs diff R2
+ vabal.u8 q1, d27, d25 @Accumulate Abs diff R2
- vabal.u8 q0, d22, d20 @Accumulate Abs diff R3
- vabal.u8 q1, d23, d21 @Aaccumulate Abs diff R3
+ vabal.u8 q0, d22, d20 @Accumulate Abs diff R3
+ vabal.u8 q1, d23, d21 @Aaccumulate Abs diff R3
- vabal.u8 q0, d18, d16 @Accumulate Abs diff R4
- vabal.u8 q1, d19, d17 @Accumulate Abs diff R4
+ vabal.u8 q0, d18, d16 @Accumulate Abs diff R4
+ vabal.u8 q1, d19, d17 @Accumulate Abs diff R4
- vadd.u16 q2, q0, q1 @ADD two accumulators
- vadd.u16 d6, d4, d5 @Add two blk sad
- vpadd.u16 d8, d6, d7 @Add col 1-2 and 3-4 sad
- vpadd.u16 d10, d8, d9 @Add col 12-34 sad
+ vadd.u16 q2, q0, q1 @ADD two accumulators
+ vadd.u16 d6, d4, d5 @Add two blk sad
+ vpadd.u16 d8, d6, d7 @Add col 1-2 and 3-4 sad
+ vpadd.u16 d10, d8, d9 @Add col 12-34 sad
- vmov.u16 r9, d10[0] @move SAD to ARM
- ADD R10, R10, R9 @Add to the global sad
+ vmov.u16 r9, d10[0] @move SAD to ARM
+ ADD R10, R10, R9 @Add to the global sad
funcend_sad_16x16: @End of fucntion process
- ldr r5, [sp, #44]
- ldr r6, [sp, #48]
- str r7, [r6] @Store the is zero reg
- str r10, [r5] @Store sad
+ vpop {d8-d15}
+ ldr r5, [sp, #44]
+ ldr r6, [sp, #48]
+
+ str r7, [r6] @Store the is zero reg
+ str r10, [r5] @Store sad
@SUB SP,SP,#40
- pop {r4-r12, pc}
+ pop {r4-r12, pc}
diff --git a/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s b/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s
index c442077..e768c21 100644
--- a/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s
+++ b/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s
@@ -76,7 +76,7 @@
.p2align 2
.include "ih264_neon_macros.s"
-.globl ih264e_evaluate_intra16x16_modes_av8
+.global ih264e_evaluate_intra16x16_modes_av8
ih264e_evaluate_intra16x16_modes_av8:
diff --git a/encoder/armv8/ih264e_half_pel_av8.s b/encoder/armv8/ih264e_half_pel_av8.s
index 6dbd8f8..817faa6 100644
--- a/encoder/armv8/ih264e_half_pel_av8.s
+++ b/encoder/armv8/ih264e_half_pel_av8.s
@@ -1015,10 +1015,3 @@ filter_2dvh_skip_row:
///*****************************************
-
-
-
-
-
-
- .section .note.gnu-stack,"",%progbits
diff --git a/encoder/armv8/ime_distortion_metrics_av8.s b/encoder/armv8/ime_distortion_metrics_av8.s
index 99ebc8a..47c3425 100644
--- a/encoder/armv8/ime_distortion_metrics_av8.s
+++ b/encoder/armv8/ime_distortion_metrics_av8.s
@@ -975,4 +975,3 @@ satdq_end_func:
ldp d8, d9, [sp], #16
pop_v_regs
ret
- .section .note.gnu-stack,"",%progbits
diff --git a/encoder/x86/ih264e_intra_modes_eval_ssse3.c b/encoder/x86/ih264e_intra_modes_eval_ssse3.c
index 657921f..0f4a9ad 100644
--- a/encoder/x86/ih264e_intra_modes_eval_ssse3.c
+++ b/encoder/x86/ih264e_intra_modes_eval_ssse3.c
@@ -487,7 +487,7 @@ void ih264e_evaluate_intra_4x4_modes_ssse3(UWORD8 *pu1_src,
INT_MAX, INT_MAX, INT_MAX, INT_MAX };
WORD32 min_cost;
- WORD32 lambda4 = u4_lambda << 2;
+ UWORD32 lambda4 = u4_lambda << 2;
WORD32 dst_strd2, dst_strd3;
__m128i left_top_16x8b, src_16x8b, pred0_16x8b, sad_8x16b;
diff --git a/encoder/x86/ime_distortion_metrics_sse42.c b/encoder/x86/ime_distortion_metrics_sse42.c
index 0876788..baf18a4 100644
--- a/encoder/x86/ime_distortion_metrics_sse42.c
+++ b/encoder/x86/ime_distortion_metrics_sse42.c
@@ -110,6 +110,7 @@ void ime_compute_sad_16x16_sse42(UWORD8 *pu1_src,
__m128i res_r0, res_r1, res_r2, res_r3;
__m128i sad_val;
int val1, val2;
+ UNUSED (i4_max_sad);
// Row 0-3 sad calculation
src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
@@ -248,6 +249,7 @@ void ime_compute_sad_16x8_sse42(UWORD8 *pu1_src,
WORD32 i4_max_sad,
WORD32 *pi4_mb_distortion)
{
+ UNUSED (i4_max_sad);
__m128i src_r0, src_r1, src_r2, src_r3;
__m128i est_r0, est_r1, est_r2, est_r3;
__m128i res_r0, res_r1, res_r2, res_r3;
@@ -498,6 +500,7 @@ void ime_compute_sad_16x16_fast_sse42(UWORD8 *pu1_src,
WORD32 i4_max_sad,
WORD32 *pi4_mb_distortion)
{
+ UNUSED (i4_max_sad);
__m128i src_r0, src_r1, src_r2, src_r3;
__m128i est_r0, est_r1, est_r2, est_r3;
__m128i res_r0, res_r1, res_r2, res_r3;