summaryrefslogtreecommitdiffstats
path: root/libvpx/vp9/common/arm/neon
diff options
context:
space:
mode:
Diffstat (limited to 'libvpx/vp9/common/arm/neon')
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm (renamed from libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm)0
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm (renamed from libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm)0
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm (renamed from libvpx/vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm)0
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm (renamed from libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm)10
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm (renamed from libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm)0
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm (renamed from libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm)0
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm (renamed from libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm)0
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm (renamed from libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm)0
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm (renamed from libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm)0
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm (renamed from libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm)2
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm199
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c52
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm70
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm33
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm634
15 files changed, 945 insertions, 55 deletions
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm
index b1fd21b..b1fd21b 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm
index a13c0d0..a13c0d0 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm
index d290d07..d290d07 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm
index 388a7d7..72e933e 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm
@@ -72,7 +72,7 @@ cospi_31_64 EQU 804
; reg1 = output[first_offset]
; reg2 = output[second_offset]
; for proper address calculation, the last offset used when manipulating
- ; output, wethere reading or storing) must be passed in. use 0 for first
+ ; output, whether reading or storing) must be passed in. use 0 for first
; use.
MACRO
LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
@@ -88,7 +88,7 @@ cospi_31_64 EQU 804
; output[first_offset] = reg1
; output[second_offset] = reg2
; for proper address calculation, the last offset used when manipulating
- ; output, wethere reading or storing) must be passed in. use 0 for first
+ ; output, whether reading or storing) must be passed in. use 0 for first
; use.
MACRO
STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
@@ -242,7 +242,7 @@ cospi_31_64 EQU 804
; TODO(cd): have special case to re-use constants when they are similar for
; consecutive butterflies
; TODO(cd): have special case when both constants are the same, do the
- ; additions/substractions before the multiplies.
+ ; additions/subtractions before the multiplies.
; generate the constants
; generate scalar constants
mov r8, #$first_constant & 0xFF00
@@ -260,7 +260,7 @@ cospi_31_64 EQU 804
vmull.s16 q11, $regB, d31
vmull.s16 q12, $regC, d31
; (used) five for intermediate (q8-q12), one for constants (q15)
- ; do some addition/substractions (to get back two register)
+ ; do some addition/subtractions (to get back two register)
vsub.s32 q8, q8, q10
vsub.s32 q9, q9, q11
; do more multiplications (ordered for maximum latency hiding)
@@ -268,7 +268,7 @@ cospi_31_64 EQU 804
vmull.s16 q11, $regA, d30
vmull.s16 q15, $regB, d30
; (used) six for intermediate (q8-q12, q15)
- ; do more addition/substractions
+ ; do more addition/subtractions
vadd.s32 q11, q12, q11
vadd.s32 q10, q10, q15
; (used) four for intermediate (q8-q11)
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm
index 0d4a721..0d4a721 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm
index 00283fc..00283fc 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm
index 421d202..421d202 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm
index 5476400..5476400 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm
index 2f326e2..2f326e2 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
index 93d3af3..b41f566 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
@@ -576,6 +576,7 @@
vld1.s16 {q14,q15}, [r0]!
push {r0-r10}
+ vpush {d8-d15}
; transpose the input data
TRANSPOSE8X8
@@ -636,6 +637,7 @@ iadst_iadst
IADST8X8_1D
end_vp9_iht8x8_64_add_neon
+ vpop {d8-d15}
pop {r0-r10}
; ROUND_POWER_OF_TWO(temp_out[j], 5)
diff --git a/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm b/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
new file mode 100644
index 0000000..5b8ec20
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
@@ -0,0 +1,199 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp9_lpf_horizontal_4_dual_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
+; const uint8_t *blimit0,
+; const uint8_t *limit0,
+; const uint8_t *thresh0,
+; const uint8_t *blimit1,
+; const uint8_t *limit1,
+; const uint8_t *thresh1)
+; r0 uint8_t *s,
+; r1 int p,
+; r2 const uint8_t *blimit0,
+; r3 const uint8_t *limit0,
+; sp const uint8_t *thresh0,
+; sp+4 const uint8_t *blimit1,
+; sp+8 const uint8_t *limit1,
+; sp+12 const uint8_t *thresh1,
+
+|vp9_lpf_horizontal_4_dual_neon| PROC
+ push {lr}
+
+ ldr r12, [sp, #4] ; load thresh0
+ vld1.8 {d0}, [r2] ; load blimit0 to first half q
+ vld1.8 {d2}, [r3] ; load limit0 to first half q
+
+ add r1, r1, r1 ; double pitch
+ ldr r2, [sp, #8] ; load blimit1
+
+ vld1.8 {d4}, [r12] ; load thresh0 to first half q
+
+ ldr r3, [sp, #12] ; load limit1
+ ldr r12, [sp, #16] ; load thresh1
+ vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q
+
+ sub r2, r0, r1, lsl #1 ; s[-4 * p]
+
+ vld1.8 {d3}, [r3] ; load limit1 to 2nd half q
+ vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q
+
+ vpush {d8-d15} ; save neon registers
+
+ add r3, r2, r1, lsr #1 ; s[-3 * p]
+
+ vld1.u8 {q3}, [r2@64], r1 ; p3
+ vld1.u8 {q4}, [r3@64], r1 ; p2
+ vld1.u8 {q5}, [r2@64], r1 ; p1
+ vld1.u8 {q6}, [r3@64], r1 ; p0
+ vld1.u8 {q7}, [r2@64], r1 ; q0
+ vld1.u8 {q8}, [r3@64], r1 ; q1
+ vld1.u8 {q9}, [r2@64] ; q2
+ vld1.u8 {q10}, [r3@64] ; q3
+
+ sub r2, r2, r1, lsl #1
+ sub r3, r3, r1, lsl #1
+
+ bl vp9_loop_filter_neon_16
+
+ vst1.u8 {q5}, [r2@64], r1 ; store op1
+ vst1.u8 {q6}, [r3@64], r1 ; store op0
+ vst1.u8 {q7}, [r2@64], r1 ; store oq0
+ vst1.u8 {q8}, [r3@64], r1 ; store oq1
+
+ vpop {d8-d15} ; restore neon registers
+
+ pop {pc}
+ ENDP ; |vp9_lpf_horizontal_4_dual_neon|
+
+; void vp9_loop_filter_neon_16();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. This function uses
+; registers d8-d15, so the calling function must save those registers.
+;
+; r0-r3, r12 PRESERVE
+; q0 blimit
+; q1 limit
+; q2 thresh
+; q3 p3
+; q4 p2
+; q5 p1
+; q6 p0
+; q7 q0
+; q8 q1
+; q9 q2
+; q10 q3
+;
+; Outputs:
+; q5 op1
+; q6 op0
+; q7 oq0
+; q8 oq1
+|vp9_loop_filter_neon_16| PROC
+
+ ; filter_mask
+ vabd.u8 q11, q3, q4 ; m1 = abs(p3 - p2)
+ vabd.u8 q12, q4, q5 ; m2 = abs(p2 - p1)
+ vabd.u8 q13, q5, q6 ; m3 = abs(p1 - p0)
+ vabd.u8 q14, q8, q7 ; m4 = abs(q1 - q0)
+ vabd.u8 q3, q9, q8 ; m5 = abs(q2 - q1)
+ vabd.u8 q4, q10, q9 ; m6 = abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 q11, q11, q12 ; m7 = max(m1, m2)
+ vmax.u8 q12, q13, q14 ; m8 = max(m3, m4)
+
+ vabd.u8 q9, q6, q7 ; abs(p0 - q0)
+
+ vmax.u8 q3, q3, q4 ; m9 = max(m5, m6)
+
+ vmov.u8 q10, #0x80
+
+ vmax.u8 q15, q11, q12 ; m10 = max(m7, m8)
+
+ vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
+ vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
+ vmax.u8 q15, q15, q3 ; m11 = max(m10, m9)
+
+ vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
+ vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
+
+ veor q7, q7, q10 ; qs0
+
+ vcge.u8 q15, q1, q15 ; abs(m11) > limit
+
+ vshr.u8 q2, q2, #1 ; a = a / 2
+ veor q6, q6, q10 ; ps0
+
+ veor q5, q5, q10 ; ps1
+ vqadd.u8 q9, q9, q2 ; a = b + a
+
+ veor q8, q8, q10 ; qs1
+
+ vmov.u16 q4, #3
+
+ vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
+ vsubl.s8 q11, d15, d13
+
+ vcge.u8 q9, q0, q9 ; a > blimit
+
+ vqsub.s8 q1, q5, q8 ; filter = clamp(ps1-qs1)
+ vorr q14, q13, q14 ; hev
+
+ vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0)
+ vmul.i16 q11, q11, q4
+
+ vand q1, q1, q14 ; filter &= hev
+ vand q15, q15, q9 ; mask
+
+ vmov.u8 q4, #3
+
+ vaddw.s8 q2, q2, d2 ; filter + 3 * (qs0 - ps0)
+ vaddw.s8 q11, q11, d3
+
+ vmov.u8 q9, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d2, q2
+ vqmovn.s16 d3, q11
+ vand q1, q1, q15 ; filter &= mask
+
+ vqadd.s8 q2, q1, q4 ; filter2 = clamp(filter+3)
+ vqadd.s8 q1, q1, q9 ; filter1 = clamp(filter+4)
+ vshr.s8 q2, q2, #3 ; filter2 >>= 3
+ vshr.s8 q1, q1, #3 ; filter1 >>= 3
+
+
+ vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + filter2)
+ vqsub.s8 q0, q7, q1 ; u = clamp(qs0 - filter1)
+
+ ; outer tap adjustments
+ vrshr.s8 q1, q1, #1 ; filter = ++filter1 >> 1
+
+ veor q7, q0, q10 ; *oq0 = u^0x80
+
+ vbic q1, q1, q14 ; filter &= ~hev
+
+ vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + filter)
+ vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - filter)
+
+ veor q6, q11, q10 ; *op0 = u^0x80
+ veor q5, q13, q10 ; *op1 = u^0x80
+ veor q8, q12, q10 ; *oq1 = u^0x80
+
+ bx lr
+ ENDP ; |vp9_loop_filter_neon_16|
+
+ END
diff --git a/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
new file mode 100644
index 0000000..0820db2
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+
+void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_lpf_horizontal_8(s, p, blimit0, limit0, thresh0, 1);
+ vp9_lpf_horizontal_8(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
+ vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
+ vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+ vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
+}
diff --git a/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm b/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm
index 8b4fe5d..4430322 100644
--- a/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm
@@ -8,10 +8,10 @@
; be found in the AUTHORS file in the root of the source tree.
;
- EXPORT |vp9_loop_filter_horizontal_edge_neon|
- EXPORT |vp9_loop_filter_vertical_edge_neon|
- EXPORT |vp9_mbloop_filter_horizontal_edge_neon|
- EXPORT |vp9_mbloop_filter_vertical_edge_neon|
+ EXPORT |vp9_lpf_horizontal_4_neon|
+ EXPORT |vp9_lpf_vertical_4_neon|
+ EXPORT |vp9_lpf_horizontal_8_neon|
+ EXPORT |vp9_lpf_vertical_8_neon|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
@@ -21,12 +21,12 @@
; TODO(fgalligan): See about removing the count code as this function is only
; called with a count of 1.
;
-; void vp9_loop_filter_horizontal_edge_neon(uint8_t *s,
-; int p /* pitch */,
-; const uint8_t *blimit,
-; const uint8_t *limit,
-; const uint8_t *thresh,
-; int count)
+; void vp9_lpf_horizontal_4_neon(uint8_t *s,
+; int p /* pitch */,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
;
; r0 uint8_t *s,
; r1 int p, /* pitch */
@@ -34,7 +34,7 @@
; r3 const uint8_t *limit,
; sp const uint8_t *thresh,
; sp+4 int count
-|vp9_loop_filter_horizontal_edge_neon| PROC
+|vp9_lpf_horizontal_4_neon| PROC
push {lr}
vld1.8 {d0[]}, [r2] ; duplicate *blimit
@@ -77,19 +77,19 @@ count_lf_h_loop
end_vp9_lf_h_edge
pop {pc}
- ENDP ; |vp9_loop_filter_horizontal_edge_neon|
+ ENDP ; |vp9_lpf_horizontal_4_neon|
; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
; works on 16 iterations at a time.
; TODO(fgalligan): See about removing the count code as this function is only
; called with a count of 1.
;
-; void vp9_loop_filter_vertical_edge_neon(uint8_t *s,
-; int p /* pitch */,
-; const uint8_t *blimit,
-; const uint8_t *limit,
-; const uint8_t *thresh,
-; int count)
+; void vp9_lpf_vertical_4_neon(uint8_t *s,
+; int p /* pitch */,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
;
; r0 uint8_t *s,
; r1 int p, /* pitch */
@@ -97,7 +97,7 @@ end_vp9_lf_h_edge
; r3 const uint8_t *limit,
; sp const uint8_t *thresh,
; sp+4 int count
-|vp9_loop_filter_vertical_edge_neon| PROC
+|vp9_lpf_vertical_4_neon| PROC
push {lr}
vld1.8 {d0[]}, [r2] ; duplicate *blimit
@@ -158,7 +158,7 @@ count_lf_v_loop
end_vp9_lf_v_edge
pop {pc}
- ENDP ; |vp9_loop_filter_vertical_edge_neon|
+ ENDP ; |vp9_lpf_vertical_4_neon|
; void vp9_loop_filter_neon();
; This is a helper function for the loopfilters. The invidual functions do the
@@ -276,18 +276,18 @@ end_vp9_lf_v_edge
bx lr
ENDP ; |vp9_loop_filter_neon|
-; void vp9_mbloop_filter_horizontal_edge_neon(uint8_t *s, int p,
-; const uint8_t *blimit,
-; const uint8_t *limit,
-; const uint8_t *thresh,
-; int count)
+; void vp9_lpf_horizontal_8_neon(uint8_t *s, int p,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
; r0 uint8_t *s,
; r1 int p, /* pitch */
; r2 const uint8_t *blimit,
; r3 const uint8_t *limit,
; sp const uint8_t *thresh,
; sp+4 int count
-|vp9_mbloop_filter_horizontal_edge_neon| PROC
+|vp9_lpf_horizontal_8_neon| PROC
push {r4-r5, lr}
vld1.8 {d0[]}, [r2] ; duplicate *blimit
@@ -333,14 +333,14 @@ count_mblf_h_loop
end_vp9_mblf_h_edge
pop {r4-r5, pc}
- ENDP ; |vp9_mbloop_filter_horizontal_edge_neon|
+ ENDP ; |vp9_lpf_horizontal_8_neon|
-; void vp9_mbloop_filter_vertical_edge_neon(uint8_t *s,
-; int pitch,
-; const uint8_t *blimit,
-; const uint8_t *limit,
-; const uint8_t *thresh,
-; int count)
+; void vp9_lpf_vertical_8_neon(uint8_t *s,
+; int pitch,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
;
; r0 uint8_t *s,
; r1 int pitch,
@@ -348,7 +348,7 @@ end_vp9_mblf_h_edge
; r3 const uint8_t *limit,
; sp const uint8_t *thresh,
; sp+4 int count
-|vp9_mbloop_filter_vertical_edge_neon| PROC
+|vp9_lpf_vertical_8_neon| PROC
push {r4-r5, lr}
vld1.8 {d0[]}, [r2] ; duplicate *blimit
@@ -420,7 +420,7 @@ count_mblf_v_loop
end_vp9_mblf_v_edge
pop {r4-r5, pc}
- ENDP ; |vp9_mbloop_filter_vertical_edge_neon|
+ ENDP ; |vp9_lpf_vertical_8_neon|
; void vp9_mbloop_filter_neon();
; This is a helper function for the loopfilters. The invidual functions do the
diff --git a/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm b/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
index 2e8001b..5fe2bba 100644
--- a/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
@@ -8,23 +8,23 @@
; be found in the AUTHORS file in the root of the source tree.
;
- EXPORT |vp9_mb_lpf_horizontal_edge_w_neon|
- EXPORT |vp9_mb_lpf_vertical_edge_w_neon|
+ EXPORT |vp9_lpf_horizontal_16_neon|
+ EXPORT |vp9_lpf_vertical_16_neon|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
-; void vp9_mb_lpf_horizontal_edge_w_neon(uint8_t *s, int p,
-; const uint8_t *blimit,
-; const uint8_t *limit,
-; const uint8_t *thresh
-; int count)
+; void vp9_lpf_horizontal_16_neon(uint8_t *s, int p,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh
+; int count)
; r0 uint8_t *s,
; r1 int p, /* pitch */
; r2 const uint8_t *blimit,
; r3 const uint8_t *limit,
; sp const uint8_t *thresh,
-|vp9_mb_lpf_horizontal_edge_w_neon| PROC
+|vp9_lpf_horizontal_16_neon| PROC
push {r4-r8, lr}
vpush {d8-d15}
ldr r4, [sp, #88] ; load thresh
@@ -115,18 +115,18 @@ h_next
vpop {d8-d15}
pop {r4-r8, pc}
- ENDP ; |vp9_mb_lpf_horizontal_edge_w_neon|
+ ENDP ; |vp9_lpf_horizontal_16_neon|
-; void vp9_mb_lpf_vertical_edge_w_neon(uint8_t *s, int p,
-; const uint8_t *blimit,
-; const uint8_t *limit,
-; const uint8_t *thresh)
+; void vp9_lpf_vertical_16_neon(uint8_t *s, int p,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
; r0 uint8_t *s,
; r1 int p, /* pitch */
; r2 const uint8_t *blimit,
; r3 const uint8_t *limit,
; sp const uint8_t *thresh,
-|vp9_mb_lpf_vertical_edge_w_neon| PROC
+|vp9_lpf_vertical_16_neon| PROC
push {r4-r8, lr}
vpush {d8-d15}
ldr r4, [sp, #88] ; load thresh
@@ -279,7 +279,7 @@ v_end
vpop {d8-d15}
pop {r4-r8, pc}
- ENDP ; |vp9_mb_lpf_vertical_edge_w_neon|
+ ENDP ; |vp9_lpf_vertical_16_neon|
; void vp9_wide_mbfilter_neon();
; This is a helper function for the loopfilters. The invidual functions do the
@@ -439,6 +439,9 @@ v_end
tst r7, #1
bxne lr
+ orrs r5, r5, r6 ; Check for 0
+ orreq r7, r7, #2 ; Only do mbfilter branch
+
; mbfilter flat && mask branch
; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
; and using vibt on the q's?
diff --git a/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm b/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm
new file mode 100644
index 0000000..dc9856f
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm
@@ -0,0 +1,634 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp9_v_predictor_4x4_neon|
+ EXPORT |vp9_v_predictor_8x8_neon|
+ EXPORT |vp9_v_predictor_16x16_neon|
+ EXPORT |vp9_v_predictor_32x32_neon|
+ EXPORT |vp9_h_predictor_4x4_neon|
+ EXPORT |vp9_h_predictor_8x8_neon|
+ EXPORT |vp9_h_predictor_16x16_neon|
+ EXPORT |vp9_h_predictor_32x32_neon|
+ EXPORT |vp9_tm_predictor_4x4_neon|
+ EXPORT |vp9_tm_predictor_8x8_neon|
+ EXPORT |vp9_tm_predictor_16x16_neon|
+ EXPORT |vp9_tm_predictor_32x32_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vp9_v_predictor_4x4_neon| PROC
+ vld1.32 {d0[0]}, [r2]
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ bx lr
+ ENDP ; |vp9_v_predictor_4x4_neon|
+
+;void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vp9_v_predictor_8x8_neon| PROC
+ vld1.8 {d0}, [r2]
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d0}, [r0], r1
+ bx lr
+ ENDP ; |vp9_v_predictor_8x8_neon|
+
+;void vp9_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vp9_v_predictor_16x16_neon| PROC
+ vld1.8 {q0}, [r2]
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ vst1.8 {q0}, [r0], r1
+ bx lr
+ ENDP ; |vp9_v_predictor_16x16_neon|
+
+;void vp9_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vp9_v_predictor_32x32_neon| PROC
+ vld1.8 {q0, q1}, [r2]
+ mov r2, #2
+loop_v
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ vst1.8 {q0, q1}, [r0], r1
+ subs r2, r2, #1
+ bgt loop_v
+ bx lr
+ ENDP ; |vp9_v_predictor_32x32_neon|
+
+;void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vp9_h_predictor_4x4_neon| PROC
+ vld1.32 {d1[0]}, [r3]
+ vdup.8 d0, d1[0]
+ vst1.32 {d0[0]}, [r0], r1
+ vdup.8 d0, d1[1]
+ vst1.32 {d0[0]}, [r0], r1
+ vdup.8 d0, d1[2]
+ vst1.32 {d0[0]}, [r0], r1
+ vdup.8 d0, d1[3]
+ vst1.32 {d0[0]}, [r0], r1
+ bx lr
+ ENDP ; |vp9_h_predictor_4x4_neon|
+
+;void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vp9_h_predictor_8x8_neon| PROC
+ vld1.64 {d1}, [r3]
+ vdup.8 d0, d1[0]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[1]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[2]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[3]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[4]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[5]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[6]
+ vst1.64 {d0}, [r0], r1
+ vdup.8 d0, d1[7]
+ vst1.64 {d0}, [r0], r1
+ bx lr
+ ENDP ; |vp9_h_predictor_8x8_neon|
+
+;void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vp9_h_predictor_16x16_neon| PROC
+ vld1.8 {q1}, [r3]
+ vdup.8 q0, d2[0]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[1]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[2]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[3]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[4]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[5]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[6]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[7]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[0]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[1]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[2]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[3]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[4]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[5]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[6]
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[7]
+ vst1.8 {q0}, [r0], r1
+ bx lr
+ ENDP ; |vp9_h_predictor_16x16_neon|
+
+;void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vp9_h_predictor_32x32_neon| PROC
+ sub r1, r1, #16
+ mov r2, #2
+loop_h
+ vld1.8 {q1}, [r3]!
+ vdup.8 q0, d2[0]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[1]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[2]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[3]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[4]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[5]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[6]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d2[7]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[0]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[1]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[2]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[3]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[4]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[5]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[6]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ vdup.8 q0, d3[7]
+ vst1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r0], r1
+ subs r2, r2, #1
+ bgt loop_h
+ bx lr
+ ENDP ; |vp9_h_predictor_32x32_neon|
+
+;void vp9_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vp9_tm_predictor_4x4_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ ldrb r12, [r12]
+ vdup.u8 d0, r12
+
+ ; Load above 4 pixels
+ vld1.32 {d2[0]}, [r2]
+
+ ; Compute above - ytop_left
+ vsubl.u8 q3, d2, d0
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; 1st row and 2nd row
+ ldrb r12, [r3], #1
+ ldrb r2, [r3], #1
+ vdup.u16 q1, r12
+ vdup.u16 q2, r2
+ vadd.s16 q1, q1, q3
+ vadd.s16 q2, q2, q3
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d1[0]}, [r0], r1
+
+ ; 3rd row and 4th row
+ ldrb r12, [r3], #1
+ ldrb r2, [r3], #1
+ vdup.u16 q1, r12
+ vdup.u16 q2, r2
+ vadd.s16 q1, q1, q3
+ vadd.s16 q2, q2, q3
+ vqmovun.s16 d0, q1
+ vqmovun.s16 d1, q2
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d1[0]}, [r0], r1
+ bx lr
+ ENDP ; |vp9_tm_predictor_4x4_neon|
+
+;void vp9_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vp9_tm_predictor_8x8_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ ldrb r12, [r12]
+ vdup.u8 d0, r12
+
+ ; preload 8 left
+ vld1.8 {d30}, [r3]
+
+ ; Load above 8 pixels
+ vld1.64 {d2}, [r2]
+
+ vmovl.u8 q10, d30
+
+ ; Compute above - ytop_left
+ vsubl.u8 q3, d2, d0
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; 1st row and 2nd row
+ vdup.16 q0, d20[0]
+ vdup.16 q1, d20[1]
+ vadd.s16 q0, q3, q0
+ vadd.s16 q1, q3, q1
+
+ ; 3rd row and 4th row
+ vdup.16 q8, d20[2]
+ vdup.16 q9, d20[3]
+ vadd.s16 q8, q3, q8
+ vadd.s16 q9, q3, q9
+
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q8
+ vqmovun.s16 d3, q9
+
+ vst1.64 {d0}, [r0], r1
+ vst1.64 {d1}, [r0], r1
+ vst1.64 {d2}, [r0], r1
+ vst1.64 {d3}, [r0], r1
+
+ ; 5th row and 6th row
+ vdup.16 q0, d21[0]
+ vdup.16 q1, d21[1]
+ vadd.s16 q0, q3, q0
+ vadd.s16 q1, q3, q1
+
+ ; 7th row and 8th row
+ vdup.16 q8, d21[2]
+ vdup.16 q9, d21[3]
+ vadd.s16 q8, q3, q8
+ vadd.s16 q9, q3, q9
+
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q8
+ vqmovun.s16 d3, q9
+
+ vst1.64 {d0}, [r0], r1
+ vst1.64 {d1}, [r0], r1
+ vst1.64 {d2}, [r0], r1
+ vst1.64 {d3}, [r0], r1
+
+ bx lr
+ ENDP ; |vp9_tm_predictor_8x8_neon|
+
+;void vp9_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vp9_tm_predictor_16x16_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ ldrb r12, [r12]
+ vdup.u8 q0, r12
+
+ ; Load above 8 pixels
+ vld1.8 {q1}, [r2]
+
+ ; preload 8 left into r12
+ vld1.8 {d18}, [r3]!
+
+ ; Compute above - ytop_left
+ vsubl.u8 q2, d2, d0
+ vsubl.u8 q3, d3, d1
+
+ vmovl.u8 q10, d18
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; Process 8 rows in each single loop and loop 2 times to process 16 rows.
+ mov r2, #2
+
+loop_16x16_neon
+ ; Process two rows.
+ vdup.16 q0, d20[0]
+ vdup.16 q8, d20[1]
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vdup.16 q0, d20[2] ; proload next 2 rows data
+ vdup.16 q8, d20[3]
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vdup.16 q0, d21[0] ; proload next 2 rows data
+ vdup.16 q8, d21[1]
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vdup.16 q0, d21[2] ; proload next 2 rows data
+ vdup.16 q8, d21[3]
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+
+ vadd.s16 q1, q0, q2
+ vadd.s16 q0, q0, q3
+ vadd.s16 q11, q8, q2
+ vadd.s16 q8, q8, q3
+ vqmovun.s16 d2, q1
+ vqmovun.s16 d3, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q8
+ vld1.8 {d18}, [r3]! ; preload 8 left into r12
+ vmovl.u8 q10, d18
+ vst1.64 {d2,d3}, [r0], r1
+ vst1.64 {d22,d23}, [r0], r1
+
+ subs r2, r2, #1
+ bgt loop_16x16_neon
+
+ bx lr
+ ENDP ; |vp9_tm_predictor_16x16_neon|
+
+;void vp9_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
+; const uint8_t *above,
+; const uint8_t *left)
+; r0 uint8_t *dst
+; r1 ptrdiff_t y_stride
+; r2 const uint8_t *above
+; r3 const uint8_t *left
+
+|vp9_tm_predictor_32x32_neon| PROC
+ ; Load ytop_left = above[-1];
+ sub r12, r2, #1
+ ldrb r12, [r12]
+ vdup.u8 q0, r12
+
+ ; Load above 32 pixels
+ vld1.8 {q1}, [r2]!
+ vld1.8 {q2}, [r2]
+
+ ; preload 8 left pixels
+ vld1.8 {d26}, [r3]!
+
+ ; Compute above - ytop_left
+ vsubl.u8 q8, d2, d0
+ vsubl.u8 q9, d3, d1
+ vsubl.u8 q10, d4, d0
+ vsubl.u8 q11, d5, d1
+
+ vmovl.u8 q3, d26
+
+ ; Load left row by row and compute left + (above - ytop_left)
+ ; Process 8 rows in each single loop and loop 4 times to process 32 rows.
+ mov r2, #4
+
+loop_32x32_neon
+ ; Process two rows.
+ vdup.16 q0, d6[0]
+ vdup.16 q2, d6[1]
+ vadd.s16 q12, q0, q8
+ vadd.s16 q13, q0, q9
+ vadd.s16 q14, q0, q10
+ vadd.s16 q15, q0, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vdup.16 q1, d6[2]
+ vdup.16 q2, d6[3]
+ vst1.64 {d24-d27}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q12, q1, q8
+ vadd.s16 q13, q1, q9
+ vadd.s16 q14, q1, q10
+ vadd.s16 q15, q1, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vdup.16 q0, d7[0]
+ vdup.16 q2, d7[1]
+ vst1.64 {d24-d27}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q12, q0, q8
+ vadd.s16 q13, q0, q9
+ vadd.s16 q14, q0, q10
+ vadd.s16 q15, q0, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vdup.16 q0, d7[2]
+ vdup.16 q2, d7[3]
+ vst1.64 {d24-d27}, [r0], r1
+
+ ; Process two rows.
+ vadd.s16 q12, q0, q8
+ vadd.s16 q13, q0, q9
+ vadd.s16 q14, q0, q10
+ vadd.s16 q15, q0, q11
+ vqmovun.s16 d0, q12
+ vqmovun.s16 d1, q13
+ vadd.s16 q12, q2, q8
+ vadd.s16 q13, q2, q9
+ vqmovun.s16 d2, q14
+ vqmovun.s16 d3, q15
+ vadd.s16 q14, q2, q10
+ vadd.s16 q15, q2, q11
+ vst1.64 {d0-d3}, [r0], r1
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vld1.8 {d0}, [r3]! ; preload 8 left pixels
+ vqmovun.s16 d26, q14
+ vqmovun.s16 d27, q15
+ vmovl.u8 q3, d0
+ vst1.64 {d24-d27}, [r0], r1
+
+ subs r2, r2, #1
+ bgt loop_32x32_neon
+
+ bx lr
+ ENDP ; |vp9_tm_predictor_32x32_neon|
+
+ END