diff options
Diffstat (limited to 'libvpx/vp9/common')
105 files changed, 9350 insertions, 6442 deletions
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm index b1fd21b..b1fd21b 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm index a13c0d0..a13c0d0 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm index d290d07..d290d07 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm index 388a7d7..72e933e 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm @@ -72,7 +72,7 @@ cospi_31_64 EQU 804 ; reg1 = output[first_offset] ; reg2 = output[second_offset] ; for proper address calculation, the last offset used when manipulating - ; output, wethere reading or storing) must be passed in. use 0 for first + ; output, whether reading or storing) must be passed in. use 0 for first ; use. MACRO LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 @@ -88,7 +88,7 @@ cospi_31_64 EQU 804 ; output[first_offset] = reg1 ; output[second_offset] = reg2 ; for proper address calculation, the last offset used when manipulating - ; output, wethere reading or storing) must be passed in. use 0 for first + ; output, whether reading or storing) must be passed in. use 0 for first ; use. MACRO STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 @@ -242,7 +242,7 @@ cospi_31_64 EQU 804 ; TODO(cd): have special case to re-use constants when they are similar for ; consecutive butterflies ; TODO(cd): have special case when both constants are the same, do the - ; additions/substractions before the multiplies. + ; additions/subtractions before the multiplies. ; generate the constants ; generate scalar constants mov r8, #$first_constant & 0xFF00 @@ -260,7 +260,7 @@ cospi_31_64 EQU 804 vmull.s16 q11, $regB, d31 vmull.s16 q12, $regC, d31 ; (used) five for intermediate (q8-q12), one for constants (q15) - ; do some addition/substractions (to get back two register) + ; do some addition/subtractions (to get back two register) vsub.s32 q8, q8, q10 vsub.s32 q9, q9, q11 ; do more multiplications (ordered for maximum latency hiding) @@ -268,7 +268,7 @@ cospi_31_64 EQU 804 vmull.s16 q11, $regA, d30 vmull.s16 q15, $regB, d30 ; (used) six for intermediate (q8-q12, q15) - ; do more addition/substractions + ; do more addition/subtractions vadd.s32 q11, q12, q11 vadd.s32 q10, q10, q15 ; (used) four for intermediate (q8-q11) diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm index 0d4a721..0d4a721 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm index 00283fc..00283fc 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm index 421d202..421d202 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm index 5476400..5476400 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm diff --git a/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm index 2f326e2..2f326e2 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm diff --git a/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm index 93d3af3..b41f566 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm @@ -576,6 +576,7 @@ vld1.s16 {q14,q15}, [r0]! push {r0-r10} + vpush {d8-d15} ; transpose the input data TRANSPOSE8X8 @@ -636,6 +637,7 @@ iadst_iadst IADST8X8_1D end_vp9_iht8x8_64_add_neon + vpop {d8-d15} pop {r0-r10} ; ROUND_POWER_OF_TWO(temp_out[j], 5) diff --git a/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm b/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm new file mode 100644 index 0000000..5b8ec20 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm @@ -0,0 +1,199 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_lpf_horizontal_4_dual_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p, +; const uint8_t *blimit0, +; const uint8_t *limit0, +; const uint8_t *thresh0, +; const uint8_t *blimit1, +; const uint8_t *limit1, +; const uint8_t *thresh1) +; r0 uint8_t *s, +; r1 int p, +; r2 const uint8_t *blimit0, +; r3 const uint8_t *limit0, +; sp const uint8_t *thresh0, +; sp+4 const uint8_t *blimit1, +; sp+8 const uint8_t *limit1, +; sp+12 const uint8_t *thresh1, + +|vp9_lpf_horizontal_4_dual_neon| PROC + push {lr} + + ldr r12, [sp, #4] ; load thresh0 + vld1.8 {d0}, [r2] ; load blimit0 to first half q + vld1.8 {d2}, [r3] ; load limit0 to first half q + + add r1, r1, r1 ; double pitch + ldr r2, [sp, #8] ; load blimit1 + + vld1.8 {d4}, [r12] ; load thresh0 to first half q + + ldr r3, [sp, #12] ; load limit1 + ldr r12, [sp, #16] ; load thresh1 + vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q + + sub r2, r0, r1, lsl #1 ; s[-4 * p] + + vld1.8 {d3}, [r3] ; load limit1 to 2nd half q + vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q + + vpush {d8-d15} ; save neon registers + + add r3, r2, r1, lsr #1 ; s[-3 * p] + + vld1.u8 {q3}, [r2@64], r1 ; p3 + vld1.u8 {q4}, [r3@64], r1 ; p2 + vld1.u8 {q5}, [r2@64], r1 ; p1 + vld1.u8 {q6}, [r3@64], r1 ; p0 + vld1.u8 {q7}, [r2@64], r1 ; q0 + vld1.u8 {q8}, [r3@64], r1 ; q1 + vld1.u8 {q9}, [r2@64] ; q2 + vld1.u8 {q10}, [r3@64] ; q3 + + sub r2, r2, r1, lsl #1 + sub r3, r3, r1, lsl #1 + + bl vp9_loop_filter_neon_16 + + vst1.u8 {q5}, [r2@64], r1 ; store op1 + vst1.u8 {q6}, [r3@64], r1 ; store op0 + vst1.u8 {q7}, [r2@64], r1 ; store oq0 + vst1.u8 {q8}, [r3@64], r1 ; store oq1 + + vpop {d8-d15} ; restore neon registers + + pop {pc} + ENDP ; |vp9_lpf_horizontal_4_dual_neon| + +; void vp9_loop_filter_neon_16(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. This function uses +; registers d8-d15, so the calling function must save those registers. +; +; r0-r3, r12 PRESERVE +; q0 blimit +; q1 limit +; q2 thresh +; q3 p3 +; q4 p2 +; q5 p1 +; q6 p0 +; q7 q0 +; q8 q1 +; q9 q2 +; q10 q3 +; +; Outputs: +; q5 op1 +; q6 op0 +; q7 oq0 +; q8 oq1 +|vp9_loop_filter_neon_16| PROC + + ; filter_mask + vabd.u8 q11, q3, q4 ; m1 = abs(p3 - p2) + vabd.u8 q12, q4, q5 ; m2 = abs(p2 - p1) + vabd.u8 q13, q5, q6 ; m3 = abs(p1 - p0) + vabd.u8 q14, q8, q7 ; m4 = abs(q1 - q0) + vabd.u8 q3, q9, q8 ; m5 = abs(q2 - q1) + vabd.u8 q4, q10, q9 ; m6 = abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 q11, q11, q12 ; m7 = max(m1, m2) + vmax.u8 q12, q13, q14 ; m8 = max(m3, m4) + + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + + vmax.u8 q3, q3, q4 ; m9 = max(m5, m6) + + vmov.u8 q10, #0x80 + + vmax.u8 q15, q11, q12 ; m10 = max(m7, m8) + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + vmax.u8 q15, q15, q3 ; m11 = max(m10, m9) + + vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) + vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 + + veor q7, q7, q10 ; qs0 + + vcge.u8 q15, q1, q15 ; abs(m11) > limit + + vshr.u8 q2, q2, #1 ; a = a / 2 + veor q6, q6, q10 ; ps0 + + veor q5, q5, q10 ; ps1 + vqadd.u8 q9, q9, q2 ; a = b + a + + veor q8, q8, q10 ; qs1 + + vmov.u16 q4, #3 + + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q11, d15, d13 + + vcge.u8 q9, q0, q9 ; a > blimit + + vqsub.s8 q1, q5, q8 ; filter = clamp(ps1-qs1) + vorr q14, q13, q14 ; hev + + vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) + vmul.i16 q11, q11, q4 + + vand q1, q1, q14 ; filter &= hev + vand q15, q15, q9 ; mask + + vmov.u8 q4, #3 + + vaddw.s8 q2, q2, d2 ; filter + 3 * (qs0 - ps0) + vaddw.s8 q11, q11, d3 + + vmov.u8 q9, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d2, q2 + vqmovn.s16 d3, q11 + vand q1, q1, q15 ; filter &= mask + + vqadd.s8 q2, q1, q4 ; filter2 = clamp(filter+3) + vqadd.s8 q1, q1, q9 ; filter1 = clamp(filter+4) + vshr.s8 q2, q2, #3 ; filter2 >>= 3 + vshr.s8 q1, q1, #3 ; filter1 >>= 3 + + + vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + filter2) + vqsub.s8 q0, q7, q1 ; u = clamp(qs0 - filter1) + + ; outer tap adjustments + vrshr.s8 q1, q1, #1 ; filter = ++filter1 >> 1 + + veor q7, q0, q10 ; *oq0 = u^0x80 + + vbic q1, q1, q14 ; filter &= ~hev + + vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + filter) + vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - filter) + + veor q6, q11, q10 ; *op0 = u^0x80 + veor q5, q13, q10 ; *op1 = u^0x80 + veor q8, q12, q10 ; *oq1 = u^0x80 + + bx lr + ENDP ; |vp9_loop_filter_neon_16| + + END diff --git a/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c new file mode 100644 index 0000000..0820db2 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" + +void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_horizontal_8(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_horizontal_8(s + 8, p, blimit1, limit1, thresh1, 1); +} + +void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1); +} + +void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1); +} + +void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh); + vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh); +} diff --git a/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm b/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm index 8b4fe5d..4430322 100644 --- a/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm @@ -8,10 +8,10 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_loop_filter_horizontal_edge_neon| - EXPORT |vp9_loop_filter_vertical_edge_neon| - EXPORT |vp9_mbloop_filter_horizontal_edge_neon| - EXPORT |vp9_mbloop_filter_vertical_edge_neon| + EXPORT |vp9_lpf_horizontal_4_neon| + EXPORT |vp9_lpf_vertical_4_neon| + EXPORT |vp9_lpf_horizontal_8_neon| + EXPORT |vp9_lpf_vertical_8_neon| ARM AREA ||.text||, CODE, READONLY, ALIGN=2 @@ -21,12 +21,12 @@ ; TODO(fgalligan): See about removing the count code as this function is only ; called with a count of 1. ; -; void vp9_loop_filter_horizontal_edge_neon(uint8_t *s, -; int p /* pitch */, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh, -; int count) +; void vp9_lpf_horizontal_4_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) ; ; r0 uint8_t *s, ; r1 int p, /* pitch */ @@ -34,7 +34,7 @@ ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, ; sp+4 int count -|vp9_loop_filter_horizontal_edge_neon| PROC +|vp9_lpf_horizontal_4_neon| PROC push {lr} vld1.8 {d0[]}, [r2] ; duplicate *blimit @@ -77,19 +77,19 @@ count_lf_h_loop end_vp9_lf_h_edge pop {pc} - ENDP ; |vp9_loop_filter_horizontal_edge_neon| + ENDP ; |vp9_lpf_horizontal_4_neon| ; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter ; works on 16 iterations at a time. ; TODO(fgalligan): See about removing the count code as this function is only ; called with a count of 1. ; -; void vp9_loop_filter_vertical_edge_neon(uint8_t *s, -; int p /* pitch */, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh, -; int count) +; void vp9_lpf_vertical_4_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) ; ; r0 uint8_t *s, ; r1 int p, /* pitch */ @@ -97,7 +97,7 @@ end_vp9_lf_h_edge ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, ; sp+4 int count -|vp9_loop_filter_vertical_edge_neon| PROC +|vp9_lpf_vertical_4_neon| PROC push {lr} vld1.8 {d0[]}, [r2] ; duplicate *blimit @@ -158,7 +158,7 @@ count_lf_v_loop end_vp9_lf_v_edge pop {pc} - ENDP ; |vp9_loop_filter_vertical_edge_neon| + ENDP ; |vp9_lpf_vertical_4_neon| ; void vp9_loop_filter_neon(); ; This is a helper function for the loopfilters. The invidual functions do the @@ -276,18 +276,18 @@ end_vp9_lf_v_edge bx lr ENDP ; |vp9_loop_filter_neon| -; void vp9_mbloop_filter_horizontal_edge_neon(uint8_t *s, int p, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh, -; int count) +; void vp9_lpf_horizontal_8_neon(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) ; r0 uint8_t *s, ; r1 int p, /* pitch */ ; r2 const uint8_t *blimit, ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, ; sp+4 int count -|vp9_mbloop_filter_horizontal_edge_neon| PROC +|vp9_lpf_horizontal_8_neon| PROC push {r4-r5, lr} vld1.8 {d0[]}, [r2] ; duplicate *blimit @@ -333,14 +333,14 @@ count_mblf_h_loop end_vp9_mblf_h_edge pop {r4-r5, pc} - ENDP ; |vp9_mbloop_filter_horizontal_edge_neon| + ENDP ; |vp9_lpf_horizontal_8_neon| -; void vp9_mbloop_filter_vertical_edge_neon(uint8_t *s, -; int pitch, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh, -; int count) +; void vp9_lpf_vertical_8_neon(uint8_t *s, +; int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) ; ; r0 uint8_t *s, ; r1 int pitch, @@ -348,7 +348,7 @@ end_vp9_mblf_h_edge ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, ; sp+4 int count -|vp9_mbloop_filter_vertical_edge_neon| PROC +|vp9_lpf_vertical_8_neon| PROC push {r4-r5, lr} vld1.8 {d0[]}, [r2] ; duplicate *blimit @@ -420,7 +420,7 @@ count_mblf_v_loop end_vp9_mblf_v_edge pop {r4-r5, pc} - ENDP ; |vp9_mbloop_filter_vertical_edge_neon| + ENDP ; |vp9_lpf_vertical_8_neon| ; void vp9_mbloop_filter_neon(); ; This is a helper function for the loopfilters. The invidual functions do the diff --git a/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm b/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm index 2e8001b..5fe2bba 100644 --- a/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm @@ -8,23 +8,23 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_mb_lpf_horizontal_edge_w_neon| - EXPORT |vp9_mb_lpf_vertical_edge_w_neon| + EXPORT |vp9_lpf_horizontal_16_neon| + EXPORT |vp9_lpf_vertical_16_neon| ARM AREA ||.text||, CODE, READONLY, ALIGN=2 -; void vp9_mb_lpf_horizontal_edge_w_neon(uint8_t *s, int p, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh -; int count) +; void vp9_lpf_horizontal_16_neon(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh +; int count) ; r0 uint8_t *s, ; r1 int p, /* pitch */ ; r2 const uint8_t *blimit, ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, -|vp9_mb_lpf_horizontal_edge_w_neon| PROC +|vp9_lpf_horizontal_16_neon| PROC push {r4-r8, lr} vpush {d8-d15} ldr r4, [sp, #88] ; load thresh @@ -115,18 +115,18 @@ h_next vpop {d8-d15} pop {r4-r8, pc} - ENDP ; |vp9_mb_lpf_horizontal_edge_w_neon| + ENDP ; |vp9_lpf_horizontal_16_neon| -; void vp9_mb_lpf_vertical_edge_w_neon(uint8_t *s, int p, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) +; void vp9_lpf_vertical_16_neon(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) ; r0 uint8_t *s, ; r1 int p, /* pitch */ ; r2 const uint8_t *blimit, ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, -|vp9_mb_lpf_vertical_edge_w_neon| PROC +|vp9_lpf_vertical_16_neon| PROC push {r4-r8, lr} vpush {d8-d15} ldr r4, [sp, #88] ; load thresh @@ -279,7 +279,7 @@ v_end vpop {d8-d15} pop {r4-r8, pc} - ENDP ; |vp9_mb_lpf_vertical_edge_w_neon| + ENDP ; |vp9_lpf_vertical_16_neon| ; void vp9_wide_mbfilter_neon(); ; This is a helper function for the loopfilters. The invidual functions do the @@ -439,6 +439,9 @@ v_end tst r7, #1 bxne lr + orrs r5, r5, r6 ; Check for 0 + orreq r7, r7, #2 ; Only do mbfilter branch + ; mbfilter flat && mask branch ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's ; and using vibt on the q's? diff --git a/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm b/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm new file mode 100644 index 0000000..dc9856f --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm @@ -0,0 +1,634 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_v_predictor_4x4_neon| + EXPORT |vp9_v_predictor_8x8_neon| + EXPORT |vp9_v_predictor_16x16_neon| + EXPORT |vp9_v_predictor_32x32_neon| + EXPORT |vp9_h_predictor_4x4_neon| + EXPORT |vp9_h_predictor_8x8_neon| + EXPORT |vp9_h_predictor_16x16_neon| + EXPORT |vp9_h_predictor_32x32_neon| + EXPORT |vp9_tm_predictor_4x4_neon| + EXPORT |vp9_tm_predictor_8x8_neon| + EXPORT |vp9_tm_predictor_16x16_neon| + EXPORT |vp9_tm_predictor_32x32_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_v_predictor_4x4_neon| PROC + vld1.32 {d0[0]}, [r2] + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + bx lr + ENDP ; |vp9_v_predictor_4x4_neon| + +;void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_v_predictor_8x8_neon| PROC + vld1.8 {d0}, [r2] + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + bx lr + ENDP ; |vp9_v_predictor_8x8_neon| + +;void vp9_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_v_predictor_16x16_neon| PROC + vld1.8 {q0}, [r2] + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + bx lr + ENDP ; |vp9_v_predictor_16x16_neon| + +;void vp9_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_v_predictor_32x32_neon| PROC + vld1.8 {q0, q1}, [r2] + mov r2, #2 +loop_v + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + subs r2, r2, #1 + bgt loop_v + bx lr + ENDP ; |vp9_v_predictor_32x32_neon| + +;void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_h_predictor_4x4_neon| PROC + vld1.32 {d1[0]}, [r3] + vdup.8 d0, d1[0] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[1] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[2] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[3] + vst1.32 {d0[0]}, [r0], r1 + bx lr + ENDP ; |vp9_h_predictor_4x4_neon| + +;void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_h_predictor_8x8_neon| PROC + vld1.64 {d1}, [r3] + vdup.8 d0, d1[0] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[1] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[2] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[3] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[4] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[5] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[6] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[7] + vst1.64 {d0}, [r0], r1 + bx lr + ENDP ; |vp9_h_predictor_8x8_neon| + +;void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_h_predictor_16x16_neon| PROC + vld1.8 {q1}, [r3] + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0], r1 + bx lr + ENDP ; |vp9_h_predictor_16x16_neon| + +;void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_h_predictor_32x32_neon| PROC + sub r1, r1, #16 + mov r2, #2 +loop_h + vld1.8 {q1}, [r3]! + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + subs r2, r2, #1 + bgt loop_h + bx lr + ENDP ; |vp9_h_predictor_32x32_neon| + +;void vp9_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_tm_predictor_4x4_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + ldrb r12, [r12] + vdup.u8 d0, r12 + + ; Load above 4 pixels + vld1.32 {d2[0]}, [r2] + + ; Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + ; Load left row by row and compute left + (above - ytop_left) + ; 1st row and 2nd row + ldrb r12, [r3], #1 + ldrb r2, [r3], #1 + vdup.u16 q1, r12 + vdup.u16 q2, r2 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + + ; 3rd row and 4th row + ldrb r12, [r3], #1 + ldrb r2, [r3], #1 + vdup.u16 q1, r12 + vdup.u16 q2, r2 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + bx lr + ENDP ; |vp9_tm_predictor_4x4_neon| + +;void vp9_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_tm_predictor_8x8_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + ldrb r12, [r12] + vdup.u8 d0, r12 + + ; preload 8 left + vld1.8 {d30}, [r3] + + ; Load above 8 pixels + vld1.64 {d2}, [r2] + + vmovl.u8 q10, d30 + + ; Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + ; Load left row by row and compute left + (above - ytop_left) + ; 1st row and 2nd row + vdup.16 q0, d20[0] + vdup.16 q1, d20[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + ; 3rd row and 4th row + vdup.16 q8, d20[2] + vdup.16 q9, d20[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + ; 5th row and 6th row + vdup.16 q0, d21[0] + vdup.16 q1, d21[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + ; 7th row and 8th row + vdup.16 q8, d21[2] + vdup.16 q9, d21[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + bx lr + ENDP ; |vp9_tm_predictor_8x8_neon| + +;void vp9_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_tm_predictor_16x16_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + ldrb r12, [r12] + vdup.u8 q0, r12 + + ; Load above 8 pixels + vld1.8 {q1}, [r2] + + ; preload 8 left into r12 + vld1.8 {d18}, [r3]! + + ; Compute above - ytop_left + vsubl.u8 q2, d2, d0 + vsubl.u8 q3, d3, d1 + + vmovl.u8 q10, d18 + + ; Load left row by row and compute left + (above - ytop_left) + ; Process 8 rows in each single loop and loop 2 times to process 16 rows. + mov r2, #2 + +loop_16x16_neon + ; Process two rows. + vdup.16 q0, d20[0] + vdup.16 q8, d20[1] + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d20[2] ; proload next 2 rows data + vdup.16 q8, d20[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + ; Process two rows. + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[0] ; proload next 2 rows data + vdup.16 q8, d21[1] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[2] ; proload next 2 rows data + vdup.16 q8, d21[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vld1.8 {d18}, [r3]! ; preload 8 left into r12 + vmovl.u8 q10, d18 + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + subs r2, r2, #1 + bgt loop_16x16_neon + + bx lr + ENDP ; |vp9_tm_predictor_16x16_neon| + +;void vp9_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_tm_predictor_32x32_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + ldrb r12, [r12] + vdup.u8 q0, r12 + + ; Load above 32 pixels + vld1.8 {q1}, [r2]! + vld1.8 {q2}, [r2] + + ; preload 8 left pixels + vld1.8 {d26}, [r3]! + + ; Compute above - ytop_left + vsubl.u8 q8, d2, d0 + vsubl.u8 q9, d3, d1 + vsubl.u8 q10, d4, d0 + vsubl.u8 q11, d5, d1 + + vmovl.u8 q3, d26 + + ; Load left row by row and compute left + (above - ytop_left) + ; Process 8 rows in each single loop and loop 4 times to process 32 rows. + mov r2, #4 + +loop_32x32_neon + ; Process two rows. + vdup.16 q0, d6[0] + vdup.16 q2, d6[1] + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q1, d6[2] + vdup.16 q2, d6[3] + vst1.64 {d24-d27}, [r0], r1 + + ; Process two rows. + vadd.s16 q12, q1, q8 + vadd.s16 q13, q1, q9 + vadd.s16 q14, q1, q10 + vadd.s16 q15, q1, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[0] + vdup.16 q2, d7[1] + vst1.64 {d24-d27}, [r0], r1 + + ; Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[2] + vdup.16 q2, d7[3] + vst1.64 {d24-d27}, [r0], r1 + + ; Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vld1.8 {d0}, [r3]! ; preload 8 left pixels + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vmovl.u8 q3, d0 + vst1.64 {d24-d27}, [r0], r1 + + subs r2, r2, #1 + bgt loop_32x32_neon + + bx lr + ENDP ; |vp9_tm_predictor_32x32_neon| + + END diff --git a/libvpx/vp9/common/generic/vp9_systemdependent.c b/libvpx/vp9/common/generic/vp9_systemdependent.c deleted file mode 100644 index 536febb..0000000 --- a/libvpx/vp9/common/generic/vp9_systemdependent.c +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "./vpx_config.h" -#include "./vp9_rtcd.h" -#include "vp9/common/vp9_onyxc_int.h" - -void vp9_machine_specific_config(VP9_COMMON *cm) { - (void)cm; - vp9_rtcd(); -} diff --git a/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h b/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h index 644264f..6ebea9f 100644 --- a/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h +++ b/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_COMMON_DSPR2_H_ -#define VP9_COMMON_VP9_COMMON_DSPR2_H_ +#ifndef VP9_COMMON_MIPS_DSPR2_VP9_COMMON_DSPR2_H_ +#define VP9_COMMON_MIPS_DSPR2_VP9_COMMON_DSPR2_H_ #include <assert.h> @@ -17,6 +17,10 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" +#ifdef __cplusplus +extern "C" { +#endif + #if HAVE_DSPR2 #define CROP_WIDTH 512 extern uint8_t *vp9_ff_cropTbl; @@ -81,8 +85,8 @@ static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) { ); } -void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride); +void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride); void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, @@ -114,4 +118,8 @@ void vp9_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, int w, int h); #endif // #if HAVE_DSPR2 -#endif // VP9_COMMON_VP9_COMMON_DSPR2_H_ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_MIPS_DSPR2_VP9_COMMON_DSPR2_H_ diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c index 1b2f550..19c582f 100644 --- a/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c +++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c @@ -19,8 +19,8 @@ #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 -static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output, - uint32_t no_rows) { +static void idct16_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { int i; int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; int step1_10, step1_11, step1_12, step1_13; @@ -404,8 +404,8 @@ static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output, } } -static void idct16_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { +static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { int i; int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; int step1_8, step1_9, step1_10, step1_11; @@ -905,13 +905,13 @@ void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, ); // First transform rows - idct16_1d_rows_dspr2(input, out, 16); + idct16_rows_dspr2(input, out, 16); // Then transform columns and add to dest - idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride); + idct16_cols_add_blk_dspr2(out, dest, dest_stride); } -static void iadst16_1d(const int16_t *input, int16_t *output) { +static void iadst16(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; int x0 = input[15]; @@ -1099,16 +1099,16 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, switch (tx_type) { case DCT_DCT: // DCT in both horizontal and vertical - idct16_1d_rows_dspr2(input, outptr, 16); - idct16_1d_cols_add_blk_dspr2(out, dest, pitch); + idct16_rows_dspr2(input, outptr, 16); + idct16_cols_add_blk_dspr2(out, dest, pitch); break; case ADST_DCT: // ADST in vertical, DCT in horizontal - idct16_1d_rows_dspr2(input, outptr, 16); + idct16_rows_dspr2(input, outptr, 16); outptr = out; for (i = 0; i < 16; ++i) { - iadst16_1d(outptr, temp_out); + iadst16(outptr, temp_out); for (j = 0; j < 16; ++j) dest[j * pitch + i] = @@ -1125,7 +1125,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, /* prefetch row */ vp9_prefetch_load((const uint8_t *)(input + 16)); - iadst16_1d(input, outptr); + iadst16(input, outptr); input += 16; outptr += 16; } @@ -1134,7 +1134,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, for (j = 0; j < 16; ++j) temp_in[j * 16 + i] = out[i * 16 + j]; - idct16_1d_cols_add_blk_dspr2(temp_in, dest, pitch); + idct16_cols_add_blk_dspr2(temp_in, dest, pitch); } break; case ADST_ADST: // ADST in both directions @@ -1145,7 +1145,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, /* prefetch row */ vp9_prefetch_load((const uint8_t *)(input + 16)); - iadst16_1d(input, outptr); + iadst16(input, outptr); input += 16; outptr += 16; } @@ -1153,7 +1153,7 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - iadst16_1d(temp_in, temp_out); + iadst16(temp_in, temp_out); for (j = 0; j < 16; ++j) dest[j * pitch + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) @@ -1183,7 +1183,7 @@ void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, // First transform rows. Since all non-zero dct coefficients are in // upper-left 4x4 area, we only need to calculate first 4 rows here. - idct16_1d_rows_dspr2(input, outptr, 4); + idct16_rows_dspr2(input, outptr, 4); outptr += 4; for (i = 0; i < 6; ++i) { @@ -1213,7 +1213,7 @@ void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, } // Then transform columns - idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride); + idct16_cols_add_blk_dspr2(out, dest, dest_stride); } void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c index 5e92db3..132d88c 100644 --- a/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c +++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c @@ -18,8 +18,8 @@ #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 -void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { +void vp9_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19; diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c index bc67594..74a90b0 100644 --- a/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c +++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c @@ -19,8 +19,8 @@ #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 -static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output, - uint32_t no_rows) { +static void idct32_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; @@ -882,10 +882,10 @@ void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, ); // Rows - idct32_1d_rows_dspr2(input, outptr, 32); + idct32_rows_dspr2(input, outptr, 32); // Columns - vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride); + vp9_idct32_cols_add_blk_dspr2(out, dest, dest_stride); } void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, @@ -903,7 +903,7 @@ void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, ); // Rows - idct32_1d_rows_dspr2(input, outptr, 8); + idct32_rows_dspr2(input, outptr, 8); outptr += 8; __asm__ __volatile__ ( @@ -947,7 +947,7 @@ void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, } // Columns - vp9_idct32_1d_cols_add_blk_dspr2(out, dest, stride); + vp9_idct32_cols_add_blk_dspr2(out, dest, stride); } void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c index 5b7aa5e..1990348 100644 --- a/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c +++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c @@ -19,7 +19,7 @@ #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 -static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) { +static void vp9_idct4_rows_dspr2(const int16_t *input, int16_t *output) { int16_t step_0, step_1, step_2, step_3; int Temp0, Temp1, Temp2, Temp3; const int const_2_power_13 = 8192; @@ -104,7 +104,7 @@ static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) { } } -static void vp9_idct4_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, +static void vp9_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { int16_t step_0, step_1, step_2, step_3; int Temp0, Temp1, Temp2, Temp3; @@ -240,10 +240,10 @@ void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, ); // Rows - vp9_idct4_1d_rows_dspr2(input, outptr); + vp9_idct4_rows_dspr2(input, outptr); // Columns - vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); } void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, @@ -319,7 +319,7 @@ void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, } } -static void iadst4_1d_dspr2(const int16_t *input, int16_t *output) { +static void iadst4_dspr2(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; int x0, x1, x2, x3; @@ -379,16 +379,16 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, switch (tx_type) { case DCT_DCT: // DCT in both horizontal and vertical - vp9_idct4_1d_rows_dspr2(input, outptr); - vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + vp9_idct4_rows_dspr2(input, outptr); + vp9_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); break; case ADST_DCT: // ADST in vertical, DCT in horizontal - vp9_idct4_1d_rows_dspr2(input, outptr); + vp9_idct4_rows_dspr2(input, outptr); outptr = out; for (i = 0; i < 4; ++i) { - iadst4_1d_dspr2(outptr, temp_out); + iadst4_dspr2(outptr, temp_out); for (j = 0; j < 4; ++j) dest[j * dest_stride + i] = @@ -400,7 +400,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, break; case DCT_ADST: // DCT in vertical, ADST in horizontal for (i = 0; i < 4; ++i) { - iadst4_1d_dspr2(input, outptr); + iadst4_dspr2(input, outptr); input += 4; outptr += 4; } @@ -410,11 +410,11 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, temp_in[i * 4 + j] = out[j * 4 + i]; } } - vp9_idct4_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); + vp9_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); break; case ADST_ADST: // ADST in both directions for (i = 0; i < 4; ++i) { - iadst4_1d_dspr2(input, outptr); + iadst4_dspr2(input, outptr); input += 4; outptr += 4; } @@ -422,7 +422,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - iadst4_1d_dspr2(temp_in, temp_out); + iadst4_dspr2(temp_in, temp_out); for (j = 0; j < 4; ++j) dest[j * dest_stride + i] = diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c index 93a0840..acccaea 100644 --- a/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c +++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c @@ -19,8 +19,8 @@ #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 -static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output, - uint32_t no_rows) { +static void idct8_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; const int const_2_power_13 = 8192; int Temp0, Temp1, Temp2, Temp3, Temp4; @@ -200,8 +200,8 @@ static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output, } } -static void idct8_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { +static void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; int Temp0, Temp1, Temp2, Temp3; int i; @@ -462,13 +462,13 @@ void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, ); // First transform rows - idct8_1d_rows_dspr2(input, outptr, 8); + idct8_rows_dspr2(input, outptr, 8); // Then transform columns and add to dest - idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); } -static void iadst8_1d_dspr2(const int16_t *input, int16_t *output) { +static void iadst8_dspr2(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; int x0, x1, x2, x3, x4, x5, x6, x7; @@ -563,14 +563,14 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, switch (tx_type) { case DCT_DCT: // DCT in both horizontal and vertical - idct8_1d_rows_dspr2(input, outptr, 8); - idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + idct8_rows_dspr2(input, outptr, 8); + idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); break; case ADST_DCT: // ADST in vertical, DCT in horizontal - idct8_1d_rows_dspr2(input, outptr, 8); + idct8_rows_dspr2(input, outptr, 8); for (i = 0; i < 8; ++i) { - iadst8_1d_dspr2(&out[i * 8], temp_out); + iadst8_dspr2(&out[i * 8], temp_out); for (j = 0; j < 8; ++j) dest[j * dest_stride + i] = @@ -580,7 +580,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, break; case DCT_ADST: // DCT in vertical, ADST in horizontal for (i = 0; i < 8; ++i) { - iadst8_1d_dspr2(input, outptr); + iadst8_dspr2(input, outptr); input += 8; outptr += 8; } @@ -590,11 +590,11 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, temp_in[i * 8 + j] = out[j * 8 + i]; } } - idct8_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); + idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); break; case ADST_ADST: // ADST in both directions for (i = 0; i < 8; ++i) { - iadst8_1d_dspr2(input, outptr); + iadst8_dspr2(input, outptr); input += 8; outptr += 8; } @@ -603,7 +603,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - iadst8_1d_dspr2(temp_in, temp_out); + iadst8_dspr2(temp_in, temp_out); for (j = 0; j < 8; ++j) dest[j * dest_stride + i] = @@ -631,7 +631,7 @@ void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest, ); // First transform rows - idct8_1d_rows_dspr2(input, outptr, 4); + idct8_rows_dspr2(input, outptr, 4); outptr += 4; @@ -659,7 +659,7 @@ void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest, // Then transform columns and add to dest - idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); } void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, diff --git a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c index 36cfc83..3df7f4c 100644 --- a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c +++ b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c @@ -20,12 +20,12 @@ #include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h" #if HAVE_DSPR2 -void vp9_loop_filter_horizontal_edge_dspr2(unsigned char *s, - int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { +void vp9_lpf_horizontal_4_dspr2(unsigned char *s, + int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count) { uint8_t i; uint32_t mask; uint32_t hev; @@ -114,12 +114,12 @@ void vp9_loop_filter_horizontal_edge_dspr2(unsigned char *s, } } -void vp9_loop_filter_vertical_edge_dspr2(unsigned char *s, - int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { +void vp9_lpf_vertical_4_dspr2(unsigned char *s, + int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count) { uint8_t i; uint32_t mask, hev; uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; @@ -306,4 +306,57 @@ void vp9_loop_filter_vertical_edge_dspr2(unsigned char *s, } } } + +void vp9_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1); +} + +void vp9_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1); +} + +void vp9_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1); +} + +void vp9_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, + 1); +} + +void vp9_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + vp9_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh); + vp9_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh); +} #endif // #if HAVE_DSPR2 diff --git a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h index 98bfcfa..008cf8c 100644 --- a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h +++ b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h @@ -17,6 +17,10 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_onyxc_int.h" +#ifdef __cplusplus +extern "C" { +#endif + #if HAVE_DSPR2 /* inputs & outputs are quad-byte vectors */ static INLINE void vp9_filter_dspr2(uint32_t mask, uint32_t hev, @@ -752,4 +756,8 @@ static INLINE void vp9_wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6, *oq6 = res_oq6; } #endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_ diff --git a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h index 4cb2ebb..ca01a6a 100644 --- a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h +++ b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h @@ -17,6 +17,10 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_onyxc_int.h" +#ifdef __cplusplus +extern "C" { +#endif + #if HAVE_DSPR2 #define STORE_F0() { \ __asm__ __volatile__ ( \ @@ -467,4 +471,8 @@ } #endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_ diff --git a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h index b9e0aca..5b0d9cc 100644 --- a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h +++ b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h @@ -17,6 +17,10 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_onyxc_int.h" +#ifdef __cplusplus +extern "C" { +#endif + #if HAVE_DSPR2 /* processing 4 pixels at the same time * compute hev and mask in the same function */ @@ -362,4 +366,8 @@ static INLINE void vp9_flatmask5(uint32_t p4, uint32_t p3, *flat2 = flat1; } #endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_ diff --git a/libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c index adfd755..7cd0b63 100644 --- a/libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c +++ b/libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c @@ -20,12 +20,12 @@ #include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h" #if HAVE_DSPR2 -void vp9_mbloop_filter_horizontal_edge_dspr2(unsigned char *s, - int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { +void vp9_lpf_horizontal_8_dspr2(unsigned char *s, + int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count) { uint32_t mask; uint32_t hev, flat; uint8_t i; @@ -319,12 +319,12 @@ void vp9_mbloop_filter_horizontal_edge_dspr2(unsigned char *s, } } -void vp9_mbloop_filter_vertical_edge_dspr2(unsigned char *s, - int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { +void vp9_lpf_vertical_8_dspr2(unsigned char *s, + int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count) { uint8_t i; uint32_t mask, hev, flat; uint8_t *s1, *s2, *s3, *s4; diff --git a/libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c index 0759755..6c94674 100644 --- a/libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c +++ b/libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c @@ -20,12 +20,12 @@ #include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h" #if HAVE_DSPR2 -void vp9_mb_lpf_horizontal_edge_w_dspr2(unsigned char *s, - int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { +void vp9_lpf_horizontal_16_dspr2(unsigned char *s, + int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count) { uint32_t mask; uint32_t hev, flat, flat2; uint8_t i; diff --git a/libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c index 9e9171c..851fc6c 100644 --- a/libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c +++ b/libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c @@ -20,11 +20,11 @@ #include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h" #if HAVE_DSPR2 -void vp9_mb_lpf_vertical_edge_w_dspr2(uint8_t *s, - int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { +void vp9_lpf_vertical_16_dspr2(uint8_t *s, + int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { uint8_t i; uint32_t mask, hev, flat, flat2; uint8_t *s1, *s2, *s3, *s4; diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c index d298160..08ab27a 100644 --- a/libvpx/vp9/common/vp9_alloccommon.c +++ b/libvpx/vp9/common/vp9_alloccommon.c @@ -8,14 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include "./vpx_config.h" #include "vpx_mem/vpx_mem.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_entropymv.h" -#include "vp9/common/vp9_findnearmv.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_systemdependent.h" @@ -31,27 +29,6 @@ void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi) { vpx_memset(&mi[i * stride], 0, sizeof(MODE_INFO)); } -void vp9_free_frame_buffers(VP9_COMMON *cm) { - int i; - - for (i = 0; i < NUM_YV12_BUFFERS; i++) - vp9_free_frame_buffer(&cm->yv12_fb[i]); - - vp9_free_frame_buffer(&cm->post_proc_buffer); - - vpx_free(cm->mip); - vpx_free(cm->prev_mip); - vpx_free(cm->last_frame_seg_map); - vpx_free(cm->mi_grid_base); - vpx_free(cm->prev_mi_grid_base); - - cm->mip = NULL; - cm->prev_mip = NULL; - cm->last_frame_seg_map = NULL; - cm->mi_grid_base = NULL; - cm->prev_mi_grid_base = NULL; -} - static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) { cm->mi_cols = aligned_width >> MI_SIZE_LOG2; cm->mi_rows = aligned_height >> MI_SIZE_LOG2; @@ -69,57 +46,91 @@ static void setup_mi(VP9_COMMON *cm) { cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1; vpx_memset(cm->mip, 0, - cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO)); + cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(*cm->mip)); vpx_memset(cm->mi_grid_base, 0, cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base)); - vp9_update_mode_info_border(cm, cm->mip); vp9_update_mode_info_border(cm, cm->prev_mip); } +static int alloc_mi(VP9_COMMON *cm, int mi_size) { + cm->mip = (MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->mip)); + if (cm->mip == NULL) + return 1; + + cm->prev_mip = (MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->prev_mip)); + if (cm->prev_mip == NULL) + return 1; + + cm->mi_grid_base = + (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base)); + if (cm->mi_grid_base == NULL) + return 1; + + cm->prev_mi_grid_base = + (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base)); + if (cm->prev_mi_grid_base == NULL) + return 1; + + return 0; +} + +static void free_mi(VP9_COMMON *cm) { + vpx_free(cm->mip); + vpx_free(cm->prev_mip); + vpx_free(cm->mi_grid_base); + vpx_free(cm->prev_mi_grid_base); + + cm->mip = NULL; + cm->prev_mip = NULL; + cm->mi_grid_base = NULL; + cm->prev_mi_grid_base = NULL; +} + +void vp9_free_frame_buffers(VP9_COMMON *cm) { + int i; + + for (i = 0; i < FRAME_BUFFERS; ++i) { + vp9_free_frame_buffer(&cm->frame_bufs[i].buf); + + if (cm->frame_bufs[i].ref_count > 0 && + cm->frame_bufs[i].raw_frame_buffer.data != NULL) { + cm->release_fb_cb(cm->cb_priv, &cm->frame_bufs[i].raw_frame_buffer); + cm->frame_bufs[i].ref_count = 0; + } + } + + vp9_free_frame_buffer(&cm->post_proc_buffer); + + free_mi(cm); + + vpx_free(cm->last_frame_seg_map); + cm->last_frame_seg_map = NULL; +} + int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) { const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2); const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); const int ss_x = cm->subsampling_x; const int ss_y = cm->subsampling_y; - int mi_size; if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y, - VP9BORDERINPIXELS) < 0) + VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0) goto fail; set_mb_mi(cm, aligned_width, aligned_height); - // Allocation - mi_size = cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE); - - vpx_free(cm->mip); - cm->mip = vpx_calloc(mi_size, sizeof(MODE_INFO)); - if (!cm->mip) - goto fail; - - vpx_free(cm->prev_mip); - cm->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO)); - if (!cm->prev_mip) - goto fail; - - vpx_free(cm->mi_grid_base); - cm->mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->mi_grid_base)); - if (!cm->mi_grid_base) - goto fail; - - vpx_free(cm->prev_mi_grid_base); - cm->prev_mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base)); - if (!cm->prev_mi_grid_base) + free_mi(cm); + if (alloc_mi(cm, cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE))) goto fail; setup_mi(cm); // Create the segmentation map structure and set to 0. vpx_free(cm->last_frame_seg_map); - cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1); + cm->last_frame_seg_map = (uint8_t *)vpx_calloc(cm->mi_rows * cm->mi_cols, 1); if (!cm->last_frame_seg_map) goto fail; @@ -137,57 +148,37 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) { const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); const int ss_x = cm->subsampling_x; const int ss_y = cm->subsampling_y; - int mi_size; vp9_free_frame_buffers(cm); - for (i = 0; i < NUM_YV12_BUFFERS; i++) { - cm->fb_idx_ref_cnt[i] = 0; - if (vp9_alloc_frame_buffer(&cm->yv12_fb[i], width, height, ss_x, ss_y, - VP9BORDERINPIXELS) < 0) + for (i = 0; i < FRAME_BUFFERS; i++) { + cm->frame_bufs[i].ref_count = 0; + if (vp9_alloc_frame_buffer(&cm->frame_bufs[i].buf, width, height, + ss_x, ss_y, VP9_ENC_BORDER_IN_PIXELS) < 0) goto fail; } - cm->new_fb_idx = NUM_YV12_BUFFERS - 1; - cm->fb_idx_ref_cnt[cm->new_fb_idx] = 1; - - for (i = 0; i < ALLOWED_REFS_PER_FRAME; i++) - cm->active_ref_idx[i] = i; + cm->new_fb_idx = FRAME_BUFFERS - 1; + cm->frame_bufs[cm->new_fb_idx].ref_count = 1; - for (i = 0; i < NUM_REF_FRAMES; i++) { + for (i = 0; i < REF_FRAMES; i++) { cm->ref_frame_map[i] = i; - cm->fb_idx_ref_cnt[i] = 1; + cm->frame_bufs[i].ref_count = 1; } if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y, - VP9BORDERINPIXELS) < 0) + VP9_ENC_BORDER_IN_PIXELS) < 0) goto fail; set_mb_mi(cm, aligned_width, aligned_height); - // Allocation - mi_size = cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE); - - cm->mip = vpx_calloc(mi_size, sizeof(MODE_INFO)); - if (!cm->mip) - goto fail; - - cm->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO)); - if (!cm->prev_mip) - goto fail; - - cm->mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->mi_grid_base)); - if (!cm->mi_grid_base) - goto fail; - - cm->prev_mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base)); - if (!cm->prev_mi_grid_base) + if (alloc_mi(cm, cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE))) goto fail; setup_mi(cm); // Create the segmentation map structure and set to 0. - cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1); + cm->last_frame_seg_map = (uint8_t *)vpx_calloc(cm->mi_rows * cm->mi_cols, 1); if (!cm->last_frame_seg_map) goto fail; @@ -198,22 +189,13 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) { return 1; } -void vp9_create_common(VP9_COMMON *cm) { - vp9_machine_specific_config(cm); - - cm->tx_mode = ONLY_4X4; - cm->comp_pred_mode = HYBRID_PREDICTION; -} - void vp9_remove_common(VP9_COMMON *cm) { vp9_free_frame_buffers(cm); + vp9_free_internal_frame_buffers(&cm->int_frame_buffers); } void vp9_initialize_common() { vp9_init_neighbors(); - vp9_coef_tree_initialize(); - vp9_entropy_mode_init(); - vp9_entropy_mv_init(); } void vp9_update_frame_size(VP9_COMMON *cm) { @@ -227,3 +209,19 @@ void vp9_update_frame_size(VP9_COMMON *cm) { if (cm->last_frame_seg_map) vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols); } + +void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) { + // Current mip will be the prev_mip for the next frame. + MODE_INFO *temp = cm->prev_mip; + MODE_INFO **temp2 = cm->prev_mi_grid_base; + cm->prev_mip = cm->mip; + cm->mip = temp; + cm->prev_mi_grid_base = cm->mi_grid_base; + cm->mi_grid_base = temp2; + + // Update the upper left visible macroblock ptrs. + cm->mi = cm->mip + cm->mode_info_stride + 1; + cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1; + cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1; + cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1; +} diff --git a/libvpx/vp9/common/vp9_alloccommon.h b/libvpx/vp9/common/vp9_alloccommon.h index cf8dca5..fca6935 100644 --- a/libvpx/vp9/common/vp9_alloccommon.h +++ b/libvpx/vp9/common/vp9_alloccommon.h @@ -14,11 +14,14 @@ #include "vp9/common/vp9_onyxc_int.h" +#ifdef __cplusplus +extern "C" { +#endif + void vp9_initialize_common(); void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi); -void vp9_create_common(VP9_COMMON *cm); void vp9_remove_common(VP9_COMMON *cm); int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height); @@ -28,4 +31,10 @@ void vp9_free_frame_buffers(VP9_COMMON *cm); void vp9_update_frame_size(VP9_COMMON *cm); +void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm); + +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_ALLOCCOMMON_H_ diff --git a/libvpx/vp9/common/vp9_blockd.c b/libvpx/vp9/common/vp9_blockd.c new file mode 100644 index 0000000..e1d1318 --- /dev/null +++ b/libvpx/vp9/common/vp9_blockd.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_blockd.h" + +MB_PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *left_mi, int b) { + if (b == 0 || b == 2) { + if (!left_mi || is_inter_block(&left_mi->mbmi)) + return DC_PRED; + + return get_y_mode(left_mi, b + 1); + } else { + assert(b == 1 || b == 3); + return cur_mi->bmi[b - 1].as_mode; + } +} + +MB_PREDICTION_MODE vp9_above_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *above_mi, int b) { + if (b == 0 || b == 1) { + if (!above_mi || is_inter_block(&above_mi->mbmi)) + return DC_PRED; + + return get_y_mode(above_mi, b + 2); + } else { + assert(b == 2 || b == 3); + return cur_mi->bmi[b - 2].as_mode; + } +} + +void vp9_foreach_transformed_block_in_plane( + const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane, + foreach_transformed_block_visitor visit, void *arg) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const MB_MODE_INFO* mbmi = &xd->mi_8x8[0]->mbmi; + // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") + // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 + // transform size varies per plane, look it up in a common way. + const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) + : mbmi->tx_size; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const int step = 1 << (tx_size << 1); + int i; + + // If mb_to_right_edge is < 0 we are in a situation in which + // the current block size extends into the UMV and we won't + // visit the sub blocks that are wholly within the UMV. + if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) { + int r, c; + + int max_blocks_wide = num_4x4_w; + int max_blocks_high = num_4x4_h; + + // xd->mb_to_right_edge is in units of pixels * 8. This converts + // it to 4x4 block sizes. + if (xd->mb_to_right_edge < 0) + max_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + + if (xd->mb_to_bottom_edge < 0) + max_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + + i = 0; + // Unlike the normal case - in here we have to keep track of the + // row and column of the blocks we use so that we know if we are in + // the unrestricted motion border. + for (r = 0; r < num_4x4_h; r += (1 << tx_size)) { + for (c = 0; c < num_4x4_w; c += (1 << tx_size)) { + if (r < max_blocks_high && c < max_blocks_wide) + visit(plane, i, plane_bsize, tx_size, arg); + i += step; + } + } + } else { + for (i = 0; i < num_4x4_w * num_4x4_h; i += step) + visit(plane, i, plane_bsize, tx_size, arg); + } +} + +void vp9_foreach_transformed_block(const MACROBLOCKD* const xd, + BLOCK_SIZE bsize, + foreach_transformed_block_visitor visit, + void *arg) { + int plane; + + for (plane = 0; plane < MAX_MB_PLANE; plane++) + vp9_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg); +} + +void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, + int aoff, int loff) { + ENTROPY_CONTEXT *const a = pd->above_context + aoff; + ENTROPY_CONTEXT *const l = pd->left_context + loff; + const int tx_size_in_blocks = 1 << tx_size; + + // above + if (has_eob && xd->mb_to_right_edge < 0) { + int i; + const int blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize] + + (xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + int above_contexts = tx_size_in_blocks; + if (above_contexts + aoff > blocks_wide) + above_contexts = blocks_wide - aoff; + + for (i = 0; i < above_contexts; ++i) + a[i] = has_eob; + for (i = above_contexts; i < tx_size_in_blocks; ++i) + a[i] = 0; + } else { + vpx_memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); + } + + // left + if (has_eob && xd->mb_to_bottom_edge < 0) { + int i; + const int blocks_high = num_4x4_blocks_high_lookup[plane_bsize] + + (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + int left_contexts = tx_size_in_blocks; + if (left_contexts + loff > blocks_high) + left_contexts = blocks_high - loff; + + for (i = 0; i < left_contexts; ++i) + l[i] = has_eob; + for (i = left_contexts; i < tx_size_in_blocks; ++i) + l[i] = 0; + } else { + vpx_memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); + } +} + +void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y) { + int i; + + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y; + xd->plane[i].subsampling_x = i ? ss_x : 0; + xd->plane[i].subsampling_y = i ? ss_y : 0; + } +#if CONFIG_ALPHA + // TODO(jkoleszar): Using the Y w/h for now + xd->plane[3].plane_type = PLANE_TYPE_Y; + xd->plane[3].subsampling_x = 0; + xd->plane[3].subsampling_y = 0; +#endif +} diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h index c5da375..ca5a0c2 100644 --- a/libvpx/vp9/common/vp9_blockd.h +++ b/libvpx/vp9/common/vp9_blockd.h @@ -24,10 +24,14 @@ #include "vp9/common/vp9_mv.h" #include "vp9/common/vp9_scale.h" #include "vp9/common/vp9_seg_common.h" -#include "vp9/common/vp9_treecoder.h" -#define BLOCK_SIZE_GROUPS 4 -#define MBSKIP_CONTEXTS 3 +#ifdef __cplusplus +extern "C" { +#endif + +#define BLOCK_SIZE_GROUPS 4 +#define SKIP_CONTEXTS 3 +#define INTER_MODE_CONTEXTS 7 /* Segment Feature Masks */ #define MAX_MV_REF_CANDIDATES 2 @@ -37,8 +41,9 @@ #define REF_CONTEXTS 5 typedef enum { - PLANE_TYPE_Y_WITH_DC, - PLANE_TYPE_UV, + PLANE_TYPE_Y = 0, + PLANE_TYPE_UV = 1, + PLANE_TYPES } PLANE_TYPE; typedef char ENTROPY_CONTEXT; @@ -84,7 +89,6 @@ static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) { #define INTER_OFFSET(mode) ((mode) - NEARESTMV) - /* For keyframes, intra block modes are predicted by the (already decoded) modes for the Y blocks to the left and above us; for interframes, there is a single probability table. */ @@ -114,10 +118,6 @@ static INLINE int mi_width_log2(BLOCK_SIZE sb_type) { return mi_width_log2_lookup[sb_type]; } -static INLINE int mi_height_log2(BLOCK_SIZE sb_type) { - return mi_height_log2_lookup[sb_type]; -} - // This structure now relates to 8x8 block regions. typedef struct { MB_PREDICTION_MODE mode, uv_mode; @@ -125,17 +125,16 @@ typedef struct { TX_SIZE tx_size; int_mv mv[2]; // for each reference frame used int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; - int_mv best_mv[2]; uint8_t mode_context[MAX_REF_FRAMES]; - unsigned char skip_coeff; // 0=need to decode coeffs, 1=no coefficients + unsigned char skip; // 0=need to decode coeffs, 1=no coefficients unsigned char segment_id; // Segment id for this block. // Flags used for prediction status of various bit-stream signals unsigned char seg_id_predicted; - INTERPOLATION_TYPE interp_filter; + INTERP_FILTER interp_filter; BLOCK_SIZE sb_type; } MB_MODE_INFO; @@ -145,6 +144,11 @@ typedef struct { b_mode_info bmi[4]; } MODE_INFO; +static INLINE MB_PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) { + return mi->mbmi.sb_type < BLOCK_8X8 ? mi->bmi[block].as_mode + : mi->mbmi.mode; +} + static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) { return mbmi->ref_frame[0] > INTRA_FRAME; } @@ -153,6 +157,12 @@ static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) { return mbmi->ref_frame[1] > INTRA_FRAME; } +MB_PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *left_mi, int b); + +MB_PREDICTION_MODE vp9_above_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *above_mi, int b); + enum mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 @@ -170,33 +180,35 @@ struct buf_2d { }; struct macroblockd_plane { - int16_t *qcoeff; int16_t *dqcoeff; - uint16_t *eobs; PLANE_TYPE plane_type; int subsampling_x; int subsampling_y; struct buf_2d dst; struct buf_2d pre[2]; - int16_t *dequant; + const int16_t *dequant; ENTROPY_CONTEXT *above_context; ENTROPY_CONTEXT *left_context; }; #define BLOCK_OFFSET(x, i) ((x) + (i) * 16) +typedef struct RefBuffer { + // TODO(dkovalev): idx is not really required and should be removed, now it + // is used in vp9_onyxd_if.c + int idx; + YV12_BUFFER_CONFIG *buf; + struct scale_factors sf; +} RefBuffer; + typedef struct macroblockd { struct macroblockd_plane plane[MAX_MB_PLANE]; - struct scale_factors scale_factor[2]; - - MODE_INFO *last_mi; int mode_info_stride; // A NULL indicates that the 8x8 is not part of the image MODE_INFO **mi_8x8; MODE_INFO **prev_mi_8x8; - MODE_INFO *mi_stream; int up_available; int left_available; @@ -207,11 +219,20 @@ typedef struct macroblockd { int mb_to_top_edge; int mb_to_bottom_edge; + /* pointers to reference frames */ + RefBuffer *block_refs[2]; + + /* pointer to current frame */ + const YV12_BUFFER_CONFIG *cur_buf; + + /* mc buffer */ + DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]); + int lossless; /* Inverse transform function pointers. */ void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob); - struct subpix_fn_table subpix; + const InterpKernel *interp_kernel; int corrupted; @@ -225,182 +246,74 @@ typedef struct macroblockd { -static BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, PARTITION_TYPE partition) { +static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, + PARTITION_TYPE partition) { const BLOCK_SIZE subsize = subsize_lookup[partition][bsize]; assert(subsize < BLOCK_SIZES); return subsize; } -extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT]; +extern const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES]; + +static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type, + const MACROBLOCKD *xd) { + const MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; + + if (plane_type != PLANE_TYPE_Y || is_inter_block(mbmi)) + return DCT_DCT; + return intra_mode_to_tx_type_lookup[mbmi->mode]; +} static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type, const MACROBLOCKD *xd, int ib) { const MODE_INFO *const mi = xd->mi_8x8[0]; - const MB_MODE_INFO *const mbmi = &mi->mbmi; - if (plane_type != PLANE_TYPE_Y_WITH_DC || - xd->lossless || - is_inter_block(mbmi)) + if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(&mi->mbmi)) return DCT_DCT; - return mode2txfm_map[mbmi->sb_type < BLOCK_8X8 ? - mi->bmi[ib].as_mode : mbmi->mode]; -} - -static INLINE TX_TYPE get_tx_type_8x8(PLANE_TYPE plane_type, - const MACROBLOCKD *xd) { - return plane_type == PLANE_TYPE_Y_WITH_DC ? - mode2txfm_map[xd->mi_8x8[0]->mbmi.mode] : DCT_DCT; + return intra_mode_to_tx_type_lookup[get_y_mode(mi, ib)]; } -static INLINE TX_TYPE get_tx_type_16x16(PLANE_TYPE plane_type, - const MACROBLOCKD *xd) { - return plane_type == PLANE_TYPE_Y_WITH_DC ? - mode2txfm_map[xd->mi_8x8[0]->mbmi.mode] : DCT_DCT; -} - -static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) { - int i; +void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y); - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC; - xd->plane[i].subsampling_x = i ? ss_x : 0; - xd->plane[i].subsampling_y = i ? ss_y : 0; +static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize) { + if (bsize < BLOCK_8X8) { + return TX_4X4; + } else { + // TODO(dkovalev): Assuming YUV420 (ss_x == 1, ss_y == 1) + const BLOCK_SIZE plane_bsize = ss_size_lookup[bsize][1][1]; + return MIN(y_tx_size, max_txsize_lookup[plane_bsize]); } -#if CONFIG_ALPHA - // TODO(jkoleszar): Using the Y w/h for now - xd->plane[3].subsampling_x = 0; - xd->plane[3].subsampling_y = 0; -#endif } - static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) { - return MIN(mbmi->tx_size, max_uv_txsize_lookup[mbmi->sb_type]); + return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type); } -static BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize, - const struct macroblockd_plane *pd) { +static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize, + const struct macroblockd_plane *pd) { BLOCK_SIZE bs = ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y]; assert(bs < BLOCK_SIZES); return bs; } -static INLINE int plane_block_width(BLOCK_SIZE bsize, - const struct macroblockd_plane* plane) { - return 4 << (b_width_log2(bsize) - plane->subsampling_x); -} - -static INLINE int plane_block_height(BLOCK_SIZE bsize, - const struct macroblockd_plane* plane) { - return 4 << (b_height_log2(bsize) - plane->subsampling_y); -} - typedef void (*foreach_transformed_block_visitor)(int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg); -static INLINE void foreach_transformed_block_in_plane( +void vp9_foreach_transformed_block_in_plane( const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane, - foreach_transformed_block_visitor visit, void *arg) { - const struct macroblockd_plane *const pd = &xd->plane[plane]; - const MB_MODE_INFO* mbmi = &xd->mi_8x8[0]->mbmi; - // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") - // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 - // transform size varies per plane, look it up in a common way. - const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) - : mbmi->tx_size; - const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); - const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; - const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; - const int step = 1 << (tx_size << 1); - int i; - - // If mb_to_right_edge is < 0 we are in a situation in which - // the current block size extends into the UMV and we won't - // visit the sub blocks that are wholly within the UMV. - if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) { - int r, c; - - int max_blocks_wide = num_4x4_w; - int max_blocks_high = num_4x4_h; - - // xd->mb_to_right_edge is in units of pixels * 8. This converts - // it to 4x4 block sizes. - if (xd->mb_to_right_edge < 0) - max_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x)); - - if (xd->mb_to_bottom_edge < 0) - max_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); - - i = 0; - // Unlike the normal case - in here we have to keep track of the - // row and column of the blocks we use so that we know if we are in - // the unrestricted motion border. - for (r = 0; r < num_4x4_h; r += (1 << tx_size)) { - for (c = 0; c < num_4x4_w; c += (1 << tx_size)) { - if (r < max_blocks_high && c < max_blocks_wide) - visit(plane, i, plane_bsize, tx_size, arg); - i += step; - } - } - } else { - for (i = 0; i < num_4x4_w * num_4x4_h; i += step) - visit(plane, i, plane_bsize, tx_size, arg); - } -} - -static INLINE void foreach_transformed_block( - const MACROBLOCKD* const xd, BLOCK_SIZE bsize, - foreach_transformed_block_visitor visit, void *arg) { - int plane; + foreach_transformed_block_visitor visit, void *arg); - for (plane = 0; plane < MAX_MB_PLANE; plane++) - foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg); -} -static INLINE void foreach_transformed_block_uv( +void vp9_foreach_transformed_block( const MACROBLOCKD* const xd, BLOCK_SIZE bsize, - foreach_transformed_block_visitor visit, void *arg) { - int plane; - - for (plane = 1; plane < MAX_MB_PLANE; plane++) - foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg); -} - -static int raster_block_offset(BLOCK_SIZE plane_bsize, - int raster_block, int stride) { - const int bw = b_width_log2(plane_bsize); - const int y = 4 * (raster_block >> bw); - const int x = 4 * (raster_block & ((1 << bw) - 1)); - return y * stride + x; -} -static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize, - int raster_block, int16_t *base) { - const int stride = 4 << b_width_log2(plane_bsize); - return base + raster_block_offset(plane_bsize, raster_block, stride); -} -static uint8_t* raster_block_offset_uint8(BLOCK_SIZE plane_bsize, - int raster_block, uint8_t *base, - int stride) { - return base + raster_block_offset(plane_bsize, raster_block, stride); -} - -static int txfrm_block_to_raster_block(BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, int block) { - const int bwl = b_width_log2(plane_bsize); - const int tx_cols_log2 = bwl - tx_size; - const int tx_cols = 1 << tx_cols_log2; - const int raster_mb = block >> (tx_size << 1); - const int x = (raster_mb & (tx_cols - 1)) << tx_size; - const int y = (raster_mb >> tx_cols_log2) << tx_size; - return x + (y << bwl); -} + foreach_transformed_block_visitor visit, void *arg); -static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, int block, - int *x, int *y) { +static INLINE void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, int block, + int *x, int *y) { const int bwl = b_width_log2(plane_bsize); const int tx_cols_log2 = bwl - tx_size; const int tx_cols = 1 << tx_cols_log2; @@ -409,93 +322,12 @@ static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize, *y = (raster_mb >> tx_cols_log2) << tx_size; } -static void extend_for_intra(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize, - int plane, int block, TX_SIZE tx_size) { - struct macroblockd_plane *const pd = &xd->plane[plane]; - uint8_t *const buf = pd->dst.buf; - const int stride = pd->dst.stride; - - int x, y; - txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y); - x = x * 4 - 1; - y = y * 4 - 1; - // Copy a pixel into the umv if we are in a situation where the block size - // extends into the UMV. - // TODO(JBB): Should be able to do the full extend in place so we don't have - // to do this multiple times. - if (xd->mb_to_right_edge < 0) { - const int bw = 4 << b_width_log2(plane_bsize); - const int umv_border_start = bw + (xd->mb_to_right_edge >> - (3 + pd->subsampling_x)); - - if (x + bw > umv_border_start) - vpx_memset(&buf[y * stride + umv_border_start], - buf[y * stride + umv_border_start - 1], bw); - } - - if (xd->mb_to_bottom_edge < 0) { - if (xd->left_available || x >= 0) { - const int bh = 4 << b_height_log2(plane_bsize); - const int umv_border_start = - bh + (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)); - - if (y + bh > umv_border_start) { - const uint8_t c = buf[(umv_border_start - 1) * stride + x]; - uint8_t *d = &buf[umv_border_start * stride + x]; - int i; - for (i = 0; i < bh; ++i, d += stride) - *d = c; - } - } - } -} +void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, + int aoff, int loff); -static void set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, - BLOCK_SIZE plane_bsize, TX_SIZE tx_size, - int has_eob, int aoff, int loff) { - ENTROPY_CONTEXT *const a = pd->above_context + aoff; - ENTROPY_CONTEXT *const l = pd->left_context + loff; - const int tx_size_in_blocks = 1 << tx_size; - - // above - if (has_eob && xd->mb_to_right_edge < 0) { - int i; - const int blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize] + - (xd->mb_to_right_edge >> (5 + pd->subsampling_x)); - int above_contexts = tx_size_in_blocks; - if (above_contexts + aoff > blocks_wide) - above_contexts = blocks_wide - aoff; - - for (i = 0; i < above_contexts; ++i) - a[i] = has_eob; - for (i = above_contexts; i < tx_size_in_blocks; ++i) - a[i] = 0; - } else { - vpx_memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); - } - - // left - if (has_eob && xd->mb_to_bottom_edge < 0) { - int i; - const int blocks_high = num_4x4_blocks_high_lookup[plane_bsize] + - (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); - int left_contexts = tx_size_in_blocks; - if (left_contexts + loff > blocks_high) - left_contexts = blocks_high - loff; - - for (i = 0; i < left_contexts; ++i) - l[i] = has_eob; - for (i = left_contexts; i < tx_size_in_blocks; ++i) - l[i] = 0; - } else { - vpx_memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); - } -} - -static int get_tx_eob(const struct segmentation *seg, int segment_id, - TX_SIZE tx_size) { - const int eob_max = 16 << (tx_size << 1); - return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; -} +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_COMMON_VP9_BLOCKD_H_ diff --git a/libvpx/vp9/common/vp9_common.h b/libvpx/vp9/common/vp9_common.h index 36d1cdf..2dccb70 100644 --- a/libvpx/vp9/common/vp9_common.h +++ b/libvpx/vp9/common/vp9_common.h @@ -18,6 +18,11 @@ #include "./vpx_config.h" #include "vpx_mem/vpx_mem.h" #include "vpx/vpx_integer.h" +#include "vp9/common/vp9_systemdependent.h" + +#ifdef __cplusplus +extern "C" { +#endif #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #define MAX(x, y) (((x) > (y)) ? (x) : (y)) @@ -55,16 +60,8 @@ static INLINE double fclamp(double value, double low, double high) { return value < low ? low : (value > high ? high : value); } -static int get_unsigned_bits(unsigned int num_values) { - int cat = 0; - if (num_values <= 1) - return 0; - num_values--; - while (num_values > 0) { - cat++; - num_values >>= 1; - } - return cat; +static INLINE int get_unsigned_bits(unsigned int num_values) { + return num_values > 0 ? get_msb(num_values) + 1 : 0; } #if CONFIG_DEBUG @@ -91,4 +88,8 @@ static int get_unsigned_bits(unsigned int num_values) { #define VP9_FRAME_MARKER 0x2 +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_common_data.c b/libvpx/vp9/common/vp9_common_data.c index f858900..a927823 100644 --- a/libvpx/vp9/common/vp9_common_data.c +++ b/libvpx/vp9/common/vp9_common_data.c @@ -26,8 +26,6 @@ const int mi_width_log2_lookup[BLOCK_SIZES] = {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3}; const int num_8x8_blocks_wide_lookup[BLOCK_SIZES] = {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8}; -const int mi_height_log2_lookup[BLOCK_SIZES] = - {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3}; const int num_8x8_blocks_high_lookup[BLOCK_SIZES] = {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8}; @@ -108,12 +106,6 @@ const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = { TX_16X16, TX_16X16, TX_16X16, TX_32X32, TX_32X32, TX_32X32, TX_32X32 }; -const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES] = { - TX_4X4, TX_4X4, TX_4X4, - TX_4X4, TX_4X4, TX_4X4, - TX_8X8, TX_8X8, TX_8X8, - TX_16X16, TX_16X16, TX_16X16, TX_32X32 -}; const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = { TX_4X4, // ONLY_4X4 @@ -123,8 +115,6 @@ const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = { TX_32X32, // TX_MODE_SELECT }; - - const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = { // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1 // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1 @@ -143,4 +133,24 @@ const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = { {{BLOCK_64X64, BLOCK_64X32}, {BLOCK_32X64, BLOCK_32X32}}, }; - +// Generates 4 bit field in which each bit set to 1 represents +// a blocksize partition 1111 means we split 64x64, 32x32, 16x16 +// and 8x8. 1000 means we just split the 64x64 to 32x32 +const struct { + PARTITION_CONTEXT above; + PARTITION_CONTEXT left; +} partition_context_lookup[BLOCK_SIZES]= { + {15, 15}, // 4X4 - {0b1111, 0b1111} + {15, 14}, // 4X8 - {0b1111, 0b1110} + {14, 15}, // 8X4 - {0b1110, 0b1111} + {14, 14}, // 8X8 - {0b1110, 0b1110} + {14, 12}, // 8X16 - {0b1110, 0b1100} + {12, 14}, // 16X8 - {0b1100, 0b1110} + {12, 12}, // 16X16 - {0b1100, 0b1100} + {12, 8 }, // 16X32 - {0b1100, 0b1000} + {8, 12}, // 32X16 - {0b1000, 0b1100} + {8, 8 }, // 32X32 - {0b1000, 0b1000} + {8, 0 }, // 32X64 - {0b1000, 0b0000} + {0, 8 }, // 64X32 - {0b0000, 0b1000} + {0, 0 }, // 64X64 - {0b0000, 0b0000} +}; diff --git a/libvpx/vp9/common/vp9_common_data.h b/libvpx/vp9/common/vp9_common_data.h index c1f6405..f419627 100644 --- a/libvpx/vp9/common/vp9_common_data.h +++ b/libvpx/vp9/common/vp9_common_data.h @@ -13,10 +13,13 @@ #include "vp9/common/vp9_enums.h" +#ifdef __cplusplus +extern "C" { +#endif + extern const int b_width_log2_lookup[BLOCK_SIZES]; extern const int b_height_log2_lookup[BLOCK_SIZES]; extern const int mi_width_log2_lookup[BLOCK_SIZES]; -extern const int mi_height_log2_lookup[BLOCK_SIZES]; extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZES]; extern const int num_8x8_blocks_high_lookup[BLOCK_SIZES]; extern const int num_4x4_blocks_high_lookup[BLOCK_SIZES]; @@ -26,8 +29,11 @@ extern const int num_pels_log2_lookup[BLOCK_SIZES]; extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES]; extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES]; extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES]; -extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES]; extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES]; extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2]; -#endif // VP9_COMMON_VP9_COMMON_DATA_H +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_COMMON_DATA_H_ diff --git a/libvpx/vp9/common/vp9_convolve.c b/libvpx/vp9/common/vp9_convolve.c index a2d864c..d30e0b4 100644 --- a/libvpx/vp9/common/vp9_convolve.c +++ b/libvpx/vp9/common/vp9_convolve.c @@ -18,40 +18,21 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" -static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x0, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int taps) { - int x, y, k; - - /* NOTE: This assumes that the filter table is 256-byte aligned. */ - /* TODO(agrange) Modify to make independent of table alignment. */ - const int16_t *const filter_x_base = - (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff); - - /* Adjust base pointer address for this source line */ - src -= taps / 2 - 1; - +static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, + int x0_q4, int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; for (y = 0; y < h; ++y) { - /* Initial phase offset */ - int x_q4 = (int)(filter_x0 - filter_x_base) / taps; - + int x_q4 = x0_q4; for (x = 0; x < w; ++x) { - /* Per-pixel src offset */ - const int src_x = x_q4 >> SUBPEL_BITS; - int sum = 0; - - /* Pointer to filter to use */ - const int16_t *const filter_x = filter_x_base + - (x_q4 & SUBPEL_MASK) * taps; - - for (k = 0; k < taps; ++k) - sum += src[src_x + k] * filter_x[k]; - + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_x[k] * x_filter[k]; dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); - - /* Move to the next source pixel */ x_q4 += x_step_q4; } src += src_stride; @@ -59,41 +40,22 @@ static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride, } } -static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x0, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int taps) { - int x, y, k; - - /* NOTE: This assumes that the filter table is 256-byte aligned. */ - /* TODO(agrange) Modify to make independent of table alignment. */ - const int16_t *const filter_x_base = - (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff); - - /* Adjust base pointer address for this source line */ - src -= taps / 2 - 1; - +static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, + int x0_q4, int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; for (y = 0; y < h; ++y) { - /* Initial phase offset */ - int x_q4 = (int)(filter_x0 - filter_x_base) / taps; - + int x_q4 = x0_q4; for (x = 0; x < w; ++x) { - /* Per-pixel src offset */ - const int src_x = x_q4 >> SUBPEL_BITS; - int sum = 0; - - /* Pointer to filter to use */ - const int16_t *const filter_x = filter_x_base + - (x_q4 & SUBPEL_MASK) * taps; - - for (k = 0; k < taps; ++k) - sum += src[src_x + k] * filter_x[k]; - + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_x[k] * x_filter[k]; dst[x] = ROUND_POWER_OF_TWO(dst[x] + - clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); - - /* Move to the next source pixel */ + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); x_q4 += x_step_q4; } src += src_stride; @@ -101,41 +63,22 @@ static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, } } -static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y0, int y_step_q4, - int w, int h, int taps) { - int x, y, k; - - /* NOTE: This assumes that the filter table is 256-byte aligned. */ - /* TODO(agrange) Modify to make independent of table alignment. */ - const int16_t *const filter_y_base = - (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff); - - /* Adjust base pointer address for this source column */ - src -= src_stride * (taps / 2 - 1); +static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, + int y0_q4, int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (x = 0; x < w; ++x) { - /* Initial phase offset */ - int y_q4 = (int)(filter_y0 - filter_y_base) / taps; - + int y_q4 = y0_q4; for (y = 0; y < h; ++y) { - /* Per-pixel src offset */ - const int src_y = y_q4 >> SUBPEL_BITS; - int sum = 0; - - /* Pointer to filter to use */ - const int16_t *const filter_y = filter_y_base + - (y_q4 & SUBPEL_MASK) * taps; - - for (k = 0; k < taps; ++k) - sum += src[(src_y + k) * src_stride] * filter_y[k]; - - dst[y * dst_stride] = - clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); - - /* Move to the next source pixel */ + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); y_q4 += y_step_q4; } ++src; @@ -143,41 +86,23 @@ static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride, } } -static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y0, int y_step_q4, - int w, int h, int taps) { - int x, y, k; - - /* NOTE: This assumes that the filter table is 256-byte aligned. */ - /* TODO(agrange) Modify to make independent of table alignment. */ - const int16_t *const filter_y_base = - (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff); - - /* Adjust base pointer address for this source column */ - src -= src_stride * (taps / 2 - 1); +static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, + int y0_q4, int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (x = 0; x < w; ++x) { - /* Initial phase offset */ - int y_q4 = (int)(filter_y0 - filter_y_base) / taps; - + int y_q4 = y0_q4; for (y = 0; y < h; ++y) { - /* Per-pixel src offset */ - const int src_y = y_q4 >> SUBPEL_BITS; - int sum = 0; - - /* Pointer to filter to use */ - const int16_t *const filter_y = filter_y_base + - (y_q4 & SUBPEL_MASK) * taps; - - for (k = 0; k < taps; ++k) - sum += src[(src_y + k) * src_stride] * filter_y[k]; - + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + - clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); - - /* Move to the next source pixel */ + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); y_q4 += y_step_q4; } ++src; @@ -185,33 +110,42 @@ static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, } } -static void convolve_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int taps) { - /* Fixed size intermediate buffer places limits on parameters. - * Maximum intermediate_height is 324, for y_step_q4 == 80, - * h == 64, taps == 8. - * y_step_q4 of 80 allows for 1/10 scale for 5 layer svc - */ +static void convolve(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const x_filters, + int x0_q4, int x_step_q4, + const InterpKernel *const y_filters, + int y0_q4, int y_step_q4, + int w, int h) { + // Fixed size intermediate buffer places limits on parameters. + // Maximum intermediate_height is 324, for y_step_q4 == 80, + // h == 64, taps == 8. + // y_step_q4 of 80 allows for 1/10 scale for 5 layer svc uint8_t temp[64 * 324]; - int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + taps; + int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + SUBPEL_TAPS; assert(w <= 64); assert(h <= 64); - assert(taps <= 8); assert(y_step_q4 <= 80); assert(x_step_q4 <= 80); if (intermediate_height < h) intermediate_height = h; - convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride, temp, 64, - filter_x, x_step_q4, filter_y, y_step_q4, w, - intermediate_height, taps); - convolve_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h, taps); + convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, + x_filters, x0_q4, x_step_q4, w, intermediate_height); + convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, + y_filters, y0_q4, y_step_q4, w, h); +} + +static const InterpKernel *get_filter_base(const int16_t *filter) { + // NOTE: This assumes that the filter table is 256-byte aligned. + // TODO(agrange) Modify to make independent of table alignment. + return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); +} + +static int get_filter_offset(const int16_t *f, const InterpKernel *base) { + return (int)((const InterpKernel *)(intptr_t)f - base); } void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, @@ -219,8 +153,11 @@ void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - convolve_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8); + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + convolve_horiz(src, src_stride, dst, dst_stride, filters_x, + x0_q4, x_step_q4, w, h); } void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, @@ -228,8 +165,11 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - convolve_avg_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8); + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, + x0_q4, x_step_q4, w, h); } void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, @@ -237,8 +177,10 @@ void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - convolve_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8); + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + convolve_vert(src, src_stride, dst, dst_stride, filters_y, + y0_q4, y_step_q4, w, h); } void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, @@ -246,8 +188,10 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - convolve_avg_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8); + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, + y0_q4, y_step_q4, w, h); } void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, @@ -255,8 +199,15 @@ void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - convolve_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8); + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + convolve(src, src_stride, dst, dst_stride, + filters_x, x0_q4, x_step_q4, + filters_y, y0_q4, y_step_q4, w, h); } void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, @@ -269,9 +220,9 @@ void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, assert(w <= 64); assert(h <= 64); - vp9_convolve8(src, src_stride, temp, 64, - filter_x, x_step_q4, filter_y, y_step_q4, w, h); - vp9_convolve_avg(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); + vp9_convolve8_c(src, src_stride, temp, 64, + filter_x, x_step_q4, filter_y, y_step_q4, w, h); + vp9_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); } void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, diff --git a/libvpx/vp9/common/vp9_convolve.h b/libvpx/vp9/common/vp9_convolve.h index 29d4990..6bf71fc 100644 --- a/libvpx/vp9/common/vp9_convolve.h +++ b/libvpx/vp9/common/vp9_convolve.h @@ -13,10 +13,18 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_CONVOLVE_H_ diff --git a/libvpx/vp9/common/vp9_debugmodes.c b/libvpx/vp9/common/vp9_debugmodes.c index 355ac1a..8f150a4 100644 --- a/libvpx/vp9/common/vp9_debugmodes.c +++ b/libvpx/vp9/common/vp9_debugmodes.c @@ -22,7 +22,7 @@ static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) { * and uses the passed in member offset to print out the value of an integer * for each mbmi member value in the mi structure. */ -static void print_mi_data(VP9_COMMON *cm, FILE *file, char *descriptor, +static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor, size_t member_offset) { int mi_row; int mi_col; @@ -47,7 +47,7 @@ static void print_mi_data(VP9_COMMON *cm, FILE *file, char *descriptor, } fprintf(file, "\n"); } -void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) { +void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, const char *file) { int mi_row; int mi_col; int mi_index = 0; @@ -58,7 +58,7 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) { print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type)); print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode)); - print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip_coeff)); + print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip)); print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0])); print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size)); print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode)); diff --git a/libvpx/vp9/common/vp9_default_coef_probs.h b/libvpx/vp9/common/vp9_default_coef_probs.h deleted file mode 100644 index 3b512be..0000000 --- a/libvpx/vp9/common/vp9_default_coef_probs.h +++ /dev/null @@ -1,699 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. -*/ -#ifndef VP9_COMMON_DEFAULT_COEF_PROBS_H_ -#define VP9_COMMON_DEFAULT_COEF_PROBS_H_ - -/*Generated file, included by vp9_entropy.c*/ -static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = { - { /* block Type 0 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 195, 29, 183 }, - { 84, 49, 136 }, - { 8, 42, 71 } - }, { /* Coeff Band 1 */ - { 31, 107, 169 }, - { 35, 99, 159 }, - { 17, 82, 140 }, - { 8, 66, 114 }, - { 2, 44, 76 }, - { 1, 19, 32 } - }, { /* Coeff Band 2 */ - { 40, 132, 201 }, - { 29, 114, 187 }, - { 13, 91, 157 }, - { 7, 75, 127 }, - { 3, 58, 95 }, - { 1, 28, 47 } - }, { /* Coeff Band 3 */ - { 69, 142, 221 }, - { 42, 122, 201 }, - { 15, 91, 159 }, - { 6, 67, 121 }, - { 1, 42, 77 }, - { 1, 17, 31 } - }, { /* Coeff Band 4 */ - { 102, 148, 228 }, - { 67, 117, 204 }, - { 17, 82, 154 }, - { 6, 59, 114 }, - { 2, 39, 75 }, - { 1, 15, 29 } - }, { /* Coeff Band 5 */ - { 156, 57, 233 }, - { 119, 57, 212 }, - { 58, 48, 163 }, - { 29, 40, 124 }, - { 12, 30, 81 }, - { 3, 12, 31 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 191, 107, 226 }, - { 124, 117, 204 }, - { 25, 99, 155 } - }, { /* Coeff Band 1 */ - { 29, 148, 210 }, - { 37, 126, 194 }, - { 8, 93, 157 }, - { 2, 68, 118 }, - { 1, 39, 69 }, - { 1, 17, 33 } - }, { /* Coeff Band 2 */ - { 41, 151, 213 }, - { 27, 123, 193 }, - { 3, 82, 144 }, - { 1, 58, 105 }, - { 1, 32, 60 }, - { 1, 13, 26 } - }, { /* Coeff Band 3 */ - { 59, 159, 220 }, - { 23, 126, 198 }, - { 4, 88, 151 }, - { 1, 66, 114 }, - { 1, 38, 71 }, - { 1, 18, 34 } - }, { /* Coeff Band 4 */ - { 114, 136, 232 }, - { 51, 114, 207 }, - { 11, 83, 155 }, - { 3, 56, 105 }, - { 1, 33, 65 }, - { 1, 17, 34 } - }, { /* Coeff Band 5 */ - { 149, 65, 234 }, - { 121, 57, 215 }, - { 61, 49, 166 }, - { 28, 36, 114 }, - { 12, 25, 76 }, - { 3, 16, 42 } - } - } - }, { /* block Type 1 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 214, 49, 220 }, - { 132, 63, 188 }, - { 42, 65, 137 } - }, { /* Coeff Band 1 */ - { 85, 137, 221 }, - { 104, 131, 216 }, - { 49, 111, 192 }, - { 21, 87, 155 }, - { 2, 49, 87 }, - { 1, 16, 28 } - }, { /* Coeff Band 2 */ - { 89, 163, 230 }, - { 90, 137, 220 }, - { 29, 100, 183 }, - { 10, 70, 135 }, - { 2, 42, 81 }, - { 1, 17, 33 } - }, { /* Coeff Band 3 */ - { 108, 167, 237 }, - { 55, 133, 222 }, - { 15, 97, 179 }, - { 4, 72, 135 }, - { 1, 45, 85 }, - { 1, 19, 38 } - }, { /* Coeff Band 4 */ - { 124, 146, 240 }, - { 66, 124, 224 }, - { 17, 88, 175 }, - { 4, 58, 122 }, - { 1, 36, 75 }, - { 1, 18, 37 } - }, { /* Coeff Band 5 */ - { 141, 79, 241 }, - { 126, 70, 227 }, - { 66, 58, 182 }, - { 30, 44, 136 }, - { 12, 34, 96 }, - { 2, 20, 47 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 229, 99, 249 }, - { 143, 111, 235 }, - { 46, 109, 192 } - }, { /* Coeff Band 1 */ - { 82, 158, 236 }, - { 94, 146, 224 }, - { 25, 117, 191 }, - { 9, 87, 149 }, - { 3, 56, 99 }, - { 1, 33, 57 } - }, { /* Coeff Band 2 */ - { 83, 167, 237 }, - { 68, 145, 222 }, - { 10, 103, 177 }, - { 2, 72, 131 }, - { 1, 41, 79 }, - { 1, 20, 39 } - }, { /* Coeff Band 3 */ - { 99, 167, 239 }, - { 47, 141, 224 }, - { 10, 104, 178 }, - { 2, 73, 133 }, - { 1, 44, 85 }, - { 1, 22, 47 } - }, { /* Coeff Band 4 */ - { 127, 145, 243 }, - { 71, 129, 228 }, - { 17, 93, 177 }, - { 3, 61, 124 }, - { 1, 41, 84 }, - { 1, 21, 52 } - }, { /* Coeff Band 5 */ - { 157, 78, 244 }, - { 140, 72, 231 }, - { 69, 58, 184 }, - { 31, 44, 137 }, - { 14, 38, 105 }, - { 8, 23, 61 } - } - } - } -}; -static const vp9_coeff_probs_model default_coef_probs_8x8[BLOCK_TYPES] = { - { /* block Type 0 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 125, 34, 187 }, - { 52, 41, 133 }, - { 6, 31, 56 } - }, { /* Coeff Band 1 */ - { 37, 109, 153 }, - { 51, 102, 147 }, - { 23, 87, 128 }, - { 8, 67, 101 }, - { 1, 41, 63 }, - { 1, 19, 29 } - }, { /* Coeff Band 2 */ - { 31, 154, 185 }, - { 17, 127, 175 }, - { 6, 96, 145 }, - { 2, 73, 114 }, - { 1, 51, 82 }, - { 1, 28, 45 } - }, { /* Coeff Band 3 */ - { 23, 163, 200 }, - { 10, 131, 185 }, - { 2, 93, 148 }, - { 1, 67, 111 }, - { 1, 41, 69 }, - { 1, 14, 24 } - }, { /* Coeff Band 4 */ - { 29, 176, 217 }, - { 12, 145, 201 }, - { 3, 101, 156 }, - { 1, 69, 111 }, - { 1, 39, 63 }, - { 1, 14, 23 } - }, { /* Coeff Band 5 */ - { 57, 192, 233 }, - { 25, 154, 215 }, - { 6, 109, 167 }, - { 3, 78, 118 }, - { 1, 48, 69 }, - { 1, 21, 29 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 202, 105, 245 }, - { 108, 106, 216 }, - { 18, 90, 144 } - }, { /* Coeff Band 1 */ - { 33, 172, 219 }, - { 64, 149, 206 }, - { 14, 117, 177 }, - { 5, 90, 141 }, - { 2, 61, 95 }, - { 1, 37, 57 } - }, { /* Coeff Band 2 */ - { 33, 179, 220 }, - { 11, 140, 198 }, - { 1, 89, 148 }, - { 1, 60, 104 }, - { 1, 33, 57 }, - { 1, 12, 21 } - }, { /* Coeff Band 3 */ - { 30, 181, 221 }, - { 8, 141, 198 }, - { 1, 87, 145 }, - { 1, 58, 100 }, - { 1, 31, 55 }, - { 1, 12, 20 } - }, { /* Coeff Band 4 */ - { 32, 186, 224 }, - { 7, 142, 198 }, - { 1, 86, 143 }, - { 1, 58, 100 }, - { 1, 31, 55 }, - { 1, 12, 22 } - }, { /* Coeff Band 5 */ - { 57, 192, 227 }, - { 20, 143, 204 }, - { 3, 96, 154 }, - { 1, 68, 112 }, - { 1, 42, 69 }, - { 1, 19, 32 } - } - } - }, { /* block Type 1 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 212, 35, 215 }, - { 113, 47, 169 }, - { 29, 48, 105 } - }, { /* Coeff Band 1 */ - { 74, 129, 203 }, - { 106, 120, 203 }, - { 49, 107, 178 }, - { 19, 84, 144 }, - { 4, 50, 84 }, - { 1, 15, 25 } - }, { /* Coeff Band 2 */ - { 71, 172, 217 }, - { 44, 141, 209 }, - { 15, 102, 173 }, - { 6, 76, 133 }, - { 2, 51, 89 }, - { 1, 24, 42 } - }, { /* Coeff Band 3 */ - { 64, 185, 231 }, - { 31, 148, 216 }, - { 8, 103, 175 }, - { 3, 74, 131 }, - { 1, 46, 81 }, - { 1, 18, 30 } - }, { /* Coeff Band 4 */ - { 65, 196, 235 }, - { 25, 157, 221 }, - { 5, 105, 174 }, - { 1, 67, 120 }, - { 1, 38, 69 }, - { 1, 15, 30 } - }, { /* Coeff Band 5 */ - { 65, 204, 238 }, - { 30, 156, 224 }, - { 7, 107, 177 }, - { 2, 70, 124 }, - { 1, 42, 73 }, - { 1, 18, 34 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 225, 86, 251 }, - { 144, 104, 235 }, - { 42, 99, 181 } - }, { /* Coeff Band 1 */ - { 85, 175, 239 }, - { 112, 165, 229 }, - { 29, 136, 200 }, - { 12, 103, 162 }, - { 6, 77, 123 }, - { 2, 53, 84 } - }, { /* Coeff Band 2 */ - { 75, 183, 239 }, - { 30, 155, 221 }, - { 3, 106, 171 }, - { 1, 74, 128 }, - { 1, 44, 76 }, - { 1, 17, 28 } - }, { /* Coeff Band 3 */ - { 73, 185, 240 }, - { 27, 159, 222 }, - { 2, 107, 172 }, - { 1, 75, 127 }, - { 1, 42, 73 }, - { 1, 17, 29 } - }, { /* Coeff Band 4 */ - { 62, 190, 238 }, - { 21, 159, 222 }, - { 2, 107, 172 }, - { 1, 72, 122 }, - { 1, 40, 71 }, - { 1, 18, 32 } - }, { /* Coeff Band 5 */ - { 61, 199, 240 }, - { 27, 161, 226 }, - { 4, 113, 180 }, - { 1, 76, 129 }, - { 1, 46, 80 }, - { 1, 23, 41 } - } - } - } -}; -static const vp9_coeff_probs_model default_coef_probs_16x16[BLOCK_TYPES] = { - { /* block Type 0 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 7, 27, 153 }, - { 5, 30, 95 }, - { 1, 16, 30 } - }, { /* Coeff Band 1 */ - { 50, 75, 127 }, - { 57, 75, 124 }, - { 27, 67, 108 }, - { 10, 54, 86 }, - { 1, 33, 52 }, - { 1, 12, 18 } - }, { /* Coeff Band 2 */ - { 43, 125, 151 }, - { 26, 108, 148 }, - { 7, 83, 122 }, - { 2, 59, 89 }, - { 1, 38, 60 }, - { 1, 17, 27 } - }, { /* Coeff Band 3 */ - { 23, 144, 163 }, - { 13, 112, 154 }, - { 2, 75, 117 }, - { 1, 50, 81 }, - { 1, 31, 51 }, - { 1, 14, 23 } - }, { /* Coeff Band 4 */ - { 18, 162, 185 }, - { 6, 123, 171 }, - { 1, 78, 125 }, - { 1, 51, 86 }, - { 1, 31, 54 }, - { 1, 14, 23 } - }, { /* Coeff Band 5 */ - { 15, 199, 227 }, - { 3, 150, 204 }, - { 1, 91, 146 }, - { 1, 55, 95 }, - { 1, 30, 53 }, - { 1, 11, 20 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 19, 55, 240 }, - { 19, 59, 196 }, - { 3, 52, 105 } - }, { /* Coeff Band 1 */ - { 41, 166, 207 }, - { 104, 153, 199 }, - { 31, 123, 181 }, - { 14, 101, 152 }, - { 5, 72, 106 }, - { 1, 36, 52 } - }, { /* Coeff Band 2 */ - { 35, 176, 211 }, - { 12, 131, 190 }, - { 2, 88, 144 }, - { 1, 60, 101 }, - { 1, 36, 60 }, - { 1, 16, 28 } - }, { /* Coeff Band 3 */ - { 28, 183, 213 }, - { 8, 134, 191 }, - { 1, 86, 142 }, - { 1, 56, 96 }, - { 1, 30, 53 }, - { 1, 12, 20 } - }, { /* Coeff Band 4 */ - { 20, 190, 215 }, - { 4, 135, 192 }, - { 1, 84, 139 }, - { 1, 53, 91 }, - { 1, 28, 49 }, - { 1, 11, 20 } - }, { /* Coeff Band 5 */ - { 13, 196, 216 }, - { 2, 137, 192 }, - { 1, 86, 143 }, - { 1, 57, 99 }, - { 1, 32, 56 }, - { 1, 13, 24 } - } - } - }, { /* block Type 1 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 211, 29, 217 }, - { 96, 47, 156 }, - { 22, 43, 87 } - }, { /* Coeff Band 1 */ - { 78, 120, 193 }, - { 111, 116, 186 }, - { 46, 102, 164 }, - { 15, 80, 128 }, - { 2, 49, 76 }, - { 1, 18, 28 } - }, { /* Coeff Band 2 */ - { 71, 161, 203 }, - { 42, 132, 192 }, - { 10, 98, 150 }, - { 3, 69, 109 }, - { 1, 44, 70 }, - { 1, 18, 29 } - }, { /* Coeff Band 3 */ - { 57, 186, 211 }, - { 30, 140, 196 }, - { 4, 93, 146 }, - { 1, 62, 102 }, - { 1, 38, 65 }, - { 1, 16, 27 } - }, { /* Coeff Band 4 */ - { 47, 199, 217 }, - { 14, 145, 196 }, - { 1, 88, 142 }, - { 1, 57, 98 }, - { 1, 36, 62 }, - { 1, 15, 26 } - }, { /* Coeff Band 5 */ - { 26, 219, 229 }, - { 5, 155, 207 }, - { 1, 94, 151 }, - { 1, 60, 104 }, - { 1, 36, 62 }, - { 1, 16, 28 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 233, 29, 248 }, - { 146, 47, 220 }, - { 43, 52, 140 } - }, { /* Coeff Band 1 */ - { 100, 163, 232 }, - { 179, 161, 222 }, - { 63, 142, 204 }, - { 37, 113, 174 }, - { 26, 89, 137 }, - { 18, 68, 97 } - }, { /* Coeff Band 2 */ - { 85, 181, 230 }, - { 32, 146, 209 }, - { 7, 100, 164 }, - { 3, 71, 121 }, - { 1, 45, 77 }, - { 1, 18, 30 } - }, { /* Coeff Band 3 */ - { 65, 187, 230 }, - { 20, 148, 207 }, - { 2, 97, 159 }, - { 1, 68, 116 }, - { 1, 40, 70 }, - { 1, 14, 29 } - }, { /* Coeff Band 4 */ - { 40, 194, 227 }, - { 8, 147, 204 }, - { 1, 94, 155 }, - { 1, 65, 112 }, - { 1, 39, 66 }, - { 1, 14, 26 } - }, { /* Coeff Band 5 */ - { 16, 208, 228 }, - { 3, 151, 207 }, - { 1, 98, 160 }, - { 1, 67, 117 }, - { 1, 41, 74 }, - { 1, 17, 31 } - } - } - } -}; -static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = { - { /* block Type 0 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 17, 38, 140 }, - { 7, 34, 80 }, - { 1, 17, 29 } - }, { /* Coeff Band 1 */ - { 37, 75, 128 }, - { 41, 76, 128 }, - { 26, 66, 116 }, - { 12, 52, 94 }, - { 2, 32, 55 }, - { 1, 10, 16 } - }, { /* Coeff Band 2 */ - { 50, 127, 154 }, - { 37, 109, 152 }, - { 16, 82, 121 }, - { 5, 59, 85 }, - { 1, 35, 54 }, - { 1, 13, 20 } - }, { /* Coeff Band 3 */ - { 40, 142, 167 }, - { 17, 110, 157 }, - { 2, 71, 112 }, - { 1, 44, 72 }, - { 1, 27, 45 }, - { 1, 11, 17 } - }, { /* Coeff Band 4 */ - { 30, 175, 188 }, - { 9, 124, 169 }, - { 1, 74, 116 }, - { 1, 48, 78 }, - { 1, 30, 49 }, - { 1, 11, 18 } - }, { /* Coeff Band 5 */ - { 10, 222, 223 }, - { 2, 150, 194 }, - { 1, 83, 128 }, - { 1, 48, 79 }, - { 1, 27, 45 }, - { 1, 11, 17 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 36, 41, 235 }, - { 29, 36, 193 }, - { 10, 27, 111 } - }, { /* Coeff Band 1 */ - { 85, 165, 222 }, - { 177, 162, 215 }, - { 110, 135, 195 }, - { 57, 113, 168 }, - { 23, 83, 120 }, - { 10, 49, 61 } - }, { /* Coeff Band 2 */ - { 85, 190, 223 }, - { 36, 139, 200 }, - { 5, 90, 146 }, - { 1, 60, 103 }, - { 1, 38, 65 }, - { 1, 18, 30 } - }, { /* Coeff Band 3 */ - { 72, 202, 223 }, - { 23, 141, 199 }, - { 2, 86, 140 }, - { 1, 56, 97 }, - { 1, 36, 61 }, - { 1, 16, 27 } - }, { /* Coeff Band 4 */ - { 55, 218, 225 }, - { 13, 145, 200 }, - { 1, 86, 141 }, - { 1, 57, 99 }, - { 1, 35, 61 }, - { 1, 13, 22 } - }, { /* Coeff Band 5 */ - { 15, 235, 212 }, - { 1, 132, 184 }, - { 1, 84, 139 }, - { 1, 57, 97 }, - { 1, 34, 56 }, - { 1, 14, 23 } - } - } - }, { /* block Type 1 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 181, 21, 201 }, - { 61, 37, 123 }, - { 10, 38, 71 } - }, { /* Coeff Band 1 */ - { 47, 106, 172 }, - { 95, 104, 173 }, - { 42, 93, 159 }, - { 18, 77, 131 }, - { 4, 50, 81 }, - { 1, 17, 23 } - }, { /* Coeff Band 2 */ - { 62, 147, 199 }, - { 44, 130, 189 }, - { 28, 102, 154 }, - { 18, 75, 115 }, - { 2, 44, 65 }, - { 1, 12, 19 } - }, { /* Coeff Band 3 */ - { 55, 153, 210 }, - { 24, 130, 194 }, - { 3, 93, 146 }, - { 1, 61, 97 }, - { 1, 31, 50 }, - { 1, 10, 16 } - }, { /* Coeff Band 4 */ - { 49, 186, 223 }, - { 17, 148, 204 }, - { 1, 96, 142 }, - { 1, 53, 83 }, - { 1, 26, 44 }, - { 1, 11, 17 } - }, { /* Coeff Band 5 */ - { 13, 217, 212 }, - { 2, 136, 180 }, - { 1, 78, 124 }, - { 1, 50, 83 }, - { 1, 29, 49 }, - { 1, 14, 23 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 197, 13, 247 }, - { 82, 17, 222 }, - { 25, 17, 162 } - }, { /* Coeff Band 1 */ - { 126, 186, 247 }, - { 234, 191, 243 }, - { 176, 177, 234 }, - { 104, 158, 220 }, - { 66, 128, 186 }, - { 55, 90, 137 } - }, { /* Coeff Band 2 */ - { 111, 197, 242 }, - { 46, 158, 219 }, - { 9, 104, 171 }, - { 2, 65, 125 }, - { 1, 44, 80 }, - { 1, 17, 91 } - }, { /* Coeff Band 3 */ - { 104, 208, 245 }, - { 39, 168, 224 }, - { 3, 109, 162 }, - { 1, 79, 124 }, - { 1, 50, 102 }, - { 1, 43, 102 } - }, { /* Coeff Band 4 */ - { 84, 220, 246 }, - { 31, 177, 231 }, - { 2, 115, 180 }, - { 1, 79, 134 }, - { 1, 55, 77 }, - { 1, 60, 79 } - }, { /* Coeff Band 5 */ - { 43, 243, 240 }, - { 8, 180, 217 }, - { 1, 115, 166 }, - { 1, 84, 121 }, - { 1, 51, 67 }, - { 1, 16, 6 } - } - } - } -}; - -#endif // VP9_COMMON_DEFAULT_COEF_PROBS_H_ diff --git a/libvpx/vp9/common/vp9_entropy.c b/libvpx/vp9/common/vp9_entropy.c index feceb66..bc12f9a 100644 --- a/libvpx/vp9/common/vp9_entropy.c +++ b/libvpx/vp9/common/vp9_entropy.c @@ -15,29 +15,8 @@ #include "vpx_mem/vpx_mem.h" #include "vpx/vpx_integer.h" -#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES) -DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = { - 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -DECLARE_ALIGNED(16, const uint8_t, - vp9_coefband_trans_8x8plus[1024]) = { +const uint8_t vp9_coefband_trans_8x8plus[1024] = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, // beyond MAXBAND_INDEX+1 all values are filled as 5 @@ -106,50 +85,17 @@ DECLARE_ALIGNED(16, const uint8_t, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, }; -DECLARE_ALIGNED(16, const uint8_t, - vp9_coefband_trans_4x4[16]) = { +const uint8_t vp9_coefband_trans_4x4[16] = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, }; -DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = { +const uint8_t vp9_pt_energy_class[ENTROPY_TOKENS] = { 0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5 }; - - -/* Array indices are identical to previously-existing CONTEXT_NODE indices */ - -const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = { - -DCT_EOB_TOKEN, 2, /* 0 = EOB */ - -ZERO_TOKEN, 4, /* 1 = ZERO */ - -ONE_TOKEN, 6, /* 2 = ONE */ - 8, 12, /* 3 = LOW_VAL */ - -TWO_TOKEN, 10, /* 4 = TWO */ - -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */ - 14, 16, /* 6 = HIGH_LOW */ - -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */ - 18, 20, /* 8 = CAT_THREEFOUR */ - -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */ - -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */ -}; - -struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS]; - -/* Trees for extra bits. Probabilities are constant and - do not depend on previously encoded bits */ - -static const vp9_prob Pcat1[] = { 159}; -static const vp9_prob Pcat2[] = { 165, 145}; -static const vp9_prob Pcat3[] = { 173, 148, 140}; -static const vp9_prob Pcat4[] = { 176, 155, 140, 135}; -static const vp9_prob Pcat5[] = { 180, 157, 141, 134, 130}; -static const vp9_prob Pcat6[] = { - 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 -}; - -const vp9_tree_index vp9_coefmodel_tree[6] = { - -DCT_EOB_MODEL_TOKEN, 2, /* 0 = EOB */ - -ZERO_TOKEN, 4, /* 1 = ZERO */ +const vp9_tree_index vp9_coefmodel_tree[TREE_SIZE(UNCONSTRAINED_NODES + 1)] = { + -EOB_MODEL_TOKEN, 2, + -ZERO_TOKEN, 4, -ONE_TOKEN, -TWO_TOKEN, }; @@ -162,198 +108,617 @@ const vp9_tree_index vp9_coefmodel_tree[6] = { // the probabilities for the rest of the nodes. // beta = 8 -static const vp9_prob modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = { + +// Every odd line in this table can be generated from the even lines +// by averaging : +// vp9_pareto8_full[l][node] = (vp9_pareto8_full[l-1][node] + +// vp9_pareto8_full[l+1][node] ) >> 1; +const vp9_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES] = { { 3, 86, 128, 6, 86, 23, 88, 29}, + { 6, 86, 128, 11, 87, 42, 91, 52}, { 9, 86, 129, 17, 88, 61, 94, 76}, + { 12, 86, 129, 22, 88, 77, 97, 93}, { 15, 87, 129, 28, 89, 93, 100, 110}, + { 17, 87, 129, 33, 90, 105, 103, 123}, { 20, 88, 130, 38, 91, 118, 106, 136}, + { 23, 88, 130, 43, 91, 128, 108, 146}, { 26, 89, 131, 48, 92, 139, 111, 156}, + { 28, 89, 131, 53, 93, 147, 114, 163}, { 31, 90, 131, 58, 94, 156, 117, 171}, + { 34, 90, 131, 62, 94, 163, 119, 177}, { 37, 90, 132, 66, 95, 171, 122, 184}, + { 39, 90, 132, 70, 96, 177, 124, 189}, { 42, 91, 132, 75, 97, 183, 127, 194}, + { 44, 91, 132, 79, 97, 188, 129, 198}, { 47, 92, 133, 83, 98, 193, 132, 202}, + { 49, 92, 133, 86, 99, 197, 134, 205}, { 52, 93, 133, 90, 100, 201, 137, 208}, + { 54, 93, 133, 94, 100, 204, 139, 211}, { 57, 94, 134, 98, 101, 208, 142, 214}, + { 59, 94, 134, 101, 102, 211, 144, 216}, { 62, 94, 135, 105, 103, 214, 146, 218}, + { 64, 94, 135, 108, 103, 216, 148, 220}, { 66, 95, 135, 111, 104, 219, 151, 222}, + { 68, 95, 135, 114, 105, 221, 153, 223}, { 71, 96, 136, 117, 106, 224, 155, 225}, + { 73, 96, 136, 120, 106, 225, 157, 226}, { 76, 97, 136, 123, 107, 227, 159, 228}, + { 78, 97, 136, 126, 108, 229, 160, 229}, { 80, 98, 137, 129, 109, 231, 162, 231}, + { 82, 98, 137, 131, 109, 232, 164, 232}, { 84, 98, 138, 134, 110, 234, 166, 233}, + { 86, 98, 138, 137, 111, 235, 168, 234}, { 89, 99, 138, 140, 112, 236, 170, 235}, + { 91, 99, 138, 142, 112, 237, 171, 235}, { 93, 100, 139, 145, 113, 238, 173, 236}, + { 95, 100, 139, 147, 114, 239, 174, 237}, { 97, 101, 140, 149, 115, 240, 176, 238}, + { 99, 101, 140, 151, 115, 241, 177, 238}, {101, 102, 140, 154, 116, 242, 179, 239}, + {103, 102, 140, 156, 117, 242, 180, 239}, {105, 103, 141, 158, 118, 243, 182, 240}, + {107, 103, 141, 160, 118, 243, 183, 240}, {109, 104, 141, 162, 119, 244, 185, 241}, + {111, 104, 141, 164, 119, 244, 186, 241}, {113, 104, 142, 166, 120, 245, 187, 242}, + {114, 104, 142, 168, 121, 245, 188, 242}, {116, 105, 143, 170, 122, 246, 190, 243}, + {118, 105, 143, 171, 122, 246, 191, 243}, {120, 106, 143, 173, 123, 247, 192, 244}, + {121, 106, 143, 175, 124, 247, 193, 244}, {123, 107, 144, 177, 125, 248, 195, 244}, + {125, 107, 144, 178, 125, 248, 196, 244}, {127, 108, 145, 180, 126, 249, 197, 245}, + {128, 108, 145, 181, 127, 249, 198, 245}, {130, 109, 145, 183, 128, 249, 199, 245}, + {132, 109, 145, 184, 128, 249, 200, 245}, {134, 110, 146, 186, 129, 250, 201, 246}, + {135, 110, 146, 187, 130, 250, 202, 246}, {137, 111, 147, 189, 131, 251, 203, 246}, + {138, 111, 147, 190, 131, 251, 204, 246}, {140, 112, 147, 192, 132, 251, 205, 247}, + {141, 112, 147, 193, 132, 251, 206, 247}, {143, 113, 148, 194, 133, 251, 207, 247}, + {144, 113, 148, 195, 134, 251, 207, 247}, {146, 114, 149, 197, 135, 252, 208, 248}, + {147, 114, 149, 198, 135, 252, 209, 248}, {149, 115, 149, 199, 136, 252, 210, 248}, + {150, 115, 149, 200, 137, 252, 210, 248}, {152, 115, 150, 201, 138, 252, 211, 248}, + {153, 115, 150, 202, 138, 252, 212, 248}, {155, 116, 151, 204, 139, 253, 213, 249}, + {156, 116, 151, 205, 139, 253, 213, 249}, {158, 117, 151, 206, 140, 253, 214, 249}, + {159, 117, 151, 207, 141, 253, 215, 249}, {161, 118, 152, 208, 142, 253, 216, 249}, + {162, 118, 152, 209, 142, 253, 216, 249}, {163, 119, 153, 210, 143, 253, 217, 249}, + {164, 119, 153, 211, 143, 253, 217, 249}, {166, 120, 153, 212, 144, 254, 218, 250}, + {167, 120, 153, 212, 145, 254, 219, 250}, {168, 121, 154, 213, 146, 254, 220, 250}, + {169, 121, 154, 214, 146, 254, 220, 250}, {171, 122, 155, 215, 147, 254, 221, 250}, + {172, 122, 155, 216, 147, 254, 221, 250}, {173, 123, 155, 217, 148, 254, 222, 250}, + {174, 123, 155, 217, 149, 254, 222, 250}, {176, 124, 156, 218, 150, 254, 223, 250}, + {177, 124, 156, 219, 150, 254, 223, 250}, {178, 125, 157, 220, 151, 254, 224, 251}, + {179, 125, 157, 220, 151, 254, 224, 251}, {180, 126, 157, 221, 152, 254, 225, 251}, + {181, 126, 157, 221, 152, 254, 225, 251}, {183, 127, 158, 222, 153, 254, 226, 251}, + {184, 127, 158, 223, 154, 254, 226, 251}, {185, 128, 159, 224, 155, 255, 227, 251}, + {186, 128, 159, 224, 155, 255, 227, 251}, {187, 129, 160, 225, 156, 255, 228, 251}, + {188, 130, 160, 225, 156, 255, 228, 251}, {189, 131, 160, 226, 157, 255, 228, 251}, + {190, 131, 160, 226, 158, 255, 228, 251}, {191, 132, 161, 227, 159, 255, 229, 251}, + {192, 132, 161, 227, 159, 255, 229, 251}, {193, 133, 162, 228, 160, 255, 230, 252}, + {194, 133, 162, 229, 160, 255, 230, 252}, {195, 134, 163, 230, 161, 255, 231, 252}, + {196, 134, 163, 230, 161, 255, 231, 252}, {197, 135, 163, 231, 162, 255, 231, 252}, + {198, 135, 163, 231, 162, 255, 231, 252}, {199, 136, 164, 232, 163, 255, 232, 252}, + {200, 136, 164, 232, 164, 255, 232, 252}, + {201, 137, 165, 233, 165, 255, 233, 252}, {201, 137, 165, 233, 165, 255, 233, 252}, {202, 138, 166, 233, 166, 255, 233, 252}, + {203, 138, 166, 233, 166, 255, 233, 252}, {204, 139, 166, 234, 167, 255, 234, 252}, + {205, 139, 166, 234, 167, 255, 234, 252}, + {206, 140, 167, 235, 168, 255, 235, 252}, {206, 140, 167, 235, 168, 255, 235, 252}, {207, 141, 168, 236, 169, 255, 235, 252}, + {208, 141, 168, 236, 170, 255, 235, 252}, {209, 142, 169, 237, 171, 255, 236, 252}, + {209, 143, 169, 237, 171, 255, 236, 252}, {210, 144, 169, 237, 172, 255, 236, 252}, + {211, 144, 169, 237, 172, 255, 236, 252}, {212, 145, 170, 238, 173, 255, 237, 252}, + {213, 145, 170, 238, 173, 255, 237, 252}, {214, 146, 171, 239, 174, 255, 237, 253}, + {214, 146, 171, 239, 174, 255, 237, 253}, + {215, 147, 172, 240, 175, 255, 238, 253}, {215, 147, 172, 240, 175, 255, 238, 253}, {216, 148, 173, 240, 176, 255, 238, 253}, + {217, 148, 173, 240, 176, 255, 238, 253}, {218, 149, 173, 241, 177, 255, 239, 253}, + {218, 149, 173, 241, 178, 255, 239, 253}, {219, 150, 174, 241, 179, 255, 239, 253}, + {219, 151, 174, 241, 179, 255, 239, 253}, {220, 152, 175, 242, 180, 255, 240, 253}, + {221, 152, 175, 242, 180, 255, 240, 253}, {222, 153, 176, 242, 181, 255, 240, 253}, + {222, 153, 176, 242, 181, 255, 240, 253}, + {223, 154, 177, 243, 182, 255, 240, 253}, {223, 154, 177, 243, 182, 255, 240, 253}, {224, 155, 178, 244, 183, 255, 241, 253}, + {224, 155, 178, 244, 183, 255, 241, 253}, {225, 156, 178, 244, 184, 255, 241, 253}, + {225, 157, 178, 244, 184, 255, 241, 253}, {226, 158, 179, 244, 185, 255, 242, 253}, + {227, 158, 179, 244, 185, 255, 242, 253}, + {228, 159, 180, 245, 186, 255, 242, 253}, {228, 159, 180, 245, 186, 255, 242, 253}, {229, 160, 181, 245, 187, 255, 242, 253}, + {229, 160, 181, 245, 187, 255, 242, 253}, {230, 161, 182, 246, 188, 255, 243, 253}, + {230, 162, 182, 246, 188, 255, 243, 253}, + {231, 163, 183, 246, 189, 255, 243, 253}, {231, 163, 183, 246, 189, 255, 243, 253}, {232, 164, 184, 247, 190, 255, 243, 253}, + {232, 164, 184, 247, 190, 255, 243, 253}, + {233, 165, 185, 247, 191, 255, 244, 253}, {233, 165, 185, 247, 191, 255, 244, 253}, {234, 166, 185, 247, 192, 255, 244, 253}, + {234, 167, 185, 247, 192, 255, 244, 253}, {235, 168, 186, 248, 193, 255, 244, 253}, + {235, 168, 186, 248, 193, 255, 244, 253}, + {236, 169, 187, 248, 194, 255, 244, 253}, {236, 169, 187, 248, 194, 255, 244, 253}, {236, 170, 188, 248, 195, 255, 245, 253}, + {236, 170, 188, 248, 195, 255, 245, 253}, {237, 171, 189, 249, 196, 255, 245, 254}, + {237, 172, 189, 249, 196, 255, 245, 254}, + {238, 173, 190, 249, 197, 255, 245, 254}, {238, 173, 190, 249, 197, 255, 245, 254}, {239, 174, 191, 249, 198, 255, 245, 254}, + {239, 174, 191, 249, 198, 255, 245, 254}, {240, 175, 192, 249, 199, 255, 246, 254}, + {240, 176, 192, 249, 199, 255, 246, 254}, + {240, 177, 193, 250, 200, 255, 246, 254}, {240, 177, 193, 250, 200, 255, 246, 254}, {241, 178, 194, 250, 201, 255, 246, 254}, + {241, 178, 194, 250, 201, 255, 246, 254}, {242, 179, 195, 250, 202, 255, 246, 254}, + {242, 180, 195, 250, 202, 255, 246, 254}, + {242, 181, 196, 250, 203, 255, 247, 254}, {242, 181, 196, 250, 203, 255, 247, 254}, {243, 182, 197, 251, 204, 255, 247, 254}, + {243, 183, 197, 251, 204, 255, 247, 254}, {244, 184, 198, 251, 205, 255, 247, 254}, + {244, 184, 198, 251, 205, 255, 247, 254}, + {244, 185, 199, 251, 206, 255, 247, 254}, {244, 185, 199, 251, 206, 255, 247, 254}, {245, 186, 200, 251, 207, 255, 247, 254}, + {245, 187, 200, 251, 207, 255, 247, 254}, + {246, 188, 201, 252, 207, 255, 248, 254}, {246, 188, 201, 252, 207, 255, 248, 254}, {246, 189, 202, 252, 208, 255, 248, 254}, + {246, 190, 202, 252, 208, 255, 248, 254}, + {247, 191, 203, 252, 209, 255, 248, 254}, {247, 191, 203, 252, 209, 255, 248, 254}, {247, 192, 204, 252, 210, 255, 248, 254}, + {247, 193, 204, 252, 210, 255, 248, 254}, + {248, 194, 205, 252, 211, 255, 248, 254}, {248, 194, 205, 252, 211, 255, 248, 254}, {248, 195, 206, 252, 212, 255, 249, 254}, + {248, 196, 206, 252, 212, 255, 249, 254}, + {249, 197, 207, 253, 213, 255, 249, 254}, {249, 197, 207, 253, 213, 255, 249, 254}, {249, 198, 208, 253, 214, 255, 249, 254}, + {249, 199, 209, 253, 214, 255, 249, 254}, + {250, 200, 210, 253, 215, 255, 249, 254}, {250, 200, 210, 253, 215, 255, 249, 254}, {250, 201, 211, 253, 215, 255, 249, 254}, + {250, 202, 211, 253, 215, 255, 249, 254}, + {250, 203, 212, 253, 216, 255, 249, 254}, {250, 203, 212, 253, 216, 255, 249, 254}, {251, 204, 213, 253, 217, 255, 250, 254}, + {251, 205, 213, 253, 217, 255, 250, 254}, {251, 206, 214, 254, 218, 255, 250, 254}, + {251, 206, 215, 254, 218, 255, 250, 254}, {252, 207, 216, 254, 219, 255, 250, 254}, + {252, 208, 216, 254, 219, 255, 250, 254}, {252, 209, 217, 254, 220, 255, 250, 254}, + {252, 210, 217, 254, 220, 255, 250, 254}, {252, 211, 218, 254, 221, 255, 250, 254}, + {252, 212, 218, 254, 221, 255, 250, 254}, {253, 213, 219, 254, 222, 255, 250, 254}, + {253, 213, 220, 254, 222, 255, 250, 254}, {253, 214, 221, 254, 223, 255, 250, 254}, + {253, 215, 221, 254, 223, 255, 250, 254}, {253, 216, 222, 254, 224, 255, 251, 254}, + {253, 217, 223, 254, 224, 255, 251, 254}, {253, 218, 224, 254, 225, 255, 251, 254}, + {253, 219, 224, 254, 225, 255, 251, 254}, {254, 220, 225, 254, 225, 255, 251, 254}, + {254, 221, 226, 254, 225, 255, 251, 254}, {254, 222, 227, 255, 226, 255, 251, 254}, + {254, 223, 227, 255, 226, 255, 251, 254}, {254, 224, 228, 255, 227, 255, 251, 254}, + {254, 225, 229, 255, 227, 255, 251, 254}, {254, 226, 230, 255, 228, 255, 251, 254}, + {254, 227, 230, 255, 229, 255, 251, 254}, {255, 228, 231, 255, 230, 255, 251, 254}, + {255, 229, 232, 255, 230, 255, 251, 254}, {255, 230, 233, 255, 231, 255, 252, 254}, + {255, 231, 234, 255, 231, 255, 252, 254}, {255, 232, 235, 255, 232, 255, 252, 254}, + {255, 233, 236, 255, 232, 255, 252, 254}, {255, 235, 237, 255, 233, 255, 252, 254}, + {255, 236, 238, 255, 234, 255, 252, 254}, {255, 238, 240, 255, 235, 255, 252, 255}, + {255, 239, 241, 255, 235, 255, 252, 254}, {255, 241, 243, 255, 236, 255, 252, 254}, - {255, 246, 247, 255, 239, 255, 253, 255} + {255, 243, 245, 255, 237, 255, 252, 254}, + {255, 246, 247, 255, 239, 255, 253, 255}, + {255, 246, 247, 255, 239, 255, 253, 255}, }; -static void extend_model_to_full_distribution(vp9_prob p, - vp9_prob *tree_probs) { - const int l = (p - 1) / 2; - const vp9_prob (*model)[MODEL_NODES] = modelcoefprobs_pareto8; - if (p & 1) { - vpx_memcpy(tree_probs + UNCONSTRAINED_NODES, - model[l], MODEL_NODES * sizeof(vp9_prob)); - } else { - // interpolate - int i; - for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i) - tree_probs[i] = (model[l][i - UNCONSTRAINED_NODES] + - model[l + 1][i - UNCONSTRAINED_NODES]) >> 1; +static const vp9_coeff_probs_model default_coef_probs_4x4[PLANE_TYPES] = { + { // Y plane + { // Intra + { // Band 0 + { 195, 29, 183 }, { 84, 49, 136 }, { 8, 42, 71 } + }, { // Band 1 + { 31, 107, 169 }, { 35, 99, 159 }, { 17, 82, 140 }, + { 8, 66, 114 }, { 2, 44, 76 }, { 1, 19, 32 } + }, { // Band 2 + { 40, 132, 201 }, { 29, 114, 187 }, { 13, 91, 157 }, + { 7, 75, 127 }, { 3, 58, 95 }, { 1, 28, 47 } + }, { // Band 3 + { 69, 142, 221 }, { 42, 122, 201 }, { 15, 91, 159 }, + { 6, 67, 121 }, { 1, 42, 77 }, { 1, 17, 31 } + }, { // Band 4 + { 102, 148, 228 }, { 67, 117, 204 }, { 17, 82, 154 }, + { 6, 59, 114 }, { 2, 39, 75 }, { 1, 15, 29 } + }, { // Band 5 + { 156, 57, 233 }, { 119, 57, 212 }, { 58, 48, 163 }, + { 29, 40, 124 }, { 12, 30, 81 }, { 3, 12, 31 } + } + }, { // Inter + { // Band 0 + { 191, 107, 226 }, { 124, 117, 204 }, { 25, 99, 155 } + }, { // Band 1 + { 29, 148, 210 }, { 37, 126, 194 }, { 8, 93, 157 }, + { 2, 68, 118 }, { 1, 39, 69 }, { 1, 17, 33 } + }, { // Band 2 + { 41, 151, 213 }, { 27, 123, 193 }, { 3, 82, 144 }, + { 1, 58, 105 }, { 1, 32, 60 }, { 1, 13, 26 } + }, { // Band 3 + { 59, 159, 220 }, { 23, 126, 198 }, { 4, 88, 151 }, + { 1, 66, 114 }, { 1, 38, 71 }, { 1, 18, 34 } + }, { // Band 4 + { 114, 136, 232 }, { 51, 114, 207 }, { 11, 83, 155 }, + { 3, 56, 105 }, { 1, 33, 65 }, { 1, 17, 34 } + }, { // Band 5 + { 149, 65, 234 }, { 121, 57, 215 }, { 61, 49, 166 }, + { 28, 36, 114 }, { 12, 25, 76 }, { 3, 16, 42 } + } + } + }, { // UV plane + { // Intra + { // Band 0 + { 214, 49, 220 }, { 132, 63, 188 }, { 42, 65, 137 } + }, { // Band 1 + { 85, 137, 221 }, { 104, 131, 216 }, { 49, 111, 192 }, + { 21, 87, 155 }, { 2, 49, 87 }, { 1, 16, 28 } + }, { // Band 2 + { 89, 163, 230 }, { 90, 137, 220 }, { 29, 100, 183 }, + { 10, 70, 135 }, { 2, 42, 81 }, { 1, 17, 33 } + }, { // Band 3 + { 108, 167, 237 }, { 55, 133, 222 }, { 15, 97, 179 }, + { 4, 72, 135 }, { 1, 45, 85 }, { 1, 19, 38 } + }, { // Band 4 + { 124, 146, 240 }, { 66, 124, 224 }, { 17, 88, 175 }, + { 4, 58, 122 }, { 1, 36, 75 }, { 1, 18, 37 } + }, { // Band 5 + { 141, 79, 241 }, { 126, 70, 227 }, { 66, 58, 182 }, + { 30, 44, 136 }, { 12, 34, 96 }, { 2, 20, 47 } + } + }, { // Inter + { // Band 0 + { 229, 99, 249 }, { 143, 111, 235 }, { 46, 109, 192 } + }, { // Band 1 + { 82, 158, 236 }, { 94, 146, 224 }, { 25, 117, 191 }, + { 9, 87, 149 }, { 3, 56, 99 }, { 1, 33, 57 } + }, { // Band 2 + { 83, 167, 237 }, { 68, 145, 222 }, { 10, 103, 177 }, + { 2, 72, 131 }, { 1, 41, 79 }, { 1, 20, 39 } + }, { // Band 3 + { 99, 167, 239 }, { 47, 141, 224 }, { 10, 104, 178 }, + { 2, 73, 133 }, { 1, 44, 85 }, { 1, 22, 47 } + }, { // Band 4 + { 127, 145, 243 }, { 71, 129, 228 }, { 17, 93, 177 }, + { 3, 61, 124 }, { 1, 41, 84 }, { 1, 21, 52 } + }, { // Band 5 + { 157, 78, 244 }, { 140, 72, 231 }, { 69, 58, 184 }, + { 31, 44, 137 }, { 14, 38, 105 }, { 8, 23, 61 } + } + } } -} - -void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) { - if (full != model) - vpx_memcpy(full, model, sizeof(vp9_prob) * UNCONSTRAINED_NODES); - extend_model_to_full_distribution(model[PIVOT_NODE], full); -} +}; -static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28]; +static const vp9_coeff_probs_model default_coef_probs_8x8[PLANE_TYPES] = { + { // Y plane + { // Intra + { // Band 0 + { 125, 34, 187 }, { 52, 41, 133 }, { 6, 31, 56 } + }, { // Band 1 + { 37, 109, 153 }, { 51, 102, 147 }, { 23, 87, 128 }, + { 8, 67, 101 }, { 1, 41, 63 }, { 1, 19, 29 } + }, { // Band 2 + { 31, 154, 185 }, { 17, 127, 175 }, { 6, 96, 145 }, + { 2, 73, 114 }, { 1, 51, 82 }, { 1, 28, 45 } + }, { // Band 3 + { 23, 163, 200 }, { 10, 131, 185 }, { 2, 93, 148 }, + { 1, 67, 111 }, { 1, 41, 69 }, { 1, 14, 24 } + }, { // Band 4 + { 29, 176, 217 }, { 12, 145, 201 }, { 3, 101, 156 }, + { 1, 69, 111 }, { 1, 39, 63 }, { 1, 14, 23 } + }, { // Band 5 + { 57, 192, 233 }, { 25, 154, 215 }, { 6, 109, 167 }, + { 3, 78, 118 }, { 1, 48, 69 }, { 1, 21, 29 } + } + }, { // Inter + { // Band 0 + { 202, 105, 245 }, { 108, 106, 216 }, { 18, 90, 144 } + }, { // Band 1 + { 33, 172, 219 }, { 64, 149, 206 }, { 14, 117, 177 }, + { 5, 90, 141 }, { 2, 61, 95 }, { 1, 37, 57 } + }, { // Band 2 + { 33, 179, 220 }, { 11, 140, 198 }, { 1, 89, 148 }, + { 1, 60, 104 }, { 1, 33, 57 }, { 1, 12, 21 } + }, { // Band 3 + { 30, 181, 221 }, { 8, 141, 198 }, { 1, 87, 145 }, + { 1, 58, 100 }, { 1, 31, 55 }, { 1, 12, 20 } + }, { // Band 4 + { 32, 186, 224 }, { 7, 142, 198 }, { 1, 86, 143 }, + { 1, 58, 100 }, { 1, 31, 55 }, { 1, 12, 22 } + }, { // Band 5 + { 57, 192, 227 }, { 20, 143, 204 }, { 3, 96, 154 }, + { 1, 68, 112 }, { 1, 42, 69 }, { 1, 19, 32 } + } + } + }, { // UV plane + { // Intra + { // Band 0 + { 212, 35, 215 }, { 113, 47, 169 }, { 29, 48, 105 } + }, { // Band 1 + { 74, 129, 203 }, { 106, 120, 203 }, { 49, 107, 178 }, + { 19, 84, 144 }, { 4, 50, 84 }, { 1, 15, 25 } + }, { // Band 2 + { 71, 172, 217 }, { 44, 141, 209 }, { 15, 102, 173 }, + { 6, 76, 133 }, { 2, 51, 89 }, { 1, 24, 42 } + }, { // Band 3 + { 64, 185, 231 }, { 31, 148, 216 }, { 8, 103, 175 }, + { 3, 74, 131 }, { 1, 46, 81 }, { 1, 18, 30 } + }, { // Band 4 + { 65, 196, 235 }, { 25, 157, 221 }, { 5, 105, 174 }, + { 1, 67, 120 }, { 1, 38, 69 }, { 1, 15, 30 } + }, { // Band 5 + { 65, 204, 238 }, { 30, 156, 224 }, { 7, 107, 177 }, + { 2, 70, 124 }, { 1, 42, 73 }, { 1, 18, 34 } + } + }, { // Inter + { // Band 0 + { 225, 86, 251 }, { 144, 104, 235 }, { 42, 99, 181 } + }, { // Band 1 + { 85, 175, 239 }, { 112, 165, 229 }, { 29, 136, 200 }, + { 12, 103, 162 }, { 6, 77, 123 }, { 2, 53, 84 } + }, { // Band 2 + { 75, 183, 239 }, { 30, 155, 221 }, { 3, 106, 171 }, + { 1, 74, 128 }, { 1, 44, 76 }, { 1, 17, 28 } + }, { // Band 3 + { 73, 185, 240 }, { 27, 159, 222 }, { 2, 107, 172 }, + { 1, 75, 127 }, { 1, 42, 73 }, { 1, 17, 29 } + }, { // Band 4 + { 62, 190, 238 }, { 21, 159, 222 }, { 2, 107, 172 }, + { 1, 72, 122 }, { 1, 40, 71 }, { 1, 18, 32 } + }, { // Band 5 + { 61, 199, 240 }, { 27, 161, 226 }, { 4, 113, 180 }, + { 1, 76, 129 }, { 1, 46, 80 }, { 1, 23, 41 } + } + } + } +}; -static void init_bit_tree(vp9_tree_index *p, int n) { - int i = 0; +static const vp9_coeff_probs_model default_coef_probs_16x16[PLANE_TYPES] = { + { // Y plane + { // Intra + { // Band 0 + { 7, 27, 153 }, { 5, 30, 95 }, { 1, 16, 30 } + }, { // Band 1 + { 50, 75, 127 }, { 57, 75, 124 }, { 27, 67, 108 }, + { 10, 54, 86 }, { 1, 33, 52 }, { 1, 12, 18 } + }, { // Band 2 + { 43, 125, 151 }, { 26, 108, 148 }, { 7, 83, 122 }, + { 2, 59, 89 }, { 1, 38, 60 }, { 1, 17, 27 } + }, { // Band 3 + { 23, 144, 163 }, { 13, 112, 154 }, { 2, 75, 117 }, + { 1, 50, 81 }, { 1, 31, 51 }, { 1, 14, 23 } + }, { // Band 4 + { 18, 162, 185 }, { 6, 123, 171 }, { 1, 78, 125 }, + { 1, 51, 86 }, { 1, 31, 54 }, { 1, 14, 23 } + }, { // Band 5 + { 15, 199, 227 }, { 3, 150, 204 }, { 1, 91, 146 }, + { 1, 55, 95 }, { 1, 30, 53 }, { 1, 11, 20 } + } + }, { // Inter + { // Band 0 + { 19, 55, 240 }, { 19, 59, 196 }, { 3, 52, 105 } + }, { // Band 1 + { 41, 166, 207 }, { 104, 153, 199 }, { 31, 123, 181 }, + { 14, 101, 152 }, { 5, 72, 106 }, { 1, 36, 52 } + }, { // Band 2 + { 35, 176, 211 }, { 12, 131, 190 }, { 2, 88, 144 }, + { 1, 60, 101 }, { 1, 36, 60 }, { 1, 16, 28 } + }, { // Band 3 + { 28, 183, 213 }, { 8, 134, 191 }, { 1, 86, 142 }, + { 1, 56, 96 }, { 1, 30, 53 }, { 1, 12, 20 } + }, { // Band 4 + { 20, 190, 215 }, { 4, 135, 192 }, { 1, 84, 139 }, + { 1, 53, 91 }, { 1, 28, 49 }, { 1, 11, 20 } + }, { // Band 5 + { 13, 196, 216 }, { 2, 137, 192 }, { 1, 86, 143 }, + { 1, 57, 99 }, { 1, 32, 56 }, { 1, 13, 24 } + } + } + }, { // UV plane + { // Intra + { // Band 0 + { 211, 29, 217 }, { 96, 47, 156 }, { 22, 43, 87 } + }, { // Band 1 + { 78, 120, 193 }, { 111, 116, 186 }, { 46, 102, 164 }, + { 15, 80, 128 }, { 2, 49, 76 }, { 1, 18, 28 } + }, { // Band 2 + { 71, 161, 203 }, { 42, 132, 192 }, { 10, 98, 150 }, + { 3, 69, 109 }, { 1, 44, 70 }, { 1, 18, 29 } + }, { // Band 3 + { 57, 186, 211 }, { 30, 140, 196 }, { 4, 93, 146 }, + { 1, 62, 102 }, { 1, 38, 65 }, { 1, 16, 27 } + }, { // Band 4 + { 47, 199, 217 }, { 14, 145, 196 }, { 1, 88, 142 }, + { 1, 57, 98 }, { 1, 36, 62 }, { 1, 15, 26 } + }, { // Band 5 + { 26, 219, 229 }, { 5, 155, 207 }, { 1, 94, 151 }, + { 1, 60, 104 }, { 1, 36, 62 }, { 1, 16, 28 } + } + }, { // Inter + { // Band 0 + { 233, 29, 248 }, { 146, 47, 220 }, { 43, 52, 140 } + }, { // Band 1 + { 100, 163, 232 }, { 179, 161, 222 }, { 63, 142, 204 }, + { 37, 113, 174 }, { 26, 89, 137 }, { 18, 68, 97 } + }, { // Band 2 + { 85, 181, 230 }, { 32, 146, 209 }, { 7, 100, 164 }, + { 3, 71, 121 }, { 1, 45, 77 }, { 1, 18, 30 } + }, { // Band 3 + { 65, 187, 230 }, { 20, 148, 207 }, { 2, 97, 159 }, + { 1, 68, 116 }, { 1, 40, 70 }, { 1, 14, 29 } + }, { // Band 4 + { 40, 194, 227 }, { 8, 147, 204 }, { 1, 94, 155 }, + { 1, 65, 112 }, { 1, 39, 66 }, { 1, 14, 26 } + }, { // Band 5 + { 16, 208, 228 }, { 3, 151, 207 }, { 1, 98, 160 }, + { 1, 67, 117 }, { 1, 41, 74 }, { 1, 17, 31 } + } + } + } +}; - while (++i < n) { - p[0] = p[1] = i << 1; - p += 2; +static const vp9_coeff_probs_model default_coef_probs_32x32[PLANE_TYPES] = { + { // Y plane + { // Intra + { // Band 0 + { 17, 38, 140 }, { 7, 34, 80 }, { 1, 17, 29 } + }, { // Band 1 + { 37, 75, 128 }, { 41, 76, 128 }, { 26, 66, 116 }, + { 12, 52, 94 }, { 2, 32, 55 }, { 1, 10, 16 } + }, { // Band 2 + { 50, 127, 154 }, { 37, 109, 152 }, { 16, 82, 121 }, + { 5, 59, 85 }, { 1, 35, 54 }, { 1, 13, 20 } + }, { // Band 3 + { 40, 142, 167 }, { 17, 110, 157 }, { 2, 71, 112 }, + { 1, 44, 72 }, { 1, 27, 45 }, { 1, 11, 17 } + }, { // Band 4 + { 30, 175, 188 }, { 9, 124, 169 }, { 1, 74, 116 }, + { 1, 48, 78 }, { 1, 30, 49 }, { 1, 11, 18 } + }, { // Band 5 + { 10, 222, 223 }, { 2, 150, 194 }, { 1, 83, 128 }, + { 1, 48, 79 }, { 1, 27, 45 }, { 1, 11, 17 } + } + }, { // Inter + { // Band 0 + { 36, 41, 235 }, { 29, 36, 193 }, { 10, 27, 111 } + }, { // Band 1 + { 85, 165, 222 }, { 177, 162, 215 }, { 110, 135, 195 }, + { 57, 113, 168 }, { 23, 83, 120 }, { 10, 49, 61 } + }, { // Band 2 + { 85, 190, 223 }, { 36, 139, 200 }, { 5, 90, 146 }, + { 1, 60, 103 }, { 1, 38, 65 }, { 1, 18, 30 } + }, { // Band 3 + { 72, 202, 223 }, { 23, 141, 199 }, { 2, 86, 140 }, + { 1, 56, 97 }, { 1, 36, 61 }, { 1, 16, 27 } + }, { // Band 4 + { 55, 218, 225 }, { 13, 145, 200 }, { 1, 86, 141 }, + { 1, 57, 99 }, { 1, 35, 61 }, { 1, 13, 22 } + }, { // Band 5 + { 15, 235, 212 }, { 1, 132, 184 }, { 1, 84, 139 }, + { 1, 57, 97 }, { 1, 34, 56 }, { 1, 14, 23 } + } + } + }, { // UV plane + { // Intra + { // Band 0 + { 181, 21, 201 }, { 61, 37, 123 }, { 10, 38, 71 } + }, { // Band 1 + { 47, 106, 172 }, { 95, 104, 173 }, { 42, 93, 159 }, + { 18, 77, 131 }, { 4, 50, 81 }, { 1, 17, 23 } + }, { // Band 2 + { 62, 147, 199 }, { 44, 130, 189 }, { 28, 102, 154 }, + { 18, 75, 115 }, { 2, 44, 65 }, { 1, 12, 19 } + }, { // Band 3 + { 55, 153, 210 }, { 24, 130, 194 }, { 3, 93, 146 }, + { 1, 61, 97 }, { 1, 31, 50 }, { 1, 10, 16 } + }, { // Band 4 + { 49, 186, 223 }, { 17, 148, 204 }, { 1, 96, 142 }, + { 1, 53, 83 }, { 1, 26, 44 }, { 1, 11, 17 } + }, { // Band 5 + { 13, 217, 212 }, { 2, 136, 180 }, { 1, 78, 124 }, + { 1, 50, 83 }, { 1, 29, 49 }, { 1, 14, 23 } + } + }, { // Inter + { // Band 0 + { 197, 13, 247 }, { 82, 17, 222 }, { 25, 17, 162 } + }, { // Band 1 + { 126, 186, 247 }, { 234, 191, 243 }, { 176, 177, 234 }, + { 104, 158, 220 }, { 66, 128, 186 }, { 55, 90, 137 } + }, { // Band 2 + { 111, 197, 242 }, { 46, 158, 219 }, { 9, 104, 171 }, + { 2, 65, 125 }, { 1, 44, 80 }, { 1, 17, 91 } + }, { // Band 3 + { 104, 208, 245 }, { 39, 168, 224 }, { 3, 109, 162 }, + { 1, 79, 124 }, { 1, 50, 102 }, { 1, 43, 102 } + }, { // Band 4 + { 84, 220, 246 }, { 31, 177, 231 }, { 2, 115, 180 }, + { 1, 79, 134 }, { 1, 55, 77 }, { 1, 60, 79 } + }, { // Band 5 + { 43, 243, 240 }, { 8, 180, 217 }, { 1, 115, 166 }, + { 1, 84, 121 }, { 1, 51, 67 }, { 1, 16, 6 } + } + } } +}; - p[0] = p[1] = 0; +static void extend_to_full_distribution(vp9_prob *probs, vp9_prob p) { + vpx_memcpy(probs, vp9_pareto8_full[p = 0 ? 0 : p - 1], + MODEL_NODES * sizeof(vp9_prob)); } -static void init_bit_trees() { - init_bit_tree(cat1, 1); - init_bit_tree(cat2, 2); - init_bit_tree(cat3, 3); - init_bit_tree(cat4, 4); - init_bit_tree(cat5, 5); - init_bit_tree(cat6, 14); +void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) { + if (full != model) + vpx_memcpy(full, model, sizeof(vp9_prob) * UNCONSTRAINED_NODES); + extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]); } -const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS] = { - { 0, 0, 0, 0}, - { 0, 0, 0, 1}, - { 0, 0, 0, 2}, - { 0, 0, 0, 3}, - { 0, 0, 0, 4}, - { cat1, Pcat1, 1, 5}, - { cat2, Pcat2, 2, 7}, - { cat3, Pcat3, 3, 11}, - { cat4, Pcat4, 4, 19}, - { cat5, Pcat5, 5, 35}, - { cat6, Pcat6, 14, 67}, - { 0, 0, 0, 0} -}; - -#include "vp9/common/vp9_default_coef_probs.h" - void vp9_default_coef_probs(VP9_COMMON *cm) { vp9_copy(cm->fc.coef_probs[TX_4X4], default_coef_probs_4x4); vp9_copy(cm->fc.coef_probs[TX_8X8], default_coef_probs_8x8); @@ -361,13 +726,6 @@ void vp9_default_coef_probs(VP9_COMMON *cm) { vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32); } -void vp9_coef_tree_initialize() { - init_bit_trees(); - vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree); -} - -// #define COEF_COUNT_TESTING - #define COEF_COUNT_SAT 24 #define COEF_MAX_UPDATE_FACTOR 112 #define COEF_COUNT_SAT_KEY 24 @@ -379,29 +737,30 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size, unsigned int count_sat, unsigned int update_factor) { const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; - - vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[tx_size]; - const vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[tx_size]; - vp9_coeff_count_model *coef_counts = cm->counts.coef[tx_size]; - unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] = + vp9_coeff_probs_model *const probs = cm->fc.coef_probs[tx_size]; + const vp9_coeff_probs_model *const pre_probs = pre_fc->coef_probs[tx_size]; + vp9_coeff_count_model *counts = cm->counts.coef[tx_size]; + unsigned int (*eob_counts)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] = cm->counts.eob_branch[tx_size]; int i, j, k, l, m; - unsigned int branch_ct[UNCONSTRAINED_NODES][2]; - for (i = 0; i < BLOCK_TYPES; ++i) + for (i = 0; i < PLANE_TYPES; ++i) for (j = 0; j < REF_TYPES; ++j) for (k = 0; k < COEF_BANDS; ++k) - for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { - if (l >= 3 && k == 0) - continue; - vp9_tree_probs_from_distribution(vp9_coefmodel_tree, branch_ct, - coef_counts[i][j][k][l]); - branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0]; + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + const int n0 = counts[i][j][k][l][ZERO_TOKEN]; + const int n1 = counts[i][j][k][l][ONE_TOKEN]; + const int n2 = counts[i][j][k][l][TWO_TOKEN]; + const int neob = counts[i][j][k][l][EOB_MODEL_TOKEN]; + const unsigned int branch_ct[UNCONSTRAINED_NODES][2] = { + { neob, eob_counts[i][j][k][l] - neob }, + { n0, n1 + n2 }, + { n1, n2 } + }; for (m = 0; m < UNCONSTRAINED_NODES; ++m) - dst_coef_probs[i][j][k][l][m] = merge_probs( - pre_coef_probs[i][j][k][l][m], - branch_ct[m], - count_sat, update_factor); + probs[i][j][k][l][m] = merge_probs(pre_probs[i][j][k][l][m], + branch_ct[m], + count_sat, update_factor); } } diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h index e133d65..15bf8eb 100644 --- a/libvpx/vp9/common/vp9_entropy.h +++ b/libvpx/vp9/common/vp9_entropy.h @@ -16,40 +16,36 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_scan.h" -#include "vp9/common/vp9_treecoder.h" -#define DIFF_UPDATE_PROB 252 +#ifdef __cplusplus +extern "C" { +#endif -/* Coefficient token alphabet */ +#define DIFF_UPDATE_PROB 252 -#define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */ -#define ONE_TOKEN 1 /* 1 Extra Bits 0+1 */ -#define TWO_TOKEN 2 /* 2 Extra Bits 0+1 */ -#define THREE_TOKEN 3 /* 3 Extra Bits 0+1 */ -#define FOUR_TOKEN 4 /* 4 Extra Bits 0+1 */ -#define DCT_VAL_CATEGORY1 5 /* 5-6 Extra Bits 1+1 */ -#define DCT_VAL_CATEGORY2 6 /* 7-10 Extra Bits 2+1 */ -#define DCT_VAL_CATEGORY3 7 /* 11-18 Extra Bits 3+1 */ -#define DCT_VAL_CATEGORY4 8 /* 19-34 Extra Bits 4+1 */ -#define DCT_VAL_CATEGORY5 9 /* 35-66 Extra Bits 5+1 */ -#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 14+1 */ -#define DCT_EOB_TOKEN 11 /* EOB Extra Bits 0+0 */ -#define MAX_ENTROPY_TOKENS 12 -#define ENTROPY_NODES 11 -#define EOSB_TOKEN 127 /* Not signalled, encoder only */ +// Coefficient token alphabet +#define ZERO_TOKEN 0 // 0 Extra Bits 0+0 +#define ONE_TOKEN 1 // 1 Extra Bits 0+1 +#define TWO_TOKEN 2 // 2 Extra Bits 0+1 +#define THREE_TOKEN 3 // 3 Extra Bits 0+1 +#define FOUR_TOKEN 4 // 4 Extra Bits 0+1 +#define CATEGORY1_TOKEN 5 // 5-6 Extra Bits 1+1 +#define CATEGORY2_TOKEN 6 // 7-10 Extra Bits 2+1 +#define CATEGORY3_TOKEN 7 // 11-18 Extra Bits 3+1 +#define CATEGORY4_TOKEN 8 // 19-34 Extra Bits 4+1 +#define CATEGORY5_TOKEN 9 // 35-66 Extra Bits 5+1 +#define CATEGORY6_TOKEN 10 // 67+ Extra Bits 14+1 +#define EOB_TOKEN 11 // EOB Extra Bits 0+0 -#define INTER_MODE_CONTEXTS 7 +#define ENTROPY_TOKENS 12 -extern DECLARE_ALIGNED(16, const uint8_t, - vp9_pt_energy_class[MAX_ENTROPY_TOKENS]); +#define ENTROPY_NODES 11 -extern const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)]; +DECLARE_ALIGNED(16, extern const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]); -#define DCT_EOB_MODEL_TOKEN 3 /* EOB Extra Bits 0+0 */ +#define EOB_MODEL_TOKEN 3 extern const vp9_tree_index vp9_coefmodel_tree[]; -extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS]; - typedef struct { const vp9_tree_index *tree; const vp9_prob *prob; @@ -58,15 +54,12 @@ typedef struct { } vp9_extra_bit; // indexed by token value -extern const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS]; +extern const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS]; -#define MAX_PROB 255 #define DCT_MAX_VALUE 16384 /* Coefficients are predicted via a 3-dimensional probability table. */ -/* Outside dimension. 0 = Y with DC, 1 = UV */ -#define BLOCK_TYPES 2 #define REF_TYPES 2 // intra=0, inter=1 /* Middle dimension reflects the coefficient position within the transform. */ @@ -88,13 +81,14 @@ extern const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS]; coefficient band (and since zigzag positions 0, 1, and 2 are in distinct bands). */ -#define PREV_COEF_CONTEXTS 6 +#define COEFF_CONTEXTS 6 +#define BAND_COEFF_CONTEXTS(band) ((band) == 0 ? 3 : COEFF_CONTEXTS) // #define ENTROPY_STATS -typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] - [MAX_ENTROPY_TOKENS]; -typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] +typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] + [ENTROPY_TOKENS]; +typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] [ENTROPY_NODES][2]; #define SUBEXP_PARAM 4 /* Subexponential code parameter */ @@ -102,8 +96,6 @@ typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] struct VP9Common; void vp9_default_coef_probs(struct VP9Common *cm); - -void vp9_coef_tree_initialize(); void vp9_adapt_coef_probs(struct VP9Common *cm); static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) { @@ -123,10 +115,10 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) { // This macro is currently unused but may be used by certain implementations #define MAXBAND_INDEX 21 -extern const uint8_t vp9_coefband_trans_8x8plus[1024]; -extern const uint8_t vp9_coefband_trans_4x4[16]; +DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_8x8plus[1024]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_4x4[16]); -static const uint8_t *get_band_translate(TX_SIZE tx_size) { +static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) { return tx_size == TX_4X4 ? vp9_coefband_trans_4x4 : vp9_coefband_trans_8x8plus; } @@ -135,24 +127,26 @@ static const uint8_t *get_band_translate(TX_SIZE tx_size) { // 1, 3, 5, 7, ..., 253, 255 // In between probabilities are interpolated linearly -#define COEFPROB_MODELS 128 +#define COEFF_PROB_MODELS 256 #define UNCONSTRAINED_NODES 3 #define PIVOT_NODE 2 // which node is pivot +#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES) +extern const vp9_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES]; + typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS] - [PREV_COEF_CONTEXTS] - [UNCONSTRAINED_NODES]; + [COEFF_CONTEXTS][UNCONSTRAINED_NODES]; typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS] - [PREV_COEF_CONTEXTS] + [COEFF_CONTEXTS] [UNCONSTRAINED_NODES + 1]; void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full); -static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, - const ENTROPY_CONTEXT *l) { +static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, + const ENTROPY_CONTEXT *l) { ENTROPY_CONTEXT above_ec = 0, left_ec = 0; switch (tx_size) { @@ -173,32 +167,26 @@ static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, left_ec = !!*(const uint64_t *)l; break; default: - assert(!"Invalid transform size."); + assert(0 && "Invalid transform size."); } return combine_entropy_contexts(above_ec, left_ec); } -static void get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, - PLANE_TYPE type, int block_idx, - const int16_t **scan, const int16_t **scan_nb) { - switch (tx_size) { - case TX_4X4: - get_scan_nb_4x4(get_tx_type_4x4(type, xd, block_idx), scan, scan_nb); - break; - case TX_8X8: - get_scan_nb_8x8(get_tx_type_8x8(type, xd), scan, scan_nb); - break; - case TX_16X16: - get_scan_nb_16x16(get_tx_type_16x16(type, xd), scan, scan_nb); - break; - case TX_32X32: - *scan = vp9_default_scan_32x32; - *scan_nb = vp9_default_scan_32x32_neighbors; - break; - default: - assert(!"Invalid transform size."); +static const INLINE scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, + PLANE_TYPE type, int block_idx) { + const MODE_INFO *const mi = xd->mi_8x8[0]; + + if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) { + return &vp9_default_scan_orders[tx_size]; + } else { + const MB_PREDICTION_MODE mode = get_y_mode(mi, block_idx); + return &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]]; } } +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_ENTROPY_H_ diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c index 3b2510d..f2c81bc 100644 --- a/libvpx/vp9/common/vp9_entropymode.c +++ b/libvpx/vp9/common/vp9_entropymode.c @@ -10,7 +10,6 @@ #include "vpx_mem/vpx_mem.h" -#include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_seg_common.h" @@ -232,21 +231,18 @@ const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = { -D63_PRED, 16, /* 7 = D63_NODE */ -D153_PRED, -D207_PRED /* 8 = D153_NODE */ }; -struct vp9_token vp9_intra_mode_encodings[INTRA_MODES]; const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = { -INTER_OFFSET(ZEROMV), 2, -INTER_OFFSET(NEARESTMV), 4, -INTER_OFFSET(NEARMV), -INTER_OFFSET(NEWMV) }; -struct vp9_token vp9_inter_mode_encodings[INTER_MODES]; const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = { -PARTITION_NONE, 2, -PARTITION_HORZ, 4, -PARTITION_VERT, -PARTITION_SPLIT }; -struct vp9_token vp9_partition_encodings[PARTITION_TYPES]; static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = { 9, 102, 187, 225 @@ -306,7 +302,7 @@ void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p, ct_8x8p[0][1] = tx_count_8x8p[TX_8X8]; } -static const vp9_prob default_mbskip_probs[MBSKIP_CONTEXTS] = { +static const vp9_prob default_skip_probs[SKIP_CONTEXTS] = { 192, 128, 64 }; @@ -318,17 +314,18 @@ static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS] { 149, 144, }, }; -void vp9_init_mbmode_probs(VP9_COMMON *cm) { - vp9_copy(cm->fc.uv_mode_prob, default_if_uv_probs); - vp9_copy(cm->fc.y_mode_prob, default_if_y_probs); - vp9_copy(cm->fc.switchable_interp_prob, default_switchable_interp_prob); - vp9_copy(cm->fc.partition_prob, default_partition_probs); - vp9_copy(cm->fc.intra_inter_prob, default_intra_inter_p); - vp9_copy(cm->fc.comp_inter_prob, default_comp_inter_p); - vp9_copy(cm->fc.comp_ref_prob, default_comp_ref_p); - vp9_copy(cm->fc.single_ref_prob, default_single_ref_p); - cm->fc.tx_probs = default_tx_probs; - vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs); +void vp9_init_mode_probs(FRAME_CONTEXT *fc) { + vp9_copy(fc->uv_mode_prob, default_if_uv_probs); + vp9_copy(fc->y_mode_prob, default_if_y_probs); + vp9_copy(fc->switchable_interp_prob, default_switchable_interp_prob); + vp9_copy(fc->partition_prob, default_partition_probs); + vp9_copy(fc->intra_inter_prob, default_intra_inter_p); + vp9_copy(fc->comp_inter_prob, default_comp_inter_p); + vp9_copy(fc->comp_ref_prob, default_comp_ref_p); + vp9_copy(fc->single_ref_prob, default_single_ref_p); + fc->tx_probs = default_tx_probs; + vp9_copy(fc->skip_probs, default_skip_probs); + vp9_copy(fc->inter_mode_probs, default_inter_mode_probs); } const vp9_tree_index vp9_switchable_interp_tree @@ -336,15 +333,6 @@ const vp9_tree_index vp9_switchable_interp_tree -EIGHTTAP, 2, -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP }; -struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS]; - -void vp9_entropy_mode_init() { - vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree); - vp9_tokens_from_tree(vp9_switchable_interp_encodings, - vp9_switchable_interp_tree); - vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree); - vp9_tokens_from_tree(vp9_inter_mode_encodings, vp9_inter_mode_tree); -} #define COUNT_SAT 20 #define MAX_UPDATE_FACTOR 128 @@ -356,7 +344,7 @@ static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) { static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, const unsigned int *counts, vp9_prob *probs) { - tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR, + vp9_tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR, probs); } @@ -396,7 +384,7 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i], counts->partition[i], fc->partition_prob[i]); - if (cm->mcomp_filter_type == SWITCHABLE) { + if (cm->interp_filter == SWITCHABLE) { for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i], counts->switchable_interp[i], fc->switchable_interp_prob[i]); @@ -426,9 +414,8 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { } } - for (i = 0; i < MBSKIP_CONTEXTS; ++i) - fc->mbskip_probs[i] = adapt_prob(pre_fc->mbskip_probs[i], - counts->mbskip[i]); + for (i = 0; i < SKIP_CONTEXTS; ++i) + fc->skip_probs[i] = adapt_prob(pre_fc->skip_probs[i], counts->skip[i]); } static void set_default_lf_deltas(struct loopfilter *lf) { @@ -464,28 +451,26 @@ void vp9_setup_past_independence(VP9_COMMON *cm) { lf->last_sharpness_level = -1; vp9_default_coef_probs(cm); - vp9_init_mbmode_probs(cm); + vp9_init_mode_probs(&cm->fc); vp9_init_mv_probs(cm); - vp9_copy(cm->fc.inter_mode_probs, default_inter_mode_probs); if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode || cm->reset_frame_context == 3) { // Reset all frame contexts. - for (i = 0; i < NUM_FRAME_CONTEXTS; ++i) + for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = cm->fc; } else if (cm->reset_frame_context == 2) { // Reset only the frame context specified in the frame header. cm->frame_contexts[cm->frame_context_idx] = cm->fc; } - vpx_memset(cm->prev_mip, 0, - cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO)); + if (frame_is_intra_only(cm)) + vpx_memset(cm->prev_mip, 0, + cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO)); + vpx_memset(cm->mip, 0, cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO)); - vp9_update_mode_info_border(cm, cm->mip); - vp9_update_mode_info_border(cm, cm->prev_mip); - vp9_zero(cm->ref_frame_sign_bias); cm->frame_context_idx = 0; diff --git a/libvpx/vp9/common/vp9_entropymode.h b/libvpx/vp9/common/vp9_entropymode.h index 38b4199..c7b1911 100644 --- a/libvpx/vp9/common/vp9_entropymode.h +++ b/libvpx/vp9/common/vp9_entropymode.h @@ -12,14 +12,17 @@ #define VP9_COMMON_VP9_ENTROPYMODE_H_ #include "vp9/common/vp9_blockd.h" -#include "vp9/common/vp9_treecoder.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymv.h" + +#ifdef __cplusplus +extern "C" { +#endif #define TX_SIZE_CONTEXTS 2 #define SWITCHABLE_FILTERS 3 // number of switchable filters #define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1) -// #define MODE_STATS - struct VP9Common; struct tx_probs { @@ -34,31 +37,56 @@ struct tx_counts { unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2]; }; +typedef struct frame_contexts { + vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1]; + vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; + vp9_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1]; + vp9_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES]; + vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS - 1]; + vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1]; + vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS]; + vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS]; + vp9_prob single_ref_prob[REF_CONTEXTS][2]; + vp9_prob comp_ref_prob[REF_CONTEXTS]; + struct tx_probs tx_probs; + vp9_prob skip_probs[SKIP_CONTEXTS]; + nmv_context nmvc; +} FRAME_CONTEXT; + +typedef struct { + unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES]; + unsigned int uv_mode[INTRA_MODES][INTRA_MODES]; + unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES]; + vp9_coeff_count_model coef[TX_SIZES][PLANE_TYPES]; + unsigned int eob_branch[TX_SIZES][PLANE_TYPES][REF_TYPES] + [COEF_BANDS][COEFF_CONTEXTS]; + unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS]; + unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES]; + unsigned int intra_inter[INTRA_INTER_CONTEXTS][2]; + unsigned int comp_inter[COMP_INTER_CONTEXTS][2]; + unsigned int single_ref[REF_CONTEXTS][2][2]; + unsigned int comp_ref[REF_CONTEXTS][2]; + struct tx_counts tx; + unsigned int skip[SKIP_CONTEXTS][2]; + nmv_context_counts mv; +} FRAME_COUNTS; + extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES] [INTRA_MODES - 1]; - extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS] [PARTITION_TYPES - 1]; - extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)]; -extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES]; - extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)]; -extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES]; - extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)]; -extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES]; - extern const vp9_tree_index vp9_switchable_interp_tree [TREE_SIZE(SWITCHABLE_FILTERS)]; -extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS]; - -void vp9_entropy_mode_init(); void vp9_setup_past_independence(struct VP9Common *cm); -void vp9_init_mbmode_probs(struct VP9Common *cm); +void vp9_init_mode_probs(FRAME_CONTEXT *fc); void vp9_adapt_mode_probs(struct VP9Common *cm); @@ -69,4 +97,17 @@ void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p, void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p, unsigned int (*ct_8x8p)[2]); +static INLINE const vp9_prob *get_y_mode_probs(const MODE_INFO *mi, + const MODE_INFO *above_mi, + const MODE_INFO *left_mi, + int block) { + const MB_PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block); + const MB_PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block); + return vp9_kf_y_mode_prob[above][left]; +} + +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_ENTROPYMODE_H_ diff --git a/libvpx/vp9/common/vp9_entropymv.c b/libvpx/vp9/common/vp9_entropymv.c index 290dcdd..197b7c0 100644 --- a/libvpx/vp9/common/vp9_entropymv.c +++ b/libvpx/vp9/common/vp9_entropymv.c @@ -23,7 +23,6 @@ const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = { -MV_JOINT_HNZVZ, 4, -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ }; -struct vp9_token vp9_mv_joint_encodings[MV_JOINTS]; const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = { -MV_CLASS_0, 2, @@ -37,19 +36,16 @@ const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = { -MV_CLASS_7, -MV_CLASS_8, -MV_CLASS_9, -MV_CLASS_10, }; -struct vp9_token vp9_mv_class_encodings[MV_CLASSES]; const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { -0, -1, }; -struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE]; -const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)] = { +const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = { -0, 2, -1, 4, -2, -3 }; -struct vp9_token vp9_mv_fp_encodings[4]; static const nmv_context default_nmv_context = { {32, 64, 96}, @@ -126,12 +122,8 @@ static const uint8_t log_in_base_2[] = { }; MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) { - MV_CLASS_TYPE c = MV_CLASS_0; - if (z >= CLASS0_SIZE * 4096) - c = MV_CLASS_10; - else - c = log_in_base_2[z >> 3]; - + const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096) ? MV_CLASS_10 : + (MV_CLASS_TYPE)log_in_base_2[z >> 3]; if (offset) *offset = z - mv_class_base(c); return c; @@ -196,8 +188,8 @@ static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) { static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, const unsigned int *counts, vp9_prob *probs) { - tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR, - probs); + vp9_tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT, + MV_MAX_UPDATE_FACTOR, probs); } void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) { @@ -235,13 +227,6 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) { } } -void vp9_entropy_mv_init() { - vp9_tokens_from_tree(vp9_mv_joint_encodings, vp9_mv_joint_tree); - vp9_tokens_from_tree(vp9_mv_class_encodings, vp9_mv_class_tree); - vp9_tokens_from_tree(vp9_mv_class0_encodings, vp9_mv_class0_tree); - vp9_tokens_from_tree(vp9_mv_fp_encodings, vp9_mv_fp_tree); -} - void vp9_init_mv_probs(VP9_COMMON *cm) { cm->fc.nmvc = default_nmv_context; } diff --git a/libvpx/vp9/common/vp9_entropymv.h b/libvpx/vp9/common/vp9_entropymv.h index d843f5b..e7033e4 100644 --- a/libvpx/vp9/common/vp9_entropymv.h +++ b/libvpx/vp9/common/vp9_entropymv.h @@ -12,19 +12,21 @@ #ifndef VP9_COMMON_VP9_ENTROPYMV_H_ #define VP9_COMMON_VP9_ENTROPYMV_H_ -#include "vp9/common/vp9_treecoder.h" #include "./vpx_config.h" #include "vp9/common/vp9_blockd.h" +#ifdef __cplusplus +extern "C" { +#endif + struct VP9Common; -void vp9_entropy_mv_init(); void vp9_init_mv_probs(struct VP9Common *cm); void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp); int vp9_use_mv_hp(const MV *ref); -#define NMV_UPDATE_PROB 252 +#define MV_UPDATE_PROB 252 /* Symbols for coding which components are zero jointly */ #define MV_JOINTS 4 @@ -62,6 +64,7 @@ typedef enum { #define CLASS0_BITS 1 /* bits at integer precision for class 0 */ #define CLASS0_SIZE (1 << CLASS0_BITS) #define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2) +#define MV_FP_SIZE 4 #define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2) #define MV_MAX ((1 << MV_MAX_BITS) - 1) @@ -71,25 +74,18 @@ typedef enum { #define MV_UPP ((1 << MV_IN_USE_BITS) - 1) #define MV_LOW (-(1 << MV_IN_USE_BITS)) -extern const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)]; -extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS]; - -extern const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)]; -extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES]; - -extern const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)]; -extern struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE]; - -extern const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)]; -extern struct vp9_token vp9_mv_fp_encodings[4]; +extern const vp9_tree_index vp9_mv_joint_tree[]; +extern const vp9_tree_index vp9_mv_class_tree[]; +extern const vp9_tree_index vp9_mv_class0_tree[]; +extern const vp9_tree_index vp9_mv_fp_tree[]; typedef struct { vp9_prob sign; vp9_prob classes[MV_CLASSES - 1]; vp9_prob class0[CLASS0_SIZE - 1]; vp9_prob bits[MV_OFFSET_BITS]; - vp9_prob class0_fp[CLASS0_SIZE][4 - 1]; - vp9_prob fp[4 - 1]; + vp9_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1]; + vp9_prob fp[MV_FP_SIZE - 1]; vp9_prob class0_hp; vp9_prob hp; } nmv_component; @@ -116,8 +112,8 @@ typedef struct { unsigned int classes[MV_CLASSES]; unsigned int class0[CLASS0_SIZE]; unsigned int bits[MV_OFFSET_BITS][2]; - unsigned int class0_fp[CLASS0_SIZE][4]; - unsigned int fp[4]; + unsigned int class0_fp[CLASS0_SIZE][MV_FP_SIZE]; + unsigned int fp[MV_FP_SIZE]; unsigned int class0_hp[2]; unsigned int hp[2]; } nmv_component_counts; @@ -129,4 +125,8 @@ typedef struct { void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx); +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_ENTROPYMV_H_ diff --git a/libvpx/vp9/common/vp9_enums.h b/libvpx/vp9/common/vp9_enums.h index 1651b90..e96e769 100644 --- a/libvpx/vp9/common/vp9_enums.h +++ b/libvpx/vp9/common/vp9_enums.h @@ -13,6 +13,10 @@ #include "./vpx_config.h" +#ifdef __cplusplus +extern "C" { +#endif + #define MI_SIZE_LOG2 3 #define MI_BLOCK_SIZE_LOG2 (6 - MI_SIZE_LOG2) // 64 = 2^6 @@ -52,20 +56,22 @@ typedef enum PARTITION_TYPE { #define PARTITION_PLOFFSET 4 // number of probability models per block size #define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET) +// block transform size typedef enum { - TX_4X4 = 0, // 4x4 dct transform - TX_8X8 = 1, // 8x8 dct transform - TX_16X16 = 2, // 16x16 dct transform - TX_32X32 = 3, // 32x32 dct transform + TX_4X4 = 0, // 4x4 transform + TX_8X8 = 1, // 8x8 transform + TX_16X16 = 2, // 16x16 transform + TX_32X32 = 3, // 32x32 transform TX_SIZES } TX_SIZE; +// frame transform mode typedef enum { - ONLY_4X4 = 0, - ALLOW_8X8 = 1, - ALLOW_16X16 = 2, - ALLOW_32X32 = 3, - TX_MODE_SELECT = 4, + ONLY_4X4 = 0, // only 4x4 transform used + ALLOW_8X8 = 1, // allow block transform size up to 8x8 + ALLOW_16X16 = 2, // allow block transform size up to 16x16 + ALLOW_32X32 = 3, // allow block transform size up to 32x32 + TX_MODE_SELECT = 4, // transform specified for each block TX_MODES = 5, } TX_MODE; @@ -73,7 +79,8 @@ typedef enum { DCT_DCT = 0, // DCT in both horizontal and vertical ADST_DCT = 1, // ADST in vertical, DCT in horizontal DCT_ADST = 2, // DCT in vertical, ADST in horizontal - ADST_ADST = 3 // ADST in both directions + ADST_ADST = 3, // ADST in both directions + TX_TYPES = 4 } TX_TYPE; typedef enum { @@ -87,4 +94,8 @@ typedef enum { SRGB = 7 // RGB } COLOR_SPACE; +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_ENUMS_H_ diff --git a/libvpx/vp9/common/vp9_extend.c b/libvpx/vp9/common/vp9_extend.c deleted file mode 100644 index 836bf0e..0000000 --- a/libvpx/vp9/common/vp9_extend.c +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_mem/vpx_mem.h" - -#include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_extend.h" - -static void copy_and_extend_plane(const uint8_t *src, int src_pitch, - uint8_t *dst, int dst_pitch, - int w, int h, - int extend_top, int extend_left, - int extend_bottom, int extend_right) { - int i, linesize; - - // copy the left and right most columns out - const uint8_t *src_ptr1 = src; - const uint8_t *src_ptr2 = src + w - 1; - uint8_t *dst_ptr1 = dst - extend_left; - uint8_t *dst_ptr2 = dst + w; - - for (i = 0; i < h; i++) { - vpx_memset(dst_ptr1, src_ptr1[0], extend_left); - vpx_memcpy(dst_ptr1 + extend_left, src_ptr1, w); - vpx_memset(dst_ptr2, src_ptr2[0], extend_right); - src_ptr1 += src_pitch; - src_ptr2 += src_pitch; - dst_ptr1 += dst_pitch; - dst_ptr2 += dst_pitch; - } - - // Now copy the top and bottom lines into each line of the respective - // borders - src_ptr1 = dst - extend_left; - src_ptr2 = dst + dst_pitch * (h - 1) - extend_left; - dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left; - dst_ptr2 = dst + dst_pitch * (h) - extend_left; - linesize = extend_left + extend_right + w; - - for (i = 0; i < extend_top; i++) { - vpx_memcpy(dst_ptr1, src_ptr1, linesize); - dst_ptr1 += dst_pitch; - } - - for (i = 0; i < extend_bottom; i++) { - vpx_memcpy(dst_ptr2, src_ptr2, linesize); - dst_ptr2 += dst_pitch; - } -} - -void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst) { - // Extend src frame in buffer - // Altref filtering assumes 16 pixel extension - const int et_y = 16; - const int el_y = 16; - // Motion estimation may use src block variance with the block size up - // to 64x64, so the right and bottom need to be extended to 64 multiple - // or up to 16, whichever is greater. - const int eb_y = MAX(ALIGN_POWER_OF_TWO(src->y_width, 6) - src->y_width, - 16); - const int er_y = MAX(ALIGN_POWER_OF_TWO(src->y_height, 6) - src->y_height, - 16); - const int uv_width_subsampling = (src->uv_width != src->y_width); - const int uv_height_subsampling = (src->uv_height != src->y_height); - const int et_uv = et_y >> uv_height_subsampling; - const int el_uv = el_y >> uv_width_subsampling; - const int eb_uv = eb_y >> uv_height_subsampling; - const int er_uv = er_y >> uv_width_subsampling; - -#if CONFIG_ALPHA - const int et_a = dst->border >> (dst->alpha_height != dst->y_height); - const int el_a = dst->border >> (dst->alpha_width != dst->y_width); - const int eb_a = et_a + dst->alpha_height - src->alpha_height; - const int er_a = el_a + dst->alpha_width - src->alpha_width; - - copy_and_extend_plane(src->alpha_buffer, src->alpha_stride, - dst->alpha_buffer, dst->alpha_stride, - src->alpha_width, src->alpha_height, - et_a, el_a, eb_a, er_a); -#endif - - copy_and_extend_plane(src->y_buffer, src->y_stride, - dst->y_buffer, dst->y_stride, - src->y_width, src->y_height, - et_y, el_y, eb_y, er_y); - - copy_and_extend_plane(src->u_buffer, src->uv_stride, - dst->u_buffer, dst->uv_stride, - src->uv_width, src->uv_height, - et_uv, el_uv, eb_uv, er_uv); - - copy_and_extend_plane(src->v_buffer, src->uv_stride, - dst->v_buffer, dst->uv_stride, - src->uv_width, src->uv_height, - et_uv, el_uv, eb_uv, er_uv); -} - -void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst, - int srcy, int srcx, - int srch, int srcw) { - // If the side is not touching the bounder then don't extend. - const int et_y = srcy ? 0 : dst->border; - const int el_y = srcx ? 0 : dst->border; - const int eb_y = srcy + srch != src->y_height ? 0 : - dst->border + dst->y_height - src->y_height; - const int er_y = srcx + srcw != src->y_width ? 0 : - dst->border + dst->y_width - src->y_width; - const int src_y_offset = srcy * src->y_stride + srcx; - const int dst_y_offset = srcy * dst->y_stride + srcx; - - const int et_uv = ROUND_POWER_OF_TWO(et_y, 1); - const int el_uv = ROUND_POWER_OF_TWO(el_y, 1); - const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1); - const int er_uv = ROUND_POWER_OF_TWO(er_y, 1); - const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1); - const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1); - const int srch_uv = ROUND_POWER_OF_TWO(srch, 1); - const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1); - - copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride, - dst->y_buffer + dst_y_offset, dst->y_stride, - srcw, srch, - et_y, el_y, eb_y, er_y); - - copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride, - dst->u_buffer + dst_uv_offset, dst->uv_stride, - srcw_uv, srch_uv, - et_uv, el_uv, eb_uv, er_uv); - - copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride, - dst->v_buffer + dst_uv_offset, dst->uv_stride, - srcw_uv, srch_uv, - et_uv, el_uv, eb_uv, er_uv); -} diff --git a/libvpx/vp9/common/vp9_extend.h b/libvpx/vp9/common/vp9_extend.h deleted file mode 100644 index 7ff79b7..0000000 --- a/libvpx/vp9/common/vp9_extend.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_VP9_EXTEND_H_ -#define VP9_COMMON_VP9_EXTEND_H_ - -#include "vpx_scale/yv12config.h" -#include "vpx/vpx_integer.h" - - -void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst); - -void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst, - int srcy, int srcx, - int srch, int srcw); -#endif // VP9_COMMON_VP9_EXTEND_H_ diff --git a/libvpx/vp9/common/vp9_filter.c b/libvpx/vp9/common/vp9_filter.c index 79ace14..7474a88 100644 --- a/libvpx/vp9/common/vp9_filter.c +++ b/libvpx/vp9/common/vp9_filter.c @@ -10,12 +10,9 @@ #include <assert.h> -#include "vpx_ports/mem.h" - #include "vp9/common/vp9_filter.h" -DECLARE_ALIGNED(256, const subpel_kernel, - vp9_bilinear_filters[SUBPEL_SHIFTS]) = { +const InterpKernel vp9_bilinear_filters[SUBPEL_SHIFTS] = { { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 }, { 0, 0, 0, 112, 16, 0, 0, 0 }, @@ -35,8 +32,7 @@ DECLARE_ALIGNED(256, const subpel_kernel, }; // Lagrangian interpolation filter -DECLARE_ALIGNED(256, const subpel_kernel, - vp9_sub_pel_filters_8[SUBPEL_SHIFTS]) = { +const InterpKernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS] = { { 0, 0, 0, 128, 0, 0, 0, 0}, { 0, 1, -5, 126, 8, -3, 1, 0}, { -1, 3, -10, 122, 18, -6, 2, 0}, @@ -56,8 +52,7 @@ DECLARE_ALIGNED(256, const subpel_kernel, }; // DCT based filter -DECLARE_ALIGNED(256, const subpel_kernel, - vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]) = { +const InterpKernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS] = { {0, 0, 0, 128, 0, 0, 0, 0}, {-1, 3, -7, 127, 8, -3, 1, 0}, {-2, 5, -13, 125, 17, -6, 3, -1}, @@ -77,8 +72,7 @@ DECLARE_ALIGNED(256, const subpel_kernel, }; // freqmultiplier = 0.5 -DECLARE_ALIGNED(256, const subpel_kernel, - vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]) = { +const InterpKernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS] = { { 0, 0, 0, 128, 0, 0, 0, 0}, {-3, -1, 32, 64, 38, 1, -3, 0}, {-2, -2, 29, 63, 41, 2, -3, 0}, @@ -98,14 +92,15 @@ DECLARE_ALIGNED(256, const subpel_kernel, }; -static const subpel_kernel* vp9_filter_kernels[4] = { +static const InterpKernel* vp9_filter_kernels[4] = { vp9_sub_pel_filters_8, vp9_sub_pel_filters_8lp, vp9_sub_pel_filters_8s, vp9_bilinear_filters }; -const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type) { - return vp9_filter_kernels[type]; +const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter) { + assert(filter != SWITCHABLE); + return vp9_filter_kernels[filter]; } diff --git a/libvpx/vp9/common/vp9_filter.h b/libvpx/vp9/common/vp9_filter.h index b1e7e64..29d3867 100644 --- a/libvpx/vp9/common/vp9_filter.h +++ b/libvpx/vp9/common/vp9_filter.h @@ -13,6 +13,12 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + + +#ifdef __cplusplus +extern "C" { +#endif #define FILTER_BITS 7 @@ -27,25 +33,28 @@ typedef enum { EIGHTTAP_SHARP = 2, BILINEAR = 3, SWITCHABLE = 4 /* should be the last one */ -} INTERPOLATION_TYPE; +} INTERP_FILTER; -typedef int16_t subpel_kernel[SUBPEL_TAPS]; +typedef int16_t InterpKernel[SUBPEL_TAPS]; -struct subpix_fn_table { - const subpel_kernel *filter_x; - const subpel_kernel *filter_y; -}; +const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter); -const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type); - -extern const subpel_kernel vp9_bilinear_filters[SUBPEL_SHIFTS]; -extern const subpel_kernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS]; -extern const subpel_kernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]; -extern const subpel_kernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]; +DECLARE_ALIGNED(256, extern const InterpKernel, + vp9_bilinear_filters[SUBPEL_SHIFTS]); +DECLARE_ALIGNED(256, extern const InterpKernel, + vp9_sub_pel_filters_8[SUBPEL_SHIFTS]); +DECLARE_ALIGNED(256, extern const InterpKernel, + vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]); +DECLARE_ALIGNED(256, extern const InterpKernel, + vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]); // The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear // filter kernel as a 2 tap filter. #define BILINEAR_FILTERS_2TAP(x) \ (vp9_bilinear_filters[(x)] + SUBPEL_TAPS/2 - 1) +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_FILTER_H_ diff --git a/libvpx/vp9/common/vp9_findnearmv.c b/libvpx/vp9/common/vp9_findnearmv.c deleted file mode 100644 index b91c501..0000000 --- a/libvpx/vp9/common/vp9_findnearmv.c +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vp9/common/vp9_findnearmv.h" -#include "vp9/common/vp9_mvref_common.h" - -static void lower_mv_precision(MV *mv, int allow_hp) { - const int use_hp = allow_hp && vp9_use_mv_hp(mv); - if (!use_hp) { - if (mv->row & 1) - mv->row += (mv->row > 0 ? -1 : 1); - if (mv->col & 1) - mv->col += (mv->col > 0 ? -1 : 1); - } -} - - -void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, - int_mv *mvlist, int_mv *nearest, int_mv *near) { - int i; - // Make sure all the candidates are properly clamped etc - for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { - lower_mv_precision(&mvlist[i].as_mv, allow_hp); - clamp_mv2(&mvlist[i].as_mv, xd); - } - *nearest = mvlist[0]; - *near = mvlist[1]; -} - -void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, - const TileInfo *const tile, - int_mv *dst_nearest, - int_mv *dst_near, - int block_idx, int ref_idx, - int mi_row, int mi_col) { - int_mv dst_list[MAX_MV_REF_CANDIDATES]; - int_mv mv_list[MAX_MV_REF_CANDIDATES]; - MODE_INFO *const mi = xd->mi_8x8[0]; - - assert(ref_idx == 0 || ref_idx == 1); - assert(MAX_MV_REF_CANDIDATES == 2); // makes code here slightly easier - - vp9_find_mv_refs_idx(cm, xd, tile, mi, xd->last_mi, - mi->mbmi.ref_frame[ref_idx], - mv_list, block_idx, mi_row, mi_col); - - dst_list[1].as_int = 0; - if (block_idx == 0) { - vpx_memcpy(dst_list, mv_list, MAX_MV_REF_CANDIDATES * sizeof(int_mv)); - } else if (block_idx == 1 || block_idx == 2) { - int dst = 0, n; - b_mode_info *bmi = mi->bmi; - - dst_list[dst++].as_int = bmi[0].as_mv[ref_idx].as_int; - for (n = 0; dst < MAX_MV_REF_CANDIDATES && - n < MAX_MV_REF_CANDIDATES; n++) - if (mv_list[n].as_int != dst_list[0].as_int) - dst_list[dst++].as_int = mv_list[n].as_int; - } else { - int dst = 0, n; - b_mode_info *bmi = mi->bmi; - - assert(block_idx == 3); - dst_list[dst++].as_int = bmi[2].as_mv[ref_idx].as_int; - if (dst_list[0].as_int != bmi[1].as_mv[ref_idx].as_int) - dst_list[dst++].as_int = bmi[1].as_mv[ref_idx].as_int; - if (dst < MAX_MV_REF_CANDIDATES && - dst_list[0].as_int != bmi[0].as_mv[ref_idx].as_int) - dst_list[dst++].as_int = bmi[0].as_mv[ref_idx].as_int; - for (n = 0; dst < MAX_MV_REF_CANDIDATES && - n < MAX_MV_REF_CANDIDATES; n++) - if (mv_list[n].as_int != dst_list[0].as_int) - dst_list[dst++].as_int = mv_list[n].as_int; - } - - dst_nearest->as_int = dst_list[0].as_int; - dst_near->as_int = dst_list[1].as_int; -} diff --git a/libvpx/vp9/common/vp9_findnearmv.h b/libvpx/vp9/common/vp9_findnearmv.h deleted file mode 100644 index 2362caa..0000000 --- a/libvpx/vp9/common/vp9_findnearmv.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_COMMON_VP9_FINDNEARMV_H_ -#define VP9_COMMON_VP9_FINDNEARMV_H_ - -#include "vp9/common/vp9_mv.h" -#include "vp9/common/vp9_blockd.h" -#include "vp9/common/vp9_treecoder.h" -#include "vp9/common/vp9_onyxc_int.h" - -#define LEFT_TOP_MARGIN ((VP9BORDERINPIXELS - VP9_INTERP_EXTEND) << 3) -#define RIGHT_BOTTOM_MARGIN ((VP9BORDERINPIXELS - VP9_INTERP_EXTEND) << 3) - -// check a list of motion vectors by sad score using a number rows of pixels -// above and a number cols of pixels in the left to select the one with best -// score to use as ref motion vector -void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, - int_mv *mvlist, int_mv *nearest, int_mv *near); - -// TODO(jingning): this mv clamping function should be block size dependent. -static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { - clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN, - xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, - xd->mb_to_top_edge - LEFT_TOP_MARGIN, - xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); -} - -void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, - const TileInfo *const tile, - int_mv *dst_nearest, - int_mv *dst_near, - int block_idx, int ref_idx, - int mi_row, int mi_col); - -static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mi, - const MODE_INFO *left_mi, int b) { - if (b == 0 || b == 2) { - if (!left_mi || is_inter_block(&left_mi->mbmi)) - return DC_PRED; - - return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode - : left_mi->mbmi.mode; - } else { - assert(b == 1 || b == 3); - return cur_mi->bmi[b - 1].as_mode; - } -} - -static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mi, - const MODE_INFO *above_mi, int b) { - if (b == 0 || b == 1) { - if (!above_mi || is_inter_block(&above_mi->mbmi)) - return DC_PRED; - - return above_mi->mbmi.sb_type < BLOCK_8X8 ? above_mi->bmi[b + 2].as_mode - : above_mi->mbmi.mode; - } else { - assert(b == 2 || b == 3); - return cur_mi->bmi[b - 2].as_mode; - } -} - -#endif // VP9_COMMON_VP9_FINDNEARMV_H_ diff --git a/libvpx/vp9/common/vp9_frame_buffers.c b/libvpx/vp9/common/vp9_frame_buffers.c new file mode 100644 index 0000000..a0b1e03 --- /dev/null +++ b/libvpx/vp9/common/vp9_frame_buffers.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "vp9/common/vp9_frame_buffers.h" +#include "vpx_mem/vpx_mem.h" + +int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list) { + assert(list != NULL); + vp9_free_internal_frame_buffers(list); + + list->num_internal_frame_buffers = + VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; + list->int_fb = + (InternalFrameBuffer *)vpx_calloc(list->num_internal_frame_buffers, + sizeof(*list->int_fb)); + return (list->int_fb == NULL); +} + +void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) { + int i; + + assert(list != NULL); + + for (i = 0; i < list->num_internal_frame_buffers; ++i) { + vpx_free(list->int_fb[i].data); + list->int_fb[i].data = NULL; + } + vpx_free(list->int_fb); + list->int_fb = NULL; +} + +int vp9_get_frame_buffer(void *cb_priv, size_t min_size, + vpx_codec_frame_buffer_t *fb) { + int i; + InternalFrameBufferList *const int_fb_list = + (InternalFrameBufferList *)cb_priv; + if (int_fb_list == NULL) + return -1; + + // Find a free frame buffer. + for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) { + if (!int_fb_list->int_fb[i].in_use) + break; + } + + if (i == int_fb_list->num_internal_frame_buffers) + return -1; + + if (int_fb_list->int_fb[i].size < min_size) { + int_fb_list->int_fb[i].data = + (uint8_t *)vpx_realloc(int_fb_list->int_fb[i].data, min_size); + if (!int_fb_list->int_fb[i].data) + return -1; + + int_fb_list->int_fb[i].size = min_size; + } + + fb->data = int_fb_list->int_fb[i].data; + fb->size = int_fb_list->int_fb[i].size; + int_fb_list->int_fb[i].in_use = 1; + + // Set the frame buffer's private data to point at the internal frame buffer. + fb->priv = &int_fb_list->int_fb[i]; + return 0; +} + +int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb) { + InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv; + (void)cb_priv; + int_fb->in_use = 0; + return 0; +} diff --git a/libvpx/vp9/common/vp9_frame_buffers.h b/libvpx/vp9/common/vp9_frame_buffers.h new file mode 100644 index 0000000..e2cfe61 --- /dev/null +++ b/libvpx/vp9/common/vp9_frame_buffers.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_FRAME_BUFFERS_H_ +#define VP9_COMMON_VP9_FRAME_BUFFERS_H_ + +#include "vpx/vpx_frame_buffer.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct InternalFrameBuffer { + uint8_t *data; + size_t size; + int in_use; +} InternalFrameBuffer; + +typedef struct InternalFrameBufferList { + int num_internal_frame_buffers; + InternalFrameBuffer *int_fb; +} InternalFrameBufferList; + +// Initializes |list|. Returns 0 on success. +int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list); + +// Free any data allocated to the frame buffers. +void vp9_free_internal_frame_buffers(InternalFrameBufferList *list); + +// Callback used by libvpx to request an external frame buffer. |cb_priv| +// Callback private data, which points to an InternalFrameBufferList. +// |min_size| is the minimum size in bytes needed to decode the next frame. +// |fb| pointer to the frame buffer. +int vp9_get_frame_buffer(void *cb_priv, size_t min_size, + vpx_codec_frame_buffer_t *fb); + +// Callback used by libvpx when there are no references to the frame buffer. +// |cb_priv| is not used. |fb| pointer to the frame buffer. +int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_FRAME_BUFFERS_H_ diff --git a/libvpx/vp9/common/vp9_idct.c b/libvpx/vp9/common/vp9_idct.c index ea8683e..20b78bf 100644 --- a/libvpx/vp9/common/vp9_idct.c +++ b/libvpx/vp9/common/vp9_idct.c @@ -96,7 +96,7 @@ void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) { } } -static void idct4_1d(const int16_t *input, int16_t *output) { +static void idct4(const int16_t *input, int16_t *output) { int16_t step[4]; int temp1, temp2; // stage 1 @@ -124,7 +124,7 @@ void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { // Rows for (i = 0; i < 4; ++i) { - idct4_1d(input, outptr); + idct4(input, outptr); input += 4; outptr += 4; } @@ -133,7 +133,7 @@ void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - idct4_1d(temp_in, temp_out); + idct4(temp_in, temp_out); for (j = 0; j < 4; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i]); @@ -156,7 +156,7 @@ void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) { } } -static void idct8_1d(const int16_t *input, int16_t *output) { +static void idct8(const int16_t *input, int16_t *output) { int16_t step1[8], step2[8]; int temp1, temp2; // stage 1 @@ -174,7 +174,7 @@ static void idct8_1d(const int16_t *input, int16_t *output) { step1[6] = dct_const_round_shift(temp2); // stage 2 & stage 3 - even half - idct4_1d(step1, step1); + idct4(step1, step1); // stage 2 - odd half step2[4] = step1[4] + step1[5]; @@ -209,7 +209,7 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) { // First transform rows for (i = 0; i < 8; ++i) { - idct8_1d(input, outptr); + idct8(input, outptr); input += 8; outptr += 8; } @@ -218,7 +218,7 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - idct8_1d(temp_in, temp_out); + idct8(temp_in, temp_out); for (j = 0; j < 8; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i]); @@ -238,7 +238,7 @@ void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) { } } -static void iadst4_1d(const int16_t *input, int16_t *output) { +static void iadst4(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; int x0 = input[0]; @@ -283,10 +283,10 @@ static void iadst4_1d(const int16_t *input, int16_t *output) { void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride, int tx_type) { const transform_2d IHT_4[] = { - { idct4_1d, idct4_1d }, // DCT_DCT = 0 - { iadst4_1d, idct4_1d }, // ADST_DCT = 1 - { idct4_1d, iadst4_1d }, // DCT_ADST = 2 - { iadst4_1d, iadst4_1d } // ADST_ADST = 3 + { idct4, idct4 }, // DCT_DCT = 0 + { iadst4, idct4 }, // ADST_DCT = 1 + { idct4, iadst4 }, // DCT_ADST = 2 + { iadst4, iadst4 } // ADST_ADST = 3 }; int i, j; @@ -311,7 +311,7 @@ void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride, + dest[j * stride + i]); } } -static void iadst8_1d(const int16_t *input, int16_t *output) { +static void iadst8(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; int x0 = input[7]; @@ -389,10 +389,10 @@ static void iadst8_1d(const int16_t *input, int16_t *output) { } static const transform_2d IHT_8[] = { - { idct8_1d, idct8_1d }, // DCT_DCT = 0 - { iadst8_1d, idct8_1d }, // ADST_DCT = 1 - { idct8_1d, iadst8_1d }, // DCT_ADST = 2 - { iadst8_1d, iadst8_1d } // ADST_ADST = 3 + { idct8, idct8 }, // DCT_DCT = 0 + { iadst8, idct8 }, // ADST_DCT = 1 + { idct8, iadst8 }, // DCT_ADST = 2 + { iadst8, iadst8 } // ADST_ADST = 3 }; void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride, @@ -430,7 +430,7 @@ void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) { // First transform rows // only first 4 row has non-zero coefs for (i = 0; i < 4; ++i) { - idct8_1d(input, outptr); + idct8(input, outptr); input += 8; outptr += 8; } @@ -439,14 +439,14 @@ void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - idct8_1d(temp_in, temp_out); + idct8(temp_in, temp_out); for (j = 0; j < 8; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i]); } } -static void idct16_1d(const int16_t *input, int16_t *output) { +static void idct16(const int16_t *input, int16_t *output) { int16_t step1[16], step2[16]; int temp1, temp2; @@ -619,7 +619,7 @@ void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) { // First transform rows for (i = 0; i < 16; ++i) { - idct16_1d(input, outptr); + idct16(input, outptr); input += 16; outptr += 16; } @@ -628,14 +628,14 @@ void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - idct16_1d(temp_in, temp_out); + idct16(temp_in, temp_out); for (j = 0; j < 16; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + dest[j * stride + i]); } } -static void iadst16_1d(const int16_t *input, int16_t *output) { +static void iadst16(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; int x0 = input[15]; @@ -807,10 +807,10 @@ static void iadst16_1d(const int16_t *input, int16_t *output) { } static const transform_2d IHT_16[] = { - { idct16_1d, idct16_1d }, // DCT_DCT = 0 - { iadst16_1d, idct16_1d }, // ADST_DCT = 1 - { idct16_1d, iadst16_1d }, // DCT_ADST = 2 - { iadst16_1d, iadst16_1d } // ADST_ADST = 3 + { idct16, idct16 }, // DCT_DCT = 0 + { iadst16, idct16 }, // ADST_DCT = 1 + { idct16, iadst16 }, // DCT_ADST = 2 + { iadst16, iadst16 } // ADST_ADST = 3 }; void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride, @@ -835,7 +835,8 @@ void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride, ht.cols(temp_in, temp_out); for (j = 0; j < 16; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) - + dest[j * stride + i]); } + + dest[j * stride + i]); + } } void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) { @@ -847,7 +848,7 @@ void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) { // First transform rows. Since all non-zero dct coefficients are in // upper-left 4x4 area, we only need to calculate first 4 rows here. for (i = 0; i < 4; ++i) { - idct16_1d(input, outptr); + idct16(input, outptr); input += 16; outptr += 16; } @@ -856,7 +857,7 @@ void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) temp_in[j] = out[j*16 + i]; - idct16_1d(temp_in, temp_out); + idct16(temp_in, temp_out); for (j = 0; j < 16; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + dest[j * stride + i]); @@ -876,7 +877,7 @@ void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) { } } -static void idct32_1d(const int16_t *input, int16_t *output) { +static void idct32(const int16_t *input, int16_t *output) { int16_t step1[32], step2[32]; int temp1, temp2; @@ -1262,7 +1263,7 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) { zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; if (zero_coeff[0] | zero_coeff[1]) - idct32_1d(input, outptr); + idct32(input, outptr); else vpx_memset(outptr, 0, sizeof(int16_t) * 32); input += 32; @@ -1273,10 +1274,10 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 32; ++i) { for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - idct32_1d(temp_in, temp_out); + idct32(temp_in, temp_out); for (j = 0; j < 32; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) - + dest[j * stride + i]); + + dest[j * stride + i]); } } @@ -1289,7 +1290,7 @@ void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) { // Rows // only upper-left 8x8 has non-zero coeff for (i = 0; i < 8; ++i) { - idct32_1d(input, outptr); + idct32(input, outptr); input += 32; outptr += 32; } @@ -1298,7 +1299,7 @@ void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 32; ++i) { for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - idct32_1d(temp_in, temp_out); + idct32(temp_in, temp_out); for (j = 0; j < 32; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + dest[j * stride + i]); @@ -1344,43 +1345,37 @@ void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) { // coefficients. Use eobs to decide what to do. // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. // Combine that with code here. - if (eob) { - if (eob == 1) - // DC only DCT coefficient - vp9_idct8x8_1_add(input, dest, stride); - else if (eob <= 10) - vp9_idct8x8_10_add(input, dest, stride); - else - vp9_idct8x8_64_add(input, dest, stride); - } + if (eob == 1) + // DC only DCT coefficient + vp9_idct8x8_1_add(input, dest, stride); + else if (eob <= 10) + vp9_idct8x8_10_add(input, dest, stride); + else + vp9_idct8x8_64_add(input, dest, stride); } void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, int eob) { /* The calculation can be simplified if there are not many non-zero dct * coefficients. Use eobs to separate different cases. */ - if (eob) { - if (eob == 1) - /* DC only DCT coefficient. */ - vp9_idct16x16_1_add(input, dest, stride); - else if (eob <= 10) - vp9_idct16x16_10_add(input, dest, stride); - else - vp9_idct16x16_256_add(input, dest, stride); - } + if (eob == 1) + /* DC only DCT coefficient. */ + vp9_idct16x16_1_add(input, dest, stride); + else if (eob <= 10) + vp9_idct16x16_10_add(input, dest, stride); + else + vp9_idct16x16_256_add(input, dest, stride); } void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride, int eob) { - if (eob) { - if (eob == 1) - vp9_idct32x32_1_add(input, dest, stride); - else if (eob <= 34) - // non-zero coeff only in upper-left 8x8 - vp9_idct32x32_34_add(input, dest, stride); - else - vp9_idct32x32_1024_add(input, dest, stride); - } + if (eob == 1) + vp9_idct32x32_1_add(input, dest, stride); + else if (eob <= 34) + // non-zero coeff only in upper-left 8x8 + vp9_idct32x32_34_add(input, dest, stride); + else + vp9_idct32x32_1024_add(input, dest, stride); } // iht @@ -1397,9 +1392,7 @@ void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, if (tx_type == DCT_DCT) { vp9_idct8x8_add(input, dest, stride, eob); } else { - if (eob > 0) { - vp9_iht8x8_64_add(input, dest, stride, tx_type); - } + vp9_iht8x8_64_add(input, dest, stride, tx_type); } } @@ -1408,8 +1401,6 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, if (tx_type == DCT_DCT) { vp9_idct16x16_add(input, dest, stride, eob); } else { - if (eob > 0) { - vp9_iht16x16_256_add(input, dest, stride, tx_type); - } + vp9_iht16x16_256_add(input, dest, stride, tx_type); } } diff --git a/libvpx/vp9/common/vp9_idct.h b/libvpx/vp9/common/vp9_idct.h index 2b3f35f..ceca795 100644 --- a/libvpx/vp9/common/vp9_idct.h +++ b/libvpx/vp9/common/vp9_idct.h @@ -18,6 +18,10 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#ifdef __cplusplus +extern "C" { +#endif + // Constants and Macros used by all idct/dct functions #define DCT_CONST_BITS 14 @@ -77,8 +81,7 @@ static const int sinpi_4_9 = 15212; static INLINE int dct_const_round_shift(int input) { int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - assert(INT16_MIN <= rv && rv <= INT16_MAX); - return rv; + return (int16_t)rv; } typedef void (*transform_1d)(const int16_t*, int16_t*); @@ -104,4 +107,8 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, int stride, int eob); +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_IDCT_H_ diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c index ff504a1..af8afed 100644 --- a/libvpx/vp9/common/vp9_loopfilter.c +++ b/libvpx/vp9/common/vp9_loopfilter.c @@ -16,26 +16,6 @@ #include "vp9/common/vp9_seg_common.h" -// This structure holds bit masks for all 8x8 blocks in a 64x64 region. -// Each 1 bit represents a position in which we want to apply the loop filter. -// Left_ entries refer to whether we apply a filter on the border to the -// left of the block. Above_ entries refer to whether or not to apply a -// filter on the above border. Int_ entries refer to whether or not to -// apply borders on the 4x4 edges within the 8x8 block that each bit -// represents. -// Since each transform is accompanied by a potentially different type of -// loop filter there is a different entry in the array for each transform size. -typedef struct { - uint64_t left_y[TX_SIZES]; - uint64_t above_y[TX_SIZES]; - uint64_t int_4x4_y; - uint16_t left_uv[TX_SIZES]; - uint16_t above_uv[TX_SIZES]; - uint16_t int_4x4_uv; - uint8_t lfl_y[64]; - uint8_t lfl_uv[16]; -} LOOP_FILTER_MASK; - // 64 bit masks for left transform size. Each 1 represents a position where // we should apply a loop filter across the left border of an 8x8 block // boundary. @@ -221,23 +201,10 @@ static const uint16_t size_mask_uv[BLOCK_SIZES] = { static const uint16_t left_border_uv = 0x1111; static const uint16_t above_border_uv = 0x000f; - -static void lf_init_lut(loop_filter_info_n *lfi) { - lfi->mode_lf_lut[DC_PRED] = 0; - lfi->mode_lf_lut[D45_PRED] = 0; - lfi->mode_lf_lut[D135_PRED] = 0; - lfi->mode_lf_lut[D117_PRED] = 0; - lfi->mode_lf_lut[D153_PRED] = 0; - lfi->mode_lf_lut[D207_PRED] = 0; - lfi->mode_lf_lut[D63_PRED] = 0; - lfi->mode_lf_lut[V_PRED] = 0; - lfi->mode_lf_lut[H_PRED] = 0; - lfi->mode_lf_lut[TM_PRED] = 0; - lfi->mode_lf_lut[ZEROMV] = 0; - lfi->mode_lf_lut[NEARESTMV] = 1; - lfi->mode_lf_lut[NEARMV] = 1; - lfi->mode_lf_lut[NEWMV] = 1; -} +static const int mode_lf_lut[MB_MODE_COUNT] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES + 1, 1, 0, 1 // INTER_MODES (ZEROMV == 0) +}; static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { int lvl; @@ -270,9 +237,6 @@ void vp9_loop_filter_init(VP9_COMMON *cm) { update_sharpness(lfi, lf->sharpness_level); lf->last_sharpness_level = lf->sharpness_level; - // init LUT for lvl and hev thr picking - lf_init_lut(lfi); - // init hev threshold const vectors for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) vpx_memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH); @@ -283,10 +247,10 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) { // n_shift is the a multiplier for lf_deltas // the multiplier is 1 for when filter_lvl is between 0 and 31; // 2 when filter_lvl is between 32 and 63 - const int n_shift = default_filt_lvl >> 5; + const int scale = 1 << (default_filt_lvl >> 5); loop_filter_info_n *const lfi = &cm->lf_info; struct loopfilter *const lf = &cm->lf; - struct segmentation *const seg = &cm->seg; + const struct segmentation *const seg = &cm->seg; // update limits if sharpness has changed if (lf->last_sharpness_level != lf->sharpness_level) { @@ -295,86 +259,130 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) { } for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) { - int lvl_seg = default_filt_lvl, ref, mode, intra_lvl; - - // Set the baseline filter values for each segment + int lvl_seg = default_filt_lvl; if (vp9_segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) { const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF); - lvl_seg = seg->abs_delta == SEGMENT_ABSDATA - ? data - : clamp(default_filt_lvl + data, 0, MAX_LOOP_FILTER); + lvl_seg = clamp(seg->abs_delta == SEGMENT_ABSDATA ? + data : default_filt_lvl + data, + 0, MAX_LOOP_FILTER); } if (!lf->mode_ref_delta_enabled) { // we could get rid of this if we assume that deltas are set to // zero when not in use; encoder always uses deltas vpx_memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id])); - continue; - } - - intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * (1 << n_shift); - lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER); - - for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref) - for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) { - const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * (1 << n_shift) - + lf->mode_deltas[mode] * (1 << n_shift); - lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER); + } else { + int ref, mode; + const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale; + lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER); + + for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref) { + for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) { + const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale + + lf->mode_deltas[mode] * scale; + lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER); + } } + } } } -static uint8_t build_lfi(const loop_filter_info_n *lfi_n, - const MB_MODE_INFO *mbmi) { - const int seg = mbmi->segment_id; - const int ref = mbmi->ref_frame[0]; - const int mode = lfi_n->mode_lf_lut[mbmi->mode]; - const int filter_level = lfi_n->lvl[seg][ref][mode]; - - return filter_level; -} - -static void filter_selectively_vert(uint8_t *s, int pitch, - unsigned int mask_16x16, - unsigned int mask_8x8, - unsigned int mask_4x4, - unsigned int mask_4x4_int, - const loop_filter_info_n *lfi_n, - const uint8_t *lfl) { +static void filter_selectively_vert_row2(PLANE_TYPE plane_type, + uint8_t *s, int pitch, + unsigned int mask_16x16_l, + unsigned int mask_8x8_l, + unsigned int mask_4x4_l, + unsigned int mask_4x4_int_l, + const loop_filter_info_n *lfi_n, + const uint8_t *lfl) { + const int mask_shift = plane_type ? 4 : 8; + const int mask_cutoff = plane_type ? 0xf : 0xff; + const int lfl_forward = plane_type ? 4 : 8; + + unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff; + unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff; + unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff; + unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff; + unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff; + unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff; + unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff; + unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff; unsigned int mask; - for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; - mask; mask >>= 1) { - const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl; + for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 | + mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1; + mask; mask >>= 1) { + const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl; + const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward); + // TODO(yunqingwang): count in loopfilter functions should be removed. if (mask & 1) { - if (mask_16x16 & 1) { - vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr); - assert(!(mask_8x8 & 1)); - assert(!(mask_4x4 & 1)); - assert(!(mask_4x4_int & 1)); - } else if (mask_8x8 & 1) { - vp9_mbloop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); - assert(!(mask_16x16 & 1)); - assert(!(mask_4x4 & 1)); - } else if (mask_4x4 & 1) { - vp9_loop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); - assert(!(mask_16x16 & 1)); - assert(!(mask_8x8 & 1)); + if ((mask_16x16_0 | mask_16x16_1) & 1) { + if ((mask_16x16_0 & mask_16x16_1) & 1) { + vp9_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr); + } else if (mask_16x16_0 & 1) { + vp9_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr); + } else { + vp9_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim, + lfi1->lim, lfi1->hev_thr); + } + } + + if ((mask_8x8_0 | mask_8x8_1) & 1) { + if ((mask_8x8_0 & mask_8x8_1) & 1) { + vp9_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } else if (mask_8x8_0 & 1) { + vp9_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, + 1); + } else { + vp9_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim, + lfi1->hev_thr, 1); + } + } + + if ((mask_4x4_0 | mask_4x4_1) & 1) { + if ((mask_4x4_0 & mask_4x4_1) & 1) { + vp9_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } else if (mask_4x4_0 & 1) { + vp9_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, + 1); + } else { + vp9_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim, + lfi1->hev_thr, 1); + } + } + + if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) { + if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) { + vp9_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } else if (mask_4x4_int_0 & 1) { + vp9_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, 1); + } else { + vp9_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim, + lfi1->hev_thr, 1); + } } } - if (mask_4x4_int & 1) - vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + s += 8; lfl += 1; - mask_16x16 >>= 1; - mask_8x8 >>= 1; - mask_4x4 >>= 1; - mask_4x4_int >>= 1; + mask_16x16_0 >>= 1; + mask_8x8_0 >>= 1; + mask_4x4_0 >>= 1; + mask_4x4_int_0 >>= 1; + mask_16x16_1 >>= 1; + mask_8x8_1 >>= 1; + mask_4x4_1 >>= 1; + mask_4x4_int_1 >>= 1; } } @@ -396,95 +404,73 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, if (mask & 1) { if (mask_16x16 & 1) { if ((mask_16x16 & 3) == 3) { - vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); + vp9_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); count = 2; } else { - vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + vp9_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } - assert(!(mask_8x8 & 1)); - assert(!(mask_4x4 & 1)); - assert(!(mask_4x4_int & 1)); } else if (mask_8x8 & 1) { if ((mask_8x8 & 3) == 3) { // Next block's thresholds const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1); - // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering. - vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); - vp9_mbloop_filter_horizontal_edge(s + 8, pitch, lfin->mblim, - lfin->lim, lfin->hev_thr, 1); + vp9_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); if ((mask_4x4_int & 3) == 3) { - // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering. - vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); - vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch, - lfin->mblim, lfin->lim, - lfin->hev_thr, 1); + vp9_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr); } else { if (mask_4x4_int & 1) - vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); + vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); else if (mask_4x4_int & 2) - vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch, - lfin->mblim, lfin->lim, - lfin->hev_thr, 1); + vp9_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, + lfin->lim, lfin->hev_thr, 1); } count = 2; } else { - vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + vp9_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (mask_4x4_int & 1) - vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); + vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } - assert(!(mask_16x16 & 1)); - assert(!(mask_4x4 & 1)); } else if (mask_4x4 & 1) { if ((mask_4x4 & 3) == 3) { // Next block's thresholds const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1); - // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering. - vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); - vp9_loop_filter_horizontal_edge(s + 8, pitch, lfin->mblim, lfin->lim, - lfin->hev_thr, 1); - + vp9_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); if ((mask_4x4_int & 3) == 3) { - // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering. - vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); - vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch, - lfin->mblim, lfin->lim, - lfin->hev_thr, 1); + vp9_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr); } else { if (mask_4x4_int & 1) - vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); + vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); else if (mask_4x4_int & 2) - vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch, - lfin->mblim, lfin->lim, - lfin->hev_thr, 1); + vp9_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, + lfin->lim, lfin->hev_thr, 1); } count = 2; } else { - vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + vp9_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); - if (mask_4x4_int & 1) - vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); + if (mask_4x4_int & 1) + vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } - assert(!(mask_16x16 & 1)); - assert(!(mask_8x8 & 1)); } else if (mask_4x4_int & 1) { - vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); + vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } } s += 8 * count; @@ -510,11 +496,10 @@ static void build_masks(const loop_filter_info_n *const lfi_n, const BLOCK_SIZE block_size = mi->mbmi.sb_type; const TX_SIZE tx_size_y = mi->mbmi.tx_size; const TX_SIZE tx_size_uv = get_uv_tx_size(&mi->mbmi); - const int skip = mi->mbmi.skip_coeff; + const int skip = mi->mbmi.skip; const int seg = mi->mbmi.segment_id; const int ref = mi->mbmi.ref_frame[0]; - const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode]; - const int filter_level = lfi_n->lvl[seg][ref][mode]; + const int filter_level = lfi_n->lvl[seg][ref][mode_lf_lut[mi->mbmi.mode]]; uint64_t *left_y = &lfm->left_y[tx_size_y]; uint64_t *above_y = &lfm->above_y[tx_size_y]; uint64_t *int_4x4_y = &lfm->int_4x4_y; @@ -592,11 +577,10 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n, LOOP_FILTER_MASK *lfm) { const BLOCK_SIZE block_size = mi->mbmi.sb_type; const TX_SIZE tx_size_y = mi->mbmi.tx_size; - const int skip = mi->mbmi.skip_coeff; + const int skip = mi->mbmi.skip; const int seg = mi->mbmi.segment_id; const int ref = mi->mbmi.ref_frame[0]; - const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode]; - const int filter_level = lfi_n->lvl[seg][ref][mode]; + const int filter_level = lfi_n->lvl[seg][ref][mode_lf_lut[mi->mbmi.mode]]; uint64_t *left_y = &lfm->left_y[tx_size_y]; uint64_t *above_y = &lfm->above_y[tx_size_y]; uint64_t *int_4x4_y = &lfm->int_4x4_y; @@ -634,9 +618,9 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n, // This function sets up the bit masks for the entire 64x64 region represented // by mi_row, mi_col. // TODO(JBB): This function only works for yv12. -static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, - MODE_INFO **mi_8x8, const int mode_info_stride, - LOOP_FILTER_MASK *lfm) { +void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, + MODE_INFO **mi_8x8, const int mode_info_stride, + LOOP_FILTER_MASK *lfm) { int idx_32, idx_16, idx_8; const loop_filter_info_n *const lfi_n = &cm->lf_info; MODE_INFO **mip = mi_8x8; @@ -864,9 +848,66 @@ static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, lfm->left_uv[i] &= 0xeeee; } } + + // Assert if we try to apply 2 different loop filters at the same position. + assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_8X8])); + assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_4X4])); + assert(!(lfm->left_y[TX_8X8] & lfm->left_y[TX_4X4])); + assert(!(lfm->int_4x4_y & lfm->left_y[TX_16X16])); + assert(!(lfm->left_uv[TX_16X16]&lfm->left_uv[TX_8X8])); + assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4])); + assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4])); + assert(!(lfm->int_4x4_uv & lfm->left_uv[TX_16X16])); + assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8])); + assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4])); + assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4])); + assert(!(lfm->int_4x4_y & lfm->above_y[TX_16X16])); + assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8])); + assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4])); + assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4])); + assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16])); +} + +static uint8_t build_lfi(const loop_filter_info_n *lfi_n, + const MB_MODE_INFO *mbmi) { + const int seg = mbmi->segment_id; + const int ref = mbmi->ref_frame[0]; + return lfi_n->lvl[seg][ref][mode_lf_lut[mbmi->mode]]; +} + +static void filter_selectively_vert(uint8_t *s, int pitch, + unsigned int mask_16x16, + unsigned int mask_8x8, + unsigned int mask_4x4, + unsigned int mask_4x4_int, + const loop_filter_info_n *lfi_n, + const uint8_t *lfl) { + unsigned int mask; + + for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; + mask; mask >>= 1) { + const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl; + + if (mask & 1) { + if (mask_16x16 & 1) { + vp9_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + } else if (mask_8x8 & 1) { + vp9_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + } else if (mask_4x4 & 1) { + vp9_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + } + } + if (mask_4x4_int & 1) + vp9_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + s += 8; + lfl += 1; + mask_16x16 >>= 1; + mask_8x8 >>= 1; + mask_4x4 >>= 1; + mask_4x4_int >>= 1; + } } -#if CONFIG_NON420 static void filter_block_plane_non420(VP9_COMMON *cm, struct macroblockd_plane *plane, MODE_INFO **mi_8x8, @@ -894,15 +935,15 @@ static void filter_block_plane_non420(VP9_COMMON *cm, // Determine the vertical edges that need filtering for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) { const MODE_INFO *mi = mi_8x8[c]; - const int skip_this = mi[0].mbmi.skip_coeff - && is_inter_block(&mi[0].mbmi); + const BLOCK_SIZE sb_type = mi[0].mbmi.sb_type; + const int skip_this = mi[0].mbmi.skip && is_inter_block(&mi[0].mbmi); // left edge of current unit is block/partition edge -> no skip - const int block_edge_left = b_width_log2(mi[0].mbmi.sb_type) ? - !(c & ((1 << (b_width_log2(mi[0].mbmi.sb_type)-1)) - 1)) : 1; + const int block_edge_left = (num_4x4_blocks_wide_lookup[sb_type] > 1) ? + !(c & (num_8x8_blocks_wide_lookup[sb_type] - 1)) : 1; const int skip_this_c = skip_this && !block_edge_left; // top edge of current unit is block/partition edge -> no skip - const int block_edge_above = b_height_log2(mi[0].mbmi.sb_type) ? - !(r & ((1 << (b_height_log2(mi[0].mbmi.sb_type)-1)) - 1)) : 1; + const int block_edge_above = (num_4x4_blocks_high_lookup[sb_type] > 1) ? + !(r & (num_8x8_blocks_high_lookup[sb_type] - 1)) : 1; const int skip_this_r = skip_this && !block_edge_above; const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV) ? get_uv_tx_size(&mi[0].mbmi) @@ -1004,15 +1045,13 @@ static void filter_block_plane_non420(VP9_COMMON *cm, dst->buf += 8 * dst->stride; } } -#endif -static void filter_block_plane(VP9_COMMON *const cm, - struct macroblockd_plane *const plane, - int mi_row, - LOOP_FILTER_MASK *lfm) { +void vp9_filter_block_plane(VP9_COMMON *const cm, + struct macroblockd_plane *const plane, + int mi_row, + LOOP_FILTER_MASK *lfm) { struct buf_2d *const dst = &plane->dst; uint8_t* const dst0 = dst->buf; - unsigned int mask_4x4_int_row[MI_BLOCK_SIZE] = {0}; int r, c; if (!plane->plane_type) { @@ -1021,23 +1060,27 @@ static void filter_block_plane(VP9_COMMON *const cm, uint64_t mask_4x4 = lfm->left_y[TX_4X4]; uint64_t mask_4x4_int = lfm->int_4x4_y; - // Vertical pass - for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) { - mask_4x4_int_row[r] = mask_4x4_int & 0xff; + // Vertical pass: do 2 rows at one time + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) { + unsigned int mask_16x16_l = mask_16x16 & 0xffff; + unsigned int mask_8x8_l = mask_8x8 & 0xffff; + unsigned int mask_4x4_l = mask_4x4 & 0xffff; + unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff; // Disable filtering on the leftmost column - filter_selectively_vert(dst->buf, dst->stride, - mask_16x16 & 0xff, - mask_8x8 & 0xff, - mask_4x4 & 0xff, - mask_4x4_int_row[r], - &cm->lf_info, &lfm->lfl_y[r << 3]); - - dst->buf += 8 * dst->stride; - mask_16x16 >>= 8; - mask_8x8 >>= 8; - mask_4x4 >>= 8; - mask_4x4_int >>= 8; + filter_selectively_vert_row2(plane->plane_type, + dst->buf, dst->stride, + mask_16x16_l, + mask_8x8_l, + mask_4x4_l, + mask_4x4_int_l, + &cm->lf_info, &lfm->lfl_y[r << 3]); + + dst->buf += 16 * dst->stride; + mask_16x16 >>= 16; + mask_8x8 >>= 16; + mask_4x4 >>= 16; + mask_4x4_int >>= 16; } // Horizontal pass @@ -1045,6 +1088,7 @@ static void filter_block_plane(VP9_COMMON *const cm, mask_16x16 = lfm->above_y[TX_16X16]; mask_8x8 = lfm->above_y[TX_8X8]; mask_4x4 = lfm->above_y[TX_4X4]; + mask_4x4_int = lfm->int_4x4_y; for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) { unsigned int mask_16x16_r; @@ -1065,13 +1109,14 @@ static void filter_block_plane(VP9_COMMON *const cm, mask_16x16_r, mask_8x8_r, mask_4x4_r, - mask_4x4_int_row[r], + mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r << 3]); dst->buf += 8 * dst->stride; mask_16x16 >>= 8; mask_8x8 >>= 8; mask_4x4 >>= 8; + mask_4x4_int >>= 8; } } else { uint16_t mask_16x16 = lfm->left_uv[TX_16X16]; @@ -1079,27 +1124,37 @@ static void filter_block_plane(VP9_COMMON *const cm, uint16_t mask_4x4 = lfm->left_uv[TX_4X4]; uint16_t mask_4x4_int = lfm->int_4x4_uv; - // Vertical pass - for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) { + // Vertical pass: do 2 rows at one time + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) { if (plane->plane_type == 1) { - for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) + for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) { lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)]; + lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + + (c << 1)]; + } } - mask_4x4_int_row[r] = mask_4x4_int & 0xf; - // Disable filtering on the leftmost column - filter_selectively_vert(dst->buf, dst->stride, - mask_16x16 & 0xf, - mask_8x8 & 0xf, - mask_4x4 & 0xf, - mask_4x4_int_row[r], - &cm->lf_info, &lfm->lfl_uv[r << 1]); - - dst->buf += 8 * dst->stride; - mask_16x16 >>= 4; - mask_8x8 >>= 4; - mask_4x4 >>= 4; - mask_4x4_int >>= 4; + { + unsigned int mask_16x16_l = mask_16x16 & 0xff; + unsigned int mask_8x8_l = mask_8x8 & 0xff; + unsigned int mask_4x4_l = mask_4x4 & 0xff; + unsigned int mask_4x4_int_l = mask_4x4_int & 0xff; + + // Disable filtering on the leftmost column + filter_selectively_vert_row2(plane->plane_type, + dst->buf, dst->stride, + mask_16x16_l, + mask_8x8_l, + mask_4x4_l, + mask_4x4_int_l, + &cm->lf_info, &lfm->lfl_uv[r << 1]); + + dst->buf += 16 * dst->stride; + mask_16x16 >>= 8; + mask_8x8 >>= 8; + mask_4x4 >>= 8; + mask_4x4_int >>= 8; + } } // Horizontal pass @@ -1107,11 +1162,12 @@ static void filter_block_plane(VP9_COMMON *const cm, mask_16x16 = lfm->above_uv[TX_16X16]; mask_8x8 = lfm->above_uv[TX_8X8]; mask_4x4 = lfm->above_uv[TX_4X4]; + mask_4x4_int = lfm->int_4x4_uv; for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) { const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1; const unsigned int mask_4x4_int_r = skip_border_4x4_r ? - 0 : (mask_4x4_int_row[r]); + 0 : (mask_4x4_int & 0xf); unsigned int mask_16x16_r; unsigned int mask_8x8_r; unsigned int mask_4x4_r; @@ -1137,6 +1193,7 @@ static void filter_block_plane(VP9_COMMON *const cm, mask_16x16 >>= 4; mask_8x8 >>= 4; mask_4x4 >>= 4; + mask_4x4_int >>= 4; } } } @@ -1147,10 +1204,8 @@ void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, const int num_planes = y_only ? 1 : MAX_MB_PLANE; int mi_row, mi_col; LOOP_FILTER_MASK lfm; -#if CONFIG_NON420 int use_420 = y_only || (xd->plane[1].subsampling_y == 1 && xd->plane[1].subsampling_x == 1); -#endif for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) { MODE_INFO **mi_8x8 = cm->mi_grid_visible + mi_row * cm->mode_info_stride; @@ -1158,25 +1213,19 @@ void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) { int plane; - setup_dst_planes(xd, frame_buffer, mi_row, mi_col); + vp9_setup_dst_planes(xd, frame_buffer, mi_row, mi_col); // TODO(JBB): Make setup_mask work for non 420. -#if CONFIG_NON420 if (use_420) -#endif - setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mode_info_stride, - &lfm); + vp9_setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, + cm->mode_info_stride, &lfm); for (plane = 0; plane < num_planes; ++plane) { -#if CONFIG_NON420 if (use_420) -#endif - filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm); -#if CONFIG_NON420 + vp9_filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm); else filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col, mi_row, mi_col); -#endif } } } @@ -1184,12 +1233,12 @@ void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd, int frame_filter_level, - int y_only, int partial) { + int y_only, int partial_frame) { int start_mi_row, end_mi_row, mi_rows_to_filter; if (!frame_filter_level) return; start_mi_row = 0; mi_rows_to_filter = cm->mi_rows; - if (partial && cm->mi_rows > 8) { + if (partial_frame && cm->mi_rows > 8) { start_mi_row = cm->mi_rows >> 1; start_mi_row &= 0xfffffff8; mi_rows_to_filter = MAX(cm->mi_rows / 8, 8); diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h index 62389ea..97ae9d2 100644 --- a/libvpx/vp9/common/vp9_loopfilter.h +++ b/libvpx/vp9/common/vp9_loopfilter.h @@ -17,6 +17,10 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_seg_common.h" +#ifdef __cplusplus +extern "C" { +#endif + #define MAX_LOOP_FILTER 63 #define MAX_SHARPNESS 7 @@ -54,12 +58,44 @@ typedef struct { typedef struct { loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1]; uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS]; - uint8_t mode_lf_lut[MB_MODE_COUNT]; } loop_filter_info_n; +// This structure holds bit masks for all 8x8 blocks in a 64x64 region. +// Each 1 bit represents a position in which we want to apply the loop filter. +// Left_ entries refer to whether we apply a filter on the border to the +// left of the block. Above_ entries refer to whether or not to apply a +// filter on the above border. Int_ entries refer to whether or not to +// apply borders on the 4x4 edges within the 8x8 block that each bit +// represents. +// Since each transform is accompanied by a potentially different type of +// loop filter there is a different entry in the array for each transform size. +typedef struct { + uint64_t left_y[TX_SIZES]; + uint64_t above_y[TX_SIZES]; + uint64_t int_4x4_y; + uint16_t left_uv[TX_SIZES]; + uint16_t above_uv[TX_SIZES]; + uint16_t int_4x4_uv; + uint8_t lfl_y[64]; + uint8_t lfl_uv[16]; +} LOOP_FILTER_MASK; + /* assorted loopfilter functions which get used elsewhere */ struct VP9Common; struct macroblockd; +struct VP9LfSyncData; + +// This function sets up the bit masks for the entire 64x64 region represented +// by mi_row, mi_col. +void vp9_setup_mask(struct VP9Common *const cm, + const int mi_row, const int mi_col, + MODE_INFO **mi_8x8, const int mode_info_stride, + LOOP_FILTER_MASK *lfm); + +void vp9_filter_block_plane(struct VP9Common *const cm, + struct macroblockd_plane *const plane, + int mi_row, + LOOP_FILTER_MASK *lfm); void vp9_loop_filter_init(struct VP9Common *cm); @@ -71,7 +107,7 @@ void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl); void vp9_loop_filter_frame(struct VP9Common *cm, struct macroblockd *mbd, int filter_level, - int y_only, int partial); + int y_only, int partial_frame); // Apply the loop filter to [start, stop) macro block rows in frame_buffer. void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, @@ -87,8 +123,15 @@ typedef struct LoopFilterWorkerData { int start; int stop; int y_only; + + struct VP9LfSyncData *lf_sync; + int num_lf_workers; } LFWorkerData; // Operates on the rows described by LFWorkerData passed as 'arg1'. int vp9_loop_filter_worker(void *arg1, void *arg2); +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_LOOPFILTER_H_ diff --git a/libvpx/vp9/common/vp9_loopfilter_filters.c b/libvpx/vp9/common/vp9_loopfilter_filters.c index 2c4bf6c..25d3311 100644 --- a/libvpx/vp9/common/vp9_loopfilter_filters.c +++ b/libvpx/vp9/common/vp9_loopfilter_filters.c @@ -70,7 +70,7 @@ static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0, return hev; } -static INLINE void filter4(int8_t mask, uint8_t hev, uint8_t *op1, +static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1) { int8_t filter1, filter2; @@ -78,6 +78,7 @@ static INLINE void filter4(int8_t mask, uint8_t hev, uint8_t *op1, const int8_t ps0 = (int8_t) *op0 ^ 0x80; const int8_t qs0 = (int8_t) *oq0 ^ 0x80; const int8_t qs1 = (int8_t) *oq1 ^ 0x80; + const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); // add outer taps if we have high edge variance int8_t filter = signed_char_clamp(ps1 - qs1) & hev; @@ -101,11 +102,9 @@ static INLINE void filter4(int8_t mask, uint8_t hev, uint8_t *op1, *op1 = signed_char_clamp(ps1 + filter) ^ 0x80; } -void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int p /* pitch */, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { +void vp9_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int count) { int i; // loop filter designed to work using chars so that we can make maximum use @@ -115,17 +114,22 @@ void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int p /* pitch */, const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1); - filter4(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p); + filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p); ++s; } } -void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { +void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1); +} + +void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int count) { int i; // loop filter designed to work using chars so that we can make maximum use @@ -135,13 +139,21 @@ void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1); - filter4(mask, hev, s - 2, s - 1, s, s + 1); + filter4(mask, *thresh, s - 2, s - 1, s, s + 1); s += pitch; } } -static INLINE void filter8(int8_t mask, uint8_t hev, uint8_t flat, +void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1); + vp9_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, + thresh1, 1); +} + +static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, uint8_t *op3, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, @@ -158,15 +170,13 @@ static INLINE void filter8(int8_t mask, uint8_t hev, uint8_t flat, *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); } else { - filter4(mask, hev, op1, op0, oq0, oq1); + filter4(mask, thresh, op1, op0, oq0, oq1); } } -void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { +void vp9_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int count) { int i; // loop filter designed to work using chars so that we can make maximum use @@ -177,19 +187,24 @@ void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int p, const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - filter8(mask, hev, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, - s, s + 1 * p, s + 2 * p, s + 3 * p); + filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, + s, s + 1 * p, s + 2 * p, s + 3 * p); ++s; } } -void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { +void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1); +} + +void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int count) { int i; for (i = 0; i < 8 * count; ++i) { @@ -197,15 +212,23 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t hev = hev_mask(thresh[0], p1, p0, q0, q1); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - filter8(mask, hev, flat, s - 4, s - 3, s - 2, s - 1, - s, s + 1, s + 2, s + 3); + filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, + s, s + 1, s + 2, s + 3); s += pitch; } } -static INLINE void filter16(int8_t mask, uint8_t hev, +void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1); + vp9_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, + thresh1, 1); +} + +static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat, uint8_t flat2, uint8_t *op7, uint8_t *op6, uint8_t *op5, uint8_t *op4, @@ -252,15 +275,13 @@ static INLINE void filter16(int8_t mask, uint8_t hev, *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); } else { - filter8(mask, hev, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); + filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); } } -void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { +void vp9_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int count) { int i; // loop filter designed to work using chars so that we can make maximum use @@ -270,13 +291,12 @@ void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int p, const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat2 = flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]); - filter16(mask, hev, flat, flat2, + filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, @@ -285,25 +305,35 @@ void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int p, } } -void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { +static void mb_lpf_vertical_edge_w(uint8_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count) { int i; - for (i = 0; i < 8; ++i) { + for (i = 0; i < count; ++i) { const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], s[7]); - filter16(mask, hev, flat, flat2, + filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7); s += p; } } + +void vp9_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8); +} + +void vp9_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16); +} diff --git a/libvpx/vp9/common/vp9_mv.h b/libvpx/vp9/common/vp9_mv.h index 31a79b9..3eb7f9d 100644 --- a/libvpx/vp9/common/vp9_mv.h +++ b/libvpx/vp9/common/vp9_mv.h @@ -15,7 +15,11 @@ #include "vp9/common/vp9_common.h" -typedef struct { +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct mv { int16_t row; int16_t col; } MV; @@ -25,15 +29,19 @@ typedef union int_mv { MV as_mv; } int_mv; /* facilitates faster equality tests and copies */ -typedef struct { +typedef struct mv32 { int32_t row; int32_t col; } MV32; -static void clamp_mv(MV *mv, int min_col, int max_col, - int min_row, int max_row) { +static INLINE void clamp_mv(MV *mv, int min_col, int max_col, + int min_row, int max_row) { mv->col = clamp(mv->col, min_col, max_col); mv->row = clamp(mv->row, min_row, max_row); } +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_MV_H_ diff --git a/libvpx/vp9/common/vp9_mvref_common.c b/libvpx/vp9/common/vp9_mvref_common.c index 8df8aec..9f2c2df 100644 --- a/libvpx/vp9/common/vp9_mvref_common.c +++ b/libvpx/vp9/common/vp9_mvref_common.c @@ -13,6 +13,11 @@ #define MVREF_NEIGHBOURS 8 +typedef struct position { + int row; + int col; +} POSITION; + typedef enum { BOTH_ZERO = 0, ZERO_PLUS_PREDICTED = 1, @@ -71,7 +76,7 @@ static const int counter_to_context[19] = { BOTH_INTRA // 18 }; -static const MV mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = { +static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = { // 4X4 {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, // 4X8 @@ -172,26 +177,27 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref, // are inside the borders of the tile. static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row, int mi_rows, - const MV *mv) { - return !(mi_row + mv->row < 0 || - mi_col + mv->col < tile->mi_col_start || - mi_row + mv->row >= mi_rows || - mi_col + mv->col >= tile->mi_col_end); + const POSITION *mi_pos) { + return !(mi_row + mi_pos->row < 0 || + mi_col + mi_pos->col < tile->mi_col_start || + mi_row + mi_pos->row >= mi_rows || + mi_col + mi_pos->col >= tile->mi_col_end); } // This function searches the neighbourhood of a given MB/SB // to try and find candidate reference vectors. -void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, - const TileInfo *const tile, - MODE_INFO *mi, const MODE_INFO *prev_mi, - MV_REFERENCE_FRAME ref_frame, - int_mv *mv_ref_list, - int block_idx, - int mi_row, int mi_col) { +static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const TileInfo *const tile, + MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, + int block, int mi_row, int mi_col) { const int *ref_sign_bias = cm->ref_frame_sign_bias; int i, refmv_count = 0; - const MV *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type]; - const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL; + const MODE_INFO *prev_mi = cm->coding_use_prev_mi && cm->prev_mi ? + xd->prev_mi_8x8[0] : NULL; + const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type]; + const MB_MODE_INFO *const prev_mbmi = cm->coding_use_prev_mi && prev_mi ? + &prev_mi->mbmi : NULL; int different_ref_found = 0; int context_counter = 0; @@ -202,26 +208,19 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, // if the size < 8x8 we get the mv from the bmi substructure, // and we also need to keep a mode count. for (i = 0; i < 2; ++i) { - const MV *const mv_ref = &mv_ref_search[i]; + const POSITION *const mv_ref = &mv_ref_search[i]; if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { const MODE_INFO *const candidate_mi = xd->mi_8x8[mv_ref->col + mv_ref->row * xd->mode_info_stride]; const MB_MODE_INFO *const candidate = &candidate_mi->mbmi; // Keep counts for entropy encoding. context_counter += mode_2_counter[candidate->mode]; + different_ref_found = 1; - // Check if the candidate comes from the same reference frame. - if (candidate->ref_frame[0] == ref_frame) { - ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, - mv_ref->col, block_idx)); - different_ref_found = candidate->ref_frame[1] != ref_frame; - } else { - if (candidate->ref_frame[1] == ref_frame) - // Add second motion vector if it has the same ref_frame. - ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, - mv_ref->col, block_idx)); - different_ref_found = 1; - } + if (candidate->ref_frame[0] == ref_frame) + ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block)); + else if (candidate->ref_frame[1] == ref_frame) + ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block)); } } @@ -229,20 +228,17 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, // as before except we don't need to keep track of sub blocks or // mode counts. for (; i < MVREF_NEIGHBOURS; ++i) { - const MV *const mv_ref = &mv_ref_search[i]; + const POSITION *const mv_ref = &mv_ref_search[i]; if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col + mv_ref->row * xd->mode_info_stride]->mbmi; + different_ref_found = 1; - if (candidate->ref_frame[0] == ref_frame) { + if (candidate->ref_frame[0] == ref_frame) ADD_MV_REF_LIST(candidate->mv[0]); - different_ref_found = candidate->ref_frame[1] != ref_frame; - } else { - if (candidate->ref_frame[1] == ref_frame) - ADD_MV_REF_LIST(candidate->mv[1]); - different_ref_found = 1; - } + else if (candidate->ref_frame[1] == ref_frame) + ADD_MV_REF_LIST(candidate->mv[1]); } } @@ -259,7 +255,7 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, // different reference frames. if (different_ref_found) { for (i = 0; i < MVREF_NEIGHBOURS; ++i) { - const MV *mv_ref = &mv_ref_search[i]; + const POSITION *mv_ref = &mv_ref_search[i]; if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col + mv_ref->row @@ -284,3 +280,84 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) clamp_mv_ref(&mv_ref_list[i].as_mv, xd); } + +void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const TileInfo *const tile, + MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, + int mi_row, int mi_col) { + find_mv_refs_idx(cm, xd, tile, mi, ref_frame, mv_ref_list, -1, + mi_row, mi_col); +} + +static void lower_mv_precision(MV *mv, int allow_hp) { + const int use_hp = allow_hp && vp9_use_mv_hp(mv); + if (!use_hp) { + if (mv->row & 1) + mv->row += (mv->row > 0 ? -1 : 1); + if (mv->col & 1) + mv->col += (mv->col > 0 ? -1 : 1); + } +} + + +void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, + int_mv *mvlist, int_mv *nearest, int_mv *near) { + int i; + // Make sure all the candidates are properly clamped etc + for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { + lower_mv_precision(&mvlist[i].as_mv, allow_hp); + clamp_mv2(&mvlist[i].as_mv, xd); + } + *nearest = mvlist[0]; + *near = mvlist[1]; +} + +void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *const tile, + int block, int ref, int mi_row, int mi_col, + int_mv *nearest, int_mv *near) { + int_mv mv_list[MAX_MV_REF_CANDIDATES]; + MODE_INFO *const mi = xd->mi_8x8[0]; + b_mode_info *bmi = mi->bmi; + int n; + + assert(MAX_MV_REF_CANDIDATES == 2); + + find_mv_refs_idx(cm, xd, tile, mi, mi->mbmi.ref_frame[ref], mv_list, block, + mi_row, mi_col); + + near->as_int = 0; + switch (block) { + case 0: + nearest->as_int = mv_list[0].as_int; + near->as_int = mv_list[1].as_int; + break; + case 1: + case 2: + nearest->as_int = bmi[0].as_mv[ref].as_int; + for (n = 0; n < MAX_MV_REF_CANDIDATES; ++n) + if (nearest->as_int != mv_list[n].as_int) { + near->as_int = mv_list[n].as_int; + break; + } + break; + case 3: { + int_mv candidates[2 + MAX_MV_REF_CANDIDATES]; + candidates[0] = bmi[1].as_mv[ref]; + candidates[1] = bmi[0].as_mv[ref]; + candidates[2] = mv_list[0]; + candidates[3] = mv_list[1]; + + nearest->as_int = bmi[2].as_mv[ref].as_int; + for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n) + if (nearest->as_int != candidates[n].as_int) { + near->as_int = candidates[n].as_int; + break; + } + break; + } + default: + assert("Invalid block index."); + } +} diff --git a/libvpx/vp9/common/vp9_mvref_common.h b/libvpx/vp9/common/vp9_mvref_common.h index ce4c559..903ac02 100644 --- a/libvpx/vp9/common/vp9_mvref_common.h +++ b/libvpx/vp9/common/vp9_mvref_common.h @@ -7,29 +7,46 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP9_COMMON_VP9_MVREF_COMMON_H_ +#define VP9_COMMON_VP9_MVREF_COMMON_H_ #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_blockd.h" -#ifndef VP9_COMMON_VP9_MVREF_COMMON_H_ -#define VP9_COMMON_VP9_MVREF_COMMON_H_ +#ifdef __cplusplus +extern "C" { +#endif + +#define LEFT_TOP_MARGIN ((VP9_ENC_BORDER_IN_PIXELS - VP9_INTERP_EXTEND) << 3) +#define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS -\ + VP9_INTERP_EXTEND) << 3) -void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, - const TileInfo *const tile, - MODE_INFO *mi, const MODE_INFO *prev_mi, - MV_REFERENCE_FRAME ref_frame, - int_mv *mv_ref_list, - int block_idx, - int mi_row, int mi_col); - -static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, - const TileInfo *const tile, - MODE_INFO *mi, const MODE_INFO *prev_mi, - MV_REFERENCE_FRAME ref_frame, - int_mv *mv_ref_list, - int mi_row, int mi_col) { - vp9_find_mv_refs_idx(cm, xd, tile, mi, prev_mi, ref_frame, - mv_ref_list, -1, mi_row, mi_col); +// TODO(jingning): this mv clamping function should be block size dependent. +static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { + clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN, + xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, + xd->mb_to_top_edge - LEFT_TOP_MARGIN, + xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); } +void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const TileInfo *const tile, + MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, int mi_row, int mi_col); + +// check a list of motion vectors by sad score using a number rows of pixels +// above and a number cols of pixels in the left to select the one with best +// score to use as ref motion vector +void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, + int_mv *mvlist, int_mv *nearest, int_mv *near); + +void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *const tile, + int block, int ref, int mi_row, int mi_col, + int_mv *nearest, int_mv *near); + +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_MVREF_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_onyx.h b/libvpx/vp9/common/vp9_onyx.h deleted file mode 100644 index 452dd6b..0000000 --- a/libvpx/vp9/common/vp9_onyx.h +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_VP9_ONYX_H_ -#define VP9_COMMON_VP9_ONYX_H_ - -#ifdef __cplusplus -extern "C" -{ // NOLINT -#endif - -#include "./vpx_config.h" -#include "vpx/internal/vpx_codec_internal.h" -#include "vpx/vp8cx.h" -#include "vpx_scale/yv12config.h" -#include "vp9/common/vp9_ppflags.h" - -#define MAX_SEGMENTS 8 - - typedef int *VP9_PTR; - - /* Create/destroy static data structures. */ - - typedef enum { - NORMAL = 0, - FOURFIVE = 1, - THREEFIVE = 2, - ONETWO = 3 - } VPX_SCALING; - - typedef enum { - VP9_LAST_FLAG = 1, - VP9_GOLD_FLAG = 2, - VP9_ALT_FLAG = 4 - } VP9_REFFRAME; - - - typedef enum { - USAGE_STREAM_FROM_SERVER = 0x0, - USAGE_LOCAL_FILE_PLAYBACK = 0x1, - USAGE_CONSTRAINED_QUALITY = 0x2, - USAGE_CONSTANT_QUALITY = 0x3, - } END_USAGE; - - - typedef enum { - MODE_GOODQUALITY = 0x1, - MODE_BESTQUALITY = 0x2, - MODE_FIRSTPASS = 0x3, - MODE_SECONDPASS = 0x4, - MODE_SECONDPASS_BEST = 0x5, - } MODE; - - typedef enum { - FRAMEFLAGS_KEY = 1, - FRAMEFLAGS_GOLDEN = 2, - FRAMEFLAGS_ALTREF = 4, - } FRAMETYPE_FLAGS; - - typedef struct { - int version; // 4 versions of bitstream defined: - // 0 - best quality/slowest decode, - // 3 - lowest quality/fastest decode - int width; // width of data passed to the compressor - int height; // height of data passed to the compressor - double framerate; // set to passed in framerate - int64_t target_bandwidth; // bandwidth to be used in kilobits per second - - int noise_sensitivity; // pre processing blur: recommendation 0 - int Sharpness; // sharpening output: recommendation 0: - int cpu_used; - unsigned int rc_max_intra_bitrate_pct; - - // mode -> - // (0)=Realtime/Live Encoding. This mode is optimized for realtime - // encoding (for example, capturing a television signal or feed from - // a live camera). ( speed setting controls how fast ) - // (1)=Good Quality Fast Encoding. The encoder balances quality with the - // amount of time it takes to encode the output. ( speed setting - // controls how fast ) - // (2)=One Pass - Best Quality. The encoder places priority on the - // quality of the output over encoding speed. The output is compressed - // at the highest possible quality. This option takes the longest - // amount of time to encode. ( speed setting ignored ) - // (3)=Two Pass - First Pass. The encoder generates a file of statistics - // for use in the second encoding pass. ( speed setting controls how - // fast ) - // (4)=Two Pass - Second Pass. The encoder uses the statistics that were - // generated in the first encoding pass to create the compressed - // output. ( speed setting controls how fast ) - // (5)=Two Pass - Second Pass Best. The encoder uses the statistics that - // were generated in the first encoding pass to create the compressed - // output using the highest possible quality, and taking a - // longer amount of time to encode.. ( speed setting ignored ) - int Mode; - - // Key Framing Operations - int auto_key; // autodetect cut scenes and set the keyframes - int key_freq; // maximum distance to key frame. - - int allow_lag; // allow lagged compression (if 0 lagin frames is ignored) - int lag_in_frames; // how many frames lag before we start encoding - - // ---------------------------------------------------------------- - // DATARATE CONTROL OPTIONS - - int end_usage; // vbr or cbr - - // buffer targeting aggressiveness - int under_shoot_pct; - int over_shoot_pct; - - // buffering parameters - int64_t starting_buffer_level; // in seconds - int64_t optimal_buffer_level; - int64_t maximum_buffer_size; - - // controlling quality - int fixed_q; - int worst_allowed_q; - int best_allowed_q; - int cq_level; - int lossless; - - // two pass datarate control - int two_pass_vbrbias; // two pass datarate control tweaks - int two_pass_vbrmin_section; - int two_pass_vbrmax_section; - // END DATARATE CONTROL OPTIONS - // ---------------------------------------------------------------- - - // Spatial scalability - int ss_number_layers; - - // these parameters aren't to be used in final build don't use!!! - int play_alternate; - int alt_freq; - - int encode_breakout; // early breakout : for video conf recommend 800 - - /* Bitfield defining the error resiliency features to enable. - * Can provide decodable frames after losses in previous - * frames and decodable partitions after losses in the same frame. - */ - unsigned int error_resilient_mode; - - /* Bitfield defining the parallel decoding mode where the - * decoding in successive frames may be conducted in parallel - * just by decoding the frame headers. - */ - unsigned int frame_parallel_decoding_mode; - - int arnr_max_frames; - int arnr_strength; - int arnr_type; - - int tile_columns; - int tile_rows; - - struct vpx_fixed_buf two_pass_stats_in; - struct vpx_codec_pkt_list *output_pkt_list; - - vp8e_tuning tuning; - } VP9_CONFIG; - - - void vp9_initialize_enc(); - - VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf); - void vp9_remove_compressor(VP9_PTR *comp); - - void vp9_change_config(VP9_PTR onyx, VP9_CONFIG *oxcf); - - // receive a frames worth of data. caller can assume that a copy of this - // frame is made and not just a copy of the pointer.. - int vp9_receive_raw_frame(VP9_PTR comp, unsigned int frame_flags, - YV12_BUFFER_CONFIG *sd, int64_t time_stamp, - int64_t end_time_stamp); - - int vp9_get_compressed_data(VP9_PTR comp, unsigned int *frame_flags, - unsigned long *size, unsigned char *dest, - int64_t *time_stamp, int64_t *time_end, - int flush); - - int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest, - vp9_ppflags_t *flags); - - int vp9_use_as_reference(VP9_PTR comp, int ref_frame_flags); - - int vp9_update_reference(VP9_PTR comp, int ref_frame_flags); - - int vp9_copy_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag, - YV12_BUFFER_CONFIG *sd); - - int vp9_get_reference_enc(VP9_PTR ptr, int index, YV12_BUFFER_CONFIG **fb); - - int vp9_set_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag, - YV12_BUFFER_CONFIG *sd); - - int vp9_update_entropy(VP9_PTR comp, int update); - - int vp9_set_roimap(VP9_PTR comp, unsigned char *map, - unsigned int rows, unsigned int cols, - int delta_q[MAX_SEGMENTS], - int delta_lf[MAX_SEGMENTS], - unsigned int threshold[MAX_SEGMENTS]); - - int vp9_set_active_map(VP9_PTR comp, unsigned char *map, - unsigned int rows, unsigned int cols); - - int vp9_set_internal_size(VP9_PTR comp, - VPX_SCALING horiz_mode, VPX_SCALING vert_mode); - - int vp9_set_size_literal(VP9_PTR comp, unsigned int width, - unsigned int height); - - void vp9_set_svc(VP9_PTR comp, int use_svc); - - int vp9_get_quantizer(VP9_PTR c); - -#ifdef __cplusplus -} -#endif - -#endif // VP9_COMMON_VP9_ONYX_H_ diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h index a2af57a..52889f7 100644 --- a/libvpx/vp9/common/vp9_onyxc_int.h +++ b/libvpx/vp9/common/vp9_onyxc_int.h @@ -18,6 +18,7 @@ #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_frame_buffers.h" #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_tile_common.h" @@ -25,62 +26,42 @@ #include "vp9/common/vp9_postproc.h" #endif -#define ALLOWED_REFS_PER_FRAME 3 +#ifdef __cplusplus +extern "C" { +#endif + +#define REFS_PER_FRAME 3 -#define NUM_REF_FRAMES_LOG2 3 -#define NUM_REF_FRAMES (1 << NUM_REF_FRAMES_LOG2) +#define REF_FRAMES_LOG2 3 +#define REF_FRAMES (1 << REF_FRAMES_LOG2) // 1 scratch frame for the new frame, 3 for scaled references on the encoder // TODO(jkoleszar): These 3 extra references could probably come from the // normal reference pool. -#define NUM_YV12_BUFFERS (NUM_REF_FRAMES + 4) - -#define NUM_FRAME_CONTEXTS_LOG2 2 -#define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LOG2) - -typedef struct frame_contexts { - vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1]; - vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; - vp9_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1]; - vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES]; - vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS] - [SWITCHABLE_FILTERS - 1]; - vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1]; - vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS]; - vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS]; - vp9_prob single_ref_prob[REF_CONTEXTS][2]; - vp9_prob comp_ref_prob[REF_CONTEXTS]; - struct tx_probs tx_probs; - vp9_prob mbskip_probs[MBSKIP_CONTEXTS]; - nmv_context nmvc; -} FRAME_CONTEXT; +#define FRAME_BUFFERS (REF_FRAMES + 4) -typedef struct { - unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES]; - unsigned int uv_mode[INTRA_MODES][INTRA_MODES]; - unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES]; - vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES]; - unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES] - [COEF_BANDS][PREV_COEF_CONTEXTS]; - unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS] - [SWITCHABLE_FILTERS]; - unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES]; - unsigned int intra_inter[INTRA_INTER_CONTEXTS][2]; - unsigned int comp_inter[COMP_INTER_CONTEXTS][2]; - unsigned int single_ref[REF_CONTEXTS][2][2]; - unsigned int comp_ref[REF_CONTEXTS][2]; - struct tx_counts tx; - unsigned int mbskip[MBSKIP_CONTEXTS][2]; - nmv_context_counts mv; -} FRAME_COUNTS; +#define FRAME_CONTEXTS_LOG2 2 +#define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2) + +extern const struct { + PARTITION_CONTEXT above; + PARTITION_CONTEXT left; +} partition_context_lookup[BLOCK_SIZES]; typedef enum { - SINGLE_PREDICTION_ONLY = 0, - COMP_PREDICTION_ONLY = 1, - HYBRID_PREDICTION = 2, - NB_PREDICTION_TYPES = 3, -} COMPPREDMODE_TYPE; + SINGLE_REFERENCE = 0, + COMPOUND_REFERENCE = 1, + REFERENCE_MODE_SELECT = 2, + REFERENCE_MODES = 3, +} REFERENCE_MODE; + + +typedef struct { + int ref_count; + vpx_codec_frame_buffer_t raw_frame_buffer; + YV12_BUFFER_CONFIG buf; +} RefCntBuffer; typedef struct VP9Common { struct vpx_internal_error_info error; @@ -108,17 +89,16 @@ typedef struct VP9Common { YV12_BUFFER_CONFIG *frame_to_show; - YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS]; - int fb_idx_ref_cnt[NUM_YV12_BUFFERS]; /* reference counts */ - int ref_frame_map[NUM_REF_FRAMES]; /* maps fb_idx to reference slot */ + RefCntBuffer frame_bufs[FRAME_BUFFERS]; + + int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */ // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and // roll new_fb_idx into it. - // Each frame can reference ALLOWED_REFS_PER_FRAME buffers - int active_ref_idx[ALLOWED_REFS_PER_FRAME]; - struct scale_factors active_ref_scale[ALLOWED_REFS_PER_FRAME]; - struct scale_factors_common active_ref_scale_comm[ALLOWED_REFS_PER_FRAME]; + // Each frame can reference REFS_PER_FRAME buffers + RefBuffer frame_refs[REFS_PER_FRAME]; + int new_fb_idx; YV12_BUFFER_CONFIG post_proc_buffer; @@ -128,6 +108,7 @@ typedef struct VP9Common { int show_frame; int last_show_frame; + int show_existing_frame; // Flag signaling that the frame is encoded using only INTRA modes. int intra_only; @@ -175,7 +156,7 @@ typedef struct VP9Common { // Persistent mb segment id map used in prediction. unsigned char *last_frame_seg_map; - INTERPOLATION_TYPE mcomp_filter_type; + INTERP_FILTER interp_filter; loop_filter_info_n lf_info; @@ -190,10 +171,10 @@ typedef struct VP9Common { int allow_comp_inter_inter; MV_REFERENCE_FRAME comp_fixed_ref; MV_REFERENCE_FRAME comp_var_ref[2]; - COMPPREDMODE_TYPE comp_pred_mode; + REFERENCE_MODE reference_mode; FRAME_CONTEXT fc; /* this frame entropy */ - FRAME_CONTEXT frame_contexts[NUM_FRAME_CONTEXTS]; + FRAME_CONTEXT frame_contexts[FRAME_CONTEXTS]; unsigned int frame_context_idx; /* Context to use/update */ FRAME_COUNTS counts; @@ -207,45 +188,54 @@ typedef struct VP9Common { int error_resilient_mode; int frame_parallel_decoding_mode; + // Flag indicates if prev_mi can be used in coding: + // 0: encoder assumes decoder does not have prev_mi + // 1: encoder assumes decoder has and uses prev_mi + unsigned int coding_use_prev_mi; + int log2_tile_cols, log2_tile_rows; -} VP9_COMMON; -// ref == 0 => LAST_FRAME -// ref == 1 => GOLDEN_FRAME -// ref == 2 => ALTREF_FRAME -static YV12_BUFFER_CONFIG *get_frame_ref_buffer(VP9_COMMON *cm, int ref) { - return &cm->yv12_fb[cm->active_ref_idx[ref]]; -} + // Private data associated with the frame buffer callbacks. + void *cb_priv; + vpx_get_frame_buffer_cb_fn_t get_fb_cb; + vpx_release_frame_buffer_cb_fn_t release_fb_cb; -static YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) { - return &cm->yv12_fb[cm->new_fb_idx]; + // Handles memory for the codec. + InternalFrameBufferList int_frame_buffers; +} VP9_COMMON; + +static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) { + return &cm->frame_bufs[cm->new_fb_idx].buf; } -static int get_free_fb(VP9_COMMON *cm) { +static INLINE int get_free_fb(VP9_COMMON *cm) { int i; - for (i = 0; i < NUM_YV12_BUFFERS; i++) - if (cm->fb_idx_ref_cnt[i] == 0) + for (i = 0; i < FRAME_BUFFERS; i++) + if (cm->frame_bufs[i].ref_count == 0) break; - assert(i < NUM_YV12_BUFFERS); - cm->fb_idx_ref_cnt[i] = 1; + assert(i < FRAME_BUFFERS); + cm->frame_bufs[i].ref_count = 1; return i; } -static void ref_cnt_fb(int *buf, int *idx, int new_idx) { - if (buf[*idx] > 0) - buf[*idx]--; +static INLINE void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) { + const int ref_index = *idx; + + if (ref_index >= 0 && bufs[ref_index].ref_count > 0) + bufs[ref_index].ref_count--; *idx = new_idx; - buf[new_idx]++; + bufs[new_idx].ref_count++; } -static int mi_cols_aligned_to_sb(int n_mis) { +static INLINE int mi_cols_aligned_to_sb(int n_mis) { return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2); } -static INLINE const vp9_prob* get_partition_probs(VP9_COMMON *cm, int ctx) { +static INLINE const vp9_prob* get_partition_probs(const VP9_COMMON *cm, + int ctx) { return cm->frame_type == KEY_FRAME ? vp9_kf_partition_probs[ctx] : cm->fc.partition_prob[ctx]; } @@ -265,10 +255,10 @@ static INLINE void set_skip_context( } } -static void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, - int mi_row, int bh, - int mi_col, int bw, - int mi_rows, int mi_cols) { +static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, + int mi_row, int bh, + int mi_col, int bw, + int mi_rows, int mi_cols) { xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8; xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); @@ -279,10 +269,9 @@ static void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, xd->left_available = (mi_col > tile->mi_col_start); } -static void set_prev_mi(VP9_COMMON *cm) { +static INLINE void set_prev_mi(VP9_COMMON *cm) { const int use_prev_in_find_mv_refs = cm->width == cm->last_width && cm->height == cm->last_height && - !cm->error_resilient_mode && !cm->intra_only && cm->last_show_frame; // Special case: set prev_mi to NULL when the previous mode info @@ -298,54 +287,46 @@ static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) { static INLINE void update_partition_context( PARTITION_CONTEXT *above_seg_context, PARTITION_CONTEXT left_seg_context[8], - int mi_row, int mi_col, - BLOCK_SIZE sb_type, - BLOCK_SIZE sb_size) { - PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col; - PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK); - - const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2; - const int bwl = b_width_log2(sb_type); - const int bhl = b_height_log2(sb_type); - const int boffset = b_width_log2(BLOCK_64X64) - bsl; - const char pcval0 = ~(0xe << boffset); - const char pcval1 = ~(0xf << boffset); - const char pcvalue[2] = {pcval0, pcval1}; - - assert(MAX(bwl, bhl) <= bsl); + int mi_row, int mi_col, BLOCK_SIZE subsize, BLOCK_SIZE bsize) { + PARTITION_CONTEXT *const above_ctx = above_seg_context + mi_col; + PARTITION_CONTEXT *const left_ctx = left_seg_context + (mi_row & MI_MASK); + + // num_4x4_blocks_wide_lookup[bsize] / 2 + const int bs = num_8x8_blocks_wide_lookup[bsize]; // update the partition context at the end notes. set partition bits // of block sizes larger than the current one to be one, and partition // bits of smaller block sizes to be zero. - vpx_memset(above_ctx, pcvalue[bwl == bsl], bs); - vpx_memset(left_ctx, pcvalue[bhl == bsl], bs); + vpx_memset(above_ctx, partition_context_lookup[subsize].above, bs); + vpx_memset(left_ctx, partition_context_lookup[subsize].left, bs); } static INLINE int partition_plane_context( const PARTITION_CONTEXT *above_seg_context, const PARTITION_CONTEXT left_seg_context[8], - int mi_row, int mi_col, - BLOCK_SIZE sb_type) { + int mi_row, int mi_col, BLOCK_SIZE bsize) { const PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col; const PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK); - int bsl = mi_width_log2(sb_type), bs = 1 << bsl; + const int bsl = mi_width_log2(bsize); + const int bs = 1 << bsl; int above = 0, left = 0, i; - int boffset = mi_width_log2(BLOCK_64X64) - bsl; - assert(mi_width_log2(sb_type) == mi_height_log2(sb_type)); + assert(b_width_log2(bsize) == b_height_log2(bsize)); assert(bsl >= 0); - assert(boffset >= 0); - - for (i = 0; i < bs; i++) - above |= (above_ctx[i] & (1 << boffset)); - for (i = 0; i < bs; i++) - left |= (left_ctx[i] & (1 << boffset)); - above = (above > 0); - left = (left > 0); + for (i = 0; i < bs; i++) { + above |= above_ctx[i]; + left |= left_ctx[i]; + } + above = (above & bs) > 0; + left = (left & bs) > 0; return (left * 2 + above) + bsl * PARTITION_PLOFFSET; } +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_ONYXC_INT_H_ diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c index 212a28a..7baa9ee 100644 --- a/libvpx/vp9/common/vp9_postproc.c +++ b/libvpx/vp9/common/vp9_postproc.c @@ -13,13 +13,16 @@ #include <stdio.h> #include "./vpx_config.h" +#include "./vpx_scale_rtcd.h" +#include "./vp9_rtcd.h" + +#include "vpx_scale/vpx_scale.h" #include "vpx_scale/yv12config.h" + +#include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_postproc.h" -#include "vp9/common/vp9_textblit.h" -#include "vpx_scale/vpx_scale.h" #include "vp9/common/vp9_systemdependent.h" -#include "./vp9_rtcd.h" -#include "./vpx_scale_rtcd.h" +#include "vp9/common/vp9_textblit.h" #define RGB_TO_YUV(t) \ ( (0.257*(float)(t >> 16)) + (0.504*(float)(t >> 8 & 0xff)) + \ @@ -127,9 +130,6 @@ const short vp9_rv[] = { 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, }; - -/**************************************************************************** - */ void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, @@ -371,7 +371,7 @@ void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, } } -double vp9_gaussian(double sigma, double mu, double x) { +static double gaussian(double sigma, double mu, double x) { return 1 / (sigma * sqrt(2.0 * 3.14159265)) * (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma))); } @@ -396,7 +396,7 @@ static void fillrd(struct postproc_state *state, int q, int a) { next = 0; for (i = -32; i < 32; i++) { - int a = (int)(.5 + 256 * vp9_gaussian(sigma, 0, i)); + int a = (int)(0.5 + 256 * gaussian(sigma, 0, i)); if (a) { for (j = 0; j < a; j++) { @@ -425,27 +425,6 @@ static void fillrd(struct postproc_state *state, int q, int a) { state->last_noise = a; } -/**************************************************************************** - * - * ROUTINE : plane_add_noise_c - * - * INPUTS : unsigned char *Start starting address of buffer to - * add gaussian noise to - * unsigned int width width of plane - * unsigned int height height of plane - * int pitch distance between subsequent lines of frame - * int q quantizer used to determine amount of noise - * to add - * - * OUTPUTS : None. - * - * RETURNS : void. - * - * FUNCTION : adds gaussian noise to a plane of pixels - * - * SPECIAL NOTES : None. - * - ****************************************************************************/ void vp9_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16], char whiteclamp[16], @@ -628,49 +607,40 @@ static void constrain_line(int x0, int *x1, int y0, int *y1, int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) { - int q = cm->lf.filter_level * 10 / 6; - int flags = ppflags->post_proc_flag; - int deblock_level = ppflags->deblocking_level; - int noise_level = ppflags->noise_level; + const int q = MIN(63, cm->lf.filter_level * 10 / 6); + const int flags = ppflags->post_proc_flag; + YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer; + struct postproc_state *const ppstate = &cm->postproc_state; if (!cm->frame_to_show) return -1; - if (q > 63) - q = 63; - if (!flags) { *dest = *cm->frame_to_show; return 0; } -#if ARCH_X86||ARCH_X86_64 - vpx_reset_mmx_state(); -#endif + vp9_clear_system_state(); if (flags & VP9D_DEMACROBLOCK) { - deblock_and_de_macro_block(cm->frame_to_show, &cm->post_proc_buffer, - q + (deblock_level - 5) * 10, 1, 0); + deblock_and_de_macro_block(cm->frame_to_show, ppbuf, + q + (ppflags->deblocking_level - 5) * 10, 1, 0); } else if (flags & VP9D_DEBLOCK) { - vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, q); + vp9_deblock(cm->frame_to_show, ppbuf, q); } else { - vp8_yv12_copy_frame(cm->frame_to_show, &cm->post_proc_buffer); + vp8_yv12_copy_frame(cm->frame_to_show, ppbuf); } if (flags & VP9D_ADDNOISE) { - if (cm->postproc_state.last_q != q - || cm->postproc_state.last_noise != noise_level) { - fillrd(&cm->postproc_state, 63 - q, noise_level); + const int noise_level = ppflags->noise_level; + if (ppstate->last_q != q || + ppstate->last_noise != noise_level) { + fillrd(ppstate, 63 - q, noise_level); } - vp9_plane_add_noise(cm->post_proc_buffer.y_buffer, - cm->postproc_state.noise, - cm->postproc_state.blackclamp, - cm->postproc_state.whiteclamp, - cm->postproc_state.bothclamp, - cm->post_proc_buffer.y_width, - cm->post_proc_buffer.y_height, - cm->post_proc_buffer.y_stride); + vp9_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp, + ppstate->whiteclamp, ppstate->bothclamp, + ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride); } #if 0 && CONFIG_POSTPROC_VISUALIZER @@ -684,16 +654,14 @@ int vp9_post_proc_frame(struct VP9Common *cm, cm->filter_level, flags, cm->mb_cols, cm->mb_rows); - vp9_blit_text(message, cm->post_proc_buffer.y_buffer, - cm->post_proc_buffer.y_stride); + vp9_blit_text(message, ppbuf->y_buffer, ppbuf->y_stride); } if (flags & VP9D_DEBUG_TXT_MBLK_MODES) { int i, j; uint8_t *y_ptr; - YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer; - int mb_rows = post->y_height >> 4; - int mb_cols = post->y_width >> 4; + int mb_rows = ppbuf->y_height >> 4; + int mb_cols = ppbuf->y_width >> 4; int mb_index = 0; MODE_INFO *mi = cm->mi; @@ -719,9 +687,8 @@ int vp9_post_proc_frame(struct VP9Common *cm, if (flags & VP9D_DEBUG_TXT_DC_DIFF) { int i, j; uint8_t *y_ptr; - YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer; - int mb_rows = post->y_height >> 4; - int mb_cols = post->y_width >> 4; + int mb_rows = ppbuf->y_height >> 4; + int mb_cols = ppbuf->y_width >> 4; int mb_index = 0; MODE_INFO *mi = cm->mi; @@ -733,7 +700,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, char zz[4]; int dc_diff = !(mi[mb_index].mbmi.mode != I4X4_PRED && mi[mb_index].mbmi.mode != SPLITMV && - mi[mb_index].mbmi.skip_coeff); + mi[mb_index].mbmi.skip); if (cm->frame_type == KEY_FRAME) snprintf(zz, sizeof(zz) - 1, "a"); @@ -755,17 +722,15 @@ int vp9_post_proc_frame(struct VP9Common *cm, snprintf(message, sizeof(message), "Bitrate: %10.2f framerate: %10.2f ", cm->bitrate, cm->framerate); - vp9_blit_text(message, cm->post_proc_buffer.y_buffer, - cm->post_proc_buffer.y_stride); + vp9_blit_text(message, ppbuf->y_buffer, ppbuf->y_stride); } /* Draw motion vectors */ if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) { - YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer; - int width = post->y_width; - int height = post->y_height; - uint8_t *y_buffer = cm->post_proc_buffer.y_buffer; - int y_stride = cm->post_proc_buffer.y_stride; + int width = ppbuf->y_width; + int height = ppbuf->y_height; + uint8_t *y_buffer = ppbuf->y_buffer; + int y_stride = ppbuf->y_stride; MODE_INFO *mi = cm->mi; int x0, y0; @@ -904,13 +869,12 @@ int vp9_post_proc_frame(struct VP9Common *cm, if ((flags & VP9D_DEBUG_CLR_BLK_MODES) && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) { int y, x; - YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer; - int width = post->y_width; - int height = post->y_height; - uint8_t *y_ptr = cm->post_proc_buffer.y_buffer; - uint8_t *u_ptr = cm->post_proc_buffer.u_buffer; - uint8_t *v_ptr = cm->post_proc_buffer.v_buffer; - int y_stride = cm->post_proc_buffer.y_stride; + int width = ppbuf->y_width; + int height = ppbuf->y_height; + uint8_t *y_ptr = ppbuf->y_buffer; + uint8_t *u_ptr = ppbuf->u_buffer; + uint8_t *v_ptr = ppbuf->v_buffer; + int y_stride = ppbuf->y_stride; MODE_INFO *mi = cm->mi; for (y = 0; y < height; y += 16) { @@ -969,13 +933,12 @@ int vp9_post_proc_frame(struct VP9Common *cm, if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) && ppflags->display_ref_frame_flag) { int y, x; - YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer; - int width = post->y_width; - int height = post->y_height; - uint8_t *y_ptr = cm->post_proc_buffer.y_buffer; - uint8_t *u_ptr = cm->post_proc_buffer.u_buffer; - uint8_t *v_ptr = cm->post_proc_buffer.v_buffer; - int y_stride = cm->post_proc_buffer.y_stride; + int width = ppbuf->y_width; + int height = ppbuf->y_height; + uint8_t *y_ptr = ppbuf->y_buffer; + uint8_t *u_ptr = ppbuf->u_buffer; + uint8_t *v_ptr = ppbuf->v_buffer; + int y_stride = ppbuf->y_stride; MODE_INFO *mi = cm->mi; for (y = 0; y < height; y += 16) { @@ -1002,7 +965,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, } #endif - *dest = cm->post_proc_buffer; + *dest = *ppbuf; /* handle problem with extending borders */ dest->y_width = cm->width; diff --git a/libvpx/vp9/common/vp9_postproc.h b/libvpx/vp9/common/vp9_postproc.h index c63beae..b07d5d0 100644 --- a/libvpx/vp9/common/vp9_postproc.h +++ b/libvpx/vp9/common/vp9_postproc.h @@ -13,6 +13,11 @@ #define VP9_COMMON_VP9_POSTPROC_H_ #include "vpx_ports/mem.h" +#include "vp9/common/vp9_ppflags.h" + +#ifdef __cplusplus +extern "C" { +#endif struct postproc_state { int last_q; @@ -23,8 +28,7 @@ struct postproc_state { DECLARE_ALIGNED(16, char, bothclamp[16]); }; -#include "vp9/common/vp9_onyxc_int.h" -#include "vp9/common/vp9_ppflags.h" +struct VP9Common; int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags); @@ -33,4 +37,8 @@ void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q); void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q); +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_POSTPROC_H_ diff --git a/libvpx/vp9/common/vp9_ppflags.h b/libvpx/vp9/common/vp9_ppflags.h index 561c930..8168935 100644 --- a/libvpx/vp9/common/vp9_ppflags.h +++ b/libvpx/vp9/common/vp9_ppflags.h @@ -11,6 +11,10 @@ #ifndef VP9_COMMON_VP9_PPFLAGS_H_ #define VP9_COMMON_VP9_PPFLAGS_H_ +#ifdef __cplusplus +extern "C" { +#endif + enum { VP9D_NOFILTERING = 0, VP9D_DEBLOCK = 1 << 0, @@ -35,4 +39,8 @@ typedef struct { int display_mv_flag; } vp9_ppflags_t; +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_PPFLAGS_H_ diff --git a/libvpx/vp9/common/vp9_pragmas.h b/libvpx/vp9/common/vp9_pragmas.h index f079161..0efc713 100644 --- a/libvpx/vp9/common/vp9_pragmas.h +++ b/libvpx/vp9/common/vp9_pragmas.h @@ -11,6 +11,10 @@ #ifndef VP9_COMMON_VP9_PRAGMAS_H_ #define VP9_COMMON_VP9_PRAGMAS_H_ +#ifdef __cplusplus +extern "C" { +#endif + #ifdef __INTEL_COMPILER #pragma warning(disable:997 1011 170) #endif @@ -19,4 +23,8 @@ #pragma warning(disable:4799) #endif +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_PRAGMAS_H_ diff --git a/libvpx/vp9/common/vp9_pred_common.c b/libvpx/vp9/common/vp9_pred_common.c index 6018e17..197bcb6 100644 --- a/libvpx/vp9/common/vp9_pred_common.c +++ b/libvpx/vp9/common/vp9_pred_common.c @@ -14,134 +14,110 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_seg_common.h" -#include "vp9/common/vp9_treecoder.h" -static INLINE const MB_MODE_INFO *get_above_mbmi(const MODE_INFO *const above) { - return (above != NULL) ? &above->mbmi : NULL; -} - -static INLINE const MB_MODE_INFO *get_left_mbmi(const MODE_INFO *const left) { - return (left != NULL) ? &left->mbmi : NULL; +static INLINE const MB_MODE_INFO *get_mbmi(const MODE_INFO *const mi) { + return (mi != NULL) ? &mi->mbmi : NULL; } // Returns a context number for the given MB prediction signal -unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) { - const MODE_INFO *const above_mi = get_above_mi(xd); - const MODE_INFO *const left_mi = get_left_mi(xd); - const int above_in_image = above_mi != NULL; - const int left_in_image = left_mi != NULL; +int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) { // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. // The prediction flags in these dummy entries are initialised to 0. - // left - const int left_mv_pred = left_in_image ? is_inter_block(&left_mi->mbmi) - : 0; - const int left_interp = left_in_image && left_mv_pred - ? left_mi->mbmi.interp_filter - : SWITCHABLE_FILTERS; - - // above - const int above_mv_pred = above_in_image ? is_inter_block(&above_mi->mbmi) - : 0; - const int above_interp = above_in_image && above_mv_pred - ? above_mi->mbmi.interp_filter - : SWITCHABLE_FILTERS; - - if (left_interp == above_interp) - return left_interp; - else if (left_interp == SWITCHABLE_FILTERS && - above_interp != SWITCHABLE_FILTERS) - return above_interp; - else if (left_interp != SWITCHABLE_FILTERS && - above_interp == SWITCHABLE_FILTERS) - return left_interp; + const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); + const int left_type = left_mbmi != NULL && is_inter_block(left_mbmi) ? + left_mbmi->interp_filter : SWITCHABLE_FILTERS; + const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); + const int above_type = above_mbmi != NULL && is_inter_block(above_mbmi) ? + above_mbmi->interp_filter : SWITCHABLE_FILTERS; + + if (left_type == above_type) + return left_type; + else if (left_type == SWITCHABLE_FILTERS && above_type != SWITCHABLE_FILTERS) + return above_type; + else if (left_type != SWITCHABLE_FILTERS && above_type == SWITCHABLE_FILTERS) + return left_type; else return SWITCHABLE_FILTERS; } -// Returns a context number for the given MB prediction signal -unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) { - const MODE_INFO *const above_mi = get_above_mi(xd); - const MODE_INFO *const left_mi = get_left_mi(xd); - const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi); - const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi); - const int above_in_image = above_mi != NULL; - const int left_in_image = left_mi != NULL; - const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1; - const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; - // The mode info data structure has a one element border above and to the - // left of the entries corresponding to real macroblocks. - // The prediction flags in these dummy entries are initialized to 0. - // 0 - inter/inter, inter/--, --/inter, --/-- - // 1 - intra/inter, inter/intra - // 2 - intra/--, --/intra - // 3 - intra/intra - if (above_in_image && left_in_image) // both edges available +// The mode info data structure has a one element border above and to the +// left of the entries corresponding to real macroblocks. +// The prediction flags in these dummy entries are initialized to 0. +// 0 - inter/inter, inter/--, --/inter, --/-- +// 1 - intra/inter, inter/intra +// 2 - intra/--, --/intra +// 3 - intra/intra +int vp9_get_intra_inter_context(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); + const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); + const int has_above = above_mbmi != NULL; + const int has_left = left_mbmi != NULL; + + if (has_above && has_left) { // both edges available + const int above_intra = !is_inter_block(above_mbmi); + const int left_intra = !is_inter_block(left_mbmi); return left_intra && above_intra ? 3 : left_intra || above_intra; - else if (above_in_image || left_in_image) // one edge available - return 2 * (above_in_image ? above_intra : left_intra); - else + } else if (has_above || has_left) { // one edge available + return 2 * !is_inter_block(has_above ? above_mbmi : left_mbmi); + } else { return 0; + } } -// Returns a context number for the given MB prediction signal -unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm, - const MACROBLOCKD *xd) { - int pred_context; - const MODE_INFO *const above_mi = get_above_mi(xd); - const MODE_INFO *const left_mi = get_left_mi(xd); - const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi); - const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi); - const int above_in_image = above_mi != NULL; - const int left_in_image = left_mi != NULL; + +int vp9_get_reference_mode_context(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + int ctx; + const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); + const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); + const int has_above = above_mbmi != NULL; + const int has_left = left_mbmi != NULL; // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. // The prediction flags in these dummy entries are initialised to 0. - if (above_in_image && left_in_image) { // both edges available + if (has_above && has_left) { // both edges available if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) // neither edge uses comp pred (0/1) - pred_context = (above_mbmi->ref_frame[0] == cm->comp_fixed_ref) ^ - (left_mbmi->ref_frame[0] == cm->comp_fixed_ref); + ctx = (above_mbmi->ref_frame[0] == cm->comp_fixed_ref) ^ + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref); else if (!has_second_ref(above_mbmi)) // one of two edges uses comp pred (2/3) - pred_context = 2 + (above_mbmi->ref_frame[0] == cm->comp_fixed_ref || - !is_inter_block(above_mbmi)); + ctx = 2 + (above_mbmi->ref_frame[0] == cm->comp_fixed_ref || + !is_inter_block(above_mbmi)); else if (!has_second_ref(left_mbmi)) // one of two edges uses comp pred (2/3) - pred_context = 2 + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref || - !is_inter_block(left_mbmi)); + ctx = 2 + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref || + !is_inter_block(left_mbmi)); else // both edges use comp pred (4) - pred_context = 4; - } else if (above_in_image || left_in_image) { // one edge available - const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; + ctx = 4; + } else if (has_above || has_left) { // one edge available + const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi; if (!has_second_ref(edge_mbmi)) // edge does not use comp pred (0/1) - pred_context = edge_mbmi->ref_frame[0] == cm->comp_fixed_ref; + ctx = edge_mbmi->ref_frame[0] == cm->comp_fixed_ref; else // edge uses comp pred (3) - pred_context = 3; + ctx = 3; } else { // no edges available (1) - pred_context = 1; + ctx = 1; } - assert(pred_context >= 0 && pred_context < COMP_INTER_CONTEXTS); - return pred_context; + assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS); + return ctx; } // Returns a context number for the given MB prediction signal -unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, - const MACROBLOCKD *xd) { +int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { int pred_context; - const MODE_INFO *const above_mi = get_above_mi(xd); - const MODE_INFO *const left_mi = get_left_mi(xd); - const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi); - const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi); - const int above_in_image = above_mi != NULL; - const int left_in_image = left_mi != NULL; - const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1; - const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; + const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); + const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); + const int above_in_image = above_mbmi != NULL; + const int left_in_image = left_mbmi != NULL; + // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. @@ -150,6 +126,9 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, const int var_ref_idx = !fix_ref_idx; if (above_in_image && left_in_image) { // both edges available + const int above_intra = !is_inter_block(above_mbmi); + const int left_intra = !is_inter_block(left_mbmi); + if (above_intra && left_intra) { // intra/intra (2) pred_context = 2; } else if (above_intra || left_intra) { // intra/inter @@ -163,10 +142,10 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, } else { // inter/inter const int l_sg = !has_second_ref(left_mbmi); const int a_sg = !has_second_ref(above_mbmi); - MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0] - : above_mbmi->ref_frame[var_ref_idx]; - MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0] - : left_mbmi->ref_frame[var_ref_idx]; + const MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0] + : above_mbmi->ref_frame[var_ref_idx]; + const MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0] + : left_mbmi->ref_frame[var_ref_idx]; if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) { pred_context = 0; @@ -179,8 +158,8 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, else pred_context = 1; } else if (l_sg || a_sg) { // single/comp - MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl; - MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl; + const MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl; + const MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl; if (vrfc == cm->comp_var_ref[1] && rfs != cm->comp_var_ref[1]) pred_context = 1; else if (rfs == cm->comp_var_ref[1] && vrfc != cm->comp_var_ref[1]) @@ -212,21 +191,21 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, return pred_context; } -unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { + +int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { int pred_context; - const MODE_INFO *const above_mi = get_above_mi(xd); - const MODE_INFO *const left_mi = get_left_mi(xd); - const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi); - const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi); - const int above_in_image = above_mi != NULL; - const int left_in_image = left_mi != NULL; - const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1; - const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; + const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); + const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); + const int has_above = above_mbmi != NULL; + const int has_left = left_mbmi != NULL; // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. // The prediction flags in these dummy entries are initialised to 0. - if (above_in_image && left_in_image) { // both edges available + if (has_above && has_left) { // both edges available + const int above_intra = !is_inter_block(above_mbmi); + const int left_intra = !is_inter_block(left_mbmi); + if (above_intra && left_intra) { // intra/intra pred_context = 2; } else if (above_intra || left_intra) { // intra/inter or inter/intra @@ -237,30 +216,31 @@ unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME || edge_mbmi->ref_frame[1] == LAST_FRAME); } else { // inter/inter - if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) { - pred_context = 2 * (above_mbmi->ref_frame[0] == LAST_FRAME) + - 2 * (left_mbmi->ref_frame[0] == LAST_FRAME); - } else if (has_second_ref(above_mbmi) && has_second_ref(left_mbmi)) { - pred_context = 1 + (above_mbmi->ref_frame[0] == LAST_FRAME || - above_mbmi->ref_frame[1] == LAST_FRAME || - left_mbmi->ref_frame[0] == LAST_FRAME || - left_mbmi->ref_frame[1] == LAST_FRAME); - } else { - const MV_REFERENCE_FRAME rfs = !has_second_ref(above_mbmi) ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - const MV_REFERENCE_FRAME crf1 = has_second_ref(above_mbmi) ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - const MV_REFERENCE_FRAME crf2 = has_second_ref(above_mbmi) ? - above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1]; + const int above_has_second = has_second_ref(above_mbmi); + const int left_has_second = has_second_ref(left_mbmi); + const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1]; + const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1]; + + if (above_has_second && left_has_second) { + pred_context = 1 + (above0 == LAST_FRAME || above1 == LAST_FRAME || + left0 == LAST_FRAME || left1 == LAST_FRAME); + } else if (above_has_second || left_has_second) { + const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1; if (rfs == LAST_FRAME) pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME); else - pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME; + pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME); + } else { + pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME); } } - } else if (above_in_image || left_in_image) { // one edge available - const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; + } else if (has_above || has_left) { // one edge available + const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi; if (!is_inter_block(edge_mbmi)) { // intra pred_context = 2; } else { // inter @@ -278,22 +258,21 @@ unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { return pred_context; } -unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { +int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { int pred_context; - const MODE_INFO *const above_mi = get_above_mi(xd); - const MODE_INFO *const left_mi = get_left_mi(xd); - const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi); - const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi); - const int above_in_image = above_mi != NULL; - const int left_in_image = left_mi != NULL; - const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1; - const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; + const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); + const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); + const int has_above = above_mbmi != NULL; + const int has_left = left_mbmi != NULL; // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. // The prediction flags in these dummy entries are initialised to 0. - if (above_in_image && left_in_image) { // both edges available + if (has_above && has_left) { // both edges available + const int above_intra = !is_inter_block(above_mbmi); + const int left_intra = !is_inter_block(left_mbmi); + if (above_intra && left_intra) { // intra/intra pred_context = 2; } else if (above_intra || left_intra) { // intra/inter or inter/intra @@ -308,36 +287,25 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { edge_mbmi->ref_frame[1] == GOLDEN_FRAME); } } else { // inter/inter - if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) { - if (above_mbmi->ref_frame[0] == LAST_FRAME && - left_mbmi->ref_frame[0] == LAST_FRAME) { - pred_context = 3; - } else if (above_mbmi->ref_frame[0] == LAST_FRAME || - left_mbmi->ref_frame[0] == LAST_FRAME) { - const MB_MODE_INFO *edge_mbmi = - above_mbmi->ref_frame[0] == LAST_FRAME ? left_mbmi : above_mbmi; - - pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME); - } else { - pred_context = 2 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME) + - 2 * (left_mbmi->ref_frame[0] == GOLDEN_FRAME); - } - } else if (has_second_ref(above_mbmi) && has_second_ref(left_mbmi)) { - if (above_mbmi->ref_frame[0] == left_mbmi->ref_frame[0] && - above_mbmi->ref_frame[1] == left_mbmi->ref_frame[1]) - pred_context = 3 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME || - above_mbmi->ref_frame[1] == GOLDEN_FRAME || - left_mbmi->ref_frame[0] == GOLDEN_FRAME || - left_mbmi->ref_frame[1] == GOLDEN_FRAME); + const int above_has_second = has_second_ref(above_mbmi); + const int left_has_second = has_second_ref(left_mbmi); + const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1]; + const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1]; + + if (above_has_second && left_has_second) { + if (above0 == left0 && above1 == left1) + pred_context = 3 * (above0 == GOLDEN_FRAME || + above1 == GOLDEN_FRAME || + left0 == GOLDEN_FRAME || + left1 == GOLDEN_FRAME); else pred_context = 2; - } else { - const MV_REFERENCE_FRAME rfs = !has_second_ref(above_mbmi) ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - const MV_REFERENCE_FRAME crf1 = has_second_ref(above_mbmi) ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - const MV_REFERENCE_FRAME crf2 = has_second_ref(above_mbmi) ? - above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1]; + } else if (above_has_second || left_has_second) { + const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1; if (rfs == GOLDEN_FRAME) pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); @@ -345,10 +313,21 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME; else pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); + } else { + if (above0 == LAST_FRAME && left0 == LAST_FRAME) { + pred_context = 3; + } else if (above0 == LAST_FRAME || left0 == LAST_FRAME) { + const MV_REFERENCE_FRAME edge0 = (above0 == LAST_FRAME) ? left0 + : above0; + pred_context = 4 * (edge0 == GOLDEN_FRAME); + } else { + pred_context = 2 * (above0 == GOLDEN_FRAME) + + 2 * (left0 == GOLDEN_FRAME); + } } } - } else if (above_in_image || left_in_image) { // one edge available - const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; + } else if (has_above || has_left) { // one edge available + const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi; if (!is_inter_block(edge_mbmi) || (edge_mbmi->ref_frame[0] == LAST_FRAME && !has_second_ref(edge_mbmi))) @@ -368,36 +347,23 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { // The mode info data structure has a one element border above and to the // left of the entries corresponding to real blocks. // The prediction flags in these dummy entries are initialized to 0. -unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) { - const MODE_INFO *const above_mi = get_above_mi(xd); - const MODE_INFO *const left_mi = get_left_mi(xd); - const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi); - const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi); - const int above_in_image = above_mi != NULL; - const int left_in_image = left_mi != NULL; +int vp9_get_tx_size_context(const MACROBLOCKD *xd) { const int max_tx_size = max_txsize_lookup[xd->mi_8x8[0]->mbmi.sb_type]; - int above_context = max_tx_size; - int left_context = max_tx_size; - - if (above_in_image) - above_context = above_mbmi->skip_coeff ? max_tx_size - : above_mbmi->tx_size; - - if (left_in_image) - left_context = left_mbmi->skip_coeff ? max_tx_size - : left_mbmi->tx_size; - - if (!left_in_image) - left_context = above_context; - - if (!above_in_image) - above_context = left_context; - - return above_context + left_context > max_tx_size; -} - -void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag) { - xd->mi_8x8[0]->mbmi.seg_id_predicted = pred_flag; + const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); + const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); + const int has_above = above_mbmi != NULL; + const int has_left = left_mbmi != NULL; + int above_ctx = (has_above && !above_mbmi->skip) ? above_mbmi->tx_size + : max_tx_size; + int left_ctx = (has_left && !left_mbmi->skip) ? left_mbmi->tx_size + : max_tx_size; + if (!has_left) + left_ctx = above_ctx; + + if (!has_above) + above_ctx = left_ctx; + + return (above_ctx + left_ctx) > max_tx_size; } int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids, diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h index 9190930..6c7a0d3 100644 --- a/libvpx/vp9/common/vp9_pred_common.h +++ b/libvpx/vp9/common/vp9_pred_common.h @@ -14,6 +14,10 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_onyxc_int.h" +#ifdef __cplusplus +extern "C" { +#endif + static INLINE const MODE_INFO *get_above_mi(const MACROBLOCKD *const xd) { return xd->up_available ? xd->mi_8x8[-xd->mode_info_stride] : NULL; } @@ -35,55 +39,42 @@ static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) { return above_sip + left_sip; } -static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg, +static INLINE vp9_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg, const MACROBLOCKD *xd) { return seg->pred_probs[vp9_get_pred_context_seg_id(xd)]; } -void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag); - -static INLINE int vp9_get_pred_context_mbskip(const MACROBLOCKD *xd) { +static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) { const MODE_INFO *const above_mi = get_above_mi(xd); const MODE_INFO *const left_mi = get_left_mi(xd); - const int above_skip_coeff = (above_mi != NULL) ? - above_mi->mbmi.skip_coeff : 0; - const int left_skip_coeff = (left_mi != NULL) ? left_mi->mbmi.skip_coeff : 0; - - return above_skip_coeff + left_skip_coeff; + const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip : 0; + const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip : 0; + return above_skip + left_skip; } -static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm, - const MACROBLOCKD *xd) { - return cm->fc.mbskip_probs[vp9_get_pred_context_mbskip(xd)]; +static INLINE vp9_prob vp9_get_skip_prob(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + return cm->fc.skip_probs[vp9_get_skip_context(xd)]; } -static INLINE unsigned char vp9_get_pred_flag_mbskip(const MACROBLOCKD *xd) { - return xd->mi_8x8[0]->mbmi.skip_coeff; -} +int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd); -unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd); +int vp9_get_intra_inter_context(const MACROBLOCKD *xd); -unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd); - -static INLINE vp9_prob vp9_get_pred_prob_intra_inter(const VP9_COMMON *cm, - const MACROBLOCKD *xd) { - const int pred_context = vp9_get_pred_context_intra_inter(xd); - return cm->fc.intra_inter_prob[pred_context]; +static INLINE vp9_prob vp9_get_intra_inter_prob(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + return cm->fc.intra_inter_prob[vp9_get_intra_inter_context(xd)]; } -unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm, - const MACROBLOCKD *xd); +int vp9_get_reference_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd); - -static INLINE -vp9_prob vp9_get_pred_prob_comp_inter_inter(const VP9_COMMON *cm, - const MACROBLOCKD *xd) { - const int pred_context = vp9_get_pred_context_comp_inter_inter(cm, xd); - return cm->fc.comp_inter_prob[pred_context]; +static INLINE vp9_prob vp9_get_reference_mode_prob(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + return cm->fc.comp_inter_prob[vp9_get_reference_mode_context(cm, xd)]; } -unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, - const MACROBLOCKD *xd); +int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, + const MACROBLOCKD *xd); static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm, const MACROBLOCKD *xd) { @@ -91,26 +82,24 @@ static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm, return cm->fc.comp_ref_prob[pred_context]; } -unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd); +int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd); static INLINE vp9_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm, const MACROBLOCKD *xd) { - const int pred_context = vp9_get_pred_context_single_ref_p1(xd); - return cm->fc.single_ref_prob[pred_context][0]; + return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0]; } -unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd); +int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd); static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm, const MACROBLOCKD *xd) { - const int pred_context = vp9_get_pred_context_single_ref_p2(xd); - return cm->fc.single_ref_prob[pred_context][1]; + return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1]; } -unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd); +int vp9_get_tx_size_context(const MACROBLOCKD *xd); -static const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, - const struct tx_probs *tx_probs) { +static INLINE const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, + const struct tx_probs *tx_probs) { switch (max_tx_size) { case TX_8X8: return tx_probs->p8x8[ctx]; @@ -119,19 +108,19 @@ static const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, case TX_32X32: return tx_probs->p32x32[ctx]; default: - assert(!"Invalid max_tx_size."); + assert(0 && "Invalid max_tx_size."); return NULL; } } -static const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size, const MACROBLOCKD *xd, - const struct tx_probs *tx_probs) { - const int ctx = vp9_get_pred_context_tx_size(xd); - return get_tx_probs(max_tx_size, ctx, tx_probs); +static INLINE const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size, + const MACROBLOCKD *xd, + const struct tx_probs *tx_probs) { + return get_tx_probs(max_tx_size, vp9_get_tx_size_context(xd), tx_probs); } -static unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, - struct tx_counts *tx_counts) { +static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, + struct tx_counts *tx_counts) { switch (max_tx_size) { case TX_8X8: return tx_counts->p8x8[ctx]; @@ -140,9 +129,13 @@ static unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, case TX_32X32: return tx_counts->p32x32[ctx]; default: - assert(!"Invalid max_tx_size."); + assert(0 && "Invalid max_tx_size."); return NULL; } } +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_PRED_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_prob.c b/libvpx/vp9/common/vp9_prob.c new file mode 100644 index 0000000..a1befc6 --- /dev/null +++ b/libvpx/vp9/common/vp9_prob.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_prob.h" + +const uint8_t vp9_norm[256] = { + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + + +static unsigned int tree_merge_probs_impl(unsigned int i, + const vp9_tree_index *tree, + const vp9_prob *pre_probs, + const unsigned int *counts, + unsigned int count_sat, + unsigned int max_update, + vp9_prob *probs) { + const int l = tree[i]; + const unsigned int left_count = (l <= 0) + ? counts[-l] + : tree_merge_probs_impl(l, tree, pre_probs, counts, + count_sat, max_update, probs); + const int r = tree[i + 1]; + const unsigned int right_count = (r <= 0) + ? counts[-r] + : tree_merge_probs_impl(r, tree, pre_probs, counts, + count_sat, max_update, probs); + const unsigned int ct[2] = { left_count, right_count }; + probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct, + count_sat, max_update); + return left_count + right_count; +} + +void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, + const unsigned int *counts, unsigned int count_sat, + unsigned int max_update_factor, vp9_prob *probs) { + tree_merge_probs_impl(0, tree, pre_probs, counts, count_sat, + max_update_factor, probs); +} diff --git a/libvpx/vp9/common/vp9_prob.h b/libvpx/vp9/common/vp9_prob.h new file mode 100644 index 0000000..f361480 --- /dev/null +++ b/libvpx/vp9/common/vp9_prob.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_PROB_H_ +#define VP9_COMMON_VP9_PROB_H_ + +#include "./vpx_config.h" + +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" + +#include "vp9/common/vp9_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint8_t vp9_prob; + +#define MAX_PROB 255 + +#define vp9_prob_half ((vp9_prob) 128) + +typedef int8_t vp9_tree_index; + +#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2) + +#define vp9_complement(x) (255 - x) + +/* We build coding trees compactly in arrays. + Each node of the tree is a pair of vp9_tree_indices. + Array index often references a corresponding probability table. + Index <= 0 means done encoding/decoding and value = -Index, + Index > 0 means need another bit, specification at index. + Nonnegative indices are always even; processing begins at node 0. */ + +typedef const vp9_tree_index vp9_tree[]; + +static INLINE vp9_prob clip_prob(int p) { + return (p > 255) ? 255u : (p < 1) ? 1u : p; +} + +// int64 is not needed for normal frame level calculations. +// However when outputting entropy stats accumulated over many frames +// or even clips we can overflow int math. +#ifdef ENTROPY_STATS +static INLINE vp9_prob get_prob(int num, int den) { + return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den); +} +#else +static INLINE vp9_prob get_prob(int num, int den) { + return (den == 0) ? 128u : clip_prob((num * 256 + (den >> 1)) / den); +} +#endif + +static INLINE vp9_prob get_binary_prob(int n0, int n1) { + return get_prob(n0, n0 + n1); +} + +/* This function assumes prob1 and prob2 are already within [1,255] range. */ +static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) { + return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8); +} + +static INLINE vp9_prob merge_probs(vp9_prob pre_prob, + const unsigned int ct[2], + unsigned int count_sat, + unsigned int max_update_factor) { + const vp9_prob prob = get_binary_prob(ct[0], ct[1]); + const unsigned int count = MIN(ct[0] + ct[1], count_sat); + const unsigned int factor = max_update_factor * count / count_sat; + return weighted_prob(pre_prob, prob, factor); +} + +void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, + const unsigned int *counts, unsigned int count_sat, + unsigned int max_update_factor, vp9_prob *probs); + + +DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_PROB_H_ diff --git a/libvpx/vp9/common/vp9_quant_common.c b/libvpx/vp9/common/vp9_quant_common.c index 6dbdb42..def1255 100644 --- a/libvpx/vp9/common/vp9_quant_common.c +++ b/libvpx/vp9/common/vp9_quant_common.c @@ -130,12 +130,13 @@ int16_t vp9_ac_quant(int qindex, int delta) { } -int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex) { +int vp9_get_qindex(const struct segmentation *seg, int segment_id, + int base_qindex) { if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) { const int data = vp9_get_segdata(seg, segment_id, SEG_LVL_ALT_Q); - return seg->abs_delta == SEGMENT_ABSDATA ? - data : // Abs value - clamp(base_qindex + data, 0, MAXQ); // Delta value + const int seg_qindex = seg->abs_delta == SEGMENT_ABSDATA ? + data : base_qindex + data; + return clamp(seg_qindex, 0, MAXQ); } else { return base_qindex; } diff --git a/libvpx/vp9/common/vp9_quant_common.h b/libvpx/vp9/common/vp9_quant_common.h index 83f2fb6..5811040 100644 --- a/libvpx/vp9/common/vp9_quant_common.h +++ b/libvpx/vp9/common/vp9_quant_common.h @@ -13,6 +13,10 @@ #include "vp9/common/vp9_blockd.h" +#ifdef __cplusplus +extern "C" { +#endif + #define MINQ 0 #define MAXQ 255 #define QINDEX_RANGE (MAXQ - MINQ + 1) @@ -23,6 +27,11 @@ void vp9_init_quant_tables(); int16_t vp9_dc_quant(int qindex, int delta); int16_t vp9_ac_quant(int qindex, int delta); -int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex); +int vp9_get_qindex(const struct segmentation *seg, int segment_id, + int base_qindex); + +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_COMMON_VP9_QUANT_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_reconinter.c b/libvpx/vp9/common/vp9_reconinter.c index 7cc66c8..005f370 100644 --- a/libvpx/vp9/common/vp9_reconinter.c +++ b/libvpx/vp9/common/vp9_reconinter.c @@ -20,59 +20,81 @@ #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" -void vp9_setup_interp_filters(MACROBLOCKD *xd, - INTERPOLATION_TYPE mcomp_filter_type, - VP9_COMMON *cm) { - if (xd->mi_8x8 && xd->mi_8x8[0]) { - MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; - - set_scale_factors(xd, mbmi->ref_frame[0] - LAST_FRAME, - mbmi->ref_frame[1] - LAST_FRAME, - cm->active_ref_scale); - } else { - set_scale_factors(xd, -1, -1, cm->active_ref_scale); - } +static void build_mc_border(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + int x, int y, int b_w, int b_h, int w, int h) { + // Get a pointer to the start of the real data for this row. + const uint8_t *ref_row = src - x - y * src_stride; + + if (y >= h) + ref_row += (h - 1) * src_stride; + else if (y > 0) + ref_row += y * src_stride; + + do { + int right = 0, copy; + int left = x < 0 ? -x : 0; + + if (left > b_w) + left = b_w; + + if (x + b_w > w) + right = x + b_w - w; + + if (right > b_w) + right = b_w; + + copy = b_w - left - right; + + if (left) + memset(dst, ref_row[0], left); + + if (copy) + memcpy(dst + left, ref_row + x + left, copy); - xd->subpix.filter_x = xd->subpix.filter_y = - vp9_get_filter_kernel(mcomp_filter_type == SWITCHABLE ? - EIGHTTAP : mcomp_filter_type); + if (right) + memset(dst + left + copy, ref_row[w - 1], right); - assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0); + dst += dst_stride; + ++y; + + if (y > 0 && y < h) + ref_row += src_stride; + } while (--b_h); } static void inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, - const MV32 *mv, - const struct scale_factors *scale, + const int subpel_x, + const int subpel_y, + const struct scale_factors *sf, int w, int h, int ref, - const struct subpix_fn_table *subpix, + const InterpKernel *kernel, int xs, int ys) { - const int subpel_x = mv->col & SUBPEL_MASK; - const int subpel_y = mv->row & SUBPEL_MASK; - - src += (mv->row >> SUBPEL_BITS) * src_stride + (mv->col >> SUBPEL_BITS); - scale->sfc->predict[subpel_x != 0][subpel_y != 0][ref]( + sf->predict[subpel_x != 0][subpel_y != 0][ref]( src, src_stride, dst, dst_stride, - subpix->filter_x[subpel_x], xs, - subpix->filter_y[subpel_y], ys, - w, h); + kernel[subpel_x], xs, kernel[subpel_y], ys, w, h); } void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const MV *src_mv, - const struct scale_factors *scale, + const struct scale_factors *sf, int w, int h, int ref, - const struct subpix_fn_table *subpix, - enum mv_precision precision) { + const InterpKernel *kernel, + enum mv_precision precision, + int x, int y) { const int is_q4 = precision == MV_PRECISION_Q4; const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2, is_q4 ? src_mv->col : src_mv->col * 2 }; - const struct scale_factors_common *sfc = scale->sfc; - const MV32 mv = sfc->scale_mv(&mv_q4, scale); + MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf); + const int subpel_x = mv.col & SUBPEL_MASK; + const int subpel_y = mv.row & SUBPEL_MASK; + + src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS); - inter_predictor(src, src_stride, dst, dst_stride, &mv, scale, - w, h, ref, subpix, sfc->x_step_q4, sfc->y_step_q4); + inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y, + sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4); } static INLINE int round_mv_comp_q4(int value) { @@ -117,30 +139,17 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv, return clamped_mv; } - -// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could -// calculate the subsampled BLOCK_SIZE, but that type isn't defined for -// sizes smaller than 16x16 yet. static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, - BLOCK_SIZE bsize, int pred_w, int pred_h, + int bw, int bh, + int x, int y, int w, int h, int mi_x, int mi_y) { struct macroblockd_plane *const pd = &xd->plane[plane]; - const int bwl = b_width_log2(bsize) - pd->subsampling_x; - const int bw = 4 << bwl; - const int bh = plane_block_height(bsize, pd); - const int x = 4 * (block & ((1 << bwl) - 1)); - const int y = 4 * (block >> bwl); const MODE_INFO *mi = xd->mi_8x8[0]; const int is_compound = has_second_ref(&mi->mbmi); int ref; - assert(x < bw); - assert(y < bh); - assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == bw); - assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == bh); - for (ref = 0; ref < 1 + is_compound; ++ref) { - struct scale_factors *const scale = &xd->scale_factor[ref]; + const struct scale_factors *const sf = &xd->block_refs[ref]->sf; struct buf_2d *const pre_buf = &pd->pre[ref]; struct buf_2d *const dst_buf = &pd->dst; uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; @@ -165,25 +174,27 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, uint8_t *pre; MV32 scaled_mv; - int xs, ys; - - if (vp9_is_scaled(scale->sfc)) { - pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, scale); - scale->sfc->set_scaled_offsets(scale, mi_y + y, mi_x + x); - scaled_mv = scale->sfc->scale_mv(&mv_q4, scale); - xs = scale->sfc->x_step_q4; - ys = scale->sfc->y_step_q4; + int xs, ys, subpel_x, subpel_y; + + if (vp9_is_scaled(sf)) { + pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf); + scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf); + xs = sf->x_step_q4; + ys = sf->y_step_q4; } else { pre = pre_buf->buf + (y * pre_buf->stride + x); scaled_mv.row = mv_q4.row; scaled_mv.col = mv_q4.col; xs = ys = 16; } + subpel_x = scaled_mv.col & SUBPEL_MASK; + subpel_y = scaled_mv.row & SUBPEL_MASK; + pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride + + (scaled_mv.col >> SUBPEL_BITS); inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, - &scaled_mv, scale, - 4 << pred_w, 4 << pred_h, ref, - &xd->subpix, xs, ys); + subpel_x, subpel_y, sf, w, h, ref, xd->interp_kernel, + xs, ys); } } @@ -191,20 +202,26 @@ static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize, int mi_row, int mi_col, int plane_from, int plane_to) { int plane; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; for (plane = plane_from; plane <= plane_to; ++plane) { - const int mi_x = mi_col * MI_SIZE; - const int mi_y = mi_row * MI_SIZE; - const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; - const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, + &xd->plane[plane]); + const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const int bw = 4 * num_4x4_w; + const int bh = 4 * num_4x4_h; if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) { int i = 0, x, y; assert(bsize == BLOCK_8X8); - for (y = 0; y < 1 << bhl; ++y) - for (x = 0; x < 1 << bwl; ++x) - build_inter_predictors(xd, plane, i++, bsize, 0, 0, mi_x, mi_y); + for (y = 0; y < num_4x4_h; ++y) + for (x = 0; x < num_4x4_w; ++x) + build_inter_predictors(xd, plane, i++, bw, bh, + 4 * x, 4 * y, 4, 4, mi_x, mi_y); } else { - build_inter_predictors(xd, plane, 0, bsize, bwl, bhl, mi_x, mi_y); + build_inter_predictors(xd, plane, 0, bw, bh, + 0, 0, bw, bh, mi_x, mi_y); } } } @@ -224,22 +241,206 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, MAX_MB_PLANE - 1); } -// TODO(dkovalev: find better place for this function) -void vp9_setup_scale_factors(VP9_COMMON *cm, int i) { - const int ref = cm->active_ref_idx[i]; - struct scale_factors *const sf = &cm->active_ref_scale[i]; - struct scale_factors_common *const sfc = &cm->active_ref_scale_comm[i]; - if (ref >= NUM_YV12_BUFFERS) { - vp9_zero(*sf); - vp9_zero(*sfc); - } else { - YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[ref]; - vp9_setup_scale_factors_for_frame(sf, sfc, - fb->y_crop_width, fb->y_crop_height, - cm->width, cm->height); - - if (vp9_is_scaled(sfc)) - vp9_extend_frame_borders(fb, cm->subsampling_x, cm->subsampling_y); +// TODO(jingning): This function serves as a placeholder for decoder prediction +// using on demand border extension. It should be moved to /decoder/ directory. +static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, + int bw, int bh, + int x, int y, int w, int h, + int mi_x, int mi_y) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const MODE_INFO *mi = xd->mi_8x8[0]; + const int is_compound = has_second_ref(&mi->mbmi); + int ref; + + for (ref = 0; ref < 1 + is_compound; ++ref) { + const struct scale_factors *const sf = &xd->block_refs[ref]->sf; + struct buf_2d *const pre_buf = &pd->pre[ref]; + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; + + // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the + // same MV (the average of the 4 luma MVs) but we could do something + // smarter for non-4:2:0. Just punt for now, pending the changes to get + // rid of SPLITMV mode entirely. + const MV mv = mi->mbmi.sb_type < BLOCK_8X8 + ? (plane == 0 ? mi->bmi[block].as_mv[ref].as_mv + : mi_mv_pred_q4(mi, ref)) + : mi->mbmi.mv[ref].as_mv; + + // TODO(jkoleszar): This clamping is done in the incorrect place for the + // scaling case. It needs to be done on the scaled MV, not the pre-scaling + // MV. Note however that it performs the subsampling aware scaling so + // that the result is always q4. + // mv_precision precision is MV_PRECISION_Q4. + const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh, + pd->subsampling_x, + pd->subsampling_y); + + MV32 scaled_mv; + int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height, buf_stride, + subpel_x, subpel_y; + uint8_t *ref_frame, *buf_ptr; + const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf; + + // Get reference frame pointer, width and height. + if (plane == 0) { + frame_width = ref_buf->y_crop_width; + frame_height = ref_buf->y_crop_height; + ref_frame = ref_buf->y_buffer; + } else { + frame_width = ref_buf->uv_crop_width; + frame_height = ref_buf->uv_crop_height; + ref_frame = plane == 1 ? ref_buf->u_buffer : ref_buf->v_buffer; + } + + if (vp9_is_scaled(sf)) { + // Co-ordinate of containing block to pixel precision. + int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)); + int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)); + + // Co-ordinate of the block to 1/16th pixel precision. + x0_16 = (x_start + x) << SUBPEL_BITS; + y0_16 = (y_start + y) << SUBPEL_BITS; + + // Co-ordinate of current block in reference frame + // to 1/16th pixel precision. + x0_16 = sf->scale_value_x(x0_16, sf); + y0_16 = sf->scale_value_y(y0_16, sf); + + // Map the top left corner of the block into the reference frame. + // NOTE: This must be done in this way instead of + // sf->scale_value_x(x_start + x, sf). + x0 = sf->scale_value_x(x_start, sf) + sf->scale_value_x(x, sf); + y0 = sf->scale_value_y(y_start, sf) + sf->scale_value_y(y, sf); + + // Scale the MV and incorporate the sub-pixel offset of the block + // in the reference frame. + scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf); + xs = sf->x_step_q4; + ys = sf->y_step_q4; + } else { + // Co-ordinate of containing block to pixel precision. + x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x; + y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y; + + // Co-ordinate of the block to 1/16th pixel precision. + x0_16 = x0 << SUBPEL_BITS; + y0_16 = y0 << SUBPEL_BITS; + + scaled_mv.row = mv_q4.row; + scaled_mv.col = mv_q4.col; + xs = ys = 16; + } + subpel_x = scaled_mv.col & SUBPEL_MASK; + subpel_y = scaled_mv.row & SUBPEL_MASK; + + // Calculate the top left corner of the best matching block in the reference frame. + x0 += scaled_mv.col >> SUBPEL_BITS; + y0 += scaled_mv.row >> SUBPEL_BITS; + x0_16 += scaled_mv.col; + y0_16 += scaled_mv.row; + + // Get reference block pointer. + buf_ptr = ref_frame + y0 * pre_buf->stride + x0; + buf_stride = pre_buf->stride; + + // Do border extension if there is motion or the + // width/height is not a multiple of 8 pixels. + if (scaled_mv.col || scaled_mv.row || + (frame_width & 0x7) || (frame_height & 0x7)) { + // Get reference block bottom right coordinate. + int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1; + int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1; + int x_pad = 0, y_pad = 0; + + if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) { + x0 -= VP9_INTERP_EXTEND - 1; + x1 += VP9_INTERP_EXTEND; + x_pad = 1; + } + + if (subpel_y || (sf->y_step_q4 & SUBPEL_MASK)) { + y0 -= VP9_INTERP_EXTEND - 1; + y1 += VP9_INTERP_EXTEND; + y_pad = 1; + } + + // Skip border extension if block is inside the frame. + if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width || + y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) { + uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0; + // Extend the border. + build_mc_border(buf_ptr1, pre_buf->stride, xd->mc_buf, x1 - x0 + 1, + x0, y0, x1 - x0 + 1, y1 - y0 + 1, frame_width, + frame_height); + buf_stride = x1 - x0 + 1; + buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3; + } + } + + inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, + subpel_y, sf, w, h, ref, xd->interp_kernel, xs, ys); + } +} + +void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + int plane; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, + &xd->plane[plane]); + const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const int bw = 4 * num_4x4_w; + const int bh = 4 * num_4x4_h; + + if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) { + int i = 0, x, y; + assert(bsize == BLOCK_8X8); + for (y = 0; y < num_4x4_h; ++y) + for (x = 0; x < num_4x4_w; ++x) + dec_build_inter_predictors(xd, plane, i++, bw, bh, + 4 * x, 4 * y, 4, 4, mi_x, mi_y); + } else { + dec_build_inter_predictors(xd, plane, 0, bw, bh, + 0, 0, bw, bh, mi_x, mi_y); + } } } +void vp9_setup_dst_planes(MACROBLOCKD *xd, + const YV12_BUFFER_CONFIG *src, + int mi_row, int mi_col) { + uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, + src->alpha_buffer}; + const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, + src->alpha_stride}; + int i; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + struct macroblockd_plane *const pd = &xd->plane[i]; + setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL, + pd->subsampling_x, pd->subsampling_y); + } +} + +void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx, + const YV12_BUFFER_CONFIG *src, + int mi_row, int mi_col, + const struct scale_factors *sf) { + if (src != NULL) { + int i; + uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, + src->alpha_buffer}; + const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, + src->alpha_stride}; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + struct macroblockd_plane *const pd = &xd->plane[i]; + setup_pred_plane(&pd->pre[idx], buffers[i], strides[i], mi_row, mi_col, + sf, pd->subsampling_x, pd->subsampling_y); + } + } +} diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h index 2c8a6e4..86f3158 100644 --- a/libvpx/vp9/common/vp9_reconinter.h +++ b/libvpx/vp9/common/vp9_reconinter.h @@ -14,7 +14,10 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_onyxc_int.h" -struct subpix_fn_table; +#ifdef __cplusplus +extern "C" { +#endif + void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize); @@ -24,80 +27,45 @@ void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col, void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize); -void vp9_setup_interp_filters(MACROBLOCKD *xd, - INTERPOLATION_TYPE filter, - VP9_COMMON *cm); +void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize); void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const MV *mv_q3, - const struct scale_factors *scale, + const struct scale_factors *sf, int w, int h, int do_avg, - const struct subpix_fn_table *subpix, - enum mv_precision precision); - -static int scaled_buffer_offset(int x_offset, int y_offset, int stride, - const struct scale_factors *scale) { - const int x = scale ? scale->sfc->scale_value_x(x_offset, scale->sfc) : - x_offset; - const int y = scale ? scale->sfc->scale_value_y(y_offset, scale->sfc) : - y_offset; + const InterpKernel *kernel, + enum mv_precision precision, + int x, int y); + +static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride, + const struct scale_factors *sf) { + const int x = sf ? sf->scale_value_x(x_offset, sf) : x_offset; + const int y = sf ? sf->scale_value_y(y_offset, sf) : y_offset; return y * stride + x; } -static void setup_pred_plane(struct buf_2d *dst, - uint8_t *src, int stride, - int mi_row, int mi_col, - const struct scale_factors *scale, - int subsampling_x, int subsampling_y) { +static INLINE void setup_pred_plane(struct buf_2d *dst, + uint8_t *src, int stride, + int mi_row, int mi_col, + const struct scale_factors *scale, + int subsampling_x, int subsampling_y) { const int x = (MI_SIZE * mi_col) >> subsampling_x; const int y = (MI_SIZE * mi_row) >> subsampling_y; dst->buf = src + scaled_buffer_offset(x, y, stride, scale); dst->stride = stride; } -// TODO(jkoleszar): audit all uses of this that don't set mb_row, mb_col -static void setup_dst_planes(MACROBLOCKD *xd, - const YV12_BUFFER_CONFIG *src, - int mi_row, int mi_col) { - uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; - int i; - - for (i = 0; i < MAX_MB_PLANE; ++i) { - struct macroblockd_plane *pd = &xd->plane[i]; - setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL, - pd->subsampling_x, pd->subsampling_y); - } -} - -static void setup_pre_planes(MACROBLOCKD *xd, int i, - const YV12_BUFFER_CONFIG *src, - int mi_row, int mi_col, - const struct scale_factors *sf) { - if (src) { - int j; - uint8_t* buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; - - for (j = 0; j < MAX_MB_PLANE; ++j) { - struct macroblockd_plane *pd = &xd->plane[j]; - setup_pred_plane(&pd->pre[i], buffers[j], strides[j], - mi_row, mi_col, sf, pd->subsampling_x, pd->subsampling_y); - } - } -} +void vp9_setup_dst_planes(MACROBLOCKD *xd, const YV12_BUFFER_CONFIG *src, + int mi_row, int mi_col); -static void set_scale_factors(MACROBLOCKD *xd, int ref0, int ref1, - struct scale_factors sf[MAX_REF_FRAMES]) { - xd->scale_factor[0] = sf[ref0 >= 0 ? ref0 : 0]; - xd->scale_factor[1] = sf[ref1 >= 0 ? ref1 : 0]; -} +void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx, + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const struct scale_factors *sf); -void vp9_setup_scale_factors(VP9_COMMON *cm, int i); +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_COMMON_VP9_RECONINTER_H_ diff --git a/libvpx/vp9/common/vp9_reconintra.c b/libvpx/vp9/common/vp9_reconintra.c index eb643b0..915c1c1 100644 --- a/libvpx/vp9/common/vp9_reconintra.c +++ b/libvpx/vp9/common/vp9_reconintra.c @@ -18,21 +18,17 @@ #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_onyxc_int.h" -const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = { - DCT_DCT, // DC - ADST_DCT, // V - DCT_ADST, // H - DCT_DCT, // D45 - ADST_ADST, // D135 - ADST_DCT, // D117 - DCT_ADST, // D153 - DCT_ADST, // D207 - ADST_DCT, // D63 - ADST_ADST, // TM - DCT_DCT, // NEARESTMV - DCT_DCT, // NEARMV - DCT_DCT, // ZEROMV - DCT_DCT // NEWMV +const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = { + DCT_DCT, // DC + ADST_DCT, // V + DCT_ADST, // H + DCT_DCT, // D45 + ADST_ADST, // D135 + ADST_DCT, // D117 + DCT_ADST, // D153 + DCT_ADST, // D207 + ADST_DCT, // D63 + ADST_ADST, // TM }; #define intra_pred_sized(type, size) \ @@ -313,17 +309,21 @@ static void init_intra_pred_fn_ptrs(void) { #undef intra_pred_allsizes } -static void build_intra_predictors(const uint8_t *ref, int ref_stride, - uint8_t *dst, int dst_stride, +static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, + int ref_stride, uint8_t *dst, int dst_stride, MB_PREDICTION_MODE mode, TX_SIZE tx_size, int up_available, int left_available, - int right_available) { + int right_available, int x, int y, + int plane) { int i; DECLARE_ALIGNED_ARRAY(16, uint8_t, left_col, 64); DECLARE_ALIGNED_ARRAY(16, uint8_t, above_data, 128 + 16); uint8_t *above_row = above_data + 16; const uint8_t *const_above_row = above_row; const int bs = 4 << tx_size; + int frame_width, frame_height; + int x0, y0; + const struct macroblockd_plane *const pd = &xd->plane[plane]; // 127 127 127 .. 127 127 127 127 127 127 // 129 A B .. Y Z @@ -334,26 +334,90 @@ static void build_intra_predictors(const uint8_t *ref, int ref_stride, once(init_intra_pred_fn_ptrs); + // Get current frame pointer, width and height. + if (plane == 0) { + frame_width = xd->cur_buf->y_width; + frame_height = xd->cur_buf->y_height; + } else { + frame_width = xd->cur_buf->uv_width; + frame_height = xd->cur_buf->uv_height; + } + + // Get block position in current frame. + x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x; + y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y; + // left if (left_available) { - for (i = 0; i < bs; i++) - left_col[i] = ref[i * ref_stride - 1]; + if (xd->mb_to_bottom_edge < 0) { + /* slower path if the block needs border extension */ + if (y0 + bs <= frame_height) { + for (i = 0; i < bs; ++i) + left_col[i] = ref[i * ref_stride - 1]; + } else { + const int extend_bottom = frame_height - y0; + for (i = 0; i < extend_bottom; ++i) + left_col[i] = ref[i * ref_stride - 1]; + for (; i < bs; ++i) + left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1]; + } + } else { + /* faster path if the block does not need extension */ + for (i = 0; i < bs; ++i) + left_col[i] = ref[i * ref_stride - 1]; + } } else { vpx_memset(left_col, 129, bs); } + // TODO(hkuang) do not extend 2*bs pixels for all modes. // above if (up_available) { const uint8_t *above_ref = ref - ref_stride; - if (bs == 4 && right_available && left_available) { - const_above_row = above_ref; - } else { - vpx_memcpy(above_row, above_ref, bs); - if (bs == 4 && right_available) - vpx_memcpy(above_row + bs, above_ref + bs, bs); - else - vpx_memset(above_row + bs, above_row[bs - 1], bs); + if (xd->mb_to_right_edge < 0) { + /* slower path if the block needs border extension */ + if (x0 + 2 * bs <= frame_width) { + if (right_available && bs == 4) { + vpx_memcpy(above_row, above_ref, 2 * bs); + } else { + vpx_memcpy(above_row, above_ref, bs); + vpx_memset(above_row + bs, above_row[bs - 1], bs); + } + } else if (x0 + bs <= frame_width) { + const int r = frame_width - x0; + if (right_available && bs == 4) { + vpx_memcpy(above_row, above_ref, r); + vpx_memset(above_row + r, above_row[r - 1], + x0 + 2 * bs - frame_width); + } else { + vpx_memcpy(above_row, above_ref, bs); + vpx_memset(above_row + bs, above_row[bs - 1], bs); + } + } else if (x0 <= frame_width) { + const int r = frame_width - x0; + if (right_available && bs == 4) { + vpx_memcpy(above_row, above_ref, r); + vpx_memset(above_row + r, above_row[r - 1], + x0 + 2 * bs - frame_width); + } else { + vpx_memcpy(above_row, above_ref, r); + vpx_memset(above_row + r, above_row[r - 1], + x0 + 2 * bs - frame_width); + } + } above_row[-1] = left_available ? above_ref[-1] : 129; + } else { + /* faster path if the block does not need extension */ + if (bs == 4 && right_available && left_available) { + const_above_row = above_ref; + } else { + vpx_memcpy(above_row, above_ref, bs); + if (bs == 4 && right_available) + vpx_memcpy(above_row + bs, above_ref + bs, bs); + else + vpx_memset(above_row + bs, above_row[bs - 1], bs); + above_row[-1] = left_available ? above_ref[-1] : 129; + } } } else { vpx_memset(above_row, 127, bs * 2); @@ -370,16 +434,19 @@ static void build_intra_predictors(const uint8_t *ref, int ref_stride, } void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in, - TX_SIZE tx_size, int mode, - const uint8_t *ref, int ref_stride, - uint8_t *dst, int dst_stride) { + TX_SIZE tx_size, MB_PREDICTION_MODE mode, + const uint8_t *ref, int ref_stride, + uint8_t *dst, int dst_stride, + int aoff, int loff, int plane) { const int bwl = bwl_in - tx_size; const int wmask = (1 << bwl) - 1; const int have_top = (block_idx >> bwl) || xd->up_available; const int have_left = (block_idx & wmask) || xd->left_available; const int have_right = ((block_idx & wmask) != wmask); + const int x = aoff * 4; + const int y = loff * 4; assert(bwl >= 0); - build_intra_predictors(ref, ref_stride, dst, dst_stride, mode, tx_size, - have_top, have_left, have_right); + build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size, + have_top, have_left, have_right, x, y, plane); } diff --git a/libvpx/vp9/common/vp9_reconintra.h b/libvpx/vp9/common/vp9_reconintra.h index 6e3f55c..abc1767 100644 --- a/libvpx/vp9/common/vp9_reconintra.h +++ b/libvpx/vp9/common/vp9_reconintra.h @@ -14,8 +14,17 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" +#ifdef __cplusplus +extern "C" { +#endif + void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in, - TX_SIZE tx_size, int mode, + TX_SIZE tx_size, MB_PREDICTION_MODE mode, const uint8_t *ref, int ref_stride, - uint8_t *dst, int dst_stride); + uint8_t *dst, int dst_stride, + int aoff, int loff, int plane); +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_RECONINTRA_H_ diff --git a/libvpx/vp9/common/vp9_rtcd_defs.pl b/libvpx/vp9/common/vp9_rtcd_defs.pl new file mode 100644 index 0000000..e4cd9d4 --- /dev/null +++ b/libvpx/vp9/common/vp9_rtcd_defs.pl @@ -0,0 +1,778 @@ +sub vp9_common_forward_decls() { +print <<EOF +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_enums.h" + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct vp9_variance_vtable; + +#define DEC_MVCOSTS int *mvjcost, int *mvcost[2] +struct mv; +union int_mv; +struct yv12_buffer_config; +EOF +} +forward_decls qw/vp9_common_forward_decls/; + +# x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly. +if (vpx_config("CONFIG_USE_X86INC") eq "yes") { + $mmx_x86inc = 'mmx'; + $sse_x86inc = 'sse'; + $sse2_x86inc = 'sse2'; + $ssse3_x86inc = 'ssse3'; + $avx_x86inc = 'avx'; + $avx2_x86inc = 'avx2'; +} else { + $mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc = + $avx_x86inc = $avx2_x86inc = ''; +} + +# this variable is for functions that are 64 bit only. +if ($opts{arch} eq "x86_64") { + $mmx_x86_64 = 'mmx'; + $sse2_x86_64 = 'sse2'; + $ssse3_x86_64 = 'ssse3'; + $avx_x86_64 = 'avx'; + $avx2_x86_64 = 'avx2'; +} else { + $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = + $avx_x86_64 = $avx2_x86_64 = ''; +} + +# +# RECON +# +add_proto qw/void vp9_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d207_predictor_4x4/, "$ssse3_x86inc"; + +add_proto qw/void vp9_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d45_predictor_4x4/, "$ssse3_x86inc"; + +add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc"; + +add_proto qw/void vp9_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_h_predictor_4x4 neon dspr2/, "$ssse3_x86inc"; + +add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d117_predictor_4x4/; + +add_proto qw/void vp9_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d135_predictor_4x4/; + +add_proto qw/void vp9_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc"; + +add_proto qw/void vp9_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_v_predictor_4x4 neon/, "$sse_x86inc"; + +add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_tm_predictor_4x4 neon dspr2/, "$sse_x86inc"; + +add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_dc_predictor_4x4 dspr2/, "$sse_x86inc"; + +add_proto qw/void vp9_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_dc_top_predictor_4x4/; + +add_proto qw/void vp9_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_dc_left_predictor_4x4/; + +add_proto qw/void vp9_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_dc_128_predictor_4x4/; + +add_proto qw/void vp9_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc"; + +add_proto qw/void vp9_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d45_predictor_8x8/, "$ssse3_x86inc"; + +add_proto qw/void vp9_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d63_predictor_8x8/, "$ssse3_x86inc"; + +add_proto qw/void vp9_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_h_predictor_8x8 neon dspr2/, "$ssse3_x86inc"; + +add_proto qw/void vp9_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d117_predictor_8x8/; + +add_proto qw/void vp9_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d135_predictor_8x8/; + +add_proto qw/void vp9_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d153_predictor_8x8/, "$ssse3_x86inc"; + +add_proto qw/void vp9_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_v_predictor_8x8 neon/, "$sse_x86inc"; + +add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_tm_predictor_8x8 neon dspr2/, "$sse2_x86inc"; + +add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_dc_predictor_8x8 dspr2/, "$sse_x86inc"; + +add_proto qw/void vp9_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_dc_top_predictor_8x8/; + +add_proto qw/void vp9_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_dc_left_predictor_8x8/; + +add_proto qw/void vp9_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_dc_128_predictor_8x8/; + +add_proto qw/void vp9_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d207_predictor_16x16/, "$ssse3_x86inc"; + +add_proto qw/void vp9_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d45_predictor_16x16/, "$ssse3_x86inc"; + +add_proto qw/void vp9_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d63_predictor_16x16/, "$ssse3_x86inc"; + +add_proto qw/void vp9_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_h_predictor_16x16 neon dspr2/, "$ssse3_x86inc"; + +add_proto qw/void vp9_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d117_predictor_16x16/; + +add_proto qw/void vp9_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d135_predictor_16x16/; + +add_proto qw/void vp9_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d153_predictor_16x16/, "$ssse3_x86inc"; + +add_proto qw/void vp9_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_v_predictor_16x16 neon/, "$sse2_x86inc"; + +add_proto qw/void vp9_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_tm_predictor_16x16 neon/, "$sse2_x86inc"; + +add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_dc_predictor_16x16 dspr2/, "$sse2_x86inc"; + +add_proto qw/void vp9_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_dc_top_predictor_16x16/; + +add_proto qw/void vp9_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_dc_left_predictor_16x16/; + +add_proto qw/void vp9_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_dc_128_predictor_16x16/; + +add_proto qw/void vp9_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d207_predictor_32x32/, "$ssse3_x86inc"; + +add_proto qw/void vp9_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d45_predictor_32x32/, "$ssse3_x86inc"; + +add_proto qw/void vp9_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d63_predictor_32x32/, "$ssse3_x86inc"; + +add_proto qw/void vp9_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_h_predictor_32x32 neon/, "$ssse3_x86inc"; + +add_proto qw/void vp9_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d117_predictor_32x32/; + +add_proto qw/void vp9_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d135_predictor_32x32/; + +add_proto qw/void vp9_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_d153_predictor_32x32/; + +add_proto qw/void vp9_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_v_predictor_32x32 neon/, "$sse2_x86inc"; + +add_proto qw/void vp9_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_tm_predictor_32x32 neon/, "$sse2_x86_64"; + +add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_dc_predictor_32x32/, "$sse2_x86inc"; + +add_proto qw/void vp9_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_dc_top_predictor_32x32/; + +add_proto qw/void vp9_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_dc_left_predictor_32x32/; + +add_proto qw/void vp9_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vp9_dc_128_predictor_32x32/; + +# +# Loopfilter +# +add_proto qw/void vp9_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/vp9_lpf_vertical_16 sse2 neon dspr2/; + +add_proto qw/void vp9_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/vp9_lpf_vertical_16_dual sse2 neon dspr2/; + +add_proto qw/void vp9_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; +specialize qw/vp9_lpf_vertical_8 sse2 neon dspr2/; + +add_proto qw/void vp9_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/vp9_lpf_vertical_8_dual sse2 neon dspr2/; + +add_proto qw/void vp9_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; +specialize qw/vp9_lpf_vertical_4 mmx neon dspr2/; + +add_proto qw/void vp9_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/vp9_lpf_vertical_4_dual sse2 neon dspr2/; + +add_proto qw/void vp9_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; +specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon dspr2/; + +add_proto qw/void vp9_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; +specialize qw/vp9_lpf_horizontal_8 sse2 neon dspr2/; + +add_proto qw/void vp9_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/vp9_lpf_horizontal_8_dual sse2 neon dspr2/; + +add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; +specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2/; + +add_proto qw/void vp9_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/vp9_lpf_horizontal_4_dual sse2 neon dspr2/; + +# +# post proc +# +if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") { +add_proto qw/void vp9_mbpost_proc_down/, "uint8_t *dst, int pitch, int rows, int cols, int flimit"; +specialize qw/vp9_mbpost_proc_down mmx sse2/; +$vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm; + +add_proto qw/void vp9_mbpost_proc_across_ip/, "uint8_t *src, int pitch, int rows, int cols, int flimit"; +specialize qw/vp9_mbpost_proc_across_ip sse2/; +$vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm; + +add_proto qw/void vp9_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"; +specialize qw/vp9_post_proc_down_and_across mmx sse2/; +$vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm; + +add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"; +specialize qw/vp9_plane_add_noise mmx sse2/; +$vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt; +} + +add_proto qw/void vp9_blend_mb_inner/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"; +specialize qw/vp9_blend_mb_inner/; + +add_proto qw/void vp9_blend_mb_outer/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"; +specialize qw/vp9_blend_mb_outer/; + +add_proto qw/void vp9_blend_b/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"; +specialize qw/vp9_blend_b/; + +# +# Sub Pixel Filters +# +add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +specialize qw/vp9_convolve_copy neon dspr2/, "$sse2_x86inc"; + +add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +specialize qw/vp9_convolve_avg neon dspr2/, "$sse2_x86inc"; + +add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +specialize qw/vp9_convolve8 sse2 ssse3 avx2 neon dspr2/; + +add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +specialize qw/vp9_convolve8_horiz sse2 ssse3 avx2 neon dspr2/; + +add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +specialize qw/vp9_convolve8_vert sse2 ssse3 avx2 neon dspr2/; + +add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +specialize qw/vp9_convolve8_avg sse2 ssse3 neon dspr2/; + +add_proto qw/void vp9_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2/; + +add_proto qw/void vp9_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon dspr2/; + +# +# dct +# +add_proto qw/void vp9_idct4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; +specialize qw/vp9_idct4x4_1_add sse2 neon dspr2/; + +add_proto qw/void vp9_idct4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; +specialize qw/vp9_idct4x4_16_add sse2 neon dspr2/; + +add_proto qw/void vp9_idct8x8_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; +specialize qw/vp9_idct8x8_1_add sse2 neon dspr2/; + +add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; +specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/; + +add_proto qw/void vp9_idct8x8_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; +specialize qw/vp9_idct8x8_10_add sse2 neon dspr2/; + +add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; +specialize qw/vp9_idct16x16_1_add sse2 neon dspr2/; + +add_proto qw/void vp9_idct16x16_256_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; +specialize qw/vp9_idct16x16_256_add sse2 neon dspr2/; + +add_proto qw/void vp9_idct16x16_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; +specialize qw/vp9_idct16x16_10_add sse2 neon dspr2/; + +add_proto qw/void vp9_idct32x32_1024_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; +specialize qw/vp9_idct32x32_1024_add sse2 neon dspr2/; + +add_proto qw/void vp9_idct32x32_34_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; +specialize qw/vp9_idct32x32_34_add sse2 neon dspr2/; +$vp9_idct32x32_34_add_neon=vp9_idct32x32_1024_add_neon; + +add_proto qw/void vp9_idct32x32_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; +specialize qw/vp9_idct32x32_1_add sse2 neon dspr2/; + +add_proto qw/void vp9_iht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"; +specialize qw/vp9_iht4x4_16_add sse2 neon dspr2/; + +add_proto qw/void vp9_iht8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"; +specialize qw/vp9_iht8x8_64_add sse2 neon dspr2/; + +add_proto qw/void vp9_iht16x16_256_add/, "const int16_t *input, uint8_t *output, int pitch, int tx_type"; +specialize qw/vp9_iht16x16_256_add sse2 dspr2/; + +# dct and add + +add_proto qw/void vp9_iwht4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; +specialize qw/vp9_iwht4x4_1_add/; + +add_proto qw/void vp9_iwht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride"; +specialize qw/vp9_iwht4x4_16_add/; + +# +# Encoder functions below this point. +# +if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { + + +# variance +add_proto qw/unsigned int vp9_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance32x16/, "$sse2_x86inc", "$avx2_x86inc"; + +add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance16x32/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance64x32/, "$sse2_x86inc", "$avx2_x86inc"; + +add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance32x64/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance32x32/, "$sse2_x86inc", "$avx2_x86inc"; + +add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance64x64/, "$sse2_x86inc", "$avx2_x86inc"; + +add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance16x16 mmx/, "$sse2_x86inc", "$avx2_x86inc"; + +add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance8x16 mmx/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance8x8 mmx/, "$sse2_x86inc"; + +add_proto qw/void vp9_get_sse_sum_8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; +specialize qw/vp9_get_sse_sum_8x8 sse2/; +$vp9_get_sse_sum_8x8_sse2=vp9_get8x8var_sse2; + +add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance8x4/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance4x8/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance4x4 mmx/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; +specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_sub_pixel_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; +specialize qw/vp9_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_sub_pixel_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; +specialize qw/vp9_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_sub_pixel_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; +specialize qw/vp9_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_sub_pixel_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; +specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_sub_pixel_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; +specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_sub_pixel_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; +specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_sub_pixel_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; +specialize qw/vp9_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_sub_pixel_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; +specialize qw/vp9_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_sub_pixel_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; +specialize qw/vp9_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc"; + +# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form +add_proto qw/unsigned int vp9_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_sub_pixel_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; +specialize qw/vp9_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_sub_pixel_variance4x8/, "$sse_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; +specialize qw/vp9_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc"; +#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt + +add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; +specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc"; + +add_proto qw/unsigned int vp9_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +specialize qw/vp9_sad64x64/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +specialize qw/vp9_sad32x64/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +specialize qw/vp9_sad64x32/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +specialize qw/vp9_sad32x16/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +specialize qw/vp9_sad16x32/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +specialize qw/vp9_sad32x32/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +specialize qw/vp9_sad16x16 mmx/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +specialize qw/vp9_sad16x8 mmx/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +specialize qw/vp9_sad8x16 mmx/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +specialize qw/vp9_sad8x8 mmx/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +specialize qw/vp9_sad8x4/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +specialize qw/vp9_sad4x8/, "$sse_x86inc"; + +add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"; +specialize qw/vp9_sad4x4 mmx/, "$sse_x86inc"; + +add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +specialize qw/vp9_sad64x64_avg/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +specialize qw/vp9_sad32x64_avg/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +specialize qw/vp9_sad64x32_avg/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +specialize qw/vp9_sad32x16_avg/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +specialize qw/vp9_sad16x32_avg/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +specialize qw/vp9_sad32x32_avg/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +specialize qw/vp9_sad16x16_avg/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +specialize qw/vp9_sad16x8_avg/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +specialize qw/vp9_sad8x16_avg/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +specialize qw/vp9_sad8x8_avg/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +specialize qw/vp9_sad8x4_avg/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +specialize qw/vp9_sad4x8_avg/, "$sse_x86inc"; + +add_proto qw/unsigned int vp9_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"; +specialize qw/vp9_sad4x4_avg/, "$sse_x86inc"; + +add_proto qw/unsigned int vp9_variance_halfpixvar16x16_h/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance_halfpixvar16x16_h/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_variance_halfpixvar16x16_v/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance_halfpixvar16x16_v/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_variance_halfpixvar16x16_hv/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance_halfpixvar16x16_hv/, "$sse2_x86inc"; + +add_proto qw/unsigned int vp9_variance_halfpixvar64x64_h/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance_halfpixvar64x64_h/; + +add_proto qw/unsigned int vp9_variance_halfpixvar64x64_v/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance_halfpixvar64x64_v/; + +add_proto qw/unsigned int vp9_variance_halfpixvar64x64_hv/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance_halfpixvar64x64_hv/; + +add_proto qw/unsigned int vp9_variance_halfpixvar32x32_h/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance_halfpixvar32x32_h/; + +add_proto qw/unsigned int vp9_variance_halfpixvar32x32_v/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance_halfpixvar32x32_v/; + +add_proto qw/unsigned int vp9_variance_halfpixvar32x32_hv/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_variance_halfpixvar32x32_hv/; + +add_proto qw/void vp9_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad64x64x3/; + +add_proto qw/void vp9_sad32x32x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad32x32x3/; + +add_proto qw/void vp9_sad16x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad16x16x3 sse3 ssse3/; + +add_proto qw/void vp9_sad16x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad16x8x3 sse3 ssse3/; + +add_proto qw/void vp9_sad8x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad8x16x3 sse3/; + +add_proto qw/void vp9_sad8x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad8x8x3 sse3/; + +add_proto qw/void vp9_sad4x4x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad4x4x3 sse3/; + +add_proto qw/void vp9_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +specialize qw/vp9_sad64x64x8/; + +add_proto qw/void vp9_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +specialize qw/vp9_sad32x32x8/; + +add_proto qw/void vp9_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +specialize qw/vp9_sad16x16x8 sse4/; + +add_proto qw/void vp9_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +specialize qw/vp9_sad16x8x8 sse4/; + +add_proto qw/void vp9_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +specialize qw/vp9_sad8x16x8 sse4/; + +add_proto qw/void vp9_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +specialize qw/vp9_sad8x8x8 sse4/; + +add_proto qw/void vp9_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +specialize qw/vp9_sad8x4x8/; + +add_proto qw/void vp9_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +specialize qw/vp9_sad4x8x8/; + +add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +specialize qw/vp9_sad4x4x8 sse4/; + +add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad64x64x4d sse2/; + +add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad32x64x4d sse2/; + +add_proto qw/void vp9_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad64x32x4d sse2/; + +add_proto qw/void vp9_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad32x16x4d sse2/; + +add_proto qw/void vp9_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad16x32x4d sse2/; + +add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad32x32x4d sse2/; + +add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad16x16x4d sse2/; + +add_proto qw/void vp9_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad16x8x4d sse2/; + +add_proto qw/void vp9_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad8x16x4d sse2/; + +add_proto qw/void vp9_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad8x8x4d sse2/; + +# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form +add_proto qw/void vp9_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad8x4x4d sse2/; + +add_proto qw/void vp9_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad4x8x4d sse/; + +add_proto qw/void vp9_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; +specialize qw/vp9_sad4x4x4d sse/; + +#add_proto qw/unsigned int vp9_sub_pixel_mse16x16/, "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"; +#specialize qw/vp9_sub_pixel_mse16x16 sse2 mmx/; + +add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; +specialize qw/vp9_mse16x16 mmx/, "$sse2_x86inc", "$avx2_x86inc"; + +add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; +specialize qw/vp9_mse8x16/; + +add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; +specialize qw/vp9_mse16x8/; + +add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; +specialize qw/vp9_mse8x8/; + +add_proto qw/unsigned int vp9_sub_pixel_mse64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_sub_pixel_mse64x64/; + +add_proto qw/unsigned int vp9_sub_pixel_mse32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp9_sub_pixel_mse32x32/; + +add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *"; +specialize qw/vp9_get_mb_ss mmx sse2/; +# ENCODEMB INVOKE + +add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"; +specialize qw/vp9_block_error/, "$sse2_x86inc"; + +add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; +specialize qw/vp9_subtract_block/, "$sse2_x86inc"; + +add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; +specialize qw/vp9_quantize_b/, "$ssse3_x86_64"; + +add_proto qw/void vp9_quantize_b_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; +specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64"; + +# +# Structured Similarity (SSIM) +# +if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") { + add_proto qw/void vp9_ssim_parms_8x8/, "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"; + specialize qw/vp9_ssim_parms_8x8/, "$sse2_x86_64"; + + add_proto qw/void vp9_ssim_parms_16x16/, "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"; + specialize qw/vp9_ssim_parms_16x16/, "$sse2_x86_64"; +} + +# fdct functions +add_proto qw/void vp9_fht4x4/, "const int16_t *input, int16_t *output, int stride, int tx_type"; +specialize qw/vp9_fht4x4 sse2 avx2/; + +add_proto qw/void vp9_fht8x8/, "const int16_t *input, int16_t *output, int stride, int tx_type"; +specialize qw/vp9_fht8x8 sse2 avx2/; + +add_proto qw/void vp9_fht16x16/, "const int16_t *input, int16_t *output, int stride, int tx_type"; +specialize qw/vp9_fht16x16 sse2 avx2/; + +add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride"; +specialize qw/vp9_fwht4x4/; + +add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride"; +specialize qw/vp9_fdct4x4 sse2 avx2/; + +add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride"; +specialize qw/vp9_fdct8x8 sse2 avx2/; + +add_proto qw/void vp9_fdct16x16/, "const int16_t *input, int16_t *output, int stride"; +specialize qw/vp9_fdct16x16 sse2 avx2/; + +add_proto qw/void vp9_fdct32x32/, "const int16_t *input, int16_t *output, int stride"; +specialize qw/vp9_fdct32x32 sse2 avx2/; + +add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, int16_t *output, int stride"; +specialize qw/vp9_fdct32x32_rd sse2 avx2/; + +# +# Motion search +# +add_proto qw/int vp9_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv, struct mv *best_mv"; +specialize qw/vp9_full_search_sad sse3 sse4_1/; +$vp9_full_search_sad_sse3=vp9_full_search_sadx3; +$vp9_full_search_sad_sse4_1=vp9_full_search_sadx8; + +add_proto qw/int vp9_refining_search_sad/, "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"; +specialize qw/vp9_refining_search_sad sse3/; +$vp9_refining_search_sad_sse3=vp9_refining_search_sadx4; + +add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"; +specialize qw/vp9_diamond_search_sad sse3/; +$vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4; + +add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"; +specialize qw/vp9_full_range_search/; + +add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; +specialize qw/vp9_temporal_filter_apply sse2/; + +} +# end encoder functions +1; diff --git a/libvpx/vp9/common/vp9_rtcd_defs.sh b/libvpx/vp9/common/vp9_rtcd_defs.sh deleted file mode 100644 index 2c0864e..0000000 --- a/libvpx/vp9/common/vp9_rtcd_defs.sh +++ /dev/null @@ -1,744 +0,0 @@ -vp9_common_forward_decls() { -cat <<EOF -/* - * VP9 - */ - -#include "vpx/vpx_integer.h" -#include "vp9/common/vp9_enums.h" - -struct macroblockd; - -/* Encoder forward decls */ -struct macroblock; -struct vp9_variance_vtable; - -#define DEC_MVCOSTS int *mvjcost, int *mvcost[2] -union int_mv; -struct yv12_buffer_config; -EOF -} -forward_decls vp9_common_forward_decls - -# x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly. -[ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse_x86inc=sse && - sse2_x86inc=sse2 && ssse3_x86inc=ssse3 && avx_x86inc=avx && avx2_x86inc=avx2 - -# this variable is for functions that are 64 bit only. -[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && - ssse3_x86_64=ssse3 && avx_x86_64=avx && avx2_x86_64=avx2 - -# -# RECON -# -prototype void vp9_d207_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d207_predictor_4x4 $ssse3_x86inc - -prototype void vp9_d45_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d45_predictor_4x4 $ssse3_x86inc - -prototype void vp9_d63_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d63_predictor_4x4 $ssse3_x86inc - -prototype void vp9_h_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_h_predictor_4x4 $ssse3_x86inc dspr2 - -prototype void vp9_d117_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d117_predictor_4x4 - -prototype void vp9_d135_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d135_predictor_4x4 - -prototype void vp9_d153_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d153_predictor_4x4 $ssse3_x86inc - -prototype void vp9_v_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_v_predictor_4x4 $sse_x86inc - -prototype void vp9_tm_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_tm_predictor_4x4 $sse_x86inc dspr2 - -prototype void vp9_dc_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_dc_predictor_4x4 $sse_x86inc dspr2 - -prototype void vp9_dc_top_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_dc_top_predictor_4x4 - -prototype void vp9_dc_left_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_dc_left_predictor_4x4 - -prototype void vp9_dc_128_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_dc_128_predictor_4x4 - -prototype void vp9_d207_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d207_predictor_8x8 $ssse3_x86inc - -prototype void vp9_d45_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d45_predictor_8x8 $ssse3_x86inc - -prototype void vp9_d63_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d63_predictor_8x8 $ssse3_x86inc - -prototype void vp9_h_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_h_predictor_8x8 $ssse3_x86inc dspr2 - -prototype void vp9_d117_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d117_predictor_8x8 - -prototype void vp9_d135_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d135_predictor_8x8 - -prototype void vp9_d153_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d153_predictor_8x8 $ssse3_x86inc - -prototype void vp9_v_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_v_predictor_8x8 $sse_x86inc - -prototype void vp9_tm_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_tm_predictor_8x8 $sse2_x86inc dspr2 - -prototype void vp9_dc_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_dc_predictor_8x8 $sse_x86inc dspr2 - -prototype void vp9_dc_top_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_dc_top_predictor_8x8 - -prototype void vp9_dc_left_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_dc_left_predictor_8x8 - -prototype void vp9_dc_128_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_dc_128_predictor_8x8 - -prototype void vp9_d207_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d207_predictor_16x16 $ssse3_x86inc - -prototype void vp9_d45_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d45_predictor_16x16 $ssse3_x86inc - -prototype void vp9_d63_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d63_predictor_16x16 $ssse3_x86inc - -prototype void vp9_h_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_h_predictor_16x16 $ssse3_x86inc dspr2 - -prototype void vp9_d117_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d117_predictor_16x16 - -prototype void vp9_d135_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d135_predictor_16x16 - -prototype void vp9_d153_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d153_predictor_16x16 $ssse3_x86inc - -prototype void vp9_v_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_v_predictor_16x16 $sse2_x86inc - -prototype void vp9_tm_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_tm_predictor_16x16 $sse2_x86inc - -prototype void vp9_dc_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_dc_predictor_16x16 $sse2_x86inc dspr2 - -prototype void vp9_dc_top_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_dc_top_predictor_16x16 - -prototype void vp9_dc_left_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_dc_left_predictor_16x16 - -prototype void vp9_dc_128_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_dc_128_predictor_16x16 - -prototype void vp9_d207_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d207_predictor_32x32 $ssse3_x86inc - -prototype void vp9_d45_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d45_predictor_32x32 $ssse3_x86inc - -prototype void vp9_d63_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d63_predictor_32x32 $ssse3_x86inc - -prototype void vp9_h_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_h_predictor_32x32 $ssse3_x86inc - -prototype void vp9_d117_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d117_predictor_32x32 - -prototype void vp9_d135_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d135_predictor_32x32 - -prototype void vp9_d153_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d153_predictor_32x32 - -prototype void vp9_v_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_v_predictor_32x32 $sse2_x86inc - -prototype void vp9_tm_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_tm_predictor_32x32 $sse2_x86_64 - -prototype void vp9_dc_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_dc_predictor_32x32 $sse2_x86inc - -prototype void vp9_dc_top_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_dc_top_predictor_32x32 - -prototype void vp9_dc_left_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_dc_left_predictor_32x32 - -prototype void vp9_dc_128_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_dc_128_predictor_32x32 - -# -# Loopfilter -# -prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh" -specialize vp9_mb_lpf_vertical_edge_w sse2 neon dspr2 - -prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_mbloop_filter_vertical_edge sse2 neon dspr2 - -prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_loop_filter_vertical_edge mmx neon dspr2 - -prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon dspr2 - -prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_mbloop_filter_horizontal_edge sse2 neon dspr2 - -prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_loop_filter_horizontal_edge mmx neon dspr2 - -# -# post proc -# -if [ "$CONFIG_VP9_POSTPROC" = "yes" ]; then -prototype void vp9_mbpost_proc_down "uint8_t *dst, int pitch, int rows, int cols, int flimit" -specialize vp9_mbpost_proc_down mmx sse2 -vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm - -prototype void vp9_mbpost_proc_across_ip "uint8_t *src, int pitch, int rows, int cols, int flimit" -specialize vp9_mbpost_proc_across_ip sse2 -vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm - -prototype void vp9_post_proc_down_and_across "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit" -specialize vp9_post_proc_down_and_across mmx sse2 -vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm - -prototype void vp9_plane_add_noise "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch" -specialize vp9_plane_add_noise mmx sse2 -vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt -fi - -prototype void vp9_blend_mb_inner "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride" -specialize vp9_blend_mb_inner - -prototype void vp9_blend_mb_outer "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride" -specialize vp9_blend_mb_outer - -prototype void vp9_blend_b "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride" -specialize vp9_blend_b - -# -# Sub Pixel Filters -# -prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve_copy $sse2_x86inc neon dspr2 - -prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve_avg $sse2_x86inc neon dspr2 - -prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8 sse2 ssse3 neon dspr2 - -prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_horiz sse2 ssse3 neon dspr2 - -prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_vert sse2 ssse3 neon dspr2 - -prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_avg sse2 ssse3 neon dspr2 - -prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2 - -prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_avg_vert sse2 ssse3 neon dspr2 - -# -# dct -# -prototype void vp9_idct4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct4x4_1_add sse2 neon dspr2 - -prototype void vp9_idct4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct4x4_16_add sse2 neon dspr2 - -prototype void vp9_idct8x8_1_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct8x8_1_add sse2 neon dspr2 - -prototype void vp9_idct8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct8x8_64_add sse2 neon dspr2 - -prototype void vp9_idct8x8_10_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct8x8_10_add sse2 neon dspr2 - -prototype void vp9_idct16x16_1_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct16x16_1_add sse2 neon dspr2 - -prototype void vp9_idct16x16_256_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct16x16_256_add sse2 neon dspr2 - -prototype void vp9_idct16x16_10_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct16x16_10_add sse2 neon dspr2 - -prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct32x32_1024_add sse2 neon dspr2 - -prototype void vp9_idct32x32_34_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct32x32_34_add sse2 dspr2 - -prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct32x32_1_add sse2 neon dspr2 - -prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type" -specialize vp9_iht4x4_16_add sse2 neon dspr2 - -prototype void vp9_iht8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type" -specialize vp9_iht8x8_64_add sse2 neon dspr2 - -prototype void vp9_iht16x16_256_add "const int16_t *input, uint8_t *output, int pitch, int tx_type" -specialize vp9_iht16x16_256_add sse2 dspr2 - -# dct and add - -prototype void vp9_iwht4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_iwht4x4_1_add - -prototype void vp9_iwht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_iwht4x4_16_add - -# -# Encoder functions below this point. -# -if [ "$CONFIG_VP9_ENCODER" = "yes" ]; then - - -# variance -prototype unsigned int vp9_variance32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance32x16 $sse2_x86inc - -prototype unsigned int vp9_variance16x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance16x32 $sse2_x86inc - -prototype unsigned int vp9_variance64x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance64x32 $sse2_x86inc - -prototype unsigned int vp9_variance32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance32x64 $sse2_x86inc - -prototype unsigned int vp9_variance32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance32x32 $sse2_x86inc - -prototype unsigned int vp9_variance64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance64x64 $sse2_x86inc - -prototype unsigned int vp9_variance16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance16x16 mmx $sse2_x86inc - -prototype unsigned int vp9_variance16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance16x8 mmx $sse2_x86inc - -prototype unsigned int vp9_variance8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance8x16 mmx $sse2_x86inc - -prototype unsigned int vp9_variance8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance8x8 mmx $sse2_x86inc - -prototype void vp9_get_sse_sum_8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum" -specialize vp9_get_sse_sum_8x8 sse2 -vp9_get_sse_sum_8x8_sse2=vp9_get8x8var_sse2 - -prototype unsigned int vp9_variance8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance8x4 $sse2_x86inc - -prototype unsigned int vp9_variance4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance4x8 $sse2_x86inc - -prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance4x4 mmx $sse2_x86inc - -prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance64x64 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x64 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance32x64 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance64x32 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance64x32 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x16 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance32x16 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x32 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance16x32 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance32x32 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x16 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance16x16 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x16 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance8x16 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x8 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance16x8 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x8 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance8x8 $sse2_x86inc $ssse3_x86inc - -# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form -prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x4 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance8x4 $sse2_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance4x8 $sse_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance4x8 $sse_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance4x4 $sse_x86inc $ssse3_x86inc -#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt - -prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance4x4 $sse_x86inc $ssse3_x86inc - -prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad64x64 $sse2_x86inc - -prototype unsigned int vp9_sad32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad32x64 $sse2_x86inc - -prototype unsigned int vp9_sad64x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad64x32 $sse2_x86inc - -prototype unsigned int vp9_sad32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad32x16 $sse2_x86inc - -prototype unsigned int vp9_sad16x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad16x32 $sse2_x86inc - -prototype unsigned int vp9_sad32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad32x32 $sse2_x86inc - -prototype unsigned int vp9_sad16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad16x16 mmx $sse2_x86inc - -prototype unsigned int vp9_sad16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad16x8 mmx $sse2_x86inc - -prototype unsigned int vp9_sad8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad8x16 mmx $sse2_x86inc - -prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad8x8 mmx $sse2_x86inc - -prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad8x4 $sse2_x86inc - -prototype unsigned int vp9_sad4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad4x8 $sse_x86inc - -prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad4x4 mmx $sse_x86inc - -prototype unsigned int vp9_sad64x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad64x64_avg $sse2_x86inc - -prototype unsigned int vp9_sad32x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad32x64_avg $sse2_x86inc - -prototype unsigned int vp9_sad64x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad64x32_avg $sse2_x86inc - -prototype unsigned int vp9_sad32x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad32x16_avg $sse2_x86inc - -prototype unsigned int vp9_sad16x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad16x32_avg $sse2_x86inc - -prototype unsigned int vp9_sad32x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad32x32_avg $sse2_x86inc - -prototype unsigned int vp9_sad16x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad16x16_avg $sse2_x86inc - -prototype unsigned int vp9_sad16x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad16x8_avg $sse2_x86inc - -prototype unsigned int vp9_sad8x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad8x16_avg $sse2_x86inc - -prototype unsigned int vp9_sad8x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad8x8_avg $sse2_x86inc - -prototype unsigned int vp9_sad8x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad8x4_avg $sse2_x86inc - -prototype unsigned int vp9_sad4x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad4x8_avg $sse_x86inc - -prototype unsigned int vp9_sad4x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad4x4_avg $sse_x86inc - -prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_h $sse2_x86inc - -prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_v $sse2_x86inc - -prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_hv $sse2_x86inc - -prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar64x64_h - -prototype unsigned int vp9_variance_halfpixvar64x64_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar64x64_v - -prototype unsigned int vp9_variance_halfpixvar64x64_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar64x64_hv - -prototype unsigned int vp9_variance_halfpixvar32x32_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar32x32_h - -prototype unsigned int vp9_variance_halfpixvar32x32_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar32x32_v - -prototype unsigned int vp9_variance_halfpixvar32x32_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar32x32_hv - -prototype void vp9_sad64x64x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp9_sad64x64x3 - -prototype void vp9_sad32x32x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp9_sad32x32x3 - -prototype void vp9_sad16x16x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp9_sad16x16x3 sse3 ssse3 - -prototype void vp9_sad16x8x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp9_sad16x8x3 sse3 ssse3 - -prototype void vp9_sad8x16x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp9_sad8x16x3 sse3 - -prototype void vp9_sad8x8x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp9_sad8x8x3 sse3 - -prototype void vp9_sad4x4x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp9_sad4x4x3 sse3 - -prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" -specialize vp9_sad64x64x8 - -prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" -specialize vp9_sad32x32x8 - -prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" -specialize vp9_sad16x16x8 sse4 - -prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" -specialize vp9_sad16x8x8 sse4 - -prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" -specialize vp9_sad8x16x8 sse4 - -prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" -specialize vp9_sad8x8x8 sse4 - -prototype void vp9_sad8x4x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" -specialize vp9_sad8x4x8 - -prototype void vp9_sad4x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" -specialize vp9_sad4x8x8 - -prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" -specialize vp9_sad4x4x8 sse4 - -prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp9_sad64x64x4d sse2 - -prototype void vp9_sad32x64x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp9_sad32x64x4d sse2 - -prototype void vp9_sad64x32x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp9_sad64x32x4d sse2 - -prototype void vp9_sad32x16x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp9_sad32x16x4d sse2 - -prototype void vp9_sad16x32x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp9_sad16x32x4d sse2 - -prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp9_sad32x32x4d sse2 - -prototype void vp9_sad16x16x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp9_sad16x16x4d sse2 - -prototype void vp9_sad16x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp9_sad16x8x4d sse2 - -prototype void vp9_sad8x16x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp9_sad8x16x4d sse2 - -prototype void vp9_sad8x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp9_sad8x8x4d sse2 - -# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form -prototype void vp9_sad8x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp9_sad8x4x4d sse2 - -prototype void vp9_sad4x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp9_sad4x8x4d sse - -prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp9_sad4x4x4d sse - -#prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse" -#specialize vp9_sub_pixel_mse16x16 sse2 mmx - -prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse" -specialize vp9_mse16x16 mmx $sse2_x86inc - -prototype unsigned int vp9_mse8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse" -specialize vp9_mse8x16 - -prototype unsigned int vp9_mse16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse" -specialize vp9_mse16x8 - -prototype unsigned int vp9_mse8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse" -specialize vp9_mse8x8 - -prototype unsigned int vp9_sub_pixel_mse64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_mse64x64 - -prototype unsigned int vp9_sub_pixel_mse32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_mse32x32 - -prototype unsigned int vp9_get_mb_ss "const int16_t *" -specialize vp9_get_mb_ss mmx sse2 -# ENCODEMB INVOKE - -prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz" -specialize vp9_block_error $sse2_x86inc - -prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride" -specialize vp9_subtract_block $sse2_x86inc - -prototype void vp9_quantize_b "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" -specialize vp9_quantize_b $ssse3_x86_64 - -prototype void vp9_quantize_b_32x32 "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" -specialize vp9_quantize_b_32x32 $ssse3_x86_64 - -# -# Structured Similarity (SSIM) -# -if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then - prototype void vp9_ssim_parms_8x8 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr" - specialize vp9_ssim_parms_8x8 $sse2_x86_64 - - prototype void vp9_ssim_parms_16x16 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr" - specialize vp9_ssim_parms_16x16 $sse2_x86_64 -fi - -# fdct functions -prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type" -specialize vp9_short_fht4x4 sse2 - -prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type" -specialize vp9_short_fht8x8 sse2 - -prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type" -specialize vp9_short_fht16x16 sse2 - -prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride" -specialize vp9_fwht4x4 - -prototype void vp9_fdct4x4 "const int16_t *input, int16_t *output, int stride" -specialize vp9_fdct4x4 sse2 - -prototype void vp9_fdct8x8 "const int16_t *input, int16_t *output, int stride" -specialize vp9_fdct8x8 sse2 - -prototype void vp9_fdct16x16 "const int16_t *input, int16_t *output, int stride" -specialize vp9_fdct16x16 sse2 - -prototype void vp9_fdct32x32 "const int16_t *input, int16_t *output, int stride" -specialize vp9_fdct32x32 sse2 - -prototype void vp9_fdct32x32_rd "const int16_t *input, int16_t *output, int stride" -specialize vp9_fdct32x32_rd sse2 - -# -# Motion search -# -prototype int vp9_full_search_sad "struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n" -specialize vp9_full_search_sad sse3 sse4_1 -vp9_full_search_sad_sse3=vp9_full_search_sadx3 -vp9_full_search_sad_sse4_1=vp9_full_search_sadx8 - -prototype int vp9_refining_search_sad "struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv" -specialize vp9_refining_search_sad sse3 -vp9_refining_search_sad_sse3=vp9_refining_search_sadx4 - -prototype int vp9_diamond_search_sad "struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv" -specialize vp9_diamond_search_sad sse3 -vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4 - -prototype void vp9_temporal_filter_apply "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count" -specialize vp9_temporal_filter_apply sse2 - -prototype void vp9_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction" -specialize vp9_yv12_copy_partial_frame - - -fi -# end encoder functions diff --git a/libvpx/vp9/common/vp9_sadmxn.h b/libvpx/vp9/common/vp9_sadmxn.h deleted file mode 100644 index b2dfd63..0000000 --- a/libvpx/vp9/common/vp9_sadmxn.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_VP9_SADMXN_H_ -#define VP9_COMMON_VP9_SADMXN_H_ - -#include "./vpx_config.h" -#include "vpx/vpx_integer.h" - -static INLINE unsigned int sad_mx_n_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - int m, - int n) { - int r, c; - unsigned int sad = 0; - - for (r = 0; r < n; r++) { - for (c = 0; c < m; c++) { - sad += abs(src_ptr[c] - ref_ptr[c]); - } - - src_ptr += src_stride; - ref_ptr += ref_stride; - } - - return sad; -} - -#endif // VP9_COMMON_VP9_SADMXN_H_ diff --git a/libvpx/vp9/common/vp9_scale.c b/libvpx/vp9/common/vp9_scale.c index 3f0994f..e0f1e34 100644 --- a/libvpx/vp9/common/vp9_scale.c +++ b/libvpx/vp9/common/vp9_scale.c @@ -12,47 +12,19 @@ #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_scale.h" -static INLINE int scaled_x(int val, const struct scale_factors_common *sfc) { - return val * sfc->x_scale_fp >> REF_SCALE_SHIFT; +static INLINE int scaled_x(int val, const struct scale_factors *sf) { + return val * sf->x_scale_fp >> REF_SCALE_SHIFT; } -static INLINE int scaled_y(int val, const struct scale_factors_common *sfc) { - return val * sfc->y_scale_fp >> REF_SCALE_SHIFT; +static INLINE int scaled_y(int val, const struct scale_factors *sf) { + return val * sf->y_scale_fp >> REF_SCALE_SHIFT; } -static int unscaled_value(int val, const struct scale_factors_common *sfc) { - (void) sfc; +static int unscaled_value(int val, const struct scale_factors *sf) { + (void) sf; return val; } -static MV32 scaled_mv(const MV *mv, const struct scale_factors *scale) { - const MV32 res = { - scaled_y(mv->row, scale->sfc) + scale->y_offset_q4, - scaled_x(mv->col, scale->sfc) + scale->x_offset_q4 - }; - return res; -} - -static MV32 unscaled_mv(const MV *mv, const struct scale_factors *scale) { - const MV32 res = { - mv->row, - mv->col - }; - return res; -} - -static void set_offsets_with_scaling(struct scale_factors *scale, - int row, int col) { - scale->x_offset_q4 = scaled_x(col << SUBPEL_BITS, scale->sfc) & SUBPEL_MASK; - scale->y_offset_q4 = scaled_y(row << SUBPEL_BITS, scale->sfc) & SUBPEL_MASK; -} - -static void set_offsets_without_scaling(struct scale_factors *scale, - int row, int col) { - scale->x_offset_q4 = 0; - scale->y_offset_q4 = 0; -} - static int get_fixed_point_scale_factor(int other_size, int this_size) { // Calculate scaling factor once for each reference frame // and use fixed point scaling factors in decoding and encoding routines. @@ -69,31 +41,36 @@ static int check_scale_factors(int other_w, int other_h, this_h <= 16 * other_h; } -void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, - struct scale_factors_common *scale_comm, +MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf) { + const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf) & SUBPEL_MASK; + const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf) & SUBPEL_MASK; + const MV32 res = { + scaled_y(mv->row, sf) + y_off_q4, + scaled_x(mv->col, sf) + x_off_q4 + }; + return res; +} + +void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, int other_h, int this_w, int this_h) { if (!check_scale_factors(other_w, other_h, this_w, this_h)) { - scale_comm->x_scale_fp = REF_INVALID_SCALE; - scale_comm->y_scale_fp = REF_INVALID_SCALE; + sf->x_scale_fp = REF_INVALID_SCALE; + sf->y_scale_fp = REF_INVALID_SCALE; return; } - scale_comm->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w); - scale_comm->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h); - scale_comm->x_step_q4 = scaled_x(16, scale_comm); - scale_comm->y_step_q4 = scaled_y(16, scale_comm); + sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w); + sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h); + sf->x_step_q4 = scaled_x(16, sf); + sf->y_step_q4 = scaled_y(16, sf); - if (vp9_is_scaled(scale_comm)) { - scale_comm->scale_value_x = scaled_x; - scale_comm->scale_value_y = scaled_y; - scale_comm->set_scaled_offsets = set_offsets_with_scaling; - scale_comm->scale_mv = scaled_mv; + if (vp9_is_scaled(sf)) { + sf->scale_value_x = scaled_x; + sf->scale_value_y = scaled_y; } else { - scale_comm->scale_value_x = unscaled_value; - scale_comm->scale_value_y = unscaled_value; - scale_comm->set_scaled_offsets = set_offsets_without_scaling; - scale_comm->scale_mv = unscaled_mv; + sf->scale_value_x = unscaled_value; + sf->scale_value_y = unscaled_value; } // TODO(agrange): Investigate the best choice of functions to use here @@ -102,48 +79,44 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, // applied in one direction only, and not at all for 0,0, seems to give the // best quality, but it may be worth trying an additional mode that does // do the filtering on full-pel. - if (scale_comm->x_step_q4 == 16) { - if (scale_comm->y_step_q4 == 16) { + if (sf->x_step_q4 == 16) { + if (sf->y_step_q4 == 16) { // No scaling in either direction. - scale_comm->predict[0][0][0] = vp9_convolve_copy; - scale_comm->predict[0][0][1] = vp9_convolve_avg; - scale_comm->predict[0][1][0] = vp9_convolve8_vert; - scale_comm->predict[0][1][1] = vp9_convolve8_avg_vert; - scale_comm->predict[1][0][0] = vp9_convolve8_horiz; - scale_comm->predict[1][0][1] = vp9_convolve8_avg_horiz; + sf->predict[0][0][0] = vp9_convolve_copy; + sf->predict[0][0][1] = vp9_convolve_avg; + sf->predict[0][1][0] = vp9_convolve8_vert; + sf->predict[0][1][1] = vp9_convolve8_avg_vert; + sf->predict[1][0][0] = vp9_convolve8_horiz; + sf->predict[1][0][1] = vp9_convolve8_avg_horiz; } else { // No scaling in x direction. Must always scale in the y direction. - scale_comm->predict[0][0][0] = vp9_convolve8_vert; - scale_comm->predict[0][0][1] = vp9_convolve8_avg_vert; - scale_comm->predict[0][1][0] = vp9_convolve8_vert; - scale_comm->predict[0][1][1] = vp9_convolve8_avg_vert; - scale_comm->predict[1][0][0] = vp9_convolve8; - scale_comm->predict[1][0][1] = vp9_convolve8_avg; + sf->predict[0][0][0] = vp9_convolve8_vert; + sf->predict[0][0][1] = vp9_convolve8_avg_vert; + sf->predict[0][1][0] = vp9_convolve8_vert; + sf->predict[0][1][1] = vp9_convolve8_avg_vert; + sf->predict[1][0][0] = vp9_convolve8; + sf->predict[1][0][1] = vp9_convolve8_avg; } } else { - if (scale_comm->y_step_q4 == 16) { + if (sf->y_step_q4 == 16) { // No scaling in the y direction. Must always scale in the x direction. - scale_comm->predict[0][0][0] = vp9_convolve8_horiz; - scale_comm->predict[0][0][1] = vp9_convolve8_avg_horiz; - scale_comm->predict[0][1][0] = vp9_convolve8; - scale_comm->predict[0][1][1] = vp9_convolve8_avg; - scale_comm->predict[1][0][0] = vp9_convolve8_horiz; - scale_comm->predict[1][0][1] = vp9_convolve8_avg_horiz; + sf->predict[0][0][0] = vp9_convolve8_horiz; + sf->predict[0][0][1] = vp9_convolve8_avg_horiz; + sf->predict[0][1][0] = vp9_convolve8; + sf->predict[0][1][1] = vp9_convolve8_avg; + sf->predict[1][0][0] = vp9_convolve8_horiz; + sf->predict[1][0][1] = vp9_convolve8_avg_horiz; } else { // Must always scale in both directions. - scale_comm->predict[0][0][0] = vp9_convolve8; - scale_comm->predict[0][0][1] = vp9_convolve8_avg; - scale_comm->predict[0][1][0] = vp9_convolve8; - scale_comm->predict[0][1][1] = vp9_convolve8_avg; - scale_comm->predict[1][0][0] = vp9_convolve8; - scale_comm->predict[1][0][1] = vp9_convolve8_avg; + sf->predict[0][0][0] = vp9_convolve8; + sf->predict[0][0][1] = vp9_convolve8_avg; + sf->predict[0][1][0] = vp9_convolve8; + sf->predict[0][1][1] = vp9_convolve8_avg; + sf->predict[1][0][0] = vp9_convolve8; + sf->predict[1][0][1] = vp9_convolve8_avg; } } // 2D subpel motion always gets filtered in both directions - scale_comm->predict[1][1][0] = vp9_convolve8; - scale_comm->predict[1][1][1] = vp9_convolve8_avg; - - scale->sfc = scale_comm; - scale->x_offset_q4 = 0; // calculated per block - scale->y_offset_q4 = 0; // calculated per block + sf->predict[1][1][0] = vp9_convolve8; + sf->predict[1][1][1] = vp9_convolve8_avg; } diff --git a/libvpx/vp9/common/vp9_scale.h b/libvpx/vp9/common/vp9_scale.h index 1437fcd..a9dda18 100644 --- a/libvpx/vp9/common/vp9_scale.h +++ b/libvpx/vp9/common/vp9_scale.h @@ -14,44 +14,44 @@ #include "vp9/common/vp9_mv.h" #include "vp9/common/vp9_convolve.h" +#ifdef __cplusplus +extern "C" { +#endif + #define REF_SCALE_SHIFT 14 #define REF_NO_SCALE (1 << REF_SCALE_SHIFT) #define REF_INVALID_SCALE -1 -struct scale_factors; -struct scale_factors_common { +struct scale_factors { int x_scale_fp; // horizontal fixed point scale factor int y_scale_fp; // vertical fixed point scale factor int x_step_q4; int y_step_q4; - int (*scale_value_x)(int val, const struct scale_factors_common *sfc); - int (*scale_value_y)(int val, const struct scale_factors_common *sfc); - void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col); - MV32 (*scale_mv)(const MV *mv, const struct scale_factors *scale); + int (*scale_value_x)(int val, const struct scale_factors *sf); + int (*scale_value_y)(int val, const struct scale_factors *sf); convolve_fn_t predict[2][2][2]; // horiz, vert, avg }; -struct scale_factors { - int x_offset_q4; - int y_offset_q4; - const struct scale_factors_common *sfc; -}; +MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf); -void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, - struct scale_factors_common *scale_comm, +void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, int other_h, int this_w, int this_h); -static int vp9_is_valid_scale(const struct scale_factors_common *sfc) { - return sfc->x_scale_fp != REF_INVALID_SCALE && - sfc->y_scale_fp != REF_INVALID_SCALE; +static INLINE int vp9_is_valid_scale(const struct scale_factors *sf) { + return sf->x_scale_fp != REF_INVALID_SCALE && + sf->y_scale_fp != REF_INVALID_SCALE; } -static int vp9_is_scaled(const struct scale_factors_common *sfc) { - return sfc->x_scale_fp != REF_NO_SCALE || - sfc->y_scale_fp != REF_NO_SCALE; +static INLINE int vp9_is_scaled(const struct scale_factors *sf) { + return sf->x_scale_fp != REF_NO_SCALE || + sf->y_scale_fp != REF_NO_SCALE; } +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_SCALE_H_ diff --git a/libvpx/vp9/common/vp9_scan.c b/libvpx/vp9/common/vp9_scan.c index f17da91..1ec5a0c 100644 --- a/libvpx/vp9/common/vp9_scan.c +++ b/libvpx/vp9/common/vp9_scan.c @@ -12,28 +12,28 @@ #include "vp9/common/vp9_scan.h" -DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]) = { +DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = { 0, 4, 1, 5, 8, 2, 12, 9, 3, 6, 13, 10, 7, 14, 11, 15, }; -DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]) = { +DECLARE_ALIGNED(16, static const int16_t, col_scan_4x4[16]) = { 0, 4, 8, 1, 12, 5, 9, 2, 13, 6, 10, 3, 7, 14, 11, 15, }; -DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = { +DECLARE_ALIGNED(16, static const int16_t, row_scan_4x4[16]) = { 0, 1, 4, 2, 5, 3, 6, 8, 9, 7, 12, 10, 13, 11, 14, 15, }; -DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = { +DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = { 0, 8, 1, 16, 9, 2, 17, 24, 10, 3, 18, 25, 32, 11, 4, 26, 33, 19, 40, 12, 34, 27, 5, 41, @@ -44,7 +44,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = { 46, 39, 61, 54, 47, 62, 55, 63, }; -DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = { +DECLARE_ALIGNED(16, static const int16_t, col_scan_8x8[64]) = { 0, 8, 16, 1, 24, 9, 32, 17, 2, 40, 25, 10, 33, 18, 48, 3, 26, 41, 11, 56, 19, 34, 4, 49, @@ -55,7 +55,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = { 31, 61, 39, 54, 47, 62, 55, 63, }; -DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = { +DECLARE_ALIGNED(16, static const int16_t, row_scan_8x8[64]) = { 0, 1, 2, 8, 9, 3, 16, 10, 4, 17, 11, 24, 5, 18, 25, 12, 19, 26, 32, 6, 13, 20, 33, 27, @@ -66,7 +66,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = { 60, 39, 61, 47, 54, 55, 62, 63, }; -DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = { +DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = { 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80, 50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52, 98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69, @@ -87,7 +87,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = { 255, }; -DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = { +DECLARE_ALIGNED(16, static const int16_t, col_scan_16x16[256]) = { 0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81, 34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4, 67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21, @@ -108,7 +108,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = { 255, }; -DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = { +DECLARE_ALIGNED(16, static const int16_t, row_scan_16x16[256]) = { 0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20, 49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52, 23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69, @@ -130,7 +130,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = { 255, }; -DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = { +DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = { 0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160, 129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193, 68, 131, 37, 100, @@ -233,38 +233,68 @@ DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = { // in {top, left, topleft, topright, bottomleft} order // for each position in raster scan order. // -1 indicates the neighbor does not exist. -DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_4x4[16]); +DECLARE_ALIGNED(16, static int16_t, vp9_col_iscan_4x4[16]); +DECLARE_ALIGNED(16, static int16_t, vp9_row_iscan_4x4[16]); +DECLARE_ALIGNED(16, static int16_t, vp9_col_iscan_8x8[64]); +DECLARE_ALIGNED(16, static int16_t, vp9_row_iscan_8x8[64]); +DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_8x8[64]); +DECLARE_ALIGNED(16, static int16_t, vp9_col_iscan_16x16[256]); +DECLARE_ALIGNED(16, static int16_t, vp9_row_iscan_16x16[256]); +DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_16x16[256]); +DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_32x32[1024]); -DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]); -DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]); -DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]); -DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]); -DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]); -DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]); -DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]); -DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]); -DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]); -DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]); +const scan_order vp9_default_scan_orders[TX_SIZES] = { + {default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors}, + {default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors}, + {default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors}, + {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors}, +}; + +const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES] = { + { // TX_4X4 + {default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors}, + {row_scan_4x4, vp9_row_iscan_4x4, row_scan_4x4_neighbors}, + {col_scan_4x4, vp9_col_iscan_4x4, col_scan_4x4_neighbors}, + {default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors} + }, { // TX_8X8 + {default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors}, + {row_scan_8x8, vp9_row_iscan_8x8, row_scan_8x8_neighbors}, + {col_scan_8x8, vp9_col_iscan_8x8, col_scan_8x8_neighbors}, + {default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors} + }, { // TX_16X16 + {default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors}, + {row_scan_16x16, vp9_row_iscan_16x16, row_scan_16x16_neighbors}, + {col_scan_16x16, vp9_col_iscan_16x16, col_scan_16x16_neighbors}, + {default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors} + }, { // TX_32X32 + {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors}, + {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors}, + {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors}, + {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors}, + } +}; static int find_in_scan(const int16_t *scan, int l, int idx) { int n, l2 = l * l; @@ -276,9 +306,9 @@ static int find_in_scan(const int16_t *scan, int l, int idx) { assert(0); return -1; } -static void init_scan_neighbors(const int16_t *scan, - int16_t *iscan, - int l, int16_t *neighbors) { + +static void init_scan_neighbors(const int16_t *scan, int16_t *iscan, int l, + int16_t *neighbors) { int l2 = l * l; int n, i, j; @@ -302,15 +332,15 @@ static void init_scan_neighbors(const int16_t *scan, // use the combination of the two as a context. int a = (i - 1) * l + j; int b = i * l + j - 1; - if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 || - scan == vp9_col_scan_16x16) { + if (scan == col_scan_4x4 || scan == col_scan_8x8 || + scan == col_scan_16x16) { // in the col/row scan cases (as well as left/top edge cases), we set // both contexts to the same value, so we can branchlessly do a+b+1>>1 // which automatically becomes a if a == b neighbors[MAX_NEIGHBORS * n + 0] = neighbors[MAX_NEIGHBORS * n + 1] = a; - } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 || - scan == vp9_row_scan_16x16) { + } else if (scan == row_scan_4x4 || scan == row_scan_8x8 || + scan == row_scan_16x16) { neighbors[MAX_NEIGHBORS * n + 0] = neighbors[MAX_NEIGHBORS * n + 1] = b; } else { @@ -334,24 +364,24 @@ static void init_scan_neighbors(const int16_t *scan, } void vp9_init_neighbors() { - init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4, - vp9_default_scan_4x4_neighbors); - init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4, - vp9_row_scan_4x4_neighbors); - init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4, - vp9_col_scan_4x4_neighbors); - init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8, - vp9_default_scan_8x8_neighbors); - init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8, - vp9_row_scan_8x8_neighbors); - init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8, - vp9_col_scan_8x8_neighbors); - init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16, - vp9_default_scan_16x16_neighbors); - init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16, - vp9_row_scan_16x16_neighbors); - init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16, - vp9_col_scan_16x16_neighbors); - init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32, - vp9_default_scan_32x32_neighbors); + init_scan_neighbors(default_scan_4x4, vp9_default_iscan_4x4, 4, + default_scan_4x4_neighbors); + init_scan_neighbors(row_scan_4x4, vp9_row_iscan_4x4, 4, + row_scan_4x4_neighbors); + init_scan_neighbors(col_scan_4x4, vp9_col_iscan_4x4, 4, + col_scan_4x4_neighbors); + init_scan_neighbors(default_scan_8x8, vp9_default_iscan_8x8, 8, + default_scan_8x8_neighbors); + init_scan_neighbors(row_scan_8x8, vp9_row_iscan_8x8, 8, + row_scan_8x8_neighbors); + init_scan_neighbors(col_scan_8x8, vp9_col_iscan_8x8, 8, + col_scan_8x8_neighbors); + init_scan_neighbors(default_scan_16x16, vp9_default_iscan_16x16, 16, + default_scan_16x16_neighbors); + init_scan_neighbors(row_scan_16x16, vp9_row_iscan_16x16, 16, + row_scan_16x16_neighbors); + init_scan_neighbors(col_scan_16x16, vp9_col_iscan_16x16, 16, + col_scan_16x16_neighbors); + init_scan_neighbors(default_scan_32x32, vp9_default_iscan_32x32, 32, + default_scan_32x32_neighbors); } diff --git a/libvpx/vp9/common/vp9_scan.h b/libvpx/vp9/common/vp9_scan.h index 14a1a7e..9613b67 100644 --- a/libvpx/vp9/common/vp9_scan.h +++ b/libvpx/vp9/common/vp9_scan.h @@ -15,180 +15,24 @@ #include "vpx_ports/mem.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_blockd.h" -#define MAX_NEIGHBORS 2 - -extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]); -extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]); -extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]); - -extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]); -extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]); -extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]); - -extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]); -extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]); -extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]); - -extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]); - -extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]); -extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]); -extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]); - -extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]); -extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]); -extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]); - -extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]); -extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]); -extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]); - -extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]); - -extern DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]); +#ifdef __cplusplus +extern "C" { +#endif +#define MAX_NEIGHBORS 2 void vp9_init_neighbors(); -static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) { - switch (tx_type) { - case ADST_DCT: - return vp9_row_scan_4x4; - case DCT_ADST: - return vp9_col_scan_4x4; - default: - return vp9_default_scan_4x4; - } -} - -static INLINE void get_scan_nb_4x4(TX_TYPE tx_type, - const int16_t **scan, const int16_t **nb) { - switch (tx_type) { - case ADST_DCT: - *scan = vp9_row_scan_4x4; - *nb = vp9_row_scan_4x4_neighbors; - break; - case DCT_ADST: - *scan = vp9_col_scan_4x4; - *nb = vp9_col_scan_4x4_neighbors; - break; - default: - *scan = vp9_default_scan_4x4; - *nb = vp9_default_scan_4x4_neighbors; - break; - } -} - -static INLINE const int16_t* get_iscan_4x4(TX_TYPE tx_type) { - switch (tx_type) { - case ADST_DCT: - return vp9_row_iscan_4x4; - case DCT_ADST: - return vp9_col_iscan_4x4; - default: - return vp9_default_iscan_4x4; - } -} - -static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) { - switch (tx_type) { - case ADST_DCT: - return vp9_row_scan_8x8; - case DCT_ADST: - return vp9_col_scan_8x8; - default: - return vp9_default_scan_8x8; - } -} - -static INLINE void get_scan_nb_8x8(TX_TYPE tx_type, - const int16_t **scan, const int16_t **nb) { - switch (tx_type) { - case ADST_DCT: - *scan = vp9_row_scan_8x8; - *nb = vp9_row_scan_8x8_neighbors; - break; - case DCT_ADST: - *scan = vp9_col_scan_8x8; - *nb = vp9_col_scan_8x8_neighbors; - break; - default: - *scan = vp9_default_scan_8x8; - *nb = vp9_default_scan_8x8_neighbors; - break; - } -} - -static INLINE const int16_t* get_iscan_8x8(TX_TYPE tx_type) { - switch (tx_type) { - case ADST_DCT: - return vp9_row_iscan_8x8; - case DCT_ADST: - return vp9_col_iscan_8x8; - default: - return vp9_default_iscan_8x8; - } -} - -static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) { - switch (tx_type) { - case ADST_DCT: - return vp9_row_scan_16x16; - case DCT_ADST: - return vp9_col_scan_16x16; - default: - return vp9_default_scan_16x16; - } -} +typedef struct { + const int16_t *scan; + const int16_t *iscan; + const int16_t *neighbors; +} scan_order; -static INLINE void get_scan_nb_16x16(TX_TYPE tx_type, - const int16_t **scan, const int16_t **nb) { - switch (tx_type) { - case ADST_DCT: - *scan = vp9_row_scan_16x16; - *nb = vp9_row_scan_16x16_neighbors; - break; - case DCT_ADST: - *scan = vp9_col_scan_16x16; - *nb = vp9_col_scan_16x16_neighbors; - break; - default: - *scan = vp9_default_scan_16x16; - *nb = vp9_default_scan_16x16_neighbors; - break; - } -} - -static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) { - switch (tx_type) { - case ADST_DCT: - return vp9_row_iscan_16x16; - case DCT_ADST: - return vp9_col_iscan_16x16; - default: - return vp9_default_iscan_16x16; - } -} +extern const scan_order vp9_default_scan_orders[TX_SIZES]; +extern const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES]; static INLINE int get_coef_context(const int16_t *neighbors, const uint8_t *token_cache, int c) { @@ -196,4 +40,8 @@ static INLINE int get_coef_context(const int16_t *neighbors, token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1; } +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_SCAN_H_ diff --git a/libvpx/vp9/common/vp9_seg_common.c b/libvpx/vp9/common/vp9_seg_common.c index ef30404..910200e 100644 --- a/libvpx/vp9/common/vp9_seg_common.c +++ b/libvpx/vp9/common/vp9_seg_common.c @@ -41,11 +41,6 @@ void vp9_enable_segfeature(struct segmentation *seg, int segment_id, seg->feature_mask[segment_id] |= 1 << feature_id; } -void vp9_disable_segfeature(struct segmentation *seg, int segment_id, - SEG_LVL_FEATURES feature_id) { - seg->feature_mask[segment_id] &= ~(1 << feature_id); -} - int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id) { return seg_feature_data_max[feature_id]; } @@ -54,11 +49,6 @@ int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) { return seg_feature_data_signed[feature_id]; } -void vp9_clear_segdata(struct segmentation *seg, int segment_id, - SEG_LVL_FEATURES feature_id) { - seg->feature_data[segment_id][feature_id] = 0; -} - void vp9_set_segdata(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id, int seg_data) { assert(seg_data <= seg_feature_data_max[feature_id]); diff --git a/libvpx/vp9/common/vp9_seg_common.h b/libvpx/vp9/common/vp9_seg_common.h index eb38c06..ff2d66a 100644 --- a/libvpx/vp9/common/vp9_seg_common.h +++ b/libvpx/vp9/common/vp9_seg_common.h @@ -11,7 +11,11 @@ #ifndef VP9_COMMON_VP9_SEG_COMMON_H_ #define VP9_COMMON_VP9_SEG_COMMON_H_ -#include "vp9/common/vp9_treecoder.h" +#include "vp9/common/vp9_prob.h" + +#ifdef __cplusplus +extern "C" { +#endif #define SEGMENT_DELTADATA 0 #define SEGMENT_ABSDATA 1 @@ -55,18 +59,10 @@ void vp9_enable_segfeature(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id); -void vp9_disable_segfeature(struct segmentation *seg, - int segment_id, - SEG_LVL_FEATURES feature_id); - int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id); int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id); -void vp9_clear_segdata(struct segmentation *seg, - int segment_id, - SEG_LVL_FEATURES feature_id); - void vp9_set_segdata(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id, @@ -78,5 +74,9 @@ int vp9_get_segdata(const struct segmentation *seg, extern const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)]; +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_SEG_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_systemdependent.h b/libvpx/vp9/common/vp9_systemdependent.h index 254a431..e971158 100644 --- a/libvpx/vp9/common/vp9_systemdependent.h +++ b/libvpx/vp9/common/vp9_systemdependent.h @@ -12,8 +12,16 @@ #define VP9_COMMON_VP9_SYSTEMDEPENDENT_H_ #ifdef _MSC_VER -#include <math.h> -#define snprintf _snprintf +# include <math.h> // the ceil() definition must precede intrin.h +# if _MSC_VER > 1310 && (defined(_M_X64) || defined(_M_IX86)) +# include <intrin.h> +# define USE_MSC_INTRIN +# endif +# define snprintf _snprintf +#endif + +#ifdef __cplusplus +extern "C" { #endif #include "./vpx_config.h" @@ -26,7 +34,7 @@ void vpx_reset_mmx_state(void); #if defined(_MSC_VER) && _MSC_VER < 1800 // round is not defined in MSVC before VS2013. -static int round(double x) { +static INLINE int round(double x) { if (x < 0) return (int)ceil(x - 0.5); else @@ -34,7 +42,42 @@ static int round(double x) { } #endif -struct VP9Common; -void vp9_machine_specific_config(struct VP9Common *cm); +// use GNU builtins where available. +#if defined(__GNUC__) && \ + ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4) +static INLINE int get_msb(unsigned int n) { + return 31 ^ __builtin_clz(n); +} +#elif defined(USE_MSC_INTRIN) +#pragma intrinsic(_BitScanReverse) + +static INLINE int get_msb(unsigned int n) { + unsigned long first_set_bit; + _BitScanReverse(&first_set_bit, n); + return first_set_bit; +} +#undef USE_MSC_INTRIN +#else +// Returns (int)floor(log2(n)). n must be > 0. +static INLINE int get_msb(unsigned int n) { + int log = 0; + unsigned int value = n; + int i; + + for (i = 4; i >= 0; --i) { + const int shift = (1 << i); + const unsigned int x = value >> shift; + if (x != 0) { + value = x; + log += shift; + } + } + return log; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_COMMON_VP9_SYSTEMDEPENDENT_H_ diff --git a/libvpx/vp9/common/vp9_textblit.h b/libvpx/vp9/common/vp9_textblit.h index c968628..158ec1b 100644 --- a/libvpx/vp9/common/vp9_textblit.h +++ b/libvpx/vp9/common/vp9_textblit.h @@ -11,9 +11,17 @@ #ifndef VP9_COMMON_VP9_TEXTBLIT_H_ #define VP9_COMMON_VP9_TEXTBLIT_H_ +#ifdef __cplusplus +extern "C" { +#endif + void vp9_blit_text(const char *msg, unsigned char *address, int pitch); void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, int pitch); +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_TEXTBLIT_H_ diff --git a/libvpx/vp9/common/vp9_tile_common.c b/libvpx/vp9/common/vp9_tile_common.c index e3035d0..78909dd 100644 --- a/libvpx/vp9/common/vp9_tile_common.c +++ b/libvpx/vp9/common/vp9_tile_common.c @@ -15,46 +15,37 @@ #define MIN_TILE_WIDTH_B64 4 #define MAX_TILE_WIDTH_B64 64 -static int to_sbs(n_mis) { - return mi_cols_aligned_to_sb(n_mis) >> MI_BLOCK_SIZE_LOG2; +static int get_tile_offset(int idx, int mis, int log2) { + const int sb_cols = mi_cols_aligned_to_sb(mis) >> MI_BLOCK_SIZE_LOG2; + const int offset = ((idx * sb_cols) >> log2) << MI_BLOCK_SIZE_LOG2; + return MIN(offset, mis); } -static void get_tile_offsets(int *min_tile_off, int *max_tile_off, - int tile_idx, int log2_n_tiles, int n_mis) { - const int n_sbs = to_sbs(n_mis); - const int sb_off1 = (tile_idx * n_sbs) >> log2_n_tiles; - const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles; - - *min_tile_off = MIN(sb_off1 << 3, n_mis); - *max_tile_off = MIN(sb_off2 << 3, n_mis); -} - -void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, - int row_idx, int col_idx) { - get_tile_offsets(&tile->mi_row_start, &tile->mi_row_end, - row_idx, cm->log2_tile_rows, cm->mi_rows); - get_tile_offsets(&tile->mi_col_start, &tile->mi_col_end, - col_idx, cm->log2_tile_cols, cm->mi_cols); +void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) { + tile->mi_row_start = get_tile_offset(row, cm->mi_rows, cm->log2_tile_rows); + tile->mi_row_end = get_tile_offset(row + 1, cm->mi_rows, cm->log2_tile_rows); + tile->mi_col_start = get_tile_offset(col, cm->mi_cols, cm->log2_tile_cols); + tile->mi_col_end = get_tile_offset(col + 1, cm->mi_cols, cm->log2_tile_cols); } void vp9_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, int *max_log2_tile_cols) { - const int sb_cols = to_sbs(mi_cols); - int min_log2_n_tiles, max_log2_n_tiles; + const int sb_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2; + int min_log2 = 0, max_log2 = 0; - for (max_log2_n_tiles = 0; - (sb_cols >> max_log2_n_tiles) >= MIN_TILE_WIDTH_B64; - max_log2_n_tiles++) {} - max_log2_n_tiles--; - if (max_log2_n_tiles < 0) - max_log2_n_tiles = 0; + // max + while ((sb_cols >> max_log2) >= MIN_TILE_WIDTH_B64) + ++max_log2; + --max_log2; + if (max_log2 < 0) + max_log2 = 0; - for (min_log2_n_tiles = 0; - (MAX_TILE_WIDTH_B64 << min_log2_n_tiles) < sb_cols; - min_log2_n_tiles++) {} + // min + while ((MAX_TILE_WIDTH_B64 << min_log2) < sb_cols) + ++min_log2; - assert(min_log2_n_tiles <= max_log2_n_tiles); + assert(min_log2 <= max_log2); - *min_log2_tile_cols = min_log2_n_tiles; - *max_log2_tile_cols = max_log2_n_tiles; + *min_log2_tile_cols = min_log2; + *max_log2_tile_cols = max_log2; } diff --git a/libvpx/vp9/common/vp9_tile_common.h b/libvpx/vp9/common/vp9_tile_common.h index a110abb..a97719e 100644 --- a/libvpx/vp9/common/vp9_tile_common.h +++ b/libvpx/vp9/common/vp9_tile_common.h @@ -11,6 +11,10 @@ #ifndef VP9_COMMON_VP9_TILE_COMMON_H_ #define VP9_COMMON_VP9_TILE_COMMON_H_ +#ifdef __cplusplus +extern "C" { +#endif + struct VP9Common; typedef struct TileInfo { @@ -18,12 +22,16 @@ typedef struct TileInfo { int mi_col_start, mi_col_end; } TileInfo; -// initializes 'tile->mi_(row|col)_(start|end)' for (row_idx, col_idx) based on +// initializes 'tile->mi_(row|col)_(start|end)' for (row, col) based on // 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)' void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm, - int row_idx, int col_idx); + int row, int col); void vp9_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, int *max_log2_tile_cols); +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_TILE_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_treecoder.c b/libvpx/vp9/common/vp9_treecoder.c deleted file mode 100644 index e2a5b9f..0000000 --- a/libvpx/vp9/common/vp9_treecoder.c +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include <assert.h> - -#include "./vpx_config.h" -#include "vp9/common/vp9_treecoder.h" - -static void tree2tok(struct vp9_token *const p, vp9_tree t, - int i, int v, int l) { - v += v; - ++l; - - do { - const vp9_tree_index j = t[i++]; - - if (j <= 0) { - p[-j].value = v; - p[-j].len = l; - } else { - tree2tok(p, t, j, v, l); - } - } while (++v & 1); -} - -void vp9_tokens_from_tree(struct vp9_token *p, vp9_tree t) { - tree2tok(p, t, 0, 0, 0); -} - -static unsigned int convert_distribution(unsigned int i, vp9_tree tree, - unsigned int branch_ct[][2], - const unsigned int num_events[]) { - unsigned int left, right; - - if (tree[i] <= 0) - left = num_events[-tree[i]]; - else - left = convert_distribution(tree[i], tree, branch_ct, num_events); - - if (tree[i + 1] <= 0) - right = num_events[-tree[i + 1]]; - else - right = convert_distribution(tree[i + 1], tree, branch_ct, num_events); - - branch_ct[i >> 1][0] = left; - branch_ct[i >> 1][1] = right; - return left + right; -} - -void vp9_tree_probs_from_distribution(vp9_tree tree, - unsigned int branch_ct[/* n-1 */][2], - const unsigned int num_events[/* n */]) { - convert_distribution(0, tree, branch_ct, num_events); -} - - diff --git a/libvpx/vp9/common/vp9_treecoder.h b/libvpx/vp9/common/vp9_treecoder.h deleted file mode 100644 index a79b156..0000000 --- a/libvpx/vp9/common/vp9_treecoder.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_VP9_TREECODER_H_ -#define VP9_COMMON_VP9_TREECODER_H_ - -#include "./vpx_config.h" -#include "vpx/vpx_integer.h" -#include "vp9/common/vp9_common.h" - -typedef uint8_t vp9_prob; - -#define vp9_prob_half ((vp9_prob) 128) - -typedef int8_t vp9_tree_index; - -#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2) - -#define vp9_complement(x) (255 - x) - -/* We build coding trees compactly in arrays. - Each node of the tree is a pair of vp9_tree_indices. - Array index often references a corresponding probability table. - Index <= 0 means done encoding/decoding and value = -Index, - Index > 0 means need another bit, specification at index. - Nonnegative indices are always even; processing begins at node 0. */ - -typedef const vp9_tree_index vp9_tree[]; - -struct vp9_token { - int value; - int len; -}; - -/* Construct encoding array from tree. */ - -void vp9_tokens_from_tree(struct vp9_token*, vp9_tree); - -/* Convert array of token occurrence counts into a table of probabilities - for the associated binary encoding tree. Also writes count of branches - taken for each node on the tree; this facilitiates decisions as to - probability updates. */ - -void vp9_tree_probs_from_distribution(vp9_tree tree, - unsigned int branch_ct[ /* n - 1 */ ][2], - const unsigned int num_events[ /* n */ ]); - - -static INLINE vp9_prob clip_prob(int p) { - return (p > 255) ? 255u : (p < 1) ? 1u : p; -} - -// int64 is not needed for normal frame level calculations. -// However when outputing entropy stats accumulated over many frames -// or even clips we can overflow int math. -#ifdef ENTROPY_STATS -static INLINE vp9_prob get_prob(int num, int den) { - return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den); -} -#else -static INLINE vp9_prob get_prob(int num, int den) { - return (den == 0) ? 128u : clip_prob((num * 256 + (den >> 1)) / den); -} -#endif - -static INLINE vp9_prob get_binary_prob(int n0, int n1) { - return get_prob(n0, n0 + n1); -} - -/* this function assumes prob1 and prob2 are already within [1,255] range */ -static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) { - return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8); -} - -static INLINE vp9_prob merge_probs(vp9_prob pre_prob, - const unsigned int ct[2], - unsigned int count_sat, - unsigned int max_update_factor) { - const vp9_prob prob = get_binary_prob(ct[0], ct[1]); - const unsigned int count = MIN(ct[0] + ct[1], count_sat); - const unsigned int factor = max_update_factor * count / count_sat; - return weighted_prob(pre_prob, prob, factor); -} - -static unsigned int tree_merge_probs_impl(unsigned int i, - const vp9_tree_index *tree, - const vp9_prob *pre_probs, - const unsigned int *counts, - unsigned int count_sat, - unsigned int max_update_factor, - vp9_prob *probs) { - const int l = tree[i]; - const unsigned int left_count = (l <= 0) - ? counts[-l] - : tree_merge_probs_impl(l, tree, pre_probs, counts, - count_sat, max_update_factor, probs); - const int r = tree[i + 1]; - const unsigned int right_count = (r <= 0) - ? counts[-r] - : tree_merge_probs_impl(r, tree, pre_probs, counts, - count_sat, max_update_factor, probs); - const unsigned int ct[2] = { left_count, right_count }; - probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct, - count_sat, max_update_factor); - return left_count + right_count; -} - -static void tree_merge_probs(const vp9_tree_index *tree, - const vp9_prob *pre_probs, - const unsigned int *counts, - unsigned int count_sat, - unsigned int max_update_factor, vp9_prob *probs) { - tree_merge_probs_impl(0, tree, pre_probs, counts, - count_sat, max_update_factor, probs); -} - - -#endif // VP9_COMMON_VP9_TREECODER_H_ diff --git a/libvpx/vp9/common/x86/vp9_asm_stubs.c b/libvpx/vp9/common/x86/vp9_asm_stubs.c index 106e6d4..1b4904c 100644 --- a/libvpx/vp9/common/x86/vp9_asm_stubs.c +++ b/libvpx/vp9/common/x86/vp9_asm_stubs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -13,45 +13,205 @@ #include "./vpx_config.h" #include "./vp9_rtcd.h" #include "vpx_ports/mem.h" -/////////////////////////////////////////////////////////////////////////// -// the mmx function that does the bilinear filtering and var calculation // -// int one pass // -/////////////////////////////////////////////////////////////////////////// -DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = { - { 128, 128, 128, 128, 0, 0, 0, 0 }, - { 120, 120, 120, 120, 8, 8, 8, 8 }, - { 112, 112, 112, 112, 16, 16, 16, 16 }, - { 104, 104, 104, 104, 24, 24, 24, 24 }, - { 96, 96, 96, 96, 32, 32, 32, 32 }, - { 88, 88, 88, 88, 40, 40, 40, 40 }, - { 80, 80, 80, 80, 48, 48, 48, 48 }, - { 72, 72, 72, 72, 56, 56, 56, 56 }, - { 64, 64, 64, 64, 64, 64, 64, 64 }, - { 56, 56, 56, 56, 72, 72, 72, 72 }, - { 48, 48, 48, 48, 80, 80, 80, 80 }, - { 40, 40, 40, 40, 88, 88, 88, 88 }, - { 32, 32, 32, 32, 96, 96, 96, 96 }, - { 24, 24, 24, 24, 104, 104, 104, 104 }, - { 16, 16, 16, 16, 112, 112, 112, 112 }, - { 8, 8, 8, 8, 120, 120, 120, 120 } -}; typedef void filter8_1dfunction ( const unsigned char *src_ptr, - const unsigned int src_pitch, + const ptrdiff_t src_pitch, unsigned char *output_ptr, - unsigned int out_pitch, + ptrdiff_t out_pitch, unsigned int output_height, const short *filter ); +#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h) { \ + if (step_q4 == 16 && filter[3] != 128) { \ + if (filter[0] || filter[1] || filter[2]) { \ + while (w >= 16) { \ + vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + vp9_filter_block1d16_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_filter_block1d8_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_filter_block1d4_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h); \ + } \ +} + +#define FUN_CONV_2D(avg, opt) \ +void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h) { \ + assert(w <= 64); \ + assert(h <= 64); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ + filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \ + vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h + 7); \ + vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h); \ + } else { \ + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \ + vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h + 1); \ + vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h); \ + } \ + } else { \ + vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ + } \ +} +#if HAVE_AVX2 +filter8_1dfunction vp9_filter_block1d16_v8_avx2; +filter8_1dfunction vp9_filter_block1d16_h8_avx2; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +#if (ARCH_X86_64) +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; +#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3 +#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3 +#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3 +#else +filter8_1dfunction vp9_filter_block1d8_v8_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 +#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 +#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 +#endif +filter8_1dfunction vp9_filter_block1d16_v2_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_ssse3; +#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 +#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 +#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 +#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3 +#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3 +#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3 +#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3 +// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); + +// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, avx2); +#endif #if HAVE_SSSE3 +#if (ARCH_X86_64) +filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; +#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3 +#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3 +#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3 +#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3 +#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3 +#else filter8_1dfunction vp9_filter_block1d16_v8_ssse3; filter8_1dfunction vp9_filter_block1d16_h8_ssse3; filter8_1dfunction vp9_filter_block1d8_v8_ssse3; filter8_1dfunction vp9_filter_block1d8_h8_ssse3; filter8_1dfunction vp9_filter_block1d4_v8_ssse3; filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +#endif filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; @@ -59,201 +219,57 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; -void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - /* Ensure the filter can be compressed to int16_t. */ - if (x_step_q4 == 16 && filter_x[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_h8_ssse3(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_h8_ssse3(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_h8_ssse3(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} - -void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - if (y_step_q4 == 16 && filter_y[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} - -void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - if (x_step_q4 == 16 && filter_x[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_h8_avg_ssse3(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_h8_avg_ssse3(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_h8_avg_ssse3(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} - -void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - if (y_step_q4 == 16 && filter_y[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} - -void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); - - assert(w <= 64); - assert(h <= 64); - if (x_step_q4 == 16 && y_step_q4 == 16) { - vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h + 7); - vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h); - } else { - vp9_convolve8_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h); - } -} +filter8_1dfunction vp9_filter_block1d16_v2_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_ssse3; +filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; -void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); +// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, + ssse3); - assert(w <= 64); - assert(h <= 64); - if (x_step_q4 == 16 && y_step_q4 == 16) { - vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h + 7); - vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } else { - vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h); - } -} +// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, ssse3); +FUN_CONV_2D(avg_ , ssse3); #endif #if HAVE_SSE2 @@ -270,199 +286,54 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2; filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2; filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; -void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - /* Ensure the filter can be compressed to int16_t. */ - if (x_step_q4 == 16 && filter_x[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_h8_sse2(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_h8_sse2(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_h8_sse2(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} +filter8_1dfunction vp9_filter_block1d16_v2_sse2; +filter8_1dfunction vp9_filter_block1d16_h2_sse2; +filter8_1dfunction vp9_filter_block1d8_v2_sse2; +filter8_1dfunction vp9_filter_block1d8_h2_sse2; +filter8_1dfunction vp9_filter_block1d4_v2_sse2; +filter8_1dfunction vp9_filter_block1d4_h2_sse2; +filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2; +filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2; +filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2; +filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2; +filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2; +filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; -void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - if (y_step_q4 == 16 && filter_y[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} +// void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); -void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - if (x_step_q4 == 16 && filter_x[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_h8_avg_sse2(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_h8_avg_sse2(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_h8_avg_sse2(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} - -void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - if (y_step_q4 == 16 && filter_y[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} - -void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); - - assert(w <= 64); - assert(h <= 64); - if (x_step_q4 == 16 && y_step_q4 == 16) { - vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h + 7); - vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h); - } else { - vp9_convolve8_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h); - } -} - -void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); - - assert(w <= 64); - assert(h <= 64); - if (x_step_q4 == 16 && y_step_q4 == 16) { - vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h + 7); - vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } else { - vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h); - } -} +// void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, sse2); +FUN_CONV_2D(avg_ , sse2); #endif diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c index 2a33844..13a5b5a 100644 --- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -174,15 +174,13 @@ void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { static INLINE void transpose_4x4(__m128i *res) { const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]); - res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); - res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); - res[1] = _mm_unpackhi_epi64(res[0], res[0]); - res[3] = _mm_unpackhi_epi64(res[2], res[2]); + res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); + res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); } -static void idct4_1d_sse2(__m128i *in) { +static void idct4_sse2(__m128i *in) { const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); @@ -192,8 +190,8 @@ static void idct4_1d_sse2(__m128i *in) { transpose_4x4(in); // stage 1 - u[0] = _mm_unpacklo_epi16(in[0], in[2]); - u[1] = _mm_unpacklo_epi16(in[1], in[3]); + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpackhi_epi16(in[0], in[1]); v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); @@ -209,19 +207,16 @@ static void idct4_1d_sse2(__m128i *in) { v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[0] = _mm_packs_epi32(v[0], v[2]); - u[1] = _mm_packs_epi32(v[1], v[3]); - u[2] = _mm_unpackhi_epi64(u[0], u[0]); - u[3] = _mm_unpackhi_epi64(u[1], u[1]); + u[0] = _mm_packs_epi32(v[0], v[1]); + u[1] = _mm_packs_epi32(v[3], v[2]); // stage 2 - in[0] = _mm_add_epi16(u[0], u[3]); - in[1] = _mm_add_epi16(u[1], u[2]); - in[2] = _mm_sub_epi16(u[1], u[2]); - in[3] = _mm_sub_epi16(u[0], u[3]); + in[0] = _mm_add_epi16(u[0], u[1]); + in[1] = _mm_sub_epi16(u[0], u[1]); + in[1] = _mm_shuffle_epi32(in[1], 0x4E); } -static void iadst4_1d_sse2(__m128i *in) { +static void iadst4_sse2(__m128i *in) { const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); @@ -232,13 +227,14 @@ static void iadst4_1d_sse2(__m128i *in) { __m128i u[8], v[8], in7; transpose_4x4(in); - in7 = _mm_add_epi16(in[0], in[3]); - in7 = _mm_sub_epi16(in7, in[2]); + in7 = _mm_srli_si128(in[1], 8); + in7 = _mm_add_epi16(in7, in[0]); + in7 = _mm_sub_epi16(in7, in[1]); - u[0] = _mm_unpacklo_epi16(in[0], in[2]); - u[1] = _mm_unpacklo_epi16(in[1], in[3]); + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpackhi_epi16(in[0], in[1]); u[2] = _mm_unpacklo_epi16(in7, kZero); - u[3] = _mm_unpacklo_epi16(in[1], kZero); + u[3] = _mm_unpackhi_epi16(in[0], kZero); v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 @@ -265,39 +261,35 @@ static void iadst4_1d_sse2(__m128i *in) { u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - in[0] = _mm_packs_epi32(u[0], u[2]); - in[1] = _mm_packs_epi32(u[1], u[3]); - in[2] = _mm_unpackhi_epi64(in[0], in[0]); - in[3] = _mm_unpackhi_epi64(in[1], in[1]); + in[0] = _mm_packs_epi32(u[0], u[1]); + in[1] = _mm_packs_epi32(u[2], u[3]); } void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, int tx_type) { - __m128i in[4]; + __m128i in[2]; const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); - in[0] = _mm_loadl_epi64((const __m128i *)input); - in[1] = _mm_loadl_epi64((const __m128i *)(input + 4)); - in[2] = _mm_loadl_epi64((const __m128i *)(input + 8)); - in[3] = _mm_loadl_epi64((const __m128i *)(input + 12)); + in[0]= _mm_loadu_si128((const __m128i *)(input)); + in[1]= _mm_loadu_si128((const __m128i *)(input + 8)); switch (tx_type) { case 0: // DCT_DCT - idct4_1d_sse2(in); - idct4_1d_sse2(in); + idct4_sse2(in); + idct4_sse2(in); break; case 1: // ADST_DCT - idct4_1d_sse2(in); - iadst4_1d_sse2(in); + idct4_sse2(in); + iadst4_sse2(in); break; case 2: // DCT_ADST - iadst4_1d_sse2(in); - idct4_1d_sse2(in); + iadst4_sse2(in); + idct4_sse2(in); break; case 3: // ADST_ADST - iadst4_1d_sse2(in); - iadst4_1d_sse2(in); + iadst4_sse2(in); + iadst4_sse2(in); break; default: assert(0); @@ -307,18 +299,35 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, // Final round and shift in[0] = _mm_add_epi16(in[0], eight); in[1] = _mm_add_epi16(in[1], eight); - in[2] = _mm_add_epi16(in[2], eight); - in[3] = _mm_add_epi16(in[3], eight); in[0] = _mm_srai_epi16(in[0], 4); in[1] = _mm_srai_epi16(in[1], 4); - in[2] = _mm_srai_epi16(in[2], 4); - in[3] = _mm_srai_epi16(in[3], 4); - RECON_AND_STORE4X4(dest, in[0]); - RECON_AND_STORE4X4(dest, in[1]); - RECON_AND_STORE4X4(dest, in[2]); - RECON_AND_STORE4X4(dest, in[3]); + // Reconstruction and Store + { + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); + __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); + d0 = _mm_unpacklo_epi32(d0, + _mm_cvtsi32_si128(*(const int *) (dest + stride))); + d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128( + *(const int *) (dest + stride * 3))); + d0 = _mm_unpacklo_epi8(d0, zero); + d2 = _mm_unpacklo_epi8(d2, zero); + d0 = _mm_add_epi16(d0, in[0]); + d2 = _mm_add_epi16(d2, in[1]); + d0 = _mm_packus_epi16(d0, d2); + // store result[0] + *(int *)dest = _mm_cvtsi128_si32(d0); + // store result[1] + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); + // store result[2] + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); + // store result[3] + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); + } } #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ @@ -352,37 +361,40 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ } -#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ - const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ - \ +#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \ + out0, out1, out2, out3) \ + { \ + const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ + const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ + \ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - \ + \ out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - out4 = out5 = out6 = out7 = zero; \ } -#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \ +#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ { \ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ - const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ \ in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ - in2 = _mm_unpacklo_epi32(tr0_2, tr0_3); /* i5 i4 */ \ - in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \ + } + +#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ } // Define Macro for multiplying elements by constants and adding them together. @@ -422,7 +434,30 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, res3 = _mm_packs_epi32(tmp6, tmp7); \ } -#define IDCT8_1D \ +#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ + { \ + tmp0 = _mm_madd_epi16(lo_0, cst0); \ + tmp1 = _mm_madd_epi16(hi_0, cst0); \ + tmp2 = _mm_madd_epi16(lo_0, cst1); \ + tmp3 = _mm_madd_epi16(hi_0, cst1); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + res0 = _mm_packs_epi32(tmp0, tmp1); \ + res1 = _mm_packs_epi32(tmp2, tmp3); \ + } + +#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ + { \ /* Stage1 */ \ { \ const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ @@ -482,14 +517,15 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, } \ \ /* Stage4 */ \ - in0 = _mm_adds_epi16(stp1_0, stp2_7); \ - in1 = _mm_adds_epi16(stp1_1, stp1_6); \ - in2 = _mm_adds_epi16(stp1_2, stp1_5); \ - in3 = _mm_adds_epi16(stp1_3, stp2_4); \ - in4 = _mm_subs_epi16(stp1_3, stp2_4); \ - in5 = _mm_subs_epi16(stp1_2, stp1_5); \ - in6 = _mm_subs_epi16(stp1_1, stp1_6); \ - in7 = _mm_subs_epi16(stp1_0, stp2_7); + out0 = _mm_adds_epi16(stp1_0, stp2_7); \ + out1 = _mm_adds_epi16(stp1_1, stp1_6); \ + out2 = _mm_adds_epi16(stp1_2, stp1_5); \ + out3 = _mm_adds_epi16(stp1_3, stp2_4); \ + out4 = _mm_subs_epi16(stp1_3, stp2_4); \ + out5 = _mm_subs_epi16(stp1_2, stp1_5); \ + out6 = _mm_subs_epi16(stp1_1, stp1_6); \ + out7 = _mm_subs_epi16(stp1_0, stp2_7); \ + } #define RECON_AND_STORE(dest, in_x) \ { \ @@ -533,11 +569,12 @@ void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { // 2-D for (i = 0; i < 2; i++) { // 8x8 Transpose is copied from vp9_fdct8x8_sse2() - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); // 4-stage 1D idct8x8 - IDCT8_1D + IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); } // Final rounding and shift @@ -620,7 +657,24 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); } -static void idct8_1d_sse2(__m128i *in) { +static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + + out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); + out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); + out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); + out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); +} + +static void idct8_sse2(__m128i *in) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); @@ -636,32 +690,16 @@ static void idct8_1d_sse2(__m128i *in) { __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - in0 = in[0]; - in1 = in[1]; - in2 = in[2]; - in3 = in[3]; - in4 = in[4]; - in5 = in[5]; - in6 = in[6]; - in7 = in[7]; - // 8x8 Transpose is copied from vp9_fdct8x8_sse2() - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); + TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], + in0, in1, in2, in3, in4, in5, in6, in7); // 4-stage 1D idct8x8 - IDCT8_1D - in[0] = in0; - in[1] = in1; - in[2] = in2; - in[3] = in3; - in[4] = in4; - in[5] = in5; - in[6] = in6; - in[7] = in7; + IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); } -static void iadst8_1d_sse2(__m128i *in) { +static void iadst8_sse2(__m128i *in) { const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); @@ -908,20 +946,20 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, switch (tx_type) { case 0: // DCT_DCT - idct8_1d_sse2(in); - idct8_1d_sse2(in); + idct8_sse2(in); + idct8_sse2(in); break; case 1: // ADST_DCT - idct8_1d_sse2(in); - iadst8_1d_sse2(in); + idct8_sse2(in); + iadst8_sse2(in); break; case 2: // DCT_ADST - iadst8_1d_sse2(in); - idct8_1d_sse2(in); + iadst8_sse2(in); + idct8_sse2(in); break; case 3: // ADST_ADST - iadst8_1d_sse2(in); - iadst8_1d_sse2(in); + iadst8_sse2(in); + iadst8_sse2(in); break; default: assert(0); @@ -983,12 +1021,11 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); // 8x4 Transpose - TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3) - + TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); // Stage1 { //NOLINT - const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3); - const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2); + const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); + const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); tmp0 = _mm_madd_epi16(lo_17, stg1_0); tmp2 = _mm_madd_epi16(lo_17, stg1_1); @@ -1004,16 +1041,14 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - stp1_4 = _mm_packs_epi32(tmp0, zero); - stp1_7 = _mm_packs_epi32(tmp2, zero); - stp1_5 = _mm_packs_epi32(tmp4, zero); - stp1_6 = _mm_packs_epi32(tmp6, zero); + stp1_4 = _mm_packs_epi32(tmp0, tmp2); + stp1_5 = _mm_packs_epi32(tmp4, tmp6); } // Stage2 { //NOLINT - const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2); - const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3); + const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); + const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); tmp0 = _mm_madd_epi16(lo_04, stg2_0); tmp2 = _mm_madd_epi16(lo_04, stg2_1); @@ -1029,24 +1064,26 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - stp2_0 = _mm_packs_epi32(tmp0, zero); - stp2_1 = _mm_packs_epi32(tmp2, zero); - stp2_2 = _mm_packs_epi32(tmp4, zero); - stp2_3 = _mm_packs_epi32(tmp6, zero); + stp2_0 = _mm_packs_epi32(tmp0, tmp2); + stp2_2 = _mm_packs_epi32(tmp6, tmp4); - stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); - stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); - stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); - stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); + tmp0 = _mm_adds_epi16(stp1_4, stp1_5); + tmp1 = _mm_subs_epi16(stp1_4, stp1_5); + + stp2_4 = tmp0; + stp2_5 = _mm_unpacklo_epi64(tmp1, zero); + stp2_6 = _mm_unpackhi_epi64(tmp1, zero); } // Stage3 { //NOLINT const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); - stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); - stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); - stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); - stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); + + tmp4 = _mm_adds_epi16(stp2_0, stp2_2); + tmp6 = _mm_subs_epi16(stp2_0, stp2_2); + + stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); + stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); tmp0 = _mm_madd_epi16(lo_56, stg3_0); tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 @@ -1056,27 +1093,19 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - stp1_5 = _mm_packs_epi32(tmp0, zero); - stp1_6 = _mm_packs_epi32(tmp2, zero); + stp1_5 = _mm_packs_epi32(tmp0, tmp2); } // Stage4 - in0 = _mm_adds_epi16(stp1_0, stp2_7); - in1 = _mm_adds_epi16(stp1_1, stp1_6); - in2 = _mm_adds_epi16(stp1_2, stp1_5); - in3 = _mm_adds_epi16(stp1_3, stp2_4); - in4 = _mm_subs_epi16(stp1_3, stp2_4); - in5 = _mm_subs_epi16(stp1_2, stp1_5); - in6 = _mm_subs_epi16(stp1_1, stp1_6); - in7 = _mm_subs_epi16(stp1_0, stp2_7); - - // Columns. 4x8 Transpose - TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7) - - // 1D idct8x8 - IDCT8_1D + tmp0 = _mm_adds_epi16(stp1_3, stp2_4); + tmp1 = _mm_adds_epi16(stp1_2, stp1_5); + tmp2 = _mm_subs_epi16(stp1_3, stp2_4); + tmp3 = _mm_subs_epi16(stp1_2, stp1_5); + TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) + + IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, + in0, in1, in2, in3, in4, in5, in6, in7); // Final rounding and shift in0 = _mm_adds_epi16(in0, final_rounding); in1 = _mm_adds_epi16(in1, final_rounding); @@ -1106,17 +1135,17 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE(dest, in7); } -#define IDCT16_1D \ +#define IDCT16 \ /* Stage2 */ \ { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \ - const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \ - const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7); \ - const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \ - const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \ + const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ + const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ + const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ + const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ + const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ + const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ + const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ + const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ \ MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ stg2_0, stg2_1, stg2_2, stg2_3, \ @@ -1129,10 +1158,10 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { \ /* Stage3 */ \ { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \ - const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \ - const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \ + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ + const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ + const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ + const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ \ MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ stg3_0, stg3_1, stg3_2, stg3_3, \ @@ -1151,10 +1180,10 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { \ /* Stage4 */ \ { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \ - const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \ - const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \ + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ + const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ + const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ + const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ \ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ @@ -1235,6 +1264,114 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { stp2_10, stp2_13, stp2_11, stp2_12) \ } +#define IDCT16_10 \ + /* Stage2 */ \ + { \ + const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ + const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ + const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ + const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \ + stg2_0, stg2_1, stg2_6, stg2_7, \ + stp1_8_0, stp1_15, stp1_11, stp1_12_0) \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ + const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ + \ + MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \ + stg3_0, stg3_1, \ + stp2_4, stp2_7) \ + \ + stp1_9 = stp1_8_0; \ + stp1_10 = stp1_11; \ + \ + stp1_13 = stp1_12_0; \ + stp1_14 = stp1_15; \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ + const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \ + stg4_0, stg4_1, \ + stp1_0, stp1_1) \ + stp2_5 = stp2_4; \ + stp2_6 = stp2_7; \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ + stg4_4, stg4_5, stg4_6, stg4_7, \ + stp2_9, stp2_14, stp2_10, stp2_13) \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_2 = stp1_1; \ + stp1_3 = stp1_0; \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ + \ + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, \ + stp2_10, stp2_13, stp2_11, stp2_12) \ + } + void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); @@ -1266,16 +1403,7 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, - in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, - in10 = zero, in11 = zero, in12 = zero, in13 = zero, - in14 = zero, in15 = zero; - __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, - l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, - l12 = zero, l13 = zero, l14 = zero, l15 = zero; - __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero, - r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero, - r12 = zero, r13 = zero, r14 = zero, r15 = zero; + __m128i in[16], l[16], r[16], *curr1; __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0, stp1_12_0; @@ -1284,162 +1412,132 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int i; - // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct. - for (i = 0; i < 4; i++) { - // 1-D idct - if (i < 2) { - if (i == 1) input += 128; + curr1 = l; + for (i = 0; i < 2; i++) { + // 1-D idct // Load input data. - in0 = _mm_load_si128((const __m128i *)input); - in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); - in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); - in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); - in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); - in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); - in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); - in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); - in4 = _mm_load_si128((const __m128i *)(input + 8 * 8)); - in12 = _mm_load_si128((const __m128i *)(input + 8 * 9)); - in5 = _mm_load_si128((const __m128i *)(input + 8 * 10)); - in13 = _mm_load_si128((const __m128i *)(input + 8 * 11)); - in6 = _mm_load_si128((const __m128i *)(input + 8 * 12)); - in14 = _mm_load_si128((const __m128i *)(input + 8 * 13)); - in7 = _mm_load_si128((const __m128i *)(input + 8 * 14)); - in15 = _mm_load_si128((const __m128i *)(input + 8 * 15)); - - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, - in10, in11, in12, in13, in14, in15); - } - - if (i == 2) { - TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, - in5, in6, in7); - TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12, - in13, in14, in15); - } - - if (i == 3) { - TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, - in4, in5, in6, in7); - TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11, - in12, in13, in14, in15); - } + in[0] = _mm_load_si128((const __m128i *)input); + in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1)); + in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3)); + in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5)); + in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); + in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7)); + in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8)); + in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9)); + in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10)); + in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11)); + in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12)); + in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13)); + in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14)); + in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15)); + + array_transpose_8x8(in, in); + array_transpose_8x8(in+8, in+8); + + IDCT16 + + // Stage7 + curr1[0] = _mm_add_epi16(stp2_0, stp1_15); + curr1[1] = _mm_add_epi16(stp2_1, stp1_14); + curr1[2] = _mm_add_epi16(stp2_2, stp2_13); + curr1[3] = _mm_add_epi16(stp2_3, stp2_12); + curr1[4] = _mm_add_epi16(stp2_4, stp2_11); + curr1[5] = _mm_add_epi16(stp2_5, stp2_10); + curr1[6] = _mm_add_epi16(stp2_6, stp1_9); + curr1[7] = _mm_add_epi16(stp2_7, stp1_8); + curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); + curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); + curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); + curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); + curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); + curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); + curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); + curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); + + curr1 = r; + input += 128; + } + for (i = 0; i < 2; i++) { + // 1-D idct + array_transpose_8x8(l+i*8, in); + array_transpose_8x8(r+i*8, in+8); - IDCT16_1D + IDCT16 - // Stage7 - if (i == 0) { - // Left 8x16 - l0 = _mm_add_epi16(stp2_0, stp1_15); - l1 = _mm_add_epi16(stp2_1, stp1_14); - l2 = _mm_add_epi16(stp2_2, stp2_13); - l3 = _mm_add_epi16(stp2_3, stp2_12); - l4 = _mm_add_epi16(stp2_4, stp2_11); - l5 = _mm_add_epi16(stp2_5, stp2_10); - l6 = _mm_add_epi16(stp2_6, stp1_9); - l7 = _mm_add_epi16(stp2_7, stp1_8); - l8 = _mm_sub_epi16(stp2_7, stp1_8); - l9 = _mm_sub_epi16(stp2_6, stp1_9); - l10 = _mm_sub_epi16(stp2_5, stp2_10); - l11 = _mm_sub_epi16(stp2_4, stp2_11); - l12 = _mm_sub_epi16(stp2_3, stp2_12); - l13 = _mm_sub_epi16(stp2_2, stp2_13); - l14 = _mm_sub_epi16(stp2_1, stp1_14); - l15 = _mm_sub_epi16(stp2_0, stp1_15); - } else if (i == 1) { - // Right 8x16 - r0 = _mm_add_epi16(stp2_0, stp1_15); - r1 = _mm_add_epi16(stp2_1, stp1_14); - r2 = _mm_add_epi16(stp2_2, stp2_13); - r3 = _mm_add_epi16(stp2_3, stp2_12); - r4 = _mm_add_epi16(stp2_4, stp2_11); - r5 = _mm_add_epi16(stp2_5, stp2_10); - r6 = _mm_add_epi16(stp2_6, stp1_9); - r7 = _mm_add_epi16(stp2_7, stp1_8); - r8 = _mm_sub_epi16(stp2_7, stp1_8); - r9 = _mm_sub_epi16(stp2_6, stp1_9); - r10 = _mm_sub_epi16(stp2_5, stp2_10); - r11 = _mm_sub_epi16(stp2_4, stp2_11); - r12 = _mm_sub_epi16(stp2_3, stp2_12); - r13 = _mm_sub_epi16(stp2_2, stp2_13); - r14 = _mm_sub_epi16(stp2_1, stp1_14); - r15 = _mm_sub_epi16(stp2_0, stp1_15); - } else { // 2-D - in0 = _mm_add_epi16(stp2_0, stp1_15); - in1 = _mm_add_epi16(stp2_1, stp1_14); - in2 = _mm_add_epi16(stp2_2, stp2_13); - in3 = _mm_add_epi16(stp2_3, stp2_12); - in4 = _mm_add_epi16(stp2_4, stp2_11); - in5 = _mm_add_epi16(stp2_5, stp2_10); - in6 = _mm_add_epi16(stp2_6, stp1_9); - in7 = _mm_add_epi16(stp2_7, stp1_8); - in8 = _mm_sub_epi16(stp2_7, stp1_8); - in9 = _mm_sub_epi16(stp2_6, stp1_9); - in10 = _mm_sub_epi16(stp2_5, stp2_10); - in11 = _mm_sub_epi16(stp2_4, stp2_11); - in12 = _mm_sub_epi16(stp2_3, stp2_12); - in13 = _mm_sub_epi16(stp2_2, stp2_13); - in14 = _mm_sub_epi16(stp2_1, stp1_14); - in15 = _mm_sub_epi16(stp2_0, stp1_15); + in[0] = _mm_add_epi16(stp2_0, stp1_15); + in[1] = _mm_add_epi16(stp2_1, stp1_14); + in[2] = _mm_add_epi16(stp2_2, stp2_13); + in[3] = _mm_add_epi16(stp2_3, stp2_12); + in[4] = _mm_add_epi16(stp2_4, stp2_11); + in[5] = _mm_add_epi16(stp2_5, stp2_10); + in[6] = _mm_add_epi16(stp2_6, stp1_9); + in[7] = _mm_add_epi16(stp2_7, stp1_8); + in[8] = _mm_sub_epi16(stp2_7, stp1_8); + in[9] = _mm_sub_epi16(stp2_6, stp1_9); + in[10] = _mm_sub_epi16(stp2_5, stp2_10); + in[11] = _mm_sub_epi16(stp2_4, stp2_11); + in[12] = _mm_sub_epi16(stp2_3, stp2_12); + in[13] = _mm_sub_epi16(stp2_2, stp2_13); + in[14] = _mm_sub_epi16(stp2_1, stp1_14); + in[15] = _mm_sub_epi16(stp2_0, stp1_15); // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - in8 = _mm_adds_epi16(in8, final_rounding); - in9 = _mm_adds_epi16(in9, final_rounding); - in10 = _mm_adds_epi16(in10, final_rounding); - in11 = _mm_adds_epi16(in11, final_rounding); - in12 = _mm_adds_epi16(in12, final_rounding); - in13 = _mm_adds_epi16(in13, final_rounding); - in14 = _mm_adds_epi16(in14, final_rounding); - in15 = _mm_adds_epi16(in15, final_rounding); - - in0 = _mm_srai_epi16(in0, 6); - in1 = _mm_srai_epi16(in1, 6); - in2 = _mm_srai_epi16(in2, 6); - in3 = _mm_srai_epi16(in3, 6); - in4 = _mm_srai_epi16(in4, 6); - in5 = _mm_srai_epi16(in5, 6); - in6 = _mm_srai_epi16(in6, 6); - in7 = _mm_srai_epi16(in7, 6); - in8 = _mm_srai_epi16(in8, 6); - in9 = _mm_srai_epi16(in9, 6); - in10 = _mm_srai_epi16(in10, 6); - in11 = _mm_srai_epi16(in11, 6); - in12 = _mm_srai_epi16(in12, 6); - in13 = _mm_srai_epi16(in13, 6); - in14 = _mm_srai_epi16(in14, 6); - in15 = _mm_srai_epi16(in15, 6); - - RECON_AND_STORE(dest, in0); - RECON_AND_STORE(dest, in1); - RECON_AND_STORE(dest, in2); - RECON_AND_STORE(dest, in3); - RECON_AND_STORE(dest, in4); - RECON_AND_STORE(dest, in5); - RECON_AND_STORE(dest, in6); - RECON_AND_STORE(dest, in7); - RECON_AND_STORE(dest, in8); - RECON_AND_STORE(dest, in9); - RECON_AND_STORE(dest, in10); - RECON_AND_STORE(dest, in11); - RECON_AND_STORE(dest, in12); - RECON_AND_STORE(dest, in13); - RECON_AND_STORE(dest, in14); - RECON_AND_STORE(dest, in15); + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + + RECON_AND_STORE(dest, in[0]); + RECON_AND_STORE(dest, in[1]); + RECON_AND_STORE(dest, in[2]); + RECON_AND_STORE(dest, in[3]); + RECON_AND_STORE(dest, in[4]); + RECON_AND_STORE(dest, in[5]); + RECON_AND_STORE(dest, in[6]); + RECON_AND_STORE(dest, in[7]); + RECON_AND_STORE(dest, in[8]); + RECON_AND_STORE(dest, in[9]); + RECON_AND_STORE(dest, in[10]); + RECON_AND_STORE(dest, in[11]); + RECON_AND_STORE(dest, in[12]); + RECON_AND_STORE(dest, in[13]); + RECON_AND_STORE(dest, in[14]); + RECON_AND_STORE(dest, in[15]); dest += 8 - (stride * 16); - } } } @@ -1492,7 +1590,7 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { res0[15] = tbuf[7]; } -static void iadst16_1d_8col(__m128i *in) { +static void iadst16_8col(__m128i *in) { // perform 16x16 1-D ADST for 8 columns __m128i s[16], x[16], u[32], v[32]; const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); @@ -1962,7 +2060,7 @@ static void iadst16_1d_8col(__m128i *in) { in[15] = _mm_sub_epi16(kZero, s[1]); } -static void idct16_1d_8col(__m128i *in) { +static void idct16_8col(__m128i *in) { const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); @@ -2306,16 +2404,16 @@ static void idct16_1d_8col(__m128i *in) { in[15] = _mm_sub_epi16(s[0], s[15]); } -static void idct16_1d_sse2(__m128i *in0, __m128i *in1) { +static void idct16_sse2(__m128i *in0, __m128i *in1) { array_transpose_16x16(in0, in1); - idct16_1d_8col(in0); - idct16_1d_8col(in1); + idct16_8col(in0); + idct16_8col(in1); } -static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) { +static void iadst16_sse2(__m128i *in0, __m128i *in1) { array_transpose_16x16(in0, in1); - iadst16_1d_8col(in0); - iadst16_1d_8col(in1); + iadst16_8col(in0); + iadst16_8col(in1); } static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { @@ -2404,20 +2502,20 @@ void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, switch (tx_type) { case 0: // DCT_DCT - idct16_1d_sse2(in0, in1); - idct16_1d_sse2(in0, in1); + idct16_sse2(in0, in1); + idct16_sse2(in0, in1); break; case 1: // ADST_DCT - idct16_1d_sse2(in0, in1); - iadst16_1d_sse2(in0, in1); + idct16_sse2(in0, in1); + iadst16_sse2(in0, in1); break; case 2: // DCT_ADST - iadst16_1d_sse2(in0, in1); - idct16_1d_sse2(in0, in1); + iadst16_sse2(in0, in1); + idct16_sse2(in0, in1); break; case 3: // ADST_ADST - iadst16_1d_sse2(in0, in1); - iadst16_1d_sse2(in0, in1); + iadst16_sse2(in0, in1); + iadst16_sse2(in0, in1); break; default: assert(0); @@ -2437,149 +2535,87 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, - in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, - in10 = zero, in11 = zero, in12 = zero, in13 = zero, - in14 = zero, in15 = zero; - __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, - l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, - l12 = zero, l13 = zero, l14 = zero, l15 = zero; - - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + __m128i in[16], l[16]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0, stp1_12_0; __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int i; - // 1-D idct. Load input data. - in0 = _mm_load_si128((const __m128i *)input); - in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); - in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); - in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); - in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); - in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); - in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); - in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); + // First 1-D inverse DCT + // Load input data. + in[0] = _mm_load_si128((const __m128i *)input); + in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); - TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3); - TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11); + TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); // Stage2 { - const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11); - const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3); - const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9); - const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1); + const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); + const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); - tmp4 = _mm_madd_epi16(lo_9_7, stg2_2); - tmp6 = _mm_madd_epi16(lo_9_7, stg2_3); - tmp1 = _mm_madd_epi16(lo_5_11, stg2_4); - tmp3 = _mm_madd_epi16(lo_5_11, stg2_5); tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); tmp0 = _mm_add_epi32(tmp0, rounding); tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); tmp5 = _mm_add_epi32(tmp5, rounding); tmp7 = _mm_add_epi32(tmp7, rounding); tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - stp2_8 = _mm_packs_epi32(tmp0, zero); - stp2_15 = _mm_packs_epi32(tmp2, zero); - stp2_9 = _mm_packs_epi32(tmp4, zero); - stp2_14 = _mm_packs_epi32(tmp6, zero); - - stp2_10 = _mm_packs_epi32(tmp1, zero); - stp2_13 = _mm_packs_epi32(tmp3, zero); - stp2_11 = _mm_packs_epi32(tmp5, zero); - stp2_12 = _mm_packs_epi32(tmp7, zero); + stp2_8 = _mm_packs_epi32(tmp0, tmp2); + stp2_11 = _mm_packs_epi32(tmp5, tmp7); } // Stage3 { - const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11); - const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3); + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); - tmp4 = _mm_madd_epi16(lo_10_6, stg3_2); - tmp6 = _mm_madd_epi16(lo_10_6, stg3_3); tmp0 = _mm_add_epi32(tmp0, rounding); tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp1_4 = _mm_packs_epi32(tmp0, zero); - stp1_7 = _mm_packs_epi32(tmp2, zero); - stp1_5 = _mm_packs_epi32(tmp4, zero); - stp1_6 = _mm_packs_epi32(tmp6, zero); - stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); + stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); + stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); - stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); + stp1_4 = _mm_packs_epi32(tmp0, tmp2); } // Stage4 { - const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); - const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10); - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); - tmp4 = _mm_madd_epi16(lo_4_12, stg4_2); - tmp6 = _mm_madd_epi16(lo_4_12, stg4_3); tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); @@ -2587,8 +2623,6 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, tmp0 = _mm_add_epi32(tmp0, rounding); tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); tmp1 = _mm_add_epi32(tmp1, rounding); tmp3 = _mm_add_epi32(tmp3, rounding); tmp5 = _mm_add_epi32(tmp5, rounding); @@ -2596,49 +2630,40 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - stp2_0 = _mm_packs_epi32(tmp0, zero); - stp2_1 = _mm_packs_epi32(tmp2, zero); - stp2_2 = _mm_packs_epi32(tmp4, zero); - stp2_3 = _mm_packs_epi32(tmp6, zero); - stp2_9 = _mm_packs_epi32(tmp1, zero); - stp2_14 = _mm_packs_epi32(tmp3, zero); - stp2_10 = _mm_packs_epi32(tmp5, zero); - stp2_13 = _mm_packs_epi32(tmp7, zero); - - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); + stp1_0 = _mm_packs_epi32(tmp0, tmp0); + stp1_1 = _mm_packs_epi32(tmp2, tmp2); + stp2_9 = _mm_packs_epi32(tmp1, tmp3); + stp2_10 = _mm_packs_epi32(tmp5, tmp7); + + stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); } // Stage5 and Stage6 { - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); - - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); - - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); + tmp0 = _mm_add_epi16(stp2_8, stp2_11); + tmp1 = _mm_sub_epi16(stp2_8, stp2_11); + tmp2 = _mm_add_epi16(stp2_9, stp2_10); + tmp3 = _mm_sub_epi16(stp2_9, stp2_10); + + stp1_9 = _mm_unpacklo_epi64(tmp2, zero); + stp1_10 = _mm_unpacklo_epi64(tmp3, zero); + stp1_8 = _mm_unpacklo_epi64(tmp0, zero); + stp1_11 = _mm_unpacklo_epi64(tmp1, zero); + + stp1_13 = _mm_unpackhi_epi64(tmp3, zero); + stp1_14 = _mm_unpackhi_epi64(tmp2, zero); + stp1_12 = _mm_unpackhi_epi64(tmp1, zero); + stp1_15 = _mm_unpackhi_epi64(tmp0, zero); } // Stage6 { - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); @@ -2663,124 +2688,121 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - stp1_5 = _mm_packs_epi32(tmp1, zero); - stp1_6 = _mm_packs_epi32(tmp3, zero); + stp1_6 = _mm_packs_epi32(tmp3, tmp1); + stp2_10 = _mm_packs_epi32(tmp0, zero); stp2_13 = _mm_packs_epi32(tmp2, zero); stp2_11 = _mm_packs_epi32(tmp4, zero); stp2_12 = _mm_packs_epi32(tmp6, zero); - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); + tmp0 = _mm_add_epi16(stp1_0, stp1_4); + tmp1 = _mm_sub_epi16(stp1_0, stp1_4); + tmp2 = _mm_add_epi16(stp1_1, stp1_6); + tmp3 = _mm_sub_epi16(stp1_1, stp1_6); + + stp2_0 = _mm_unpackhi_epi64(tmp0, zero); + stp2_1 = _mm_unpacklo_epi64(tmp2, zero); + stp2_2 = _mm_unpackhi_epi64(tmp2, zero); + stp2_3 = _mm_unpacklo_epi64(tmp0, zero); + stp2_4 = _mm_unpacklo_epi64(tmp1, zero); + stp2_5 = _mm_unpackhi_epi64(tmp3, zero); + stp2_6 = _mm_unpacklo_epi64(tmp3, zero); + stp2_7 = _mm_unpackhi_epi64(tmp1, zero); } // Stage7. Left 8x16 only. - l0 = _mm_add_epi16(stp2_0, stp1_15); - l1 = _mm_add_epi16(stp2_1, stp1_14); - l2 = _mm_add_epi16(stp2_2, stp2_13); - l3 = _mm_add_epi16(stp2_3, stp2_12); - l4 = _mm_add_epi16(stp2_4, stp2_11); - l5 = _mm_add_epi16(stp2_5, stp2_10); - l6 = _mm_add_epi16(stp2_6, stp1_9); - l7 = _mm_add_epi16(stp2_7, stp1_8); - l8 = _mm_sub_epi16(stp2_7, stp1_8); - l9 = _mm_sub_epi16(stp2_6, stp1_9); - l10 = _mm_sub_epi16(stp2_5, stp2_10); - l11 = _mm_sub_epi16(stp2_4, stp2_11); - l12 = _mm_sub_epi16(stp2_3, stp2_12); - l13 = _mm_sub_epi16(stp2_2, stp2_13); - l14 = _mm_sub_epi16(stp2_1, stp1_14); - l15 = _mm_sub_epi16(stp2_0, stp1_15); - - // 2-D idct. We do 2 8x16 blocks. + l[0] = _mm_add_epi16(stp2_0, stp1_15); + l[1] = _mm_add_epi16(stp2_1, stp1_14); + l[2] = _mm_add_epi16(stp2_2, stp2_13); + l[3] = _mm_add_epi16(stp2_3, stp2_12); + l[4] = _mm_add_epi16(stp2_4, stp2_11); + l[5] = _mm_add_epi16(stp2_5, stp2_10); + l[6] = _mm_add_epi16(stp2_6, stp1_9); + l[7] = _mm_add_epi16(stp2_7, stp1_8); + l[8] = _mm_sub_epi16(stp2_7, stp1_8); + l[9] = _mm_sub_epi16(stp2_6, stp1_9); + l[10] = _mm_sub_epi16(stp2_5, stp2_10); + l[11] = _mm_sub_epi16(stp2_4, stp2_11); + l[12] = _mm_sub_epi16(stp2_3, stp2_12); + l[13] = _mm_sub_epi16(stp2_2, stp2_13); + l[14] = _mm_sub_epi16(stp2_1, stp1_14); + l[15] = _mm_sub_epi16(stp2_0, stp1_15); + + // Second 1-D inverse transform, performed per 8x16 block for (i = 0; i < 2; i++) { - if (i == 0) - TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, - in5, in6, in7); - - if (i == 1) - TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, - in4, in5, in6, in7); - - in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero; + array_transpose_4X8(l + 8*i, in); - IDCT16_1D + IDCT16_10 // Stage7 - in0 = _mm_add_epi16(stp2_0, stp1_15); - in1 = _mm_add_epi16(stp2_1, stp1_14); - in2 = _mm_add_epi16(stp2_2, stp2_13); - in3 = _mm_add_epi16(stp2_3, stp2_12); - in4 = _mm_add_epi16(stp2_4, stp2_11); - in5 = _mm_add_epi16(stp2_5, stp2_10); - in6 = _mm_add_epi16(stp2_6, stp1_9); - in7 = _mm_add_epi16(stp2_7, stp1_8); - in8 = _mm_sub_epi16(stp2_7, stp1_8); - in9 = _mm_sub_epi16(stp2_6, stp1_9); - in10 = _mm_sub_epi16(stp2_5, stp2_10); - in11 = _mm_sub_epi16(stp2_4, stp2_11); - in12 = _mm_sub_epi16(stp2_3, stp2_12); - in13 = _mm_sub_epi16(stp2_2, stp2_13); - in14 = _mm_sub_epi16(stp2_1, stp1_14); - in15 = _mm_sub_epi16(stp2_0, stp1_15); + in[0] = _mm_add_epi16(stp2_0, stp1_15); + in[1] = _mm_add_epi16(stp2_1, stp1_14); + in[2] = _mm_add_epi16(stp2_2, stp2_13); + in[3] = _mm_add_epi16(stp2_3, stp2_12); + in[4] = _mm_add_epi16(stp2_4, stp2_11); + in[5] = _mm_add_epi16(stp2_5, stp2_10); + in[6] = _mm_add_epi16(stp2_6, stp1_9); + in[7] = _mm_add_epi16(stp2_7, stp1_8); + in[8] = _mm_sub_epi16(stp2_7, stp1_8); + in[9] = _mm_sub_epi16(stp2_6, stp1_9); + in[10] = _mm_sub_epi16(stp2_5, stp2_10); + in[11] = _mm_sub_epi16(stp2_4, stp2_11); + in[12] = _mm_sub_epi16(stp2_3, stp2_12); + in[13] = _mm_sub_epi16(stp2_2, stp2_13); + in[14] = _mm_sub_epi16(stp2_1, stp1_14); + in[15] = _mm_sub_epi16(stp2_0, stp1_15); // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - in8 = _mm_adds_epi16(in8, final_rounding); - in9 = _mm_adds_epi16(in9, final_rounding); - in10 = _mm_adds_epi16(in10, final_rounding); - in11 = _mm_adds_epi16(in11, final_rounding); - in12 = _mm_adds_epi16(in12, final_rounding); - in13 = _mm_adds_epi16(in13, final_rounding); - in14 = _mm_adds_epi16(in14, final_rounding); - in15 = _mm_adds_epi16(in15, final_rounding); - - in0 = _mm_srai_epi16(in0, 6); - in1 = _mm_srai_epi16(in1, 6); - in2 = _mm_srai_epi16(in2, 6); - in3 = _mm_srai_epi16(in3, 6); - in4 = _mm_srai_epi16(in4, 6); - in5 = _mm_srai_epi16(in5, 6); - in6 = _mm_srai_epi16(in6, 6); - in7 = _mm_srai_epi16(in7, 6); - in8 = _mm_srai_epi16(in8, 6); - in9 = _mm_srai_epi16(in9, 6); - in10 = _mm_srai_epi16(in10, 6); - in11 = _mm_srai_epi16(in11, 6); - in12 = _mm_srai_epi16(in12, 6); - in13 = _mm_srai_epi16(in13, 6); - in14 = _mm_srai_epi16(in14, 6); - in15 = _mm_srai_epi16(in15, 6); - - RECON_AND_STORE(dest, in0); - RECON_AND_STORE(dest, in1); - RECON_AND_STORE(dest, in2); - RECON_AND_STORE(dest, in3); - RECON_AND_STORE(dest, in4); - RECON_AND_STORE(dest, in5); - RECON_AND_STORE(dest, in6); - RECON_AND_STORE(dest, in7); - RECON_AND_STORE(dest, in8); - RECON_AND_STORE(dest, in9); - RECON_AND_STORE(dest, in10); - RECON_AND_STORE(dest, in11); - RECON_AND_STORE(dest, in12); - RECON_AND_STORE(dest, in13); - RECON_AND_STORE(dest, in14); - RECON_AND_STORE(dest, in15); + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + + RECON_AND_STORE(dest, in[0]); + RECON_AND_STORE(dest, in[1]); + RECON_AND_STORE(dest, in[2]); + RECON_AND_STORE(dest, in[3]); + RECON_AND_STORE(dest, in[4]); + RECON_AND_STORE(dest, in[5]); + RECON_AND_STORE(dest, in[6]); + RECON_AND_STORE(dest, in[7]); + RECON_AND_STORE(dest, in[8]); + RECON_AND_STORE(dest, in[9]); + RECON_AND_STORE(dest, in[10]); + RECON_AND_STORE(dest, in[11]); + RECON_AND_STORE(dest, in[12]); + RECON_AND_STORE(dest, in[13]); + RECON_AND_STORE(dest, in[14]); + RECON_AND_STORE(dest, in[15]); dest += 8 - (stride * 16); } @@ -2792,28 +2814,329 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, input += 8; \ } \ -#define IDCT32_1D \ +#define IDCT32_34 \ +/* Stage1 */ \ +{ \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ + const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ + \ + const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \ + const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ + \ + const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ + const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ + \ + const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ + const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ + \ + MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \ + stg1_1, stp1_16, stp1_31); \ + MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \ + stg1_7, stp1_19, stp1_28); \ + MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \ + stg1_9, stp1_20, stp1_27); \ + MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \ + stg1_15, stp1_23, stp1_24); \ +} \ +\ +/* Stage2 */ \ +{ \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ + const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ + \ + const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ + const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ + \ + MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \ + stg2_1, stp2_8, stp2_15); \ + MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \ + stg2_7, stp2_11, stp2_12); \ + \ + stp2_16 = stp1_16; \ + stp2_19 = stp1_19; \ + \ + stp2_20 = stp1_20; \ + stp2_23 = stp1_23; \ + \ + stp2_24 = stp1_24; \ + stp2_27 = stp1_27; \ + \ + stp2_28 = stp1_28; \ + stp2_31 = stp1_31; \ +} \ +\ +/* Stage3 */ \ +{ \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ + const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ + \ + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ + \ + MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \ + stg3_1, stp1_4, stp1_7); \ + \ + stp1_8 = stp2_8; \ + stp1_11 = stp2_11; \ + stp1_12 = stp2_12; \ + stp1_15 = stp2_15; \ + \ + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ + stp1_18, stp1_29) \ + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ + stp1_22, stp1_25) \ + \ + stp1_16 = stp2_16; \ + stp1_31 = stp2_31; \ + stp1_19 = stp2_19; \ + stp1_20 = stp2_20; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_27 = stp2_27; \ + stp1_28 = stp2_28; \ +} \ +\ +/* Stage4 */ \ +{ \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ + const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ + \ + MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \ + stg4_1, stp2_0, stp2_1); \ + \ + stp2_4 = stp1_4; \ + stp2_5 = stp1_4; \ + stp2_6 = stp1_7; \ + stp2_7 = stp1_7; \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ + stp2_10, stp2_13) \ + \ + stp2_8 = stp1_8; \ + stp2_15 = stp1_15; \ + stp2_11 = stp1_11; \ + stp2_12 = stp1_12; \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ +} \ +\ +/* Stage5 */ \ +{ \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + stp1_0 = stp2_0; \ + stp1_1 = stp2_1; \ + stp1_2 = stp2_1; \ + stp1_3 = stp2_0; \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_4 = stp2_4; \ + stp1_7 = stp2_7; \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + \ + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ + stp1_19, stp1_28) \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + \ + stp1_22 = stp2_22; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_25 = stp2_25; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} \ +\ +/* Stage6 */ \ +{ \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ + \ + stp2_8 = stp1_8; \ + stp2_9 = stp1_9; \ + stp2_14 = stp1_14; \ + stp2_15 = stp1_15; \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ + stp2_13, stp2_11, stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ + \ + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ +} \ +\ +/* Stage7 */ \ +{ \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + stp1_18 = stp2_18; \ + stp1_19 = stp2_19; \ + \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ + stp1_23, stp1_24) \ + \ + stp1_28 = stp2_28; \ + stp1_29 = stp2_29; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} + + +#define IDCT32 \ /* Stage1 */ \ { \ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \ - const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \ - const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \ - \ - const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \ - const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \ - const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \ - const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \ - const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \ - \ - const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \ - const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \ + const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ + const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ + const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ + const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ + \ + const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ + const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ + const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \ + const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ + \ + const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ + const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ + const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ + const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ + \ + const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ + const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ + const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ + const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ \ MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ @@ -2831,15 +3154,15 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, \ /* Stage2 */ \ { \ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \ - const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \ - const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \ + const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ + const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ + const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ + const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ \ - const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \ - const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \ + const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ + const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ + const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ + const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ \ MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ @@ -2871,10 +3194,10 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, \ /* Stage3 */ \ { \ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \ - const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \ - const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \ + const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ + const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ + const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ + const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ \ const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ @@ -2918,10 +3241,10 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, \ /* Stage4 */ \ { \ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \ - const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \ - const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \ + const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ + const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ + const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ + const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ \ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ @@ -3178,10 +3501,7 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, - in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, - in24, in25, in26, in27, in28, in29, in30, in31; - __m128i col[128]; + __m128i in[32], col[32]; __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, @@ -3193,296 +3513,225 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i, j, i32; - - // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. - for (i = 0; i < 8; i++) { - i32 = (i << 5); - if (i == 0) { - // First 1-D idct: first 8 rows - // Load input data. - LOAD_DQCOEFF(in0, input); - LOAD_DQCOEFF(in8, input); - LOAD_DQCOEFF(in16, input); - LOAD_DQCOEFF(in24, input); - LOAD_DQCOEFF(in1, input); - LOAD_DQCOEFF(in9, input); - LOAD_DQCOEFF(in17, input); - LOAD_DQCOEFF(in25, input); - LOAD_DQCOEFF(in2, input); - LOAD_DQCOEFF(in10, input); - LOAD_DQCOEFF(in18, input); - LOAD_DQCOEFF(in26, input); - LOAD_DQCOEFF(in3, input); - LOAD_DQCOEFF(in11, input); - LOAD_DQCOEFF(in19, input); - LOAD_DQCOEFF(in27, input); - - LOAD_DQCOEFF(in4, input); - LOAD_DQCOEFF(in12, input); - LOAD_DQCOEFF(in20, input); - LOAD_DQCOEFF(in28, input); - LOAD_DQCOEFF(in5, input); - LOAD_DQCOEFF(in13, input); - LOAD_DQCOEFF(in21, input); - LOAD_DQCOEFF(in29, input); - LOAD_DQCOEFF(in6, input); - LOAD_DQCOEFF(in14, input); - LOAD_DQCOEFF(in22, input); - LOAD_DQCOEFF(in30, input); - LOAD_DQCOEFF(in7, input); - LOAD_DQCOEFF(in15, input); - LOAD_DQCOEFF(in23, input); - LOAD_DQCOEFF(in31, input); - - // Transpose 32x8 block to 8x32 block - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, - in10, in11, in12, in13, in14, in15); - TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, - in18, in19, in20, in21, in22, in23); - TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, - in26, in27, in28, in29, in30, in31); - } else if (i < 4) { - // First 1-D idct: next 24 zero-coeff rows - col[i32 + 0] = _mm_setzero_si128(); - col[i32 + 1] = _mm_setzero_si128(); - col[i32 + 2] = _mm_setzero_si128(); - col[i32 + 3] = _mm_setzero_si128(); - col[i32 + 4] = _mm_setzero_si128(); - col[i32 + 5] = _mm_setzero_si128(); - col[i32 + 6] = _mm_setzero_si128(); - col[i32 + 7] = _mm_setzero_si128(); - col[i32 + 8] = _mm_setzero_si128(); - col[i32 + 9] = _mm_setzero_si128(); - col[i32 + 10] = _mm_setzero_si128(); - col[i32 + 11] = _mm_setzero_si128(); - col[i32 + 12] = _mm_setzero_si128(); - col[i32 + 13] = _mm_setzero_si128(); - col[i32 + 14] = _mm_setzero_si128(); - col[i32 + 15] = _mm_setzero_si128(); - col[i32 + 16] = _mm_setzero_si128(); - col[i32 + 17] = _mm_setzero_si128(); - col[i32 + 18] = _mm_setzero_si128(); - col[i32 + 19] = _mm_setzero_si128(); - col[i32 + 20] = _mm_setzero_si128(); - col[i32 + 21] = _mm_setzero_si128(); - col[i32 + 22] = _mm_setzero_si128(); - col[i32 + 23] = _mm_setzero_si128(); - col[i32 + 24] = _mm_setzero_si128(); - col[i32 + 25] = _mm_setzero_si128(); - col[i32 + 26] = _mm_setzero_si128(); - col[i32 + 27] = _mm_setzero_si128(); - col[i32 + 28] = _mm_setzero_si128(); - col[i32 + 29] = _mm_setzero_si128(); - col[i32 + 30] = _mm_setzero_si128(); - col[i32 + 31] = _mm_setzero_si128(); - continue; - } else { - // Second 1-D idct - j = i - 4; - - // Transpose 32x8 block to 8x32 block - TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], - col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], - col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, - in5, in6, in7); - j += 4; - TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], - col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], - col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, - in11, in12, in13, in14, in15); - j += 4; - TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], - col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], - col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, - in19, in20, in21, in22, in23); - j += 4; - TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], - col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], - col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, - in28, in29, in30, in31); - } - - IDCT32_1D + int i; + // Load input data. + LOAD_DQCOEFF(in[0], input); + LOAD_DQCOEFF(in[8], input); + LOAD_DQCOEFF(in[16], input); + LOAD_DQCOEFF(in[24], input); + LOAD_DQCOEFF(in[1], input); + LOAD_DQCOEFF(in[9], input); + LOAD_DQCOEFF(in[17], input); + LOAD_DQCOEFF(in[25], input); + LOAD_DQCOEFF(in[2], input); + LOAD_DQCOEFF(in[10], input); + LOAD_DQCOEFF(in[18], input); + LOAD_DQCOEFF(in[26], input); + LOAD_DQCOEFF(in[3], input); + LOAD_DQCOEFF(in[11], input); + LOAD_DQCOEFF(in[19], input); + LOAD_DQCOEFF(in[27], input); + + LOAD_DQCOEFF(in[4], input); + LOAD_DQCOEFF(in[12], input); + LOAD_DQCOEFF(in[20], input); + LOAD_DQCOEFF(in[28], input); + LOAD_DQCOEFF(in[5], input); + LOAD_DQCOEFF(in[13], input); + LOAD_DQCOEFF(in[21], input); + LOAD_DQCOEFF(in[29], input); + LOAD_DQCOEFF(in[6], input); + LOAD_DQCOEFF(in[14], input); + LOAD_DQCOEFF(in[22], input); + LOAD_DQCOEFF(in[30], input); + LOAD_DQCOEFF(in[7], input); + LOAD_DQCOEFF(in[15], input); + LOAD_DQCOEFF(in[23], input); + LOAD_DQCOEFF(in[31], input); - // final stage - if (i < 4) { - // 1_D: Store 32 intermediate results for each 8x32 block. - col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); - col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); - col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); - col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); - col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); - col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); - col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); - col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); - col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); - col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); - col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); - col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); - col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); - col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); - col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); - col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); - col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); - col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); - col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); - col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); - col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); - col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); - col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); - col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); - col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); - col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); - col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); - col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); - col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); - col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); - col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); - col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); - } else { + array_transpose_8x8(in, in); + array_transpose_8x8(in+8, in+8); + array_transpose_8x8(in+16, in+16); + array_transpose_8x8(in+24, in+24); + + IDCT32 + + // 1_D: Store 32 intermediate results for each 8x32 block. + col[0] = _mm_add_epi16(stp1_0, stp1_31); + col[1] = _mm_add_epi16(stp1_1, stp1_30); + col[2] = _mm_add_epi16(stp1_2, stp1_29); + col[3] = _mm_add_epi16(stp1_3, stp1_28); + col[4] = _mm_add_epi16(stp1_4, stp1_27); + col[5] = _mm_add_epi16(stp1_5, stp1_26); + col[6] = _mm_add_epi16(stp1_6, stp1_25); + col[7] = _mm_add_epi16(stp1_7, stp1_24); + col[8] = _mm_add_epi16(stp1_8, stp1_23); + col[9] = _mm_add_epi16(stp1_9, stp1_22); + col[10] = _mm_add_epi16(stp1_10, stp1_21); + col[11] = _mm_add_epi16(stp1_11, stp1_20); + col[12] = _mm_add_epi16(stp1_12, stp1_19); + col[13] = _mm_add_epi16(stp1_13, stp1_18); + col[14] = _mm_add_epi16(stp1_14, stp1_17); + col[15] = _mm_add_epi16(stp1_15, stp1_16); + col[16] = _mm_sub_epi16(stp1_15, stp1_16); + col[17] = _mm_sub_epi16(stp1_14, stp1_17); + col[18] = _mm_sub_epi16(stp1_13, stp1_18); + col[19] = _mm_sub_epi16(stp1_12, stp1_19); + col[20] = _mm_sub_epi16(stp1_11, stp1_20); + col[21] = _mm_sub_epi16(stp1_10, stp1_21); + col[22] = _mm_sub_epi16(stp1_9, stp1_22); + col[23] = _mm_sub_epi16(stp1_8, stp1_23); + col[24] = _mm_sub_epi16(stp1_7, stp1_24); + col[25] = _mm_sub_epi16(stp1_6, stp1_25); + col[26] = _mm_sub_epi16(stp1_5, stp1_26); + col[27] = _mm_sub_epi16(stp1_4, stp1_27); + col[28] = _mm_sub_epi16(stp1_3, stp1_28); + col[29] = _mm_sub_epi16(stp1_2, stp1_29); + col[30] = _mm_sub_epi16(stp1_1, stp1_30); + col[31] = _mm_sub_epi16(stp1_0, stp1_31); + for (i = 0; i < 4; i++) { const __m128i zero = _mm_setzero_si128(); + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col+i*8, in); + IDCT32_34 // 2_D: Calculate the results and store them to destination. - in0 = _mm_add_epi16(stp1_0, stp1_31); - in1 = _mm_add_epi16(stp1_1, stp1_30); - in2 = _mm_add_epi16(stp1_2, stp1_29); - in3 = _mm_add_epi16(stp1_3, stp1_28); - in4 = _mm_add_epi16(stp1_4, stp1_27); - in5 = _mm_add_epi16(stp1_5, stp1_26); - in6 = _mm_add_epi16(stp1_6, stp1_25); - in7 = _mm_add_epi16(stp1_7, stp1_24); - in8 = _mm_add_epi16(stp1_8, stp1_23); - in9 = _mm_add_epi16(stp1_9, stp1_22); - in10 = _mm_add_epi16(stp1_10, stp1_21); - in11 = _mm_add_epi16(stp1_11, stp1_20); - in12 = _mm_add_epi16(stp1_12, stp1_19); - in13 = _mm_add_epi16(stp1_13, stp1_18); - in14 = _mm_add_epi16(stp1_14, stp1_17); - in15 = _mm_add_epi16(stp1_15, stp1_16); - in16 = _mm_sub_epi16(stp1_15, stp1_16); - in17 = _mm_sub_epi16(stp1_14, stp1_17); - in18 = _mm_sub_epi16(stp1_13, stp1_18); - in19 = _mm_sub_epi16(stp1_12, stp1_19); - in20 = _mm_sub_epi16(stp1_11, stp1_20); - in21 = _mm_sub_epi16(stp1_10, stp1_21); - in22 = _mm_sub_epi16(stp1_9, stp1_22); - in23 = _mm_sub_epi16(stp1_8, stp1_23); - in24 = _mm_sub_epi16(stp1_7, stp1_24); - in25 = _mm_sub_epi16(stp1_6, stp1_25); - in26 = _mm_sub_epi16(stp1_5, stp1_26); - in27 = _mm_sub_epi16(stp1_4, stp1_27); - in28 = _mm_sub_epi16(stp1_3, stp1_28); - in29 = _mm_sub_epi16(stp1_2, stp1_29); - in30 = _mm_sub_epi16(stp1_1, stp1_30); - in31 = _mm_sub_epi16(stp1_0, stp1_31); + in[0] = _mm_add_epi16(stp1_0, stp1_31); + in[1] = _mm_add_epi16(stp1_1, stp1_30); + in[2] = _mm_add_epi16(stp1_2, stp1_29); + in[3] = _mm_add_epi16(stp1_3, stp1_28); + in[4] = _mm_add_epi16(stp1_4, stp1_27); + in[5] = _mm_add_epi16(stp1_5, stp1_26); + in[6] = _mm_add_epi16(stp1_6, stp1_25); + in[7] = _mm_add_epi16(stp1_7, stp1_24); + in[8] = _mm_add_epi16(stp1_8, stp1_23); + in[9] = _mm_add_epi16(stp1_9, stp1_22); + in[10] = _mm_add_epi16(stp1_10, stp1_21); + in[11] = _mm_add_epi16(stp1_11, stp1_20); + in[12] = _mm_add_epi16(stp1_12, stp1_19); + in[13] = _mm_add_epi16(stp1_13, stp1_18); + in[14] = _mm_add_epi16(stp1_14, stp1_17); + in[15] = _mm_add_epi16(stp1_15, stp1_16); + in[16] = _mm_sub_epi16(stp1_15, stp1_16); + in[17] = _mm_sub_epi16(stp1_14, stp1_17); + in[18] = _mm_sub_epi16(stp1_13, stp1_18); + in[19] = _mm_sub_epi16(stp1_12, stp1_19); + in[20] = _mm_sub_epi16(stp1_11, stp1_20); + in[21] = _mm_sub_epi16(stp1_10, stp1_21); + in[22] = _mm_sub_epi16(stp1_9, stp1_22); + in[23] = _mm_sub_epi16(stp1_8, stp1_23); + in[24] = _mm_sub_epi16(stp1_7, stp1_24); + in[25] = _mm_sub_epi16(stp1_6, stp1_25); + in[26] = _mm_sub_epi16(stp1_5, stp1_26); + in[27] = _mm_sub_epi16(stp1_4, stp1_27); + in[28] = _mm_sub_epi16(stp1_3, stp1_28); + in[29] = _mm_sub_epi16(stp1_2, stp1_29); + in[30] = _mm_sub_epi16(stp1_1, stp1_30); + in[31] = _mm_sub_epi16(stp1_0, stp1_31); // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - in8 = _mm_adds_epi16(in8, final_rounding); - in9 = _mm_adds_epi16(in9, final_rounding); - in10 = _mm_adds_epi16(in10, final_rounding); - in11 = _mm_adds_epi16(in11, final_rounding); - in12 = _mm_adds_epi16(in12, final_rounding); - in13 = _mm_adds_epi16(in13, final_rounding); - in14 = _mm_adds_epi16(in14, final_rounding); - in15 = _mm_adds_epi16(in15, final_rounding); - in16 = _mm_adds_epi16(in16, final_rounding); - in17 = _mm_adds_epi16(in17, final_rounding); - in18 = _mm_adds_epi16(in18, final_rounding); - in19 = _mm_adds_epi16(in19, final_rounding); - in20 = _mm_adds_epi16(in20, final_rounding); - in21 = _mm_adds_epi16(in21, final_rounding); - in22 = _mm_adds_epi16(in22, final_rounding); - in23 = _mm_adds_epi16(in23, final_rounding); - in24 = _mm_adds_epi16(in24, final_rounding); - in25 = _mm_adds_epi16(in25, final_rounding); - in26 = _mm_adds_epi16(in26, final_rounding); - in27 = _mm_adds_epi16(in27, final_rounding); - in28 = _mm_adds_epi16(in28, final_rounding); - in29 = _mm_adds_epi16(in29, final_rounding); - in30 = _mm_adds_epi16(in30, final_rounding); - in31 = _mm_adds_epi16(in31, final_rounding); - - in0 = _mm_srai_epi16(in0, 6); - in1 = _mm_srai_epi16(in1, 6); - in2 = _mm_srai_epi16(in2, 6); - in3 = _mm_srai_epi16(in3, 6); - in4 = _mm_srai_epi16(in4, 6); - in5 = _mm_srai_epi16(in5, 6); - in6 = _mm_srai_epi16(in6, 6); - in7 = _mm_srai_epi16(in7, 6); - in8 = _mm_srai_epi16(in8, 6); - in9 = _mm_srai_epi16(in9, 6); - in10 = _mm_srai_epi16(in10, 6); - in11 = _mm_srai_epi16(in11, 6); - in12 = _mm_srai_epi16(in12, 6); - in13 = _mm_srai_epi16(in13, 6); - in14 = _mm_srai_epi16(in14, 6); - in15 = _mm_srai_epi16(in15, 6); - in16 = _mm_srai_epi16(in16, 6); - in17 = _mm_srai_epi16(in17, 6); - in18 = _mm_srai_epi16(in18, 6); - in19 = _mm_srai_epi16(in19, 6); - in20 = _mm_srai_epi16(in20, 6); - in21 = _mm_srai_epi16(in21, 6); - in22 = _mm_srai_epi16(in22, 6); - in23 = _mm_srai_epi16(in23, 6); - in24 = _mm_srai_epi16(in24, 6); - in25 = _mm_srai_epi16(in25, 6); - in26 = _mm_srai_epi16(in26, 6); - in27 = _mm_srai_epi16(in27, 6); - in28 = _mm_srai_epi16(in28, 6); - in29 = _mm_srai_epi16(in29, 6); - in30 = _mm_srai_epi16(in30, 6); - in31 = _mm_srai_epi16(in31, 6); - - RECON_AND_STORE(dest, in0); - RECON_AND_STORE(dest, in1); - RECON_AND_STORE(dest, in2); - RECON_AND_STORE(dest, in3); - RECON_AND_STORE(dest, in4); - RECON_AND_STORE(dest, in5); - RECON_AND_STORE(dest, in6); - RECON_AND_STORE(dest, in7); - RECON_AND_STORE(dest, in8); - RECON_AND_STORE(dest, in9); - RECON_AND_STORE(dest, in10); - RECON_AND_STORE(dest, in11); - RECON_AND_STORE(dest, in12); - RECON_AND_STORE(dest, in13); - RECON_AND_STORE(dest, in14); - RECON_AND_STORE(dest, in15); - RECON_AND_STORE(dest, in16); - RECON_AND_STORE(dest, in17); - RECON_AND_STORE(dest, in18); - RECON_AND_STORE(dest, in19); - RECON_AND_STORE(dest, in20); - RECON_AND_STORE(dest, in21); - RECON_AND_STORE(dest, in22); - RECON_AND_STORE(dest, in23); - RECON_AND_STORE(dest, in24); - RECON_AND_STORE(dest, in25); - RECON_AND_STORE(dest, in26); - RECON_AND_STORE(dest, in27); - RECON_AND_STORE(dest, in28); - RECON_AND_STORE(dest, in29); - RECON_AND_STORE(dest, in30); - RECON_AND_STORE(dest, in31); + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + in[16] = _mm_adds_epi16(in[16], final_rounding); + in[17] = _mm_adds_epi16(in[17], final_rounding); + in[18] = _mm_adds_epi16(in[18], final_rounding); + in[19] = _mm_adds_epi16(in[19], final_rounding); + in[20] = _mm_adds_epi16(in[20], final_rounding); + in[21] = _mm_adds_epi16(in[21], final_rounding); + in[22] = _mm_adds_epi16(in[22], final_rounding); + in[23] = _mm_adds_epi16(in[23], final_rounding); + in[24] = _mm_adds_epi16(in[24], final_rounding); + in[25] = _mm_adds_epi16(in[25], final_rounding); + in[26] = _mm_adds_epi16(in[26], final_rounding); + in[27] = _mm_adds_epi16(in[27], final_rounding); + in[28] = _mm_adds_epi16(in[28], final_rounding); + in[29] = _mm_adds_epi16(in[29], final_rounding); + in[30] = _mm_adds_epi16(in[30], final_rounding); + in[31] = _mm_adds_epi16(in[31], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + in[16] = _mm_srai_epi16(in[16], 6); + in[17] = _mm_srai_epi16(in[17], 6); + in[18] = _mm_srai_epi16(in[18], 6); + in[19] = _mm_srai_epi16(in[19], 6); + in[20] = _mm_srai_epi16(in[20], 6); + in[21] = _mm_srai_epi16(in[21], 6); + in[22] = _mm_srai_epi16(in[22], 6); + in[23] = _mm_srai_epi16(in[23], 6); + in[24] = _mm_srai_epi16(in[24], 6); + in[25] = _mm_srai_epi16(in[25], 6); + in[26] = _mm_srai_epi16(in[26], 6); + in[27] = _mm_srai_epi16(in[27], 6); + in[28] = _mm_srai_epi16(in[28], 6); + in[29] = _mm_srai_epi16(in[29], 6); + in[30] = _mm_srai_epi16(in[30], 6); + in[31] = _mm_srai_epi16(in[31], 6); + + RECON_AND_STORE(dest, in[0]); + RECON_AND_STORE(dest, in[1]); + RECON_AND_STORE(dest, in[2]); + RECON_AND_STORE(dest, in[3]); + RECON_AND_STORE(dest, in[4]); + RECON_AND_STORE(dest, in[5]); + RECON_AND_STORE(dest, in[6]); + RECON_AND_STORE(dest, in[7]); + RECON_AND_STORE(dest, in[8]); + RECON_AND_STORE(dest, in[9]); + RECON_AND_STORE(dest, in[10]); + RECON_AND_STORE(dest, in[11]); + RECON_AND_STORE(dest, in[12]); + RECON_AND_STORE(dest, in[13]); + RECON_AND_STORE(dest, in[14]); + RECON_AND_STORE(dest, in[15]); + RECON_AND_STORE(dest, in[16]); + RECON_AND_STORE(dest, in[17]); + RECON_AND_STORE(dest, in[18]); + RECON_AND_STORE(dest, in[19]); + RECON_AND_STORE(dest, in[20]); + RECON_AND_STORE(dest, in[21]); + RECON_AND_STORE(dest, in[22]); + RECON_AND_STORE(dest, in[23]); + RECON_AND_STORE(dest, in[24]); + RECON_AND_STORE(dest, in[25]); + RECON_AND_STORE(dest, in[26]); + RECON_AND_STORE(dest, in[27]); + RECON_AND_STORE(dest, in[28]); + RECON_AND_STORE(dest, in[29]); + RECON_AND_STORE(dest, in[30]); + RECON_AND_STORE(dest, in[31]); dest += 8 - (stride * 32); } } -} void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, int stride) { @@ -3537,10 +3786,7 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, - in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, - in24, in25, in26, in27, in28, in29, in30, in31; - __m128i col[128]; + __m128i in[32], col[128], zero_idx[16]; __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, @@ -3553,66 +3799,63 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, stp2_30, stp2_31; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int i, j, i32; - __m128i zero_idx[16]; int zero_flag[2]; - // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. - for (i = 0; i < 8; i++) { + for (i = 0; i < 4; i++) { i32 = (i << 5); - if (i < 4) { // First 1-D idct // Load input data. - LOAD_DQCOEFF(in0, input); - LOAD_DQCOEFF(in8, input); - LOAD_DQCOEFF(in16, input); - LOAD_DQCOEFF(in24, input); - LOAD_DQCOEFF(in1, input); - LOAD_DQCOEFF(in9, input); - LOAD_DQCOEFF(in17, input); - LOAD_DQCOEFF(in25, input); - LOAD_DQCOEFF(in2, input); - LOAD_DQCOEFF(in10, input); - LOAD_DQCOEFF(in18, input); - LOAD_DQCOEFF(in26, input); - LOAD_DQCOEFF(in3, input); - LOAD_DQCOEFF(in11, input); - LOAD_DQCOEFF(in19, input); - LOAD_DQCOEFF(in27, input); - - LOAD_DQCOEFF(in4, input); - LOAD_DQCOEFF(in12, input); - LOAD_DQCOEFF(in20, input); - LOAD_DQCOEFF(in28, input); - LOAD_DQCOEFF(in5, input); - LOAD_DQCOEFF(in13, input); - LOAD_DQCOEFF(in21, input); - LOAD_DQCOEFF(in29, input); - LOAD_DQCOEFF(in6, input); - LOAD_DQCOEFF(in14, input); - LOAD_DQCOEFF(in22, input); - LOAD_DQCOEFF(in30, input); - LOAD_DQCOEFF(in7, input); - LOAD_DQCOEFF(in15, input); - LOAD_DQCOEFF(in23, input); - LOAD_DQCOEFF(in31, input); + LOAD_DQCOEFF(in[0], input); + LOAD_DQCOEFF(in[8], input); + LOAD_DQCOEFF(in[16], input); + LOAD_DQCOEFF(in[24], input); + LOAD_DQCOEFF(in[1], input); + LOAD_DQCOEFF(in[9], input); + LOAD_DQCOEFF(in[17], input); + LOAD_DQCOEFF(in[25], input); + LOAD_DQCOEFF(in[2], input); + LOAD_DQCOEFF(in[10], input); + LOAD_DQCOEFF(in[18], input); + LOAD_DQCOEFF(in[26], input); + LOAD_DQCOEFF(in[3], input); + LOAD_DQCOEFF(in[11], input); + LOAD_DQCOEFF(in[19], input); + LOAD_DQCOEFF(in[27], input); + + LOAD_DQCOEFF(in[4], input); + LOAD_DQCOEFF(in[12], input); + LOAD_DQCOEFF(in[20], input); + LOAD_DQCOEFF(in[28], input); + LOAD_DQCOEFF(in[5], input); + LOAD_DQCOEFF(in[13], input); + LOAD_DQCOEFF(in[21], input); + LOAD_DQCOEFF(in[29], input); + LOAD_DQCOEFF(in[6], input); + LOAD_DQCOEFF(in[14], input); + LOAD_DQCOEFF(in[22], input); + LOAD_DQCOEFF(in[30], input); + LOAD_DQCOEFF(in[7], input); + LOAD_DQCOEFF(in[15], input); + LOAD_DQCOEFF(in[23], input); + LOAD_DQCOEFF(in[31], input); // checking if all entries are zero - zero_idx[0] = _mm_or_si128(in0, in1); - zero_idx[1] = _mm_or_si128(in2, in3); - zero_idx[2] = _mm_or_si128(in4, in5); - zero_idx[3] = _mm_or_si128(in6, in7); - zero_idx[4] = _mm_or_si128(in8, in9); - zero_idx[5] = _mm_or_si128(in10, in11); - zero_idx[6] = _mm_or_si128(in12, in13); - zero_idx[7] = _mm_or_si128(in14, in15); - zero_idx[8] = _mm_or_si128(in16, in17); - zero_idx[9] = _mm_or_si128(in18, in19); - zero_idx[10] = _mm_or_si128(in20, in21); - zero_idx[11] = _mm_or_si128(in22, in23); - zero_idx[12] = _mm_or_si128(in24, in25); - zero_idx[13] = _mm_or_si128(in26, in27); - zero_idx[14] = _mm_or_si128(in28, in29); - zero_idx[15] = _mm_or_si128(in30, in31); + zero_idx[0] = _mm_or_si128(in[0], in[1]); + zero_idx[1] = _mm_or_si128(in[2], in[3]); + zero_idx[2] = _mm_or_si128(in[4], in[5]); + zero_idx[3] = _mm_or_si128(in[6], in[7]); + zero_idx[4] = _mm_or_si128(in[8], in[9]); + zero_idx[5] = _mm_or_si128(in[10], in[11]); + zero_idx[6] = _mm_or_si128(in[12], in[13]); + zero_idx[7] = _mm_or_si128(in[14], in[15]); + zero_idx[8] = _mm_or_si128(in[16], in[17]); + zero_idx[9] = _mm_or_si128(in[18], in[19]); + zero_idx[10] = _mm_or_si128(in[20], in[21]); + zero_idx[11] = _mm_or_si128(in[22], in[23]); + zero_idx[12] = _mm_or_si128(in[24], in[25]); + zero_idx[13] = _mm_or_si128(in[26], in[27]); + zero_idx[14] = _mm_or_si128(in[28], in[29]); + zero_idx[15] = _mm_or_si128(in[30], in[31]); zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); @@ -3674,44 +3917,13 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, } // Transpose 32x8 block to 8x32 block - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, - in10, in11, in12, in13, in14, in15); - TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, - in18, in19, in20, in21, in22, in23); - TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, - in26, in27, in28, in29, in30, in31); - } else { - // Second 1-D idct - j = i - 4; - - // Transpose 32x8 block to 8x32 block - TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], - col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], - col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, - in5, in6, in7); - j += 4; - TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], - col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], - col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, - in11, in12, in13, in14, in15); - j += 4; - TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], - col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], - col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, - in19, in20, in21, in22, in23); - j += 4; - TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], - col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], - col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, - in28, in29, in30, in31); - } + array_transpose_8x8(in, in); + array_transpose_8x8(in+8, in+8); + array_transpose_8x8(in+16, in+16); + array_transpose_8x8(in+24, in+24); - IDCT32_1D + IDCT32 - // final stage - if (i < 4) { // 1_D: Store 32 intermediate results for each 8x32 block. col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); @@ -3745,146 +3957,156 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); - } else { + } + for (i = 0; i < 4; i++) { const __m128i zero = _mm_setzero_si128(); + // Second 1-D idct + j = i << 3; + + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col+j, in); + array_transpose_8x8(col+j+32, in+8); + array_transpose_8x8(col+j+64, in+16); + array_transpose_8x8(col+j+96, in+24); + + IDCT32 // 2_D: Calculate the results and store them to destination. - in0 = _mm_add_epi16(stp1_0, stp1_31); - in1 = _mm_add_epi16(stp1_1, stp1_30); - in2 = _mm_add_epi16(stp1_2, stp1_29); - in3 = _mm_add_epi16(stp1_3, stp1_28); - in4 = _mm_add_epi16(stp1_4, stp1_27); - in5 = _mm_add_epi16(stp1_5, stp1_26); - in6 = _mm_add_epi16(stp1_6, stp1_25); - in7 = _mm_add_epi16(stp1_7, stp1_24); - in8 = _mm_add_epi16(stp1_8, stp1_23); - in9 = _mm_add_epi16(stp1_9, stp1_22); - in10 = _mm_add_epi16(stp1_10, stp1_21); - in11 = _mm_add_epi16(stp1_11, stp1_20); - in12 = _mm_add_epi16(stp1_12, stp1_19); - in13 = _mm_add_epi16(stp1_13, stp1_18); - in14 = _mm_add_epi16(stp1_14, stp1_17); - in15 = _mm_add_epi16(stp1_15, stp1_16); - in16 = _mm_sub_epi16(stp1_15, stp1_16); - in17 = _mm_sub_epi16(stp1_14, stp1_17); - in18 = _mm_sub_epi16(stp1_13, stp1_18); - in19 = _mm_sub_epi16(stp1_12, stp1_19); - in20 = _mm_sub_epi16(stp1_11, stp1_20); - in21 = _mm_sub_epi16(stp1_10, stp1_21); - in22 = _mm_sub_epi16(stp1_9, stp1_22); - in23 = _mm_sub_epi16(stp1_8, stp1_23); - in24 = _mm_sub_epi16(stp1_7, stp1_24); - in25 = _mm_sub_epi16(stp1_6, stp1_25); - in26 = _mm_sub_epi16(stp1_5, stp1_26); - in27 = _mm_sub_epi16(stp1_4, stp1_27); - in28 = _mm_sub_epi16(stp1_3, stp1_28); - in29 = _mm_sub_epi16(stp1_2, stp1_29); - in30 = _mm_sub_epi16(stp1_1, stp1_30); - in31 = _mm_sub_epi16(stp1_0, stp1_31); + in[0] = _mm_add_epi16(stp1_0, stp1_31); + in[1] = _mm_add_epi16(stp1_1, stp1_30); + in[2] = _mm_add_epi16(stp1_2, stp1_29); + in[3] = _mm_add_epi16(stp1_3, stp1_28); + in[4] = _mm_add_epi16(stp1_4, stp1_27); + in[5] = _mm_add_epi16(stp1_5, stp1_26); + in[6] = _mm_add_epi16(stp1_6, stp1_25); + in[7] = _mm_add_epi16(stp1_7, stp1_24); + in[8] = _mm_add_epi16(stp1_8, stp1_23); + in[9] = _mm_add_epi16(stp1_9, stp1_22); + in[10] = _mm_add_epi16(stp1_10, stp1_21); + in[11] = _mm_add_epi16(stp1_11, stp1_20); + in[12] = _mm_add_epi16(stp1_12, stp1_19); + in[13] = _mm_add_epi16(stp1_13, stp1_18); + in[14] = _mm_add_epi16(stp1_14, stp1_17); + in[15] = _mm_add_epi16(stp1_15, stp1_16); + in[16] = _mm_sub_epi16(stp1_15, stp1_16); + in[17] = _mm_sub_epi16(stp1_14, stp1_17); + in[18] = _mm_sub_epi16(stp1_13, stp1_18); + in[19] = _mm_sub_epi16(stp1_12, stp1_19); + in[20] = _mm_sub_epi16(stp1_11, stp1_20); + in[21] = _mm_sub_epi16(stp1_10, stp1_21); + in[22] = _mm_sub_epi16(stp1_9, stp1_22); + in[23] = _mm_sub_epi16(stp1_8, stp1_23); + in[24] = _mm_sub_epi16(stp1_7, stp1_24); + in[25] = _mm_sub_epi16(stp1_6, stp1_25); + in[26] = _mm_sub_epi16(stp1_5, stp1_26); + in[27] = _mm_sub_epi16(stp1_4, stp1_27); + in[28] = _mm_sub_epi16(stp1_3, stp1_28); + in[29] = _mm_sub_epi16(stp1_2, stp1_29); + in[30] = _mm_sub_epi16(stp1_1, stp1_30); + in[31] = _mm_sub_epi16(stp1_0, stp1_31); // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - in8 = _mm_adds_epi16(in8, final_rounding); - in9 = _mm_adds_epi16(in9, final_rounding); - in10 = _mm_adds_epi16(in10, final_rounding); - in11 = _mm_adds_epi16(in11, final_rounding); - in12 = _mm_adds_epi16(in12, final_rounding); - in13 = _mm_adds_epi16(in13, final_rounding); - in14 = _mm_adds_epi16(in14, final_rounding); - in15 = _mm_adds_epi16(in15, final_rounding); - in16 = _mm_adds_epi16(in16, final_rounding); - in17 = _mm_adds_epi16(in17, final_rounding); - in18 = _mm_adds_epi16(in18, final_rounding); - in19 = _mm_adds_epi16(in19, final_rounding); - in20 = _mm_adds_epi16(in20, final_rounding); - in21 = _mm_adds_epi16(in21, final_rounding); - in22 = _mm_adds_epi16(in22, final_rounding); - in23 = _mm_adds_epi16(in23, final_rounding); - in24 = _mm_adds_epi16(in24, final_rounding); - in25 = _mm_adds_epi16(in25, final_rounding); - in26 = _mm_adds_epi16(in26, final_rounding); - in27 = _mm_adds_epi16(in27, final_rounding); - in28 = _mm_adds_epi16(in28, final_rounding); - in29 = _mm_adds_epi16(in29, final_rounding); - in30 = _mm_adds_epi16(in30, final_rounding); - in31 = _mm_adds_epi16(in31, final_rounding); - - in0 = _mm_srai_epi16(in0, 6); - in1 = _mm_srai_epi16(in1, 6); - in2 = _mm_srai_epi16(in2, 6); - in3 = _mm_srai_epi16(in3, 6); - in4 = _mm_srai_epi16(in4, 6); - in5 = _mm_srai_epi16(in5, 6); - in6 = _mm_srai_epi16(in6, 6); - in7 = _mm_srai_epi16(in7, 6); - in8 = _mm_srai_epi16(in8, 6); - in9 = _mm_srai_epi16(in9, 6); - in10 = _mm_srai_epi16(in10, 6); - in11 = _mm_srai_epi16(in11, 6); - in12 = _mm_srai_epi16(in12, 6); - in13 = _mm_srai_epi16(in13, 6); - in14 = _mm_srai_epi16(in14, 6); - in15 = _mm_srai_epi16(in15, 6); - in16 = _mm_srai_epi16(in16, 6); - in17 = _mm_srai_epi16(in17, 6); - in18 = _mm_srai_epi16(in18, 6); - in19 = _mm_srai_epi16(in19, 6); - in20 = _mm_srai_epi16(in20, 6); - in21 = _mm_srai_epi16(in21, 6); - in22 = _mm_srai_epi16(in22, 6); - in23 = _mm_srai_epi16(in23, 6); - in24 = _mm_srai_epi16(in24, 6); - in25 = _mm_srai_epi16(in25, 6); - in26 = _mm_srai_epi16(in26, 6); - in27 = _mm_srai_epi16(in27, 6); - in28 = _mm_srai_epi16(in28, 6); - in29 = _mm_srai_epi16(in29, 6); - in30 = _mm_srai_epi16(in30, 6); - in31 = _mm_srai_epi16(in31, 6); - - RECON_AND_STORE(dest, in0); - RECON_AND_STORE(dest, in1); - RECON_AND_STORE(dest, in2); - RECON_AND_STORE(dest, in3); - RECON_AND_STORE(dest, in4); - RECON_AND_STORE(dest, in5); - RECON_AND_STORE(dest, in6); - RECON_AND_STORE(dest, in7); - RECON_AND_STORE(dest, in8); - RECON_AND_STORE(dest, in9); - RECON_AND_STORE(dest, in10); - RECON_AND_STORE(dest, in11); - RECON_AND_STORE(dest, in12); - RECON_AND_STORE(dest, in13); - RECON_AND_STORE(dest, in14); - RECON_AND_STORE(dest, in15); - RECON_AND_STORE(dest, in16); - RECON_AND_STORE(dest, in17); - RECON_AND_STORE(dest, in18); - RECON_AND_STORE(dest, in19); - RECON_AND_STORE(dest, in20); - RECON_AND_STORE(dest, in21); - RECON_AND_STORE(dest, in22); - RECON_AND_STORE(dest, in23); - RECON_AND_STORE(dest, in24); - RECON_AND_STORE(dest, in25); - RECON_AND_STORE(dest, in26); - RECON_AND_STORE(dest, in27); - RECON_AND_STORE(dest, in28); - RECON_AND_STORE(dest, in29); - RECON_AND_STORE(dest, in30); - RECON_AND_STORE(dest, in31); + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + in[16] = _mm_adds_epi16(in[16], final_rounding); + in[17] = _mm_adds_epi16(in[17], final_rounding); + in[18] = _mm_adds_epi16(in[18], final_rounding); + in[19] = _mm_adds_epi16(in[19], final_rounding); + in[20] = _mm_adds_epi16(in[20], final_rounding); + in[21] = _mm_adds_epi16(in[21], final_rounding); + in[22] = _mm_adds_epi16(in[22], final_rounding); + in[23] = _mm_adds_epi16(in[23], final_rounding); + in[24] = _mm_adds_epi16(in[24], final_rounding); + in[25] = _mm_adds_epi16(in[25], final_rounding); + in[26] = _mm_adds_epi16(in[26], final_rounding); + in[27] = _mm_adds_epi16(in[27], final_rounding); + in[28] = _mm_adds_epi16(in[28], final_rounding); + in[29] = _mm_adds_epi16(in[29], final_rounding); + in[30] = _mm_adds_epi16(in[30], final_rounding); + in[31] = _mm_adds_epi16(in[31], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + in[16] = _mm_srai_epi16(in[16], 6); + in[17] = _mm_srai_epi16(in[17], 6); + in[18] = _mm_srai_epi16(in[18], 6); + in[19] = _mm_srai_epi16(in[19], 6); + in[20] = _mm_srai_epi16(in[20], 6); + in[21] = _mm_srai_epi16(in[21], 6); + in[22] = _mm_srai_epi16(in[22], 6); + in[23] = _mm_srai_epi16(in[23], 6); + in[24] = _mm_srai_epi16(in[24], 6); + in[25] = _mm_srai_epi16(in[25], 6); + in[26] = _mm_srai_epi16(in[26], 6); + in[27] = _mm_srai_epi16(in[27], 6); + in[28] = _mm_srai_epi16(in[28], 6); + in[29] = _mm_srai_epi16(in[29], 6); + in[30] = _mm_srai_epi16(in[30], 6); + in[31] = _mm_srai_epi16(in[31], 6); + + RECON_AND_STORE(dest, in[0]); + RECON_AND_STORE(dest, in[1]); + RECON_AND_STORE(dest, in[2]); + RECON_AND_STORE(dest, in[3]); + RECON_AND_STORE(dest, in[4]); + RECON_AND_STORE(dest, in[5]); + RECON_AND_STORE(dest, in[6]); + RECON_AND_STORE(dest, in[7]); + RECON_AND_STORE(dest, in[8]); + RECON_AND_STORE(dest, in[9]); + RECON_AND_STORE(dest, in[10]); + RECON_AND_STORE(dest, in[11]); + RECON_AND_STORE(dest, in[12]); + RECON_AND_STORE(dest, in[13]); + RECON_AND_STORE(dest, in[14]); + RECON_AND_STORE(dest, in[15]); + RECON_AND_STORE(dest, in[16]); + RECON_AND_STORE(dest, in[17]); + RECON_AND_STORE(dest, in[18]); + RECON_AND_STORE(dest, in[19]); + RECON_AND_STORE(dest, in[20]); + RECON_AND_STORE(dest, in[21]); + RECON_AND_STORE(dest, in[22]); + RECON_AND_STORE(dest, in[23]); + RECON_AND_STORE(dest, in[24]); + RECON_AND_STORE(dest, in[25]); + RECON_AND_STORE(dest, in[26]); + RECON_AND_STORE(dest, in[27]); + RECON_AND_STORE(dest, in[28]); + RECON_AND_STORE(dest, in[29]); + RECON_AND_STORE(dest, in[30]); + RECON_AND_STORE(dest, in[31]); dest += 8 - (stride * 32); } - } } //NOLINT void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c index 3c5cb8f..439c028 100644 --- a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c +++ b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c @@ -933,7 +933,7 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p, } } -void vp9_mb_lpf_horizontal_edge_w_avx2(unsigned char *s, int p, +void vp9_lpf_horizontal_16_avx2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh, int count) { if (count == 1) diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c index fa4dd9b..448ad5a 100644 --- a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c +++ b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include <emmintrin.h> /* SSE2 */ +#include <emmintrin.h> // SSE2 #include "vp9/common/vp9_loopfilter.h" #include "vpx_ports/emmintrin_compat.h" @@ -17,20 +17,14 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { - __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat, flat2; __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; __m128i abs_p1p0; - const unsigned int extended_thresh = _thresh[0] * 0x01010101u; - const unsigned int extended_limit = _limit[0] * 0x01010101u; - const unsigned int extended_blimit = _blimit[0] * 0x01010101u; - const __m128i thresh = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0); - const __m128i limit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0); - const __m128i blimit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4), @@ -105,7 +99,7 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); - /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + // (vp9_filter + 3 * (qs0 - ps0)) & mask filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); @@ -116,11 +110,11 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, filter2 = _mm_unpacklo_epi8(zero, filter2); filter2 = _mm_srai_epi16(filter2, 0xB); - /* Filter1 >> 3 */ + // Filter1 >> 3 filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); - /* filt >> 1 */ + // filt >> 1 filt = _mm_adds_epi16(filter1, t1); filt = _mm_srai_epi16(filt, 1); filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), @@ -375,32 +369,25 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { - DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]); - DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]); - - DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_op, 7 * 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_oq, 7 * 16); - DECLARE_ALIGNED(16, unsigned char, ap[8][16]); - DECLARE_ALIGNED(16, unsigned char, aq[8][16]); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op, 3 * 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq, 3 * 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, ap, 8 * 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, aq, 8 * 16); - __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat, flat2; __m128i p7, p6, p5; __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; __m128i q5, q6, q7; int i = 0; - const unsigned int extended_thresh = _thresh[0] * 0x01010101u; - const unsigned int extended_limit = _limit[0] * 0x01010101u; - const unsigned int extended_blimit = _blimit[0] * 0x01010101u; - const __m128i thresh = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0); - const __m128i limit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0); - const __m128i blimit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); @@ -413,16 +400,16 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); - _mm_store_si128((__m128i *)ap[4], p4); - _mm_store_si128((__m128i *)ap[3], p3); - _mm_store_si128((__m128i *)ap[2], p2); - _mm_store_si128((__m128i *)ap[1], p1); - _mm_store_si128((__m128i *)ap[0], p0); - _mm_store_si128((__m128i *)aq[4], q4); - _mm_store_si128((__m128i *)aq[3], q3); - _mm_store_si128((__m128i *)aq[2], q2); - _mm_store_si128((__m128i *)aq[1], q1); - _mm_store_si128((__m128i *)aq[0], q0); + _mm_store_si128((__m128i *)&ap[4 * 16], p4); + _mm_store_si128((__m128i *)&ap[3 * 16], p3); + _mm_store_si128((__m128i *)&ap[2 * 16], p2); + _mm_store_si128((__m128i *)&ap[1 * 16], p1); + _mm_store_si128((__m128i *)&ap[0 * 16], p0); + _mm_store_si128((__m128i *)&aq[4 * 16], q4); + _mm_store_si128((__m128i *)&aq[3 * 16], q3); + _mm_store_si128((__m128i *)&aq[2 * 16], q2); + _mm_store_si128((__m128i *)&aq[1 * 16], q1); + _mm_store_si128((__m128i *)&aq[0 * 16], q0); { @@ -486,13 +473,13 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); - /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + // (vp9_filter + 3 * (qs0 - ps0)) & mask filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); - /* Filter1 >> 3 */ + // Filter1 >> 3 work_a = _mm_cmpgt_epi8(zero, filter1); filter1 = _mm_srli_epi16(filter1, 3); work_a = _mm_and_si128(work_a, te0); @@ -500,7 +487,7 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, filter1 = _mm_or_si128(filter1, work_a); qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - /* Filter2 >> 3 */ + // Filter2 >> 3 work_a = _mm_cmpgt_epi8(zero, filter2); filter2 = _mm_srli_epi16(filter2, 3); work_a = _mm_and_si128(work_a, te0); @@ -508,7 +495,7 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, filter2 = _mm_or_si128(filter2, work_a); ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - /* filt >> 1 */ + // filt >> 1 filt = _mm_adds_epi8(filter1, t1); work_a = _mm_cmpgt_epi8(zero, filt); filt = _mm_srli_epi16(filt, 1); @@ -546,8 +533,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, _mm_subs_epu8(p0, p5)), _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); - _mm_store_si128((__m128i *)ap[5], p5); - _mm_store_si128((__m128i *)aq[5], q5); + _mm_store_si128((__m128i *)&ap[5 * 16], p5); + _mm_store_si128((__m128i *)&aq[5 * 16], q5); flat2 = _mm_max_epu8(work, flat2); p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); @@ -555,8 +542,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, _mm_subs_epu8(p0, p6)), _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); - _mm_store_si128((__m128i *)ap[6], p6); - _mm_store_si128((__m128i *)aq[6], q6); + _mm_store_si128((__m128i *)&ap[6 * 16], p6); + _mm_store_si128((__m128i *)&aq[6 * 16], q6); flat2 = _mm_max_epu8(work, flat2); p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); @@ -565,8 +552,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, _mm_subs_epu8(p0, p7)), _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); - _mm_store_si128((__m128i *)ap[7], p7); - _mm_store_si128((__m128i *)aq[7], q7); + _mm_store_si128((__m128i *)&ap[7 * 16], p7); + _mm_store_si128((__m128i *)&aq[7 * 16], q7); flat2 = _mm_max_epu8(work, flat2); flat2 = _mm_subs_epu8(flat2, one); flat2 = _mm_cmpeq_epi8(flat2, zero); @@ -586,22 +573,38 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, __m128i a, b, c; unsigned int off = i * 8; - p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero); - p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero); - p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero); - p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero); - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero); - q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero); - q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero); - q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero); - q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero); + p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[7 * 16] + off)), + zero); + p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[6 * 16] + off)), + zero); + p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[5 * 16] + off)), + zero); + p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[4 * 16] + off)), + zero); + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[3 * 16] + off)), + zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[2 * 16] + off)), + zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[1 * 16] + off)), + zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[0 * 16] + off)), + zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[0 * 16] + off)), + zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[1 * 16] + off)), + zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[2 * 16] + off)), + zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[3 * 16] + off)), + zero); + q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[4 * 16] + off)), + zero); + q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[5 * 16] + off)), + zero); + q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[6 * 16] + off)), + zero); + q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[7 * 16] + off)), + zero); c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7 c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c)); @@ -610,117 +613,117 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1)); a = _mm_add_epi16(_mm_add_epi16(p0, q0), a); - _mm_storel_epi64((__m128i *)&flat_op[2][i*8], + _mm_storel_epi64((__m128i *)&flat_op[2 * 16 + i * 8], _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) , b)); c = _mm_add_epi16(_mm_add_epi16(p5, eight), c); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[6][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[6 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q1, a); b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1); - _mm_storel_epi64((__m128i *)&flat_op[1][i*8], + _mm_storel_epi64((__m128i *)&flat_op[1 * 16 + i * 8], _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) , b)); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[5][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[5 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q2, a); b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0); - _mm_storel_epi64((__m128i *)&flat_op[0][i*8], + _mm_storel_epi64((__m128i *)&flat_op[i * 8], _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) , b)); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[4][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[4 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q3, a); b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0); - _mm_storel_epi64((__m128i *)&flat_oq[0][i*8], + _mm_storel_epi64((__m128i *)&flat_oq[i * 8], _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) , b)); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[3][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[3 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); b = _mm_add_epi16(q3, b); b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1); - _mm_storel_epi64((__m128i *)&flat_oq[1][i*8], + _mm_storel_epi64((__m128i *)&flat_oq[16 + i * 8], _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) , b)); c = _mm_add_epi16(q4, c); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[2][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[2 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); b = _mm_add_epi16(q3, b); b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2); - _mm_storel_epi64((__m128i *)&flat_oq[2][i*8], + _mm_storel_epi64((__m128i *)&flat_oq[2 * 16 + i * 8], _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) , b)); a = _mm_add_epi16(q5, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[1][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q6, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[0][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[2 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[3 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[4 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[5 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[6 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); temp_flat2 = _mm_srli_si128(temp_flat2, 8); @@ -730,51 +733,51 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - work_a = _mm_load_si128((__m128i *)ap[2]); - p2 = _mm_load_si128((__m128i *)flat_op[2]); + work_a = _mm_load_si128((__m128i *)&ap[2 * 16]); + p2 = _mm_load_si128((__m128i *)&flat_op[2 * 16]); work_a = _mm_andnot_si128(flat, work_a); p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_store_si128((__m128i *)flat_op[2], p2); + _mm_store_si128((__m128i *)&flat_op[2 * 16], p2); - p1 = _mm_load_si128((__m128i *)flat_op[1]); + p1 = _mm_load_si128((__m128i *)&flat_op[1 * 16]); work_a = _mm_andnot_si128(flat, ps1); p1 = _mm_and_si128(flat, p1); p1 = _mm_or_si128(work_a, p1); - _mm_store_si128((__m128i *)flat_op[1], p1); + _mm_store_si128((__m128i *)&flat_op[1 * 16], p1); - p0 = _mm_load_si128((__m128i *)flat_op[0]); + p0 = _mm_load_si128((__m128i *)&flat_op[0]); work_a = _mm_andnot_si128(flat, ps0); p0 = _mm_and_si128(flat, p0); p0 = _mm_or_si128(work_a, p0); - _mm_store_si128((__m128i *)flat_op[0], p0); + _mm_store_si128((__m128i *)&flat_op[0], p0); - q0 = _mm_load_si128((__m128i *)flat_oq[0]); + q0 = _mm_load_si128((__m128i *)&flat_oq[0]); work_a = _mm_andnot_si128(flat, qs0); q0 = _mm_and_si128(flat, q0); q0 = _mm_or_si128(work_a, q0); - _mm_store_si128((__m128i *)flat_oq[0], q0); + _mm_store_si128((__m128i *)&flat_oq[0], q0); - q1 = _mm_load_si128((__m128i *)flat_oq[1]); + q1 = _mm_load_si128((__m128i *)&flat_oq[1 * 16]); work_a = _mm_andnot_si128(flat, qs1); q1 = _mm_and_si128(flat, q1); q1 = _mm_or_si128(work_a, q1); - _mm_store_si128((__m128i *)flat_oq[1], q1); + _mm_store_si128((__m128i *)&flat_oq[1 * 16], q1); - work_a = _mm_load_si128((__m128i *)aq[2]); - q2 = _mm_load_si128((__m128i *)flat_oq[2]); + work_a = _mm_load_si128((__m128i *)&aq[2 * 16]); + q2 = _mm_load_si128((__m128i *)&flat_oq[2 * 16]); work_a = _mm_andnot_si128(flat, work_a); q2 = _mm_and_si128(flat, q2); q2 = _mm_or_si128(work_a, q2); - _mm_store_si128((__m128i *)flat_oq[2], q2); + _mm_store_si128((__m128i *)&flat_oq[2 * 16], q2); // write out op6 - op3 { unsigned char *dst = (s - 7 * p); for (i = 6; i > 2; i--) { __m128i flat2_output; - work_a = _mm_load_si128((__m128i *)ap[i]); - flat2_output = _mm_load_si128((__m128i *)flat2_op[i]); + work_a = _mm_load_si128((__m128i *)&ap[i * 16]); + flat2_output = _mm_load_si128((__m128i *)&flat2_op[i * 16]); work_a = _mm_andnot_si128(flat2, work_a); flat2_output = _mm_and_si128(flat2, flat2_output); work_a = _mm_or_si128(work_a, flat2_output); @@ -783,43 +786,43 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, } } - work_a = _mm_load_si128((__m128i *)flat_op[2]); - p2 = _mm_load_si128((__m128i *)flat2_op[2]); + work_a = _mm_load_si128((__m128i *)&flat_op[2 * 16]); + p2 = _mm_load_si128((__m128i *)&flat2_op[2 * 16]); work_a = _mm_andnot_si128(flat2, work_a); p2 = _mm_and_si128(flat2, p2); p2 = _mm_or_si128(work_a, p2); _mm_storeu_si128((__m128i *)(s - 3 * p), p2); - work_a = _mm_load_si128((__m128i *)flat_op[1]); - p1 = _mm_load_si128((__m128i *)flat2_op[1]); + work_a = _mm_load_si128((__m128i *)&flat_op[1 * 16]); + p1 = _mm_load_si128((__m128i *)&flat2_op[1 * 16]); work_a = _mm_andnot_si128(flat2, work_a); p1 = _mm_and_si128(flat2, p1); p1 = _mm_or_si128(work_a, p1); _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - work_a = _mm_load_si128((__m128i *)flat_op[0]); - p0 = _mm_load_si128((__m128i *)flat2_op[0]); + work_a = _mm_load_si128((__m128i *)&flat_op[0]); + p0 = _mm_load_si128((__m128i *)&flat2_op[0]); work_a = _mm_andnot_si128(flat2, work_a); p0 = _mm_and_si128(flat2, p0); p0 = _mm_or_si128(work_a, p0); _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - work_a = _mm_load_si128((__m128i *)flat_oq[0]); - q0 = _mm_load_si128((__m128i *)flat2_oq[0]); + work_a = _mm_load_si128((__m128i *)&flat_oq[0]); + q0 = _mm_load_si128((__m128i *)&flat2_oq[0]); work_a = _mm_andnot_si128(flat2, work_a); q0 = _mm_and_si128(flat2, q0); q0 = _mm_or_si128(work_a, q0); _mm_storeu_si128((__m128i *)(s - 0 * p), q0); - work_a = _mm_load_si128((__m128i *)flat_oq[1]); - q1 = _mm_load_si128((__m128i *)flat2_oq[1]); + work_a = _mm_load_si128((__m128i *)&flat_oq[1 * 16]); + q1 = _mm_load_si128((__m128i *)&flat2_oq[16]); work_a = _mm_andnot_si128(flat2, work_a); q1 = _mm_and_si128(flat2, q1); q1 = _mm_or_si128(work_a, q1); _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - work_a = _mm_load_si128((__m128i *)flat_oq[2]); - q2 = _mm_load_si128((__m128i *)flat2_oq[2]); + work_a = _mm_load_si128((__m128i *)&flat_oq[2 * 16]); + q2 = _mm_load_si128((__m128i *)&flat2_oq[2 * 16]); work_a = _mm_andnot_si128(flat2, work_a); q2 = _mm_and_si128(flat2, q2); q2 = _mm_or_si128(work_a, q2); @@ -830,8 +833,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, unsigned char *dst = (s + 3 * p); for (i = 3; i < 7; i++) { __m128i flat2_output; - work_a = _mm_load_si128((__m128i *)aq[i]); - flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]); + work_a = _mm_load_si128((__m128i *)&aq[i * 16]); + flat2_output = _mm_load_si128((__m128i *)&flat2_oq[i * 16]); work_a = _mm_andnot_si128(flat2, work_a); flat2_output = _mm_and_si128(flat2, flat2_output); work_a = _mm_or_si128(work_a, flat2_output); @@ -842,52 +845,275 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, } } -void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, - int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh, - int count) { +// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly. +void vp9_lpf_horizontal_16_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh, int count) { if (count == 1) mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh); else mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh); } -void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, - int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh, - int count) { - DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - __m128i mask, hev, flat; +void vp9_lpf_horizontal_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh, int count) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16); const __m128i zero = _mm_set1_epi16(0); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat; __m128i p3, p2, p1, p0, q0, q1, q2, q3; - const unsigned int extended_thresh = _thresh[0] * 0x01010101u; - const unsigned int extended_limit = _limit[0] * 0x01010101u; - const unsigned int extended_blimit = _blimit[0] * 0x01010101u; - const __m128i thresh = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0); - const __m128i limit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0); - const __m128i blimit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); + __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; (void)count; - p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); - p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); - p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); - p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); - q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); - q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); - q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); - q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); + + q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), + _mm_loadl_epi64((__m128i *)(s + 3 * p))); + q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), + _mm_loadl_epi64((__m128i *)(s + 2 * p))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 1 * p))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + _mm_loadl_epi64((__m128i *)(s - 0 * p))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + // filter_mask and hev_mask + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), + _mm_subs_epu8(q0p0, q1p1)); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + + abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), + _mm_subs_epu8(p0q0, q0p0)); + abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), + _mm_subs_epu8(p1q1, q1p1)); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1), + _mm_subs_epu8(q1p1, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), + _mm_subs_epu8(q2p2, q3p3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // flat_mask4 + + flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0), + _mm_subs_epu8(q0p0, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), + _mm_subs_epu8(q0p0, q3p3))); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + } + + { + const __m128i four = _mm_set1_epi16(4); + unsigned char *src = s; + { + __m128i workp_a, workp_b, workp_shft; + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op2[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op1[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op0[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq0[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq1[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq2[0], + _mm_packus_epi16(workp_shft, workp_shft)); + } + } + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + t80); + const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + t80); + const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), + t80); + const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), + t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (vp9_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 11); + filter1 = _mm_packs_epi16(filter1, filter1); + + // Filter2 >> 3 + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 11); + filter2 = _mm_packs_epi16(filter2, zero); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + filt = _mm_unpacklo_epi8(zero, filt); + filt = _mm_srai_epi16(filt, 9); + filt = _mm_packs_epi16(filt, zero); + + filt = _mm_andnot_si128(hev, filt); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q0 = _mm_loadl_epi64((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + q1 = _mm_loadl_epi64((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q2 = _mm_loadl_epi64((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p0 = _mm_loadl_epi64((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + p1 = _mm_loadl_epi64((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p2 = _mm_loadl_epi64((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s - 2 * p), p1); + _mm_storel_epi64((__m128i *)(s - 1 * p), p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), q1); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); + } +} + +void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, + const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16); + const __m128i zero = _mm_set1_epi16(0); + const __m128i blimit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + const __m128i thresh = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + + __m128i mask, hev, flat; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); { const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); @@ -901,6 +1127,8 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); __m128i work; + + // filter_mask and hev_mask flat = _mm_max_epu8(abs_p1p0, abs_q1q0); hev = _mm_subs_epu8(flat, thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); @@ -926,6 +1154,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, mask = _mm_subs_epu8(mask, limit); mask = _mm_cmpeq_epi8(mask, zero); + // flat_mask4 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), _mm_or_si128(_mm_subs_epu8(q2, q0), @@ -943,7 +1172,9 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, { const __m128i four = _mm_set1_epi16(4); unsigned char *src = s; - { + int i = 0; + + do { __m128i workp_a, workp_b, workp_shft; p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); @@ -958,38 +1189,40 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op2[0], + _mm_storel_epi64((__m128i *)&flat_op2[i * 8], _mm_packus_epi16(workp_shft, workp_shft)); workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op1[0], + _mm_storel_epi64((__m128i *)&flat_op1[i * 8], _mm_packus_epi16(workp_shft, workp_shft)); workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op0[0], + _mm_storel_epi64((__m128i *)&flat_op0[i * 8], _mm_packus_epi16(workp_shft, workp_shft)); workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq0[0], + _mm_storel_epi64((__m128i *)&flat_oq0[i * 8], _mm_packus_epi16(workp_shft, workp_shft)); workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq1[0], + _mm_storel_epi64((__m128i *)&flat_oq1[i * 8], _mm_packus_epi16(workp_shft, workp_shft)); workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq2[0], + _mm_storel_epi64((__m128i *)&flat_oq2[i * 8], _mm_packus_epi16(workp_shft, workp_shft)); - } + + src += 8; + } while (++i < 2); } // lp filter { @@ -1001,13 +1234,13 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); - const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); - const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); - const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), + const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); - const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), + const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); __m128i filt; __m128i work_a; @@ -1018,27 +1251,27 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); - /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + // (vp9_filter + 3 * (qs0 - ps0)) & mask filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); - /* Filter1 >> 3 */ + // Filter1 >> 3 work_a = _mm_cmpgt_epi8(zero, filter1); filter1 = _mm_srli_epi16(filter1, 3); work_a = _mm_and_si128(work_a, te0); filter1 = _mm_and_si128(filter1, t1f); filter1 = _mm_or_si128(filter1, work_a); - /* Filter2 >> 3 */ + // Filter2 >> 3 work_a = _mm_cmpgt_epi8(zero, filter2); filter2 = _mm_srli_epi16(filter2, 3); work_a = _mm_and_si128(work_a, te0); filter2 = _mm_and_si128(filter2, t1f); filter2 = _mm_or_si128(filter2, work_a); - /* filt >> 1 */ + // filt >> 1 filt = _mm_adds_epi8(filter1, t1); work_a = _mm_cmpgt_epi8(zero, filt); filt = _mm_srli_epi16(filt, 1); @@ -1049,47 +1282,185 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, filt = _mm_andnot_si128(hev, filt); work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - q0 = _mm_loadl_epi64((__m128i *)flat_oq0); + q0 = _mm_load_si128((__m128i *)flat_oq0); work_a = _mm_andnot_si128(flat, work_a); q0 = _mm_and_si128(flat, q0); q0 = _mm_or_si128(work_a, q0); work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); - q1 = _mm_loadl_epi64((__m128i *)flat_oq1); + q1 = _mm_load_si128((__m128i *)flat_oq1); work_a = _mm_andnot_si128(flat, work_a); q1 = _mm_and_si128(flat, q1); q1 = _mm_or_si128(work_a, q1); work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q2 = _mm_loadl_epi64((__m128i *)flat_oq2); + q2 = _mm_load_si128((__m128i *)flat_oq2); work_a = _mm_andnot_si128(flat, work_a); q2 = _mm_and_si128(flat, q2); q2 = _mm_or_si128(work_a, q2); work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - p0 = _mm_loadl_epi64((__m128i *)flat_op0); + p0 = _mm_load_si128((__m128i *)flat_op0); work_a = _mm_andnot_si128(flat, work_a); p0 = _mm_and_si128(flat, p0); p0 = _mm_or_si128(work_a, p0); work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - p1 = _mm_loadl_epi64((__m128i *)flat_op1); + p1 = _mm_load_si128((__m128i *)flat_op1); work_a = _mm_andnot_si128(flat, work_a); p1 = _mm_and_si128(flat, p1); p1 = _mm_or_si128(work_a, p1); work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p2 = _mm_loadl_epi64((__m128i *)flat_op2); + p2 = _mm_load_si128((__m128i *)flat_op2); work_a = _mm_andnot_si128(flat, work_a); p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_storel_epi64((__m128i *)(s - 3 * p), p2); - _mm_storel_epi64((__m128i *)(s - 2 * p), p1); - _mm_storel_epi64((__m128i *)(s - 1 * p), p0); - _mm_storel_epi64((__m128i *)(s + 0 * p), q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), q1); - _mm_storel_epi64((__m128i *)(s + 2 * p), q2); + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + } +} + +void vp9_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + const __m128i blimit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + const __m128i thresh = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + const __m128i zero = _mm_set1_epi16(0); + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + __m128i mask, hev, flat; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + + // filter_mask and hev_mask + { + const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), + _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), + _mm_subs_epu8(q0, q1)); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), + _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), + _mm_subs_epu8(q1, p1)); + __m128i work; + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), + _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), + _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), + _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), + _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // filter4 + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), + t80); + const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), + t80); + const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), + t80); + const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), + t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (vp9_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); } } @@ -1098,7 +1469,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, __m128i x0, x1, x2, x3, x4, x5, x6, x7; __m128i x8, x9, x10, x11, x12, x13, x14, x15; - /* Read in 16 lines */ + // Read in 16 lines x0 = _mm_loadl_epi64((__m128i *)in0); x8 = _mm_loadl_epi64((__m128i *)in1); x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); @@ -1136,7 +1507,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, x14 = _mm_unpacklo_epi32(x12, x13); x15 = _mm_unpackhi_epi32(x12, x13); - /* Store first 4-line result */ + // Store first 4-line result _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); @@ -1152,7 +1523,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, x14 = _mm_unpacklo_epi32(x12, x13); x15 = _mm_unpackhi_epi32(x12, x13); - /* Store second 4-line result */ + // Store second 4-line result _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); @@ -1222,61 +1593,124 @@ static INLINE void transpose(unsigned char *src[], int in_p, } while (++idx8x8 < num_8x8_to_transpose); } -void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s, - int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, - int count) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); +void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8); unsigned char *src[2]; unsigned char *dst[2]; - (void)count; - /* Transpose 16x16 */ - transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16); - transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16); - - /* Loop filtering */ - vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit, - thresh, 1); - src[0] = t_dst + 3 * 16; - src[1] = t_dst + 3 * 16 + 8; + // Transpose 8x16 + transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); - dst[0] = s - 5; - dst[1] = s - 5 + p * 8; + // Loop filtering + vp9_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, + blimit1, limit1, thresh1); + src[0] = t_dst; + src[1] = t_dst + 8; + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; - /* Transpose 16x8 */ + // Transpose back transpose(src, 16, dst, p, 2); } -void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s, - int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); - unsigned char *src[4]; - unsigned char *dst[4]; +void vp9_lpf_vertical_8_sse2(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count) { + DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 8); + unsigned char *src[1]; + unsigned char *dst[1]; + (void)count; + // Transpose 8x8 + src[0] = s - 4; dst[0] = t_dst; - dst[1] = t_dst + 8 * 16; - src[0] = s - 8; - src[1] = s - 8 + 8; + transpose(src, p, dst, 8, 1); + + // Loop filtering + vp9_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1); + + src[0] = t_dst; + dst[0] = s - 4; - /* Transpose 16x16 */ - transpose(src, p, dst, 16, 2); + // Transpose back + transpose(src, 8, dst, p, 1); +} + +void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8); + unsigned char *src[2]; + unsigned char *dst[2]; - /* Loop filtering */ - vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit, - thresh, 1); + // Transpose 8x16 + transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + // Loop filtering + vp9_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, + blimit1, limit1, thresh1); src[0] = t_dst; - src[1] = t_dst + 8 * 16; + src[1] = t_dst + 8; - dst[0] = s - 8; - dst[1] = s - 8 + 8; + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + // Transpose back transpose(src, 16, dst, p, 2); } + +void vp9_lpf_vertical_16_sse2(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 16); + unsigned char *src[2]; + unsigned char *dst[2]; + + src[0] = s - 8; + src[1] = s; + dst[0] = t_dst; + dst[1] = t_dst + 8 * 8; + + // Transpose 16x8 + transpose(src, p, dst, 8, 2); + + // Loop filtering + mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh); + + src[0] = t_dst; + src[1] = t_dst + 8 * 8; + dst[0] = s - 8; + dst[1] = s; + + // Transpose back + transpose(src, 8, dst, p, 2); +} + +void vp9_lpf_vertical_16_dual_sse2(unsigned char *s, int p, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); + + // Transpose 16x16 + transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); + transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); + + // Loop filtering + mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit, + thresh); + + // Transpose back + transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); + transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); +} diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm b/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm index 4ebb51b..91055b9 100644 --- a/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm +++ b/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm @@ -12,7 +12,7 @@ %include "vpx_ports/x86_abi_support.asm" -;void vp9_loop_filter_horizontal_edge_mmx +;void vp9_lpf_horizontal_4_mmx ;( ; unsigned char *src_ptr, ; int src_pixel_step, @@ -21,8 +21,8 @@ ; const char *thresh, ; int count ;) -global sym(vp9_loop_filter_horizontal_edge_mmx) PRIVATE -sym(vp9_loop_filter_horizontal_edge_mmx): +global sym(vp9_lpf_horizontal_4_mmx) PRIVATE +sym(vp9_lpf_horizontal_4_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -224,7 +224,7 @@ sym(vp9_loop_filter_horizontal_edge_mmx): ret -;void vp9_loop_filter_vertical_edge_mmx +;void vp9_lpf_vertical_4_mmx ;( ; unsigned char *src_ptr, ; int src_pixel_step, @@ -233,8 +233,8 @@ sym(vp9_loop_filter_horizontal_edge_mmx): ; const char *thresh, ; int count ;) -global sym(vp9_loop_filter_vertical_edge_mmx) PRIVATE -sym(vp9_loop_filter_vertical_edge_mmx): +global sym(vp9_lpf_vertical_4_mmx) PRIVATE +sym(vp9_lpf_vertical_4_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -527,7 +527,7 @@ sym(vp9_loop_filter_vertical_edge_mmx): pxor mm7, [GLOBAL(t80)] ; unoffset ; mm7 = q1 - ; tranpose and write back + ; transpose and write back ; mm1 = 72 62 52 42 32 22 12 02 ; mm6 = 73 63 53 43 33 23 13 03 ; mm3 = 74 64 54 44 34 24 14 04 diff --git a/libvpx/vp9/common/x86/vp9_postproc_x86.h b/libvpx/vp9/common/x86/vp9_postproc_x86.h index 8870215..cab9d34 100644 --- a/libvpx/vp9/common/x86/vp9_postproc_x86.h +++ b/libvpx/vp9/common/x86/vp9_postproc_x86.h @@ -12,6 +12,10 @@ #ifndef VP9_COMMON_X86_VP9_POSTPROC_X86_H_ #define VP9_COMMON_X86_VP9_POSTPROC_X86_H_ +#ifdef __cplusplus +extern "C" { +#endif + /* Note: * * This platform is commonly built for runtime CPU detection. If you modify @@ -61,4 +65,8 @@ extern prototype_postproc_addnoise(vp9_plane_add_noise_wmt); #endif #endif +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_X86_VP9_POSTPROC_X86_H_ diff --git a/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c new file mode 100644 index 0000000..7e9cc84 --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c @@ -0,0 +1,543 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <immintrin.h> +#include "vpx_ports/mem.h" + +// filters for 16_h8 and 16_v8 +DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +#if defined(__clang__) +# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) +# define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +# else // clang > 3.3 +# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +# endif // clang <= 3.3 +#elif defined(__GNUC__) +# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) +# define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 +# define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) +# else // gcc > 4.7 +# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +# endif // gcc <= 4.6 +#else // !(gcc || clang) +# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // __clang__ + +void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, srcReg32b2, filtersReg32; + unsigned int i; + unsigned int src_stride, dst_stride; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x706u)); + + filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i-=2) { + // load the 2 strides of source + srcReg32b1 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr-3))); + srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, + _mm_loadu_si128((__m128i *) + (src_ptr+src_pixels_per_line-3)), 1); + + // filter the source buffer + srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, + _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + // reading 2 strides of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+5))); + srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, + _mm_loadu_si128((__m128i *) + (src_ptr+src_pixels_per_line+5)), 1); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, + _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + // filter the source buffer + srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, + _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, + _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); + + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, + srcRegFilt32b2_1); + + src_ptr+=src_stride; + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, + _mm256_castsi256_si128(srcRegFilt32b1_1)); + + // save the next 16 bits + _mm_store_si128((__m128i*)(output_ptr+output_pitch), + _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + output_ptr+=dst_stride; + } + + // if the number of strides is odd. + // process only 16 bytes + if (i > 0) { + __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt2Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, + _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(secondFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3= _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt4Reg)); + srcRegFilt2= _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, + _mm256_castsi256_si128(forthFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + + // reading the next 16 bytes + // (part of it was being read by earlier read) + srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + // filter the source buffer + srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt2Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, + _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(secondFilters)); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt4Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, + _mm256_castsi256_si128(forthFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm256_castsi256_si128(addFilterReg64)); + + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm256_castsi256_si128(addFilterReg64)); + + // shift by 7 bit each 16 bit + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); + srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); + } +} + +void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64; + __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; + __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; + __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + unsigned int i; + unsigned int src_stride, dst_stride; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x706u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + // load 16 bytes 7 times in stride of src_pitch + srcReg32b1 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr))); + srcReg32b2 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch))); + srcReg32b3 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2))); + srcReg32b4 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3))); + srcReg32b5 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4))); + srcReg32b6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5))); + srcReg32b7 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6))); + + // have each consecutive loads on the same 256 register + srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, + _mm256_castsi256_si128(srcReg32b2), 1); + srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, + _mm256_castsi256_si128(srcReg32b3), 1); + srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, + _mm256_castsi256_si128(srcReg32b4), 1); + srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, + _mm256_castsi256_si128(srcReg32b5), 1); + srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, + _mm256_castsi256_si128(srcReg32b6), 1); + srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, + _mm256_castsi256_si128(srcReg32b7), 1); + + // merge every two consecutive registers except the last one + srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); + srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); + + // save + srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); + + // save + srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); + + // save + srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); + + // save + srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); + + + for (i = output_height; i > 1; i-=2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg32b8 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7))); + srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, + _mm256_castsi256_si128(srcReg32b8), 1); + srcReg32b9 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8))); + srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, + _mm256_castsi256_si128(srcReg32b9), 1); + + // merge every two consecutive registers + // save + srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); + srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8); + + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); + srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); + + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_min_epi16(srcReg32b8, srcReg32b12)); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_min_epi16(srcReg32b6, srcReg32b13)); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_max_epi16(srcReg32b8, srcReg32b12)); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_max_epi16(srcReg32b6, srcReg32b13)); + + + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); + srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); + + src_ptr+=src_stride; + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, + _mm256_castsi256_si128(srcReg32b1)); + + // save the next 16 bits + _mm_store_si128((__m128i*)(output_ptr+out_pitch), + _mm256_extractf128_si256(srcReg32b1, 1)); + + output_ptr+=dst_stride; + + // save part of the registers for next strides + srcReg32b10 = srcReg32b11; + srcReg32b1 = srcReg32b3; + srcReg32b11 = srcReg32b2; + srcReg32b3 = srcReg32b5; + srcReg32b2 = srcReg32b4; + srcReg32b5 = srcReg32b7; + srcReg32b7 = srcReg32b9; + } + if (i > 0) { + __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; + __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; + // load the last 16 bytes + srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); + + // merge the last 2 results together + srcRegFilt4 = _mm_unpacklo_epi8( + _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + srcRegFilt7 = _mm_unpackhi_epi8( + _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, + _mm256_castsi256_si128(forthFilters)); + srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, + _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); + + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), + _mm256_castsi256_si128(secondFilters)); + srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), + _mm256_castsi256_si128(secondFilters)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), + _mm256_castsi256_si128(thirdFilters)); + srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_min_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, + _mm_min_epi16(srcRegFilt5, srcRegFilt7)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_max_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, + _mm_max_epi16(srcRegFilt5, srcRegFilt7)); + + + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm256_castsi256_si128(addFilterReg64)); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, + _mm256_castsi256_si128(addFilterReg64)); + + // shift by 7 bit each 16 bit + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); + } +} diff --git a/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c new file mode 100644 index 0000000..cf28d8d --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c @@ -0,0 +1,490 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <tmmintrin.h> +#include "vpx_ports/mem.h" +#include "vpx_ports/emmintrin_compat.h" + +// filters only for the 4_h8 convolution +DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +// filters for 8_h8 and 16_h8 +DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; + __m128i addFilterReg64, filtersReg, srcReg, minReg; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 =_mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter into the first lane + firstFilters = _mm_shufflelo_epi16(filtersReg, 0); + // duplicate only the third 16 bit in the filter into the first lane + secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); + // duplicate only the seconds 16 bits in the filter into the second lane + firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); + // duplicate only the forth 16 bits in the filter into the second lane + secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); + + // loading the local filters + thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8); + forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8); + + for (i = 0; i < output_height; i++) { + srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); + + // filter the source buffer + srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters); + srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // extract the higher half of the lane + srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); + srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); + + minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); + + // add and saturate all the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bits + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + src_ptr+=src_pixels_per_line; + + // save only 4 bytes + *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); + + output_ptr+=output_pitch; + } +} + +void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; + __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; + __m128i addFilterReg64, filtersReg, minReg; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 128 bit register + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 128 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 128 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 128 bit register + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + filt1Reg = _mm_load_si128((__m128i const *)filt1_global); + filt2Reg = _mm_load_si128((__m128i const *)filt2_global); + filt3Reg = _mm_load_si128((__m128i const *)filt3_global); + filt4Reg = _mm_load_si128((__m128i const *)filt4_global); + + for (i = 0; i < output_height; i++) { + srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); + + // filter the source buffer + srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // filter the source buffer + srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg); + srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); + srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); + + // add and saturate all the results together + minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + + srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bits + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + + src_ptr+=src_pixels_per_line; + + // save only 8 bytes + _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); + + output_ptr+=output_pitch; + } +} + +void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; + __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 128 bit register + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 128 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 128 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 128 bit register + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + filt1Reg = _mm_load_si128((__m128i const *)filt1_global); + filt2Reg = _mm_load_si128((__m128i const *)filt2_global); + filt3Reg = _mm_load_si128((__m128i const *)filt3_global); + filt4Reg = _mm_load_si128((__m128i const *)filt4_global); + + for (i = 0; i < output_height; i++) { + srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); + + // filter the source buffer + srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + + // reading the next 16 bytes. + // (part of it was being read by earlier read) + srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + // filter the source buffer + srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); + srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); + + src_ptr+=src_pixels_per_line; + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); + + output_ptr+=output_pitch; + } +} + +void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6; + __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits in the filter + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits in the filter + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits in the filter + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + for (i = 0; i < output_height; i++) { + // load the first 8 bytes + srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); + // load the next 8 bytes in stride of src_pitch + srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]); + srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]); + srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]); + + // merge the result together + srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); + srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); + + // load the next 8 bytes in stride of src_pitch + srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]); + srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]); + srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]); + srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]); + + // merge the result together + srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4); + srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); + srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); + + // add and saturate the results together + minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); + srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + + src_ptr+=src_pitch; + + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); + + output_ptr+=out_pitch; + } +} + +void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3; + __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits in the filter + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits in the filter + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits in the filter + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + for (i = 0; i < output_height; i++) { + // load the first 16 bytes + srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr)); + // load the next 16 bytes in stride of src_pitch + srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)); + srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)); + srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); + + // merge the result together + srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); + srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); + srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2); + srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); + srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); + + // add and saturate the results together + srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); + + // load the next 16 bytes in stride of two/three src_pitch + srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)); + srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)); + + // merge the result together + srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); + srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters); + srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters); + + // load the next 16 bytes in stride of four/five src_pitch + srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)); + srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)); + + // merge the result together + srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); + srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters); + srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters); + + // add and saturate the results together + srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, + _mm_min_epi16(srcRegFilt4, srcRegFilt7)); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_min_epi16(srcRegFilt6, srcRegFilt8)); + + // add and saturate the results together + srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, + _mm_max_epi16(srcRegFilt4, srcRegFilt7)); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_max_epi16(srcRegFilt6, srcRegFilt8)); + srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7); + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1); + + src_ptr+=src_pitch; + + // save 16 bytes convolve result + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); + + output_ptr+=out_pitch; + } +} diff --git a/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm index 7a5cca0..634fa77 100644 --- a/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm +++ b/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm @@ -11,17 +11,6 @@ %include "vpx_ports/x86_abi_support.asm" -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -; -; This is an implementation of some of the SSE optimizations first seen in ffvp8 -; -;*************************************************************************************/ - - %macro VERTx4 1 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr @@ -81,11 +70,14 @@ pmaddubsw xmm4, k4k5 pmaddubsw xmm6, k6k7 + movdqa xmm1, xmm2 paddsw xmm0, xmm6 - paddsw xmm0, xmm2 + pmaxsw xmm2, xmm4 + pminsw xmm4, xmm1 paddsw xmm0, xmm4 - paddsw xmm0, krd + paddsw xmm0, xmm2 + paddsw xmm0, krd psraw xmm0, 7 packuswb xmm0, xmm0 @@ -166,10 +158,13 @@ pmaddubsw xmm6, k6k7 paddsw xmm0, xmm6 - paddsw xmm0, xmm2 + movdqa xmm1, xmm2 + pmaxsw xmm2, xmm4 + pminsw xmm4, xmm1 paddsw xmm0, xmm4 - paddsw xmm0, krd + paddsw xmm0, xmm2 + paddsw xmm0, krd psraw xmm0, 7 packuswb xmm0, xmm0 @@ -251,10 +246,13 @@ pmaddubsw xmm6, k6k7 paddsw xmm0, xmm6 - paddsw xmm0, xmm2 + movdqa xmm1, xmm2 + pmaxsw xmm2, xmm4 + pminsw xmm4, xmm1 paddsw xmm0, xmm4 - paddsw xmm0, krd + paddsw xmm0, xmm2 + paddsw xmm0, krd psraw xmm0, 7 packuswb xmm0, xmm0 %if %1 @@ -538,14 +536,22 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): movdqa %2, %1 pshufb %1, [GLOBAL(shuf_t0t1)] pshufb %2, [GLOBAL(shuf_t2t3)] - pmaddubsw %1, xmm6 - pmaddubsw %2, xmm7 + pmaddubsw %1, k0k1k4k5 + pmaddubsw %2, k2k3k6k7 - paddsw %1, %2 - movdqa %2, %1 + movdqa xmm4, %1 + movdqa xmm5, %2 + psrldq %1, 8 psrldq %2, 8 - paddsw %1, %2 - paddsw %1, xmm5 + movdqa xmm6, xmm5 + + paddsw xmm4, %2 + pmaxsw xmm5, %1 + pminsw %1, xmm6 + paddsw %1, xmm4 + paddsw %1, xmm5 + + paddsw %1, krd psraw %1, 7 packuswb %1, %1 %endm @@ -565,6 +571,10 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7 pshufd xmm5, xmm5, 0 ;rounding + movdqa k0k1k4k5, xmm6 + movdqa k2k3k6k7, xmm7 + movdqa krd, xmm5 + movsxd rax, dword ptr arg(1) ;src_pixels_per_line movsxd rdx, dword ptr arg(3) ;output_pitch movsxd rcx, dword ptr arg(4) ;output_height @@ -631,9 +641,13 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): pmaddubsw %3, k4k5 pmaddubsw %4, k6k7 - paddsw %1, %2 paddsw %1, %4 + movdqa %4, %2 + pmaxsw %2, %3 + pminsw %3, %4 paddsw %1, %3 + paddsw %1, %2 + paddsw %1, krd psraw %1, 7 packuswb %1, %1 @@ -779,12 +793,19 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): pmaddubsw xmm6, k4k5 pmaddubsw xmm7, k6k7 - paddsw xmm0, xmm1 paddsw xmm0, xmm3 + movdqa xmm3, xmm1 + pmaxsw xmm1, xmm2 + pminsw xmm2, xmm3 paddsw xmm0, xmm2 - paddsw xmm4, xmm5 + paddsw xmm0, xmm1 + paddsw xmm4, xmm7 + movdqa xmm7, xmm5 + pmaxsw xmm5, xmm6 + pminsw xmm6, xmm7 paddsw xmm4, xmm6 + paddsw xmm4, xmm5 paddsw xmm0, krd paddsw xmm4, krd @@ -826,8 +847,16 @@ sym(vp9_filter_block1d4_h8_ssse3): push rdi ; end prolog + ALIGN_STACK 16, rax + sub rsp, 16 * 3 + %define k0k1k4k5 [rsp + 16 * 0] + %define k2k3k6k7 [rsp + 16 * 1] + %define krd [rsp + 16 * 2] + HORIZx4 0 + add rsp, 16 * 3 + pop rsp ; begin epilog pop rdi pop rsi @@ -932,8 +961,16 @@ sym(vp9_filter_block1d4_h8_avg_ssse3): push rdi ; end prolog + ALIGN_STACK 16, rax + sub rsp, 16 * 3 + %define k0k1k4k5 [rsp + 16 * 0] + %define k2k3k6k7 [rsp + 16 * 1] + %define krd [rsp + 16 * 2] + HORIZx4 1 + add rsp, 16 * 3 + pop rsp ; begin epilog pop rdi pop rsi diff --git a/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm b/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm new file mode 100644 index 0000000..d94ccf2 --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm @@ -0,0 +1,448 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm3, [rdx] ;load filters + pshuflw xmm4, xmm3, 11111111b ;k3 + psrldq xmm3, 8 + pshuflw xmm3, xmm3, 0b ;k4 + punpcklqdq xmm4, xmm3 ;k3k4 + + movq xmm3, rcx ;rounding + pshufd xmm3, xmm3, 0 + + pxor xmm2, xmm2 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + + punpckldq xmm0, xmm1 ;two row in one register + punpcklbw xmm0, xmm2 ;unpack to word + pmullw xmm0, xmm4 ;multiply the filter factors + + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + + paddsw xmm0, xmm3 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + + pshuflw xmm6, xmm7, 11111111b ;k3 + pshufhw xmm7, xmm7, 0b ;k4 + punpcklwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movq xmm4, rcx ;rounding + pshufd xmm4, xmm4, 0 + + pxor xmm5, xmm5 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpckhbw xmm2, xmm5 + punpckhbw xmm3, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + pmullw xmm2, xmm6 + pmullw xmm3, xmm7 + + paddsw xmm0, xmm1 + paddsw xmm2, xmm3 + + paddsw xmm0, xmm4 ;rounding + paddsw xmm2, xmm4 + psraw xmm0, 7 ;shift + psraw xmm2, 7 + packuswb xmm0, xmm2 ;pack back to byte +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +global sym(vp9_filter_block1d4_v2_sse2) PRIVATE +sym(vp9_filter_block1d4_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_sse2) PRIVATE +sym(vp9_filter_block1d8_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_sse2) PRIVATE +sym(vp9_filter_block1d16_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_v2_avg_sse2) PRIVATE +sym(vp9_filter_block1d4_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_avg_sse2) PRIVATE +sym(vp9_filter_block1d8_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_avg_sse2) PRIVATE +sym(vp9_filter_block1d16_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_sse2) PRIVATE +sym(vp9_filter_block1d4_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_sse2) PRIVATE +sym(vp9_filter_block1d8_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_sse2) PRIVATE +sym(vp9_filter_block1d16_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_avg_sse2) PRIVATE +sym(vp9_filter_block1d4_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_avg_sse2) PRIVATE +sym(vp9_filter_block1d8_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_avg_sse2) PRIVATE +sym(vp9_filter_block1d16_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm b/libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm new file mode 100644 index 0000000..b5e18fe --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm @@ -0,0 +1,422 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm3, [rdx] ;load filters + psrldq xmm3, 6 + packsswb xmm3, xmm3 + pshuflw xmm3, xmm3, 0b ;k3_k4 + + movq xmm2, rcx ;rounding + pshufd xmm2, xmm2, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm3 + + paddsw xmm0, xmm2 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + psrldq xmm7, 6 + packsswb xmm7, xmm7 + pshuflw xmm7, xmm7, 0b ;k3_k4 + punpcklwd xmm7, xmm7 + + movq xmm6, rcx ;rounding + pshufd xmm6, xmm6, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm7 + + paddsw xmm0, xmm6 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte + +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm1 + punpckhbw xmm2, xmm1 + pmaddubsw xmm0, xmm7 + pmaddubsw xmm2, xmm7 + + paddsw xmm0, xmm6 ;rounding + paddsw xmm2, xmm6 + psraw xmm0, 7 ;shift + psraw xmm2, 7 + packuswb xmm0, xmm2 ;pack back to byte + +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +global sym(vp9_filter_block1d4_v2_ssse3) PRIVATE +sym(vp9_filter_block1d4_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_ssse3) PRIVATE +sym(vp9_filter_block1d8_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_ssse3) PRIVATE +sym(vp9_filter_block1d16_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_v2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_ssse3) PRIVATE +sym(vp9_filter_block1d4_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_ssse3) PRIVATE +sym(vp9_filter_block1d8_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_ssse3) PRIVATE +sym(vp9_filter_block1d16_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret |