diff options
Diffstat (limited to 'libvpx/vp9')
49 files changed, 7042 insertions, 995 deletions
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm new file mode 100644 index 0000000..d290d07 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm @@ -0,0 +1,144 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + EXPORT |vp9_idct32x32_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ;TODO(hkuang): put the following macros in a seperate + ;file so other idct function could also use them. + MACRO + LD_16x8 $src, $stride + vld1.8 {q8}, [$src], $stride + vld1.8 {q9}, [$src], $stride + vld1.8 {q10}, [$src], $stride + vld1.8 {q11}, [$src], $stride + vld1.8 {q12}, [$src], $stride + vld1.8 {q13}, [$src], $stride + vld1.8 {q14}, [$src], $stride + vld1.8 {q15}, [$src], $stride + MEND + + MACRO + ADD_DIFF_16x8 $diff + vqadd.u8 q8, q8, $diff + vqadd.u8 q9, q9, $diff + vqadd.u8 q10, q10, $diff + vqadd.u8 q11, q11, $diff + vqadd.u8 q12, q12, $diff + vqadd.u8 q13, q13, $diff + vqadd.u8 q14, q14, $diff + vqadd.u8 q15, q15, $diff + MEND + + MACRO + SUB_DIFF_16x8 $diff + vqsub.u8 q8, q8, $diff + vqsub.u8 q9, q9, $diff + vqsub.u8 q10, q10, $diff + vqsub.u8 q11, q11, $diff + vqsub.u8 q12, q12, $diff + vqsub.u8 q13, q13, $diff + vqsub.u8 q14, q14, $diff + vqsub.u8 q15, q15, $diff + MEND + + MACRO + ST_16x8 $dst, $stride + vst1.8 {q8}, [$dst], $stride + vst1.8 {q9}, [$dst], $stride + vst1.8 {q10},[$dst], $stride + vst1.8 {q11},[$dst], $stride + vst1.8 {q12},[$dst], $stride + vst1.8 {q13},[$dst], $stride + vst1.8 {q14},[$dst], $stride + vst1.8 {q15},[$dst], $stride + MEND + +;void vp9_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride + +|vp9_idct32x32_1_add_neon| PROC + push {lr} + pld [r1] + add r3, r1, #16 ; r3 dest + 16 for second loop + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 6) + add r0, r0, #32 ; + (1 <<((6) - 1)) + asrs r0, r0, #6 ; >> 6 + bge diff_positive_32_32 + +diff_negative_32_32 + neg r0, r0 + usat r0, #8, r0 + vdup.u8 q0, r0 + mov r0, #4 + +diff_negative_32_32_loop + sub r0, #1 + LD_16x8 r1, r2 + SUB_DIFF_16x8 q0 + ST_16x8 r12, r2 + + LD_16x8 r1, r2 + SUB_DIFF_16x8 q0 + ST_16x8 r12, r2 + cmp r0, #2 + moveq r1, r3 + moveq r12, r3 + cmp r0, #0 + bne diff_negative_32_32_loop + pop {pc} + +diff_positive_32_32 + usat r0, #8, r0 + vdup.u8 q0, r0 + mov r0, #4 + +diff_positive_32_32_loop + sub r0, #1 + LD_16x8 r1, r2 + ADD_DIFF_16x8 q0 + ST_16x8 r12, r2 + + LD_16x8 r1, r2 + ADD_DIFF_16x8 q0 + ST_16x8 r12, r2 + cmp r0, #2 + moveq r1, r3 + moveq r12, r3 + cmp r0, #0 + bne diff_positive_32_32_loop + pop {pc} + + ENDP ; |vp9_idct32x32_1_add_neon| + END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm index f00d027..388a7d7 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm @@ -1145,7 +1145,7 @@ idct32_bands_end_1st_pass ; pass loop processing add r5, r5, #1 - B idct32_pass_loop + b idct32_pass_loop idct32_bands_end_2nd_pass STORE_COMBINE_CENTER_RESULTS diff --git a/libvpx/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c new file mode 100644 index 0000000..b0dc496 --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c @@ -0,0 +1,332 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <stdlib.h> + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +void vp9_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + + __asm__ __volatile__ ( + "lb %[tmp1], (%[left]) \n\t" + "lb %[tmp2], 1(%[left]) \n\t" + "lb %[tmp3], 2(%[left]) \n\t" + "lb %[tmp4], 3(%[left]) \n\t" + "lb %[tmp5], 4(%[left]) \n\t" + "lb %[tmp6], 5(%[left]) \n\t" + "lb %[tmp7], 6(%[left]) \n\t" + "lb %[tmp8], 7(%[left]) \n\t" + "lb %[tmp9], 8(%[left]) \n\t" + "lb %[tmp10], 9(%[left]) \n\t" + "lb %[tmp11], 10(%[left]) \n\t" + "lb %[tmp12], 11(%[left]) \n\t" + "lb %[tmp13], 12(%[left]) \n\t" + "lb %[tmp14], 13(%[left]) \n\t" + "lb %[tmp15], 14(%[left]) \n\t" + "lb %[tmp16], 15(%[left]) \n\t" + + "replv.qb %[tmp1], %[tmp1] \n\t" + "replv.qb %[tmp2], %[tmp2] \n\t" + "replv.qb %[tmp3], %[tmp3] \n\t" + "replv.qb %[tmp4], %[tmp4] \n\t" + "replv.qb %[tmp5], %[tmp5] \n\t" + "replv.qb %[tmp6], %[tmp6] \n\t" + "replv.qb %[tmp7], %[tmp7] \n\t" + "replv.qb %[tmp8], %[tmp8] \n\t" + "replv.qb %[tmp9], %[tmp9] \n\t" + "replv.qb %[tmp10], %[tmp10] \n\t" + "replv.qb %[tmp11], %[tmp11] \n\t" + "replv.qb %[tmp12], %[tmp12] \n\t" + "replv.qb %[tmp13], %[tmp13] \n\t" + "replv.qb %[tmp14], %[tmp14] \n\t" + "replv.qb %[tmp15], %[tmp15] \n\t" + "replv.qb %[tmp16], %[tmp16] \n\t" + + "sw %[tmp1], (%[dst]) \n\t" + "sw %[tmp1], 4(%[dst]) \n\t" + "sw %[tmp1], 8(%[dst]) \n\t" + "sw %[tmp1], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp2], (%[dst]) \n\t" + "sw %[tmp2], 4(%[dst]) \n\t" + "sw %[tmp2], 8(%[dst]) \n\t" + "sw %[tmp2], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp3], (%[dst]) \n\t" + "sw %[tmp3], 4(%[dst]) \n\t" + "sw %[tmp3], 8(%[dst]) \n\t" + "sw %[tmp3], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp4], (%[dst]) \n\t" + "sw %[tmp4], 4(%[dst]) \n\t" + "sw %[tmp4], 8(%[dst]) \n\t" + "sw %[tmp4], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp5], (%[dst]) \n\t" + "sw %[tmp5], 4(%[dst]) \n\t" + "sw %[tmp5], 8(%[dst]) \n\t" + "sw %[tmp5], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp6], (%[dst]) \n\t" + "sw %[tmp6], 4(%[dst]) \n\t" + "sw %[tmp6], 8(%[dst]) \n\t" + "sw %[tmp6], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp7], (%[dst]) \n\t" + "sw %[tmp7], 4(%[dst]) \n\t" + "sw %[tmp7], 8(%[dst]) \n\t" + "sw %[tmp7], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp8], (%[dst]) \n\t" + "sw %[tmp8], 4(%[dst]) \n\t" + "sw %[tmp8], 8(%[dst]) \n\t" + "sw %[tmp8], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp9], (%[dst]) \n\t" + "sw %[tmp9], 4(%[dst]) \n\t" + "sw %[tmp9], 8(%[dst]) \n\t" + "sw %[tmp9], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp10], (%[dst]) \n\t" + "sw %[tmp10], 4(%[dst]) \n\t" + "sw %[tmp10], 8(%[dst]) \n\t" + "sw %[tmp10], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp11], (%[dst]) \n\t" + "sw %[tmp11], 4(%[dst]) \n\t" + "sw %[tmp11], 8(%[dst]) \n\t" + "sw %[tmp11], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp12], (%[dst]) \n\t" + "sw %[tmp12], 4(%[dst]) \n\t" + "sw %[tmp12], 8(%[dst]) \n\t" + "sw %[tmp12], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp13], (%[dst]) \n\t" + "sw %[tmp13], 4(%[dst]) \n\t" + "sw %[tmp13], 8(%[dst]) \n\t" + "sw %[tmp13], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp14], (%[dst]) \n\t" + "sw %[tmp14], 4(%[dst]) \n\t" + "sw %[tmp14], 8(%[dst]) \n\t" + "sw %[tmp14], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp15], (%[dst]) \n\t" + "sw %[tmp15], 4(%[dst]) \n\t" + "sw %[tmp15], 8(%[dst]) \n\t" + "sw %[tmp15], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp16], (%[dst]) \n\t" + "sw %[tmp16], 4(%[dst]) \n\t" + "sw %[tmp16], 8(%[dst]) \n\t" + "sw %[tmp16], 12(%[dst]) \n\t" + + : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), + [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4), + [tmp5] "=&r" (tmp5), [tmp7] "=&r" (tmp7), + [tmp6] "=&r" (tmp6), [tmp8] "=&r" (tmp8), + [tmp9] "=&r" (tmp9), [tmp10] "=&r" (tmp10), + [tmp11] "=&r" (tmp11), [tmp12] "=&r" (tmp12), + [tmp13] "=&r" (tmp13), [tmp14] "=&r" (tmp14), + [tmp15] "=&r" (tmp15), [tmp16] "=&r" (tmp16) + : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride) + ); +} + +void vp9_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t expected_dc; + int32_t average; + int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; + int32_t above2, left2; + + __asm__ __volatile__ ( + "lw %[above1], (%[above]) \n\t" + "lw %[above2], 4(%[above]) \n\t" + "lw %[left1], (%[left]) \n\t" + "lw %[left2], 4(%[left]) \n\t" + + "preceu.ph.qbl %[above_l1], %[above1] \n\t" + "preceu.ph.qbr %[above_r1], %[above1] \n\t" + "preceu.ph.qbl %[left_l1], %[left1] \n\t" + "preceu.ph.qbr %[left_r1], %[left1] \n\t" + + "addu.ph %[average], %[above_r1], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "preceu.ph.qbl %[above_l1], %[above2] \n\t" + "preceu.ph.qbr %[above_r1], %[above2] \n\t" + "preceu.ph.qbl %[left_l1], %[left2] \n\t" + "preceu.ph.qbr %[left_r1], %[left2] \n\t" + + "addu.ph %[average], %[average], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[above_r1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "lw %[above1], 8(%[above]) \n\t" + "lw %[above2], 12(%[above]) \n\t" + "lw %[left1], 8(%[left]) \n\t" + "lw %[left2], 12(%[left]) \n\t" + + "preceu.ph.qbl %[above_l1], %[above1] \n\t" + "preceu.ph.qbr %[above_r1], %[above1] \n\t" + "preceu.ph.qbl %[left_l1], %[left1] \n\t" + "preceu.ph.qbr %[left_r1], %[left1] \n\t" + + "addu.ph %[average], %[average], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[above_r1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "preceu.ph.qbl %[above_l1], %[above2] \n\t" + "preceu.ph.qbr %[above_r1], %[above2] \n\t" + "preceu.ph.qbl %[left_l1], %[left2] \n\t" + "preceu.ph.qbr %[left_r1], %[left2] \n\t" + + "addu.ph %[average], %[average], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[above_r1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "addiu %[average], %[average], 16 \n\t" + "srl %[tmp], %[average], 16 \n\t" + "addu.ph %[average], %[tmp], %[average] \n\t" + "srl %[expected_dc], %[average], 5 \n\t" + "replv.qb %[expected_dc], %[expected_dc] \n\t" + + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + : [left1] "=&r" (left1), [above1] "=&r" (above1), + [left_l1] "=&r" (left_l1), [above_l1] "=&r" (above_l1), + [left_r1] "=&r" (left_r1), [above_r1] "=&r" (above_r1), + [above2] "=&r" (above2), [left2] "=&r" (left2), + [average] "=&r" (average), [tmp] "=&r" (tmp), + [expected_dc] "=&r" (expected_dc) + : [above] "r" (above), [left] "r" (left), + [dst] "r" (dst), [stride] "r" (stride) + ); +} +#endif // #if HAVE_DSPR2 diff --git a/libvpx/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c new file mode 100644 index 0000000..a53c623 --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <stdlib.h> + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +void vp9_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t tmp1, tmp2, tmp3, tmp4; + + __asm__ __volatile__ ( + "lb %[tmp1], (%[left]) \n\t" + "lb %[tmp2], 1(%[left]) \n\t" + "lb %[tmp3], 2(%[left]) \n\t" + "lb %[tmp4], 3(%[left]) \n\t" + "replv.qb %[tmp1], %[tmp1] \n\t" + "replv.qb %[tmp2], %[tmp2] \n\t" + "replv.qb %[tmp3], %[tmp3] \n\t" + "replv.qb %[tmp4], %[tmp4] \n\t" + "sw %[tmp1], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp2], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp3], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp4], (%[dst]) \n\t" + + : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), + [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4) + : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride) + ); +} + +void vp9_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t expected_dc; + int32_t average; + int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l; + + __asm__ __volatile__ ( + "lw %[above_c], (%[above]) \n\t" + "lw %[left_c], (%[left]) \n\t" + + "preceu.ph.qbl %[above_l], %[above_c] \n\t" + "preceu.ph.qbr %[above_r], %[above_c] \n\t" + "preceu.ph.qbl %[left_l], %[left_c] \n\t" + "preceu.ph.qbr %[left_r], %[left_c] \n\t" + + "addu.ph %[average], %[above_r], %[above_l] \n\t" + "addu.ph %[average], %[average], %[left_l] \n\t" + "addu.ph %[average], %[average], %[left_r] \n\t" + "addiu %[average], %[average], 4 \n\t" + "srl %[tmp], %[average], 16 \n\t" + "addu.ph %[average], %[tmp], %[average] \n\t" + "srl %[expected_dc], %[average], 3 \n\t" + "replv.qb %[expected_dc], %[expected_dc] \n\t" + + "sw %[expected_dc], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + + : [above_c] "=&r" (above_c), [above_l] "=&r" (above_l), + [above_r] "=&r" (above_r), [left_c] "=&r" (left_c), + [left_l] "=&r" (left_l), [left_r] "=&r" (left_r), + [average] "=&r" (average), [tmp] "=&r" (tmp), + [expected_dc] "=&r" (expected_dc) + : [above] "r" (above), [left] "r" (left), + [dst] "r" (dst), [stride] "r" (stride) + ); +} + +void vp9_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t abovel, abover; + int32_t left0, left1, left2, left3; + int32_t res0, res1; + int32_t resl; + int32_t resr; + int32_t top_left; + uint8_t *cm = vp9_ff_cropTbl; + + __asm__ __volatile__ ( + "ulw %[resl], (%[above]) \n\t" + + "lbu %[left0], (%[left]) \n\t" + "lbu %[left1], 1(%[left]) \n\t" + "lbu %[left2], 2(%[left]) \n\t" + "lbu %[left3], 3(%[left]) \n\t" + + "lbu %[top_left], -1(%[above]) \n\t" + + "preceu.ph.qbl %[abovel], %[resl] \n\t" + "preceu.ph.qbr %[abover], %[resl] \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "replv.ph %[left1], %[left1] \n\t" + "replv.ph %[left2], %[left2] \n\t" + "replv.ph %[left3], %[left3] \n\t" + + "replv.ph %[top_left], %[top_left] \n\t" + + "addu.ph %[resl], %[abovel], %[left0] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left0] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "addu.ph %[resl], %[abovel], %[left1] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left1] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sb %[res1], 1(%[dst]) \n\t" + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "addu.ph %[resl], %[abovel], %[left2] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left2] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + + "sb %[res1], 1(%[dst]) \n\t" + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "addu.ph %[resl], %[abovel], %[left3] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left3] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + : [abovel] "=&r" (abovel), [abover] "=&r" (abover), + [left0] "=&r" (left0), [left1] "=&r" (left1), [left2] "=&r" (left2), + [res0] "=&r" (res0), [res1] "=&r" (res1), [left3] "=&r" (left3), + [resl] "=&r" (resl), [resr] "=&r" (resr), [top_left] "=&r" (top_left) + : [above] "r" (above), [left] "r" (left), + [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm) + ); +} +#endif // #if HAVE_DSPR2 diff --git a/libvpx/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c new file mode 100644 index 0000000..40d93ae --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c @@ -0,0 +1,610 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <stdlib.h> + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +void vp9_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + + __asm__ __volatile__ ( + "lb %[tmp1], (%[left]) \n\t" + "lb %[tmp2], 1(%[left]) \n\t" + "lb %[tmp3], 2(%[left]) \n\t" + "lb %[tmp4], 3(%[left]) \n\t" + "lb %[tmp5], 4(%[left]) \n\t" + "lb %[tmp6], 5(%[left]) \n\t" + "lb %[tmp7], 6(%[left]) \n\t" + "lb %[tmp8], 7(%[left]) \n\t" + + "replv.qb %[tmp1], %[tmp1] \n\t" + "replv.qb %[tmp2], %[tmp2] \n\t" + "replv.qb %[tmp3], %[tmp3] \n\t" + "replv.qb %[tmp4], %[tmp4] \n\t" + "replv.qb %[tmp5], %[tmp5] \n\t" + "replv.qb %[tmp6], %[tmp6] \n\t" + "replv.qb %[tmp7], %[tmp7] \n\t" + "replv.qb %[tmp8], %[tmp8] \n\t" + + "sw %[tmp1], (%[dst]) \n\t" + "sw %[tmp1], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp2], (%[dst]) \n\t" + "sw %[tmp2], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp3], (%[dst]) \n\t" + "sw %[tmp3], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp4], (%[dst]) \n\t" + "sw %[tmp4], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp5], (%[dst]) \n\t" + "sw %[tmp5], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp6], (%[dst]) \n\t" + "sw %[tmp6], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp7], (%[dst]) \n\t" + "sw %[tmp7], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp8], (%[dst]) \n\t" + "sw %[tmp8], 4(%[dst]) \n\t" + + : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), + [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4), + [tmp5] "=&r" (tmp5), [tmp7] "=&r" (tmp7), + [tmp6] "=&r" (tmp6), [tmp8] "=&r" (tmp8) + : [left] "r" (left), [dst] "r" (dst), + [stride] "r" (stride) + ); +} + +void vp9_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t expected_dc; + int32_t average; + int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; + int32_t above2, above_l2, above_r2, left2, left_r2, left_l2; + + __asm__ __volatile__ ( + "lw %[above1], (%[above]) \n\t" + "lw %[above2], 4(%[above]) \n\t" + "lw %[left1], (%[left]) \n\t" + "lw %[left2], 4(%[left]) \n\t" + + "preceu.ph.qbl %[above_l1], %[above1] \n\t" + "preceu.ph.qbr %[above_r1], %[above1] \n\t" + "preceu.ph.qbl %[left_l1], %[left1] \n\t" + "preceu.ph.qbr %[left_r1], %[left1] \n\t" + + "preceu.ph.qbl %[above_l2], %[above2] \n\t" + "preceu.ph.qbr %[above_r2], %[above2] \n\t" + "preceu.ph.qbl %[left_l2], %[left2] \n\t" + "preceu.ph.qbr %[left_r2], %[left2] \n\t" + + "addu.ph %[average], %[above_r1], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "addu.ph %[average], %[average], %[above_l2] \n\t" + "addu.ph %[average], %[average], %[above_r2] \n\t" + "addu.ph %[average], %[average], %[left_l2] \n\t" + "addu.ph %[average], %[average], %[left_r2] \n\t" + + "addiu %[average], %[average], 8 \n\t" + + "srl %[tmp], %[average], 16 \n\t" + "addu.ph %[average], %[tmp], %[average] \n\t" + "srl %[expected_dc], %[average], 4 \n\t" + "replv.qb %[expected_dc], %[expected_dc] \n\t" + + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + : [above1] "=&r" (above1), [above_l1] "=&r" (above_l1), + [above_r1] "=&r" (above_r1), [left1] "=&r" (left1), + [left_l1] "=&r" (left_l1), [left_r1] "=&r" (left_r1), + [above2] "=&r" (above2), [above_l2] "=&r" (above_l2), + [above_r2] "=&r" (above_r2), [left2] "=&r" (left2), + [left_l2] "=&r" (left_l2), [left_r2] "=&r" (left_r2), + [average] "=&r" (average), [tmp] "=&r" (tmp), + [expected_dc] "=&r" (expected_dc) + : [above] "r" (above), [left] "r" (left), [dst] "r" (dst), + [stride] "r" (stride) + ); +} + +void vp9_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t abovel, abover; + int32_t abovel_1, abover_1; + int32_t left0; + int32_t res0, res1, res2, res3; + int32_t reshw; + int32_t top_left; + uint8_t *cm = vp9_ff_cropTbl; + + __asm__ __volatile__ ( + "ulw %[reshw], (%[above]) \n\t" + "ulw %[top_left], 4(%[above]) \n\t" + + "lbu %[left0], (%[left]) \n\t" + + "preceu.ph.qbl %[abovel], %[reshw] \n\t" + "preceu.ph.qbr %[abover], %[reshw] \n\t" + "preceu.ph.qbl %[abovel_1], %[top_left] \n\t" + "preceu.ph.qbr %[abover_1], %[top_left] \n\t" + + "lbu %[top_left], -1(%[above]) \n\t" + "replv.ph %[left0], %[left0] \n\t" + + "replv.ph %[top_left], %[top_left] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 1(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 2(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 3(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 4(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 5(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 6(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 7(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + : [abovel] "=&r" (abovel), [abover] "=&r" (abover), + [abovel_1] "=&r" (abovel_1), [abover_1] "=&r" (abover_1), + [left0] "=&r" (left0), [res2] "=&r" (res2), [res3] "=&r" (res3), + [res0] "=&r" (res0), [res1] "=&r" (res1), + [reshw] "=&r" (reshw), [top_left] "=&r" (top_left) + : [above] "r" (above), [left] "r" (left), + [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm) + ); +} +#endif // #if HAVE_DSPR2 diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c index d3aee73..bc67594 100644 --- a/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c +++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c @@ -19,7 +19,8 @@ #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 -static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) { +static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; @@ -42,7 +43,7 @@ static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) { const int const_2_power_13 = 8192; const int32_t *input_int; - for (i = 32; i--; ) { + for (i = no_rows; i--; ) { input_int = (const int32_t *)input; if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | @@ -881,12 +882,74 @@ void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, ); // Rows - idct32_1d_rows_dspr2(input, outptr); + idct32_1d_rows_dspr2(input, outptr, 32); // Columns vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride); } +void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, + int stride) { + DECLARE_ALIGNED(32, int16_t, out[32 * 32]); + int16_t *outptr = out; + uint32_t i; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // Rows + idct32_1d_rows_dspr2(input, outptr, 8); + + outptr += 8; + __asm__ __volatile__ ( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 4(%[outptr]) \n\t" + "sw $zero, 8(%[outptr]) \n\t" + "sw $zero, 12(%[outptr]) \n\t" + "sw $zero, 16(%[outptr]) \n\t" + "sw $zero, 20(%[outptr]) \n\t" + "sw $zero, 24(%[outptr]) \n\t" + "sw $zero, 28(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 36(%[outptr]) \n\t" + "sw $zero, 40(%[outptr]) \n\t" + "sw $zero, 44(%[outptr]) \n\t" + + : + : [outptr] "r" (outptr) + ); + + for (i = 0; i < 31; ++i) { + outptr += 32; + + __asm__ __volatile__ ( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 4(%[outptr]) \n\t" + "sw $zero, 8(%[outptr]) \n\t" + "sw $zero, 12(%[outptr]) \n\t" + "sw $zero, 16(%[outptr]) \n\t" + "sw $zero, 20(%[outptr]) \n\t" + "sw $zero, 24(%[outptr]) \n\t" + "sw $zero, 28(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 36(%[outptr]) \n\t" + "sw $zero, 40(%[outptr]) \n\t" + "sw $zero, 44(%[outptr]) \n\t" + + : + : [outptr] "r" (outptr) + ); + } + + // Columns + vp9_idct32_1d_cols_add_blk_dspr2(out, dest, stride); +} + void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { int r, out; diff --git a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c new file mode 100644 index 0000000..36cfc83 --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" +#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h" +#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h" +#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h" + +#if HAVE_DSPR2 +void vp9_loop_filter_horizontal_edge_dspr2(unsigned char *s, + int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count) { + uint8_t i; + uint32_t mask; + uint32_t hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__ ( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), + [limit_vec] "=r" (limit_vec) + : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) + ); + + /* prefetch data for store */ + vp9_prefetch_store(s); + + /* loop filter designed to work using chars so that we can make maximum use + of 8 bit simd instructions. */ + for (i = 0; i < 2; i++) { + sm1 = s - (pitch << 2); + s0 = sm1 + pitch; + s1 = s0 + pitch; + s2 = s - pitch; + s3 = s; + s4 = s + pitch; + s5 = s4 + pitch; + s6 = s5 + pitch; + + __asm__ __volatile__ ( + "lw %[p1], (%[s1]) \n\t" + "lw %[p2], (%[s2]) \n\t" + "lw %[p3], (%[s3]) \n\t" + "lw %[p4], (%[s4]) \n\t" + + : [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4) + : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4) + ); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + mask will be zero and filtering is not needed */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + __asm__ __volatile__ ( + "lw %[pm1], (%[sm1]) \n\t" + "lw %[p0], (%[s0]) \n\t" + "lw %[p5], (%[s5]) \n\t" + "lw %[p6], (%[s6]) \n\t" + + : [pm1] "=&r" (pm1), [p0] "=&r" (p0), [p5] "=&r" (p5), + [p6] "=&r" (p6) + : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6) + ); + + vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, + pm1, p0, p3, p4, p5, p6, + thresh_vec, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); + + __asm__ __volatile__ ( + "sw %[p1], (%[s1]) \n\t" + "sw %[p2], (%[s2]) \n\t" + "sw %[p3], (%[s3]) \n\t" + "sw %[p4], (%[s4]) \n\t" + + : + : [p1] "r" (p1), [p2] "r" (p2), [p3] "r" (p3), [p4] "r" (p4), + [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4) + ); + } + } + + s = s + 4; + } +} + +void vp9_loop_filter_vertical_edge_dspr2(unsigned char *s, + int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count) { + uint8_t i; + uint32_t mask, hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__ ( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), + [limit_vec] "=r" (limit_vec) + : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) + ); + + /* prefetch data for store */ + vp9_prefetch_store(s + pitch); + + for (i = 0; i < 2; i++) { + s1 = s; + s2 = s + pitch; + s3 = s2 + pitch; + s4 = s3 + pitch; + s = s4 + pitch; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p2 = *((uint32_t *)(s1 - 4)); + p6 = *((uint32_t *)(s1)); + p1 = *((uint32_t *)(s2 - 4)); + p5 = *((uint32_t *)(s2)); + p0 = *((uint32_t *)(s3 - 4)); + p4 = *((uint32_t *)(s3)); + pm1 = *((uint32_t *)(s4 - 4)); + p3 = *((uint32_t *)(s4)); + + /* transpose pm1, p0, p1, p2 */ + __asm__ __volatile__ ( + "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" + "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[pm1], %[sec4], 16 \n\t" + + : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), + [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), + [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1), + [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) + : + ); + + /* transpose p3, p4, p5, p6 */ + __asm__ __volatile__ ( + "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" + "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" + "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" + + "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" + "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" + "append %[p5], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), + [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), + [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), + [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) + : + ); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, + p0, p3, p4, p5, p6, thresh_vec, + &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); + + /* unpack processed 4x4 neighborhood + * don't use transpose on output data + * because memory isn't aligned + */ + __asm__ __volatile__ ( + "sb %[p4], 1(%[s4]) \n\t" + "sb %[p3], 0(%[s4]) \n\t" + "sb %[p2], -1(%[s4]) \n\t" + "sb %[p1], -2(%[s4]) \n\t" + + : + : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), + [s4] "r" (s4) + ); + + __asm__ __volatile__ ( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + + : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) + : + ); + + __asm__ __volatile__ ( + "sb %[p4], 1(%[s3]) \n\t" + "sb %[p3], 0(%[s3]) \n\t" + "sb %[p2], -1(%[s3]) \n\t" + "sb %[p1], -2(%[s3]) \n\t" + + : [p1] "+r" (p1) + : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [s3] "r" (s3) + ); + + __asm__ __volatile__ ( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + + : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) + : + ); + + __asm__ __volatile__ ( + "sb %[p4], 1(%[s2]) \n\t" + "sb %[p3], 0(%[s2]) \n\t" + "sb %[p2], -1(%[s2]) \n\t" + "sb %[p1], -2(%[s2]) \n\t" + + : + : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), + [s2] "r" (s2) + ); + + __asm__ __volatile__ ( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + + : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) + : + ); + + __asm__ __volatile__ ( + "sb %[p4], 1(%[s1]) \n\t" + "sb %[p3], 0(%[s1]) \n\t" + "sb %[p2], -1(%[s1]) \n\t" + "sb %[p1], -2(%[s1]) \n\t" + + : + : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), + [s1] "r" (s1) + ); + } + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h new file mode 100644 index 0000000..98bfcfa --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h @@ -0,0 +1,755 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_ +#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_ + +#include <stdlib.h> + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_onyxc_int.h" + +#if HAVE_DSPR2 +/* inputs & outputs are quad-byte vectors */ +static INLINE void vp9_filter_dspr2(uint32_t mask, uint32_t hev, + uint32_t *ps1, uint32_t *ps0, + uint32_t *qs0, uint32_t *qs1) { + int32_t vp9_filter_l, vp9_filter_r; + int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; + int32_t subr_r, subr_l; + uint32_t t1, t2, HWM, t3; + uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; + int32_t vps1, vps0, vqs0, vqs1; + int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; + uint32_t N128; + + N128 = 0x80808080; + t1 = 0x03000300; + t2 = 0x04000400; + t3 = 0x01000100; + HWM = 0xFF00FF00; + + vps0 = (*ps0) ^ N128; + vps1 = (*ps1) ^ N128; + vqs0 = (*qs0) ^ N128; + vqs1 = (*qs1) ^ N128; + + /* use halfword pairs instead quad-bytes because of accuracy */ + vps0_l = vps0 & HWM; + vps0_r = vps0 << 8; + vps0_r = vps0_r & HWM; + + vps1_l = vps1 & HWM; + vps1_r = vps1 << 8; + vps1_r = vps1_r & HWM; + + vqs0_l = vqs0 & HWM; + vqs0_r = vqs0 << 8; + vqs0_r = vqs0_r & HWM; + + vqs1_l = vqs1 & HWM; + vqs1_r = vqs1 << 8; + vqs1_r = vqs1_r & HWM; + + mask_l = mask & HWM; + mask_r = mask << 8; + mask_r = mask_r & HWM; + + hev_l = hev & HWM; + hev_r = hev << 8; + hev_r = hev_r & HWM; + + __asm__ __volatile__ ( + /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */ + "subq_s.ph %[vp9_filter_l], %[vps1_l], %[vqs1_l] \n\t" + "subq_s.ph %[vp9_filter_r], %[vps1_r], %[vqs1_r] \n\t" + + /* qs0 - ps0 */ + "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" + "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" + + /* vp9_filter &= hev; */ + "and %[vp9_filter_l], %[vp9_filter_l], %[hev_l] \n\t" + "and %[vp9_filter_r], %[vp9_filter_r], %[hev_r] \n\t" + + /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */ + "addq_s.ph %[vp9_filter_l], %[vp9_filter_l], %[subr_l] \n\t" + "addq_s.ph %[vp9_filter_r], %[vp9_filter_r], %[subr_r] \n\t" + "xor %[invhev_l], %[hev_l], %[HWM] \n\t" + "addq_s.ph %[vp9_filter_l], %[vp9_filter_l], %[subr_l] \n\t" + "addq_s.ph %[vp9_filter_r], %[vp9_filter_r], %[subr_r] \n\t" + "xor %[invhev_r], %[hev_r], %[HWM] \n\t" + "addq_s.ph %[vp9_filter_l], %[vp9_filter_l], %[subr_l] \n\t" + "addq_s.ph %[vp9_filter_r], %[vp9_filter_r], %[subr_r] \n\t" + + /* vp9_filter &= mask; */ + "and %[vp9_filter_l], %[vp9_filter_l], %[mask_l] \n\t" + "and %[vp9_filter_r], %[vp9_filter_r], %[mask_r] \n\t" + + : [vp9_filter_l] "=&r" (vp9_filter_l), + [vp9_filter_r] "=&r" (vp9_filter_r), + [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r), + [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r) + : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l), + [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r), + [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r), + [mask_l] "r" (mask_l), [mask_r] "r" (mask_r), + [hev_l] "r" (hev_l), [hev_r] "r" (hev_r), + [HWM] "r" (HWM) + ); + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + __asm__ __volatile__ ( + /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */ + "addq_s.ph %[Filter1_l], %[vp9_filter_l], %[t2] \n\t" + "addq_s.ph %[Filter1_r], %[vp9_filter_r], %[t2] \n\t" + + /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */ + "addq_s.ph %[Filter2_l], %[vp9_filter_l], %[t1] \n\t" + "addq_s.ph %[Filter2_r], %[vp9_filter_r], %[t1] \n\t" + "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" + "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" + + "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" + "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" + + "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" + "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" + + /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */ + "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" + "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" + + /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */ + "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" + + : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r), + [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r), + [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r), + [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r) + : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM), + [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r) + ); + + __asm__ __volatile__ ( + /* (vp9_filter += 1) >>= 1 */ + "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" + "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" + + /* vp9_filter &= ~hev; */ + "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t" + "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t" + + /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */ + "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t" + "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t" + + /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */ + "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" + + : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r), + [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r), + [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r) + : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r) + ); + + /* Create quad-bytes from halfword pairs */ + vqs0_l = vqs0_l & HWM; + vqs1_l = vqs1_l & HWM; + vps0_l = vps0_l & HWM; + vps1_l = vps1_l & HWM; + + __asm__ __volatile__ ( + "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" + "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" + "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" + "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" + + : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r), + [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r) + : + ); + + vqs0 = vqs0_l | vqs0_r; + vqs1 = vqs1_l | vqs1_r; + vps0 = vps0_l | vps0_r; + vps1 = vps1_l | vps1_r; + + *ps0 = vps0 ^ N128; + *ps1 = vps1 ^ N128; + *qs0 = vqs0 ^ N128; + *qs1 = vqs1 ^ N128; +} + +static INLINE void vp9_filter1_dspr2(uint32_t mask, uint32_t hev, + uint32_t ps1, uint32_t ps0, + uint32_t qs0, uint32_t qs1, + uint32_t *p1_f0, uint32_t *p0_f0, + uint32_t *q0_f0, uint32_t *q1_f0) { + int32_t vp9_filter_l, vp9_filter_r; + int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; + int32_t subr_r, subr_l; + uint32_t t1, t2, HWM, t3; + uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; + int32_t vps1, vps0, vqs0, vqs1; + int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; + uint32_t N128; + + N128 = 0x80808080; + t1 = 0x03000300; + t2 = 0x04000400; + t3 = 0x01000100; + HWM = 0xFF00FF00; + + vps0 = (ps0) ^ N128; + vps1 = (ps1) ^ N128; + vqs0 = (qs0) ^ N128; + vqs1 = (qs1) ^ N128; + + /* use halfword pairs instead quad-bytes because of accuracy */ + vps0_l = vps0 & HWM; + vps0_r = vps0 << 8; + vps0_r = vps0_r & HWM; + + vps1_l = vps1 & HWM; + vps1_r = vps1 << 8; + vps1_r = vps1_r & HWM; + + vqs0_l = vqs0 & HWM; + vqs0_r = vqs0 << 8; + vqs0_r = vqs0_r & HWM; + + vqs1_l = vqs1 & HWM; + vqs1_r = vqs1 << 8; + vqs1_r = vqs1_r & HWM; + + mask_l = mask & HWM; + mask_r = mask << 8; + mask_r = mask_r & HWM; + + hev_l = hev & HWM; + hev_r = hev << 8; + hev_r = hev_r & HWM; + + __asm__ __volatile__ ( + /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */ + "subq_s.ph %[vp9_filter_l], %[vps1_l], %[vqs1_l] \n\t" + "subq_s.ph %[vp9_filter_r], %[vps1_r], %[vqs1_r] \n\t" + + /* qs0 - ps0 */ + "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" + "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" + + /* vp9_filter &= hev; */ + "and %[vp9_filter_l], %[vp9_filter_l], %[hev_l] \n\t" + "and %[vp9_filter_r], %[vp9_filter_r], %[hev_r] \n\t" + + /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */ + "addq_s.ph %[vp9_filter_l], %[vp9_filter_l], %[subr_l] \n\t" + "addq_s.ph %[vp9_filter_r], %[vp9_filter_r], %[subr_r] \n\t" + "xor %[invhev_l], %[hev_l], %[HWM] \n\t" + "addq_s.ph %[vp9_filter_l], %[vp9_filter_l], %[subr_l] \n\t" + "addq_s.ph %[vp9_filter_r], %[vp9_filter_r], %[subr_r] \n\t" + "xor %[invhev_r], %[hev_r], %[HWM] \n\t" + "addq_s.ph %[vp9_filter_l], %[vp9_filter_l], %[subr_l] \n\t" + "addq_s.ph %[vp9_filter_r], %[vp9_filter_r], %[subr_r] \n\t" + + /* vp9_filter &= mask; */ + "and %[vp9_filter_l], %[vp9_filter_l], %[mask_l] \n\t" + "and %[vp9_filter_r], %[vp9_filter_r], %[mask_r] \n\t" + + : [vp9_filter_l] "=&r" (vp9_filter_l), + [vp9_filter_r] "=&r" (vp9_filter_r), + [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r), + [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r) + : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l), + [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r), + [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r), + [mask_l] "r" (mask_l), [mask_r] "r" (mask_r), + [hev_l] "r" (hev_l), [hev_r] "r" (hev_r), [HWM] "r" (HWM) + ); + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + __asm__ __volatile__ ( + /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */ + "addq_s.ph %[Filter1_l], %[vp9_filter_l], %[t2] \n\t" + "addq_s.ph %[Filter1_r], %[vp9_filter_r], %[t2] \n\t" + + /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */ + "addq_s.ph %[Filter2_l], %[vp9_filter_l], %[t1] \n\t" + "addq_s.ph %[Filter2_r], %[vp9_filter_r], %[t1] \n\t" + "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" + "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" + + "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" + "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" + + "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" + "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" + + /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */ + "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" + "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" + + /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */ + "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" + + : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r), + [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r), + [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r), + [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r) + : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM), + [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r) + ); + + __asm__ __volatile__ ( + /* (vp9_filter += 1) >>= 1 */ + "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" + "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" + + /* vp9_filter &= ~hev; */ + "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t" + "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t" + + /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */ + "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t" + "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t" + + /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */ + "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" + + : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r), + [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r), + [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r) + : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r) + ); + + /* Create quad-bytes from halfword pairs */ + vqs0_l = vqs0_l & HWM; + vqs1_l = vqs1_l & HWM; + vps0_l = vps0_l & HWM; + vps1_l = vps1_l & HWM; + + __asm__ __volatile__ ( + "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" + "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" + "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" + "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" + + : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r), + [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r) + : + ); + + vqs0 = vqs0_l | vqs0_r; + vqs1 = vqs1_l | vqs1_r; + vps0 = vps0_l | vps0_r; + vps1 = vps1_l | vps1_r; + + *p0_f0 = vps0 ^ N128; + *p1_f0 = vps1 ^ N128; + *q0_f0 = vqs0 ^ N128; + *q1_f0 = vqs1 ^ N128; +} + +static INLINE void vp9_mbfilter_dspr2(uint32_t *op3, uint32_t *op2, + uint32_t *op1, uint32_t *op0, + uint32_t *oq0, uint32_t *oq1, + uint32_t *oq2, uint32_t *oq3) { + /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ + const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + uint32_t res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2; + uint32_t tmp; + uint32_t add_p210_q012; + uint32_t u32Four = 0x00040004; + + /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */ + /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */ + /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */ + /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */ + /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ + /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ + + __asm__ __volatile__ ( + "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t" + + "shll.ph %[tmp], %[p3], 1 \n\t" + "addu.ph %[res_op2], %[tmp], %[p3] \n\t" + "addu.ph %[res_op1], %[p3], %[p3] \n\t" + "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" + "addu.ph %[res_op1], %[res_op1], %[p1] \n\t" + "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t" + "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q1] \n\t" + "subu.ph %[res_op1], %[res_op1], %[q2] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q2] \n\t" + "shrl.ph %[res_op1], %[res_op1], 3 \n\t" + "shrl.ph %[res_op2], %[res_op2], 3 \n\t" + "addu.ph %[res_op0], %[p3], %[p0] \n\t" + "addu.ph %[res_oq0], %[q0], %[q3] \n\t" + "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq1], %[q3], %[q3] \n\t" + "shll.ph %[tmp], %[q3], 1 \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t" + "addu.ph %[res_oq2], %[tmp], %[q3] \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t" + "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" + "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t" + "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t" + "shrl.ph %[res_op0], %[res_op0], 3 \n\t" + "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" + + : [add_p210_q012] "=&r" (add_p210_q012), + [tmp] "=&r" (tmp), [res_op2] "=&r" (res_op2), + [res_op1] "=&r" (res_op1), [res_op0] "=&r" (res_op0), + [res_oq0] "=&r" (res_oq0), [res_oq1] "=&r" (res_oq1), + [res_oq2] "=&r" (res_oq2) + : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1), + [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3), + [u32Four] "r" (u32Four) + ); + + *op2 = res_op2; + *op1 = res_op1; + *op0 = res_op0; + *oq0 = res_oq0; + *oq1 = res_oq1; + *oq2 = res_oq2; +} + +static INLINE void vp9_mbfilter1_dspr2(uint32_t p3, uint32_t p2, + uint32_t p1, uint32_t p0, + uint32_t q0, uint32_t q1, + uint32_t q2, uint32_t q3, + uint32_t *op2_f1, + uint32_t *op1_f1, uint32_t *op0_f1, + uint32_t *oq0_f1, uint32_t *oq1_f1, + uint32_t *oq2_f1) { + /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ + uint32_t res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2; + uint32_t tmp; + uint32_t add_p210_q012; + uint32_t u32Four = 0x00040004; + + /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */ + /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */ + /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */ + /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */ + /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ + /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ + + __asm__ __volatile__ ( + "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t" + + "shll.ph %[tmp], %[p3], 1 \n\t" + "addu.ph %[res_op2], %[tmp], %[p3] \n\t" + "addu.ph %[res_op1], %[p3], %[p3] \n\t" + "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" + "addu.ph %[res_op1], %[res_op1], %[p1] \n\t" + "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t" + "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q1] \n\t" + "subu.ph %[res_op1], %[res_op1], %[q2] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q2] \n\t" + "shrl.ph %[res_op1], %[res_op1], 3 \n\t" + "shrl.ph %[res_op2], %[res_op2], 3 \n\t" + "addu.ph %[res_op0], %[p3], %[p0] \n\t" + "addu.ph %[res_oq0], %[q0], %[q3] \n\t" + "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq1], %[q3], %[q3] \n\t" + "shll.ph %[tmp], %[q3], 1 \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t" + "addu.ph %[res_oq2], %[tmp], %[q3] \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t" + "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" + "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t" + "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t" + "shrl.ph %[res_op0], %[res_op0], 3 \n\t" + "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" + + : [add_p210_q012] "=&r" (add_p210_q012), [tmp] "=&r" (tmp), + [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1), + [res_op0] "=&r" (res_op0), [res_oq0] "=&r" (res_oq0), + [res_oq1] "=&r" (res_oq1), [res_oq2] "=&r" (res_oq2) + : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1), + [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3), + [u32Four] "r" (u32Four) + ); + + *op2_f1 = res_op2; + *op1_f1 = res_op1; + *op0_f1 = res_op0; + *oq0_f1 = res_oq0; + *oq1_f1 = res_oq1; + *oq2_f1 = res_oq2; +} + +static INLINE void vp9_wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6, + uint32_t *op5, uint32_t *op4, + uint32_t *op3, uint32_t *op2, + uint32_t *op1, uint32_t *op0, + uint32_t *oq0, uint32_t *oq1, + uint32_t *oq2, uint32_t *oq3, + uint32_t *oq4, uint32_t *oq5, + uint32_t *oq6, uint32_t *oq7) { + const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4; + const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7; + uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6; + uint32_t tmp; + uint32_t add_p6toq6; + uint32_t u32Eight = 0x00080008; + + __asm__ __volatile__ ( + /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6 + which is used most of the time */ + "addu.ph %[add_p6toq6], %[p6], %[p5] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p4] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p3] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p2] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p1] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p0] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q0] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q1] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q2] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q3] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q4] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q5] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q6] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[u32Eight] \n\t" + + : [add_p6toq6] "=&r" (add_p6toq6) + : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), + [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), + [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3), + [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), + [u32Eight] "r" (u32Eight) + ); + + __asm__ __volatile__ ( + /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + + p3 + p2 + p1 + p0 + q0, 4) */ + "shll.ph %[tmp], %[p7], 3 \n\t" + "subu.ph %[res_op6], %[tmp], %[p7] \n\t" + "addu.ph %[res_op6], %[res_op6], %[p6] \n\t" + "addu.ph %[res_op6], %[res_op6], %[add_p6toq6] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q1] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q2] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q3] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q4] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q5] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q6] \n\t" + "shrl.ph %[res_op6], %[res_op6], 4 \n\t" + + /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + + p2 + p1 + p0 + q0 + q1, 4) */ + "shll.ph %[tmp], %[p7], 2 \n\t" + "addu.ph %[res_op5], %[tmp], %[p7] \n\t" + "addu.ph %[res_op5], %[res_op5], %[p7] \n\t" + "addu.ph %[res_op5], %[res_op5], %[p5] \n\t" + "addu.ph %[res_op5], %[res_op5], %[add_p6toq6] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q2] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q3] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q4] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q5] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q6] \n\t" + "shrl.ph %[res_op5], %[res_op5], 4 \n\t" + + /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + + p1 + p0 + q0 + q1 + q2, 4) */ + "shll.ph %[tmp], %[p7], 2 \n\t" + "addu.ph %[res_op4], %[tmp], %[p7] \n\t" + "addu.ph %[res_op4], %[res_op4], %[p4] \n\t" + "addu.ph %[res_op4], %[res_op4], %[add_p6toq6] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q3] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q4] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q5] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q6] \n\t" + "shrl.ph %[res_op4], %[res_op4], 4 \n\t" + + /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + + p1 + p0 + q0 + q1 + q2 + q3, 4) */ + "shll.ph %[tmp], %[p7], 2 \n\t" + "addu.ph %[res_op3], %[tmp], %[p3] \n\t" + "addu.ph %[res_op3], %[res_op3], %[add_p6toq6] \n\t" + "subu.ph %[res_op3], %[res_op3], %[q4] \n\t" + "subu.ph %[res_op3], %[res_op3], %[q5] \n\t" + "subu.ph %[res_op3], %[res_op3], %[q6] \n\t" + "shrl.ph %[res_op3], %[res_op3], 4 \n\t" + + /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + + p0 + q0 + q1 + q2 + q3 + q4, 4) */ + "shll.ph %[tmp], %[p7], 1 \n\t" + "addu.ph %[res_op2], %[tmp], %[p7] \n\t" + "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" + "addu.ph %[res_op2], %[res_op2], %[add_p6toq6] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q5] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q6] \n\t" + "shrl.ph %[res_op2], %[res_op2], 4 \n\t" + + /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + + p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */ + "shll.ph %[tmp], %[p7], 1 \n\t" + "addu.ph %[res_op1], %[tmp], %[p1] \n\t" + "addu.ph %[res_op1], %[res_op1], %[add_p6toq6] \n\t" + "subu.ph %[res_op1], %[res_op1], %[q6] \n\t" + "shrl.ph %[res_op1], %[res_op1], 4 \n\t" + + /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + + q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */ + "addu.ph %[res_op0], %[p7], %[p0] \n\t" + "addu.ph %[res_op0], %[res_op0], %[add_p6toq6] \n\t" + "shrl.ph %[res_op0], %[res_op0], 4 \n\t" + + : [res_op6] "=&r" (res_op6), [res_op5] "=&r" (res_op5), + [res_op4] "=&r" (res_op4), [res_op3] "=&r" (res_op3), + [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1), + [res_op0] "=&r" (res_op0), [tmp] "=&r" (tmp) + : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), + [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), + [q2] "r" (q2), [q1] "r" (q1), + [q3] "r" (q3), [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), + [add_p6toq6] "r" (add_p6toq6) + ); + + *op6 = res_op6; + *op5 = res_op5; + *op4 = res_op4; + *op3 = res_op3; + *op2 = res_op2; + *op1 = res_op1; + *op0 = res_op0; + + __asm__ __volatile__ ( + /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */ + "addu.ph %[res_oq0], %[q7], %[q0] \n\t" + "addu.ph %[res_oq0], %[res_oq0], %[add_p6toq6] \n\t" + "shrl.ph %[res_oq0], %[res_oq0], 4 \n\t" + + /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */ + "shll.ph %[tmp], %[q7], 1 \n\t" + "addu.ph %[res_oq1], %[tmp], %[q1] \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[add_p6toq6] \n\t" + "subu.ph %[res_oq1], %[res_oq1], %[p6] \n\t" + "shrl.ph %[res_oq1], %[res_oq1], 4 \n\t" + + /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + + q3 + q4 + q5 + q6 + q7 * 3, 4) */ + "shll.ph %[tmp], %[q7], 1 \n\t" + "addu.ph %[res_oq2], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[add_p6toq6] \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p5] \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p6] \n\t" + "shrl.ph %[res_oq2], %[res_oq2], 4 \n\t" + + /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 + + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */ + "shll.ph %[tmp], %[q7], 2 \n\t" + "addu.ph %[res_oq3], %[tmp], %[q3] \n\t" + "addu.ph %[res_oq3], %[res_oq3], %[add_p6toq6] \n\t" + "subu.ph %[res_oq3], %[res_oq3], %[p4] \n\t" + "subu.ph %[res_oq3], %[res_oq3], %[p5] \n\t" + "subu.ph %[res_oq3], %[res_oq3], %[p6] \n\t" + "shrl.ph %[res_oq3], %[res_oq3], 4 \n\t" + + /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 + + q4 * 2 + q5 + q6 + q7 * 5, 4) */ + "shll.ph %[tmp], %[q7], 2 \n\t" + "addu.ph %[res_oq4], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq4], %[res_oq4], %[q4] \n\t" + "addu.ph %[res_oq4], %[res_oq4], %[add_p6toq6] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p3] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p4] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p5] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p6] \n\t" + "shrl.ph %[res_oq4], %[res_oq4], 4 \n\t" + + /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 + + q5 * 2 + q6 + q7 * 6, 4) */ + "shll.ph %[tmp], %[q7], 2 \n\t" + "addu.ph %[res_oq5], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq5], %[res_oq5], %[q7] \n\t" + "addu.ph %[res_oq5], %[res_oq5], %[q5] \n\t" + "addu.ph %[res_oq5], %[res_oq5], %[add_p6toq6] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p2] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p3] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p4] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p5] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p6] \n\t" + "shrl.ph %[res_oq5], %[res_oq5], 4 \n\t" + + /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + + q4 + q5 + q6 * 2 + q7 * 7, 4) */ + "shll.ph %[tmp], %[q7], 3 \n\t" + "subu.ph %[res_oq6], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq6], %[res_oq6], %[q6] \n\t" + "addu.ph %[res_oq6], %[res_oq6], %[add_p6toq6] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p1] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p2] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p3] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p4] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p5] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p6] \n\t" + "shrl.ph %[res_oq6], %[res_oq6], 4 \n\t" + + : [res_oq6] "=&r" (res_oq6), [res_oq5] "=&r" (res_oq5), + [res_oq4] "=&r" (res_oq4), [res_oq3] "=&r" (res_oq3), + [res_oq2] "=&r" (res_oq2), [res_oq1] "=&r" (res_oq1), + [res_oq0] "=&r" (res_oq0), [tmp] "=&r" (tmp) + : [q7] "r" (q7), [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4), + [q3] "r" (q3), [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0), + [p1] "r" (p1), [p2] "r" (p2), + [p3] "r" (p3), [p4] "r" (p4), [p5] "r" (p5), [p6] "r" (p6), + [add_p6toq6] "r" (add_p6toq6) + ); + + *oq0 = res_oq0; + *oq1 = res_oq1; + *oq2 = res_oq2; + *oq3 = res_oq3; + *oq4 = res_oq4; + *oq5 = res_oq5; + *oq6 = res_oq6; +} +#endif // #if HAVE_DSPR2 +#endif // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_ diff --git a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h new file mode 100644 index 0000000..4cb2ebb --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h @@ -0,0 +1,470 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_ +#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_ + +#include <stdlib.h> + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_onyxc_int.h" + +#if HAVE_DSPR2 +#define STORE_F0() { \ + __asm__ __volatile__ ( \ + "sb %[q1_f0], 1(%[s4]) \n\t" \ + "sb %[q0_f0], 0(%[s4]) \n\t" \ + "sb %[p0_f0], -1(%[s4]) \n\t" \ + "sb %[p1_f0], -2(%[s4]) \n\t" \ + \ + : \ + : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \ + [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0), \ + [s4] "r" (s4) \ + ); \ + \ + __asm__ __volatile__ ( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0), \ + [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0) \ + : \ + ); \ + \ + __asm__ __volatile__ ( \ + "sb %[q1_f0], 1(%[s3]) \n\t" \ + "sb %[q0_f0], 0(%[s3]) \n\t" \ + "sb %[p0_f0], -1(%[s3]) \n\t" \ + "sb %[p1_f0], -2(%[s3]) \n\t" \ + \ + : [p1_f0] "+r" (p1_f0) \ + : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \ + [s3] "r" (s3), [p0_f0] "r" (p0_f0) \ + ); \ + \ + __asm__ __volatile__ ( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0), \ + [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0) \ + : \ + ); \ + \ + __asm__ __volatile__ ( \ + "sb %[q1_f0], 1(%[s2]) \n\t" \ + "sb %[q0_f0], 0(%[s2]) \n\t" \ + "sb %[p0_f0], -1(%[s2]) \n\t" \ + "sb %[p1_f0], -2(%[s2]) \n\t" \ + \ + : \ + : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \ + [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0), \ + [s2] "r" (s2) \ + ); \ + \ + __asm__ __volatile__ ( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0), \ + [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0) \ + : \ + ); \ + \ + __asm__ __volatile__ ( \ + "sb %[q1_f0], 1(%[s1]) \n\t" \ + "sb %[q0_f0], 0(%[s1]) \n\t" \ + "sb %[p0_f0], -1(%[s1]) \n\t" \ + "sb %[p1_f0], -2(%[s1]) \n\t" \ + \ + : \ + : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0), \ + [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0), \ + [s1] "r" (s1) \ + ); \ +} + +#define STORE_F1() { \ + __asm__ __volatile__ ( \ + "sb %[q2_r], 2(%[s4]) \n\t" \ + "sb %[q1_r], 1(%[s4]) \n\t" \ + "sb %[q0_r], 0(%[s4]) \n\t" \ + "sb %[p0_r], -1(%[s4]) \n\t" \ + "sb %[p1_r], -2(%[s4]) \n\t" \ + "sb %[p2_r], -3(%[s4]) \n\t" \ + \ + : \ + : [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r), \ + [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \ + [s4] "r" (s4) \ + ); \ + \ + __asm__ __volatile__ ( \ + "srl %[q2_r], %[q2_r], 16 \n\t" \ + "srl %[q1_r], %[q1_r], 16 \n\t" \ + "srl %[q0_r], %[q0_r], 16 \n\t" \ + "srl %[p0_r], %[p0_r], 16 \n\t" \ + "srl %[p1_r], %[p1_r], 16 \n\t" \ + "srl %[p2_r], %[p2_r], 16 \n\t" \ + \ + : [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r), [q0_r] "+r" (q0_r), \ + [p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r) \ + : \ + ); \ + \ + __asm__ __volatile__ ( \ + "sb %[q2_r], 2(%[s3]) \n\t" \ + "sb %[q1_r], 1(%[s3]) \n\t" \ + "sb %[q0_r], 0(%[s3]) \n\t" \ + "sb %[p0_r], -1(%[s3]) \n\t" \ + "sb %[p1_r], -2(%[s3]) \n\t" \ + "sb %[p2_r], -3(%[s3]) \n\t" \ + \ + : \ + : [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r), \ + [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \ + [s3] "r" (s3) \ + ); \ + \ + __asm__ __volatile__ ( \ + "sb %[q2_l], 2(%[s2]) \n\t" \ + "sb %[q1_l], 1(%[s2]) \n\t" \ + "sb %[q0_l], 0(%[s2]) \n\t" \ + "sb %[p0_l], -1(%[s2]) \n\t" \ + "sb %[p1_l], -2(%[s2]) \n\t" \ + "sb %[p2_l], -3(%[s2]) \n\t" \ + \ + : \ + : [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l), \ + [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \ + [s2] "r" (s2) \ + ); \ + \ + __asm__ __volatile__ ( \ + "srl %[q2_l], %[q2_l], 16 \n\t" \ + "srl %[q1_l], %[q1_l], 16 \n\t" \ + "srl %[q0_l], %[q0_l], 16 \n\t" \ + "srl %[p0_l], %[p0_l], 16 \n\t" \ + "srl %[p1_l], %[p1_l], 16 \n\t" \ + "srl %[p2_l], %[p2_l], 16 \n\t" \ + \ + : [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l), [q0_l] "+r" (q0_l), \ + [p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l) \ + : \ + ); \ + \ + __asm__ __volatile__ ( \ + "sb %[q2_l], 2(%[s1]) \n\t" \ + "sb %[q1_l], 1(%[s1]) \n\t" \ + "sb %[q0_l], 0(%[s1]) \n\t" \ + "sb %[p0_l], -1(%[s1]) \n\t" \ + "sb %[p1_l], -2(%[s1]) \n\t" \ + "sb %[p2_l], -3(%[s1]) \n\t" \ + \ + : \ + : [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l), \ + [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \ + [s1] "r" (s1) \ + ); \ +} + +#define STORE_F2() { \ + __asm__ __volatile__ ( \ + "sb %[q6_r], 6(%[s4]) \n\t" \ + "sb %[q5_r], 5(%[s4]) \n\t" \ + "sb %[q4_r], 4(%[s4]) \n\t" \ + "sb %[q3_r], 3(%[s4]) \n\t" \ + "sb %[q2_r], 2(%[s4]) \n\t" \ + "sb %[q1_r], 1(%[s4]) \n\t" \ + "sb %[q0_r], 0(%[s4]) \n\t" \ + "sb %[p0_r], -1(%[s4]) \n\t" \ + "sb %[p1_r], -2(%[s4]) \n\t" \ + "sb %[p2_r], -3(%[s4]) \n\t" \ + "sb %[p3_r], -4(%[s4]) \n\t" \ + "sb %[p4_r], -5(%[s4]) \n\t" \ + "sb %[p5_r], -6(%[s4]) \n\t" \ + "sb %[p6_r], -7(%[s4]) \n\t" \ + \ + : \ + : [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r), \ + [q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), \ + [q0_r] "r" (q0_r), \ + [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \ + [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r), \ + [p6_r] "r" (p6_r), \ + [s4] "r" (s4) \ + ); \ + \ + __asm__ __volatile__ ( \ + "srl %[q6_r], %[q6_r], 16 \n\t" \ + "srl %[q5_r], %[q5_r], 16 \n\t" \ + "srl %[q4_r], %[q4_r], 16 \n\t" \ + "srl %[q3_r], %[q3_r], 16 \n\t" \ + "srl %[q2_r], %[q2_r], 16 \n\t" \ + "srl %[q1_r], %[q1_r], 16 \n\t" \ + "srl %[q0_r], %[q0_r], 16 \n\t" \ + "srl %[p0_r], %[p0_r], 16 \n\t" \ + "srl %[p1_r], %[p1_r], 16 \n\t" \ + "srl %[p2_r], %[p2_r], 16 \n\t" \ + "srl %[p3_r], %[p3_r], 16 \n\t" \ + "srl %[p4_r], %[p4_r], 16 \n\t" \ + "srl %[p5_r], %[p5_r], 16 \n\t" \ + "srl %[p6_r], %[p6_r], 16 \n\t" \ + \ + : [q6_r] "+r" (q6_r), [q5_r] "+r" (q5_r), [q4_r] "+r" (q4_r), \ + [q3_r] "+r" (q3_r), [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r), \ + [q0_r] "+r" (q0_r), \ + [p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r), \ + [p3_r] "+r" (p3_r), [p4_r] "+r" (p4_r), [p5_r] "+r" (p5_r), \ + [p6_r] "+r" (p6_r) \ + : \ + ); \ + \ + __asm__ __volatile__ ( \ + "sb %[q6_r], 6(%[s3]) \n\t" \ + "sb %[q5_r], 5(%[s3]) \n\t" \ + "sb %[q4_r], 4(%[s3]) \n\t" \ + "sb %[q3_r], 3(%[s3]) \n\t" \ + "sb %[q2_r], 2(%[s3]) \n\t" \ + "sb %[q1_r], 1(%[s3]) \n\t" \ + "sb %[q0_r], 0(%[s3]) \n\t" \ + "sb %[p0_r], -1(%[s3]) \n\t" \ + "sb %[p1_r], -2(%[s3]) \n\t" \ + "sb %[p2_r], -3(%[s3]) \n\t" \ + "sb %[p3_r], -4(%[s3]) \n\t" \ + "sb %[p4_r], -5(%[s3]) \n\t" \ + "sb %[p5_r], -6(%[s3]) \n\t" \ + "sb %[p6_r], -7(%[s3]) \n\t" \ + \ + : \ + : [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r), \ + [q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), \ + [q0_r] "r" (q0_r), \ + [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r), \ + [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r), \ + [p6_r] "r" (p6_r), \ + [s3] "r" (s3) \ + ); \ + \ + __asm__ __volatile__ ( \ + "sb %[q6_l], 6(%[s2]) \n\t" \ + "sb %[q5_l], 5(%[s2]) \n\t" \ + "sb %[q4_l], 4(%[s2]) \n\t" \ + "sb %[q3_l], 3(%[s2]) \n\t" \ + "sb %[q2_l], 2(%[s2]) \n\t" \ + "sb %[q1_l], 1(%[s2]) \n\t" \ + "sb %[q0_l], 0(%[s2]) \n\t" \ + "sb %[p0_l], -1(%[s2]) \n\t" \ + "sb %[p1_l], -2(%[s2]) \n\t" \ + "sb %[p2_l], -3(%[s2]) \n\t" \ + "sb %[p3_l], -4(%[s2]) \n\t" \ + "sb %[p4_l], -5(%[s2]) \n\t" \ + "sb %[p5_l], -6(%[s2]) \n\t" \ + "sb %[p6_l], -7(%[s2]) \n\t" \ + \ + : \ + : [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l), \ + [q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), \ + [q0_l] "r" (q0_l), \ + [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \ + [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l), \ + [p6_l] "r" (p6_l), \ + [s2] "r" (s2) \ + ); \ + \ + __asm__ __volatile__ ( \ + "srl %[q6_l], %[q6_l], 16 \n\t" \ + "srl %[q5_l], %[q5_l], 16 \n\t" \ + "srl %[q4_l], %[q4_l], 16 \n\t" \ + "srl %[q3_l], %[q3_l], 16 \n\t" \ + "srl %[q2_l], %[q2_l], 16 \n\t" \ + "srl %[q1_l], %[q1_l], 16 \n\t" \ + "srl %[q0_l], %[q0_l], 16 \n\t" \ + "srl %[p0_l], %[p0_l], 16 \n\t" \ + "srl %[p1_l], %[p1_l], 16 \n\t" \ + "srl %[p2_l], %[p2_l], 16 \n\t" \ + "srl %[p3_l], %[p3_l], 16 \n\t" \ + "srl %[p4_l], %[p4_l], 16 \n\t" \ + "srl %[p5_l], %[p5_l], 16 \n\t" \ + "srl %[p6_l], %[p6_l], 16 \n\t" \ + \ + : [q6_l] "+r" (q6_l), [q5_l] "+r" (q5_l), [q4_l] "+r" (q4_l), \ + [q3_l] "+r" (q3_l), [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l), \ + [q0_l] "+r" (q0_l), \ + [p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l), \ + [p3_l] "+r" (p3_l), [p4_l] "+r" (p4_l), [p5_l] "+r" (p5_l), \ + [p6_l] "+r" (p6_l) \ + : \ + ); \ + \ + __asm__ __volatile__ ( \ + "sb %[q6_l], 6(%[s1]) \n\t" \ + "sb %[q5_l], 5(%[s1]) \n\t" \ + "sb %[q4_l], 4(%[s1]) \n\t" \ + "sb %[q3_l], 3(%[s1]) \n\t" \ + "sb %[q2_l], 2(%[s1]) \n\t" \ + "sb %[q1_l], 1(%[s1]) \n\t" \ + "sb %[q0_l], 0(%[s1]) \n\t" \ + "sb %[p0_l], -1(%[s1]) \n\t" \ + "sb %[p1_l], -2(%[s1]) \n\t" \ + "sb %[p2_l], -3(%[s1]) \n\t" \ + "sb %[p3_l], -4(%[s1]) \n\t" \ + "sb %[p4_l], -5(%[s1]) \n\t" \ + "sb %[p5_l], -6(%[s1]) \n\t" \ + "sb %[p6_l], -7(%[s1]) \n\t" \ + \ + : \ + : [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l), \ + [q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), \ + [q0_l] "r" (q0_l), \ + [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l), \ + [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l), \ + [p6_l] "r" (p6_l), \ + [s1] "r" (s1) \ + ); \ +} + +#define PACK_LEFT_0TO3() { \ + __asm__ __volatile__ ( \ + "preceu.ph.qbl %[p3_l], %[p3] \n\t" \ + "preceu.ph.qbl %[p2_l], %[p2] \n\t" \ + "preceu.ph.qbl %[p1_l], %[p1] \n\t" \ + "preceu.ph.qbl %[p0_l], %[p0] \n\t" \ + "preceu.ph.qbl %[q0_l], %[q0] \n\t" \ + "preceu.ph.qbl %[q1_l], %[q1] \n\t" \ + "preceu.ph.qbl %[q2_l], %[q2] \n\t" \ + "preceu.ph.qbl %[q3_l], %[q3] \n\t" \ + \ + : [p3_l] "=&r" (p3_l), [p2_l] "=&r" (p2_l), \ + [p1_l] "=&r" (p1_l), [p0_l] "=&r" (p0_l), \ + [q0_l] "=&r" (q0_l), [q1_l] "=&r" (q1_l), \ + [q2_l] "=&r" (q2_l), [q3_l] "=&r" (q3_l) \ + : [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), \ + [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3) \ + ); \ +} + +#define PACK_LEFT_4TO7() { \ + __asm__ __volatile__ ( \ + "preceu.ph.qbl %[p7_l], %[p7] \n\t" \ + "preceu.ph.qbl %[p6_l], %[p6] \n\t" \ + "preceu.ph.qbl %[p5_l], %[p5] \n\t" \ + "preceu.ph.qbl %[p4_l], %[p4] \n\t" \ + "preceu.ph.qbl %[q4_l], %[q4] \n\t" \ + "preceu.ph.qbl %[q5_l], %[q5] \n\t" \ + "preceu.ph.qbl %[q6_l], %[q6] \n\t" \ + "preceu.ph.qbl %[q7_l], %[q7] \n\t" \ + \ + : [p7_l] "=&r" (p7_l), [p6_l] "=&r" (p6_l), \ + [p5_l] "=&r" (p5_l), [p4_l] "=&r" (p4_l), \ + [q4_l] "=&r" (q4_l), [q5_l] "=&r" (q5_l), \ + [q6_l] "=&r" (q6_l), [q7_l] "=&r" (q7_l) \ + : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), \ + [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7) \ + ); \ +} + +#define PACK_RIGHT_0TO3() { \ + __asm__ __volatile__ ( \ + "preceu.ph.qbr %[p3_r], %[p3] \n\t" \ + "preceu.ph.qbr %[p2_r], %[p2] \n\t" \ + "preceu.ph.qbr %[p1_r], %[p1] \n\t" \ + "preceu.ph.qbr %[p0_r], %[p0] \n\t" \ + "preceu.ph.qbr %[q0_r], %[q0] \n\t" \ + "preceu.ph.qbr %[q1_r], %[q1] \n\t" \ + "preceu.ph.qbr %[q2_r], %[q2] \n\t" \ + "preceu.ph.qbr %[q3_r], %[q3] \n\t" \ + \ + : [p3_r] "=&r" (p3_r), [p2_r] "=&r" (p2_r), \ + [p1_r] "=&r" (p1_r), [p0_r] "=&r" (p0_r), \ + [q0_r] "=&r" (q0_r), [q1_r] "=&r" (q1_r), \ + [q2_r] "=&r" (q2_r), [q3_r] "=&r" (q3_r) \ + : [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), \ + [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3) \ + ); \ +} + +#define PACK_RIGHT_4TO7() { \ + __asm__ __volatile__ ( \ + "preceu.ph.qbr %[p7_r], %[p7] \n\t" \ + "preceu.ph.qbr %[p6_r], %[p6] \n\t" \ + "preceu.ph.qbr %[p5_r], %[p5] \n\t" \ + "preceu.ph.qbr %[p4_r], %[p4] \n\t" \ + "preceu.ph.qbr %[q4_r], %[q4] \n\t" \ + "preceu.ph.qbr %[q5_r], %[q5] \n\t" \ + "preceu.ph.qbr %[q6_r], %[q6] \n\t" \ + "preceu.ph.qbr %[q7_r], %[q7] \n\t" \ + \ + : [p7_r] "=&r" (p7_r), [p6_r] "=&r" (p6_r), \ + [p5_r] "=&r" (p5_r), [p4_r] "=&r" (p4_r), \ + [q4_r] "=&r" (q4_r), [q5_r] "=&r" (q5_r), \ + [q6_r] "=&r" (q6_r), [q7_r] "=&r" (q7_r) \ + : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), \ + [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7) \ + ); \ +} + +#define COMBINE_LEFT_RIGHT_0TO2() { \ + __asm__ __volatile__ ( \ + "precr.qb.ph %[p2], %[p2_l], %[p2_r] \n\t" \ + "precr.qb.ph %[p1], %[p1_l], %[p1_r] \n\t" \ + "precr.qb.ph %[p0], %[p0_l], %[p0_r] \n\t" \ + "precr.qb.ph %[q0], %[q0_l], %[q0_r] \n\t" \ + "precr.qb.ph %[q1], %[q1_l], %[q1_r] \n\t" \ + "precr.qb.ph %[q2], %[q2_l], %[q2_r] \n\t" \ + \ + : [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0), \ + [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2) \ + : [p2_l] "r" (p2_l), [p2_r] "r" (p2_r), \ + [p1_l] "r" (p1_l), [p1_r] "r" (p1_r), \ + [p0_l] "r" (p0_l), [p0_r] "r" (p0_r), \ + [q0_l] "r" (q0_l), [q0_r] "r" (q0_r), \ + [q1_l] "r" (q1_l), [q1_r] "r" (q1_r), \ + [q2_l] "r" (q2_l), [q2_r] "r" (q2_r) \ + ); \ +} + +#define COMBINE_LEFT_RIGHT_3TO6() { \ + __asm__ __volatile__ ( \ + "precr.qb.ph %[p6], %[p6_l], %[p6_r] \n\t" \ + "precr.qb.ph %[p5], %[p5_l], %[p5_r] \n\t" \ + "precr.qb.ph %[p4], %[p4_l], %[p4_r] \n\t" \ + "precr.qb.ph %[p3], %[p3_l], %[p3_r] \n\t" \ + "precr.qb.ph %[q3], %[q3_l], %[q3_r] \n\t" \ + "precr.qb.ph %[q4], %[q4_l], %[q4_r] \n\t" \ + "precr.qb.ph %[q5], %[q5_l], %[q5_r] \n\t" \ + "precr.qb.ph %[q6], %[q6_l], %[q6_r] \n\t" \ + \ + : [p6] "=&r" (p6),[p5] "=&r" (p5), \ + [p4] "=&r" (p4),[p3] "=&r" (p3), \ + [q3] "=&r" (q3),[q4] "=&r" (q4), \ + [q5] "=&r" (q5),[q6] "=&r" (q6) \ + : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), \ + [p4_l] "r" (p4_l), [p3_l] "r" (p3_l), \ + [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), \ + [p4_r] "r" (p4_r), [p3_r] "r" (p3_r), \ + [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), \ + [q5_l] "r" (q5_l), [q6_l] "r" (q6_l), \ + [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), \ + [q5_r] "r" (q5_r), [q6_r] "r" (q6_r) \ + ); \ +} + +#endif // #if HAVE_DSPR2 +#endif // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_ diff --git a/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h new file mode 100644 index 0000000..b9e0aca --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h @@ -0,0 +1,365 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_ +#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_ + +#include <stdlib.h> + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_onyxc_int.h" + +#if HAVE_DSPR2 +/* processing 4 pixels at the same time + * compute hev and mask in the same function */ +static INLINE void vp9_filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit, + uint32_t p1, uint32_t p0, + uint32_t p3, uint32_t p2, + uint32_t q0, uint32_t q1, + uint32_t q2, uint32_t q3, + uint32_t thresh, uint32_t *hev, + uint32_t *mask) { + uint32_t c, r, r3, r_k; + uint32_t s1, s2, s3; + uint32_t ones = 0xFFFFFFFF; + uint32_t hev1; + + __asm__ __volatile__ ( + /* mask |= (abs(p3 - p2) > limit) */ + "subu_s.qb %[c], %[p3], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], $0, %[c] \n\t" + + /* mask |= (abs(p2 - p1) > limit) */ + "subu_s.qb %[c], %[p2], %[p1] \n\t" + "subu_s.qb %[r_k], %[p1], %[p2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(p1 - p0) > limit) + * hev |= (abs(p1 - p0) > thresh) + */ + "subu_s.qb %[c], %[p1], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], $0, %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(q1 - q0) > limit) + * hev |= (abs(q1 - q0) > thresh) + */ + "subu_s.qb %[c], %[q1], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], %[r3], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(q2 - q1) > limit) */ + "subu_s.qb %[c], %[q2], %[q1] \n\t" + "subu_s.qb %[r_k], %[q1], %[q2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r3], %[r3], 24 \n\t" + + /* mask |= (abs(q3 - q2) > limit) */ + "subu_s.qb %[c], %[q3], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + : [c] "=&r" (c), [r_k] "=&r" (r_k), + [r] "=&r" (r), [r3] "=&r" (r3) + : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2), + [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0), + [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh) + ); + + __asm__ __volatile__ ( + /* abs(p0 - q0) */ + "subu_s.qb %[c], %[p0], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[p0] \n\t" + "wrdsp %[r3] \n\t" + "or %[s1], %[r_k], %[c] \n\t" + + /* abs(p1 - q1) */ + "subu_s.qb %[c], %[p1], %[q1] \n\t" + "addu_s.qb %[s3], %[s1], %[s1] \n\t" + "pick.qb %[hev1], %[ones], $0 \n\t" + "subu_s.qb %[r_k], %[q1], %[p1] \n\t" + "or %[s2], %[r_k], %[c] \n\t" + + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */ + "shrl.qb %[s2], %[s2], 1 \n\t" + "addu_s.qb %[s1], %[s2], %[s3] \n\t" + "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r], %[r], 24 \n\t" + + "wrdsp %[r] \n\t" + "pick.qb %[s2], $0, %[ones] \n\t" + + : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1), + [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3) + : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3), + [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit) + ); + + *hev = hev1; + *mask = s2; +} + +static INLINE void vp9_filter_hev_mask_flatmask4_dspr2(uint32_t limit, + uint32_t flimit, + uint32_t thresh, + uint32_t p1, uint32_t p0, + uint32_t p3, uint32_t p2, + uint32_t q0, uint32_t q1, + uint32_t q2, uint32_t q3, + uint32_t *hev, + uint32_t *mask, + uint32_t *flat) { + uint32_t c, r, r3, r_k, r_flat; + uint32_t s1, s2, s3; + uint32_t ones = 0xFFFFFFFF; + uint32_t flat_thresh = 0x01010101; + uint32_t hev1; + uint32_t flat1; + + __asm__ __volatile__ ( + /* mask |= (abs(p3 - p2) > limit) */ + "subu_s.qb %[c], %[p3], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], $0, %[c] \n\t" + + /* mask |= (abs(p2 - p1) > limit) */ + "subu_s.qb %[c], %[p2], %[p1] \n\t" + "subu_s.qb %[r_k], %[p1], %[p2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(p1 - p0) > limit) + * hev |= (abs(p1 - p0) > thresh) + * flat |= (abs(p1 - p0) > thresh) + */ + "subu_s.qb %[c], %[p1], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], $0, %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], $0, %[c] \n\t" + + /* mask |= (abs(q1 - q0) > limit) + * hev |= (abs(q1 - q0) > thresh) + * flat |= (abs(q1 - q0) > thresh) + */ + "subu_s.qb %[c], %[q1], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], %[r3], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p0 - p2) > thresh) */ + "subu_s.qb %[c], %[p0], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q0 - q2) > thresh) */ + "subu_s.qb %[c], %[q0], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p3 - p0) > thresh) */ + "subu_s.qb %[c], %[p3], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q3 - q0) > thresh) */ + "subu_s.qb %[c], %[q3], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + "sll %[r_flat], %[r_flat], 24 \n\t" + /* look at stall here */ + "wrdsp %[r_flat] \n\t" + "pick.qb %[flat1], $0, %[ones] \n\t" + + /* mask |= (abs(q2 - q1) > limit) */ + "subu_s.qb %[c], %[q2], %[q1] \n\t" + "subu_s.qb %[r_k], %[q1], %[q2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r3], %[r3], 24 \n\t" + + /* mask |= (abs(q3 - q2) > limit) */ + "subu_s.qb %[c], %[q3], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), [r3] "=&r" (r3), + [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1) + : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2), + [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0), + [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh), + [flat_thresh] "r" (flat_thresh), [ones] "r" (ones) + ); + + __asm__ __volatile__ ( + /* abs(p0 - q0) */ + "subu_s.qb %[c], %[p0], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[p0] \n\t" + "wrdsp %[r3] \n\t" + "or %[s1], %[r_k], %[c] \n\t" + + /* abs(p1 - q1) */ + "subu_s.qb %[c], %[p1], %[q1] \n\t" + "addu_s.qb %[s3], %[s1], %[s1] \n\t" + "pick.qb %[hev1], %[ones], $0 \n\t" + "subu_s.qb %[r_k], %[q1], %[p1] \n\t" + "or %[s2], %[r_k], %[c] \n\t" + + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */ + "shrl.qb %[s2], %[s2], 1 \n\t" + "addu_s.qb %[s1], %[s2], %[s3] \n\t" + "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r], %[r], 24 \n\t" + + "wrdsp %[r] \n\t" + "pick.qb %[s2], $0, %[ones] \n\t" + + : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1), + [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3) + : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3), + [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit) + ); + + *hev = hev1; + *mask = s2; + *flat = flat1; +} + +static INLINE void vp9_flatmask5(uint32_t p4, uint32_t p3, + uint32_t p2, uint32_t p1, + uint32_t p0, uint32_t q0, + uint32_t q1, uint32_t q2, + uint32_t q3, uint32_t q4, + uint32_t *flat2) { + uint32_t c, r, r_k, r_flat; + uint32_t ones = 0xFFFFFFFF; + uint32_t flat_thresh = 0x01010101; + uint32_t flat1, flat3; + + __asm__ __volatile__ ( + /* flat |= (abs(p4 - p0) > thresh) */ + "subu_s.qb %[c], %[p4], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p4] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r], $0, %[c] \n\t" + + /* flat |= (abs(q4 - q0) > thresh) */ + "subu_s.qb %[c], %[q4], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q4] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r], %[r], 24 \n\t" + "wrdsp %[r] \n\t" + "pick.qb %[flat3], $0, %[ones] \n\t" + + /* flat |= (abs(p1 - p0) > thresh) */ + "subu_s.qb %[c], %[p1], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], $0, %[c] \n\t" + + /* flat |= (abs(q1 - q0) > thresh) */ + "subu_s.qb %[c], %[q1], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p0 - p2) > thresh) */ + "subu_s.qb %[c], %[p0], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q0 - q2) > thresh) */ + "subu_s.qb %[c], %[q0], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p3 - p0) > thresh) */ + "subu_s.qb %[c], %[p3], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q3 - q0) > thresh) */ + "subu_s.qb %[c], %[q3], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + "sll %[r_flat], %[r_flat], 24 \n\t" + "wrdsp %[r_flat] \n\t" + "pick.qb %[flat1], $0, %[ones] \n\t" + /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */ + "and %[flat1], %[flat3], %[flat1] \n\t" + + : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), + [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1), [flat3] "=&r" (flat3) + : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), + [p1] "r" (p1), [p0] "r" (p0), [q0] "r" (q0), [q1] "r" (q1), + [q2] "r" (q2), [q3] "r" (q3), [q4] "r" (q4), + [flat_thresh] "r" (flat_thresh), [ones] "r" (ones) + ); + + *flat2 = flat1; +} +#endif // #if HAVE_DSPR2 +#endif // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_ diff --git a/libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c new file mode 100644 index 0000000..adfd755 --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c @@ -0,0 +1,652 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" +#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h" +#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h" +#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h" + +#if HAVE_DSPR2 +void vp9_mbloop_filter_horizontal_edge_dspr2(unsigned char *s, + int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count) { + uint32_t mask; + uint32_t hev, flat; + uint8_t i; + uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p3, p2, p1, p0, q0, q1, q2, q3; + uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; + uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__ ( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), + [limit_vec] "=r" (limit_vec) + : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) + ); + + /* prefetch data for store */ + vp9_prefetch_store(s); + + for (i = 0; i < 2; i++) { + sp3 = s - (pitch << 2); + sp2 = sp3 + pitch; + sp1 = sp2 + pitch; + sp0 = sp1 + pitch; + sq0 = s; + sq1 = s + pitch; + sq2 = sq1 + pitch; + sq3 = sq2 + pitch; + + __asm__ __volatile__ ( + "lw %[p3], (%[sp3]) \n\t" + "lw %[p2], (%[sp2]) \n\t" + "lw %[p1], (%[sp1]) \n\t" + "lw %[p0], (%[sp0]) \n\t" + "lw %[q0], (%[sq0]) \n\t" + "lw %[q1], (%[sq1]) \n\t" + "lw %[q2], (%[sq2]) \n\t" + "lw %[q3], (%[sq3]) \n\t" + + : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0), + [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0) + : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0) + ); + + vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, + p1, p0, p3, p2, q0, q1, q2, q3, + &hev, &mask, &flat); + + if ((flat == 0) && (mask != 0)) { + vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1, + &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + __asm__ __volatile__ ( + "sw %[p1_f0], (%[sp1]) \n\t" + "sw %[p0_f0], (%[sp0]) \n\t" + "sw %[q0_f0], (%[sq0]) \n\t" + "sw %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1) + ); + } else if ((mask & flat) == 0xFFFFFFFF) { + /* left 2 element operation */ + PACK_LEFT_0TO3() + vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, + &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, + &q0_r, &q1_r, &q2_r, &q3_r); + + COMBINE_LEFT_RIGHT_0TO2() + + __asm__ __volatile__ ( + "sw %[p2], (%[sp2]) \n\t" + "sw %[p1], (%[sp1]) \n\t" + "sw %[p0], (%[sp0]) \n\t" + "sw %[q0], (%[sq0]) \n\t" + "sw %[q1], (%[sq1]) \n\t" + "sw %[q2], (%[sq2]) \n\t" + + : + : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), + [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), + [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) + ); + } else if ((flat != 0) && (mask != 0)) { + /* filtering */ + vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1, + &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, + &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, + &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__ ( + "sb %[p2_r], (%[sp2]) \n\t" + "sb %[p1_r], (%[sp1]) \n\t" + "sb %[p0_r], (%[sp0]) \n\t" + "sb %[q0_r], (%[sq0]) \n\t" + "sb %[q1_r], (%[sq1]) \n\t" + "sb %[q2_r], (%[sq2]) \n\t" + + : + : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), + [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), + [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) + ); + } else if (mask & 0x000000FF) { + __asm__ __volatile__ ( + "sb %[p1_f0], (%[sp1]) \n\t" + "sb %[p0_f0], (%[sp0]) \n\t" + "sb %[q0_f0], (%[sq0]) \n\t" + "sb %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1) + ); + } + + __asm__ __volatile__ ( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r), + [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r), + [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), + [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) + : + ); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__ ( + "sb %[p2_r], +1(%[sp2]) \n\t" + "sb %[p1_r], +1(%[sp1]) \n\t" + "sb %[p0_r], +1(%[sp0]) \n\t" + "sb %[q0_r], +1(%[sq0]) \n\t" + "sb %[q1_r], +1(%[sq1]) \n\t" + "sb %[q2_r], +1(%[sq2]) \n\t" + + : + : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), + [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), + [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) + ); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__ ( + "sb %[p1_f0], +1(%[sp1]) \n\t" + "sb %[p0_f0], +1(%[sp0]) \n\t" + "sb %[q0_f0], +1(%[sq0]) \n\t" + "sb %[q1_f0], +1(%[sq1]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1) + ); + } + + __asm__ __volatile__ ( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), + [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2), + [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), + [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) + : + ); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__ ( + "sb %[p2_l], +2(%[sp2]) \n\t" + "sb %[p1_l], +2(%[sp1]) \n\t" + "sb %[p0_l], +2(%[sp0]) \n\t" + "sb %[q0_l], +2(%[sq0]) \n\t" + "sb %[q1_l], +2(%[sq1]) \n\t" + "sb %[q2_l], +2(%[sq2]) \n\t" + + : + : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), + [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), + [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) + ); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__ ( + "sb %[p1_f0], +2(%[sp1]) \n\t" + "sb %[p0_f0], +2(%[sp0]) \n\t" + "sb %[q0_f0], +2(%[sq0]) \n\t" + "sb %[q1_f0], +2(%[sq1]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1) + ); + } + + __asm__ __volatile__ ( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l), + [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), + [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), + [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) + : + ); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__ ( + "sb %[p2_l], +3(%[sp2]) \n\t" + "sb %[p1_l], +3(%[sp1]) \n\t" + "sb %[p0_l], +3(%[sp0]) \n\t" + "sb %[q0_l], +3(%[sq0]) \n\t" + "sb %[q1_l], +3(%[sq1]) \n\t" + "sb %[q2_l], +3(%[sq2]) \n\t" + + : + : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), + [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), + [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) + ); + } else if (mask & 0xFF000000) { + __asm__ __volatile__ ( + "sb %[p1_f0], +3(%[sp1]) \n\t" + "sb %[p0_f0], +3(%[sp0]) \n\t" + "sb %[q0_f0], +3(%[sq0]) \n\t" + "sb %[q1_f0], +3(%[sq1]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1) + ); + } + } + + s = s + 4; + } +} + +void vp9_mbloop_filter_vertical_edge_dspr2(unsigned char *s, + int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count) { + uint8_t i; + uint32_t mask, hev, flat; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p3, p2, p1, p0, q3, q2, q1, q0; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; + uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__ ( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), + [limit_vec] "=r" (limit_vec) + : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) + ); + + vp9_prefetch_store(s + pitch); + + for (i = 0; i < 2; i++) { + s1 = s; + s2 = s + pitch; + s3 = s2 + pitch; + s4 = s3 + pitch; + s = s4 + pitch; + + __asm__ __volatile__ ( + "lw %[p0], -4(%[s1]) \n\t" + "lw %[p1], -4(%[s2]) \n\t" + "lw %[p2], -4(%[s3]) \n\t" + "lw %[p3], -4(%[s4]) \n\t" + "lw %[q3], (%[s1]) \n\t" + "lw %[q2], (%[s2]) \n\t" + "lw %[q1], (%[s3]) \n\t" + "lw %[q0], (%[s4]) \n\t" + + : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0), + [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2), [q3] "=&r" (q3) + : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4) + ); + + /* transpose p3, p2, p1, p0 + original (when loaded from memory) + register -4 -3 -2 -1 + p0 p0_0 p0_1 p0_2 p0_3 + p1 p1_0 p1_1 p1_2 p1_3 + p2 p2_0 p2_1 p2_2 p2_3 + p3 p3_0 p3_1 p3_2 p3_3 + + after transpose + register + p0 p3_3 p2_3 p1_3 p0_3 + p1 p3_2 p2_2 p1_2 p0_2 + p2 p3_1 p2_1 p1_1 p0_1 + p3 p3_0 p2_0 p1_0 p0_0 + */ + __asm__ __volatile__ ( + "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p0], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p2], %[p3] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p0], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p2], %[p3], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), + [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), + [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), + [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) + : + ); + + /* transpose q0, q1, q2, q3 + original (when loaded from memory) + register +1 +2 +3 +4 + q3 q3_0 q3_1 q3_2 q3_3 + q2 q2_0 q2_1 q2_2 q2_3 + q1 q1_0 q1_1 q1_2 q1_3 + q0 q0_0 q0_1 q0_2 q0_3 + + after transpose + register + q3 q0_3 q1_3 q2_3 q3_3 + q2 q0_2 q1_2 q2_2 q3_2 + q1 q0_1 q1_1 q2_1 q3_1 + q0 q0_0 q1_0 q2_0 q3_0 + */ + __asm__ __volatile__ ( + "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" + "precr.qb.ph %[prim2], %[q3], %[q2] \n\t" + "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" + "precr.qb.ph %[prim4], %[q1], %[q0] \n\t" + + "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[q3], %[q2], %[sec3] \n\t" + "precrq.ph.w %[q1], %[q0], %[sec4] \n\t" + "append %[q2], %[sec3], 16 \n\t" + "append %[q0], %[sec4], 16 \n\t" + + : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), + [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), + [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0), + [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) + : + ); + + vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, + p1, p0, p3, p2, q0, q1, q2, q3, + &hev, &mask, &flat); + + if ((flat == 0) && (mask != 0)) { + vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1, + &p1_f0, &p0_f0, &q0_f0, &q1_f0); + STORE_F0() + } else if ((mask & flat) == 0xFFFFFFFF) { + /* left 2 element operation */ + PACK_LEFT_0TO3() + vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, + &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, + &q0_r, &q1_r, &q2_r, &q3_r); + + STORE_F1() + } else if ((flat != 0) && (mask != 0)) { + vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1, + &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, + &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, + &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__ ( + "sb %[p2_r], -3(%[s4]) \n\t" + "sb %[p1_r], -2(%[s4]) \n\t" + "sb %[p0_r], -1(%[s4]) \n\t" + "sb %[q0_r], (%[s4]) \n\t" + "sb %[q1_r], +1(%[s4]) \n\t" + "sb %[q2_r], +2(%[s4]) \n\t" + + : + : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), + [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), + [s4] "r" (s4) + ); + } else if (mask & 0x000000FF) { + __asm__ __volatile__ ( + "sb %[p1_f0], -2(%[s4]) \n\t" + "sb %[p0_f0], -1(%[s4]) \n\t" + "sb %[q0_f0], (%[s4]) \n\t" + "sb %[q1_f0], +1(%[s4]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [s4] "r" (s4) + ); + } + + __asm__ __volatile__ ( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r), + [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r), + [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), + [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) + : + ); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__ ( + "sb %[p2_r], -3(%[s3]) \n\t" + "sb %[p1_r], -2(%[s3]) \n\t" + "sb %[p0_r], -1(%[s3]) \n\t" + "sb %[q0_r], (%[s3]) \n\t" + "sb %[q1_r], +1(%[s3]) \n\t" + "sb %[q2_r], +2(%[s3]) \n\t" + + : + : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), + [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), + [s3] "r" (s3) + ); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__ ( + "sb %[p1_f0], -2(%[s3]) \n\t" + "sb %[p0_f0], -1(%[s3]) \n\t" + "sb %[q0_f0], (%[s3]) \n\t" + "sb %[q1_f0], +1(%[s3]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [s3] "r" (s3) + ); + } + + __asm__ __volatile__ ( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), + [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2), + [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), + [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) + : + ); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__ ( + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" + + : + : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), + [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), + [s2] "r" (s2) + ); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__ ( + "sb %[p1_f0], -2(%[s2]) \n\t" + "sb %[p0_f0], -1(%[s2]) \n\t" + "sb %[q0_f0], (%[s2]) \n\t" + "sb %[q1_f0], +1(%[s2]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [s2] "r" (s2) + ); + } + + __asm__ __volatile__ ( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l), + [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), + [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), + [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) + : + ); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__ ( + "sb %[p2_l], -3(%[s1]) \n\t" + "sb %[p1_l], -2(%[s1]) \n\t" + "sb %[p0_l], -1(%[s1]) \n\t" + "sb %[q0_l], (%[s1]) \n\t" + "sb %[q1_l], +1(%[s1]) \n\t" + "sb %[q2_l], +2(%[s1]) \n\t" + + : + : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), + [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), + [s1] "r" (s1) + ); + } else if (mask & 0xFF000000) { + __asm__ __volatile__ ( + "sb %[p1_f0], -2(%[s1]) \n\t" + "sb %[p0_f0], -1(%[s1]) \n\t" + "sb %[q0_f0], (%[s1]) \n\t" + "sb %[q1_f0], +1(%[s1]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0), + [q1_f0] "r" (q1_f0), [s1] "r" (s1) + ); + } + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c new file mode 100644 index 0000000..0759755 --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c @@ -0,0 +1,795 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" +#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h" +#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h" +#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h" + +#if HAVE_DSPR2 +void vp9_mb_lpf_horizontal_edge_w_dspr2(unsigned char *s, + int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count) { + uint32_t mask; + uint32_t hev, flat, flat2; + uint8_t i; + uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0; + uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; + uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; + uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; + uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; + uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; + uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__ ( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), + [limit_vec] "=r" (limit_vec) + : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) + ); + + /* prefetch data for store */ + vp9_prefetch_store(s); + + for (i = 0; i < (2 * count); i++) { + sp7 = s - (pitch << 3); + sp6 = sp7 + pitch; + sp5 = sp6 + pitch; + sp4 = sp5 + pitch; + sp3 = sp4 + pitch; + sp2 = sp3 + pitch; + sp1 = sp2 + pitch; + sp0 = sp1 + pitch; + sq0 = s; + sq1 = s + pitch; + sq2 = sq1 + pitch; + sq3 = sq2 + pitch; + sq4 = sq3 + pitch; + sq5 = sq4 + pitch; + sq6 = sq5 + pitch; + sq7 = sq6 + pitch; + + __asm__ __volatile__ ( + "lw %[p7], (%[sp7]) \n\t" + "lw %[p6], (%[sp6]) \n\t" + "lw %[p5], (%[sp5]) \n\t" + "lw %[p4], (%[sp4]) \n\t" + "lw %[p3], (%[sp3]) \n\t" + "lw %[p2], (%[sp2]) \n\t" + "lw %[p1], (%[sp1]) \n\t" + "lw %[p0], (%[sp0]) \n\t" + + : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0), + [p7] "=&r" (p7), [p6] "=&r" (p6), [p5] "=&r" (p5), [p4] "=&r" (p4) + : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), + [sp4] "r" (sp4), [sp5] "r" (sp5), [sp6] "r" (sp6), [sp7] "r" (sp7) + ); + + __asm__ __volatile__ ( + "lw %[q0], (%[sq0]) \n\t" + "lw %[q1], (%[sq1]) \n\t" + "lw %[q2], (%[sq2]) \n\t" + "lw %[q3], (%[sq3]) \n\t" + "lw %[q4], (%[sq4]) \n\t" + "lw %[q5], (%[sq5]) \n\t" + "lw %[q6], (%[sq6]) \n\t" + "lw %[q7], (%[sq7]) \n\t" + + : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0), + [q7] "=&r" (q7), [q6] "=&r" (q6), [q5] "=&r" (q5), [q4] "=&r" (q4) + : [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0), + [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6), [sq7] "r" (sq7) + ); + + vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, + p1, p0, p3, p2, q0, q1, q2, q3, + &hev, &mask, &flat); + + vp9_flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); + + /* f0 */ + if (((flat2 == 0) && (flat == 0) && (mask != 0)) || + ((flat2 != 0) && (flat == 0) && (mask != 0))) { + vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1, + &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + __asm__ __volatile__ ( + "sw %[p1_f0], (%[sp1]) \n\t" + "sw %[p0_f0], (%[sp0]) \n\t" + "sw %[q0_f0], (%[sq0]) \n\t" + "sw %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1) + ); + } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && + (mask == 0xFFFFFFFF)) { + /* f2 */ + PACK_LEFT_0TO3() + PACK_LEFT_4TO7() + vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, + &p3_l, &p2_l, &p1_l, &p0_l, + &q0_l, &q1_l, &q2_l, &q3_l, + &q4_l, &q5_l, &q6_l, &q7_l); + + PACK_RIGHT_0TO3() + PACK_RIGHT_4TO7() + vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, + &p3_r, &p2_r, &p1_r, &p0_r, + &q0_r, &q1_r, &q2_r, &q3_r, + &q4_r, &q5_r, &q6_r, &q7_r); + + COMBINE_LEFT_RIGHT_0TO2() + COMBINE_LEFT_RIGHT_3TO6() + + __asm__ __volatile__ ( + "sw %[p6], (%[sp6]) \n\t" + "sw %[p5], (%[sp5]) \n\t" + "sw %[p4], (%[sp4]) \n\t" + "sw %[p3], (%[sp3]) \n\t" + "sw %[p2], (%[sp2]) \n\t" + "sw %[p1], (%[sp1]) \n\t" + "sw %[p0], (%[sp0]) \n\t" + + : + : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), + [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), + [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4), [sp3] "r" (sp3), + [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0) + ); + + __asm__ __volatile__ ( + "sw %[q6], (%[sq6]) \n\t" + "sw %[q5], (%[sq5]) \n\t" + "sw %[q4], (%[sq4]) \n\t" + "sw %[q3], (%[sq3]) \n\t" + "sw %[q2], (%[sq2]) \n\t" + "sw %[q1], (%[sq1]) \n\t" + "sw %[q0], (%[sq0]) \n\t" + + : + : [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4), [q3] "r" (q3), + [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0), + [sq6] "r" (sq6), [sq5] "r" (sq5), [sq4] "r" (sq4), [sq3] "r" (sq3), + [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0) + ); + } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { + /* f1 */ + /* left 2 element operation */ + PACK_LEFT_0TO3() + vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, + &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, + &q0_r, &q1_r, &q2_r, &q3_r); + + COMBINE_LEFT_RIGHT_0TO2() + + __asm__ __volatile__ ( + "sw %[p2], (%[sp2]) \n\t" + "sw %[p1], (%[sp1]) \n\t" + "sw %[p0], (%[sp0]) \n\t" + "sw %[q0], (%[sq0]) \n\t" + "sw %[q1], (%[sq1]) \n\t" + "sw %[q2], (%[sq2]) \n\t" + + : + : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), + [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), + [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) + ); + } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { + /* f0+f1 */ + vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1, + &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, + &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, + &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__ ( + "sb %[p2_r], (%[sp2]) \n\t" + "sb %[p1_r], (%[sp1]) \n\t" + "sb %[p0_r], (%[sp0]) \n\t" + "sb %[q0_r], (%[sq0]) \n\t" + "sb %[q1_r], (%[sq1]) \n\t" + "sb %[q2_r], (%[sq2]) \n\t" + + : + : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), + [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), + [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) + ); + } else if (mask & 0x000000FF) { + __asm__ __volatile__ ( + "sb %[p1_f0], (%[sp1]) \n\t" + "sb %[p0_f0], (%[sp0]) \n\t" + "sb %[q0_f0], (%[sq0]) \n\t" + "sb %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1) + ); + } + + __asm__ __volatile__ ( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r), + [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r), + [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), + [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) + : + ); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__ ( + "sb %[p2_r], +1(%[sp2]) \n\t" + "sb %[p1_r], +1(%[sp1]) \n\t" + "sb %[p0_r], +1(%[sp0]) \n\t" + "sb %[q0_r], +1(%[sq0]) \n\t" + "sb %[q1_r], +1(%[sq1]) \n\t" + "sb %[q2_r], +1(%[sq2]) \n\t" + + : + : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), + [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), + [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) + ); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__ ( + "sb %[p1_f0], +1(%[sp1]) \n\t" + "sb %[p0_f0], +1(%[sp0]) \n\t" + "sb %[q0_f0], +1(%[sq0]) \n\t" + "sb %[q1_f0], +1(%[sq1]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1) + ); + } + + __asm__ __volatile__ ( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), + [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) + : + ); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__ ( + "sb %[p2_l], +2(%[sp2]) \n\t" + "sb %[p1_l], +2(%[sp1]) \n\t" + "sb %[p0_l], +2(%[sp0]) \n\t" + "sb %[q0_l], +2(%[sq0]) \n\t" + "sb %[q1_l], +2(%[sq1]) \n\t" + "sb %[q2_l], +2(%[sq2]) \n\t" + + : + : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), + [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), + [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) + ); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__ ( + "sb %[p1_f0], +2(%[sp1]) \n\t" + "sb %[p0_f0], +2(%[sp0]) \n\t" + "sb %[q0_f0], +2(%[sq0]) \n\t" + "sb %[q1_f0], +2(%[sq1]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1) + ); + } + + __asm__ __volatile__ ( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l), + [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), + [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), + [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) + : + ); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__ ( + "sb %[p2_l], +3(%[sp2]) \n\t" + "sb %[p1_l], +3(%[sp1]) \n\t" + "sb %[p0_l], +3(%[sp0]) \n\t" + "sb %[q0_l], +3(%[sq0]) \n\t" + "sb %[q1_l], +3(%[sq1]) \n\t" + "sb %[q2_l], +3(%[sq2]) \n\t" + + : + : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), + [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), + [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) + ); + } else if (mask & 0xFF000000) { + __asm__ __volatile__ ( + "sb %[p1_f0], +3(%[sp1]) \n\t" + "sb %[p0_f0], +3(%[sp0]) \n\t" + "sb %[q0_f0], +3(%[sq0]) \n\t" + "sb %[q1_f0], +3(%[sq1]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1) + ); + } + } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { + /* f0 + f1 + f2 */ + /* f0 function */ + vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1, + &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* f1 function */ + /* left 2 element operation */ + PACK_LEFT_0TO3() + vp9_mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, + q0_l, q1_l, q2_l, q3_l, + &p2_l_f1, &p1_l_f1, &p0_l_f1, + &q0_l_f1, &q1_l_f1, &q2_l_f1); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + vp9_mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, + q0_r, q1_r, q2_r, q3_r, + &p2_r_f1, &p1_r_f1, &p0_r_f1, + &q0_r_f1, &q1_r_f1, &q2_r_f1); + + /* f2 function */ + PACK_LEFT_4TO7() + vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, + &p3_l, &p2_l, &p1_l, &p0_l, + &q0_l, &q1_l, &q2_l, &q3_l, + &q4_l, &q5_l, &q6_l, &q7_l); + + PACK_RIGHT_4TO7() + vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, + &p3_r, &p2_r, &p1_r, &p0_r, + &q0_r, &q1_r, &q2_r, &q3_r, + &q4_r, &q5_r, &q6_r, &q7_r); + + if (mask & flat & flat2 & 0x000000FF) { + __asm__ __volatile__ ( + "sb %[p6_r], (%[sp6]) \n\t" + "sb %[p5_r], (%[sp5]) \n\t" + "sb %[p4_r], (%[sp4]) \n\t" + "sb %[p3_r], (%[sp3]) \n\t" + "sb %[p2_r], (%[sp2]) \n\t" + "sb %[p1_r], (%[sp1]) \n\t" + "sb %[p0_r], (%[sp0]) \n\t" + + : + : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r), + [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), + [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4), + [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), + [p0_r] "r" (p0_r), [sp0] "r" (sp0) + ); + + __asm__ __volatile__ ( + "sb %[q0_r], (%[sq0]) \n\t" + "sb %[q1_r], (%[sq1]) \n\t" + "sb %[q2_r], (%[sq2]) \n\t" + "sb %[q3_r], (%[sq3]) \n\t" + "sb %[q4_r], (%[sq4]) \n\t" + "sb %[q5_r], (%[sq5]) \n\t" + "sb %[q6_r], (%[sq6]) \n\t" + + : + : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), + [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r), + [q6_r] "r" (q6_r), + [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2), + [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5), + [sq6] "r" (sq6) + ); + } else if (mask & flat & 0x000000FF) { + __asm__ __volatile__ ( + "sb %[p2_r_f1], (%[sp2]) \n\t" + "sb %[p1_r_f1], (%[sp1]) \n\t" + "sb %[p0_r_f1], (%[sp0]) \n\t" + "sb %[q0_r_f1], (%[sq0]) \n\t" + "sb %[q1_r_f1], (%[sq1]) \n\t" + "sb %[q2_r_f1], (%[sq2]) \n\t" + + : + : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1), + [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1), + [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1), + [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) + ); + } else if (mask & 0x000000FF) { + __asm__ __volatile__ ( + "sb %[p1_f0], (%[sp1]) \n\t" + "sb %[p0_f0], (%[sp0]) \n\t" + "sb %[q0_f0], (%[sq0]) \n\t" + "sb %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0), + [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1) + ); + } + + __asm__ __volatile__ ( + "srl %[p6_r], %[p6_r], 16 \n\t" + "srl %[p5_r], %[p5_r], 16 \n\t" + "srl %[p4_r], %[p4_r], 16 \n\t" + "srl %[p3_r], %[p3_r], 16 \n\t" + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[q3_r], %[q3_r], 16 \n\t" + "srl %[q4_r], %[q4_r], 16 \n\t" + "srl %[q5_r], %[q5_r], 16 \n\t" + "srl %[q6_r], %[q6_r], 16 \n\t" + + : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r), + [q3_r] "+r" (q3_r), [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r), + [p6_r] "+r" (p6_r), [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r), + [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), + [q6_r] "+r" (q6_r), [p0_r] "+r" (p0_r) + : + ); + + __asm__ __volatile__ ( + "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" + "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" + "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" + "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t" + "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t" + "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1), + [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1), + [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1), + [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), + [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) + : + ); + + if (mask & flat & flat2 & 0x0000FF00) { + __asm__ __volatile__ ( + "sb %[p6_r], +1(%[sp6]) \n\t" + "sb %[p5_r], +1(%[sp5]) \n\t" + "sb %[p4_r], +1(%[sp4]) \n\t" + "sb %[p3_r], +1(%[sp3]) \n\t" + "sb %[p2_r], +1(%[sp2]) \n\t" + "sb %[p1_r], +1(%[sp1]) \n\t" + "sb %[p0_r], +1(%[sp0]) \n\t" + + : + : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r), + [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), + [p0_r] "r" (p0_r), [sp6] "r" (sp6), [sp5] "r" (sp5), + [sp4] "r" (sp4), [sp3] "r" (sp3), + [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0) + ); + + __asm__ __volatile__ ( + "sb %[q0_r], +1(%[sq0]) \n\t" + "sb %[q1_r], +1(%[sq1]) \n\t" + "sb %[q2_r], +1(%[sq2]) \n\t" + "sb %[q3_r], +1(%[sq3]) \n\t" + "sb %[q4_r], +1(%[sq4]) \n\t" + "sb %[q5_r], +1(%[sq5]) \n\t" + "sb %[q6_r], +1(%[sq6]) \n\t" + + : + : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), + [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r), + [q6_r] "r" (q6_r), [sq0] "r" (sq0), [sq1] "r" (sq1), + [sq2] "r" (sq2), [sq3] "r" (sq3), + [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6) + ); + } else if (mask & flat & 0x0000FF00) { + __asm__ __volatile__ ( + "sb %[p2_r_f1], +1(%[sp2]) \n\t" + "sb %[p1_r_f1], +1(%[sp1]) \n\t" + "sb %[p0_r_f1], +1(%[sp0]) \n\t" + "sb %[q0_r_f1], +1(%[sq0]) \n\t" + "sb %[q1_r_f1], +1(%[sq1]) \n\t" + "sb %[q2_r_f1], +1(%[sq2]) \n\t" + + : + : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1), + [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1), + [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1), + [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) + ); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__ ( + "sb %[p1_f0], +1(%[sp1]) \n\t" + "sb %[p0_f0], +1(%[sp0]) \n\t" + "sb %[q0_f0], +1(%[sq0]) \n\t" + "sb %[q1_f0], +1(%[sq1]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0), + [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1) + ); + } + + __asm__ __volatile__ ( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), + [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) + : + ); + + if (mask & flat & flat2 & 0x00FF0000) { + __asm__ __volatile__ ( + "sb %[p6_l], +2(%[sp6]) \n\t" + "sb %[p5_l], +2(%[sp5]) \n\t" + "sb %[p4_l], +2(%[sp4]) \n\t" + "sb %[p3_l], +2(%[sp3]) \n\t" + "sb %[p2_l], +2(%[sp2]) \n\t" + "sb %[p1_l], +2(%[sp1]) \n\t" + "sb %[p0_l], +2(%[sp0]) \n\t" + + : + : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l), + [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), + [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5), + [sp4] "r" (sp4), [sp3] "r" (sp3), + [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0) + ); + + __asm__ __volatile__ ( + "sb %[q0_l], +2(%[sq0]) \n\t" + "sb %[q1_l], +2(%[sq1]) \n\t" + "sb %[q2_l], +2(%[sq2]) \n\t" + "sb %[q3_l], +2(%[sq3]) \n\t" + "sb %[q4_l], +2(%[sq4]) \n\t" + "sb %[q5_l], +2(%[sq5]) \n\t" + "sb %[q6_l], +2(%[sq6]) \n\t" + + : + : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), + [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l), + [q6_l] "r" (q6_l), [sq0] "r" (sq0), [sq1] "r" (sq1), + [sq2] "r" (sq2), [sq3] "r" (sq3), + [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6) + ); + } else if (mask & flat & 0x00FF0000) { + __asm__ __volatile__ ( + "sb %[p2_l_f1], +2(%[sp2]) \n\t" + "sb %[p1_l_f1], +2(%[sp1]) \n\t" + "sb %[p0_l_f1], +2(%[sp0]) \n\t" + "sb %[q0_l_f1], +2(%[sq0]) \n\t" + "sb %[q1_l_f1], +2(%[sq1]) \n\t" + "sb %[q2_l_f1], +2(%[sq2]) \n\t" + + : + : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1), + [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1), + [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1), + [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) + ); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__ ( + "sb %[p1_f0], +2(%[sp1]) \n\t" + "sb %[p0_f0], +2(%[sp0]) \n\t" + "sb %[q0_f0], +2(%[sq0]) \n\t" + "sb %[q1_f0], +2(%[sq1]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0), + [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1) + ); + } + + __asm__ __volatile__ ( + "srl %[p6_l], %[p6_l], 16 \n\t" + "srl %[p5_l], %[p5_l], 16 \n\t" + "srl %[p4_l], %[p4_l], 16 \n\t" + "srl %[p3_l], %[p3_l], 16 \n\t" + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[q3_l], %[q3_l], 16 \n\t" + "srl %[q4_l], %[q4_l], 16 \n\t" + "srl %[q5_l], %[q5_l], 16 \n\t" + "srl %[q6_l], %[q6_l], 16 \n\t" + + : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), + [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l), + [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l), + [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l), + [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l) + : + ); + + __asm__ __volatile__ ( + "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" + "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" + "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" + "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t" + "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t" + "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1), + [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1), + [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1), + [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), + [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) + : + ); + + if (mask & flat & flat2 & 0xFF000000) { + __asm__ __volatile__ ( + "sb %[p6_l], +3(%[sp6]) \n\t" + "sb %[p5_l], +3(%[sp5]) \n\t" + "sb %[p4_l], +3(%[sp4]) \n\t" + "sb %[p3_l], +3(%[sp3]) \n\t" + "sb %[p2_l], +3(%[sp2]) \n\t" + "sb %[p1_l], +3(%[sp1]) \n\t" + "sb %[p0_l], +3(%[sp0]) \n\t" + + : + : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l), + [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), + [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5), + [sp4] "r" (sp4), [sp3] "r" (sp3), [sp2] "r" (sp2), + [sp1] "r" (sp1), [sp0] "r" (sp0) + ); + + __asm__ __volatile__ ( + "sb %[q0_l], +3(%[sq0]) \n\t" + "sb %[q1_l], +3(%[sq1]) \n\t" + "sb %[q2_l], +3(%[sq2]) \n\t" + "sb %[q3_l], +3(%[sq3]) \n\t" + "sb %[q4_l], +3(%[sq4]) \n\t" + "sb %[q5_l], +3(%[sq5]) \n\t" + "sb %[q6_l], +3(%[sq6]) \n\t" + + : + : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), + [q2_l] "r" (q2_l), [q3_l] "r" (q3_l), + [q4_l] "r" (q4_l), [q5_l] "r" (q5_l), + [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2), + [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5), + [q6_l] "r" (q6_l), [sq6] "r" (sq6) + ); + } else if (mask & flat & 0xFF000000) { + __asm__ __volatile__ ( + "sb %[p2_l_f1], +3(%[sp2]) \n\t" + "sb %[p1_l_f1], +3(%[sp1]) \n\t" + "sb %[p0_l_f1], +3(%[sp0]) \n\t" + "sb %[q0_l_f1], +3(%[sq0]) \n\t" + "sb %[q1_l_f1], +3(%[sq1]) \n\t" + "sb %[q2_l_f1], +3(%[sq2]) \n\t" + + : + : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1), + [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1), + [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1), + [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) + ); + } else if (mask & 0xFF000000) { + __asm__ __volatile__ ( + "sb %[p1_f0], +3(%[sp1]) \n\t" + "sb %[p0_f0], +3(%[sp0]) \n\t" + "sb %[q0_f0], +3(%[sq0]) \n\t" + "sb %[q1_f0], +3(%[sq1]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [sp1] "r" (sp1), [sp0] "r" (sp0), + [sq0] "r" (sq0), [sq1] "r" (sq1) + ); + } + } + + s = s + 4; + } +} +#endif // #if HAVE_DSPR2 diff --git a/libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c new file mode 100644 index 0000000..9e9171c --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c @@ -0,0 +1,840 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" +#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h" +#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h" +#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h" + +#if HAVE_DSPR2 +void vp9_mb_lpf_vertical_edge_w_dspr2(uint8_t *s, + int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + uint8_t i; + uint32_t mask, hev, flat, flat2; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; + uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; + uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; + uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; + uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; + uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__ ( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), + [limit_vec] "=r" (limit_vec) + : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) + ); + + vp9_prefetch_store(s + pitch); + + for (i = 0; i < 2; i++) { + s1 = s; + s2 = s + pitch; + s3 = s2 + pitch; + s4 = s3 + pitch; + s = s4 + pitch; + + __asm__ __volatile__ ( + "lw %[p0], -4(%[s1]) \n\t" + "lw %[p1], -4(%[s2]) \n\t" + "lw %[p2], -4(%[s3]) \n\t" + "lw %[p3], -4(%[s4]) \n\t" + "lw %[p4], -8(%[s1]) \n\t" + "lw %[p5], -8(%[s2]) \n\t" + "lw %[p6], -8(%[s3]) \n\t" + "lw %[p7], -8(%[s4]) \n\t" + + : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), + [p0] "=&r" (p0), [p7] "=&r" (p7), [p6] "=&r" (p6), + [p5] "=&r" (p5), [p4] "=&r" (p4) + : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4) + ); + + __asm__ __volatile__ ( + "lw %[q3], (%[s1]) \n\t" + "lw %[q2], (%[s2]) \n\t" + "lw %[q1], (%[s3]) \n\t" + "lw %[q0], (%[s4]) \n\t" + "lw %[q7], +4(%[s1]) \n\t" + "lw %[q6], +4(%[s2]) \n\t" + "lw %[q5], +4(%[s3]) \n\t" + "lw %[q4], +4(%[s4]) \n\t" + + : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), + [q0] "=&r" (q0), [q7] "=&r" (q7), [q6] "=&r" (q6), + [q5] "=&r" (q5), [q4] "=&r" (q4) + : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4) + ); + + /* transpose p3, p2, p1, p0 + original (when loaded from memory) + register -4 -3 -2 -1 + p0 p0_0 p0_1 p0_2 p0_3 + p1 p1_0 p1_1 p1_2 p1_3 + p2 p2_0 p2_1 p2_2 p2_3 + p3 p3_0 p3_1 p3_2 p3_3 + + after transpose + register + p0 p3_3 p2_3 p1_3 p0_3 + p1 p3_2 p2_2 p1_2 p0_2 + p2 p3_1 p2_1 p1_1 p0_1 + p3 p3_0 p2_0 p1_0 p0_0 + */ + __asm__ __volatile__ ( + "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p0], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p2], %[p3] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p0], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p2], %[p3], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), + [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), + [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), + [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) + : + ); + + /* transpose q0, q1, q2, q3 + original (when loaded from memory) + register +1 +2 +3 +4 + q3 q3_0 q3_1 q3_2 q3_3 + q2 q2_0 q2_1 q2_2 q2_3 + q1 q1_0 q1_1 q1_2 q1_3 + q0 q0_0 q0_1 q0_2 q0_3 + + after transpose + register + q3 q0_3 q1_3 q2_3 q3_3 + q2 q0_2 q1_2 q2_2 q3_2 + q1 q0_1 q1_1 q2_1 q3_1 + q0 q0_0 q1_0 q2_0 q3_0 + */ + __asm__ __volatile__ ( + "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" + "precr.qb.ph %[prim2], %[q3], %[q2] \n\t" + "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" + "precr.qb.ph %[prim4], %[q1], %[q0] \n\t" + + "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[q3], %[q2], %[sec3] \n\t" + "precrq.ph.w %[q1], %[q0], %[sec4] \n\t" + "append %[q2], %[sec3], 16 \n\t" + "append %[q0], %[sec4], 16 \n\t" + + : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), + [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), + [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0), + [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) + : + ); + + /* transpose p7, p6, p5, p4 + original (when loaded from memory) + register -8 -7 -6 -5 + p4 p4_0 p4_1 p4_2 p4_3 + p5 p5_0 p5_1 p5_2 p5_3 + p6 p6_0 p6_1 p6_2 p6_3 + p7 p7_0 p7_1 p7_2 p7_3 + + after transpose + register + p4 p7_3 p6_3 p5_3 p4_3 + p5 p7_2 p6_2 p5_2 p4_2 + p6 p7_1 p6_1 p5_1 p4_1 + p7 p7_0 p6_0 p5_0 p4_0 + */ + __asm__ __volatile__ ( + "precrq.qb.ph %[prim1], %[p4], %[p5] \n\t" + "precr.qb.ph %[prim2], %[p4], %[p5] \n\t" + "precrq.qb.ph %[prim3], %[p6], %[p7] \n\t" + "precr.qb.ph %[prim4], %[p6], %[p7] \n\t" + + "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p7], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p4], %[p5], %[sec3] \n\t" + "precrq.ph.w %[p6], %[p7], %[sec4] \n\t" + "append %[p5], %[sec3], 16 \n\t" + "append %[p7], %[sec4], 16 \n\t" + + : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), + [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), + [p4] "+r" (p4), [p5] "+r" (p5), [p6] "+r" (p6), [p7] "+r" (p7), + [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) + : + ); + + /* transpose q4, q5, q6, q7 + original (when loaded from memory) + register +5 +6 +7 +8 + q7 q7_0 q7_1 q7_2 q7_3 + q6 q6_0 q6_1 q6_2 q6_3 + q5 q5_0 q5_1 q5_2 q5_3 + q4 q4_0 q4_1 q4_2 q4_3 + + after transpose + register + q7 q4_3 q5_3 q26_3 q7_3 + q6 q4_2 q5_2 q26_2 q7_2 + q5 q4_1 q5_1 q26_1 q7_1 + q4 q4_0 q5_0 q26_0 q7_0 + */ + __asm__ __volatile__ ( + "precrq.qb.ph %[prim1], %[q7], %[q6] \n\t" + "precr.qb.ph %[prim2], %[q7], %[q6] \n\t" + "precrq.qb.ph %[prim3], %[q5], %[q4] \n\t" + "precr.qb.ph %[prim4], %[q5], %[q4] \n\t" + + "precrq.qb.ph %[q6], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[q4], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[q7], %[q6], %[sec3] \n\t" + "precrq.ph.w %[q5], %[q4], %[sec4] \n\t" + "append %[q6], %[sec3], 16 \n\t" + "append %[q4], %[sec4], 16 \n\t" + + : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), + [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), + [q7] "+r" (q7), [q6] "+r" (q6), [q5] "+r" (q5), [q4] "+r" (q4), + [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) + : + ); + + vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, + p1, p0, p3, p2, q0, q1, q2, q3, + &hev, &mask, &flat); + + vp9_flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); + + /* f0 */ + if (((flat2 == 0) && (flat == 0) && (mask != 0)) || + ((flat2 != 0) && (flat == 0) && (mask != 0))) { + vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1, + &p1_f0, &p0_f0, &q0_f0, &q1_f0); + STORE_F0() + } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && + (mask == 0xFFFFFFFF)) { + /* f2 */ + PACK_LEFT_0TO3() + PACK_LEFT_4TO7() + vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, + &p3_l, &p2_l, &p1_l, &p0_l, + &q0_l, &q1_l, &q2_l, &q3_l, + &q4_l, &q5_l, &q6_l, &q7_l); + + PACK_RIGHT_0TO3() + PACK_RIGHT_4TO7() + vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, + &p3_r, &p2_r, &p1_r, &p0_r, + &q0_r, &q1_r, &q2_r, &q3_r, + &q4_r, &q5_r, &q6_r, &q7_r); + + STORE_F2() + } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { + /* f1 */ + PACK_LEFT_0TO3() + vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, + &q0_l, &q1_l, &q2_l, &q3_l); + + PACK_RIGHT_0TO3() + vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, + &q0_r, &q1_r, &q2_r, &q3_r); + + STORE_F1() + } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { + /* f0 + f1 */ + vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1, + &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, + &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, + &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__ ( + "sb %[p2_r], -3(%[s4]) \n\t" + "sb %[p1_r], -2(%[s4]) \n\t" + "sb %[p0_r], -1(%[s4]) \n\t" + "sb %[q0_r], (%[s4]) \n\t" + "sb %[q1_r], +1(%[s4]) \n\t" + "sb %[q2_r], +2(%[s4]) \n\t" + + : + : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), + [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), + [s4] "r" (s4) + ); + } else if (mask & 0x000000FF) { + __asm__ __volatile__ ( + "sb %[p1_f0], -2(%[s4]) \n\t" + "sb %[p0_f0], -1(%[s4]) \n\t" + "sb %[q0_f0], (%[s4]) \n\t" + "sb %[q1_f0], +1(%[s4]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [s4] "r" (s4) + ); + } + + __asm__ __volatile__ ( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r), + [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r), + [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), + [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) + : + ); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__ ( + "sb %[p2_r], -3(%[s3]) \n\t" + "sb %[p1_r], -2(%[s3]) \n\t" + "sb %[p0_r], -1(%[s3]) \n\t" + "sb %[q0_r], (%[s3]) \n\t" + "sb %[q1_r], +1(%[s3]) \n\t" + "sb %[q2_r], +2(%[s3]) \n\t" + + : + : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), + [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), + [s3] "r" (s3) + ); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__ ( + "sb %[p1_f0], -2(%[s3]) \n\t" + "sb %[p0_f0], -1(%[s3]) \n\t" + "sb %[q0_f0], (%[s3]) \n\t" + "sb %[q1_f0], +1(%[s3]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [s3] "r" (s3) + ); + } + + __asm__ __volatile__ ( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), + [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) + : + ); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__ ( + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" + + : + : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), + [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), + [s2] "r" (s2) + ); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__ ( + "sb %[p1_f0], -2(%[s2]) \n\t" + "sb %[p0_f0], -1(%[s2]) \n\t" + "sb %[q0_f0], (%[s2]) \n\t" + "sb %[q1_f0], +1(%[s2]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [s2] "r" (s2) + ); + } + + __asm__ __volatile__ ( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l), + [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), + [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), + [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) + : + ); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__ ( + "sb %[p2_l], -3(%[s1]) \n\t" + "sb %[p1_l], -2(%[s1]) \n\t" + "sb %[p0_l], -1(%[s1]) \n\t" + "sb %[q0_l], (%[s1]) \n\t" + "sb %[q1_l], +1(%[s1]) \n\t" + "sb %[q2_l], +2(%[s1]) \n\t" + + : + : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), + [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), + [s1] "r" (s1) + ); + } else if (mask & 0xFF000000) { + __asm__ __volatile__ ( + "sb %[p1_f0], -2(%[s1]) \n\t" + "sb %[p0_f0], -1(%[s1]) \n\t" + "sb %[q0_f0], (%[s1]) \n\t" + "sb %[q1_f0], +1(%[s1]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [s1] "r" (s1) + ); + } + } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { + /* f0+f1+f2 */ + vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1, + &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + PACK_LEFT_0TO3() + vp9_mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, + q0_l, q1_l, q2_l, q3_l, + &p2_l_f1, &p1_l_f1, &p0_l_f1, + &q0_l_f1, &q1_l_f1, &q2_l_f1); + + PACK_RIGHT_0TO3() + vp9_mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, + q0_r, q1_r, q2_r, q3_r, + &p2_r_f1, &p1_r_f1, &p0_r_f1, + &q0_r_f1, &q1_r_f1, &q2_r_f1); + + PACK_LEFT_4TO7() + vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, + &p3_l, &p2_l, &p1_l, &p0_l, + &q0_l, &q1_l, &q2_l, &q3_l, + &q4_l, &q5_l, &q6_l, &q7_l); + + PACK_RIGHT_4TO7() + vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, + &p3_r, &p2_r, &p1_r, &p0_r, + &q0_r, &q1_r, &q2_r, &q3_r, + &q4_r, &q5_r, &q6_r, &q7_r); + + if (mask & flat & flat2 & 0x000000FF) { + __asm__ __volatile__ ( + "sb %[p6_r], -7(%[s4]) \n\t" + "sb %[p5_r], -6(%[s4]) \n\t" + "sb %[p4_r], -5(%[s4]) \n\t" + "sb %[p3_r], -4(%[s4]) \n\t" + "sb %[p2_r], -3(%[s4]) \n\t" + "sb %[p1_r], -2(%[s4]) \n\t" + "sb %[p0_r], -1(%[s4]) \n\t" + + : + : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), + [p4_r] "r" (p4_r), [p3_r] "r" (p3_r), + [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), + [p0_r] "r" (p0_r), [s4] "r" (s4) + ); + + __asm__ __volatile__ ( + "sb %[q0_r], (%[s4]) \n\t" + "sb %[q1_r], +1(%[s4]) \n\t" + "sb %[q2_r], +2(%[s4]) \n\t" + "sb %[q3_r], +3(%[s4]) \n\t" + "sb %[q4_r], +4(%[s4]) \n\t" + "sb %[q5_r], +5(%[s4]) \n\t" + "sb %[q6_r], +6(%[s4]) \n\t" + + : + : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), + [q2_r] "r" (q2_r), [q3_r] "r" (q3_r), + [q4_r] "r" (q4_r), [q5_r] "r" (q5_r), + [q6_r] "r" (q6_r), [s4] "r" (s4) + ); + } else if (mask & flat & 0x000000FF) { + __asm__ __volatile__ ( + "sb %[p2_r_f1], -3(%[s4]) \n\t" + "sb %[p1_r_f1], -2(%[s4]) \n\t" + "sb %[p0_r_f1], -1(%[s4]) \n\t" + "sb %[q0_r_f1], (%[s4]) \n\t" + "sb %[q1_r_f1], +1(%[s4]) \n\t" + "sb %[q2_r_f1], +2(%[s4]) \n\t" + + : + : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1), + [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1), + [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1), + [s4] "r" (s4) + ); + } else if (mask & 0x000000FF) { + __asm__ __volatile__ ( + "sb %[p1_f0], -2(%[s4]) \n\t" + "sb %[p0_f0], -1(%[s4]) \n\t" + "sb %[q0_f0], (%[s4]) \n\t" + "sb %[q1_f0], +1(%[s4]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [s4] "r" (s4) + ); + } + + __asm__ __volatile__ ( + "srl %[p6_r], %[p6_r], 16 \n\t" + "srl %[p5_r], %[p5_r], 16 \n\t" + "srl %[p4_r], %[p4_r], 16 \n\t" + "srl %[p3_r], %[p3_r], 16 \n\t" + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[q3_r], %[q3_r], 16 \n\t" + "srl %[q4_r], %[q4_r], 16 \n\t" + "srl %[q5_r], %[q5_r], 16 \n\t" + "srl %[q6_r], %[q6_r], 16 \n\t" + + : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), + [q2_r] "+r" (q2_r), [q3_r] "+r" (q3_r), + [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r), + [q6_r] "+r" (q6_r), [p6_r] "+r" (p6_r), + [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r), + [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r), + [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r) + : + ); + + __asm__ __volatile__ ( + "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" + "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" + "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" + "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t" + "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t" + "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1), + [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1), + [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1), + [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), + [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) + : + ); + + if (mask & flat & flat2 & 0x0000FF00) { + __asm__ __volatile__ ( + "sb %[p6_r], -7(%[s3]) \n\t" + "sb %[p5_r], -6(%[s3]) \n\t" + "sb %[p4_r], -5(%[s3]) \n\t" + "sb %[p3_r], -4(%[s3]) \n\t" + "sb %[p2_r], -3(%[s3]) \n\t" + "sb %[p1_r], -2(%[s3]) \n\t" + "sb %[p0_r], -1(%[s3]) \n\t" + + : + : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r), + [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), + [p0_r] "r" (p0_r), [s3] "r" (s3) + ); + + __asm__ __volatile__ ( + "sb %[q0_r], (%[s3]) \n\t" + "sb %[q1_r], +1(%[s3]) \n\t" + "sb %[q2_r], +2(%[s3]) \n\t" + "sb %[q3_r], +3(%[s3]) \n\t" + "sb %[q4_r], +4(%[s3]) \n\t" + "sb %[q5_r], +5(%[s3]) \n\t" + "sb %[q6_r], +6(%[s3]) \n\t" + + : + : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), + [q2_r] "r" (q2_r), [q3_r] "r" (q3_r), + [q4_r] "r" (q4_r), [q5_r] "r" (q5_r), + [q6_r] "r" (q6_r), [s3] "r" (s3) + ); + } else if (mask & flat & 0x0000FF00) { + __asm__ __volatile__ ( + "sb %[p2_r_f1], -3(%[s3]) \n\t" + "sb %[p1_r_f1], -2(%[s3]) \n\t" + "sb %[p0_r_f1], -1(%[s3]) \n\t" + "sb %[q0_r_f1], (%[s3]) \n\t" + "sb %[q1_r_f1], +1(%[s3]) \n\t" + "sb %[q2_r_f1], +2(%[s3]) \n\t" + + : + : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1), + [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1), + [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1), + [s3] "r" (s3) + ); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__ ( + "sb %[p1_f0], -2(%[s3]) \n\t" + "sb %[p0_f0], -1(%[s3]) \n\t" + "sb %[q0_f0], (%[s3]) \n\t" + "sb %[q1_f0], +1(%[s3]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [s3] "r" (s3) + ); + } + + __asm__ __volatile__ ( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), + [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) + : + ); + + if (mask & flat & flat2 & 0x00FF0000) { + __asm__ __volatile__ ( + "sb %[p6_l], -7(%[s2]) \n\t" + "sb %[p5_l], -6(%[s2]) \n\t" + "sb %[p4_l], -5(%[s2]) \n\t" + "sb %[p3_l], -4(%[s2]) \n\t" + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + + : + : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l), + [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), + [p0_l] "r" (p0_l), [s2] "r" (s2) + ); + + __asm__ __volatile__ ( + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" + "sb %[q3_l], +3(%[s2]) \n\t" + "sb %[q4_l], +4(%[s2]) \n\t" + "sb %[q5_l], +5(%[s2]) \n\t" + "sb %[q6_l], +6(%[s2]) \n\t" + + : + : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), + [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l), + [q6_l] "r" (q6_l), [s2] "r" (s2) + ); + } else if (mask & flat & 0x00FF0000) { + __asm__ __volatile__ ( + "sb %[p2_l_f1], -3(%[s2]) \n\t" + "sb %[p1_l_f1], -2(%[s2]) \n\t" + "sb %[p0_l_f1], -1(%[s2]) \n\t" + "sb %[q0_l_f1], (%[s2]) \n\t" + "sb %[q1_l_f1], +1(%[s2]) \n\t" + "sb %[q2_l_f1], +2(%[s2]) \n\t" + + : + : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1), + [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1), + [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1), + [s2] "r" (s2) + ); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__ ( + "sb %[p1_f0], -2(%[s2]) \n\t" + "sb %[p0_f0], -1(%[s2]) \n\t" + "sb %[q0_f0], (%[s2]) \n\t" + "sb %[q1_f0], +1(%[s2]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [s2] "r" (s2) + ); + } + + __asm__ __volatile__ ( + "srl %[p6_l], %[p6_l], 16 \n\t" + "srl %[p5_l], %[p5_l], 16 \n\t" + "srl %[p4_l], %[p4_l], 16 \n\t" + "srl %[p3_l], %[p3_l], 16 \n\t" + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[q3_l], %[q3_l], 16 \n\t" + "srl %[q4_l], %[q4_l], 16 \n\t" + "srl %[q5_l], %[q5_l], 16 \n\t" + "srl %[q6_l], %[q6_l], 16 \n\t" + + : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), + [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l), + [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l), + [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l), + [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l) + : + ); + + __asm__ __volatile__ ( + "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" + "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" + "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" + "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t" + "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t" + "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1), + [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1), + [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1), + [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), + [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) + : + ); + + if (mask & flat & flat2 & 0xFF000000) { + __asm__ __volatile__ ( + "sb %[p6_l], -7(%[s1]) \n\t" + "sb %[p5_l], -6(%[s1]) \n\t" + "sb %[p4_l], -5(%[s1]) \n\t" + "sb %[p3_l], -4(%[s1]) \n\t" + "sb %[p2_l], -3(%[s1]) \n\t" + "sb %[p1_l], -2(%[s1]) \n\t" + "sb %[p0_l], -1(%[s1]) \n\t" + + : + : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l), + [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), + [p0_l] "r" (p0_l), + [s1] "r" (s1) + ); + + __asm__ __volatile__ ( + "sb %[q0_l], (%[s1]) \n\t" + "sb %[q1_l], 1(%[s1]) \n\t" + "sb %[q2_l], 2(%[s1]) \n\t" + "sb %[q3_l], 3(%[s1]) \n\t" + "sb %[q4_l], 4(%[s1]) \n\t" + "sb %[q5_l], 5(%[s1]) \n\t" + "sb %[q6_l], 6(%[s1]) \n\t" + + : + : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), + [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l), + [q6_l] "r" (q6_l), + [s1] "r" (s1) + ); + } else if (mask & flat & 0xFF000000) { + __asm__ __volatile__ ( + "sb %[p2_l_f1], -3(%[s1]) \n\t" + "sb %[p1_l_f1], -2(%[s1]) \n\t" + "sb %[p0_l_f1], -1(%[s1]) \n\t" + "sb %[q0_l_f1], (%[s1]) \n\t" + "sb %[q1_l_f1], +1(%[s1]) \n\t" + "sb %[q2_l_f1], +2(%[s1]) \n\t" + + : + : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1), + [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1), + [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1), + [s1] "r" (s1) + ); + } else if (mask & 0xFF000000) { + __asm__ __volatile__ ( + "sb %[p1_f0], -2(%[s1]) \n\t" + "sb %[p0_f0], -1(%[s1]) \n\t" + "sb %[q0_f0], (%[s1]) \n\t" + "sb %[q1_f0], +1(%[s1]) \n\t" + + : + : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), + [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), + [s1] "r" (s1) + ); + } + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c index 0d65651..d298160 100644 --- a/libvpx/vp9/common/vp9_alloccommon.c +++ b/libvpx/vp9/common/vp9_alloccommon.c @@ -79,6 +79,57 @@ static void setup_mi(VP9_COMMON *cm) { vp9_update_mode_info_border(cm, cm->prev_mip); } +int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) { + const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2); + const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); + const int ss_x = cm->subsampling_x; + const int ss_y = cm->subsampling_y; + int mi_size; + + if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y, + VP9BORDERINPIXELS) < 0) + goto fail; + + set_mb_mi(cm, aligned_width, aligned_height); + + // Allocation + mi_size = cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE); + + vpx_free(cm->mip); + cm->mip = vpx_calloc(mi_size, sizeof(MODE_INFO)); + if (!cm->mip) + goto fail; + + vpx_free(cm->prev_mip); + cm->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO)); + if (!cm->prev_mip) + goto fail; + + vpx_free(cm->mi_grid_base); + cm->mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->mi_grid_base)); + if (!cm->mi_grid_base) + goto fail; + + vpx_free(cm->prev_mi_grid_base); + cm->prev_mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base)); + if (!cm->prev_mi_grid_base) + goto fail; + + setup_mi(cm); + + // Create the segmentation map structure and set to 0. + vpx_free(cm->last_frame_seg_map); + cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1); + if (!cm->last_frame_seg_map) + goto fail; + + return 0; + + fail: + vp9_free_frame_buffers(cm); + return 1; +} + int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) { int i; diff --git a/libvpx/vp9/common/vp9_alloccommon.h b/libvpx/vp9/common/vp9_alloccommon.h index 5d5fae9..cf8dca5 100644 --- a/libvpx/vp9/common/vp9_alloccommon.h +++ b/libvpx/vp9/common/vp9_alloccommon.h @@ -21,6 +21,7 @@ void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi); void vp9_create_common(VP9_COMMON *cm); void vp9_remove_common(VP9_COMMON *cm); +int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height); int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height); void vp9_free_frame_buffers(VP9_COMMON *cm); diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h index d0d4852..c5da375 100644 --- a/libvpx/vp9/common/vp9_blockd.h +++ b/libvpx/vp9/common/vp9_blockd.h @@ -82,9 +82,8 @@ static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) { #define INTER_MODES (1 + NEWMV - NEARESTMV) -static INLINE int inter_mode_offset(MB_PREDICTION_MODE mode) { - return (mode - NEARESTMV); -} +#define INTER_OFFSET(mode) ((mode) - NEARESTMV) + /* For keyframes, intra block modes are predicted by the (already decoded) modes for the Y blocks to the left and above us; for interframes, there @@ -171,9 +170,9 @@ struct buf_2d { }; struct macroblockd_plane { - DECLARE_ALIGNED(16, int16_t, qcoeff[64 * 64]); - DECLARE_ALIGNED(16, int16_t, dqcoeff[64 * 64]); - DECLARE_ALIGNED(16, uint16_t, eobs[256]); + int16_t *qcoeff; + int16_t *dqcoeff; + uint16_t *eobs; PLANE_TYPE plane_type; int subsampling_x; int subsampling_y; @@ -216,13 +215,6 @@ typedef struct macroblockd { int corrupted; - unsigned char sb_index; // index of 32x32 block inside the 64x64 block - unsigned char mb_index; // index of 16x16 block inside the 32x32 block - unsigned char b_index; // index of 8x8 block inside the 16x16 block - unsigned char ab_index; // index of 4x4 block inside the 8x8 block - - int q_index; - /* Y,U,V,(A) */ ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16]; @@ -457,57 +449,46 @@ static void extend_for_intra(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize, } } } -static void set_contexts_on_border(const MACROBLOCKD *xd, - struct macroblockd_plane *pd, - BLOCK_SIZE plane_bsize, - int tx_size_in_blocks, int has_eob, - int aoff, int loff, - ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) { - int mi_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize]; - int mi_blocks_high = num_4x4_blocks_high_lookup[plane_bsize]; - int above_contexts = tx_size_in_blocks; - int left_contexts = tx_size_in_blocks; - int pt; - - // xd->mb_to_right_edge is in units of pixels * 8. This converts - // it to 4x4 block sizes. - if (xd->mb_to_right_edge < 0) - mi_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x)); - - if (xd->mb_to_bottom_edge < 0) - mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); - - // this code attempts to avoid copying into contexts that are outside - // our border. Any blocks that do are set to 0... - if (above_contexts + aoff > mi_blocks_wide) - above_contexts = mi_blocks_wide - aoff; - - if (left_contexts + loff > mi_blocks_high) - left_contexts = mi_blocks_high - loff; - - for (pt = 0; pt < above_contexts; pt++) - A[pt] = has_eob; - for (pt = above_contexts; pt < tx_size_in_blocks; pt++) - A[pt] = 0; - for (pt = 0; pt < left_contexts; pt++) - L[pt] = has_eob; - for (pt = left_contexts; pt < tx_size_in_blocks; pt++) - L[pt] = 0; -} static void set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, int aoff, int loff) { - ENTROPY_CONTEXT *const A = pd->above_context + aoff; - ENTROPY_CONTEXT *const L = pd->left_context + loff; + ENTROPY_CONTEXT *const a = pd->above_context + aoff; + ENTROPY_CONTEXT *const l = pd->left_context + loff; const int tx_size_in_blocks = 1 << tx_size; - if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) { - set_contexts_on_border(xd, pd, plane_bsize, tx_size_in_blocks, has_eob, - aoff, loff, A, L); + // above + if (has_eob && xd->mb_to_right_edge < 0) { + int i; + const int blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize] + + (xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + int above_contexts = tx_size_in_blocks; + if (above_contexts + aoff > blocks_wide) + above_contexts = blocks_wide - aoff; + + for (i = 0; i < above_contexts; ++i) + a[i] = has_eob; + for (i = above_contexts; i < tx_size_in_blocks; ++i) + a[i] = 0; + } else { + vpx_memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); + } + + // left + if (has_eob && xd->mb_to_bottom_edge < 0) { + int i; + const int blocks_high = num_4x4_blocks_high_lookup[plane_bsize] + + (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + int left_contexts = tx_size_in_blocks; + if (left_contexts + loff > blocks_high) + left_contexts = blocks_high - loff; + + for (i = 0; i < left_contexts; ++i) + l[i] = has_eob; + for (i = left_contexts; i < tx_size_in_blocks; ++i) + l[i] = 0; } else { - vpx_memset(A, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); - vpx_memset(L, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); + vpx_memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); } } diff --git a/libvpx/vp9/common/vp9_entropy.c b/libvpx/vp9/common/vp9_entropy.c index d3a867c..feceb66 100644 --- a/libvpx/vp9/common/vp9_entropy.c +++ b/libvpx/vp9/common/vp9_entropy.c @@ -37,15 +37,78 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = { }; DECLARE_ALIGNED(16, const uint8_t, - vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1]) = { + vp9_coefband_trans_8x8plus[1024]) = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 5 + 4, 4, 4, 4, 4, 5, + // beyond MAXBAND_INDEX+1 all values are filled as 5 + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, }; DECLARE_ALIGNED(16, const uint8_t, - vp9_coefband_trans_4x4[MAXBAND_INDEX + 1]) = { + vp9_coefband_trans_4x4[16]) = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, - 5, 5, 5, 5, 5, 5 }; DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = { @@ -332,7 +395,7 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size, if (l >= 3 && k == 0) continue; vp9_tree_probs_from_distribution(vp9_coefmodel_tree, branch_ct, - coef_counts[i][j][k][l], 0); + coef_counts[i][j][k][l]); branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0]; for (m = 0; m < UNCONSTRAINED_NODES; ++m) dst_coef_probs[i][j][k][l][m] = merge_probs( diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h index c58e852..e133d65 100644 --- a/libvpx/vp9/common/vp9_entropy.h +++ b/libvpx/vp9/common/vp9_entropy.h @@ -51,7 +51,7 @@ extern const vp9_tree_index vp9_coefmodel_tree[]; extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS]; typedef struct { - vp9_tree_index *tree; + const vp9_tree_index *tree; const vp9_prob *prob; int len; int base_val; @@ -120,17 +120,15 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) { // This is the index in the scan order beyond which all coefficients for // 8x8 transform and above are in the top band. -// For 4x4 blocks the index is less but to keep things common the lookup -// table for 4x4 is padded out to this index. +// This macro is currently unused but may be used by certain implementations #define MAXBAND_INDEX 21 -extern const uint8_t vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1]; -extern const uint8_t vp9_coefband_trans_4x4[MAXBAND_INDEX + 1]; +extern const uint8_t vp9_coefband_trans_8x8plus[1024]; +extern const uint8_t vp9_coefband_trans_4x4[16]; - -static int get_coef_band(const uint8_t * band_translate, int coef_index) { - return (coef_index > MAXBAND_INDEX) - ? (COEF_BANDS-1) : band_translate[coef_index]; +static const uint8_t *get_band_translate(TX_SIZE tx_size) { + return tx_size == TX_4X4 ? vp9_coefband_trans_4x4 + : vp9_coefband_trans_8x8plus; } // 128 lists of probabilities are stored for the following ONE node probs: @@ -181,11 +179,6 @@ static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, return combine_entropy_contexts(above_ec, left_ec); } -static const uint8_t *get_band_translate(TX_SIZE tx_size) { - return tx_size == TX_4X4 ? vp9_coefband_trans_4x4 - : vp9_coefband_trans_8x8plus; -} - static void get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, PLANE_TYPE type, int block_idx, const int16_t **scan, const int16_t **scan_nb) { diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c index a963d55..3b2510d 100644 --- a/libvpx/vp9/common/vp9_entropymode.c +++ b/libvpx/vp9/common/vp9_entropymode.c @@ -235,9 +235,9 @@ const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = { struct vp9_token vp9_intra_mode_encodings[INTRA_MODES]; const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = { - -ZEROMV, 2, - -NEARESTMV, 4, - -NEARMV, -NEWMV + -INTER_OFFSET(ZEROMV), 2, + -INTER_OFFSET(NEARESTMV), 4, + -INTER_OFFSET(NEARMV), -INTER_OFFSET(NEWMV) }; struct vp9_token vp9_inter_mode_encodings[INTER_MODES]; @@ -343,8 +343,7 @@ void vp9_entropy_mode_init() { vp9_tokens_from_tree(vp9_switchable_interp_encodings, vp9_switchable_interp_tree); vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree); - vp9_tokens_from_tree_offset(vp9_inter_mode_encodings, - vp9_inter_mode_tree, NEARESTMV); + vp9_tokens_from_tree(vp9_inter_mode_encodings, vp9_inter_mode_tree); } #define COUNT_SAT 20 @@ -356,9 +355,9 @@ static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) { static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, const unsigned int *counts, - unsigned int offset, vp9_prob *probs) { - tree_merge_probs(tree, pre_probs, counts, offset, - COUNT_SAT, MAX_UPDATE_FACTOR, probs); + vp9_prob *probs) { + tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR, + probs); } void vp9_adapt_mode_probs(VP9_COMMON *cm) { @@ -383,25 +382,24 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { for (i = 0; i < INTER_MODE_CONTEXTS; i++) adapt_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i], - counts->inter_mode[i], NEARESTMV, fc->inter_mode_probs[i]); + counts->inter_mode[i], fc->inter_mode_probs[i]); for (i = 0; i < BLOCK_SIZE_GROUPS; i++) adapt_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i], - counts->y_mode[i], 0, fc->y_mode_prob[i]); + counts->y_mode[i], fc->y_mode_prob[i]); for (i = 0; i < INTRA_MODES; ++i) adapt_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i], - counts->uv_mode[i], 0, fc->uv_mode_prob[i]); + counts->uv_mode[i], fc->uv_mode_prob[i]); for (i = 0; i < PARTITION_CONTEXTS; i++) adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i], - counts->partition[i], 0, fc->partition_prob[i]); + counts->partition[i], fc->partition_prob[i]); if (cm->mcomp_filter_type == SWITCHABLE) { for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i], - counts->switchable_interp[i], 0, - fc->switchable_interp_prob[i]); + counts->switchable_interp[i], fc->switchable_interp_prob[i]); } if (cm->tx_mode == TX_MODE_SELECT) { diff --git a/libvpx/vp9/common/vp9_entropymv.c b/libvpx/vp9/common/vp9_entropymv.c index b061cdb..290dcdd 100644 --- a/libvpx/vp9/common/vp9_entropymv.c +++ b/libvpx/vp9/common/vp9_entropymv.c @@ -196,8 +196,8 @@ static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) { static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, const unsigned int *counts, vp9_prob *probs) { - tree_merge_probs(tree, pre_probs, counts, 0, - MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR, probs); + tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR, + probs); } void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) { @@ -207,8 +207,7 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) { const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc; const nmv_context_counts *counts = &cm->counts.mv; - adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints, - fc->joints); + adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints, fc->joints); for (i = 0; i < 2; ++i) { nmv_component *comp = &fc->comps[i]; diff --git a/libvpx/vp9/common/vp9_extend.c b/libvpx/vp9/common/vp9_extend.c index 07c68c8..836bf0e 100644 --- a/libvpx/vp9/common/vp9_extend.c +++ b/libvpx/vp9/common/vp9_extend.c @@ -62,7 +62,7 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, const int et_y = 16; const int el_y = 16; // Motion estimation may use src block variance with the block size up - // to 64x64, so the right and bottom need to be extended to 64 mulitple + // to 64x64, so the right and bottom need to be extended to 64 multiple // or up to 16, whichever is greater. const int eb_y = MAX(ALIGN_POWER_OF_TWO(src->y_width, 6) - src->y_width, 16); diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c index 218e12e..ff504a1 100644 --- a/libvpx/vp9/common/vp9_loopfilter.c +++ b/libvpx/vp9/common/vp9_loopfilter.c @@ -32,6 +32,8 @@ typedef struct { uint16_t left_uv[TX_SIZES]; uint16_t above_uv[TX_SIZES]; uint16_t int_4x4_uv; + uint8_t lfl_y[64]; + uint8_t lfl_uv[16]; } LOOP_FILTER_MASK; // 64 bit masks for left transform size. Each 1 represents a position where @@ -322,20 +324,14 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) { } } -static int build_lfi(const loop_filter_info_n *lfi_n, - const MB_MODE_INFO *mbmi, - const loop_filter_thresh **lfi) { +static uint8_t build_lfi(const loop_filter_info_n *lfi_n, + const MB_MODE_INFO *mbmi) { const int seg = mbmi->segment_id; const int ref = mbmi->ref_frame[0]; const int mode = lfi_n->mode_lf_lut[mbmi->mode]; const int filter_level = lfi_n->lvl[seg][ref][mode]; - if (filter_level > 0) { - *lfi = &lfi_n->lfthr[filter_level]; - return 1; - } else { - return 0; - } + return filter_level; } static void filter_selectively_vert(uint8_t *s, int pitch, @@ -343,12 +339,13 @@ static void filter_selectively_vert(uint8_t *s, int pitch, unsigned int mask_8x8, unsigned int mask_4x4, unsigned int mask_4x4_int, - const loop_filter_thresh **p_lfi) { + const loop_filter_info_n *lfi_n, + const uint8_t *lfl) { unsigned int mask; for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask; mask >>= 1) { - const loop_filter_thresh *lfi = *p_lfi; + const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl; if (mask & 1) { if (mask_16x16 & 1) { @@ -373,7 +370,7 @@ static void filter_selectively_vert(uint8_t *s, int pitch, vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); s += 8; - p_lfi++; + lfl += 1; mask_16x16 >>= 1; mask_8x8 >>= 1; mask_4x4 >>= 1; @@ -386,49 +383,112 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, unsigned int mask_8x8, unsigned int mask_4x4, unsigned int mask_4x4_int, - int only_4x4_1, - const loop_filter_thresh **p_lfi) { + const loop_filter_info_n *lfi_n, + const uint8_t *lfl) { unsigned int mask; int count; for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask; mask >>= count) { - const loop_filter_thresh *lfi = *p_lfi; + const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl; count = 1; if (mask & 1) { - if (!only_4x4_1) { - if (mask_16x16 & 1) { - if ((mask_16x16 & 3) == 3) { - vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); - count = 2; + if (mask_16x16 & 1) { + if ((mask_16x16 & 3) == 3) { + vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); + count = 2; + } else { + vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); + } + assert(!(mask_8x8 & 1)); + assert(!(mask_4x4 & 1)); + assert(!(mask_4x4_int & 1)); + } else if (mask_8x8 & 1) { + if ((mask_8x8 & 3) == 3) { + // Next block's thresholds + const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1); + + // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering. + vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); + vp9_mbloop_filter_horizontal_edge(s + 8, pitch, lfin->mblim, + lfin->lim, lfin->hev_thr, 1); + + if ((mask_4x4_int & 3) == 3) { + // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering. + vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); + vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch, + lfin->mblim, lfin->lim, + lfin->hev_thr, 1); } else { - vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + if (mask_4x4_int & 1) + vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); + else if (mask_4x4_int & 2) + vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch, + lfin->mblim, lfin->lim, + lfin->hev_thr, 1); } - assert(!(mask_8x8 & 1)); - assert(!(mask_4x4 & 1)); - assert(!(mask_4x4_int & 1)); - } else if (mask_8x8 & 1) { + count = 2; + } else { vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); - assert(!(mask_16x16 & 1)); - assert(!(mask_4x4 & 1)); - } else if (mask_4x4 & 1) { - vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); - assert(!(mask_16x16 & 1)); - assert(!(mask_8x8 & 1)); + + if (mask_4x4_int & 1) + vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); } - } + assert(!(mask_16x16 & 1)); + assert(!(mask_4x4 & 1)); + } else if (mask_4x4 & 1) { + if ((mask_4x4 & 3) == 3) { + // Next block's thresholds + const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1); - if (mask_4x4_int & 1) + // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering. + vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); + vp9_loop_filter_horizontal_edge(s + 8, pitch, lfin->mblim, lfin->lim, + lfin->hev_thr, 1); + + if ((mask_4x4_int & 3) == 3) { + // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering. + vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); + vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch, + lfin->mblim, lfin->lim, + lfin->hev_thr, 1); + } else { + if (mask_4x4_int & 1) + vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); + else if (mask_4x4_int & 2) + vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch, + lfin->mblim, lfin->lim, + lfin->hev_thr, 1); + } + count = 2; + } else { + vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); + + if (mask_4x4_int & 1) + vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); + } + assert(!(mask_16x16 & 1)); + assert(!(mask_8x8 & 1)); + } else if (mask_4x4_int & 1) { vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + } } s += 8 * count; - p_lfi += count; + lfl += count; mask_16x16 >>= count; mask_8x8 >>= count; mask_4x4 >>= count; @@ -461,10 +521,20 @@ static void build_masks(const loop_filter_info_n *const lfi_n, uint16_t *left_uv = &lfm->left_uv[tx_size_uv]; uint16_t *above_uv = &lfm->above_uv[tx_size_uv]; uint16_t *int_4x4_uv = &lfm->int_4x4_uv; + int i; + int w = num_8x8_blocks_wide_lookup[block_size]; + int h = num_8x8_blocks_high_lookup[block_size]; // If filter level is 0 we don't loop filter. - if (!filter_level) + if (!filter_level) { return; + } else { + int index = shift_y; + for (i = 0; i < h; i++) { + vpx_memset(&lfm->lfl_y[index], filter_level, w); + index += 8; + } + } // These set 1 in the current block size for the block size edges. // For instance if the block size is 32x16, we'll set : @@ -530,9 +600,19 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n, uint64_t *left_y = &lfm->left_y[tx_size_y]; uint64_t *above_y = &lfm->above_y[tx_size_y]; uint64_t *int_4x4_y = &lfm->int_4x4_y; + int i; + int w = num_8x8_blocks_wide_lookup[block_size]; + int h = num_8x8_blocks_high_lookup[block_size]; - if (!filter_level) + if (!filter_level) { return; + } else { + int index = shift_y; + for (i = 0; i < h; i++) { + vpx_memset(&lfm->lfl_y[index], filter_level, w); + index += 8; + } + } *above_y |= above_prediction_mask[block_size] << shift_y; *left_y |= left_prediction_mask[block_size] << shift_y; @@ -785,6 +865,7 @@ static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, } } } + #if CONFIG_NON420 static void filter_block_plane_non420(VP9_COMMON *cm, struct macroblockd_plane *plane, @@ -801,7 +882,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm, unsigned int mask_8x8[MI_BLOCK_SIZE] = {0}; unsigned int mask_4x4[MI_BLOCK_SIZE] = {0}; unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0}; - const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE]; + uint8_t lfl[MI_BLOCK_SIZE * MI_BLOCK_SIZE]; int r, c; for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) { @@ -830,7 +911,8 @@ static void filter_block_plane_non420(VP9_COMMON *cm, const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; // Filter level can vary per MI - if (!build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x])) + if (!(lfl[(r << 3) + (c >> ss_x)] = + build_lfi(&cm->lf_info, &mi[0].mbmi))) continue; // Build masks based on the transform size of each block @@ -887,7 +969,8 @@ static void filter_block_plane_non420(VP9_COMMON *cm, mask_16x16_c & border_mask, mask_8x8_c & border_mask, mask_4x4_c & border_mask, - mask_4x4_int[r], lfi[r]); + mask_4x4_int[r], + &cm->lf_info, &lfl[r << 3]); dst->buf += 8 * dst->stride; mi_8x8 += row_step_stride; } @@ -898,11 +981,26 @@ static void filter_block_plane_non420(VP9_COMMON *cm, const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r]; + unsigned int mask_16x16_r; + unsigned int mask_8x8_r; + unsigned int mask_4x4_r; + + if (mi_row + r == 0) { + mask_16x16_r = 0; + mask_8x8_r = 0; + mask_4x4_r = 0; + } else { + mask_16x16_r = mask_16x16[r]; + mask_8x8_r = mask_8x8[r]; + mask_4x4_r = mask_4x4[r]; + } + filter_selectively_horiz(dst->buf, dst->stride, - mask_16x16[r], - mask_8x8[r], - mask_4x4[r], - mask_4x4_int_r, mi_row + r == 0, lfi[r]); + mask_16x16_r, + mask_8x8_r, + mask_4x4_r, + mask_4x4_int_r, + &cm->lf_info, &lfl[r << 3]); dst->buf += 8 * dst->stride; } } @@ -910,81 +1008,137 @@ static void filter_block_plane_non420(VP9_COMMON *cm, static void filter_block_plane(VP9_COMMON *const cm, struct macroblockd_plane *const plane, - MODE_INFO **mi_8x8, - int mi_row, int mi_col, + int mi_row, LOOP_FILTER_MASK *lfm) { - const int ss_x = plane->subsampling_x; - const int ss_y = plane->subsampling_y; - const int row_step = 1 << ss_x; - const int col_step = 1 << ss_y; - const int row_step_stride = cm->mode_info_stride * row_step; struct buf_2d *const dst = &plane->dst; uint8_t* const dst0 = dst->buf; - unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0}; - const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE]; + unsigned int mask_4x4_int_row[MI_BLOCK_SIZE] = {0}; int r, c; - int row_shift = 3 - ss_x; - int row_mask = 0xff >> (ss_x << 2); -#define MASK_ROW(value) ((value >> (r_sampled << row_shift)) & row_mask) + if (!plane->plane_type) { + uint64_t mask_16x16 = lfm->left_y[TX_16X16]; + uint64_t mask_8x8 = lfm->left_y[TX_8X8]; + uint64_t mask_4x4 = lfm->left_y[TX_4X4]; + uint64_t mask_4x4_int = lfm->int_4x4_y; - for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) { - int r_sampled = r >> ss_x; + // Vertical pass + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) { + mask_4x4_int_row[r] = mask_4x4_int & 0xff; - // Determine the vertical edges that need filtering - for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) { - const MODE_INFO *mi = mi_8x8[c]; - - build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]); - } - if (!plane->plane_type) { - mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y); // Disable filtering on the leftmost column filter_selectively_vert(dst->buf, dst->stride, - MASK_ROW(lfm->left_y[TX_16X16]), - MASK_ROW(lfm->left_y[TX_8X8]), - MASK_ROW(lfm->left_y[TX_4X4]), - MASK_ROW(lfm->int_4x4_y), - lfi[r]); - } else { - mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_uv); + mask_16x16 & 0xff, + mask_8x8 & 0xff, + mask_4x4 & 0xff, + mask_4x4_int_row[r], + &cm->lf_info, &lfm->lfl_y[r << 3]); + + dst->buf += 8 * dst->stride; + mask_16x16 >>= 8; + mask_8x8 >>= 8; + mask_4x4 >>= 8; + mask_4x4_int >>= 8; + } + + // Horizontal pass + dst->buf = dst0; + mask_16x16 = lfm->above_y[TX_16X16]; + mask_8x8 = lfm->above_y[TX_8X8]; + mask_4x4 = lfm->above_y[TX_4X4]; + + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) { + unsigned int mask_16x16_r; + unsigned int mask_8x8_r; + unsigned int mask_4x4_r; + + if (mi_row + r == 0) { + mask_16x16_r = 0; + mask_8x8_r = 0; + mask_4x4_r = 0; + } else { + mask_16x16_r = mask_16x16 & 0xff; + mask_8x8_r = mask_8x8 & 0xff; + mask_4x4_r = mask_4x4 & 0xff; + } + + filter_selectively_horiz(dst->buf, dst->stride, + mask_16x16_r, + mask_8x8_r, + mask_4x4_r, + mask_4x4_int_row[r], + &cm->lf_info, &lfm->lfl_y[r << 3]); + + dst->buf += 8 * dst->stride; + mask_16x16 >>= 8; + mask_8x8 >>= 8; + mask_4x4 >>= 8; + } + } else { + uint16_t mask_16x16 = lfm->left_uv[TX_16X16]; + uint16_t mask_8x8 = lfm->left_uv[TX_8X8]; + uint16_t mask_4x4 = lfm->left_uv[TX_4X4]; + uint16_t mask_4x4_int = lfm->int_4x4_uv; + + // Vertical pass + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) { + if (plane->plane_type == 1) { + for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) + lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)]; + } + + mask_4x4_int_row[r] = mask_4x4_int & 0xf; // Disable filtering on the leftmost column filter_selectively_vert(dst->buf, dst->stride, - MASK_ROW(lfm->left_uv[TX_16X16]), - MASK_ROW(lfm->left_uv[TX_8X8]), - MASK_ROW(lfm->left_uv[TX_4X4]), - MASK_ROW(lfm->int_4x4_uv), - lfi[r]); + mask_16x16 & 0xf, + mask_8x8 & 0xf, + mask_4x4 & 0xf, + mask_4x4_int_row[r], + &cm->lf_info, &lfm->lfl_uv[r << 1]); + + dst->buf += 8 * dst->stride; + mask_16x16 >>= 4; + mask_8x8 >>= 4; + mask_4x4 >>= 4; + mask_4x4_int >>= 4; } - dst->buf += 8 * dst->stride; - mi_8x8 += row_step_stride; - } - // Now do horizontal pass - dst->buf = dst0; - for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) { - const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; - const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r]; - int r_sampled = r >> ss_x; + // Horizontal pass + dst->buf = dst0; + mask_16x16 = lfm->above_uv[TX_16X16]; + mask_8x8 = lfm->above_uv[TX_8X8]; + mask_4x4 = lfm->above_uv[TX_4X4]; + + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) { + const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1; + const unsigned int mask_4x4_int_r = skip_border_4x4_r ? + 0 : (mask_4x4_int_row[r]); + unsigned int mask_16x16_r; + unsigned int mask_8x8_r; + unsigned int mask_4x4_r; + + if (mi_row + r == 0) { + mask_16x16_r = 0; + mask_8x8_r = 0; + mask_4x4_r = 0; + } else { + mask_16x16_r = mask_16x16 & 0xf; + mask_8x8_r = mask_8x8 & 0xf; + mask_4x4_r = mask_4x4 & 0xf; + } - if (!plane->plane_type) { filter_selectively_horiz(dst->buf, dst->stride, - MASK_ROW(lfm->above_y[TX_16X16]), - MASK_ROW(lfm->above_y[TX_8X8]), - MASK_ROW(lfm->above_y[TX_4X4]), - MASK_ROW(lfm->int_4x4_y), - mi_row + r == 0, lfi[r]); - } else { - filter_selectively_horiz(dst->buf, dst->stride, - MASK_ROW(lfm->above_uv[TX_16X16]), - MASK_ROW(lfm->above_uv[TX_8X8]), - MASK_ROW(lfm->above_uv[TX_4X4]), + mask_16x16_r, + mask_8x8_r, + mask_4x4_r, mask_4x4_int_r, - mi_row + r == 0, lfi[r]); + &cm->lf_info, &lfm->lfl_uv[r << 1]); + + dst->buf += 8 * dst->stride; + mask_16x16 >>= 4; + mask_8x8 >>= 4; + mask_4x4 >>= 4; } - dst->buf += 8 * dst->stride; } -#undef MASK_ROW } void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, @@ -1017,8 +1171,7 @@ void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, #if CONFIG_NON420 if (use_420) #endif - filter_block_plane(cm, &xd->plane[plane], mi_8x8 + mi_col, mi_row, - mi_col, &lfm); + filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm); #if CONFIG_NON420 else filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col, diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h index 19032bf..9190930 100644 --- a/libvpx/vp9/common/vp9_pred_common.h +++ b/libvpx/vp9/common/vp9_pred_common.h @@ -109,32 +109,40 @@ static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm, unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd); -static const vp9_prob *get_tx_probs(BLOCK_SIZE bsize, uint8_t context, +static const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, const struct tx_probs *tx_probs) { - if (bsize < BLOCK_16X16) - return tx_probs->p8x8[context]; - else if (bsize < BLOCK_32X32) - return tx_probs->p16x16[context]; - else - return tx_probs->p32x32[context]; + switch (max_tx_size) { + case TX_8X8: + return tx_probs->p8x8[ctx]; + case TX_16X16: + return tx_probs->p16x16[ctx]; + case TX_32X32: + return tx_probs->p32x32[ctx]; + default: + assert(!"Invalid max_tx_size."); + return NULL; + } } -static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd, - const struct tx_probs *tx_probs, - const MODE_INFO *m) { - const BLOCK_SIZE bsize = m->mbmi.sb_type; - const int context = vp9_get_pred_context_tx_size(xd); - return get_tx_probs(bsize, context, tx_probs); +static const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size, const MACROBLOCKD *xd, + const struct tx_probs *tx_probs) { + const int ctx = vp9_get_pred_context_tx_size(xd); + return get_tx_probs(max_tx_size, ctx, tx_probs); } -static unsigned int *get_tx_counts(BLOCK_SIZE bsize, uint8_t context, +static unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, struct tx_counts *tx_counts) { - if (bsize < BLOCK_16X16) - return tx_counts->p8x8[context]; - else if (bsize < BLOCK_32X32) - return tx_counts->p16x16[context]; - else - return tx_counts->p32x32[context]; + switch (max_tx_size) { + case TX_8X8: + return tx_counts->p8x8[ctx]; + case TX_16X16: + return tx_counts->p16x16[ctx]; + case TX_32X32: + return tx_counts->p32x32[ctx]; + default: + assert(!"Invalid max_tx_size."); + return NULL; + } } #endif // VP9_COMMON_VP9_PRED_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_reconinter.c b/libvpx/vp9/common/vp9_reconinter.c index 1c96788..7cc66c8 100644 --- a/libvpx/vp9/common/vp9_reconinter.c +++ b/libvpx/vp9/common/vp9_reconinter.c @@ -117,16 +117,13 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv, return clamped_mv; } -struct build_inter_predictors_args { - MACROBLOCKD *xd; - int x, y; -}; - -static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize, - int pred_w, int pred_h, - void *argv) { - const struct build_inter_predictors_args* const arg = argv; - MACROBLOCKD *const xd = arg->xd; + +// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could +// calculate the subsampled BLOCK_SIZE, but that type isn't defined for +// sizes smaller than 16x16 yet. +static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, + BLOCK_SIZE bsize, int pred_w, int pred_h, + int mi_x, int mi_y) { struct macroblockd_plane *const pd = &xd->plane[plane]; const int bwl = b_width_log2(bsize) - pd->subsampling_x; const int bw = 4 << bwl; @@ -172,7 +169,7 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize, if (vp9_is_scaled(scale->sfc)) { pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, scale); - scale->sfc->set_scaled_offsets(scale, arg->y + y, arg->x + x); + scale->sfc->set_scaled_offsets(scale, mi_y + y, mi_x + x); scaled_mv = scale->sfc->scale_mv(&mv_q4, scale); xs = scale->sfc->x_step_q4; ys = scale->sfc->y_step_q4; @@ -190,40 +187,25 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize, } } -// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could -// calculate the subsampled BLOCK_SIZE, but that type isn't defined for -// sizes smaller than 16x16 yet. -typedef void (*foreach_predicted_block_visitor)(int plane, int block, - BLOCK_SIZE bsize, - int pred_w, int pred_h, - void *arg); -static INLINE void foreach_predicted_block_in_plane( - const MACROBLOCKD* const xd, BLOCK_SIZE bsize, int plane, - foreach_predicted_block_visitor visit, void *arg) { - const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; - const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y; - - if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) { - int i = 0, x, y; - assert(bsize == BLOCK_8X8); - for (y = 0; y < 1 << bhl; ++y) - for (x = 0; x < 1 << bwl; ++x) - visit(plane, i++, bsize, 0, 0, arg); - } else { - visit(plane, 0, bsize, bwl, bhl, arg); - } -} - static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize, int mi_row, int mi_col, int plane_from, int plane_to) { int plane; for (plane = plane_from; plane <= plane_to; ++plane) { - struct build_inter_predictors_args args = { - xd, mi_col * MI_SIZE, mi_row * MI_SIZE, - }; - foreach_predicted_block_in_plane(xd, bsize, plane, build_inter_predictors, - &args); + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; + const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y; + + if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) { + int i = 0, x, y; + assert(bsize == BLOCK_8X8); + for (y = 0; y < 1 << bhl; ++y) + for (x = 0; x < 1 << bwl; ++x) + build_inter_predictors(xd, plane, i++, bsize, 0, 0, mi_x, mi_y); + } else { + build_inter_predictors(xd, plane, 0, bsize, bwl, bhl, mi_x, mi_y); + } } } diff --git a/libvpx/vp9/common/vp9_rtcd_defs.sh b/libvpx/vp9/common/vp9_rtcd_defs.sh index debec61..2c0864e 100644 --- a/libvpx/vp9/common/vp9_rtcd_defs.sh +++ b/libvpx/vp9/common/vp9_rtcd_defs.sh @@ -41,7 +41,7 @@ prototype void vp9_d63_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const ui specialize vp9_d63_predictor_4x4 $ssse3_x86inc prototype void vp9_h_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_h_predictor_4x4 $ssse3_x86inc +specialize vp9_h_predictor_4x4 $ssse3_x86inc dspr2 prototype void vp9_d117_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d117_predictor_4x4 @@ -56,10 +56,10 @@ prototype void vp9_v_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint specialize vp9_v_predictor_4x4 $sse_x86inc prototype void vp9_tm_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_tm_predictor_4x4 $sse_x86inc +specialize vp9_tm_predictor_4x4 $sse_x86inc dspr2 prototype void vp9_dc_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_dc_predictor_4x4 $sse_x86inc +specialize vp9_dc_predictor_4x4 $sse_x86inc dspr2 prototype void vp9_dc_top_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_dc_top_predictor_4x4 @@ -80,7 +80,7 @@ prototype void vp9_d63_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const ui specialize vp9_d63_predictor_8x8 $ssse3_x86inc prototype void vp9_h_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_h_predictor_8x8 $ssse3_x86inc +specialize vp9_h_predictor_8x8 $ssse3_x86inc dspr2 prototype void vp9_d117_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d117_predictor_8x8 @@ -95,10 +95,10 @@ prototype void vp9_v_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint specialize vp9_v_predictor_8x8 $sse_x86inc prototype void vp9_tm_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_tm_predictor_8x8 $sse2_x86inc +specialize vp9_tm_predictor_8x8 $sse2_x86inc dspr2 prototype void vp9_dc_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_dc_predictor_8x8 $sse_x86inc +specialize vp9_dc_predictor_8x8 $sse_x86inc dspr2 prototype void vp9_dc_top_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_dc_top_predictor_8x8 @@ -119,7 +119,7 @@ prototype void vp9_d63_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const specialize vp9_d63_predictor_16x16 $ssse3_x86inc prototype void vp9_h_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_h_predictor_16x16 $ssse3_x86inc +specialize vp9_h_predictor_16x16 $ssse3_x86inc dspr2 prototype void vp9_d117_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d117_predictor_16x16 @@ -137,7 +137,7 @@ prototype void vp9_tm_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const u specialize vp9_tm_predictor_16x16 $sse2_x86inc prototype void vp9_dc_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_dc_predictor_16x16 $sse2_x86inc +specialize vp9_dc_predictor_16x16 $sse2_x86inc dspr2 prototype void vp9_dc_top_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_dc_top_predictor_16x16 @@ -191,22 +191,22 @@ specialize vp9_dc_128_predictor_32x32 # Loopfilter # prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh" -specialize vp9_mb_lpf_vertical_edge_w sse2 neon +specialize vp9_mb_lpf_vertical_edge_w sse2 neon dspr2 prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_mbloop_filter_vertical_edge sse2 neon +specialize vp9_mbloop_filter_vertical_edge sse2 neon dspr2 prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_loop_filter_vertical_edge mmx neon +specialize vp9_loop_filter_vertical_edge mmx neon dspr2 prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon +specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon dspr2 prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_mbloop_filter_horizontal_edge sse2 neon +specialize vp9_mbloop_filter_horizontal_edge sse2 neon dspr2 prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_loop_filter_horizontal_edge mmx neon +specialize vp9_loop_filter_horizontal_edge mmx neon dspr2 # # post proc @@ -296,10 +296,10 @@ prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int specialize vp9_idct32x32_1024_add sse2 neon dspr2 prototype void vp9_idct32x32_34_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct32x32_34_add sse2 +specialize vp9_idct32x32_34_add sse2 dspr2 prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_idct32x32_1_add sse2 dspr2 +specialize vp9_idct32x32_1_add sse2 neon dspr2 prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type" specialize vp9_iht4x4_16_add sse2 neon dspr2 diff --git a/libvpx/vp9/common/vp9_treecoder.c b/libvpx/vp9/common/vp9_treecoder.c index 1805fb4..e2a5b9f 100644 --- a/libvpx/vp9/common/vp9_treecoder.c +++ b/libvpx/vp9/common/vp9_treecoder.c @@ -35,28 +35,20 @@ void vp9_tokens_from_tree(struct vp9_token *p, vp9_tree t) { tree2tok(p, t, 0, 0, 0); } -void vp9_tokens_from_tree_offset(struct vp9_token *p, vp9_tree t, - int offset) { - tree2tok(p - offset, t, 0, 0, 0); -} - static unsigned int convert_distribution(unsigned int i, vp9_tree tree, unsigned int branch_ct[][2], - const unsigned int num_events[], - unsigned int tok0_offset) { + const unsigned int num_events[]) { unsigned int left, right; - if (tree[i] <= 0) { - left = num_events[-tree[i] - tok0_offset]; - } else { - left = convert_distribution(tree[i], tree, branch_ct, num_events, - tok0_offset); - } + if (tree[i] <= 0) + left = num_events[-tree[i]]; + else + left = convert_distribution(tree[i], tree, branch_ct, num_events); + if (tree[i + 1] <= 0) - right = num_events[-tree[i + 1] - tok0_offset]; + right = num_events[-tree[i + 1]]; else - right = convert_distribution(tree[i + 1], tree, branch_ct, num_events, - tok0_offset); + right = convert_distribution(tree[i + 1], tree, branch_ct, num_events); branch_ct[i >> 1][0] = left; branch_ct[i >> 1][1] = right; @@ -65,9 +57,8 @@ static unsigned int convert_distribution(unsigned int i, vp9_tree tree, void vp9_tree_probs_from_distribution(vp9_tree tree, unsigned int branch_ct[/* n-1 */][2], - const unsigned int num_events[/* n */], - unsigned int tok0_offset) { - convert_distribution(0, tree, branch_ct, num_events, tok0_offset); + const unsigned int num_events[/* n */]) { + convert_distribution(0, tree, branch_ct, num_events); } diff --git a/libvpx/vp9/common/vp9_treecoder.h b/libvpx/vp9/common/vp9_treecoder.h index 9c776d6..a79b156 100644 --- a/libvpx/vp9/common/vp9_treecoder.h +++ b/libvpx/vp9/common/vp9_treecoder.h @@ -42,7 +42,6 @@ struct vp9_token { /* Construct encoding array from tree. */ void vp9_tokens_from_tree(struct vp9_token*, vp9_tree); -void vp9_tokens_from_tree_offset(struct vp9_token*, vp9_tree, int offset); /* Convert array of token occurrence counts into a table of probabilities for the associated binary encoding tree. Also writes count of branches @@ -51,8 +50,7 @@ void vp9_tokens_from_tree_offset(struct vp9_token*, vp9_tree, int offset); void vp9_tree_probs_from_distribution(vp9_tree tree, unsigned int branch_ct[ /* n - 1 */ ][2], - const unsigned int num_events[ /* n */ ], - unsigned int tok0_offset); + const unsigned int num_events[ /* n */ ]); static INLINE vp9_prob clip_prob(int p) { @@ -116,10 +114,10 @@ static unsigned int tree_merge_probs_impl(unsigned int i, static void tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, - const unsigned int *counts, int offset, + const unsigned int *counts, unsigned int count_sat, unsigned int max_update_factor, vp9_prob *probs) { - tree_merge_probs_impl(0, tree, pre_probs, &counts[-offset], + tree_merge_probs_impl(0, tree, pre_probs, counts, count_sat, max_update_factor, probs); } diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c index ccf5aac..2a33844 100644 --- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -15,6 +15,16 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_idct.h" +#define RECON_AND_STORE4X4(dest, in_x) \ +{ \ + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + *(int *)dest = _mm_cvtsi128_si32(d0); \ + dest += stride; \ +} + void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); @@ -26,21 +36,19 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { __m128i input0, input1, input2, input3; // Rows - input0 = _mm_loadl_epi64((const __m128i *)input); - input1 = _mm_loadl_epi64((const __m128i *)(input + 4)); - input2 = _mm_loadl_epi64((const __m128i *)(input + 8)); - input3 = _mm_loadl_epi64((const __m128i *)(input + 12)); + input0 = _mm_load_si128((const __m128i *)input); + input2 = _mm_load_si128((const __m128i *)(input + 8)); // Construct i3, i1, i3, i1, i2, i0, i2, i0 input0 = _mm_shufflelo_epi16(input0, 0xd8); - input1 = _mm_shufflelo_epi16(input1, 0xd8); + input0 = _mm_shufflehi_epi16(input0, 0xd8); input2 = _mm_shufflelo_epi16(input2, 0xd8); - input3 = _mm_shufflelo_epi16(input3, 0xd8); + input2 = _mm_shufflehi_epi16(input2, 0xd8); + input1 = _mm_unpackhi_epi32(input0, input0); input0 = _mm_unpacklo_epi32(input0, input0); - input1 = _mm_unpacklo_epi32(input1, input1); + input3 = _mm_unpackhi_epi32(input2, input2); input2 = _mm_unpacklo_epi32(input2, input2); - input3 = _mm_unpacklo_epi32(input3, input3); // Stage 1 input0 = _mm_madd_epi16(input0, cst); @@ -59,16 +67,14 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); // Stage 2 - input0 = _mm_packs_epi32(input0, zero); - input1 = _mm_packs_epi32(input1, zero); - input2 = _mm_packs_epi32(input2, zero); - input3 = _mm_packs_epi32(input3, zero); + input0 = _mm_packs_epi32(input0, input1); + input1 = _mm_packs_epi32(input2, input3); // Transpose - input1 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpacklo_epi16(input2, input3); - input0 = _mm_unpacklo_epi32(input1, input3); - input1 = _mm_unpackhi_epi32(input1, input3); + input2 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpackhi_epi16(input0, input1); + input0 = _mm_unpacklo_epi32(input2, input3); + input1 = _mm_unpackhi_epi32(input2, input3); // Switch column2, column 3, and then, we got: // input2: column1, column 0; input3: column2, column 3. @@ -78,14 +84,9 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { // Columns // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_shufflelo_epi16(input2, 0xd8); - input1 = _mm_shufflehi_epi16(input2, 0xd8); - input2 = _mm_shufflehi_epi16(input3, 0xd8); - input3 = _mm_shufflelo_epi16(input3, 0xd8); - - input0 = _mm_unpacklo_epi32(input0, input0); - input1 = _mm_unpackhi_epi32(input1, input1); - input2 = _mm_unpackhi_epi32(input2, input2); + input0 = _mm_unpacklo_epi32(input2, input2); + input1 = _mm_unpackhi_epi32(input2, input2); + input2 = _mm_unpackhi_epi32(input3, input3); input3 = _mm_unpacklo_epi32(input3, input3); // Stage 1 @@ -105,16 +106,14 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); // Stage 2 - input0 = _mm_packs_epi32(input0, zero); - input1 = _mm_packs_epi32(input1, zero); - input2 = _mm_packs_epi32(input2, zero); - input3 = _mm_packs_epi32(input3, zero); + input0 = _mm_packs_epi32(input0, input2); + input1 = _mm_packs_epi32(input1, input3); // Transpose - input1 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpacklo_epi16(input2, input3); - input0 = _mm_unpacklo_epi32(input1, input3); - input1 = _mm_unpackhi_epi32(input1, input3); + input2 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpackhi_epi16(input0, input1); + input0 = _mm_unpacklo_epi32(input2, input3); + input1 = _mm_unpackhi_epi32(input2, input3); // Switch column2, column 3, and then, we got: // input2: column1, column 0; input3: column2, column 3. @@ -129,23 +128,31 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { input2 = _mm_srai_epi16(input2, 4); input3 = _mm_srai_epi16(input3, 4); -#define RECON_AND_STORE4X4(dest, in_x) \ - { \ - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - *(int *)dest = _mm_cvtsi128_si32(d0); \ - dest += stride; \ + // Reconstruction and Store + { + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); + __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); + d0 = _mm_unpacklo_epi32(d0, + _mm_cvtsi32_si128(*(const int *) (dest + stride))); + d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128( + *(const int *) (dest + stride * 3)), d2); + d0 = _mm_unpacklo_epi8(d0, zero); + d2 = _mm_unpacklo_epi8(d2, zero); + d0 = _mm_add_epi16(d0, input2); + d2 = _mm_add_epi16(d2, input3); + d0 = _mm_packus_epi16(d0, d2); + // store input0 + *(int *)dest = _mm_cvtsi128_si32(d0); + // store input1 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); + // store input2 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); + // store input3 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); } - - input0 = _mm_srli_si128(input2, 8); - input1 = _mm_srli_si128(input3, 8); - - RECON_AND_STORE4X4(dest, input2); - RECON_AND_STORE4X4(dest, input0); - RECON_AND_STORE4X4(dest, input1); - RECON_AND_STORE4X4(dest, input3); } void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { diff --git a/libvpx/vp9/decoder/vp9_decodemv.c b/libvpx/vp9/decoder/vp9_decodemv.c index 9792d2c..b948429 100644 --- a/libvpx/vp9/decoder/vp9_decodemv.c +++ b/libvpx/vp9/decoder/vp9_decodemv.c @@ -48,12 +48,13 @@ static MB_PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r, } static MB_PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r, - uint8_t context) { - const MB_PREDICTION_MODE mode = treed_read(r, vp9_inter_mode_tree, - cm->fc.inter_mode_probs[context]); + int ctx) { + const int mode = treed_read(r, vp9_inter_mode_tree, + cm->fc.inter_mode_probs[ctx]); if (!cm->frame_parallel_decoding_mode) - ++cm->counts.inter_mode[context][inter_mode_offset(mode)]; - return mode; + ++cm->counts.inter_mode[ctx][mode]; + + return NEARESTMV + mode; } static int read_segment_id(vp9_reader *r, const struct segmentation *seg) { @@ -61,31 +62,28 @@ static int read_segment_id(vp9_reader *r, const struct segmentation *seg) { } static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, - BLOCK_SIZE bsize, vp9_reader *r) { - const uint8_t context = vp9_get_pred_context_tx_size(xd); - const vp9_prob *tx_probs = get_tx_probs(bsize, context, &cm->fc.tx_probs); + TX_SIZE max_tx_size, vp9_reader *r) { + const int ctx = vp9_get_pred_context_tx_size(xd); + const vp9_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc.tx_probs); TX_SIZE tx_size = vp9_read(r, tx_probs[0]); - if (tx_size != TX_4X4 && bsize >= BLOCK_16X16) { + if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) { tx_size += vp9_read(r, tx_probs[1]); - if (tx_size != TX_8X8 && bsize >= BLOCK_32X32) + if (tx_size != TX_8X8 && max_tx_size >= TX_32X32) tx_size += vp9_read(r, tx_probs[2]); } if (!cm->frame_parallel_decoding_mode) - ++get_tx_counts(bsize, context, &cm->counts.tx)[tx_size]; + ++get_tx_counts(max_tx_size, ctx, &cm->counts.tx)[tx_size]; return tx_size; } -static TX_SIZE read_tx_size(VP9_COMMON *const cm, MACROBLOCKD *const xd, - TX_MODE tx_mode, BLOCK_SIZE bsize, int allow_select, - vp9_reader *r) { - if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8) { - return read_selected_tx_size(cm, xd, bsize, r); - } else { - const TX_SIZE max_tx_size_block = max_txsize_lookup[bsize]; - const TX_SIZE max_tx_size_txmode = tx_mode_to_biggest_tx_size[tx_mode]; - return MIN(max_tx_size_block, max_tx_size_txmode); - } +static TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, TX_MODE tx_mode, + BLOCK_SIZE bsize, int allow_select, vp9_reader *r) { + const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; + if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8) + return read_selected_tx_size(cm, xd, max_tx_size, r); + else + return MIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]); } static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize, @@ -260,6 +258,16 @@ static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref, mv->col = ref->col + diff.col; } +static COMPPREDMODE_TYPE read_reference_mode(VP9_COMMON *cm, + const MACROBLOCKD *xd, + vp9_reader *r) { + const int ctx = vp9_get_pred_context_comp_inter_inter(cm, xd); + const int mode = vp9_read(r, cm->fc.comp_inter_prob[ctx]); + if (!cm->frame_parallel_decoding_mode) + ++cm->counts.comp_inter[ctx][mode]; + return mode; // SINGLE_PREDICTION_ONLY or COMP_PREDICTION_ONLY +} + // Read the referncence frame static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, vp9_reader *r, @@ -271,27 +279,20 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, ref_frame[0] = vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME); ref_frame[1] = NONE; } else { - const int comp_ctx = vp9_get_pred_context_comp_inter_inter(cm, xd); - int is_comp; - - if (cm->comp_pred_mode == HYBRID_PREDICTION) { - is_comp = vp9_read(r, fc->comp_inter_prob[comp_ctx]); - if (!cm->frame_parallel_decoding_mode) - ++counts->comp_inter[comp_ctx][is_comp]; - } else { - is_comp = cm->comp_pred_mode == COMP_PREDICTION_ONLY; - } + const COMPPREDMODE_TYPE mode = (cm->comp_pred_mode == HYBRID_PREDICTION) + ? read_reference_mode(cm, xd, r) + : cm->comp_pred_mode; // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding - if (is_comp) { - const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; - const int ref_ctx = vp9_get_pred_context_comp_ref_p(cm, xd); - const int b = vp9_read(r, fc->comp_ref_prob[ref_ctx]); + if (mode == COMP_PREDICTION_ONLY) { + const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; + const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd); + const int bit = vp9_read(r, fc->comp_ref_prob[ctx]); if (!cm->frame_parallel_decoding_mode) - ++counts->comp_ref[ref_ctx][b]; - ref_frame[fix_ref_idx] = cm->comp_fixed_ref; - ref_frame[!fix_ref_idx] = cm->comp_var_ref[b]; - } else { + ++counts->comp_ref[ctx][bit]; + ref_frame[idx] = cm->comp_fixed_ref; + ref_frame[!idx] = cm->comp_var_ref[bit]; + } else if (mode == SINGLE_PREDICTION_ONLY) { const int ctx0 = vp9_get_pred_context_single_ref_p1(xd); const int bit0 = vp9_read(r, fc->single_ref_prob[ctx0][0]); if (!cm->frame_parallel_decoding_mode) @@ -299,14 +300,16 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, if (bit0) { const int ctx1 = vp9_get_pred_context_single_ref_p2(xd); const int bit1 = vp9_read(r, fc->single_ref_prob[ctx1][1]); - ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME; if (!cm->frame_parallel_decoding_mode) ++counts->single_ref[ctx1][1][bit1]; + ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME; } else { ref_frame[0] = LAST_FRAME; } ref_frame[1] = NONE; + } else { + assert(!"Invalid prediction mode."); } } } diff --git a/libvpx/vp9/decoder/vp9_decodframe.c b/libvpx/vp9/decoder/vp9_decodframe.c index 4746a3a..3c4781b 100644 --- a/libvpx/vp9/decoder/vp9_decodframe.c +++ b/libvpx/vp9/decoder/vp9_decodframe.c @@ -42,6 +42,9 @@ typedef struct TileWorkerData { vp9_reader bit_reader; DECLARE_ALIGNED(16, MACROBLOCKD, xd); DECLARE_ALIGNED(16, unsigned char, token_cache[1024]); + DECLARE_ALIGNED(16, int16_t, qcoeff[MAX_MB_PLANE][64 * 64]); + DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]); + DECLARE_ALIGNED(16, uint16_t, eobs[MAX_MB_PLANE][256]); } TileWorkerData; static int read_be32(const uint8_t *p) { @@ -153,47 +156,38 @@ static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) { vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]); } -static void update_mv(vp9_reader *r, vp9_prob *p) { - if (vp9_read(r, NMV_UPDATE_PROB)) - *p = (vp9_read_literal(r, 7) << 1) | 1; +static void update_mv_probs(vp9_prob *p, int n, vp9_reader *r) { + int i; + for (i = 0; i < n; ++i) + if (vp9_read(r, NMV_UPDATE_PROB)) + p[i] = (vp9_read_literal(r, 7) << 1) | 1; } -static void read_mv_probs(vp9_reader *r, nmv_context *mvc, int allow_hp) { - int i, j, k; +static void read_mv_probs(nmv_context *ctx, int allow_hp, vp9_reader *r) { + int i, j; - for (j = 0; j < MV_JOINTS - 1; ++j) - update_mv(r, &mvc->joints[j]); + update_mv_probs(ctx->joints, MV_JOINTS - 1, r); for (i = 0; i < 2; ++i) { - nmv_component *const comp = &mvc->comps[i]; - - update_mv(r, &comp->sign); - - for (j = 0; j < MV_CLASSES - 1; ++j) - update_mv(r, &comp->classes[j]); - - for (j = 0; j < CLASS0_SIZE - 1; ++j) - update_mv(r, &comp->class0[j]); - - for (j = 0; j < MV_OFFSET_BITS; ++j) - update_mv(r, &comp->bits[j]); + nmv_component *const comp_ctx = &ctx->comps[i]; + update_mv_probs(&comp_ctx->sign, 1, r); + update_mv_probs(comp_ctx->classes, MV_CLASSES - 1, r); + update_mv_probs(comp_ctx->class0, CLASS0_SIZE - 1, r); + update_mv_probs(comp_ctx->bits, MV_OFFSET_BITS, r); } for (i = 0; i < 2; ++i) { - nmv_component *const comp = &mvc->comps[i]; - + nmv_component *const comp_ctx = &ctx->comps[i]; for (j = 0; j < CLASS0_SIZE; ++j) - for (k = 0; k < 3; ++k) - update_mv(r, &comp->class0_fp[j][k]); - - for (j = 0; j < 3; ++j) - update_mv(r, &comp->fp[j]); + update_mv_probs(comp_ctx->class0_fp[j], 3, r); + update_mv_probs(comp_ctx->fp, 3, r); } if (allow_hp) { for (i = 0; i < 2; ++i) { - update_mv(r, &mvc->comps[i].class0_hp); - update_mv(r, &mvc->comps[i].hp); + nmv_component *const comp_ctx = &ctx->comps[i]; + update_mv_probs(&comp_ctx->class0_hp, 1, r); + update_mv_probs(&comp_ctx->hp, 1, r); } } } @@ -209,20 +203,22 @@ static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) { // Allocate storage for each tile column. // TODO(jzern): when max_threads <= 1 the same storage could be used for each // tile. -static void alloc_tile_storage(VP9D_COMP *pbi, int tile_cols) { +static void alloc_tile_storage(VP9D_COMP *pbi, int tile_rows, int tile_cols) { VP9_COMMON *const cm = &pbi->common; const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); - int i, tile_col; + int i, tile_row, tile_col; CHECK_MEM_ERROR(cm, pbi->mi_streams, - vpx_realloc(pbi->mi_streams, tile_cols * + vpx_realloc(pbi->mi_streams, tile_rows * tile_cols * sizeof(*pbi->mi_streams))); - for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - TileInfo tile; - - vp9_tile_init(&tile, cm, 0, tile_col); - pbi->mi_streams[tile_col] = - &cm->mi[cm->mi_rows * tile.mi_col_start]; + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileInfo tile; + vp9_tile_init(&tile, cm, tile_row, tile_col); + pbi->mi_streams[tile_row * tile_cols + tile_col] = + &cm->mi[tile.mi_row_start * cm->mode_info_stride + + tile.mi_col_start]; + } } // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm @@ -246,31 +242,30 @@ static void alloc_tile_storage(VP9D_COMP *pbi, int tile_cols) { } static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block, - BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { + TX_SIZE tx_size, int x, int y) { struct macroblockd_plane *const pd = &xd->plane[plane]; - int16_t* const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - const int stride = pd->dst.stride; const int eob = pd->eobs[block]; if (eob > 0) { TX_TYPE tx_type; - const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size, - block); - uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block, - pd->dst.buf, stride); + const int plane_type = pd->plane_type; + const int stride = pd->dst.stride; + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint8_t *const dst = &pd->dst.buf[4 * y * stride + 4 * x]; + switch (tx_size) { case TX_4X4: - tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block); + tx_type = get_tx_type_4x4(plane_type, xd, block); if (tx_type == DCT_DCT) xd->itxm_add(dqcoeff, dst, stride, eob); else vp9_iht4x4_16_add(dqcoeff, dst, stride, tx_type); break; case TX_8X8: - tx_type = get_tx_type_8x8(pd->plane_type, xd); + tx_type = get_tx_type_8x8(plane_type, xd); vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob); break; case TX_16X16: - tx_type = get_tx_type_16x16(pd->plane_type, xd); + tx_type = get_tx_type_16x16(plane_type, xd); vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob); break; case TX_32X32: @@ -298,7 +293,7 @@ struct intra_args { VP9_COMMON *cm; MACROBLOCKD *xd; vp9_reader *r; - unsigned char* token_cache; + uint8_t *token_cache; }; static void predict_and_reconstruct_intra_block(int plane, int block, @@ -307,29 +302,28 @@ static void predict_and_reconstruct_intra_block(int plane, int block, struct intra_args *const args = arg; VP9_COMMON *const cm = args->cm; MACROBLOCKD *const xd = args->xd; - struct macroblockd_plane *const pd = &xd->plane[plane]; MODE_INFO *const mi = xd->mi_8x8[0]; - const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size, - block); - uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block, - pd->dst.buf, pd->dst.stride); const MB_PREDICTION_MODE mode = (plane == 0) - ? ((mi->mbmi.sb_type < BLOCK_8X8) ? mi->bmi[raster_block].as_mode - : mi->mbmi.mode) - : mi->mbmi.uv_mode; + ? ((mi->mbmi.sb_type < BLOCK_8X8) ? mi->bmi[block].as_mode + : mi->mbmi.mode) + : mi->mbmi.uv_mode; + int x, y; + uint8_t *dst; + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y); + dst = &pd->dst.buf[4 * y * pd->dst.stride + 4 * x]; if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) extend_for_intra(xd, plane_bsize, plane, block, tx_size); - vp9_predict_intra_block(xd, raster_block >> tx_size, + vp9_predict_intra_block(xd, block >> (tx_size << 1), b_width_log2(plane_bsize), tx_size, mode, dst, pd->dst.stride, dst, pd->dst.stride); if (!mi->mbmi.skip_coeff) { - vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, tx_size, + vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, x, y, tx_size, args->r, args->token_cache); - inverse_transform_block(xd, plane, block, plane_bsize, tx_size); + inverse_transform_block(xd, plane, block, tx_size, x, y); } } @@ -338,7 +332,7 @@ struct inter_args { MACROBLOCKD *xd; vp9_reader *r; int *eobtotal; - unsigned char* token_cache; + uint8_t *token_cache; }; static void reconstruct_inter_block(int plane, int block, @@ -347,11 +341,13 @@ static void reconstruct_inter_block(int plane, int block, struct inter_args *args = arg; VP9_COMMON *const cm = args->cm; MACROBLOCKD *const xd = args->xd; + int x, y; + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y); *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block, - plane_bsize, tx_size, + plane_bsize, x, y, tx_size, args->r, args->token_cache); - inverse_transform_block(xd, plane, block, plane_bsize, tx_size); + inverse_transform_block(xd, plane, block, tx_size, x, y); } static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, @@ -360,16 +356,15 @@ static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, const int bh = num_8x8_blocks_high_lookup[bsize]; const int bw = num_8x8_blocks_wide_lookup[bsize]; const int offset = mi_row * cm->mode_info_stride + mi_col; - - xd->mode_info_stride = cm->mode_info_stride; + const int tile_offset = tile->mi_row_start * cm->mode_info_stride + + tile->mi_col_start; xd->mi_8x8 = cm->mi_grid_visible + offset; xd->prev_mi_8x8 = cm->prev_mi_grid_visible + offset; // we are using the mode info context stream here - xd->mi_8x8[0] = xd->mi_stream; + xd->mi_8x8[0] = xd->mi_stream + offset - tile_offset; xd->mi_8x8[0]->mbmi.sb_type = bsize; - ++xd->mi_stream; // Special case: if prev_mi is NULL, the previous mode info context // cannot be used. @@ -403,7 +398,7 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd, const TileInfo *const tile, int mi_row, int mi_col, vp9_reader *r, BLOCK_SIZE bsize, - unsigned char *token_cache) { + uint8_t *token_cache) { const int less8x8 = bsize < BLOCK_8X8; MB_MODE_INFO *mbmi; @@ -425,7 +420,9 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd, } if (!is_inter_block(mbmi)) { - struct intra_args arg = { cm, xd, r, token_cache }; + struct intra_args arg = { + cm, xd, r, token_cache + }; foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block, &arg); } else { @@ -443,7 +440,9 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd, // Reconstruction if (!mbmi->skip_coeff) { int eobtotal = 0; - struct inter_args arg = { cm, xd, r, &eobtotal, token_cache }; + struct inter_args arg = { + cm, xd, r, &eobtotal, token_cache + }; foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg); if (!less8x8 && eobtotal == 0) mbmi->skip_coeff = 1; // skip loopfilter @@ -483,7 +482,7 @@ static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd, const TileInfo *const tile, int mi_row, int mi_col, vp9_reader* r, BLOCK_SIZE bsize, - unsigned char *token_cache) { + uint8_t *token_cache) { const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2; PARTITION_TYPE partition; BLOCK_SIZE subsize; @@ -705,20 +704,19 @@ static void apply_frame_size(VP9D_COMP *pbi, int width, int height) { VP9_COMMON *cm = &pbi->common; if (cm->width != width || cm->height != height) { - if (!pbi->initial_width || !pbi->initial_height) { - if (vp9_alloc_frame_buffers(cm, width, height)) + // Change in frame size. + if (cm->width == 0 || cm->height == 0) { + // Assign new frame buffer on first call. + cm->new_fb_idx = NUM_YV12_BUFFERS - 1; + cm->fb_idx_ref_cnt[cm->new_fb_idx] = 1; + } + + // TODO(agrange) Don't test width/height, check overall size. + if (width > cm->width || height > cm->height) { + // Rescale frame buffers only if they're not big enough already. + if (vp9_resize_frame_buffers(cm, width, height)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffers"); - pbi->initial_width = width; - pbi->initial_height = height; - } else { - if (width > pbi->initial_width) - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Frame width too large"); - - if (height > pbi->initial_height) - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Frame height too large"); } cm->width = width; @@ -768,9 +766,10 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi, } static void setup_tile_context(VP9D_COMP *const pbi, MACROBLOCKD *const xd, - int tile_col) { + int tile_row, int tile_col) { int i; - xd->mi_stream = pbi->mi_streams[tile_col]; + const int tile_cols = 1 << pbi->common.log2_tile_cols; + xd->mi_stream = pbi->mi_streams[tile_row * tile_cols + tile_col]; for (i = 0; i < MAX_MB_PLANE; ++i) { xd->above_context[i] = pbi->above_context[i]; @@ -802,9 +801,10 @@ static void decode_tile(VP9D_COMP *pbi, const TileInfo *const tile, vp9_zero(xd->left_context); vp9_zero(xd->left_seg_context); for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; - mi_col += MI_BLOCK_SIZE) + mi_col += MI_BLOCK_SIZE) { decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64, pbi->token_cache); + } if (pbi->do_loopfilter_inline) { const int lf_start = mi_row - MI_BLOCK_SIZE; @@ -874,81 +874,85 @@ static size_t get_tile(const uint8_t *const data_end, return size; } -static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) { - vp9_reader residual_bc; +typedef struct TileBuffer { + const uint8_t *data; + size_t size; +} TileBuffer; +static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; - - const uint8_t *const data_end = pbi->source + pbi->source_sz; - const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; + TileBuffer tile_buffers[4][1 << 6]; int tile_row, tile_col; + const uint8_t *const data_end = pbi->source + pbi->source_sz; + const uint8_t *end = NULL; + vp9_reader r; + + assert(tile_rows <= 4); + assert(tile_cols <= (1 << 6)); // Note: this memset assumes above_context[0], [1] and [2] // are allocated as part of the same buffer. vpx_memset(pbi->above_context[0], 0, - sizeof(*pbi->above_context[0]) * MAX_MB_PLANE * - 2 * aligned_mi_cols); + sizeof(*pbi->above_context[0]) * MAX_MB_PLANE * 2 * aligned_cols); vpx_memset(pbi->above_seg_context, 0, - sizeof(*pbi->above_seg_context) * aligned_mi_cols); - - if (pbi->oxcf.inv_tile_order) { - const uint8_t *data_ptr2[4][1 << 6]; - vp9_reader bc_bak = {0}; - - // pre-initialize the offsets, we're going to decode in inverse order - data_ptr2[0][0] = data; - for (tile_row = 0; tile_row < tile_rows; tile_row++) { - for (tile_col = 0; tile_col < tile_cols; tile_col++) { - const int last_tile = - tile_row == tile_rows - 1 && tile_col == tile_cols - 1; - const size_t size = get_tile(data_end, last_tile, &cm->error, &data); - data_ptr2[tile_row][tile_col] = data; - data += size; - } + sizeof(*pbi->above_seg_context) * aligned_cols); + + // Load tile data into tile_buffers + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + const int last_tile = tile_row == tile_rows - 1 && + tile_col == tile_cols - 1; + const size_t size = get_tile(data_end, last_tile, &cm->error, &data); + TileBuffer *const buf = &tile_buffers[tile_row][tile_col]; + buf->data = data; + buf->size = size; + data += size; } + } - for (tile_row = 0; tile_row < tile_rows; tile_row++) { - for (tile_col = tile_cols - 1; tile_col >= 0; tile_col--) { - TileInfo tile; - - vp9_tile_init(&tile, cm, tile_row, tile_col); - setup_token_decoder(data_ptr2[tile_row][tile_col], data_end, - data_end - data_ptr2[tile_row][tile_col], - &cm->error, &residual_bc); - setup_tile_context(pbi, xd, tile_col); - decode_tile(pbi, &tile, &residual_bc); - if (tile_row == tile_rows - 1 && tile_col == tile_cols - 1) - bc_bak = residual_bc; - } - } - residual_bc = bc_bak; - } else { - for (tile_row = 0; tile_row < tile_rows; tile_row++) { - for (tile_col = 0; tile_col < tile_cols; tile_col++) { - const int last_tile = - tile_row == tile_rows - 1 && tile_col == tile_cols - 1; - const size_t size = get_tile(data_end, last_tile, &cm->error, &data); - TileInfo tile; - - vp9_tile_init(&tile, cm, tile_row, tile_col); - - setup_token_decoder(data, data_end, size, &cm->error, &residual_bc); - setup_tile_context(pbi, xd, tile_col); - decode_tile(pbi, &tile, &residual_bc); - data += size; - } + // Decode tiles using data from tile_buffers + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + const int col = pbi->oxcf.inv_tile_order ? tile_cols - tile_col - 1 + : tile_col; + const int last_tile = tile_row == tile_rows - 1 && + col == tile_cols - 1; + const TileBuffer *const buf = &tile_buffers[tile_row][col]; + TileInfo tile; + + vp9_tile_init(&tile, cm, tile_row, col); + setup_token_decoder(buf->data, data_end, buf->size, &cm->error, &r); + setup_tile_context(pbi, xd, tile_row, col); + decode_tile(pbi, &tile, &r); + + if (last_tile) + end = vp9_reader_find_end(&r); } } - return vp9_reader_find_end(&residual_bc); + return end; +} + +static void setup_tile_macroblockd(TileWorkerData *const tile_data) { + MACROBLOCKD *xd = &tile_data->xd; + struct macroblockd_plane *const pd = xd->plane; + int i; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + pd[i].qcoeff = tile_data->qcoeff[i]; + pd[i].dqcoeff = tile_data->dqcoeff[i]; + pd[i].eobs = tile_data->eobs[i]; + vpx_memset(xd->plane[i].dqcoeff, 0, 64 * 64 * sizeof(int16_t)); + } } static int tile_worker_hook(void *arg1, void *arg2) { - TileWorkerData *tile_data = (TileWorkerData*)arg1; + TileWorkerData *const tile_data = (TileWorkerData*)arg1; const TileInfo *const tile = (TileInfo*)arg2; int mi_row, mi_col; @@ -1023,7 +1027,8 @@ static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) { setup_token_decoder(data, data_end, size, &cm->error, &tile_data->bit_reader); - setup_tile_context(pbi, &tile_data->xd, tile_col); + setup_tile_context(pbi, &tile_data->xd, 0, tile_col); + setup_tile_macroblockd(tile_data); worker->had_error = 0; if (i == num_workers - 1 || tile_col == tile_cols - 1) { @@ -1227,7 +1232,7 @@ static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data, for (i = 0; i < PARTITION_TYPES - 1; ++i) vp9_diff_update_prob(&r, &fc->partition_prob[j][i]); - read_mv_probs(&r, nmvc, cm->allow_high_precision_mv); + read_mv_probs(nmvc, cm->allow_high_precision_mv, &r); } return vp9_reader_has_error(&r); @@ -1323,9 +1328,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { } } - alloc_tile_storage(pbi, tile_cols); + alloc_tile_storage(pbi, tile_rows, tile_cols); - xd->mi_8x8 = cm->mi_grid_visible; xd->mode_info_stride = cm->mode_info_stride; set_prev_mi(cm); @@ -1335,7 +1339,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { cm->fc = cm->frame_contexts[cm->frame_context_idx]; vp9_zero(cm->counts); for (i = 0; i < MAX_MB_PLANE; ++i) - vp9_zero(xd->plane[i].dqcoeff); + vpx_memset(xd->plane[i].dqcoeff, 0, 64 * 64 * sizeof(int16_t)); xd->corrupted = 0; new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size); diff --git a/libvpx/vp9/decoder/vp9_detokenize.c b/libvpx/vp9/decoder/vp9_detokenize.c index b8d670b..05a2b42 100644 --- a/libvpx/vp9/decoder/vp9_detokenize.c +++ b/libvpx/vp9/decoder/vp9_detokenize.c @@ -67,27 +67,27 @@ static const int token_to_counttoken[MAX_ENTROPY_TOKENS] = { TWO_TOKEN, TWO_TOKEN, TWO_TOKEN, DCT_EOB_MODEL_TOKEN }; -#define INCREMENT_COUNT(token) \ - do { \ - if (!cm->frame_parallel_decoding_mode) { \ +#define INCREMENT_COUNT(token) \ + do { \ + if (!cm->frame_parallel_decoding_mode) \ ++coef_counts[band][pt][token_to_counttoken[token]]; \ - } \ - } while (0); + } while (0) + #define WRITE_COEF_CONTINUE(val, token) \ { \ - dqcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \ + dqcoeff_ptr[scan[c]] = (vp9_read_bit(r) ? -val : val) * \ dq[c > 0] / (1 + (tx_size == TX_32X32)); \ INCREMENT_COUNT(token); \ token_cache[scan[c]] = vp9_pt_energy_class[token]; \ - c++; \ + ++c; \ continue; \ } #define ADJUST_COEF(prob, bits_count) \ do { \ val += (vp9_read(r, prob) << bits_count); \ - } while (0); + } while (0) static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, vp9_reader *r, int block_idx, @@ -108,31 +108,31 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, unsigned int (*eob_branch_count)[PREV_COEF_CONTEXTS] = counts->eob_branch[tx_size][type][ref]; const int16_t *scan, *nb; - const uint8_t *const band_translate = get_band_translate(tx_size); + const uint8_t *cat6; + const uint8_t *band_translate = get_band_translate(tx_size); get_scan(xd, tx_size, type, block_idx, &scan, &nb); - while (1) { + while (c < seg_eob) { int val; - const uint8_t *cat6 = cat6_prob; - if (c >= seg_eob) - break; if (c) pt = get_coef_context(nb, token_cache, c); - band = get_coef_band(band_translate, c); + band = *band_translate++; prob = coef_probs[band][pt]; if (!cm->frame_parallel_decoding_mode) ++eob_branch_count[band][pt]; if (!vp9_read(r, prob[EOB_CONTEXT_NODE])) break; + goto DECODE_ZERO; SKIP_START: if (c >= seg_eob) break; if (c) pt = get_coef_context(nb, token_cache, c); - band = get_coef_band(band_translate, c); + band = *band_translate++; prob = coef_probs[band][pt]; + DECODE_ZERO: if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) { INCREMENT_COUNT(ZERO_TOKEN); token_cache[scan[c]] = vp9_pt_energy_class[ZERO_TOKEN]; @@ -200,10 +200,11 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY5); } val = 0; - while (*cat6) { + cat6 = cat6_prob; + while (*cat6) val = (val << 1) | vp9_read(r, *cat6++); - } val += CAT6_MIN_VAL; + WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY6); } @@ -217,22 +218,17 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd, int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, vp9_reader *r, + int x, int y, TX_SIZE tx_size, vp9_reader *r, uint8_t *token_cache) { struct macroblockd_plane *const pd = &xd->plane[plane]; const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id, tx_size); - int aoff, loff, eob, pt; - txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff); - pt = get_entropy_context(tx_size, pd->above_context + aoff, - pd->left_context + loff); - - eob = decode_coefs(cm, xd, r, block, - pd->plane_type, seg_eob, BLOCK_OFFSET(pd->dqcoeff, block), - tx_size, pd->dequant, pt, token_cache); - - set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff); - + const int pt = get_entropy_context(tx_size, pd->above_context + x, + pd->left_context + y); + const int eob = decode_coefs(cm, xd, r, block, pd->plane_type, seg_eob, + BLOCK_OFFSET(pd->dqcoeff, block), tx_size, + pd->dequant, pt, token_cache); + set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y); pd->eobs[block] = eob; return eob; } diff --git a/libvpx/vp9/decoder/vp9_detokenize.h b/libvpx/vp9/decoder/vp9_detokenize.h index 04939ea..e858a19 100644 --- a/libvpx/vp9/decoder/vp9_detokenize.h +++ b/libvpx/vp9/decoder/vp9_detokenize.h @@ -17,7 +17,7 @@ int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd, int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, vp9_reader *r, + int x, int y, TX_SIZE tx_size, vp9_reader *r, uint8_t *token_cache); #endif // VP9_DECODER_VP9_DETOKENIZE_H_ diff --git a/libvpx/vp9/decoder/vp9_onyxd_if.c b/libvpx/vp9/decoder/vp9_onyxd_if.c index 5f970a3..cb45d37 100644 --- a/libvpx/vp9/decoder/vp9_onyxd_if.c +++ b/libvpx/vp9/decoder/vp9_onyxd_if.c @@ -107,6 +107,18 @@ void vp9_initialize_dec() { } } +static void init_macroblockd(VP9D_COMP *const pbi) { + MACROBLOCKD *xd = &pbi->mb; + struct macroblockd_plane *const pd = xd->plane; + int i; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + pd[i].qcoeff = pbi->qcoeff[i]; + pd[i].dqcoeff = pbi->dqcoeff[i]; + pd[i].eobs = pbi->eobs[i]; + } +} + VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) { VP9D_COMP *const pbi = vpx_memalign(32, sizeof(VP9D_COMP)); VP9_COMMON *const cm = pbi ? &pbi->common : NULL; @@ -141,6 +153,8 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) { cm->error.setjmp = 0; pbi->decoded_key_frame = 0; + init_macroblockd(pbi); + vp9_worker_init(&pbi->lf_worker); return pbi; diff --git a/libvpx/vp9/decoder/vp9_onyxd_int.h b/libvpx/vp9/decoder/vp9_onyxd_int.h index 7c4c9db..d3d29e9 100644 --- a/libvpx/vp9/decoder/vp9_onyxd_int.h +++ b/libvpx/vp9/decoder/vp9_onyxd_int.h @@ -22,6 +22,10 @@ typedef struct VP9Decompressor { DECLARE_ALIGNED(16, VP9_COMMON, common); + DECLARE_ALIGNED(16, int16_t, qcoeff[MAX_MB_PLANE][64 * 64]); + DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]); + DECLARE_ALIGNED(16, uint16_t, eobs[MAX_MB_PLANE][256]); + VP9D_CONFIG oxcf; const uint8_t *source; @@ -50,7 +54,7 @@ typedef struct VP9Decompressor { ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; PARTITION_CONTEXT *above_seg_context; - DECLARE_ALIGNED(16, unsigned char, token_cache[1024]); + DECLARE_ALIGNED(16, uint8_t, token_cache[1024]); } VP9D_COMP; #endif // VP9_DECODER_VP9_ONYXD_INT_H_ diff --git a/libvpx/vp9/decoder/vp9_treereader.h b/libvpx/vp9/decoder/vp9_treereader.h index f612497..41680d2 100644 --- a/libvpx/vp9/decoder/vp9_treereader.h +++ b/libvpx/vp9/decoder/vp9_treereader.h @@ -15,8 +15,6 @@ #include "vp9/common/vp9_treecoder.h" #include "vp9/decoder/vp9_dboolhuff.h" -#define vp9_read_and_apply_sign(r, value) (vp9_read_bit(r) ? -(value) : (value)) - // Intent of tree data structure is to make decoding trivial. static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */ vp9_tree t, diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c index 87bd36c..efbadba 100644 --- a/libvpx/vp9/encoder/vp9_bitstream.c +++ b/libvpx/vp9/encoder/vp9_bitstream.c @@ -169,10 +169,8 @@ static void update_mode(vp9_writer *w, int n, vp9_tree tree, const unsigned int num_events[/* n */]) { int i = 0; - vp9_tree_probs_from_distribution(tree, bct, num_events, 0); - n--; - - for (i = 0; i < n; ++i) + vp9_tree_probs_from_distribution(tree, bct, num_events); + for (i = 0; i < n - 1; ++i) vp9_cond_prob_diff_update(w, &Pcur[i], bct[i]); } @@ -191,12 +189,14 @@ static void update_mbintra_mode_probs(VP9_COMP* const cpi, static void write_selected_tx_size(const VP9_COMP *cpi, MODE_INFO *m, TX_SIZE tx_size, BLOCK_SIZE bsize, vp9_writer *w) { + const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; const MACROBLOCKD *const xd = &cpi->mb.e_mbd; - const vp9_prob *tx_probs = get_tx_probs2(xd, &cpi->common.fc.tx_probs, m); + const vp9_prob *const tx_probs = get_tx_probs2(max_tx_size, xd, + &cpi->common.fc.tx_probs); vp9_write(w, tx_size != TX_4X4, tx_probs[0]); - if (bsize >= BLOCK_16X16 && tx_size != TX_4X4) { + if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) { vp9_write(w, tx_size != TX_8X8, tx_probs[1]); - if (bsize >= BLOCK_32X32 && tx_size != TX_8X8) + if (tx_size != TX_8X8 && max_tx_size >= TX_32X32) vp9_write(w, tx_size != TX_16X16, tx_probs[2]); } } @@ -231,7 +231,7 @@ static void update_switchable_interp_probs(VP9_COMP *cpi, vp9_writer *w) { int i, j; for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) { vp9_tree_probs_from_distribution(vp9_switchable_interp_tree, branch_ct, - cm->counts.switchable_interp[j], 0); + cm->counts.switchable_interp[j]); for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) vp9_cond_prob_diff_update(w, &cm->fc.switchable_interp_prob[j][i], @@ -250,7 +250,7 @@ static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer *w) { for (i = 0; i < INTER_MODE_CONTEXTS; ++i) { unsigned int branch_ct[INTER_MODES - 1][2]; vp9_tree_probs_from_distribution(vp9_inter_mode_tree, branch_ct, - cm->counts.inter_mode[i], NEARESTMV); + cm->counts.inter_mode[i]); for (j = 0; j < INTER_MODES - 1; ++j) vp9_cond_prob_diff_update(w, &cm->fc.inter_mode_probs[i][j], @@ -258,15 +258,15 @@ static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer *w) { } } -static void pack_mb_tokens(vp9_writer* const bc, +static void pack_mb_tokens(vp9_writer* const w, TOKENEXTRA **tp, const TOKENEXTRA *const stop) { TOKENEXTRA *p = *tp; while (p < stop && p->token != EOSB_TOKEN) { const int t = p->token; - const struct vp9_token *const a = vp9_coef_encodings + t; - const vp9_extra_bit *const b = vp9_extra_bits + t; + const struct vp9_token *const a = &vp9_coef_encodings[t]; + const vp9_extra_bit *const b = &vp9_extra_bits[t]; int i = 0; const vp9_prob *pp; int v = a->value; @@ -289,7 +289,7 @@ static void pack_mb_tokens(vp9_writer* const bc, do { const int bb = (v >> --n) & 1; - vp9_write(bc, bb, pp[i >> 1]); + vp9_write(w, bb, pp[i >> 1]); i = vp9_coef_tree[i + bb]; } while (n); @@ -304,12 +304,12 @@ static void pack_mb_tokens(vp9_writer* const bc, do { const int bb = (v >> --n) & 1; - vp9_write(bc, bb, pb[i >> 1]); + vp9_write(w, bb, pb[i >> 1]); i = b->tree[i + bb]; } while (n); } - vp9_write_bit(bc, e & 1); + vp9_write_bit(w, e & 1); } ++p; } @@ -321,7 +321,7 @@ static void write_sb_mv_ref(vp9_writer *w, MB_PREDICTION_MODE mode, const vp9_prob *p) { assert(is_inter_mode(mode)); write_token(w, vp9_inter_mode_tree, p, - &vp9_inter_mode_encodings[inter_mode_offset(mode)]); + &vp9_inter_mode_encodings[INTER_OFFSET(mode)]); } @@ -448,7 +448,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { if (bsize >= BLOCK_8X8) { write_sb_mv_ref(bc, mode, mv_ref_p); ++cm->counts.inter_mode[mi->mode_context[rf]] - [inter_mode_offset(mode)]; + [INTER_OFFSET(mode)]; } } @@ -471,7 +471,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode; write_sb_mv_ref(bc, blockmode, mv_ref_p); ++cm->counts.inter_mode[mi->mode_context[rf]] - [inter_mode_offset(blockmode)]; + [INTER_OFFSET(blockmode)]; if (blockmode == NEWMV) { #ifdef ENTROPY_STATS @@ -545,37 +545,33 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8, } static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile, - MODE_INFO **mi_8x8, vp9_writer *bc, - TOKENEXTRA **tok, TOKENEXTRA *tok_end, - int mi_row, int mi_col, int index) { + vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end, + int mi_row, int mi_col) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; - MODE_INFO *m = mi_8x8[0]; - - if (m->mbmi.sb_type < BLOCK_8X8) - if (index > 0) - return; + MODE_INFO *m; - xd->mi_8x8 = mi_8x8; + xd->mi_8x8 = cm->mi_grid_visible + (mi_row * cm->mode_info_stride + mi_col); + m = xd->mi_8x8[0]; set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type], mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type], cm->mi_rows, cm->mi_cols); if (frame_is_intra_only(cm)) { - write_mb_modes_kf(cpi, mi_8x8, bc); + write_mb_modes_kf(cpi, xd->mi_8x8, w); #ifdef ENTROPY_STATS active_section = 8; #endif } else { - pack_inter_mode_mvs(cpi, m, bc); + pack_inter_mode_mvs(cpi, m, w); #ifdef ENTROPY_STATS active_section = 1; #endif } assert(*tok < tok_end); - pack_mb_tokens(bc, tok, tok_end); + pack_mb_tokens(w, tok, tok_end); } static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col, @@ -602,59 +598,50 @@ static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col, } static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile, - MODE_INFO **mi_8x8, vp9_writer *bc, - TOKENEXTRA **tok, TOKENEXTRA *tok_end, - int mi_row, int mi_col, BLOCK_SIZE bsize, - int index) { + vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end, + int mi_row, int mi_col, BLOCK_SIZE bsize) { VP9_COMMON *const cm = &cpi->common; - const int mis = cm->mode_info_stride; - int bsl = b_width_log2(bsize); - int bs = (1 << bsl) / 4; // mode_info step for subsize - int n; - PARTITION_TYPE partition = PARTITION_NONE; + const int bsl = b_width_log2(bsize); + const int bs = (1 << bsl) / 4; + PARTITION_TYPE partition; BLOCK_SIZE subsize; - MODE_INFO *m = mi_8x8[0]; + MODE_INFO *m = cm->mi_grid_visible[mi_row * cm->mode_info_stride + mi_col]; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; partition = partition_lookup[bsl][m->mbmi.sb_type]; - - if (bsize < BLOCK_8X8) { - if (index > 0) - return; - } else { - write_partition(cpi, bs, mi_row, mi_col, partition, bsize, bc); - } - + write_partition(cpi, bs, mi_row, mi_col, partition, bsize, w); subsize = get_subsize(bsize, partition); - - switch (partition) { - case PARTITION_NONE: - write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0); - break; - case PARTITION_HORZ: - write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0); - if ((mi_row + bs) < cm->mi_rows) - write_modes_b(cpi, tile, mi_8x8 + bs * mis, bc, tok, tok_end, - mi_row + bs, mi_col, 1); - break; - case PARTITION_VERT: - write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0); - if ((mi_col + bs) < cm->mi_cols) - write_modes_b(cpi, tile, mi_8x8 + bs, bc, tok, tok_end, - mi_row, mi_col + bs, 1); - break; - case PARTITION_SPLIT: - for (n = 0; n < 4; n++) { - const int j = n >> 1, i = n & 1; - write_modes_sb(cpi, tile, mi_8x8 + j * bs * mis + i * bs, bc, - tok, tok_end, - mi_row + j * bs, mi_col + i * bs, subsize, n); - } - break; - default: - assert(0); + if (subsize < BLOCK_8X8) { + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + } else { + switch (partition) { + case PARTITION_NONE: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + break; + case PARTITION_HORZ: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + if (mi_row + bs < cm->mi_rows) + write_modes_b(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col); + break; + case PARTITION_VERT: + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + if (mi_col + bs < cm->mi_cols) + write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs); + break; + case PARTITION_SPLIT: + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize); + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs, + subsize); + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col, + subsize); + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col + bs, + subsize); + break; + default: + assert(0); + } } // update partition context @@ -665,25 +652,15 @@ static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile, } static void write_modes(VP9_COMP *cpi, const TileInfo *const tile, - vp9_writer* const bc, - TOKENEXTRA **tok, TOKENEXTRA *tok_end) { - VP9_COMMON *const cm = &cpi->common; - const int mis = cm->mode_info_stride; + vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end) { int mi_row, mi_col; - MODE_INFO **mi_8x8 = cm->mi_grid_visible; - MODE_INFO **m_8x8; - - mi_8x8 += tile->mi_col_start + tile->mi_row_start * mis; for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; - mi_row += 8, mi_8x8 += 8 * mis) { - m_8x8 = mi_8x8; - vp9_zero(cpi->left_seg_context); + mi_row += MI_BLOCK_SIZE) { + vp9_zero(cpi->left_seg_context); for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; - mi_col += MI_BLOCK_SIZE, m_8x8 += MI_BLOCK_SIZE) { - write_modes_sb(cpi, tile, m_8x8, bc, tok, tok_end, mi_row, mi_col, - BLOCK_64X64, 0); - } + mi_col += MI_BLOCK_SIZE) + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, BLOCK_64X64); } } @@ -703,7 +680,7 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) { continue; vp9_tree_probs_from_distribution(vp9_coef_tree, coef_branch_ct[i][j][k][l], - coef_counts[i][j][k][l], 0); + coef_counts[i][j][k][l]); coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] - coef_branch_ct[i][j][k][l][0][0]; for (m = 0; m < UNCONSTRAINED_NODES; ++m) @@ -1217,7 +1194,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { for (tile_col = 0; tile_col < tile_cols; tile_col++) { TileInfo tile; - vp9_tile_init(&tile, cm, 0, tile_col); + vp9_tile_init(&tile, cm, tile_row, tile_col); tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col]; if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h index 8033a4d..4445970 100644 --- a/libvpx/vp9/encoder/vp9_block.h +++ b/libvpx/vp9/encoder/vp9_block.h @@ -27,6 +27,18 @@ typedef struct { typedef struct { MODE_INFO mic; uint8_t *zcoeff_blk; + int16_t *coeff[MAX_MB_PLANE][3]; + int16_t *qcoeff[MAX_MB_PLANE][3]; + int16_t *dqcoeff[MAX_MB_PLANE][3]; + uint16_t *eobs[MAX_MB_PLANE][3]; + + // dual buffer pointers, 0: in use, 1: best in store + int16_t *coeff_pbuf[MAX_MB_PLANE][3]; + int16_t *qcoeff_pbuf[MAX_MB_PLANE][3]; + int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3]; + uint16_t *eobs_pbuf[MAX_MB_PLANE][3]; + + int is_coded; int num_4x4_blk; int skip; int_mv best_ref_mv; @@ -57,7 +69,7 @@ typedef struct { struct macroblock_plane { DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]); - DECLARE_ALIGNED(16, int16_t, coeff[64 * 64]); + int16_t *coeff; struct buf_2d src; // Quantizer setings @@ -81,6 +93,10 @@ struct macroblock { MACROBLOCKD e_mbd; int skip_block; + int select_txfm_size; + int skip_recode; + int skip_optimize; + int q_index; search_site *ss; int ss_count; @@ -120,6 +136,11 @@ struct macroblock { int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; + unsigned char sb_index; // index of 32x32 block inside the 64x64 block + unsigned char mb_index; // index of 16x16 block inside the 32x32 block + unsigned char b_index; // index of 8x8 block inside the 16x16 block + unsigned char ab_index; // index of 4x4 block inside the 8x8 block + // These define limits to motion vector components to prevent them // from extending outside the UMV borders int mv_col_min; @@ -179,35 +200,33 @@ struct macroblock { // refactoring on organizing the temporary buffers, when recursive // partition down to 4x4 block size is enabled. static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) { - MACROBLOCKD *const xd = &x->e_mbd; - switch (bsize) { case BLOCK_64X64: return &x->sb64_context; case BLOCK_64X32: - return &x->sb64x32_context[xd->sb_index]; + return &x->sb64x32_context[x->sb_index]; case BLOCK_32X64: - return &x->sb32x64_context[xd->sb_index]; + return &x->sb32x64_context[x->sb_index]; case BLOCK_32X32: - return &x->sb32_context[xd->sb_index]; + return &x->sb32_context[x->sb_index]; case BLOCK_32X16: - return &x->sb32x16_context[xd->sb_index][xd->mb_index]; + return &x->sb32x16_context[x->sb_index][x->mb_index]; case BLOCK_16X32: - return &x->sb16x32_context[xd->sb_index][xd->mb_index]; + return &x->sb16x32_context[x->sb_index][x->mb_index]; case BLOCK_16X16: - return &x->mb_context[xd->sb_index][xd->mb_index]; + return &x->mb_context[x->sb_index][x->mb_index]; case BLOCK_16X8: - return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index]; + return &x->sb16x8_context[x->sb_index][x->mb_index][x->b_index]; case BLOCK_8X16: - return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index]; + return &x->sb8x16_context[x->sb_index][x->mb_index][x->b_index]; case BLOCK_8X8: - return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index]; + return &x->sb8x8_context[x->sb_index][x->mb_index][x->b_index]; case BLOCK_8X4: - return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index]; + return &x->sb8x4_context[x->sb_index][x->mb_index][x->b_index]; case BLOCK_4X8: - return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index]; + return &x->sb4x8_context[x->sb_index][x->mb_index][x->b_index]; case BLOCK_4X4: - return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index]; + return &x->ab4x4_context[x->sb_index][x->mb_index][x->b_index]; default: assert(0); return NULL; diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c index a45299b..3e75f3b 100644 --- a/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/libvpx/vp9/encoder/vp9_encodeframe.c @@ -50,25 +50,25 @@ int enc_debug = 0; #endif -static INLINE uint8_t *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) { +static INLINE uint8_t *get_sb_index(MACROBLOCK *x, BLOCK_SIZE subsize) { switch (subsize) { case BLOCK_64X64: case BLOCK_64X32: case BLOCK_32X64: case BLOCK_32X32: - return &xd->sb_index; + return &x->sb_index; case BLOCK_32X16: case BLOCK_16X32: case BLOCK_16X16: - return &xd->mb_index; + return &x->mb_index; case BLOCK_16X8: case BLOCK_8X16: case BLOCK_8X8: - return &xd->b_index; + return &x->b_index; case BLOCK_8X4: case BLOCK_4X8: case BLOCK_4X4: - return &xd->ab_index; + return &x->ab_index; default: assert(0); return NULL; @@ -367,6 +367,8 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; MODE_INFO *mi = &ctx->mic; MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; MODE_INFO *mi_addr = xd->mi_8x8[0]; @@ -375,6 +377,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, const int mis = cm->mode_info_stride; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; + int max_plane; assert(mi->mbmi.mode < MB_MODE_COUNT); assert(mi->mbmi.ref_frame[0] < MAX_REF_FRAMES); @@ -383,6 +386,21 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, *mi_addr = *mi; + max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1; + for (i = 0; i < max_plane; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][1]; + pd[i].qcoeff = ctx->qcoeff_pbuf[i][1]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1]; + pd[i].eobs = ctx->eobs_pbuf[i][1]; + } + + for (i = max_plane; i < MAX_MB_PLANE; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][2]; + pd[i].qcoeff = ctx->qcoeff_pbuf[i][2]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2]; + pd[i].eobs = ctx->eobs_pbuf[i][2]; + } + // Restore the coding context of the MB to that that was in place // when the mode was picked for it for (y = 0; y < mi_height; y++) @@ -578,6 +596,9 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + int i; int orig_rdmult = x->rdmult; double rdmult_ratio; @@ -590,7 +611,7 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, if (bsize < BLOCK_8X8) { // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 // there is nothing to be done. - if (xd->ab_index != 0) { + if (x->ab_index != 0) { *totalrate = 0; *totaldist = 0; return; @@ -600,6 +621,15 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, set_offsets(cpi, tile, mi_row, mi_col, bsize); xd->mi_8x8[0]->mbmi.sb_type = bsize; + for (i = 0; i < MAX_MB_PLANE; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][0]; + pd[i].qcoeff = ctx->qcoeff_pbuf[i][0]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0]; + pd[i].eobs = ctx->eobs_pbuf[i][0]; + } + ctx->is_coded = 0; + x->skip_recode = 0; + // Set to zero to make sure we do not use the previous encoded frame stats xd->mi_8x8[0]->mbmi.skip_coeff = 0; @@ -687,16 +717,15 @@ static void update_stats(VP9_COMP *cpi) { } static BLOCK_SIZE *get_sb_partitioning(MACROBLOCK *x, BLOCK_SIZE bsize) { - MACROBLOCKD *const xd = &x->e_mbd; switch (bsize) { case BLOCK_64X64: return &x->sb64_partitioning; case BLOCK_32X32: - return &x->sb_partitioning[xd->sb_index]; + return &x->sb_partitioning[x->sb_index]; case BLOCK_16X16: - return &x->mb_partitioning[xd->sb_index][xd->mb_index]; + return &x->mb_partitioning[x->sb_index][x->mb_index]; case BLOCK_8X8: - return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index]; + return &x->b_partitioning[x->sb_index][x->mb_index][x->b_index]; default: assert(0); return NULL; @@ -769,20 +798,19 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col, static void encode_b(VP9_COMP *cpi, const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, int mi_col, int output_enabled, BLOCK_SIZE bsize, int sub_index) { - VP9_COMMON * const cm = &cpi->common; - MACROBLOCK * const x = &cpi->mb; - MACROBLOCKD * const xd = &x->e_mbd; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; if (sub_index != -1) - *get_sb_index(xd, bsize) = sub_index; + *get_sb_index(x, bsize) = sub_index; if (bsize < BLOCK_8X8) { // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 // there is nothing to be done. - if (xd->ab_index > 0) + if (x->ab_index > 0) return; } set_offsets(cpi, tile, mi_row, mi_col, bsize); @@ -800,9 +828,8 @@ static void encode_b(VP9_COMP *cpi, const TileInfo *const tile, static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, int mi_col, int output_enabled, BLOCK_SIZE bsize) { - VP9_COMMON * const cm = &cpi->common; - MACROBLOCK * const x = &cpi->mb; - MACROBLOCKD * const xd = &x->e_mbd; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; BLOCK_SIZE c1 = BLOCK_8X8; const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4; int pl = 0; @@ -848,7 +875,7 @@ static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile, for (i = 0; i < 4; i++) { const int x_idx = i & 1, y_idx = i >> 1; - *get_sb_index(xd, subsize) = i; + *get_sb_index(x, subsize) = i; encode_sb(cpi, tile, tp, mi_row + y_idx * bs, mi_col + x_idx * bs, output_enabled, subsize); } @@ -975,9 +1002,8 @@ static void rd_use_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate, int64_t *dist, int do_recon) { - VP9_COMMON * const cm = &cpi->common; - MACROBLOCK * const x = &cpi->mb; - MACROBLOCKD *xd = &cpi->mb.e_mbd; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; const int mis = cm->mode_info_stride; int bsl = b_width_log2(bsize); const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; @@ -1012,7 +1038,7 @@ static void rd_use_partition(VP9_COMP *cpi, if (bsize < BLOCK_8X8) { // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 // there is nothing to be done. - if (xd->ab_index != 0) { + if (x->ab_index != 0) { *rate = 0; *dist = 0; return; @@ -1070,7 +1096,7 @@ static void rd_use_partition(VP9_COMP *cpi, bsize, get_block_context(x, bsize), INT64_MAX); break; case PARTITION_HORZ: - *get_sb_index(xd, subsize) = 0; + *get_sb_index(x, subsize) = 0; pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist, subsize, get_block_context(x, subsize), INT64_MAX); if (last_part_rate != INT_MAX && @@ -1079,7 +1105,7 @@ static void rd_use_partition(VP9_COMP *cpi, int64_t dt = 0; update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - *get_sb_index(xd, subsize) = 1; + *get_sb_index(x, subsize) = 1; pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize, get_block_context(x, subsize), INT64_MAX); if (rt == INT_MAX || dt == INT_MAX) { @@ -1093,7 +1119,7 @@ static void rd_use_partition(VP9_COMP *cpi, } break; case PARTITION_VERT: - *get_sb_index(xd, subsize) = 0; + *get_sb_index(x, subsize) = 0; pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist, subsize, get_block_context(x, subsize), INT64_MAX); if (last_part_rate != INT_MAX && @@ -1102,7 +1128,7 @@ static void rd_use_partition(VP9_COMP *cpi, int64_t dt = 0; update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - *get_sb_index(xd, subsize) = 1; + *get_sb_index(x, subsize) = 1; pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize, get_block_context(x, subsize), INT64_MAX); if (rt == INT_MAX || dt == INT_MAX) { @@ -1128,7 +1154,7 @@ static void rd_use_partition(VP9_COMP *cpi, if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) continue; - *get_sb_index(xd, subsize) = i; + *get_sb_index(x, subsize) = i; rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp, mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt, @@ -1169,11 +1195,10 @@ static void rd_use_partition(VP9_COMP *cpi, ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; PARTITION_CONTEXT sl[8], sa[8]; - if ((mi_row + y_idx >= cm->mi_rows) - || (mi_col + x_idx >= cm->mi_cols)) + if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) continue; - *get_sb_index(xd, split_subsize) = i; + *get_sb_index(x, split_subsize) = i; *get_sb_partitioning(x, bsize) = split_subsize; *get_sb_partitioning(x, split_subsize) = split_subsize; @@ -1353,7 +1378,6 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; // Only use 8x8 result for non HD videos. // int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0; @@ -1366,9 +1390,9 @@ static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) { PICK_MODE_CONTEXT *block_context = NULL; if (bsize == BLOCK_16X16) { - block_context = x->sb8x8_context[xd->sb_index][xd->mb_index]; + block_context = x->sb8x8_context[x->sb_index][x->mb_index]; } else if (bsize == BLOCK_32X32) { - block_context = x->mb_context[xd->sb_index]; + block_context = x->mb_context[x->sb_index]; } else if (bsize == BLOCK_64X64) { block_context = x->sb32_context; } @@ -1456,9 +1480,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate, int64_t *dist, int do_recon, int64_t best_rd) { - VP9_COMMON * const cm = &cpi->common; - MACROBLOCK * const x = &cpi->mb; - MACROBLOCKD * const xd = &x->e_mbd; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; const int ms = num_8x8_blocks_wide_lookup[bsize] / 2; ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; PARTITION_CONTEXT sl[8], sa[8]; @@ -1484,7 +1507,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (bsize < BLOCK_8X8) { // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 // there is nothing to be done. - if (xd->ab_index != 0) { + if (x->ab_index != 0) { *rate = 0; *dist = 0; return; @@ -1582,7 +1605,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) continue; - *get_sb_index(xd, subsize) = i; + *get_sb_index(x, subsize) = i; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, subsize, @@ -1629,7 +1652,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, // PARTITION_HORZ if (partition_horz_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_HORZ); - *get_sb_index(xd, subsize) = 0; + *get_sb_index(x, subsize) = 0; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, @@ -1640,7 +1663,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - *get_sb_index(xd, subsize) = 1; + *get_sb_index(x, subsize) = 1; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate, @@ -1674,7 +1697,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (partition_vert_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_VERT); - *get_sb_index(xd, subsize) = 0; + *get_sb_index(x, subsize) = 0; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, @@ -1684,7 +1707,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - *get_sb_index(xd, subsize) = 1; + *get_sb_index(x, subsize) = 1; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate, @@ -1765,7 +1788,7 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile, } static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile, - int mi_row, TOKENEXTRA **tp, int *totalrate) { + int mi_row, TOKENEXTRA **tp) { VP9_COMMON * const cm = &cpi->common; int mi_col; @@ -1910,7 +1933,6 @@ static void encode_frame_internal(VP9_COMP *cpi) { MACROBLOCK * const x = &cpi->mb; VP9_COMMON * const cm = &cpi->common; MACROBLOCKD * const xd = &x->e_mbd; - int totalrate; // fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n", // cpi->common.current_video_frame, cpi->common.show_frame, @@ -1926,8 +1948,6 @@ static void encode_frame_internal(VP9_COMP *cpi) { } #endif - totalrate = 0; - vp9_zero(cm->counts.switchable_interp); vp9_zero(cpi->tx_stepdown_count); @@ -1989,7 +2009,7 @@ static void encode_frame_internal(VP9_COMP *cpi) { vp9_tile_init(&tile, cm, tile_row, tile_col); for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end; mi_row += 8) - encode_sb_row(cpi, &tile, mi_row, &tp, &totalrate); + encode_sb_row(cpi, &tile, mi_row, &tp); cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old); assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols)); @@ -2015,10 +2035,6 @@ static void encode_frame_internal(VP9_COMP *cpi) { cpi->sf.skip_encode_frame = 0; } - // 256 rate units to the bit, - // projected_frame_size in units of BYTES - cpi->projected_frame_size = totalrate >> 8; - #if 0 // Keep record of the total distortion this time around for future use cpi->last_frame_distortion = cpi->frame_distortion; @@ -2395,13 +2411,17 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, MODE_INFO **mi_8x8 = xd->mi_8x8; MODE_INFO *mi = mi_8x8[0]; MB_MODE_INFO *mbmi = &mi->mbmi; + PICK_MODE_CONTEXT *ctx = get_block_context(x, bsize); unsigned int segment_id = mbmi->segment_id; const int mis = cm->mode_info_stride; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; + x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8; + x->skip_optimize = ctx->is_coded; + ctx->is_coded = 1; x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct; x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame && - xd->q_index < QIDX_SKIP_THRESH); + x->q_index < QIDX_SKIP_THRESH); if (x->skip_encode) return; @@ -2487,7 +2507,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, (mbmi->skip_coeff || vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) { const uint8_t context = vp9_get_pred_context_tx_size(xd); - ++get_tx_counts(bsize, context, &cm->counts.tx)[mbmi->tx_size]; + ++get_tx_counts(max_txsize_lookup[bsize], + context, &cm->counts.tx)[mbmi->tx_size]; } else { int x, y; TX_SIZE sz = tx_mode_to_biggest_tx_size[cm->tx_mode]; diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c index 75ed8ea..a85ddee 100644 --- a/libvpx/vp9/encoder/vp9_encodemb.c +++ b/libvpx/vp9/encoder/vp9_encodemb.c @@ -136,14 +136,13 @@ static void optimize_b(MACROBLOCK *mb, const int16_t *scan, *nb; const int mul = 1 + (tx_size == TX_32X32); uint8_t token_cache[1024]; - const int ib = txfrm_block_to_raster_block(plane_bsize, tx_size, block); const int16_t *dequant_ptr = pd->dequant; const uint8_t *const band_translate = get_band_translate(tx_size); assert((!type && !plane) || (type && plane)); dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block); qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block); - get_scan(xd, tx_size, type, ib, &scan, &nb); + get_scan(xd, tx_size, type, block, &scan, &nb); assert(eob <= default_eob); /* Now set up a Viterbi trellis to evaluate alternative roundings. */ @@ -179,7 +178,7 @@ static void optimize_b(MACROBLOCK *mb, t0 = (vp9_dct_value_tokens_ptr + x)->token; /* Consider both possible successor states. */ if (next < default_eob) { - band = get_coef_band(band_translate, i + 1); + band = band_translate[i + 1]; pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); rate0 += mb->token_costs[tx_size][type][ref][band][0][pt] @@ -230,7 +229,7 @@ static void optimize_b(MACROBLOCK *mb, t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token; } if (next < default_eob) { - band = get_coef_band(band_translate, i + 1); + band = band_translate[i + 1]; if (t0 != DCT_EOB_TOKEN) { pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt] @@ -264,7 +263,7 @@ static void optimize_b(MACROBLOCK *mb, /* There's no choice to make for a zero coefficient, so we don't * add a new trellis node, but we do need to update the costs. */ - band = get_coef_band(band_translate, i + 1); + band = band_translate[i + 1]; t0 = tokens[next][0].token; t1 = tokens[next][1].token; /* Update the cost of each path if we're past the EOB token. */ @@ -284,7 +283,7 @@ static void optimize_b(MACROBLOCK *mb, } /* Now pick the best path through the whole trellis. */ - band = get_coef_band(band_translate, i + 1); + band = band_translate[i + 1]; pt = combine_entropy_contexts(*a, *l); rate0 = tokens[next][0].rate; rate1 = tokens[next][1].rate; @@ -420,28 +419,30 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, MACROBLOCKD *const xd = &x->e_mbd; struct optimize_ctx *const ctx = args->ctx; struct macroblockd_plane *const pd = &xd->plane[plane]; - const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size, - block); - int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block, - pd->dst.buf, pd->dst.stride); + int i, j; + uint8_t *dst; + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); + dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i]; // TODO(jingning): per transformed block zero forcing only enabled for // luma component. will integrate chroma components as well. if (x->zcoeff_blk[tx_size][block] && plane == 0) { - int i, j; pd->eobs[block] = 0; - txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); ctx->ta[plane][i] = 0; ctx->tl[plane][j] = 0; return; } - vp9_xform_quant(plane, block, plane_bsize, tx_size, arg); + if (!x->skip_recode) + vp9_xform_quant(plane, block, plane_bsize, tx_size, arg); - if (x->optimize) + if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx); + } else { + ctx->ta[plane][i] = pd->eobs[block] > 0; + ctx->tl[plane][j] = pd->eobs[block] > 0; + } if (x->skip_encode || pd->eobs[block] == 0) return; @@ -505,9 +506,10 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) { struct optimize_ctx ctx; struct encode_b_args arg = {x, &ctx}; - vp9_subtract_sb(x, bsize); + if (!x->skip_recode) + vp9_subtract_sb(x, bsize); - if (x->optimize) { + if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { int i; for (i = 0; i < MAX_MB_PLANE; ++i) optimize_init_b(i, bsize, &arg); @@ -552,19 +554,22 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, xoff = 32 * (block & twmask); yoff = 32 * (block >> twl); dst = pd->dst.buf + yoff * pd->dst.stride + xoff; - src = p->src.buf + yoff * p->src.stride + xoff; - src_diff = p->src_diff + 4 * bw * yoff + xoff; vp9_predict_intra_block(xd, block, bwl, TX_32X32, mode, dst, pd->dst.stride, dst, pd->dst.stride); - vp9_subtract_block(32, 32, src_diff, bw * 4, - src, p->src.stride, dst, pd->dst.stride); - if (x->use_lp32x32fdct) - vp9_fdct32x32_rd(src_diff, coeff, bw * 4); - else - vp9_fdct32x32(src_diff, coeff, bw * 4); - vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan, iscan); + + if (!x->skip_recode) { + src = p->src.buf + yoff * p->src.stride + xoff; + src_diff = p->src_diff + 4 * bw * yoff + xoff; + vp9_subtract_block(32, 32, src_diff, bw * 4, + src, p->src.stride, dst, pd->dst.stride); + if (x->use_lp32x32fdct) + vp9_fdct32x32_rd(src_diff, coeff, bw * 4); + else + vp9_fdct32x32(src_diff, coeff, bw * 4); + vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); + } if (!x->skip_encode && *eob) vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, *eob); break; @@ -577,16 +582,18 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, xoff = 16 * (block & twmask); yoff = 16 * (block >> twl); dst = pd->dst.buf + yoff * pd->dst.stride + xoff; - src = p->src.buf + yoff * p->src.stride + xoff; - src_diff = p->src_diff + 4 * bw * yoff + xoff; vp9_predict_intra_block(xd, block, bwl, TX_16X16, mode, dst, pd->dst.stride, dst, pd->dst.stride); - vp9_subtract_block(16, 16, src_diff, bw * 4, - src, p->src.stride, dst, pd->dst.stride); - vp9_fht16x16(tx_type, src_diff, coeff, bw * 4); - vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan, iscan); + if (!x->skip_recode) { + src = p->src.buf + yoff * p->src.stride + xoff; + src_diff = p->src_diff + 4 * bw * yoff + xoff; + vp9_subtract_block(16, 16, src_diff, bw * 4, + src, p->src.stride, dst, pd->dst.stride); + vp9_fht16x16(tx_type, src_diff, coeff, bw * 4); + vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); + } if (!x->skip_encode && *eob) vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob); break; @@ -599,16 +606,18 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, xoff = 8 * (block & twmask); yoff = 8 * (block >> twl); dst = pd->dst.buf + yoff * pd->dst.stride + xoff; - src = p->src.buf + yoff * p->src.stride + xoff; - src_diff = p->src_diff + 4 * bw * yoff + xoff; vp9_predict_intra_block(xd, block, bwl, TX_8X8, mode, dst, pd->dst.stride, dst, pd->dst.stride); - vp9_subtract_block(8, 8, src_diff, bw * 4, - src, p->src.stride, dst, pd->dst.stride); - vp9_fht8x8(tx_type, src_diff, coeff, bw * 4); - vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan, iscan); + if (!x->skip_recode) { + src = p->src.buf + yoff * p->src.stride + xoff; + src_diff = p->src_diff + 4 * bw * yoff + xoff; + vp9_subtract_block(8, 8, src_diff, bw * 4, + src, p->src.stride, dst, pd->dst.stride); + vp9_fht8x8(tx_type, src_diff, coeff, bw * 4); + vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); + } if (!x->skip_encode && *eob) vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob); break; @@ -624,19 +633,23 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, xoff = 4 * (block & twmask); yoff = 4 * (block >> twl); dst = pd->dst.buf + yoff * pd->dst.stride + xoff; - src = p->src.buf + yoff * p->src.stride + xoff; - src_diff = p->src_diff + 4 * bw * yoff + xoff; vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode, dst, pd->dst.stride, dst, pd->dst.stride); - vp9_subtract_block(4, 4, src_diff, bw * 4, - src, p->src.stride, dst, pd->dst.stride); - if (tx_type != DCT_DCT) - vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type); - else - x->fwd_txm4x4(src_diff, coeff, bw * 4); - vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan, iscan); + + if (!x->skip_recode) { + src = p->src.buf + yoff * p->src.stride + xoff; + src_diff = p->src_diff + 4 * bw * yoff + xoff; + vp9_subtract_block(4, 4, src_diff, bw * 4, + src, p->src.stride, dst, pd->dst.stride); + if (tx_type != DCT_DCT) + vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type); + else + x->fwd_txm4x4(src_diff, coeff, bw * 4); + vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); + } + if (!x->skip_encode && *eob) { if (tx_type == DCT_DCT) // this is like vp9_short_idct4x4 but has a special case around eob<=1 diff --git a/libvpx/vp9/encoder/vp9_encodemv.c b/libvpx/vp9/encoder/vp9_encodemv.c index e2c6c4c..030ca64 100644 --- a/libvpx/vp9/encoder/vp9_encodemv.c +++ b/libvpx/vp9/encoder/vp9_encodemv.c @@ -155,9 +155,8 @@ static void counts_to_nmv_context( unsigned int (*branch_ct_class0_hp)[2], unsigned int (*branch_ct_hp)[2]) { int i, j, k; - vp9_tree_probs_from_distribution(vp9_mv_joint_tree, - branch_ct_joint, - nmv_count->joints, 0); + vp9_tree_probs_from_distribution(vp9_mv_joint_tree, branch_ct_joint, + nmv_count->joints); for (i = 0; i < 2; ++i) { const uint32_t s0 = nmv_count->comps[i].sign[0]; const uint32_t s1 = nmv_count->comps[i].sign[1]; @@ -166,10 +165,10 @@ static void counts_to_nmv_context( branch_ct_sign[i][1] = s1; vp9_tree_probs_from_distribution(vp9_mv_class_tree, branch_ct_classes[i], - nmv_count->comps[i].classes, 0); + nmv_count->comps[i].classes); vp9_tree_probs_from_distribution(vp9_mv_class0_tree, branch_ct_class0[i], - nmv_count->comps[i].class0, 0); + nmv_count->comps[i].class0); for (j = 0; j < MV_OFFSET_BITS; ++j) { const uint32_t b0 = nmv_count->comps[i].bits[j][0]; const uint32_t b1 = nmv_count->comps[i].bits[j][1]; @@ -182,11 +181,11 @@ static void counts_to_nmv_context( for (k = 0; k < CLASS0_SIZE; ++k) { vp9_tree_probs_from_distribution(vp9_mv_fp_tree, branch_ct_class0_fp[i][k], - nmv_count->comps[i].class0_fp[k], 0); + nmv_count->comps[i].class0_fp[k]); } vp9_tree_probs_from_distribution(vp9_mv_fp_tree, branch_ct_fp[i], - nmv_count->comps[i].fp, 0); + nmv_count->comps[i].fp); } if (usehp) { for (i = 0; i < 2; ++i) { diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c index 6a3555d..974c300 100644 --- a/libvpx/vp9/encoder/vp9_firstpass.c +++ b/libvpx/vp9/encoder/vp9_firstpass.c @@ -482,6 +482,10 @@ void vp9_first_pass(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; TileInfo tile; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + PICK_MODE_CONTEXT *ctx = &x->sb64_context; + int i; int recon_yoffset, recon_uvoffset; const int lst_yv12_idx = cm->ref_frame_map[cpi->lst_fb_idx]; @@ -525,6 +529,15 @@ void vp9_first_pass(VP9_COMP *cpi) { vp9_frame_init_quantizer(cpi); + for (i = 0; i < MAX_MB_PLANE; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][1]; + pd[i].qcoeff = ctx->qcoeff_pbuf[i][1]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1]; + pd[i].eobs = ctx->eobs_pbuf[i][1]; + } + x->skip_recode = 0; + + // Initialise the MV cost table to the defaults // if( cm->current_video_frame == 0) // if ( 0 ) diff --git a/libvpx/vp9/encoder/vp9_onyx_if.c b/libvpx/vp9/encoder/vp9_onyx_if.c index f922f90..dd4705d 100644 --- a/libvpx/vp9/encoder/vp9_onyx_if.c +++ b/libvpx/vp9/encoder/vp9_onyx_if.c @@ -834,6 +834,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->adaptive_rd_thresh = 2; sf->recode_loop = 2; + sf->use_lp32x32fdct = 1; sf->mode_skip_start = 11; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; @@ -1436,90 +1437,121 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) { } while (++i <= MV_MAX); } +static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk, + PICK_MODE_CONTEXT *ctx) { + int num_pix = num_4x4_blk << 4; + int i, k; + ctx->num_4x4_blk = num_4x4_blk; + CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, + vpx_calloc(num_4x4_blk, sizeof(uint8_t))); + for (i = 0; i < MAX_MB_PLANE; ++i) { + for (k = 0; k < 3; ++k) { + CHECK_MEM_ERROR(cm, ctx->coeff[i][k], + vpx_memalign(16, num_pix * sizeof(int16_t))); + CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k], + vpx_memalign(16, num_pix * sizeof(int16_t))); + CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k], + vpx_memalign(16, num_pix * sizeof(int16_t))); + CHECK_MEM_ERROR(cm, ctx->eobs[i][k], + vpx_memalign(16, num_pix * sizeof(uint16_t))); + ctx->coeff_pbuf[i][k] = ctx->coeff[i][k]; + ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k]; + ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k]; + ctx->eobs_pbuf[i][k] = ctx->eobs[i][k]; + } + } +} + +static void free_mode_context(PICK_MODE_CONTEXT *ctx) { + int i, k; + vpx_free(ctx->zcoeff_blk); + ctx->zcoeff_blk = 0; + for (i = 0; i < MAX_MB_PLANE; ++i) { + for (k = 0; k < 3; ++k) { + vpx_free(ctx->coeff[i][k]); + ctx->coeff[i][k] = 0; + vpx_free(ctx->qcoeff[i][k]); + ctx->qcoeff[i][k] = 0; + vpx_free(ctx->dqcoeff[i][k]); + ctx->dqcoeff[i][k] = 0; + vpx_free(ctx->eobs[i][k]); + ctx->eobs[i][k] = 0; + } + } +} + static void init_pick_mode_context(VP9_COMP *cpi) { int i; - MACROBLOCK *x = &cpi->mb; - MACROBLOCKD *xd = &x->e_mbd; - VP9_COMMON *cm = &cpi->common; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; + for (i = 0; i < BLOCK_SIZES; ++i) { const int num_4x4_w = num_4x4_blocks_wide_lookup[i]; const int num_4x4_h = num_4x4_blocks_high_lookup[i]; const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h); if (i < BLOCK_16X16) { - for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) { - for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) { - for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) { + for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) { + for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index) { + for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index) { PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - ctx->num_4x4_blk = num_4x4_blk; - CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, - vpx_calloc(num_4x4_blk, sizeof(uint8_t))); + alloc_mode_context(cm, num_4x4_blk, ctx); } } } } else if (i < BLOCK_32X32) { - for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) { - for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk; - ++xd->mb_index) { + for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) { + for (x->mb_index = 0; x->mb_index < 64 / num_4x4_blk; ++x->mb_index) { PICK_MODE_CONTEXT *ctx = get_block_context(x, i); ctx->num_4x4_blk = num_4x4_blk; - CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, - vpx_calloc(num_4x4_blk, sizeof(uint8_t))); + alloc_mode_context(cm, num_4x4_blk, ctx); } } } else if (i < BLOCK_64X64) { - for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) { + for (x->sb_index = 0; x->sb_index < 256 / num_4x4_blk; ++x->sb_index) { PICK_MODE_CONTEXT *ctx = get_block_context(x, i); ctx->num_4x4_blk = num_4x4_blk; - CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, - vpx_calloc(num_4x4_blk, sizeof(uint8_t))); + alloc_mode_context(cm, num_4x4_blk, ctx); } } else { PICK_MODE_CONTEXT *ctx = get_block_context(x, i); ctx->num_4x4_blk = num_4x4_blk; - CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, - vpx_calloc(num_4x4_blk, sizeof(uint8_t))); + alloc_mode_context(cm, num_4x4_blk, ctx); } } } static void free_pick_mode_context(MACROBLOCK *x) { int i; - MACROBLOCKD *xd = &x->e_mbd; for (i = 0; i < BLOCK_SIZES; ++i) { const int num_4x4_w = num_4x4_blocks_wide_lookup[i]; const int num_4x4_h = num_4x4_blocks_high_lookup[i]; const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h); if (i < BLOCK_16X16) { - for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) { - for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) { - for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) { + for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) { + for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index) { + for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index) { PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - vpx_free(ctx->zcoeff_blk); - ctx->zcoeff_blk = 0; + free_mode_context(ctx); } } } } else if (i < BLOCK_32X32) { - for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) { - for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk; - ++xd->mb_index) { + for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) { + for (x->mb_index = 0; x->mb_index < 64 / num_4x4_blk; ++x->mb_index) { PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - vpx_free(ctx->zcoeff_blk); - ctx->zcoeff_blk = 0; + free_mode_context(ctx); } } } else if (i < BLOCK_64X64) { - for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) { + for (x->sb_index = 0; x->sb_index < 256 / num_4x4_blk; ++x->sb_index) { PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - vpx_free(ctx->zcoeff_blk); - ctx->zcoeff_blk = 0; + free_mode_context(ctx); } } else { PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - vpx_free(ctx->zcoeff_blk); - ctx->zcoeff_blk = 0; + free_mode_context(ctx); } } } @@ -3404,7 +3436,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // Post encode loop adjustment of Q prediction. if (!active_worst_qchanged) - vp9_update_rate_correction_factors(cpi, (cpi->sf.recode_loop) ? 2 : 0); + vp9_update_rate_correction_factors(cpi, (cpi->sf.recode_loop || + cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0); + cpi->last_q[cm->frame_type] = cm->base_qindex; diff --git a/libvpx/vp9/encoder/vp9_onyx_int.h b/libvpx/vp9/encoder/vp9_onyx_int.h index 9429c7f..9e80212 100644 --- a/libvpx/vp9/encoder/vp9_onyx_int.h +++ b/libvpx/vp9/encoder/vp9_onyx_int.h @@ -312,7 +312,6 @@ typedef struct VP9_COMP { VP9_COMMON common; VP9_CONFIG oxcf; struct rdcost_block_args rdcost_stack; - struct lookahead_ctx *lookahead; struct lookahead_entry *source; #if CONFIG_MULTIPLE_ARF diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c index fca7525..d24be96 100644 --- a/libvpx/vp9/encoder/vp9_quantize.c +++ b/libvpx/vp9/encoder/vp9_quantize.c @@ -22,7 +22,7 @@ extern int enc_debug; #endif -void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, +void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, @@ -30,58 +30,44 @@ void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - int i, rc, eob; - int zbins[2], nzbins[2], zbin; - int x, y, z, sz; - int zero_flag = n_coeffs; + int i, non_zero_count = count, eob = -1; + const int zbins[2] = { zbin_ptr[0] + zbin_oq_value, + zbin_ptr[1] + zbin_oq_value }; + const int nzbins[2] = { zbins[0] * -1, + zbins[1] * -1 }; - vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); - vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); - - eob = -1; - - // Base ZBIN - zbins[0] = zbin_ptr[0] + zbin_oq_value; - zbins[1] = zbin_ptr[1] + zbin_oq_value; - nzbins[0] = zbins[0] * -1; - nzbins[1] = zbins[1] * -1; + vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t)); + vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t)); if (!skip_block) { // Pre-scan pass - for (i = n_coeffs - 1; i >= 0; i--) { - rc = scan[i]; - z = coeff_ptr[rc]; + for (i = count - 1; i >= 0; i--) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; - if (z < zbins[rc != 0] && z > nzbins[rc != 0]) { - zero_flag--; - } else { + if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) + non_zero_count--; + else break; - } } // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. - for (i = 0; i < zero_flag; i++) { - rc = scan[i]; - z = coeff_ptr[rc]; - - zbin = (zbins[rc != 0]); - - sz = (z >> 31); // sign of z - x = (z ^ sz) - sz; - - if (x >= zbin) { - x += (round_ptr[rc != 0]); - x = clamp(x, INT16_MIN, INT16_MAX); - y = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) * - quant_shift_ptr[rc != 0]) >> 16; // quantize (x) - x = (y ^ sz) - sz; // get the sign back - qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0]; // dequantized value - - if (y) { - eob = i; // last nonzero coeffs - } + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff >= zbins[rc != 0]) { + int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> 16; // quantization + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + + if (tmp) + eob = i; } } } @@ -315,17 +301,17 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { SEG_LVL_SKIP); /* save this macroblock QIndex for vp9_update_zbin_extra() */ - x->e_mbd.q_index = qindex; + x->q_index = qindex; /* R/D setup */ cpi->mb.errorperbit = rdmult >> 6; cpi->mb.errorperbit += (cpi->mb.errorperbit == 0); - vp9_initialize_me_consts(cpi, xd->q_index); + vp9_initialize_me_consts(cpi, x->q_index); } void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) { - const int qindex = x->e_mbd.q_index; + const int qindex = x->q_index; const int y_zbin_extra = (cpi->common.y_dequant[qindex][1] * (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; const int uv_zbin_extra = (cpi->common.uv_dequant[qindex][1] * diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c index 993919e..78cb06b 100644 --- a/libvpx/vp9/encoder/vp9_rdopt.c +++ b/libvpx/vp9/encoder/vp9_rdopt.c @@ -246,6 +246,10 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) { vp9_set_speed_features(cpi); + cpi->mb.select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL && + cm->frame_type != KEY_FRAME) ? + 0 : 1; + set_block_thresholds(cpi); fill_token_costs(cpi->mb.token_costs, cm->fc.coef_probs); @@ -268,10 +272,10 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) { MB_PREDICTION_MODE m; for (m = NEARESTMV; m < MB_MODE_COUNT; m++) - cpi->mb.inter_mode_cost[i][inter_mode_offset(m)] = + cpi->mb.inter_mode_cost[i][INTER_OFFSET(m)] = cost_token(vp9_inter_mode_tree, cm->fc.inter_mode_probs[i], - &vp9_inter_mode_encodings[inter_mode_offset(m)]); + &vp9_inter_mode_encodings[INTER_OFFSET(m)]); } } } @@ -609,7 +613,7 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, // TODO(jingning): temporarily enabled only for luma component rd = MIN(rd1, rd2); - if (plane == 0) + if (!xd->lossless && plane == 0) x->zcoeff_blk[tx_size][block] = rd1 > rd2 || !xd->plane[plane].eobs[block]; args->this_rate += args->rate; @@ -740,7 +744,7 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int n, m; int s0, s1; - const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->mi_8x8[0]); + const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs); for (n = TX_4X4; n <= max_tx_size; n++) { r[n][1] = r[n][0]; @@ -845,7 +849,7 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00}; // double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00}; - const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->mi_8x8[0]); + const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs); // for (n = TX_4X4; n <= max_txfm_size; n++) // r[n][0] = (r[n][0] * scale_r[n]); @@ -1326,6 +1330,7 @@ static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x, } static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, + PICK_MODE_CONTEXT *ctx, int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, BLOCK_SIZE bsize) { @@ -1361,6 +1366,27 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, *rate_tokenonly = this_rate_tokenonly; *distortion = this_distortion; *skippable = s; + if (!x->select_txfm_size) { + int i; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = x->e_mbd.plane; + for (i = 1; i < MAX_MB_PLANE; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][2]; + pd[i].qcoeff = ctx->qcoeff_pbuf[i][2]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2]; + pd[i].eobs = ctx->eobs_pbuf[i][2]; + + ctx->coeff_pbuf[i][2] = ctx->coeff_pbuf[i][0]; + ctx->qcoeff_pbuf[i][2] = ctx->qcoeff_pbuf[i][0]; + ctx->dqcoeff_pbuf[i][2] = ctx->dqcoeff_pbuf[i][0]; + ctx->eobs_pbuf[i][2] = ctx->eobs_pbuf[i][0]; + + ctx->coeff_pbuf[i][0] = p[i].coeff; + ctx->qcoeff_pbuf[i][0] = pd[i].qcoeff; + ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff; + ctx->eobs_pbuf[i][0] = pd[i].eobs; + } + } } } @@ -1386,8 +1412,9 @@ static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x, return this_rd; } -static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE bsize, - int *rate_uv, int *rate_uv_tokenonly, +static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, + BLOCK_SIZE bsize, int *rate_uv, + int *rate_uv_tokenonly, int64_t *dist_uv, int *skip_uv, MB_PREDICTION_MODE *mode_uv) { MACROBLOCK *const x = &cpi->mb; @@ -1400,7 +1427,7 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE bsize, // Else do a proper rd search for each possible transform size that may // be considered in the main rd loop. } else { - rd_pick_intra_sbuv_mode(cpi, x, + rd_pick_intra_sbuv_mode(cpi, x, ctx, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize); } @@ -1416,7 +1443,7 @@ static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode, // Don't account for mode here if segment skip is enabled. if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) { assert(is_inter_mode(mode)); - return x->inter_mode_cost[mode_context][inter_mode_offset(mode)]; + return x->inter_mode_cost[mode_context][INTER_OFFSET(mode)]; } else { return 0; } @@ -1707,7 +1734,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, const struct buf_2d orig_src = x->plane[0].src; struct buf_2d orig_pre[2]; - mode_idx = inter_mode_offset(this_mode); + mode_idx = INTER_OFFSET(this_mode); bsi->rdstat[i][mode_idx].brdcost = INT64_MAX; // if we're near/nearest and mv == 0,0, compare to zeromv @@ -1901,6 +1928,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost, x->mvcost, cpi); + bsi->rdstat[i][mode_idx].mvs[0].as_int = mode_mv[this_mode].as_int; if (num_4x4_blocks_wide > 1) bsi->rdstat[i + 1][mode_idx].mvs[0].as_int = @@ -2002,7 +2030,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, return; } - mode_idx = inter_mode_offset(mode_selected); + mode_idx = INTER_OFFSET(mode_selected); vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above)); vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left)); @@ -2078,7 +2106,7 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, return INT64_MAX; /* set it to the best */ for (i = 0; i < 4; i++) { - mode_idx = inter_mode_offset(bsi->modes[i]); + mode_idx = INTER_OFFSET(bsi->modes[i]); mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int; if (has_second_ref(mbmi)) mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int; @@ -2477,54 +2505,41 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize); MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; - int refs[2] = { mbmi->ref_frame[0], - (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; + const int refs[2] = { mbmi->ref_frame[0], + mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] }; int_mv ref_mv[2]; const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]); - int ite; + int ite, ref; // Prediction buffer from second frame. uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t)); // Do joint motion search in compound mode to get more accurate mv. - struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}}; - struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}}; - struct buf_2d scaled_first_yv12; + struct buf_2d backup_yv12[2][MAX_MB_PLANE]; + struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0]; int last_besterr[2] = {INT_MAX, INT_MAX}; - YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL}; - scaled_ref_frame[0] = get_scaled_ref_frame(cpi, mbmi->ref_frame[0]); - scaled_ref_frame[1] = get_scaled_ref_frame(cpi, mbmi->ref_frame[1]); - - ref_mv[0] = mbmi->ref_mvs[refs[0]][0]; - ref_mv[1] = mbmi->ref_mvs[refs[1]][0]; - - if (scaled_ref_frame[0]) { - int i; - // Swap out the reference frame for a version that's been scaled to - // match the resolution of the current frame, allowing the existing - // motion search code to be used without additional modifications. - for (i = 0; i < MAX_MB_PLANE; i++) - backup_yv12[i] = xd->plane[i].pre[0]; - setup_pre_planes(xd, 0, scaled_ref_frame[0], mi_row, mi_col, NULL); - } + YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = { + get_scaled_ref_frame(cpi, mbmi->ref_frame[0]), + get_scaled_ref_frame(cpi, mbmi->ref_frame[1]) + }; - if (scaled_ref_frame[1]) { - int i; - for (i = 0; i < MAX_MB_PLANE; i++) - backup_second_yv12[i] = xd->plane[i].pre[1]; + for (ref = 0; ref < 2; ++ref) { + ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0]; + + if (scaled_ref_frame[ref]) { + int i; + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) + backup_yv12[ref][i] = xd->plane[i].pre[ref]; + setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col, NULL); + } - setup_pre_planes(xd, 1, scaled_ref_frame[1], mi_row, mi_col, NULL); + xd->scale_factor[ref].sfc->set_scaled_offsets(&xd->scale_factor[ref], + mi_row, mi_col); + frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int; } - xd->scale_factor[0].sfc->set_scaled_offsets(&xd->scale_factor[0], - mi_row, mi_col); - xd->scale_factor[1].sfc->set_scaled_offsets(&xd->scale_factor[1], - mi_row, mi_col); - scaled_first_yv12 = xd->plane[0].pre[0]; - - // Initialize mv using single prediction mode result. - frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int; - frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int; - // Allow joint search multiple times iteratively for each ref frame // and break out the search loop if it couldn't find better mv. for (ite = 0; ite < 4; ite++) { @@ -2604,24 +2619,20 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } } - // restore the predictor - if (scaled_ref_frame[0]) { - int i; - for (i = 0; i < MAX_MB_PLANE; i++) - xd->plane[i].pre[0] = backup_yv12[i]; - } + *rate_mv = 0; - if (scaled_ref_frame[1]) { - int i; - for (i = 0; i < MAX_MB_PLANE; i++) - xd->plane[i].pre[1] = backup_second_yv12[i]; + for (ref = 0; ref < 2; ++ref) { + if (scaled_ref_frame[ref]) { + // restore the predictor + int i; + for (i = 0; i < MAX_MB_PLANE; i++) + xd->plane[i].pre[ref] = backup_yv12[ref][i]; + } + + *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv, + &mbmi->ref_mvs[refs[ref]][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); } - *rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv, - &mbmi->ref_mvs[refs[0]][0].as_mv, - x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); - *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv, - &mbmi->ref_mvs[refs[1]][0].as_mv, - x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); vpx_free(second_pred); } @@ -3046,6 +3057,30 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, return this_rd; // if 0, this will be re-calculated by caller } +static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, + int max_plane) { + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = x->e_mbd.plane; + int i; + + for (i = 0; i < max_plane; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][1]; + pd[i].qcoeff = ctx->qcoeff_pbuf[i][1]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1]; + pd[i].eobs = ctx->eobs_pbuf[i][1]; + + ctx->coeff_pbuf[i][1] = ctx->coeff_pbuf[i][0]; + ctx->qcoeff_pbuf[i][1] = ctx->qcoeff_pbuf[i][0]; + ctx->dqcoeff_pbuf[i][1] = ctx->dqcoeff_pbuf[i][0]; + ctx->eobs_pbuf[i][1] = ctx->eobs_pbuf[i][0]; + + ctx->coeff_pbuf[i][0] = p[i].coeff; + ctx->qcoeff_pbuf[i][0] = pd[i].qcoeff; + ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff; + ctx->eobs_pbuf[i][0] = pd[i].eobs; + } +} + void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int *returnrate, int64_t *returndist, BLOCK_SIZE bsize, @@ -3065,7 +3100,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, *returnrate = INT_MAX; return; } - rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, + rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly, &dist_uv, &uv_skip, bsize); } else { y_skip = 0; @@ -3074,7 +3109,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, *returnrate = INT_MAX; return; } - rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, + rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly, &dist_uv, &uv_skip, BLOCK_8X8); } @@ -3157,7 +3192,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, const int bhs = num_8x8_blocks_high_lookup[bsize] / 2; int best_skip2 = 0; - x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH; + x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; // Everywhere the flag is set the error is much higher than its neighbors. ctx->frames_with_high_error = 0; @@ -3196,8 +3231,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, case BLOCK_32X32: for (i = 0; i < 4; i++) { ref_frame_mask |= - x->mb_context[xd->sb_index][i].frames_with_high_error; - mode_mask |= x->mb_context[xd->sb_index][i].modes_with_high_error; + x->mb_context[x->sb_index][i].frames_with_high_error; + mode_mask |= x->mb_context[x->sb_index][i].modes_with_high_error; } break; default: @@ -3440,7 +3475,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, uv_tx = MIN(mbmi->tx_size, max_uv_txsize_lookup[bsize]); if (rate_uv_intra[uv_tx] == INT_MAX) { - choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[uv_tx], + choose_intra_uv_mode(cpi, ctx, bsize, &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]); @@ -3574,6 +3609,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // Did this mode help.. i.e. is it the new best mode if (this_rd < best_rd || x->skip) { + int max_plane = MAX_MB_PLANE; if (!mode_excluded) { // Note index of best mode so far best_mode_index = mode_index; @@ -3581,6 +3617,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (ref_frame == INTRA_FRAME) { /* required for left and above block mv */ mbmi->mv[0].as_int = 0; + max_plane = 1; } *returnrate = rate2; @@ -3588,6 +3625,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, best_rd = this_rd; best_mbmode = *mbmi; best_skip2 = this_skip2; + if (!x->select_txfm_size) + swap_block_ptr(x, ctx, max_plane); vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size], sizeof(uint8_t) * ctx->num_4x4_blk); @@ -3694,7 +3733,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // Do Intra UV best rd mode selection if best mode choice above was intra. if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) { TX_SIZE uv_tx_size = get_uv_tx_size(mbmi); - rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size], + rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size], &rate_uv_tokenonly[uv_tx_size], &dist_uv[uv_tx_size], &skip_uv[uv_tx_size], @@ -3850,7 +3889,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, b_mode_info best_bmodes[4]; int best_skip2 = 0; - x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH; + x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4); for (i = 0; i < 4; i++) { @@ -4063,7 +4102,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, distortion2 += distortion_y; if (rate_uv_intra[TX_4X4] == INT_MAX) { - choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4], + choose_intra_uv_mode(cpi, ctx, bsize, &rate_uv_intra[TX_4X4], &rate_uv_tokenonly[TX_4X4], &dist_uv[TX_4X4], &skip_uv[TX_4X4], &mode_uv[TX_4X4]); @@ -4317,12 +4356,14 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, // Did this mode help.. i.e. is it the new best mode if (this_rd < best_rd || x->skip) { if (!mode_excluded) { + int max_plane = MAX_MB_PLANE; // Note index of best mode so far best_mode_index = mode_index; if (ref_frame == INTRA_FRAME) { /* required for left and above block mv */ mbmi->mv[0].as_int = 0; + max_plane = 1; } *returnrate = rate2; @@ -4332,6 +4373,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv); best_mbmode = *mbmi; best_skip2 = this_skip2; + if (!x->select_txfm_size) + swap_block_ptr(x, ctx, max_plane); vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size], sizeof(uint8_t) * ctx->num_4x4_blk); @@ -4438,7 +4481,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, // Do Intra UV best rd mode selection if best mode choice above was intra. if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME) { TX_SIZE uv_tx_size = get_uv_tx_size(mbmi); - rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size], + rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size], &rate_uv_tokenonly[uv_tx_size], &dist_uv[uv_tx_size], &skip_uv[uv_tx_size], diff --git a/libvpx/vp9/encoder/vp9_tokenize.c b/libvpx/vp9/encoder/vp9_tokenize.c index 7d4676e..c7336d0 100644 --- a/libvpx/vp9/encoder/vp9_tokenize.c +++ b/libvpx/vp9/encoder/vp9_tokenize.c @@ -57,7 +57,7 @@ static void fill_value_tokens() { // initialize the cost for extra bits for all possible coefficient value. { int cost = 0; - const vp9_extra_bit *p = vp9_extra_bits + t[i].token; + const vp9_extra_bit *p = &vp9_extra_bits[t[i].token]; if (p->base_val) { const int extra = t[i].extra; @@ -73,7 +73,7 @@ static void fill_value_tokens() { } while (++i < DCT_MAX_VALUE); vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE; - vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE; + vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE; } struct tokenize_b_args { @@ -127,7 +127,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, get_scan(xd, tx_size, type, block, &scan, &nb); c = 0; do { - const int band = get_coef_band(band_translate, c); + const int band = band_translate[c]; int token; int v = 0; rc = scan[c]; diff --git a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c index dc11501..fefca66 100644 --- a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c @@ -206,12 +206,12 @@ void fadst4_1d_sse2(__m128i *in) { const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); __m128i u[8], v[8]; __m128i in7 = _mm_add_epi16(in[0], in[1]); - in7 = _mm_sub_epi16(in7, in[3]); u[0] = _mm_unpacklo_epi16(in[0], in[1]); u[1] = _mm_unpacklo_epi16(in[2], in[3]); u[2] = _mm_unpacklo_epi16(in7, kZero); u[3] = _mm_unpacklo_epi16(in[2], kZero); + u[4] = _mm_unpacklo_epi16(in[3], kZero); v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 @@ -219,9 +219,10 @@ void fadst4_1d_sse2(__m128i *in) { v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 + v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03); u[0] = _mm_add_epi32(v[0], v[1]); - u[1] = v[2]; + u[1] = _mm_sub_epi32(v[2], v[6]); u[2] = _mm_add_epi32(v[3], v[4]); u[3] = _mm_sub_epi32(u[2], u[0]); u[4] = _mm_slli_epi32(v[5], 2); diff --git a/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm b/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm index 533456b..1a9e4e8 100644 --- a/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm +++ b/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm @@ -118,6 +118,14 @@ SECTION .text RET %endmacro +%macro INC_SRC_BY_SRC_STRIDE 0 +%if ARCH_X86=1 && CONFIG_PIC=1 + add srcq, src_stridemp +%else + add srcq, src_strideq +%endif +%endmacro + %macro SUBPEL_VARIANCE 1-2 0 ; W %if cpuflag(ssse3) %define bilin_filter_m bilin_filter_m_ssse3 @@ -129,41 +137,85 @@ SECTION .text ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses ; 11, not 13, if the registers are ordered correctly. May make a minor speed ; difference on Win64 -%ifdef PIC -%if %2 == 1 ; avg -cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, height, sse -%define sec_str sec_strideq -%else -cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \ - dst, dst_stride, height, sse -%endif -%define h heightd -%define bilin_filter sseq -%else -%if %2 == 1 ; avg -cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ - 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse -%if ARCH_X86_64 -%define h heightd -%define sec_str sec_strideq -%else -%define h dword heightm -%define sec_str sec_stridemp -%endif + +%ifdef PIC ; 64bit PIC + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse + %define sec_str sec_strideq + %else + cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ + y_offset, dst, dst_stride, height, sse + %endif + %define h heightd + %define bilin_filter sseq %else -cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ - dst, dst_stride, height, sse -%define h heightd -%endif -%define bilin_filter bilin_filter_m + %if ARCH_X86=1 && CONFIG_PIC=1 + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, \ + height, sse, g_bilin_filter, g_pw_8 + %define h dword heightm + %define sec_str sec_stridemp + + ;Store bilin_filter and pw_8 location in stack + GET_GOT eax + add esp, 4 ; restore esp + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ + y_offset, dst, dst_stride, height, sse, \ + g_bilin_filter, g_pw_8 + %define h heightd + + ;Store bilin_filter and pw_8 location in stack + GET_GOT eax + add esp, 4 ; restore esp + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %endif + %else + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ + 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, \ + height, sse + %if ARCH_X86_64 + %define h heightd + %define sec_str sec_strideq + %else + %define h dword heightm + %define sec_str sec_stridemp + %endif + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ + y_offset, dst, dst_stride, height, sse + %define h heightd + %endif + + %define bilin_filter bilin_filter_m + %endif %endif + ASSERT %1 <= 16 ; m6 overflows if w > 16 pxor m6, m6 ; sum pxor m7, m7 ; sse @@ -329,11 +381,22 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %define filter_y_b m9 %define filter_rnd m10 %else ; x86-32 or mmx +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0, reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] %define filter_rnd [pw_8] %endif +%endif + .x_zero_y_other_loop: %if %1 == 16 movu m0, [srcq] @@ -615,12 +678,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 +%else ;x86_32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0.5. We can reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] %else add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] %define filter_rnd [pw_8] %endif +%endif + %if %1 == 16 movu m0, [srcq] movu m3, [srcq+1] @@ -752,12 +826,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +;y_offset == 0. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] %else add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] %define filter_rnd [pw_8] %endif +%endif + .x_other_y_zero_loop: %if %1 == 16 movu m0, [srcq] @@ -873,12 +958,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0.5. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] %else add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] %define filter_rnd [pw_8] %endif +%endif + %if %1 == 16 movu m0, [srcq] movu m1, [srcq+1] @@ -1057,6 +1153,21 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %define filter_y_a m10 %define filter_y_b m11 %define filter_rnd m12 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; In this case, there is NO unused register. Used src_stride register. Later, +; src_stride has to be loaded from stack when it is needed. +%define tempq src_strideq + mov tempq, g_bilin_filterm + add x_offsetq, tempq + add y_offsetq, tempq +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + + mov tempq, g_pw_8m +%define filter_rnd [tempq] %else add x_offsetq, bilin_filter add y_offsetq, bilin_filter @@ -1066,6 +1177,8 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %define filter_y_b [y_offsetq+16] %define filter_rnd [pw_8] %endif +%endif + ; x_offset == bilin interpolation && y_offset == bilin interpolation %if %1 == 16 movu m0, [srcq] @@ -1093,7 +1206,9 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %endif psraw m0, 4 psraw m2, 4 - add srcq, src_strideq + + INC_SRC_BY_SRC_STRIDE + packuswb m0, m2 .x_other_y_other_loop: %if cpuflag(ssse3) @@ -1163,7 +1278,7 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 - add srcq, src_strideq + INC_SRC_BY_SRC_STRIDE add dstq, dst_strideq %else ; %1 < 16 movh m0, [srcq] @@ -1184,12 +1299,17 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %if cpuflag(ssse3) packuswb m0, m0 %endif - add srcq, src_strideq + + INC_SRC_BY_SRC_STRIDE + .x_other_y_other_loop: movh m2, [srcq] movh m1, [srcq+1] - movh m4, [srcq+src_strideq] - movh m3, [srcq+src_strideq+1] + + INC_SRC_BY_SRC_STRIDE + movh m4, [srcq] + movh m3, [srcq+1] + %if cpuflag(ssse3) punpcklbw m2, m1 punpcklbw m4, m3 @@ -1253,7 +1373,7 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 - lea srcq, [srcq+src_strideq*2] + INC_SRC_BY_SRC_STRIDE lea dstq, [dstq+dst_strideq*2] %endif %if %2 == 1 ; avg diff --git a/libvpx/vp9/vp9_common.mk b/libvpx/vp9/vp9_common.mk index 0badb08..2dd2bf0 100644 --- a/libvpx/vp9/vp9_common.mk +++ b/libvpx/vp9/vp9_common.mk @@ -103,11 +103,21 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_avg_horiz_dspr VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_dspr2.c VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_horiz_dspr2.c VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_vert_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_intrapred4_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_intrapred8_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_intrapred16_dspr2.c VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans8_dspr2.c VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans16_dspr2.c VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans32_cols_dspr2.c VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans32_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_loopfilter_filters_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_loopfilter_filters_dspr2.h +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_loopfilter_macros_dspr2.h +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_loopfilter_masks_dspr2.h +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c @@ -123,6 +133,7 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_1_add_neon$(AS VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_1_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct32x32_1_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct32x32_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht4x4_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht8x8_add_neon$(ASM) |