diff options
author | hkuang <hkuang@google.com> | 2013-11-07 15:50:31 -0800 |
---|---|---|
committer | hkuang <hkuang@google.com> | 2013-11-08 11:40:06 -0800 |
commit | 5ae7ac49f08a179e4f054d99fcfc9dce78d26e58 (patch) | |
tree | 0d891d2cbbac4c3da6fd15a25bf8797b29b31994 /libvpx/vp9 | |
parent | e6eeaaa14ccef4c0938fcce21c54979204041a30 (diff) | |
download | android_external_libvpx-5ae7ac49f08a179e4f054d99fcfc9dce78d26e58.tar.gz android_external_libvpx-5ae7ac49f08a179e4f054d99fcfc9dce78d26e58.tar.bz2 android_external_libvpx-5ae7ac49f08a179e4f054d99fcfc9dce78d26e58.zip |
Roll latest libvpx into Android.
The lastest libvpx just added multithread tile decoding support.
Checkout is from master: abdefeaa89a0908327518e5ca75c935c66b2e1aa
Bug:11576718
Change-Id: Icbe5430633e179b8dc6d419e280ad7ebd3cad4a0
Diffstat (limited to 'libvpx/vp9')
155 files changed, 23894 insertions, 8462 deletions
diff --git a/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c b/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c index 3e3e400..0b9fc09 100644 --- a/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c +++ b/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c @@ -11,45 +11,47 @@ #include "./vp9_rtcd.h" #include "vp9/common/vp9_common.h" -extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input, - int16_t *output, - int output_stride); -extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src, - int16_t *output, - int16_t *pass1Output, - int16_t skip_adding, - uint8_t *dest, - int dest_stride); -extern void vp9_short_idct10_16x16_add_neon_pass1(int16_t *input, - int16_t *output, - int output_stride); -extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src, - int16_t *output, - int16_t *pass1Output, - int16_t skip_adding, - uint8_t *dest, - int dest_stride); -extern void save_neon_registers(); -extern void restore_neon_registers(); - - -void vp9_short_idct16x16_add_neon(int16_t *input, - uint8_t *dest, int dest_stride) { +void vp9_idct16x16_256_add_neon_pass1(const int16_t *input, + int16_t *output, + int output_stride); +void vp9_idct16x16_256_add_neon_pass2(const int16_t *src, + int16_t *output, + int16_t *pass1Output, + int16_t skip_adding, + uint8_t *dest, + int dest_stride); +void vp9_idct16x16_10_add_neon_pass1(const int16_t *input, + int16_t *output, + int output_stride); +void vp9_idct16x16_10_add_neon_pass2(const int16_t *src, + int16_t *output, + int16_t *pass1Output, + int16_t skip_adding, + uint8_t *dest, + int dest_stride); + +/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ +extern void vp9_push_neon(int64_t *store); +extern void vp9_pop_neon(int64_t *store); + +void vp9_idct16x16_256_add_neon(const int16_t *input, + uint8_t *dest, int dest_stride) { + int64_t store_reg[8]; int16_t pass1_output[16*16] = {0}; int16_t row_idct_output[16*16] = {0}; // save d8-d15 register values. - save_neon_registers(); + vp9_push_neon(store_reg); /* Parallel idct on the upper 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8); + vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. - vp9_short_idct16x16_add_neon_pass2(input+1, + vp9_idct16x16_256_add_neon_pass2(input+1, row_idct_output, pass1_output, 0, @@ -59,12 +61,12 @@ void vp9_short_idct16x16_add_neon(int16_t *input, /* Parallel idct on the lower 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8); + vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. - vp9_short_idct16x16_add_neon_pass2(input+8*16+1, + vp9_idct16x16_256_add_neon_pass2(input+8*16+1, row_idct_output+8, pass1_output, 0, @@ -74,12 +76,12 @@ void vp9_short_idct16x16_add_neon(int16_t *input, /* Parallel idct on the left 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8); + vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. - vp9_short_idct16x16_add_neon_pass2(row_idct_output+1, + vp9_idct16x16_256_add_neon_pass2(row_idct_output+1, row_idct_output, pass1_output, 1, @@ -89,12 +91,12 @@ void vp9_short_idct16x16_add_neon(int16_t *input, /* Parallel idct on the right 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); + vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. - vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1, + vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, row_idct_output+8, pass1_output, 1, @@ -102,28 +104,29 @@ void vp9_short_idct16x16_add_neon(int16_t *input, dest_stride); // restore d8-d15 register values. - restore_neon_registers(); + vp9_pop_neon(store_reg); return; } -void vp9_short_idct10_16x16_add_neon(int16_t *input, - uint8_t *dest, int dest_stride) { +void vp9_idct16x16_10_add_neon(const int16_t *input, + uint8_t *dest, int dest_stride) { + int64_t store_reg[8]; int16_t pass1_output[16*16] = {0}; int16_t row_idct_output[16*16] = {0}; // save d8-d15 register values. - save_neon_registers(); + vp9_push_neon(store_reg); /* Parallel idct on the upper 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct10_16x16_add_neon_pass1(input, pass1_output, 8); + vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. - vp9_short_idct10_16x16_add_neon_pass2(input+1, + vp9_idct16x16_10_add_neon_pass2(input+1, row_idct_output, pass1_output, 0, @@ -135,12 +138,12 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input, /* Parallel idct on the left 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8); + vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. - vp9_short_idct16x16_add_neon_pass2(row_idct_output+1, + vp9_idct16x16_256_add_neon_pass2(row_idct_output+1, row_idct_output, pass1_output, 1, @@ -150,12 +153,12 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input, /* Parallel idct on the right 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); + vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. - vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1, + vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, row_idct_output+8, pass1_output, 1, @@ -163,7 +166,7 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input, dest_stride); // restore d8-d15 register values. - restore_neon_registers(); + vp9_pop_neon(store_reg); return; } diff --git a/libvpx/vp9/common/arm/neon/vp9_idct32x32_neon.c b/libvpx/vp9/common/arm/neon/vp9_idct32x32_neon.c deleted file mode 100644 index ceecd6f..0000000 --- a/libvpx/vp9/common/arm/neon/vp9_idct32x32_neon.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vp9/common/vp9_common.h" - -// defined in vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm -extern void idct32_transpose_and_transform(int16_t *transpose_buffer, - int16_t *output, int16_t *input); -extern void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride); - - -// defined in vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm -extern void save_neon_registers(); -extern void restore_neon_registers(); - -void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest, - int dest_stride) { - // TODO(cd): move the creation of these buffers within the ASM file - // internal buffer used to transpose 8 lines into before transforming them - int16_t transpose_buffer[32 * 8]; - // results of the first pass (transpose and transform rows) - int16_t pass1[32 * 32]; - // results of the second pass (transpose and transform columns) - int16_t pass2[32 * 32]; - - // save register we need to preserve - save_neon_registers(); - // process rows - idct32_transpose_and_transform(transpose_buffer, pass1, input); - // process columns - // TODO(cd): do these two steps/passes within the ASM file - idct32_transpose_and_transform(transpose_buffer, pass2, pass1); - // combine and add to dest - // TODO(cd): integrate this within the last storage step of the second pass - idct32_combine_add(dest, pass2, dest_stride); - // restore register we need to preserve - restore_neon_registers(); -} - -// TODO(cd): Eliminate this file altogether when everything is in ASM file diff --git a/libvpx/vp9/common/arm/neon/vp9_save_reg_neon.asm b/libvpx/vp9/common/arm/neon/vp9_save_reg_neon.asm new file mode 100644 index 0000000..71c3e70 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_save_reg_neon.asm @@ -0,0 +1,36 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_push_neon| + EXPORT |vp9_pop_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vp9_push_neon| PROC + vst1.i64 {d8, d9, d10, d11}, [r0]! + vst1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + +|vp9_pop_neon| PROC + vld1.i64 {d8, d9, d10, d11}, [r0]! + vld1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + + END + diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm index cf5c8f7..b1fd21b 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm @@ -8,21 +8,21 @@ ; - EXPORT |vp9_short_idct16x16_1_add_neon| + EXPORT |vp9_idct16x16_1_add_neon| ARM REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp9_short_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, +;void vp9_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, ; int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_short_idct16x16_1_add_neon| PROC +|vp9_idct16x16_1_add_neon| PROC ldrsh r0, [r0] ; generate cospi_16_64 = 11585 @@ -193,6 +193,6 @@ vst1.64 {d31}, [r12], r2 bx lr - ENDP ; |vp9_short_idct16x16_1_add_neon| + ENDP ; |vp9_idct16x16_1_add_neon| END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm index 7464e80..a13c0d0 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm @@ -8,12 +8,10 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_short_idct16x16_add_neon_pass1| - EXPORT |vp9_short_idct16x16_add_neon_pass2| - EXPORT |vp9_short_idct10_16x16_add_neon_pass1| - EXPORT |vp9_short_idct10_16x16_add_neon_pass2| - EXPORT |save_neon_registers| - EXPORT |restore_neon_registers| + EXPORT |vp9_idct16x16_256_add_neon_pass1| + EXPORT |vp9_idct16x16_256_add_neon_pass2| + EXPORT |vp9_idct16x16_10_add_neon_pass1| + EXPORT |vp9_idct16x16_10_add_neon_pass2| ARM REQUIRE8 PRESERVE8 @@ -38,7 +36,7 @@ MEND AREA Block, CODE, READONLY ; name this block of code -;void |vp9_short_idct16x16_add_neon_pass1|(int16_t *input, +;void |vp9_idct16x16_256_add_neon_pass1|(int16_t *input, ; int16_t *output, int output_stride) ; ; r0 int16_t input @@ -48,7 +46,7 @@ ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 ; registers and use them as buffer during calculation. -|vp9_short_idct16x16_add_neon_pass1| PROC +|vp9_idct16x16_256_add_neon_pass1| PROC ; TODO(hkuang): Find a better way to load the elements. ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 @@ -275,9 +273,9 @@ vst1.64 {d31}, [r1], r2 bx lr - ENDP ; |vp9_short_idct16x16_add_neon_pass1| + ENDP ; |vp9_idct16x16_256_add_neon_pass1| -;void vp9_short_idct16x16_add_neon_pass2(int16_t *src, +;void vp9_idct16x16_256_add_neon_pass2(int16_t *src, ; int16_t *output, ; int16_t *pass1Output, ; int16_t skip_adding, @@ -294,7 +292,7 @@ ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 ; registers and use them as buffer during calculation. -|vp9_short_idct16x16_add_neon_pass2| PROC +|vp9_idct16x16_256_add_neon_pass2| PROC push {r3-r9} ; TODO(hkuang): Find a better way to load the elements. @@ -786,9 +784,9 @@ skip_adding_dest end_idct16x16_pass2 pop {r3-r9} bx lr - ENDP ; |vp9_short_idct16x16_add_neon_pass2| + ENDP ; |vp9_idct16x16_256_add_neon_pass2| -;void |vp9_short_idct10_16x16_add_neon_pass1|(int16_t *input, +;void |vp9_idct16x16_10_add_neon_pass1|(int16_t *input, ; int16_t *output, int output_stride) ; ; r0 int16_t input @@ -798,7 +796,7 @@ end_idct16x16_pass2 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 ; registers and use them as buffer during calculation. -|vp9_short_idct10_16x16_add_neon_pass1| PROC +|vp9_idct16x16_10_add_neon_pass1| PROC ; TODO(hkuang): Find a better way to load the elements. ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 @@ -907,9 +905,9 @@ end_idct16x16_pass2 vst1.64 {d31}, [r1], r2 bx lr - ENDP ; |vp9_short_idct10_16x16_add_neon_pass1| + ENDP ; |vp9_idct16x16_10_add_neon_pass1| -;void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src, +;void vp9_idct16x16_10_add_neon_pass2(int16_t *src, ; int16_t *output, ; int16_t *pass1Output, ; int16_t skip_adding, @@ -926,7 +924,7 @@ end_idct16x16_pass2 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 ; registers and use them as buffer during calculation. -|vp9_short_idct10_16x16_add_neon_pass2| PROC +|vp9_idct16x16_10_add_neon_pass2| PROC push {r3-r9} ; TODO(hkuang): Find a better way to load the elements. @@ -1177,15 +1175,5 @@ end_idct16x16_pass2 end_idct10_16x16_pass2 pop {r3-r9} bx lr - ENDP ; |vp9_short_idct10_16x16_add_neon_pass2| -;void |save_neon_registers|() -|save_neon_registers| PROC - vpush {d8-d15} - bx lr - ENDP ; |save_registers| -;void |restore_neon_registers|() -|restore_neon_registers| PROC - vpop {d8-d15} - bx lr - ENDP ; |restore_registers| + ENDP ; |vp9_idct16x16_10_add_neon_pass2| END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm index 5c097cc..f00d027 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm @@ -43,8 +43,7 @@ cospi_30_64 EQU 1606 cospi_31_64 EQU 804 - EXPORT |idct32_transpose_and_transform| - EXPORT |idct32_combine_add| + EXPORT |vp9_idct32x32_1024_add_neon| ARM REQUIRE8 PRESERVE8 @@ -100,6 +99,142 @@ cospi_31_64 EQU 804 vst1.16 {$reg2}, [r1] MEND ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q6-q9 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_CENTER_RESULTS + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d8}, [r10], r2 + vld1.s16 {d11}, [r9], r11 + vld1.s16 {d9}, [r10] + vld1.s16 {d10}, [r9] + ; ROUND_POWER_OF_TWO + vrshr.s16 q7, q7, #6 + vrshr.s16 q8, q8, #6 + vrshr.s16 q9, q9, #6 + vrshr.s16 q6, q6, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q7, q7, d9 + vaddw.u8 q8, q8, d10 + vaddw.u8 q9, q9, d11 + vaddw.u8 q6, q6, d8 + ; clip pixel + vqmovun.s16 d9, q7 + vqmovun.s16 d10, q8 + vqmovun.s16 d11, q9 + vqmovun.s16 d8, q6 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d9}, [r10], r11 + vst1.16 {d10}, [r9], r2 + vst1.16 {d8}, [r10] + vst1.16 {d11}, [r9] + ; update pointers (by dest_stride * 2) + sub r9, r9, r2, lsl #1 + add r10, r10, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q6-q9 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_CENTER_RESULTS_LAST + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d8}, [r10], r2 + vld1.s16 {d11}, [r9], r11 + vld1.s16 {d9}, [r10] + vld1.s16 {d10}, [r9] + ; ROUND_POWER_OF_TWO + vrshr.s16 q7, q7, #6 + vrshr.s16 q8, q8, #6 + vrshr.s16 q9, q9, #6 + vrshr.s16 q6, q6, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q7, q7, d9 + vaddw.u8 q8, q8, d10 + vaddw.u8 q9, q9, d11 + vaddw.u8 q6, q6, d8 + ; clip pixel + vqmovun.s16 d9, q7 + vqmovun.s16 d10, q8 + vqmovun.s16 d11, q9 + vqmovun.s16 d8, q6 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d9}, [r10], r11 + vst1.16 {d10}, [r9], r2 + vst1.16 {d8}, [r10]! + vst1.16 {d11}, [r9]! + ; update pointers (by dest_stride * 2) + sub r9, r9, r2, lsl #1 + add r10, r10, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q4-q7 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_EXTREME_RESULTS + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d4}, [r7], r2 + vld1.s16 {d7}, [r6], r11 + vld1.s16 {d5}, [r7] + vld1.s16 {d6}, [r6] + ; ROUND_POWER_OF_TWO + vrshr.s16 q5, q5, #6 + vrshr.s16 q6, q6, #6 + vrshr.s16 q7, q7, #6 + vrshr.s16 q4, q4, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q5, q5, d5 + vaddw.u8 q6, q6, d6 + vaddw.u8 q7, q7, d7 + vaddw.u8 q4, q4, d4 + ; clip pixel + vqmovun.s16 d5, q5 + vqmovun.s16 d6, q6 + vqmovun.s16 d7, q7 + vqmovun.s16 d4, q4 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d5}, [r7], r11 + vst1.16 {d6}, [r6], r2 + vst1.16 {d7}, [r6] + vst1.16 {d4}, [r7] + ; update pointers (by dest_stride * 2) + sub r6, r6, r2, lsl #1 + add r7, r7, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q4-q7 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_EXTREME_RESULTS_LAST + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d4}, [r7], r2 + vld1.s16 {d7}, [r6], r11 + vld1.s16 {d5}, [r7] + vld1.s16 {d6}, [r6] + ; ROUND_POWER_OF_TWO + vrshr.s16 q5, q5, #6 + vrshr.s16 q6, q6, #6 + vrshr.s16 q7, q7, #6 + vrshr.s16 q4, q4, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q5, q5, d5 + vaddw.u8 q6, q6, d6 + vaddw.u8 q7, q7, d7 + vaddw.u8 q4, q4, d4 + ; clip pixel + vqmovun.s16 d5, q5 + vqmovun.s16 d6, q6 + vqmovun.s16 d7, q7 + vqmovun.s16 d4, q4 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d5}, [r7], r11 + vst1.16 {d6}, [r6], r2 + vst1.16 {d7}, [r6]! + vst1.16 {d4}, [r7]! + ; update pointers (by dest_stride * 2) + sub r6, r6, r2, lsl #1 + add r7, r7, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- ; Touches q8-q12, q15 (q13-q14 are preserved) ; valid output registers are anything but q8-q11 MACRO @@ -110,12 +245,12 @@ cospi_31_64 EQU 804 ; additions/substractions before the multiplies. ; generate the constants ; generate scalar constants - mov r3, #$first_constant & 0xFF00 - add r3, #$first_constant & 0x00FF + mov r8, #$first_constant & 0xFF00 mov r12, #$second_constant & 0xFF00 + add r8, #$first_constant & 0x00FF add r12, #$second_constant & 0x00FF ; generate vector constants - vdup.16 d30, r3 + vdup.16 d30, r8 vdup.16 d31, r12 ; (used) two for inputs (regA-regD), one for constants (q15) ; do some multiplications (ordered for maximum latency hiding) @@ -153,15 +288,22 @@ cospi_31_64 EQU 804 MEND ; -------------------------------------------------------------------------- -;void idct32_transpose_and_transform(int16_t *transpose_buffer, int16_t *output, int16_t *input); +;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride); ; -; r0 int16_t *transpose_buffer -; r1 int16_t *output -; r2 int16_t *input) -; TODO(cd): have more logical parameter ordering but this issue will disappear -; when functions are combined. +; r0 int16_t *input, +; r1 uint8_t *dest, +; r2 int dest_stride) +; loop counters +; r4 bands loop counter +; r5 pass loop counter +; r8 transpose loop counter +; combine-add pointers +; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...) +; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...) +; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...) +; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...) -|idct32_transpose_and_transform| PROC +|vp9_idct32x32_1024_add_neon| PROC ; This function does one pass of idct32x32 transform. ; ; This is done by transposing the input and then doing a 1d transform on @@ -171,43 +313,73 @@ cospi_31_64 EQU 804 ; The 1d transform is done by looping over bands of eight columns (the ; idct32_bands loop). For each band, the transform input transposition ; is done on demand, one band of four 8x8 matrices at a time. The four - ; matrices are trsnposed by pairs (the idct32_transpose_pair loop). - push {r4} - mov r4, #0 ; initialize bands loop counter + ; matrices are transposed by pairs (the idct32_transpose_pair loop). + push {r4-r11} + vpush {d8-d15} + ; stack operation + ; internal buffer used to transpose 8 lines into before transforming them + ; int16_t transpose_buffer[32 * 8]; + ; at sp + [4096, 4607] + ; results of the first pass (transpose and transform rows) + ; int16_t pass1[32 * 32]; + ; at sp + [0, 2047] + ; results of the second pass (transpose and transform columns) + ; int16_t pass2[32 * 32]; + ; at sp + [2048, 4095] + sub sp, sp, #512+2048+2048 + + ; r6 = dest + 31 * dest_stride + ; r7 = dest + 0 * dest_stride + ; r9 = dest + 15 * dest_stride + ; r10 = dest + 16 * dest_stride + rsb r6, r2, r2, lsl #5 + rsb r9, r2, r2, lsl #4 + add r10, r1, r2, lsl #4 + mov r7, r1 + add r6, r6, r1 + add r9, r9, r1 + ; r11 = -dest_stride + neg r11, r2 + ; r3 = input + mov r3, r0 + ; parameters for first pass + ; r0 = transpose_buffer[32 * 8] + add r0, sp, #4096 + ; r1 = pass1[32 * 32] + mov r1, sp + + mov r5, #0 ; initialize pass loop counter +idct32_pass_loop + mov r4, #4 ; initialize bands loop counter idct32_bands_loop - ; TODO(cd) get rid of these push/pop by properly adjusting register - ; content at end of loop - push {r0} - push {r1} - push {r2} - mov r3, #0 ; initialize transpose loop counter + mov r8, #2 ; initialize transpose loop counter idct32_transpose_pair_loop ; Load two horizontally consecutive 8x8 16bit data matrices. The first one ; into q0-q7 and the second one into q8-q15. There is a stride of 64, ; adjusted to 32 because of the two post-increments. - vld1.s16 {q8}, [r2]! - vld1.s16 {q0}, [r2]! - add r2, #32 - vld1.s16 {q9}, [r2]! - vld1.s16 {q1}, [r2]! - add r2, #32 - vld1.s16 {q10}, [r2]! - vld1.s16 {q2}, [r2]! - add r2, #32 - vld1.s16 {q11}, [r2]! - vld1.s16 {q3}, [r2]! - add r2, #32 - vld1.s16 {q12}, [r2]! - vld1.s16 {q4}, [r2]! - add r2, #32 - vld1.s16 {q13}, [r2]! - vld1.s16 {q5}, [r2]! - add r2, #32 - vld1.s16 {q14}, [r2]! - vld1.s16 {q6}, [r2]! - add r2, #32 - vld1.s16 {q15}, [r2]! - vld1.s16 {q7}, [r2]! + vld1.s16 {q8}, [r3]! + vld1.s16 {q0}, [r3]! + add r3, #32 + vld1.s16 {q9}, [r3]! + vld1.s16 {q1}, [r3]! + add r3, #32 + vld1.s16 {q10}, [r3]! + vld1.s16 {q2}, [r3]! + add r3, #32 + vld1.s16 {q11}, [r3]! + vld1.s16 {q3}, [r3]! + add r3, #32 + vld1.s16 {q12}, [r3]! + vld1.s16 {q4}, [r3]! + add r3, #32 + vld1.s16 {q13}, [r3]! + vld1.s16 {q5}, [r3]! + add r3, #32 + vld1.s16 {q14}, [r3]! + vld1.s16 {q6}, [r3]! + add r3, #32 + vld1.s16 {q15}, [r3]! + vld1.s16 {q7}, [r3]! ; Transpose the two 8x8 16bit data matrices. vswp d17, d24 @@ -255,11 +427,13 @@ idct32_transpose_pair_loop vst1.16 {q7}, [r0]! ; increment pointers by adjusted stride (not necessary for r0/out) - sub r2, r2, #8*32*2-32-16*2 + ; go back by 7*32 for the seven lines moved fully by read and add + ; go back by 32 for the eigth line only read + ; advance by 16*2 to go the next pair + sub r3, r3, #7*32*2 + 32 - 16*2 ; transpose pair loop processing - add r3, r3, #1 - cmp r3, #1 - BLE idct32_transpose_pair_loop + subs r8, r8, #1 + bne idct32_transpose_pair_loop ; restore r0/input to its original value sub r0, r0, #32*8*2 @@ -815,21 +989,26 @@ idct32_transpose_pair_loop vadd.s16 q9, q5, q0 vsub.s16 q6, q5, q0 vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 17, 17, 16, q7, q6 - STORE_IN_OUTPUT 16, 15, 14, q9, q8 + + cmp r5, #0 + bgt idct32_bands_end_2nd_pass + +idct32_bands_end_1st_pass + STORE_IN_OUTPUT 17, 16, 17, q6, q7 + STORE_IN_OUTPUT 17, 14, 15, q8, q9 ; -------------------------------------------------------------------------- ; part of final stage ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; ;output[30 * 32] = step1b[1][i] - step1b[30][i]; ;output[31 * 32] = step1b[0][i] - step1b[31][i]; - LOAD_FROM_OUTPUT 14, 30, 31, q0, q1 + LOAD_FROM_OUTPUT 15, 30, 31, q0, q1 vadd.s16 q4, q2, q1 vadd.s16 q5, q3, q0 vsub.s16 q6, q3, q0 vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 31, 31, 30, q7, q6 - STORE_IN_OUTPUT 30, 0, 1, q4, q5 + STORE_IN_OUTPUT 31, 30, 31, q6, q7 + STORE_IN_OUTPUT 31, 0, 1, q4, q5 ; -------------------------------------------------------------------------- ; part of stage 7 ;step1[2] = step1b[2][i] + step1b[13][i]; @@ -848,25 +1027,25 @@ idct32_transpose_pair_loop ;output[18 * 32] = step1b[13][i] - step1b[18][i]; ;output[19 * 32] = step1b[12][i] - step1b[19][i]; LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 - vadd.s16 q6, q4, q1 - vadd.s16 q7, q5, q0 - vsub.s16 q8, q5, q0 - vsub.s16 q9, q4, q1 - STORE_IN_OUTPUT 19, 19, 18, q9, q8 - STORE_IN_OUTPUT 18, 13, 12, q7, q6 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 19, 18, 19, q6, q7 + STORE_IN_OUTPUT 19, 12, 13, q8, q9 ; -------------------------------------------------------------------------- ; part of final stage ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; ;output[28 * 32] = step1b[3][i] - step1b[28][i]; ;output[29 * 32] = step1b[2][i] - step1b[29][i]; - LOAD_FROM_OUTPUT 12, 28, 29, q0, q1 + LOAD_FROM_OUTPUT 13, 28, 29, q0, q1 vadd.s16 q4, q2, q1 vadd.s16 q5, q3, q0 vsub.s16 q6, q3, q0 vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 29, 29, 28, q7, q6 - STORE_IN_OUTPUT 28, 2, 3, q4, q5 + STORE_IN_OUTPUT 29, 28, 29, q6, q7 + STORE_IN_OUTPUT 29, 2, 3, q4, q5 ; -------------------------------------------------------------------------- ; part of stage 7 ;step1[4] = step1b[4][i] + step1b[11][i]; @@ -885,25 +1064,25 @@ idct32_transpose_pair_loop ;output[20 * 32] = step1b[11][i] - step1b[20][i]; ;output[21 * 32] = step1b[10][i] - step1b[21][i]; LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 - vadd.s16 q6, q4, q1 - vadd.s16 q7, q5, q0 - vsub.s16 q8, q5, q0 - vsub.s16 q9, q4, q1 - STORE_IN_OUTPUT 21, 21, 20, q9, q8 - STORE_IN_OUTPUT 20, 11, 10, q7, q6 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 21, 20, 21, q6, q7 + STORE_IN_OUTPUT 21, 10, 11, q8, q9 ; -------------------------------------------------------------------------- ; part of final stage ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; ;output[26 * 32] = step1b[5][i] - step1b[26][i]; ;output[27 * 32] = step1b[4][i] - step1b[27][i]; - LOAD_FROM_OUTPUT 10, 26, 27, q0, q1 + LOAD_FROM_OUTPUT 11, 26, 27, q0, q1 vadd.s16 q4, q2, q1 vadd.s16 q5, q3, q0 vsub.s16 q6, q3, q0 vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 27, 27, 26, q7, q6 - STORE_IN_OUTPUT 26, 4, 5, q4, q5 + STORE_IN_OUTPUT 27, 26, 27, q6, q7 + STORE_IN_OUTPUT 27, 4, 5, q4, q5 ; -------------------------------------------------------------------------- ; part of stage 7 ;step1[6] = step1b[6][i] + step1b[9][i]; @@ -922,92 +1101,199 @@ idct32_transpose_pair_loop ;output[22 * 32] = step1b[9][i] - step1b[22][i]; ;output[23 * 32] = step1b[8][i] - step1b[23][i]; LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 - vadd.s16 q6, q4, q1 - vadd.s16 q7, q5, q0 - vsub.s16 q8, q5, q0 - vsub.s16 q9, q4, q1 - STORE_IN_OUTPUT 23, 23, 22, q9, q8 - STORE_IN_OUTPUT 22, 9, 8, q7, q6 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 23, 22, 23, q6, q7 + STORE_IN_OUTPUT 23, 8, 9, q8, q9 ; -------------------------------------------------------------------------- ; part of final stage ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; ;output[24 * 32] = step1b[7][i] - step1b[24][i]; ;output[25 * 32] = step1b[6][i] - step1b[25][i]; - LOAD_FROM_OUTPUT 8, 24, 25, q0, q1 + LOAD_FROM_OUTPUT 9, 24, 25, q0, q1 vadd.s16 q4, q2, q1 vadd.s16 q5, q3, q0 vsub.s16 q6, q3, q0 vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 25, 25, 24, q7, q6 - STORE_IN_OUTPUT 24, 6, 7, q4, q5 - ; -------------------------------------------------------------------------- + STORE_IN_OUTPUT 25, 24, 25, q6, q7 + STORE_IN_OUTPUT 25, 6, 7, q4, q5 - ; TODO(cd) get rid of these push/pop by properly adjusting register - ; content at end of loop - pop {r2} - pop {r1} - pop {r0} - add r1, r1, #8*2 - add r2, r2, #8*32*2 + ; restore r0 by removing the last offset from the last + ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 + sub r0, r0, #24*8*2 + ; restore r1 by removing the last offset from the last + ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2 + ; advance by 8 columns => 8*2 + sub r1, r1, #7*32*2 - 8*2 + ; advance by 8 lines (8*32*2) + ; go back by the two pairs from the loop (32*2) + add r3, r3, #8*32*2 - 32*2 ; bands loop processing - add r4, r4, #1 - cmp r4, #3 - BLE idct32_bands_loop + subs r4, r4, #1 + bne idct32_bands_loop - pop {r4} - bx lr - ENDP ; |idct32_transpose_and_transform| + ; parameters for second pass + ; the input of pass2 is the result of pass1. we have to remove the offset + ; of 32 columns induced by the above idct32_bands_loop + sub r3, r1, #32*2 + ; r1 = pass2[32 * 32] + add r1, sp, #2048 -;void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride); -; -; r0 uint8_t *dest -; r1 int16_t *out -; r2 int dest_stride) - -|idct32_combine_add| PROC - - mov r12, r0 ; dest pointer used for stores - sub r2, r2, #32 ; adjust the stride (remove the post-increments) - mov r3, #0 ; initialize loop counter - -idct32_combine_add_loop - ; load out[j * 32 + 0-31] - vld1.s16 {q12}, [r1]! - vld1.s16 {q13}, [r1]! - vld1.s16 {q14}, [r1]! - vld1.s16 {q15}, [r1]! - ; load dest[j * dest_stride + 0-31] - vld1.s16 {q6}, [r0]! - vld1.s16 {q7}, [r0]! - ; ROUND_POWER_OF_TWO - vrshr.s16 q12, q12, #6 - vrshr.s16 q13, q13, #6 - vrshr.s16 q14, q14, #6 - vrshr.s16 q15, q15, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q12, q12, d12 - vaddw.u8 q13, q13, d13 - vaddw.u8 q14, q14, d14 - vaddw.u8 q15, q15, d15 - ; clip pixel - vqmovun.s16 d12, q12 - vqmovun.s16 d13, q13 - vqmovun.s16 d14, q14 - vqmovun.s16 d15, q15 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {q6}, [r12]! - vst1.16 {q7}, [r12]! - ; increment pointers by adjusted stride (not necessary for r1/out) - add r0, r0, r2 - add r12, r12, r2 - ; loop processing - add r3, r3, #1 - cmp r3, #31 - BLE idct32_combine_add_loop + ; pass loop processing + add r5, r5, #1 + B idct32_pass_loop - bx lr - ENDP ; |idct32_transpose| +idct32_bands_end_2nd_pass + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; + ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; + ;output[30 * 32] = step1b[1][i] - step1b[30][i]; + ;output[31 * 32] = step1b[0][i] - step1b[31][i]; + LOAD_FROM_OUTPUT 17, 30, 31, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[2] = step1b[2][i] + step1b[13][i]; + ;step1[3] = step1b[3][i] + step1b[12][i]; + ;step1[12] = step1b[3][i] - step1b[12][i]; + ;step1[13] = step1b[2][i] - step1b[13][i]; + LOAD_FROM_OUTPUT 31, 12, 13, q0, q1 + vadd.s16 q2, q10, q1 + vadd.s16 q3, q11, q0 + vsub.s16 q4, q11, q0 + vsub.s16 q5, q10, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[12 * 32] = step1b[12][i] + step1b[19][i]; + ;output[13 * 32] = step1b[13][i] + step1b[18][i]; + ;output[18 * 32] = step1b[13][i] - step1b[18][i]; + ;output[19 * 32] = step1b[12][i] - step1b[19][i]; + LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; + ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; + ;output[28 * 32] = step1b[3][i] - step1b[28][i]; + ;output[29 * 32] = step1b[2][i] - step1b[29][i]; + LOAD_FROM_OUTPUT 19, 28, 29, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[4] = step1b[4][i] + step1b[11][i]; + ;step1[5] = step1b[5][i] + step1b[10][i]; + ;step1[10] = step1b[5][i] - step1b[10][i]; + ;step1[11] = step1b[4][i] - step1b[11][i]; + LOAD_FROM_OUTPUT 29, 10, 11, q0, q1 + vadd.s16 q2, q12, q1 + vadd.s16 q3, q13, q0 + vsub.s16 q4, q13, q0 + vsub.s16 q5, q12, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[10 * 32] = step1b[10][i] + step1b[21][i]; + ;output[11 * 32] = step1b[11][i] + step1b[20][i]; + ;output[20 * 32] = step1b[11][i] - step1b[20][i]; + ;output[21 * 32] = step1b[10][i] - step1b[21][i]; + LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; + ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; + ;output[26 * 32] = step1b[5][i] - step1b[26][i]; + ;output[27 * 32] = step1b[4][i] - step1b[27][i]; + LOAD_FROM_OUTPUT 21, 26, 27, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[6] = step1b[6][i] + step1b[9][i]; + ;step1[7] = step1b[7][i] + step1b[8][i]; + ;step1[8] = step1b[7][i] - step1b[8][i]; + ;step1[9] = step1b[6][i] - step1b[9][i]; + LOAD_FROM_OUTPUT 27, 8, 9, q0, q1 + vadd.s16 q2, q14, q1 + vadd.s16 q3, q15, q0 + vsub.s16 q4, q15, q0 + vsub.s16 q5, q14, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; + ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; + ;output[22 * 32] = step1b[9][i] - step1b[22][i]; + ;output[23 * 32] = step1b[8][i] - step1b[23][i]; + LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS_LAST + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; + ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; + ;output[24 * 32] = step1b[7][i] - step1b[24][i]; + ;output[25 * 32] = step1b[6][i] - step1b[25][i]; + LOAD_FROM_OUTPUT 23, 24, 25, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS_LAST + ; -------------------------------------------------------------------------- + ; restore pointers to their initial indices for next band pass by + ; removing/adding dest_stride * 8. The actual increment by eight + ; is taken care of within the _LAST macros. + add r6, r6, r2, lsl #3 + add r9, r9, r2, lsl #3 + sub r7, r7, r2, lsl #3 + sub r10, r10, r2, lsl #3 + + ; restore r0 by removing the last offset from the last + ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 + sub r0, r0, #24*8*2 + ; restore r1 by removing the last offset from the last + ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2 + ; advance by 8 columns => 8*2 + sub r1, r1, #25*32*2 - 8*2 + ; advance by 8 lines (8*32*2) + ; go back by the two pairs from the loop (32*2) + add r3, r3, #8*32*2 - 32*2 + ; bands loop processing + subs r4, r4, #1 + bne idct32_bands_loop + + ; stack operation + add sp, sp, #512+2048+2048 + vpop {d8-d15} + pop {r4-r11} + bx lr + ENDP ; |vp9_idct32x32_1024_add_neon| END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm index 869ee5f..0d4a721 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm @@ -8,21 +8,21 @@ ; - EXPORT |vp9_short_idct4x4_1_add_neon| + EXPORT |vp9_idct4x4_1_add_neon| ARM REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp9_short_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, +;void vp9_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, ; int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_short_idct4x4_1_add_neon| PROC +|vp9_idct4x4_1_add_neon| PROC ldrsh r0, [r0] ; generate cospi_16_64 = 11585 @@ -63,6 +63,6 @@ vst1.32 {d7[1]}, [r12] bx lr - ENDP ; |vp9_short_idct4x4_1_add_neon| + ENDP ; |vp9_idct4x4_1_add_neon| END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm index 640fb93..00283fc 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm @@ -8,7 +8,7 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_short_idct4x4_add_neon| + EXPORT |vp9_idct4x4_16_add_neon| ARM REQUIRE8 PRESERVE8 @@ -16,13 +16,13 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 AREA Block, CODE, READONLY ; name this block of code -;void vp9_short_idct4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +;void vp9_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_short_idct4x4_add_neon| PROC +|vp9_idct4x4_16_add_neon| PROC ; The 2D transform is done with two passes which are actually pretty ; similar. We first transform the rows. This is done by transposing @@ -185,6 +185,6 @@ vst1.32 {d26[1]}, [r1], r2 vst1.32 {d26[0]}, [r1] ; no post-increment bx lr - ENDP ; |vp9_short_idct4x4_add_neon| + ENDP ; |vp9_idct4x4_16_add_neon| END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm index 923804f..421d202 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm @@ -8,21 +8,21 @@ ; - EXPORT |vp9_short_idct8x8_1_add_neon| + EXPORT |vp9_idct8x8_1_add_neon| ARM REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp9_short_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, +;void vp9_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, ; int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_short_idct8x8_1_add_neon| PROC +|vp9_idct8x8_1_add_neon| PROC ldrsh r0, [r0] ; generate cospi_16_64 = 11585 @@ -83,6 +83,6 @@ vst1.64 {d31}, [r12], r2 bx lr - ENDP ; |vp9_short_idct8x8_1_add_neon| + ENDP ; |vp9_idct8x8_1_add_neon| END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm index a744f59..5476400 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm @@ -8,8 +8,8 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_short_idct8x8_add_neon| - EXPORT |vp9_short_idct10_8x8_add_neon| + EXPORT |vp9_idct8x8_64_add_neon| + EXPORT |vp9_idct8x8_10_add_neon| ARM REQUIRE8 PRESERVE8 @@ -198,13 +198,13 @@ MEND AREA Block, CODE, READONLY ; name this block of code -;void vp9_short_idct8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +;void vp9_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_short_idct8x8_add_neon| PROC +|vp9_idct8x8_64_add_neon| PROC push {r4-r9} vpush {d8-d15} vld1.s16 {q8,q9}, [r0]! @@ -308,15 +308,15 @@ vpop {d8-d15} pop {r4-r9} bx lr - ENDP ; |vp9_short_idct8x8_add_neon| + ENDP ; |vp9_idct8x8_64_add_neon| -;void vp9_short_idct10_8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +;void vp9_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_short_idct10_8x8_add_neon| PROC +|vp9_idct8x8_10_add_neon| PROC push {r4-r9} vpush {d8-d15} vld1.s16 {q8,q9}, [r0]! @@ -514,6 +514,6 @@ vpop {d8-d15} pop {r4-r9} bx lr - ENDP ; |vp9_short_idct10_8x8_add_neon| + ENDP ; |vp9_idct8x8_10_add_neon| END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm index 963ef35..2f326e2 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm @@ -8,7 +8,7 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_short_iht4x4_add_neon| + EXPORT |vp9_iht4x4_16_add_neon| ARM REQUIRE8 PRESERVE8 @@ -139,7 +139,7 @@ MEND AREA Block, CODE, READONLY ; name this block of code -;void vp9_short_iht4x4_add_neon(int16_t *input, uint8_t *dest, +;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest, ; int dest_stride, int tx_type) ; ; r0 int16_t input @@ -147,7 +147,7 @@ ; r2 int dest_stride ; r3 int tx_type) ; This function will only handle tx_type of 1,2,3. -|vp9_short_iht4x4_add_neon| PROC +|vp9_iht4x4_16_add_neon| PROC ; load the inputs into d16-d19 vld1.s16 {q8,q9}, [r0]! @@ -175,7 +175,7 @@ iadst_idct ; then transform columns IADST4x4_1D - b end_vp9_short_iht4x4_add_neon + b end_vp9_iht4x4_16_add_neon idct_iadst ; generate constants @@ -191,7 +191,7 @@ idct_iadst ; then transform columns IDCT4x4_1D - b end_vp9_short_iht4x4_add_neon + b end_vp9_iht4x4_16_add_neon iadst_iadst ; generate constants @@ -206,7 +206,7 @@ iadst_iadst ; then transform columns IADST4x4_1D -end_vp9_short_iht4x4_add_neon +end_vp9_iht4x4_16_add_neon ; ROUND_POWER_OF_TWO(temp_out[j], 4) vrshr.s16 q8, q8, #4 vrshr.s16 q9, q9, #4 @@ -232,6 +232,6 @@ end_vp9_short_iht4x4_add_neon vst1.32 {d26[1]}, [r1], r2 vst1.32 {d26[0]}, [r1] ; no post-increment bx lr - ENDP ; |vp9_short_iht4x4_add_neon| + ENDP ; |vp9_iht4x4_16_add_neon| END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm index bab9cb4..93d3af3 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm @@ -8,7 +8,7 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_short_iht8x8_add_neon| + EXPORT |vp9_iht8x8_64_add_neon| ARM REQUIRE8 PRESERVE8 @@ -559,7 +559,7 @@ AREA Block, CODE, READONLY ; name this block of code -;void vp9_short_iht8x8_add_neon(int16_t *input, uint8_t *dest, +;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest, ; int dest_stride, int tx_type) ; ; r0 int16_t input @@ -567,7 +567,7 @@ ; r2 int dest_stride ; r3 int tx_type) ; This function will only handle tx_type of 1,2,3. -|vp9_short_iht8x8_add_neon| PROC +|vp9_iht8x8_64_add_neon| PROC ; load the inputs into d16-d19 vld1.s16 {q8,q9}, [r0]! @@ -602,7 +602,7 @@ iadst_idct ; then transform columns IADST8X8_1D - b end_vp9_short_iht8x8_add_neon + b end_vp9_iht8x8_64_add_neon idct_iadst ; generate IADST constants @@ -620,7 +620,7 @@ idct_iadst ; then transform columns IDCT8x8_1D - b end_vp9_short_iht8x8_add_neon + b end_vp9_iht8x8_64_add_neon iadst_iadst ; generate IADST constants @@ -635,7 +635,7 @@ iadst_iadst ; then transform columns IADST8X8_1D -end_vp9_short_iht8x8_add_neon +end_vp9_iht8x8_64_add_neon pop {r0-r10} ; ROUND_POWER_OF_TWO(temp_out[j], 5) @@ -691,6 +691,6 @@ end_vp9_short_iht8x8_add_neon vst1.64 {d6}, [r0], r2 vst1.64 {d7}, [r0], r2 bx lr - ENDP ; |vp9_short_iht8x8_add_neon| + ENDP ; |vp9_iht8x8_64_add_neon| END diff --git a/libvpx/vp9/common/generic/vp9_systemdependent.c b/libvpx/vp9/common/generic/vp9_systemdependent.c index f144721..536febb 100644 --- a/libvpx/vp9/common/generic/vp9_systemdependent.c +++ b/libvpx/vp9/common/generic/vp9_systemdependent.c @@ -10,7 +10,7 @@ #include "./vpx_config.h" -#include "vp9_rtcd.h" +#include "./vp9_rtcd.h" #include "vp9/common/vp9_onyxc_int.h" void vp9_machine_specific_config(VP9_COMMON *cm) { diff --git a/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h b/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h new file mode 100644 index 0000000..644264f --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_common_dspr2.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_COMMON_DSPR2_H_ +#define VP9_COMMON_VP9_COMMON_DSPR2_H_ + +#include <assert.h> + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" + +#if HAVE_DSPR2 +#define CROP_WIDTH 512 +extern uint8_t *vp9_ff_cropTbl; + +#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) ({ \ + \ + int32_t tmp, out; \ + int dct_cost_rounding = DCT_CONST_ROUNDING; \ + int in = input; \ + \ + __asm__ __volatile__ ( \ + /* out = dct_const_round_shift(input_dc * cospi_16_64); */ \ + "mtlo %[dct_cost_rounding], $ac1 \n\t"\ + "mthi $zero, $ac1 \n\t"\ + "madd $ac1, %[in], %[cospi_16_64] \n\t"\ + "extp %[tmp], $ac1, 31 \n\t"\ + \ + /* out = dct_const_round_shift(out * cospi_16_64); */ \ + "mtlo %[dct_cost_rounding], $ac2 \n\t"\ + "mthi $zero, $ac2 \n\t"\ + "madd $ac2, %[tmp], %[cospi_16_64] \n\t"\ + "extp %[out], $ac2, 31 \n\t"\ + \ + : [tmp] "=&r" (tmp), [out] "=r" (out) \ + : [in] "r" (in), \ + [dct_cost_rounding] "r" (dct_cost_rounding), \ + [cospi_16_64] "r" (cospi_16_64) \ + ); \ + out; }) + +static INLINE void vp9_prefetch_load(const unsigned char *src) { + __asm__ __volatile__ ( + "pref 0, 0(%[src]) \n\t" + : + : [src] "r" (src) + ); +} + +/* prefetch data for store */ +static INLINE void vp9_prefetch_store(unsigned char *dst) { + __asm__ __volatile__ ( + "pref 1, 0(%[dst]) \n\t" + : + : [dst] "r" (dst) + ); +} + +static INLINE void vp9_prefetch_load_streamed(const unsigned char *src) { + __asm__ __volatile__ ( + "pref 4, 0(%[src]) \n\t" + : + : [src] "r" (src) + ); +} + +/* prefetch data for store */ +static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) { + __asm__ __volatile__ ( + "pref 5, 0(%[dst]) \n\t" + : + : [dst] "r" (dst) + ); +} + +void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride); + +void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +void vp9_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +void vp9_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +void vp9_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter, + int w, int h); + +void vp9_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +#endif // #if HAVE_DSPR2 +#endif // VP9_COMMON_VP9_COMMON_DSPR2_H_ diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c new file mode 100644 index 0000000..91d62bc --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_convolve.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_y, + int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__ ( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [p1] "=&r" (p1), [p2] "=&r" (p2), + [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [store1] "=&r" (store1), [store2] "=&r" (store2), + [src_ptr] "+r" (src_ptr) + : [filter45] "r" (filter45), [vector4a] "r" (vector4a), + [src_stride] "r" (src_stride), [cm] "r" (cm), + [dst_ptr] "r" (dst_ptr) + ); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_y, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45;; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_store(dst + dst_stride); + vp9_prefetch_store(dst + dst_stride + 32); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__ ( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [p1] "=&r" (p1), [p2] "=&r" (p2), + [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [store1] "=&r" (store1), [store2] "=&r" (store2), + [src_ptr] "+r" (src_ptr) + : [filter45] "r" (filter45), [vector4a] "r" (vector4a), + [src_stride] "r" (src_stride), [cm] "r" (cm), + [dst_ptr] "r" (dst_ptr) + ); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void vp9_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (16 == y_step_q4) { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + vp9_prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_bi_avg_vert_4_dspr2(src, src_stride, + dst, dst_stride, + filter_y, w, h); + break; + case 64: + vp9_prefetch_store(dst + 32); + convolve_bi_avg_vert_64_dspr2(src, src_stride, + dst, dst_stride, + filter_y, h); + break; + default: + vp9_convolve8_avg_vert_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + break; + } + } else { + vp9_convolve8_avg_vert_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } +} +#endif diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c new file mode 100644 index 0000000..148b20f --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c @@ -0,0 +1,833 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_convolve.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3; + uint32_t tn1, tn2; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p3], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ + "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ + "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ + + /* clamp */ + "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ + "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */ + "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ + + "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ + "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ + + "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */ + "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) + : [filter45] "r" (filter45), [vector4a] "r" (vector4a), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3, tp4; + uint32_t p1, p2, p3, p4, n1; + uint32_t st0, st1; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45;; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + "lbu %[Temp2], 0(%[dst]) \n\t" + "lbu %[tp4], 2(%[dst]) \n\t" + + /* even 2. pixel */ + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + "addqh_r.w %[tp4], %[tp4], %[st1] \n\t" + "sb %[Temp2], 0(%[dst]) \n\t" + "sb %[tp4], 2(%[dst]) \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "balign %[tp3], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "lbu %[Temp2], 4(%[dst]) \n\t" + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[Temp2], 4(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tp1], 6(%[dst]) \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + "lbu %[tp2], 1(%[dst]) \n\t" + "lbu %[tp3], 3(%[dst]) \n\t" + "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" + "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tp4], 5(%[dst]) \n\t" + + /* odd 4. pixel */ + "sb %[tp2], 1(%[dst]) \n\t" + "sb %[tp1], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbu %[tp1], 7(%[dst]) \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "addqh_r.w %[tp3], %[tp3], %[p4] \n\t" + + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[tp4], %[tp4], %[p2] \n\t" + + "lbux %[p1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[tp1], %[tp1], %[p1] \n\t" + + /* store bytes */ + "sb %[tp3], 3(%[dst]) \n\t" + "sb %[tp4], 5(%[dst]) \n\t" + "sb %[tp1], 7(%[dst]) \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), + [st0] "=&r" (st0), [st1] "=&r" (st1), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [n1] "=&r" (n1), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) + : [filter45] "r" (filter45), [vector4a] "r" (vector4a), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45;; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + vp9_prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [qload3] "=&r" (qload3), [p5] "=&r" (p5), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) + : [filter45] "r" (filter45), [vector_64] "r" (vector_64), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45;; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + vp9_prefetch_load(src_ptr + src_stride + 64); + vp9_prefetch_store(dst_ptr + dst_stride); + vp9_prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [qload3] "=&r" (qload3), [p5] "=&r" (p5), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) + : [filter45] "r" (filter45), [vector_64] "r" (vector_64), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void vp9_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (16 == x_step_q4) { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + /* prefetch data to cache memory */ + vp9_prefetch_load(src); + vp9_prefetch_load(src + 32); + vp9_prefetch_store(dst); + + switch (w) { + case 4: + convolve_bi_avg_horiz_4_dspr2(src, src_stride, + dst, dst_stride, + filter_x, h); + break; + case 8: + convolve_bi_avg_horiz_8_dspr2(src, src_stride, + dst, dst_stride, + filter_x, h); + break; + case 16: + convolve_bi_avg_horiz_16_dspr2(src, src_stride, + dst, dst_stride, + filter_x, h, 1); + break; + case 32: + convolve_bi_avg_horiz_16_dspr2(src, src_stride, + dst, dst_stride, + filter_x, h, 2); + break; + case 64: + vp9_prefetch_load(src + 64); + vp9_prefetch_store(dst + 32); + + convolve_bi_avg_horiz_64_dspr2(src, src_stride, + dst, dst_stride, + filter_x, h); + break; + default: + vp9_convolve8_avg_horiz_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + break; + } + } else { + vp9_convolve8_avg_horiz_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } +} +#endif diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c new file mode 100644 index 0000000..92644f2 --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c @@ -0,0 +1,784 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_filter.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + uint8_t *dst_ptr; + int32_t Temp1, Temp2; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + dst_ptr = dst; + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp2], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp2](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp2], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p1], %[Temp1](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[tp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [p1] "=&r" (p1), [p2] "=&r" (p2), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [dst_ptr] "+r" (dst_ptr) + : [filter45] "r" (filter45),[vector4a] "r" (vector4a), + [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride) + ); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + uint8_t *dst_ptr; + uint32_t vector4a = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3; + uint32_t p1, p2, p3, p4; + uint8_t *odd_dst; + uint32_t dst_pitch_2 = (dst_stride << 1); + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + + dst_ptr = dst; + odd_dst = (dst_ptr + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "balign %[tp3], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "extp %[p3], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[Temp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[Temp1], %[p3](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "sb %[Temp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[tp3], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[tp3], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[p1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p2], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p1], 0(%[odd_dst]) \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst) + : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm), + [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) + ); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h, + int32_t count) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload1], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p5], %[qload1] \n\t" + "ulw %[qload2], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload2] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload2] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 0(%[dst]) \n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload1], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) \n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload2], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload2] \n\t" + "preceu.ph.qbl %[p5], %[qload2] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "ulw %[qload1], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload1] \n\t" + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) + : [filter45] "r" (filter45), [vector_64] "r" (vector_64), + [cm] "r" (cm), + [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) + ); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += 1; + } +} + +static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + vp9_prefetch_load(src_ptr + src_stride + 64); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload1], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p5], %[qload1] \n\t" + "ulw %[qload2], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload2] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload2] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 0(%[dst]) \n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload1], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) \n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload2], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload2] \n\t" + "preceu.ph.qbl %[p5], %[qload2] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "ulw %[qload1], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload1] \n\t" + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) + : [filter45] "r" (filter45), [vector_64] "r" (vector_64), + [cm] "r" (cm), + [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) + ); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += 1; + } +} + +void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter, int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + int sum = 0; + + sum += src[x] * filter[3]; + sum += src[x + 1] * filter[4]; + + dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + + src += src_stride; + dst += 1; + } +} + +void vp9_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter, + int w, int h) { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + /* prefetch data to cache memory */ + vp9_prefetch_load(src); + vp9_prefetch_load(src + 32); + + switch (w) { + case 4: + convolve_bi_horiz_4_transposed_dspr2(src, src_stride, + dst, dst_stride, + filter, h); + break; + case 8: + convolve_bi_horiz_8_transposed_dspr2(src, src_stride, + dst, dst_stride, + filter, h); + break; + case 16: + case 32: + convolve_bi_horiz_16_transposed_dspr2(src, src_stride, + dst, dst_stride, + filter, h, + (w/16)); + break; + case 64: + vp9_prefetch_load(src + 32); + convolve_bi_horiz_64_transposed_dspr2(src, src_stride, + dst, dst_stride, + filter, h); + break; + default: + convolve_bi_horiz_transposed(src, src_stride, + dst, dst_stride, + filter, w, h); + break; + } +} +#endif diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c new file mode 100644 index 0000000..1debdb4 --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c @@ -0,0 +1,713 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_convolve.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void convolve_bi_horiz_4_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45;; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p1], %[Temp2](%[cm]) \n\t" + "lbux %[p2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst]) \n\t" + "sb %[p1], 1(%[dst]) \n\t" + "sb %[tp2], 2(%[dst]) \n\t" + "sb %[p2], 3(%[dst]) \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [p1] "=&r" (p1), [p2] "=&r" (p2), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) + : [filter45] "r" (filter45), [vector4a] "r" (vector4a), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_horiz_8_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3; + uint32_t p1, p2, p3, p4; + uint32_t st0, st1; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45;; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[st0], 0(%[dst]) \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + + "balign %[tp3], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[st1], 2(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "sb %[st0], 4(%[dst]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[st1], 1(%[dst]) \n\t" + "sb %[st0], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[p1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 3(%[dst]) \n\t" + "sb %[p2], 5(%[dst]) \n\t" + "sb %[p1], 7(%[dst]) \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3), + [st0] "=&r" (st0), [st1] "=&r" (st1), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) + : [filter45] "r" (filter45), [vector4a] "r" (vector4a), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45;; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + vp9_prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [p5] "=&r" (p5), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) + : [filter45] "r" (filter45), [vector_64] "r" (vector_64), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45;; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + vp9_prefetch_load(src_ptr + src_stride + 64); + vp9_prefetch_store(dst_ptr + dst_stride); + vp9_prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [p5] "=&r" (p5), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) + : [filter45] "r" (filter45), [vector_64] "r" (vector_64), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (16 == x_step_q4) { + uint32_t pos = 38; + + vp9_prefetch_load((const uint8_t *)filter_x); + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + /* prefetch data to cache memory */ + vp9_prefetch_load(src); + vp9_prefetch_load(src + 32); + vp9_prefetch_store(dst); + + switch (w) { + case 4: + convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filter_x, (int32_t)h); + break; + case 8: + convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filter_x, (int32_t)h); + break; + case 16: + convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filter_x, (int32_t)h, 1); + break; + case 32: + convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filter_x, (int32_t)h, 2); + break; + case 64: + vp9_prefetch_load(src + 64); + vp9_prefetch_store(dst + 32); + + convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filter_x, (int32_t)h); + break; + default: + vp9_convolve8_horiz_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + break; + } + } else { + vp9_convolve8_horiz_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } +} +#endif diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c new file mode 100644 index 0000000..bf01f11 --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_convolve.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void convolve_bi_vert_4_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_y, + int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__ ( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [p1] "=&r" (p1), [p2] "=&r" (p2), + [scratch1] "=&r" (scratch1), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [store1] "=&r" (store1), [store2] "=&r" (store2), + [src_ptr] "+r" (src_ptr) + : [filter45] "r" (filter45),[vector4a] "r" (vector4a), + [src_stride] "r" (src_stride), + [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) + ); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_vert_64_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_y, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_store(dst + dst_stride); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__ ( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [p1] "=&r" (p1), [p2] "=&r" (p2), + [scratch1] "=&r" (scratch1), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [store1] "=&r" (store1), [store2] "=&r" (store2), + [src_ptr] "+r" (src_ptr) + : [filter45] "r" (filter45),[vector4a] "r" (vector4a), + [src_stride] "r" (src_stride), + [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) + ); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void vp9_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (16 == y_step_q4) { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + vp9_prefetch_store(dst); + + switch (w) { + case 4 : + case 8 : + case 16 : + case 32 : + convolve_bi_vert_4_dspr2(src, src_stride, + dst, dst_stride, + filter_y, w, h); + break; + case 64 : + vp9_prefetch_store(dst + 32); + convolve_bi_vert_64_dspr2(src, src_stride, + dst, dst_stride, + filter_y, h); + break; + default: + vp9_convolve8_vert_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + break; + } + } else { + vp9_convolve8_vert_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } +} +#endif diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c new file mode 100644 index 0000000..ab18490 --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c @@ -0,0 +1,695 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_convolve.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void convolve_avg_vert_4_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_y, + int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__ ( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2), + [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [store1] "=&r" (store1), [store2] "=&r" (store2), + [src_ptr] "+r" (src_ptr) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), + [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) + ); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_vert_64_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_y, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_store(dst + dst_stride); + vp9_prefetch_store(dst + dst_stride + 32); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__ ( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2), + [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [store1] "=&r" (store1), [store2] "=&r" (store2), + [src_ptr] "+r" (src_ptr) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), + [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) + ); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void vp9_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (((const int32_t *)filter_y)[1] == 0x800000) { + vp9_convolve_avg(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } else if (((const int32_t *)filter_y)[0] == 0) { + vp9_convolve2_avg_vert_dspr2(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } else { + if (16 == y_step_q4) { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + vp9_prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_avg_vert_4_dspr2(src, src_stride, + dst, dst_stride, + filter_y, w, h); + break; + case 64: + vp9_prefetch_store(dst + 32); + convolve_avg_vert_64_dspr2(src, src_stride, + dst, dst_stride, + filter_y, h); + break; + default: + vp9_convolve8_avg_vert_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + break; + } + } else { + vp9_convolve8_avg_vert_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } + } +} + +void vp9_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + /* Fixed size intermediate buffer places limits on parameters. */ + DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135); + int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; + + assert(w <= 64); + assert(h <= 64); + + if (intermediate_height < h) + intermediate_height = h; + + if (x_step_q4 != 16 || y_step_q4 != 16) + return vp9_convolve8_avg_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + + vp9_convolve8_horiz(src - (src_stride * 3), src_stride, + temp, 64, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, intermediate_height); + + vp9_convolve8_avg_vert(temp + 64 * 3, 64, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); +} + +void vp9_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int x, y; + uint32_t tp1, tp2, tn1; + uint32_t tp3, tp4, tn2; + + /* prefetch data to cache memory */ + vp9_prefetch_load(src); + vp9_prefetch_load(src + 32); + vp9_prefetch_store(dst); + + switch (w) { + case 4: + /* 1 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + + : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1), + [tp2] "=&r" (tp2) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + break; + case 8: + /* 2 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), + [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + break; + case 16: + /* 4 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 8(%[src]) \n\t" + "ulw %[tp2], 8(%[dst]) \n\t" + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "ulw %[tp3], 12(%[src]) \n\t" + "ulw %[tp4], 12(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), + [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + break; + case 32: + /* 8 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 8(%[src]) \n\t" + "ulw %[tp2], 8(%[dst]) \n\t" + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "ulw %[tp3], 12(%[src]) \n\t" + "ulw %[tp4], 12(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 16(%[src]) \n\t" + "ulw %[tp2], 16(%[dst]) \n\t" + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ + "ulw %[tp3], 20(%[src]) \n\t" + "ulw %[tp4], 20(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 24(%[src]) \n\t" + "ulw %[tp2], 24(%[dst]) \n\t" + "sw %[tn1], 16(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 20(%[dst]) \n\t" /* store */ + "ulw %[tp3], 28(%[src]) \n\t" + "ulw %[tp4], 28(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 24(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 28(%[dst]) \n\t" /* store */ + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), + [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + break; + case 64: + vp9_prefetch_load(src + 64); + vp9_prefetch_store(dst + 32); + + /* 16 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_load(src + src_stride + 64); + vp9_prefetch_store(dst + dst_stride); + vp9_prefetch_store(dst + dst_stride + 32); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 8(%[src]) \n\t" + "ulw %[tp2], 8(%[dst]) \n\t" + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "ulw %[tp3], 12(%[src]) \n\t" + "ulw %[tp4], 12(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 16(%[src]) \n\t" + "ulw %[tp2], 16(%[dst]) \n\t" + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ + "ulw %[tp3], 20(%[src]) \n\t" + "ulw %[tp4], 20(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 24(%[src]) \n\t" + "ulw %[tp2], 24(%[dst]) \n\t" + "sw %[tn1], 16(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 20(%[dst]) \n\t" /* store */ + "ulw %[tp3], 28(%[src]) \n\t" + "ulw %[tp4], 28(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 32(%[src]) \n\t" + "ulw %[tp2], 32(%[dst]) \n\t" + "sw %[tn1], 24(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 28(%[dst]) \n\t" /* store */ + "ulw %[tp3], 36(%[src]) \n\t" + "ulw %[tp4], 36(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 40(%[src]) \n\t" + "ulw %[tp2], 40(%[dst]) \n\t" + "sw %[tn1], 32(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 36(%[dst]) \n\t" /* store */ + "ulw %[tp3], 44(%[src]) \n\t" + "ulw %[tp4], 44(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 48(%[src]) \n\t" + "ulw %[tp2], 48(%[dst]) \n\t" + "sw %[tn1], 40(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 44(%[dst]) \n\t" /* store */ + "ulw %[tp3], 52(%[src]) \n\t" + "ulw %[tp4], 52(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 56(%[src]) \n\t" + "ulw %[tp2], 56(%[dst]) \n\t" + "sw %[tn1], 48(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 52(%[dst]) \n\t" /* store */ + "ulw %[tp3], 60(%[src]) \n\t" + "ulw %[tp4], 60(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 56(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 60(%[dst]) \n\t" /* store */ + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), + [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + break; + default: + for (y = h; y > 0; --y) { + for (x = 0; x < w; ++x) { + dst[x] = (dst[x] + src[x] + 1) >> 1; + } + + src += src_stride; + dst += dst_stride; + } + break; + } +} +#endif diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c new file mode 100644 index 0000000..69da1cf --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c @@ -0,0 +1,1038 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_convolve.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void convolve_avg_horiz_4_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4; + uint32_t n1, n2, n3, n4; + uint32_t tn1, tn2; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "preceu.ph.qbl %[n4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[n1], %[tn1] \n\t" + "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ + "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ + "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ + + /* clamp */ + "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ + "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */ + "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ + + "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ + "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ + + "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */ + "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_horiz_8_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4, n1; + uint32_t tn1, tn2, tn3; + uint32_t st0, st1; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + "lbu %[Temp2], 0(%[dst]) \n\t" + "lbu %[tn3], 2(%[dst]) \n\t" + + /* even 2. pixel */ + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "preceu.ph.qbl %[n1], %[tn2] \n\t" + "ulw %[tn1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[tn1] \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + "addqh_r.w %[tn3], %[tn3], %[st1] \n\t" + "sb %[Temp2], 0(%[dst]) \n\t" + "sb %[tn3], 2(%[dst]) \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "balign %[tn3], %[tn1], 3 \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "lbu %[Temp2], 4(%[dst]) \n\t" + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[Temp2], 4(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tn2] \n\t" + "preceu.ph.qbl %[p4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tp1], 6(%[dst]) \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn1] \n\t" + "preceu.ph.qbl %[n1], %[tn1] \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + "lbu %[tp2], 1(%[dst]) \n\t" + "lbu %[tn2], 3(%[dst]) \n\t" + "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "preceu.ph.qbr %[p2], %[tn3] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" + "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tn3], 5(%[dst]) \n\t" + + /* odd 4. pixel */ + "sb %[tp2], 1(%[dst]) \n\t" + "sb %[tp1], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbu %[tn1], 7(%[dst]) \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "addqh_r.w %[tn2], %[tn2], %[p4] \n\t" + + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[tn3], %[tn3], %[p2] \n\t" + + "lbux %[n1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[tn1], %[tn1], %[n1] \n\t" + + /* store bytes */ + "sb %[tn2], 3(%[dst]) \n\t" + "sb %[tn3], 5(%[dst]) \n\t" + "sb %[tn1], 7(%[dst]) \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3), + [st0] "=&r" (st0), [st1] "=&r" (st1), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [n1] "=&r" (n1), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + vp9_prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "ulw %[qload2], 16(%[src]) \n\t" + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [qload3] "=&r" (qload3), [p5] "=&r" (p5), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) + : [filter12] "r" (filter12), [filter34] "r" (filter34), + [filter56] "r" (filter56), [filter78] "r" (filter78), + [vector_64] "r" (vector_64), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + vp9_prefetch_load(src_ptr + src_stride + 64); + vp9_prefetch_store(dst_ptr + dst_stride); + vp9_prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "ulw %[qload2], 16(%[src]) \n\t" + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [qload3] "=&r" (qload3), [p5] "=&r" (p5), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) + : [filter12] "r" (filter12), [filter34] "r" (filter34), + [filter56] "r" (filter56), [filter78] "r" (filter78), + [vector_64] "r" (vector_64), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void vp9_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (((const int32_t *)filter_x)[1] == 0x800000) { + vp9_convolve_avg(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } else if (((const int32_t *)filter_x)[0] == 0) { + vp9_convolve2_avg_horiz_dspr2(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } else { + if (16 == x_step_q4) { + uint32_t pos = 38; + + src -= 3; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + /* prefetch data to cache memory */ + vp9_prefetch_load(src); + vp9_prefetch_load(src + 32); + vp9_prefetch_store(dst); + + switch (w) { + case 4: + convolve_avg_horiz_4_dspr2(src, src_stride, + dst, dst_stride, + filter_x, h); + break; + case 8: + convolve_avg_horiz_8_dspr2(src, src_stride, + dst, dst_stride, + filter_x, h); + break; + case 16: + convolve_avg_horiz_16_dspr2(src, src_stride, + dst, dst_stride, + filter_x, h, 1); + break; + case 32: + convolve_avg_horiz_16_dspr2(src, src_stride, + dst, dst_stride, + filter_x, h, 2); + break; + case 64: + vp9_prefetch_load(src + 64); + vp9_prefetch_store(dst + 32); + + convolve_avg_horiz_64_dspr2(src, src_stride, + dst, dst_stride, + filter_x, h); + break; + default: + vp9_convolve8_avg_horiz_c(src + 3, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + break; + } + } else { + vp9_convolve8_avg_horiz_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } + } +} +#endif diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c new file mode 100644 index 0000000..0ef9dd5 --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c @@ -0,0 +1,1284 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_filter.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +uint8_t vp9_ff_cropTbl_a[256 + 2 * CROP_WIDTH]; +uint8_t *vp9_ff_cropTbl; + +void vp9_dsputil_static_init(void) { + int i; + + for (i = 0; i < 256; i++) vp9_ff_cropTbl_a[i + CROP_WIDTH] = i; + + for (i = 0; i < CROP_WIDTH; i++) { + vp9_ff_cropTbl_a[i] = 0; + vp9_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255; + } + + vp9_ff_cropTbl = &vp9_ff_cropTbl_a[CROP_WIDTH]; +} + +static void convolve_horiz_4_transposed_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + uint8_t *dst_ptr; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4; + uint32_t tn1, tn2; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + dst_ptr = dst; + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tn2] \n\t" + "preceu.ph.qbl %[p4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn1] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[p2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[tn1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[tp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4), + [dst_ptr] "+r" (dst_ptr) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), + [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride) + ); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_horiz_8_transposed_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + uint8_t *dst_ptr; + uint32_t vector4a = 64; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3; + uint32_t p1, p2, p3, p4, n1; + uint8_t *odd_dst; + uint32_t dst_pitch_2 = (dst_stride << 1); + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + + dst_ptr = dst; + odd_dst = (dst_ptr + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp2], 0(%[src]) \n\t" + "ulw %[tp1], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp1] \n\t" + "preceu.ph.qbl %[p4], %[tp1] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "preceu.ph.qbr %[p1], %[tp3] \n\t" + "preceu.ph.qbl %[n1], %[tp3] \n\t" + "ulw %[tp2], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[tp2] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" + "lbux %[tp3], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" + "extp %[p3], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[Temp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "sb %[tp3], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "ulw %[tp1], 1(%[src]) \n\t" + "ulw %[tp3], 5(%[src]) \n\t" + + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[tp2], %[p3](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "sb %[tp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "ulw %[tp2], 9(%[src]) \n\t" + + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[n1], %[tp2] \n\t" + "ulw %[Temp1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[tp3], %[Temp2](%[cm]) \n\t" + "preceu.ph.qbr %[p2], %[Temp1] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[tp3], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[n1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p2], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[n1], 0(%[odd_dst]) \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [n1] "=&r" (n1), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), [cm] "r" (cm), + [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) + ); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h, + int32_t count) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "ulw %[qload2], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload2] \n\t" + "preceu.ph.qbl %[p5], %[qload2] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload2], 16(%[src]) \n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st3], 0(%[dst]) \n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload1], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) \n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload2], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload2] \n\t" + "preceu.ph.qbl %[p5], %[qload2] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "ulw %[qload1], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload1] \n\t" + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) + : [filter12] "r" (filter12), [filter34] "r" (filter34), + [filter56] "r" (filter56), [filter78] "r" (filter78), + [vector_64] "r" (vector_64), [cm] "r" (cm), + [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) + ); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + + dst_ptr += 1; + } +} + +static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + vp9_prefetch_load(src_ptr + src_stride + 64); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "ulw %[qload2], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload2] \n\t" + "preceu.ph.qbl %[p5], %[qload2] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload2], 16(%[src]) \n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st3], 0(%[dst]) \n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload1], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) \n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload2], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload2] \n\t" + "preceu.ph.qbl %[p5], %[qload2] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "ulw %[qload1], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload1] \n\t" + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) + : [filter12] "r" (filter12), [filter34] "r" (filter34), + [filter56] "r" (filter56), [filter78] "r" (filter78), + [vector_64] "r" (vector_64), [cm] "r" (cm), + [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) + ); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + + dst_ptr += 1; + } +} + +void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter, int w, int h) { + int x, y, k; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + int sum = 0; + + for (k = 0; k < 8; ++k) + sum += src[x + k] * filter[k]; + + dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + + src += src_stride; + dst += 1; + } +} + +void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + dst[x * dst_stride] = src[x]; + } + + src += src_stride; + dst += 1; + } +} + +void vp9_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135); + int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + if (intermediate_height < h) + intermediate_height = h; + + if (x_step_q4 != 16 || y_step_q4 != 16) + return vp9_convolve8_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + + if ((((const int32_t *)filter_x)[1] == 0x800000) + && (((const int32_t *)filter_y)[1] == 0x800000)) + return vp9_convolve_copy(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + + /* copy the src to dst */ + if (filter_x[3] == 0x80) { + copy_horiz_transposed(src - src_stride * 3, src_stride, + temp, intermediate_height, + w, intermediate_height); + } else if (((const int32_t *)filter_x)[0] == 0) { + vp9_convolve2_dspr2(src - src_stride * 3, src_stride, + temp, intermediate_height, + filter_x, + w, intermediate_height); + } else { + src -= (src_stride * 3 + 3); + + /* prefetch data to cache memory */ + vp9_prefetch_load(src); + vp9_prefetch_load(src + 32); + + switch (w) { + case 4: + convolve_horiz_4_transposed_dspr2(src, src_stride, + temp, intermediate_height, + filter_x, intermediate_height); + break; + case 8: + convolve_horiz_8_transposed_dspr2(src, src_stride, + temp, intermediate_height, + filter_x, intermediate_height); + break; + case 16: + case 32: + convolve_horiz_16_transposed_dspr2(src, src_stride, + temp, intermediate_height, + filter_x, intermediate_height, + (w/16)); + break; + case 64: + vp9_prefetch_load(src + 32); + convolve_horiz_64_transposed_dspr2(src, src_stride, + temp, intermediate_height, + filter_x, intermediate_height); + break; + default: + convolve_horiz_transposed(src, src_stride, + temp, intermediate_height, + filter_x, w, intermediate_height); + break; + } + } + + /* copy the src to dst */ + if (filter_y[3] == 0x80) { + copy_horiz_transposed(temp + 3, intermediate_height, + dst, dst_stride, + h, w); + } else if (((const int32_t *)filter_y)[0] == 0) { + vp9_convolve2_dspr2(temp + 3, intermediate_height, + dst, dst_stride, + filter_y, + h, w); + } else { + switch (h) { + case 4: + convolve_horiz_4_transposed_dspr2(temp, intermediate_height, + dst, dst_stride, + filter_y, w); + break; + case 8: + convolve_horiz_8_transposed_dspr2(temp, intermediate_height, + dst, dst_stride, + filter_y, w); + break; + case 16: + case 32: + convolve_horiz_16_transposed_dspr2(temp, intermediate_height, + dst, dst_stride, + filter_y, w, (h/16)); + break; + case 64: + convolve_horiz_64_transposed_dspr2(temp, intermediate_height, + dst, dst_stride, + filter_y, w); + break; + default: + convolve_horiz_transposed(temp, intermediate_height, + dst, dst_stride, + filter_y, h, w); + break; + } + } +} + +void vp9_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int x, y; + + /* prefetch data to cache memory */ + vp9_prefetch_load(src); + vp9_prefetch_load(src + 32); + vp9_prefetch_store(dst); + + switch (w) { + case 4: + { + uint32_t tp1; + + /* 1 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], (%[src]) \n\t" + "sw %[tp1], (%[dst]) \n\t" /* store */ + + : [tp1] "=&r" (tp1) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + } + break; + case 8: + { + uint32_t tp1, tp2; + + /* 2 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + } + break; + case 16: + { + uint32_t tp1, tp2, tp3, tp4; + + /* 4 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tp3] "=&r" (tp3), [tp4] "=&r" (tp4) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + } + break; + case 32: + { + uint32_t tp1, tp2, tp3, tp4; + uint32_t tp5, tp6, tp7, tp8; + + /* 8 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + "ulw %[tp5], 16(%[src]) \n\t" + "ulw %[tp6], 20(%[src]) \n\t" + "ulw %[tp7], 24(%[src]) \n\t" + "ulw %[tp8], 28(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + "sw %[tp5], 16(%[dst]) \n\t" /* store */ + "sw %[tp6], 20(%[dst]) \n\t" /* store */ + "sw %[tp7], 24(%[dst]) \n\t" /* store */ + "sw %[tp8], 28(%[dst]) \n\t" /* store */ + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), + [tp5] "=&r" (tp5), [tp6] "=&r" (tp6), + [tp7] "=&r" (tp7), [tp8] "=&r" (tp8) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + } + break; + case 64: + { + uint32_t tp1, tp2, tp3, tp4; + uint32_t tp5, tp6, tp7, tp8; + + vp9_prefetch_load(src + 64); + vp9_prefetch_store(dst + 32); + + /* 16 word storage */ + for (y = h; y--; ) { + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_load(src + src_stride + 64); + vp9_prefetch_store(dst + dst_stride); + vp9_prefetch_store(dst + dst_stride + 32); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + "ulw %[tp5], 16(%[src]) \n\t" + "ulw %[tp6], 20(%[src]) \n\t" + "ulw %[tp7], 24(%[src]) \n\t" + "ulw %[tp8], 28(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + "sw %[tp5], 16(%[dst]) \n\t" /* store */ + "sw %[tp6], 20(%[dst]) \n\t" /* store */ + "sw %[tp7], 24(%[dst]) \n\t" /* store */ + "sw %[tp8], 28(%[dst]) \n\t" /* store */ + + "ulw %[tp1], 32(%[src]) \n\t" + "ulw %[tp2], 36(%[src]) \n\t" + "ulw %[tp3], 40(%[src]) \n\t" + "ulw %[tp4], 44(%[src]) \n\t" + "ulw %[tp5], 48(%[src]) \n\t" + "ulw %[tp6], 52(%[src]) \n\t" + "ulw %[tp7], 56(%[src]) \n\t" + "ulw %[tp8], 60(%[src]) \n\t" + + "sw %[tp1], 32(%[dst]) \n\t" /* store */ + "sw %[tp2], 36(%[dst]) \n\t" /* store */ + "sw %[tp3], 40(%[dst]) \n\t" /* store */ + "sw %[tp4], 44(%[dst]) \n\t" /* store */ + "sw %[tp5], 48(%[dst]) \n\t" /* store */ + "sw %[tp6], 52(%[dst]) \n\t" /* store */ + "sw %[tp7], 56(%[dst]) \n\t" /* store */ + "sw %[tp8], 60(%[dst]) \n\t" /* store */ + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), + [tp5] "=&r" (tp5), [tp6] "=&r" (tp6), + [tp7] "=&r" (tp7), [tp8] "=&r" (tp8) + : [src] "r" (src), [dst] "r" (dst) + ); + + src += src_stride; + dst += dst_stride; + } + } + break; + default: + for (y = h; y--; ) { + for (x = 0; x < w; ++x) { + dst[x] = src[x]; + } + + src += src_stride; + dst += dst_stride; + } + break; + } +} +#endif diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c new file mode 100644 index 0000000..0303896 --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c @@ -0,0 +1,923 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_convolve.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void convolve_horiz_4_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4; + uint32_t n1, n2, n3, n4; + uint32_t tn1, tn2; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "preceu.ph.qbl %[n4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[n1], %[tn1] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[n2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst]) \n\t" + "sb %[tn1], 1(%[dst]) \n\t" + "sb %[tp2], 2(%[dst]) \n\t" + "sb %[n2], 3(%[dst]) \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_horiz_8_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4, n1; + uint32_t tn1, tn2, tn3; + uint32_t st0, st1; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + vp9_prefetch_store(dst + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "preceu.ph.qbl %[n1], %[tn2] \n\t" + "ulw %[tn1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[tn1] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[st0], 0(%[dst]) \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + + "balign %[tn3], %[tn1], 3 \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[st1], 2(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tn2] \n\t" + "preceu.ph.qbl %[p4], %[tn2] \n\t" + "sb %[st0], 4(%[dst]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn1] \n\t" + "preceu.ph.qbl %[n1], %[tn1] \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "preceu.ph.qbr %[p2], %[tn3] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[st1], 1(%[dst]) \n\t" + "sb %[st0], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[n1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 3(%[dst]) \n\t" + "sb %[p2], 5(%[dst]) \n\t" + "sb %[n1], 7(%[dst]) \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3), + [st0] "=&r" (st0), [st1] "=&r" (st1), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [n1] "=&r" (n1), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), + [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) + ); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + vp9_prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "ulw %[qload2], 16(%[src]) \n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [p5] "=&r" (p5), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) + : [filter12] "r" (filter12), [filter34] "r" (filter34), + [filter56] "r" (filter56), [filter78] "r" (filter78), + [vector_64] "r" (vector_64), + [cm] "r" (cm), [dst] "r" (dst), + [src] "r" (src) + ); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + vp9_prefetch_load(src_ptr + src_stride + 64); + vp9_prefetch_store(dst_ptr + dst_stride); + vp9_prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "ulw %[qload2], 16(%[src]) \n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [p5] "=&r" (p5), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) + : [filter12] "r" (filter12), [filter34] "r" (filter34), + [filter56] "r" (filter56), [filter78] "r" (filter78), + [vector_64] "r" (vector_64), + [cm] "r" (cm), [dst] "r" (dst), + [src] "r" (src) + ); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void vp9_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (((const int32_t *)filter_x)[1] == 0x800000) { + vp9_convolve_copy(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } else if (((const int32_t *)filter_x)[0] == 0) { + vp9_convolve2_horiz_dspr2(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } else { + if (16 == x_step_q4) { + uint32_t pos = 38; + + vp9_prefetch_load((const uint8_t *)filter_x); + src -= 3; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + /* prefetch data to cache memory */ + vp9_prefetch_load(src); + vp9_prefetch_load(src + 32); + vp9_prefetch_store(dst); + + switch (w) { + case 4: + convolve_horiz_4_dspr2(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filter_x, (int32_t)h); + break; + case 8: + convolve_horiz_8_dspr2(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filter_x, (int32_t)h); + break; + case 16: + convolve_horiz_16_dspr2(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filter_x, (int32_t)h, 1); + break; + case 32: + convolve_horiz_16_dspr2(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filter_x, (int32_t)h, 2); + break; + case 64: + vp9_prefetch_load(src + 64); + vp9_prefetch_store(dst + 32); + + convolve_horiz_64_dspr2(src, (int32_t)src_stride, + dst, (int32_t)dst_stride, + filter_x, (int32_t)h); + break; + default: + vp9_convolve8_horiz_c(src + 3, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + break; + } + } else { + vp9_convolve8_horiz_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } + } +} +#endif diff --git a/libvpx/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c new file mode 100644 index 0000000..0930bb3 --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c @@ -0,0 +1,396 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_convolve.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void convolve_vert_4_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_y, + int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__ ( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [p1] "=&r" (p1), [p2] "=&r" (p2), + [n1] "=&r" (n1), [n2] "=&r" (n2), + [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [store1] "=&r" (store1), [store2] "=&r" (store2), + [src_ptr] "+r" (src_ptr) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), [src_stride] "r" (src_stride), + [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) + ); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_vert_64_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_y, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_store(dst + dst_stride); + vp9_prefetch_store(dst + dst_stride + 32); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__ ( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [p1] "=&r" (p1), [p2] "=&r" (p2), + [n1] "=&r" (n1), [n2] "=&r" (n2), + [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [store1] "=&r" (store1), [store2] "=&r" (store2), + [src_ptr] "+r" (src_ptr) + : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), + [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), + [vector4a] "r" (vector4a), [src_stride] "r" (src_stride), + [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) + ); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void vp9_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (((const int32_t *)filter_y)[1] == 0x800000) { + vp9_convolve_copy(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } else if (((const int32_t *)filter_y)[0] == 0) { + vp9_convolve2_vert_dspr2(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } else { + if (16 == y_step_q4) { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + vp9_prefetch_store(dst); + + switch (w) { + case 4 : + case 8 : + case 16 : + case 32 : + convolve_vert_4_dspr2(src, src_stride, + dst, dst_stride, + filter_y, w, h); + break; + case 64 : + vp9_prefetch_store(dst + 32); + convolve_vert_64_dspr2(src, src_stride, + dst, dst_stride, + filter_y, h); + break; + default: + vp9_convolve8_vert_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + break; + } + } else { + vp9_convolve8_vert_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + } + } +} + +#endif diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c new file mode 100644 index 0000000..1b2f550 --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c @@ -0,0 +1,1315 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { + int i; + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int step1_10, step1_11, step1_12, step1_13; + int step2_0, step2_1, step2_2, step2_3; + int step2_8, step2_9, step2_10, step2_11; + int step2_12, step2_13, step2_14, step2_15; + int load1, load2, load3, load4, load5, load6, load7, load8; + int result1, result2, result3, result4; + const int const_2_power_13 = 8192; + + for (i = no_rows; i--; ) { + /* prefetch row */ + vp9_prefetch_load((const uint8_t *)(input + 16)); + + __asm__ __volatile__ ( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 16(%[input]) \n\t" + "lh %[load3], 8(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[step2_0], $ac1, 31 \n\t" + "extp %[step2_1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[step2_2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[step2_3], $ac1, 31 \n\t" + + "add %[step1_0], %[step2_0], %[step2_3] \n\t" + "add %[step1_1], %[step2_1], %[step2_2] \n\t" + "sub %[step1_2], %[step2_1], %[step2_2] \n\t" + "sub %[step1_3], %[step2_0], %[step2_3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1), + [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3), + [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), + [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + __asm__ __volatile__ ( + "lh %[load5], 2(%[input]) \n\t" + "lh %[load6], 30(%[input]) \n\t" + "lh %[load7], 18(%[input]) \n\t" + "lh %[load8], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_30_64] \n\t" + "msub $ac1, %[load6], %[cospi_2_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_14_64] \n\t" + "msub $ac3, %[load8], %[cospi_18_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_18_64] \n\t" + "madd $ac1, %[load8], %[cospi_14_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_2_64] \n\t" + "madd $ac2, %[load6], %[cospi_30_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "sub %[load5], %[result1], %[result2] \n\t" + "sub %[load6], %[result4], %[result3] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load6], %[cospi_24_64] \n\t" + "msub $ac1, %[load5], %[cospi_8_64] \n\t" + "madd $ac3, %[load5], %[cospi_24_64] \n\t" + "madd $ac3, %[load6], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[result1], %[result2] \n\t" + "add %[step2_15], %[result4], %[result3] \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), + [load7] "=&r" (load7), [load8] "=&r" (load8), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [result3] "=&r" (result3), [result4] "=&r" (result4), + [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15), + [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), + [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 22(%[input]) \n\t" + "lh %[load3], 26(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load3], %[cospi_6_64] \n\t" + "msub $ac3, %[load4], %[cospi_26_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load1], %[cospi_10_64] \n\t" + "madd $ac1, %[load2], %[cospi_22_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load3], %[cospi_26_64] \n\t" + "madd $ac2, %[load4], %[cospi_6_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[result2], %[result1] \n\t" + "sub %[load2], %[result4], %[result3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[result1], %[result2] \n\t" + "add %[step2_12], %[result4], %[result3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [result3] "=&r" (result3), [result4] "=&r" (result4), + [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), + [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), + [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + __asm__ __volatile__ ( + "lh %[load5], 4(%[input]) \n\t" + "lh %[load6], 28(%[input]) \n\t" + "lh %[load7], 20(%[input]) \n\t" + "lh %[load8], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_28_64] \n\t" + "msub $ac1, %[load6], %[cospi_4_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_12_64] \n\t" + "msub $ac3, %[load8], %[cospi_20_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_20_64] \n\t" + "madd $ac1, %[load8], %[cospi_12_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_4_64] \n\t" + "madd $ac2, %[load6], %[cospi_28_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[result4], %[result3] \n\t" + "sub %[load5], %[load5], %[result1] \n\t" + "add %[load5], %[load5], %[result2] \n\t" + + "sub %[load6], %[result1], %[result2] \n\t" + "sub %[load6], %[load6], %[result3] \n\t" + "add %[load6], %[load6], %[result4] \n\t" + + "madd $ac1, %[load5], %[cospi_16_64] \n\t" + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[result1], %[result2] \n\t" + "add %[step1_7], %[result4], %[result3] \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), + [load7] "=&r" (load7), [load8] "=&r" (load8), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [result3] "=&r" (result3), [result4] "=&r" (result4), + [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), + [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "sub %[load5], %[step2_14], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + + "madd $ac0, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_14], %[step2_13] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "add %[load6], %[load6], %[step2_9] \n\t" + + "madd $ac1, %[load6], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[step2_15], %[step2_12] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "add %[load5], %[load5], %[step2_11] \n\t" + + "madd $ac2, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_15], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_11] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_10], $ac0, 31 \n\t" + "extp %[step1_13], $ac1, 31 \n\t" + "extp %[step1_11], $ac2, 31 \n\t" + "extp %[step1_12], $ac3, 31 \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), + [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11), + [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13) + : [const_2_power_13] "r" (const_2_power_13), + [step2_14] "r" (step2_14), [step2_13] "r" (step2_13), + [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), + [step2_15] "r" (step2_15), [step2_12] "r" (step2_12), + [step2_8] "r" (step2_8), [step2_11] "r" (step2_11), + [cospi_16_64] "r" (cospi_16_64) + ); + + __asm__ __volatile__ ( + "add %[load5], %[step1_0], %[step1_7] \n\t" + "add %[load5], %[load5], %[step2_12] \n\t" + "add %[load5], %[load5], %[step2_15] \n\t" + "add %[load6], %[step1_1], %[step1_6] \n\t" + "add %[load6], %[load6], %[step2_13] \n\t" + "add %[load6], %[load6], %[step2_14] \n\t" + "sh %[load5], 0(%[output]) \n\t" + "sh %[load6], 32(%[output]) \n\t" + "sub %[load5], %[step1_1], %[step1_6] \n\t" + "add %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + "sub %[load6], %[step1_0], %[step1_7] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + "add %[load6], %[load6], %[step2_11] \n\t" + "sh %[load5], 192(%[output]) \n\t" + "sh %[load6], 224(%[output]) \n\t" + "sub %[load5], %[step1_0], %[step1_7] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "sub %[load5], %[load5], %[step2_11] \n\t" + "sub %[load6], %[step1_1], %[step1_6] \n\t" + "sub %[load6], %[load6], %[step2_9] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "sh %[load5], 256(%[output]) \n\t" + "sh %[load6], 288(%[output]) \n\t" + "add %[load5], %[step1_1], %[step1_6] \n\t" + "sub %[load5], %[load5], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_14] \n\t" + "add %[load6], %[step1_0], %[step1_7] \n\t" + "sub %[load6], %[load6], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_15] \n\t" + "sh %[load5], 448(%[output]) \n\t" + "sh %[load6], 480(%[output]) \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6) + : [output] "r" (output), + [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), + [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), + [step2_8] "r" (step2_8), [step2_9] "r" (step2_9), + [step2_10] "r" (step2_10), [step2_11] "r" (step2_11), + [step2_12] "r" (step2_12), [step2_13] "r" (step2_13), + [step2_14] "r" (step2_14), [step2_15] "r" (step2_15) + ); + + __asm__ __volatile__ ( + "add %[load5], %[step1_2], %[step1_5] \n\t" + "add %[load5], %[load5], %[step1_13] \n\t" + "add %[load6], %[step1_3], %[step1_4] \n\t" + "add %[load6], %[load6], %[step1_12] \n\t" + "sh %[load5], 64(%[output]) \n\t" + "sh %[load6], 96(%[output]) \n\t" + "sub %[load5], %[step1_3], %[step1_4] \n\t" + "add %[load5], %[load5], %[step1_11] \n\t" + "sub %[load6], %[step1_2], %[step1_5] \n\t" + "add %[load6], %[load6], %[step1_10] \n\t" + "sh %[load5], 128(%[output]) \n\t" + "sh %[load6], 160(%[output]) \n\t" + "sub %[load5], %[step1_2], %[step1_5] \n\t" + "sub %[load5], %[load5], %[step1_10] \n\t" + "sub %[load6], %[step1_3], %[step1_4] \n\t" + "sub %[load6], %[load6], %[step1_11] \n\t" + "sh %[load5], 320(%[output]) \n\t" + "sh %[load6], 352(%[output]) \n\t" + "add %[load5], %[step1_3], %[step1_4] \n\t" + "sub %[load5], %[load5], %[step1_12] \n\t" + "add %[load6], %[step1_2], %[step1_5] \n\t" + "sub %[load6], %[load6], %[step1_13] \n\t" + "sh %[load5], 384(%[output]) \n\t" + "sh %[load6], 416(%[output]) \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6) + : [output] "r" (output), + [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), + [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), + [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), + [step1_12] "r" (step1_12), [step1_13] "r" (step1_13) + ); + + input += 16; + output += 1; + } +} + +static void idct16_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int i; + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int step1_8, step1_9, step1_10, step1_11; + int step1_12, step1_13, step1_14, step1_15; + int step2_0, step2_1, step2_2, step2_3; + int step2_8, step2_9, step2_10, step2_11; + int step2_12, step2_13, step2_14, step2_15; + int load1, load2, load3, load4, load5, load6, load7, load8; + int result1, result2, result3, result4; + const int const_2_power_13 = 8192; + uint8_t *dest_pix; + uint8_t *cm = vp9_ff_cropTbl; + + /* prefetch vp9_ff_cropTbl */ + vp9_prefetch_load(vp9_ff_cropTbl); + vp9_prefetch_load(vp9_ff_cropTbl + 32); + vp9_prefetch_load(vp9_ff_cropTbl + 64); + vp9_prefetch_load(vp9_ff_cropTbl + 96); + vp9_prefetch_load(vp9_ff_cropTbl + 128); + vp9_prefetch_load(vp9_ff_cropTbl + 160); + vp9_prefetch_load(vp9_ff_cropTbl + 192); + vp9_prefetch_load(vp9_ff_cropTbl + 224); + + for (i = 0; i < 16; ++i) { + dest_pix = (dest + i); + __asm__ __volatile__ ( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 16(%[input]) \n\t" + "lh %[load3], 8(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[step2_0], $ac1, 31 \n\t" + "extp %[step2_1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[step2_2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[step2_3], $ac1, 31 \n\t" + + "add %[step1_0], %[step2_0], %[step2_3] \n\t" + "add %[step1_1], %[step2_1], %[step2_2] \n\t" + "sub %[step1_2], %[step2_1], %[step2_2] \n\t" + "sub %[step1_3], %[step2_0], %[step2_3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1), + [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3), + [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), + [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + __asm__ __volatile__ ( + "lh %[load5], 2(%[input]) \n\t" + "lh %[load6], 30(%[input]) \n\t" + "lh %[load7], 18(%[input]) \n\t" + "lh %[load8], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_30_64] \n\t" + "msub $ac1, %[load6], %[cospi_2_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_14_64] \n\t" + "msub $ac3, %[load8], %[cospi_18_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_18_64] \n\t" + "madd $ac1, %[load8], %[cospi_14_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_2_64] \n\t" + "madd $ac2, %[load6], %[cospi_30_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "sub %[load5], %[result1], %[result2] \n\t" + "sub %[load6], %[result4], %[result3] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load6], %[cospi_24_64] \n\t" + "msub $ac1, %[load5], %[cospi_8_64] \n\t" + "madd $ac3, %[load5], %[cospi_24_64] \n\t" + "madd $ac3, %[load6], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[result1], %[result2] \n\t" + "add %[step2_15], %[result4], %[result3] \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), + [load7] "=&r" (load7), [load8] "=&r" (load8), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [result3] "=&r" (result3), [result4] "=&r" (result4), + [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15), + [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), + [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 22(%[input]) \n\t" + "lh %[load3], 26(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load3], %[cospi_6_64] \n\t" + "msub $ac3, %[load4], %[cospi_26_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load1], %[cospi_10_64] \n\t" + "madd $ac1, %[load2], %[cospi_22_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load3], %[cospi_26_64] \n\t" + "madd $ac2, %[load4], %[cospi_6_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[result2], %[result1] \n\t" + "sub %[load2], %[result4], %[result3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[result1], %[result2] \n\t" + "add %[step2_12], %[result4], %[result3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [result3] "=&r" (result3), [result4] "=&r" (result4), + [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), + [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), + [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + __asm__ __volatile__ ( + "lh %[load5], 4(%[input]) \n\t" + "lh %[load6], 28(%[input]) \n\t" + "lh %[load7], 20(%[input]) \n\t" + "lh %[load8], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_28_64] \n\t" + "msub $ac1, %[load6], %[cospi_4_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_12_64] \n\t" + "msub $ac3, %[load8], %[cospi_20_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_20_64] \n\t" + "madd $ac1, %[load8], %[cospi_12_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_4_64] \n\t" + "madd $ac2, %[load6], %[cospi_28_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[result4], %[result3] \n\t" + "sub %[load5], %[load5], %[result1] \n\t" + "add %[load5], %[load5], %[result2] \n\t" + + "sub %[load6], %[result1], %[result2] \n\t" + "sub %[load6], %[load6], %[result3] \n\t" + "add %[load6], %[load6], %[result4] \n\t" + + "madd $ac1, %[load5], %[cospi_16_64] \n\t" + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + + "add %[step1_4], %[result1], %[result2] \n\t" + "add %[step1_7], %[result4], %[result3] \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), + [load7] "=&r" (load7), [load8] "=&r" (load8), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [result3] "=&r" (result3), [result4] "=&r" (result4), + [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), + [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "sub %[load5], %[step2_14], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + + "madd $ac0, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_14], %[step2_13] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "add %[load6], %[load6], %[step2_9] \n\t" + + "madd $ac1, %[load6], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[step2_15], %[step2_12] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "add %[load5], %[load5], %[step2_11] \n\t" + + "madd $ac2, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_15], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_11] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_10], $ac0, 31 \n\t" + "extp %[step1_13], $ac1, 31 \n\t" + "extp %[step1_11], $ac2, 31 \n\t" + "extp %[step1_12], $ac3, 31 \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), + [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11), + [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13) + : [const_2_power_13] "r" (const_2_power_13), + [step2_14] "r" (step2_14), [step2_13] "r" (step2_13), + [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), + [step2_15] "r" (step2_15), [step2_12] "r" (step2_12), + [step2_8] "r" (step2_8), [step2_11] "r" (step2_11), + [cospi_16_64] "r" (cospi_16_64) + ); + + step1_8 = step2_8 + step2_11; + step1_9 = step2_9 + step2_10; + step1_14 = step2_13 + step2_14; + step1_15 = step2_12 + step2_15; + + __asm__ __volatile__ ( + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_0], %[step1_7] \n\t" + "add %[load5], %[load5], %[step1_15] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_1], %[step1_6] \n\t" + "add %[load6], %[load6], %[step1_14] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_2], %[step1_5] \n\t" + "add %[load5], %[load5], %[step1_13] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_3], %[step1_4] \n\t" + "add %[load6], %[load6], %[step1_12] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_3], %[step1_4] \n\t" + "add %[load5], %[load5], %[step1_11] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_2], %[step1_5] \n\t" + "add %[load6], %[load6], %[step1_10] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "sub %[load5], %[step1_1], %[step1_6] \n\t" + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[load5], %[step1_9] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_0], %[step1_7] \n\t" + "add %[load6], %[load6], %[step1_8] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_0], %[step1_7] \n\t" + "sub %[load5], %[load5], %[step1_8] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_1], %[step1_6] \n\t" + "sub %[load6], %[load6], %[step1_9] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_2], %[step1_5] \n\t" + "sub %[load5], %[load5], %[step1_10] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_3], %[step1_4] \n\t" + "sub %[load6], %[load6], %[step1_11] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_3], %[step1_4] \n\t" + "sub %[load5], %[load5], %[step1_12] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_2], %[step1_5] \n\t" + "sub %[load6], %[load6], %[step1_13] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_1], %[step1_6] \n\t" + "sub %[load5], %[load5], %[step1_14] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_0], %[step1_7] \n\t" + "sub %[load6], %[load6], %[step1_15] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + + : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7), + [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), + [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), + [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), + [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), + [step1_8] "r" (step1_8), [step1_9] "r" (step1_9), + [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), + [step1_12] "r" (step1_12), [step1_13] "r" (step1_13), + [step1_14] "r" (step1_14), [step1_15] "r" (step1_15) + ); + + input += 16; + } +} + +void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // First transform rows + idct16_1d_rows_dspr2(input, out, 16); + + // Then transform columns and add to dest + idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride); +} + +static void iadst16_1d(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; + + int x0 = input[15]; + int x1 = input[0]; + int x2 = input[13]; + int x3 = input[2]; + int x4 = input[11]; + int x5 = input[4]; + int x6 = input[9]; + int x7 = input[6]; + int x8 = input[7]; + int x9 = input[8]; + int x10 = input[5]; + int x11 = input[10]; + int x12 = input[3]; + int x13 = input[12]; + int x14 = input[1]; + int x15 = input[14]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 + | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { + output[0] = output[1] = output[2] = output[3] = output[4] + = output[5] = output[6] = output[7] = output[8] + = output[9] = output[10] = output[11] = output[12] + = output[13] = output[14] = output[15] = 0; + return; + } + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = dct_const_round_shift(s0 + s8); + x1 = dct_const_round_shift(s1 + s9); + x2 = dct_const_round_shift(s2 + s10); + x3 = dct_const_round_shift(s3 + s11); + x4 = dct_const_round_shift(s4 + s12); + x5 = dct_const_round_shift(s5 + s13); + x6 = dct_const_round_shift(s6 + s14); + x7 = dct_const_round_shift(s7 + s15); + x8 = dct_const_round_shift(s0 - s8); + x9 = dct_const_round_shift(s1 - s9); + x10 = dct_const_round_shift(s2 - s10); + x11 = dct_const_round_shift(s3 - s11); + x12 = dct_const_round_shift(s4 - s12); + x13 = dct_const_round_shift(s5 - s13); + x14 = dct_const_round_shift(s6 - s14); + x15 = dct_const_round_shift(s7 - s15); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = s0 + s4; + x1 = s1 + s5; + x2 = s2 + s6; + x3 = s3 + s7; + x4 = s0 - s4; + x5 = s1 - s5; + x6 = s2 - s6; + x7 = s3 - s7; + x8 = dct_const_round_shift(s8 + s12); + x9 = dct_const_round_shift(s9 + s13); + x10 = dct_const_round_shift(s10 + s14); + x11 = dct_const_round_shift(s11 + s15); + x12 = dct_const_round_shift(s8 - s12); + x13 = dct_const_round_shift(s9 - s13); + x14 = dct_const_round_shift(s10 - s14); + x15 = dct_const_round_shift(s11 - s15); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = dct_const_round_shift(s4 + s6); + x5 = dct_const_round_shift(s5 + s7); + x6 = dct_const_round_shift(s4 - s6); + x7 = dct_const_round_shift(s5 - s7); + x8 = s8 + s10; + x9 = s9 + s11; + x10 = s8 - s10; + x11 = s9 - s11; + x12 = dct_const_round_shift(s12 + s14); + x13 = dct_const_round_shift(s13 + s15); + x14 = dct_const_round_shift(s12 - s14); + x15 = dct_const_round_shift(s13 - s15); + + // stage 4 + s2 = (- cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (- x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (- x10 + x11); + s14 = (- cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = dct_const_round_shift(s2); + x3 = dct_const_round_shift(s3); + x6 = dct_const_round_shift(s6); + x7 = dct_const_round_shift(s7); + x10 = dct_const_round_shift(s10); + x11 = dct_const_round_shift(s11); + x14 = dct_const_round_shift(s14); + x15 = dct_const_round_shift(s15); + + output[0] = x0; + output[1] = -x8; + output[2] = x12; + output[3] = -x4; + output[4] = x6; + output[5] = x14; + output[6] = x10; + output[7] = x2; + output[8] = x3; + output[9] = x11; + output[10] = x15; + output[11] = x7; + output[12] = x5; + output[13] = -x13; + output[14] = x9; + output[15] = -x1; +} + +void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, + int pitch, int tx_type) { + int i, j; + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + int16_t *outptr = out; + int16_t temp_out[16]; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + switch (tx_type) { + case DCT_DCT: // DCT in both horizontal and vertical + idct16_1d_rows_dspr2(input, outptr, 16); + idct16_1d_cols_add_blk_dspr2(out, dest, pitch); + break; + case ADST_DCT: // ADST in vertical, DCT in horizontal + idct16_1d_rows_dspr2(input, outptr, 16); + + outptr = out; + + for (i = 0; i < 16; ++i) { + iadst16_1d(outptr, temp_out); + + for (j = 0; j < 16; ++j) + dest[j * pitch + i] = + clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * pitch + i]); + outptr += 16; + } + break; + case DCT_ADST: // DCT in vertical, ADST in horizontal + { + int16_t temp_in[16 * 16]; + + for (i = 0; i < 16; ++i) { + /* prefetch row */ + vp9_prefetch_load((const uint8_t *)(input + 16)); + + iadst16_1d(input, outptr); + input += 16; + outptr += 16; + } + + for (i = 0; i < 16; ++i) + for (j = 0; j < 16; ++j) + temp_in[j * 16 + i] = out[i * 16 + j]; + + idct16_1d_cols_add_blk_dspr2(temp_in, dest, pitch); + } + break; + case ADST_ADST: // ADST in both directions + { + int16_t temp_in[16]; + + for (i = 0; i < 16; ++i) { + /* prefetch row */ + vp9_prefetch_load((const uint8_t *)(input + 16)); + + iadst16_1d(input, outptr); + input += 16; + outptr += 16; + } + + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + iadst16_1d(temp_in, temp_out); + for (j = 0; j < 16; ++j) + dest[j * pitch + i] = + clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * pitch + i]); + } + } + break; + default: + printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n"); + break; + } +} + +void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + int16_t *outptr = out; + uint32_t i; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. + idct16_1d_rows_dspr2(input, outptr, 4); + + outptr += 4; + for (i = 0; i < 6; ++i) { + __asm__ __volatile__ ( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 64(%[outptr]) \n\t" + "sw $zero, 96(%[outptr]) \n\t" + "sw $zero, 128(%[outptr]) \n\t" + "sw $zero, 160(%[outptr]) \n\t" + "sw $zero, 192(%[outptr]) \n\t" + "sw $zero, 224(%[outptr]) \n\t" + "sw $zero, 256(%[outptr]) \n\t" + "sw $zero, 288(%[outptr]) \n\t" + "sw $zero, 320(%[outptr]) \n\t" + "sw $zero, 352(%[outptr]) \n\t" + "sw $zero, 384(%[outptr]) \n\t" + "sw $zero, 416(%[outptr]) \n\t" + "sw $zero, 448(%[outptr]) \n\t" + "sw $zero, 480(%[outptr]) \n\t" + + : + : [outptr] "r" (outptr) + ); + + outptr += 2; + } + + // Then transform columns + idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride); +} + +void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + uint32_t pos = 45; + int32_t out; + int32_t r; + int32_t a1, absa1; + int32_t vector_a1; + int32_t t1, t2, t3, t4; + int32_t vector_1, vector_2, vector_3, vector_4; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + + : + : [pos] "r" (pos) + ); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__ ( + "addi %[out], %[out], 32 \n\t" + "sra %[a1], %[out], 6 \n\t" + + : [out] "+r" (out), [a1] "=r" (a1) + : + ); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 16; r--;) { + __asm__ __volatile__ ( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), + [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), + [dest] "+&r" (dest) + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) + ); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 16; r--;) { + __asm__ __volatile__ ( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), + [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), + [dest] "+&r" (dest) + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) + ); + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c new file mode 100644 index 0000000..5e92db3 --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c @@ -0,0 +1,1073 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; + int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; + int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19; + int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26; + int16_t step1_27, step1_28, step1_29, step1_30, step1_31; + int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; + int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; + int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; + int16_t step2_28, step2_29, step2_30, step2_31; + int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; + int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; + int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27; + int16_t step3_28, step3_29, step3_30, step3_31; + int temp0, temp1, temp2, temp3; + int load1, load2, load3, load4; + int result1, result2; + int i, temp21; + uint8_t *dest_pix, *dest_pix1; + const int const_2_power_13 = 8192; + uint8_t *cm = vp9_ff_cropTbl; + + /* prefetch vp9_ff_cropTbl */ + vp9_prefetch_load(vp9_ff_cropTbl); + vp9_prefetch_load(vp9_ff_cropTbl + 32); + vp9_prefetch_load(vp9_ff_cropTbl + 64); + vp9_prefetch_load(vp9_ff_cropTbl + 96); + vp9_prefetch_load(vp9_ff_cropTbl + 128); + vp9_prefetch_load(vp9_ff_cropTbl + 160); + vp9_prefetch_load(vp9_ff_cropTbl + 192); + vp9_prefetch_load(vp9_ff_cropTbl + 224); + + for (i = 0; i < 32; ++i) { + dest_pix = dest + i; + dest_pix1 = dest + i + 31 * dest_stride; + + __asm__ __volatile__ ( + "lh %[load1], 2(%[input]) \n\t" + "lh %[load2], 62(%[input]) \n\t" + "lh %[load3], 34(%[input]) \n\t" + "lh %[load4], 30(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_31_64] \n\t" + "msub $ac1, %[load2], %[cospi_1_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_1_64] \n\t" + "madd $ac3, %[load2], %[cospi_31_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_15_64] \n\t" + "msub $ac2, %[load4], %[cospi_17_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_17_64] \n\t" + "madd $ac1, %[load4], %[cospi_15_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_17], $ac1, 31 \n\t" + "extp %[step1_30], $ac3, 31 \n\t" + "add %[step1_16], %[temp0], %[temp1] \n\t" + "add %[step1_31], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), + [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17), + [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64), + [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 18(%[input]) \n\t" + "lh %[load2], 46(%[input]) \n\t" + "lh %[load3], 50(%[input]) \n\t" + "lh %[load4], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_23_64] \n\t" + "msub $ac1, %[load2], %[cospi_9_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_9_64] \n\t" + "madd $ac3, %[load2], %[cospi_23_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_7_64] \n\t" + "msub $ac2, %[load4], %[cospi_25_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_25_64] \n\t" + "madd $ac1, %[load4], %[cospi_7_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "msub $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_18], $ac1, 31 \n\t" + "extp %[step1_29], $ac3, 31 \n\t" + "add %[step1_19], %[temp0], %[temp1] \n\t" + "add %[step1_28], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), + [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19), + [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64), + [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 54(%[input]) \n\t" + "lh %[load3], 42(%[input]) \n\t" + "lh %[load4], 22(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_27_64] \n\t" + "msub $ac1, %[load2], %[cospi_5_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_5_64] \n\t" + "madd $ac3, %[load2], %[cospi_27_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_11_64] \n\t" + "msub $ac2, %[load4], %[cospi_21_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_21_64] \n\t" + "madd $ac1, %[load4], %[cospi_11_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "madd $ac1, %[load2], %[cospi_12_64] \n\t" + "msub $ac1, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load1], %[cospi_12_64] \n\t" + "madd $ac3, %[load2], %[cospi_20_64] \n\t" + + "extp %[step1_21], $ac1, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" + "add %[step1_20], %[temp0], %[temp1] \n\t" + "add %[step1_27], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), + [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21), + [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64), + [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64), + [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 26(%[input]) \n\t" + "lh %[load2], 38(%[input]) \n\t" + "lh %[load3], 58(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_19_64] \n\t" + "msub $ac1, %[load2], %[cospi_13_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_13_64] \n\t" + "madd $ac3, %[load2], %[cospi_19_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_3_64] \n\t" + "msub $ac2, %[load4], %[cospi_29_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_29_64] \n\t" + "madd $ac1, %[load4], %[cospi_3_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + "msub $ac1, %[load1], %[cospi_12_64] \n\t" + "msub $ac1, %[load2], %[cospi_20_64] \n\t" + "msub $ac3, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load2], %[cospi_12_64] \n\t" + "extp %[step1_22], $ac1, 31 \n\t" + "extp %[step1_25], $ac3, 31 \n\t" + "add %[step1_23], %[temp0], %[temp1] \n\t" + "add %[step1_24], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), + [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23), + [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64), + [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64), + [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 4(%[input]) \n\t" + "lh %[load2], 60(%[input]) \n\t" + "lh %[load3], 36(%[input]) \n\t" + "lh %[load4], 28(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_30_64] \n\t" + "msub $ac1, %[load2], %[cospi_2_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_2_64] \n\t" + "madd $ac3, %[load2], %[cospi_30_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_14_64] \n\t" + "msub $ac2, %[load4], %[cospi_18_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_18_64] \n\t" + "madd $ac1, %[load4], %[cospi_14_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + "msub $ac1, %[load1], %[cospi_8_64] \n\t" + "madd $ac1, %[load2], %[cospi_24_64] \n\t" + "madd $ac3, %[load1], %[cospi_24_64] \n\t" + "madd $ac3, %[load2], %[cospi_8_64] \n\t" + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[temp0], %[temp1] \n\t" + "add %[step2_15], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), + [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9), + [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), + [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), + [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 20(%[input]) \n\t" + "lh %[load2], 44(%[input]) \n\t" + "lh %[load3], 52(%[input]) \n\t" + "lh %[load4], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_10_64] \n\t" + "madd $ac3, %[load2], %[cospi_22_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_6_64] \n\t" + "msub $ac2, %[load4], %[cospi_26_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_26_64] \n\t" + "madd $ac1, %[load4], %[cospi_6_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[temp0], %[temp1] \n\t" + "add %[step2_12], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3), + [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), + [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), + [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), + [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) + ); + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "sub %[temp0], %[step2_14], %[step2_13] \n\t" + "sub %[temp0], %[temp0], %[step2_9] \n\t" + "add %[temp0], %[temp0], %[step2_10] \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp1], %[step2_14], %[step2_13] \n\t" + "add %[temp1], %[temp1], %[step2_9] \n\t" + "sub %[temp1], %[temp1], %[step2_10] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "sub %[temp0], %[step2_15], %[step2_12] \n\t" + "sub %[temp0], %[temp0], %[step2_8] \n\t" + "add %[temp0], %[temp0], %[step2_11] \n\t" + "madd $ac2, %[temp0], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sub %[temp1], %[step2_15], %[step2_12] \n\t" + "add %[temp1], %[temp1], %[step2_8] \n\t" + "sub %[temp1], %[temp1], %[step2_11] \n\t" + "madd $ac3, %[temp1], %[cospi_16_64] \n\t" + + "add %[step3_8], %[step2_8], %[step2_11] \n\t" + "add %[step3_9], %[step2_9], %[step2_10] \n\t" + "add %[step3_14], %[step2_13], %[step2_14] \n\t" + "add %[step3_15], %[step2_12], %[step2_15] \n\t" + "extp %[step3_10], $ac0, 31 \n\t" + "extp %[step3_13], $ac1, 31 \n\t" + "extp %[step3_11], $ac2, 31 \n\t" + "extp %[step3_12], $ac3, 31 \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9), + [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11), + [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13), + [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15) + : [const_2_power_13] "r" (const_2_power_13), [step2_8] "r" (step2_8), + [step2_9] "r" (step2_9), [step2_10] "r" (step2_10), + [step2_11] "r" (step2_11), [step2_12] "r" (step2_12), + [step2_13] "r" (step2_13), [step2_14] "r" (step2_14), + [step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64) + ); + + step2_18 = step1_17 - step1_18; + step2_29 = step1_30 - step1_29; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" + "extp %[step3_18], $ac0, 31 \n\t" + + : [step3_18] "=r" (step3_18) + : [const_2_power_13] "r" (const_2_power_13), + [step2_18] "r" (step2_18), [step2_29] "r" (step2_29), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; + step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_19 = step1_16 - step1_19; + step2_28 = step1_31 - step1_28; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" + "extp %[step3_19], $ac0, 31 \n\t" + + : [step3_19] "=r" (step3_19) + : [const_2_power_13] "r" (const_2_power_13), + [step2_19] "r" (step2_19), [step2_28] "r" (step2_28), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; + step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_16 = step1_16 + step1_19; + step3_17 = step1_17 + step1_18; + step3_30 = step1_29 + step1_30; + step3_31 = step1_28 + step1_31; + + step2_20 = step1_23 - step1_20; + step2_27 = step1_24 - step1_27; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" + "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" + "extp %[step3_20], $ac0, 31 \n\t" + + : [step3_20] "=r" (step3_20) + : [const_2_power_13] "r" (const_2_power_13), + [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; + step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_21 = step1_22 - step1_21; + step2_26 = step1_25 - step1_26; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" + "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" + "extp %[step3_21], $ac1, 31 \n\t" + + : [step3_21] "=r" (step3_21) + : [const_2_power_13] "r" (const_2_power_13), + [step2_21] "r" (step2_21), [step2_26] "r" (step2_26), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; + step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_22 = step1_21 + step1_22; + step3_23 = step1_20 + step1_23; + step3_24 = step1_24 + step1_27; + step3_25 = step1_25 + step1_26; + + step2_16 = step3_16 + step3_23; + step2_17 = step3_17 + step3_22; + step2_18 = step3_18 + step3_21; + step2_19 = step3_19 + step3_20; + step2_20 = step3_19 - step3_20; + step2_21 = step3_18 - step3_21; + step2_22 = step3_17 - step3_22; + step2_23 = step3_16 - step3_23; + + step2_24 = step3_31 - step3_24; + step2_25 = step3_30 - step3_25; + step2_26 = step3_29 - step3_26; + step2_27 = step3_28 - step3_27; + step2_28 = step3_28 + step3_27; + step2_29 = step3_29 + step3_26; + step2_30 = step3_30 + step3_25; + step2_31 = step3_31 + step3_24; + + __asm__ __volatile__ ( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 32(%[input]) \n\t" + "lh %[load3], 16(%[input]) \n\t" + "lh %[load4], 48(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[temp2], $ac3, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[temp3], $ac1, 31 \n\t" + "add %[step1_0], %[temp0], %[temp3] \n\t" + "add %[step1_1], %[temp1], %[temp2] \n\t" + "sub %[step1_2], %[temp1], %[temp2] \n\t" + "sub %[step1_3], %[temp0], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), + [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 8(%[input]) \n\t" + "lh %[load2], 56(%[input]) \n\t" + "lh %[load3], 40(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_12_64] \n\t" + "msub $ac2, %[load4], %[cospi_20_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_20_64] \n\t" + "madd $ac1, %[load4], %[cospi_12_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load1], %[load1], %[temp0] \n\t" + "add %[load1], %[load1], %[temp1] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + "sub %[load2], %[load2], %[temp2] \n\t" + "add %[load2], %[load2], %[temp3] \n\t" + "madd $ac1, %[load1], %[cospi_16_64] \n\t" + "madd $ac3, %[load2], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[temp0], %[temp1] \n\t" + "add %[step1_7], %[temp3], %[temp2] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), + [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + step2_0 = step1_0 + step1_7; + step2_1 = step1_1 + step1_6; + step2_2 = step1_2 + step1_5; + step2_3 = step1_3 + step1_4; + step2_4 = step1_3 - step1_4; + step2_5 = step1_2 - step1_5; + step2_6 = step1_1 - step1_6; + step2_7 = step1_0 - step1_7; + + // stage 7 + step1_0 = step2_0 + step3_15; + step1_1 = step2_1 + step3_14; + step1_2 = step2_2 + step3_13; + step1_3 = step2_3 + step3_12; + step1_4 = step2_4 + step3_11; + step1_5 = step2_5 + step3_10; + step1_6 = step2_6 + step3_9; + step1_7 = step2_7 + step3_8; + step1_8 = step2_7 - step3_8; + step1_9 = step2_6 - step3_9; + step1_10 = step2_5 - step3_10; + step1_11 = step2_4 - step3_11; + step1_12 = step2_3 - step3_12; + step1_13 = step2_2 - step3_13; + step1_14 = step2_1 - step3_14; + step1_15 = step2_0 - step3_15; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_27], %[step2_20] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_20], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20) + : [const_2_power_13] "r" (const_2_power_13), [step2_20] "r" (step2_20), + [step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_20 + step2_27) * cospi_16_64; + step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_26], %[step2_21] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_21], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21) + : [const_2_power_13] "r" (const_2_power_13), [step2_26] "r" (step2_26), + [step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_21 + step2_26) * cospi_16_64; + step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_25], %[step2_22] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_22], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22) + : [const_2_power_13] "r" (const_2_power_13), [step2_25] "r" (step2_25), + [step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_22 + step2_25) * cospi_16_64; + step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_24], %[step2_23] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_23], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23) + : [const_2_power_13] "r" (const_2_power_13), [step2_24] "r" (step2_24), + [step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_23 + step2_24) * cospi_16_64; + step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_0], %[step2_31] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_1], %[step2_30] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_2], %[step2_29] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_3], %[step2_28] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step1_0] "r" (step1_0), [step1_1] "r" (step1_1), + [step1_2] "r" (step1_2), [step1_3] "r" (step1_3), + [step2_28] "r" (step2_28), [step2_29] "r" (step2_29), + [step2_30] "r" (step2_30), [step2_31] "r" (step2_31) + ); + + step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), + [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) + ); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_4], %[step1_27] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_5], %[step1_26] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_6], %[step1_25] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_7], %[step1_24] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step1_4] "r" (step1_4), [step1_5] "r" (step1_5), + [step1_6] "r" (step1_6), [step1_7] "r" (step1_7), + [step1_24] "r" (step1_24), [step1_25] "r" (step1_25), + [step1_26] "r" (step1_26), [step1_27] "r" (step1_27) + ); + + step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), + [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) + ); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_8], %[step1_23] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_9], %[step1_22] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_10], %[step1_21] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_11], %[step1_20] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step1_8] "r" (step1_8), [step1_9] "r" (step1_9), + [step1_10] "r" (step1_10), [step1_11] "r" (step1_11), + [step1_20] "r" (step1_20), [step1_21] "r" (step1_21), + [step1_22] "r" (step1_22), [step1_23] "r" (step1_23) + ); + + step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), + [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) + ); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_12], %[step2_19] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_13], %[step2_18] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_14], %[step2_17] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_15], %[step2_16] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step1_12] "r" (step1_12), [step1_13] "r" (step1_13), + [step1_14] "r" (step1_14), [step1_15] "r" (step1_15), + [step2_16] "r" (step2_16), [step2_17] "r" (step2_17), + [step2_18] "r" (step2_18), [step2_19] "r" (step2_19) + ); + + step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6); + + __asm__ __volatile__ ( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), + [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1) + : [cm] "r" (cm), [dest_stride] "r" (dest_stride), + [step3_12] "r" (step3_12), [step3_13] "r" (step3_13), + [step3_14] "r" (step3_14), [step3_15] "r" (step3_15) + ); + + input += 32; + } +} +#endif // #if HAVE_DSPR2 diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c new file mode 100644 index 0000000..d3aee73 --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c @@ -0,0 +1,1013 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) { + int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; + int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; + int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; + int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; + int16_t step1_28, step1_29, step1_30, step1_31; + int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; + int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; + int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; + int16_t step2_28, step2_29, step2_30, step2_31; + int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; + int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; + int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; + int16_t step3_29, step3_30, step3_31; + int temp0, temp1, temp2, temp3; + int load1, load2, load3, load4; + int result1, result2; + int temp21; + int i; + const int const_2_power_13 = 8192; + const int32_t *input_int; + + for (i = 32; i--; ) { + input_int = (const int32_t *)input; + + if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | + input_int[4] | input_int[5] | input_int[6] | input_int[7] | + input_int[8] | input_int[9] | input_int[10] | input_int[11] | + input_int[12] | input_int[13] | input_int[14] | input_int[15])) { + input += 32; + + __asm__ __volatile__ ( + "sh $zero, 0(%[output]) \n\t" + "sh $zero, 64(%[output]) \n\t" + "sh $zero, 128(%[output]) \n\t" + "sh $zero, 192(%[output]) \n\t" + "sh $zero, 256(%[output]) \n\t" + "sh $zero, 320(%[output]) \n\t" + "sh $zero, 384(%[output]) \n\t" + "sh $zero, 448(%[output]) \n\t" + "sh $zero, 512(%[output]) \n\t" + "sh $zero, 576(%[output]) \n\t" + "sh $zero, 640(%[output]) \n\t" + "sh $zero, 704(%[output]) \n\t" + "sh $zero, 768(%[output]) \n\t" + "sh $zero, 832(%[output]) \n\t" + "sh $zero, 896(%[output]) \n\t" + "sh $zero, 960(%[output]) \n\t" + "sh $zero, 1024(%[output]) \n\t" + "sh $zero, 1088(%[output]) \n\t" + "sh $zero, 1152(%[output]) \n\t" + "sh $zero, 1216(%[output]) \n\t" + "sh $zero, 1280(%[output]) \n\t" + "sh $zero, 1344(%[output]) \n\t" + "sh $zero, 1408(%[output]) \n\t" + "sh $zero, 1472(%[output]) \n\t" + "sh $zero, 1536(%[output]) \n\t" + "sh $zero, 1600(%[output]) \n\t" + "sh $zero, 1664(%[output]) \n\t" + "sh $zero, 1728(%[output]) \n\t" + "sh $zero, 1792(%[output]) \n\t" + "sh $zero, 1856(%[output]) \n\t" + "sh $zero, 1920(%[output]) \n\t" + "sh $zero, 1984(%[output]) \n\t" + + : + : [output] "r" (output) + ); + + output += 1; + + continue; + } + + /* prefetch row */ + vp9_prefetch_load((const uint8_t *)(input + 32)); + vp9_prefetch_load((const uint8_t *)(input + 48)); + + __asm__ __volatile__ ( + "lh %[load1], 2(%[input]) \n\t" + "lh %[load2], 62(%[input]) \n\t" + "lh %[load3], 34(%[input]) \n\t" + "lh %[load4], 30(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_31_64] \n\t" + "msub $ac1, %[load2], %[cospi_1_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_1_64] \n\t" + "madd $ac3, %[load2], %[cospi_31_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_15_64] \n\t" + "msub $ac2, %[load4], %[cospi_17_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_17_64] \n\t" + "madd $ac1, %[load4], %[cospi_15_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_17], $ac1, 31 \n\t" + "extp %[step1_30], $ac3, 31 \n\t" + "add %[step1_16], %[temp0], %[temp1] \n\t" + "add %[step1_31], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17), + [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64), + [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 18(%[input]) \n\t" + "lh %[load2], 46(%[input]) \n\t" + "lh %[load3], 50(%[input]) \n\t" + "lh %[load4], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_23_64] \n\t" + "msub $ac1, %[load2], %[cospi_9_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_9_64] \n\t" + "madd $ac3, %[load2], %[cospi_23_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_7_64] \n\t" + "msub $ac2, %[load4], %[cospi_25_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_25_64] \n\t" + "madd $ac1, %[load4], %[cospi_7_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "msub $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_18], $ac1, 31 \n\t" + "extp %[step1_29], $ac3, 31 \n\t" + "add %[step1_19], %[temp0], %[temp1] \n\t" + "add %[step1_28], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19), + [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64), + [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 54(%[input]) \n\t" + "lh %[load3], 42(%[input]) \n\t" + "lh %[load4], 22(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_27_64] \n\t" + "msub $ac1, %[load2], %[cospi_5_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_5_64] \n\t" + "madd $ac3, %[load2], %[cospi_27_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_11_64] \n\t" + "msub $ac2, %[load4], %[cospi_21_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_21_64] \n\t" + "madd $ac1, %[load4], %[cospi_11_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "madd $ac1, %[load2], %[cospi_12_64] \n\t" + "msub $ac1, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load1], %[cospi_12_64] \n\t" + "madd $ac3, %[load2], %[cospi_20_64] \n\t" + + "extp %[step1_21], $ac1, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" + "add %[step1_20], %[temp0], %[temp1] \n\t" + "add %[step1_27], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21), + [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64), + [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64), + [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 26(%[input]) \n\t" + "lh %[load2], 38(%[input]) \n\t" + "lh %[load3], 58(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_19_64] \n\t" + "msub $ac1, %[load2], %[cospi_13_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_13_64] \n\t" + "madd $ac3, %[load2], %[cospi_19_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_3_64] \n\t" + "msub $ac2, %[load4], %[cospi_29_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_29_64] \n\t" + "madd $ac1, %[load4], %[cospi_3_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_12_64] \n\t" + "msub $ac1, %[load2], %[cospi_20_64] \n\t" + "msub $ac3, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load2], %[cospi_12_64] \n\t" + + "extp %[step1_22], $ac1, 31 \n\t" + "extp %[step1_25], $ac3, 31 \n\t" + "add %[step1_23], %[temp0], %[temp1] \n\t" + "add %[step1_24], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23), + [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64), + [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64), + [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 4(%[input]) \n\t" + "lh %[load2], 60(%[input]) \n\t" + "lh %[load3], 36(%[input]) \n\t" + "lh %[load4], 28(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_30_64] \n\t" + "msub $ac1, %[load2], %[cospi_2_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_2_64] \n\t" + "madd $ac3, %[load2], %[cospi_30_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_14_64] \n\t" + "msub $ac2, %[load4], %[cospi_18_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_18_64] \n\t" + "madd $ac1, %[load4], %[cospi_14_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "msub $ac1, %[load1], %[cospi_8_64] \n\t" + "madd $ac1, %[load2], %[cospi_24_64] \n\t" + "madd $ac3, %[load1], %[cospi_24_64] \n\t" + "madd $ac3, %[load2], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[temp0], %[temp1] \n\t" + "add %[step2_15], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9), + [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64), + [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64), + [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) + ); + + __asm__ __volatile__ ( + "lh %[load1], 20(%[input]) \n\t" + "lh %[load2], 44(%[input]) \n\t" + "lh %[load3], 52(%[input]) \n\t" + "lh %[load4], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_10_64] \n\t" + "madd $ac3, %[load2], %[cospi_22_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_6_64] \n\t" + "msub $ac2, %[load4], %[cospi_26_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_26_64] \n\t" + "madd $ac1, %[load4], %[cospi_6_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[temp0], %[temp1] \n\t" + "add %[step2_12], %[temp2], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11), + [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64), + [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64), + [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64) + ); + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "sub %[temp0], %[step2_14], %[step2_13] \n\t" + "sub %[temp0], %[temp0], %[step2_9] \n\t" + "add %[temp0], %[temp0], %[step2_10] \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp1], %[step2_14], %[step2_13] \n\t" + "add %[temp1], %[temp1], %[step2_9] \n\t" + "sub %[temp1], %[temp1], %[step2_10] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "sub %[temp0], %[step2_15], %[step2_12] \n\t" + "sub %[temp0], %[temp0], %[step2_8] \n\t" + "add %[temp0], %[temp0], %[step2_11] \n\t" + "madd $ac2, %[temp0], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sub %[temp1], %[step2_15], %[step2_12] \n\t" + "add %[temp1], %[temp1], %[step2_8] \n\t" + "sub %[temp1], %[temp1], %[step2_11] \n\t" + "madd $ac3, %[temp1], %[cospi_16_64] \n\t" + + "add %[step3_8], %[step2_8], %[step2_11] \n\t" + "add %[step3_9], %[step2_9], %[step2_10] \n\t" + "add %[step3_14], %[step2_13], %[step2_14] \n\t" + "add %[step3_15], %[step2_12], %[step2_15] \n\t" + + "extp %[step3_10], $ac0, 31 \n\t" + "extp %[step3_13], $ac1, 31 \n\t" + "extp %[step3_11], $ac2, 31 \n\t" + "extp %[step3_12], $ac3, 31 \n\t" + + : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9), + [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11), + [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13), + [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15) + : [const_2_power_13] "r" (const_2_power_13), + [step2_8] "r" (step2_8), [step2_9] "r" (step2_9), + [step2_10] "r" (step2_10), [step2_11] "r" (step2_11), + [step2_12] "r" (step2_12), [step2_13] "r" (step2_13), + [step2_14] "r" (step2_14), [step2_15] "r" (step2_15), + [cospi_16_64] "r" (cospi_16_64) + ); + + step2_18 = step1_17 - step1_18; + step2_29 = step1_30 - step1_29; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" + "extp %[step3_18], $ac0, 31 \n\t" + + : [step3_18] "=r" (step3_18) + : [const_2_power_13] "r" (const_2_power_13), + [step2_18] "r" (step2_18), [step2_29] "r" (step2_29), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; + step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_19 = step1_16 - step1_19; + step2_28 = step1_31 - step1_28; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" + "extp %[step3_19], $ac0, 31 \n\t" + + : [step3_19] "=r" (step3_19) + : [const_2_power_13] "r" (const_2_power_13), + [step2_19] "r" (step2_19), [step2_28] "r" (step2_28), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; + step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_16 = step1_16 + step1_19; + step3_17 = step1_17 + step1_18; + step3_30 = step1_29 + step1_30; + step3_31 = step1_28 + step1_31; + + step2_20 = step1_23 - step1_20; + step2_27 = step1_24 - step1_27; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" + "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" + "extp %[step3_20], $ac0, 31 \n\t" + + : [step3_20] "=r" (step3_20) + : [const_2_power_13] "r" (const_2_power_13), + [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; + step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_21 = step1_22 - step1_21; + step2_26 = step1_25 - step1_26; + + __asm__ __volatile__ ( + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" + "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" + "extp %[step3_21], $ac1, 31 \n\t" + + : [step3_21] "=r" (step3_21) + : [const_2_power_13] "r" (const_2_power_13), + [step2_21] "r" (step2_21), [step2_26] "r" (step2_26), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + ); + + temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; + step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_22 = step1_21 + step1_22; + step3_23 = step1_20 + step1_23; + step3_24 = step1_24 + step1_27; + step3_25 = step1_25 + step1_26; + + step2_16 = step3_16 + step3_23; + step2_17 = step3_17 + step3_22; + step2_18 = step3_18 + step3_21; + step2_19 = step3_19 + step3_20; + step2_20 = step3_19 - step3_20; + step2_21 = step3_18 - step3_21; + step2_22 = step3_17 - step3_22; + step2_23 = step3_16 - step3_23; + + step2_24 = step3_31 - step3_24; + step2_25 = step3_30 - step3_25; + step2_26 = step3_29 - step3_26; + step2_27 = step3_28 - step3_27; + step2_28 = step3_28 + step3_27; + step2_29 = step3_29 + step3_26; + step2_30 = step3_30 + step3_25; + step2_31 = step3_31 + step3_24; + + __asm__ __volatile__ ( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 32(%[input]) \n\t" + "lh %[load3], 16(%[input]) \n\t" + "lh %[load4], 48(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[temp2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[temp3], $ac1, 31 \n\t" + + "add %[step1_0], %[temp0], %[temp3] \n\t" + "add %[step1_1], %[temp1], %[temp2] \n\t" + "sub %[step1_2], %[temp1], %[temp2] \n\t" + "sub %[step1_3], %[temp0], %[temp3] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [result1] "=&r" (result1), [result2] "=&r" (result2), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1), + [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_16_64] "r" (cospi_16_64), + [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64) + + ); + + __asm__ __volatile__ ( + "lh %[load1], 8(%[input]) \n\t" + "lh %[load2], 56(%[input]) \n\t" + "lh %[load3], 40(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_12_64] \n\t" + "msub $ac2, %[load4], %[cospi_20_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_20_64] \n\t" + "madd $ac1, %[load4], %[cospi_12_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load1], %[load1], %[temp0] \n\t" + "add %[load1], %[load1], %[temp1] \n\t" + + "sub %[load2], %[temp0], %[temp1] \n\t" + "sub %[load2], %[load2], %[temp2] \n\t" + "add %[load2], %[load2], %[temp3] \n\t" + + "madd $ac1, %[load1], %[cospi_16_64] \n\t" + "madd $ac3, %[load2], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[temp0], %[temp1] \n\t" + "add %[step1_7], %[temp3], %[temp2] \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [load3] "=&r" (load3), [load4] "=&r" (load4), + [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), + [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), + [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5), + [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7) + : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input), + [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64), + [cospi_16_64] "r" (cospi_16_64) + ); + + step2_0 = step1_0 + step1_7; + step2_1 = step1_1 + step1_6; + step2_2 = step1_2 + step1_5; + step2_3 = step1_3 + step1_4; + step2_4 = step1_3 - step1_4; + step2_5 = step1_2 - step1_5; + step2_6 = step1_1 - step1_6; + step2_7 = step1_0 - step1_7; + + step1_0 = step2_0 + step3_15; + step1_1 = step2_1 + step3_14; + step1_2 = step2_2 + step3_13; + step1_3 = step2_3 + step3_12; + step1_4 = step2_4 + step3_11; + step1_5 = step2_5 + step3_10; + step1_6 = step2_6 + step3_9; + step1_7 = step2_7 + step3_8; + step1_8 = step2_7 - step3_8; + step1_9 = step2_6 - step3_9; + step1_10 = step2_5 - step3_10; + step1_11 = step2_4 - step3_11; + step1_12 = step2_3 - step3_12; + step1_13 = step2_2 - step3_13; + step1_14 = step2_1 - step3_14; + step1_15 = step2_0 - step3_15; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_27], %[step2_20] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_20], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20) + : [const_2_power_13] "r" (const_2_power_13), + [step2_20] "r" (step2_20), [step2_27] "r" (step2_27), + [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_20 + step2_27) * cospi_16_64; + step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_26], %[step2_21] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_21], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21) + : [const_2_power_13] "r" (const_2_power_13), + [step2_26] "r" (step2_26), [step2_21] "r" (step2_21), + [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_21 + step2_26) * cospi_16_64; + step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_25], %[step2_22] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_22], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22) + : [const_2_power_13] "r" (const_2_power_13), + [step2_25] "r" (step2_25), [step2_22] "r" (step2_22), + [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_22 + step2_25) * cospi_16_64; + step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__ ( + "sub %[temp0], %[step2_24], %[step2_23] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_23], $ac0, 31 \n\t" + + : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23) + : [const_2_power_13] "r" (const_2_power_13), + [step2_24] "r" (step2_24), [step2_23] "r" (step2_23), + [cospi_16_64] "r" (cospi_16_64) + ); + + temp21 = (step2_23 + step2_24) * cospi_16_64; + step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + // final stage + output[0 * 32] = step1_0 + step2_31; + output[1 * 32] = step1_1 + step2_30; + output[2 * 32] = step1_2 + step2_29; + output[3 * 32] = step1_3 + step2_28; + output[4 * 32] = step1_4 + step1_27; + output[5 * 32] = step1_5 + step1_26; + output[6 * 32] = step1_6 + step1_25; + output[7 * 32] = step1_7 + step1_24; + output[8 * 32] = step1_8 + step1_23; + output[9 * 32] = step1_9 + step1_22; + output[10 * 32] = step1_10 + step1_21; + output[11 * 32] = step1_11 + step1_20; + output[12 * 32] = step1_12 + step2_19; + output[13 * 32] = step1_13 + step2_18; + output[14 * 32] = step1_14 + step2_17; + output[15 * 32] = step1_15 + step2_16; + output[16 * 32] = step1_15 - step2_16; + output[17 * 32] = step1_14 - step2_17; + output[18 * 32] = step1_13 - step2_18; + output[19 * 32] = step1_12 - step2_19; + output[20 * 32] = step1_11 - step1_20; + output[21 * 32] = step1_10 - step1_21; + output[22 * 32] = step1_9 - step1_22; + output[23 * 32] = step1_8 - step1_23; + output[24 * 32] = step1_7 - step1_24; + output[25 * 32] = step1_6 - step1_25; + output[26 * 32] = step1_5 - step1_26; + output[27 * 32] = step1_4 - step1_27; + output[28 * 32] = step1_3 - step2_28; + output[29 * 32] = step1_2 - step2_29; + output[30 * 32] = step1_1 - step2_30; + output[31 * 32] = step1_0 - step2_31; + + input += 32; + output += 1; + } +} + +void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[32 * 32]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // Rows + idct32_1d_rows_dspr2(input, outptr); + + // Columns + vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride); +} + +void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, + int stride) { + int r, out; + int32_t a1, absa1; + int32_t vector_a1; + int32_t t1, t2, t3, t4; + int32_t vector_1, vector_2, vector_3, vector_4; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + + : + : [pos] "r" (pos) + ); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__ ( + "addi %[out], %[out], 32 \n\t" + "sra %[a1], %[out], 6 \n\t" + + : [out] "+r" (out), [a1] "=r" (a1) + : + ); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 32; r--;) { + __asm__ __volatile__ ( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + + "lw %[t1], 16(%[dest]) \n\t" + "lw %[t2], 20(%[dest]) \n\t" + "lw %[t3], 24(%[dest]) \n\t" + "lw %[t4], 28(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 16(%[dest]) \n\t" + "sw %[vector_2], 20(%[dest]) \n\t" + "sw %[vector_3], 24(%[dest]) \n\t" + "sw %[vector_4], 28(%[dest]) \n\t" + + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), + [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), + [dest] "+&r" (dest) + : [stride] "r" (stride), [vector_a1] "r" (vector_a1) + ); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 32; r--;) { + __asm__ __volatile__ ( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + + "lw %[t1], 16(%[dest]) \n\t" + "lw %[t2], 20(%[dest]) \n\t" + "lw %[t3], 24(%[dest]) \n\t" + "lw %[t4], 28(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 16(%[dest]) \n\t" + "sw %[vector_2], 20(%[dest]) \n\t" + "sw %[vector_3], 24(%[dest]) \n\t" + "sw %[vector_4], 28(%[dest]) \n\t" + + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4), + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), + [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4), + [dest] "+&r" (dest) + : [stride] "r" (stride), [vector_a1] "r" (vector_a1) + ); + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c new file mode 100644 index 0000000..5b7aa5e --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c @@ -0,0 +1,438 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) { + int16_t step_0, step_1, step_2, step_3; + int Temp0, Temp1, Temp2, Temp3; + const int const_2_power_13 = 8192; + int i; + + for (i = 4; i--; ) { + __asm__ __volatile__ ( + /* + temp_1 = (input[0] + input[2]) * cospi_16_64; + step_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[2]) * cospi_16_64; + step_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 4(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "extp %[step_0], $ac0, 31 \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "extp %[step_1], $ac1, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + /* + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + step_2 = dct_const_round_shift(temp1); + */ + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "extp %[step_2], $ac0, 31 \n\t" + + /* + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step_3 = dct_const_round_shift(temp2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[step_3], $ac1, 31 \n\t" + + /* + output[0] = step_0 + step_3; + output[4] = step_1 + step_2; + output[8] = step_1 - step_2; + output[12] = step_0 - step_3; + */ + "add %[Temp0], %[step_0], %[step_3] \n\t" + "sh %[Temp0], 0(%[output]) \n\t" + + "add %[Temp1], %[step_1], %[step_2] \n\t" + "sh %[Temp1], 8(%[output]) \n\t" + + "sub %[Temp2], %[step_1], %[step_2] \n\t" + "sh %[Temp2], 16(%[output]) \n\t" + + "sub %[Temp3], %[step_0], %[step_3] \n\t" + "sh %[Temp3], 24(%[output]) \n\t" + + : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), + [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [step_0] "=&r" (step_0), [step_1] "=&r" (step_1), + [step_2] "=&r" (step_2), [step_3] "=&r" (step_3), + [output] "+r" (output) + : [const_2_power_13] "r" (const_2_power_13), + [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64), + [cospi_24_64] "r" (cospi_24_64), + [input] "r" (input) + ); + + input += 4; + output += 1; + } +} + +static void vp9_idct4_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int16_t step_0, step_1, step_2, step_3; + int Temp0, Temp1, Temp2, Temp3; + const int const_2_power_13 = 8192; + int i; + uint8_t *dest_pix; + uint8_t *cm = vp9_ff_cropTbl; + + /* prefetch vp9_ff_cropTbl */ + vp9_prefetch_load(vp9_ff_cropTbl); + vp9_prefetch_load(vp9_ff_cropTbl + 32); + vp9_prefetch_load(vp9_ff_cropTbl + 64); + vp9_prefetch_load(vp9_ff_cropTbl + 96); + vp9_prefetch_load(vp9_ff_cropTbl + 128); + vp9_prefetch_load(vp9_ff_cropTbl + 160); + vp9_prefetch_load(vp9_ff_cropTbl + 192); + vp9_prefetch_load(vp9_ff_cropTbl + 224); + + for (i = 0; i < 4; ++i) { + dest_pix = (dest + i); + + __asm__ __volatile__ ( + /* + temp_1 = (input[0] + input[2]) * cospi_16_64; + step_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[2]) * cospi_16_64; + step_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 4(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "extp %[step_0], $ac0, 31 \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "extp %[step_1], $ac1, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + /* + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + step_2 = dct_const_round_shift(temp1); + */ + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "extp %[step_2], $ac0, 31 \n\t" + + /* + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step_3 = dct_const_round_shift(temp2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[step_3], $ac1, 31 \n\t" + + /* + output[0] = step_0 + step_3; + output[4] = step_1 + step_2; + output[8] = step_1 - step_2; + output[12] = step_0 - step_3; + */ + "add %[Temp0], %[step_0], %[step_3] \n\t" + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step_1], %[step_2] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step_1], %[step_2] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step_0], %[step_3] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + + : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), + [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [step_0] "=&r" (step_0), [step_1] "=&r" (step_1), + [step_2] "=&r" (step_2), [step_3] "=&r" (step_3), + [dest_pix] "+r" (dest_pix) + : [const_2_power_13] "r" (const_2_power_13), + [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64), + [cospi_24_64] "r" (cospi_24_64), + [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride) + ); + + input += 4; + } +} + +void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[4 * 4]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // Rows + vp9_idct4_1d_rows_dspr2(input, outptr); + + // Columns + vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); +} + +void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + int a1, absa1; + int r; + int32_t out; + int t2, vector_a1, vector_a; + uint32_t pos = 45; + int16_t input_dc = input[0]; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + + : + : [pos] "r" (pos) + ); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc); + __asm__ __volatile__ ( + "addi %[out], %[out], 8 \n\t" + "sra %[a1], %[out], 4 \n\t" + + : [out] "+r" (out), [a1] "=r" (a1) + : + ); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 4; r--;) { + __asm__ __volatile__ ( + "lw %[t2], 0(%[dest]) \n\t" + "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" + "sw %[vector_a], 0(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), + [dest] "+&r" (dest) + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) + ); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "replv.qb %[vector_a1], %[a1] \n\t" + : [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 4; r--;) { + __asm__ __volatile__ ( + "lw %[t2], 0(%[dest]) \n\t" + "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" + "sw %[vector_a], 0(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a), + [dest] "+&r" (dest) + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) + ); + } + } +} + +static void iadst4_1d_dspr2(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + int x0, x1, x2, x3; + + x0 = input[0]; + x1 = input[1]; + x2 = input[2]; + x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = x0 - x2 + x3; + + x0 = s0 + s3 + s5; + x1 = s1 - s4 - s6; + x2 = sinpi_3_9 * s7; + x3 = s2; + + s0 = x0 + x3; + s1 = x1 + x3; + s2 = x2; + s3 = x0 + x1 - x3; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = dct_const_round_shift(s0); + output[1] = dct_const_round_shift(s1); + output[2] = dct_const_round_shift(s2); + output[3] = dct_const_round_shift(s3); +} + +void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride, int tx_type) { + int i, j; + DECLARE_ALIGNED(32, int16_t, out[4 * 4]); + int16_t *outptr = out; + int16_t temp_in[4 * 4], temp_out[4]; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + switch (tx_type) { + case DCT_DCT: // DCT in both horizontal and vertical + vp9_idct4_1d_rows_dspr2(input, outptr); + vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + break; + case ADST_DCT: // ADST in vertical, DCT in horizontal + vp9_idct4_1d_rows_dspr2(input, outptr); + + outptr = out; + + for (i = 0; i < 4; ++i) { + iadst4_1d_dspr2(outptr, temp_out); + + for (j = 0; j < 4; ++j) + dest[j * dest_stride + i] = + clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * dest_stride + i]); + + outptr += 4; + } + break; + case DCT_ADST: // DCT in vertical, ADST in horizontal + for (i = 0; i < 4; ++i) { + iadst4_1d_dspr2(input, outptr); + input += 4; + outptr += 4; + } + + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) { + temp_in[i * 4 + j] = out[j * 4 + i]; + } + } + vp9_idct4_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); + break; + case ADST_ADST: // ADST in both directions + for (i = 0; i < 4; ++i) { + iadst4_1d_dspr2(input, outptr); + input += 4; + outptr += 4; + } + + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + iadst4_1d_dspr2(temp_in, temp_out); + + for (j = 0; j < 4; ++j) + dest[j * dest_stride + i] = + clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * dest_stride + i]); + } + break; + default: + printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n"); + break; + } +} +#endif // #if HAVE_DSPR2 diff --git a/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c new file mode 100644 index 0000000..93a0840 --- /dev/null +++ b/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c @@ -0,0 +1,745 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <stdio.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" + +#if HAVE_DSPR2 +static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + const int const_2_power_13 = 8192; + int Temp0, Temp1, Temp2, Temp3, Temp4; + int i; + + for (i = no_rows; i--; ) { + __asm__ __volatile__ ( + /* + temp_1 = (input[0] + input[4]) * cospi_16_64; + step2_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[4]) * cospi_16_64; + step2_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 8(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "extp %[Temp4], $ac0, 31 \n\t" + + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + /* + temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; + step2_2 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 4(%[input]) \n\t" + "lh %[Temp1], 12(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "extp %[Temp3], $ac0, 31 \n\t" + + /* + step1_1 = step2_1 + step2_2; + step1_2 = step2_1 - step2_2; + */ + "add %[step1_1], %[Temp2], %[Temp3] \n\t" + "sub %[step1_2], %[Temp2], %[Temp3] \n\t" + + /* + temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; + step2_3 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + /* + step1_0 = step2_0 + step2_3; + step1_3 = step2_0 - step2_3; + */ + "add %[step1_0], %[Temp4], %[Temp1] \n\t" + "sub %[step1_3], %[Temp4], %[Temp1] \n\t" + + /* + temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + step1_4 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 2(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp1], 14(%[input]) \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" + "extp %[step1_4], $ac0, 31 \n\t" + + /* + temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1_7 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" + "extp %[step1_7], $ac1, 31 \n\t" + + /* + temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + step1_5 = dct_const_round_shift(temp_1); + */ + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" + "extp %[step1_5], $ac0, 31 \n\t" + + /* + temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1_6 = dct_const_round_shift(temp_2); + */ + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* + temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; + temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; + */ + "sub %[Temp0], %[step1_7], %[step1_6] \n\t" + "sub %[Temp0], %[Temp0], %[step1_4] \n\t" + "add %[Temp0], %[Temp0], %[step1_5] \n\t" + "sub %[Temp1], %[step1_4], %[step1_5] \n\t" + "sub %[Temp1], %[Temp1], %[step1_6] \n\t" + "add %[Temp1], %[Temp1], %[step1_7] \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" + + /* + step1_4 = step1_4 + step1_5; + step1_7 = step1_6 + step1_7; + */ + "add %[step1_4], %[step1_4], %[step1_5] \n\t" + "add %[step1_7], %[step1_7], %[step1_6] \n\t" + + "extp %[step1_5], $ac0, 31 \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + "add %[Temp0], %[step1_0], %[step1_7] \n\t" + "sh %[Temp0], 0(%[output]) \n\t" + "add %[Temp1], %[step1_1], %[step1_6] \n\t" + "sh %[Temp1], 16(%[output]) \n\t" + "add %[Temp0], %[step1_2], %[step1_5] \n\t" + "sh %[Temp0], 32(%[output]) \n\t" + "add %[Temp1], %[step1_3], %[step1_4] \n\t" + "sh %[Temp1], 48(%[output]) \n\t" + + "sub %[Temp0], %[step1_3], %[step1_4] \n\t" + "sh %[Temp0], 64(%[output]) \n\t" + "sub %[Temp1], %[step1_2], %[step1_5] \n\t" + "sh %[Temp1], 80(%[output]) \n\t" + "sub %[Temp0], %[step1_1], %[step1_6] \n\t" + "sh %[Temp0], 96(%[output]) \n\t" + "sub %[Temp1], %[step1_0], %[step1_7] \n\t" + "sh %[Temp1], 112(%[output]) \n\t" + + : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), + [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), + [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), + [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), + [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), + [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [Temp4] "=&r" (Temp4) + : [const_2_power_13] "r" (const_2_power_13), + [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), + [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), + [cospi_24_64] "r" (cospi_24_64), + [output] "r" (output), [input] "r" (input) + ); + + input += 8; + output += 1; + } +} + +static void idct8_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int Temp0, Temp1, Temp2, Temp3; + int i; + const int const_2_power_13 = 8192; + uint8_t *dest_pix; + uint8_t *cm = vp9_ff_cropTbl; + + /* prefetch vp9_ff_cropTbl */ + vp9_prefetch_load(vp9_ff_cropTbl); + vp9_prefetch_load(vp9_ff_cropTbl + 32); + vp9_prefetch_load(vp9_ff_cropTbl + 64); + vp9_prefetch_load(vp9_ff_cropTbl + 96); + vp9_prefetch_load(vp9_ff_cropTbl + 128); + vp9_prefetch_load(vp9_ff_cropTbl + 160); + vp9_prefetch_load(vp9_ff_cropTbl + 192); + vp9_prefetch_load(vp9_ff_cropTbl + 224); + + for (i = 0; i < 8; ++i) { + dest_pix = (dest + i); + + __asm__ __volatile__ ( + /* + temp_1 = (input[0] + input[4]) * cospi_16_64; + step2_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[4]) * cospi_16_64; + step2_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 8(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "extp %[step1_6], $ac0, 31 \n\t" + + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + /* + temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; + step2_2 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 4(%[input]) \n\t" + "lh %[Temp1], 12(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "extp %[Temp3], $ac0, 31 \n\t" + + /* + step1_1 = step2_1 + step2_2; + step1_2 = step2_1 - step2_2; + */ + "add %[step1_1], %[Temp2], %[Temp3] \n\t" + "sub %[step1_2], %[Temp2], %[Temp3] \n\t" + + /* + temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; + step2_3 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + /* + step1_0 = step2_0 + step2_3; + step1_3 = step2_0 - step2_3; + */ + "add %[step1_0], %[step1_6], %[Temp1] \n\t" + "sub %[step1_3], %[step1_6], %[Temp1] \n\t" + + /* + temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + step1_4 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 2(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp1], 14(%[input]) \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" + "extp %[step1_4], $ac0, 31 \n\t" + + /* + temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1_7 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" + "extp %[step1_7], $ac1, 31 \n\t" + + /* + temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + step1_5 = dct_const_round_shift(temp_1); + */ + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" + "extp %[step1_5], $ac0, 31 \n\t" + + /* + temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1_6 = dct_const_round_shift(temp_2); + */ + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* + temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; + temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; + */ + "sub %[Temp0], %[step1_7], %[step1_6] \n\t" + "sub %[Temp0], %[Temp0], %[step1_4] \n\t" + "add %[Temp0], %[Temp0], %[step1_5] \n\t" + "sub %[Temp1], %[step1_4], %[step1_5] \n\t" + "sub %[Temp1], %[Temp1], %[step1_6] \n\t" + "add %[Temp1], %[Temp1], %[step1_7] \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" + + /* + step1_4 = step1_4 + step1_5; + step1_7 = step1_6 + step1_7; + */ + "add %[step1_4], %[step1_4], %[step1_5] \n\t" + "add %[step1_7], %[step1_7], %[step1_6] \n\t" + + "extp %[step1_5], $ac0, 31 \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* add block */ + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp0], %[step1_0], %[step1_7] \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_1], %[step1_6] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_2], %[step1_5] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_3], %[step1_4] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_3], %[step1_4] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_2], %[step1_5] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_1], %[step1_6] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_0], %[step1_7] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + + : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1), + [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3), + [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5), + [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7), + [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1), + [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [dest_pix] "+r" (dest_pix) + : [const_2_power_13] "r" (const_2_power_13), + [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64), + [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64), + [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64), + [cospi_24_64] "r" (cospi_24_64), + [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride) + ); + + input += 8; + } +} + +void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[8 * 8]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // First transform rows + idct8_1d_rows_dspr2(input, outptr, 8); + + // Then transform columns and add to dest + idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); +} + +static void iadst8_1d_dspr2(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + int x0, x1, x2, x3, x4, x5, x6, x7; + + x0 = input[7]; + x1 = input[0]; + x2 = input[5]; + x3 = input[2]; + x4 = input[3]; + x5 = input[4]; + x6 = input[1]; + x7 = input[6]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + output[0] = output[1] = output[2] = output[3] = output[4] + = output[5] = output[6] = output[7] = 0; + return; + } + + // stage 1 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + + x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS); + x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS); + x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS); + x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS); + x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS); + x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS); + x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS); + + // stage 3 + s2 = cospi_16_64 * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (x6 - x7); + + x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS); + x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS); + + output[0] = x0; + output[1] = -x4; + output[2] = x6; + output[3] = -x2; + output[4] = x3; + output[5] = -x7; + output[6] = x5; + output[7] = -x1; +} + +void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride, int tx_type) { + int i, j; + DECLARE_ALIGNED(32, int16_t, out[8 * 8]); + int16_t *outptr = out; + int16_t temp_in[8 * 8], temp_out[8]; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + switch (tx_type) { + case DCT_DCT: // DCT in both horizontal and vertical + idct8_1d_rows_dspr2(input, outptr, 8); + idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); + break; + case ADST_DCT: // ADST in vertical, DCT in horizontal + idct8_1d_rows_dspr2(input, outptr, 8); + + for (i = 0; i < 8; ++i) { + iadst8_1d_dspr2(&out[i * 8], temp_out); + + for (j = 0; j < 8; ++j) + dest[j * dest_stride + i] = + clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * dest_stride + i]); + } + break; + case DCT_ADST: // DCT in vertical, ADST in horizontal + for (i = 0; i < 8; ++i) { + iadst8_1d_dspr2(input, outptr); + input += 8; + outptr += 8; + } + + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) { + temp_in[i * 8 + j] = out[j * 8 + i]; + } + } + idct8_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); + break; + case ADST_ADST: // ADST in both directions + for (i = 0; i < 8; ++i) { + iadst8_1d_dspr2(input, outptr); + input += 8; + outptr += 8; + } + + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + + iadst8_1d_dspr2(temp_in, temp_out); + + for (j = 0; j < 8; ++j) + dest[j * dest_stride + i] = + clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * dest_stride + i]); + } + break; + default: + printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n"); + break; + } +} + +void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[8 * 8]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + : + : [pos] "r" (pos) + ); + + // First transform rows + idct8_1d_rows_dspr2(input, outptr, 4); + + outptr += 4; + + __asm__ __volatile__ ( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 4(%[outptr]) \n\t" + "sw $zero, 16(%[outptr]) \n\t" + "sw $zero, 20(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 36(%[outptr]) \n\t" + "sw $zero, 48(%[outptr]) \n\t" + "sw $zero, 52(%[outptr]) \n\t" + "sw $zero, 64(%[outptr]) \n\t" + "sw $zero, 68(%[outptr]) \n\t" + "sw $zero, 80(%[outptr]) \n\t" + "sw $zero, 84(%[outptr]) \n\t" + "sw $zero, 96(%[outptr]) \n\t" + "sw $zero, 100(%[outptr]) \n\t" + "sw $zero, 112(%[outptr]) \n\t" + "sw $zero, 116(%[outptr]) \n\t" + + : + : [outptr] "r" (outptr) + ); + + + // Then transform columns and add to dest + idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride); +} + +void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + uint32_t pos = 45; + int32_t out; + int32_t r; + int32_t a1, absa1; + int32_t t1, t2, vector_a1, vector_1, vector_2; + + /* bit positon for extract from acc */ + __asm__ __volatile__ ( + "wrdsp %[pos], 1 \n\t" + + : + : [pos] "r" (pos) + ); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__ ( + "addi %[out], %[out], 16 \n\t" + "sra %[a1], %[out], 5 \n\t" + + : [out] "+r" (out), [a1] "=r" (a1) + : + ); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 8; r--;) { + __asm__ __volatile__ ( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r" (t1), [t2] "=&r" (t2), + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), + [dest] "+&r" (dest) + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) + ); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__ ( + "replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r" (vector_a1) + : [a1] "r" (a1) + ); + + for (r = 8; r--;) { + __asm__ __volatile__ ( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r" (t1), [t2] "=&r" (t2), + [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2), + [dest] "+r" (dest) + : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1) + ); + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c index 864e27e..0d65651 100644 --- a/libvpx/vp9/common/vp9_alloccommon.c +++ b/libvpx/vp9/common/vp9_alloccommon.c @@ -41,30 +41,25 @@ void vp9_free_frame_buffers(VP9_COMMON *cm) { vpx_free(cm->mip); vpx_free(cm->prev_mip); - vpx_free(cm->above_seg_context); vpx_free(cm->last_frame_seg_map); vpx_free(cm->mi_grid_base); vpx_free(cm->prev_mi_grid_base); - vpx_free(cm->above_context[0]); - for (i = 0; i < MAX_MB_PLANE; i++) - cm->above_context[i] = 0; cm->mip = NULL; cm->prev_mip = NULL; - cm->above_seg_context = NULL; cm->last_frame_seg_map = NULL; cm->mi_grid_base = NULL; cm->prev_mi_grid_base = NULL; } static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) { - cm->mb_cols = (aligned_width + 8) >> 4; - cm->mb_rows = (aligned_height + 8) >> 4; - cm->MBs = cm->mb_rows * cm->mb_cols; - cm->mi_cols = aligned_width >> MI_SIZE_LOG2; cm->mi_rows = aligned_height >> MI_SIZE_LOG2; cm->mode_info_stride = cm->mi_cols + MI_BLOCK_SIZE; + + cm->mb_cols = (cm->mi_cols + 1) >> 1; + cm->mb_rows = (cm->mi_rows + 1) >> 1; + cm->MBs = cm->mb_rows * cm->mb_cols; } static void setup_mi(VP9_COMMON *cm) { @@ -85,7 +80,7 @@ static void setup_mi(VP9_COMMON *cm) { } int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) { - int i, mi_cols; + int i; const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2); const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); @@ -140,21 +135,6 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) { setup_mi(cm); - // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling - // information is exposed at this level - mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); - - // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm - // block where mi unit size is 8x8. - cm->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE * - (2 * mi_cols), 1); - if (!cm->above_context[0]) - goto fail; - - cm->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1); - if (!cm->above_seg_context) - goto fail; - // Create the segmentation map structure and set to 0. cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1); if (!cm->last_frame_seg_map) @@ -170,13 +150,8 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) { void vp9_create_common(VP9_COMMON *cm) { vp9_machine_specific_config(cm); - vp9_init_mbmode_probs(cm); - cm->tx_mode = ONLY_4X4; cm->comp_pred_mode = HYBRID_PREDICTION; - - // Initialize reference frame sign bias structure to defaults - vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias)); } void vp9_remove_common(VP9_COMMON *cm) { @@ -184,24 +159,19 @@ void vp9_remove_common(VP9_COMMON *cm) { } void vp9_initialize_common() { + vp9_init_neighbors(); vp9_coef_tree_initialize(); vp9_entropy_mode_init(); vp9_entropy_mv_init(); } void vp9_update_frame_size(VP9_COMMON *cm) { - int i, mi_cols; const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, MI_SIZE_LOG2); const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, MI_SIZE_LOG2); set_mb_mi(cm, aligned_width, aligned_height); setup_mi(cm); - mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); - for (i = 1; i < MAX_MB_PLANE; i++) - cm->above_context[i] = - cm->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols; - // Initialize the previous frame segment map to 0. if (cm->last_frame_seg_map) vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols); diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h index c8d677f..d0d4852 100644 --- a/libvpx/vp9/common/vp9_blockd.h +++ b/libvpx/vp9/common/vp9_blockd.h @@ -20,6 +20,7 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_common_data.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_mv.h" #include "vp9/common/vp9_scale.h" #include "vp9/common/vp9_seg_common.h" @@ -52,18 +53,10 @@ static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a, typedef enum { KEY_FRAME = 0, INTER_FRAME = 1, - NUM_FRAME_TYPES, + FRAME_TYPES, } FRAME_TYPE; typedef enum { - EIGHTTAP = 0, - EIGHTTAP_SMOOTH = 1, - EIGHTTAP_SHARP = 2, - BILINEAR = 3, - SWITCHABLE = 4 /* should be the last one */ -} INTERPOLATIONFILTERTYPE; - -typedef enum { DC_PRED, // Average of above and left pixels V_PRED, // Vertical H_PRED, // Horizontal @@ -81,10 +74,6 @@ typedef enum { MB_MODE_COUNT } MB_PREDICTION_MODE; -static INLINE int is_intra_mode(MB_PREDICTION_MODE mode) { - return mode <= TM_PRED; -} - static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) { return mode >= NEARESTMV && mode <= NEWMV; } @@ -101,10 +90,10 @@ static INLINE int inter_mode_offset(MB_PREDICTION_MODE mode) { modes for the Y blocks to the left and above us; for interframes, there is a single probability table. */ -union b_mode_info { +typedef struct { MB_PREDICTION_MODE as_mode; int_mv as_mv[2]; // first, second inter predictor motion vectors -}; +} b_mode_info; typedef enum { NONE = -1, @@ -137,7 +126,7 @@ typedef struct { TX_SIZE tx_size; int_mv mv[2]; // for each reference frame used int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; - int_mv best_mv, best_second_mv; + int_mv best_mv[2]; uint8_t mode_context[MAX_REF_FRAMES]; @@ -147,14 +136,14 @@ typedef struct { // Flags used for prediction status of various bit-stream signals unsigned char seg_id_predicted; - INTERPOLATIONFILTERTYPE interp_filter; + INTERPOLATION_TYPE interp_filter; BLOCK_SIZE sb_type; } MB_MODE_INFO; typedef struct { MB_MODE_INFO mbmi; - union b_mode_info bmi[4]; + b_mode_info bmi[4]; } MODE_INFO; static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) { @@ -203,22 +192,15 @@ typedef struct macroblockd { struct scale_factors scale_factor[2]; MODE_INFO *last_mi; - MODE_INFO *this_mi; int mode_info_stride; - MODE_INFO *mic_stream_ptr; - // A NULL indicates that the 8x8 is not part of the image MODE_INFO **mi_8x8; MODE_INFO **prev_mi_8x8; + MODE_INFO *mi_stream; int up_available; int left_available; - int right_available; - - // partition contexts - PARTITION_CONTEXT *above_seg_context; - PARTITION_CONTEXT *left_seg_context; /* Distance of MB away from frame edges */ int mb_to_left_edge; @@ -228,14 +210,10 @@ typedef struct macroblockd { int lossless; /* Inverse transform function pointers. */ - void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride); - void (*inv_txm4x4_add)(int16_t *input, uint8_t *dest, int stride); - void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob); + void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob); struct subpix_fn_table subpix; - int allow_high_precision_mv; - int corrupted; unsigned char sb_index; // index of 32x32 block inside the 64x64 block @@ -245,71 +223,15 @@ typedef struct macroblockd { int q_index; -} MACROBLOCKD; + /* Y,U,V,(A) */ + ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; + ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16]; -static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) { - switch (subsize) { - case BLOCK_64X64: - case BLOCK_64X32: - case BLOCK_32X64: - case BLOCK_32X32: - return &xd->sb_index; - case BLOCK_32X16: - case BLOCK_16X32: - case BLOCK_16X16: - return &xd->mb_index; - case BLOCK_16X8: - case BLOCK_8X16: - case BLOCK_8X8: - return &xd->b_index; - case BLOCK_8X4: - case BLOCK_4X8: - case BLOCK_4X4: - return &xd->ab_index; - default: - assert(0); - return NULL; - } -} - -static INLINE void update_partition_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type, - BLOCK_SIZE sb_size) { - const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2; - const int bwl = b_width_log2(sb_type); - const int bhl = b_height_log2(sb_type); - const int boffset = b_width_log2(BLOCK_64X64) - bsl; - const char pcval0 = ~(0xe << boffset); - const char pcval1 = ~(0xf << boffset); - const char pcvalue[2] = {pcval0, pcval1}; - - assert(MAX(bwl, bhl) <= bsl); - - // update the partition context at the end notes. set partition bits - // of block sizes larger than the current one to be one, and partition - // bits of smaller block sizes to be zero. - vpx_memset(xd->above_seg_context, pcvalue[bwl == bsl], bs); - vpx_memset(xd->left_seg_context, pcvalue[bhl == bsl], bs); -} - -static INLINE int partition_plane_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type) { - int bsl = mi_width_log2(sb_type), bs = 1 << bsl; - int above = 0, left = 0, i; - int boffset = mi_width_log2(BLOCK_64X64) - bsl; - - assert(mi_width_log2(sb_type) == mi_height_log2(sb_type)); - assert(bsl >= 0); - assert(boffset >= 0); - - for (i = 0; i < bs; i++) - above |= (xd->above_seg_context[i] & (1 << boffset)); - for (i = 0; i < bs; i++) - left |= (xd->left_seg_context[i] & (1 << boffset)); + PARTITION_CONTEXT *above_seg_context; + PARTITION_CONTEXT left_seg_context[8]; +} MACROBLOCKD; - above = (above > 0); - left = (left > 0); - return (left * 2 + above) + bsl * PARTITION_PLOFFSET; -} static BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, PARTITION_TYPE partition) { const BLOCK_SIZE subsize = subsize_lookup[partition][bsize]; @@ -321,7 +243,7 @@ extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT]; static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type, const MACROBLOCKD *xd, int ib) { - const MODE_INFO *const mi = xd->this_mi; + const MODE_INFO *const mi = xd->mi_8x8[0]; const MB_MODE_INFO *const mbmi = &mi->mbmi; if (plane_type != PLANE_TYPE_Y_WITH_DC || @@ -336,13 +258,13 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type, static INLINE TX_TYPE get_tx_type_8x8(PLANE_TYPE plane_type, const MACROBLOCKD *xd) { return plane_type == PLANE_TYPE_Y_WITH_DC ? - mode2txfm_map[xd->this_mi->mbmi.mode] : DCT_DCT; + mode2txfm_map[xd->mi_8x8[0]->mbmi.mode] : DCT_DCT; } static INLINE TX_TYPE get_tx_type_16x16(PLANE_TYPE plane_type, const MACROBLOCKD *xd) { return plane_type == PLANE_TYPE_Y_WITH_DC ? - mode2txfm_map[xd->this_mi->mbmi.mode] : DCT_DCT; + mode2txfm_map[xd->mi_8x8[0]->mbmi.mode] : DCT_DCT; } static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) { @@ -391,7 +313,7 @@ static INLINE void foreach_transformed_block_in_plane( const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane, foreach_transformed_block_visitor visit, void *arg) { const struct macroblockd_plane *const pd = &xd->plane[plane]; - const MB_MODE_INFO* mbmi = &xd->this_mi->mbmi; + const MB_MODE_INFO* mbmi = &xd->mi_8x8[0]->mbmi; // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 // transform size varies per plane, look it up in a common way. @@ -495,7 +417,7 @@ static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize, *y = (raster_mb >> tx_cols_log2) << tx_size; } -static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize, +static void extend_for_intra(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize, int plane, int block, TX_SIZE tx_size) { struct macroblockd_plane *const pd = &xd->plane[plane]; uint8_t *const buf = pd->dst.buf; @@ -520,19 +442,22 @@ static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize, } if (xd->mb_to_bottom_edge < 0) { - const int bh = 4 << b_height_log2(plane_bsize); - const int umv_border_start = bh + (xd->mb_to_bottom_edge >> - (3 + pd->subsampling_y)); - int i; - const uint8_t c = buf[(umv_border_start - 1) * stride + x]; - uint8_t *d = &buf[umv_border_start * stride + x]; - - if (y + bh > umv_border_start) - for (i = 0; i < bh; ++i, d += stride) - *d = c; + if (xd->left_available || x >= 0) { + const int bh = 4 << b_height_log2(plane_bsize); + const int umv_border_start = + bh + (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)); + + if (y + bh > umv_border_start) { + const uint8_t c = buf[(umv_border_start - 1) * stride + x]; + uint8_t *d = &buf[umv_border_start * stride + x]; + int i; + for (i = 0; i < bh; ++i, d += stride) + *d = c; + } + } } } -static void set_contexts_on_border(MACROBLOCKD *xd, +static void set_contexts_on_border(const MACROBLOCKD *xd, struct macroblockd_plane *pd, BLOCK_SIZE plane_bsize, int tx_size_in_blocks, int has_eob, @@ -570,7 +495,7 @@ static void set_contexts_on_border(MACROBLOCKD *xd, L[pt] = 0; } -static void set_contexts(MACROBLOCKD *xd, struct macroblockd_plane *pd, +static void set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, int aoff, int loff) { ENTROPY_CONTEXT *const A = pd->above_context + aoff; @@ -586,7 +511,7 @@ static void set_contexts(MACROBLOCKD *xd, struct macroblockd_plane *pd, } } -static int get_tx_eob(struct segmentation *seg, int segment_id, +static int get_tx_eob(const struct segmentation *seg, int segment_id, TX_SIZE tx_size) { const int eob_max = 16 << (tx_size << 1); return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; diff --git a/libvpx/vp9/common/vp9_common.h b/libvpx/vp9/common/vp9_common.h index 1796906..36d1cdf 100644 --- a/libvpx/vp9/common/vp9_common.h +++ b/libvpx/vp9/common/vp9_common.h @@ -40,8 +40,8 @@ vpx_memcpy(dest, src, n * sizeof(*src)); \ } -#define vp9_zero(dest) vpx_memset(&dest, 0, sizeof(dest)); -#define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest)); +#define vp9_zero(dest) vpx_memset(&dest, 0, sizeof(dest)) +#define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest)) static INLINE uint8_t clip_pixel(int val) { return (val > 255) ? 255u : (val < 0) ? 0u : val; @@ -84,9 +84,11 @@ static int get_unsigned_bits(unsigned int num_values) { } while (0) #endif -#define SYNC_CODE_0 0x49 -#define SYNC_CODE_1 0x83 -#define SYNC_CODE_2 0x42 +#define VP9_SYNC_CODE_0 0x49 +#define VP9_SYNC_CODE_1 0x83 +#define VP9_SYNC_CODE_2 0x42 + +#define VP9_FRAME_MARKER 0x2 #endif // VP9_COMMON_VP9_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_common_data.c b/libvpx/vp9/common/vp9_common_data.c index dc41efd..f858900 100644 --- a/libvpx/vp9/common/vp9_common_data.c +++ b/libvpx/vp9/common/vp9_common_data.c @@ -115,6 +115,16 @@ const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES] = { TX_16X16, TX_16X16, TX_16X16, TX_32X32 }; +const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = { + TX_4X4, // ONLY_4X4 + TX_8X8, // ALLOW_8X8 + TX_16X16, // ALLOW_16X16 + TX_32X32, // ALLOW_32X32 + TX_32X32, // TX_MODE_SELECT +}; + + + const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = { // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1 // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1 @@ -133,3 +143,4 @@ const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = { {{BLOCK_64X64, BLOCK_64X32}, {BLOCK_32X64, BLOCK_32X32}}, }; + diff --git a/libvpx/vp9/common/vp9_common_data.h b/libvpx/vp9/common/vp9_common_data.h index 3822bfc..c1f6405 100644 --- a/libvpx/vp9/common/vp9_common_data.h +++ b/libvpx/vp9/common/vp9_common_data.h @@ -27,6 +27,7 @@ extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES]; extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES]; extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES]; extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES]; +extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES]; extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2]; -#endif // VP9_COMMON_VP9_COMMON_DATA_H +#endif // VP9_COMMON_VP9_COMMON_DATA_H diff --git a/libvpx/vp9/common/vp9_convolve.c b/libvpx/vp9/common/vp9_convolve.c index 94231a1..a2d864c 100644 --- a/libvpx/vp9/common/vp9_convolve.c +++ b/libvpx/vp9/common/vp9_convolve.c @@ -7,13 +7,13 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include "vp9/common/vp9_convolve.h" #include <assert.h> #include "./vpx_config.h" #include "./vp9_rtcd.h" #include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_convolve.h" #include "vp9/common/vp9_filter.h" #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" @@ -35,7 +35,7 @@ static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride, for (y = 0; y < h; ++y) { /* Initial phase offset */ - int x_q4 = (filter_x0 - filter_x_base) / taps; + int x_q4 = (int)(filter_x0 - filter_x_base) / taps; for (x = 0; x < w; ++x) { /* Per-pixel src offset */ @@ -76,7 +76,7 @@ static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, for (y = 0; y < h; ++y) { /* Initial phase offset */ - int x_q4 = (filter_x0 - filter_x_base) / taps; + int x_q4 = (int)(filter_x0 - filter_x_base) / taps; for (x = 0; x < w; ++x) { /* Per-pixel src offset */ @@ -118,7 +118,7 @@ static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride, for (x = 0; x < w; ++x) { /* Initial phase offset */ - int y_q4 = (filter_y0 - filter_y_base) / taps; + int y_q4 = (int)(filter_y0 - filter_y_base) / taps; for (y = 0; y < h; ++y) { /* Per-pixel src offset */ @@ -160,7 +160,7 @@ static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, for (x = 0; x < w; ++x) { /* Initial phase offset */ - int y_q4 = (filter_y0 - filter_y_base) / taps; + int y_q4 = (int)(filter_y0 - filter_y_base) / taps; for (y = 0; y < h; ++y) { /* Per-pixel src offset */ @@ -282,7 +282,7 @@ void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, int r; for (r = h; r > 0; --r) { - memcpy(dst, src, w); + vpx_memcpy(dst, src, w); src += src_stride; dst += dst_stride; } diff --git a/libvpx/vp9/common/vp9_convolve.h b/libvpx/vp9/common/vp9_convolve.h index 13220e9..29d4990 100644 --- a/libvpx/vp9/common/vp9_convolve.h +++ b/libvpx/vp9/common/vp9_convolve.h @@ -7,23 +7,16 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_CONVOLVE_H_ -#define VP9_COMMON_CONVOLVE_H_ +#ifndef VP9_COMMON_VP9_CONVOLVE_H_ +#define VP9_COMMON_VP9_CONVOLVE_H_ #include "./vpx_config.h" #include "vpx/vpx_integer.h" -#define FILTER_BITS 7 - typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -struct subpix_fn_table { - const int16_t (*filter_x)[8]; - const int16_t (*filter_y)[8]; -}; - -#endif // VP9_COMMON_CONVOLVE_H_ +#endif // VP9_COMMON_VP9_CONVOLVE_H_ diff --git a/libvpx/vp9/common/vp9_debugmodes.c b/libvpx/vp9/common/vp9_debugmodes.c index 79f769e..355ac1a 100644 --- a/libvpx/vp9/common/vp9_debugmodes.c +++ b/libvpx/vp9/common/vp9_debugmodes.c @@ -63,9 +63,9 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) { print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size)); print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode)); - log_frame_info(cm, "Vectors ",mvs); + log_frame_info(cm, "Vectors ", mvs); for (mi_row = 0; mi_row < rows; mi_row++) { - fprintf(mvs,"V "); + fprintf(mvs, "V "); for (mi_col = 0; mi_col < cols; mi_col++) { fprintf(mvs, "%4d:%4d ", mi_8x8[mi_index]->mbmi.mv[0].as_mv.row, mi_8x8[mi_index]->mbmi.mv[0].as_mv.col); diff --git a/libvpx/vp9/common/vp9_default_coef_probs.h b/libvpx/vp9/common/vp9_default_coef_probs.h index 185fced..3b512be 100644 --- a/libvpx/vp9/common/vp9_default_coef_probs.h +++ b/libvpx/vp9/common/vp9_default_coef_probs.h @@ -7,6 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP9_COMMON_DEFAULT_COEF_PROBS_H_ +#define VP9_COMMON_DEFAULT_COEF_PROBS_H_ /*Generated file, included by vp9_entropy.c*/ static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = { @@ -694,3 +696,4 @@ static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = { } }; +#endif // VP9_COMMON_DEFAULT_COEF_PROBS_H_ diff --git a/libvpx/vp9/common/vp9_entropy.c b/libvpx/vp9/common/vp9_entropy.c index 32d9e0c..d3a867c 100644 --- a/libvpx/vp9/common/vp9_entropy.c +++ b/libvpx/vp9/common/vp9_entropy.c @@ -52,156 +52,11 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = { 0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5 }; -DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]) = { - 0, 4, 1, 5, - 8, 2, 12, 9, - 3, 6, 13, 10, - 7, 14, 11, 15, -}; - -DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]) = { - 0, 4, 8, 1, - 12, 5, 9, 2, - 13, 6, 10, 3, - 7, 14, 11, 15, -}; - -DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = { - 0, 1, 4, 2, - 5, 3, 6, 8, - 9, 7, 12, 10, - 13, 11, 14, 15, -}; - -DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = { - 0, 8, 1, 16, 9, 2, 17, 24, - 10, 3, 18, 25, 32, 11, 4, 26, - 33, 19, 40, 12, 34, 27, 5, 41, - 20, 48, 13, 35, 42, 28, 21, 6, - 49, 56, 36, 43, 29, 7, 14, 50, - 57, 44, 22, 37, 15, 51, 58, 30, - 45, 23, 52, 59, 38, 31, 60, 53, - 46, 39, 61, 54, 47, 62, 55, 63, -}; -DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = { - 0, 8, 16, 1, 24, 9, 32, 17, - 2, 40, 25, 10, 33, 18, 48, 3, - 26, 41, 11, 56, 19, 34, 4, 49, - 27, 42, 12, 35, 20, 57, 50, 28, - 5, 43, 13, 36, 58, 51, 21, 44, - 6, 29, 59, 37, 14, 52, 22, 7, - 45, 60, 30, 15, 38, 53, 23, 46, - 31, 61, 39, 54, 47, 62, 55, 63, -}; - -DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = { - 0, 1, 2, 8, 9, 3, 16, 10, - 4, 17, 11, 24, 5, 18, 25, 12, - 19, 26, 32, 6, 13, 20, 33, 27, - 7, 34, 40, 21, 28, 41, 14, 35, - 48, 42, 29, 36, 49, 22, 43, 15, - 56, 37, 50, 44, 30, 57, 23, 51, - 58, 45, 38, 52, 31, 59, 53, 46, - 60, 39, 61, 47, 54, 55, 62, 63, -}; - -DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = { - 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80, - 50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52, - 98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69, - 100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, 101, 131, 160, 146, - 55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, 176, 162, 87, 56, 25, - 133, 118, 177, 148, 72, 103, 41, 163, 10, 192, 178, 88, 57, 134, 149, 119, - 26, 164, 73, 104, 193, 42, 179, 208, 11, 135, 89, 165, 120, 150, 58, 194, - 180, 27, 74, 209, 105, 151, 136, 43, 90, 224, 166, 195, 181, 121, 210, 59, - 12, 152, 106, 167, 196, 75, 137, 225, 211, 240, 182, 122, 91, 28, 197, 13, - 226, 168, 183, 153, 44, 212, 138, 107, 241, 60, 29, 123, 198, 184, 227, 169, - 242, 76, 213, 154, 45, 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108, - 77, 155, 30, 15, 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140, - 230, 62, 216, 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141, - 63, 232, 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142, - 219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159, 251, - 190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239, 255, -}; - -DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = { - 0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81, - 34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4, - 67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21, - 146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, 116, 193, 147, 85, - 22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, 7, 148, 194, 86, 179, - 225, 23, 133, 39, 164, 8, 102, 210, 241, 55, 195, 118, 149, 71, 180, 24, - 87, 226, 134, 165, 211, 40, 103, 56, 72, 150, 196, 242, 119, 9, 181, 227, - 88, 166, 25, 135, 41, 104, 212, 57, 151, 197, 120, 73, 243, 182, 136, 167, - 213, 89, 10, 228, 105, 152, 198, 26, 42, 121, 183, 244, 168, 58, 137, 229, - 74, 214, 90, 153, 199, 184, 11, 106, 245, 27, 122, 230, 169, 43, 215, 59, - 200, 138, 185, 246, 75, 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170, - 60, 247, 232, 76, 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202, - 233, 171, 61, 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125, - 62, 172, 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79, - 126, 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205, 236, - 159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239, 255, -}; - -DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = { - 0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20, - 49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52, - 23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69, - 25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, 41, 56, 114, 100, - 13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, 116, 14, 87, 130, 102, - 144, 73, 131, 117, 28, 58, 15, 88, 43, 145, 103, 132, 146, 118, 74, 160, - 89, 133, 104, 29, 59, 147, 119, 44, 161, 148, 90, 105, 134, 162, 120, 176, - 75, 135, 149, 30, 60, 163, 177, 45, 121, 91, 106, 164, 178, 150, 192, 136, - 165, 179, 31, 151, 193, 76, 122, 61, 137, 194, 107, 152, 180, 208, 46, 166, - 167, 195, 92, 181, 138, 209, 123, 153, 224, 196, 77, 168, 210, 182, 240, 108, - 197, 62, 154, 225, 183, 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170, - 124, 155, 199, 78, 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186, - 156, 229, 243, 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110, - 157, 245, 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111, 158, - 188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220, 175, - 190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, 255, -}; - -DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = { - 0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160, 129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193, 68, 131, 37, 100, - 225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38, 258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321, 102, 352, 8, 197, - 71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292, 135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293, 41, 417, 199, 136, - 262, 387, 448, 325, 356, 10, 73, 418, 231, 168, 449, 294, 388, 105, 419, 263, 42, 200, 357, 450, 137, 480, 74, 326, 232, 11, 389, 169, 295, 420, 106, 451, - 481, 358, 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421, 75, 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391, 453, 139, 44, 234, - 484, 297, 360, 171, 76, 515, 545, 266, 329, 454, 13, 423, 203, 108, 546, 485, 576, 298, 235, 140, 361, 330, 172, 547, 45, 455, 267, 577, 486, 77, 204, 362, - 608, 14, 299, 578, 109, 236, 487, 609, 331, 141, 579, 46, 15, 173, 610, 363, 78, 205, 16, 110, 237, 611, 142, 47, 174, 79, 206, 17, 111, 238, 48, 143, - 80, 175, 112, 207, 49, 18, 239, 81, 113, 19, 50, 82, 114, 51, 83, 115, 640, 516, 392, 268, 144, 20, 672, 641, 548, 517, 424, 393, 300, 269, 176, 145, - 52, 21, 704, 673, 642, 580, 549, 518, 456, 425, 394, 332, 301, 270, 208, 177, 146, 84, 53, 22, 736, 705, 674, 643, 612, 581, 550, 519, 488, 457, 426, 395, - 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 737, 706, 675, 613, 582, 551, 489, 458, 427, 365, 334, 303, 241, 210, 179, 117, 86, 55, 738, 707, - 614, 583, 490, 459, 366, 335, 242, 211, 118, 87, 739, 615, 491, 367, 243, 119, 768, 644, 520, 396, 272, 148, 24, 800, 769, 676, 645, 552, 521, 428, 397, 304, - 273, 180, 149, 56, 25, 832, 801, 770, 708, 677, 646, 584, 553, 522, 460, 429, 398, 336, 305, 274, 212, 181, 150, 88, 57, 26, 864, 833, 802, 771, 740, 709, - 678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, 865, 834, 803, 741, 710, 679, 617, 586, 555, 493, - 462, 431, 369, 338, 307, 245, 214, 183, 121, 90, 59, 866, 835, 742, 711, 618, 587, 494, 463, 370, 339, 246, 215, 122, 91, 867, 743, 619, 495, 371, 247, 123, - 896, 772, 648, 524, 400, 276, 152, 28, 928, 897, 804, 773, 680, 649, 556, 525, 432, 401, 308, 277, 184, 153, 60, 29, 960, 929, 898, 836, 805, 774, 712, 681, - 650, 588, 557, 526, 464, 433, 402, 340, 309, 278, 216, 185, 154, 92, 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682, 651, 620, 589, 558, 527, - 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93, 62, 31, 993, 962, 931, 869, 838, 807, 745, 714, 683, 621, 590, 559, 497, 466, 435, 373, - 342, 311, 249, 218, 187, 125, 94, 63, 994, 963, 870, 839, 746, 715, 622, 591, 498, 467, 374, 343, 250, 219, 126, 95, 995, 871, 747, 623, 499, 375, 251, 127, - 900, 776, 652, 528, 404, 280, 156, 932, 901, 808, 777, 684, 653, 560, 529, 436, 405, 312, 281, 188, 157, 964, 933, 902, 840, 809, 778, 716, 685, 654, 592, 561, - 530, 468, 437, 406, 344, 313, 282, 220, 189, 158, 996, 965, 934, 903, 872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469, 438, 407, 376, 345, - 314, 283, 252, 221, 190, 159, 997, 966, 935, 873, 842, 811, 749, 718, 687, 625, 594, 563, 501, 470, 439, 377, 346, 315, 253, 222, 191, 998, 967, 874, 843, 750, - 719, 626, 595, 502, 471, 378, 347, 254, 223, 999, 875, 751, 627, 503, 379, 255, 904, 780, 656, 532, 408, 284, 936, 905, 812, 781, 688, 657, 564, 533, 440, 409, - 316, 285, 968, 937, 906, 844, 813, 782, 720, 689, 658, 596, 565, 534, 472, 441, 410, 348, 317, 286, 1000, 969, 938, 907, 876, 845, 814, 783, 752, 721, 690, 659, - 628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 1001, 970, 939, 877, 846, 815, 753, 722, 691, 629, 598, 567, 505, 474, 443, 381, 350, 319, 1002, 971, - 878, 847, 754, 723, 630, 599, 506, 475, 382, 351, 1003, 879, 755, 631, 507, 383, 908, 784, 660, 536, 412, 940, 909, 816, 785, 692, 661, 568, 537, 444, 413, 972, - 941, 910, 848, 817, 786, 724, 693, 662, 600, 569, 538, 476, 445, 414, 1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601, 570, 539, 508, 477, - 446, 415, 1005, 974, 943, 881, 850, 819, 757, 726, 695, 633, 602, 571, 509, 478, 447, 1006, 975, 882, 851, 758, 727, 634, 603, 510, 479, 1007, 883, 759, 635, 511, - 912, 788, 664, 540, 944, 913, 820, 789, 696, 665, 572, 541, 976, 945, 914, 852, 821, 790, 728, 697, 666, 604, 573, 542, 1008, 977, 946, 915, 884, 853, 822, 791, - 760, 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 885, 854, 823, 761, 730, 699, 637, 606, 575, 1010, 979, 886, 855, 762, 731, 638, 607, 1011, 887, 763, 639, - 916, 792, 668, 948, 917, 824, 793, 700, 669, 980, 949, 918, 856, 825, 794, 732, 701, 670, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733, 702, 671, 1013, 982, - 951, 889, 858, 827, 765, 734, 703, 1014, 983, 890, 859, 766, 735, 1015, 891, 767, 920, 796, 952, 921, 828, 797, 984, 953, 922, 860, 829, 798, 1016, 985, 954, 923, - 892, 861, 830, 799, 1017, 986, 955, 893, 862, 831, 1018, 987, 894, 863, 1019, 895, 924, 956, 925, 988, 957, 926, 1020, 989, 958, 927, 1021, 990, 959, 1022, 991, 1023, -}; /* Array indices are identical to previously-existing CONTEXT_NODE indices */ -const vp9_tree_index vp9_coef_tree[ 22] = /* corresponding _CONTEXT_NODEs */ -{ +const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = { -DCT_EOB_TOKEN, 2, /* 0 = EOB */ -ZERO_TOKEN, 4, /* 1 = ZERO */ -ONE_TOKEN, 6, /* 2 = ONE */ @@ -419,7 +274,7 @@ static void init_bit_trees() { init_bit_tree(cat6, 14); } -const vp9_extra_bit vp9_extra_bits[12] = { +const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS] = { { 0, 0, 0, 0}, { 0, 0, 0, 1}, { 0, 0, 0, 2}, @@ -443,159 +298,7 @@ void vp9_default_coef_probs(VP9_COMMON *cm) { vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32); } -// Neighborhood 5-tuples for various scans and blocksizes, -// in {top, left, topleft, topright, bottomleft} order -// for each position in raster scan order. -// -1 indicates the neighbor does not exist. -DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]); - -DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]); -DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]); -DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]); -DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]); -DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]); -DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]); -DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]); -DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]); -DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]); -DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]); - -static int find_in_scan(const int16_t *scan, int l, int idx) { - int n, l2 = l * l; - for (n = 0; n < l2; n++) { - int rc = scan[n]; - if (rc == idx) - return n; - } - assert(0); - return -1; -} -static void init_scan_neighbors(const int16_t *scan, - int16_t *iscan, - int l, int16_t *neighbors) { - int l2 = l * l; - int n, i, j; - - // dc doesn't use this type of prediction - neighbors[MAX_NEIGHBORS * 0 + 0] = 0; - neighbors[MAX_NEIGHBORS * 0 + 1] = 0; - iscan[0] = find_in_scan(scan, l, 0); - for (n = 1; n < l2; n++) { - int rc = scan[n]; - iscan[n] = find_in_scan(scan, l, n); - i = rc / l; - j = rc % l; - if (i > 0 && j > 0) { - // col/row scan is used for adst/dct, and generally means that - // energy decreases to zero much faster in the dimension in - // which ADST is used compared to the direction in which DCT - // is used. Likewise, we find much higher correlation between - // coefficients within the direction in which DCT is used. - // Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff - // as a context. If ADST or DCT is used in both directions, we - // use the combination of the two as a context. - int a = (i - 1) * l + j; - int b = i * l + j - 1; - if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 || - scan == vp9_col_scan_16x16) { - // in the col/row scan cases (as well as left/top edge cases), we set - // both contexts to the same value, so we can branchlessly do a+b+1>>1 - // which automatically becomes a if a == b - neighbors[MAX_NEIGHBORS * n + 0] = - neighbors[MAX_NEIGHBORS * n + 1] = a; - } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 || - scan == vp9_row_scan_16x16) { - neighbors[MAX_NEIGHBORS * n + 0] = - neighbors[MAX_NEIGHBORS * n + 1] = b; - } else { - neighbors[MAX_NEIGHBORS * n + 0] = a; - neighbors[MAX_NEIGHBORS * n + 1] = b; - } - } else if (i > 0) { - neighbors[MAX_NEIGHBORS * n + 0] = - neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j; - } else { - assert(j > 0); - neighbors[MAX_NEIGHBORS * n + 0] = - neighbors[MAX_NEIGHBORS * n + 1] = i * l + j - 1; - } - assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n); - } - // one padding item so we don't have to add branches in code to handle - // calls to get_coef_context() for the token after the final dc token - neighbors[MAX_NEIGHBORS * l2 + 0] = 0; - neighbors[MAX_NEIGHBORS * l2 + 1] = 0; -} - -void vp9_init_neighbors() { - init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4, - vp9_default_scan_4x4_neighbors); - init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4, - vp9_row_scan_4x4_neighbors); - init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4, - vp9_col_scan_4x4_neighbors); - init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8, - vp9_default_scan_8x8_neighbors); - init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8, - vp9_row_scan_8x8_neighbors); - init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8, - vp9_col_scan_8x8_neighbors); - init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16, - vp9_default_scan_16x16_neighbors); - init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16, - vp9_row_scan_16x16_neighbors); - init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16, - vp9_col_scan_16x16_neighbors); - init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32, - vp9_default_scan_32x32_neighbors); -} - -const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan) { - if (scan == vp9_default_scan_4x4) { - return vp9_default_scan_4x4_neighbors; - } else if (scan == vp9_row_scan_4x4) { - return vp9_row_scan_4x4_neighbors; - } else if (scan == vp9_col_scan_4x4) { - return vp9_col_scan_4x4_neighbors; - } else if (scan == vp9_default_scan_8x8) { - return vp9_default_scan_8x8_neighbors; - } else if (scan == vp9_row_scan_8x8) { - return vp9_row_scan_8x8_neighbors; - } else if (scan == vp9_col_scan_8x8) { - return vp9_col_scan_8x8_neighbors; - } else if (scan == vp9_default_scan_16x16) { - return vp9_default_scan_16x16_neighbors; - } else if (scan == vp9_row_scan_16x16) { - return vp9_row_scan_16x16_neighbors; - } else if (scan == vp9_col_scan_16x16) { - return vp9_col_scan_16x16_neighbors; - } else { - assert(scan == vp9_default_scan_32x32); - return vp9_default_scan_32x32_neighbors; - } -} - void vp9_coef_tree_initialize() { - vp9_init_neighbors(); init_bit_trees(); vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree); } @@ -612,16 +315,15 @@ void vp9_coef_tree_initialize() { static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size, unsigned int count_sat, unsigned int update_factor) { - FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; + const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[tx_size]; - vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[tx_size]; + const vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[tx_size]; vp9_coeff_count_model *coef_counts = cm->counts.coef[tx_size]; unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] = cm->counts.eob_branch[tx_size]; - int t, i, j, k, l; + int i, j, k, l, m; unsigned int branch_ct[UNCONSTRAINED_NODES][2]; - vp9_prob coef_probs[UNCONSTRAINED_NODES]; for (i = 0; i < BLOCK_TYPES; ++i) for (j = 0; j < REF_TYPES; ++j) @@ -629,15 +331,14 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size, for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { if (l >= 3 && k == 0) continue; - vp9_tree_probs_from_distribution(vp9_coefmodel_tree, coef_probs, - branch_ct, coef_counts[i][j][k][l], - 0); + vp9_tree_probs_from_distribution(vp9_coefmodel_tree, branch_ct, + coef_counts[i][j][k][l], 0); branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0]; - coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]); - for (t = 0; t < UNCONSTRAINED_NODES; ++t) - dst_coef_probs[i][j][k][l][t] = merge_probs( - pre_coef_probs[i][j][k][l][t], coef_probs[t], - branch_ct[t], count_sat, update_factor); + for (m = 0; m < UNCONSTRAINED_NODES; ++m) + dst_coef_probs[i][j][k][l][m] = merge_probs( + pre_coef_probs[i][j][k][l][m], + branch_ct[m], + count_sat, update_factor); } } @@ -645,7 +346,7 @@ void vp9_adapt_coef_probs(VP9_COMMON *cm) { TX_SIZE t; unsigned int count_sat, update_factor; - if (cm->frame_type == KEY_FRAME || cm->intra_only) { + if (frame_is_intra_only(cm)) { update_factor = COEF_MAX_UPDATE_FACTOR_KEY; count_sat = COEF_COUNT_SAT_KEY; } else if (cm->last_frame_type == KEY_FRAME) { diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h index f138c09..c58e852 100644 --- a/libvpx/vp9/common/vp9_entropy.h +++ b/libvpx/vp9/common/vp9_entropy.h @@ -12,9 +12,13 @@ #define VP9_COMMON_VP9_ENTROPY_H_ #include "vpx/vpx_integer.h" -#include "vp9/common/vp9_treecoder.h" + #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/common/vp9_treecoder.h" + +#define DIFF_UPDATE_PROB 252 /* Coefficient token alphabet */ @@ -36,7 +40,10 @@ #define INTER_MODE_CONTEXTS 7 -extern const vp9_tree_index vp9_coef_tree[]; +extern DECLARE_ALIGNED(16, const uint8_t, + vp9_pt_energy_class[MAX_ENTROPY_TOKENS]); + +extern const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)]; #define DCT_EOB_MODEL_TOKEN 3 /* EOB Extra Bits 0+0 */ extern const vp9_tree_index vp9_coefmodel_tree[]; @@ -44,13 +51,14 @@ extern const vp9_tree_index vp9_coefmodel_tree[]; extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS]; typedef struct { - vp9_tree_p tree; + vp9_tree_index *tree; const vp9_prob *prob; int len; int base_val; } vp9_extra_bit; -extern const vp9_extra_bit vp9_extra_bits[12]; /* indexed by token value */ +// indexed by token value +extern const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS]; #define MAX_PROB 255 #define DCT_MAX_VALUE 16384 @@ -88,72 +96,14 @@ typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] [ENTROPY_NODES][2]; -typedef vp9_prob vp9_coeff_probs[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] - [ENTROPY_NODES]; #define SUBEXP_PARAM 4 /* Subexponential code parameter */ #define MODULUS_PARAM 13 /* Modulus parameter */ struct VP9Common; void vp9_default_coef_probs(struct VP9Common *cm); -extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]); - -extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]); -extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]); - -extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]); - -extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]); -extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]); - -extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]); - -extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]); -extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]); - -extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]); - -extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]); - -extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]); -extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]); - -extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]); - -extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]); -extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]); - -extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]); - -extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]); -extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]); - -extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]); -#define MAX_NEIGHBORS 2 - -extern DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]); - -void vp9_coef_tree_initialize(void); +void vp9_coef_tree_initialize(); void vp9_adapt_coef_probs(struct VP9Common *cm); static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) { @@ -183,16 +133,6 @@ static int get_coef_band(const uint8_t * band_translate, int coef_index) { ? (COEF_BANDS-1) : band_translate[coef_index]; } -static INLINE int get_coef_context(const int16_t *neighbors, - uint8_t *token_cache, - int c) { - return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] + - token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1; -} - -const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan); - - // 128 lists of probabilities are stored for the following ONE node probs: // 1, 3, 5, 7, ..., 253, 255 // In between probabilities are interpolated linearly @@ -210,171 +150,62 @@ typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS] typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS] [PREV_COEF_CONTEXTS] [UNCONSTRAINED_NODES + 1]; -typedef unsigned int vp9_coeff_stats_model[REF_TYPES][COEF_BANDS] - [PREV_COEF_CONTEXTS] - [UNCONSTRAINED_NODES][2]; void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full); -static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) { - switch (tx_type) { - case ADST_DCT: - return vp9_row_scan_4x4; - case DCT_ADST: - return vp9_col_scan_4x4; - default: - return vp9_default_scan_4x4; - } -} +static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, + const ENTROPY_CONTEXT *l) { + ENTROPY_CONTEXT above_ec = 0, left_ec = 0; -static INLINE void get_scan_nb_4x4(TX_TYPE tx_type, - const int16_t **scan, const int16_t **nb) { - switch (tx_type) { - case ADST_DCT: - *scan = vp9_row_scan_4x4; - *nb = vp9_row_scan_4x4_neighbors; - break; - case DCT_ADST: - *scan = vp9_col_scan_4x4; - *nb = vp9_col_scan_4x4_neighbors; - break; - default: - *scan = vp9_default_scan_4x4; - *nb = vp9_default_scan_4x4_neighbors; + switch (tx_size) { + case TX_4X4: + above_ec = a[0] != 0; + left_ec = l[0] != 0; break; - } -} - -static INLINE const int16_t* get_iscan_4x4(TX_TYPE tx_type) { - switch (tx_type) { - case ADST_DCT: - return vp9_row_iscan_4x4; - case DCT_ADST: - return vp9_col_iscan_4x4; - default: - return vp9_default_iscan_4x4; - } -} - -static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) { - switch (tx_type) { - case ADST_DCT: - return vp9_row_scan_8x8; - case DCT_ADST: - return vp9_col_scan_8x8; - default: - return vp9_default_scan_8x8; - } -} - -static INLINE void get_scan_nb_8x8(TX_TYPE tx_type, - const int16_t **scan, const int16_t **nb) { - switch (tx_type) { - case ADST_DCT: - *scan = vp9_row_scan_8x8; - *nb = vp9_row_scan_8x8_neighbors; + case TX_8X8: + above_ec = !!*(const uint16_t *)a; + left_ec = !!*(const uint16_t *)l; break; - case DCT_ADST: - *scan = vp9_col_scan_8x8; - *nb = vp9_col_scan_8x8_neighbors; + case TX_16X16: + above_ec = !!*(const uint32_t *)a; + left_ec = !!*(const uint32_t *)l; break; - default: - *scan = vp9_default_scan_8x8; - *nb = vp9_default_scan_8x8_neighbors; + case TX_32X32: + above_ec = !!*(const uint64_t *)a; + left_ec = !!*(const uint64_t *)l; break; - } -} - -static INLINE const int16_t* get_iscan_8x8(TX_TYPE tx_type) { - switch (tx_type) { - case ADST_DCT: - return vp9_row_iscan_8x8; - case DCT_ADST: - return vp9_col_iscan_8x8; default: - return vp9_default_iscan_8x8; - } -} - -static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) { - switch (tx_type) { - case ADST_DCT: - return vp9_row_scan_16x16; - case DCT_ADST: - return vp9_col_scan_16x16; - default: - return vp9_default_scan_16x16; + assert(!"Invalid transform size."); } -} -static INLINE void get_scan_nb_16x16(TX_TYPE tx_type, - const int16_t **scan, const int16_t **nb) { - switch (tx_type) { - case ADST_DCT: - *scan = vp9_row_scan_16x16; - *nb = vp9_row_scan_16x16_neighbors; - break; - case DCT_ADST: - *scan = vp9_col_scan_16x16; - *nb = vp9_col_scan_16x16_neighbors; - break; - default: - *scan = vp9_default_scan_16x16; - *nb = vp9_default_scan_16x16_neighbors; - break; - } + return combine_entropy_contexts(above_ec, left_ec); } -static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) { - switch (tx_type) { - case ADST_DCT: - return vp9_row_iscan_16x16; - case DCT_ADST: - return vp9_col_iscan_16x16; - default: - return vp9_default_iscan_16x16; - } +static const uint8_t *get_band_translate(TX_SIZE tx_size) { + return tx_size == TX_4X4 ? vp9_coefband_trans_4x4 + : vp9_coefband_trans_8x8plus; } -static int get_entropy_context(const MACROBLOCKD *xd, TX_SIZE tx_size, - PLANE_TYPE type, int block_idx, - ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L, - const int16_t **scan, - const uint8_t **band_translate) { - ENTROPY_CONTEXT above_ec = 0, left_ec = 0; - +static void get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, + PLANE_TYPE type, int block_idx, + const int16_t **scan, const int16_t **scan_nb) { switch (tx_size) { case TX_4X4: - *scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx)); - *band_translate = vp9_coefband_trans_4x4; - above_ec = A[0] != 0; - left_ec = L[0] != 0; + get_scan_nb_4x4(get_tx_type_4x4(type, xd, block_idx), scan, scan_nb); break; case TX_8X8: - *scan = get_scan_8x8(get_tx_type_8x8(type, xd)); - *band_translate = vp9_coefband_trans_8x8plus; - above_ec = !!*(uint16_t *)A; - left_ec = !!*(uint16_t *)L; + get_scan_nb_8x8(get_tx_type_8x8(type, xd), scan, scan_nb); break; case TX_16X16: - *scan = get_scan_16x16(get_tx_type_16x16(type, xd)); - *band_translate = vp9_coefband_trans_8x8plus; - above_ec = !!*(uint32_t *)A; - left_ec = !!*(uint32_t *)L; + get_scan_nb_16x16(get_tx_type_16x16(type, xd), scan, scan_nb); break; case TX_32X32: *scan = vp9_default_scan_32x32; - *band_translate = vp9_coefband_trans_8x8plus; - above_ec = !!*(uint64_t *)A; - left_ec = !!*(uint64_t *)L; + *scan_nb = vp9_default_scan_32x32_neighbors; break; default: assert(!"Invalid transform size."); } - - return combine_entropy_contexts(above_ec, left_ec); } -enum { VP9_COEF_UPDATE_PROB = 252 }; - #endif // VP9_COMMON_VP9_ENTROPY_H_ diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c index 93c89b0..a963d55 100644 --- a/libvpx/vp9/common/vp9_entropymode.c +++ b/libvpx/vp9/common/vp9_entropymode.c @@ -14,204 +14,199 @@ #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_seg_common.h" -const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES] - [INTRA_MODES - 1] = { - { 144, 11, 54, 157, 195, 130, 46, 58, 108 } /* y = dc */, - { 118, 15, 123, 148, 131, 101, 44, 93, 131 } /* y = v */, - { 113, 12, 23, 188, 226, 142, 26, 32, 125 } /* y = h */, - { 120, 11, 50, 123, 163, 135, 64, 77, 103 } /* y = d45 */, - { 113, 9, 36, 155, 111, 157, 32, 44, 161 } /* y = d135 */, - { 116, 9, 55, 176, 76, 96, 37, 61, 149 } /* y = d117 */, - { 115, 9, 28, 141, 161, 167, 21, 25, 193 } /* y = d153 */, - { 120, 12, 32, 145, 195, 142, 32, 38, 86 } /* y = d207 */, - { 116, 12, 64, 120, 140, 125, 49, 115, 121 } /* y = d63 */, - { 102, 19, 66, 162, 182, 122, 35, 59, 128 } /* y = tm */ +const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1] = { + { // above = dc + { 137, 30, 42, 148, 151, 207, 70, 52, 91 }, // left = dc + { 92, 45, 102, 136, 116, 180, 74, 90, 100 }, // left = v + { 73, 32, 19, 187, 222, 215, 46, 34, 100 }, // left = h + { 91, 30, 32, 116, 121, 186, 93, 86, 94 }, // left = d45 + { 72, 35, 36, 149, 68, 206, 68, 63, 105 }, // left = d135 + { 73, 31, 28, 138, 57, 124, 55, 122, 151 }, // left = d117 + { 67, 23, 21, 140, 126, 197, 40, 37, 171 }, // left = d153 + { 86, 27, 28, 128, 154, 212, 45, 43, 53 }, // left = d207 + { 74, 32, 27, 107, 86, 160, 63, 134, 102 }, // left = d63 + { 59, 67, 44, 140, 161, 202, 78, 67, 119 } // left = tm + }, { // above = v + { 63, 36, 126, 146, 123, 158, 60, 90, 96 }, // left = dc + { 43, 46, 168, 134, 107, 128, 69, 142, 92 }, // left = v + { 44, 29, 68, 159, 201, 177, 50, 57, 77 }, // left = h + { 58, 38, 76, 114, 97, 172, 78, 133, 92 }, // left = d45 + { 46, 41, 76, 140, 63, 184, 69, 112, 57 }, // left = d135 + { 38, 32, 85, 140, 46, 112, 54, 151, 133 }, // left = d117 + { 39, 27, 61, 131, 110, 175, 44, 75, 136 }, // left = d153 + { 52, 30, 74, 113, 130, 175, 51, 64, 58 }, // left = d207 + { 47, 35, 80, 100, 74, 143, 64, 163, 74 }, // left = d63 + { 36, 61, 116, 114, 128, 162, 80, 125, 82 } // left = tm + }, { // above = h + { 82, 26, 26, 171, 208, 204, 44, 32, 105 }, // left = dc + { 55, 44, 68, 166, 179, 192, 57, 57, 108 }, // left = v + { 42, 26, 11, 199, 241, 228, 23, 15, 85 }, // left = h + { 68, 42, 19, 131, 160, 199, 55, 52, 83 }, // left = d45 + { 58, 50, 25, 139, 115, 232, 39, 52, 118 }, // left = d135 + { 50, 35, 33, 153, 104, 162, 64, 59, 131 }, // left = d117 + { 44, 24, 16, 150, 177, 202, 33, 19, 156 }, // left = d153 + { 55, 27, 12, 153, 203, 218, 26, 27, 49 }, // left = d207 + { 53, 49, 21, 110, 116, 168, 59, 80, 76 }, // left = d63 + { 38, 72, 19, 168, 203, 212, 50, 50, 107 } // left = tm + }, { // above = d45 + { 103, 26, 36, 129, 132, 201, 83, 80, 93 }, // left = dc + { 59, 38, 83, 112, 103, 162, 98, 136, 90 }, // left = v + { 62, 30, 23, 158, 200, 207, 59, 57, 50 }, // left = h + { 67, 30, 29, 84, 86, 191, 102, 91, 59 }, // left = d45 + { 60, 32, 33, 112, 71, 220, 64, 89, 104 }, // left = d135 + { 53, 26, 34, 130, 56, 149, 84, 120, 103 }, // left = d117 + { 53, 21, 23, 133, 109, 210, 56, 77, 172 }, // left = d153 + { 77, 19, 29, 112, 142, 228, 55, 66, 36 }, // left = d207 + { 61, 29, 29, 93, 97, 165, 83, 175, 162 }, // left = d63 + { 47, 47, 43, 114, 137, 181, 100, 99, 95 } // left = tm + }, { // above = d135 + { 69, 23, 29, 128, 83, 199, 46, 44, 101 }, // left = dc + { 53, 40, 55, 139, 69, 183, 61, 80, 110 }, // left = v + { 40, 29, 19, 161, 180, 207, 43, 24, 91 }, // left = h + { 60, 34, 19, 105, 61, 198, 53, 64, 89 }, // left = d45 + { 52, 31, 22, 158, 40, 209, 58, 62, 89 }, // left = d135 + { 44, 31, 29, 147, 46, 158, 56, 102, 198 }, // left = d117 + { 35, 19, 12, 135, 87, 209, 41, 45, 167 }, // left = d153 + { 55, 25, 21, 118, 95, 215, 38, 39, 66 }, // left = d207 + { 51, 38, 25, 113, 58, 164, 70, 93, 97 }, // left = d63 + { 47, 54, 34, 146, 108, 203, 72, 103, 151 } // left = tm + }, { // above = d117 + { 64, 19, 37, 156, 66, 138, 49, 95, 133 }, // left = dc + { 46, 27, 80, 150, 55, 124, 55, 121, 135 }, // left = v + { 36, 23, 27, 165, 149, 166, 54, 64, 118 }, // left = h + { 53, 21, 36, 131, 63, 163, 60, 109, 81 }, // left = d45 + { 40, 26, 35, 154, 40, 185, 51, 97, 123 }, // left = d135 + { 35, 19, 34, 179, 19, 97, 48, 129, 124 }, // left = d117 + { 36, 20, 26, 136, 62, 164, 33, 77, 154 }, // left = d153 + { 45, 18, 32, 130, 90, 157, 40, 79, 91 }, // left = d207 + { 45, 26, 28, 129, 45, 129, 49, 147, 123 }, // left = d63 + { 38, 44, 51, 136, 74, 162, 57, 97, 121 } // left = tm + }, { // above = d153 + { 75, 17, 22, 136, 138, 185, 32, 34, 166 }, // left = dc + { 56, 39, 58, 133, 117, 173, 48, 53, 187 }, // left = v + { 35, 21, 12, 161, 212, 207, 20, 23, 145 }, // left = h + { 56, 29, 19, 117, 109, 181, 55, 68, 112 }, // left = d45 + { 47, 29, 17, 153, 64, 220, 59, 51, 114 }, // left = d135 + { 46, 16, 24, 136, 76, 147, 41, 64, 172 }, // left = d117 + { 34, 17, 11, 108, 152, 187, 13, 15, 209 }, // left = d153 + { 51, 24, 14, 115, 133, 209, 32, 26, 104 }, // left = d207 + { 55, 30, 18, 122, 79, 179, 44, 88, 116 }, // left = d63 + { 37, 49, 25, 129, 168, 164, 41, 54, 148 } // left = tm + }, { // above = d207 + { 82, 22, 32, 127, 143, 213, 39, 41, 70 }, // left = dc + { 62, 44, 61, 123, 105, 189, 48, 57, 64 }, // left = v + { 47, 25, 17, 175, 222, 220, 24, 30, 86 }, // left = h + { 68, 36, 17, 106, 102, 206, 59, 74, 74 }, // left = d45 + { 57, 39, 23, 151, 68, 216, 55, 63, 58 }, // left = d135 + { 49, 30, 35, 141, 70, 168, 82, 40, 115 }, // left = d117 + { 51, 25, 15, 136, 129, 202, 38, 35, 139 }, // left = d153 + { 68, 26, 16, 111, 141, 215, 29, 28, 28 }, // left = d207 + { 59, 39, 19, 114, 75, 180, 77, 104, 42 }, // left = d63 + { 40, 61, 26, 126, 152, 206, 61, 59, 93 } // left = tm + }, { // above = d63 + { 78, 23, 39, 111, 117, 170, 74, 124, 94 }, // left = dc + { 48, 34, 86, 101, 92, 146, 78, 179, 134 }, // left = v + { 47, 22, 24, 138, 187, 178, 68, 69, 59 }, // left = h + { 56, 25, 33, 105, 112, 187, 95, 177, 129 }, // left = d45 + { 48, 31, 27, 114, 63, 183, 82, 116, 56 }, // left = d135 + { 43, 28, 37, 121, 63, 123, 61, 192, 169 }, // left = d117 + { 42, 17, 24, 109, 97, 177, 56, 76, 122 }, // left = d153 + { 58, 18, 28, 105, 139, 182, 70, 92, 63 }, // left = d207 + { 46, 23, 32, 74, 86, 150, 67, 183, 88 }, // left = d63 + { 36, 38, 48, 92, 122, 165, 88, 137, 91 } // left = tm + }, { // above = tm + { 65, 70, 60, 155, 159, 199, 61, 60, 81 }, // left = dc + { 44, 78, 115, 132, 119, 173, 71, 112, 93 }, // left = v + { 39, 38, 21, 184, 227, 206, 42, 32, 64 }, // left = h + { 58, 47, 36, 124, 137, 193, 80, 82, 78 }, // left = d45 + { 49, 50, 35, 144, 95, 205, 63, 78, 59 }, // left = d135 + { 41, 53, 52, 148, 71, 142, 65, 128, 51 }, // left = d117 + { 40, 36, 28, 143, 143, 202, 40, 55, 137 }, // left = d153 + { 52, 34, 29, 129, 183, 227, 42, 35, 43 }, // left = d207 + { 42, 44, 44, 104, 105, 164, 64, 130, 80 }, // left = d63 + { 43, 81, 53, 140, 169, 204, 68, 84, 72 } // left = tm + } }; -static const vp9_prob default_if_y_probs[BLOCK_SIZE_GROUPS] - [INTRA_MODES - 1] = { - { 65, 32, 18, 144, 162, 194, 41, 51, 98 } /* block_size < 8x8 */, - { 132, 68, 18, 165, 217, 196, 45, 40, 78 } /* block_size < 16x16 */, - { 173, 80, 19, 176, 240, 193, 64, 35, 46 } /* block_size < 32x32 */, - { 221, 135, 38, 194, 248, 121, 96, 85, 29 } /* block_size >= 32x32 */ +const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1] = { + { 144, 11, 54, 157, 195, 130, 46, 58, 108 }, // y = dc + { 118, 15, 123, 148, 131, 101, 44, 93, 131 }, // y = v + { 113, 12, 23, 188, 226, 142, 26, 32, 125 }, // y = h + { 120, 11, 50, 123, 163, 135, 64, 77, 103 }, // y = d45 + { 113, 9, 36, 155, 111, 157, 32, 44, 161 }, // y = d135 + { 116, 9, 55, 176, 76, 96, 37, 61, 149 }, // y = d117 + { 115, 9, 28, 141, 161, 167, 21, 25, 193 }, // y = d153 + { 120, 12, 32, 145, 195, 142, 32, 38, 86 }, // y = d207 + { 116, 12, 64, 120, 140, 125, 49, 115, 121 }, // y = d63 + { 102, 19, 66, 162, 182, 122, 35, 59, 128 } // y = tm }; -static const vp9_prob default_if_uv_probs[INTRA_MODES] - [INTRA_MODES - 1] = { - { 120, 7, 76, 176, 208, 126, 28, 54, 103 } /* y = dc */, - { 48, 12, 154, 155, 139, 90, 34, 117, 119 } /* y = v */, - { 67, 6, 25, 204, 243, 158, 13, 21, 96 } /* y = h */, - { 97, 5, 44, 131, 176, 139, 48, 68, 97 } /* y = d45 */, - { 83, 5, 42, 156, 111, 152, 26, 49, 152 } /* y = d135 */, - { 80, 5, 58, 178, 74, 83, 33, 62, 145 } /* y = d117 */, - { 86, 5, 32, 154, 192, 168, 14, 22, 163 } /* y = d153 */, - { 85, 5, 32, 156, 216, 148, 19, 29, 73 } /* y = d207 */, - { 77, 7, 64, 116, 132, 122, 37, 126, 120 } /* y = d63 */, - { 101, 21, 107, 181, 192, 103, 19, 67, 125 } /* y = tm */ +static const vp9_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = { + { 65, 32, 18, 144, 162, 194, 41, 51, 98 }, // block_size < 8x8 + { 132, 68, 18, 165, 217, 196, 45, 40, 78 }, // block_size < 16x16 + { 173, 80, 19, 176, 240, 193, 64, 35, 46 }, // block_size < 32x32 + { 221, 135, 38, 194, 248, 121, 96, 85, 29 } // block_size >= 32x32 }; -static const vp9_prob default_partition_probs[NUM_FRAME_TYPES] - [NUM_PARTITION_CONTEXTS] - [PARTITION_TYPES - 1] = { - { /* frame_type = keyframe */ - /* 8x8 -> 4x4 */ - { 158, 97, 94 } /* a/l both not split */, - { 93, 24, 99 } /* a split, l not split */, - { 85, 119, 44 } /* l split, a not split */, - { 62, 59, 67 } /* a/l both split */, - /* 16x16 -> 8x8 */ - { 149, 53, 53 } /* a/l both not split */, - { 94, 20, 48 } /* a split, l not split */, - { 83, 53, 24 } /* l split, a not split */, - { 52, 18, 18 } /* a/l both split */, - /* 32x32 -> 16x16 */ - { 150, 40, 39 } /* a/l both not split */, - { 78, 12, 26 } /* a split, l not split */, - { 67, 33, 11 } /* l split, a not split */, - { 24, 7, 5 } /* a/l both split */, - /* 64x64 -> 32x32 */ - { 174, 35, 49 } /* a/l both not split */, - { 68, 11, 27 } /* a split, l not split */, - { 57, 15, 9 } /* l split, a not split */, - { 12, 3, 3 } /* a/l both split */ - }, { /* frame_type = interframe */ - /* 8x8 -> 4x4 */ - { 199, 122, 141 } /* a/l both not split */, - { 147, 63, 159 } /* a split, l not split */, - { 148, 133, 118 } /* l split, a not split */, - { 121, 104, 114 } /* a/l both split */, - /* 16x16 -> 8x8 */ - { 174, 73, 87 } /* a/l both not split */, - { 92, 41, 83 } /* a split, l not split */, - { 82, 99, 50 } /* l split, a not split */, - { 53, 39, 39 } /* a/l both split */, - /* 32x32 -> 16x16 */ - { 177, 58, 59 } /* a/l both not split */, - { 68, 26, 63 } /* a split, l not split */, - { 52, 79, 25 } /* l split, a not split */, - { 17, 14, 12 } /* a/l both split */, - /* 64x64 -> 32x32 */ - { 222, 34, 30 } /* a/l both not split */, - { 72, 16, 44 } /* a split, l not split */, - { 58, 32, 12 } /* l split, a not split */, - { 10, 7, 6 } /* a/l both split */ - } +static const vp9_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = { + { 120, 7, 76, 176, 208, 126, 28, 54, 103 }, // y = dc + { 48, 12, 154, 155, 139, 90, 34, 117, 119 }, // y = v + { 67, 6, 25, 204, 243, 158, 13, 21, 96 }, // y = h + { 97, 5, 44, 131, 176, 139, 48, 68, 97 }, // y = d45 + { 83, 5, 42, 156, 111, 152, 26, 49, 152 }, // y = d135 + { 80, 5, 58, 178, 74, 83, 33, 62, 145 }, // y = d117 + { 86, 5, 32, 154, 192, 168, 14, 22, 163 }, // y = d153 + { 85, 5, 32, 156, 216, 148, 19, 29, 73 }, // y = d207 + { 77, 7, 64, 116, 132, 122, 37, 126, 120 }, // y = d63 + { 101, 21, 107, 181, 192, 103, 19, 67, 125 } // y = tm }; -const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES] - [INTRA_MODES] - [INTRA_MODES - 1] = { - { /* above = dc */ - { 137, 30, 42, 148, 151, 207, 70, 52, 91 } /* left = dc */, - { 92, 45, 102, 136, 116, 180, 74, 90, 100 } /* left = v */, - { 73, 32, 19, 187, 222, 215, 46, 34, 100 } /* left = h */, - { 91, 30, 32, 116, 121, 186, 93, 86, 94 } /* left = d45 */, - { 72, 35, 36, 149, 68, 206, 68, 63, 105 } /* left = d135 */, - { 73, 31, 28, 138, 57, 124, 55, 122, 151 } /* left = d117 */, - { 67, 23, 21, 140, 126, 197, 40, 37, 171 } /* left = d153 */, - { 86, 27, 28, 128, 154, 212, 45, 43, 53 } /* left = d207 */, - { 74, 32, 27, 107, 86, 160, 63, 134, 102 } /* left = d63 */, - { 59, 67, 44, 140, 161, 202, 78, 67, 119 } /* left = tm */ - }, { /* above = v */ - { 63, 36, 126, 146, 123, 158, 60, 90, 96 } /* left = dc */, - { 43, 46, 168, 134, 107, 128, 69, 142, 92 } /* left = v */, - { 44, 29, 68, 159, 201, 177, 50, 57, 77 } /* left = h */, - { 58, 38, 76, 114, 97, 172, 78, 133, 92 } /* left = d45 */, - { 46, 41, 76, 140, 63, 184, 69, 112, 57 } /* left = d135 */, - { 38, 32, 85, 140, 46, 112, 54, 151, 133 } /* left = d117 */, - { 39, 27, 61, 131, 110, 175, 44, 75, 136 } /* left = d153 */, - { 52, 30, 74, 113, 130, 175, 51, 64, 58 } /* left = d207 */, - { 47, 35, 80, 100, 74, 143, 64, 163, 74 } /* left = d63 */, - { 36, 61, 116, 114, 128, 162, 80, 125, 82 } /* left = tm */ - }, { /* above = h */ - { 82, 26, 26, 171, 208, 204, 44, 32, 105 } /* left = dc */, - { 55, 44, 68, 166, 179, 192, 57, 57, 108 } /* left = v */, - { 42, 26, 11, 199, 241, 228, 23, 15, 85 } /* left = h */, - { 68, 42, 19, 131, 160, 199, 55, 52, 83 } /* left = d45 */, - { 58, 50, 25, 139, 115, 232, 39, 52, 118 } /* left = d135 */, - { 50, 35, 33, 153, 104, 162, 64, 59, 131 } /* left = d117 */, - { 44, 24, 16, 150, 177, 202, 33, 19, 156 } /* left = d153 */, - { 55, 27, 12, 153, 203, 218, 26, 27, 49 } /* left = d207 */, - { 53, 49, 21, 110, 116, 168, 59, 80, 76 } /* left = d63 */, - { 38, 72, 19, 168, 203, 212, 50, 50, 107 } /* left = tm */ - }, { /* above = d45 */ - { 103, 26, 36, 129, 132, 201, 83, 80, 93 } /* left = dc */, - { 59, 38, 83, 112, 103, 162, 98, 136, 90 } /* left = v */, - { 62, 30, 23, 158, 200, 207, 59, 57, 50 } /* left = h */, - { 67, 30, 29, 84, 86, 191, 102, 91, 59 } /* left = d45 */, - { 60, 32, 33, 112, 71, 220, 64, 89, 104 } /* left = d135 */, - { 53, 26, 34, 130, 56, 149, 84, 120, 103 } /* left = d117 */, - { 53, 21, 23, 133, 109, 210, 56, 77, 172 } /* left = d153 */, - { 77, 19, 29, 112, 142, 228, 55, 66, 36 } /* left = d207 */, - { 61, 29, 29, 93, 97, 165, 83, 175, 162 } /* left = d63 */, - { 47, 47, 43, 114, 137, 181, 100, 99, 95 } /* left = tm */ - }, { /* above = d135 */ - { 69, 23, 29, 128, 83, 199, 46, 44, 101 } /* left = dc */, - { 53, 40, 55, 139, 69, 183, 61, 80, 110 } /* left = v */, - { 40, 29, 19, 161, 180, 207, 43, 24, 91 } /* left = h */, - { 60, 34, 19, 105, 61, 198, 53, 64, 89 } /* left = d45 */, - { 52, 31, 22, 158, 40, 209, 58, 62, 89 } /* left = d135 */, - { 44, 31, 29, 147, 46, 158, 56, 102, 198 } /* left = d117 */, - { 35, 19, 12, 135, 87, 209, 41, 45, 167 } /* left = d153 */, - { 55, 25, 21, 118, 95, 215, 38, 39, 66 } /* left = d207 */, - { 51, 38, 25, 113, 58, 164, 70, 93, 97 } /* left = d63 */, - { 47, 54, 34, 146, 108, 203, 72, 103, 151 } /* left = tm */ - }, { /* above = d117 */ - { 64, 19, 37, 156, 66, 138, 49, 95, 133 } /* left = dc */, - { 46, 27, 80, 150, 55, 124, 55, 121, 135 } /* left = v */, - { 36, 23, 27, 165, 149, 166, 54, 64, 118 } /* left = h */, - { 53, 21, 36, 131, 63, 163, 60, 109, 81 } /* left = d45 */, - { 40, 26, 35, 154, 40, 185, 51, 97, 123 } /* left = d135 */, - { 35, 19, 34, 179, 19, 97, 48, 129, 124 } /* left = d117 */, - { 36, 20, 26, 136, 62, 164, 33, 77, 154 } /* left = d153 */, - { 45, 18, 32, 130, 90, 157, 40, 79, 91 } /* left = d207 */, - { 45, 26, 28, 129, 45, 129, 49, 147, 123 } /* left = d63 */, - { 38, 44, 51, 136, 74, 162, 57, 97, 121 } /* left = tm */ - }, { /* above = d153 */ - { 75, 17, 22, 136, 138, 185, 32, 34, 166 } /* left = dc */, - { 56, 39, 58, 133, 117, 173, 48, 53, 187 } /* left = v */, - { 35, 21, 12, 161, 212, 207, 20, 23, 145 } /* left = h */, - { 56, 29, 19, 117, 109, 181, 55, 68, 112 } /* left = d45 */, - { 47, 29, 17, 153, 64, 220, 59, 51, 114 } /* left = d135 */, - { 46, 16, 24, 136, 76, 147, 41, 64, 172 } /* left = d117 */, - { 34, 17, 11, 108, 152, 187, 13, 15, 209 } /* left = d153 */, - { 51, 24, 14, 115, 133, 209, 32, 26, 104 } /* left = d207 */, - { 55, 30, 18, 122, 79, 179, 44, 88, 116 } /* left = d63 */, - { 37, 49, 25, 129, 168, 164, 41, 54, 148 } /* left = tm */ - }, { /* above = d207 */ - { 82, 22, 32, 127, 143, 213, 39, 41, 70 } /* left = dc */, - { 62, 44, 61, 123, 105, 189, 48, 57, 64 } /* left = v */, - { 47, 25, 17, 175, 222, 220, 24, 30, 86 } /* left = h */, - { 68, 36, 17, 106, 102, 206, 59, 74, 74 } /* left = d45 */, - { 57, 39, 23, 151, 68, 216, 55, 63, 58 } /* left = d135 */, - { 49, 30, 35, 141, 70, 168, 82, 40, 115 } /* left = d117 */, - { 51, 25, 15, 136, 129, 202, 38, 35, 139 } /* left = d153 */, - { 68, 26, 16, 111, 141, 215, 29, 28, 28 } /* left = d207 */, - { 59, 39, 19, 114, 75, 180, 77, 104, 42 } /* left = d63 */, - { 40, 61, 26, 126, 152, 206, 61, 59, 93 } /* left = tm */ - }, { /* above = d63 */ - { 78, 23, 39, 111, 117, 170, 74, 124, 94 } /* left = dc */, - { 48, 34, 86, 101, 92, 146, 78, 179, 134 } /* left = v */, - { 47, 22, 24, 138, 187, 178, 68, 69, 59 } /* left = h */, - { 56, 25, 33, 105, 112, 187, 95, 177, 129 } /* left = d45 */, - { 48, 31, 27, 114, 63, 183, 82, 116, 56 } /* left = d135 */, - { 43, 28, 37, 121, 63, 123, 61, 192, 169 } /* left = d117 */, - { 42, 17, 24, 109, 97, 177, 56, 76, 122 } /* left = d153 */, - { 58, 18, 28, 105, 139, 182, 70, 92, 63 } /* left = d207 */, - { 46, 23, 32, 74, 86, 150, 67, 183, 88 } /* left = d63 */, - { 36, 38, 48, 92, 122, 165, 88, 137, 91 } /* left = tm */ - }, { /* above = tm */ - { 65, 70, 60, 155, 159, 199, 61, 60, 81 } /* left = dc */, - { 44, 78, 115, 132, 119, 173, 71, 112, 93 } /* left = v */, - { 39, 38, 21, 184, 227, 206, 42, 32, 64 } /* left = h */, - { 58, 47, 36, 124, 137, 193, 80, 82, 78 } /* left = d45 */, - { 49, 50, 35, 144, 95, 205, 63, 78, 59 } /* left = d135 */, - { 41, 53, 52, 148, 71, 142, 65, 128, 51 } /* left = d117 */, - { 40, 36, 28, 143, 143, 202, 40, 55, 137 } /* left = d153 */, - { 52, 34, 29, 129, 183, 227, 42, 35, 43 } /* left = d207 */, - { 42, 44, 44, 104, 105, 164, 64, 130, 80 } /* left = d63 */, - { 43, 81, 53, 140, 169, 204, 68, 84, 72 } /* left = tm */ - } +const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS] + [PARTITION_TYPES - 1] = { + // 8x8 -> 4x4 + { 158, 97, 94 }, // a/l both not split + { 93, 24, 99 }, // a split, l not split + { 85, 119, 44 }, // l split, a not split + { 62, 59, 67 }, // a/l both split + // 16x16 -> 8x8 + { 149, 53, 53 }, // a/l both not split + { 94, 20, 48 }, // a split, l not split + { 83, 53, 24 }, // l split, a not split + { 52, 18, 18 }, // a/l both split + // 32x32 -> 16x16 + { 150, 40, 39 }, // a/l both not split + { 78, 12, 26 }, // a split, l not split + { 67, 33, 11 }, // l split, a not split + { 24, 7, 5 }, // a/l both split + // 64x64 -> 32x32 + { 174, 35, 49 }, // a/l both not split + { 68, 11, 27 }, // a split, l not split + { 57, 15, 9 }, // l split, a not split + { 12, 3, 3 }, // a/l both split +}; + +static const vp9_prob default_partition_probs[PARTITION_CONTEXTS] + [PARTITION_TYPES - 1] = { + // 8x8 -> 4x4 + { 199, 122, 141 }, // a/l both not split + { 147, 63, 159 }, // a split, l not split + { 148, 133, 118 }, // l split, a not split + { 121, 104, 114 }, // a/l both split + // 16x16 -> 8x8 + { 174, 73, 87 }, // a/l both not split + { 92, 41, 83 }, // a split, l not split + { 82, 99, 50 }, // l split, a not split + { 53, 39, 39 }, // a/l both split + // 32x32 -> 16x16 + { 177, 58, 59 }, // a/l both not split + { 68, 26, 63 }, // a split, l not split + { 52, 79, 25 }, // l split, a not split + { 17, 14, 12 }, // a/l both split + // 64x64 -> 32x32 + { 222, 34, 30 }, // a/l both not split + { 72, 16, 44 }, // a split, l not split + { 58, 32, 12 }, // l split, a not split + { 10, 7, 6 }, // a/l both split }; static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS] @@ -226,7 +221,7 @@ static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS] }; /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */ -const vp9_tree_index vp9_intra_mode_tree[INTRA_MODES * 2 - 2] = { +const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = { -DC_PRED, 2, /* 0 = DC_NODE */ -TM_PRED, 4, /* 1 = TM_NODE */ -V_PRED, 6, /* 2 = V_NODE */ @@ -237,22 +232,20 @@ const vp9_tree_index vp9_intra_mode_tree[INTRA_MODES * 2 - 2] = { -D63_PRED, 16, /* 7 = D63_NODE */ -D153_PRED, -D207_PRED /* 8 = D153_NODE */ }; +struct vp9_token vp9_intra_mode_encodings[INTRA_MODES]; -const vp9_tree_index vp9_inter_mode_tree[6] = { +const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = { -ZEROMV, 2, -NEARESTMV, 4, -NEARMV, -NEWMV }; +struct vp9_token vp9_inter_mode_encodings[INTER_MODES]; -const vp9_tree_index vp9_partition_tree[6] = { +const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = { -PARTITION_NONE, 2, -PARTITION_HORZ, 4, -PARTITION_VERT, -PARTITION_SPLIT }; - -struct vp9_token vp9_intra_mode_encodings[INTRA_MODES]; -struct vp9_token vp9_inter_mode_encodings[INTER_MODES]; - struct vp9_token vp9_partition_encodings[PARTITION_TYPES]; static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = { @@ -286,7 +279,7 @@ static const struct tx_probs default_tx_probs = { { 66 } } }; -void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p, +void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p, unsigned int (*ct_32x32p)[2]) { ct_32x32p[0][0] = tx_count_32x32p[TX_4X4]; ct_32x32p[0][1] = tx_count_32x32p[TX_8X8] + @@ -299,7 +292,7 @@ void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p, ct_32x32p[2][1] = tx_count_32x32p[TX_32X32]; } -void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p, +void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p, unsigned int (*ct_16x16p)[2]) { ct_16x16p[0][0] = tx_count_16x16p[TX_4X4]; ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] + tx_count_16x16p[TX_16X16]; @@ -307,7 +300,7 @@ void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p, ct_16x16p[1][1] = tx_count_16x16p[TX_16X16]; } -void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p, +void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p, unsigned int (*ct_8x8p)[2]) { ct_8x8p[0][0] = tx_count_8x8p[TX_4X4]; ct_8x8p[0][1] = tx_count_8x8p[TX_8X8]; @@ -317,8 +310,8 @@ static const vp9_prob default_mbskip_probs[MBSKIP_CONTEXTS] = { 192, 128, 64 }; -static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTERS+1] - [SWITCHABLE_FILTERS-1] = { +static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS - 1] = { { 235, 162, }, { 36, 255, }, { 34, 3, }, @@ -338,7 +331,8 @@ void vp9_init_mbmode_probs(VP9_COMMON *cm) { vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs); } -const vp9_tree_index vp9_switchable_interp_tree[SWITCHABLE_FILTERS*2-2] = { +const vp9_tree_index vp9_switchable_interp_tree + [TREE_SIZE(SWITCHABLE_FILTERS)] = { -EIGHTTAP, 2, -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP }; @@ -356,76 +350,58 @@ void vp9_entropy_mode_init() { #define COUNT_SAT 20 #define MAX_UPDATE_FACTOR 128 -static int update_ct(vp9_prob pre_prob, vp9_prob prob, unsigned int ct[2]) { - return merge_probs(pre_prob, prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR); -} - -static int update_ct2(vp9_prob pre_prob, unsigned int ct[2]) { - return merge_probs2(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR); +static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) { + return merge_probs(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR); } -static void update_mode_probs(int n_modes, - const vp9_tree_index *tree, unsigned int *cnt, - vp9_prob *pre_probs, vp9_prob *dst_probs, - unsigned int tok0_offset) { -#define MAX_PROBS 32 - vp9_prob probs[MAX_PROBS]; - unsigned int branch_ct[MAX_PROBS][2]; - int t; - - assert(n_modes - 1 < MAX_PROBS); - vp9_tree_probs_from_distribution(tree, probs, branch_ct, cnt, tok0_offset); - for (t = 0; t < n_modes - 1; ++t) - dst_probs[t] = update_ct(pre_probs[t], probs[t], branch_ct[t]); +static void adapt_probs(const vp9_tree_index *tree, + const vp9_prob *pre_probs, const unsigned int *counts, + unsigned int offset, vp9_prob *probs) { + tree_merge_probs(tree, pre_probs, counts, offset, + COUNT_SAT, MAX_UPDATE_FACTOR, probs); } void vp9_adapt_mode_probs(VP9_COMMON *cm) { int i, j; FRAME_CONTEXT *fc = &cm->fc; - FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; - FRAME_COUNTS *counts = &cm->counts; + const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; + const FRAME_COUNTS *counts = &cm->counts; for (i = 0; i < INTRA_INTER_CONTEXTS; i++) - fc->intra_inter_prob[i] = update_ct2(pre_fc->intra_inter_prob[i], + fc->intra_inter_prob[i] = adapt_prob(pre_fc->intra_inter_prob[i], counts->intra_inter[i]); for (i = 0; i < COMP_INTER_CONTEXTS; i++) - fc->comp_inter_prob[i] = update_ct2(pre_fc->comp_inter_prob[i], + fc->comp_inter_prob[i] = adapt_prob(pre_fc->comp_inter_prob[i], counts->comp_inter[i]); for (i = 0; i < REF_CONTEXTS; i++) - fc->comp_ref_prob[i] = update_ct2(pre_fc->comp_ref_prob[i], + fc->comp_ref_prob[i] = adapt_prob(pre_fc->comp_ref_prob[i], counts->comp_ref[i]); for (i = 0; i < REF_CONTEXTS; i++) for (j = 0; j < 2; j++) - fc->single_ref_prob[i][j] = update_ct2(pre_fc->single_ref_prob[i][j], + fc->single_ref_prob[i][j] = adapt_prob(pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]); for (i = 0; i < INTER_MODE_CONTEXTS; i++) - update_mode_probs(INTER_MODES, vp9_inter_mode_tree, - counts->inter_mode[i], pre_fc->inter_mode_probs[i], - fc->inter_mode_probs[i], NEARESTMV); + adapt_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i], + counts->inter_mode[i], NEARESTMV, fc->inter_mode_probs[i]); for (i = 0; i < BLOCK_SIZE_GROUPS; i++) - update_mode_probs(INTRA_MODES, vp9_intra_mode_tree, - counts->y_mode[i], pre_fc->y_mode_prob[i], - fc->y_mode_prob[i], 0); + adapt_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i], + counts->y_mode[i], 0, fc->y_mode_prob[i]); for (i = 0; i < INTRA_MODES; ++i) - update_mode_probs(INTRA_MODES, vp9_intra_mode_tree, - counts->uv_mode[i], pre_fc->uv_mode_prob[i], - fc->uv_mode_prob[i], 0); + adapt_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i], + counts->uv_mode[i], 0, fc->uv_mode_prob[i]); - for (i = 0; i < NUM_PARTITION_CONTEXTS; i++) - update_mode_probs(PARTITION_TYPES, vp9_partition_tree, - counts->partition[i], - pre_fc->partition_prob[INTER_FRAME][i], - fc->partition_prob[INTER_FRAME][i], 0); + for (i = 0; i < PARTITION_CONTEXTS; i++) + adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i], + counts->partition[i], 0, fc->partition_prob[i]); if (cm->mcomp_filter_type == SWITCHABLE) { - for (i = 0; i <= SWITCHABLE_FILTERS; i++) - update_mode_probs(SWITCHABLE_FILTERS, vp9_switchable_interp_tree, - counts->switchable_interp[i], - pre_fc->switchable_interp_prob[i], - fc->switchable_interp_prob[i], 0); + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) + adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i], + counts->switchable_interp[i], 0, + fc->switchable_interp_prob[i]); } if (cm->tx_mode == TX_MODE_SELECT) { @@ -437,23 +413,23 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p); for (j = 0; j < TX_SIZES - 3; ++j) - fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j], + fc->tx_probs.p8x8[i][j] = adapt_prob(pre_fc->tx_probs.p8x8[i][j], branch_ct_8x8p[j]); tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p); for (j = 0; j < TX_SIZES - 2; ++j) - fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j], + fc->tx_probs.p16x16[i][j] = adapt_prob(pre_fc->tx_probs.p16x16[i][j], branch_ct_16x16p[j]); tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p); for (j = 0; j < TX_SIZES - 1; ++j) - fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j], + fc->tx_probs.p32x32[i][j] = adapt_prob(pre_fc->tx_probs.p32x32[i][j], branch_ct_32x32p[j]); } } for (i = 0; i < MBSKIP_CONTEXTS; ++i) - fc->mbskip_probs[i] = update_ct2(pre_fc->mbskip_probs[i], + fc->mbskip_probs[i] = adapt_prob(pre_fc->mbskip_probs[i], counts->mbskip[i]); } diff --git a/libvpx/vp9/common/vp9_entropymode.h b/libvpx/vp9/common/vp9_entropymode.h index 4cf4c03..38b4199 100644 --- a/libvpx/vp9/common/vp9_entropymode.h +++ b/libvpx/vp9/common/vp9_entropymode.h @@ -14,10 +14,9 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_treecoder.h" -#define SUBMVREF_COUNT 5 #define TX_SIZE_CONTEXTS 2 -#define MODE_UPDATE_PROB 252 #define SWITCHABLE_FILTERS 3 // number of switchable filters +#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1) // #define MODE_STATS @@ -39,19 +38,20 @@ extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES] [INTRA_MODES - 1]; -extern const vp9_tree_index vp9_intra_mode_tree[]; -extern const vp9_tree_index vp9_inter_mode_tree[]; +extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS] + [PARTITION_TYPES - 1]; +extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)]; extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES]; + +extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)]; extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES]; -// probability models for partition information -extern const vp9_tree_index vp9_partition_tree[]; +extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)]; extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES]; extern const vp9_tree_index vp9_switchable_interp_tree - [2 * (SWITCHABLE_FILTERS - 1)]; - + [TREE_SIZE(SWITCHABLE_FILTERS)]; extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS]; void vp9_entropy_mode_init(); @@ -62,11 +62,11 @@ void vp9_init_mbmode_probs(struct VP9Common *cm); void vp9_adapt_mode_probs(struct VP9Common *cm); -void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p, +void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p, unsigned int (*ct_32x32p)[2]); -void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p, +void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p, unsigned int (*ct_16x16p)[2]); -void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p, +void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p, unsigned int (*ct_8x8p)[2]); #endif // VP9_COMMON_VP9_ENTROPYMODE_H_ diff --git a/libvpx/vp9/common/vp9_entropymv.c b/libvpx/vp9/common/vp9_entropymv.c index 2e973e5..b061cdb 100644 --- a/libvpx/vp9/common/vp9_entropymv.c +++ b/libvpx/vp9/common/vp9_entropymv.c @@ -18,14 +18,14 @@ /* Integer pel reference mv threshold for use of high-precision 1/8 mv */ #define COMPANDED_MVREF_THRESH 8 -const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = { +const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = { -MV_JOINT_ZERO, 2, -MV_JOINT_HNZVZ, 4, -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ }; struct vp9_token vp9_mv_joint_encodings[MV_JOINTS]; -const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = { +const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = { -MV_CLASS_0, 2, -MV_CLASS_1, 4, 6, 8, @@ -39,12 +39,12 @@ const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = { }; struct vp9_token vp9_mv_class_encodings[MV_CLASSES]; -const vp9_tree_index vp9_mv_class0_tree [2 * CLASS0_SIZE - 2] = { +const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { -0, -1, }; struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE]; -const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = { +const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)] = { -0, 2, -1, 4, -2, -3 @@ -53,8 +53,8 @@ struct vp9_token vp9_mv_fp_encodings[4]; static const nmv_context default_nmv_context = { {32, 64, 96}, - { - { /* vert component */ + { // NOLINT + { /* vert component */ // NOLINT 128, /* sign */ {224, 144, 192, 168, 192, 176, 192, 198, 198, 245}, /* class */ {216}, /* class0 */ @@ -64,7 +64,7 @@ static const nmv_context default_nmv_context = { 160, /* class0_hp bit */ 128, /* hp */ }, - { /* hor component */ + { /* hor component */ // NOLINT 128, /* sign */ {216, 128, 176, 160, 176, 176, 192, 198, 198, 208}, /* class */ {208}, /* class0 */ @@ -149,7 +149,7 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) { static void inc_mv_component(int v, nmv_component_counts *comp_counts, int incr, int usehp) { int s, z, c, o, d, e, f; - assert (v != 0); /* should not be zero */ + assert(v != 0); /* should not be zero */ s = v < 0; comp_counts->sign[s] += incr; z = (s ? -v : v) - 1; /* magnitude - 1 */ @@ -175,77 +175,63 @@ static void inc_mv_component(int v, nmv_component_counts *comp_counts, } } +void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) { + if (counts != NULL) { + const MV_JOINT_TYPE j = vp9_get_mv_joint(mv); + ++counts->joints[j]; -void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) { - const MV_JOINT_TYPE j = vp9_get_mv_joint(mv); - ++counts->joints[j]; - - if (mv_joint_vertical(j)) { - inc_mv_component(mv->row, &counts->comps[0], 1, 1); - } + if (mv_joint_vertical(j)) { + inc_mv_component(mv->row, &counts->comps[0], 1, 1); + } - if (mv_joint_horizontal(j)) { - inc_mv_component(mv->col, &counts->comps[1], 1, 1); + if (mv_joint_horizontal(j)) { + inc_mv_component(mv->col, &counts->comps[1], 1, 1); + } } } static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) { - return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR); + return merge_probs(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR); } -static unsigned int adapt_probs(unsigned int i, - vp9_tree tree, - vp9_prob this_probs[], - const vp9_prob last_probs[], - const unsigned int num_events[]) { - - - const unsigned int left = tree[i] <= 0 - ? num_events[-tree[i]] - : adapt_probs(tree[i], tree, this_probs, last_probs, num_events); - - const unsigned int right = tree[i + 1] <= 0 - ? num_events[-tree[i + 1]] - : adapt_probs(tree[i + 1], tree, this_probs, last_probs, num_events); - const unsigned int ct[2] = { left, right }; - this_probs[i >> 1] = adapt_prob(last_probs[i >> 1], ct); - return left + right; +static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, + const unsigned int *counts, vp9_prob *probs) { + tree_merge_probs(tree, pre_probs, counts, 0, + MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR, probs); } - void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) { int i, j; - FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; + nmv_context *fc = &cm->fc.nmvc; + const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc; + const nmv_context_counts *counts = &cm->counts.mv; - nmv_context *ctx = &cm->fc.nmvc; - nmv_context *pre_ctx = &pre_fc->nmvc; - nmv_context_counts *cts = &cm->counts.mv; - - adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints); + adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints, + fc->joints); for (i = 0; i < 2; ++i) { - ctx->comps[i].sign = adapt_prob(pre_ctx->comps[i].sign, cts->comps[i].sign); - adapt_probs(0, vp9_mv_class_tree, ctx->comps[i].classes, - pre_ctx->comps[i].classes, cts->comps[i].classes); - adapt_probs(0, vp9_mv_class0_tree, ctx->comps[i].class0, - pre_ctx->comps[i].class0, cts->comps[i].class0); + nmv_component *comp = &fc->comps[i]; + const nmv_component *pre_comp = &pre_fc->comps[i]; + const nmv_component_counts *c = &counts->comps[i]; + + comp->sign = adapt_prob(pre_comp->sign, c->sign); + adapt_probs(vp9_mv_class_tree, pre_comp->classes, c->classes, + comp->classes); + adapt_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0, comp->class0); for (j = 0; j < MV_OFFSET_BITS; ++j) - ctx->comps[i].bits[j] = adapt_prob(pre_ctx->comps[i].bits[j], - cts->comps[i].bits[j]); + comp->bits[j] = adapt_prob(pre_comp->bits[j], c->bits[j]); for (j = 0; j < CLASS0_SIZE; ++j) - adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].class0_fp[j], - pre_ctx->comps[i].class0_fp[j], cts->comps[i].class0_fp[j]); + adapt_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j], c->class0_fp[j], + comp->class0_fp[j]); - adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].fp, pre_ctx->comps[i].fp, - cts->comps[i].fp); + adapt_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp); if (allow_hp) { - ctx->comps[i].class0_hp = adapt_prob(pre_ctx->comps[i].class0_hp, - cts->comps[i].class0_hp); - ctx->comps[i].hp = adapt_prob(pre_ctx->comps[i].hp, cts->comps[i].hp); + comp->class0_hp = adapt_prob(pre_comp->class0_hp, c->class0_hp); + comp->hp = adapt_prob(pre_comp->hp, c->hp); } } } diff --git a/libvpx/vp9/common/vp9_entropymv.h b/libvpx/vp9/common/vp9_entropymv.h index a10c933..d843f5b 100644 --- a/libvpx/vp9/common/vp9_entropymv.h +++ b/libvpx/vp9/common/vp9_entropymv.h @@ -13,7 +13,7 @@ #define VP9_COMMON_VP9_ENTROPYMV_H_ #include "vp9/common/vp9_treecoder.h" -#include "vpx_config.h" +#include "./vpx_config.h" #include "vp9/common/vp9_blockd.h" struct VP9Common; @@ -43,9 +43,6 @@ static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) { return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ; } -extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2]; -extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS]; - /* Symbols for coding magnitude class of nonzero components */ #define MV_CLASSES 11 typedef enum { @@ -62,9 +59,6 @@ typedef enum { MV_CLASS_10 = 10, /* (1024,2048] integer pel */ } MV_CLASS_TYPE; -extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2]; -extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES]; - #define CLASS0_BITS 1 /* bits at integer precision for class 0 */ #define CLASS0_SIZE (1 << CLASS0_BITS) #define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2) @@ -73,10 +67,20 @@ extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES]; #define MV_MAX ((1 << MV_MAX_BITS) - 1) #define MV_VALS ((MV_MAX << 1) + 1) -extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2]; +#define MV_IN_USE_BITS 14 +#define MV_UPP ((1 << MV_IN_USE_BITS) - 1) +#define MV_LOW (-(1 << MV_IN_USE_BITS)) + +extern const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)]; +extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS]; + +extern const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)]; +extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES]; + +extern const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)]; extern struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE]; -extern const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2]; +extern const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)]; extern struct vp9_token vp9_mv_fp_encodings[4]; typedef struct { @@ -108,7 +112,6 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset); typedef struct { - unsigned int mvcount[MV_VALS]; unsigned int sign[2]; unsigned int classes[MV_CLASSES]; unsigned int class0[CLASS0_SIZE]; diff --git a/libvpx/vp9/common/vp9_enums.h b/libvpx/vp9/common/vp9_enums.h index 1bf0742..1651b90 100644 --- a/libvpx/vp9/common/vp9_enums.h +++ b/libvpx/vp9/common/vp9_enums.h @@ -50,7 +50,7 @@ typedef enum PARTITION_TYPE { } PARTITION_TYPE; #define PARTITION_PLOFFSET 4 // number of probability models per block size -#define NUM_PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET) +#define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET) typedef enum { TX_4X4 = 0, // 4x4 dct transform @@ -76,4 +76,15 @@ typedef enum { ADST_ADST = 3 // ADST in both directions } TX_TYPE; +typedef enum { + UNKNOWN = 0, + BT_601 = 1, // YUV + BT_709 = 2, // YUV + SMPTE_170 = 3, // YUV + SMPTE_240 = 4, // YUV + RESERVED_1 = 5, + RESERVED_2 = 6, + SRGB = 7 // RGB +} COLOR_SPACE; + #endif // VP9_COMMON_VP9_ENUMS_H_ diff --git a/libvpx/vp9/common/vp9_filter.c b/libvpx/vp9/common/vp9_filter.c index 4ac2bc9..79ace14 100644 --- a/libvpx/vp9/common/vp9_filter.c +++ b/libvpx/vp9/common/vp9_filter.c @@ -8,12 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> + #include "vpx_ports/mem.h" #include "vp9/common/vp9_filter.h" -DECLARE_ALIGNED(256, const int16_t, - vp9_bilinear_filters[SUBPEL_SHIFTS][SUBPEL_TAPS]) = { +DECLARE_ALIGNED(256, const subpel_kernel, + vp9_bilinear_filters[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 }, { 0, 0, 0, 112, 16, 0, 0, 0 }, @@ -33,8 +35,8 @@ DECLARE_ALIGNED(256, const int16_t, }; // Lagrangian interpolation filter -DECLARE_ALIGNED(256, const int16_t, - vp9_sub_pel_filters_8[SUBPEL_SHIFTS][SUBPEL_TAPS]) = { +DECLARE_ALIGNED(256, const subpel_kernel, + vp9_sub_pel_filters_8[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0}, { 0, 1, -5, 126, 8, -3, 1, 0}, { -1, 3, -10, 122, 18, -6, 2, 0}, @@ -54,8 +56,8 @@ DECLARE_ALIGNED(256, const int16_t, }; // DCT based filter -DECLARE_ALIGNED(256, const int16_t, - vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][SUBPEL_TAPS]) = { +DECLARE_ALIGNED(256, const subpel_kernel, + vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]) = { {0, 0, 0, 128, 0, 0, 0, 0}, {-1, 3, -7, 127, 8, -3, 1, 0}, {-2, 5, -13, 125, 17, -6, 3, -1}, @@ -75,8 +77,8 @@ DECLARE_ALIGNED(256, const int16_t, }; // freqmultiplier = 0.5 -DECLARE_ALIGNED(256, const int16_t, - vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][SUBPEL_TAPS]) = { +DECLARE_ALIGNED(256, const subpel_kernel, + vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0}, {-3, -1, 32, 64, 38, 1, -3, 0}, {-2, -2, 29, 63, 41, 2, -3, 0}, @@ -94,3 +96,16 @@ DECLARE_ALIGNED(256, const int16_t, { 0, -3, 2, 41, 63, 29, -2, -2}, { 0, -3, 1, 38, 64, 32, -1, -3} }; + + +static const subpel_kernel* vp9_filter_kernels[4] = { + vp9_sub_pel_filters_8, + vp9_sub_pel_filters_8lp, + vp9_sub_pel_filters_8s, + vp9_bilinear_filters +}; + +const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type) { + return vp9_filter_kernels[type]; +} + diff --git a/libvpx/vp9/common/vp9_filter.h b/libvpx/vp9/common/vp9_filter.h index 7b1ffae..b1e7e64 100644 --- a/libvpx/vp9/common/vp9_filter.h +++ b/libvpx/vp9/common/vp9_filter.h @@ -11,19 +11,37 @@ #ifndef VP9_COMMON_VP9_FILTER_H_ #define VP9_COMMON_VP9_FILTER_H_ -#include "vpx_config.h" +#include "./vpx_config.h" #include "vpx/vpx_integer.h" +#define FILTER_BITS 7 + #define SUBPEL_BITS 4 #define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1) #define SUBPEL_SHIFTS (1 << SUBPEL_BITS) #define SUBPEL_TAPS 8 -extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][SUBPEL_TAPS]; -extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][SUBPEL_TAPS]; -extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][SUBPEL_TAPS]; -extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][SUBPEL_TAPS]; -extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][SUBPEL_TAPS]; +typedef enum { + EIGHTTAP = 0, + EIGHTTAP_SMOOTH = 1, + EIGHTTAP_SHARP = 2, + BILINEAR = 3, + SWITCHABLE = 4 /* should be the last one */ +} INTERPOLATION_TYPE; + +typedef int16_t subpel_kernel[SUBPEL_TAPS]; + +struct subpix_fn_table { + const subpel_kernel *filter_x; + const subpel_kernel *filter_y; +}; + +const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type); + +extern const subpel_kernel vp9_bilinear_filters[SUBPEL_SHIFTS]; +extern const subpel_kernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS]; +extern const subpel_kernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]; +extern const subpel_kernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]; // The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear // filter kernel as a 2 tap filter. diff --git a/libvpx/vp9/common/vp9_findnearmv.c b/libvpx/vp9/common/vp9_findnearmv.c index 49a731f..b91c501 100644 --- a/libvpx/vp9/common/vp9_findnearmv.c +++ b/libvpx/vp9/common/vp9_findnearmv.c @@ -22,14 +22,12 @@ static void lower_mv_precision(MV *mv, int allow_hp) { } -void vp9_find_best_ref_mvs(MACROBLOCKD *xd, - int_mv *mvlist, - int_mv *nearest, - int_mv *near) { +void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, + int_mv *mvlist, int_mv *nearest, int_mv *near) { int i; // Make sure all the candidates are properly clamped etc for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { - lower_mv_precision(&mvlist[i].as_mv, xd->allow_high_precision_mv); + lower_mv_precision(&mvlist[i].as_mv, allow_hp); clamp_mv2(&mvlist[i].as_mv, xd); } *nearest = mvlist[0]; @@ -37,27 +35,28 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, } void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *const tile, int_mv *dst_nearest, int_mv *dst_near, int block_idx, int ref_idx, int mi_row, int mi_col) { int_mv dst_list[MAX_MV_REF_CANDIDATES]; int_mv mv_list[MAX_MV_REF_CANDIDATES]; - MODE_INFO *const mi = xd->this_mi; + MODE_INFO *const mi = xd->mi_8x8[0]; assert(ref_idx == 0 || ref_idx == 1); assert(MAX_MV_REF_CANDIDATES == 2); // makes code here slightly easier - vp9_find_mv_refs_idx(cm, xd, mi, xd->last_mi, + vp9_find_mv_refs_idx(cm, xd, tile, mi, xd->last_mi, mi->mbmi.ref_frame[ref_idx], mv_list, block_idx, mi_row, mi_col); dst_list[1].as_int = 0; if (block_idx == 0) { - memcpy(dst_list, mv_list, MAX_MV_REF_CANDIDATES * sizeof(int_mv)); + vpx_memcpy(dst_list, mv_list, MAX_MV_REF_CANDIDATES * sizeof(int_mv)); } else if (block_idx == 1 || block_idx == 2) { int dst = 0, n; - union b_mode_info *bmi = mi->bmi; + b_mode_info *bmi = mi->bmi; dst_list[dst++].as_int = bmi[0].as_mv[ref_idx].as_int; for (n = 0; dst < MAX_MV_REF_CANDIDATES && @@ -66,7 +65,7 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, dst_list[dst++].as_int = mv_list[n].as_int; } else { int dst = 0, n; - union b_mode_info *bmi = mi->bmi; + b_mode_info *bmi = mi->bmi; assert(block_idx == 3); dst_list[dst++].as_int = bmi[2].as_mv[ref_idx].as_int; diff --git a/libvpx/vp9/common/vp9_findnearmv.h b/libvpx/vp9/common/vp9_findnearmv.h index ad0d882..2362caa 100644 --- a/libvpx/vp9/common/vp9_findnearmv.h +++ b/libvpx/vp9/common/vp9_findnearmv.h @@ -23,10 +23,8 @@ // check a list of motion vectors by sad score using a number rows of pixels // above and a number cols of pixels in the left to select the one with best // score to use as ref motion vector -void vp9_find_best_ref_mvs(MACROBLOCKD *xd, - int_mv *mvlist, - int_mv *nearest, - int_mv *near); +void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, + int_mv *mvlist, int_mv *nearest, int_mv *near); // TODO(jingning): this mv clamping function should be block size dependent. static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { @@ -36,57 +34,39 @@ static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); } -void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, - MACROBLOCKD *xd, +void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *const tile, int_mv *dst_nearest, int_mv *dst_near, int block_idx, int ref_idx, int mi_row, int mi_col); -static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, - const MODE_INFO *left_mb, int b) { - // FIXME(rbultje, jingning): temporary hack because jenkins doesn't - // understand this condition. This will go away soon. - const MODE_INFO *mi = cur_mb; - +static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *left_mi, int b) { if (b == 0 || b == 2) { - /* On L edge, get from MB to left of us */ - mi = left_mb; - if (!mi) + if (!left_mi || is_inter_block(&left_mi->mbmi)) return DC_PRED; - if (mi->mbmi.ref_frame[0] != INTRA_FRAME) { - return DC_PRED; - } else if (mi->mbmi.sb_type < BLOCK_8X8) { - return ((mi->bmi + 1 + b)->as_mode); - } else { - return mi->mbmi.mode; - } + return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode + : left_mi->mbmi.mode; + } else { + assert(b == 1 || b == 3); + return cur_mi->bmi[b - 1].as_mode; } - assert(b == 1 || b == 3); - return (mi->bmi + b - 1)->as_mode; } -static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, - const MODE_INFO *above_mb, int b) { - const MODE_INFO *mi = cur_mb; - - if (!(b >> 1)) { - /* On top edge, get from MB above us */ - mi = above_mb; - if (!mi) +static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *above_mi, int b) { + if (b == 0 || b == 1) { + if (!above_mi || is_inter_block(&above_mi->mbmi)) return DC_PRED; - if (mi->mbmi.ref_frame[0] != INTRA_FRAME) { - return DC_PRED; - } else if (mi->mbmi.sb_type < BLOCK_8X8) { - return ((mi->bmi + 2 + b)->as_mode); - } else { - return mi->mbmi.mode; - } + return above_mi->mbmi.sb_type < BLOCK_8X8 ? above_mi->bmi[b + 2].as_mode + : above_mi->mbmi.mode; + } else { + assert(b == 2 || b == 3); + return cur_mi->bmi[b - 2].as_mode; } - - return (mi->bmi + b - 2)->as_mode; } #endif // VP9_COMMON_VP9_FINDNEARMV_H_ diff --git a/libvpx/vp9/common/vp9_idct.c b/libvpx/vp9/common/vp9_idct.c index a224525..ea8683e 100644 --- a/libvpx/vp9/common/vp9_idct.c +++ b/libvpx/vp9/common/vp9_idct.c @@ -18,20 +18,20 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_idct.h" -void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 0.5 shifts per pixel. */ int i; int16_t output[16]; int a1, b1, c1, d1, e1; - int16_t *ip = input; + const int16_t *ip = input; int16_t *op = output; for (i = 0; i < 4; i++) { - a1 = ip[0] >> WHT_UPSCALE_FACTOR; - c1 = ip[1] >> WHT_UPSCALE_FACTOR; - d1 = ip[2] >> WHT_UPSCALE_FACTOR; - b1 = ip[3] >> WHT_UPSCALE_FACTOR; + a1 = ip[0] >> UNIT_QUANT_SHIFT; + c1 = ip[1] >> UNIT_QUANT_SHIFT; + d1 = ip[2] >> UNIT_QUANT_SHIFT; + b1 = ip[3] >> UNIT_QUANT_SHIFT; a1 += c1; d1 -= b1; e1 = (a1 - d1) >> 1; @@ -60,24 +60,24 @@ void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { c1 = e1 - c1; a1 -= b1; d1 += c1; - dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1); - dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + b1); - dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + c1); - dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + d1); + dest[stride * 0] = clip_pixel(dest[stride * 0] + a1); + dest[stride * 1] = clip_pixel(dest[stride * 1] + b1); + dest[stride * 2] = clip_pixel(dest[stride * 2] + c1); + dest[stride * 3] = clip_pixel(dest[stride * 3] + d1); ip++; dest++; } } -void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) { +void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) { int i; int a1, e1; int16_t tmp[4]; - int16_t *ip = in; + const int16_t *ip = in; int16_t *op = tmp; - a1 = ip[0] >> WHT_UPSCALE_FACTOR; + a1 = ip[0] >> UNIT_QUANT_SHIFT; e1 = a1 >> 1; a1 -= e1; op[0] = a1; @@ -96,7 +96,7 @@ void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) { } } -void vp9_idct4_1d_c(int16_t *input, int16_t *output) { +static void idct4_1d(const int16_t *input, int16_t *output) { int16_t step[4]; int temp1, temp2; // stage 1 @@ -116,7 +116,7 @@ void vp9_idct4_1d_c(int16_t *input, int16_t *output) { output[3] = step[0] - step[3]; } -void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { int16_t out[4 * 4]; int16_t *outptr = out; int i, j; @@ -124,7 +124,7 @@ void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { // Rows for (i = 0; i < 4; ++i) { - vp9_idct4_1d(input, outptr); + idct4_1d(input, outptr); input += 4; outptr += 4; } @@ -133,14 +133,14 @@ void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - vp9_idct4_1d(temp_in, temp_out); + idct4_1d(temp_in, temp_out); for (j = 0; j < 4; ++j) - dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) - + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * stride + i]); } } -void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) { int i; int a1; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); @@ -156,7 +156,7 @@ void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) { } } -static void idct8_1d(int16_t *input, int16_t *output) { +static void idct8_1d(const int16_t *input, int16_t *output) { int16_t step1[8], step2[8]; int temp1, temp2; // stage 1 @@ -174,7 +174,7 @@ static void idct8_1d(int16_t *input, int16_t *output) { step1[6] = dct_const_round_shift(temp2); // stage 2 & stage 3 - even half - vp9_idct4_1d(step1, step1); + idct4_1d(step1, step1); // stage 2 - odd half step2[4] = step1[4] + step1[5]; @@ -201,7 +201,7 @@ static void idct8_1d(int16_t *input, int16_t *output) { output[7] = step1[0] - step1[7]; } -void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) { int16_t out[8 * 8]; int16_t *outptr = out; int i, j; @@ -220,12 +220,12 @@ void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) { temp_in[j] = out[j * 8 + i]; idct8_1d(temp_in, temp_out); for (j = 0; j < 8; ++j) - dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) - + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * stride + i]); } } -void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) { int i, j; int a1; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); @@ -234,11 +234,11 @@ void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) { for (j = 0; j < 8; ++j) { for (i = 0; i < 8; ++i) dest[i] = clip_pixel(dest[i] + a1); - dest += dest_stride; + dest += stride; } } -static void iadst4_1d(int16_t *input, int16_t *output) { +static void iadst4_1d(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; int x0 = input[0]; @@ -280,13 +280,13 @@ static void iadst4_1d(int16_t *input, int16_t *output) { output[3] = dct_const_round_shift(s3); } -void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride, - int tx_type) { +void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { const transform_2d IHT_4[] = { - { vp9_idct4_1d, vp9_idct4_1d }, // DCT_DCT = 0 - { iadst4_1d, vp9_idct4_1d }, // ADST_DCT = 1 - { vp9_idct4_1d, iadst4_1d }, // DCT_ADST = 2 - { iadst4_1d, iadst4_1d } // ADST_ADST = 3 + { idct4_1d, idct4_1d }, // DCT_DCT = 0 + { iadst4_1d, idct4_1d }, // ADST_DCT = 1 + { idct4_1d, iadst4_1d }, // DCT_ADST = 2 + { iadst4_1d, iadst4_1d } // ADST_ADST = 3 }; int i, j; @@ -307,11 +307,11 @@ void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride, temp_in[j] = out[j * 4 + i]; IHT_4[tx_type].cols(temp_in, temp_out); for (j = 0; j < 4; ++j) - dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) - + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * stride + i]); } } -static void iadst8_1d(int16_t *input, int16_t *output) { +static void iadst8_1d(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; int x0 = input[7]; @@ -395,8 +395,8 @@ static const transform_2d IHT_8[] = { { iadst8_1d, iadst8_1d } // ADST_ADST = 3 }; -void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride, - int tx_type) { +void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { int i, j; int16_t out[8 * 8]; int16_t *outptr = out; @@ -416,12 +416,12 @@ void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride, temp_in[j] = out[j * 8 + i]; ht.cols(temp_in, temp_out); for (j = 0; j < 8; ++j) - dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) - + dest[j * dest_stride + i]); } + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * stride + i]); + } } -void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest, - int dest_stride) { +void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) { int16_t out[8 * 8] = { 0 }; int16_t *outptr = out; int i, j; @@ -441,12 +441,12 @@ void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest, temp_in[j] = out[j * 8 + i]; idct8_1d(temp_in, temp_out); for (j = 0; j < 8; ++j) - dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) - + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * stride + i]); } } -static void idct16_1d(int16_t *input, int16_t *output) { +static void idct16_1d(const int16_t *input, int16_t *output) { int16_t step1[16], step2[16]; int temp1, temp2; @@ -611,7 +611,7 @@ static void idct16_1d(int16_t *input, int16_t *output) { output[15] = step2[0] - step2[15]; } -void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) { int16_t out[16 * 16]; int16_t *outptr = out; int i, j; @@ -630,12 +630,12 @@ void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) { temp_in[j] = out[j * 16 + i]; idct16_1d(temp_in, temp_out); for (j = 0; j < 16; ++j) - dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) - + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * stride + i]); } } -void iadst16_1d(int16_t *input, int16_t *output) { +static void iadst16_1d(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; int x0 = input[15]; @@ -813,8 +813,8 @@ static const transform_2d IHT_16[] = { { iadst16_1d, iadst16_1d } // ADST_ADST = 3 }; -void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride, - int tx_type) { +void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { int i, j; int16_t out[16 * 16]; int16_t *outptr = out; @@ -834,12 +834,11 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride, temp_in[j] = out[j * 16 + i]; ht.cols(temp_in, temp_out); for (j = 0; j < 16; ++j) - dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) - + dest[j * dest_stride + i]); } + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * stride + i]); } } -void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest, - int dest_stride) { +void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) { int16_t out[16 * 16] = { 0 }; int16_t *outptr = out; int i, j; @@ -859,13 +858,12 @@ void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest, temp_in[j] = out[j*16 + i]; idct16_1d(temp_in, temp_out); for (j = 0; j < 16; ++j) - dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) - + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * stride + i]); } } -void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest, - int dest_stride) { +void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) { int i, j; int a1; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); @@ -874,11 +872,11 @@ void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest, for (j = 0; j < 16; ++j) { for (i = 0; i < 16; ++i) dest[i] = clip_pixel(dest[i] + a1); - dest += dest_stride; + dest += stride; } } -static void idct32_1d(int16_t *input, int16_t *output) { +static void idct32_1d(const int16_t *input, int16_t *output) { int16_t step1[32], step2[32]; int temp1, temp2; @@ -1245,7 +1243,7 @@ static void idct32_1d(int16_t *input, int16_t *output) { output[31] = step1[0] - step1[31]; } -void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) { int16_t out[32 * 32]; int16_t *outptr = out; int i, j; @@ -1253,6 +1251,44 @@ void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) { // Rows for (i = 0; i < 32; ++i) { + int16_t zero_coeff[16]; + for (j = 0; j < 16; ++j) + zero_coeff[j] = input[2 * j] | input[2 * j + 1]; + for (j = 0; j < 8; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + for (j = 0; j < 4; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + for (j = 0; j < 2; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + + if (zero_coeff[0] | zero_coeff[1]) + idct32_1d(input, outptr); + else + vpx_memset(outptr, 0, sizeof(int16_t) * 32); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j * 32 + i]; + idct32_1d(temp_in, temp_out); + for (j = 0; j < 32; ++j) + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * stride + i]); + } +} + +void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) { + int16_t out[32 * 32] = {0}; + int16_t *outptr = out; + int i, j; + int16_t temp_in[32], temp_out[32]; + + // Rows + // only upper-left 8x8 has non-zero coeff + for (i = 0; i < 8; ++i) { idct32_1d(input, outptr); input += 32; outptr += 32; @@ -1264,13 +1300,116 @@ void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) { temp_in[j] = out[j * 32 + i]; idct32_1d(temp_in, temp_out); for (j = 0; j < 32; ++j) - dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) - + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * stride + i]); } } -void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) { +void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) { + int i, j; + int a1; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); - output[0] = ROUND_POWER_OF_TWO(out, 6); + a1 = ROUND_POWER_OF_TWO(out, 6); + + for (j = 0; j < 32; ++j) { + for (i = 0; i < 32; ++i) + dest[i] = clip_pixel(dest[i] + a1); + dest += stride; + } +} + +// idct +void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) { + if (eob > 1) + vp9_idct4x4_16_add(input, dest, stride); + else + vp9_idct4x4_1_add(input, dest, stride); +} + + +void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) { + if (eob > 1) + vp9_iwht4x4_16_add(input, dest, stride); + else + vp9_iwht4x4_1_add(input, dest, stride); +} + +void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) { + // If dc is 1, then input[0] is the reconstructed value, do not need + // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. + + // The calculation can be simplified if there are not many non-zero dct + // coefficients. Use eobs to decide what to do. + // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. + // Combine that with code here. + if (eob) { + if (eob == 1) + // DC only DCT coefficient + vp9_idct8x8_1_add(input, dest, stride); + else if (eob <= 10) + vp9_idct8x8_10_add(input, dest, stride); + else + vp9_idct8x8_64_add(input, dest, stride); + } +} + +void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, + int eob) { + /* The calculation can be simplified if there are not many non-zero dct + * coefficients. Use eobs to separate different cases. */ + if (eob) { + if (eob == 1) + /* DC only DCT coefficient. */ + vp9_idct16x16_1_add(input, dest, stride); + else if (eob <= 10) + vp9_idct16x16_10_add(input, dest, stride); + else + vp9_idct16x16_256_add(input, dest, stride); + } +} + +void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride, + int eob) { + if (eob) { + if (eob == 1) + vp9_idct32x32_1_add(input, dest, stride); + else if (eob <= 34) + // non-zero coeff only in upper-left 8x8 + vp9_idct32x32_34_add(input, dest, stride); + else + vp9_idct32x32_1024_add(input, dest, stride); + } +} + +// iht +void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, + int stride, int eob) { + if (tx_type == DCT_DCT) + vp9_idct4x4_add(input, dest, stride, eob); + else + vp9_iht4x4_16_add(input, dest, stride, tx_type); +} + +void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, + int stride, int eob) { + if (tx_type == DCT_DCT) { + vp9_idct8x8_add(input, dest, stride, eob); + } else { + if (eob > 0) { + vp9_iht8x8_64_add(input, dest, stride, tx_type); + } + } +} + +void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, + int stride, int eob) { + if (tx_type == DCT_DCT) { + vp9_idct16x16_add(input, dest, stride, eob); + } else { + if (eob > 0) { + vp9_iht16x16_256_add(input, dest, stride, tx_type); + } + } } diff --git a/libvpx/vp9/common/vp9_idct.h b/libvpx/vp9/common/vp9_idct.h index 0c47da6..2b3f35f 100644 --- a/libvpx/vp9/common/vp9_idct.h +++ b/libvpx/vp9/common/vp9_idct.h @@ -16,16 +16,18 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" // Constants and Macros used by all idct/dct functions #define DCT_CONST_BITS 14 #define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1)) -#define WHT_UPSCALE_FACTOR 2 +#define UNIT_QUANT_SHIFT 2 +#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT) #define pair_set_epi16(a, b) \ - _mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16)) + _mm_set_epi16(b, a, b, a, b, a, b, a) #define pair_set_epi32(a, b) \ _mm_set_epi32(b, a, b, a) @@ -79,10 +81,27 @@ static INLINE int dct_const_round_shift(int input) { return rv; } -typedef void (*transform_1d)(int16_t*, int16_t*); +typedef void (*transform_1d)(const int16_t*, int16_t*); typedef struct { transform_1d cols, rows; // vertical and horizontal } transform_2d; +void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob); + +void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob); +void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob); +void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, int + eob); +void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride, + int eob); + +void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, + int stride, int eob); +void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, + int stride, int eob); +void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, + int stride, int eob); + + #endif // VP9_COMMON_VP9_IDCT_H_ diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c index cfb5cd4..218e12e 100644 --- a/libvpx/vp9/common/vp9_loopfilter.c +++ b/libvpx/vp9/common/vp9_loopfilter.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vpx_config.h" +#include "./vpx_config.h" #include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_reconinter.h" @@ -16,12 +16,6 @@ #include "vp9/common/vp9_seg_common.h" -struct loop_filter_info { - const uint8_t *mblim; - const uint8_t *lim; - const uint8_t *hev_thr; -}; - // This structure holds bit masks for all 8x8 blocks in a 64x64 region. // Each 1 bit represents a position in which we want to apply the loop filter. // Left_ entries refer to whether we apply a filter on the border to the @@ -259,8 +253,8 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { if (block_inside_limit < 1) block_inside_limit = 1; - vpx_memset(lfi->lim[lvl], block_inside_limit, SIMD_WIDTH); - vpx_memset(lfi->mblim[lvl], (2 * (lvl + 2) + block_inside_limit), + vpx_memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH); + vpx_memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit), SIMD_WIDTH); } } @@ -268,7 +262,7 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { void vp9_loop_filter_init(VP9_COMMON *cm) { loop_filter_info_n *lfi = &cm->lf_info; struct loopfilter *lf = &cm->lf; - int i; + int lvl; // init limits for given sharpness update_sharpness(lfi, lf->sharpness_level); @@ -278,8 +272,8 @@ void vp9_loop_filter_init(VP9_COMMON *cm) { lf_init_lut(lfi); // init hev threshold const vectors - for (i = 0; i < 4; i++) - vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH); + for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) + vpx_memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH); } void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) { @@ -316,13 +310,13 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) { continue; } - intra_lvl = lvl_seg + (lf->ref_deltas[INTRA_FRAME] << n_shift); + intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * (1 << n_shift); lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER); for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref) for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) { - const int inter_lvl = lvl_seg + (lf->ref_deltas[ref] << n_shift) - + (lf->mode_deltas[mode] << n_shift); + const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * (1 << n_shift) + + lf->mode_deltas[mode] * (1 << n_shift); lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER); } } @@ -330,16 +324,14 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) { static int build_lfi(const loop_filter_info_n *lfi_n, const MB_MODE_INFO *mbmi, - struct loop_filter_info *lfi) { + const loop_filter_thresh **lfi) { const int seg = mbmi->segment_id; const int ref = mbmi->ref_frame[0]; const int mode = lfi_n->mode_lf_lut[mbmi->mode]; const int filter_level = lfi_n->lvl[seg][ref][mode]; if (filter_level > 0) { - lfi->mblim = lfi_n->mblim[filter_level]; - lfi->lim = lfi_n->lim[filter_level]; - lfi->hev_thr = lfi_n->hev_thr[filter_level >> 4]; + *lfi = &lfi_n->lfthr[filter_level]; return 1; } else { return 0; @@ -351,11 +343,13 @@ static void filter_selectively_vert(uint8_t *s, int pitch, unsigned int mask_8x8, unsigned int mask_4x4, unsigned int mask_4x4_int, - const struct loop_filter_info *lfi) { + const loop_filter_thresh **p_lfi) { unsigned int mask; for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask; mask >>= 1) { + const loop_filter_thresh *lfi = *p_lfi; + if (mask & 1) { if (mask_16x16 & 1) { vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim, @@ -379,7 +373,7 @@ static void filter_selectively_vert(uint8_t *s, int pitch, vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); s += 8; - lfi++; + p_lfi++; mask_16x16 >>= 1; mask_8x8 >>= 1; mask_4x4 >>= 1; @@ -393,12 +387,14 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, unsigned int mask_4x4, unsigned int mask_4x4_int, int only_4x4_1, - const struct loop_filter_info *lfi) { + const loop_filter_thresh **p_lfi) { unsigned int mask; int count; for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask; mask >>= count) { + const loop_filter_thresh *lfi = *p_lfi; + count = 1; if (mask & 1) { if (!only_4x4_1) { @@ -432,7 +428,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, lfi->lim, lfi->hev_thr, 1); } s += 8 * count; - lfi += count; + p_lfi += count; mask_16x16 >>= count; mask_8x8 >>= count; mask_4x4 >>= count; @@ -805,7 +801,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm, unsigned int mask_8x8[MI_BLOCK_SIZE] = {0}; unsigned int mask_4x4[MI_BLOCK_SIZE] = {0}; unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0}; - struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE]; + const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE]; int r, c; for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) { @@ -834,7 +830,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm, const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; // Filter level can vary per MI - if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x))) + if (!build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x])) continue; // Build masks based on the transform size of each block @@ -925,7 +921,7 @@ static void filter_block_plane(VP9_COMMON *const cm, struct buf_2d *const dst = &plane->dst; uint8_t* const dst0 = dst->buf; unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0}; - struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE]; + const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE]; int r, c; int row_shift = 3 - ss_x; int row_mask = 0xff >> (ss_x << 2); @@ -938,8 +934,8 @@ static void filter_block_plane(VP9_COMMON *const cm, // Determine the vertical edges that need filtering for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) { const MODE_INFO *mi = mi_8x8[c]; - if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x))) - continue; + + build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]); } if (!plane->plane_type) { mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y); diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h index 91d40ac..62389ea 100644 --- a/libvpx/vp9/common/vp9_loopfilter.h +++ b/libvpx/vp9/common/vp9_loopfilter.h @@ -12,7 +12,7 @@ #define VP9_COMMON_VP9_LOOPFILTER_H_ #include "vpx_ports/mem.h" -#include "vpx_config.h" +#include "./vpx_config.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_seg_common.h" @@ -46,12 +46,13 @@ struct loopfilter { // Need to align this structure so when it is declared and // passed it can be loaded into vector registers. typedef struct { - DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, - mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); - DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, - lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); - DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, - hev_thr[4][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]); +} loop_filter_thresh; + +typedef struct { + loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1]; uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS]; uint8_t mode_lf_lut[MB_MODE_COUNT]; } loop_filter_info_n; diff --git a/libvpx/vp9/common/vp9_loopfilter_filters.c b/libvpx/vp9/common/vp9_loopfilter_filters.c index 88130d8..2c4bf6c 100644 --- a/libvpx/vp9/common/vp9_loopfilter_filters.c +++ b/libvpx/vp9/common/vp9_loopfilter_filters.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vpx_config.h" +#include "./vpx_config.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_onyxc_int.h" diff --git a/libvpx/vp9/common/vp9_mvref_common.c b/libvpx/vp9/common/vp9_mvref_common.c index bfeeb57..8df8aec 100644 --- a/libvpx/vp9/common/vp9_mvref_common.c +++ b/libvpx/vp9/common/vp9_mvref_common.c @@ -108,7 +108,7 @@ static const int idx_n_column_to_subblock[4][2] = { }; // clamp_mv_ref -#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units +#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) { clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER, @@ -119,10 +119,9 @@ static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) { // This function returns either the appropriate sub block or block's mv // on whether the block_size < 8x8 and we have check_sub_blocks set. -static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, - int check_sub_blocks, int which_mv, +static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv, int search_col, int block_idx) { - return check_sub_blocks && candidate->mbmi.sb_type < BLOCK_8X8 + return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8 ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]] .as_mv[which_mv] : candidate->mbmi.mv[which_mv]; @@ -171,17 +170,19 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref, // Checks that the given mi_row, mi_col and search point // are inside the borders of the tile. -static INLINE int is_inside(const VP9_COMMON *cm, int mi_col, int mi_row, +static INLINE int is_inside(const TileInfo *const tile, + int mi_col, int mi_row, int mi_rows, const MV *mv) { return !(mi_row + mv->row < 0 || - mi_col + mv->col < cm->cur_tile_mi_col_start || - mi_row + mv->row >= cm->mi_rows || - mi_col + mv->col >= cm->cur_tile_mi_col_end); + mi_col + mv->col < tile->mi_col_start || + mi_row + mv->row >= mi_rows || + mi_col + mv->col >= tile->mi_col_end); } // This function searches the neighbourhood of a given MB/SB // to try and find candidate reference vectors. void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const TileInfo *const tile, MODE_INFO *mi, const MODE_INFO *prev_mi, MV_REFERENCE_FRAME ref_frame, int_mv *mv_ref_list, @@ -202,8 +203,7 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, // and we also need to keep a mode count. for (i = 0; i < 2; ++i) { const MV *const mv_ref = &mv_ref_search[i]; - if (is_inside(cm, mi_col, mi_row, mv_ref)) { - const int check_sub_blocks = block_idx >= 0; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { const MODE_INFO *const candidate_mi = xd->mi_8x8[mv_ref->col + mv_ref->row * xd->mode_info_stride]; const MB_MODE_INFO *const candidate = &candidate_mi->mbmi; @@ -212,13 +212,13 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, // Check if the candidate comes from the same reference frame. if (candidate->ref_frame[0] == ref_frame) { - ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, check_sub_blocks, 0, + ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block_idx)); different_ref_found = candidate->ref_frame[1] != ref_frame; } else { if (candidate->ref_frame[1] == ref_frame) // Add second motion vector if it has the same ref_frame. - ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, check_sub_blocks, 1, + ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block_idx)); different_ref_found = 1; } @@ -230,7 +230,7 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, // mode counts. for (; i < MVREF_NEIGHBOURS; ++i) { const MV *const mv_ref = &mv_ref_search[i]; - if (is_inside(cm, mi_col, mi_row, mv_ref)) { + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col + mv_ref->row * xd->mode_info_stride]->mbmi; @@ -260,7 +260,7 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, if (different_ref_found) { for (i = 0; i < MVREF_NEIGHBOURS; ++i) { const MV *mv_ref = &mv_ref_search[i]; - if (is_inside(cm, mi_col, mi_row, mv_ref)) { + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col + mv_ref->row * xd->mode_info_stride]->mbmi; diff --git a/libvpx/vp9/common/vp9_mvref_common.h b/libvpx/vp9/common/vp9_mvref_common.h index 39ebdb0..ce4c559 100644 --- a/libvpx/vp9/common/vp9_mvref_common.h +++ b/libvpx/vp9/common/vp9_mvref_common.h @@ -15,6 +15,7 @@ #define VP9_COMMON_VP9_MVREF_COMMON_H_ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const TileInfo *const tile, MODE_INFO *mi, const MODE_INFO *prev_mi, MV_REFERENCE_FRAME ref_frame, int_mv *mv_ref_list, @@ -22,11 +23,12 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, int mi_row, int mi_col); static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const TileInfo *const tile, MODE_INFO *mi, const MODE_INFO *prev_mi, MV_REFERENCE_FRAME ref_frame, int_mv *mv_ref_list, int mi_row, int mi_col) { - vp9_find_mv_refs_idx(cm, xd, mi, prev_mi, ref_frame, + vp9_find_mv_refs_idx(cm, xd, tile, mi, prev_mi, ref_frame, mv_ref_list, -1, mi_row, mi_col); } diff --git a/libvpx/vp9/common/vp9_onyx.h b/libvpx/vp9/common/vp9_onyx.h index f424e6a..452dd6b 100644 --- a/libvpx/vp9/common/vp9_onyx.h +++ b/libvpx/vp9/common/vp9_onyx.h @@ -13,7 +13,7 @@ #ifdef __cplusplus extern "C" -{ +{ // NOLINT #endif #include "./vpx_config.h" @@ -33,7 +33,6 @@ extern "C" FOURFIVE = 1, THREEFIVE = 2, ONETWO = 3 - } VPX_SCALING; typedef enum { @@ -71,42 +70,48 @@ extern "C" // 3 - lowest quality/fastest decode int width; // width of data passed to the compressor int height; // height of data passed to the compressor - double framerate; // set to passed in framerate - int64_t target_bandwidth; // bandwidth to be used in kilobits per second + double framerate; // set to passed in framerate + int64_t target_bandwidth; // bandwidth to be used in kilobits per second - int noise_sensitivity; // parameter used for applying pre processing blur: recommendation 0 - int Sharpness; // parameter used for sharpening output: recommendation 0: + int noise_sensitivity; // pre processing blur: recommendation 0 + int Sharpness; // sharpening output: recommendation 0: int cpu_used; unsigned int rc_max_intra_bitrate_pct; // mode -> - // (0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing - // a television signal or feed from a live camera). ( speed setting controls how fast ) - // (1)=Good Quality Fast Encoding. The encoder balances quality with the amount of time it takes to - // encode the output. ( speed setting controls how fast ) - // (2)=One Pass - Best Quality. The encoder places priority on the quality of the output over encoding - // speed. The output is compressed at the highest possible quality. This option takes the longest - // amount of time to encode. ( speed setting ignored ) - // (3)=Two Pass - First Pass. The encoder generates a file of statistics for use in the second encoding - // pass. ( speed setting controls how fast ) - // (4)=Two Pass - Second Pass. The encoder uses the statistics that were generated in the first encoding - // pass to create the compressed output. ( speed setting controls how fast ) - // (5)=Two Pass - Second Pass Best. The encoder uses the statistics that were generated in the first - // encoding pass to create the compressed output using the highest possible quality, and taking a + // (0)=Realtime/Live Encoding. This mode is optimized for realtime + // encoding (for example, capturing a television signal or feed from + // a live camera). ( speed setting controls how fast ) + // (1)=Good Quality Fast Encoding. The encoder balances quality with the + // amount of time it takes to encode the output. ( speed setting + // controls how fast ) + // (2)=One Pass - Best Quality. The encoder places priority on the + // quality of the output over encoding speed. The output is compressed + // at the highest possible quality. This option takes the longest + // amount of time to encode. ( speed setting ignored ) + // (3)=Two Pass - First Pass. The encoder generates a file of statistics + // for use in the second encoding pass. ( speed setting controls how + // fast ) + // (4)=Two Pass - Second Pass. The encoder uses the statistics that were + // generated in the first encoding pass to create the compressed + // output. ( speed setting controls how fast ) + // (5)=Two Pass - Second Pass Best. The encoder uses the statistics that + // were generated in the first encoding pass to create the compressed + // output using the highest possible quality, and taking a // longer amount of time to encode.. ( speed setting ignored ) - int Mode; // + int Mode; // Key Framing Operations - int auto_key; // automatically detect cut scenes and set the keyframes - int key_freq; // maximum distance to key frame. + int auto_key; // autodetect cut scenes and set the keyframes + int key_freq; // maximum distance to key frame. - int allow_lag; // allow lagged compression (if 0 lagin frames is ignored) - int lag_in_frames; // how many frames lag before we start encoding + int allow_lag; // allow lagged compression (if 0 lagin frames is ignored) + int lag_in_frames; // how many frames lag before we start encoding // ---------------------------------------------------------------- // DATARATE CONTROL OPTIONS - int end_usage; // vbr or cbr + int end_usage; // vbr or cbr // buffer targeting aggressiveness int under_shoot_pct; @@ -138,7 +143,7 @@ extern "C" int play_alternate; int alt_freq; - int encode_breakout; // early breakout encode threshold : for video conf recommend 800 + int encode_breakout; // early breakout : for video conf recommend 800 /* Bitfield defining the error resiliency features to enable. * Can provide decodable frames after losses in previous @@ -173,8 +178,8 @@ extern "C" void vp9_change_config(VP9_PTR onyx, VP9_CONFIG *oxcf); -// receive a frames worth of data caller can assume that a copy of this frame is made -// and not just a copy of the pointer.. + // receive a frames worth of data. caller can assume that a copy of this + // frame is made and not just a copy of the pointer.. int vp9_receive_raw_frame(VP9_PTR comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp); @@ -216,8 +221,6 @@ extern "C" int vp9_set_size_literal(VP9_PTR comp, unsigned int width, unsigned int height); - int vp9_switch_layer(VP9_PTR comp, int layer); - void vp9_set_svc(VP9_PTR comp, int use_svc); int vp9_get_quantizer(VP9_PTR c); diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h index 0431e14..a2af57a 100644 --- a/libvpx/vp9/common/vp9_onyxc_int.h +++ b/libvpx/vp9/common/vp9_onyxc_int.h @@ -11,14 +11,15 @@ #ifndef VP9_COMMON_VP9_ONYXC_INT_H_ #define VP9_COMMON_VP9_ONYXC_INT_H_ -#include "vpx_config.h" +#include "./vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" -#include "vp9_rtcd.h" +#include "./vp9_rtcd.h" #include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_tile_common.h" #if CONFIG_VP9_POSTPROC #include "vp9/common/vp9_postproc.h" @@ -40,10 +41,9 @@ typedef struct frame_contexts { vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1]; vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; - vp9_prob partition_prob[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS] - [PARTITION_TYPES - 1]; + vp9_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1]; vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES]; - vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1] + vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS] [SWITCHABLE_FILTERS - 1]; vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1]; vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS]; @@ -58,11 +58,11 @@ typedef struct frame_contexts { typedef struct { unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES]; unsigned int uv_mode[INTRA_MODES][INTRA_MODES]; - unsigned int partition[NUM_PARTITION_CONTEXTS][PARTITION_TYPES]; + unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES]; vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES]; unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES] [COEF_BANDS][PREV_COEF_CONTEXTS]; - unsigned int switchable_interp[SWITCHABLE_FILTERS + 1] + unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS] [SWITCHABLE_FILTERS]; unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES]; unsigned int intra_inter[INTRA_INTER_CONTEXTS][2]; @@ -91,6 +91,8 @@ typedef struct VP9Common { DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][8]); #endif + COLOR_SPACE color_space; + int width; int height; int display_width; @@ -116,11 +118,12 @@ typedef struct VP9Common { // Each frame can reference ALLOWED_REFS_PER_FRAME buffers int active_ref_idx[ALLOWED_REFS_PER_FRAME]; struct scale_factors active_ref_scale[ALLOWED_REFS_PER_FRAME]; + struct scale_factors_common active_ref_scale_comm[ALLOWED_REFS_PER_FRAME]; int new_fb_idx; YV12_BUFFER_CONFIG post_proc_buffer; - FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */ + FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/ FRAME_TYPE frame_type; int show_frame; @@ -129,6 +132,8 @@ typedef struct VP9Common { // Flag signaling that the frame is encoded using only INTRA modes. int intra_only; + int allow_high_precision_mv; + // Flag signaling that the frame context should be reset to default values. // 0 or 1 implies don't reset, 2 reset just the context specified in the // frame header, 3 reset all contexts. @@ -146,8 +151,6 @@ typedef struct VP9Common { TX_MODE tx_mode; int base_qindex; - int last_kf_gf_q; /* Q used on the last GF or KF */ - int y_dc_delta_q; int uv_dc_delta_q; int uv_ac_delta_q; @@ -172,7 +175,7 @@ typedef struct VP9Common { // Persistent mb segment id map used in prediction. unsigned char *last_frame_seg_map; - INTERPOLATIONFILTERTYPE mcomp_filter_type; + INTERPOLATION_TYPE mcomp_filter_type; loop_filter_info_n lf_info; @@ -183,14 +186,6 @@ typedef struct VP9Common { struct loopfilter lf; struct segmentation seg; - /* Y,U,V */ - ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; - ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16]; - - // partition contexts - PARTITION_CONTEXT *above_seg_context; - PARTITION_CONTEXT left_seg_context[8]; - // Context probabilities for reference frame prediction int allow_comp_inter_inter; MV_REFERENCE_FRAME comp_fixed_ref; @@ -213,10 +208,19 @@ typedef struct VP9Common { int frame_parallel_decoding_mode; int log2_tile_cols, log2_tile_rows; - int cur_tile_mi_col_start, cur_tile_mi_col_end; - int cur_tile_mi_row_start, cur_tile_mi_row_end; } VP9_COMMON; +// ref == 0 => LAST_FRAME +// ref == 1 => GOLDEN_FRAME +// ref == 2 => ALTREF_FRAME +static YV12_BUFFER_CONFIG *get_frame_ref_buffer(VP9_COMMON *cm, int ref) { + return &cm->yv12_fb[cm->active_ref_idx[ref]]; +} + +static YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) { + return &cm->yv12_fb[cm->new_fb_idx]; +} + static int get_free_fb(VP9_COMMON *cm) { int i; for (i = 0; i < NUM_YV12_BUFFERS; i++) @@ -241,58 +245,38 @@ static int mi_cols_aligned_to_sb(int n_mis) { return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2); } -static INLINE void set_skip_context(VP9_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col) { +static INLINE const vp9_prob* get_partition_probs(VP9_COMMON *cm, int ctx) { + return cm->frame_type == KEY_FRAME ? vp9_kf_partition_probs[ctx] + : cm->fc.partition_prob[ctx]; +} + +static INLINE void set_skip_context( + MACROBLOCKD *xd, + ENTROPY_CONTEXT *above_context[MAX_MB_PLANE], + ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16], + int mi_row, int mi_col) { const int above_idx = mi_col * 2; const int left_idx = (mi_row * 2) & 15; int i; for (i = 0; i < MAX_MB_PLANE; i++) { struct macroblockd_plane *const pd = &xd->plane[i]; - pd->above_context = cm->above_context[i] + (above_idx >> pd->subsampling_x); - pd->left_context = cm->left_context[i] + (left_idx >> pd->subsampling_y); + pd->above_context = above_context[i] + (above_idx >> pd->subsampling_x); + pd->left_context = left_context[i] + (left_idx >> pd->subsampling_y); } } -static INLINE void set_partition_seg_context(VP9_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col) { - xd->above_seg_context = cm->above_seg_context + mi_col; - xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK); -} - -// return the node index in the prob tree for binary coding -static int check_bsize_coverage(int bs, int mi_rows, int mi_cols, - int mi_row, int mi_col) { - const int r = (mi_row + bs < mi_rows); - const int c = (mi_col + bs < mi_cols); - - if (r && c) - return 0; - - if (c && !r) - return 1; // only allow horizontal/split partition types - - if (r && !c) - return 2; // only allow vertical/split partition types - - return -1; -} - -static void set_mi_row_col(VP9_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int bh, - int mi_col, int bw) { - xd->mb_to_top_edge = -((mi_row * MI_SIZE) << 3); - xd->mb_to_bottom_edge = ((cm->mi_rows - bh - mi_row) * MI_SIZE) << 3; - xd->mb_to_left_edge = -((mi_col * MI_SIZE) << 3); - xd->mb_to_right_edge = ((cm->mi_cols - bw - mi_col) * MI_SIZE) << 3; +static void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, + int mi_row, int bh, + int mi_col, int bw, + int mi_rows, int mi_cols) { + xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); + xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8; + xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); + xd->mb_to_right_edge = ((mi_cols - bw - mi_col) * MI_SIZE) * 8; // Are edges available for intra prediction? xd->up_available = (mi_row != 0); - xd->left_available = (mi_col > cm->cur_tile_mi_col_start); - xd->right_available = (mi_col + bw < cm->cur_tile_mi_col_end); -} - -static int get_token_alloc(int mb_rows, int mb_cols) { - return mb_rows * mb_cols * (48 * 16 + 4); + xd->left_available = (mi_col > tile->mi_col_start); } static void set_prev_mi(VP9_COMMON *cm) { @@ -306,4 +290,62 @@ static void set_prev_mi(VP9_COMMON *cm) { cm->prev_mi = use_prev_in_find_mv_refs ? cm->prev_mip + cm->mode_info_stride + 1 : NULL; } + +static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) { + return cm->frame_type == KEY_FRAME || cm->intra_only; +} + +static INLINE void update_partition_context( + PARTITION_CONTEXT *above_seg_context, + PARTITION_CONTEXT left_seg_context[8], + int mi_row, int mi_col, + BLOCK_SIZE sb_type, + BLOCK_SIZE sb_size) { + PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col; + PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK); + + const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2; + const int bwl = b_width_log2(sb_type); + const int bhl = b_height_log2(sb_type); + const int boffset = b_width_log2(BLOCK_64X64) - bsl; + const char pcval0 = ~(0xe << boffset); + const char pcval1 = ~(0xf << boffset); + const char pcvalue[2] = {pcval0, pcval1}; + + assert(MAX(bwl, bhl) <= bsl); + + // update the partition context at the end notes. set partition bits + // of block sizes larger than the current one to be one, and partition + // bits of smaller block sizes to be zero. + vpx_memset(above_ctx, pcvalue[bwl == bsl], bs); + vpx_memset(left_ctx, pcvalue[bhl == bsl], bs); +} + +static INLINE int partition_plane_context( + const PARTITION_CONTEXT *above_seg_context, + const PARTITION_CONTEXT left_seg_context[8], + int mi_row, int mi_col, + BLOCK_SIZE sb_type) { + const PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col; + const PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK); + + int bsl = mi_width_log2(sb_type), bs = 1 << bsl; + int above = 0, left = 0, i; + int boffset = mi_width_log2(BLOCK_64X64) - bsl; + + assert(mi_width_log2(sb_type) == mi_height_log2(sb_type)); + assert(bsl >= 0); + assert(boffset >= 0); + + for (i = 0; i < bs; i++) + above |= (above_ctx[i] & (1 << boffset)); + for (i = 0; i < bs; i++) + left |= (left_ctx[i] & (1 << boffset)); + + above = (above > 0); + left = (left > 0); + + return (left * 2 + above) + bsl * PARTITION_PLOFFSET; +} + #endif // VP9_COMMON_VP9_ONYXC_INT_H_ diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c index 955e676..212a28a 100644 --- a/libvpx/vp9/common/vp9_postproc.c +++ b/libvpx/vp9/common/vp9_postproc.c @@ -8,6 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <math.h> +#include <stdlib.h> +#include <stdio.h> #include "./vpx_config.h" #include "vpx_scale/yv12config.h" @@ -18,11 +21,6 @@ #include "./vp9_rtcd.h" #include "./vpx_scale_rtcd.h" - -#include <math.h> -#include <stdlib.h> -#include <stdio.h> - #define RGB_TO_YUV(t) \ ( (0.257*(float)(t >> 16)) + (0.504*(float)(t >> 8 & 0xff)) + \ (0.098*(float)(t & 0xff)) + 16), \ @@ -155,7 +153,6 @@ void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr, p_dst = dst_ptr; for (col = 0; col < cols; col++) { - int kernel = 4; int v = p_src[col]; @@ -257,7 +254,7 @@ void vp9_mbpost_proc_across_ip_c(uint8_t *src, int pitch, void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch, int rows, int cols, int flimit) { int r, c, i; - const short *rv3 = &vp9_rv[63 & rand()]; + const short *rv3 = &vp9_rv[63 & rand()]; // NOLINT for (c = 0; c < cols; c++) { uint8_t *s = &dst[c]; @@ -408,7 +405,6 @@ static void fillrd(struct postproc_state *state, int q, int a) { next = next + j; } - } for (; next < 256; next++) @@ -416,7 +412,7 @@ static void fillrd(struct postproc_state *state, int q, int a) { } for (i = 0; i < 3072; i++) { - state->noise[i] = char_dist[rand() & 0xff]; + state->noise[i] = char_dist[rand() & 0xff]; // NOLINT } for (i = 0; i < 16; i++) { @@ -680,13 +676,14 @@ int vp9_post_proc_frame(struct VP9Common *cm, #if 0 && CONFIG_POSTPROC_VISUALIZER if (flags & VP9D_DEBUG_TXT_FRAME_INFO) { char message[512]; - sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d", - (cm->frame_type == KEY_FRAME), - cm->refresh_golden_frame, - cm->base_qindex, - cm->filter_level, - flags, - cm->mb_cols, cm->mb_rows); + snprintf(message, sizeof(message) -1, + "F%1dG%1dQ%3dF%3dP%d_s%dx%d", + (cm->frame_type == KEY_FRAME), + cm->refresh_golden_frame, + cm->base_qindex, + cm->filter_level, + flags, + cm->mb_cols, cm->mb_rows); vp9_blit_text(message, cm->post_proc_buffer.y_buffer, cm->post_proc_buffer.y_stride); } @@ -707,7 +704,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, for (j = 0; j < mb_cols; j++) { char zz[4]; - sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a'); + snprintf(zz, sizeof(zz) - 1, "%c", mi[mb_index].mbmi.mode + 'a'); vp9_blit_text(zz, y_ptr, post->y_stride); mb_index++; @@ -716,7 +713,6 @@ int vp9_post_proc_frame(struct VP9Common *cm, mb_index++; /* border */ y_ptr += post->y_stride * 16 - post->y_width; - } } @@ -740,9 +736,9 @@ int vp9_post_proc_frame(struct VP9Common *cm, mi[mb_index].mbmi.skip_coeff); if (cm->frame_type == KEY_FRAME) - sprintf(zz, "a"); + snprintf(zz, sizeof(zz) - 1, "a"); else - sprintf(zz, "%c", dc_diff + '0'); + snprintf(zz, sizeof(zz) - 1, "%c", dc_diff + '0'); vp9_blit_text(zz, y_ptr, post->y_stride); mb_index++; @@ -751,7 +747,6 @@ int vp9_post_proc_frame(struct VP9Common *cm, mb_index++; /* border */ y_ptr += post->y_stride * 16 - post->y_width; - } } @@ -894,8 +889,9 @@ int vp9_post_proc_frame(struct VP9Common *cm, constrain_line(lx0, &x1, ly0 + 1, &y1, width, height); vp9_blit_line(lx0, x1, ly0 + 1, y1, y_buffer, y_stride); - } else + } else { vp9_blit_line(lx0, x1, ly0, y1, y_buffer, y_stride); + } } mi++; diff --git a/libvpx/vp9/common/vp9_pred_common.c b/libvpx/vp9/common/vp9_pred_common.c index 81fbf1f..6018e17 100644 --- a/libvpx/vp9/common/vp9_pred_common.c +++ b/libvpx/vp9/common/vp9_pred_common.c @@ -16,25 +16,33 @@ #include "vp9/common/vp9_seg_common.h" #include "vp9/common/vp9_treecoder.h" +static INLINE const MB_MODE_INFO *get_above_mbmi(const MODE_INFO *const above) { + return (above != NULL) ? &above->mbmi : NULL; +} + +static INLINE const MB_MODE_INFO *get_left_mbmi(const MODE_INFO *const left) { + return (left != NULL) ? &left->mbmi : NULL; +} + // Returns a context number for the given MB prediction signal unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) { - const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride]; - const MODE_INFO * const left_mi = xd->mi_8x8[-1]; - const int left_in_image = xd->left_available && left_mi; - const int above_in_image = xd->up_available && above_mi; + const MODE_INFO *const above_mi = get_above_mi(xd); + const MODE_INFO *const left_mi = get_left_mi(xd); + const int above_in_image = above_mi != NULL; + const int left_in_image = left_mi != NULL; // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. // The prediction flags in these dummy entries are initialised to 0. // left - const int left_mv_pred = left_in_image ? is_inter_mode(left_mi->mbmi.mode) + const int left_mv_pred = left_in_image ? is_inter_block(&left_mi->mbmi) : 0; const int left_interp = left_in_image && left_mv_pred ? left_mi->mbmi.interp_filter : SWITCHABLE_FILTERS; // above - const int above_mv_pred = above_in_image ? is_inter_mode(above_mi->mbmi.mode) + const int above_mv_pred = above_in_image ? is_inter_block(&above_mi->mbmi) : 0; const int above_interp = above_in_image && above_mv_pred ? above_mi->mbmi.interp_filter @@ -53,14 +61,14 @@ unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) { } // Returns a context number for the given MB prediction signal unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) { - const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride]; - const MODE_INFO * const left_mi = xd->mi_8x8[-1]; - const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0; - const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0; - const int left_in_image = xd->left_available && left_mi; - const int above_in_image = xd->up_available && above_mi; - const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; + const MODE_INFO *const above_mi = get_above_mi(xd); + const MODE_INFO *const left_mi = get_left_mi(xd); + const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi); + const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi); + const int above_in_image = above_mi != NULL; + const int left_in_image = left_mi != NULL; const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1; + const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; // The mode info data structure has a one element border above and to the // left of the entries corresponding to real macroblocks. @@ -81,12 +89,12 @@ unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) { unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm, const MACROBLOCKD *xd) { int pred_context; - const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride]; - const MODE_INFO * const left_mi = xd->mi_8x8[-1]; - const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0; - const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0; - const int left_in_image = xd->left_available && left_mi; - const int above_in_image = xd->up_available && above_mi; + const MODE_INFO *const above_mi = get_above_mi(xd); + const MODE_INFO *const left_mi = get_left_mi(xd); + const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi); + const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi); + const int above_in_image = above_mi != NULL; + const int left_in_image = left_mi != NULL; // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. @@ -126,14 +134,14 @@ unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm, unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, const MACROBLOCKD *xd) { int pred_context; - const MODE_INFO * const above_mi = xd->mi_8x8[-cm->mode_info_stride]; - const MODE_INFO * const left_mi = xd->mi_8x8[-1]; - const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0; - const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0; - const int left_in_image = xd->left_available && left_mi; - const int above_in_image = xd->up_available && above_mi; - const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; + const MODE_INFO *const above_mi = get_above_mi(xd); + const MODE_INFO *const left_mi = get_left_mi(xd); + const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi); + const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi); + const int above_in_image = above_mi != NULL; + const int left_in_image = left_mi != NULL; const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1; + const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. @@ -206,14 +214,14 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, } unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { int pred_context; - const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride]; - const MODE_INFO * const left_mi = xd->mi_8x8[-1]; - const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0; - const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0; - const int left_in_image = xd->left_available && left_mi; - const int above_in_image = xd->up_available && above_mi; - const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; + const MODE_INFO *const above_mi = get_above_mi(xd); + const MODE_INFO *const left_mi = get_left_mi(xd); + const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi); + const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi); + const int above_in_image = above_mi != NULL; + const int left_in_image = left_mi != NULL; const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1; + const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. @@ -272,14 +280,14 @@ unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { int pred_context; - const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride]; - const MODE_INFO * const left_mi = xd->mi_8x8[-1]; - const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0; - const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0; - const int left_in_image = xd->left_available && left_mi; - const int above_in_image = xd->up_available && above_mi; - const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; + const MODE_INFO *const above_mi = get_above_mi(xd); + const MODE_INFO *const left_mi = get_left_mi(xd); + const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi); + const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi); + const int above_in_image = above_mi != NULL; + const int left_in_image = left_mi != NULL; const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1; + const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; // Note: // The mode info data structure has a one element border above and to the @@ -361,12 +369,12 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { // left of the entries corresponding to real blocks. // The prediction flags in these dummy entries are initialized to 0. unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) { - const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride]; - const MODE_INFO * const left_mi = xd->mi_8x8[-1]; - const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0; - const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0; - const int left_in_image = xd->left_available && left_mi; - const int above_in_image = xd->up_available && above_mi; + const MODE_INFO *const above_mi = get_above_mi(xd); + const MODE_INFO *const left_mi = get_left_mi(xd); + const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi); + const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi); + const int above_in_image = above_mi != NULL; + const int left_in_image = left_mi != NULL; const int max_tx_size = max_txsize_lookup[xd->mi_8x8[0]->mbmi.sb_type]; int above_context = max_tx_size; int left_context = max_tx_size; @@ -389,19 +397,14 @@ unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) { } void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag) { - xd->this_mi->mbmi.seg_id_predicted = pred_flag; -} - -void vp9_set_pred_flag_mbskip(MACROBLOCKD *xd, BLOCK_SIZE bsize, - uint8_t pred_flag) { - xd->this_mi->mbmi.skip_coeff = pred_flag; + xd->mi_8x8[0]->mbmi.seg_id_predicted = pred_flag; } int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids, BLOCK_SIZE bsize, int mi_row, int mi_col) { const int mi_offset = mi_row * cm->mi_cols + mi_col; - const int bw = 1 << mi_width_log2(bsize); - const int bh = 1 << mi_height_log2(bsize); + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; const int xmis = MIN(cm->mi_cols - mi_col, bw); const int ymis = MIN(cm->mi_rows - mi_row, bh); int x, y, segment_id = INT_MAX; diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h index 47ca8ab..19032bf 100644 --- a/libvpx/vp9/common/vp9_pred_common.h +++ b/libvpx/vp9/common/vp9_pred_common.h @@ -14,17 +14,25 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_onyxc_int.h" +static INLINE const MODE_INFO *get_above_mi(const MACROBLOCKD *const xd) { + return xd->up_available ? xd->mi_8x8[-xd->mode_info_stride] : NULL; +} + +static INLINE const MODE_INFO *get_left_mi(const MACROBLOCKD *const xd) { + return xd->left_available ? xd->mi_8x8[-1] : NULL; +} + int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids, BLOCK_SIZE bsize, int mi_row, int mi_col); - static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) { - const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride]; - const MODE_INFO * const left_mi = xd->mi_8x8[-1]; - const int above_sip = above_mi ? above_mi->mbmi.seg_id_predicted : 0; - const int left_sip = left_mi ? left_mi->mbmi.seg_id_predicted : 0; + const MODE_INFO *const above_mi = get_above_mi(xd); + const MODE_INFO *const left_mi = get_left_mi(xd); + const int above_sip = (above_mi != NULL) ? + above_mi->mbmi.seg_id_predicted : 0; + const int left_sip = (left_mi != NULL) ? left_mi->mbmi.seg_id_predicted : 0; - return above_sip + (xd->left_available ? left_sip : 0); + return above_sip + left_sip; } static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg, @@ -35,12 +43,13 @@ static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg, void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag); static INLINE int vp9_get_pred_context_mbskip(const MACROBLOCKD *xd) { - const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride]; - const MODE_INFO * const left_mi = xd->mi_8x8[-1]; - const int above_skip_coeff = above_mi ? above_mi->mbmi.skip_coeff : 0; - const int left_skip_coeff = left_mi ? left_mi->mbmi.skip_coeff : 0; + const MODE_INFO *const above_mi = get_above_mi(xd); + const MODE_INFO *const left_mi = get_left_mi(xd); + const int above_skip_coeff = (above_mi != NULL) ? + above_mi->mbmi.skip_coeff : 0; + const int left_skip_coeff = (left_mi != NULL) ? left_mi->mbmi.skip_coeff : 0; - return above_skip_coeff + (xd->left_available ? left_skip_coeff : 0); + return above_skip_coeff + left_skip_coeff; } static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm, @@ -49,12 +58,9 @@ static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm, } static INLINE unsigned char vp9_get_pred_flag_mbskip(const MACROBLOCKD *xd) { - return xd->this_mi->mbmi.skip_coeff; + return xd->mi_8x8[0]->mbmi.skip_coeff; } -void vp9_set_pred_flag_mbskip(MACROBLOCKD *xd, BLOCK_SIZE bsize, - uint8_t pred_flag); - unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd); unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd); @@ -69,8 +75,9 @@ unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm, const MACROBLOCKD *xd); -static INLINE vp9_prob vp9_get_pred_prob_comp_inter_inter(const VP9_COMMON *cm, - const MACROBLOCKD *xd) { +static INLINE +vp9_prob vp9_get_pred_prob_comp_inter_inter(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { const int pred_context = vp9_get_pred_context_comp_inter_inter(cm, xd); return cm->fc.comp_inter_prob[pred_context]; } @@ -120,14 +127,14 @@ static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd, return get_tx_probs(bsize, context, tx_probs); } -static void update_tx_counts(BLOCK_SIZE bsize, uint8_t context, - TX_SIZE tx_size, struct tx_counts *tx_counts) { - if (bsize >= BLOCK_32X32) - tx_counts->p32x32[context][tx_size]++; - else if (bsize >= BLOCK_16X16) - tx_counts->p16x16[context][tx_size]++; +static unsigned int *get_tx_counts(BLOCK_SIZE bsize, uint8_t context, + struct tx_counts *tx_counts) { + if (bsize < BLOCK_16X16) + return tx_counts->p8x8[context]; + else if (bsize < BLOCK_32X32) + return tx_counts->p16x16[context]; else - tx_counts->p8x8[context][tx_size]++; + return tx_counts->p32x32[context]; } #endif // VP9_COMMON_VP9_PRED_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_quant_common.c b/libvpx/vp9/common/vp9_quant_common.c index bc40854..6dbdb42 100644 --- a/libvpx/vp9/common/vp9_quant_common.c +++ b/libvpx/vp9/common/vp9_quant_common.c @@ -14,69 +14,69 @@ #if 1 static const int16_t dc_qlookup[QINDEX_RANGE] = { - 4, 8, 8, 9, 10, 11, 12, 12, - 13, 14, 15, 16, 17, 18, 19, 19, - 20, 21, 22, 23, 24, 25, 26, 26, - 27, 28, 29, 30, 31, 32, 32, 33, - 34, 35, 36, 37, 38, 38, 39, 40, - 41, 42, 43, 43, 44, 45, 46, 47, - 48, 48, 49, 50, 51, 52, 53, 53, - 54, 55, 56, 57, 57, 58, 59, 60, - 61, 62, 62, 63, 64, 65, 66, 66, - 67, 68, 69, 70, 70, 71, 72, 73, - 74, 74, 75, 76, 77, 78, 78, 79, - 80, 81, 81, 82, 83, 84, 85, 85, - 87, 88, 90, 92, 93, 95, 96, 98, - 99, 101, 102, 104, 105, 107, 108, 110, - 111, 113, 114, 116, 117, 118, 120, 121, - 123, 125, 127, 129, 131, 134, 136, 138, - 140, 142, 144, 146, 148, 150, 152, 154, - 156, 158, 161, 164, 166, 169, 172, 174, - 177, 180, 182, 185, 187, 190, 192, 195, - 199, 202, 205, 208, 211, 214, 217, 220, - 223, 226, 230, 233, 237, 240, 243, 247, - 250, 253, 257, 261, 265, 269, 272, 276, - 280, 284, 288, 292, 296, 300, 304, 309, - 313, 317, 322, 326, 330, 335, 340, 344, - 349, 354, 359, 364, 369, 374, 379, 384, - 389, 395, 400, 406, 411, 417, 423, 429, - 435, 441, 447, 454, 461, 467, 475, 482, - 489, 497, 505, 513, 522, 530, 539, 549, - 559, 569, 579, 590, 602, 614, 626, 640, - 654, 668, 684, 700, 717, 736, 755, 775, - 796, 819, 843, 869, 896, 925, 955, 988, + 4, 8, 8, 9, 10, 11, 12, 12, + 13, 14, 15, 16, 17, 18, 19, 19, + 20, 21, 22, 23, 24, 25, 26, 26, + 27, 28, 29, 30, 31, 32, 32, 33, + 34, 35, 36, 37, 38, 38, 39, 40, + 41, 42, 43, 43, 44, 45, 46, 47, + 48, 48, 49, 50, 51, 52, 53, 53, + 54, 55, 56, 57, 57, 58, 59, 60, + 61, 62, 62, 63, 64, 65, 66, 66, + 67, 68, 69, 70, 70, 71, 72, 73, + 74, 74, 75, 76, 77, 78, 78, 79, + 80, 81, 81, 82, 83, 84, 85, 85, + 87, 88, 90, 92, 93, 95, 96, 98, + 99, 101, 102, 104, 105, 107, 108, 110, + 111, 113, 114, 116, 117, 118, 120, 121, + 123, 125, 127, 129, 131, 134, 136, 138, + 140, 142, 144, 146, 148, 150, 152, 154, + 156, 158, 161, 164, 166, 169, 172, 174, + 177, 180, 182, 185, 187, 190, 192, 195, + 199, 202, 205, 208, 211, 214, 217, 220, + 223, 226, 230, 233, 237, 240, 243, 247, + 250, 253, 257, 261, 265, 269, 272, 276, + 280, 284, 288, 292, 296, 300, 304, 309, + 313, 317, 322, 326, 330, 335, 340, 344, + 349, 354, 359, 364, 369, 374, 379, 384, + 389, 395, 400, 406, 411, 417, 423, 429, + 435, 441, 447, 454, 461, 467, 475, 482, + 489, 497, 505, 513, 522, 530, 539, 549, + 559, 569, 579, 590, 602, 614, 626, 640, + 654, 668, 684, 700, 717, 736, 755, 775, + 796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336, }; static const int16_t ac_qlookup[QINDEX_RANGE] = { - 4, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, - 23, 24, 25, 26, 27, 28, 29, 30, - 31, 32, 33, 34, 35, 36, 37, 38, - 39, 40, 41, 42, 43, 44, 45, 46, - 47, 48, 49, 50, 51, 52, 53, 54, - 55, 56, 57, 58, 59, 60, 61, 62, - 63, 64, 65, 66, 67, 68, 69, 70, - 71, 72, 73, 74, 75, 76, 77, 78, - 79, 80, 81, 82, 83, 84, 85, 86, - 87, 88, 89, 90, 91, 92, 93, 94, - 95, 96, 97, 98, 99, 100, 101, 102, - 104, 106, 108, 110, 112, 114, 116, 118, - 120, 122, 124, 126, 128, 130, 132, 134, - 136, 138, 140, 142, 144, 146, 148, 150, - 152, 155, 158, 161, 164, 167, 170, 173, - 176, 179, 182, 185, 188, 191, 194, 197, - 200, 203, 207, 211, 215, 219, 223, 227, - 231, 235, 239, 243, 247, 251, 255, 260, - 265, 270, 275, 280, 285, 290, 295, 300, - 305, 311, 317, 323, 329, 335, 341, 347, - 353, 359, 366, 373, 380, 387, 394, 401, - 408, 416, 424, 432, 440, 448, 456, 465, - 474, 483, 492, 501, 510, 520, 530, 540, - 550, 560, 571, 582, 593, 604, 615, 627, - 639, 651, 663, 676, 689, 702, 715, 729, - 743, 757, 771, 786, 801, 816, 832, 848, - 864, 881, 898, 915, 933, 951, 969, 988, + 4, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, + 63, 64, 65, 66, 67, 68, 69, 70, + 71, 72, 73, 74, 75, 76, 77, 78, + 79, 80, 81, 82, 83, 84, 85, 86, + 87, 88, 89, 90, 91, 92, 93, 94, + 95, 96, 97, 98, 99, 100, 101, 102, + 104, 106, 108, 110, 112, 114, 116, 118, + 120, 122, 124, 126, 128, 130, 132, 134, + 136, 138, 140, 142, 144, 146, 148, 150, + 152, 155, 158, 161, 164, 167, 170, 173, + 176, 179, 182, 185, 188, 191, 194, 197, + 200, 203, 207, 211, 215, 219, 223, 227, + 231, 235, 239, 243, 247, 251, 255, 260, + 265, 270, 275, 280, 285, 290, 295, 300, + 305, 311, 317, 323, 329, 335, 341, 347, + 353, 359, 366, 373, 380, 387, 394, 401, + 408, 416, 424, 432, 440, 448, 456, 465, + 474, 483, 492, 501, 510, 520, 530, 540, + 550, 560, 571, 582, 593, 604, 615, 627, + 639, 651, 663, 676, 689, 702, 715, 729, + 743, 757, 771, 786, 801, 816, 832, 848, + 864, 881, 898, 915, 933, 951, 969, 988, 1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, 1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343, 1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567, diff --git a/libvpx/vp9/common/vp9_reconinter.c b/libvpx/vp9/common/vp9_reconinter.c index dc1d46c..1c96788 100644 --- a/libvpx/vp9/common/vp9_reconinter.c +++ b/libvpx/vp9/common/vp9_reconinter.c @@ -20,37 +20,44 @@ #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" - void vp9_setup_interp_filters(MACROBLOCKD *xd, - INTERPOLATIONFILTERTYPE mcomp_filter_type, + INTERPOLATION_TYPE mcomp_filter_type, VP9_COMMON *cm) { - if (xd->mi_8x8 && xd->this_mi) { - MB_MODE_INFO * mbmi = &xd->this_mi->mbmi; + if (xd->mi_8x8 && xd->mi_8x8[0]) { + MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; - set_scale_factors(xd, mbmi->ref_frame[0] - 1, mbmi->ref_frame[1] - 1, - cm->active_ref_scale); + set_scale_factors(xd, mbmi->ref_frame[0] - LAST_FRAME, + mbmi->ref_frame[1] - LAST_FRAME, + cm->active_ref_scale); } else { set_scale_factors(xd, -1, -1, cm->active_ref_scale); } - switch (mcomp_filter_type) { - case EIGHTTAP: - case SWITCHABLE: - xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8; - break; - case EIGHTTAP_SMOOTH: - xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8lp; - break; - case EIGHTTAP_SHARP: - xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8s; - break; - case BILINEAR: - xd->subpix.filter_x = xd->subpix.filter_y = vp9_bilinear_filters; - break; - } + xd->subpix.filter_x = xd->subpix.filter_y = + vp9_get_filter_kernel(mcomp_filter_type == SWITCHABLE ? + EIGHTTAP : mcomp_filter_type); + assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0); } +static void inter_predictor(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const MV32 *mv, + const struct scale_factors *scale, + int w, int h, int ref, + const struct subpix_fn_table *subpix, + int xs, int ys) { + const int subpel_x = mv->col & SUBPEL_MASK; + const int subpel_y = mv->row & SUBPEL_MASK; + + src += (mv->row >> SUBPEL_BITS) * src_stride + (mv->col >> SUBPEL_BITS); + scale->sfc->predict[subpel_x != 0][subpel_y != 0][ref]( + src, src_stride, dst, dst_stride, + subpix->filter_x[subpel_x], xs, + subpix->filter_y[subpel_y], ys, + w, h); +} + void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const MV *src_mv, @@ -59,18 +66,13 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride, const struct subpix_fn_table *subpix, enum mv_precision precision) { const int is_q4 = precision == MV_PRECISION_Q4; - const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row << 1, - is_q4 ? src_mv->col : src_mv->col << 1 }; - const MV32 mv = scale->scale_mv(&mv_q4, scale); - const int subpel_x = mv.col & SUBPEL_MASK; - const int subpel_y = mv.row & SUBPEL_MASK; - - src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS); - scale->predict[subpel_x != 0][subpel_y != 0][ref]( - src, src_stride, dst, dst_stride, - subpix->filter_x[subpel_x], scale->x_step_q4, - subpix->filter_y[subpel_y], scale->y_step_q4, - w, h); + const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2, + is_q4 ? src_mv->col : src_mv->col * 2 }; + const struct scale_factors_common *sfc = scale->sfc; + const MV32 mv = sfc->scale_mv(&mv_q4, scale); + + inter_predictor(src, src_stride, dst, dst_stride, &mv, scale, + w, h, ref, subpix, sfc->x_step_q4, sfc->y_step_q4); } static INLINE int round_mv_comp_q4(int value) { @@ -100,16 +102,17 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv, const int spel_top = (VP9_INTERP_EXTEND + bh) << SUBPEL_BITS; const int spel_bottom = spel_top - SUBPEL_SHIFTS; MV clamped_mv = { - src_mv->row << (1 - ss_y), - src_mv->col << (1 - ss_x) + src_mv->row * (1 << (1 - ss_y)), + src_mv->col * (1 << (1 - ss_x)) }; assert(ss_x <= 1); assert(ss_y <= 1); - clamp_mv(&clamped_mv, (xd->mb_to_left_edge << (1 - ss_x)) - spel_left, - (xd->mb_to_right_edge << (1 - ss_x)) + spel_right, - (xd->mb_to_top_edge << (1 - ss_y)) - spel_top, - (xd->mb_to_bottom_edge << (1 - ss_y)) + spel_bottom); + clamp_mv(&clamped_mv, + xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left, + xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right, + xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top, + xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom); return clamped_mv; } @@ -130,8 +133,8 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize, const int bh = plane_block_height(bsize, pd); const int x = 4 * (block & ((1 << bwl) - 1)); const int y = 4 * (block >> bwl); - const MODE_INFO *mi = xd->this_mi; - const int use_second_ref = mi->mbmi.ref_frame[1] > 0; + const MODE_INFO *mi = xd->mi_8x8[0]; + const int is_compound = has_second_ref(&mi->mbmi); int ref; assert(x < bw); @@ -139,14 +142,10 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize, assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == bw); assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == bh); - for (ref = 0; ref < 1 + use_second_ref; ++ref) { + for (ref = 0; ref < 1 + is_compound; ++ref) { struct scale_factors *const scale = &xd->scale_factor[ref]; struct buf_2d *const pre_buf = &pd->pre[ref]; struct buf_2d *const dst_buf = &pd->dst; - - const uint8_t *const pre = pre_buf->buf + scaled_buffer_offset(x, y, - pre_buf->stride, scale); - uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the @@ -162,15 +161,32 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize, // scaling case. It needs to be done on the scaled MV, not the pre-scaling // MV. Note however that it performs the subsampling aware scaling so // that the result is always q4. - const MV res_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh, - pd->subsampling_x, - pd->subsampling_y); - - scale->set_scaled_offsets(scale, arg->y + y, arg->x + x); - vp9_build_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, - &res_mv, scale, - 4 << pred_w, 4 << pred_h, ref, - &xd->subpix, MV_PRECISION_Q4); + // mv_precision precision is MV_PRECISION_Q4. + const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh, + pd->subsampling_x, + pd->subsampling_y); + + uint8_t *pre; + MV32 scaled_mv; + int xs, ys; + + if (vp9_is_scaled(scale->sfc)) { + pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, scale); + scale->sfc->set_scaled_offsets(scale, arg->y + y, arg->x + x); + scaled_mv = scale->sfc->scale_mv(&mv_q4, scale); + xs = scale->sfc->x_step_q4; + ys = scale->sfc->y_step_q4; + } else { + pre = pre_buf->buf + (y * pre_buf->stride + x); + scaled_mv.row = mv_q4.row; + scaled_mv.col = mv_q4.col; + xs = ys = 16; + } + + inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, + &scaled_mv, scale, + 4 << pred_w, 4 << pred_h, ref, + &xd->subpix, xs, ys); } } @@ -184,36 +200,17 @@ typedef void (*foreach_predicted_block_visitor)(int plane, int block, static INLINE void foreach_predicted_block_in_plane( const MACROBLOCKD* const xd, BLOCK_SIZE bsize, int plane, foreach_predicted_block_visitor visit, void *arg) { - int i, x, y; - - // block sizes in number of 4x4 blocks log 2 ("*_b") - // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 - // subsampled size of the block const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y; - // size of the predictor to use. - int pred_w, pred_h; - - if (xd->this_mi->mbmi.sb_type < BLOCK_8X8) { + if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) { + int i = 0, x, y; assert(bsize == BLOCK_8X8); - pred_w = 0; - pred_h = 0; + for (y = 0; y < 1 << bhl; ++y) + for (x = 0; x < 1 << bwl; ++x) + visit(plane, i++, bsize, 0, 0, arg); } else { - pred_w = bwl; - pred_h = bhl; - } - assert(pred_w <= bwl); - assert(pred_h <= bhl); - - // visit each subblock in raster order - i = 0; - for (y = 0; y < 1 << bhl; y += 1 << pred_h) { - for (x = 0; x < 1 << bwl; x += 1 << pred_w) { - visit(plane, i, bsize, pred_w, pred_h, arg); - i += 1 << pred_w; - } - i += (1 << (bwl + pred_h)) - (1 << bwl); + visit(plane, 0, bsize, bwl, bhl, arg); } } @@ -249,15 +246,17 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, void vp9_setup_scale_factors(VP9_COMMON *cm, int i) { const int ref = cm->active_ref_idx[i]; struct scale_factors *const sf = &cm->active_ref_scale[i]; + struct scale_factors_common *const sfc = &cm->active_ref_scale_comm[i]; if (ref >= NUM_YV12_BUFFERS) { vp9_zero(*sf); + vp9_zero(*sfc); } else { YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[ref]; - vp9_setup_scale_factors_for_frame(sf, + vp9_setup_scale_factors_for_frame(sf, sfc, fb->y_crop_width, fb->y_crop_height, cm->width, cm->height); - if (vp9_is_scaled(sf)) + if (vp9_is_scaled(sfc)) vp9_extend_frame_borders(fb, cm->subsampling_x, cm->subsampling_y); } } diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h index 504b793..2c8a6e4 100644 --- a/libvpx/vp9/common/vp9_reconinter.h +++ b/libvpx/vp9/common/vp9_reconinter.h @@ -25,7 +25,7 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize); void vp9_setup_interp_filters(MACROBLOCKD *xd, - INTERPOLATIONFILTERTYPE filter, + INTERPOLATION_TYPE filter, VP9_COMMON *cm); void vp9_build_inter_predictor(const uint8_t *src, int src_stride, @@ -38,8 +38,10 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride, static int scaled_buffer_offset(int x_offset, int y_offset, int stride, const struct scale_factors *scale) { - const int x = scale ? scale->scale_value_x(x_offset, scale) : x_offset; - const int y = scale ? scale->scale_value_y(y_offset, scale) : y_offset; + const int x = scale ? scale->sfc->scale_value_x(x_offset, scale->sfc) : + x_offset; + const int y = scale ? scale->sfc->scale_value_y(y_offset, scale->sfc) : + y_offset; return y * stride + x; } diff --git a/libvpx/vp9/common/vp9_reconintra.c b/libvpx/vp9/common/vp9_reconintra.c index 4a451b9..eb643b0 100644 --- a/libvpx/vp9/common/vp9_reconintra.c +++ b/libvpx/vp9/common/vp9_reconintra.c @@ -13,7 +13,7 @@ #include "vpx_mem/vpx_mem.h" #include "vpx_ports/vpx_once.h" -#include "vp9_rtcd.h" +#include "./vp9_rtcd.h" #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_onyxc_int.h" @@ -369,7 +369,7 @@ static void build_intra_predictors(const uint8_t *ref, int ref_stride, } } -void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in, +void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in, TX_SIZE tx_size, int mode, const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride) { diff --git a/libvpx/vp9/common/vp9_reconintra.h b/libvpx/vp9/common/vp9_reconintra.h index e9d0dbf..6e3f55c 100644 --- a/libvpx/vp9/common/vp9_reconintra.h +++ b/libvpx/vp9/common/vp9_reconintra.h @@ -14,8 +14,8 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" -void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in, - TX_SIZE tx_size, int mode, - const uint8_t *ref, int ref_stride, - uint8_t *dst, int dst_stride); +void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in, + TX_SIZE tx_size, int mode, + const uint8_t *ref, int ref_stride, + uint8_t *dst, int dst_stride); #endif // VP9_COMMON_VP9_RECONINTRA_H_ diff --git a/libvpx/vp9/common/vp9_rtcd.c b/libvpx/vp9/common/vp9_rtcd.c index 72613ae..dc15a84 100644 --- a/libvpx/vp9/common/vp9_rtcd.c +++ b/libvpx/vp9/common/vp9_rtcd.c @@ -7,9 +7,9 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include "vpx_config.h" +#include "./vpx_config.h" #define RTCD_C -#include "vp9_rtcd.h" +#include "./vp9_rtcd.h" #include "vpx_ports/vpx_once.h" void vpx_scale_rtcd(void); diff --git a/libvpx/vp9/common/vp9_rtcd_defs.sh b/libvpx/vp9/common/vp9_rtcd_defs.sh index 042afbb..debec61 100644 --- a/libvpx/vp9/common/vp9_rtcd_defs.sh +++ b/libvpx/vp9/common/vp9_rtcd_defs.sh @@ -22,38 +22,23 @@ forward_decls vp9_common_forward_decls # x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly. [ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse_x86inc=sse && - sse2_x86inc=sse2 && ssse3_x86inc=ssse3 + sse2_x86inc=sse2 && ssse3_x86inc=ssse3 && avx_x86inc=avx && avx2_x86inc=avx2 # this variable is for functions that are 64 bit only. -[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && ssse3_x86_64=ssse3 - -# -# Dequant -# - -prototype void vp9_idct_add_16x16 "int16_t *input, uint8_t *dest, int stride, int eob" -specialize vp9_idct_add_16x16 - -prototype void vp9_idct_add_8x8 "int16_t *input, uint8_t *dest, int stride, int eob" -specialize vp9_idct_add_8x8 - -prototype void vp9_idct_add "int16_t *input, uint8_t *dest, int stride, int eob" -specialize vp9_idct_add - -prototype void vp9_idct_add_32x32 "int16_t *q, uint8_t *dst, int stride, int eob" -specialize vp9_idct_add_32x32 +[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && + ssse3_x86_64=ssse3 && avx_x86_64=avx && avx2_x86_64=avx2 # # RECON # prototype void vp9_d207_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d207_predictor_4x4 +specialize vp9_d207_predictor_4x4 $ssse3_x86inc prototype void vp9_d45_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d45_predictor_4x4 $ssse3_x86inc prototype void vp9_d63_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d63_predictor_4x4 +specialize vp9_d63_predictor_4x4 $ssse3_x86inc prototype void vp9_h_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_h_predictor_4x4 $ssse3_x86inc @@ -65,7 +50,7 @@ prototype void vp9_d135_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const u specialize vp9_d135_predictor_4x4 prototype void vp9_d153_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d153_predictor_4x4 +specialize vp9_d153_predictor_4x4 $ssse3_x86inc prototype void vp9_v_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_v_predictor_4x4 $sse_x86inc @@ -86,13 +71,13 @@ prototype void vp9_dc_128_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const specialize vp9_dc_128_predictor_4x4 prototype void vp9_d207_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d207_predictor_8x8 +specialize vp9_d207_predictor_8x8 $ssse3_x86inc prototype void vp9_d45_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d45_predictor_8x8 $ssse3_x86inc prototype void vp9_d63_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d63_predictor_8x8 +specialize vp9_d63_predictor_8x8 $ssse3_x86inc prototype void vp9_h_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_h_predictor_8x8 $ssse3_x86inc @@ -104,7 +89,7 @@ prototype void vp9_d135_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const u specialize vp9_d135_predictor_8x8 prototype void vp9_d153_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d153_predictor_8x8 +specialize vp9_d153_predictor_8x8 $ssse3_x86inc prototype void vp9_v_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_v_predictor_8x8 $sse_x86inc @@ -125,13 +110,13 @@ prototype void vp9_dc_128_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const specialize vp9_dc_128_predictor_8x8 prototype void vp9_d207_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d207_predictor_16x16 +specialize vp9_d207_predictor_16x16 $ssse3_x86inc prototype void vp9_d45_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d45_predictor_16x16 $ssse3_x86inc prototype void vp9_d63_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d63_predictor_16x16 +specialize vp9_d63_predictor_16x16 $ssse3_x86inc prototype void vp9_h_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_h_predictor_16x16 $ssse3_x86inc @@ -143,7 +128,7 @@ prototype void vp9_d135_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const specialize vp9_d135_predictor_16x16 prototype void vp9_d153_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d153_predictor_16x16 +specialize vp9_d153_predictor_16x16 $ssse3_x86inc prototype void vp9_v_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_v_predictor_16x16 $sse2_x86inc @@ -164,16 +149,16 @@ prototype void vp9_dc_128_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, con specialize vp9_dc_128_predictor_16x16 prototype void vp9_d207_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d207_predictor_32x32 +specialize vp9_d207_predictor_32x32 $ssse3_x86inc prototype void vp9_d45_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d45_predictor_32x32 $ssse3_x86inc prototype void vp9_d63_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_d63_predictor_32x32 +specialize vp9_d63_predictor_32x32 $ssse3_x86inc prototype void vp9_h_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_h_predictor_32x32 $ssse3 x86inc +specialize vp9_h_predictor_32x32 $ssse3_x86inc prototype void vp9_d117_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d117_predictor_32x32 @@ -202,17 +187,6 @@ specialize vp9_dc_left_predictor_32x32 prototype void vp9_dc_128_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_dc_128_predictor_32x32 -if [ "$CONFIG_VP9_DECODER" = "yes" ]; then -prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride" -specialize vp9_add_constant_residual_8x8 sse2 neon - -prototype void vp9_add_constant_residual_16x16 "const int16_t diff, uint8_t *dest, int stride" -specialize vp9_add_constant_residual_16x16 sse2 neon - -prototype void vp9_add_constant_residual_32x32 "const int16_t diff, uint8_t *dest, int stride" -specialize vp9_add_constant_residual_32x32 sse2 neon -fi - # # Loopfilter # @@ -226,7 +200,7 @@ prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8 specialize vp9_loop_filter_vertical_edge mmx neon prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_mb_lpf_horizontal_edge_w sse2 neon +specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" specialize vp9_mbloop_filter_horizontal_edge sse2 neon @@ -268,80 +242,81 @@ specialize vp9_blend_b # Sub Pixel Filters # prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve_copy $sse2_x86inc neon +specialize vp9_convolve_copy $sse2_x86inc neon dspr2 prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve_avg $sse2_x86inc neon +specialize vp9_convolve_avg $sse2_x86inc neon dspr2 prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8 ssse3 neon +specialize vp9_convolve8 sse2 ssse3 neon dspr2 prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_horiz ssse3 neon +specialize vp9_convolve8_horiz sse2 ssse3 neon dspr2 prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_vert ssse3 neon +specialize vp9_convolve8_vert sse2 ssse3 neon dspr2 prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_avg ssse3 neon +specialize vp9_convolve8_avg sse2 ssse3 neon dspr2 prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_avg_horiz ssse3 neon +specialize vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2 prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_avg_vert ssse3 neon +specialize vp9_convolve8_avg_vert sse2 ssse3 neon dspr2 # # dct # -prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct4x4_1_add sse2 neon +prototype void vp9_idct4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_idct4x4_1_add sse2 neon dspr2 -prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct4x4_add sse2 neon +prototype void vp9_idct4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_idct4x4_16_add sse2 neon dspr2 -prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct8x8_1_add sse2 neon +prototype void vp9_idct8x8_1_add "const int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_idct8x8_1_add sse2 neon dspr2 -prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct8x8_add sse2 neon +prototype void vp9_idct8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_idct8x8_64_add sse2 neon dspr2 -prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct10_8x8_add sse2 neon +prototype void vp9_idct8x8_10_add "const int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_idct8x8_10_add sse2 neon dspr2 -prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct16x16_1_add sse2 neon +prototype void vp9_idct16x16_1_add "const int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_idct16x16_1_add sse2 neon dspr2 -prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct16x16_add sse2 neon +prototype void vp9_idct16x16_256_add "const int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_idct16x16_256_add sse2 neon dspr2 -prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct10_16x16_add sse2 neon +prototype void vp9_idct16x16_10_add "const int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_idct16x16_10_add sse2 neon dspr2 -prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct32x32_add sse2 neon +prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_idct32x32_1024_add sse2 neon dspr2 -prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output" -specialize vp9_short_idct1_32x32 +prototype void vp9_idct32x32_34_add "const int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_idct32x32_34_add sse2 -prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" -specialize vp9_short_iht4x4_add sse2 neon +prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_idct32x32_1_add sse2 dspr2 -prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" -specialize vp9_short_iht8x8_add sse2 neon +prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type" +specialize vp9_iht4x4_16_add sse2 neon dspr2 -prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type" -specialize vp9_short_iht16x16_add sse2 +prototype void vp9_iht8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type" +specialize vp9_iht8x8_64_add sse2 neon dspr2 + +prototype void vp9_iht16x16_256_add "const int16_t *input, uint8_t *output, int pitch, int tx_type" +specialize vp9_iht16x16_256_add sse2 dspr2 -prototype void vp9_idct4_1d "int16_t *input, int16_t *output" -specialize vp9_idct4_1d sse2 # dct and add -prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_iwalsh4x4_1_add +prototype void vp9_iwht4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_iwht4x4_1_add -prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_iwalsh4x4_add +prototype void vp9_iwht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_iwht4x4_16_add # # Encoder functions below this point. @@ -697,10 +672,10 @@ specialize vp9_block_error $sse2_x86inc prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride" specialize vp9_subtract_block $sse2_x86inc -prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" +prototype void vp9_quantize_b "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" specialize vp9_quantize_b $ssse3_x86_64 -prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" +prototype void vp9_quantize_b_32x32 "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" specialize vp9_quantize_b_32x32 $ssse3_x86_64 # @@ -715,38 +690,32 @@ if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then fi # fdct functions -prototype void vp9_short_fht4x4 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type" +prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type" specialize vp9_short_fht4x4 sse2 -prototype void vp9_short_fht8x8 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type" +prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type" specialize vp9_short_fht8x8 sse2 -prototype void vp9_short_fht16x16 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type" +prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type" specialize vp9_short_fht16x16 sse2 -prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_fdct8x8 sse2 - -prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_fdct4x4 sse2 - -prototype void vp9_short_fdct8x4 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_fdct8x4 sse2 +prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride" +specialize vp9_fwht4x4 -prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_fdct32x32 sse2 +prototype void vp9_fdct4x4 "const int16_t *input, int16_t *output, int stride" +specialize vp9_fdct4x4 sse2 -prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_fdct32x32_rd sse2 +prototype void vp9_fdct8x8 "const int16_t *input, int16_t *output, int stride" +specialize vp9_fdct8x8 sse2 -prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_fdct16x16 sse2 +prototype void vp9_fdct16x16 "const int16_t *input, int16_t *output, int stride" +specialize vp9_fdct16x16 sse2 -prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_walsh4x4 +prototype void vp9_fdct32x32 "const int16_t *input, int16_t *output, int stride" +specialize vp9_fdct32x32 sse2 -prototype void vp9_short_walsh8x4 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_walsh8x4 +prototype void vp9_fdct32x32_rd "const int16_t *input, int16_t *output, int stride" +specialize vp9_fdct32x32_rd sse2 # # Motion search diff --git a/libvpx/vp9/common/vp9_scale.c b/libvpx/vp9/common/vp9_scale.c index 989206c..3f0994f 100644 --- a/libvpx/vp9/common/vp9_scale.c +++ b/libvpx/vp9/common/vp9_scale.c @@ -12,23 +12,23 @@ #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_scale.h" -static INLINE int scaled_x(int val, const struct scale_factors *scale) { - return val * scale->x_scale_fp >> REF_SCALE_SHIFT; +static INLINE int scaled_x(int val, const struct scale_factors_common *sfc) { + return val * sfc->x_scale_fp >> REF_SCALE_SHIFT; } -static INLINE int scaled_y(int val, const struct scale_factors *scale) { - return val * scale->y_scale_fp >> REF_SCALE_SHIFT; +static INLINE int scaled_y(int val, const struct scale_factors_common *sfc) { + return val * sfc->y_scale_fp >> REF_SCALE_SHIFT; } -static int unscaled_value(int val, const struct scale_factors *scale) { - (void) scale; +static int unscaled_value(int val, const struct scale_factors_common *sfc) { + (void) sfc; return val; } static MV32 scaled_mv(const MV *mv, const struct scale_factors *scale) { const MV32 res = { - scaled_y(mv->row, scale) + scale->y_offset_q4, - scaled_x(mv->col, scale) + scale->x_offset_q4 + scaled_y(mv->row, scale->sfc) + scale->y_offset_q4, + scaled_x(mv->col, scale->sfc) + scale->x_offset_q4 }; return res; } @@ -43,8 +43,8 @@ static MV32 unscaled_mv(const MV *mv, const struct scale_factors *scale) { static void set_offsets_with_scaling(struct scale_factors *scale, int row, int col) { - scale->x_offset_q4 = scaled_x(col << SUBPEL_BITS, scale) & SUBPEL_MASK; - scale->y_offset_q4 = scaled_y(row << SUBPEL_BITS, scale) & SUBPEL_MASK; + scale->x_offset_q4 = scaled_x(col << SUBPEL_BITS, scale->sfc) & SUBPEL_MASK; + scale->y_offset_q4 = scaled_y(row << SUBPEL_BITS, scale->sfc) & SUBPEL_MASK; } static void set_offsets_without_scaling(struct scale_factors *scale, @@ -70,31 +70,30 @@ static int check_scale_factors(int other_w, int other_h, } void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, + struct scale_factors_common *scale_comm, int other_w, int other_h, int this_w, int this_h) { if (!check_scale_factors(other_w, other_h, this_w, this_h)) { - scale->x_scale_fp = REF_INVALID_SCALE; - scale->y_scale_fp = REF_INVALID_SCALE; + scale_comm->x_scale_fp = REF_INVALID_SCALE; + scale_comm->y_scale_fp = REF_INVALID_SCALE; return; } - scale->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w); - scale->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h); - scale->x_step_q4 = scaled_x(16, scale); - scale->y_step_q4 = scaled_y(16, scale); - scale->x_offset_q4 = 0; // calculated per block - scale->y_offset_q4 = 0; // calculated per block + scale_comm->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w); + scale_comm->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h); + scale_comm->x_step_q4 = scaled_x(16, scale_comm); + scale_comm->y_step_q4 = scaled_y(16, scale_comm); - if (vp9_is_scaled(scale)) { - scale->scale_value_x = scaled_x; - scale->scale_value_y = scaled_y; - scale->set_scaled_offsets = set_offsets_with_scaling; - scale->scale_mv = scaled_mv; + if (vp9_is_scaled(scale_comm)) { + scale_comm->scale_value_x = scaled_x; + scale_comm->scale_value_y = scaled_y; + scale_comm->set_scaled_offsets = set_offsets_with_scaling; + scale_comm->scale_mv = scaled_mv; } else { - scale->scale_value_x = unscaled_value; - scale->scale_value_y = unscaled_value; - scale->set_scaled_offsets = set_offsets_without_scaling; - scale->scale_mv = unscaled_mv; + scale_comm->scale_value_x = unscaled_value; + scale_comm->scale_value_y = unscaled_value; + scale_comm->set_scaled_offsets = set_offsets_without_scaling; + scale_comm->scale_mv = unscaled_mv; } // TODO(agrange): Investigate the best choice of functions to use here @@ -103,44 +102,48 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, // applied in one direction only, and not at all for 0,0, seems to give the // best quality, but it may be worth trying an additional mode that does // do the filtering on full-pel. - if (scale->x_step_q4 == 16) { - if (scale->y_step_q4 == 16) { + if (scale_comm->x_step_q4 == 16) { + if (scale_comm->y_step_q4 == 16) { // No scaling in either direction. - scale->predict[0][0][0] = vp9_convolve_copy; - scale->predict[0][0][1] = vp9_convolve_avg; - scale->predict[0][1][0] = vp9_convolve8_vert; - scale->predict[0][1][1] = vp9_convolve8_avg_vert; - scale->predict[1][0][0] = vp9_convolve8_horiz; - scale->predict[1][0][1] = vp9_convolve8_avg_horiz; + scale_comm->predict[0][0][0] = vp9_convolve_copy; + scale_comm->predict[0][0][1] = vp9_convolve_avg; + scale_comm->predict[0][1][0] = vp9_convolve8_vert; + scale_comm->predict[0][1][1] = vp9_convolve8_avg_vert; + scale_comm->predict[1][0][0] = vp9_convolve8_horiz; + scale_comm->predict[1][0][1] = vp9_convolve8_avg_horiz; } else { // No scaling in x direction. Must always scale in the y direction. - scale->predict[0][0][0] = vp9_convolve8_vert; - scale->predict[0][0][1] = vp9_convolve8_avg_vert; - scale->predict[0][1][0] = vp9_convolve8_vert; - scale->predict[0][1][1] = vp9_convolve8_avg_vert; - scale->predict[1][0][0] = vp9_convolve8; - scale->predict[1][0][1] = vp9_convolve8_avg; + scale_comm->predict[0][0][0] = vp9_convolve8_vert; + scale_comm->predict[0][0][1] = vp9_convolve8_avg_vert; + scale_comm->predict[0][1][0] = vp9_convolve8_vert; + scale_comm->predict[0][1][1] = vp9_convolve8_avg_vert; + scale_comm->predict[1][0][0] = vp9_convolve8; + scale_comm->predict[1][0][1] = vp9_convolve8_avg; } } else { - if (scale->y_step_q4 == 16) { + if (scale_comm->y_step_q4 == 16) { // No scaling in the y direction. Must always scale in the x direction. - scale->predict[0][0][0] = vp9_convolve8_horiz; - scale->predict[0][0][1] = vp9_convolve8_avg_horiz; - scale->predict[0][1][0] = vp9_convolve8; - scale->predict[0][1][1] = vp9_convolve8_avg; - scale->predict[1][0][0] = vp9_convolve8_horiz; - scale->predict[1][0][1] = vp9_convolve8_avg_horiz; + scale_comm->predict[0][0][0] = vp9_convolve8_horiz; + scale_comm->predict[0][0][1] = vp9_convolve8_avg_horiz; + scale_comm->predict[0][1][0] = vp9_convolve8; + scale_comm->predict[0][1][1] = vp9_convolve8_avg; + scale_comm->predict[1][0][0] = vp9_convolve8_horiz; + scale_comm->predict[1][0][1] = vp9_convolve8_avg_horiz; } else { // Must always scale in both directions. - scale->predict[0][0][0] = vp9_convolve8; - scale->predict[0][0][1] = vp9_convolve8_avg; - scale->predict[0][1][0] = vp9_convolve8; - scale->predict[0][1][1] = vp9_convolve8_avg; - scale->predict[1][0][0] = vp9_convolve8; - scale->predict[1][0][1] = vp9_convolve8_avg; + scale_comm->predict[0][0][0] = vp9_convolve8; + scale_comm->predict[0][0][1] = vp9_convolve8_avg; + scale_comm->predict[0][1][0] = vp9_convolve8; + scale_comm->predict[0][1][1] = vp9_convolve8_avg; + scale_comm->predict[1][0][0] = vp9_convolve8; + scale_comm->predict[1][0][1] = vp9_convolve8_avg; } } // 2D subpel motion always gets filtered in both directions - scale->predict[1][1][0] = vp9_convolve8; - scale->predict[1][1][1] = vp9_convolve8_avg; + scale_comm->predict[1][1][0] = vp9_convolve8; + scale_comm->predict[1][1][1] = vp9_convolve8_avg; + + scale->sfc = scale_comm; + scale->x_offset_q4 = 0; // calculated per block + scale->y_offset_q4 = 0; // calculated per block } diff --git a/libvpx/vp9/common/vp9_scale.h b/libvpx/vp9/common/vp9_scale.h index 7a720d0..1437fcd 100644 --- a/libvpx/vp9/common/vp9_scale.h +++ b/libvpx/vp9/common/vp9_scale.h @@ -18,34 +18,40 @@ #define REF_NO_SCALE (1 << REF_SCALE_SHIFT) #define REF_INVALID_SCALE -1 -struct scale_factors { +struct scale_factors; +struct scale_factors_common { int x_scale_fp; // horizontal fixed point scale factor int y_scale_fp; // vertical fixed point scale factor - int x_offset_q4; int x_step_q4; - int y_offset_q4; int y_step_q4; - int (*scale_value_x)(int val, const struct scale_factors *scale); - int (*scale_value_y)(int val, const struct scale_factors *scale); + int (*scale_value_x)(int val, const struct scale_factors_common *sfc); + int (*scale_value_y)(int val, const struct scale_factors_common *sfc); void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col); MV32 (*scale_mv)(const MV *mv, const struct scale_factors *scale); convolve_fn_t predict[2][2][2]; // horiz, vert, avg }; +struct scale_factors { + int x_offset_q4; + int y_offset_q4; + const struct scale_factors_common *sfc; +}; + void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, + struct scale_factors_common *scale_comm, int other_w, int other_h, int this_w, int this_h); -static int vp9_is_valid_scale(const struct scale_factors *sf) { - return sf->x_scale_fp != REF_INVALID_SCALE && - sf->y_scale_fp != REF_INVALID_SCALE; +static int vp9_is_valid_scale(const struct scale_factors_common *sfc) { + return sfc->x_scale_fp != REF_INVALID_SCALE && + sfc->y_scale_fp != REF_INVALID_SCALE; } -static int vp9_is_scaled(const struct scale_factors *sf) { - return sf->x_scale_fp != REF_NO_SCALE || - sf->y_scale_fp != REF_NO_SCALE; +static int vp9_is_scaled(const struct scale_factors_common *sfc) { + return sfc->x_scale_fp != REF_NO_SCALE || + sfc->y_scale_fp != REF_NO_SCALE; } -#endif // VP9_COMMON_VP9_SCALE_H_ +#endif // VP9_COMMON_VP9_SCALE_H_ diff --git a/libvpx/vp9/common/vp9_scan.c b/libvpx/vp9/common/vp9_scan.c new file mode 100644 index 0000000..f17da91 --- /dev/null +++ b/libvpx/vp9/common/vp9_scan.c @@ -0,0 +1,357 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "vp9/common/vp9_scan.h" + +DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]) = { + 0, 4, 1, 5, + 8, 2, 12, 9, + 3, 6, 13, 10, + 7, 14, 11, 15, +}; + +DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]) = { + 0, 4, 8, 1, + 12, 5, 9, 2, + 13, 6, 10, 3, + 7, 14, 11, 15, +}; + +DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = { + 0, 1, 4, 2, + 5, 3, 6, 8, + 9, 7, 12, 10, + 13, 11, 14, 15, +}; + +DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = { + 0, 8, 1, 16, 9, 2, 17, 24, + 10, 3, 18, 25, 32, 11, 4, 26, + 33, 19, 40, 12, 34, 27, 5, 41, + 20, 48, 13, 35, 42, 28, 21, 6, + 49, 56, 36, 43, 29, 7, 14, 50, + 57, 44, 22, 37, 15, 51, 58, 30, + 45, 23, 52, 59, 38, 31, 60, 53, + 46, 39, 61, 54, 47, 62, 55, 63, +}; + +DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = { + 0, 8, 16, 1, 24, 9, 32, 17, + 2, 40, 25, 10, 33, 18, 48, 3, + 26, 41, 11, 56, 19, 34, 4, 49, + 27, 42, 12, 35, 20, 57, 50, 28, + 5, 43, 13, 36, 58, 51, 21, 44, + 6, 29, 59, 37, 14, 52, 22, 7, + 45, 60, 30, 15, 38, 53, 23, 46, + 31, 61, 39, 54, 47, 62, 55, 63, +}; + +DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = { + 0, 1, 2, 8, 9, 3, 16, 10, + 4, 17, 11, 24, 5, 18, 25, 12, + 19, 26, 32, 6, 13, 20, 33, 27, + 7, 34, 40, 21, 28, 41, 14, 35, + 48, 42, 29, 36, 49, 22, 43, 15, + 56, 37, 50, 44, 30, 57, 23, 51, + 58, 45, 38, 52, 31, 59, 53, 46, + 60, 39, 61, 47, 54, 55, 62, 63, +}; + +DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = { + 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80, + 50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52, + 98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69, + 100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, 101, 131, 160, 146, + 55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, 176, 162, 87, 56, 25, + 133, 118, 177, 148, 72, 103, 41, 163, 10, 192, 178, 88, 57, 134, 149, 119, + 26, 164, 73, 104, 193, 42, 179, 208, 11, 135, 89, 165, 120, 150, 58, 194, + 180, 27, 74, 209, 105, 151, 136, 43, 90, 224, 166, 195, 181, 121, 210, 59, + 12, 152, 106, 167, 196, 75, 137, 225, 211, 240, 182, 122, 91, 28, 197, 13, + 226, 168, 183, 153, 44, 212, 138, 107, 241, 60, 29, 123, 198, 184, 227, 169, + 242, 76, 213, 154, 45, 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108, + 77, 155, 30, 15, 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140, + 230, 62, 216, 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141, + 63, 232, 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142, + 219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159, + 251, + 190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239, + 255, +}; + +DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = { + 0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81, + 34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4, + 67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21, + 146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, 116, 193, 147, 85, + 22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, 7, 148, 194, 86, 179, + 225, 23, 133, 39, 164, 8, 102, 210, 241, 55, 195, 118, 149, 71, 180, 24, + 87, 226, 134, 165, 211, 40, 103, 56, 72, 150, 196, 242, 119, 9, 181, 227, + 88, 166, 25, 135, 41, 104, 212, 57, 151, 197, 120, 73, 243, 182, 136, 167, + 213, 89, 10, 228, 105, 152, 198, 26, 42, 121, 183, 244, 168, 58, 137, 229, + 74, 214, 90, 153, 199, 184, 11, 106, 245, 27, 122, 230, 169, 43, 215, 59, + 200, 138, 185, 246, 75, 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170, + 60, 247, 232, 76, 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202, + 233, 171, 61, 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125, + 62, 172, 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79, + 126, 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205, + 236, + 159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239, + 255, +}; + +DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = { + 0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20, + 49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52, + 23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69, + 25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, 41, 56, 114, 100, + 13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, 116, 14, 87, 130, 102, + 144, 73, 131, 117, 28, 58, 15, 88, 43, 145, 103, 132, 146, 118, 74, 160, + 89, 133, 104, 29, 59, 147, 119, 44, 161, 148, 90, 105, 134, 162, 120, 176, + 75, 135, 149, 30, 60, 163, 177, 45, 121, 91, 106, 164, 178, 150, 192, 136, + 165, 179, 31, 151, 193, 76, 122, 61, 137, 194, 107, 152, 180, 208, 46, 166, + 167, 195, 92, 181, 138, 209, 123, 153, 224, 196, 77, 168, 210, 182, 240, 108, + 197, 62, 154, 225, 183, 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170, + 124, 155, 199, 78, 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186, + 156, 229, 243, 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110, + 157, 245, 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111, + 158, + 188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220, + 175, + 190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, + 255, +}; + +DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = { + 0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160, + 129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193, + 68, 131, 37, 100, + 225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38, + 258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321, + 102, 352, 8, 197, + 71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292, + 135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293, + 41, 417, 199, 136, + 262, 387, 448, 325, 356, 10, 73, 418, 231, 168, 449, 294, 388, 105, + 419, 263, 42, 200, 357, 450, 137, 480, 74, 326, 232, 11, 389, 169, + 295, 420, 106, 451, + 481, 358, 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421, + 75, 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391, + 453, 139, 44, 234, + 484, 297, 360, 171, 76, 515, 545, 266, 329, 454, 13, 423, 203, 108, + 546, 485, 576, 298, 235, 140, 361, 330, 172, 547, 45, 455, 267, 577, + 486, 77, 204, 362, + 608, 14, 299, 578, 109, 236, 487, 609, 331, 141, 579, 46, 15, 173, + 610, 363, 78, 205, 16, 110, 237, 611, 142, 47, 174, 79, 206, 17, + 111, 238, 48, 143, + 80, 175, 112, 207, 49, 18, 239, 81, 113, 19, 50, 82, 114, 51, + 83, 115, 640, 516, 392, 268, 144, 20, 672, 641, 548, 517, 424, + 393, 300, 269, 176, 145, + 52, 21, 704, 673, 642, 580, 549, 518, 456, 425, 394, 332, 301, + 270, 208, 177, 146, 84, 53, 22, 736, 705, 674, 643, 612, 581, + 550, 519, 488, 457, 426, 395, + 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 737, + 706, 675, 613, 582, 551, 489, 458, 427, 365, 334, 303, 241, + 210, 179, 117, 86, 55, 738, 707, + 614, 583, 490, 459, 366, 335, 242, 211, 118, 87, 739, 615, 491, + 367, 243, 119, 768, 644, 520, 396, 272, 148, 24, 800, 769, 676, + 645, 552, 521, 428, 397, 304, + 273, 180, 149, 56, 25, 832, 801, 770, 708, 677, 646, 584, 553, + 522, 460, 429, 398, 336, 305, 274, 212, 181, 150, 88, 57, 26, + 864, 833, 802, 771, 740, 709, + 678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306, + 275, 244, 213, 182, 151, 120, 89, 58, 27, 865, 834, 803, 741, + 710, 679, 617, 586, 555, 493, + 462, 431, 369, 338, 307, 245, 214, 183, 121, 90, 59, 866, 835, + 742, 711, 618, 587, 494, 463, 370, 339, 246, 215, 122, 91, 867, + 743, 619, 495, 371, 247, 123, + 896, 772, 648, 524, 400, 276, 152, 28, 928, 897, 804, 773, 680, + 649, 556, 525, 432, 401, 308, 277, 184, 153, 60, 29, 960, 929, + 898, 836, 805, 774, 712, 681, + 650, 588, 557, 526, 464, 433, 402, 340, 309, 278, 216, 185, 154, + 92, 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682, + 651, 620, 589, 558, 527, + 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, + 93, 62, 31, 993, 962, 931, 869, 838, 807, 745, 714, 683, 621, 590, + 559, 497, 466, 435, 373, + 342, 311, 249, 218, 187, 125, 94, 63, 994, 963, 870, 839, 746, 715, + 622, 591, 498, 467, 374, 343, 250, 219, 126, 95, 995, 871, 747, 623, + 499, 375, 251, 127, + 900, 776, 652, 528, 404, 280, 156, 932, 901, 808, 777, 684, 653, 560, + 529, 436, 405, 312, 281, 188, 157, 964, 933, 902, 840, 809, 778, 716, + 685, 654, 592, 561, + 530, 468, 437, 406, 344, 313, 282, 220, 189, 158, 996, 965, 934, 903, + 872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469, + 438, 407, 376, 345, + 314, 283, 252, 221, 190, 159, 997, 966, 935, 873, 842, 811, 749, 718, + 687, 625, 594, 563, 501, 470, 439, 377, 346, 315, 253, 222, 191, 998, + 967, 874, 843, 750, + 719, 626, 595, 502, 471, 378, 347, 254, 223, 999, 875, 751, 627, 503, + 379, 255, 904, 780, 656, 532, 408, 284, 936, 905, 812, 781, 688, 657, + 564, 533, 440, 409, + 316, 285, 968, 937, 906, 844, 813, 782, 720, 689, 658, 596, 565, 534, + 472, 441, 410, 348, 317, 286, 1000, 969, 938, 907, 876, 845, 814, 783, + 752, 721, 690, 659, + 628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 1001, 970, + 939, 877, 846, 815, 753, 722, 691, 629, 598, 567, 505, 474, 443, 381, + 350, 319, 1002, 971, + 878, 847, 754, 723, 630, 599, 506, 475, 382, 351, 1003, 879, 755, 631, + 507, 383, 908, 784, 660, 536, 412, 940, 909, 816, 785, 692, 661, 568, + 537, 444, 413, 972, + 941, 910, 848, 817, 786, 724, 693, 662, 600, 569, 538, 476, 445, 414, + 1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601, + 570, 539, 508, 477, + 446, 415, 1005, 974, 943, 881, 850, 819, 757, 726, 695, 633, 602, 571, + 509, 478, 447, 1006, 975, 882, 851, 758, 727, 634, 603, 510, 479, + 1007, 883, 759, 635, 511, + 912, 788, 664, 540, 944, 913, 820, 789, 696, 665, 572, 541, 976, 945, + 914, 852, 821, 790, 728, 697, 666, 604, 573, 542, 1008, 977, 946, 915, + 884, 853, 822, 791, + 760, 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 885, 854, 823, + 761, 730, 699, 637, 606, 575, 1010, 979, 886, 855, 762, 731, 638, 607, + 1011, 887, 763, 639, + 916, 792, 668, 948, 917, 824, 793, 700, 669, 980, 949, 918, 856, 825, + 794, 732, 701, 670, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733, + 702, 671, 1013, 982, + 951, 889, 858, 827, 765, 734, 703, 1014, 983, 890, 859, 766, 735, 1015, + 891, 767, 920, 796, 952, 921, 828, 797, 984, 953, 922, 860, 829, 798, + 1016, 985, 954, 923, + 892, 861, 830, 799, 1017, 986, 955, 893, 862, 831, 1018, 987, 894, 863, + 1019, 895, 924, 956, 925, 988, 957, 926, 1020, 989, 958, 927, 1021, + 990, 959, 1022, 991, 1023, +}; + +// Neighborhood 5-tuples for various scans and blocksizes, +// in {top, left, topleft, topright, bottomleft} order +// for each position in raster scan order. +// -1 indicates the neighbor does not exist. +DECLARE_ALIGNED(16, int16_t, + vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int16_t, + vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int16_t, + vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int16_t, + vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int16_t, + vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int16_t, + vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int16_t, + vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int16_t, + vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int16_t, + vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int16_t, + vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]); + + +DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]); +DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]); +DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]); +DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]); +DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]); +DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]); +DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]); +DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]); +DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]); +DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]); + +static int find_in_scan(const int16_t *scan, int l, int idx) { + int n, l2 = l * l; + for (n = 0; n < l2; n++) { + int rc = scan[n]; + if (rc == idx) + return n; + } + assert(0); + return -1; +} +static void init_scan_neighbors(const int16_t *scan, + int16_t *iscan, + int l, int16_t *neighbors) { + int l2 = l * l; + int n, i, j; + + // dc doesn't use this type of prediction + neighbors[MAX_NEIGHBORS * 0 + 0] = 0; + neighbors[MAX_NEIGHBORS * 0 + 1] = 0; + iscan[0] = find_in_scan(scan, l, 0); + for (n = 1; n < l2; n++) { + int rc = scan[n]; + iscan[n] = find_in_scan(scan, l, n); + i = rc / l; + j = rc % l; + if (i > 0 && j > 0) { + // col/row scan is used for adst/dct, and generally means that + // energy decreases to zero much faster in the dimension in + // which ADST is used compared to the direction in which DCT + // is used. Likewise, we find much higher correlation between + // coefficients within the direction in which DCT is used. + // Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff + // as a context. If ADST or DCT is used in both directions, we + // use the combination of the two as a context. + int a = (i - 1) * l + j; + int b = i * l + j - 1; + if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 || + scan == vp9_col_scan_16x16) { + // in the col/row scan cases (as well as left/top edge cases), we set + // both contexts to the same value, so we can branchlessly do a+b+1>>1 + // which automatically becomes a if a == b + neighbors[MAX_NEIGHBORS * n + 0] = + neighbors[MAX_NEIGHBORS * n + 1] = a; + } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 || + scan == vp9_row_scan_16x16) { + neighbors[MAX_NEIGHBORS * n + 0] = + neighbors[MAX_NEIGHBORS * n + 1] = b; + } else { + neighbors[MAX_NEIGHBORS * n + 0] = a; + neighbors[MAX_NEIGHBORS * n + 1] = b; + } + } else if (i > 0) { + neighbors[MAX_NEIGHBORS * n + 0] = + neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j; + } else { + assert(j > 0); + neighbors[MAX_NEIGHBORS * n + 0] = + neighbors[MAX_NEIGHBORS * n + 1] = i * l + j - 1; + } + assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n); + } + // one padding item so we don't have to add branches in code to handle + // calls to get_coef_context() for the token after the final dc token + neighbors[MAX_NEIGHBORS * l2 + 0] = 0; + neighbors[MAX_NEIGHBORS * l2 + 1] = 0; +} + +void vp9_init_neighbors() { + init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4, + vp9_default_scan_4x4_neighbors); + init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4, + vp9_row_scan_4x4_neighbors); + init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4, + vp9_col_scan_4x4_neighbors); + init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8, + vp9_default_scan_8x8_neighbors); + init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8, + vp9_row_scan_8x8_neighbors); + init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8, + vp9_col_scan_8x8_neighbors); + init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16, + vp9_default_scan_16x16_neighbors); + init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16, + vp9_row_scan_16x16_neighbors); + init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16, + vp9_col_scan_16x16_neighbors); + init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32, + vp9_default_scan_32x32_neighbors); +} diff --git a/libvpx/vp9/common/vp9_scan.h b/libvpx/vp9/common/vp9_scan.h new file mode 100644 index 0000000..14a1a7e --- /dev/null +++ b/libvpx/vp9/common/vp9_scan.h @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_SCAN_H_ +#define VP9_COMMON_VP9_SCAN_H_ + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +#include "vp9/common/vp9_enums.h" + +#define MAX_NEIGHBORS 2 + +extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]); +extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]); +extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]); + +extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]); +extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]); +extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]); + +extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]); +extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]); +extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]); + +extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]); + +extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]); +extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]); +extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]); + +extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]); +extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]); +extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]); + +extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]); +extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]); +extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]); + +extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]); + +extern DECLARE_ALIGNED(16, int16_t, + vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); +extern DECLARE_ALIGNED(16, int16_t, + vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); +extern DECLARE_ALIGNED(16, int16_t, + vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); +extern DECLARE_ALIGNED(16, int16_t, + vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); +extern DECLARE_ALIGNED(16, int16_t, + vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); +extern DECLARE_ALIGNED(16, int16_t, + vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); +extern DECLARE_ALIGNED(16, int16_t, + vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); +extern DECLARE_ALIGNED(16, int16_t, + vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); +extern DECLARE_ALIGNED(16, int16_t, + vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); +extern DECLARE_ALIGNED(16, int16_t, + vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]); + + +void vp9_init_neighbors(); + +static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) { + switch (tx_type) { + case ADST_DCT: + return vp9_row_scan_4x4; + case DCT_ADST: + return vp9_col_scan_4x4; + default: + return vp9_default_scan_4x4; + } +} + +static INLINE void get_scan_nb_4x4(TX_TYPE tx_type, + const int16_t **scan, const int16_t **nb) { + switch (tx_type) { + case ADST_DCT: + *scan = vp9_row_scan_4x4; + *nb = vp9_row_scan_4x4_neighbors; + break; + case DCT_ADST: + *scan = vp9_col_scan_4x4; + *nb = vp9_col_scan_4x4_neighbors; + break; + default: + *scan = vp9_default_scan_4x4; + *nb = vp9_default_scan_4x4_neighbors; + break; + } +} + +static INLINE const int16_t* get_iscan_4x4(TX_TYPE tx_type) { + switch (tx_type) { + case ADST_DCT: + return vp9_row_iscan_4x4; + case DCT_ADST: + return vp9_col_iscan_4x4; + default: + return vp9_default_iscan_4x4; + } +} + +static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) { + switch (tx_type) { + case ADST_DCT: + return vp9_row_scan_8x8; + case DCT_ADST: + return vp9_col_scan_8x8; + default: + return vp9_default_scan_8x8; + } +} + +static INLINE void get_scan_nb_8x8(TX_TYPE tx_type, + const int16_t **scan, const int16_t **nb) { + switch (tx_type) { + case ADST_DCT: + *scan = vp9_row_scan_8x8; + *nb = vp9_row_scan_8x8_neighbors; + break; + case DCT_ADST: + *scan = vp9_col_scan_8x8; + *nb = vp9_col_scan_8x8_neighbors; + break; + default: + *scan = vp9_default_scan_8x8; + *nb = vp9_default_scan_8x8_neighbors; + break; + } +} + +static INLINE const int16_t* get_iscan_8x8(TX_TYPE tx_type) { + switch (tx_type) { + case ADST_DCT: + return vp9_row_iscan_8x8; + case DCT_ADST: + return vp9_col_iscan_8x8; + default: + return vp9_default_iscan_8x8; + } +} + +static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) { + switch (tx_type) { + case ADST_DCT: + return vp9_row_scan_16x16; + case DCT_ADST: + return vp9_col_scan_16x16; + default: + return vp9_default_scan_16x16; + } +} + +static INLINE void get_scan_nb_16x16(TX_TYPE tx_type, + const int16_t **scan, const int16_t **nb) { + switch (tx_type) { + case ADST_DCT: + *scan = vp9_row_scan_16x16; + *nb = vp9_row_scan_16x16_neighbors; + break; + case DCT_ADST: + *scan = vp9_col_scan_16x16; + *nb = vp9_col_scan_16x16_neighbors; + break; + default: + *scan = vp9_default_scan_16x16; + *nb = vp9_default_scan_16x16_neighbors; + break; + } +} + +static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) { + switch (tx_type) { + case ADST_DCT: + return vp9_row_iscan_16x16; + case DCT_ADST: + return vp9_col_iscan_16x16; + default: + return vp9_default_iscan_16x16; + } +} + +static INLINE int get_coef_context(const int16_t *neighbors, + const uint8_t *token_cache, int c) { + return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] + + token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1; +} + +#endif // VP9_COMMON_VP9_SCAN_H_ diff --git a/libvpx/vp9/common/vp9_seg_common.c b/libvpx/vp9/common/vp9_seg_common.c index 6bfd8f8..ef30404 100644 --- a/libvpx/vp9/common/vp9_seg_common.c +++ b/libvpx/vp9/common/vp9_seg_common.c @@ -76,7 +76,7 @@ int vp9_get_segdata(const struct segmentation *seg, int segment_id, } -const vp9_tree_index vp9_segment_tree[14] = { +const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = { 2, 4, 6, 8, 10, 12, 0, -1, -2, -3, -4, -5, -6, -7 }; diff --git a/libvpx/vp9/common/vp9_seg_common.h b/libvpx/vp9/common/vp9_seg_common.h index f22239b..eb38c06 100644 --- a/libvpx/vp9/common/vp9_seg_common.h +++ b/libvpx/vp9/common/vp9_seg_common.h @@ -76,7 +76,7 @@ int vp9_get_segdata(const struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id); -extern const vp9_tree_index vp9_segment_tree[14]; +extern const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)]; #endif // VP9_COMMON_VP9_SEG_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_subpelvar.h b/libvpx/vp9/common/vp9_subpelvar.h deleted file mode 100644 index fe75481..0000000 --- a/libvpx/vp9/common/vp9_subpelvar.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_VP9_SUBPELVAR_H_ -#define VP9_COMMON_VP9_SUBPELVAR_H_ - -#include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_convolve.h" - -static void variance(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - int w, - int h, - unsigned int *sse, - int *sum) { - int i, j; - int diff; - - *sum = 0; - *sse = 0; - - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - diff = src_ptr[j] - ref_ptr[j]; - *sum += diff; - *sse += diff * diff; - } - - src_ptr += source_stride; - ref_ptr += recon_stride; - } -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_first_pass - * - * INPUTS : uint8_t *src_ptr : Pointer to source block. - * uint32_t src_pixels_per_line : Stride of input block. - * uint32_t pixel_step : Offset between filter input samples (see notes). - * uint32_t output_height : Input block height. - * uint32_t output_width : Input block width. - * int32_t *vp9_filter : Array of 2 bi-linear filter taps. - * - * OUTPUTS : int32_t *output_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in - * either horizontal or vertical direction to produce the - * filtered output block. Used to implement first-pass - * of 2-D separable filter. - * - * SPECIAL NOTES : Produces int32_t output to retain precision for next pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * pixel_step defines whether the filter is applied - * horizontally (pixel_step=1) or vertically (pixel_step=stride). - * It defines the offset required to move from one input - * to the next. - * - ****************************************************************************/ -static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr, - uint16_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const int16_t *vp9_filter) { - unsigned int i, j; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + - (int)src_ptr[pixel_step] * vp9_filter[1], - FILTER_BITS); - - src_ptr++; - } - - // Next row... - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_second_pass - * - * INPUTS : int32_t *src_ptr : Pointer to source block. - * uint32_t src_pixels_per_line : Stride of input block. - * uint32_t pixel_step : Offset between filter input samples (see notes). - * uint32_t output_height : Input block height. - * uint32_t output_width : Input block width. - * int32_t *vp9_filter : Array of 2 bi-linear filter taps. - * - * OUTPUTS : uint16_t *output_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in - * either horizontal or vertical direction to produce the - * filtered output block. Used to implement second-pass - * of 2-D separable filter. - * - * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * pixel_step defines whether the filter is applied - * horizontally (pixel_step=1) or vertically (pixel_step=stride). - * It defines the offset required to move from one input - * to the next. - * - ****************************************************************************/ -static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const int16_t *vp9_filter) { - unsigned int i, j; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + - (int)src_ptr[pixel_step] * vp9_filter[1], - FILTER_BITS); - src_ptr++; - } - - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -#endif // VP9_COMMON_VP9_SUBPELVAR_H_ diff --git a/libvpx/vp9/common/vp9_systemdependent.h b/libvpx/vp9/common/vp9_systemdependent.h index cc909e2..254a431 100644 --- a/libvpx/vp9/common/vp9_systemdependent.h +++ b/libvpx/vp9/common/vp9_systemdependent.h @@ -13,6 +13,7 @@ #ifdef _MSC_VER #include <math.h> +#define snprintf _snprintf #endif #include "./vpx_config.h" @@ -23,8 +24,8 @@ void vpx_reset_mmx_state(void); #define vp9_clear_system_state() #endif -#ifdef _MSC_VER -// round is not defined in MSVC +#if defined(_MSC_VER) && _MSC_VER < 1800 +// round is not defined in MSVC before VS2013. static int round(double x) { if (x < 0) return (int)ceil(x - 0.5); diff --git a/libvpx/vp9/common/vp9_tile_common.c b/libvpx/vp9/common/vp9_tile_common.c index 1791c1a..e3035d0 100644 --- a/libvpx/vp9/common/vp9_tile_common.c +++ b/libvpx/vp9/common/vp9_tile_common.c @@ -10,6 +10,8 @@ #include "vp9/common/vp9_tile_common.h" +#include "vp9/common/vp9_onyxc_int.h" + #define MIN_TILE_WIDTH_B64 4 #define MAX_TILE_WIDTH_B64 64 @@ -17,8 +19,8 @@ static int to_sbs(n_mis) { return mi_cols_aligned_to_sb(n_mis) >> MI_BLOCK_SIZE_LOG2; } -static void vp9_get_tile_offsets(int *min_tile_off, int *max_tile_off, - int tile_idx, int log2_n_tiles, int n_mis) { +static void get_tile_offsets(int *min_tile_off, int *max_tile_off, + int tile_idx, int log2_n_tiles, int n_mis) { const int n_sbs = to_sbs(n_mis); const int sb_off1 = (tile_idx * n_sbs) >> log2_n_tiles; const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles; @@ -27,17 +29,14 @@ static void vp9_get_tile_offsets(int *min_tile_off, int *max_tile_off, *max_tile_off = MIN(sb_off2 << 3, n_mis); } -void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx) { - vp9_get_tile_offsets(&cm->cur_tile_mi_col_start, &cm->cur_tile_mi_col_end, - tile_col_idx, cm->log2_tile_cols, cm->mi_cols); -} - -void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx) { - vp9_get_tile_offsets(&cm->cur_tile_mi_row_start, &cm->cur_tile_mi_row_end, - tile_row_idx, cm->log2_tile_rows, cm->mi_rows); +void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, + int row_idx, int col_idx) { + get_tile_offsets(&tile->mi_row_start, &tile->mi_row_end, + row_idx, cm->log2_tile_rows, cm->mi_rows); + get_tile_offsets(&tile->mi_col_start, &tile->mi_col_end, + col_idx, cm->log2_tile_cols, cm->mi_cols); } - void vp9_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, int *max_log2_tile_cols) { const int sb_cols = to_sbs(mi_cols); diff --git a/libvpx/vp9/common/vp9_tile_common.h b/libvpx/vp9/common/vp9_tile_common.h index 6d14560..a110abb 100644 --- a/libvpx/vp9/common/vp9_tile_common.h +++ b/libvpx/vp9/common/vp9_tile_common.h @@ -11,11 +11,17 @@ #ifndef VP9_COMMON_VP9_TILE_COMMON_H_ #define VP9_COMMON_VP9_TILE_COMMON_H_ -#include "vp9/common/vp9_onyxc_int.h" +struct VP9Common; -void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx); +typedef struct TileInfo { + int mi_row_start, mi_row_end; + int mi_col_start, mi_col_end; +} TileInfo; -void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx); +// initializes 'tile->mi_(row|col)_(start|end)' for (row_idx, col_idx) based on +// 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)' +void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm, + int row_idx, int col_idx); void vp9_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, int *max_log2_tile_cols); diff --git a/libvpx/vp9/common/vp9_treecoder.c b/libvpx/vp9/common/vp9_treecoder.c index 2e21a5b..1805fb4 100644 --- a/libvpx/vp9/common/vp9_treecoder.c +++ b/libvpx/vp9/common/vp9_treecoder.c @@ -25,8 +25,9 @@ static void tree2tok(struct vp9_token *const p, vp9_tree t, if (j <= 0) { p[-j].value = v; p[-j].len = l; - } else + } else { tree2tok(p, t, j, v, l); + } } while (++v & 1); } @@ -39,9 +40,7 @@ void vp9_tokens_from_tree_offset(struct vp9_token *p, vp9_tree t, tree2tok(p - offset, t, 0, 0, 0); } -static unsigned int convert_distribution(unsigned int i, - vp9_tree tree, - vp9_prob probs[], +static unsigned int convert_distribution(unsigned int i, vp9_tree tree, unsigned int branch_ct[][2], const unsigned int num_events[], unsigned int tok0_offset) { @@ -50,26 +49,25 @@ static unsigned int convert_distribution(unsigned int i, if (tree[i] <= 0) { left = num_events[-tree[i] - tok0_offset]; } else { - left = convert_distribution(tree[i], tree, probs, branch_ct, - num_events, tok0_offset); + left = convert_distribution(tree[i], tree, branch_ct, num_events, + tok0_offset); } if (tree[i + 1] <= 0) right = num_events[-tree[i + 1] - tok0_offset]; else - right = convert_distribution(tree[i + 1], tree, probs, branch_ct, - num_events, tok0_offset); + right = convert_distribution(tree[i + 1], tree, branch_ct, num_events, + tok0_offset); - probs[i>>1] = get_binary_prob(left, right); - branch_ct[i>>1][0] = left; - branch_ct[i>>1][1] = right; + branch_ct[i >> 1][0] = left; + branch_ct[i >> 1][1] = right; return left + right; } -void vp9_tree_probs_from_distribution( - vp9_tree tree, - vp9_prob probs [ /* n-1 */ ], - unsigned int branch_ct [ /* n-1 */ ] [2], - const unsigned int num_events[ /* n */ ], - unsigned int tok0_offset) { - convert_distribution(0, tree, probs, branch_ct, num_events, tok0_offset); +void vp9_tree_probs_from_distribution(vp9_tree tree, + unsigned int branch_ct[/* n-1 */][2], + const unsigned int num_events[/* n */], + unsigned int tok0_offset) { + convert_distribution(0, tree, branch_ct, num_events, tok0_offset); } + + diff --git a/libvpx/vp9/common/vp9_treecoder.h b/libvpx/vp9/common/vp9_treecoder.h index 31182c3..9c776d6 100644 --- a/libvpx/vp9/common/vp9_treecoder.h +++ b/libvpx/vp9/common/vp9_treecoder.h @@ -21,6 +21,8 @@ typedef uint8_t vp9_prob; typedef int8_t vp9_tree_index; +#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2) + #define vp9_complement(x) (255 - x) /* We build coding trees compactly in arrays. @@ -30,7 +32,7 @@ typedef int8_t vp9_tree_index; Index > 0 means need another bit, specification at index. Nonnegative indices are always even; processing begins at node 0. */ -typedef const vp9_tree_index vp9_tree[], *vp9_tree_p; +typedef const vp9_tree_index vp9_tree[]; struct vp9_token { int value; @@ -48,11 +50,11 @@ void vp9_tokens_from_tree_offset(struct vp9_token*, vp9_tree, int offset); probability updates. */ void vp9_tree_probs_from_distribution(vp9_tree tree, - vp9_prob probs[ /* n - 1 */ ], unsigned int branch_ct[ /* n - 1 */ ][2], const unsigned int num_events[ /* n */ ], unsigned int tok0_offset); + static INLINE vp9_prob clip_prob(int p) { return (p > 255) ? 255u : (p < 1) ? 1u : p; } @@ -79,21 +81,46 @@ static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) { return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8); } -static INLINE vp9_prob merge_probs(vp9_prob pre_prob, vp9_prob prob, +static INLINE vp9_prob merge_probs(vp9_prob pre_prob, const unsigned int ct[2], unsigned int count_sat, unsigned int max_update_factor) { + const vp9_prob prob = get_binary_prob(ct[0], ct[1]); const unsigned int count = MIN(ct[0] + ct[1], count_sat); const unsigned int factor = max_update_factor * count / count_sat; return weighted_prob(pre_prob, prob, factor); } -static INLINE vp9_prob merge_probs2(vp9_prob pre_prob, - const unsigned int ct[2], - unsigned int count_sat, - unsigned int max_update_factor) { - return merge_probs(pre_prob, get_binary_prob(ct[0], ct[1]), ct, count_sat, - max_update_factor); +static unsigned int tree_merge_probs_impl(unsigned int i, + const vp9_tree_index *tree, + const vp9_prob *pre_probs, + const unsigned int *counts, + unsigned int count_sat, + unsigned int max_update_factor, + vp9_prob *probs) { + const int l = tree[i]; + const unsigned int left_count = (l <= 0) + ? counts[-l] + : tree_merge_probs_impl(l, tree, pre_probs, counts, + count_sat, max_update_factor, probs); + const int r = tree[i + 1]; + const unsigned int right_count = (r <= 0) + ? counts[-r] + : tree_merge_probs_impl(r, tree, pre_probs, counts, + count_sat, max_update_factor, probs); + const unsigned int ct[2] = { left_count, right_count }; + probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct, + count_sat, max_update_factor); + return left_count + right_count; +} + +static void tree_merge_probs(const vp9_tree_index *tree, + const vp9_prob *pre_probs, + const unsigned int *counts, int offset, + unsigned int count_sat, + unsigned int max_update_factor, vp9_prob *probs) { + tree_merge_probs_impl(0, tree, pre_probs, &counts[-offset], + count_sat, max_update_factor, probs); } diff --git a/libvpx/vp9/common/x86/vp9_asm_stubs.c b/libvpx/vp9/common/x86/vp9_asm_stubs.c index 3f1c198..106e6d4 100644 --- a/libvpx/vp9/common/x86/vp9_asm_stubs.c +++ b/libvpx/vp9/common/x86/vp9_asm_stubs.c @@ -36,90 +36,28 @@ DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = { { 8, 8, 8, 8, 120, 120, 120, 120 } }; -#if HAVE_SSSE3 -void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); +typedef void filter8_1dfunction ( + const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter +); -void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block1d8_v8_avg_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block1d8_h8_avg_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block1d4_v8_avg_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); +#if HAVE_SSSE3 +filter8_1dfunction vp9_filter_block1d16_v8_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, @@ -279,7 +217,7 @@ void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71); + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); assert(w <= 64); assert(h <= 64); @@ -300,7 +238,7 @@ void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71); + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); assert(w <= 64); assert(h <= 64); @@ -317,3 +255,214 @@ void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, } } #endif + +#if HAVE_SSE2 +filter8_1dfunction vp9_filter_block1d16_v8_sse2; +filter8_1dfunction vp9_filter_block1d16_h8_sse2; +filter8_1dfunction vp9_filter_block1d8_v8_sse2; +filter8_1dfunction vp9_filter_block1d8_h8_sse2; +filter8_1dfunction vp9_filter_block1d4_v8_sse2; +filter8_1dfunction vp9_filter_block1d4_h8_sse2; +filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2; +filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2; +filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2; +filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2; +filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2; +filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; + +void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + /* Ensure the filter can be compressed to int16_t. */ + if (x_step_q4 == 16 && filter_x[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_h8_sse2(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 16; + dst += 16; + w -= 16; + } + while (w >= 8) { + vp9_filter_block1d8_h8_sse2(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_h8_sse2(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } +} + +void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (y_step_q4 == 16 && filter_y[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 16; + dst += 16; + w -= 16; + } + while (w >= 8) { + vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } +} + +void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (x_step_q4 == 16 && filter_x[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_h8_avg_sse2(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 16; + dst += 16; + w -= 16; + } + while (w >= 8) { + vp9_filter_block1d8_h8_avg_sse2(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_h8_avg_sse2(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } +} + +void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (y_step_q4 == 16 && filter_y[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 16; + dst += 16; + w -= 16; + } + while (w >= 8) { + vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } +} + +void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); + + assert(w <= 64); + assert(h <= 64); + if (x_step_q4 == 16 && y_step_q4 == 16) { + vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h + 7); + vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, w, h); + } else { + vp9_convolve8_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, w, h); + } +} + +void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); + + assert(w <= 64); + assert(h <= 64); + if (x_step_q4 == 16 && y_step_q4 == 16) { + vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h + 7); + vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } else { + vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, w, h); + } +} +#endif diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c index 8f740f4..ccf5aac 100644 --- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -15,7 +15,7 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_idct.h" -void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, @@ -26,10 +26,10 @@ void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) { __m128i input0, input1, input2, input3; // Rows - input0 = _mm_loadl_epi64((__m128i *)input); - input1 = _mm_loadl_epi64((__m128i *)(input + 4)); - input2 = _mm_loadl_epi64((__m128i *)(input + 8)); - input3 = _mm_loadl_epi64((__m128i *)(input + 12)); + input0 = _mm_loadl_epi64((const __m128i *)input); + input1 = _mm_loadl_epi64((const __m128i *)(input + 4)); + input2 = _mm_loadl_epi64((const __m128i *)(input + 8)); + input3 = _mm_loadl_epi64((const __m128i *)(input + 12)); // Construct i3, i1, i3, i1, i2, i0, i2, i0 input0 = _mm_shufflelo_epi16(input0, 0xd8); @@ -148,7 +148,7 @@ void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE4X4(dest, input3); } -void vp9_short_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { __m128i dc_value; const __m128i zero = _mm_setzero_si128(); int a; @@ -165,41 +165,6 @@ void vp9_short_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE4X4(dest, dc_value); } -void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) { - const __m128i zero = _mm_setzero_si128(); - const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, - (int16_t)cospi_16_64, (int16_t)-cospi_16_64, - (int16_t)cospi_24_64, (int16_t)-cospi_8_64, - (int16_t)cospi_8_64, (int16_t)cospi_24_64); - const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1); - - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i in, temp; - - // Load input data. - in = _mm_loadl_epi64((__m128i *)input); - - // Construct i3, i1, i3, i1, i2, i0, i2, i0 - in = _mm_shufflelo_epi16(in, 0xd8); - in = _mm_unpacklo_epi32(in, in); - - // Stage 1 - in = _mm_madd_epi16(in, c1); - in = _mm_add_epi32(in, rounding); - in = _mm_srai_epi32(in, DCT_CONST_BITS); - in = _mm_packs_epi32(in, zero); - - // Stage 2 - temp = _mm_shufflelo_epi16(in, 0x9c); - in = _mm_shufflelo_epi16(in, 0xc9); - in = _mm_unpacklo_epi64(temp, in); - in = _mm_madd_epi16(in, c2); - in = _mm_packs_epi32(in, zero); - - // Store results - _mm_storel_epi64((__m128i *)output, in); -} - static INLINE void transpose_4x4(__m128i *res) { const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]); @@ -210,7 +175,7 @@ static INLINE void transpose_4x4(__m128i *res) { res[3] = _mm_unpackhi_epi64(res[2], res[2]); } -void idct4_1d_sse2(__m128i *in) { +static void idct4_1d_sse2(__m128i *in) { const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); @@ -249,7 +214,7 @@ void idct4_1d_sse2(__m128i *in) { in[3] = _mm_sub_epi16(u[0], u[3]); } -void iadst4_1d_sse2(__m128i *in) { +static void iadst4_1d_sse2(__m128i *in) { const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); @@ -299,16 +264,16 @@ void iadst4_1d_sse2(__m128i *in) { in[3] = _mm_unpackhi_epi64(in[1], in[1]); } -void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride, - int tx_type) { +void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { __m128i in[4]; const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); - in[0] = _mm_loadl_epi64((__m128i *)input); - in[1] = _mm_loadl_epi64((__m128i *)(input + 4)); - in[2] = _mm_loadl_epi64((__m128i *)(input + 8)); - in[3] = _mm_loadl_epi64((__m128i *)(input + 12)); + in[0] = _mm_loadl_epi64((const __m128i *)input); + in[1] = _mm_loadl_epi64((const __m128i *)(input + 4)); + in[2] = _mm_loadl_epi64((const __m128i *)(input + 8)); + in[3] = _mm_loadl_epi64((const __m128i *)(input + 12)); switch (tx_type) { case 0: // DCT_DCT @@ -450,7 +415,7 @@ void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride, res3 = _mm_packs_epi32(tmp6, tmp7); \ } -#define IDCT8x8_1D \ +#define IDCT8_1D \ /* Stage1 */ \ { \ const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ @@ -529,7 +494,7 @@ void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride, dest += stride; \ } -void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<4); @@ -549,23 +514,23 @@ void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { int i; // Load input data. - in0 = _mm_load_si128((__m128i *)input); - in1 = _mm_load_si128((__m128i *)(input + 8 * 1)); - in2 = _mm_load_si128((__m128i *)(input + 8 * 2)); - in3 = _mm_load_si128((__m128i *)(input + 8 * 3)); - in4 = _mm_load_si128((__m128i *)(input + 8 * 4)); - in5 = _mm_load_si128((__m128i *)(input + 8 * 5)); - in6 = _mm_load_si128((__m128i *)(input + 8 * 6)); - in7 = _mm_load_si128((__m128i *)(input + 8 * 7)); + in0 = _mm_load_si128((const __m128i *)input); + in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); + in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); + in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); + in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); + in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); // 2-D for (i = 0; i < 2; i++) { - // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2() + // 8x8 Transpose is copied from vp9_fdct8x8_sse2() TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); // 4-stage 1D idct8x8 - IDCT8x8_1D + IDCT8_1D } // Final rounding and shift @@ -597,7 +562,7 @@ void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE(dest, in7); } -void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { __m128i dc_value; const __m128i zero = _mm_setzero_si128(); int a; @@ -648,7 +613,7 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); } -void idct8_1d_sse2(__m128i *in) { +static void idct8_1d_sse2(__m128i *in) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); @@ -673,12 +638,12 @@ void idct8_1d_sse2(__m128i *in) { in6 = in[6]; in7 = in[7]; - // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2() + // 8x8 Transpose is copied from vp9_fdct8x8_sse2() TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); // 4-stage 1D idct8x8 - IDCT8x8_1D + IDCT8_1D in[0] = in0; in[1] = in1; in[2] = in2; @@ -689,7 +654,7 @@ void idct8_1d_sse2(__m128i *in) { in[7] = in7; } -void iadst8_1d_sse2(__m128i *in) { +static void iadst8_1d_sse2(__m128i *in) { const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); @@ -918,21 +883,21 @@ void iadst8_1d_sse2(__m128i *in) { } -void vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride, - int tx_type) { +void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { __m128i in[8]; const __m128i zero = _mm_setzero_si128(); const __m128i final_rounding = _mm_set1_epi16(1<<4); // load input data - in[0] = _mm_load_si128((__m128i *)input); - in[1] = _mm_load_si128((__m128i *)(input + 8 * 1)); - in[2] = _mm_load_si128((__m128i *)(input + 8 * 2)); - in[3] = _mm_load_si128((__m128i *)(input + 8 * 3)); - in[4] = _mm_load_si128((__m128i *)(input + 8 * 4)); - in[5] = _mm_load_si128((__m128i *)(input + 8 * 5)); - in[6] = _mm_load_si128((__m128i *)(input + 8 * 6)); - in[7] = _mm_load_si128((__m128i *)(input + 8 * 7)); + in[0] = _mm_load_si128((const __m128i *)input); + in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1)); + in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3)); + in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5)); + in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6)); + in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7)); switch (tx_type) { case 0: // DCT_DCT @@ -985,7 +950,7 @@ void vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride, RECON_AND_STORE(dest, in[7]); } -void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<4); @@ -1005,16 +970,16 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; // Rows. Load 4-row input data. - in0 = _mm_load_si128((__m128i *)input); - in1 = _mm_load_si128((__m128i *)(input + 8 * 1)); - in2 = _mm_load_si128((__m128i *)(input + 8 * 2)); - in3 = _mm_load_si128((__m128i *)(input + 8 * 3)); + in0 = _mm_load_si128((const __m128i *)input); + in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); + in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); // 8x4 Transpose TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3) // Stage1 - { + { //NOLINT const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3); const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2); @@ -1039,7 +1004,7 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { } // Stage2 - { + { //NOLINT const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2); const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3); @@ -1069,7 +1034,7 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { } // Stage3 - { + { //NOLINT const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); @@ -1103,7 +1068,7 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { in4, in5, in6, in7) // 1D idct8x8 - IDCT8x8_1D + IDCT8_1D // Final rounding and shift in0 = _mm_adds_epi16(in0, final_rounding); @@ -1134,7 +1099,7 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE(dest, in7); } -#define IDCT16x16_1D \ +#define IDCT16_1D \ /* Stage2 */ \ { \ const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \ @@ -1263,7 +1228,8 @@ void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { stp2_10, stp2_13, stp2_11, stp2_12) \ } -void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, + int stride) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<5); const __m128i zero = _mm_setzero_si128(); @@ -1318,22 +1284,22 @@ void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) { if (i == 1) input += 128; // Load input data. - in0 = _mm_load_si128((__m128i *)input); - in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); - in1 = _mm_load_si128((__m128i *)(input + 8 * 2)); - in9 = _mm_load_si128((__m128i *)(input + 8 * 3)); - in2 = _mm_load_si128((__m128i *)(input + 8 * 4)); - in10 = _mm_load_si128((__m128i *)(input + 8 * 5)); - in3 = _mm_load_si128((__m128i *)(input + 8 * 6)); - in11 = _mm_load_si128((__m128i *)(input + 8 * 7)); - in4 = _mm_load_si128((__m128i *)(input + 8 * 8)); - in12 = _mm_load_si128((__m128i *)(input + 8 * 9)); - in5 = _mm_load_si128((__m128i *)(input + 8 * 10)); - in13 = _mm_load_si128((__m128i *)(input + 8 * 11)); - in6 = _mm_load_si128((__m128i *)(input + 8 * 12)); - in14 = _mm_load_si128((__m128i *)(input + 8 * 13)); - in7 = _mm_load_si128((__m128i *)(input + 8 * 14)); - in15 = _mm_load_si128((__m128i *)(input + 8 * 15)); + in0 = _mm_load_si128((const __m128i *)input); + in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); + in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); + in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); + in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); + in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); + in4 = _mm_load_si128((const __m128i *)(input + 8 * 8)); + in12 = _mm_load_si128((const __m128i *)(input + 8 * 9)); + in5 = _mm_load_si128((const __m128i *)(input + 8 * 10)); + in13 = _mm_load_si128((const __m128i *)(input + 8 * 11)); + in6 = _mm_load_si128((const __m128i *)(input + 8 * 12)); + in14 = _mm_load_si128((const __m128i *)(input + 8 * 13)); + in7 = _mm_load_si128((const __m128i *)(input + 8 * 14)); + in15 = _mm_load_si128((const __m128i *)(input + 8 * 15)); TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); @@ -1355,7 +1321,7 @@ void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) { in12, in13, in14, in15); } - IDCT16x16_1D + IDCT16_1D // Stage7 if (i == 0) { @@ -1470,7 +1436,7 @@ void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) { } } -void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { +void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { __m128i dc_value; const __m128i zero = _mm_setzero_si128(); int a, i; @@ -1519,7 +1485,7 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { res0[15] = tbuf[7]; } -void iadst16_1d_8col(__m128i *in) { +static void iadst16_1d_8col(__m128i *in) { // perform 16x16 1-D ADST for 8 columns __m128i s[16], x[16], u[32], v[32]; const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); @@ -1989,7 +1955,7 @@ void iadst16_1d_8col(__m128i *in) { in[15] = _mm_sub_epi16(kZero, s[1]); } -void idct16_1d_8col(__m128i *in) { +static void idct16_1d_8col(__m128i *in) { const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); @@ -2333,36 +2299,36 @@ void idct16_1d_8col(__m128i *in) { in[15] = _mm_sub_epi16(s[0], s[15]); } -void idct16_1d_sse2(__m128i *in0, __m128i *in1) { +static void idct16_1d_sse2(__m128i *in0, __m128i *in1) { array_transpose_16x16(in0, in1); idct16_1d_8col(in0); idct16_1d_8col(in1); } -void iadst16_1d_sse2(__m128i *in0, __m128i *in1) { +static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) { array_transpose_16x16(in0, in1); iadst16_1d_8col(in0); iadst16_1d_8col(in1); } -static INLINE void load_buffer_8x16(int16_t *input, __m128i *in) { - in[0] = _mm_load_si128((__m128i *)(input + 0 * 16)); - in[1] = _mm_load_si128((__m128i *)(input + 1 * 16)); - in[2] = _mm_load_si128((__m128i *)(input + 2 * 16)); - in[3] = _mm_load_si128((__m128i *)(input + 3 * 16)); - in[4] = _mm_load_si128((__m128i *)(input + 4 * 16)); - in[5] = _mm_load_si128((__m128i *)(input + 5 * 16)); - in[6] = _mm_load_si128((__m128i *)(input + 6 * 16)); - in[7] = _mm_load_si128((__m128i *)(input + 7 * 16)); - - in[8] = _mm_load_si128((__m128i *)(input + 8 * 16)); - in[9] = _mm_load_si128((__m128i *)(input + 9 * 16)); - in[10] = _mm_load_si128((__m128i *)(input + 10 * 16)); - in[11] = _mm_load_si128((__m128i *)(input + 11 * 16)); - in[12] = _mm_load_si128((__m128i *)(input + 12 * 16)); - in[13] = _mm_load_si128((__m128i *)(input + 13 * 16)); - in[14] = _mm_load_si128((__m128i *)(input + 14 * 16)); - in[15] = _mm_load_si128((__m128i *)(input + 15 * 16)); +static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); + in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); + in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); + in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); + in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); + + in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); + in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); + in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); + in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); + in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); + in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); + in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); + in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); } static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { @@ -2421,8 +2387,8 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { RECON_AND_STORE(dest, in[15]); } -void vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride, - int tx_type) { +void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { __m128i in0[16], in1[16]; load_buffer_8x16(input, in0); @@ -2456,8 +2422,8 @@ void vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride, write_buffer_8x16(dest, in1, stride); } -void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest, - int stride) { +void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, + int stride) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<5); const __m128i zero = _mm_setzero_si128(); @@ -2503,14 +2469,14 @@ void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest, __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int i; // 1-D idct. Load input data. - in0 = _mm_load_si128((__m128i *)input); - in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); - in1 = _mm_load_si128((__m128i *)(input + 8 * 2)); - in9 = _mm_load_si128((__m128i *)(input + 8 * 3)); - in2 = _mm_load_si128((__m128i *)(input + 8 * 4)); - in10 = _mm_load_si128((__m128i *)(input + 8 * 5)); - in3 = _mm_load_si128((__m128i *)(input + 8 * 6)); - in11 = _mm_load_si128((__m128i *)(input + 8 * 7)); + in0 = _mm_load_si128((const __m128i *)input); + in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); + in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); + in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); + in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); + in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11); @@ -2737,7 +2703,7 @@ void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest, in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero; - IDCT16x16_1D + IDCT16_1D // Stage7 in0 = _mm_add_epi16(stp2_0, stp1_15); @@ -2815,11 +2781,704 @@ void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest, #define LOAD_DQCOEFF(reg, input) \ { \ - reg = _mm_load_si128((__m128i *) input); \ + reg = _mm_load_si128((const __m128i *) input); \ input += 8; \ } \ -void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) { +#define IDCT32_1D \ +/* Stage1 */ \ +{ \ + const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \ + const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \ + const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \ + const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \ + \ + const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \ + const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \ + const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \ + const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \ + \ + const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \ + const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \ + const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \ + const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \ + \ + const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \ + const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \ + const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \ + const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \ + \ + MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ + stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ + stp1_17, stp1_30) \ + MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ + stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ + stp1_19, stp1_28) \ + MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ + stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ + stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ + stp1_23, stp1_24) \ +} \ +\ +/* Stage2 */ \ +{ \ + const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \ + const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \ + const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \ + const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \ + \ + const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \ + const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \ + const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \ + const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \ + \ + MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ + stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ + stp2_14) \ + MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ + stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ + stp2_11, stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ + stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ + stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ + \ + stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ + stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ + stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ + stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ + \ + stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ + stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ + stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ +} \ +\ +/* Stage3 */ \ +{ \ + const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \ + const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \ + const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \ + const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \ + \ + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + \ + MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ + stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ + stp1_6) \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ + stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ + \ + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ + stp1_18, stp1_29) \ + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ + stp1_22, stp1_25) \ + \ + stp1_16 = stp2_16; \ + stp1_31 = stp2_31; \ + stp1_19 = stp2_19; \ + stp1_20 = stp2_20; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_27 = stp2_27; \ + stp1_28 = stp2_28; \ +} \ +\ +/* Stage4 */ \ +{ \ + const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \ + const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \ + const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \ + const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ + stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ + stp2_2, stp2_3) \ + \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ + stp2_10, stp2_13) \ + \ + stp2_8 = stp1_8; \ + stp2_15 = stp1_15; \ + stp2_11 = stp1_11; \ + stp2_12 = stp1_12; \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ +} \ +\ +/* Stage5 */ \ +{ \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_4 = stp2_4; \ + stp1_7 = stp2_7; \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + \ + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ + stp1_19, stp1_28) \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + \ + stp1_22 = stp2_22; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_25 = stp2_25; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} \ +\ +/* Stage6 */ \ +{ \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ + \ + stp2_8 = stp1_8; \ + stp2_9 = stp1_9; \ + stp2_14 = stp1_14; \ + stp2_15 = stp1_15; \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ + stp2_13, stp2_11, stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ + \ + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ +} \ +\ +/* Stage7 */ \ +{ \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + stp1_18 = stp2_18; \ + stp1_19 = stp2_19; \ + \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ + stp1_23, stp1_24) \ + \ + stp1_28 = stp2_28; \ + stp1_29 = stp2_29; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} + +// Only upper-left 8x8 has non-zero coeff +void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<5); + + // idct constants for each stage + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, + in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, + in24, in25, in26, in27, in28, in29, in30, in31; + __m128i col[128]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, + stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, + stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, + stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, + stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i, j, i32; + + // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. + for (i = 0; i < 8; i++) { + i32 = (i << 5); + if (i == 0) { + // First 1-D idct: first 8 rows + // Load input data. + LOAD_DQCOEFF(in0, input); + LOAD_DQCOEFF(in8, input); + LOAD_DQCOEFF(in16, input); + LOAD_DQCOEFF(in24, input); + LOAD_DQCOEFF(in1, input); + LOAD_DQCOEFF(in9, input); + LOAD_DQCOEFF(in17, input); + LOAD_DQCOEFF(in25, input); + LOAD_DQCOEFF(in2, input); + LOAD_DQCOEFF(in10, input); + LOAD_DQCOEFF(in18, input); + LOAD_DQCOEFF(in26, input); + LOAD_DQCOEFF(in3, input); + LOAD_DQCOEFF(in11, input); + LOAD_DQCOEFF(in19, input); + LOAD_DQCOEFF(in27, input); + + LOAD_DQCOEFF(in4, input); + LOAD_DQCOEFF(in12, input); + LOAD_DQCOEFF(in20, input); + LOAD_DQCOEFF(in28, input); + LOAD_DQCOEFF(in5, input); + LOAD_DQCOEFF(in13, input); + LOAD_DQCOEFF(in21, input); + LOAD_DQCOEFF(in29, input); + LOAD_DQCOEFF(in6, input); + LOAD_DQCOEFF(in14, input); + LOAD_DQCOEFF(in22, input); + LOAD_DQCOEFF(in30, input); + LOAD_DQCOEFF(in7, input); + LOAD_DQCOEFF(in15, input); + LOAD_DQCOEFF(in23, input); + LOAD_DQCOEFF(in31, input); + + // Transpose 32x8 block to 8x32 block + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, + in18, in19, in20, in21, in22, in23); + TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, + in26, in27, in28, in29, in30, in31); + } else if (i < 4) { + // First 1-D idct: next 24 zero-coeff rows + col[i32 + 0] = _mm_setzero_si128(); + col[i32 + 1] = _mm_setzero_si128(); + col[i32 + 2] = _mm_setzero_si128(); + col[i32 + 3] = _mm_setzero_si128(); + col[i32 + 4] = _mm_setzero_si128(); + col[i32 + 5] = _mm_setzero_si128(); + col[i32 + 6] = _mm_setzero_si128(); + col[i32 + 7] = _mm_setzero_si128(); + col[i32 + 8] = _mm_setzero_si128(); + col[i32 + 9] = _mm_setzero_si128(); + col[i32 + 10] = _mm_setzero_si128(); + col[i32 + 11] = _mm_setzero_si128(); + col[i32 + 12] = _mm_setzero_si128(); + col[i32 + 13] = _mm_setzero_si128(); + col[i32 + 14] = _mm_setzero_si128(); + col[i32 + 15] = _mm_setzero_si128(); + col[i32 + 16] = _mm_setzero_si128(); + col[i32 + 17] = _mm_setzero_si128(); + col[i32 + 18] = _mm_setzero_si128(); + col[i32 + 19] = _mm_setzero_si128(); + col[i32 + 20] = _mm_setzero_si128(); + col[i32 + 21] = _mm_setzero_si128(); + col[i32 + 22] = _mm_setzero_si128(); + col[i32 + 23] = _mm_setzero_si128(); + col[i32 + 24] = _mm_setzero_si128(); + col[i32 + 25] = _mm_setzero_si128(); + col[i32 + 26] = _mm_setzero_si128(); + col[i32 + 27] = _mm_setzero_si128(); + col[i32 + 28] = _mm_setzero_si128(); + col[i32 + 29] = _mm_setzero_si128(); + col[i32 + 30] = _mm_setzero_si128(); + col[i32 + 31] = _mm_setzero_si128(); + continue; + } else { + // Second 1-D idct + j = i - 4; + + // Transpose 32x8 block to 8x32 block + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, + in5, in6, in7); + j += 4; + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, + in11, in12, in13, in14, in15); + j += 4; + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, + in19, in20, in21, in22, in23); + j += 4; + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, + in28, in29, in30, in31); + } + + IDCT32_1D + + // final stage + if (i < 4) { + // 1_D: Store 32 intermediate results for each 8x32 block. + col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); + col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); + col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); + col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); + col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); + col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); + col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); + col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); + col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); + col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); + col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); + col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); + col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); + col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); + col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); + col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); + col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); + col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); + col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); + col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); + col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); + col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); + col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); + col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); + col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); + col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); + col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); + col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); + col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); + col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); + col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); + col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); + } else { + const __m128i zero = _mm_setzero_si128(); + + // 2_D: Calculate the results and store them to destination. + in0 = _mm_add_epi16(stp1_0, stp1_31); + in1 = _mm_add_epi16(stp1_1, stp1_30); + in2 = _mm_add_epi16(stp1_2, stp1_29); + in3 = _mm_add_epi16(stp1_3, stp1_28); + in4 = _mm_add_epi16(stp1_4, stp1_27); + in5 = _mm_add_epi16(stp1_5, stp1_26); + in6 = _mm_add_epi16(stp1_6, stp1_25); + in7 = _mm_add_epi16(stp1_7, stp1_24); + in8 = _mm_add_epi16(stp1_8, stp1_23); + in9 = _mm_add_epi16(stp1_9, stp1_22); + in10 = _mm_add_epi16(stp1_10, stp1_21); + in11 = _mm_add_epi16(stp1_11, stp1_20); + in12 = _mm_add_epi16(stp1_12, stp1_19); + in13 = _mm_add_epi16(stp1_13, stp1_18); + in14 = _mm_add_epi16(stp1_14, stp1_17); + in15 = _mm_add_epi16(stp1_15, stp1_16); + in16 = _mm_sub_epi16(stp1_15, stp1_16); + in17 = _mm_sub_epi16(stp1_14, stp1_17); + in18 = _mm_sub_epi16(stp1_13, stp1_18); + in19 = _mm_sub_epi16(stp1_12, stp1_19); + in20 = _mm_sub_epi16(stp1_11, stp1_20); + in21 = _mm_sub_epi16(stp1_10, stp1_21); + in22 = _mm_sub_epi16(stp1_9, stp1_22); + in23 = _mm_sub_epi16(stp1_8, stp1_23); + in24 = _mm_sub_epi16(stp1_7, stp1_24); + in25 = _mm_sub_epi16(stp1_6, stp1_25); + in26 = _mm_sub_epi16(stp1_5, stp1_26); + in27 = _mm_sub_epi16(stp1_4, stp1_27); + in28 = _mm_sub_epi16(stp1_3, stp1_28); + in29 = _mm_sub_epi16(stp1_2, stp1_29); + in30 = _mm_sub_epi16(stp1_1, stp1_30); + in31 = _mm_sub_epi16(stp1_0, stp1_31); + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + in8 = _mm_adds_epi16(in8, final_rounding); + in9 = _mm_adds_epi16(in9, final_rounding); + in10 = _mm_adds_epi16(in10, final_rounding); + in11 = _mm_adds_epi16(in11, final_rounding); + in12 = _mm_adds_epi16(in12, final_rounding); + in13 = _mm_adds_epi16(in13, final_rounding); + in14 = _mm_adds_epi16(in14, final_rounding); + in15 = _mm_adds_epi16(in15, final_rounding); + in16 = _mm_adds_epi16(in16, final_rounding); + in17 = _mm_adds_epi16(in17, final_rounding); + in18 = _mm_adds_epi16(in18, final_rounding); + in19 = _mm_adds_epi16(in19, final_rounding); + in20 = _mm_adds_epi16(in20, final_rounding); + in21 = _mm_adds_epi16(in21, final_rounding); + in22 = _mm_adds_epi16(in22, final_rounding); + in23 = _mm_adds_epi16(in23, final_rounding); + in24 = _mm_adds_epi16(in24, final_rounding); + in25 = _mm_adds_epi16(in25, final_rounding); + in26 = _mm_adds_epi16(in26, final_rounding); + in27 = _mm_adds_epi16(in27, final_rounding); + in28 = _mm_adds_epi16(in28, final_rounding); + in29 = _mm_adds_epi16(in29, final_rounding); + in30 = _mm_adds_epi16(in30, final_rounding); + in31 = _mm_adds_epi16(in31, final_rounding); + + in0 = _mm_srai_epi16(in0, 6); + in1 = _mm_srai_epi16(in1, 6); + in2 = _mm_srai_epi16(in2, 6); + in3 = _mm_srai_epi16(in3, 6); + in4 = _mm_srai_epi16(in4, 6); + in5 = _mm_srai_epi16(in5, 6); + in6 = _mm_srai_epi16(in6, 6); + in7 = _mm_srai_epi16(in7, 6); + in8 = _mm_srai_epi16(in8, 6); + in9 = _mm_srai_epi16(in9, 6); + in10 = _mm_srai_epi16(in10, 6); + in11 = _mm_srai_epi16(in11, 6); + in12 = _mm_srai_epi16(in12, 6); + in13 = _mm_srai_epi16(in13, 6); + in14 = _mm_srai_epi16(in14, 6); + in15 = _mm_srai_epi16(in15, 6); + in16 = _mm_srai_epi16(in16, 6); + in17 = _mm_srai_epi16(in17, 6); + in18 = _mm_srai_epi16(in18, 6); + in19 = _mm_srai_epi16(in19, 6); + in20 = _mm_srai_epi16(in20, 6); + in21 = _mm_srai_epi16(in21, 6); + in22 = _mm_srai_epi16(in22, 6); + in23 = _mm_srai_epi16(in23, 6); + in24 = _mm_srai_epi16(in24, 6); + in25 = _mm_srai_epi16(in25, 6); + in26 = _mm_srai_epi16(in26, 6); + in27 = _mm_srai_epi16(in27, 6); + in28 = _mm_srai_epi16(in28, 6); + in29 = _mm_srai_epi16(in29, 6); + in30 = _mm_srai_epi16(in30, 6); + in31 = _mm_srai_epi16(in31, 6); + + RECON_AND_STORE(dest, in0); + RECON_AND_STORE(dest, in1); + RECON_AND_STORE(dest, in2); + RECON_AND_STORE(dest, in3); + RECON_AND_STORE(dest, in4); + RECON_AND_STORE(dest, in5); + RECON_AND_STORE(dest, in6); + RECON_AND_STORE(dest, in7); + RECON_AND_STORE(dest, in8); + RECON_AND_STORE(dest, in9); + RECON_AND_STORE(dest, in10); + RECON_AND_STORE(dest, in11); + RECON_AND_STORE(dest, in12); + RECON_AND_STORE(dest, in13); + RECON_AND_STORE(dest, in14); + RECON_AND_STORE(dest, in15); + RECON_AND_STORE(dest, in16); + RECON_AND_STORE(dest, in17); + RECON_AND_STORE(dest, in18); + RECON_AND_STORE(dest, in19); + RECON_AND_STORE(dest, in20); + RECON_AND_STORE(dest, in21); + RECON_AND_STORE(dest, in22); + RECON_AND_STORE(dest, in23); + RECON_AND_STORE(dest, in24); + RECON_AND_STORE(dest, in25); + RECON_AND_STORE(dest, in26); + RECON_AND_STORE(dest, in27); + RECON_AND_STORE(dest, in28); + RECON_AND_STORE(dest, in29); + RECON_AND_STORE(dest, in30); + RECON_AND_STORE(dest, in31); + + dest += 8 - (stride * 32); + } + } +} + +void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, + int stride) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<5); @@ -3042,336 +3701,7 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) { in28, in29, in30, in31); } - // Stage1 - { - const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); - const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); - const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); - const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); - - const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); - const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); - const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); - const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); - - const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); - const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); - const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); - const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); - - const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); - const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); - const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); - const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); - - MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, - stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, - stp1_17, stp1_30) - MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, - stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, - stp1_19, stp1_28) - MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, - stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, - stp1_21, stp1_26) - MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, - stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, - stp1_23, stp1_24) - } - - // Stage2 - { - const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); - const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); - const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); - const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); - - const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); - const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); - const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); - const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); - - MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, - stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, - stp2_14) - MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, - stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, - stp2_11, stp2_12) - - stp2_16 = _mm_add_epi16(stp1_16, stp1_17); - stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); - stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); - stp2_19 = _mm_add_epi16(stp1_19, stp1_18); - - stp2_20 = _mm_add_epi16(stp1_20, stp1_21); - stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); - stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); - stp2_23 = _mm_add_epi16(stp1_23, stp1_22); - - stp2_24 = _mm_add_epi16(stp1_24, stp1_25); - stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); - stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); - stp2_27 = _mm_add_epi16(stp1_27, stp1_26); - - stp2_28 = _mm_add_epi16(stp1_28, stp1_29); - stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); - stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); - stp2_31 = _mm_add_epi16(stp1_31, stp1_30); - } - - // Stage3 - { - const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); - const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); - const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); - const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); - - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); - - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); - - MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, - stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, - stp1_6) - - stp1_8 = _mm_add_epi16(stp2_8, stp2_9); - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); - stp1_12 = _mm_add_epi16(stp2_12, stp2_13); - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); - - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, - stp1_18, stp1_29) - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, - stp1_22, stp1_25) - - stp1_16 = stp2_16; - stp1_31 = stp2_31; - stp1_19 = stp2_19; - stp1_20 = stp2_20; - stp1_23 = stp2_23; - stp1_24 = stp2_24; - stp1_27 = stp2_27; - stp1_28 = stp2_28; - } - - // Stage4 - { - const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); - const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); - const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); - const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); - - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); - - MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, - stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, - stp2_2, stp2_3) - - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); - - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, - stp2_10, stp2_13) - - stp2_8 = stp1_8; - stp2_15 = stp1_15; - stp2_11 = stp1_11; - stp2_12 = stp1_12; - - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); - - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); - } - - // Stage5 - { - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); - - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); - - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); - - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); - - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - - stp1_5 = _mm_packs_epi32(tmp0, tmp1); - stp1_6 = _mm_packs_epi32(tmp2, tmp3); - - stp1_4 = stp2_4; - stp1_7 = stp2_7; - - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); - - stp1_16 = stp2_16; - stp1_17 = stp2_17; - - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, - stp1_19, stp1_28) - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, - stp1_21, stp1_26) - - stp1_22 = stp2_22; - stp1_23 = stp2_23; - stp1_24 = stp2_24; - stp1_25 = stp2_25; - stp1_30 = stp2_30; - stp1_31 = stp2_31; - } - - // Stage6 - { - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); - - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); - - stp2_8 = stp1_8; - stp2_9 = stp1_9; - stp2_14 = stp1_14; - stp2_15 = stp1_15; - - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, - stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, - stp2_13, stp2_11, stp2_12) - - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); - - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); - } - - // Stage7 - { - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); - - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); - - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); - - stp1_16 = stp2_16; - stp1_17 = stp2_17; - stp1_18 = stp2_18; - stp1_19 = stp2_19; - - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, - stp1_21, stp1_26) - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, - stp1_23, stp1_24) - - stp1_28 = stp2_28; - stp1_29 = stp2_29; - stp1_30 = stp2_30; - stp1_31 = stp2_31; - } + IDCT32_1D // final stage if (i < 4) { @@ -3548,4 +3878,52 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) { dest += 8 - (stride * 32); } } +} //NOLINT + +void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a, i; + + a = dct_const_round_shift(input[0] * cospi_16_64); + a = dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 6); + + dc_value = _mm_set1_epi16(a); + + for (i = 0; i < 4; ++i) { + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + dest += 8 - (stride * 32); + } } diff --git a/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm b/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm index 980b8b9..69b07f6 100644 --- a/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm +++ b/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm @@ -19,12 +19,14 @@ pw_32: times 8 dw 32 SECTION .text INIT_MMX sse -cglobal dc_predictor_4x4, 4, 4, 2, dst, stride, above, left +cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + pxor m1, m1 movd m0, [aboveq] punpckldq m0, [leftq] psadbw m0, m1 - paddw m0, [pw_4] + paddw m0, [GLOBAL(pw_4)] psraw m0, 3 pshufw m0, m0, 0x0 packuswb m0, m0 @@ -33,10 +35,14 @@ cglobal dc_predictor_4x4, 4, 4, 2, dst, stride, above, left lea dstq, [dstq+strideq*2] movd [dstq ], m0 movd [dstq+strideq], m0 + + RESTORE_GOT RET INIT_MMX sse -cglobal dc_predictor_8x8, 4, 4, 3, dst, stride, above, left +cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + pxor m1, m1 movq m0, [aboveq] movq m2, [leftq] @@ -45,7 +51,7 @@ cglobal dc_predictor_8x8, 4, 4, 3, dst, stride, above, left psadbw m0, m1 psadbw m2, m1 paddw m0, m2 - paddw m0, [pw_8] + paddw m0, [GLOBAL(pw_8)] psraw m0, 4 pshufw m0, m0, 0x0 packuswb m0, m0 @@ -58,10 +64,14 @@ cglobal dc_predictor_8x8, 4, 4, 3, dst, stride, above, left movq [dstq+strideq ], m0 movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 + + RESTORE_GOT RET INIT_XMM sse2 -cglobal dc_predictor_16x16, 4, 4, 3, dst, stride, above, left +cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + pxor m1, m1 mova m0, [aboveq] mova m2, [leftq] @@ -73,7 +83,7 @@ cglobal dc_predictor_16x16, 4, 4, 3, dst, stride, above, left paddw m0, m2 movhlps m2, m0 paddw m0, m2 - paddw m0, [pw_16] + paddw m0, [GLOBAL(pw_16)] psraw m0, 5 pshuflw m0, m0, 0x0 punpcklqdq m0, m0 @@ -86,10 +96,14 @@ cglobal dc_predictor_16x16, 4, 4, 3, dst, stride, above, left lea dstq, [dstq+strideq*4] dec lines4d jnz .loop + + RESTORE_GOT REP_RET INIT_XMM sse2 -cglobal dc_predictor_32x32, 4, 4, 5, dst, stride, above, left +cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + pxor m1, m1 mova m0, [aboveq] mova m2, [aboveq+16] @@ -107,7 +121,7 @@ cglobal dc_predictor_32x32, 4, 4, 5, dst, stride, above, left paddw m0, m4 movhlps m2, m0 paddw m0, m2 - paddw m0, [pw_32] + paddw m0, [GLOBAL(pw_32)] psraw m0, 6 pshuflw m0, m0, 0x0 punpcklqdq m0, m0 @@ -124,6 +138,8 @@ cglobal dc_predictor_32x32, 4, 4, 5, dst, stride, above, left lea dstq, [dstq+strideq*4] dec lines4d jnz .loop + + RESTORE_GOT REP_RET INIT_MMX sse diff --git a/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm index 8ba26f3..88df9b2 100644 --- a/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm +++ b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm @@ -13,27 +13,23 @@ SECTION_RODATA pb_1: times 16 db 1 -pw_2: times 8 dw 2 -pb_7m1: times 8 db 7, -1 -pb_15: times 16 db 15 - -sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7 -sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7 +sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 -sh_b2w01234577: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 7, -1, 7, -1 -sh_b2w12345677: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1 -sh_b2w23456777: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1, 7, -1 -sh_b2w01234567: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1 -sh_b2w12345678: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1 -sh_b2w23456789: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1 -sh_b2w89abcdef: db 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1 -sh_b2w9abcdeff: db 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1 -sh_b2wabcdefff: db 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1, 15, -1 sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 +sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0 +sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 SECTION .text @@ -112,14 +108,16 @@ cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left REP_RET INIT_MMX ssse3 -cglobal d45_predictor_4x4, 3, 3, 4, dst, stride, above +cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset + GET_GOT goffsetq + movq m0, [aboveq] - pshufb m2, m0, [sh_b23456777] - pshufb m1, m0, [sh_b01234577] - pshufb m0, [sh_b12345677] + pshufb m2, m0, [GLOBAL(sh_b23456777)] + pshufb m1, m0, [GLOBAL(sh_b01234577)] + pshufb m0, [GLOBAL(sh_b12345677)] pavgb m3, m2, m1 pxor m2, m1 - pand m2, [pb_1] + pand m2, [GLOBAL(pb_1)] psubb m3, m2 pavgb m0, m3 @@ -132,19 +130,23 @@ cglobal d45_predictor_4x4, 3, 3, 4, dst, stride, above movd [dstq ], m0 psrlq m0, 8 movd [dstq+strideq], m0 + + RESTORE_GOT RET INIT_MMX ssse3 -cglobal d45_predictor_8x8, 3, 3, 4, dst, stride, above +cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset + GET_GOT goffsetq + movq m0, [aboveq] - mova m1, [sh_b12345677] - DEFINE_ARGS dst, stride, stride3, line + mova m1, [GLOBAL(sh_b12345677)] + DEFINE_ARGS dst, stride, stride3 lea stride3q, [strideq*3] - pshufb m2, m0, [sh_b23456777] + pshufb m2, m0, [GLOBAL(sh_b23456777)] pavgb m3, m2, m0 pxor m2, m0 pshufb m0, m1 - pand m2, [pb_1] + pand m2, [GLOBAL(pb_1)] psubb m3, m2 pavgb m0, m3 @@ -167,20 +169,24 @@ cglobal d45_predictor_8x8, 3, 3, 4, dst, stride, above movq [dstq+strideq*2], m0 pshufb m0, m1 movq [dstq+stride3q ], m0 + + RESTORE_GOT RET INIT_XMM ssse3 -cglobal d45_predictor_16x16, 3, 5, 4, dst, stride, above, dst8, line +cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset + GET_GOT goffsetq + mova m0, [aboveq] DEFINE_ARGS dst, stride, stride3, dst8, line lea stride3q, [strideq*3] lea dst8q, [dstq+strideq*8] - mova m1, [sh_b123456789abcdeff] - pshufb m2, m0, [sh_b23456789abcdefff] + mova m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] pavgb m3, m2, m0 pxor m2, m0 pshufb m0, m1 - pand m2, [pb_1] + pand m2, [GLOBAL(pb_1)] psubb m3, m2 pavgb m0, m3 @@ -214,29 +220,33 @@ cglobal d45_predictor_16x16, 3, 5, 4, dst, stride, above, dst8, line movhps [dstq+strideq +8], m0 movhps [dstq+strideq*2+8], m0 movhps [dstq+stride3q +8], m0 + + RESTORE_GOT RET INIT_XMM ssse3 -cglobal d45_predictor_32x32, 3, 5, 7, dst, stride, above, dst16, line +cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset + GET_GOT goffsetq + mova m0, [aboveq] mova m4, [aboveq+16] DEFINE_ARGS dst, stride, stride3, dst16, line lea stride3q, [strideq*3] lea dst16q, [dstq +strideq*8] lea dst16q, [dst16q+strideq*8] - mova m1, [sh_b123456789abcdeff] - pshufb m2, m4, [sh_b23456789abcdefff] + mova m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m2, m4, [GLOBAL(sh_b23456789abcdefff)] pavgb m3, m2, m4 pxor m2, m4 palignr m5, m4, m0, 1 palignr m6, m4, m0, 2 pshufb m4, m1 - pand m2, [pb_1] + pand m2, [GLOBAL(pb_1)] psubb m3, m2 pavgb m4, m3 pavgb m3, m0, m6 pxor m0, m6 - pand m0, [pb_1] + pand m0, [GLOBAL(pb_1)] psubb m3, m0 pavgb m5, m3 @@ -288,4 +298,739 @@ cglobal d45_predictor_32x32, 3, 5, 7, dst, stride, above, dst16, line mova [dstq +strideq +16], m4 mova [dstq +strideq*2+16], m4 mova [dstq +stride3q +16], m4 + + RESTORE_GOT + RET + +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 + pavgb %4, %1, %3 + pxor %3, %1 + pand %3, [GLOBAL(pb_1)] + psubb %4, %3 + pavgb %4, %2 +%endmacro + +INIT_XMM ssse3 +cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset + GET_GOT goffsetq + + movq m3, [aboveq] + pshufb m1, m3, [GLOBAL(sh_b23456777)] + pshufb m2, m3, [GLOBAL(sh_b12345677)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4 + pavgb m3, m2 + + ; store 4 lines + movd [dstq ], m3 + movd [dstq+strideq], m4 + lea dstq, [dstq+strideq*2] + psrldq m3, 1 + psrldq m4, 1 + movd [dstq ], m3 + movd [dstq+strideq], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset + GET_GOT goffsetq + + movq m3, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] + pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] + pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] + pshufb m3, [GLOBAL(sh_b0123456777777777)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4 + pavgb m3, m2 + + ; store 4 lines + movq [dstq ], m3 + movq [dstq+strideq], m4 + psrldq m3, 1 + psrldq m4, 1 + movq [dstq+strideq*2], m3 + movq [dstq+stride3q ], m4 + lea dstq, [dstq+strideq*4] + psrldq m3, 1 + psrldq m4, 1 + + ; store 4 lines + movq [dstq ], m3 + movq [dstq+strideq], m4 + psrldq m3, 1 + psrldq m4, 1 + movq [dstq+strideq*2], m3 + movq [dstq+stride3q ], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, line + lea stride3q, [strideq*3] + mova m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] + pshufb m3, m0, m1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4 + pavgb m0, m3 + + mov lined, 4 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m4 + pshufb m0, m1 + pshufb m4, m1 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m4 + pshufb m0, m1 + pshufb m4, m1 + lea dstq, [dstq+strideq*4] + dec lined + jnz .loop + RESTORE_GOT + REP_RET + +INIT_XMM ssse3 +cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + mova m7, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, line + mova m1, [GLOBAL(sh_b123456789abcdeff)] + lea stride3q, [strideq*3] + pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)] + pshufb m3, m7, m1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4 + palignr m6, m7, m0, 1 + palignr m5, m7, m0, 2 + pavgb m7, m3 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2 + pavgb m0, m6 + + mov lined, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m7 + mova [dstq+strideq ], m2 + mova [dstq+strideq +16], m4 + palignr m3, m7, m0, 1 + palignr m5, m4, m2, 1 + pshufb m7, m1 + pshufb m4, m1 + + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m5 + mova [dstq+stride3q +16], m4 + palignr m0, m7, m3, 1 + palignr m2, m4, m5, 1 + pshufb m7, m1 + pshufb m4, m1 + lea dstq, [dstq+strideq*4] + dec lined + jnz .loop + RESTORE_GOT + REP_RET + +INIT_XMM ssse3 +cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + movd m0, [leftq] ; l1, l2, l3, l4 + movd m1, [aboveq-1] ; tl, t1, t2, t3 + punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3 + pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3 + psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3 + psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3 + ; comments below are for a predictor like this + ; A1 B1 C1 D1 + ; A2 B2 A1 B1 + ; A3 B3 A2 B2 + ; A4 B4 A3 B3 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1 + pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1 + + punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 .. + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 .. + movd [dstq+stride3q ], m3 + psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 .. + movd [dstq+strideq*2], m3 + psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 .. + movd [dstq+strideq ], m3 + psrldq m3, 2 ; A1 B1 C1 D1 .. + movd [dstq ], m3 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + movq m0, [leftq] ; [0- 7] l1-8 [byte] + movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte] + pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word] + pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word] + pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word] + pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word] + psrldq m4, m0, 1 ; t1-7 [word] + psrldq m5, m0, 2 ; t2-7 [word] + ; comments below are for a predictor like this + ; A1 B1 C1 D1 E1 F1 G1 H1 + ; A2 B2 A1 B1 C1 D1 E1 F1 + ; A3 B3 A2 B2 A1 B1 C1 D1 + ; A4 B4 A3 B3 A2 B2 A1 B1 + ; A5 B5 A4 B4 A3 B3 A2 B2 + ; A6 B6 A5 B5 A4 B4 A3 B3 + ; A7 B7 A6 B6 A5 B5 A4 B4 + ; A8 B8 A7 B7 A6 B6 A5 B5 + pavgb m6, m1, m2 ; 2-tap avg A8-A1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1 + + punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1 + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1 + palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1 + movq [dstq+strideq*2], m0 + psrldq m0, 2 ; A-B2, A-B1, C-H1 + movq [dstq+strideq ], m0 + psrldq m0, 2 ; A-H1 + movq [dstq ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5 + psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4 + movq [dstq+strideq*2], m6 + psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3 + movq [dstq+strideq ], m6 + psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2 + movq [dstq ], m6 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + mova m0, [leftq] + movu m7, [aboveq-1] + ; comments below are for a predictor like this + ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1 + ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 + ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 + ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 + ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 + ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 + ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 + ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 + ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 + ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 + ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 + ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 + ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 + ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 + ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 + ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 + pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)] + palignr m5, m0, m6, 15 + palignr m3, m0, m6, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg + pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] + pavgb m5, m0 ; A1 - Ag + + punpcklbw m0, m4, m5 ; A-B8 ... A-B1 + punpckhbw m4, m5 ; A-B9 ... A-Bg + + pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)] + pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1 + + pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + palignr m2, m1, m6, 14 + mova [dstq ], m2 + palignr m2, m1, m6, 12 + mova [dstq+strideq ], m2 + palignr m2, m1, m6, 10 + mova [dstq+strideq*2], m2 + palignr m2, m1, m6, 8 + mova [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + palignr m2, m1, m6, 6 + mova [dstq ], m2 + palignr m2, m1, m6, 4 + mova [dstq+strideq ], m2 + palignr m2, m1, m6, 2 + mova [dstq+strideq*2], m2 + pshufb m4, [GLOBAL(sh_bfedcba9876543210)] + mova [dstq+stride3q ], m6 + lea dstq, [dstq+strideq*4] + + palignr m2, m6, m4, 14 + mova [dstq ], m2 + palignr m2, m6, m4, 12 + mova [dstq+strideq ], m2 + palignr m2, m6, m4, 10 + mova [dstq+strideq*2], m2 + palignr m2, m6, m4, 8 + mova [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + palignr m2, m6, m4, 6 + mova [dstq ], m2 + palignr m2, m6, m4, 4 + mova [dstq+strideq ], m2 + palignr m2, m6, m4, 2 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + mova m0, [leftq] + movu m7, [aboveq-1] + movu m1, [aboveq+15] + + pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high] + + palignr m3, m1, m7, 1 + palignr m5, m1, m7, 2 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low] + + pshufb m7, [GLOBAL(sh_bfedcba9876543210)] + palignr m5, m0, m7, 15 + palignr m3, m0, m7, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg + pavgb m5, m0 ; A1 - Ag + punpcklbw m6, m4, m5 ; A-B8 ... A-B1 + punpckhbw m4, m5 ; A-B9 ... A-Bg + pshufb m6, [GLOBAL(sh_bfedcba9876543210)] + pshufb m4, [GLOBAL(sh_bfedcba9876543210)] + + DEFINE_ARGS dst, stride, stride3, left, line + lea stride3q, [strideq*3] + + palignr m5, m2, m1, 14 + palignr m7, m1, m6, 14 + mova [dstq ], m7 + mova [dstq+16 ], m5 + palignr m5, m2, m1, 12 + palignr m7, m1, m6, 12 + mova [dstq+strideq ], m7 + mova [dstq+strideq+16 ], m5 + palignr m5, m2, m1, 10 + palignr m7, m1, m6, 10 + mova [dstq+strideq*2 ], m7 + mova [dstq+strideq*2+16], m5 + palignr m5, m2, m1, 8 + palignr m7, m1, m6, 8 + mova [dstq+stride3q ], m7 + mova [dstq+stride3q+16 ], m5 + lea dstq, [dstq+strideq*4] + palignr m5, m2, m1, 6 + palignr m7, m1, m6, 6 + mova [dstq ], m7 + mova [dstq+16 ], m5 + palignr m5, m2, m1, 4 + palignr m7, m1, m6, 4 + mova [dstq+strideq ], m7 + mova [dstq+strideq+16 ], m5 + palignr m5, m2, m1, 2 + palignr m7, m1, m6, 2 + mova [dstq+strideq*2 ], m7 + mova [dstq+strideq*2+16], m5 + mova [dstq+stride3q ], m6 + mova [dstq+stride3q+16 ], m1 + lea dstq, [dstq+strideq*4] + + palignr m5, m1, m6, 14 + palignr m3, m6, m4, 14 + mova [dstq ], m3 + mova [dstq+16 ], m5 + palignr m5, m1, m6, 12 + palignr m3, m6, m4, 12 + mova [dstq+strideq ], m3 + mova [dstq+strideq+16 ], m5 + palignr m5, m1, m6, 10 + palignr m3, m6, m4, 10 + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m5 + palignr m5, m1, m6, 8 + palignr m3, m6, m4, 8 + mova [dstq+stride3q ], m3 + mova [dstq+stride3q+16 ], m5 + lea dstq, [dstq+strideq*4] + palignr m5, m1, m6, 6 + palignr m3, m6, m4, 6 + mova [dstq ], m3 + mova [dstq+16 ], m5 + palignr m5, m1, m6, 4 + palignr m3, m6, m4, 4 + mova [dstq+strideq ], m3 + mova [dstq+strideq+16 ], m5 + palignr m5, m1, m6, 2 + palignr m3, m6, m4, 2 + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m5 + mova [dstq+stride3q ], m4 + mova [dstq+stride3q+16 ], m6 + lea dstq, [dstq+strideq*4] + + mova m7, [leftq] + mova m3, [leftq+16] + palignr m5, m3, m7, 15 + palignr m0, m3, m7, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh - + pavgb m5, m3 ; Ah - + punpcklbw m3, m2, m5 ; A-B8 ... A-B1 + punpckhbw m2, m5 ; A-B9 ... A-Bg + pshufb m3, [GLOBAL(sh_bfedcba9876543210)] + pshufb m2, [GLOBAL(sh_bfedcba9876543210)] + + palignr m7, m6, m4, 14 + palignr m0, m4, m3, 14 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m6, m4, 12 + palignr m0, m4, m3, 12 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m6, m4, 10 + palignr m0, m4, m3, 10 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + palignr m7, m6, m4, 8 + palignr m0, m4, m3, 8 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q+16 ], m7 + lea dstq, [dstq+strideq*4] + palignr m7, m6, m4, 6 + palignr m0, m4, m3, 6 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m6, m4, 4 + palignr m0, m4, m3, 4 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m6, m4, 2 + palignr m0, m4, m3, 2 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m3 + mova [dstq+stride3q+16 ], m4 + lea dstq, [dstq+strideq*4] + + palignr m7, m4, m3, 14 + palignr m0, m3, m2, 14 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m4, m3, 12 + palignr m0, m3, m2, 12 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m4, m3, 10 + palignr m0, m3, m2, 10 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + palignr m7, m4, m3, 8 + palignr m0, m3, m2, 8 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q+16 ], m7 + lea dstq, [dstq+strideq*4] + palignr m7, m4, m3, 6 + palignr m0, m3, m2, 6 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m4, m3, 4 + palignr m0, m3, m2, 4 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m4, m3, 2 + palignr m0, m3, m2, 2 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m2 + mova [dstq+stride3q+16 ], m3 + + RESTORE_GOT + RET + +INIT_MMX ssse3 +cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset + GET_GOT goffsetq + movd m0, [leftq] ; abcd [byte] + pshufb m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte] + pshufb m3, m0, [GLOBAL(sh_b2333)] ; cddd + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2 + pavgb m1, m0 ; ab, bc, cd, d [byte] + + punpcklbw m1, m2 ; ab, a2bc, bc, b2cd, cd, c3d, d, d + movd [dstq ], m1 + psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d + movd [dstq+strideq], m1 + lea dstq, [dstq+strideq*2] + psrlq m1, 16 ; cd, c3d, d, d + movd [dstq ], m1 + pshufw m1, m1, q1111 ; d, d, d, d + movd [dstq+strideq], m1 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset + GET_GOT goffsetq + movq m3, [leftq] ; abcdefgh [byte] + lea stride3q, [strideq*3] + + pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] + pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] + pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3 + pavgb m0, m2 + punpcklbw m0, m3 ; interleaved output + + movq [dstq ], m0 + psrldq m0, 2 + movq [dstq+strideq ], m0 + psrldq m0, 2 + movq [dstq+strideq*2], m0 + psrldq m0, 2 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + pshufhw m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh + psrldq m0, 2 + movq [dstq ], m0 + psrldq m0, 2 + movq [dstq+strideq ], m0 + psrldq m0, 2 + movq [dstq+strideq*2], m0 + psrldq m0, 2 + movq [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset + GET_GOT goffsetq + lea stride3q, [strideq*3] + mova m0, [leftq] ; abcdefghijklmnop [byte] + pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp + pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 + pavgb m1, m0 ; ab, bc, cd .. no, op, pp [byte] + + punpckhbw m4, m1, m3 ; interleaved input + punpcklbw m1, m3 ; interleaved output + mova [dstq ], m1 + palignr m3, m4, m1, 2 + mova [dstq+strideq ], m3 + palignr m3, m4, m1, 4 + mova [dstq+strideq*2], m3 + palignr m3, m4, m1, 6 + mova [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + palignr m3, m4, m1, 8 + mova [dstq ], m3 + palignr m3, m4, m1, 10 + mova [dstq+strideq ], m3 + palignr m3, m4, m1, 12 + mova [dstq+strideq*2], m3 + palignr m3, m4, m1, 14 + mova [dstq+stride3q ], m3 + DEFINE_ARGS dst, stride, stride3, line + mov lined, 2 + mova m0, [GLOBAL(sh_b23456789abcdefff)] +.loop: + lea dstq, [dstq+strideq*4] + mova [dstq ], m4 + pshufb m4, m0 + mova [dstq+strideq ], m4 + pshufb m4, m0 + mova [dstq+strideq*2], m4 + pshufb m4, m0 + mova [dstq+stride3q ], m4 + pshufb m4, m0 + dec lined + jnz .loop + RESTORE_GOT + REP_RET + +INIT_XMM ssse3 +cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset + GET_GOT goffsetq + lea stride3q, [strideq*3] + mova m1, [leftq] ; 0-15 [byte] + mova m2, [leftq+16] ; 16-31 [byte] + pshufb m0, m2, [GLOBAL(sh_b23456789abcdefff)] + pshufb m4, m2, [GLOBAL(sh_b123456789abcdeff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3 + palignr m6, m2, m1, 1 + palignr m5, m2, m1, 2 + pavgb m2, m4 ; high 16px even lines + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0 + pavgb m1, m6 ; low 16px even lines + + punpckhbw m6, m1, m0 ; interleaved output 2 + punpcklbw m1, m0 ; interleaved output 1 + + punpckhbw m7, m2, m3 ; interleaved output 4 + punpcklbw m2, m3 ; interleaved output 3 + + ; output 1st 8 lines (and half of 2nd 8 lines) + DEFINE_ARGS dst, stride, stride3, dst8 + lea dst8q, [dstq+strideq*8] + mova [dstq ], m1 + mova [dstq +16], m6 + mova [dst8q ], m6 + palignr m0, m6, m1, 2 + palignr m4, m2, m6, 2 + mova [dstq +strideq ], m0 + mova [dstq +strideq +16], m4 + mova [dst8q+strideq ], m4 + palignr m0, m6, m1, 4 + palignr m4, m2, m6, 4 + mova [dstq +strideq*2 ], m0 + mova [dstq +strideq*2+16], m4 + mova [dst8q+strideq*2 ], m4 + palignr m0, m6, m1, 6 + palignr m4, m2, m6, 6 + mova [dstq +stride3q ], m0 + mova [dstq +stride3q +16], m4 + mova [dst8q+stride3q ], m4 + lea dstq, [dstq +strideq*4] + lea dst8q, [dst8q+strideq*4] + palignr m0, m6, m1, 8 + palignr m4, m2, m6, 8 + mova [dstq ], m0 + mova [dstq +16], m4 + mova [dst8q ], m4 + palignr m0, m6, m1, 10 + palignr m4, m2, m6, 10 + mova [dstq +strideq ], m0 + mova [dstq +strideq +16], m4 + mova [dst8q+strideq ], m4 + palignr m0, m6, m1, 12 + palignr m4, m2, m6, 12 + mova [dstq +strideq*2 ], m0 + mova [dstq +strideq*2+16], m4 + mova [dst8q+strideq*2 ], m4 + palignr m0, m6, m1, 14 + palignr m4, m2, m6, 14 + mova [dstq +stride3q ], m0 + mova [dstq +stride3q +16], m4 + mova [dst8q+stride3q ], m4 + lea dstq, [dstq+strideq*4] + lea dst8q, [dst8q+strideq*4] + + ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines + mova [dstq +16], m2 + mova [dst8q ], m2 + palignr m4, m7, m2, 2 + mova [dstq +strideq +16], m4 + mova [dst8q+strideq ], m4 + palignr m4, m7, m2, 4 + mova [dstq +strideq*2+16], m4 + mova [dst8q+strideq*2 ], m4 + palignr m4, m7, m2, 6 + mova [dstq +stride3q +16], m4 + mova [dst8q+stride3q ], m4 + lea dstq, [dstq+strideq*4] + lea dst8q, [dst8q+strideq*4] + palignr m4, m7, m2, 8 + mova [dstq +16], m4 + mova [dst8q ], m4 + palignr m4, m7, m2, 10 + mova [dstq +strideq +16], m4 + mova [dst8q+strideq ], m4 + palignr m4, m7, m2, 12 + mova [dstq +strideq*2+16], m4 + mova [dst8q+strideq*2 ], m4 + palignr m4, m7, m2, 14 + mova [dstq +stride3q +16], m4 + mova [dst8q+stride3q ], m4 + lea dstq, [dstq+strideq*4] + lea dst8q, [dst8q+strideq*4] + + ; output 2nd half of 3rd 8 lines and half of 4th 8 lines + mova m0, [GLOBAL(sh_b23456789abcdefff)] + mova [dstq +16], m7 + mova [dst8q ], m7 + pshufb m7, m0 + mova [dstq +strideq +16], m7 + mova [dst8q+strideq ], m7 + pshufb m7, m0 + mova [dstq +strideq*2+16], m7 + mova [dst8q+strideq*2 ], m7 + pshufb m7, m0 + mova [dstq +stride3q +16], m7 + mova [dst8q+stride3q ], m7 + pshufb m7, m0 + lea dstq, [dstq+strideq*4] + lea dst8q, [dst8q+strideq*4] + mova [dstq +16], m7 + mova [dst8q ], m7 + pshufb m7, m0 + mova [dstq +strideq +16], m7 + mova [dst8q+strideq ], m7 + pshufb m7, m0 + mova [dstq +strideq*2+16], m7 + mova [dst8q+strideq*2 ], m7 + pshufb m7, m0 + mova [dstq +stride3q +16], m7 + mova [dst8q+stride3q ], m7 + pshufb m7, m0 + lea dstq, [dstq+strideq*4] + + ; output last half of 4th 8 lines + mova [dstq +16], m7 + mova [dstq +strideq +16], m7 + mova [dstq +strideq*2+16], m7 + mova [dstq +stride3q +16], m7 + lea dstq, [dstq+strideq*4] + mova [dstq +16], m7 + mova [dstq +strideq +16], m7 + mova [dstq +strideq*2+16], m7 + mova [dstq +stride3q +16], m7 + + ; done! + RESTORE_GOT RET diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c new file mode 100644 index 0000000..3c5cb8f --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c @@ -0,0 +1,943 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <immintrin.h> /* AVX2 */ + +static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p, + const unsigned char *_blimit, const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i mask, hev, flat, flat2; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; + __m128i abs_p1p0; + + const __m128i thresh = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _thresh[0])); + const __m128i limit = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _limit[0])); + const __m128i blimit = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _blimit[0])); + + q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p)); + q4p4 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p))); + q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p)); + q3p3 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p))); + q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p)); + q2p2 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p))); + q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p)); + q1p1 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p)); + q0p0 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p))); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; + abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), + _mm_subs_epu8(q0p0, q1p1)); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + fe = _mm_set1_epi8(0xfe); + ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), + _mm_subs_epu8(p0q0, q0p0)); + abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), + _mm_subs_epu8(p1q1, q1p1)); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), + _mm_subs_epu8(q1p1, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), + _mm_subs_epu8(q2p2, q3p3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); + __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); + __m128i qs0 = _mm_xor_si128(p0q0, t80); + __m128i qs1 = _mm_xor_si128(p1q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; + __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; + + filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, qs0ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 0xB); + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 0xB); + + /* Filter1 >> 3 */ + filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); + qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); + + /* filt >> 1 */ + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128( + _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt); + filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); + qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); + // loopfilter done + + { + __m128i work; + flat = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), + _mm_subs_epu8(q0p0, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), + _mm_subs_epu8(q0p0, q3p3))); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p)); + q5p5 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q5p5), + (__m64 *) (s + 5 * p))); + + q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p)); + q6p6 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q6p6), + (__m64 *) (s + 6 * p))); + + flat2 = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), + _mm_subs_epu8(q0p0, q4p4)), + _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), + _mm_subs_epu8(q0p0, q5p5))); + + q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p)); + q7p7 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q7p7), + (__m64 *) (s + 7 * p))); + + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), + _mm_subs_epu8(q0p0, q6p6)), + _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), + _mm_subs_epu8(q0p0, q7p7))); + + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; + __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; + + p7_16 = _mm_unpacklo_epi8(q7p7, zero); + p6_16 = _mm_unpacklo_epi8(q6p6, zero); + p5_16 = _mm_unpacklo_epi8(q5p5, zero); + p4_16 = _mm_unpacklo_epi8(q4p4, zero); + p3_16 = _mm_unpacklo_epi8(q3p3, zero); + p2_16 = _mm_unpacklo_epi8(q2p2, zero); + p1_16 = _mm_unpacklo_epi8(q1p1, zero); + p0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_unpackhi_epi8(q0p0, zero); + q1_16 = _mm_unpackhi_epi8(q1p1, zero); + q2_16 = _mm_unpackhi_epi8(q2p2, zero); + q3_16 = _mm_unpackhi_epi8(q3p3, zero); + q4_16 = _mm_unpackhi_epi8(q4p4, zero); + q5_16 = _mm_unpackhi_epi8(q5p5, zero); + q6_16 = _mm_unpackhi_epi8(q6p6, zero); + q7_16 = _mm_unpackhi_epi8(q7p7, zero); + + pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), + _mm_add_epi16(p4_16, p3_16)); + pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), + _mm_add_epi16(q4_16, q3_16)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, + _mm_add_epi16(p2_16, p1_16)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, + _mm_add_epi16(q2_16, q1_16)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = _mm_add_epi16(eight, + _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16(four, + _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), + 4); + flat2_q0p0 = _mm_packus_epi16(res_p, res_q); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(q3_16, q0_16)), 3); + + flat_q0p0 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(p7_16, p7_16); + sum_q7 = _mm_add_epi16(q7_16, q7_16); + sum_p3 = _mm_add_epi16(p3_16, p3_16); + sum_q3 = _mm_add_epi16(q3_16, q3_16); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), + 4); + flat2_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, + _mm_add_epi16(sum_q3, q1_16)), 3); + flat_q1p1 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), + 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, + _mm_add_epi16(sum_q3, q2_16)), 3); + flat_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), + 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), + 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), + 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), + 4); + flat2_q6p6 = _mm_packus_epi16(res_p, res_q); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + flat = _mm_shuffle_epi32(flat, 68); + flat2 = _mm_shuffle_epi32(flat2, 68); + + q2p2 = _mm_andnot_si128(flat, q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + q2p2 = _mm_or_si128(q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + q6p6 = _mm_andnot_si128(flat2, q6p6); + flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); + q6p6 = _mm_or_si128(q6p6, flat2_q6p6); + _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6); + _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6)); + + q5p5 = _mm_andnot_si128(flat2, q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + q5p5 = _mm_or_si128(q5p5, flat2_q5p5); + _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5); + _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5)); + + q4p4 = _mm_andnot_si128(flat2, q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + q4p4 = _mm_or_si128(q4p4, flat2_q4p4); + _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4); + _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4)); + + q3p3 = _mm_andnot_si128(flat2, q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + q3p3 = _mm_or_si128(q3p3, flat2_q3p3); + _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3); + _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3)); + + q2p2 = _mm_andnot_si128(flat2, q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + q2p2 = _mm_or_si128(q2p2, flat2_q2p2); + _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2); + _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2)); + + q1p1 = _mm_andnot_si128(flat2, q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + q1p1 = _mm_or_si128(q1p1, flat2_q1p1); + _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1); + _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1)); + + q0p0 = _mm_andnot_si128(flat2, q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + q0p0 = _mm_or_si128(q0p0, flat2_q0p0); + _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0); + _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0)); + } +} + +static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p, + const unsigned char *_blimit, const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i mask, hev, flat, flat2; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + __m128i p7, p6, p5; + __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + __m128i q5, q6, q7; + + const __m128i thresh = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _thresh[0])); + const __m128i limit = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _limit[0])); + const __m128i blimit = _mm_broadcastb_epi8( + _mm_cvtsi32_si128((int) _blimit[0])); + + p4 = _mm_loadu_si128((__m128i *) (s - 5 * p)); + p3 = _mm_loadu_si128((__m128i *) (s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *) (s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *) (s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *) (s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *) (s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *) (s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *) (s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *) (s + 3 * p)); + q4 = _mm_loadu_si128((__m128i *) (s + 4 * p)); + + { + const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), + _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), + _mm_subs_epu8(q0, q1)); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), + _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), + _mm_subs_epu8(q1, p1)); + __m128i work; + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + __m128i ps1 = _mm_xor_si128(p1, t80); + __m128i ps0 = _mm_xor_si128(p0, t80); + __m128i qs0 = _mm_xor_si128(q0, t80); + __m128i qs1 = _mm_xor_si128(q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1, + flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, + flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, + flat_q2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + /* Filter1 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + + /* Filter2 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + + /* filt >> 1 */ + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + // loopfilter done + + { + __m128i work; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)), + _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4))); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + p5 = _mm_loadu_si128((__m128i *) (s - 6 * p)); + q5 = _mm_loadu_si128((__m128i *) (s + 5 * p)); + flat2 = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), + _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); + + flat2 = _mm_max_epu8(work, flat2); + p6 = _mm_loadu_si128((__m128i *) (s - 7 * p)); + q6 = _mm_loadu_si128((__m128i *) (s + 6 * p)); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), + _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); + + flat2 = _mm_max_epu8(work, flat2); + + p7 = _mm_loadu_si128((__m128i *) (s - 8 * p)); + q7 = _mm_loadu_si128((__m128i *) (s + 7 * p)); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), + _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); + + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m256i eight = _mm256_set1_epi16(8); + const __m256i four = _mm256_set1_epi16(4); + __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, + q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, + p256_0, q256_0; + __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, + pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, + res_q; + + p256_7 = _mm256_cvtepu8_epi16(p7); + p256_6 = _mm256_cvtepu8_epi16(p6); + p256_5 = _mm256_cvtepu8_epi16(p5); + p256_4 = _mm256_cvtepu8_epi16(p4); + p256_3 = _mm256_cvtepu8_epi16(p3); + p256_2 = _mm256_cvtepu8_epi16(p2); + p256_1 = _mm256_cvtepu8_epi16(p1); + p256_0 = _mm256_cvtepu8_epi16(p0); + q256_0 = _mm256_cvtepu8_epi16(q0); + q256_1 = _mm256_cvtepu8_epi16(q1); + q256_2 = _mm256_cvtepu8_epi16(q2); + q256_3 = _mm256_cvtepu8_epi16(q3); + q256_4 = _mm256_cvtepu8_epi16(q4); + q256_5 = _mm256_cvtepu8_epi16(q5); + q256_6 = _mm256_cvtepu8_epi16(q6); + q256_7 = _mm256_cvtepu8_epi16(q7); + + pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5), + _mm256_add_epi16(p256_4, p256_3)); + pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5), + _mm256_add_epi16(q256_4, q256_3)); + + pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0, + _mm256_add_epi16(p256_2, p256_1)); + pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0, + _mm256_add_epi16(q256_2, q256_1)); + pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + + pixelFilter_p = _mm256_add_epi16(eight, + _mm256_add_epi16(pixelFilter_p, pixelFilter_q)); + + pixetFilter_p2p1p0 = _mm256_add_epi16(four, + _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(p256_7, p256_0)), 4); + + flat2_p0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(q256_7, q256_0)), 4); + + flat2_q0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(p256_3, p256_0)), 3); + + flat_p0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(q256_3, q256_0)), 3); + + flat_q0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(p256_7, p256_7); + + sum_q7 = _mm256_add_epi16(q256_7, q256_7); + + sum_p3 = _mm256_add_epi16(p256_3, p256_3); + + sum_q3 = _mm256_add_epi16(q256_3, q256_3); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_1)), 4); + + flat2_p1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_1)), 4); + + flat2_q1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2); + + pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(sum_p3, p256_1)), 3); + + flat_p1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_q2q1q0, + _mm256_add_epi16(sum_q3, q256_1)), 3); + + flat_q1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + sum_p3 = _mm256_add_epi16(sum_p3, p256_3); + + sum_q3 = _mm256_add_epi16(sum_q3, q256_3); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_2)), 4); + + flat2_p2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_2)), 4); + + flat2_q2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1); + + pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(sum_p3, p256_2)), 3); + + flat_p2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixetFilter_q2q1q0, + _mm256_add_epi16(sum_q3, q256_2)), 3); + + flat_q2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_3)), 4); + + flat2_p3 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_3)), 4); + + flat2_q3 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_4)), 4); + + flat2_p4 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_4)), 4); + + flat2_q4 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_5)), 4); + + flat2_p5 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_5)), 4); + + flat2_q5 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, + _mm256_add_epi16(sum_p7, p256_6)), 4); + + flat2_p6 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), + 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, + _mm256_add_epi16(sum_q7, q256_6)), 4); + + flat2_q6 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), + 168)); + } + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + p2 = _mm_andnot_si128(flat, p2); + flat_p2 = _mm_and_si128(flat, flat_p2); + p2 = _mm_or_si128(flat_p2, p2); + + p1 = _mm_andnot_si128(flat, ps1); + flat_p1 = _mm_and_si128(flat, flat_p1); + p1 = _mm_or_si128(flat_p1, p1); + + p0 = _mm_andnot_si128(flat, ps0); + flat_p0 = _mm_and_si128(flat, flat_p0); + p0 = _mm_or_si128(flat_p0, p0); + + q0 = _mm_andnot_si128(flat, qs0); + flat_q0 = _mm_and_si128(flat, flat_q0); + q0 = _mm_or_si128(flat_q0, q0); + + q1 = _mm_andnot_si128(flat, qs1); + flat_q1 = _mm_and_si128(flat, flat_q1); + q1 = _mm_or_si128(flat_q1, q1); + + q2 = _mm_andnot_si128(flat, q2); + flat_q2 = _mm_and_si128(flat, flat_q2); + q2 = _mm_or_si128(flat_q2, q2); + + p6 = _mm_andnot_si128(flat2, p6); + flat2_p6 = _mm_and_si128(flat2, flat2_p6); + p6 = _mm_or_si128(flat2_p6, p6); + _mm_storeu_si128((__m128i *) (s - 7 * p), p6); + + p5 = _mm_andnot_si128(flat2, p5); + flat2_p5 = _mm_and_si128(flat2, flat2_p5); + p5 = _mm_or_si128(flat2_p5, p5); + _mm_storeu_si128((__m128i *) (s - 6 * p), p5); + + p4 = _mm_andnot_si128(flat2, p4); + flat2_p4 = _mm_and_si128(flat2, flat2_p4); + p4 = _mm_or_si128(flat2_p4, p4); + _mm_storeu_si128((__m128i *) (s - 5 * p), p4); + + p3 = _mm_andnot_si128(flat2, p3); + flat2_p3 = _mm_and_si128(flat2, flat2_p3); + p3 = _mm_or_si128(flat2_p3, p3); + _mm_storeu_si128((__m128i *) (s - 4 * p), p3); + + p2 = _mm_andnot_si128(flat2, p2); + flat2_p2 = _mm_and_si128(flat2, flat2_p2); + p2 = _mm_or_si128(flat2_p2, p2); + _mm_storeu_si128((__m128i *) (s - 3 * p), p2); + + p1 = _mm_andnot_si128(flat2, p1); + flat2_p1 = _mm_and_si128(flat2, flat2_p1); + p1 = _mm_or_si128(flat2_p1, p1); + _mm_storeu_si128((__m128i *) (s - 2 * p), p1); + + p0 = _mm_andnot_si128(flat2, p0); + flat2_p0 = _mm_and_si128(flat2, flat2_p0); + p0 = _mm_or_si128(flat2_p0, p0); + _mm_storeu_si128((__m128i *) (s - 1 * p), p0); + + q0 = _mm_andnot_si128(flat2, q0); + flat2_q0 = _mm_and_si128(flat2, flat2_q0); + q0 = _mm_or_si128(flat2_q0, q0); + _mm_storeu_si128((__m128i *) (s - 0 * p), q0); + + q1 = _mm_andnot_si128(flat2, q1); + flat2_q1 = _mm_and_si128(flat2, flat2_q1); + q1 = _mm_or_si128(flat2_q1, q1); + _mm_storeu_si128((__m128i *) (s + 1 * p), q1); + + q2 = _mm_andnot_si128(flat2, q2); + flat2_q2 = _mm_and_si128(flat2, flat2_q2); + q2 = _mm_or_si128(flat2_q2, q2); + _mm_storeu_si128((__m128i *) (s + 2 * p), q2); + + q3 = _mm_andnot_si128(flat2, q3); + flat2_q3 = _mm_and_si128(flat2, flat2_q3); + q3 = _mm_or_si128(flat2_q3, q3); + _mm_storeu_si128((__m128i *) (s + 3 * p), q3); + + q4 = _mm_andnot_si128(flat2, q4); + flat2_q4 = _mm_and_si128(flat2, flat2_q4); + q4 = _mm_or_si128(flat2_q4, q4); + _mm_storeu_si128((__m128i *) (s + 4 * p), q4); + + q5 = _mm_andnot_si128(flat2, q5); + flat2_q5 = _mm_and_si128(flat2, flat2_q5); + q5 = _mm_or_si128(flat2_q5, q5); + _mm_storeu_si128((__m128i *) (s + 5 * p), q5); + + q6 = _mm_andnot_si128(flat2, q6); + flat2_q6 = _mm_and_si128(flat2, flat2_q6); + q6 = _mm_or_si128(flat2_q6, q6); + _mm_storeu_si128((__m128i *) (s + 6 * p), q6); + } +} + +void vp9_mb_lpf_horizontal_edge_w_avx2(unsigned char *s, int p, + const unsigned char *_blimit, const unsigned char *_limit, + const unsigned char *_thresh, int count) { + if (count == 1) + mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh); + else + mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh); +} diff --git a/libvpx/vp9/common/x86/vp9_postproc_x86.h b/libvpx/vp9/common/x86/vp9_postproc_x86.h index b0e8b18..8870215 100644 --- a/libvpx/vp9/common/x86/vp9_postproc_x86.h +++ b/libvpx/vp9/common/x86/vp9_postproc_x86.h @@ -61,4 +61,4 @@ extern prototype_postproc_addnoise(vp9_plane_add_noise_wmt); #endif #endif -#endif +#endif // VP9_COMMON_X86_VP9_POSTPROC_X86_H_ diff --git a/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm b/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm new file mode 100644 index 0000000..9dc8d0a --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm @@ -0,0 +1,987 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;Note: tap3 and tap4 have to be applied and added after other taps to avoid +;overflow. + +%macro GET_FILTERS_4 0 + mov rdx, arg(5) ;filter ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + psrldq xmm7, 8 + pshuflw xmm4, xmm7, 0b ;k4 + pshuflw xmm5, xmm7, 01010101b ;k5 + pshuflw xmm6, xmm7, 10101010b ;k6 + pshuflw xmm7, xmm7, 11111111b ;k7 + + punpcklqdq xmm0, xmm1 + punpcklqdq xmm2, xmm3 + punpcklqdq xmm5, xmm4 + punpcklqdq xmm6, xmm7 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm2 + movdqa k5k4, xmm5 + movdqa k6k7, xmm6 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 + + pxor xmm7, xmm7 + movdqa zero, xmm7 +%endm + +%macro APPLY_FILTER_4 1 + punpckldq xmm0, xmm1 ;two row in one register + punpckldq xmm6, xmm7 + punpckldq xmm2, xmm3 + punpckldq xmm5, xmm4 + + punpcklbw xmm0, zero ;unpack to word + punpcklbw xmm6, zero + punpcklbw xmm2, zero + punpcklbw xmm5, zero + + pmullw xmm0, k0k1 ;multiply the filter factors + pmullw xmm6, k6k7 + pmullw xmm2, k2k3 + pmullw xmm5, k5k4 + + paddsw xmm0, xmm6 ;sum + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + paddsw xmm0, xmm2 + psrldq xmm2, 8 + paddsw xmm0, xmm5 + psrldq xmm5, 8 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + + paddsw xmm0, krd ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 +%endm + +%macro GET_FILTERS 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + pshufhw xmm4, xmm7, 0b ;k4 + pshufhw xmm5, xmm7, 01010101b ;k5 + pshufhw xmm6, xmm7, 10101010b ;k6 + pshufhw xmm7, xmm7, 11111111b ;k7 + + punpcklwd xmm0, xmm0 + punpcklwd xmm1, xmm1 + punpcklwd xmm2, xmm2 + punpcklwd xmm3, xmm3 + punpckhwd xmm4, xmm4 + punpckhwd xmm5, xmm5 + punpckhwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movdqa k0, xmm0 ;store filter factors on stack + movdqa k1, xmm1 + movdqa k2, xmm2 + movdqa k3, xmm3 + movdqa k4, xmm4 + movdqa k5, xmm5 + movdqa k6, xmm6 + movdqa k7, xmm7 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 ;rounding + + pxor xmm7, xmm7 + movdqa zero, xmm7 +%endm + +%macro LOAD_VERT_8 1 + movq xmm0, [rsi + %1] ;0 + movq xmm1, [rsi + rax + %1] ;1 + movq xmm6, [rsi + rdx * 2 + %1] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2 + %1] ;7 + movq xmm2, [rsi + rax + %1] ;2 + movq xmm3, [rsi + rax * 2 + %1] ;3 + movq xmm4, [rsi + rdx + %1] ;4 + movq xmm5, [rsi + rax * 4 + %1] ;5 +%endm + +%macro APPLY_FILTER_8 2 + punpcklbw xmm0, zero + punpcklbw xmm1, zero + punpcklbw xmm6, zero + punpcklbw xmm7, zero + punpcklbw xmm2, zero + punpcklbw xmm5, zero + punpcklbw xmm3, zero + punpcklbw xmm4, zero + + pmullw xmm0, k0 + pmullw xmm1, k1 + pmullw xmm6, k6 + pmullw xmm7, k7 + pmullw xmm2, k2 + pmullw xmm5, k5 + pmullw xmm3, k3 + pmullw xmm4, k4 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm6 + paddsw xmm0, xmm7 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + paddsw xmm0, xmm3 + paddsw xmm0, xmm4 + + paddsw xmm0, krd ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi + %2] + pavgb xmm0, xmm1 +%endif + movq [rdi + %2], xmm0 +%endm + +;void vp9_filter_block1d4_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d4_v8_sse2) PRIVATE +sym(vp9_filter_block1d4_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movd xmm0, [rsi] ;load src: row 0 + movd xmm1, [rsi + rax] ;1 + movd xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movd xmm7, [rsi + rdx * 2] ;7 + movd xmm2, [rsi + rax] ;2 + movd xmm3, [rsi + rax * 2] ;3 + movd xmm4, [rsi + rdx] ;4 + movd xmm5, [rsi + rax * 4] ;5 + + APPLY_FILTER_4 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d8_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d8_v8_sse2) PRIVATE +sym(vp9_filter_block1d8_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 0, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d16_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d16_v8_sse2) PRIVATE +sym(vp9_filter_block1d16_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 0, 0 + sub rsi, rax + + LOAD_VERT_8 8 + APPLY_FILTER_8 0, 8 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE +sym(vp9_filter_block1d4_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movd xmm0, [rsi] ;load src: row 0 + movd xmm1, [rsi + rax] ;1 + movd xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movd xmm7, [rsi + rdx * 2] ;7 + movd xmm2, [rsi + rax] ;2 + movd xmm3, [rsi + rax * 2] ;3 + movd xmm4, [rsi + rdx] ;4 + movd xmm5, [rsi + rax * 4] ;5 + + APPLY_FILTER_4 1 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE +sym(vp9_filter_block1d8_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 1, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE +sym(vp9_filter_block1d16_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 1, 0 + sub rsi, rax + + LOAD_VERT_8 8 + APPLY_FILTER_8 1, 8 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d4_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d4_h8_sse2) PRIVATE +sym(vp9_filter_block1d4_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm3, 3 + psrldq xmm5, 5 + psrldq xmm4, 4 + + APPLY_FILTER_4 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d8_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d8_h8_sse2) PRIVATE +sym(vp9_filter_block1d8_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d16_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d16_h8_sse2) PRIVATE +sym(vp9_filter_block1d16_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 0 + + movdqu xmm0, [rsi + 5] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 8 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE +sym(vp9_filter_block1d4_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm3, 3 + psrldq xmm5, 5 + psrldq xmm4, 4 + + APPLY_FILTER_4 1 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE +sym(vp9_filter_block1d8_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 1, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE +sym(vp9_filter_block1d16_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 1, 0 + + movdqu xmm0, [rsi + 5] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 1, 8 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm index bbf9888..7a5cca0 100644 --- a/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm +++ b/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm @@ -534,6 +534,21 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): ret ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%macro HORIZx4_ROW 2 + movdqa %2, %1 + pshufb %1, [GLOBAL(shuf_t0t1)] + pshufb %2, [GLOBAL(shuf_t2t3)] + pmaddubsw %1, xmm6 + pmaddubsw %2, xmm7 + + paddsw %1, %2 + movdqa %2, %1 + psrldq %2, 8 + paddsw %1, %2 + paddsw %1, xmm5 + psraw %1, 7 + packuswb %1, %1 +%endm %macro HORIZx4 1 mov rdx, arg(5) ;filter ptr @@ -544,64 +559,84 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): movdqa xmm4, [rdx] ;load filters movq xmm5, rcx packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 - movdqa krd, xmm5 + pshuflw xmm6, xmm4, 0b ;k0_k1 + pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5 + pshuflw xmm7, xmm4, 01010101b ;k2_k3 + pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7 + pshufd xmm5, xmm5, 0 ;rounding movsxd rax, dword ptr arg(1) ;src_pixels_per_line movsxd rdx, dword ptr arg(3) ;output_pitch movsxd rcx, dword ptr arg(4) ;output_height - + shr rcx, 1 .loop: - movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 - - movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 - punpcklqdq xmm0, xmm3 + ;Do two rows once + movq xmm0, [rsi - 3] ;load src + movq xmm1, [rsi + 5] + movq xmm2, [rsi + rax - 3] + movq xmm3, [rsi + rax + 5] + punpcklqdq xmm0, xmm1 + punpcklqdq xmm2, xmm3 + + HORIZx4_ROW xmm0, xmm1 + HORIZx4_ROW xmm2, xmm3 +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 + movd xmm3, [rdi + rdx] + pavgb xmm2, xmm3 +%endif + movd [rdi], xmm0 + movd [rdi +rdx], xmm2 - movdqa xmm1, xmm0 - pshufb xmm0, [GLOBAL(shuf_t0t1)] - pmaddubsw xmm0, k0k1 + lea rsi, [rsi + rax] + prefetcht0 [rsi + 4 * rax - 3] + lea rsi, [rsi + rax] + lea rdi, [rdi + 2 * rdx] + prefetcht0 [rsi + 2 * rax - 3] - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf_t2t3)] - pmaddubsw xmm1, k2k3 + dec rcx + jnz .loop - movdqa xmm4, xmm2 - pshufb xmm2, [GLOBAL(shuf_t4t5)] - pmaddubsw xmm2, k4k5 + ; Do last row if output_height is odd + movsxd rcx, dword ptr arg(4) ;output_height + and rcx, 1 + je .done - pshufb xmm4, [GLOBAL(shuf_t6t7)] - pmaddubsw xmm4, k6k7 + movq xmm0, [rsi - 3] ; load src + movq xmm1, [rsi + 5] + punpcklqdq xmm0, xmm1 - paddsw xmm0, xmm1 - paddsw xmm0, xmm4 - paddsw xmm0, xmm2 - paddsw xmm0, krd - psraw xmm0, 7 - packuswb xmm0, xmm0 + HORIZx4_ROW xmm0, xmm1 %if %1 movd xmm1, [rdi] pavgb xmm0, xmm1 %endif - lea rsi, [rsi + rax] movd [rdi], xmm0 +.done +%endm - lea rdi, [rdi + rdx] - dec rcx - jnz .loop +%macro HORIZx8_ROW 4 + movdqa %2, %1 + movdqa %3, %1 + movdqa %4, %1 + + pshufb %1, [GLOBAL(shuf_t0t1)] + pshufb %2, [GLOBAL(shuf_t2t3)] + pshufb %3, [GLOBAL(shuf_t4t5)] + pshufb %4, [GLOBAL(shuf_t6t7)] + + pmaddubsw %1, k0k1 + pmaddubsw %2, k2k3 + pmaddubsw %3, k4k5 + pmaddubsw %4, k6k7 + + paddsw %1, %2 + paddsw %1, %4 + paddsw %1, %3 + paddsw %1, krd + psraw %1, 7 + packuswb %1, %1 %endm %macro HORIZx8 1 @@ -633,45 +668,51 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): movsxd rax, dword ptr arg(1) ;src_pixels_per_line movsxd rdx, dword ptr arg(3) ;output_pitch movsxd rcx, dword ptr arg(4) ;output_height + shr rcx, 1 .loop: - movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 - - movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 + movq xmm0, [rsi - 3] ;load src + movq xmm3, [rsi + 5] + movq xmm4, [rsi + rax - 3] + movq xmm7, [rsi + rax + 5] punpcklqdq xmm0, xmm3 + punpcklqdq xmm4, xmm7 - movdqa xmm1, xmm0 - pshufb xmm0, [GLOBAL(shuf_t0t1)] - pmaddubsw xmm0, k0k1 + HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 + HORIZx8_ROW xmm4, xmm5, xmm6, xmm7 +%if %1 + movq xmm1, [rdi] + movq xmm2, [rdi + rdx] + pavgb xmm0, xmm1 + pavgb xmm4, xmm2 +%endif + movq [rdi], xmm0 + movq [rdi + rdx], xmm4 - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf_t2t3)] - pmaddubsw xmm1, k2k3 + lea rsi, [rsi + rax] + prefetcht0 [rsi + 4 * rax - 3] + lea rsi, [rsi + rax] + lea rdi, [rdi + 2 * rdx] + prefetcht0 [rsi + 2 * rax - 3] + dec rcx + jnz .loop - movdqa xmm4, xmm2 - pshufb xmm2, [GLOBAL(shuf_t4t5)] - pmaddubsw xmm2, k4k5 + ;Do last row if output_height is odd + movsxd rcx, dword ptr arg(4) ;output_height + and rcx, 1 + je .done - pshufb xmm4, [GLOBAL(shuf_t6t7)] - pmaddubsw xmm4, k6k7 + movq xmm0, [rsi - 3] + movq xmm3, [rsi + 5] + punpcklqdq xmm0, xmm3 - paddsw xmm0, xmm1 - paddsw xmm0, xmm4 - paddsw xmm0, xmm2 - paddsw xmm0, krd - psraw xmm0, 7 - packuswb xmm0, xmm0 + HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 %if %1 movq xmm1, [rdi] pavgb xmm0, xmm1 %endif - - lea rsi, [rsi + rax] movq [rdi], xmm0 - - lea rdi, [rdi + rdx] - dec rcx - jnz .loop +.done %endm %macro HORIZx16 1 @@ -705,60 +746,53 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): movsxd rcx, dword ptr arg(4) ;output_height .loop: - movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 + prefetcht0 [rsi + 2 * rax -3] - movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 - punpcklqdq xmm0, xmm3 + movq xmm0, [rsi - 3] ;load src data + movq xmm4, [rsi + 5] + movq xmm7, [rsi + 13] + punpcklqdq xmm0, xmm4 + punpcklqdq xmm4, xmm7 movdqa xmm1, xmm0 - pshufb xmm0, [GLOBAL(shuf_t0t1)] - pmaddubsw xmm0, k0k1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm4 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 - movdqa xmm2, xmm1 + pshufb xmm0, [GLOBAL(shuf_t0t1)] pshufb xmm1, [GLOBAL(shuf_t2t3)] - pmaddubsw xmm1, k2k3 - - movdqa xmm4, xmm2 pshufb xmm2, [GLOBAL(shuf_t4t5)] - pmaddubsw xmm2, k4k5 + pshufb xmm3, [GLOBAL(shuf_t6t7)] + pshufb xmm4, [GLOBAL(shuf_t0t1)] + pshufb xmm5, [GLOBAL(shuf_t2t3)] + pshufb xmm6, [GLOBAL(shuf_t4t5)] + pshufb xmm7, [GLOBAL(shuf_t6t7)] - pshufb xmm4, [GLOBAL(shuf_t6t7)] - pmaddubsw xmm4, k6k7 + pmaddubsw xmm0, k0k1 + pmaddubsw xmm1, k2k3 + pmaddubsw xmm2, k4k5 + pmaddubsw xmm3, k6k7 + pmaddubsw xmm4, k0k1 + pmaddubsw xmm5, k2k3 + pmaddubsw xmm6, k4k5 + pmaddubsw xmm7, k6k7 paddsw xmm0, xmm1 - paddsw xmm0, xmm4 + paddsw xmm0, xmm3 paddsw xmm0, xmm2 + paddsw xmm4, xmm5 + paddsw xmm4, xmm7 + paddsw xmm4, xmm6 + paddsw xmm0, krd + paddsw xmm4, krd psraw xmm0, 7 + psraw xmm4, 7 packuswb xmm0, xmm0 - - - movq xmm3, [rsi + 5] - movq xmm7, [rsi + 13] - punpcklqdq xmm3, xmm7 - - movdqa xmm1, xmm3 - pshufb xmm3, [GLOBAL(shuf_t0t1)] - pmaddubsw xmm3, k0k1 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf_t2t3)] - pmaddubsw xmm1, k2k3 - - movdqa xmm4, xmm2 - pshufb xmm2, [GLOBAL(shuf_t4t5)] - pmaddubsw xmm2, k4k5 - - pshufb xmm4, [GLOBAL(shuf_t6t7)] - pmaddubsw xmm4, k6k7 - - paddsw xmm3, xmm1 - paddsw xmm3, xmm4 - paddsw xmm3, xmm2 - paddsw xmm3, krd - psraw xmm3, 7 - packuswb xmm3, xmm3 - punpcklqdq xmm0, xmm3 + packuswb xmm4, xmm4 + punpcklqdq xmm0, xmm4 %if %1 movdqa xmm1, [rdi] pavgb xmm0, xmm1 @@ -792,19 +826,8 @@ sym(vp9_filter_block1d4_h8_ssse3): push rdi ; end prolog - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - HORIZx4 0 - add rsp, 16*5 - pop rsp - ; begin epilog pop rdi pop rsi @@ -909,19 +932,8 @@ sym(vp9_filter_block1d4_h8_avg_ssse3): push rdi ; end prolog - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - HORIZx4 1 - add rsp, 16*5 - pop rsp - ; begin epilog pop rdi pop rsi diff --git a/libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm b/libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm deleted file mode 100644 index 174e747..0000000 --- a/libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm +++ /dev/null @@ -1,230 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vp9_add_constant_residual_8x8_neon| - EXPORT |vp9_add_constant_residual_16x16_neon| - EXPORT |vp9_add_constant_residual_32x32_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - MACRO - LD_16x8 $src, $stride - vld1.8 {q8}, [$src], $stride - vld1.8 {q9}, [$src], $stride - vld1.8 {q10}, [$src], $stride - vld1.8 {q11}, [$src], $stride - vld1.8 {q12}, [$src], $stride - vld1.8 {q13}, [$src], $stride - vld1.8 {q14}, [$src], $stride - vld1.8 {q15}, [$src], $stride - MEND - - MACRO - ADD_DIFF_16x8 $diff - vqadd.u8 q8, q8, $diff - vqadd.u8 q9, q9, $diff - vqadd.u8 q10, q10, $diff - vqadd.u8 q11, q11, $diff - vqadd.u8 q12, q12, $diff - vqadd.u8 q13, q13, $diff - vqadd.u8 q14, q14, $diff - vqadd.u8 q15, q15, $diff - MEND - - MACRO - SUB_DIFF_16x8 $diff - vqsub.u8 q8, q8, $diff - vqsub.u8 q9, q9, $diff - vqsub.u8 q10, q10, $diff - vqsub.u8 q11, q11, $diff - vqsub.u8 q12, q12, $diff - vqsub.u8 q13, q13, $diff - vqsub.u8 q14, q14, $diff - vqsub.u8 q15, q15, $diff - MEND - - MACRO - ST_16x8 $dst, $stride - vst1.8 {q8}, [$dst], $stride - vst1.8 {q9}, [$dst], $stride - vst1.8 {q10}, [$dst], $stride - vst1.8 {q11}, [$dst], $stride - vst1.8 {q12}, [$dst], $stride - vst1.8 {q13}, [$dst], $stride - vst1.8 {q14}, [$dst], $stride - vst1.8 {q15}, [$dst], $stride - MEND - -; void add_constant_residual(const int16_t diff, uint8_t *dest, int stride, -; int width, int height) { -; int r, c; -; -; for (r = 0; r < height; r++) { -; for (c = 0; c < width; c++) -; dest[c] = clip_pixel(diff + dest[c]); -; -; dest += stride; -; } -;} -;void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, -; int stride) { -; add_constant_residual(diff, dest, stride, 8, 8); -;} -; r0 : const int16_t diff -; r1 : const uint8_t *dest -; r2 : int stride -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -|vp9_add_constant_residual_8x8_neon| PROC - mov r3, r1 ; r3: save dest to r3 - vld1.8 {d0}, [r1], r2 - vld1.8 {d1}, [r1], r2 - vld1.8 {d2}, [r1], r2 - vld1.8 {d3}, [r1], r2 - vld1.8 {d4}, [r1], r2 - vld1.8 {d5}, [r1], r2 - vld1.8 {d6}, [r1], r2 - vld1.8 {d7}, [r1], r2 - cmp r0, #0 - bge DIFF_POSITIVE_8x8 - -DIFF_NEGATIVE_8x8 ; diff < 0 - neg r0, r0 - usat r0, #8, r0 - vdup.u8 q8, r0 - - vqsub.u8 q0, q0, q8 - vqsub.u8 q1, q1, q8 - vqsub.u8 q2, q2, q8 - vqsub.u8 q3, q3, q8 - b DIFF_SAVE_8x8 - -DIFF_POSITIVE_8x8 ; diff >= 0 - usat r0, #8, r0 - vdup.u8 q8, r0 - - vqadd.u8 q0, q0, q8 - vqadd.u8 q1, q1, q8 - vqadd.u8 q2, q2, q8 - vqadd.u8 q3, q3, q8 - -DIFF_SAVE_8x8 - vst1.8 {d0}, [r3], r2 - vst1.8 {d1}, [r3], r2 - vst1.8 {d2}, [r3], r2 - vst1.8 {d3}, [r3], r2 - vst1.8 {d4}, [r3], r2 - vst1.8 {d5}, [r3], r2 - vst1.8 {d6}, [r3], r2 - vst1.8 {d7}, [r3], r2 - - bx lr - ENDP - -;void vp9_add_constant_residual_16x16_c(const int16_t diff, uint8_t *dest, -; int stride) { -; add_constant_residual(diff, dest, stride, 16, 16); -;} -; r0 : const int16_t diff -; r1 : const uint8_t *dest -; r2 : int stride -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -|vp9_add_constant_residual_16x16_neon| PROC - mov r3, r1 - LD_16x8 r1, r2 - cmp r0, #0 - bge DIFF_POSITIVE_16x16 - -|DIFF_NEGATIVE_16x16| - neg r0, r0 - usat r0, #8, r0 - vdup.u8 q0, r0 - - SUB_DIFF_16x8 q0 - ST_16x8 r3, r2 - LD_16x8 r1, r2 - SUB_DIFF_16x8 q0 - b DIFF_SAVE_16x16 - -|DIFF_POSITIVE_16x16| - usat r0, #8, r0 - vdup.u8 q0, r0 - - ADD_DIFF_16x8 q0 - ST_16x8 r3, r2 - LD_16x8 r1, r2 - ADD_DIFF_16x8 q0 - -|DIFF_SAVE_16x16| - ST_16x8 r3, r2 - bx lr - ENDP - -;void vp9_add_constant_residual_32x32_c(const int16_t diff, uint8_t *dest, -; int stride) { -; add_constant_residual(diff, dest, stride, 32, 32); -;} -; r0 : const int16_t diff -; r1 : const uint8_t *dest -; r2 : int stride -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -|vp9_add_constant_residual_32x32_neon| PROC - push {r4,lr} - pld [r1] - mov r3, r1 - add r4, r1, #16 ; r4 dest + 16 for second loop - cmp r0, #0 - bge DIFF_POSITIVE_32x32 - -|DIFF_NEGATIVE_32x32| - neg r0, r0 - usat r0, #8, r0 - vdup.u8 q0, r0 - mov r0, #4 - -|DIFF_NEGATIVE_32x32_LOOP| - sub r0, #1 - LD_16x8 r1, r2 - SUB_DIFF_16x8 q0 - ST_16x8 r3, r2 - - LD_16x8 r1, r2 - SUB_DIFF_16x8 q0 - ST_16x8 r3, r2 - cmp r0, #2 - moveq r1, r4 - moveq r3, r4 - cmp r0, #0 - bne DIFF_NEGATIVE_32x32_LOOP - pop {r4,pc} - -|DIFF_POSITIVE_32x32| - usat r0, #8, r0 - vdup.u8 q0, r0 - mov r0, #4 - -|DIFF_POSITIVE_32x32_LOOP| - sub r0, #1 - LD_16x8 r1, r2 - ADD_DIFF_16x8 q0 - ST_16x8 r3, r2 - - LD_16x8 r1, r2 - ADD_DIFF_16x8 q0 - ST_16x8 r3, r2 - cmp r0, #2 - moveq r1, r4 - moveq r3, r4 - cmp r0, #0 - bne DIFF_POSITIVE_32x32_LOOP - pop {r4,pc} - ENDP - - END diff --git a/libvpx/vp9/decoder/vp9_dboolhuff.h b/libvpx/vp9/decoder/vp9_dboolhuff.h index c864516..fd8e74c 100644 --- a/libvpx/vp9/decoder/vp9_dboolhuff.h +++ b/libvpx/vp9/decoder/vp9_dboolhuff.h @@ -44,7 +44,7 @@ static int vp9_read(vp9_reader *br, int probability) { VP9_BD_VALUE bigsplit; int count; unsigned int range; - unsigned int split = 1 + (((br->range - 1) * probability) >> 8); + unsigned int split = ((br->range * probability) + (256 - probability)) >> 8; if (br->count < 0) vp9_reader_fill(br); diff --git a/libvpx/vp9/decoder/vp9_decodemv.c b/libvpx/vp9/decoder/vp9_decodemv.c index 84a29b1..9792d2c 100644 --- a/libvpx/vp9/decoder/vp9_decodemv.c +++ b/libvpx/vp9/decoder/vp9_decodemv.c @@ -23,18 +23,36 @@ #include "vp9/decoder/vp9_decodemv.h" #include "vp9/decoder/vp9_decodframe.h" #include "vp9/decoder/vp9_onyxd_int.h" -#include "vp9/decoder/vp9_dsubexp.h" #include "vp9/decoder/vp9_treereader.h" static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) { return (MB_PREDICTION_MODE)treed_read(r, vp9_intra_mode_tree, p); } +static MB_PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, vp9_reader *r, + int size_group) { + const MB_PREDICTION_MODE y_mode = read_intra_mode(r, + cm->fc.y_mode_prob[size_group]); + if (!cm->frame_parallel_decoding_mode) + ++cm->counts.y_mode[size_group][y_mode]; + return y_mode; +} + +static MB_PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r, + MB_PREDICTION_MODE y_mode) { + const MB_PREDICTION_MODE uv_mode = read_intra_mode(r, + cm->fc.uv_mode_prob[y_mode]); + if (!cm->frame_parallel_decoding_mode) + ++cm->counts.uv_mode[y_mode][uv_mode]; + return uv_mode; +} + static MB_PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r, uint8_t context) { - MB_PREDICTION_MODE mode = treed_read(r, vp9_inter_mode_tree, - cm->fc.inter_mode_probs[context]); - ++cm->counts.inter_mode[context][inter_mode_offset(mode)]; + const MB_PREDICTION_MODE mode = treed_read(r, vp9_inter_mode_tree, + cm->fc.inter_mode_probs[context]); + if (!cm->frame_parallel_decoding_mode) + ++cm->counts.inter_mode[context][inter_mode_offset(mode)]; return mode; } @@ -53,33 +71,28 @@ static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, tx_size += vp9_read(r, tx_probs[2]); } - update_tx_counts(bsize, context, tx_size, &cm->counts.tx); + if (!cm->frame_parallel_decoding_mode) + ++get_tx_counts(bsize, context, &cm->counts.tx)[tx_size]; return tx_size; } -static TX_SIZE read_tx_size(VP9D_COMP *pbi, TX_MODE tx_mode, - BLOCK_SIZE bsize, int allow_select, +static TX_SIZE read_tx_size(VP9_COMMON *const cm, MACROBLOCKD *const xd, + TX_MODE tx_mode, BLOCK_SIZE bsize, int allow_select, vp9_reader *r) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; - - if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8) + if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8) { return read_selected_tx_size(cm, xd, bsize, r); - else if (tx_mode >= ALLOW_32X32 && bsize >= BLOCK_32X32) - return TX_32X32; - else if (tx_mode >= ALLOW_16X16 && bsize >= BLOCK_16X16) - return TX_16X16; - else if (tx_mode >= ALLOW_8X8 && bsize >= BLOCK_8X8) - return TX_8X8; - else - return TX_4X4; + } else { + const TX_SIZE max_tx_size_block = max_txsize_lookup[bsize]; + const TX_SIZE max_tx_size_txmode = tx_mode_to_biggest_tx_size[tx_mode]; + return MIN(max_tx_size_block, max_tx_size_txmode); + } } static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize, int mi_row, int mi_col, int segment_id) { const int mi_offset = mi_row * cm->mi_cols + mi_col; - const int bw = 1 << mi_width_log2(bsize); - const int bh = 1 << mi_height_log2(bsize); + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; const int xmis = MIN(cm->mi_cols - mi_col, bw); const int ymis = MIN(cm->mi_rows - mi_row, bh); int x, y; @@ -91,11 +104,11 @@ static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize, cm->last_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id; } -static int read_intra_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col, +static int read_intra_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd, + int mi_row, int mi_col, vp9_reader *r) { - MACROBLOCKD *const xd = &pbi->mb; - struct segmentation *const seg = &pbi->common.seg; - const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type; + struct segmentation *const seg = &cm->seg; + const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type; int segment_id; if (!seg->enabled) @@ -105,16 +118,14 @@ static int read_intra_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col, return 0; segment_id = read_segment_id(r, seg); - set_segment_id(&pbi->common, bsize, mi_row, mi_col, segment_id); + set_segment_id(cm, bsize, mi_row, mi_col, segment_id); return segment_id; } -static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col, - vp9_reader *r) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; +static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd, + int mi_row, int mi_col, vp9_reader *r) { struct segmentation *const seg = &cm->seg; - const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type; + const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type; int pred_segment_id, segment_id; if (!seg->enabled) @@ -138,37 +149,37 @@ static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col, return segment_id; } -static uint8_t read_skip_coeff(VP9D_COMP *pbi, int segment_id, vp9_reader *r) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; - int skip_coeff = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); - if (!skip_coeff) { +static int read_skip_coeff(VP9_COMMON *cm, const MACROBLOCKD *xd, + int segment_id, vp9_reader *r) { + if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + return 1; + } else { const int ctx = vp9_get_pred_context_mbskip(xd); - skip_coeff = vp9_read(r, vp9_get_pred_prob_mbskip(cm, xd)); - cm->counts.mbskip[ctx][skip_coeff]++; + const int skip = vp9_read(r, cm->fc.mbskip_probs[ctx]); + if (!cm->frame_parallel_decoding_mode) + ++cm->counts.mbskip[ctx][skip]; + return skip; } - return skip_coeff; } -static void read_intra_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *m, +static void read_intra_frame_mode_info(VP9_COMMON *const cm, + MACROBLOCKD *const xd, + MODE_INFO *const m, int mi_row, int mi_col, vp9_reader *r) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; MB_MODE_INFO *const mbmi = &m->mbmi; const BLOCK_SIZE bsize = mbmi->sb_type; const MODE_INFO *above_mi = xd->mi_8x8[-cm->mode_info_stride]; - const MODE_INFO *left_mi = xd->mi_8x8[-1]; + const MODE_INFO *left_mi = xd->left_available ? xd->mi_8x8[-1] : NULL; - mbmi->segment_id = read_intra_segment_id(pbi, mi_row, mi_col, r); - mbmi->skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r); - mbmi->tx_size = read_tx_size(pbi, cm->tx_mode, bsize, 1, r); + mbmi->segment_id = read_intra_segment_id(cm, xd, mi_row, mi_col, r); + mbmi->skip_coeff = read_skip_coeff(cm, xd, mbmi->segment_id, r); + mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, bsize, 1, r); mbmi->ref_frame[0] = INTRA_FRAME; mbmi->ref_frame[1] = NONE; if (bsize >= BLOCK_8X8) { const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, 0); - const MB_PREDICTION_MODE L = xd->left_available ? - left_block_mode(m, left_mi, 0) : DC_PRED; + const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, 0); mbmi->mode = read_intra_mode(r, vp9_kf_y_mode_prob[A][L]); } else { // Only 4x4, 4x8, 8x4 blocks @@ -180,8 +191,7 @@ static void read_intra_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *m, for (idx = 0; idx < 2; idx += num_4x4_w) { const int ib = idy * 2 + idx; const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, ib); - const MB_PREDICTION_MODE L = (xd->left_available || idx) ? - left_block_mode(m, left_mi, ib) : DC_PRED; + const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, ib); const MB_PREDICTION_MODE b_mode = read_intra_mode(r, vp9_kf_y_mode_prob[A][L]); m->bmi[ib].as_mode = b_mode; @@ -200,7 +210,6 @@ static void read_intra_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *m, static int read_mv_component(vp9_reader *r, const nmv_component *mvcomp, int usehp) { - int mag, d, fr, hp; const int sign = vp9_read(r, mvcomp->sign); const int mv_class = treed_read(r, vp9_mv_class_tree, mvcomp->classes); @@ -251,56 +260,10 @@ static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref, mv->col = ref->col + diff.col; } -static void update_mv(vp9_reader *r, vp9_prob *p) { - if (vp9_read(r, NMV_UPDATE_PROB)) - *p = (vp9_read_literal(r, 7) << 1) | 1; -} - -static void read_mv_probs(vp9_reader *r, nmv_context *mvc, int allow_hp) { - int i, j, k; - - for (j = 0; j < MV_JOINTS - 1; ++j) - update_mv(r, &mvc->joints[j]); - - for (i = 0; i < 2; ++i) { - nmv_component *const comp = &mvc->comps[i]; - - update_mv(r, &comp->sign); - - for (j = 0; j < MV_CLASSES - 1; ++j) - update_mv(r, &comp->classes[j]); - - for (j = 0; j < CLASS0_SIZE - 1; ++j) - update_mv(r, &comp->class0[j]); - - for (j = 0; j < MV_OFFSET_BITS; ++j) - update_mv(r, &comp->bits[j]); - } - - for (i = 0; i < 2; ++i) { - nmv_component *const comp = &mvc->comps[i]; - - for (j = 0; j < CLASS0_SIZE; ++j) - for (k = 0; k < 3; ++k) - update_mv(r, &comp->class0_fp[j][k]); - - for (j = 0; j < 3; ++j) - update_mv(r, &comp->fp[j]); - } - - if (allow_hp) { - for (i = 0; i < 2; ++i) { - update_mv(r, &mvc->comps[i].class0_hp); - update_mv(r, &mvc->comps[i].hp); - } - } -} - // Read the referncence frame -static void read_ref_frames(VP9D_COMP *pbi, vp9_reader *r, +static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, + vp9_reader *r, int segment_id, MV_REFERENCE_FRAME ref_frame[2]) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; FRAME_CONTEXT *const fc = &cm->fc; FRAME_COUNTS *const counts = &cm->counts; @@ -313,7 +276,8 @@ static void read_ref_frames(VP9D_COMP *pbi, vp9_reader *r, if (cm->comp_pred_mode == HYBRID_PREDICTION) { is_comp = vp9_read(r, fc->comp_inter_prob[comp_ctx]); - counts->comp_inter[comp_ctx][is_comp]++; + if (!cm->frame_parallel_decoding_mode) + ++counts->comp_inter[comp_ctx][is_comp]; } else { is_comp = cm->comp_pred_mode == COMP_PREDICTION_ONLY; } @@ -323,18 +287,21 @@ static void read_ref_frames(VP9D_COMP *pbi, vp9_reader *r, const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; const int ref_ctx = vp9_get_pred_context_comp_ref_p(cm, xd); const int b = vp9_read(r, fc->comp_ref_prob[ref_ctx]); - counts->comp_ref[ref_ctx][b]++; + if (!cm->frame_parallel_decoding_mode) + ++counts->comp_ref[ref_ctx][b]; ref_frame[fix_ref_idx] = cm->comp_fixed_ref; ref_frame[!fix_ref_idx] = cm->comp_var_ref[b]; } else { const int ctx0 = vp9_get_pred_context_single_ref_p1(xd); const int bit0 = vp9_read(r, fc->single_ref_prob[ctx0][0]); - ++counts->single_ref[ctx0][0][bit0]; + if (!cm->frame_parallel_decoding_mode) + ++counts->single_ref[ctx0][0][bit0]; if (bit0) { const int ctx1 = vp9_get_pred_context_single_ref_p2(xd); const int bit1 = vp9_read(r, fc->single_ref_prob[ctx1][1]); ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME; - ++counts->single_ref[ctx1][1][bit1]; + if (!cm->frame_parallel_decoding_mode) + ++counts->single_ref[ctx1][1][bit1]; } else { ref_frame[0] = LAST_FRAME; } @@ -344,43 +311,19 @@ static void read_ref_frames(VP9D_COMP *pbi, vp9_reader *r, } } -static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) { - int i, j; - for (j = 0; j < SWITCHABLE_FILTERS + 1; ++j) - for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) - if (vp9_read(r, MODE_UPDATE_PROB)) - vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]); -} -static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) { - int i, j; - for (i = 0; i < INTER_MODE_CONTEXTS; ++i) - for (j = 0; j < INTER_MODES - 1; ++j) - if (vp9_read(r, MODE_UPDATE_PROB)) - vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]); -} - -static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) { - COMPPREDMODE_TYPE mode = vp9_read_bit(r); - if (mode) - mode += vp9_read_bit(r); - return mode; -} - -static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type( - VP9D_COMP *pbi, vp9_reader *r) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; +static INLINE INTERPOLATION_TYPE read_switchable_filter_type( + VP9_COMMON *const cm, MACROBLOCKD *const xd, vp9_reader *r) { const int ctx = vp9_get_pred_context_switchable_interp(xd); const int type = treed_read(r, vp9_switchable_interp_tree, cm->fc.switchable_interp_prob[ctx]); - ++cm->counts.switchable_interp[ctx][type]; + if (!cm->frame_parallel_decoding_mode) + ++cm->counts.switchable_interp[ctx][type]; return type; } -static void read_intra_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi, - vp9_reader *r) { - VP9_COMMON *const cm = &pbi->common; +static void read_intra_block_mode_info(VP9_COMMON *const cm, MODE_INFO *mi, + vp9_reader *r) { MB_MODE_INFO *const mbmi = &mi->mbmi; const BLOCK_SIZE bsize = mi->mbmi.sb_type; @@ -388,9 +331,7 @@ static void read_intra_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi, mbmi->ref_frame[1] = NONE; if (bsize >= BLOCK_8X8) { - const int size_group = size_group_lookup[bsize]; - mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[size_group]); - cm->counts.y_mode[size_group][mbmi->mode]++; + mbmi->mode = read_intra_mode_y(cm, r, size_group_lookup[bsize]); } else { // Only 4x4, 4x8, 8x4 blocks const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; // 1 or 2 @@ -400,10 +341,8 @@ static void read_intra_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi, for (idy = 0; idy < 2; idy += num_4x4_h) { for (idx = 0; idx < 2; idx += num_4x4_w) { const int ib = idy * 2 + idx; - const int b_mode = read_intra_mode(r, cm->fc.y_mode_prob[0]); + const int b_mode = read_intra_mode_y(cm, r, 0); mi->bmi[ib].as_mode = b_mode; - cm->counts.y_mode[0][b_mode]++; - if (num_4x4_h == 2) mi->bmi[ib + 2].as_mode = b_mode; if (num_4x4_w == 2) @@ -413,55 +352,98 @@ static void read_intra_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi, mbmi->mode = mi->bmi[3].as_mode; } - mbmi->uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[mbmi->mode]); - cm->counts.uv_mode[mbmi->mode][mbmi->uv_mode]++; + mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode); } -static int read_is_inter_block(VP9D_COMP *pbi, int segment_id, vp9_reader *r) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; +static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode, + int_mv mv[2], int_mv best_mv[2], + int_mv nearest_mv[2], int_mv near_mv[2], + int is_compound, int allow_hp, vp9_reader *r) { + int i; + int ret = 1; + + switch (mode) { + case NEWMV: { + nmv_context_counts *const mv_counts = cm->frame_parallel_decoding_mode ? + NULL : &cm->counts.mv; + read_mv(r, &mv[0].as_mv, &best_mv[0].as_mv, + &cm->fc.nmvc, mv_counts, allow_hp); + if (is_compound) + read_mv(r, &mv[1].as_mv, &best_mv[1].as_mv, + &cm->fc.nmvc, mv_counts, allow_hp); + for (i = 0; i < 1 + is_compound; ++i) { + ret = ret && mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW; + ret = ret && mv[i].as_mv.col < MV_UPP && mv[i].as_mv.col > MV_LOW; + } + break; + } + case NEARESTMV: { + mv[0].as_int = nearest_mv[0].as_int; + if (is_compound) mv[1].as_int = nearest_mv[1].as_int; + break; + } + case NEARMV: { + mv[0].as_int = near_mv[0].as_int; + if (is_compound) mv[1].as_int = near_mv[1].as_int; + break; + } + case ZEROMV: { + mv[0].as_int = 0; + if (is_compound) mv[1].as_int = 0; + break; + } + default: { + return 0; + } + } + return ret; +} +static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd, + int segment_id, vp9_reader *r) { if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { return vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME; } else { const int ctx = vp9_get_pred_context_intra_inter(xd); const int is_inter = vp9_read(r, vp9_get_pred_prob_intra_inter(cm, xd)); - ++cm->counts.intra_inter[ctx][is_inter]; + if (!cm->frame_parallel_decoding_mode) + ++cm->counts.intra_inter[ctx][is_inter]; return is_inter; } } -static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi, +static void read_inter_block_mode_info(VP9_COMMON *const cm, + MACROBLOCKD *const xd, + const TileInfo *const tile, + MODE_INFO *const mi, int mi_row, int mi_col, vp9_reader *r) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; - nmv_context *const nmvc = &cm->fc.nmvc; MB_MODE_INFO *const mbmi = &mi->mbmi; - int_mv *const mv0 = &mbmi->mv[0]; - int_mv *const mv1 = &mbmi->mv[1]; const BLOCK_SIZE bsize = mbmi->sb_type; - const int allow_hp = xd->allow_high_precision_mv; + const int allow_hp = cm->allow_high_precision_mv; - int_mv nearest, nearby, best_mv; - int_mv nearest_second, nearby_second, best_mv_second; + int_mv nearest[2], nearmv[2], best[2]; uint8_t inter_mode_ctx; MV_REFERENCE_FRAME ref0; int is_compound; mbmi->uv_mode = DC_PRED; - read_ref_frames(pbi, r, mbmi->segment_id, mbmi->ref_frame); + read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame); ref0 = mbmi->ref_frame[0]; is_compound = has_second_ref(mbmi); - vp9_find_mv_refs(cm, xd, mi, xd->last_mi, ref0, mbmi->ref_mvs[ref0], + vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi, ref0, mbmi->ref_mvs[ref0], mi_row, mi_col); inter_mode_ctx = mbmi->mode_context[ref0]; if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { mbmi->mode = ZEROMV; - assert(bsize >= BLOCK_8X8); + if (bsize < BLOCK_8X8) { + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid usage of segement feature on small blocks"); + return; + } } else { if (bsize >= BLOCK_8X8) mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx); @@ -469,222 +451,119 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi, // nearest, nearby if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) { - vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref0], &nearest, &nearby); - best_mv.as_int = mbmi->ref_mvs[ref0][0].as_int; + vp9_find_best_ref_mvs(xd, allow_hp, + mbmi->ref_mvs[ref0], &nearest[0], &nearmv[0]); + best[0].as_int = nearest[0].as_int; } if (is_compound) { const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1]; - vp9_find_mv_refs(cm, xd, mi, xd->last_mi, + vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi, ref1, mbmi->ref_mvs[ref1], mi_row, mi_col); if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) { - vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref1], - &nearest_second, &nearby_second); - best_mv_second.as_int = mbmi->ref_mvs[ref1][0].as_int; + vp9_find_best_ref_mvs(xd, allow_hp, + mbmi->ref_mvs[ref1], &nearest[1], &nearmv[1]); + best[1].as_int = nearest[1].as_int; } } - mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE - ? read_switchable_filter_type(pbi, r) - : cm->mcomp_filter_type; + mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE) + ? read_switchable_filter_type(cm, xd, r) + : cm->mcomp_filter_type; if (bsize < BLOCK_8X8) { const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; // 1 or 2 const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; // 1 or 2 int idx, idy; + int b_mode; for (idy = 0; idy < 2; idy += num_4x4_h) { for (idx = 0; idx < 2; idx += num_4x4_w) { - int_mv blockmv, secondmv; + int_mv block[2]; const int j = idy * 2 + idx; - const int b_mode = read_inter_mode(cm, r, inter_mode_ctx); + b_mode = read_inter_mode(cm, r, inter_mode_ctx); if (b_mode == NEARESTMV || b_mode == NEARMV) { - vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest, &nearby, j, 0, + vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, &nearest[0], + &nearmv[0], j, 0, mi_row, mi_col); if (is_compound) - vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest_second, - &nearby_second, j, 1, - mi_row, mi_col); + vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, &nearest[1], + &nearmv[1], j, 1, + mi_row, mi_col); } - switch (b_mode) { - case NEWMV: - read_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc, - &cm->counts.mv, allow_hp); - - if (is_compound) - read_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc, - &cm->counts.mv, allow_hp); - break; - case NEARESTMV: - blockmv.as_int = nearest.as_int; - if (is_compound) - secondmv.as_int = nearest_second.as_int; - break; - case NEARMV: - blockmv.as_int = nearby.as_int; - if (is_compound) - secondmv.as_int = nearby_second.as_int; - break; - case ZEROMV: - blockmv.as_int = 0; - if (is_compound) - secondmv.as_int = 0; - break; - default: - assert(!"Invalid inter mode value"); - } - mi->bmi[j].as_mv[0].as_int = blockmv.as_int; + if (!assign_mv(cm, b_mode, block, best, nearest, nearmv, + is_compound, allow_hp, r)) { + xd->corrupted |= 1; + break; + }; + + + mi->bmi[j].as_mv[0].as_int = block[0].as_int; if (is_compound) - mi->bmi[j].as_mv[1].as_int = secondmv.as_int; + mi->bmi[j].as_mv[1].as_int = block[1].as_int; if (num_4x4_h == 2) mi->bmi[j + 2] = mi->bmi[j]; if (num_4x4_w == 2) mi->bmi[j + 1] = mi->bmi[j]; - mi->mbmi.mode = b_mode; } } - mv0->as_int = mi->bmi[3].as_mv[0].as_int; - mv1->as_int = mi->bmi[3].as_mv[1].as_int; - } else { - switch (mbmi->mode) { - case NEARMV: - mv0->as_int = nearby.as_int; - if (is_compound) - mv1->as_int = nearby_second.as_int; - break; + mi->mbmi.mode = b_mode; - case NEARESTMV: - mv0->as_int = nearest.as_int; - if (is_compound) - mv1->as_int = nearest_second.as_int; - break; - - case ZEROMV: - mv0->as_int = 0; - if (is_compound) - mv1->as_int = 0; - break; - - case NEWMV: - read_mv(r, &mv0->as_mv, &best_mv.as_mv, nmvc, &cm->counts.mv, allow_hp); - if (is_compound) - read_mv(r, &mv1->as_mv, &best_mv_second.as_mv, nmvc, &cm->counts.mv, - allow_hp); - break; - default: - assert(!"Invalid inter mode value"); - } + mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int; + mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int; + } else { + xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv, + best, nearest, nearmv, + is_compound, allow_hp, r); } } -static void read_inter_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *mi, +static void read_inter_frame_mode_info(VP9_COMMON *const cm, + MACROBLOCKD *const xd, + const TileInfo *const tile, + MODE_INFO *const mi, int mi_row, int mi_col, vp9_reader *r) { - VP9_COMMON *const cm = &pbi->common; MB_MODE_INFO *const mbmi = &mi->mbmi; int inter_block; mbmi->mv[0].as_int = 0; mbmi->mv[1].as_int = 0; - mbmi->segment_id = read_inter_segment_id(pbi, mi_row, mi_col, r); - mbmi->skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r); - inter_block = read_is_inter_block(pbi, mbmi->segment_id, r); - mbmi->tx_size = read_tx_size(pbi, cm->tx_mode, mbmi->sb_type, + mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r); + mbmi->skip_coeff = read_skip_coeff(cm, xd, mbmi->segment_id, r); + inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r); + mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, mbmi->sb_type, !mbmi->skip_coeff || !inter_block, r); if (inter_block) - read_inter_block_mode_info(pbi, mi, mi_row, mi_col, r); + read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r); else - read_intra_block_mode_info(pbi, mi, r); + read_intra_block_mode_info(cm, mi, r); } -static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) { - int i; - - cm->comp_pred_mode = cm->allow_comp_inter_inter ? read_comp_pred_mode(r) - : SINGLE_PREDICTION_ONLY; - - if (cm->comp_pred_mode == HYBRID_PREDICTION) - for (i = 0; i < COMP_INTER_CONTEXTS; i++) - if (vp9_read(r, MODE_UPDATE_PROB)) - vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]); - - if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) - for (i = 0; i < REF_CONTEXTS; i++) { - if (vp9_read(r, MODE_UPDATE_PROB)) - vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]); - if (vp9_read(r, MODE_UPDATE_PROB)) - vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]); - } - - if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) - for (i = 0; i < REF_CONTEXTS; i++) - if (vp9_read(r, MODE_UPDATE_PROB)) - vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]); -} - -void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) { - VP9_COMMON *const cm = &pbi->common; - int k; - - // TODO(jkoleszar): does this clear more than MBSKIP_CONTEXTS? Maybe remove. - // vpx_memset(cm->fc.mbskip_probs, 0, sizeof(cm->fc.mbskip_probs)); - for (k = 0; k < MBSKIP_CONTEXTS; ++k) - if (vp9_read(r, MODE_UPDATE_PROB)) - vp9_diff_update_prob(r, &cm->fc.mbskip_probs[k]); - - if (cm->frame_type != KEY_FRAME && !cm->intra_only) { - nmv_context *const nmvc = &pbi->common.fc.nmvc; - MACROBLOCKD *const xd = &pbi->mb; - int i, j; - - read_inter_mode_probs(&cm->fc, r); - - if (cm->mcomp_filter_type == SWITCHABLE) - read_switchable_interp_probs(&cm->fc, r); - - for (i = 0; i < INTRA_INTER_CONTEXTS; i++) - if (vp9_read(r, MODE_UPDATE_PROB)) - vp9_diff_update_prob(r, &cm->fc.intra_inter_prob[i]); - - read_comp_pred(cm, r); - - for (j = 0; j < BLOCK_SIZE_GROUPS; j++) - for (i = 0; i < INTRA_MODES - 1; ++i) - if (vp9_read(r, MODE_UPDATE_PROB)) - vp9_diff_update_prob(r, &cm->fc.y_mode_prob[j][i]); - - for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j) - for (i = 0; i < PARTITION_TYPES - 1; ++i) - if (vp9_read(r, MODE_UPDATE_PROB)) - vp9_diff_update_prob(r, &cm->fc.partition_prob[INTER_FRAME][j][i]); - - read_mv_probs(r, nmvc, xd->allow_high_precision_mv); - } -} - -void vp9_read_mode_info(VP9D_COMP* pbi, int mi_row, int mi_col, vp9_reader *r) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; - MODE_INFO *mi = xd->this_mi; +void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *const tile, + int mi_row, int mi_col, vp9_reader *r) { + MODE_INFO *const mi = xd->mi_8x8[0]; const BLOCK_SIZE bsize = mi->mbmi.sb_type; - const int bw = 1 << mi_width_log2(bsize); - const int bh = 1 << mi_height_log2(bsize); + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; const int y_mis = MIN(bh, cm->mi_rows - mi_row); const int x_mis = MIN(bw, cm->mi_cols - mi_col); int x, y, z; - if (cm->frame_type == KEY_FRAME || cm->intra_only) - read_intra_frame_mode_info(pbi, mi, mi_row, mi_col, r); + if (frame_is_intra_only(cm)) + read_intra_frame_mode_info(cm, xd, mi, mi_row, mi_col, r); else - read_inter_frame_mode_info(pbi, mi, mi_row, mi_col, r); + read_inter_frame_mode_info(cm, xd, tile, mi, mi_row, mi_col, r); - for (y = 0, z = 0; y < y_mis; y++, z += cm->mode_info_stride) + for (y = 0, z = 0; y < y_mis; y++, z += cm->mode_info_stride) { for (x = !y; x < x_mis; x++) { - xd->mi_8x8[z + x] = mi; - } + xd->mi_8x8[z + x] = mi; + } + } } diff --git a/libvpx/vp9/decoder/vp9_decodemv.h b/libvpx/vp9/decoder/vp9_decodemv.h index 462d2e3..8e9ae4a 100644 --- a/libvpx/vp9/decoder/vp9_decodemv.h +++ b/libvpx/vp9/decoder/vp9_decodemv.h @@ -14,8 +14,10 @@ #include "vp9/decoder/vp9_onyxd_int.h" #include "vp9/decoder/vp9_dboolhuff.h" -void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r); +struct TileInfo; -void vp9_read_mode_info(VP9D_COMP* pbi, int mi_row, int mi_col, vp9_reader *r); +void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd, + const struct TileInfo *const tile, + int mi_row, int mi_col, vp9_reader *r); #endif // VP9_DECODER_VP9_DECODEMV_H_ diff --git a/libvpx/vp9/decoder/vp9_decodframe.c b/libvpx/vp9/decoder/vp9_decodframe.c index dbba28e..4746a3a 100644 --- a/libvpx/vp9/decoder/vp9_decodframe.c +++ b/libvpx/vp9/decoder/vp9_decodframe.c @@ -19,6 +19,7 @@ #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_extend.h" +#include "vp9/common/vp9_idct.h" #include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_reconintra.h" @@ -31,16 +32,49 @@ #include "vp9/decoder/vp9_detokenize.h" #include "vp9/decoder/vp9_decodemv.h" #include "vp9/decoder/vp9_dsubexp.h" -#include "vp9/decoder/vp9_idct_blk.h" #include "vp9/decoder/vp9_onyxd_int.h" #include "vp9/decoder/vp9_read_bit_buffer.h" #include "vp9/decoder/vp9_thread.h" #include "vp9/decoder/vp9_treereader.h" +typedef struct TileWorkerData { + VP9_COMMON *cm; + vp9_reader bit_reader; + DECLARE_ALIGNED(16, MACROBLOCKD, xd); + DECLARE_ALIGNED(16, unsigned char, token_cache[1024]); +} TileWorkerData; + static int read_be32(const uint8_t *p) { return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; } +static int is_compound_prediction_allowed(const VP9_COMMON *cm) { + int i; + for (i = 1; i < ALLOWED_REFS_PER_FRAME; ++i) + if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) + return 1; + + return 0; +} + +static void setup_compound_prediction(VP9_COMMON *cm) { + if (cm->ref_frame_sign_bias[LAST_FRAME] == + cm->ref_frame_sign_bias[GOLDEN_FRAME]) { + cm->comp_fixed_ref = ALTREF_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = GOLDEN_FRAME; + } else if (cm->ref_frame_sign_bias[LAST_FRAME] == + cm->ref_frame_sign_bias[ALTREF_FRAME]) { + cm->comp_fixed_ref = GOLDEN_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = ALTREF_FRAME; + } else { + cm->comp_fixed_ref = LAST_FRAME; + cm->comp_var_ref[0] = GOLDEN_FRAME; + cm->comp_var_ref[1] = ALTREF_FRAME; + } +} + // len == 0 is not allowed static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) { return start + len > start && start + len <= end; @@ -63,18 +97,105 @@ static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) { for (i = 0; i < TX_SIZE_CONTEXTS; ++i) for (j = 0; j < TX_SIZES - 3; ++j) - if (vp9_read(r, MODE_UPDATE_PROB)) - vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]); + vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]); for (i = 0; i < TX_SIZE_CONTEXTS; ++i) for (j = 0; j < TX_SIZES - 2; ++j) - if (vp9_read(r, MODE_UPDATE_PROB)) - vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]); + vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]); for (i = 0; i < TX_SIZE_CONTEXTS; ++i) for (j = 0; j < TX_SIZES - 1; ++j) - if (vp9_read(r, MODE_UPDATE_PROB)) - vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]); + vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]); +} + +static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) { + int i, j; + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) + for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) + vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]); +} + +static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) { + int i, j; + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) + for (j = 0; j < INTER_MODES - 1; ++j) + vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]); +} + +static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) { + COMPPREDMODE_TYPE mode = vp9_read_bit(r); + if (mode) + mode += vp9_read_bit(r); + return mode; +} + +static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) { + int i; + + const int compound_allowed = is_compound_prediction_allowed(cm); + cm->comp_pred_mode = compound_allowed ? read_comp_pred_mode(r) + : SINGLE_PREDICTION_ONLY; + if (compound_allowed) + setup_compound_prediction(cm); + + if (cm->comp_pred_mode == HYBRID_PREDICTION) + for (i = 0; i < COMP_INTER_CONTEXTS; i++) + vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]); + + if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) + for (i = 0; i < REF_CONTEXTS; i++) { + vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]); + vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]); + } + + if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) + for (i = 0; i < REF_CONTEXTS; i++) + vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]); +} + +static void update_mv(vp9_reader *r, vp9_prob *p) { + if (vp9_read(r, NMV_UPDATE_PROB)) + *p = (vp9_read_literal(r, 7) << 1) | 1; +} + +static void read_mv_probs(vp9_reader *r, nmv_context *mvc, int allow_hp) { + int i, j, k; + + for (j = 0; j < MV_JOINTS - 1; ++j) + update_mv(r, &mvc->joints[j]); + + for (i = 0; i < 2; ++i) { + nmv_component *const comp = &mvc->comps[i]; + + update_mv(r, &comp->sign); + + for (j = 0; j < MV_CLASSES - 1; ++j) + update_mv(r, &comp->classes[j]); + + for (j = 0; j < CLASS0_SIZE - 1; ++j) + update_mv(r, &comp->class0[j]); + + for (j = 0; j < MV_OFFSET_BITS; ++j) + update_mv(r, &comp->bits[j]); + } + + for (i = 0; i < 2; ++i) { + nmv_component *const comp = &mvc->comps[i]; + + for (j = 0; j < CLASS0_SIZE; ++j) + for (k = 0; k < 3; ++k) + update_mv(r, &comp->class0_fp[j][k]); + + for (j = 0; j < 3; ++j) + update_mv(r, &comp->fp[j]); + } + + if (allow_hp) { + for (i = 0; i < 2; ++i) { + update_mv(r, &mvc->comps[i].class0_hp); + update_mv(r, &mvc->comps[i].hp); + } + } } static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) { @@ -85,47 +206,110 @@ static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) { xd->plane[i].dequant = cm->uv_dequant[q_index]; } -static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *arg) { - MACROBLOCKD* const xd = arg; +// Allocate storage for each tile column. +// TODO(jzern): when max_threads <= 1 the same storage could be used for each +// tile. +static void alloc_tile_storage(VP9D_COMP *pbi, int tile_cols) { + VP9_COMMON *const cm = &pbi->common; + const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + int i, tile_col; + + CHECK_MEM_ERROR(cm, pbi->mi_streams, + vpx_realloc(pbi->mi_streams, tile_cols * + sizeof(*pbi->mi_streams))); + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileInfo tile; + + vp9_tile_init(&tile, cm, 0, tile_col); + pbi->mi_streams[tile_col] = + &cm->mi[cm->mi_rows * tile.mi_col_start]; + } + + // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm + // block where mi unit size is 8x8. + CHECK_MEM_ERROR(cm, pbi->above_context[0], + vpx_realloc(pbi->above_context[0], + sizeof(*pbi->above_context[0]) * MAX_MB_PLANE * + 2 * aligned_mi_cols)); + for (i = 1; i < MAX_MB_PLANE; ++i) { + pbi->above_context[i] = pbi->above_context[0] + + i * sizeof(*pbi->above_context[0]) * + 2 * aligned_mi_cols; + } + + // This is sized based on the entire frame. Each tile operates within its + // column bounds. + CHECK_MEM_ERROR(cm, pbi->above_seg_context, + vpx_realloc(pbi->above_seg_context, + sizeof(*pbi->above_seg_context) * + aligned_mi_cols)); +} + +static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { struct macroblockd_plane *const pd = &xd->plane[plane]; - int16_t* const qcoeff = BLOCK_OFFSET(pd->qcoeff, block); + int16_t* const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); const int stride = pd->dst.stride; const int eob = pd->eobs[block]; - const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size, - block); - uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block, - pd->dst.buf, stride); - switch (tx_size) { - case TX_4X4: { - const TX_TYPE tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block); - if (tx_type == DCT_DCT) - xd->itxm_add(qcoeff, dst, stride, eob); + if (eob > 0) { + TX_TYPE tx_type; + const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size, + block); + uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block, + pd->dst.buf, stride); + switch (tx_size) { + case TX_4X4: + tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block); + if (tx_type == DCT_DCT) + xd->itxm_add(dqcoeff, dst, stride, eob); + else + vp9_iht4x4_16_add(dqcoeff, dst, stride, tx_type); + break; + case TX_8X8: + tx_type = get_tx_type_8x8(pd->plane_type, xd); + vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_16X16: + tx_type = get_tx_type_16x16(pd->plane_type, xd); + vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_32X32: + tx_type = DCT_DCT; + vp9_idct32x32_add(dqcoeff, dst, stride, eob); + break; + default: + assert(!"Invalid transform size"); + } + + if (eob == 1) { + vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0])); + } else { + if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10) + vpx_memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0])); + else if (tx_size == TX_32X32 && eob <= 34) + vpx_memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0])); else - vp9_iht_add_c(tx_type, qcoeff, dst, stride, eob); - break; + vpx_memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0])); } - case TX_8X8: - vp9_iht_add_8x8_c(get_tx_type_8x8(pd->plane_type, xd), qcoeff, dst, - stride, eob); - break; - case TX_16X16: - vp9_iht_add_16x16_c(get_tx_type_16x16(pd->plane_type, xd), qcoeff, dst, - stride, eob); - break; - case TX_32X32: - vp9_idct_add_32x32(qcoeff, dst, stride, eob); - break; - default: - assert(!"Invalid transform size"); } } -static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *arg) { - MACROBLOCKD* const xd = arg; +struct intra_args { + VP9_COMMON *cm; + MACROBLOCKD *xd; + vp9_reader *r; + unsigned char* token_cache; +}; + +static void predict_and_reconstruct_intra_block(int plane, int block, + BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + struct intra_args *const args = arg; + VP9_COMMON *const cm = args->cm; + MACROBLOCKD *const xd = args->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; - MODE_INFO *const mi = xd->this_mi; + MODE_INFO *const mi = xd->mi_8x8[0]; const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size, block); uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block, @@ -142,32 +326,37 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, b_width_log2(plane_bsize), tx_size, mode, dst, pd->dst.stride, dst, pd->dst.stride); - if (!mi->mbmi.skip_coeff) - decode_block(plane, block, plane_bsize, tx_size, arg); + if (!mi->mbmi.skip_coeff) { + vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, tx_size, + args->r, args->token_cache); + inverse_transform_block(xd, plane, block, plane_bsize, tx_size); + } } -static int decode_tokens(VP9D_COMP *pbi, BLOCK_SIZE bsize, vp9_reader *r) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; - MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; - - if (mbmi->skip_coeff) { - reset_skip_context(xd, bsize); - return -1; - } else { - if (cm->seg.enabled) - setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id, - cm->base_qindex)); - - // TODO(dkovalev) if (!vp9_reader_has_error(r)) - return vp9_decode_tokens(pbi, r, bsize); - } +struct inter_args { + VP9_COMMON *cm; + MACROBLOCKD *xd; + vp9_reader *r; + int *eobtotal; + unsigned char* token_cache; +}; + +static void reconstruct_inter_block(int plane, int block, + BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + struct inter_args *args = arg; + VP9_COMMON *const cm = args->cm; + MACROBLOCKD *const xd = args->xd; + + *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block, + plane_bsize, tx_size, + args->r, args->token_cache); + inverse_transform_block(xd, plane, block, plane_bsize, tx_size); } -static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE bsize, - int mi_row, int mi_col) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; +static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, + const TileInfo *const tile, + BLOCK_SIZE bsize, int mi_row, int mi_col) { const int bh = num_8x8_blocks_high_lookup[bsize]; const int bw = num_8x8_blocks_wide_lookup[bsize]; const int offset = mi_row * cm->mode_info_stride + mi_col; @@ -178,178 +367,187 @@ static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE bsize, xd->prev_mi_8x8 = cm->prev_mi_grid_visible + offset; // we are using the mode info context stream here - xd->this_mi = - xd->mi_8x8[0] = xd->mic_stream_ptr; - xd->this_mi->mbmi.sb_type = bsize; - xd->mic_stream_ptr++; + xd->mi_8x8[0] = xd->mi_stream; + xd->mi_8x8[0]->mbmi.sb_type = bsize; + ++xd->mi_stream; // Special case: if prev_mi is NULL, the previous mode info context // cannot be used. xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL; - set_skip_context(cm, xd, mi_row, mi_col); - set_partition_seg_context(cm, xd, mi_row, mi_col); + set_skip_context(xd, xd->above_context, xd->left_context, mi_row, mi_col); // Distance of Mb to the various image edges. These are specified to 8th pel // as they are always compared to values that are in 1/8th pel units - set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw); + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); - setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], mi_row, mi_col); + setup_dst_planes(xd, get_frame_new_buffer(cm), mi_row, mi_col); } -static void set_ref(VP9D_COMP *pbi, int i, int mi_row, int mi_col) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; - MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; - const int ref = mbmi->ref_frame[i] - LAST_FRAME; - const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[ref]]; - const struct scale_factors *sf = &cm->active_ref_scale[ref]; - if (!vp9_is_valid_scale(sf)) +static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd, + int idx, int mi_row, int mi_col) { + MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; + const int ref = mbmi->ref_frame[idx] - LAST_FRAME; + const YV12_BUFFER_CONFIG *cfg = get_frame_ref_buffer(cm, ref); + const struct scale_factors_common *sfc = &cm->active_ref_scale_comm[ref]; + if (!vp9_is_valid_scale(sfc)) vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, "Invalid scale factors"); - xd->scale_factor[i] = *sf; - setup_pre_planes(xd, i, cfg, mi_row, mi_col, sf); + xd->scale_factor[idx].sfc = sfc; + setup_pre_planes(xd, idx, cfg, mi_row, mi_col, &xd->scale_factor[idx]); xd->corrupted |= cfg->corrupted; } -static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col, - vp9_reader *r, BLOCK_SIZE bsize) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; +static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd, + const TileInfo *const tile, + int mi_row, int mi_col, + vp9_reader *r, BLOCK_SIZE bsize, + unsigned char *token_cache) { const int less8x8 = bsize < BLOCK_8X8; MB_MODE_INFO *mbmi; - if (less8x8) - if (xd->ab_index > 0) - return; - - set_offsets(pbi, bsize, mi_row, mi_col); - vp9_read_mode_info(pbi, mi_row, mi_col, r); + set_offsets(cm, xd, tile, bsize, mi_row, mi_col); + vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r); if (less8x8) bsize = BLOCK_8X8; // Has to be called after set_offsets - mbmi = &xd->this_mi->mbmi; + mbmi = &xd->mi_8x8[0]->mbmi; - if (!is_inter_block(mbmi)) { - // Intra reconstruction - decode_tokens(pbi, bsize, r); - foreach_transformed_block(xd, bsize, decode_block_intra, xd); + if (mbmi->skip_coeff) { + reset_skip_context(xd, bsize); } else { - // Inter reconstruction - int eobtotal; + if (cm->seg.enabled) + setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id, + cm->base_qindex)); + } - set_ref(pbi, 0, mi_row, mi_col); + if (!is_inter_block(mbmi)) { + struct intra_args arg = { cm, xd, r, token_cache }; + foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block, + &arg); + } else { + // Setup + set_ref(cm, xd, 0, mi_row, mi_col); if (has_second_ref(mbmi)) - set_ref(pbi, 1, mi_row, mi_col); + set_ref(cm, xd, 1, mi_row, mi_col); - vp9_setup_interp_filters(xd, mbmi->interp_filter, cm); + xd->subpix.filter_x = xd->subpix.filter_y = + vp9_get_filter_kernel(mbmi->interp_filter); + + // Prediction vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); - eobtotal = decode_tokens(pbi, bsize, r); - if (less8x8) { - if (eobtotal >= 0) - foreach_transformed_block(xd, bsize, decode_block, xd); - } else { - assert(mbmi->sb_type == bsize); - if (eobtotal == 0) - // skip loopfilter - vp9_set_pred_flag_mbskip(xd, bsize, 1); - else if (eobtotal > 0) - foreach_transformed_block(xd, bsize, decode_block, xd); + + // Reconstruction + if (!mbmi->skip_coeff) { + int eobtotal = 0; + struct inter_args arg = { cm, xd, r, &eobtotal, token_cache }; + foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg); + if (!less8x8 && eobtotal == 0) + mbmi->skip_coeff = 1; // skip loopfilter } } + xd->corrupted |= vp9_reader_has_error(r); } -static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col, - vp9_reader* r, BLOCK_SIZE bsize) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; +static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs, + int mi_row, int mi_col, BLOCK_SIZE bsize, + vp9_reader *r) { + const int ctx = partition_plane_context(xd->above_seg_context, + xd->left_seg_context, + mi_row, mi_col, bsize); + const vp9_prob *const probs = get_partition_probs(cm, ctx); + const int has_rows = (mi_row + hbs) < cm->mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_cols; + PARTITION_TYPE p; + + if (has_rows && has_cols) + p = treed_read(r, vp9_partition_tree, probs); + else if (!has_rows && has_cols) + p = vp9_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ; + else if (has_rows && !has_cols) + p = vp9_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT; + else + p = PARTITION_SPLIT; + + if (!cm->frame_parallel_decoding_mode) + ++cm->counts.partition[ctx][p]; + + return p; +} + +static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd, + const TileInfo *const tile, + int mi_row, int mi_col, + vp9_reader* r, BLOCK_SIZE bsize, + unsigned char *token_cache) { const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2; - PARTITION_TYPE partition = PARTITION_NONE; + PARTITION_TYPE partition; BLOCK_SIZE subsize; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - if (bsize < BLOCK_8X8) { - if (xd->ab_index != 0) - return; - } else { - int pl; - const int idx = check_bsize_coverage(hbs, cm->mi_rows, cm->mi_cols, - mi_row, mi_col); - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); - - if (idx == 0) - partition = treed_read(r, vp9_partition_tree, - cm->fc.partition_prob[cm->frame_type][pl]); - else if (idx > 0 && - !vp9_read(r, cm->fc.partition_prob[cm->frame_type][pl][idx])) - partition = (idx == 1) ? PARTITION_HORZ : PARTITION_VERT; - else - partition = PARTITION_SPLIT; - - cm->counts.partition[pl][partition]++; - } - + partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r); subsize = get_subsize(bsize, partition); - *get_sb_index(xd, subsize) = 0; - - switch (partition) { - case PARTITION_NONE: - decode_modes_b(pbi, mi_row, mi_col, r, subsize); - break; - case PARTITION_HORZ: - decode_modes_b(pbi, mi_row, mi_col, r, subsize); - *get_sb_index(xd, subsize) = 1; - if (mi_row + hbs < cm->mi_rows) - decode_modes_b(pbi, mi_row + hbs, mi_col, r, subsize); - break; - case PARTITION_VERT: - decode_modes_b(pbi, mi_row, mi_col, r, subsize); - *get_sb_index(xd, subsize) = 1; - if (mi_col + hbs < cm->mi_cols) - decode_modes_b(pbi, mi_row, mi_col + hbs, r, subsize); - break; - case PARTITION_SPLIT: { - int n; - for (n = 0; n < 4; n++) { - const int j = n >> 1, i = n & 1; - *get_sb_index(xd, subsize) = n; - decode_modes_sb(pbi, mi_row + j * hbs, mi_col + i * hbs, r, subsize); - } - } break; - default: - assert(!"Invalid partition type"); + if (subsize < BLOCK_8X8) { + decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); + } else { + switch (partition) { + case PARTITION_NONE: + decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); + break; + case PARTITION_HORZ: + decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); + if (mi_row + hbs < cm->mi_rows) + decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize, + token_cache); + break; + case PARTITION_VERT: + decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); + if (mi_col + hbs < cm->mi_cols) + decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize, + token_cache); + break; + case PARTITION_SPLIT: + decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize, + token_cache); + decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize, + token_cache); + decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize, + token_cache); + decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize, + token_cache); + break; + default: + assert(!"Invalid partition type"); + } } // update partition context if (bsize >= BLOCK_8X8 && - (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) { - set_partition_seg_context(cm, xd, mi_row, mi_col); - update_partition_context(xd, subsize, bsize); - } + (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) + update_partition_context(xd->above_seg_context, xd->left_seg_context, + mi_row, mi_col, subsize, bsize); } -static void setup_token_decoder(VP9D_COMP *pbi, - const uint8_t *data, size_t read_size, +static void setup_token_decoder(const uint8_t *data, + const uint8_t *data_end, + size_t read_size, + struct vpx_internal_error_info *error_info, vp9_reader *r) { - VP9_COMMON *cm = &pbi->common; - const uint8_t *data_end = pbi->source + pbi->source_sz; - // Validate the calculated partition length. If the buffer // described by the partition can't be fully read, then restrict // it to the portion that can be (for EC mode) or throw an error. if (!read_is_valid(data, read_size, data_end)) - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt tile length"); if (vp9_reader_init(r, data, read_size)) - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + vpx_internal_error(error_info, VPX_CODEC_MEM_ERROR, "Failed to allocate bool decoder %d", 1); } @@ -364,22 +562,15 @@ static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs, for (l = 0; l < PREV_COEF_CONTEXTS; l++) if (k > 0 || l < 3) for (m = 0; m < UNCONSTRAINED_NODES; m++) - if (vp9_read(r, VP9_COEF_UPDATE_PROB)) - vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]); + vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]); } static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode, vp9_reader *r) { - read_coef_probs_common(fc->coef_probs[TX_4X4], r); - - if (tx_mode > ONLY_4X4) - read_coef_probs_common(fc->coef_probs[TX_8X8], r); - - if (tx_mode > ALLOW_8X8) - read_coef_probs_common(fc->coef_probs[TX_16X16], r); - - if (tx_mode > ALLOW_16X16) - read_coef_probs_common(fc->coef_probs[TX_32X32], r); + const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode]; + TX_SIZE tx_size; + for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) + read_coef_probs_common(fc->coef_probs[tx_size], r); } static void setup_segmentation(struct segmentation *seg, @@ -436,7 +627,6 @@ static void setup_segmentation(struct segmentation *seg, static void setup_loopfilter(struct loopfilter *lf, struct vp9_read_bit_buffer *rb) { - lf->filter_level = vp9_rb_read_literal(rb, 6); lf->sharpness_level = vp9_rb_read_literal(rb, 3); @@ -467,9 +657,8 @@ static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) { return old != *delta_q; } -static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) { - MACROBLOCKD *const xd = &pbi->mb; - VP9_COMMON *const cm = &pbi->common; +static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd, + struct vp9_read_bit_buffer *rb) { int update = 0; cm->base_qindex = vp9_rb_read_literal(rb, QINDEX_BITS); @@ -484,16 +673,15 @@ static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) { cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0; - xd->itxm_add = xd->lossless ? vp9_idct_add_lossless_c - : vp9_idct_add; + xd->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; } -static INTERPOLATIONFILTERTYPE read_interp_filter_type( - struct vp9_read_bit_buffer *rb) { - const INTERPOLATIONFILTERTYPE literal_to_type[] = { EIGHTTAP_SMOOTH, - EIGHTTAP, - EIGHTTAP_SHARP, - BILINEAR }; +static INTERPOLATION_TYPE read_interp_filter_type( + struct vp9_read_bit_buffer *rb) { + const INTERPOLATION_TYPE literal_to_type[] = { EIGHTTAP_SMOOTH, + EIGHTTAP, + EIGHTTAP_SHARP, + BILINEAR }; return vp9_rb_read_bit(rb) ? SWITCHABLE : literal_to_type[vp9_rb_read_literal(rb, 2)]; } @@ -539,7 +727,7 @@ static void apply_frame_size(VP9D_COMP *pbi, int width, int height) { vp9_update_frame_size(cm); } - vp9_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx], cm->width, cm->height, + vp9_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, VP9BORDERINPIXELS); } @@ -560,7 +748,7 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi, int found = 0, i; for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) { if (vp9_rb_read_bit(rb)) { - YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[i]]; + YV12_BUFFER_CONFIG *const cfg = get_frame_ref_buffer(cm, i); width = cfg->y_crop_width; height = cfg->y_crop_height; found = 1; @@ -579,67 +767,73 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi, setup_display_size(cm, rb); } -static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) { +static void setup_tile_context(VP9D_COMP *const pbi, MACROBLOCKD *const xd, + int tile_col) { + int i; + xd->mi_stream = pbi->mi_streams[tile_col]; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + xd->above_context[i] = pbi->above_context[i]; + } + // see note in alloc_tile_storage(). + xd->above_seg_context = pbi->above_seg_context; +} + +static void decode_tile(VP9D_COMP *pbi, const TileInfo *const tile, + vp9_reader *r) { const int num_threads = pbi->oxcf.max_threads; VP9_COMMON *const cm = &pbi->common; int mi_row, mi_col; - YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[cm->new_fb_idx]; + MACROBLOCKD *xd = &pbi->mb; if (pbi->do_loopfilter_inline) { - if (num_threads > 1) { - LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; - lf_data->frame_buffer = fb; - lf_data->cm = cm; - lf_data->xd = pbi->mb; - lf_data->stop = 0; - lf_data->y_only = 0; - } + LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + lf_data->frame_buffer = get_frame_new_buffer(cm); + lf_data->cm = cm; + lf_data->xd = pbi->mb; + lf_data->stop = 0; + lf_data->y_only = 0; vp9_loop_filter_frame_init(cm, cm->lf.filter_level); } - for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end; + for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; mi_row += MI_BLOCK_SIZE) { // For a SB there are 2 left contexts, each pertaining to a MB row within - vp9_zero(cm->left_context); - vp9_zero(cm->left_seg_context); - for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end; + vp9_zero(xd->left_context); + vp9_zero(xd->left_seg_context); + for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; mi_col += MI_BLOCK_SIZE) - decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_64X64); + decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64, + pbi->token_cache); if (pbi->do_loopfilter_inline) { - // delay the loopfilter by 1 macroblock row. const int lf_start = mi_row - MI_BLOCK_SIZE; - if (lf_start < 0) continue; + LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; - if (num_threads > 1) { - LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + // delay the loopfilter by 1 macroblock row. + if (lf_start < 0) continue; - // decoding has completed: finish up the loop filter in this thread. - if (mi_row + MI_BLOCK_SIZE >= cm->cur_tile_mi_row_end) continue; + // decoding has completed: finish up the loop filter in this thread. + if (mi_row + MI_BLOCK_SIZE >= tile->mi_row_end) continue; - vp9_worker_sync(&pbi->lf_worker); - lf_data->start = lf_start; - lf_data->stop = mi_row; - pbi->lf_worker.hook = vp9_loop_filter_worker; + vp9_worker_sync(&pbi->lf_worker); + lf_data->start = lf_start; + lf_data->stop = mi_row; + if (num_threads > 1) { vp9_worker_launch(&pbi->lf_worker); } else { - vp9_loop_filter_rows(fb, cm, &pbi->mb, lf_start, mi_row, 0); + vp9_worker_execute(&pbi->lf_worker); } } } if (pbi->do_loopfilter_inline) { - int lf_start; - if (num_threads > 1) { - LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; - vp9_worker_sync(&pbi->lf_worker); - lf_start = lf_data->stop; - } else { - lf_start = mi_row - MI_BLOCK_SIZE; - } - vp9_loop_filter_rows(fb, cm, &pbi->mb, - lf_start, cm->mi_rows, 0); + vp9_worker_sync(&pbi->lf_worker); + lf_data->start = lf_data->stop; + lf_data->stop = cm->mi_rows; + vp9_worker_execute(&pbi->lf_worker); } } @@ -659,10 +853,32 @@ static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { cm->log2_tile_rows += vp9_rb_read_bit(rb); } +// Reads the next tile returning its size and adjusting '*data' accordingly +// based on 'is_last'. +static size_t get_tile(const uint8_t *const data_end, + int is_last, + struct vpx_internal_error_info *error_info, + const uint8_t **data) { + size_t size; + + if (!is_last) { + if (!read_is_valid(*data, 4, data_end)) + vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt tile length"); + + size = read_be32(*data); + *data += 4; + } else { + size = data_end - *data; + } + return size; +} + static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) { vp9_reader residual_bc; VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; const uint8_t *const data_end = pbi->source + pbi->source_sz; const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); @@ -672,70 +888,57 @@ static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) { // Note: this memset assumes above_context[0], [1] and [2] // are allocated as part of the same buffer. - vpx_memset(cm->above_context[0], 0, - sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE * (2 * aligned_mi_cols)); + vpx_memset(pbi->above_context[0], 0, + sizeof(*pbi->above_context[0]) * MAX_MB_PLANE * + 2 * aligned_mi_cols); - vpx_memset(cm->above_seg_context, 0, - sizeof(PARTITION_CONTEXT) * aligned_mi_cols); + vpx_memset(pbi->above_seg_context, 0, + sizeof(*pbi->above_seg_context) * aligned_mi_cols); if (pbi->oxcf.inv_tile_order) { const uint8_t *data_ptr2[4][1 << 6]; vp9_reader bc_bak = {0}; - // pre-initialize the offsets, we're going to read in inverse order + // pre-initialize the offsets, we're going to decode in inverse order data_ptr2[0][0] = data; for (tile_row = 0; tile_row < tile_rows; tile_row++) { - if (tile_row) { - const int size = read_be32(data_ptr2[tile_row - 1][tile_cols - 1]); - data_ptr2[tile_row - 1][tile_cols - 1] += 4; - data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][tile_cols - 1] + size; - } - - for (tile_col = 1; tile_col < tile_cols; tile_col++) { - const int size = read_be32(data_ptr2[tile_row][tile_col - 1]); - data_ptr2[tile_row][tile_col - 1] += 4; - data_ptr2[tile_row][tile_col] = - data_ptr2[tile_row][tile_col - 1] + size; + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + const int last_tile = + tile_row == tile_rows - 1 && tile_col == tile_cols - 1; + const size_t size = get_tile(data_end, last_tile, &cm->error, &data); + data_ptr2[tile_row][tile_col] = data; + data += size; } } for (tile_row = 0; tile_row < tile_rows; tile_row++) { - vp9_get_tile_row_offsets(cm, tile_row); for (tile_col = tile_cols - 1; tile_col >= 0; tile_col--) { - vp9_get_tile_col_offsets(cm, tile_col); - setup_token_decoder(pbi, data_ptr2[tile_row][tile_col], + TileInfo tile; + + vp9_tile_init(&tile, cm, tile_row, tile_col); + setup_token_decoder(data_ptr2[tile_row][tile_col], data_end, data_end - data_ptr2[tile_row][tile_col], - &residual_bc); - decode_tile(pbi, &residual_bc); + &cm->error, &residual_bc); + setup_tile_context(pbi, xd, tile_col); + decode_tile(pbi, &tile, &residual_bc); if (tile_row == tile_rows - 1 && tile_col == tile_cols - 1) bc_bak = residual_bc; } } residual_bc = bc_bak; } else { - int has_more; - for (tile_row = 0; tile_row < tile_rows; tile_row++) { - vp9_get_tile_row_offsets(cm, tile_row); for (tile_col = 0; tile_col < tile_cols; tile_col++) { - size_t size; - - vp9_get_tile_col_offsets(cm, tile_col); + const int last_tile = + tile_row == tile_rows - 1 && tile_col == tile_cols - 1; + const size_t size = get_tile(data_end, last_tile, &cm->error, &data); + TileInfo tile; - has_more = tile_col < tile_cols - 1 || tile_row < tile_rows - 1; - if (has_more) { - if (!read_is_valid(data, 4, data_end)) - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Truncated packet or corrupt tile length"); + vp9_tile_init(&tile, cm, tile_row, tile_col); - size = read_be32(data); - data += 4; - } else { - size = data_end - data; - } - - setup_token_decoder(pbi, data, size, &residual_bc); - decode_tile(pbi, &residual_bc); + setup_token_decoder(data, data_end, size, &cm->error, &residual_bc); + setup_tile_context(pbi, xd, tile_col); + decode_tile(pbi, &tile, &residual_bc); data += size; } } @@ -744,10 +947,113 @@ static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) { return vp9_reader_find_end(&residual_bc); } +static int tile_worker_hook(void *arg1, void *arg2) { + TileWorkerData *tile_data = (TileWorkerData*)arg1; + const TileInfo *const tile = (TileInfo*)arg2; + int mi_row, mi_col; + + for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; + mi_row += MI_BLOCK_SIZE) { + vp9_zero(tile_data->xd.left_context); + vp9_zero(tile_data->xd.left_seg_context); + for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; + mi_col += MI_BLOCK_SIZE) { + decode_modes_sb(tile_data->cm, &tile_data->xd, tile, + mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64, + tile_data->token_cache); + } + } + return !tile_data->xd.corrupted; +} + +static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) { + VP9_COMMON *const cm = &pbi->common; + const uint8_t *const data_end = pbi->source + pbi->source_sz; + const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + const int num_workers = MIN(pbi->oxcf.max_threads & ~1, tile_cols); + int tile_col = 0; + + assert(tile_rows == 1); + (void)tile_rows; + + if (num_workers > pbi->num_tile_workers) { + int i; + CHECK_MEM_ERROR(cm, pbi->tile_workers, + vpx_realloc(pbi->tile_workers, + num_workers * sizeof(*pbi->tile_workers))); + for (i = pbi->num_tile_workers; i < num_workers; ++i) { + VP9Worker *const worker = &pbi->tile_workers[i]; + ++pbi->num_tile_workers; + + vp9_worker_init(worker); + worker->hook = (VP9WorkerHook)tile_worker_hook; + CHECK_MEM_ERROR(cm, worker->data1, + vpx_memalign(32, sizeof(TileWorkerData))); + CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo))); + if (i < num_workers - 1 && !vp9_worker_reset(worker)) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Tile decoder thread creation failed"); + } + } + } + + // Note: this memset assumes above_context[0], [1] and [2] + // are allocated as part of the same buffer. + vpx_memset(pbi->above_context[0], 0, + sizeof(*pbi->above_context[0]) * MAX_MB_PLANE * + 2 * aligned_mi_cols); + vpx_memset(pbi->above_seg_context, 0, + sizeof(*pbi->above_seg_context) * aligned_mi_cols); + + while (tile_col < tile_cols) { + int i; + for (i = 0; i < num_workers && tile_col < tile_cols; ++i) { + VP9Worker *const worker = &pbi->tile_workers[i]; + TileWorkerData *const tile_data = (TileWorkerData*)worker->data1; + TileInfo *const tile = (TileInfo*)worker->data2; + const size_t size = + get_tile(data_end, tile_col == tile_cols - 1, &cm->error, &data); + + tile_data->cm = cm; + tile_data->xd = pbi->mb; + tile_data->xd.corrupted = 0; + vp9_tile_init(tile, tile_data->cm, 0, tile_col); + + setup_token_decoder(data, data_end, size, &cm->error, + &tile_data->bit_reader); + setup_tile_context(pbi, &tile_data->xd, tile_col); + + worker->had_error = 0; + if (i == num_workers - 1 || tile_col == tile_cols - 1) { + vp9_worker_execute(worker); + } else { + vp9_worker_launch(worker); + } + + data += size; + ++tile_col; + } + + for (; i > 0; --i) { + VP9Worker *const worker = &pbi->tile_workers[i - 1]; + pbi->mb.corrupted |= !vp9_worker_sync(worker); + } + } + + { + const int final_worker = (tile_cols + num_workers - 1) % num_workers; + TileWorkerData *const tile_data = + (TileWorkerData*)pbi->tile_workers[final_worker].data1; + return vp9_reader_find_end(&tile_data->bit_reader); + } +} + static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { - if (vp9_rb_read_literal(rb, 8) != SYNC_CODE_0 || - vp9_rb_read_literal(rb, 8) != SYNC_CODE_1 || - vp9_rb_read_literal(rb, 8) != SYNC_CODE_2) { + if (vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_0 || + vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_1 || + vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_2) { vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, "Invalid frame sync code"); } @@ -758,34 +1064,6 @@ static void error_handler(void *data, size_t bit_offset) { vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet"); } -static void setup_inter_inter(VP9_COMMON *cm) { - int i; - - cm->allow_comp_inter_inter = 0; - for (i = 1; i < ALLOWED_REFS_PER_FRAME; ++i) - cm->allow_comp_inter_inter |= - cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]; - - if (cm->allow_comp_inter_inter) { - // which one is always-on in comp inter-inter? - if (cm->ref_frame_sign_bias[LAST_FRAME] == - cm->ref_frame_sign_bias[GOLDEN_FRAME]) { - cm->comp_fixed_ref = ALTREF_FRAME; - cm->comp_var_ref[0] = LAST_FRAME; - cm->comp_var_ref[1] = GOLDEN_FRAME; - } else if (cm->ref_frame_sign_bias[LAST_FRAME] == - cm->ref_frame_sign_bias[ALTREF_FRAME]) { - cm->comp_fixed_ref = GOLDEN_FRAME; - cm->comp_var_ref[0] = LAST_FRAME; - cm->comp_var_ref[1] = ALTREF_FRAME; - } else { - cm->comp_fixed_ref = LAST_FRAME; - cm->comp_var_ref[0] = GOLDEN_FRAME; - cm->comp_var_ref[1] = ALTREF_FRAME; - } - } -} - #define RESERVED \ if (vp9_rb_read_bit(rb)) \ vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, \ @@ -794,12 +1072,12 @@ static void setup_inter_inter(VP9_COMMON *cm) { static size_t read_uncompressed_header(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) { VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; + size_t sz; int i; cm->last_frame_type = cm->frame_type; - if (vp9_rb_read_literal(rb, 2) != 0x2) + if (vp9_rb_read_literal(rb, 2) != VP9_FRAME_MARKER) vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, "Invalid frame marker"); @@ -820,12 +1098,10 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi, cm->error_resilient_mode = vp9_rb_read_bit(rb); if (cm->frame_type == KEY_FRAME) { - int csp; - check_sync_code(cm, rb); - csp = vp9_rb_read_literal(rb, 3); // colorspace - if (csp != 7) { // != sRGB + cm->color_space = vp9_rb_read_literal(rb, 3); // colorspace + if (cm->color_space != SRGB) { vp9_rb_read_bit(rb); // [16,235] (including xvycc) vs [0,255] range if (cm->version == 1) { cm->subsampling_x = vp9_rb_read_bit(rb); @@ -872,13 +1148,11 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi, setup_frame_size_with_refs(pbi, rb); - xd->allow_high_precision_mv = vp9_rb_read_bit(rb); + cm->allow_high_precision_mv = vp9_rb_read_bit(rb); cm->mcomp_filter_type = read_interp_filter_type(rb); for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) vp9_setup_scale_factors(cm, i); - - setup_inter_inter(cm); } } @@ -890,25 +1164,34 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi, cm->frame_parallel_decoding_mode = 1; } + // This flag will be overridden by the call to vp9_setup_past_independence + // below, forcing the use of context 0 for those frame types. cm->frame_context_idx = vp9_rb_read_literal(rb, NUM_FRAME_CONTEXTS_LOG2); - if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode || cm->intra_only) + if (frame_is_intra_only(cm) || cm->error_resilient_mode) vp9_setup_past_independence(cm); setup_loopfilter(&cm->lf, rb); - setup_quantization(pbi, rb); + setup_quantization(cm, &pbi->mb, rb); setup_segmentation(&cm->seg, rb); setup_tile_info(cm, rb); + sz = vp9_rb_read_literal(rb, 16); + + if (sz == 0) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid header size"); - return vp9_rb_read_literal(rb, 16); + return sz; } static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data, size_t partition_size) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; + FRAME_CONTEXT *const fc = &cm->fc; vp9_reader r; + int k; if (vp9_reader_init(&r, data, partition_size)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, @@ -916,10 +1199,36 @@ static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data, cm->tx_mode = xd->lossless ? ONLY_4X4 : read_tx_mode(&r); if (cm->tx_mode == TX_MODE_SELECT) - read_tx_probs(&cm->fc.tx_probs, &r); - read_coef_probs(&cm->fc, cm->tx_mode, &r); + read_tx_probs(&fc->tx_probs, &r); + read_coef_probs(fc, cm->tx_mode, &r); + + for (k = 0; k < MBSKIP_CONTEXTS; ++k) + vp9_diff_update_prob(&r, &fc->mbskip_probs[k]); + + if (!frame_is_intra_only(cm)) { + nmv_context *const nmvc = &fc->nmvc; + int i, j; + + read_inter_mode_probs(fc, &r); + + if (cm->mcomp_filter_type == SWITCHABLE) + read_switchable_interp_probs(fc, &r); + + for (i = 0; i < INTRA_INTER_CONTEXTS; i++) + vp9_diff_update_prob(&r, &fc->intra_inter_prob[i]); + + read_comp_pred(cm, &r); + + for (j = 0; j < BLOCK_SIZE_GROUPS; j++) + for (i = 0; i < INTRA_MODES - 1; ++i) + vp9_diff_update_prob(&r, &fc->y_mode_prob[j][i]); + + for (j = 0; j < PARTITION_CONTEXTS; ++j) + for (i = 0; i < PARTITION_TYPES - 1; ++i) + vp9_diff_update_prob(&r, &fc->partition_prob[j][i]); - vp9_prepare_read_mode_info(pbi, &r); + read_mv_probs(&r, nmvc, cm->allow_high_precision_mv); + } return vp9_reader_has_error(&r); } @@ -936,59 +1245,109 @@ void vp9_init_dequantizer(VP9_COMMON *cm) { } } +#ifdef NDEBUG +#define debug_check_frame_counts(cm) (void)0 +#else // !NDEBUG +// Counts should only be incremented when frame_parallel_decoding_mode and +// error_resilient_mode are disabled. +static void debug_check_frame_counts(const VP9_COMMON *const cm) { + FRAME_COUNTS zero_counts; + vp9_zero(zero_counts); + assert(cm->frame_parallel_decoding_mode || cm->error_resilient_mode); + assert(!memcmp(cm->counts.y_mode, zero_counts.y_mode, + sizeof(cm->counts.y_mode))); + assert(!memcmp(cm->counts.uv_mode, zero_counts.uv_mode, + sizeof(cm->counts.uv_mode))); + assert(!memcmp(cm->counts.partition, zero_counts.partition, + sizeof(cm->counts.partition))); + assert(!memcmp(cm->counts.coef, zero_counts.coef, + sizeof(cm->counts.coef))); + assert(!memcmp(cm->counts.eob_branch, zero_counts.eob_branch, + sizeof(cm->counts.eob_branch))); + assert(!memcmp(cm->counts.switchable_interp, zero_counts.switchable_interp, + sizeof(cm->counts.switchable_interp))); + assert(!memcmp(cm->counts.inter_mode, zero_counts.inter_mode, + sizeof(cm->counts.inter_mode))); + assert(!memcmp(cm->counts.intra_inter, zero_counts.intra_inter, + sizeof(cm->counts.intra_inter))); + assert(!memcmp(cm->counts.comp_inter, zero_counts.comp_inter, + sizeof(cm->counts.comp_inter))); + assert(!memcmp(cm->counts.single_ref, zero_counts.single_ref, + sizeof(cm->counts.single_ref))); + assert(!memcmp(cm->counts.comp_ref, zero_counts.comp_ref, + sizeof(cm->counts.comp_ref))); + assert(!memcmp(&cm->counts.tx, &zero_counts.tx, sizeof(cm->counts.tx))); + assert(!memcmp(cm->counts.mbskip, zero_counts.mbskip, + sizeof(cm->counts.mbskip))); + assert(!memcmp(&cm->counts.mv, &zero_counts.mv, sizeof(cm->counts.mv))); +} +#endif // NDEBUG + int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { int i; VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; const uint8_t *data = pbi->source; - const uint8_t *data_end = pbi->source + pbi->source_sz; + const uint8_t *const data_end = pbi->source + pbi->source_sz; - struct vp9_read_bit_buffer rb = { data, data_end, 0, - cm, error_handler }; + struct vp9_read_bit_buffer rb = { data, data_end, 0, cm, error_handler }; const size_t first_partition_size = read_uncompressed_header(pbi, &rb); const int keyframe = cm->frame_type == KEY_FRAME; - YV12_BUFFER_CONFIG *new_fb = &cm->yv12_fb[cm->new_fb_idx]; + const int tile_rows = 1 << cm->log2_tile_rows; + const int tile_cols = 1 << cm->log2_tile_cols; + YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); if (!first_partition_size) { - // showing a frame directly - *p_data_end = data + 1; - return 0; + // showing a frame directly + *p_data_end = data + 1; + return 0; } - data += vp9_rb_bytes_read(&rb); - xd->corrupted = 0; - new_fb->corrupted = 0; - pbi->do_loopfilter_inline = - (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level; if (!pbi->decoded_key_frame && !keyframe) return -1; + data += vp9_rb_bytes_read(&rb); if (!read_is_valid(data, first_partition_size, data_end)) vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt header length"); - setup_plane_dequants(cm, &pbi->mb, cm->base_qindex); + pbi->do_loopfilter_inline = + (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level; + if (pbi->do_loopfilter_inline && pbi->lf_worker.data1 == NULL) { + CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, vpx_malloc(sizeof(LFWorkerData))); + pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker; + if (pbi->oxcf.max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Loop filter thread creation failed"); + } + } + + alloc_tile_storage(pbi, tile_cols); xd->mi_8x8 = cm->mi_grid_visible; - xd->mic_stream_ptr = cm->mi; xd->mode_info_stride = cm->mode_info_stride; + set_prev_mi(cm); - cm->fc = cm->frame_contexts[cm->frame_context_idx]; - - vp9_zero(cm->counts); - - new_fb->corrupted |= read_compressed_header(pbi, data, first_partition_size); - + setup_plane_dequants(cm, xd, cm->base_qindex); setup_block_dptrs(xd, cm->subsampling_x, cm->subsampling_y); - // clear out the coeff buffer + cm->fc = cm->frame_contexts[cm->frame_context_idx]; + vp9_zero(cm->counts); for (i = 0; i < MAX_MB_PLANE; ++i) - vp9_zero(xd->plane[i].qcoeff); + vp9_zero(xd->plane[i].dqcoeff); - set_prev_mi(cm); + xd->corrupted = 0; + new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size); - *p_data_end = decode_tiles(pbi, data + first_partition_size); + // TODO(jzern): remove frame_parallel_decoding_mode restriction for + // single-frame tile decoding. + if (pbi->oxcf.max_threads > 1 && tile_rows == 1 && tile_cols > 1 && + cm->frame_parallel_decoding_mode) { + *p_data_end = decode_tiles_mt(pbi, data + first_partition_size); + } else { + *p_data_end = decode_tiles(pbi, data + first_partition_size); + } cm->last_width = cm->width; cm->last_height = cm->height; @@ -1006,10 +1365,12 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) { vp9_adapt_coef_probs(cm); - if (!keyframe && !cm->intra_only) { + if (!frame_is_intra_only(cm)) { vp9_adapt_mode_probs(cm); - vp9_adapt_mv_probs(cm, xd->allow_high_precision_mv); + vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv); } + } else { + debug_check_frame_counts(cm); } if (cm->refresh_frame_context) diff --git a/libvpx/vp9/decoder/vp9_detokenize.c b/libvpx/vp9/decoder/vp9_detokenize.c index cd74a0b..b8d670b 100644 --- a/libvpx/vp9/decoder/vp9_detokenize.c +++ b/libvpx/vp9/decoder/vp9_detokenize.c @@ -61,53 +61,55 @@ static const vp9_prob cat6_prob[15] = { 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 }; -DECLARE_ALIGNED(16, extern const uint8_t, - vp9_pt_energy_class[MAX_ENTROPY_TOKENS]); -#define INCREMENT_COUNT(token) \ - do { \ - coef_counts[type][ref][band][pt] \ - [token >= TWO_TOKEN ? \ - (token == DCT_EOB_TOKEN ? DCT_EOB_MODEL_TOKEN : TWO_TOKEN) : \ - token]++; \ - token_cache[scan[c]] = vp9_pt_energy_class[token]; \ - } while (0) +static const int token_to_counttoken[MAX_ENTROPY_TOKENS] = { + ZERO_TOKEN, ONE_TOKEN, TWO_TOKEN, TWO_TOKEN, + TWO_TOKEN, TWO_TOKEN, TWO_TOKEN, TWO_TOKEN, + TWO_TOKEN, TWO_TOKEN, TWO_TOKEN, DCT_EOB_MODEL_TOKEN +}; + +#define INCREMENT_COUNT(token) \ + do { \ + if (!cm->frame_parallel_decoding_mode) { \ + ++coef_counts[band][pt][token_to_counttoken[token]]; \ + } \ + } while (0); #define WRITE_COEF_CONTINUE(val, token) \ { \ - qcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \ + dqcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \ dq[c > 0] / (1 + (tx_size == TX_32X32)); \ INCREMENT_COUNT(token); \ + token_cache[scan[c]] = vp9_pt_energy_class[token]; \ c++; \ continue; \ } -#define ADJUST_COEF(prob, bits_count) \ - do { \ - if (vp9_read(r, prob)) \ - val += 1 << bits_count; \ +#define ADJUST_COEF(prob, bits_count) \ + do { \ + val += (vp9_read(r, prob) << bits_count); \ } while (0); static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, vp9_reader *r, int block_idx, - PLANE_TYPE type, int seg_eob, int16_t *qcoeff_ptr, - TX_SIZE tx_size, const int16_t *dq, - ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) { - FRAME_CONTEXT *const fc = &cm->fc; + PLANE_TYPE type, int seg_eob, int16_t *dqcoeff_ptr, + TX_SIZE tx_size, const int16_t *dq, int pt, + uint8_t *token_cache) { + const FRAME_CONTEXT *const fc = &cm->fc; FRAME_COUNTS *const counts = &cm->counts; - const int ref = is_inter_block(&xd->this_mi->mbmi); + const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi); int band, c = 0; - vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES] = + const vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES] = fc->coef_probs[tx_size][type][ref]; vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]; uint8_t load_map[COEF_BANDS][PREV_COEF_CONTEXTS] = { { 0 } }; - vp9_prob *prob; - vp9_coeff_count_model *coef_counts = counts->coef[tx_size]; + const vp9_prob *prob; + unsigned int (*coef_counts)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES + 1] = + counts->coef[tx_size][type][ref]; + unsigned int (*eob_branch_count)[PREV_COEF_CONTEXTS] = + counts->eob_branch[tx_size][type][ref]; const int16_t *scan, *nb; - const uint8_t *band_translate; - uint8_t token_cache[1024]; - int pt = get_entropy_context(xd, tx_size, type, block_idx, A, L, - &scan, &band_translate); - nb = vp9_get_coef_neighbors_handle(scan); + const uint8_t *const band_translate = get_band_translate(tx_size); + get_scan(xd, tx_size, type, block_idx, &scan, &nb); while (1) { int val; @@ -118,11 +120,12 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, pt = get_coef_context(nb, token_cache, c); band = get_coef_band(band_translate, c); prob = coef_probs[band][pt]; - counts->eob_branch[tx_size][type][ref][band][pt]++; + if (!cm->frame_parallel_decoding_mode) + ++eob_branch_count[band][pt]; if (!vp9_read(r, prob[EOB_CONTEXT_NODE])) break; -SKIP_START: + SKIP_START: if (c >= seg_eob) break; if (c) @@ -132,6 +135,7 @@ SKIP_START: if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) { INCREMENT_COUNT(ZERO_TOKEN); + token_cache[scan[c]] = vp9_pt_energy_class[ZERO_TOKEN]; ++c; goto SKIP_START; } @@ -203,47 +207,34 @@ SKIP_START: WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY6); } - if (c < seg_eob) - coef_counts[type][ref][band][pt][DCT_EOB_MODEL_TOKEN]++; - + if (c < seg_eob) { + if (!cm->frame_parallel_decoding_mode) + ++coef_counts[band][pt][DCT_EOB_MODEL_TOKEN]; + } return c; } -struct decode_block_args { - VP9D_COMP *pbi; - vp9_reader *r; - int *eobtotal; -}; - -static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *argv) { - const struct decode_block_args* const arg = argv; - - // find the maximum eob for this transform size, adjusted by segment - MACROBLOCKD *xd = &arg->pbi->mb; - struct segmentation *seg = &arg->pbi->common.seg; - struct macroblockd_plane* pd = &xd->plane[plane]; - const int segment_id = xd->this_mi->mbmi.segment_id; - const int seg_eob = get_tx_eob(seg, segment_id, tx_size); - int aoff, loff, eob; - +int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd, + int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, vp9_reader *r, + uint8_t *token_cache) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id, + tx_size); + int aoff, loff, eob, pt; txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff); + pt = get_entropy_context(tx_size, pd->above_context + aoff, + pd->left_context + loff); - eob = decode_coefs(&arg->pbi->common, xd, arg->r, block, - pd->plane_type, seg_eob, BLOCK_OFFSET(pd->qcoeff, block), - tx_size, pd->dequant, - pd->above_context + aoff, pd->left_context + loff); + eob = decode_coefs(cm, xd, r, block, + pd->plane_type, seg_eob, BLOCK_OFFSET(pd->dqcoeff, block), + tx_size, pd->dequant, pt, token_cache); set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff); pd->eobs[block] = eob; - *arg->eobtotal += eob; + return eob; } -int vp9_decode_tokens(VP9D_COMP *pbi, vp9_reader *r, BLOCK_SIZE bsize) { - int eobtotal = 0; - struct decode_block_args args = {pbi, r, &eobtotal}; - foreach_transformed_block(&pbi->mb, bsize, decode_block, &args); - return eobtotal; -} + diff --git a/libvpx/vp9/decoder/vp9_detokenize.h b/libvpx/vp9/decoder/vp9_detokenize.h index cf07c56..04939ea 100644 --- a/libvpx/vp9/decoder/vp9_detokenize.h +++ b/libvpx/vp9/decoder/vp9_detokenize.h @@ -15,6 +15,9 @@ #include "vp9/decoder/vp9_onyxd_int.h" #include "vp9/decoder/vp9_dboolhuff.h" -int vp9_decode_tokens(VP9D_COMP* pbi, vp9_reader *r, BLOCK_SIZE bsize); +int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd, + int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, vp9_reader *r, + uint8_t *token_cache); #endif // VP9_DECODER_VP9_DETOKENIZE_H_ diff --git a/libvpx/vp9/decoder/vp9_dsubexp.c b/libvpx/vp9/decoder/vp9_dsubexp.c index 8cc64f7..fcca017 100644 --- a/libvpx/vp9/decoder/vp9_dsubexp.c +++ b/libvpx/vp9/decoder/vp9_dsubexp.c @@ -48,8 +48,6 @@ static int merge_index(int v, int n, int modulus) { static int inv_remap_prob(int v, int m) { static int inv_map_table[MAX_PROB - 1] = { - // generated by: - // inv_map_table[j] = merge_index(j, MAX_PROB - 1, MODULUS_PARAM); 6, 19, 32, 45, 58, 71, 84, 97, 110, 123, 136, 149, 162, 175, 188, 201, 214, 227, 240, 253, 0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26, @@ -66,10 +64,11 @@ static int inv_remap_prob(int v, int m) { 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, - 238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, - + 238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252 }; - // v = merge_index(v, MAX_PROBS - 1, MODULUS_PARAM); + // The clamp is not necessary for conforming VP9 stream, it is added to + // prevent out of bound access for bad input data + v = clamp(v, 0, 253); v = inv_map_table[v]; m--; if ((m << 1) <= MAX_PROB) { @@ -101,6 +100,8 @@ static int decode_term_subexp(vp9_reader *r, int k, int num_syms) { } void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p) { - int delp = decode_term_subexp(r, SUBEXP_PARAM, 255); - *p = (vp9_prob)inv_remap_prob(delp, *p); + if (vp9_read(r, DIFF_UPDATE_PROB)) { + const int delp = decode_term_subexp(r, SUBEXP_PARAM, 255); + *p = (vp9_prob)inv_remap_prob(delp, *p); + } } diff --git a/libvpx/vp9/decoder/vp9_idct_blk.c b/libvpx/vp9/decoder/vp9_idct_blk.c deleted file mode 100644 index 395e636..0000000 --- a/libvpx/vp9/decoder/vp9_idct_blk.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vp9_rtcd.h" -#include "vp9/common/vp9_blockd.h" -#include "vp9/decoder/vp9_idct_blk.h" - -static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride, - int width, int height) { - int r, c; - - for (r = 0; r < height; r++) { - for (c = 0; c < width; c++) - dest[c] = clip_pixel(diff + dest[c]); - - dest += stride; - } -} - -void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, - int stride) { - add_constant_residual(diff, dest, stride, 8, 8); -} - -void vp9_add_constant_residual_16x16_c(const int16_t diff, uint8_t *dest, - int stride) { - add_constant_residual(diff, dest, stride, 16, 16); -} - -void vp9_add_constant_residual_32x32_c(const int16_t diff, uint8_t *dest, - int stride) { - add_constant_residual(diff, dest, stride, 32, 32); -} - -void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride, - int eob) { - if (tx_type == DCT_DCT) { - vp9_idct_add(input, dest, stride, eob); - } else { - vp9_short_iht4x4_add(input, dest, stride, tx_type); - vpx_memset(input, 0, 32); - } -} - -void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, - int stride, int eob) { - if (tx_type == DCT_DCT) { - vp9_idct_add_8x8(input, dest, stride, eob); - } else { - if (eob > 0) { - vp9_short_iht8x8_add(input, dest, stride, tx_type); - vpx_memset(input, 0, 128); - } - } -} - -void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) { - if (eob > 1) { - vp9_short_idct4x4_add(input, dest, stride); - vpx_memset(input, 0, 32); - } else { - vp9_short_idct4x4_1_add(input, dest, stride); - ((int *)input)[0] = 0; - } -} - -void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride, - int eob) { - if (eob > 1) { - vp9_short_iwalsh4x4_add(input, dest, stride); - vpx_memset(input, 0, 32); - } else { - vp9_short_iwalsh4x4_1_add_c(input, dest, stride); - ((int *)input)[0] = 0; - } -} - -void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) { - // If dc is 1, then input[0] is the reconstructed value, do not need - // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. - - // The calculation can be simplified if there are not many non-zero dct - // coefficients. Use eobs to decide what to do. - // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. - // Combine that with code here. - if (eob) { - if (eob == 1) { - // DC only DCT coefficient - vp9_short_idct8x8_1_add(input, dest, stride); - input[0] = 0; - } else if (eob <= 10) { - vp9_short_idct10_8x8_add(input, dest, stride); - vpx_memset(input, 0, 128); - } else { - vp9_short_idct8x8_add(input, dest, stride); - vpx_memset(input, 0, 128); - } - } -} - -void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, - int stride, int eob) { - if (tx_type == DCT_DCT) { - vp9_idct_add_16x16(input, dest, stride, eob); - } else { - if (eob > 0) { - vp9_short_iht16x16_add(input, dest, stride, tx_type); - vpx_memset(input, 0, 512); - } - } -} - -void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) { - /* The calculation can be simplified if there are not many non-zero dct - * coefficients. Use eobs to separate different cases. */ - if (eob) { - if (eob == 1) { - /* DC only DCT coefficient. */ - vp9_short_idct16x16_1_add(input, dest, stride); - input[0] = 0; - } else if (eob <= 10) { - vp9_short_idct10_16x16_add(input, dest, stride); - vpx_memset(input, 0, 512); - } else { - vp9_short_idct16x16_add(input, dest, stride); - vpx_memset(input, 0, 512); - } - } -} - -void vp9_idct_add_32x32_c(int16_t *input, uint8_t *dest, int stride, int eob) { - DECLARE_ALIGNED_ARRAY(16, int16_t, output, 1024); - - if (eob) { - if (eob == 1) { - vp9_short_idct1_32x32(input, output); - vp9_add_constant_residual_32x32(output[0], dest, stride); - input[0] = 0; - } else { - vp9_short_idct32x32_add(input, dest, stride); - vpx_memset(input, 0, 2048); - } - } -} - diff --git a/libvpx/vp9/decoder/vp9_idct_blk.h b/libvpx/vp9/decoder/vp9_idct_blk.h deleted file mode 100644 index 1810bd0..0000000 --- a/libvpx/vp9/decoder/vp9_idct_blk.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_DECODER_VP9_IDCT_BLK_H_ -#define VP9_DECODER_VP9_IDCT_BLK_H_ - -#include "vp9/common/vp9_blockd.h" - - -void vp9_idct_add_lossless_c(int16_t *input, unsigned char *dest, int stride, - int eob); - -void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest, - int stride, int eob); - -void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest, - int stride, int eob); - -void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest, - int stride, int eob); - -#endif // VP9_DECODER_VP9_IDCT_BLK_H_ diff --git a/libvpx/vp9/decoder/vp9_onyxd.h b/libvpx/vp9/decoder/vp9_onyxd.h index cd5b750..a4b9c24 100644 --- a/libvpx/vp9/decoder/vp9_onyxd.h +++ b/libvpx/vp9/decoder/vp9_onyxd.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ONYXD_H_ -#define VP9_COMMON_VP9_ONYXD_H_ +#ifndef VP9_DECODER_VP9_ONYXD_H_ +#define VP9_DECODER_VP9_ONYXD_H_ #ifdef __cplusplus extern "C" { @@ -40,7 +40,7 @@ typedef enum { void vp9_initialize_dec(); int vp9_receive_compressed_data(VP9D_PTR comp, - uint64_t size, const uint8_t **dest, + size_t size, const uint8_t **dest, int64_t time_stamp); int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd, @@ -66,4 +66,4 @@ void vp9_remove_decompressor(VP9D_PTR comp); } #endif -#endif // VP9_COMMON_VP9_ONYXD_H_ +#endif // VP9_DECODER_VP9_ONYXD_H_ diff --git a/libvpx/vp9/decoder/vp9_onyxd_if.c b/libvpx/vp9/decoder/vp9_onyxd_if.c index 17d5def..5f970a3 100644 --- a/libvpx/vp9/decoder/vp9_onyxd_if.c +++ b/libvpx/vp9/decoder/vp9_onyxd_if.c @@ -65,13 +65,12 @@ static void recon_write_yuv_frame(const char *name, #endif #if WRITE_RECON_BUFFER == 2 void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) { - // write the frame FILE *yframe; int i; char filename[255]; - sprintf(filename, "dx\\y%04d.raw", this_frame); + snprintf(filename, sizeof(filename)-1, "dx\\y%04d.raw", this_frame); yframe = fopen(filename, "wb"); for (i = 0; i < frame->y_height; i++) @@ -79,7 +78,7 @@ void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) { frame->y_width, 1, yframe); fclose(yframe); - sprintf(filename, "dx\\u%04d.raw", this_frame); + snprintf(filename, sizeof(filename)-1, "dx\\u%04d.raw", this_frame); yframe = fopen(filename, "wb"); for (i = 0; i < frame->uv_height; i++) @@ -87,7 +86,7 @@ void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) { frame->uv_width, 1, yframe); fclose(yframe); - sprintf(filename, "dx\\v%04d.raw", this_frame); + snprintf(filename, sizeof(filename)-1, "dx\\v%04d.raw", this_frame); yframe = fopen(filename, "wb"); for (i = 0; i < frame->uv_height; i++) @@ -142,20 +141,13 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) { cm->error.setjmp = 0; pbi->decoded_key_frame = 0; - if (pbi->oxcf.max_threads > 1) { - vp9_worker_init(&pbi->lf_worker); - pbi->lf_worker.data1 = vpx_malloc(sizeof(LFWorkerData)); - pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker; - if (pbi->lf_worker.data1 == NULL || !vp9_worker_reset(&pbi->lf_worker)) { - vp9_remove_decompressor(pbi); - return NULL; - } - } + vp9_worker_init(&pbi->lf_worker); return pbi; } void vp9_remove_decompressor(VP9D_PTR ptr) { + int i; VP9D_COMP *const pbi = (VP9D_COMP *)ptr; if (!pbi) @@ -164,6 +156,16 @@ void vp9_remove_decompressor(VP9D_PTR ptr) { vp9_remove_common(&pbi->common); vp9_worker_end(&pbi->lf_worker); vpx_free(pbi->lf_worker.data1); + for (i = 0; i < pbi->num_tile_workers; ++i) { + VP9Worker *const worker = &pbi->tile_workers[i]; + vp9_worker_end(worker); + vpx_free(worker->data1); + vpx_free(worker->data2); + } + vpx_free(pbi->tile_workers); + vpx_free(pbi->mi_streams); + vpx_free(pbi->above_context[0]); + vpx_free(pbi->above_seg_context); vpx_free(pbi); } @@ -177,7 +179,6 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd) { VP9D_COMP *pbi = (VP9D_COMP *) ptr; VP9_COMMON *cm = &pbi->common; - int ref_fb_idx; /* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the * encoder is using the frame buffers for. This is just a stub to keep the @@ -185,18 +186,15 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR ptr, * later commit that adds VP9-specific controls for this functionality. */ if (ref_frame_flag == VP9_LAST_FLAG) { - ref_fb_idx = cm->ref_frame_map[0]; + YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->ref_frame_map[0]]; + if (!equal_dimensions(cfg, sd)) + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Incorrect buffer dimensions"); + else + vp8_yv12_copy_frame(cfg, sd); } else { vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Invalid reference frame"); - return cm->error.error_code; - } - - if (!equal_dimensions(&cm->yv12_fb[ref_fb_idx], sd)) { - vpx_internal_error(&cm->error, VPX_CODEC_ERROR, - "Incorrect buffer dimensions"); - } else { - vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd); } return cm->error.error_code; @@ -214,13 +212,13 @@ vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag, * vpxenc --test-decode functionality working, and will be replaced in a * later commit that adds VP9-specific controls for this functionality. */ - if (ref_frame_flag == VP9_LAST_FLAG) + if (ref_frame_flag == VP9_LAST_FLAG) { ref_fb_ptr = &pbi->common.active_ref_idx[0]; - else if (ref_frame_flag == VP9_GOLD_FLAG) + } else if (ref_frame_flag == VP9_GOLD_FLAG) { ref_fb_ptr = &pbi->common.active_ref_idx[1]; - else if (ref_frame_flag == VP9_ALT_FLAG) + } else if (ref_frame_flag == VP9_ALT_FLAG) { ref_fb_ptr = &pbi->common.active_ref_idx[2]; - else { + } else { vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR, "Invalid reference frame"); return pbi->common.error.error_code; @@ -268,7 +266,7 @@ static void swap_frame_buffers(VP9D_COMP *pbi) { ++ref_index; } - cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx]; + cm->frame_to_show = get_frame_new_buffer(cm); cm->fb_idx_ref_cnt[cm->new_fb_idx]--; // Invalidate these references until the next frame starts. @@ -277,7 +275,7 @@ static void swap_frame_buffers(VP9D_COMP *pbi) { } int vp9_receive_compressed_data(VP9D_PTR ptr, - uint64_t size, const uint8_t **psource, + size_t size, const uint8_t **psource, int64_t time_stamp) { VP9D_COMP *pbi = (VP9D_COMP *) ptr; VP9_COMMON *cm = &pbi->common; @@ -306,7 +304,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, * thing to do here. */ if (cm->active_ref_idx[0] != INT_MAX) - cm->yv12_fb[cm->active_ref_idx[0]].corrupted = 1; + get_frame_ref_buffer(cm, 0)->corrupted = 1; } cm->new_fb_idx = get_free_fb(cm); @@ -323,7 +321,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, * thing to do here. */ if (cm->active_ref_idx[0] != INT_MAX) - cm->yv12_fb[cm->active_ref_idx[0]].corrupted = 1; + get_frame_ref_buffer(cm, 0)->corrupted = 1; if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0) cm->fb_idx_ref_cnt[cm->new_fb_idx]--; @@ -343,36 +341,33 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, return retcode; } - { - swap_frame_buffers(pbi); + swap_frame_buffers(pbi); #if WRITE_RECON_BUFFER == 2 - if (cm->show_frame) - write_dx_frame_to_file(cm->frame_to_show, - cm->current_video_frame); - else - write_dx_frame_to_file(cm->frame_to_show, - cm->current_video_frame + 1000); + if (cm->show_frame) + write_dx_frame_to_file(cm->frame_to_show, + cm->current_video_frame); + else + write_dx_frame_to_file(cm->frame_to_show, + cm->current_video_frame + 1000); #endif - if (!pbi->do_loopfilter_inline) { - /* Apply the loop filter if appropriate. */ - vp9_loop_filter_frame(cm, &pbi->mb, pbi->common.lf.filter_level, 0, 0); - } + if (!pbi->do_loopfilter_inline) { + vp9_loop_filter_frame(cm, &pbi->mb, pbi->common.lf.filter_level, 0, 0); + } #if WRITE_RECON_BUFFER == 2 - if (cm->show_frame) - write_dx_frame_to_file(cm->frame_to_show, - cm->current_video_frame + 2000); - else - write_dx_frame_to_file(cm->frame_to_show, - cm->current_video_frame + 3000); + if (cm->show_frame) + write_dx_frame_to_file(cm->frame_to_show, + cm->current_video_frame + 2000); + else + write_dx_frame_to_file(cm->frame_to_show, + cm->current_video_frame + 3000); #endif - vp9_extend_frame_inner_borders(cm->frame_to_show, - cm->subsampling_x, - cm->subsampling_y); - } + vp9_extend_frame_inner_borders(cm->frame_to_show, + cm->subsampling_x, + cm->subsampling_y); #if WRITE_RECON_BUFFER == 1 if (cm->show_frame) @@ -398,6 +393,9 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1; cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1; + pbi->mb.mi_8x8 = cm->mi_grid_visible; + pbi->mb.mi_8x8[0] = cm->mi; + cm->current_video_frame++; } diff --git a/libvpx/vp9/decoder/vp9_onyxd_int.h b/libvpx/vp9/decoder/vp9_onyxd_int.h index a051971..7c4c9db 100644 --- a/libvpx/vp9/decoder/vp9_onyxd_int.h +++ b/libvpx/vp9/decoder/vp9_onyxd_int.h @@ -25,7 +25,7 @@ typedef struct VP9Decompressor { VP9D_CONFIG oxcf; const uint8_t *source; - uint32_t source_sz; + size_t source_sz; int64_t last_time_stamp; int ready_for_new_data; @@ -39,6 +39,18 @@ typedef struct VP9Decompressor { int do_loopfilter_inline; // apply loopfilter to available rows immediately VP9Worker lf_worker; + + VP9Worker *tile_workers; + int num_tile_workers; + + /* Each tile column has its own MODE_INFO stream. This array indexes them by + tile column index. */ + MODE_INFO **mi_streams; + + ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; + PARTITION_CONTEXT *above_seg_context; + + DECLARE_ALIGNED(16, unsigned char, token_cache[1024]); } VP9D_COMP; -#endif // VP9_DECODER_VP9_TREEREADER_H_ +#endif // VP9_DECODER_VP9_ONYXD_INT_H_ diff --git a/libvpx/vp9/decoder/vp9_read_bit_buffer.h b/libvpx/vp9/decoder/vp9_read_bit_buffer.h index c7fa3aa..41a6868 100644 --- a/libvpx/vp9/decoder/vp9_read_bit_buffer.h +++ b/libvpx/vp9/decoder/vp9_read_bit_buffer.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_READ_BIT_BUFFER_ -#define VP9_READ_BIT_BUFFER_ +#ifndef VP9_DECODER_VP9_READ_BIT_BUFFER_H_ +#define VP9_DECODER_VP9_READ_BIT_BUFFER_H_ #include <limits.h> @@ -57,4 +57,4 @@ static int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb, return vp9_rb_read_bit(rb) ? -value : value; } -#endif // VP9_READ_BIT_BUFFER_ +#endif // VP9_DECODER_VP9_READ_BIT_BUFFER_H_ diff --git a/libvpx/vp9/decoder/vp9_thread.c b/libvpx/vp9/decoder/vp9_thread.c index dc3b681..d953e72 100644 --- a/libvpx/vp9/decoder/vp9_thread.c +++ b/libvpx/vp9/decoder/vp9_thread.c @@ -29,7 +29,7 @@ extern "C" { //------------------------------------------------------------------------------ // simplistic pthread emulation layer -#include <process.h> +#include <process.h> // NOLINT // _beginthreadex requires __stdcall #define THREADFN unsigned int __stdcall @@ -145,9 +145,7 @@ static THREADFN thread_loop(void *ptr) { // thread loop pthread_cond_wait(&worker->condition_, &worker->mutex_); } if (worker->status_ == WORK) { - if (worker->hook) { - worker->had_error |= !worker->hook(worker->data1, worker->data2); - } + vp9_worker_execute(worker); worker->status_ = OK; } else if (worker->status_ == NOT_OK) { // finish the worker done = 1; @@ -178,7 +176,7 @@ static void change_state(VP9Worker* const worker, pthread_mutex_unlock(&worker->mutex_); } -#endif +#endif // CONFIG_MULTITHREAD //------------------------------------------------------------------------------ @@ -218,12 +216,17 @@ int vp9_worker_reset(VP9Worker* const worker) { return ok; } +void vp9_worker_execute(VP9Worker* const worker) { + if (worker->hook != NULL) { + worker->had_error |= !worker->hook(worker->data1, worker->data2); + } +} + void vp9_worker_launch(VP9Worker* const worker) { #if CONFIG_MULTITHREAD change_state(worker, WORK); #else - if (worker->hook) - worker->had_error |= !worker->hook(worker->data1, worker->data2); + vp9_worker_execute(worker); #endif } diff --git a/libvpx/vp9/decoder/vp9_thread.h b/libvpx/vp9/decoder/vp9_thread.h index a8f7e04..a624f3c 100644 --- a/libvpx/vp9/decoder/vp9_thread.h +++ b/libvpx/vp9/decoder/vp9_thread.h @@ -17,7 +17,7 @@ #ifndef VP9_DECODER_VP9_THREAD_H_ #define VP9_DECODER_VP9_THREAD_H_ -#include "vpx_config.h" +#include "./vpx_config.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { @@ -27,7 +27,7 @@ extern "C" { #if defined(_WIN32) -#include <windows.h> +#include <windows.h> // NOLINT typedef HANDLE pthread_t; typedef CRITICAL_SECTION pthread_mutex_t; typedef struct { @@ -38,7 +38,7 @@ typedef struct { #else -#include <pthread.h> +#include <pthread.h> // NOLINT #endif /* _WIN32 */ #endif /* CONFIG_MULTITHREAD */ @@ -80,6 +80,11 @@ int vp9_worker_sync(VP9Worker* const worker); // hook/data1/data2 can be changed at any time before calling this function, // but not be changed afterward until the next call to vp9_worker_sync(). void vp9_worker_launch(VP9Worker* const worker); +// This function is similar to vp9_worker_launch() except that it calls the +// hook directly instead of using a thread. Convenient to bypass the thread +// mechanism while still using the VP9Worker structs. vp9_worker_sync() must +// still be called afterward (for error reporting). +void vp9_worker_execute(VP9Worker* const worker); // Kill the thread and terminate the object. To use the object again, one // must call vp9_worker_reset() again. void vp9_worker_end(VP9Worker* const worker); @@ -90,4 +95,4 @@ void vp9_worker_end(VP9Worker* const worker); } // extern "C" #endif -#endif /* VP9_DECODER_VP9_THREAD_H_ */ +#endif // VP9_DECODER_VP9_THREAD_H_ diff --git a/libvpx/vp9/decoder/vp9_treereader.h b/libvpx/vp9/decoder/vp9_treereader.h index 710cc4c..f612497 100644 --- a/libvpx/vp9/decoder/vp9_treereader.h +++ b/libvpx/vp9/decoder/vp9_treereader.h @@ -23,7 +23,8 @@ static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */ const vp9_prob *const p) { register vp9_tree_index i = 0; - while ((i = t[ i + vp9_read(r, p[i >> 1])]) > 0); + while ((i = t[ i + vp9_read(r, p[i >> 1])]) > 0) + continue; return -i; } diff --git a/libvpx/vp9/decoder/x86/vp9_dequantize_sse2.c b/libvpx/vp9/decoder/x86/vp9_dequantize_sse2.c deleted file mode 100644 index 54ec67f..0000000 --- a/libvpx/vp9/decoder/x86/vp9_dequantize_sse2.c +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <assert.h> -#include <emmintrin.h> // SSE2 -#include "./vpx_config.h" -#include "vpx/vpx_integer.h" -#include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_idct.h" - -void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest, - int stride) { - uint8_t abs_diff; - __m128i d; - - // Prediction data. - __m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride)); - __m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride)); - __m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride)); - __m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride)); - __m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride)); - __m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride)); - __m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride)); - __m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride)); - - p0 = _mm_unpacklo_epi64(p0, p1); - p2 = _mm_unpacklo_epi64(p2, p3); - p4 = _mm_unpacklo_epi64(p4, p5); - p6 = _mm_unpacklo_epi64(p6, p7); - - // Clip diff value to [0, 255] range. Then, do addition or subtraction - // according to its sign. - if (diff >= 0) { - abs_diff = (diff > 255) ? 255 : diff; - d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); - - p0 = _mm_adds_epu8(p0, d); - p2 = _mm_adds_epu8(p2, d); - p4 = _mm_adds_epu8(p4, d); - p6 = _mm_adds_epu8(p6, d); - } else { - abs_diff = (diff < -255) ? 255 : -diff; - d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); - - p0 = _mm_subs_epu8(p0, d); - p2 = _mm_subs_epu8(p2, d); - p4 = _mm_subs_epu8(p4, d); - p6 = _mm_subs_epu8(p6, d); - } - - _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0); - p0 = _mm_srli_si128(p0, 8); - _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0); - - _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2); - p2 = _mm_srli_si128(p2, 8); - _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2); - - _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4); - p4 = _mm_srli_si128(p4, 8); - _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4); - - _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6); - p6 = _mm_srli_si128(p6, 8); - _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6); -} - -void vp9_add_constant_residual_16x16_sse2(const int16_t diff, uint8_t *dest, - int stride) { - uint8_t abs_diff; - __m128i d; - - // Prediction data. - __m128i p0 = _mm_load_si128((const __m128i *)(dest + 0 * stride)); - __m128i p1 = _mm_load_si128((const __m128i *)(dest + 1 * stride)); - __m128i p2 = _mm_load_si128((const __m128i *)(dest + 2 * stride)); - __m128i p3 = _mm_load_si128((const __m128i *)(dest + 3 * stride)); - __m128i p4 = _mm_load_si128((const __m128i *)(dest + 4 * stride)); - __m128i p5 = _mm_load_si128((const __m128i *)(dest + 5 * stride)); - __m128i p6 = _mm_load_si128((const __m128i *)(dest + 6 * stride)); - __m128i p7 = _mm_load_si128((const __m128i *)(dest + 7 * stride)); - __m128i p8 = _mm_load_si128((const __m128i *)(dest + 8 * stride)); - __m128i p9 = _mm_load_si128((const __m128i *)(dest + 9 * stride)); - __m128i p10 = _mm_load_si128((const __m128i *)(dest + 10 * stride)); - __m128i p11 = _mm_load_si128((const __m128i *)(dest + 11 * stride)); - __m128i p12 = _mm_load_si128((const __m128i *)(dest + 12 * stride)); - __m128i p13 = _mm_load_si128((const __m128i *)(dest + 13 * stride)); - __m128i p14 = _mm_load_si128((const __m128i *)(dest + 14 * stride)); - __m128i p15 = _mm_load_si128((const __m128i *)(dest + 15 * stride)); - - // Clip diff value to [0, 255] range. Then, do addition or subtraction - // according to its sign. - if (diff >= 0) { - abs_diff = (diff > 255) ? 255 : diff; - d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); - - p0 = _mm_adds_epu8(p0, d); - p1 = _mm_adds_epu8(p1, d); - p2 = _mm_adds_epu8(p2, d); - p3 = _mm_adds_epu8(p3, d); - p4 = _mm_adds_epu8(p4, d); - p5 = _mm_adds_epu8(p5, d); - p6 = _mm_adds_epu8(p6, d); - p7 = _mm_adds_epu8(p7, d); - p8 = _mm_adds_epu8(p8, d); - p9 = _mm_adds_epu8(p9, d); - p10 = _mm_adds_epu8(p10, d); - p11 = _mm_adds_epu8(p11, d); - p12 = _mm_adds_epu8(p12, d); - p13 = _mm_adds_epu8(p13, d); - p14 = _mm_adds_epu8(p14, d); - p15 = _mm_adds_epu8(p15, d); - } else { - abs_diff = (diff < -255) ? 255 : -diff; - d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); - - p0 = _mm_subs_epu8(p0, d); - p1 = _mm_subs_epu8(p1, d); - p2 = _mm_subs_epu8(p2, d); - p3 = _mm_subs_epu8(p3, d); - p4 = _mm_subs_epu8(p4, d); - p5 = _mm_subs_epu8(p5, d); - p6 = _mm_subs_epu8(p6, d); - p7 = _mm_subs_epu8(p7, d); - p8 = _mm_subs_epu8(p8, d); - p9 = _mm_subs_epu8(p9, d); - p10 = _mm_subs_epu8(p10, d); - p11 = _mm_subs_epu8(p11, d); - p12 = _mm_subs_epu8(p12, d); - p13 = _mm_subs_epu8(p13, d); - p14 = _mm_subs_epu8(p14, d); - p15 = _mm_subs_epu8(p15, d); - } - - // Store results - _mm_store_si128((__m128i *)(dest + 0 * stride), p0); - _mm_store_si128((__m128i *)(dest + 1 * stride), p1); - _mm_store_si128((__m128i *)(dest + 2 * stride), p2); - _mm_store_si128((__m128i *)(dest + 3 * stride), p3); - _mm_store_si128((__m128i *)(dest + 4 * stride), p4); - _mm_store_si128((__m128i *)(dest + 5 * stride), p5); - _mm_store_si128((__m128i *)(dest + 6 * stride), p6); - _mm_store_si128((__m128i *)(dest + 7 * stride), p7); - _mm_store_si128((__m128i *)(dest + 8 * stride), p8); - _mm_store_si128((__m128i *)(dest + 9 * stride), p9); - _mm_store_si128((__m128i *)(dest + 10 * stride), p10); - _mm_store_si128((__m128i *)(dest + 11 * stride), p11); - _mm_store_si128((__m128i *)(dest + 12 * stride), p12); - _mm_store_si128((__m128i *)(dest + 13 * stride), p13); - _mm_store_si128((__m128i *)(dest + 14 * stride), p14); - _mm_store_si128((__m128i *)(dest + 15 * stride), p15); -} - -void vp9_add_constant_residual_32x32_sse2(const int16_t diff, uint8_t *dest, - int stride) { - uint8_t abs_diff; - __m128i d; - int i = 8; - - if (diff >= 0) { - abs_diff = (diff > 255) ? 255 : diff; - d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); - } else { - abs_diff = (diff < -255) ? 255 : -diff; - d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); - } - - do { - // Prediction data. - __m128i p0 = _mm_load_si128((const __m128i *)(dest + 0 * stride)); - __m128i p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride + 16)); - __m128i p2 = _mm_load_si128((const __m128i *)(dest + 1 * stride)); - __m128i p3 = _mm_load_si128((const __m128i *)(dest + 1 * stride + 16)); - __m128i p4 = _mm_load_si128((const __m128i *)(dest + 2 * stride)); - __m128i p5 = _mm_load_si128((const __m128i *)(dest + 2 * stride + 16)); - __m128i p6 = _mm_load_si128((const __m128i *)(dest + 3 * stride)); - __m128i p7 = _mm_load_si128((const __m128i *)(dest + 3 * stride + 16)); - - // Clip diff value to [0, 255] range. Then, do addition or subtraction - // according to its sign. - if (diff >= 0) { - p0 = _mm_adds_epu8(p0, d); - p1 = _mm_adds_epu8(p1, d); - p2 = _mm_adds_epu8(p2, d); - p3 = _mm_adds_epu8(p3, d); - p4 = _mm_adds_epu8(p4, d); - p5 = _mm_adds_epu8(p5, d); - p6 = _mm_adds_epu8(p6, d); - p7 = _mm_adds_epu8(p7, d); - } else { - p0 = _mm_subs_epu8(p0, d); - p1 = _mm_subs_epu8(p1, d); - p2 = _mm_subs_epu8(p2, d); - p3 = _mm_subs_epu8(p3, d); - p4 = _mm_subs_epu8(p4, d); - p5 = _mm_subs_epu8(p5, d); - p6 = _mm_subs_epu8(p6, d); - p7 = _mm_subs_epu8(p7, d); - } - - // Store results - _mm_store_si128((__m128i *)(dest + 0 * stride), p0); - _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1); - _mm_store_si128((__m128i *)(dest + 1 * stride), p2); - _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3); - _mm_store_si128((__m128i *)(dest + 2 * stride), p4); - _mm_store_si128((__m128i *)(dest + 2 * stride + 16), p5); - _mm_store_si128((__m128i *)(dest + 3 * stride), p6); - _mm_store_si128((__m128i *)(dest + 3 * stride + 16), p7); - - dest += 4 * stride; - } while (--i); -} diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c index 957cfd2..87bd36c 100644 --- a/libvpx/vp9/encoder/vp9_bitstream.c +++ b/libvpx/vp9/encoder/vp9_bitstream.c @@ -22,7 +22,6 @@ #include "vp9/common/vp9_seg_common.h" #include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_entropy.h" -#include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_mvref_common.h" #include "vp9/common/vp9_treecoder.h" #include "vp9/common/vp9_systemdependent.h" @@ -54,8 +53,7 @@ extern unsigned int active_section; int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZES]; int64_t tx_count_16x16p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 1]; int64_t tx_count_8x8p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 2]; -int64_t switchable_interp_stats[SWITCHABLE_FILTERS+1] - [SWITCHABLE_FILTERS]; +int64_t switchable_interp_stats[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; void init_tx_count_stats() { vp9_zero(tx_count_32x32p_stats); @@ -88,10 +86,9 @@ static void update_tx_count_stats(VP9_COMMON *cm) { static void update_switchable_interp_stats(VP9_COMMON *cm) { int i, j; - for (i = 0; i < SWITCHABLE_FILTERS+1; ++i) - for (j = 0; j < SWITCHABLE_FILTERS; ++j) { + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + for (j = 0; j < SWITCHABLE_FILTERS; ++j) switchable_interp_stats[i][j] += cm->fc.switchable_interp_count[i][j]; - } } void write_tx_count_stats() { @@ -141,9 +138,9 @@ void write_switchable_interp_stats() { fclose(fp); printf( - "vp9_default_switchable_filter_count[SWITCHABLE_FILTERS+1]" + "vp9_default_switchable_filter_count[SWITCHABLE_FILTER_CONTEXTS]" "[SWITCHABLE_FILTERS] = {\n"); - for (i = 0; i < SWITCHABLE_FILTERS+1; i++) { + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { printf(" { "); for (j = 0; j < SWITCHABLE_FILTERS; j++) { printf("%"PRId64", ", switchable_interp_stats[i][j]); @@ -166,34 +163,27 @@ void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb, vp9_wb_write_literal(wb, data, get_unsigned_bits(max)); } -static void update_mode( - vp9_writer *w, - int n, - vp9_tree tree, - vp9_prob Pnew[/* n-1 */], - vp9_prob Pcur[/* n-1 */], - unsigned int bct[/* n-1 */] [2], - const unsigned int num_events[/* n */] -) { +static void update_mode(vp9_writer *w, int n, vp9_tree tree, + vp9_prob Pcur[/* n-1 */], + unsigned int bct[/* n-1 */][2], + const unsigned int num_events[/* n */]) { int i = 0; - vp9_tree_probs_from_distribution(tree, Pnew, bct, num_events, 0); + vp9_tree_probs_from_distribution(tree, bct, num_events, 0); n--; - for (i = 0; i < n; ++i) { - vp9_cond_prob_diff_update(w, &Pcur[i], MODE_UPDATE_PROB, bct[i]); - } + for (i = 0; i < n; ++i) + vp9_cond_prob_diff_update(w, &Pcur[i], bct[i]); } static void update_mbintra_mode_probs(VP9_COMP* const cpi, vp9_writer* const bc) { VP9_COMMON *const cm = &cpi->common; int j; - vp9_prob pnew[INTRA_MODES - 1]; unsigned int bct[INTRA_MODES - 1][2]; for (j = 0; j < BLOCK_SIZE_GROUPS; j++) - update_mode(bc, INTRA_MODES, vp9_intra_mode_tree, pnew, + update_mode(bc, INTRA_MODES, vp9_intra_mode_tree, cm->fc.y_mode_prob[j], bct, (unsigned int *)cpi->y_mode_count[j]); } @@ -228,53 +218,43 @@ void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *w) { int k; for (k = 0; k < MBSKIP_CONTEXTS; ++k) - vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k], - MODE_UPDATE_PROB, cm->counts.mbskip[k]); + vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k], cm->counts.mbskip[k]); } static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) { write_token(bc, vp9_intra_mode_tree, p, vp9_intra_mode_encodings + m); } -static void update_switchable_interp_probs(VP9_COMP *const cpi, - vp9_writer* const bc) { +static void update_switchable_interp_probs(VP9_COMP *cpi, vp9_writer *w) { VP9_COMMON *const cm = &cpi->common; - unsigned int branch_ct[SWITCHABLE_FILTERS + 1] - [SWITCHABLE_FILTERS - 1][2]; - vp9_prob new_prob[SWITCHABLE_FILTERS + 1][SWITCHABLE_FILTERS - 1]; + unsigned int branch_ct[SWITCHABLE_FILTERS - 1][2]; int i, j; - for (j = 0; j <= SWITCHABLE_FILTERS; ++j) { - vp9_tree_probs_from_distribution( - vp9_switchable_interp_tree, - new_prob[j], branch_ct[j], - cm->counts.switchable_interp[j], 0); - } - for (j = 0; j <= SWITCHABLE_FILTERS; ++j) { - for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) { - vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i], - MODE_UPDATE_PROB, branch_ct[j][i]); - } + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) { + vp9_tree_probs_from_distribution(vp9_switchable_interp_tree, branch_ct, + cm->counts.switchable_interp[j], 0); + + for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) + vp9_cond_prob_diff_update(w, &cm->fc.switchable_interp_prob[j][i], + branch_ct[i]); } + #ifdef MODE_STATS if (!cpi->dummy_packing) update_switchable_interp_stats(cm); #endif } -static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer* const bc) { +static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer *w) { int i, j; for (i = 0; i < INTER_MODE_CONTEXTS; ++i) { unsigned int branch_ct[INTER_MODES - 1][2]; - vp9_prob new_prob[INTER_MODES - 1]; - - vp9_tree_probs_from_distribution(vp9_inter_mode_tree, - new_prob, branch_ct, + vp9_tree_probs_from_distribution(vp9_inter_mode_tree, branch_ct, cm->counts.inter_mode[i], NEARESTMV); for (j = 0; j < INTER_MODES - 1; ++j) - vp9_cond_prob_diff_update(bc, &cm->fc.inter_mode_probs[i][j], - MODE_UPDATE_PROB, branch_ct[j]); + vp9_cond_prob_diff_update(w, &cm->fc.inter_mode_probs[i][j], + branch_ct[j]); } } @@ -283,7 +263,7 @@ static void pack_mb_tokens(vp9_writer* const bc, const TOKENEXTRA *const stop) { TOKENEXTRA *p = *tp; - while (p < stop) { + while (p < stop && p->token != EOSB_TOKEN) { const int t = p->token; const struct vp9_token *const a = vp9_coef_encodings + t; const vp9_extra_bit *const b = vp9_extra_bits + t; @@ -293,10 +273,6 @@ static void pack_mb_tokens(vp9_writer* const bc, int n = a->len; vp9_prob probs[ENTROPY_NODES]; - if (t == EOSB_TOKEN) { - ++p; - break; - } if (t >= TWO_TOKEN) { vp9_model_to_full_probs(p->context_tree, probs); pp = probs; @@ -338,14 +314,14 @@ static void pack_mb_tokens(vp9_writer* const bc, ++p; } - *tp = p; + *tp = p + (p->token == EOSB_TOKEN); } static void write_sb_mv_ref(vp9_writer *w, MB_PREDICTION_MODE mode, const vp9_prob *p) { assert(is_inter_mode(mode)); write_token(w, vp9_inter_mode_tree, p, - &vp9_inter_mode_encodings[mode - NEARESTMV]); + &vp9_inter_mode_encodings[inter_mode_offset(mode)]); } @@ -360,7 +336,7 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *mi = &xd->this_mi->mbmi; + MB_MODE_INFO *mi = &xd->mi_8x8[0]->mbmi; const int segment_id = mi->segment_id; int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME); @@ -393,8 +369,8 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) { mi->ref_frame[0]); } - // if using the prediction mdoel we have nothing further to do because - // the reference frame is fully coded by the segment + // If using the prediction model we have nothing further to do because + // the reference frame is fully coded by the segment. } static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { @@ -409,9 +385,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { const int segment_id = mi->segment_id; int skip_coeff; const BLOCK_SIZE bsize = mi->sb_type; - const int allow_hp = xd->allow_high_precision_mv; - - x->partition_info = x->pi + (m - cm->mi); + const int allow_hp = cm->allow_high_precision_mv; #ifdef ENTROPY_STATS active_section = 9; @@ -488,17 +462,13 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { } if (bsize < BLOCK_8X8) { - int j; - MB_PREDICTION_MODE blockmode; - int_mv blockmv; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; int idx, idy; for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { - j = idy * 2 + idx; - blockmode = x->partition_info->bmi[j].mode; - blockmv = m->bmi[j].as_mv[0]; + const int j = idy * 2 + idx; + const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode; write_sb_mv_ref(bc, blockmode, mv_ref_p); ++cm->counts.inter_mode[mi->mode_context[rf]] [inter_mode_offset(blockmode)]; @@ -507,14 +477,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { #ifdef ENTROPY_STATS active_section = 11; #endif - vp9_encode_mv(cpi, bc, &blockmv.as_mv, &mi->best_mv.as_mv, - nmvc, allow_hp); - - if (mi->ref_frame[1] > INTRA_FRAME) - vp9_encode_mv(cpi, bc, - &m->bmi[j].as_mv[1].as_mv, - &mi->best_second_mv.as_mv, - nmvc, allow_hp); + vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[0].as_mv, + &mi->best_mv[0].as_mv, nmvc, allow_hp); + + if (has_second_ref(mi)) + vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[1].as_mv, + &mi->best_mv[1].as_mv, nmvc, allow_hp); } } } @@ -522,12 +490,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { #ifdef ENTROPY_STATS active_section = 5; #endif - vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv, &mi->best_mv.as_mv, - nmvc, allow_hp); + vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv, + &mi->best_mv[0].as_mv, nmvc, allow_hp); - if (mi->ref_frame[1] > INTRA_FRAME) - vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv, &mi->best_second_mv.as_mv, - nmvc, allow_hp); + if (has_second_ref(mi)) + vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv, + &mi->best_mv[1].as_mv, nmvc, allow_hp); } } } @@ -541,7 +509,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8, const int ym = m->mbmi.mode; const int segment_id = m->mbmi.segment_id; MODE_INFO *above_mi = mi_8x8[-xd->mode_info_stride]; - MODE_INFO *left_mi = mi_8x8[-1]; + MODE_INFO *left_mi = xd->left_available ? mi_8x8[-1] : NULL; if (seg->update_map) write_segment_id(bc, seg, m->mbmi.segment_id); @@ -553,8 +521,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8, if (m->mbmi.sb_type >= BLOCK_8X8) { const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, 0); - const MB_PREDICTION_MODE L = xd->left_available ? - left_block_mode(m, left_mi, 0) : DC_PRED; + const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, 0); write_intra_mode(bc, ym, vp9_kf_y_mode_prob[A][L]); } else { int idx, idy; @@ -564,8 +531,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8, for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { int i = idy * 2 + idx; const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, i); - const MB_PREDICTION_MODE L = (xd->left_available || idx) ? - left_block_mode(m, left_mi, i) : DC_PRED; + const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, i); const int bm = m->bmi[i].as_mode; #ifdef ENTROPY_STATS ++intra_mode_stats[A][L][bm]; @@ -578,24 +544,25 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8, write_intra_mode(bc, m->mbmi.uv_mode, vp9_kf_uv_mode_prob[ym]); } -static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc, +static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile, + MODE_INFO **mi_8x8, vp9_writer *bc, TOKENEXTRA **tok, TOKENEXTRA *tok_end, - int mi_row, int mi_col) { + int mi_row, int mi_col, int index) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; MODE_INFO *m = mi_8x8[0]; if (m->mbmi.sb_type < BLOCK_8X8) - if (xd->ab_index > 0) + if (index > 0) return; - xd->this_mi = mi_8x8[0]; xd->mi_8x8 = mi_8x8; - set_mi_row_col(&cpi->common, xd, + set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type], - mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type]); - if ((cm->frame_type == KEY_FRAME) || cm->intra_only) { + mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type], + cm->mi_rows, cm->mi_cols); + if (frame_is_intra_only(cm)) { write_mb_modes_kf(cpi, mi_8x8, bc); #ifdef ENTROPY_STATS active_section = 8; @@ -611,11 +578,35 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc, pack_mb_tokens(bc, tok, tok_end); } -static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc, +static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col, + PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) { + VP9_COMMON *const cm = &cpi->common; + const int ctx = partition_plane_context(cpi->above_seg_context, + cpi->left_seg_context, + mi_row, mi_col, bsize); + const vp9_prob *const probs = get_partition_probs(cm, ctx); + const int has_rows = (mi_row + hbs) < cm->mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_cols; + + if (has_rows && has_cols) { + write_token(w, vp9_partition_tree, probs, &vp9_partition_encodings[p]); + } else if (!has_rows && has_cols) { + assert(p == PARTITION_SPLIT || p == PARTITION_HORZ); + vp9_write(w, p == PARTITION_SPLIT, probs[1]); + } else if (has_rows && !has_cols) { + assert(p == PARTITION_SPLIT || p == PARTITION_VERT); + vp9_write(w, p == PARTITION_SPLIT, probs[2]); + } else { + assert(p == PARTITION_SPLIT); + } +} + +static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile, + MODE_INFO **mi_8x8, vp9_writer *bc, TOKENEXTRA **tok, TOKENEXTRA *tok_end, - int mi_row, int mi_col, BLOCK_SIZE bsize) { + int mi_row, int mi_col, BLOCK_SIZE bsize, + int index) { VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *xd = &cpi->mb.e_mbd; const int mis = cm->mode_info_stride; int bsl = b_width_log2(bsize); int bs = (1 << bsl) / 4; // mode_info step for subsize @@ -629,52 +620,37 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc, partition = partition_lookup[bsl][m->mbmi.sb_type]; - if (bsize < BLOCK_8X8) - if (xd->ab_index > 0) + if (bsize < BLOCK_8X8) { + if (index > 0) return; - - if (bsize >= BLOCK_8X8) { - int pl; - const int idx = check_bsize_coverage(bs, cm->mi_rows, cm->mi_cols, - mi_row, mi_col); - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); - // encode the partition information - if (idx == 0) - write_token(bc, vp9_partition_tree, - cm->fc.partition_prob[cm->frame_type][pl], - vp9_partition_encodings + partition); - else if (idx > 0) - vp9_write(bc, partition == PARTITION_SPLIT, - cm->fc.partition_prob[cm->frame_type][pl][idx]); + } else { + write_partition(cpi, bs, mi_row, mi_col, partition, bsize, bc); } subsize = get_subsize(bsize, partition); - *(get_sb_index(xd, subsize)) = 0; switch (partition) { case PARTITION_NONE: - write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0); break; case PARTITION_HORZ: - write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col); - *(get_sb_index(xd, subsize)) = 1; + write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0); if ((mi_row + bs) < cm->mi_rows) - write_modes_b(cpi, mi_8x8 + bs * mis, bc, tok, tok_end, mi_row + bs, - mi_col); + write_modes_b(cpi, tile, mi_8x8 + bs * mis, bc, tok, tok_end, + mi_row + bs, mi_col, 1); break; case PARTITION_VERT: - write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col); - *(get_sb_index(xd, subsize)) = 1; + write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0); if ((mi_col + bs) < cm->mi_cols) - write_modes_b(cpi, mi_8x8 + bs, bc, tok, tok_end, mi_row, mi_col + bs); + write_modes_b(cpi, tile, mi_8x8 + bs, bc, tok, tok_end, + mi_row, mi_col + bs, 1); break; case PARTITION_SPLIT: for (n = 0; n < 4; n++) { - int j = n >> 1, i = n & 0x01; - *(get_sb_index(xd, subsize)) = n; - write_modes_sb(cpi, mi_8x8 + j * bs * mis + i * bs, bc, tok, tok_end, - mi_row + j * bs, mi_col + i * bs, subsize); + const int j = n >> 1, i = n & 1; + write_modes_sb(cpi, tile, mi_8x8 + j * bs * mis + i * bs, bc, + tok, tok_end, + mi_row + j * bs, mi_col + i * bs, subsize, n); } break; default: @@ -683,13 +659,13 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc, // update partition context if (bsize >= BLOCK_8X8 && - (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) { - set_partition_seg_context(cm, xd, mi_row, mi_col); - update_partition_context(xd, subsize, bsize); - } + (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) + update_partition_context(cpi->above_seg_context, cpi->left_seg_context, + mi_row, mi_col, subsize, bsize); } -static void write_modes(VP9_COMP *cpi, vp9_writer* const bc, +static void write_modes(VP9_COMP *cpi, const TileInfo *const tile, + vp9_writer* const bc, TOKENEXTRA **tok, TOKENEXTRA *tok_end) { VP9_COMMON *const cm = &cpi->common; const int mis = cm->mode_info_stride; @@ -697,57 +673,27 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc, MODE_INFO **mi_8x8 = cm->mi_grid_visible; MODE_INFO **m_8x8; - mi_8x8 += cm->cur_tile_mi_col_start + cm->cur_tile_mi_row_start * mis; + mi_8x8 += tile->mi_col_start + tile->mi_row_start * mis; - for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end; + for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; mi_row += 8, mi_8x8 += 8 * mis) { m_8x8 = mi_8x8; - vp9_zero(cm->left_seg_context); - for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end; + vp9_zero(cpi->left_seg_context); + for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; mi_col += MI_BLOCK_SIZE, m_8x8 += MI_BLOCK_SIZE) { - write_modes_sb(cpi, m_8x8, bc, tok, tok_end, mi_row, mi_col, - BLOCK_64X64); + write_modes_sb(cpi, tile, m_8x8, bc, tok, tok_end, mi_row, mi_col, + BLOCK_64X64, 0); } } } -/* This function is used for debugging probability trees. */ -static void print_prob_tree(vp9_coeff_probs *coef_probs, int block_types) { - /* print coef probability tree */ - int i, j, k, l, m; - FILE *f = fopen("enc_tree_probs.txt", "a"); - fprintf(f, "{\n"); - for (i = 0; i < block_types; i++) { - fprintf(f, " {\n"); - for (j = 0; j < REF_TYPES; ++j) { - fprintf(f, " {\n"); - for (k = 0; k < COEF_BANDS; k++) { - fprintf(f, " {\n"); - for (l = 0; l < PREV_COEF_CONTEXTS; l++) { - fprintf(f, " {"); - for (m = 0; m < ENTROPY_NODES; m++) { - fprintf(f, "%3u, ", - (unsigned int)(coef_probs[i][j][k][l][m])); - } - } - fprintf(f, " }\n"); - } - fprintf(f, " }\n"); - } - fprintf(f, " }\n"); - } - fprintf(f, "}\n"); - fclose(f); -} - static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) { vp9_coeff_probs_model *coef_probs = cpi->frame_coef_probs[tx_size]; vp9_coeff_count *coef_counts = cpi->coef_counts[tx_size]; unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] = cpi->common.counts.eob_branch[tx_size]; vp9_coeff_stats *coef_branch_ct = cpi->frame_branch_ct[tx_size]; - vp9_prob full_probs[ENTROPY_NODES]; - int i, j, k, l; + int i, j, k, l, m; for (i = 0; i < BLOCK_TYPES; ++i) { for (j = 0; j < REF_TYPES; ++j) { @@ -756,16 +702,14 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) { if (l >= 3 && k == 0) continue; vp9_tree_probs_from_distribution(vp9_coef_tree, - full_probs, coef_branch_ct[i][j][k][l], coef_counts[i][j][k][l], 0); - vpx_memcpy(coef_probs[i][j][k][l], full_probs, - sizeof(vp9_prob) * UNCONSTRAINED_NODES); coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] - coef_branch_ct[i][j][k][l][0][0]; - coef_probs[i][j][k][l][0] = - get_binary_prob(coef_branch_ct[i][j][k][l][0][0], - coef_branch_ct[i][j][k][l][0][1]); + for (m = 0; m < UNCONSTRAINED_NODES; ++m) + coef_probs[i][j][k][l][m] = get_binary_prob( + coef_branch_ct[i][j][k][l][m][0], + coef_branch_ct[i][j][k][l][m][1]); #ifdef ENTROPY_STATS if (!cpi->dummy_packing) { int t; @@ -794,7 +738,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, vp9_coeff_probs_model *old_frame_coef_probs = cpi->common.fc.coef_probs[tx_size]; vp9_coeff_stats *frame_branch_ct = cpi->frame_branch_ct[tx_size]; - const vp9_prob upd = VP9_COEF_UPDATE_PROB; + const vp9_prob upd = DIFF_UPDATE_PROB; const int entropy_nodes_update = UNCONSTRAINED_NODES; int i, j, k, l, t; switch (cpi->sf.use_fast_coef_updates) { @@ -849,7 +793,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, for (t = 0; t < entropy_nodes_update; ++t) { vp9_prob newp = new_frame_coef_probs[i][j][k][l][t]; vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t; - const vp9_prob upd = VP9_COEF_UPDATE_PROB; + const vp9_prob upd = DIFF_UPDATE_PROB; int s; int u = 0; if (l >= 3 && k == 0) @@ -1132,26 +1076,23 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) { for (i = 0; i < TX_SIZE_CONTEXTS; i++) { - tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i], - ct_8x8p); + tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i], ct_8x8p); for (j = 0; j < TX_SIZES - 3; j++) - vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j], - MODE_UPDATE_PROB, ct_8x8p[j]); + vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j], ct_8x8p[j]); } for (i = 0; i < TX_SIZE_CONTEXTS; i++) { - tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i], - ct_16x16p); + tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i], ct_16x16p); for (j = 0; j < TX_SIZES - 2; j++) vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j], - MODE_UPDATE_PROB, ct_16x16p[j]); + ct_16x16p[j]); } for (i = 0; i < TX_SIZE_CONTEXTS; i++) { tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p); for (j = 0; j < TX_SIZES - 1; j++) vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j], - MODE_UPDATE_PROB, ct_32x32p[j]); + ct_32x32p[j]); } #ifdef MODE_STATS if (!cpi->dummy_packing) @@ -1160,9 +1101,9 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) { } } -static void write_interp_filter_type(INTERPOLATIONFILTERTYPE type, +static void write_interp_filter_type(INTERPOLATION_TYPE type, struct vp9_write_bit_buffer *wb) { - const int type_to_literal[] = { 1, 0, 2 }; + const int type_to_literal[] = { 1, 0, 2, 3 }; vp9_wb_write_bit(wb, type == SWITCHABLE); if (type != SWITCHABLE) @@ -1178,7 +1119,7 @@ static void fix_mcomp_filter_type(VP9_COMP *cpi) { int i, j, c = 0; for (i = 0; i < SWITCHABLE_FILTERS; ++i) { count[i] = 0; - for (j = 0; j <= SWITCHABLE_FILTERS; ++j) + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) count[i] += cm->counts.switchable_interp[j][i]; c += (count[i] > 0); } @@ -1258,7 +1199,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; - vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) * + vpx_memset(cpi->above_seg_context, 0, sizeof(*cpi->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols)); tok[0][0] = cpi->tok; @@ -1273,9 +1214,10 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { } for (tile_row = 0; tile_row < tile_rows; tile_row++) { - vp9_get_tile_row_offsets(cm, tile_row); for (tile_col = 0; tile_col < tile_cols; tile_col++) { - vp9_get_tile_col_offsets(cm, tile_col); + TileInfo tile; + + vp9_tile_init(&tile, cm, 0, tile_col); tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col]; if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) @@ -1283,7 +1225,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { else vp9_start_encode(&residual_bc, data_ptr + total_size); - write_modes(cpi, &residual_bc, &tok[tile_row][tile_col], tok_end); + write_modes(cpi, &tile, &residual_bc, &tok[tile_row][tile_col], tok_end); assert(tok[tile_row][tile_col] == tok_end); vp9_stop_encode(&residual_bc); if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) { @@ -1352,18 +1294,16 @@ static void write_frame_size_with_refs(VP9_COMP *cpi, } static void write_sync_code(struct vp9_write_bit_buffer *wb) { - vp9_wb_write_literal(wb, SYNC_CODE_0, 8); - vp9_wb_write_literal(wb, SYNC_CODE_1, 8); - vp9_wb_write_literal(wb, SYNC_CODE_2, 8); + vp9_wb_write_literal(wb, VP9_SYNC_CODE_0, 8); + vp9_wb_write_literal(wb, VP9_SYNC_CODE_1, 8); + vp9_wb_write_literal(wb, VP9_SYNC_CODE_2, 8); } static void write_uncompressed_header(VP9_COMP *cpi, struct vp9_write_bit_buffer *wb) { VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->mb.e_mbd; - // frame marker bits - vp9_wb_write_literal(wb, 0x2, 2); + vp9_wb_write_literal(wb, VP9_FRAME_MARKER, 2); // bitstream version. // 00 - profile 0. 4:2:0 only @@ -1377,18 +1317,10 @@ static void write_uncompressed_header(VP9_COMP *cpi, vp9_wb_write_bit(wb, cm->error_resilient_mode); if (cm->frame_type == KEY_FRAME) { + const COLOR_SPACE cs = UNKNOWN; write_sync_code(wb); - // colorspaces - // 000 - Unknown - // 001 - BT.601 - // 010 - BT.709 - // 011 - SMPTE-170 - // 100 - SMPTE-240 - // 101 - Reserved - // 110 - Reserved - // 111 - sRGB (RGB) - vp9_wb_write_literal(wb, 0, 3); - if (1 /* colorspace != sRGB */) { + vp9_wb_write_literal(wb, cs, 3); + if (cs != SRGB) { vp9_wb_write_bit(wb, 0); // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] if (cm->version == 1) { vp9_wb_write_bit(wb, cm->subsampling_x); @@ -1425,7 +1357,7 @@ static void write_uncompressed_header(VP9_COMP *cpi, write_frame_size_with_refs(cpi, wb); - vp9_wb_write_bit(wb, xd->allow_high_precision_mv); + vp9_wb_write_bit(wb, cm->allow_high_precision_mv); fix_mcomp_filter_type(cpi); write_interp_filter_type(cm->mcomp_filter_type, wb); @@ -1467,7 +1399,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { vp9_update_skip_probs(cpi, &header_bc); - if (cm->frame_type != KEY_FRAME) { + if (!frame_is_intra_only(cm)) { int i; #ifdef ENTROPY_STATS active_section = 1; @@ -1481,7 +1413,6 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { for (i = 0; i < INTRA_INTER_CONTEXTS; i++) vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i], - MODE_UPDATE_PROB, cpi->intra_inter_count[i]); if (cm->allow_comp_inter_inter) { @@ -1495,7 +1426,6 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { if (use_hybrid_pred) for (i = 0; i < COMP_INTER_CONTEXTS; i++) vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i], - MODE_UPDATE_PROB, cpi->comp_inter_count[i]); } } @@ -1503,10 +1433,8 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) { for (i = 0; i < REF_CONTEXTS; i++) { vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0], - MODE_UPDATE_PROB, cpi->single_ref_count[i][0]); vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1], - MODE_UPDATE_PROB, cpi->single_ref_count[i][1]); } } @@ -1514,21 +1442,18 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) for (i = 0; i < REF_CONTEXTS; i++) vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i], - MODE_UPDATE_PROB, cpi->comp_ref_count[i]); update_mbintra_mode_probs(cpi, &header_bc); - for (i = 0; i < NUM_PARTITION_CONTEXTS; ++i) { - vp9_prob pnew[PARTITION_TYPES - 1]; + for (i = 0; i < PARTITION_CONTEXTS; ++i) { unsigned int bct[PARTITION_TYPES - 1][2]; - update_mode(&header_bc, PARTITION_TYPES, - vp9_partition_tree, pnew, - fc->partition_prob[cm->frame_type][i], bct, + update_mode(&header_bc, PARTITION_TYPES, vp9_partition_tree, + fc->partition_prob[i], bct, (unsigned int *)cpi->partition_count[i]); } - vp9_write_nmv_probs(cpi, xd->allow_high_precision_mv, &header_bc); + vp9_write_nmv_probs(cpi, cm->allow_high_precision_mv, &header_bc); } vp9_stop_encode(&header_bc); diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h index 013047e..8033a4d 100644 --- a/libvpx/vp9/encoder/vp9_block.h +++ b/libvpx/vp9/encoder/vp9_block.h @@ -23,17 +23,11 @@ typedef struct { int offset; } search_site; -typedef struct { - struct { - MB_PREDICTION_MODE mode; - } bmi[4]; -} PARTITION_INFO; - // Structure to hold snapshot of coding context during the mode picking process -// TODO Do we need all of these? typedef struct { MODE_INFO mic; - PARTITION_INFO partition_info; + uint8_t *zcoeff_blk; + int num_4x4_blk; int skip; int_mv best_ref_mv; int_mv second_best_ref_mv; @@ -48,7 +42,7 @@ typedef struct { int comp_pred_diff; int single_pred_diff; int64_t tx_rd_diff[TX_MODES]; - int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]; + int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; // motion vector cache for adaptive motion search control in partition // search loop @@ -62,8 +56,8 @@ typedef struct { } PICK_MODE_CONTEXT; struct macroblock_plane { - DECLARE_ALIGNED(16, int16_t, src_diff[64*64]); - DECLARE_ALIGNED(16, int16_t, coeff[64*64]); + DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]); + DECLARE_ALIGNED(16, int16_t, coeff[64 * 64]); struct buf_2d src; // Quantizer setings @@ -87,9 +81,6 @@ struct macroblock { MACROBLOCKD e_mbd; int skip_block; - PARTITION_INFO *partition_info; /* work pointer */ - PARTITION_INFO *pi; /* Corresponds to upper left visible macroblock */ - PARTITION_INFO *pip; /* Base of allocated array */ search_site *ss; int ss_count; @@ -100,6 +91,7 @@ struct macroblock { int sadperbit4; int rddiv; int rdmult; + unsigned int mb_energy; unsigned int *mb_activity_ptr; int *mb_norm_activity_ptr; signed int act_zbin_adj; @@ -123,11 +115,10 @@ struct macroblock { int **mvsadcost; int mbmode_cost[MB_MODE_COUNT]; - unsigned inter_mode_cost[INTER_MODE_CONTEXTS][MB_MODE_COUNT - NEARESTMV]; + unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES]; int intra_uv_mode_cost[2][MB_MODE_COUNT]; int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; - int switchable_interp_costs[SWITCHABLE_FILTERS + 1] - [SWITCHABLE_FILTERS]; + int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; // These define limits to motion vector components to prevent them // from extending outside the UMV borders @@ -136,6 +127,7 @@ struct macroblock { int mv_row_min; int mv_row_max; + uint8_t zcoeff_blk[TX_SIZES][256]; int skip; int encode_breakout; @@ -144,6 +136,7 @@ struct macroblock { // note that token_costs is the cost when eob node is skipped vp9_coeff_cost token_costs[TX_SIZES]; + DECLARE_ALIGNED(16, uint8_t, token_cache[1024]); int optimize; @@ -172,19 +165,72 @@ struct macroblock { PICK_MODE_CONTEXT sb32x64_context[2]; PICK_MODE_CONTEXT sb64x32_context[2]; PICK_MODE_CONTEXT sb64_context; - int partition_cost[NUM_PARTITION_CONTEXTS][PARTITION_TYPES]; + int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES]; BLOCK_SIZE b_partitioning[4][4][4]; BLOCK_SIZE mb_partitioning[4][4]; BLOCK_SIZE sb_partitioning[4]; BLOCK_SIZE sb64_partitioning; - void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch); - void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch); - void (*fwd_txm8x8)(int16_t *input, int16_t *output, int pitch); - void (*fwd_txm16x16)(int16_t *input, int16_t *output, int pitch); - void (*quantize_b_4x4)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type, - int y_blocks); + void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride); +}; + +// TODO(jingning): the variables used here are little complicated. need further +// refactoring on organizing the temporary buffers, when recursive +// partition down to 4x4 block size is enabled. +static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) { + MACROBLOCKD *const xd = &x->e_mbd; + + switch (bsize) { + case BLOCK_64X64: + return &x->sb64_context; + case BLOCK_64X32: + return &x->sb64x32_context[xd->sb_index]; + case BLOCK_32X64: + return &x->sb32x64_context[xd->sb_index]; + case BLOCK_32X32: + return &x->sb32_context[xd->sb_index]; + case BLOCK_32X16: + return &x->sb32x16_context[xd->sb_index][xd->mb_index]; + case BLOCK_16X32: + return &x->sb16x32_context[xd->sb_index][xd->mb_index]; + case BLOCK_16X16: + return &x->mb_context[xd->sb_index][xd->mb_index]; + case BLOCK_16X8: + return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index]; + case BLOCK_8X16: + return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index]; + case BLOCK_8X8: + return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index]; + case BLOCK_8X4: + return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index]; + case BLOCK_4X8: + return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index]; + case BLOCK_4X4: + return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index]; + default: + assert(0); + return NULL; + } +} + +struct rdcost_block_args { + MACROBLOCK *x; + ENTROPY_CONTEXT t_above[16]; + ENTROPY_CONTEXT t_left[16]; + TX_SIZE tx_size; + int bw; + int bh; + int rate; + int64_t dist; + int64_t sse; + int this_rate; + int64_t this_dist; + int64_t this_sse; + int64_t this_rd; + int64_t best_rd; + int skip; + const int16_t *scan, *nb; }; #endif // VP9_ENCODER_VP9_BLOCK_H_ diff --git a/libvpx/vp9/encoder/vp9_boolhuff.c b/libvpx/vp9/encoder/vp9_boolhuff.c index 0f1aa59..32c136e 100644 --- a/libvpx/vp9/encoder/vp9_boolhuff.c +++ b/libvpx/vp9/encoder/vp9_boolhuff.c @@ -22,23 +22,28 @@ unsigned int active_section = 0; #endif const unsigned int vp9_prob_cost[256] = { - 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046, - 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, 843, 829, 816, 803, 790, 778, - 767, 755, 744, 733, 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625, - 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516, - 511, 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, 437, 433, - 428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365, - 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, 317, 314, 311, 307, - 304, 301, 297, 294, 291, 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257, - 255, 252, 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214, - 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, 181, 179, 177, 174, - 172, 170, 168, 165, 163, 161, 159, 156, 154, 152, 150, 148, 145, 143, 141, 139, - 137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, - 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77, - 75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 53, 51, 50, - 48, 46, 45, 43, 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24, - 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1 -}; + 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, + 1129, 1099, 1072, 1046, 1023, 1000, 979, 959, 940, 922, 905, 889, + 873, 858, 843, 829, 816, 803, 790, 778, 767, 755, 744, 733, + 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625, + 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, + 534, 528, 522, 516, 511, 505, 499, 494, 488, 483, 477, 472, + 467, 462, 457, 452, 447, 442, 437, 433, 428, 424, 419, 415, + 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365, + 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, + 317, 314, 311, 307, 304, 301, 297, 294, 291, 288, 285, 281, + 278, 275, 272, 269, 266, 263, 260, 257, 255, 252, 249, 246, + 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214, + 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, + 181, 179, 177, 174, 172, 170, 168, 165, 163, 161, 159, 156, + 154, 152, 150, 148, 145, 143, 141, 139, 137, 135, 133, 131, + 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, + 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, + 82, 81, 79, 77, 75, 73, 72, 70, 68, 66, 65, 63, + 61, 60, 58, 56, 55, 53, 51, 50, 48, 46, 45, 43, + 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24, + 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, + 4, 3, 1, 1}; void vp9_start_encode(vp9_writer *br, uint8_t *source) { br->lowvalue = 0; diff --git a/libvpx/vp9/encoder/vp9_dct.c b/libvpx/vp9/encoder/vp9_dct.c index 4f4ad04..065992a 100644 --- a/libvpx/vp9/encoder/vp9_dct.c +++ b/libvpx/vp9/encoder/vp9_dct.c @@ -8,16 +8,19 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include <assert.h> #include <math.h> + #include "./vpx_config.h" -#include "vp9/common/vp9_systemdependent.h" +#include "./vp9_rtcd.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_idct.h" +#include "vp9/common/vp9_systemdependent.h" -static void fdct4_1d(int16_t *input, int16_t *output) { +#include "vp9/encoder/vp9_dct.h" + +static void fdct4(const int16_t *input, int16_t *output) { int16_t step[4]; int temp1, temp2; @@ -36,18 +39,17 @@ static void fdct4_1d(int16_t *input, int16_t *output) { output[3] = dct_const_round_shift(temp2); } -void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) { +void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, // as the first pass results are transposed, we tranpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). - const int stride = pitch >> 1; int pass; // We need an intermediate buffer between passes. int16_t intermediate[4 * 4]; - int16_t *in = input; + const int16_t *in = input; int16_t *out = intermediate; // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { @@ -58,10 +60,10 @@ void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) { for (i = 0; i < 4; ++i) { // Load inputs. if (0 == pass) { - input[0] = in[0 * stride] << 4; - input[1] = in[1 * stride] << 4; - input[2] = in[2 * stride] << 4; - input[3] = in[3 * stride] << 4; + input[0] = in[0 * stride] * 16; + input[1] = in[1 * stride] * 16; + input[2] = in[2 * stride] * 16; + input[3] = in[3 * stride] * 16; if (i == 0 && input[0]) { input[0] += 1; } @@ -102,7 +104,7 @@ void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) { } } -static void fadst4_1d(int16_t *input, int16_t *output) { +static void fadst4(const int16_t *input, int16_t *output) { int x0, x1, x2, x3; int s0, s1, s2, s3, s4, s5, s6, s7; @@ -143,14 +145,14 @@ static void fadst4_1d(int16_t *input, int16_t *output) { } static const transform_2d FHT_4[] = { - { fdct4_1d, fdct4_1d }, // DCT_DCT = 0 - { fadst4_1d, fdct4_1d }, // ADST_DCT = 1 - { fdct4_1d, fadst4_1d }, // DCT_ADST = 2 - { fadst4_1d, fadst4_1d } // ADST_ADST = 3 + { fdct4, fdct4 }, // DCT_DCT = 0 + { fadst4, fdct4 }, // ADST_DCT = 1 + { fdct4, fadst4 }, // DCT_ADST = 2 + { fadst4, fadst4 } // ADST_ADST = 3 }; -void vp9_short_fht4x4_c(int16_t *input, int16_t *output, - int pitch, TX_TYPE tx_type) { +void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, + int stride, int tx_type) { int16_t out[4 * 4]; int16_t *outptr = &out[0]; int i, j; @@ -160,7 +162,7 @@ void vp9_short_fht4x4_c(int16_t *input, int16_t *output, // Columns for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) - temp_in[j] = input[j * pitch + i] << 4; + temp_in[j] = input[j * stride + i] * 16; if (i == 0 && temp_in[0]) temp_in[0] += 1; ht.cols(temp_in, temp_out); @@ -178,12 +180,7 @@ void vp9_short_fht4x4_c(int16_t *input, int16_t *output, } } -void vp9_short_fdct8x4_c(int16_t *input, int16_t *output, int pitch) { - vp9_short_fdct4x4_c(input, output, pitch); - vp9_short_fdct4x4_c(input + 4, output + 16, pitch); -} - -static void fdct8_1d(int16_t *input, int16_t *output) { +static void fdct8(const int16_t *input, int16_t *output) { /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; /*needs32*/ int t0, t1, t2, t3; /*canbe16*/ int x0, x1, x2, x3; @@ -198,7 +195,7 @@ static void fdct8_1d(int16_t *input, int16_t *output) { s6 = input[1] - input[6]; s7 = input[0] - input[7]; - // fdct4_1d(step, step); + // fdct4(step, step); x0 = s0 + s3; x1 = s1 + s2; x2 = s1 - s2; @@ -235,8 +232,7 @@ static void fdct8_1d(int16_t *input, int16_t *output) { output[7] = dct_const_round_shift(t3); } -void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) { - const int stride = pitch >> 1; +void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { int i, j; int16_t intermediate[64]; @@ -250,16 +246,16 @@ void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) { int i; for (i = 0; i < 8; i++) { // stage 1 - s0 = (input[0 * stride] + input[7 * stride]) << 2; - s1 = (input[1 * stride] + input[6 * stride]) << 2; - s2 = (input[2 * stride] + input[5 * stride]) << 2; - s3 = (input[3 * stride] + input[4 * stride]) << 2; - s4 = (input[3 * stride] - input[4 * stride]) << 2; - s5 = (input[2 * stride] - input[5 * stride]) << 2; - s6 = (input[1 * stride] - input[6 * stride]) << 2; - s7 = (input[0 * stride] - input[7 * stride]) << 2; - - // fdct4_1d(step, step); + s0 = (input[0 * stride] + input[7 * stride]) * 4; + s1 = (input[1 * stride] + input[6 * stride]) * 4; + s2 = (input[2 * stride] + input[5 * stride]) * 4; + s3 = (input[3 * stride] + input[4 * stride]) * 4; + s4 = (input[3 * stride] - input[4 * stride]) * 4; + s5 = (input[2 * stride] - input[5 * stride]) * 4; + s6 = (input[1 * stride] - input[6 * stride]) * 4; + s7 = (input[0 * stride] - input[7 * stride]) * 4; + + // fdct4(step, step); x0 = s0 + s3; x1 = s1 + s2; x2 = s1 - s2; @@ -301,24 +297,23 @@ void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) { // Rows for (i = 0; i < 8; ++i) { - fdct8_1d(&intermediate[i * 8], &final_output[i * 8]); + fdct8(&intermediate[i * 8], &final_output[i * 8]); for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2; } } -void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) { +void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, // as the first pass results are transposed, we tranpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). - const int stride = pitch >> 1; int pass; // We need an intermediate buffer between passes. int16_t intermediate[256]; - int16_t *in = input; + const int16_t *in = input; int16_t *out = intermediate; // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { @@ -331,23 +326,23 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) { for (i = 0; i < 16; i++) { if (0 == pass) { // Calculate input for the first 8 results. - input[0] = (in[0 * stride] + in[15 * stride]) << 2; - input[1] = (in[1 * stride] + in[14 * stride]) << 2; - input[2] = (in[2 * stride] + in[13 * stride]) << 2; - input[3] = (in[3 * stride] + in[12 * stride]) << 2; - input[4] = (in[4 * stride] + in[11 * stride]) << 2; - input[5] = (in[5 * stride] + in[10 * stride]) << 2; - input[6] = (in[6 * stride] + in[ 9 * stride]) << 2; - input[7] = (in[7 * stride] + in[ 8 * stride]) << 2; + input[0] = (in[0 * stride] + in[15 * stride]) * 4; + input[1] = (in[1 * stride] + in[14 * stride]) * 4; + input[2] = (in[2 * stride] + in[13 * stride]) * 4; + input[3] = (in[3 * stride] + in[12 * stride]) * 4; + input[4] = (in[4 * stride] + in[11 * stride]) * 4; + input[5] = (in[5 * stride] + in[10 * stride]) * 4; + input[6] = (in[6 * stride] + in[ 9 * stride]) * 4; + input[7] = (in[7 * stride] + in[ 8 * stride]) * 4; // Calculate input for the next 8 results. - step1[0] = (in[7 * stride] - in[ 8 * stride]) << 2; - step1[1] = (in[6 * stride] - in[ 9 * stride]) << 2; - step1[2] = (in[5 * stride] - in[10 * stride]) << 2; - step1[3] = (in[4 * stride] - in[11 * stride]) << 2; - step1[4] = (in[3 * stride] - in[12 * stride]) << 2; - step1[5] = (in[2 * stride] - in[13 * stride]) << 2; - step1[6] = (in[1 * stride] - in[14 * stride]) << 2; - step1[7] = (in[0 * stride] - in[15 * stride]) << 2; + step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4; + step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4; + step1[2] = (in[5 * stride] - in[10 * stride]) * 4; + step1[3] = (in[4 * stride] - in[11 * stride]) * 4; + step1[4] = (in[3 * stride] - in[12 * stride]) * 4; + step1[5] = (in[2 * stride] - in[13 * stride]) * 4; + step1[6] = (in[1 * stride] - in[14 * stride]) * 4; + step1[7] = (in[0 * stride] - in[15 * stride]) * 4; } else { // Calculate input for the first 8 results. input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); @@ -368,7 +363,7 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) { step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); } - // Work on the first eight values; fdct8_1d(input, even_results); + // Work on the first eight values; fdct8(input, even_results); { /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; /*needs32*/ int t0, t1, t2, t3; @@ -384,7 +379,7 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) { s6 = input[1] - input[6]; s7 = input[0] - input[7]; - // fdct4_1d(step, step); + // fdct4(step, step); x0 = s0 + s3; x1 = s1 + s2; x2 = s1 - s2; @@ -486,7 +481,7 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) { } } -static void fadst8_1d(int16_t *input, int16_t *output) { +static void fadst8(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; int x0 = input[7]; @@ -558,14 +553,14 @@ static void fadst8_1d(int16_t *input, int16_t *output) { } static const transform_2d FHT_8[] = { - { fdct8_1d, fdct8_1d }, // DCT_DCT = 0 - { fadst8_1d, fdct8_1d }, // ADST_DCT = 1 - { fdct8_1d, fadst8_1d }, // DCT_ADST = 2 - { fadst8_1d, fadst8_1d } // ADST_ADST = 3 + { fdct8, fdct8 }, // DCT_DCT = 0 + { fadst8, fdct8 }, // ADST_DCT = 1 + { fdct8, fadst8 }, // DCT_ADST = 2 + { fadst8, fadst8 } // ADST_ADST = 3 }; -void vp9_short_fht8x8_c(int16_t *input, int16_t *output, - int pitch, TX_TYPE tx_type) { +void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, + int stride, int tx_type) { int16_t out[64]; int16_t *outptr = &out[0]; int i, j; @@ -575,7 +570,7 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output, // Columns for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) - temp_in[j] = input[j * pitch + i] << 2; + temp_in[j] = input[j * stride + i] * 4; ht.cols(temp_in, temp_out); for (j = 0; j < 8; ++j) outptr[j * 8 + i] = temp_out[j]; @@ -593,18 +588,17 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output, /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per pixel. */ -void vp9_short_walsh4x4_c(short *input, short *output, int pitch) { +void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) { int i; int a1, b1, c1, d1, e1; - short *ip = input; - short *op = output; - int pitch_short = pitch >> 1; + const int16_t *ip = input; + int16_t *op = output; for (i = 0; i < 4; i++) { - a1 = ip[0 * pitch_short]; - b1 = ip[1 * pitch_short]; - c1 = ip[2 * pitch_short]; - d1 = ip[3 * pitch_short]; + a1 = ip[0 * stride]; + b1 = ip[1 * stride]; + c1 = ip[2 * stride]; + d1 = ip[3 * stride]; a1 += b1; d1 = d1 - c1; @@ -637,24 +631,18 @@ void vp9_short_walsh4x4_c(short *input, short *output, int pitch) { c1 = e1 - c1; a1 -= c1; d1 += b1; - op[0] = a1 << WHT_UPSCALE_FACTOR; - op[1] = c1 << WHT_UPSCALE_FACTOR; - op[2] = d1 << WHT_UPSCALE_FACTOR; - op[3] = b1 << WHT_UPSCALE_FACTOR; + op[0] = a1 * UNIT_QUANT_FACTOR; + op[1] = c1 * UNIT_QUANT_FACTOR; + op[2] = d1 * UNIT_QUANT_FACTOR; + op[3] = b1 * UNIT_QUANT_FACTOR; ip += 4; op += 4; } } -void vp9_short_walsh8x4_c(short *input, short *output, int pitch) { - vp9_short_walsh4x4_c(input, output, pitch); - vp9_short_walsh4x4_c(input + 4, output + 16, pitch); -} - - // Rewrote to use same algorithm as others. -static void fdct16_1d(int16_t in[16], int16_t out[16]) { +static void fdct16(const int16_t in[16], int16_t out[16]) { /*canbe16*/ int step1[8]; /*canbe16*/ int step2[8]; /*canbe16*/ int step3[8]; @@ -680,7 +668,7 @@ static void fdct16_1d(int16_t in[16], int16_t out[16]) { step1[6] = in[1] - in[14]; step1[7] = in[0] - in[15]; - // fdct8_1d(step, step); + // fdct8(step, step); { /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; /*needs32*/ int t0, t1, t2, t3; @@ -696,7 +684,7 @@ static void fdct16_1d(int16_t in[16], int16_t out[16]) { s6 = input[1] - input[6]; s7 = input[0] - input[7]; - // fdct4_1d(step, step); + // fdct4(step, step); x0 = s0 + s3; x1 = s1 + s2; x2 = s1 - s2; @@ -795,7 +783,7 @@ static void fdct16_1d(int16_t in[16], int16_t out[16]) { out[15] = dct_const_round_shift(temp2); } -void fadst16_1d(int16_t *input, int16_t *output) { +static void fadst16(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; int x0 = input[15]; @@ -958,14 +946,14 @@ void fadst16_1d(int16_t *input, int16_t *output) { } static const transform_2d FHT_16[] = { - { fdct16_1d, fdct16_1d }, // DCT_DCT = 0 - { fadst16_1d, fdct16_1d }, // ADST_DCT = 1 - { fdct16_1d, fadst16_1d }, // DCT_ADST = 2 - { fadst16_1d, fadst16_1d } // ADST_ADST = 3 + { fdct16, fdct16 }, // DCT_DCT = 0 + { fadst16, fdct16 }, // ADST_DCT = 1 + { fdct16, fadst16 }, // DCT_ADST = 2 + { fadst16, fadst16 } // ADST_ADST = 3 }; -void vp9_short_fht16x16_c(int16_t *input, int16_t *output, - int pitch, TX_TYPE tx_type) { +void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, + int stride, int tx_type) { int16_t out[256]; int16_t *outptr = &out[0]; int i, j; @@ -975,7 +963,7 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output, // Columns for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) - temp_in[j] = input[j * pitch + i] << 2; + temp_in[j] = input[j * stride + i] * 4; ht.cols(temp_in, temp_out); for (j = 0; j < 16; ++j) outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; @@ -1003,7 +991,7 @@ static INLINE int half_round_shift(int input) { return rv; } -static void dct32_1d(int *input, int *output, int round) { +static void dct32_1d(const int *input, int *output, int round) { int step[32]; // Stage 1 step[0] = input[0] + input[(32 - 1)]; @@ -1326,8 +1314,7 @@ static void dct32_1d(int *input, int *output, int round) { output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); } -void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) { - int shortpitch = pitch >> 1; +void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { int i, j; int output[32 * 32]; @@ -1335,7 +1322,7 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) { for (i = 0; i < 32; ++i) { int temp_in[32], temp_out[32]; for (j = 0; j < 32; ++j) - temp_in[j] = input[j * shortpitch + i] << 2; + temp_in[j] = input[j * stride + i] * 4; dct32_1d(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; @@ -1355,8 +1342,7 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) { // Note that although we use dct_32_round in dct32_1d computation flow, // this 2d fdct32x32 for rate-distortion optimization loop is operating // within 16 bits precision. -void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) { - int shortpitch = pitch >> 1; +void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { int i, j; int output[32 * 32]; @@ -1364,7 +1350,7 @@ void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) { for (i = 0; i < 32; ++i) { int temp_in[32], temp_out[32]; for (j = 0; j < 32; ++j) - temp_in[j] = input[j * shortpitch + i] << 2; + temp_in[j] = input[j * stride + i] * 4; dct32_1d(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) // TODO(cd): see quality impact of only doing @@ -1383,3 +1369,27 @@ void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) { out[j + i * 32] = temp_out[j]; } } + +void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output, + int stride) { + if (tx_type == DCT_DCT) + vp9_fdct4x4(input, output, stride); + else + vp9_short_fht4x4(input, output, stride, tx_type); +} + +void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output, + int stride) { + if (tx_type == DCT_DCT) + vp9_fdct8x8(input, output, stride); + else + vp9_short_fht8x8(input, output, stride, tx_type); +} + +void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output, + int stride) { + if (tx_type == DCT_DCT) + vp9_fdct16x16(input, output, stride); + else + vp9_short_fht16x16(input, output, stride, tx_type); +} diff --git a/libvpx/vp9/encoder/vp9_dct.h b/libvpx/vp9/encoder/vp9_dct.h new file mode 100644 index 0000000..aaf976d --- /dev/null +++ b/libvpx/vp9/encoder/vp9_dct.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_ENCODER_VP9_DCT_H_ +#define VP9_ENCODER_VP9_DCT_H_ + +void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output, + int stride); + +void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output, + int stride); + +void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output, + int stride); + +#endif // VP9_ENCODER_VP9_DCT_H_ diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c index 44ab02d..a45299b 100644 --- a/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/libvpx/vp9/encoder/vp9_encodeframe.c @@ -22,6 +22,7 @@ #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_extend.h" #include "vp9/common/vp9_findnearmv.h" +#include "vp9/common/vp9_idct.h" #include "vp9/common/vp9_mvref_common.h" #include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_quant_common.h" @@ -29,7 +30,6 @@ #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/common/vp9_tile_common.h" - #include "vp9/encoder/vp9_encodeframe.h" #include "vp9/encoder/vp9_encodeintra.h" #include "vp9/encoder/vp9_encodemb.h" @@ -37,24 +37,44 @@ #include "vp9/encoder/vp9_onyx_int.h" #include "vp9/encoder/vp9_rdopt.h" #include "vp9/encoder/vp9_segmentation.h" +#include "vp9/common/vp9_systemdependent.h" #include "vp9/encoder/vp9_tokenize.h" +#include "vp9/encoder/vp9_vaq.h" -#define DBG_PRNT_SEGMAP 0 +#define DBG_PRNT_SEGMAP 0 -static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = { - TX_4X4, // ONLY_4X4 - TX_8X8, // ONLY_8X8 - TX_16X16, // ONLY_16X16 - TX_32X32, // ONLY_32X32 - TX_32X32, // TX_MODE_SELECT -}; // #define ENC_DEBUG #ifdef ENC_DEBUG int enc_debug = 0; #endif +static INLINE uint8_t *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) { + switch (subsize) { + case BLOCK_64X64: + case BLOCK_64X32: + case BLOCK_32X64: + case BLOCK_32X32: + return &xd->sb_index; + case BLOCK_32X16: + case BLOCK_16X32: + case BLOCK_16X16: + return &xd->mb_index; + case BLOCK_16X8: + case BLOCK_8X16: + case BLOCK_8X8: + return &xd->b_index; + case BLOCK_8X4: + case BLOCK_4X8: + case BLOCK_4X4: + return &xd->ab_index; + default: + assert(0); + return NULL; + } +} + static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, int mi_row, int mi_col, BLOCK_SIZE bsize); @@ -122,7 +142,6 @@ static unsigned int tt_activity_measure(MACROBLOCK *x) { static unsigned int alt_activity_measure(MACROBLOCK *x, int use_dc_pred) { return vp9_encode_intra(x, use_dc_pred); } -DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = {0}; // Measure the activity of the current macroblock // What we measure here is TBD so abstracted to this function @@ -173,8 +192,9 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) { tmp = sortlist[j - 1]; sortlist[j - 1] = sortlist[j]; sortlist[j] = tmp; - } else - break; + } else { + break; + } } } @@ -246,13 +266,11 @@ static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) { #if OUTPUT_NORM_ACT_STATS fprintf(f, "\n"); #endif - } #if OUTPUT_NORM_ACT_STATS fclose(f); #endif - } #endif // USE_ACT_INDEX @@ -264,7 +282,7 @@ static void build_activity_map(VP9_COMP *cpi) { VP9_COMMON * const cm = &cpi->common; #if ALT_ACT_MEASURE - YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx]; + YV12_BUFFER_CONFIG *new_yv12 = get_frame_new_buffer(cm); int recon_yoffset; int recon_y_stride = new_yv12->y_stride; #endif @@ -317,7 +335,6 @@ static void build_activity_map(VP9_COMP *cpi) { // Calculate an activity index number of each mb calc_activity_index(cpi, x); #endif - } // Macroblock activity masking @@ -351,8 +368,8 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; MODE_INFO *mi = &ctx->mic; - MB_MODE_INFO * const mbmi = &xd->this_mi->mbmi; - MODE_INFO *mi_addr = xd->this_mi; + MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; + MODE_INFO *mi_addr = xd->mi_8x8[0]; int mb_mode_index = ctx->best_mode_index; const int mis = cm->mode_info_stride; @@ -360,7 +377,6 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, const int mi_height = num_8x8_blocks_high_lookup[bsize]; assert(mi->mbmi.mode < MB_MODE_COUNT); - assert(mb_mode_index < MAX_MODES); assert(mi->mbmi.ref_frame[0] < MAX_REF_FRAMES); assert(mi->mbmi.ref_frame[1] < MAX_REF_FRAMES); assert(mi->mbmi.sb_type == bsize); @@ -375,6 +391,10 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) xd->mi_8x8[x_idx + y * mis] = mi_addr; + if (cpi->sf.variance_adaptive_quantization) { + vp9_mb_init_quantizer(cpi, x); + } + // FIXME(rbultje) I'm pretty sure this should go to the end of this block // (i.e. after the output_enabled) if (bsize < BLOCK_32X32) { @@ -384,12 +404,14 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, } if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) { - *x->partition_info = ctx->partition_info; mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int; mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int; } x->skip = ctx->skip; + vpx_memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk, + sizeof(uint8_t) * ctx->num_4x4_blk); + if (!output_enabled) return; @@ -398,15 +420,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, cpi->rd_tx_select_diff[i] += ctx->tx_rd_diff[i]; } - if (cm->frame_type == KEY_FRAME) { - // Restore the coding modes to that held in the coding context - // if (mb_mode == I4X4_PRED) - // for (i = 0; i < 16; i++) - // { - // xd->block[i].bmi.as_mode = - // xd->mode_info_context->bmi[i].as_mode; - // assert(xd->mode_info_context->bmi[i].as_mode < MB_MODE_COUNT); - // } + if (frame_is_intra_only(cm)) { #if CONFIG_INTERNAL_STATS static const int kf_mode_index[] = { THR_DC /*DC_PRED*/, @@ -419,7 +433,6 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, THR_D207_PRED /*D207_PRED*/, THR_D63_PRED /*D63_PRED*/, THR_TM /*TM_PRED*/, - THR_B_PRED /*I4X4_PRED*/, }; cpi->mode_chosen_counts[kf_mode_index[mi->mbmi.mode]]++; #endif @@ -428,18 +441,19 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, cpi->mode_chosen_counts[mb_mode_index]++; if (is_inter_block(mbmi) && (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) { - int_mv best_mv, best_second_mv; + int_mv best_mv[2]; const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0]; const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1]; - best_mv.as_int = ctx->best_ref_mv.as_int; - best_second_mv.as_int = ctx->second_best_ref_mv.as_int; + best_mv[0].as_int = ctx->best_ref_mv.as_int; + best_mv[1].as_int = ctx->second_best_ref_mv.as_int; if (mbmi->mode == NEWMV) { - best_mv.as_int = mbmi->ref_mvs[rf1][0].as_int; - best_second_mv.as_int = mbmi->ref_mvs[rf2][0].as_int; + best_mv[0].as_int = mbmi->ref_mvs[rf1][0].as_int; + if (rf2 > 0) + best_mv[1].as_int = mbmi->ref_mvs[rf2][0].as_int; } - mbmi->best_mv.as_int = best_mv.as_int; - mbmi->best_second_mv.as_int = best_second_mv.as_int; - vp9_update_nmv_count(cpi, x, &best_mv, &best_second_mv); + mbmi->best_mv[0].as_int = best_mv[0].as_int; + mbmi->best_mv[1].as_int = best_mv[1].as_int; + vp9_update_mv_count(cpi, x, best_mv); } if (cm->mcomp_filter_type == SWITCHABLE && is_inter_mode(mbmi->mode)) { @@ -451,28 +465,27 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff; cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff; - for (i = 0; i <= SWITCHABLE_FILTERS; i++) + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) cpi->rd_filter_diff[i] += ctx->best_filter_diff[i]; } } void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, - int mb_row, int mb_col) { - uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, src - ->alpha_buffer}; - int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, src - ->alpha_stride}; + int mi_row, int mi_col) { + uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, + src->alpha_buffer}; + const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, + src->alpha_stride}; int i; - for (i = 0; i < MAX_MB_PLANE; i++) { - setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mb_row, mb_col, + for (i = 0; i < MAX_MB_PLANE; i++) + setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mi_row, mi_col, NULL, x->e_mbd.plane[i].subsampling_x, x->e_mbd.plane[i].subsampling_y); - } } -static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, - BLOCK_SIZE bsize) { +static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, int mi_col, BLOCK_SIZE bsize) { MACROBLOCK *const x = &cpi->mb; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; @@ -486,16 +499,12 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, const int idx_map = mb_row * cm->mb_cols + mb_col; const struct segmentation *const seg = &cm->seg; - set_skip_context(cm, xd, mi_row, mi_col); - set_partition_seg_context(cm, xd, mi_row, mi_col); + set_skip_context(xd, cpi->above_context, cpi->left_context, mi_row, mi_col); // Activity map pointer x->mb_activity_ptr = &cpi->mb_activity_map[idx_map]; x->active_ptr = cpi->active_map + idx_map; - /* pointers to mode info contexts */ - x->partition_info = x->pi + idx_str; - xd->mi_8x8 = cm->mi_grid_visible + idx_str; xd->prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str; @@ -503,10 +512,9 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, // cannot be used. xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL; - xd->this_mi = xd->mi_8x8[0] = cm->mi + idx_str; - mbmi = &xd->this_mi->mbmi; + mbmi = &xd->mi_8x8[0]->mbmi; // Set up destination pointers setup_dst_planes(xd, &cm->yv12_fb[dst_fb_idx], mi_row, mi_col); @@ -520,7 +528,8 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, // Set up distance of MB to edge of frame in 1/8th pel units assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); - set_mi_row_col(cm, xd, mi_row, mi_height, mi_col, mi_width); + set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, + cm->mi_rows, cm->mi_cols); /* set up source buffers */ vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); @@ -531,10 +540,11 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, /* segment ID */ if (seg->enabled) { - uint8_t *map = seg->update_map ? cpi->segmentation_map - : cm->last_frame_seg_map; - mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col); - + if (!cpi->sf.variance_adaptive_quantization) { + uint8_t *map = seg->update_map ? cpi->segmentation_map + : cm->last_frame_seg_map; + mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col); + } vp9_mb_init_quantizer(cpi, x); if (seg->enabled && cpi->seg0_cnt > 0 @@ -546,9 +556,8 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, const int x = mb_col & ~3; const int p16 = ((mb_row & 1) << 1) + (mb_col & 1); const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1); - const int tile_progress = cm->cur_tile_mi_col_start * cm->mb_rows >> 1; - const int mb_cols = (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start) - >> 1; + const int tile_progress = tile->mi_col_start * cm->mb_rows >> 1; + const int mb_cols = (tile->mi_col_end - tile->mi_col_start) >> 1; cpi->seg0_progress = ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) << 16) / cm->MBs; @@ -561,13 +570,19 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, } } -static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col, +static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, int mi_col, int *totalrate, int64_t *totaldist, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; + int orig_rdmult = x->rdmult; + double rdmult_ratio; + + vp9_clear_system_state(); // __asm emms; + rdmult_ratio = 1.0; // avoid uninitialized warnings // Use the lower precision, but faster, 32x32 fdct for mode selection. x->use_lp32x32fdct = 1; @@ -582,35 +597,66 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col, } } - set_offsets(cpi, mi_row, mi_col, bsize); - xd->this_mi->mbmi.sb_type = bsize; + set_offsets(cpi, tile, mi_row, mi_col, bsize); + xd->mi_8x8[0]->mbmi.sb_type = bsize; // Set to zero to make sure we do not use the previous encoded frame stats - xd->this_mi->mbmi.skip_coeff = 0; + xd->mi_8x8[0]->mbmi.skip_coeff = 0; x->source_variance = get_sby_perpixel_variance(cpi, x, bsize); + if (cpi->sf.variance_adaptive_quantization) { + int energy; + if (bsize <= BLOCK_16X16) { + energy = x->mb_energy; + } else { + energy = vp9_block_energy(cpi, x, bsize); + } + + xd->mi_8x8[0]->mbmi.segment_id = vp9_vaq_segment_id(energy); + rdmult_ratio = vp9_vaq_rdmult_ratio(energy); + vp9_mb_init_quantizer(cpi, x); + } + if (cpi->oxcf.tuning == VP8_TUNE_SSIM) vp9_activity_masking(cpi, x); + if (cpi->sf.variance_adaptive_quantization) { + vp9_clear_system_state(); // __asm emms; + x->rdmult = round(x->rdmult * rdmult_ratio); + } + // Find best coding mode & reconstruct the MB so it is available // as a predictor for MBs that follow in the SB - if (cm->frame_type == KEY_FRAME) + if (frame_is_intra_only(cm)) { vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist, bsize, ctx, best_rd); - else - vp9_rd_pick_inter_mode_sb(cpi, x, mi_row, mi_col, totalrate, totaldist, - bsize, ctx, best_rd); + } else { + if (bsize >= BLOCK_8X8) + vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col, + totalrate, totaldist, bsize, ctx, best_rd); + else + vp9_rd_pick_inter_mode_sub8x8(cpi, x, tile, mi_row, mi_col, totalrate, + totaldist, bsize, ctx, best_rd); + } + + if (cpi->sf.variance_adaptive_quantization) { + x->rdmult = orig_rdmult; + if (*totalrate != INT_MAX) { + vp9_clear_system_state(); // __asm emms; + *totalrate = round(*totalrate * rdmult_ratio); + } + } } static void update_stats(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO *mi = xd->this_mi; + MODE_INFO *mi = xd->mi_8x8[0]; MB_MODE_INFO *const mbmi = &mi->mbmi; - if (cm->frame_type != KEY_FRAME) { + if (!frame_is_intra_only(cm)) { const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); @@ -637,49 +683,6 @@ static void update_stats(VP9_COMP *cpi) { [mbmi->ref_frame[0] != GOLDEN_FRAME]++; } } - - // Count of last ref frame 0,0 usage - if (mbmi->mode == ZEROMV && mbmi->ref_frame[0] == LAST_FRAME) - cpi->inter_zz_count++; - } -} - -// TODO(jingning): the variables used here are little complicated. need further -// refactoring on organizing the temporary buffers, when recursive -// partition down to 4x4 block size is enabled. -static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) { - MACROBLOCKD *const xd = &x->e_mbd; - - switch (bsize) { - case BLOCK_64X64: - return &x->sb64_context; - case BLOCK_64X32: - return &x->sb64x32_context[xd->sb_index]; - case BLOCK_32X64: - return &x->sb32x64_context[xd->sb_index]; - case BLOCK_32X32: - return &x->sb32_context[xd->sb_index]; - case BLOCK_32X16: - return &x->sb32x16_context[xd->sb_index][xd->mb_index]; - case BLOCK_16X32: - return &x->sb16x32_context[xd->sb_index][xd->mb_index]; - case BLOCK_16X16: - return &x->mb_context[xd->sb_index][xd->mb_index]; - case BLOCK_16X8: - return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index]; - case BLOCK_8X16: - return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index]; - case BLOCK_8X8: - return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index]; - case BLOCK_8X4: - return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index]; - case BLOCK_4X8: - return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index]; - case BLOCK_4X4: - return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index]; - default: - assert(0); - return NULL ; } } @@ -696,7 +699,7 @@ static BLOCK_SIZE *get_sb_partitioning(MACROBLOCK *x, BLOCK_SIZE bsize) { return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index]; default: assert(0); - return NULL ; + return NULL; } } @@ -705,7 +708,6 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col, ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8], BLOCK_SIZE bsize) { - VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; int p; @@ -715,28 +717,27 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col, int mi_height = num_8x8_blocks_high_lookup[bsize]; for (p = 0; p < MAX_MB_PLANE; p++) { vpx_memcpy( - cm->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x), + cpi->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x), a + num_4x4_blocks_wide * p, (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> xd->plane[p].subsampling_x); vpx_memcpy( - cm->left_context[p] + cpi->left_context[p] + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y), l + num_4x4_blocks_high * p, (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >> xd->plane[p].subsampling_y); } - vpx_memcpy(cm->above_seg_context + mi_col, sa, - sizeof(PARTITION_CONTEXT) * mi_width); - vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl, - sizeof(PARTITION_CONTEXT) * mi_height); + vpx_memcpy(cpi->above_seg_context + mi_col, sa, + sizeof(*cpi->above_seg_context) * mi_width); + vpx_memcpy(cpi->left_seg_context + (mi_row & MI_MASK), sl, + sizeof(cpi->left_seg_context[0]) * mi_height); } static void save_context(VP9_COMP *cpi, int mi_row, int mi_col, ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8], BLOCK_SIZE bsize) { - const VP9_COMMON *const cm = &cpi->common; const MACROBLOCK *const x = &cpi->mb; const MACROBLOCKD *const xd = &x->e_mbd; int p; @@ -749,23 +750,24 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col, for (p = 0; p < MAX_MB_PLANE; ++p) { vpx_memcpy( a + num_4x4_blocks_wide * p, - cm->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x), + cpi->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x), (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> xd->plane[p].subsampling_x); vpx_memcpy( l + num_4x4_blocks_high * p, - cm->left_context[p] + cpi->left_context[p] + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y), (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >> xd->plane[p].subsampling_y); } - vpx_memcpy(sa, cm->above_seg_context + mi_col, - sizeof(PARTITION_CONTEXT) * mi_width); - vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK), - sizeof(PARTITION_CONTEXT) * mi_height); + vpx_memcpy(sa, cpi->above_seg_context + mi_col, + sizeof(*cpi->above_seg_context) * mi_width); + vpx_memcpy(sl, cpi->left_seg_context + (mi_row & MI_MASK), + sizeof(cpi->left_seg_context[0]) * mi_height); } -static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, +static void encode_b(VP9_COMP *cpi, const TileInfo *const tile, + TOKENEXTRA **tp, int mi_row, int mi_col, int output_enabled, BLOCK_SIZE bsize, int sub_index) { VP9_COMMON * const cm = &cpi->common; MACROBLOCK * const x = &cpi->mb; @@ -783,7 +785,7 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, if (xd->ab_index > 0) return; } - set_offsets(cpi, mi_row, mi_col, bsize); + set_offsets(cpi, tile, mi_row, mi_col, bsize); update_state(cpi, get_block_context(x, bsize), bsize, output_enabled); encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize); @@ -795,7 +797,8 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, } } -static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, +static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile, + TOKENEXTRA **tp, int mi_row, int mi_col, int output_enabled, BLOCK_SIZE bsize) { VP9_COMMON * const cm = &cpi->common; MACROBLOCK * const x = &cpi->mb; @@ -812,8 +815,8 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, c1 = BLOCK_4X4; if (bsize >= BLOCK_8X8) { - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); + pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, + mi_row, mi_col, bsize); c1 = *(get_sb_partitioning(x, bsize)); } partition = partition_lookup[bsl][c1]; @@ -822,19 +825,19 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, case PARTITION_NONE: if (output_enabled && bsize >= BLOCK_8X8) cpi->partition_count[pl][PARTITION_NONE]++; - encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1); + encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, -1); break; case PARTITION_VERT: if (output_enabled) cpi->partition_count[pl][PARTITION_VERT]++; - encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0); - encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1); + encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, 0); + encode_b(cpi, tile, tp, mi_row, mi_col + bs, output_enabled, c1, 1); break; case PARTITION_HORZ: if (output_enabled) cpi->partition_count[pl][PARTITION_HORZ]++; - encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0); - encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1); + encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, 0); + encode_b(cpi, tile, tp, mi_row + bs, mi_col, output_enabled, c1, 1); break; case PARTITION_SPLIT: subsize = get_subsize(bsize, PARTITION_SPLIT); @@ -846,7 +849,7 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, const int x_idx = i & 1, y_idx = i >> 1; *get_sb_index(xd, subsize) = i; - encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs, + encode_sb(cpi, tile, tp, mi_row + y_idx * bs, mi_col + x_idx * bs, output_enabled, subsize); } break; @@ -855,10 +858,9 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, break; } - if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) { - set_partition_seg_context(cm, xd, mi_row, mi_col); - update_partition_context(xd, c1, bsize); - } + if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) + update_partition_context(cpi->above_seg_context, cpi->left_seg_context, + mi_row, mi_col, c1, bsize); } // Check to see if the given partition size is allowed for a specified number @@ -886,13 +888,13 @@ static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, // However, at the bottom and right borders of the image the requested size // may not be allowed in which case this code attempts to choose the largest // allowable partition. -static void set_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8, - int mi_row, int mi_col) { +static void set_partitioning(VP9_COMP *cpi, const TileInfo *const tile, + MODE_INFO **mi_8x8, int mi_row, int mi_col) { VP9_COMMON *const cm = &cpi->common; BLOCK_SIZE bsize = cpi->sf.always_this_block_size; const int mis = cm->mode_info_stride; - int row8x8_remaining = cm->cur_tile_mi_row_end - mi_row; - int col8x8_remaining = cm->cur_tile_mi_col_end - mi_col; + int row8x8_remaining = tile->mi_row_end - mi_row; + int col8x8_remaining = tile->mi_col_end - mi_col; int block_row, block_col; MODE_INFO * mi_upper_left = cm->mi + mi_row * mis + mi_col; int bh = num_8x8_blocks_high_lookup[bsize]; @@ -936,7 +938,7 @@ static void copy_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8, for (block_col = 0; block_col < 8; ++block_col) { MODE_INFO * prev_mi = prev_mi_8x8[block_row * mis + block_col]; BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0; - int offset; + ptrdiff_t offset; if (prev_mi) { offset = prev_mi - cm->prev_mi; @@ -947,324 +949,29 @@ static void copy_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8, } } -static void set_block_size(VP9_COMMON * const cm, MODE_INFO **mi_8x8, - BLOCK_SIZE bsize, int mis, int mi_row, - int mi_col) { - int r, c; - const int bs = MAX(num_8x8_blocks_wide_lookup[bsize], - num_8x8_blocks_high_lookup[bsize]); - const int idx_str = mis * mi_row + mi_col; - MODE_INFO **const mi2 = &mi_8x8[idx_str]; - - mi2[0] = cm->mi + idx_str; - mi2[0]->mbmi.sb_type = bsize; - - for (r = 0; r < bs; r++) - for (c = 0; c < bs; c++) - if (mi_row + r < cm->mi_rows && mi_col + c < cm->mi_cols) - mi2[r * mis + c] = mi2[0]; -} - -typedef struct { - int64_t sum_square_error; - int64_t sum_error; - int count; - int variance; -} var; - -typedef struct { - var none; - var horz[2]; - var vert[2]; -} partition_variance; - -#define VT(TYPE, BLOCKSIZE) \ - typedef struct { \ - partition_variance vt; \ - BLOCKSIZE split[4]; } TYPE; - -VT(v8x8, var) -VT(v16x16, v8x8) -VT(v32x32, v16x16) -VT(v64x64, v32x32) - -typedef struct { - partition_variance *vt; - var *split[4]; -} vt_node; - -typedef enum { - V16X16, - V32X32, - V64X64, -} TREE_LEVEL; - -static void tree_to_node(void *data, BLOCK_SIZE bsize, vt_node *node) { - int i; - switch (bsize) { - case BLOCK_64X64: { - v64x64 *vt = (v64x64 *) data; - node->vt = &vt->vt; - for (i = 0; i < 4; i++) - node->split[i] = &vt->split[i].vt.none; - break; - } - case BLOCK_32X32: { - v32x32 *vt = (v32x32 *) data; - node->vt = &vt->vt; - for (i = 0; i < 4; i++) - node->split[i] = &vt->split[i].vt.none; - break; - } - case BLOCK_16X16: { - v16x16 *vt = (v16x16 *) data; - node->vt = &vt->vt; - for (i = 0; i < 4; i++) - node->split[i] = &vt->split[i].vt.none; - break; - } - case BLOCK_8X8: { - v8x8 *vt = (v8x8 *) data; - node->vt = &vt->vt; - for (i = 0; i < 4; i++) - node->split[i] = &vt->split[i]; - break; - } - default: - node->vt = 0; - for (i = 0; i < 4; i++) - node->split[i] = 0; - assert(-1); - } -} - -// Set variance values given sum square error, sum error, count. -static void fill_variance(var *v, int64_t s2, int64_t s, int c) { - v->sum_square_error = s2; - v->sum_error = s; - v->count = c; - if (c > 0) - v->variance = 256 - * (v->sum_square_error - v->sum_error * v->sum_error / v->count) - / v->count; - else - v->variance = 0; -} - -// Combine 2 variance structures by summing the sum_error, sum_square_error, -// and counts and then calculating the new variance. -void sum_2_variances(var *r, var *a, var*b) { - fill_variance(r, a->sum_square_error + b->sum_square_error, - a->sum_error + b->sum_error, a->count + b->count); -} - -static void fill_variance_tree(void *data, BLOCK_SIZE bsize) { - vt_node node; - tree_to_node(data, bsize, &node); - sum_2_variances(&node.vt->horz[0], node.split[0], node.split[1]); - sum_2_variances(&node.vt->horz[1], node.split[2], node.split[3]); - sum_2_variances(&node.vt->vert[0], node.split[0], node.split[2]); - sum_2_variances(&node.vt->vert[1], node.split[1], node.split[3]); - sum_2_variances(&node.vt->none, &node.vt->vert[0], &node.vt->vert[1]); -} - -#if PERFORM_RANDOM_PARTITIONING -static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m, - BLOCK_SIZE block_size, int mi_row, - int mi_col, int mi_size) { - VP9_COMMON * const cm = &cpi->common; - vt_node vt; - const int mis = cm->mode_info_stride; - int64_t threshold = 4 * cpi->common.base_qindex * cpi->common.base_qindex; - - tree_to_node(data, block_size, &vt); - - // split none is available only if we have more than half a block size - // in width and height inside the visible image - if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows && - (rand() & 3) < 1) { - set_block_size(cm, m, block_size, mis, mi_row, mi_col); - return 1; - } - - // vertical split is available on all but the bottom border - if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold - && (rand() & 3) < 1) { - set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row, - mi_col); - return 1; - } - - // horizontal split is available on all but the right border - if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold - && (rand() & 3) < 1) { - set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row, - mi_col); - return 1; - } - - return 0; -} - -#else // !PERFORM_RANDOM_PARTITIONING - -static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO **m, - BLOCK_SIZE bsize, int mi_row, - int mi_col, int mi_size) { - VP9_COMMON * const cm = &cpi->common; - vt_node vt; - const int mis = cm->mode_info_stride; - int64_t threshold = 50 * cpi->common.base_qindex; - - tree_to_node(data, bsize, &vt); - - // split none is available only if we have more than half a block size - // in width and height inside the visible image - if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows - && vt.vt->none.variance < threshold) { - set_block_size(cm, m, bsize, mis, mi_row, mi_col); - return 1; - } - - // vertical split is available on all but the bottom border - if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold - && vt.vt->vert[1].variance < threshold) { - set_block_size(cm, m, get_subsize(bsize, PARTITION_VERT), mis, mi_row, - mi_col); - return 1; - } - - // horizontal split is available on all but the right border - if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold - && vt.vt->horz[1].variance < threshold) { - set_block_size(cm, m, get_subsize(bsize, PARTITION_HORZ), mis, mi_row, - mi_col); - return 1; - } - - return 0; -} -#endif // PERFORM_RANDOM_PARTITIONING - -static void choose_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8, - int mi_row, int mi_col) { - VP9_COMMON * const cm = &cpi->common; - MACROBLOCK *x = &cpi->mb; - MACROBLOCKD *xd = &cpi->mb.e_mbd; +static int sb_has_motion(VP9_COMP *cpi, MODE_INFO **prev_mi_8x8) { + VP9_COMMON *const cm = &cpi->common; const int mis = cm->mode_info_stride; - // TODO(JBB): More experimentation or testing of this threshold; - int64_t threshold = 4; - int i, j, k; - v64x64 vt; - unsigned char * s; - int sp; - const unsigned char * d; - int dp; - int pixels_wide = 64, pixels_high = 64; - - vp9_zero(vt); - set_offsets(cpi, mi_row, mi_col, BLOCK_64X64); - - if (xd->mb_to_right_edge < 0) - pixels_wide += (xd->mb_to_right_edge >> 3); - - if (xd->mb_to_bottom_edge < 0) - pixels_high += (xd->mb_to_bottom_edge >> 3); - - s = x->plane[0].src.buf; - sp = x->plane[0].src.stride; - - // TODO(JBB): Clearly the higher the quantizer the fewer partitions we want - // but this needs more experimentation. - threshold = threshold * cpi->common.base_qindex * cpi->common.base_qindex; - - d = vp9_64x64_zeros; - dp = 64; - if (cm->frame_type != KEY_FRAME) { - int_mv nearest_mv, near_mv; - const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, LAST_FRAME)]; - YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[idx]; - YV12_BUFFER_CONFIG *second_ref_fb = NULL; - - setup_pre_planes(xd, 0, ref_fb, mi_row, mi_col, - &xd->scale_factor[0]); - setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col, - &xd->scale_factor[1]); + int block_row, block_col; - xd->this_mi->mbmi.ref_frame[0] = LAST_FRAME; - xd->this_mi->mbmi.sb_type = BLOCK_64X64; - vp9_find_best_ref_mvs(xd, - mi_8x8[0]->mbmi.ref_mvs[mi_8x8[0]->mbmi.ref_frame[0]], - &nearest_mv, &near_mv); - - xd->this_mi->mbmi.mv[0] = nearest_mv; - vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_64X64); - - d = xd->plane[0].dst.buf; - dp = xd->plane[0].dst.stride; - } - - // Fill in the entire tree of 8x8 variances for splits. - for (i = 0; i < 4; i++) { - const int x32_idx = ((i & 1) << 5); - const int y32_idx = ((i >> 1) << 5); - for (j = 0; j < 4; j++) { - const int x16_idx = x32_idx + ((j & 1) << 4); - const int y16_idx = y32_idx + ((j >> 1) << 4); - v16x16 *vst = &vt.split[i].split[j]; - for (k = 0; k < 4; k++) { - int x_idx = x16_idx + ((k & 1) << 3); - int y_idx = y16_idx + ((k >> 1) << 3); - unsigned int sse = 0; - int sum = 0; - if (x_idx < pixels_wide && y_idx < pixels_high) - vp9_get_sse_sum_8x8(s + y_idx * sp + x_idx, sp, - d + y_idx * dp + x_idx, dp, &sse, &sum); - fill_variance(&vst->split[k].vt.none, sse, sum, 64); - } - } - } - // Fill the rest of the variance tree by summing the split partition - // values. - for (i = 0; i < 4; i++) { - for (j = 0; j < 4; j++) { - fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16); - } - fill_variance_tree(&vt.split[i], BLOCK_32X32); - } - fill_variance_tree(&vt, BLOCK_64X64); - // Now go through the entire structure, splitting every block size until - // we get to one that's got a variance lower than our threshold, or we - // hit 8x8. - if (!set_vt_partitioning(cpi, &vt, mi_8x8, BLOCK_64X64, mi_row, mi_col, - 4)) { - for (i = 0; i < 4; ++i) { - const int x32_idx = ((i & 1) << 2); - const int y32_idx = ((i >> 1) << 2); - if (!set_vt_partitioning(cpi, &vt.split[i], mi_8x8, BLOCK_32X32, - (mi_row + y32_idx), (mi_col + x32_idx), 2)) { - for (j = 0; j < 4; ++j) { - const int x16_idx = ((j & 1) << 1); - const int y16_idx = ((j >> 1) << 1); - if (!set_vt_partitioning(cpi, &vt.split[i].split[j], mi_8x8, - BLOCK_16X16, - (mi_row + y32_idx + y16_idx), - (mi_col + x32_idx + x16_idx), 1)) { - for (k = 0; k < 4; ++k) { - const int x8_idx = (k & 1); - const int y8_idx = (k >> 1); - set_block_size(cm, mi_8x8, BLOCK_8X8, mis, - (mi_row + y32_idx + y16_idx + y8_idx), - (mi_col + x32_idx + x16_idx + x8_idx)); - } - } + if (cm->prev_mi) { + for (block_row = 0; block_row < 8; ++block_row) { + for (block_col = 0; block_col < 8; ++block_col) { + MODE_INFO * prev_mi = prev_mi_8x8[block_row * mis + block_col]; + if (prev_mi) { + if (abs(prev_mi->mbmi.mv[0].as_mv.row) >= 8 || + abs(prev_mi->mbmi.mv[0].as_mv.col) >= 8) + return 1; } } } } + return 0; } -static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, +static void rd_use_partition(VP9_COMP *cpi, + const TileInfo *const tile, + MODE_INFO **mi_8x8, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate, int64_t *dist, int do_recon) { @@ -1315,6 +1022,11 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, } save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); + if (bsize == BLOCK_16X16) { + set_offsets(cpi, tile, mi_row, mi_col, bsize); + x->mb_energy = vp9_block_energy(cpi, x, bsize); + } + x->fast_ms = 0; x->subblock_ref = 0; @@ -1338,11 +1050,12 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, mi_row + (ms >> 1) < cm->mi_rows && mi_col + (ms >> 1) < cm->mi_cols) { *(get_sb_partitioning(x, bsize)) = bsize; - pick_sb_modes(cpi, mi_row, mi_col, &none_rate, &none_dist, bsize, + pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize, get_block_context(x, bsize), INT64_MAX); - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); + pl = partition_plane_context(cpi->above_seg_context, + cpi->left_seg_context, + mi_row, mi_col, bsize); none_rate += x->partition_cost[pl][PARTITION_NONE]; restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); @@ -1353,12 +1066,12 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, switch (partition) { case PARTITION_NONE: - pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist, + pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist, bsize, get_block_context(x, bsize), INT64_MAX); break; case PARTITION_HORZ: *get_sb_index(xd, subsize) = 0; - pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist, + pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist, subsize, get_block_context(x, subsize), INT64_MAX); if (last_part_rate != INT_MAX && bsize >= BLOCK_8X8 && mi_row + (mh >> 1) < cm->mi_rows) { @@ -1367,7 +1080,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); *get_sb_index(xd, subsize) = 1; - pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize, + pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize, get_block_context(x, subsize), INT64_MAX); if (rt == INT_MAX || dt == INT_MAX) { last_part_rate = INT_MAX; @@ -1381,7 +1094,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, break; case PARTITION_VERT: *get_sb_index(xd, subsize) = 0; - pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist, + pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist, subsize, get_block_context(x, subsize), INT64_MAX); if (last_part_rate != INT_MAX && bsize >= BLOCK_8X8 && mi_col + (ms >> 1) < cm->mi_cols) { @@ -1390,7 +1103,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); *get_sb_index(xd, subsize) = 1; - pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize, + pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize, get_block_context(x, subsize), INT64_MAX); if (rt == INT_MAX || dt == INT_MAX) { last_part_rate = INT_MAX; @@ -1417,7 +1130,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, *get_sb_index(xd, subsize) = i; - rd_use_partition(cpi, mi_8x8 + jj * bss * mis + ii * bss, tp, + rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp, mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt, i != 3); if (rt == INT_MAX || dt == INT_MAX) { @@ -1432,8 +1145,9 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, default: assert(0); } - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); + + pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, + mi_row, mi_col, bsize); if (last_part_rate < INT_MAX) last_part_rate += x->partition_cost[pl][partition]; @@ -1465,7 +1179,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx, &rt, &dt, + pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt, split_subsize, get_block_context(x, split_subsize), INT64_MAX); @@ -1478,17 +1192,18 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, } if (i != 3) - encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0, + encode_sb(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, 0, split_subsize); split_rate += rt; split_dist += dt; - set_partition_seg_context(cm, xd, mi_row + y_idx, mi_col + x_idx); - pl = partition_plane_context(xd, bsize); + pl = partition_plane_context(cpi->above_seg_context, + cpi->left_seg_context, + mi_row + y_idx, mi_col + x_idx, bsize); split_rate += x->partition_cost[pl][PARTITION_NONE]; } - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); + pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, + mi_row, mi_col, bsize); if (split_rate < INT_MAX) { split_rate += x->partition_cost[pl][PARTITION_SPLIT]; @@ -1523,7 +1238,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX); if (do_recon) - encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize); + encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize); *rate = chosen_rate; *dist = chosen_dist; @@ -1571,66 +1286,68 @@ static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO ** mi_8x8, // Look at neighboring blocks and set a min and max partition size based on // what they chose. -static void rd_auto_partition_range(VP9_COMP *cpi, int row, int col, +static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, + int row, int col, BLOCK_SIZE *min_block_size, BLOCK_SIZE *max_block_size) { + VP9_COMMON * const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; MODE_INFO ** mi_8x8 = xd->mi_8x8; + MODE_INFO ** prev_mi_8x8 = xd->prev_mi_8x8; + const int left_in_image = xd->left_available && mi_8x8[-1]; const int above_in_image = xd->up_available && mi_8x8[-xd->mode_info_stride]; MODE_INFO ** above_sb64_mi_8x8; MODE_INFO ** left_sb64_mi_8x8; - // Frequency check - if (cpi->sf.auto_min_max_partition_count <= 0) { - cpi->sf.auto_min_max_partition_count = - cpi->sf.auto_min_max_partition_interval; + int row8x8_remaining = tile->mi_row_end - row; + int col8x8_remaining = tile->mi_col_end - col; + int bh, bw; + + // Trap case where we do not have a prediction. + if (!left_in_image && !above_in_image && + ((cm->frame_type == KEY_FRAME) || !cm->prev_mi)) { *min_block_size = BLOCK_4X4; *max_block_size = BLOCK_64X64; } else { - --cpi->sf.auto_min_max_partition_count; - - // Set default values if no left or above neighbour - if (!left_in_image && !above_in_image) { - *min_block_size = BLOCK_4X4; - *max_block_size = BLOCK_64X64; - } else { - VP9_COMMON *const cm = &cpi->common; - int row8x8_remaining = cm->cur_tile_mi_row_end - row; - int col8x8_remaining = cm->cur_tile_mi_col_end - col; - int bh, bw; - - // Default "min to max" and "max to min" - *min_block_size = BLOCK_64X64; - *max_block_size = BLOCK_4X4; - - // Find the min and max partition sizes used in the left SB64 - if (left_in_image) { - left_sb64_mi_8x8 = &mi_8x8[-MI_BLOCK_SIZE]; - get_sb_partition_size_range(cpi, left_sb64_mi_8x8, - min_block_size, max_block_size); - } - - // Find the min and max partition sizes used in the above SB64 taking - // the values found for left as a starting point. - if (above_in_image) { - above_sb64_mi_8x8 = &mi_8x8[-xd->mode_info_stride * MI_BLOCK_SIZE]; - get_sb_partition_size_range(cpi, above_sb64_mi_8x8, - min_block_size, max_block_size); - } + // Default "min to max" and "max to min" + *min_block_size = BLOCK_64X64; + *max_block_size = BLOCK_4X4; + + // NOTE: each call to get_sb_partition_size_range() uses the previous + // passed in values for min and max as a starting point. + // + // Find the min and max partition used in previous frame at this location + if (cm->prev_mi && (cm->frame_type != KEY_FRAME)) { + get_sb_partition_size_range(cpi, prev_mi_8x8, + min_block_size, max_block_size); + } - // Give a bit of leaway either side of the observed min and max - *min_block_size = min_partition_size[*min_block_size]; - *max_block_size = max_partition_size[*max_block_size]; + // Find the min and max partition sizes used in the left SB64 + if (left_in_image) { + left_sb64_mi_8x8 = &mi_8x8[-MI_BLOCK_SIZE]; + get_sb_partition_size_range(cpi, left_sb64_mi_8x8, + min_block_size, max_block_size); + } - // Check border cases where max and min from neighbours may not be legal. - *max_block_size = find_partition_size(*max_block_size, - row8x8_remaining, col8x8_remaining, - &bh, &bw); - *min_block_size = MIN(*min_block_size, *max_block_size); + // Find the min and max partition sizes used in the above SB64. + if (above_in_image) { + above_sb64_mi_8x8 = &mi_8x8[-xd->mode_info_stride * MI_BLOCK_SIZE]; + get_sb_partition_size_range(cpi, above_sb64_mi_8x8, + min_block_size, max_block_size); } } + + // Give a bit of leaway either side of the observed min and max + *min_block_size = min_partition_size[*min_block_size]; + *max_block_size = max_partition_size[*max_block_size]; + + // Check border cases where max and min from neighbours may not be legal. + *max_block_size = find_partition_size(*max_block_size, + row8x8_remaining, col8x8_remaining, + &bh, &bw); + *min_block_size = MIN(*min_block_size, *max_block_size); } static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) { @@ -1735,7 +1452,8 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previous rate-distortion optimization // results, for encoding speed-up. -static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, +static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, + TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate, int64_t *dist, int do_recon, int64_t best_rd) { VP9_COMMON * const cm = &cpi->common; @@ -1772,7 +1490,13 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, return; } } - assert(mi_height_log2(bsize) == mi_width_log2(bsize)); + assert(num_8x8_blocks_wide_lookup[bsize] == + num_8x8_blocks_high_lookup[bsize]); + + if (bsize == BLOCK_16X16) { + set_offsets(cpi, tile, mi_row, mi_col, bsize); + x->mb_energy = vp9_block_energy(cpi, x, bsize); + } // Determine partition types in search according to the speed features. // The threshold set here has to be of square block size. @@ -1807,12 +1531,13 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, // PARTITION_NONE if (partition_none_allowed) { - pick_sb_modes(cpi, mi_row, mi_col, &this_rate, &this_dist, bsize, + pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize, get_block_context(x, bsize), best_rd); if (this_rate != INT_MAX) { if (bsize >= BLOCK_8X8) { - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); + pl = partition_plane_context(cpi->above_seg_context, + cpi->left_seg_context, + mi_row, mi_col, bsize); this_rate += x->partition_cost[pl][PARTITION_NONE]; } sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist); @@ -1860,7 +1585,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, *get_sb_index(xd, subsize) = i; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); - rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, + rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, subsize, &this_rate, &this_dist, i != 3, best_rd - sum_rd); if (this_rate == INT_MAX) { @@ -1872,8 +1597,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, } } if (sum_rd < best_rd && i == 4) { - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); + pl = partition_plane_context(cpi->above_seg_context, + cpi->left_seg_context, + mi_row, mi_col, bsize); sum_rate += x->partition_cost[pl][PARTITION_SPLIT]; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd) { @@ -1881,12 +1607,12 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, best_dist = sum_dist; best_rd = sum_rd; *(get_sb_partitioning(x, bsize)) = subsize; - } else { - // skip rectangular partition test when larger block size - // gives better rd cost - if (cpi->sf.less_rectangular_check) - do_rect &= !partition_none_allowed; } + } else { + // skip rectangular partition test when larger block size + // gives better rd cost + if (cpi->sf.less_rectangular_check) + do_rect &= !partition_none_allowed; } partition_split_done = 1; restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); @@ -1906,7 +1632,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, *get_sb_index(xd, subsize) = 0; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); - pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize, + pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, get_block_context(x, subsize), best_rd); sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); @@ -1917,7 +1643,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, *get_sb_index(xd, subsize) = 1; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); - pick_sb_modes(cpi, mi_row + ms, mi_col, &this_rate, + pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate, &this_dist, subsize, get_block_context(x, subsize), best_rd - sum_rd); if (this_rate == INT_MAX) { @@ -1929,8 +1655,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, } } if (sum_rd < best_rd) { - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); + pl = partition_plane_context(cpi->above_seg_context, + cpi->left_seg_context, + mi_row, mi_col, bsize); sum_rate += x->partition_cost[pl][PARTITION_HORZ]; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd) { @@ -1950,7 +1677,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, *get_sb_index(xd, subsize) = 0; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); - pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize, + pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, get_block_context(x, subsize), best_rd); sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) { @@ -1960,7 +1687,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, *get_sb_index(xd, subsize) = 1; if (cpi->sf.adaptive_motion_search) load_pred_mv(x, get_block_context(x, bsize)); - pick_sb_modes(cpi, mi_row, mi_col + ms, &this_rate, + pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate, &this_dist, subsize, get_block_context(x, subsize), best_rd - sum_rd); if (this_rate == INT_MAX) { @@ -1972,8 +1699,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, } } if (sum_rd < best_rd) { - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); + pl = partition_plane_context(cpi->above_seg_context, + cpi->left_seg_context, + mi_row, mi_col, bsize); sum_rate += x->partition_cost[pl][PARTITION_VERT]; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd) { @@ -1991,7 +1719,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, *dist = best_dist; if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) - encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize); + encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize); if (bsize == BLOCK_64X64) { assert(tp_orig < *tp); assert(best_rate < INT_MAX); @@ -2002,10 +1730,10 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, } // Examines 64x64 block and chooses a best reference frame -static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) { +static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, int mi_col) { VP9_COMMON * const cm = &cpi->common; MACROBLOCK * const x = &cpi->mb; - MACROBLOCKD * const xd = &x->e_mbd; int bsl = b_width_log2(BLOCK_64X64), bs = 1 << bsl; int ms = bs / 2; ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; @@ -2023,10 +1751,10 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) { if ((mi_row + (ms >> 1) < cm->mi_rows) && (mi_col + (ms >> 1) < cm->mi_cols)) { cpi->set_ref_frame_mask = 1; - pick_sb_modes(cpi, mi_row, mi_col, &r, &d, BLOCK_64X64, + pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64, get_block_context(x, BLOCK_64X64), INT64_MAX); - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, BLOCK_64X64); + pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, + mi_row, mi_col, BLOCK_64X64); r += x->partition_cost[pl][PARTITION_NONE]; *(get_sb_partitioning(x, BLOCK_64X64)) = BLOCK_64X64; @@ -2036,27 +1764,27 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) { restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64); } -static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp, - int *totalrate) { +static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, TOKENEXTRA **tp, int *totalrate) { VP9_COMMON * const cm = &cpi->common; int mi_col; // Initialize the left context for the new SB row - vpx_memset(&cm->left_context, 0, sizeof(cm->left_context)); - vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context)); + vpx_memset(&cpi->left_context, 0, sizeof(cpi->left_context)); + vpx_memset(cpi->left_seg_context, 0, sizeof(cpi->left_seg_context)); // Code each SB in the row - for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end; + for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; mi_col += MI_BLOCK_SIZE) { int dummy_rate; int64_t dummy_dist; - vpx_memset(cpi->mb.pred_mv, 0, sizeof(cpi->mb.pred_mv)); + vp9_zero(cpi->mb.pred_mv); if (cpi->sf.reference_masking) - rd_pick_reference_frame(cpi, mi_row, mi_col); + rd_pick_reference_frame(cpi, tile, mi_row, mi_col); - if (cpi->sf.partition_by_variance || cpi->sf.use_lastframe_partitioning || + if (cpi->sf.use_lastframe_partitioning || cpi->sf.use_one_partition_size_always ) { const int idx_str = cm->mode_info_stride * mi_row + mi_col; MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str; @@ -2064,13 +1792,9 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp, cpi->mb.source_variance = UINT_MAX; if (cpi->sf.use_one_partition_size_always) { - set_offsets(cpi, mi_row, mi_col, BLOCK_64X64); - set_partitioning(cpi, mi_8x8, mi_row, mi_col); - rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1); - } else if (cpi->sf.partition_by_variance) { - choose_partitioning(cpi, cm->mi_grid_visible, mi_row, mi_col); - rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, + set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); + set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col); + rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1); } else { if ((cpi->common.current_video_frame @@ -2078,31 +1802,34 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp, || cm->prev_mi == 0 || cpi->common.show_frame == 0 || cpi->common.frame_type == KEY_FRAME - || cpi->is_src_frame_alt_ref) { + || cpi->is_src_frame_alt_ref + || ((cpi->sf.use_lastframe_partitioning == + LAST_FRAME_PARTITION_LOW_MOTION) && + sb_has_motion(cpi, prev_mi_8x8))) { // If required set upper and lower partition size limits if (cpi->sf.auto_min_max_partition_size) { - set_offsets(cpi, mi_row, mi_col, BLOCK_64X64); - rd_auto_partition_range(cpi, mi_row, mi_col, + set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); + rd_auto_partition_range(cpi, tile, mi_row, mi_col, &cpi->sf.min_partition_size, &cpi->sf.max_partition_size); } - rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_64X64, + rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1, INT64_MAX); } else { copy_partitioning(cpi, mi_8x8, prev_mi_8x8); - rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, + rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1); } } } else { // If required set upper and lower partition size limits if (cpi->sf.auto_min_max_partition_size) { - set_offsets(cpi, mi_row, mi_col, BLOCK_64X64); - rd_auto_partition_range(cpi, mi_row, mi_col, + set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); + rd_auto_partition_range(cpi, tile, mi_row, mi_col, &cpi->sf.min_partition_size, &cpi->sf.max_partition_size); } - rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_64X64, + rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1, INT64_MAX); } } @@ -2120,7 +1847,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { xd->mode_info_stride = cm->mode_info_stride; // reset intra mode contexts - if (cm->frame_type == KEY_FRAME) + if (frame_is_intra_only(cm)) vp9_init_mbmode_probs(cm); // Copy data over into macro block data structures. @@ -2129,16 +1856,16 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { // TODO(jkoleszar): are these initializations required? setup_pre_planes(xd, 0, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]], 0, 0, NULL); - setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0); + setup_dst_planes(xd, get_frame_new_buffer(cm), 0, 0); setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); - xd->this_mi->mbmi.mode = DC_PRED; - xd->this_mi->mbmi.uv_mode = DC_PRED; + xd->mi_8x8[0]->mbmi.mode = DC_PRED; + xd->mi_8x8[0]->mbmi.uv_mode = DC_PRED; - vp9_zero(cpi->y_mode_count) - vp9_zero(cpi->y_uv_mode_count) - vp9_zero(cm->counts.inter_mode) + vp9_zero(cpi->y_mode_count); + vp9_zero(cpi->y_uv_mode_count); + vp9_zero(cm->counts.inter_mode); vp9_zero(cpi->partition_count); vp9_zero(cpi->intra_inter_count); vp9_zero(cpi->comp_inter_count); @@ -2149,29 +1876,26 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { // Note: this memset assumes above_context[0], [1] and [2] // are allocated as part of the same buffer. - vpx_memset(cm->above_context[0], 0, - sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * aligned_mi_cols); - vpx_memset(cm->above_seg_context, 0, - sizeof(PARTITION_CONTEXT) * aligned_mi_cols); + vpx_memset(cpi->above_context[0], 0, + sizeof(*cpi->above_context[0]) * + 2 * aligned_mi_cols * MAX_MB_PLANE); + vpx_memset(cpi->above_seg_context, 0, + sizeof(*cpi->above_seg_context) * aligned_mi_cols); } static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { if (lossless) { // printf("Switching to lossless\n"); - cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4; - cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4; - cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add; - cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add; + cpi->mb.fwd_txm4x4 = vp9_fwht4x4; + cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add; cpi->mb.optimize = 0; cpi->common.lf.filter_level = 0; cpi->zbin_mode_boost_enabled = 0; cpi->common.tx_mode = ONLY_4X4; } else { // printf("Not lossless\n"); - cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4; - cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4; - cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add; - cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add; + cpi->mb.fwd_txm4x4 = vp9_fdct4x4; + cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add; } } @@ -2204,21 +1928,15 @@ static void encode_frame_internal(VP9_COMP *cpi) { totalrate = 0; - // Reset frame count of inter 0,0 motion vector usage. - cpi->inter_zz_count = 0; - vp9_zero(cm->counts.switchable_interp); - vp9_zero(cpi->txfm_stepdown_count); + vp9_zero(cpi->tx_stepdown_count); xd->mi_8x8 = cm->mi_grid_visible; // required for vp9_frame_init_quantizer - xd->this_mi = xd->mi_8x8[0] = cm->mi; - xd->mic_stream_ptr = cm->mi; xd->last_mi = cm->prev_mi; - vp9_zero(cpi->NMVcount); vp9_zero(cpi->coef_counts); vp9_zero(cm->counts.eob_branch); @@ -2229,7 +1947,7 @@ static void encode_frame_internal(VP9_COMP *cpi) { vp9_frame_init_quantizer(cpi); - vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y_dc_delta_q); + vp9_initialize_rd_consts(cpi); vp9_initialize_me_consts(cpi, cm->base_qindex); switch_tx_mode(cpi); @@ -2263,16 +1981,15 @@ static void encode_frame_internal(VP9_COMP *cpi) { const int tile_rows = 1 << cm->log2_tile_rows; for (tile_row = 0; tile_row < tile_rows; tile_row++) { - vp9_get_tile_row_offsets(cm, tile_row); - for (tile_col = 0; tile_col < tile_cols; tile_col++) { + TileInfo tile; TOKENEXTRA *tp_old = tp; // For each row of SBs in the frame - vp9_get_tile_col_offsets(cm, tile_col); - for (mi_row = cm->cur_tile_mi_row_start; - mi_row < cm->cur_tile_mi_row_end; mi_row += 8) - encode_sb_row(cpi, mi_row, &tp, &totalrate); + vp9_tile_init(&tile, cm, tile_row, tile_col); + for (mi_row = tile.mi_row_start; + mi_row < tile.mi_row_end; mi_row += 8) + encode_sb_row(cpi, &tile, mi_row, &tp, &totalrate); cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old); assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols)); @@ -2306,7 +2023,6 @@ static void encode_frame_internal(VP9_COMP *cpi) { // Keep record of the total distortion this time around for future use cpi->last_frame_distortion = cpi->frame_distortion; #endif - } static int check_dual_ref_flags(VP9_COMP *cpi) { @@ -2347,18 +2063,19 @@ static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, int mis, TX_SIZE max_tx_size, int bw, int bh, int mi_row, int mi_col, BLOCK_SIZE bsize) { VP9_COMMON * const cm = &cpi->common; - MB_MODE_INFO * const mbmi = &mi_8x8[0]->mbmi; - if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) { return; - - if (mbmi->tx_size > max_tx_size) { - const int ymbs = MIN(bh, cm->mi_rows - mi_row); - const int xmbs = MIN(bw, cm->mi_cols - mi_col); - - assert(vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) || - get_skip_flag(mi_8x8, mis, ymbs, xmbs)); - set_txfm_flag(mi_8x8, mis, ymbs, xmbs, max_tx_size); + } else { + MB_MODE_INFO * const mbmi = &mi_8x8[0]->mbmi; + if (mbmi->tx_size > max_tx_size) { + const int ymbs = MIN(bh, cm->mi_rows - mi_row); + const int xmbs = MIN(bw, cm->mi_cols - mi_col); + + assert(vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) || + get_skip_flag(mi_8x8, mis, ymbs, xmbs)); + set_txfm_flag(mi_8x8, mis, ymbs, xmbs, max_tx_size); + } } } @@ -2424,7 +2141,7 @@ static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) { static int get_frame_type(VP9_COMP *cpi) { int frame_type; - if (cpi->common.frame_type == KEY_FRAME) + if (frame_is_intra_only(&cpi->common)) frame_type = 0; else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame) frame_type = 3; @@ -2453,9 +2170,9 @@ static void select_tx_mode(VP9_COMP *cpi) { unsigned int total = 0; int i; for (i = 0; i < TX_SIZES; ++i) - total += cpi->txfm_stepdown_count[i]; + total += cpi->tx_stepdown_count[i]; if (total) { - double fraction = (double)cpi->txfm_stepdown_count[0] / total; + double fraction = (double)cpi->tx_stepdown_count[0] / total; cpi->common.tx_mode = fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT; // printf("fraction = %f\n", fraction); } // else keep unchanged @@ -2472,21 +2189,23 @@ void vp9_encode_frame(VP9_COMP *cpi) { // requires further work in the rd loop. For now the only supported encoder // side behavior is where the ALT ref buffer has opposite sign bias to // the other two. - if ((cm->ref_frame_sign_bias[ALTREF_FRAME] - == cm->ref_frame_sign_bias[GOLDEN_FRAME]) - || (cm->ref_frame_sign_bias[ALTREF_FRAME] - == cm->ref_frame_sign_bias[LAST_FRAME])) { - cm->allow_comp_inter_inter = 0; - } else { - cm->allow_comp_inter_inter = 1; - cm->comp_fixed_ref = ALTREF_FRAME; - cm->comp_var_ref[0] = LAST_FRAME; - cm->comp_var_ref[1] = GOLDEN_FRAME; + if (!frame_is_intra_only(cm)) { + if ((cm->ref_frame_sign_bias[ALTREF_FRAME] + == cm->ref_frame_sign_bias[GOLDEN_FRAME]) + || (cm->ref_frame_sign_bias[ALTREF_FRAME] + == cm->ref_frame_sign_bias[LAST_FRAME])) { + cm->allow_comp_inter_inter = 0; + } else { + cm->allow_comp_inter_inter = 1; + cm->comp_fixed_ref = ALTREF_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = GOLDEN_FRAME; + } } if (cpi->sf.RD) { int i, pred_type; - INTERPOLATIONFILTERTYPE filter_type; + INTERPOLATION_TYPE filter_type; /* * This code does a single RD pass over the whole frame assuming * either compound, single or hybrid prediction as per whatever has @@ -2554,7 +2273,7 @@ void vp9_encode_frame(VP9_COMP *cpi) { cpi->rd_prediction_type_threshes[frame_type][i] >>= 1; } - for (i = 0; i <= SWITCHABLE_FILTERS; i++) { + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { const int64_t diff = cpi->rd_filter_diff[i] / cpi->common.MBs; cpi->rd_filter_threshes[frame_type][i] = (cpi->rd_filter_threshes[frame_type][i] + diff) / 2; @@ -2627,7 +2346,6 @@ void vp9_encode_frame(VP9_COMP *cpi) { } else { encode_frame_internal(cpi); } - } static void sum_intra_stats(VP9_COMP *cpi, const MODE_INFO *mi) { @@ -2732,7 +2450,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[0])]; YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[idx]; YV12_BUFFER_CONFIG *second_ref_fb = NULL; - if (mbmi->ref_frame[1] > 0) { + if (has_second_ref(mbmi)) { idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[1])]; second_ref_fb = &cm->yv12_fb[idx]; } @@ -2744,7 +2462,6 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col, &xd->scale_factor[1]); - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8)); } @@ -2770,7 +2487,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, (mbmi->skip_coeff || vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) { const uint8_t context = vp9_get_pred_context_tx_size(xd); - update_tx_counts(bsize, context, mbmi->tx_size, &cm->counts.tx); + ++get_tx_counts(bsize, context, &cm->counts.tx)[mbmi->tx_size]; } else { int x, y; TX_SIZE sz = tx_mode_to_biggest_tx_size[cm->tx_mode]; diff --git a/libvpx/vp9/encoder/vp9_encodeframe.h b/libvpx/vp9/encoder/vp9_encodeframe.h index 3991969..3e9f538 100644 --- a/libvpx/vp9/encoder/vp9_encodeframe.h +++ b/libvpx/vp9/encoder/vp9_encodeframe.h @@ -17,6 +17,6 @@ struct yv12_buffer_config; void vp9_setup_src_planes(struct macroblock *x, const struct yv12_buffer_config *src, - int mb_row, int mb_col); + int mi_row, int mi_col); #endif // VP9_ENCODER_VP9_ENCODEFRAME_H_ diff --git a/libvpx/vp9/encoder/vp9_encodeintra.c b/libvpx/vp9/encoder/vp9_encodeintra.c index c5e5dff..32b4593 100644 --- a/libvpx/vp9/encoder/vp9_encodeintra.c +++ b/libvpx/vp9/encoder/vp9_encodeintra.c @@ -9,7 +9,7 @@ */ #include "./vpx_config.h" -#include "vp9_rtcd.h" +#include "./vp9_rtcd.h" #include "vp9/encoder/vp9_quantize.h" #include "vp9/common/vp9_reconintra.h" #include "vp9/encoder/vp9_encodemb.h" diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c index 8dd80a5..75ed8ea 100644 --- a/libvpx/vp9/encoder/vp9_encodemb.c +++ b/libvpx/vp9/encoder/vp9_encodemb.c @@ -8,19 +8,22 @@ * be found in the AUTHORS file in the root of the source tree. */ + +#include "./vp9_rtcd.h" #include "./vpx_config.h" -#include "vp9/encoder/vp9_encodemb.h" + +#include "vpx_mem/vpx_mem.h" + +#include "vp9/common/vp9_idct.h" #include "vp9/common/vp9_reconinter.h" -#include "vp9/encoder/vp9_quantize.h" -#include "vp9/encoder/vp9_tokenize.h" #include "vp9/common/vp9_reconintra.h" -#include "vpx_mem/vpx_mem.h" -#include "vp9/encoder/vp9_rdopt.h" #include "vp9/common/vp9_systemdependent.h" -#include "vp9_rtcd.h" -DECLARE_ALIGNED(16, extern const uint8_t, - vp9_pt_energy_class[MAX_ENTROPY_TOKENS]); +#include "vp9/encoder/vp9_dct.h" +#include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_rdopt.h" +#include "vp9/encoder/vp9_tokenize.h" void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, @@ -38,37 +41,6 @@ void vp9_subtract_block_c(int rows, int cols, } } -static void inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, - int16_t *dqcoeff, uint8_t *dest, - int stride) { - if (eob <= 1) - xd->inv_txm4x4_1_add(dqcoeff, dest, stride); - else - xd->inv_txm4x4_add(dqcoeff, dest, stride); -} - -static void inverse_transform_b_8x8_add(int eob, - int16_t *dqcoeff, uint8_t *dest, - int stride) { - if (eob <= 1) - vp9_short_idct8x8_1_add(dqcoeff, dest, stride); - else if (eob <= 10) - vp9_short_idct10_8x8_add(dqcoeff, dest, stride); - else - vp9_short_idct8x8_add(dqcoeff, dest, stride); -} - -static void inverse_transform_b_16x16_add(int eob, - int16_t *dqcoeff, uint8_t *dest, - int stride) { - if (eob <= 1) - vp9_short_idct16x16_1_add(dqcoeff, dest, stride); - else if (eob <= 10) - vp9_short_idct10_16x16_add(dqcoeff, dest, stride); - else - vp9_short_idct16x16_add(dqcoeff, dest, stride); -} - static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { struct macroblock_plane *const p = &x->plane[plane]; const MACROBLOCKD *const xd = &x->e_mbd; @@ -97,8 +69,7 @@ void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize) { vp9_subtract_sbuv(x, bsize); } - -#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF ) +#define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF) typedef struct vp9_token_state vp9_token_state; struct vp9_token_state { @@ -109,7 +80,7 @@ struct vp9_token_state { short qc; }; -// TODO: experiments to find optimal multiple numbers +// TODO(jimbankoski): experiment to find optimal RD numbers. #define Y1_RD_MULT 4 #define UV_RD_MULT 2 @@ -147,7 +118,7 @@ static void optimize_b(MACROBLOCK *mb, TX_SIZE tx_size) { MACROBLOCKD *const xd = &mb->e_mbd; struct macroblockd_plane *pd = &xd->plane[plane]; - const int ref = is_inter_block(&xd->this_mi->mbmi); + const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi); vp9_token_state tokens[1025][2]; unsigned best_index[1025][2]; const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff, block); @@ -161,40 +132,18 @@ static void optimize_b(MACROBLOCK *mb, int best, band, pt; PLANE_TYPE type = pd->plane_type; int err_mult = plane_rd_mult[type]; - int default_eob; + const int default_eob = 16 << (tx_size << 1); const int16_t *scan, *nb; const int mul = 1 + (tx_size == TX_32X32); uint8_t token_cache[1024]; const int ib = txfrm_block_to_raster_block(plane_bsize, tx_size, block); const int16_t *dequant_ptr = pd->dequant; - const uint8_t * band_translate; + const uint8_t *const band_translate = get_band_translate(tx_size); assert((!type && !plane) || (type && plane)); dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block); qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block); - switch (tx_size) { - default: - case TX_4X4: - default_eob = 16; - scan = get_scan_4x4(get_tx_type_4x4(type, xd, ib)); - band_translate = vp9_coefband_trans_4x4; - break; - case TX_8X8: - scan = get_scan_8x8(get_tx_type_8x8(type, xd)); - default_eob = 64; - band_translate = vp9_coefband_trans_8x8plus; - break; - case TX_16X16: - scan = get_scan_16x16(get_tx_type_16x16(type, xd)); - default_eob = 256; - band_translate = vp9_coefband_trans_8x8plus; - break; - case TX_32X32: - scan = vp9_default_scan_32x32; - default_eob = 1024; - band_translate = vp9_coefband_trans_8x8plus; - break; - } + get_scan(xd, tx_size, type, ib, &scan, &nb); assert(eob <= default_eob); /* Now set up a Viterbi trellis to evaluate alternative roundings. */ @@ -213,7 +162,6 @@ static void optimize_b(MACROBLOCK *mb, for (i = 0; i < eob; i++) token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[ qcoeff_ptr[scan[i]]].token]; - nb = vp9_get_coef_neighbors_handle(scan); for (i = eob; i-- > i0;) { int base_bits, d2, dx; @@ -312,11 +260,10 @@ static void optimize_b(MACROBLOCK *mb, best_index[i][1] = best; /* Finally, make this the new head of the trellis. */ next = i; - } - /* There's no choice to make for a zero coefficient, so we don't - * add a new trellis node, but we do need to update the costs. - */ - else { + } else { + /* There's no choice to make for a zero coefficient, so we don't + * add a new trellis node, but we do need to update the costs. + */ band = get_coef_band(band_translate, i + 1); t0 = tokens[next][0].token; t1 = tokens[next][1].token; @@ -385,38 +332,12 @@ static void optimize_init_b(int plane, BLOCK_SIZE bsize, const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; - const MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; + const MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size; - int i; - switch (tx_size) { - case TX_4X4: - vpx_memcpy(args->ctx->ta[plane], pd->above_context, - sizeof(ENTROPY_CONTEXT) * num_4x4_w); - vpx_memcpy(args->ctx->tl[plane], pd->left_context, - sizeof(ENTROPY_CONTEXT) * num_4x4_h); - break; - case TX_8X8: - for (i = 0; i < num_4x4_w; i += 2) - args->ctx->ta[plane][i] = !!*(uint16_t *)&pd->above_context[i]; - for (i = 0; i < num_4x4_h; i += 2) - args->ctx->tl[plane][i] = !!*(uint16_t *)&pd->left_context[i]; - break; - case TX_16X16: - for (i = 0; i < num_4x4_w; i += 4) - args->ctx->ta[plane][i] = !!*(uint32_t *)&pd->above_context[i]; - for (i = 0; i < num_4x4_h; i += 4) - args->ctx->tl[plane][i] = !!*(uint32_t *)&pd->left_context[i]; - break; - case TX_32X32: - for (i = 0; i < num_4x4_w; i += 8) - args->ctx->ta[plane][i] = !!*(uint64_t *)&pd->above_context[i]; - for (i = 0; i < num_4x4_h; i += 8) - args->ctx->tl[plane][i] = !!*(uint64_t *)&pd->left_context[i]; - break; - default: - assert(0); - } + vp9_get_entropy_contexts(tx_size, args->ctx->ta[plane], args->ctx->tl[plane], + pd->above_context, pd->left_context, + num_4x4_w, num_4x4_h); } void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize, @@ -445,9 +366,9 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize, yoff = 32 * (block >> twl); src_diff = p->src_diff + 4 * bw * yoff + xoff; if (x->use_lp32x32fdct) - vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8); + vp9_fdct32x32_rd(src_diff, coeff, bw * 4); else - vp9_short_fdct32x32(src_diff, coeff, bw * 8); + vp9_fdct32x32(src_diff, coeff, bw * 4); vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); @@ -459,7 +380,7 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize, xoff = 16 * (block & twmask); yoff = 16 * (block >> twl); src_diff = p->src_diff + 4 * bw * yoff + xoff; - x->fwd_txm16x16(src_diff, coeff, bw * 8); + vp9_fdct16x16(src_diff, coeff, bw * 4); vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); @@ -471,7 +392,7 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize, xoff = 8 * (block & twmask); yoff = 8 * (block >> twl); src_diff = p->src_diff + 4 * bw * yoff + xoff; - x->fwd_txm8x8(src_diff, coeff, bw * 8); + vp9_fdct8x8(src_diff, coeff, bw * 4); vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); @@ -482,7 +403,7 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize, xoff = 4 * (block & twmask); yoff = 4 * (block >> twl); src_diff = p->src_diff + 4 * bw * yoff + xoff; - x->fwd_txm4x4(src_diff, coeff, bw * 8); + x->fwd_txm4x4(src_diff, coeff, bw * 4); vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); @@ -497,6 +418,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, struct encode_b_args *const args = arg; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; + struct optimize_ctx *const ctx = args->ctx; struct macroblockd_plane *const pd = &xd->plane[plane]; const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size, block); @@ -504,38 +426,68 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block, pd->dst.buf, pd->dst.stride); + + // TODO(jingning): per transformed block zero forcing only enabled for + // luma component. will integrate chroma components as well. + if (x->zcoeff_blk[tx_size][block] && plane == 0) { + int i, j; + pd->eobs[block] = 0; + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); + ctx->ta[plane][i] = 0; + ctx->tl[plane][j] = 0; + return; + } + vp9_xform_quant(plane, block, plane_bsize, tx_size, arg); if (x->optimize) - vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx); + vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx); if (x->skip_encode || pd->eobs[block] == 0) return; switch (tx_size) { case TX_32X32: - vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride); + vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]); break; case TX_16X16: - inverse_transform_b_16x16_add(pd->eobs[block], dqcoeff, dst, - pd->dst.stride); + vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]); break; case TX_8X8: - inverse_transform_b_8x8_add(pd->eobs[block], dqcoeff, dst, - pd->dst.stride); + vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]); break; case TX_4X4: // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff, - dst, pd->dst.stride); + xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]); break; default: assert(!"Invalid transform size"); } } +static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + struct encode_b_args *const args = arg; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size, + block); + + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block, + pd->dst.buf, pd->dst.stride); + + vp9_xform_quant(plane, block, plane_bsize, tx_size, arg); + + if (pd->eobs[block] == 0) + return; + + xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]); +} + void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) { MACROBLOCKD *const xd = &x->e_mbd; struct optimize_ctx ctx; @@ -545,7 +497,7 @@ void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) { if (x->optimize) optimize_init_b(0, bsize, &arg); - foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg); + foreach_transformed_block_in_plane(xd, bsize, 0, encode_block_pass1, &arg); } void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) { @@ -569,7 +521,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, struct encode_b_args* const args = arg; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; + MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; int16_t *coeff = BLOCK_OFFSET(p->coeff, block); @@ -607,14 +559,14 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, vp9_subtract_block(32, 32, src_diff, bw * 4, src, p->src.stride, dst, pd->dst.stride); if (x->use_lp32x32fdct) - vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8); + vp9_fdct32x32_rd(src_diff, coeff, bw * 4); else - vp9_short_fdct32x32(src_diff, coeff, bw * 8); + vp9_fdct32x32(src_diff, coeff, bw * 4); vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); if (!x->skip_encode && *eob) - vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride); + vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, *eob); break; case TX_16X16: tx_type = get_tx_type_16x16(pd->plane_type, xd); @@ -631,19 +583,12 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, dst, pd->dst.stride, dst, pd->dst.stride); vp9_subtract_block(16, 16, src_diff, bw * 4, src, p->src.stride, dst, pd->dst.stride); - if (tx_type != DCT_DCT) - vp9_short_fht16x16(src_diff, coeff, bw * 4, tx_type); - else - x->fwd_txm16x16(src_diff, coeff, bw * 8); + vp9_fht16x16(tx_type, src_diff, coeff, bw * 4); vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); - if (!x->skip_encode && *eob) { - if (tx_type == DCT_DCT) - inverse_transform_b_16x16_add(*eob, dqcoeff, dst, pd->dst.stride); - else - vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type); - } + if (!x->skip_encode && *eob) + vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob); break; case TX_8X8: tx_type = get_tx_type_8x8(pd->plane_type, xd); @@ -660,26 +605,19 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, dst, pd->dst.stride, dst, pd->dst.stride); vp9_subtract_block(8, 8, src_diff, bw * 4, src, p->src.stride, dst, pd->dst.stride); - if (tx_type != DCT_DCT) - vp9_short_fht8x8(src_diff, coeff, bw * 4, tx_type); - else - x->fwd_txm8x8(src_diff, coeff, bw * 8); + vp9_fht8x8(tx_type, src_diff, coeff, bw * 4); vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); - if (!x->skip_encode && *eob) { - if (tx_type == DCT_DCT) - inverse_transform_b_8x8_add(*eob, dqcoeff, dst, pd->dst.stride); - else - vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type); - } + if (!x->skip_encode && *eob) + vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob); break; case TX_4X4: tx_type = get_tx_type_4x4(pd->plane_type, xd, block); scan = get_scan_4x4(tx_type); iscan = get_iscan_4x4(tx_type); if (mbmi->sb_type < BLOCK_8X8 && plane == 0) - mode = xd->this_mi->bmi[block].as_mode; + mode = xd->mi_8x8[0]->bmi[block].as_mode; else mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; @@ -695,7 +633,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, if (tx_type != DCT_DCT) vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type); else - x->fwd_txm4x4(src_diff, coeff, bw * 8); + x->fwd_txm4x4(src_diff, coeff, bw * 4); vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); @@ -704,9 +642,9 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - inverse_transform_b_4x4_add(xd, *eob, dqcoeff, dst, pd->dst.stride); + xd->itxm_add(dqcoeff, dst, pd->dst.stride, *eob); else - vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type); + vp9_iht4x4_16_add(dqcoeff, dst, pd->dst.stride, tx_type); } break; default: diff --git a/libvpx/vp9/encoder/vp9_encodemb.h b/libvpx/vp9/encoder/vp9_encodemb.h index 54e69fd..61dd735 100644 --- a/libvpx/vp9/encoder/vp9_encodemb.h +++ b/libvpx/vp9/encoder/vp9_encodemb.h @@ -16,32 +16,17 @@ #include "vp9/encoder/vp9_onyx_int.h" #include "vp9/common/vp9_onyxc_int.h" -typedef enum { - RD_DC_PRED = DC_PRED, - RD_V_PRED = V_PRED, - RD_H_PRED = H_PRED, - RD_D45_PRED = D45_PRED, - RD_D135_PRED = D135_PRED, - RD_D117_PRED = D117_PRED, - RD_D153_PRED = D153_PRED, - RD_D207_PRED = D207_PRED, - RD_D63_PRED = D63_PRED, - RD_TM_PRED = TM_PRED, - RD_NEARESTMV = NEARESTMV, - RD_NEARMV = NEARMV, - RD_ZEROMV = ZEROMV, - RD_NEWMV = NEWMV, - RD_I4X4_PRED, - RD_SPLITMV, - RD_MODE_COUNT -} RD_PREDICTION_MODE; - typedef struct { - RD_PREDICTION_MODE mode; + MB_PREDICTION_MODE mode; MV_REFERENCE_FRAME ref_frame; MV_REFERENCE_FRAME second_ref_frame; } MODE_DEFINITION; +typedef struct { + MV_REFERENCE_FRAME ref_frame; + MV_REFERENCE_FRAME second_ref_frame; +} REF_DEFINITION; + struct optimize_ctx { ENTROPY_CONTEXT ta[MAX_MB_PLANE][16]; ENTROPY_CONTEXT tl[MAX_MB_PLANE][16]; diff --git a/libvpx/vp9/encoder/vp9_encodemv.c b/libvpx/vp9/encoder/vp9_encodemv.c index ed3a2bb..e2c6c4c 100644 --- a/libvpx/vp9/encoder/vp9_encodemv.c +++ b/libvpx/vp9/encoder/vp9_encodemv.c @@ -8,13 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <math.h> #include "vp9/common/vp9_common.h" -#include "vp9/encoder/vp9_encodemv.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_systemdependent.h" +#include "vp9/encoder/vp9_encodemv.h" -#include <math.h> #ifdef ENTROPY_STATS extern unsigned int active_section; @@ -124,8 +124,9 @@ static void build_nmv_component_cost_table(int *mvcost, } } -static int update_mv(vp9_writer *w, const unsigned int ct[2], - vp9_prob *cur_p, vp9_prob new_p, vp9_prob upd_p) { +static int update_mv(vp9_writer *w, const unsigned int ct[2], vp9_prob *cur_p, + vp9_prob upd_p) { + const vp9_prob new_p = get_binary_prob(ct[0], ct[1]); vp9_prob mod_p = new_p | 1; const int cur_b = cost_branch256(ct, *cur_p); const int mod_b = cost_branch256(ct, mod_p); @@ -143,7 +144,6 @@ static int update_mv(vp9_writer *w, const unsigned int ct[2], static void counts_to_nmv_context( nmv_context_counts *nmv_count, - nmv_context *prob, int usehp, unsigned int (*branch_ct_joint)[2], unsigned int (*branch_ct_sign)[2], @@ -156,29 +156,24 @@ static void counts_to_nmv_context( unsigned int (*branch_ct_hp)[2]) { int i, j, k; vp9_tree_probs_from_distribution(vp9_mv_joint_tree, - prob->joints, branch_ct_joint, nmv_count->joints, 0); for (i = 0; i < 2; ++i) { const uint32_t s0 = nmv_count->comps[i].sign[0]; const uint32_t s1 = nmv_count->comps[i].sign[1]; - prob->comps[i].sign = get_binary_prob(s0, s1); branch_ct_sign[i][0] = s0; branch_ct_sign[i][1] = s1; vp9_tree_probs_from_distribution(vp9_mv_class_tree, - prob->comps[i].classes, - branch_ct_classes[i], - nmv_count->comps[i].classes, 0); + branch_ct_classes[i], + nmv_count->comps[i].classes, 0); vp9_tree_probs_from_distribution(vp9_mv_class0_tree, - prob->comps[i].class0, branch_ct_class0[i], nmv_count->comps[i].class0, 0); for (j = 0; j < MV_OFFSET_BITS; ++j) { const uint32_t b0 = nmv_count->comps[i].bits[j][0]; const uint32_t b1 = nmv_count->comps[i].bits[j][1]; - prob->comps[i].bits[j] = get_binary_prob(b0, b1); branch_ct_bits[i][j][0] = b0; branch_ct_bits[i][j][1] = b1; } @@ -186,12 +181,10 @@ static void counts_to_nmv_context( for (i = 0; i < 2; ++i) { for (k = 0; k < CLASS0_SIZE; ++k) { vp9_tree_probs_from_distribution(vp9_mv_fp_tree, - prob->comps[i].class0_fp[k], branch_ct_class0_fp[i][k], nmv_count->comps[i].class0_fp[k], 0); } vp9_tree_probs_from_distribution(vp9_mv_fp_tree, - prob->comps[i].fp, branch_ct_fp[i], nmv_count->comps[i].fp, 0); } @@ -202,11 +195,9 @@ static void counts_to_nmv_context( const uint32_t hp0 = nmv_count->comps[i].hp[0]; const uint32_t hp1 = nmv_count->comps[i].hp[1]; - prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1); branch_ct_class0_hp[i][0] = c0_hp0; branch_ct_class0_hp[i][1] = c0_hp1; - prob->comps[i].hp = get_binary_prob(hp0, hp1); branch_ct_hp[i][0] = hp0; branch_ct_hp[i][1] = hp1; } @@ -215,7 +206,6 @@ static void counts_to_nmv_context( void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) { int i, j; - nmv_context prob; unsigned int branch_ct_joint[MV_JOINTS - 1][2]; unsigned int branch_ct_sign[2][2]; unsigned int branch_ct_classes[2][MV_CLASSES - 1][2]; @@ -227,30 +217,28 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) { unsigned int branch_ct_hp[2][2]; nmv_context *mvc = &cpi->common.fc.nmvc; - counts_to_nmv_context(&cpi->NMVcount, &prob, usehp, + counts_to_nmv_context(&cpi->NMVcount, usehp, branch_ct_joint, branch_ct_sign, branch_ct_classes, branch_ct_class0, branch_ct_bits, branch_ct_class0_fp, branch_ct_fp, branch_ct_class0_hp, branch_ct_hp); for (j = 0; j < MV_JOINTS - 1; ++j) - update_mv(bc, branch_ct_joint[j], &mvc->joints[j], prob.joints[j], - NMV_UPDATE_PROB); + update_mv(bc, branch_ct_joint[j], &mvc->joints[j], NMV_UPDATE_PROB); for (i = 0; i < 2; ++i) { - update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign, - prob.comps[i].sign, NMV_UPDATE_PROB); + update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign, NMV_UPDATE_PROB); for (j = 0; j < MV_CLASSES - 1; ++j) update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j], - prob.comps[i].classes[j], NMV_UPDATE_PROB); + NMV_UPDATE_PROB); for (j = 0; j < CLASS0_SIZE - 1; ++j) update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j], - prob.comps[i].class0[j], NMV_UPDATE_PROB); + NMV_UPDATE_PROB); for (j = 0; j < MV_OFFSET_BITS; ++j) update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j], - prob.comps[i].bits[j], NMV_UPDATE_PROB); + NMV_UPDATE_PROB); } for (i = 0; i < 2; ++i) { @@ -258,21 +246,19 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) { int k; for (k = 0; k < 3; ++k) update_mv(bc, branch_ct_class0_fp[i][j][k], - &mvc->comps[i].class0_fp[j][k], - prob.comps[i].class0_fp[j][k], NMV_UPDATE_PROB); + &mvc->comps[i].class0_fp[j][k], NMV_UPDATE_PROB); } for (j = 0; j < 3; ++j) - update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j], - prob.comps[i].fp[j], NMV_UPDATE_PROB); + update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j], NMV_UPDATE_PROB); } if (usehp) { for (i = 0; i < 2; ++i) { update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp, - prob.comps[i].class0_hp, NMV_UPDATE_PROB); + NMV_UPDATE_PROB); update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp, - prob.comps[i].hp, NMV_UPDATE_PROB); + NMV_UPDATE_PROB); } } } @@ -314,44 +300,34 @@ void vp9_build_nmv_cost_table(int *mvjoint, build_nmv_component_cost_table(mvcost[1], &mvctx->comps[1], usehp); } -void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x, - int_mv *best_ref_mv, int_mv *second_best_ref_mv) { +static void inc_mvs(int_mv mv[2], int_mv ref[2], int is_compound, + nmv_context_counts *counts) { + int i; + for (i = 0; i < 1 + is_compound; ++i) { + const MV diff = { mv[i].as_mv.row - ref[i].as_mv.row, + mv[i].as_mv.col - ref[i].as_mv.col }; + vp9_inc_mv(&diff, counts); + } +} + +void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]) { MODE_INFO *mi = x->e_mbd.mi_8x8[0]; MB_MODE_INFO *const mbmi = &mi->mbmi; - MV diff; - const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type]; - const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type]; - int idx, idy; + const int is_compound = has_second_ref(mbmi); if (mbmi->sb_type < BLOCK_8X8) { - PARTITION_INFO *pi = x->partition_info; - for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { - for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { + const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type]; + const int num_4x4_h = num_4x4_blocks_high_lookup[mbmi->sb_type]; + int idx, idy; + + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { const int i = idy * 2 + idx; - if (pi->bmi[i].mode == NEWMV) { - diff.row = mi->bmi[i].as_mv[0].as_mv.row - best_ref_mv->as_mv.row; - diff.col = mi->bmi[i].as_mv[0].as_mv.col - best_ref_mv->as_mv.col; - vp9_inc_mv(&diff, &cpi->NMVcount); - - if (mi->mbmi.ref_frame[1] > INTRA_FRAME) { - diff.row = mi->bmi[i].as_mv[1].as_mv.row - - second_best_ref_mv->as_mv.row; - diff.col = mi->bmi[i].as_mv[1].as_mv.col - - second_best_ref_mv->as_mv.col; - vp9_inc_mv(&diff, &cpi->NMVcount); - } - } + if (mi->bmi[i].as_mode == NEWMV) + inc_mvs(mi->bmi[i].as_mv, best_ref_mv, is_compound, &cpi->NMVcount); } } } else if (mbmi->mode == NEWMV) { - diff.row = mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row; - diff.col = mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col; - vp9_inc_mv(&diff, &cpi->NMVcount); - - if (mbmi->ref_frame[1] > INTRA_FRAME) { - diff.row = mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row; - diff.col = mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col; - vp9_inc_mv(&diff, &cpi->NMVcount); - } + inc_mvs(mbmi->mv, best_ref_mv, is_compound, &cpi->NMVcount); } } diff --git a/libvpx/vp9/encoder/vp9_encodemv.h b/libvpx/vp9/encoder/vp9_encodemv.h index 2789ce1..6331778 100644 --- a/libvpx/vp9/encoder/vp9_encodemv.h +++ b/libvpx/vp9/encoder/vp9_encodemv.h @@ -25,7 +25,7 @@ void vp9_build_nmv_cost_table(int *mvjoint, int usehp, int mvc_flag_v, int mvc_flag_h); -void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x, - int_mv *best_ref_mv, int_mv *second_best_ref_mv); + +void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]); #endif // VP9_ENCODER_VP9_ENCODEMV_H_ diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c index 9cf7b83..6a3555d 100644 --- a/libvpx/vp9/encoder/vp9_firstpass.c +++ b/libvpx/vp9/encoder/vp9_firstpass.c @@ -8,8 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "math.h" -#include "limits.h" +#include <math.h> +#include <limits.h> +#include <stdio.h> #include "vp9/encoder/vp9_block.h" #include "vp9/encoder/vp9_onyx_int.h" #include "vp9/encoder/vp9_variance.h" @@ -23,13 +24,13 @@ #include "vp9/common/vp9_systemdependent.h" #include "vpx_mem/vpx_mem.h" #include "vpx_scale/yv12config.h" -#include <stdio.h> #include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_rdopt.h" #include "vp9/encoder/vp9_ratectrl.h" #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_vaq.h" #include "./vpx_scale_rtcd.h" // TODO(jkoleszar): for setup_dst_planes #include "vp9/common/vp9_reconinter.h" @@ -77,7 +78,8 @@ static int select_cq_level(int qindex) { } -// Resets the first pass file to the given position using a relative seek from the current position +// Resets the first pass file to the given position using a relative seek from +// the current position. static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *position) { cpi->twopass.stats_in = position; } @@ -250,8 +252,10 @@ static void avg_stats(FIRSTPASS_STATS *section) { section->duration /= section->count; } -// Calculate a modified Error used in distributing bits between easier and harder frames -static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { +// Calculate a modified Error used in distributing bits between easier and +// harder frames. +static double calculate_modified_err(VP9_COMP *cpi, + FIRSTPASS_STATS *this_frame) { const FIRSTPASS_STATS *const stats = &cpi->twopass.total_stats; const double av_err = stats->ssim_weighted_pred_err / stats->count; const double this_err = this_frame->ssim_weighted_pred_err; @@ -260,38 +264,43 @@ static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) } static const double weight_table[256] = { - 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, - 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, - 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, - 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, - 0.020000, 0.031250, 0.062500, 0.093750, 0.125000, 0.156250, 0.187500, 0.218750, - 0.250000, 0.281250, 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750, - 0.500000, 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750, - 0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, 0.968750, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000 + 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, + 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, + 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, + 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, + 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.031250, 0.062500, + 0.093750, 0.125000, 0.156250, 0.187500, 0.218750, 0.250000, 0.281250, + 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750, 0.500000, + 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750, + 0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, + 0.968750, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, + 1.000000, 1.000000, 1.000000, 1.000000 }; static double simple_weight(YV12_BUFFER_CONFIG *source) { @@ -300,7 +309,8 @@ static double simple_weight(YV12_BUFFER_CONFIG *source) { uint8_t *src = source->y_buffer; double sum_weights = 0.0; - // Loop throught the Y plane raw examining levels and creating a weight for the image + // Loop through the Y plane examining levels and creating a weight for + // the image. i = source->y_height; do { j = source->y_width; @@ -340,13 +350,15 @@ void vp9_end_first_pass(VP9_COMP *cpi) { output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats); } -static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) { +static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, + YV12_BUFFER_CONFIG *recon_buffer, + int *best_motion_err, int recon_yoffset) { MACROBLOCKD *const xd = &x->e_mbd; // Set up pointers for this macro block recon buffer xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset; - switch (xd->this_mi->mbmi.sb_type) { + switch (xd->mi_8x8[0]->mbmi.sb_type) { case BLOCK_8X8: vp9_mse8x8(x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride, @@ -385,7 +397,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; int n; vp9_variance_fn_ptr_t v_fn_ptr = - cpi->fn_ptr[xd->this_mi->mbmi.sb_type]; + cpi->fn_ptr[xd->mi_8x8[0]->mbmi.sb_type]; int new_mv_mode_penalty = 256; int sr = 0; @@ -402,7 +414,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, further_steps -= sr; // override the default variance function to use MSE - switch (xd->this_mi->mbmi.sb_type) { + switch (xd->mi_8x8[0]->mbmi.sb_type) { case BLOCK_8X8: v_fn_ptr.vf = vp9_mse8x8; break; @@ -444,9 +456,9 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, while (n < further_steps) { n++; - if (num00) + if (num00) { num00--; - else { + } else { tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv, step_param + n, x->sadperbit16, &num00, &v_fn_ptr, @@ -469,13 +481,14 @@ void vp9_first_pass(VP9_COMP *cpi) { MACROBLOCK *const x = &cpi->mb; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; + TileInfo tile; int recon_yoffset, recon_uvoffset; const int lst_yv12_idx = cm->ref_frame_map[cpi->lst_fb_idx]; const int gld_yv12_idx = cm->ref_frame_map[cpi->gld_fb_idx]; YV12_BUFFER_CONFIG *const lst_yv12 = &cm->yv12_fb[lst_yv12_idx]; - YV12_BUFFER_CONFIG *const new_yv12 = &cm->yv12_fb[cm->new_fb_idx]; YV12_BUFFER_CONFIG *const gld_yv12 = &cm->yv12_fb[gld_yv12_idx]; + YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm); const int recon_y_stride = lst_yv12->y_stride; const int recon_uv_stride = lst_yv12->uv_stride; int64_t intra_error = 0; @@ -504,12 +517,9 @@ void vp9_first_pass(VP9_COMP *cpi) { setup_pre_planes(xd, 0, lst_yv12, 0, 0, NULL); setup_dst_planes(xd, new_yv12, 0, 0); - x->partition_info = x->pi; xd->mi_8x8 = cm->mi_grid_visible; // required for vp9_frame_init_quantizer - xd->this_mi = xd->mi_8x8[0] = cm->mi; - xd->mic_stream_ptr = cm->mi; setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); @@ -520,9 +530,12 @@ void vp9_first_pass(VP9_COMP *cpi) { // if ( 0 ) { vp9_init_mv_probs(cm); - vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y_dc_delta_q); + vp9_initialize_rd_consts(cpi); } + // tiling is ignored in the first pass + vp9_tile_init(&tile, cm, 0, 0); + // for each macroblock row in image for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { int_mv best_ref_mv; @@ -534,16 +547,21 @@ void vp9_first_pass(VP9_COMP *cpi) { recon_yoffset = (mb_row * recon_y_stride * 16); recon_uvoffset = (mb_row * recon_uv_stride * 8); - // Set up limit values for motion vectors to prevent them extending outside the UMV borders - x->mv_row_min = -((mb_row * 16) + (VP9BORDERINPIXELS - 8)); + // Set up limit values for motion vectors to prevent them extending + // outside the UMV borders + x->mv_row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16); x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) - + (VP9BORDERINPIXELS - 8); + + BORDER_MV_PIXELS_B16; // for each macroblock col in image for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { int this_error; int gf_motion_error = INT_MAX; int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); + double error_weight; + + vp9_clear_system_state(); // __asm emms; + error_weight = 1.0; // avoid uninitialized warnings xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset; xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset; @@ -552,40 +570,54 @@ void vp9_first_pass(VP9_COMP *cpi) { if (mb_col * 2 + 1 < cm->mi_cols) { if (mb_row * 2 + 1 < cm->mi_rows) { - xd->this_mi->mbmi.sb_type = BLOCK_16X16; + xd->mi_8x8[0]->mbmi.sb_type = BLOCK_16X16; } else { - xd->this_mi->mbmi.sb_type = BLOCK_16X8; + xd->mi_8x8[0]->mbmi.sb_type = BLOCK_16X8; } } else { if (mb_row * 2 + 1 < cm->mi_rows) { - xd->this_mi->mbmi.sb_type = BLOCK_8X16; + xd->mi_8x8[0]->mbmi.sb_type = BLOCK_8X16; } else { - xd->this_mi->mbmi.sb_type = BLOCK_8X8; + xd->mi_8x8[0]->mbmi.sb_type = BLOCK_8X8; } } - xd->this_mi->mbmi.ref_frame[0] = INTRA_FRAME; - set_mi_row_col(cm, xd, + xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME; + set_mi_row_col(xd, &tile, mb_row << 1, - 1 << mi_height_log2(xd->this_mi->mbmi.sb_type), + num_8x8_blocks_high_lookup[xd->mi_8x8[0]->mbmi.sb_type], mb_col << 1, - 1 << mi_height_log2(xd->this_mi->mbmi.sb_type)); + num_8x8_blocks_wide_lookup[xd->mi_8x8[0]->mbmi.sb_type], + cm->mi_rows, cm->mi_cols); + + if (cpi->sf.variance_adaptive_quantization) { + int energy = vp9_block_energy(cpi, x, xd->mi_8x8[0]->mbmi.sb_type); + error_weight = vp9_vaq_inv_q_ratio(energy); + } // do intra 16x16 prediction this_error = vp9_encode_intra(x, use_dc_pred); + if (cpi->sf.variance_adaptive_quantization) { + vp9_clear_system_state(); // __asm emms; + this_error *= error_weight; + } - // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame) - // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv. - // When the error score is very low this causes us to pick all or lots of INTRA modes and throw lots of key frames. + // intrapenalty below deals with situations where the intra and inter + // error scores are very low (eg a plain black frame). + // We do not have special cases in first pass for 0,0 and nearest etc so + // all inter modes carry an overhead cost estimate for the mv. + // When the error score is very low this causes us to pick all or lots of + // INTRA modes and throw lots of key frames. // This penalty adds a cost matching that of a 0,0 mv to the intra case. this_error += intrapenalty; // Cumulative intra error total intra_error += (int64_t)this_error; - // Set up limit values for motion vectors to prevent them extending outside the UMV borders - x->mv_col_min = -((mb_col * 16) + (VP9BORDERINPIXELS - 8)); + // Set up limit values for motion vectors to prevent them extending + // outside the UMV borders. + x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16); x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) - + (VP9BORDERINPIXELS - 8); + + BORDER_MV_PIXELS_B16; // Other than for the first frame do a motion search if (cm->current_video_frame > 0) { @@ -602,12 +634,21 @@ void vp9_first_pass(VP9_COMP *cpi) { first_pass_motion_search(cpi, x, &best_ref_mv, &mv.as_mv, lst_yv12, &motion_error, recon_yoffset); + if (cpi->sf.variance_adaptive_quantization) { + vp9_clear_system_state(); // __asm emms; + motion_error *= error_weight; + } - // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well + // If the current best reference mv is not centered on 0,0 then do a 0,0 + // based search as well. if (best_ref_mv.as_int) { tmp_err = INT_MAX; first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv.as_mv, lst_yv12, &tmp_err, recon_yoffset); + if (cpi->sf.variance_adaptive_quantization) { + vp9_clear_system_state(); // __asm emms; + tmp_err *= error_weight; + } if (tmp_err < motion_error) { motion_error = tmp_err; @@ -624,6 +665,10 @@ void vp9_first_pass(VP9_COMP *cpi) { first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv.as_mv, gld_yv12, &gf_motion_error, recon_yoffset); + if (cpi->sf.variance_adaptive_quantization) { + vp9_clear_system_state(); // __asm emms; + gf_motion_error *= error_weight; + } if ((gf_motion_error < motion_error) && (gf_motion_error < this_error)) { @@ -643,9 +688,9 @@ void vp9_first_pass(VP9_COMP *cpi) { sr_coded_error += gf_motion_error; else sr_coded_error += this_error; - } else + } else { sr_coded_error += motion_error; - + } /* Intra assumed best */ best_ref_mv.as_int = 0; @@ -660,17 +705,17 @@ void vp9_first_pass(VP9_COMP *cpi) { neutral_count++; } - mv.as_mv.row <<= 3; - mv.as_mv.col <<= 3; + mv.as_mv.row *= 8; + mv.as_mv.col *= 8; this_error = motion_error; vp9_set_mbmode_and_mvs(x, NEWMV, &mv); - xd->this_mi->mbmi.tx_size = TX_4X4; - xd->this_mi->mbmi.ref_frame[0] = LAST_FRAME; - xd->this_mi->mbmi.ref_frame[1] = NONE; + xd->mi_8x8[0]->mbmi.tx_size = TX_4X4; + xd->mi_8x8[0]->mbmi.ref_frame[0] = LAST_FRAME; + xd->mi_8x8[0]->mbmi.ref_frame[1] = NONE; vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, - xd->this_mi->mbmi.sb_type); - vp9_encode_sby(x, xd->this_mi->mbmi.sb_type); + xd->mi_8x8[0]->mbmi.sb_type); + vp9_encode_sby(x, xd->mi_8x8[0]->mbmi.sb_type); sum_mvr += mv.as_mv.row; sum_mvr_abs += abs(mv.as_mv.row); sum_mvc += mv.as_mv.col; @@ -717,9 +762,9 @@ void vp9_first_pass(VP9_COMP *cpi) { } } } - } else + } else { sr_coded_error += (int64_t)this_error; - + } coded_error += (int64_t)this_error; // adjust to the next column of macroblocks @@ -778,16 +823,19 @@ void vp9_first_pass(VP9_COMP *cpi) { fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount; fps.MVc = (double)sum_mvc / (double)mvcount; fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount; - fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / (double)mvcount; - fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / (double)mvcount; + fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / + (double)mvcount; + fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / + (double)mvcount; fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2); fps.new_mv_count = new_mv_count; fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs; } - // TODO: handle the case when duration is set to 0, or something less - // than the full time between subsequent values of cpi->source_time_stamp. + // TODO(paulwilkins): Handle the case when duration is set to 0, or + // something less than the full time between subsequent values of + // cpi->source_time_stamp. fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start); @@ -807,15 +855,16 @@ void vp9_first_pass(VP9_COMP *cpi) { 2.0))) { vp8_yv12_copy_frame(lst_yv12, gld_yv12); cpi->twopass.sr_update_lag = 1; - } else + } else { cpi->twopass.sr_update_lag++; - + } // swap frame pointers so last frame refers to the frame we just compressed swap_yv12(lst_yv12, new_yv12); vp9_extend_frame_borders(lst_yv12, cm->subsampling_x, cm->subsampling_y); - // Special case for the first frame. Copy into the GF buffer as a second reference. + // Special case for the first frame. Copy into the GF buffer as a second + // reference. if (cm->current_video_frame == 0) vp8_yv12_copy_frame(lst_yv12, gld_yv12); @@ -823,7 +872,8 @@ void vp9_first_pass(VP9_COMP *cpi) { if (0) { char filename[512]; FILE *recon_file; - sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame); + snprintf(filename, sizeof(filename), "enc%04d.yuv", + (int)cm->current_video_frame); if (cm->current_video_frame == 0) recon_file = fopen(filename, "wb"); @@ -835,7 +885,6 @@ void vp9_first_pass(VP9_COMP *cpi) { } cm->current_video_frame++; - } // Estimate a cost per mb attributable to overheads such as the coding of @@ -878,7 +927,7 @@ static int64_t estimate_modemvcost(VP9_COMP *cpi, (av_intra * intra_cost)) * cpi->common.MBs) << 9; // return mv_cost + mode_cost; - // TODO PGW Fix overhead costs for extended Q range + // TODO(paulwilkins): Fix overhead costs for extended Q range. #endif return 0; } @@ -1102,8 +1151,8 @@ void vp9_init_second_pass(VP9_COMP *cpi) { FIRSTPASS_STATS *start_pos; double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate; - double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth - * cpi->oxcf.two_pass_vbrmin_section / 100); + double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * + cpi->oxcf.two_pass_vbrmin_section / 100); if (two_pass_min_rate < lower_bounds_min_rate) two_pass_min_rate = lower_bounds_min_rate; @@ -1141,15 +1190,17 @@ void vp9_init_second_pass(VP9_COMP *cpi) { // This variable monitors how far behind the second ref update is lagging cpi->twopass.sr_update_lag = 1; - // Scan the first pass file and calculate an average Intra / Inter error score ratio for the sequence + // Scan the first pass file and calculate an average Intra / Inter error score + // ratio for the sequence. { double sum_iiratio = 0.0; double IIRatio; - start_pos = cpi->twopass.stats_in; // Note starting "file" position + start_pos = cpi->twopass.stats_in; // Note the starting "file" position. while (input_stats(cpi, &this_frame) != EOF) { - IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error); + IIRatio = this_frame.intra_error + / DOUBLE_DIVIDE_CHECK(this_frame.coded_error); IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio; sum_iiratio += IIRatio; } @@ -1161,21 +1212,21 @@ void vp9_init_second_pass(VP9_COMP *cpi) { reset_fpf_position(cpi, start_pos); } - // Scan the first pass file and calculate a modified total error based upon the bias/power function - // used to allocate bits + // Scan the first pass file and calculate a modified total error based upon + // the bias/power function used to allocate bits. { - start_pos = cpi->twopass.stats_in; // Note starting "file" position + start_pos = cpi->twopass.stats_in; // Note starting "file" position cpi->twopass.modified_error_total = 0.0; cpi->twopass.modified_error_used = 0.0; while (input_stats(cpi, &this_frame) != EOF) { - cpi->twopass.modified_error_total += calculate_modified_err(cpi, &this_frame); + cpi->twopass.modified_error_total += + calculate_modified_err(cpi, &this_frame); } cpi->twopass.modified_error_left = cpi->twopass.modified_error_total; - reset_fpf_position(cpi, start_pos); // Reset file position - + reset_fpf_position(cpi, start_pos); // Reset file position } } @@ -1321,7 +1372,6 @@ static void accumulate_frame_motion_stats( (this_frame_mvc_ratio < this_frame->mvc_abs) ? (this_frame_mvc_ratio * motion_pct) : this_frame->mvc_abs * motion_pct; - } } @@ -1380,7 +1430,8 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, // Update the motion related elements to the boost calculation accumulate_frame_motion_stats(&this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, - &abs_mv_in_out_accumulator, &mv_ratio_accumulator); + &abs_mv_in_out_accumulator, + &mv_ratio_accumulator); // We want to discount the flash frame itself and the recovery // frame that follows as both will have poor scores. @@ -1416,7 +1467,8 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, // Update the motion related elements to the boost calculation accumulate_frame_motion_stats(&this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, - &abs_mv_in_out_accumulator, &mv_ratio_accumulator); + &abs_mv_in_out_accumulator, + &mv_ratio_accumulator); // We want to discount the the flash frame itself and the recovery // frame that follows as both will have poor scores. @@ -1432,7 +1484,6 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, boost_score += (decay_accumulator * calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out)); - } *b_boost = (int)boost_score; @@ -1666,7 +1717,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Update the motion related elements to the boost calculation accumulate_frame_motion_stats(&next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, - &abs_mv_in_out_accumulator, &mv_ratio_accumulator); + &abs_mv_in_out_accumulator, + &mv_ratio_accumulator); // Cumulative effect of prediction quality decay if (!flash_detected) { @@ -1709,8 +1761,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) || (abs_mv_in_out_accumulator > 3.0) || (mv_in_out_accumulator < -2.0) || - ((boost_score - old_boost_score) < IIFACTOR)) - )) { + ((boost_score - old_boost_score) < IIFACTOR)))) { boost_score = old_boost_score; break; } @@ -1764,7 +1815,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { (mv_in_out_accumulator > -2.0)) && (boost_score > 100)) { // Alternative boost calculation for alt ref - cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost); + cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, + &b_boost); cpi->source_alt_ref_pending = 1; #if CONFIG_MULTIPLE_ARF @@ -1841,9 +1893,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { cpi->twopass.gf_group_bits = (int64_t)(cpi->twopass.kf_group_bits * (gf_group_err / cpi->twopass.kf_group_error_left)); - } else + } else { cpi->twopass.gf_group_bits = 0; - + } cpi->twopass.gf_group_bits = (cpi->twopass.gf_group_bits < 0) ? 0 @@ -1907,11 +1959,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { if (gf_bits > alt_gf_bits) gf_bits = alt_gf_bits; - } - // Else if it is harder than other frames in the group make sure it at - // least receives an allocation in keeping with its relative error - // score, otherwise it may be worse off than an "un-boosted" frame - else { + } else { + // If it is harder than other frames in the group make sure it at + // least receives an allocation in keeping with its relative error + // score, otherwise it may be worse off than an "un-boosted" frame. int alt_gf_bits = (int)((double)cpi->twopass.kf_group_bits * mod_frame_err / DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left)); @@ -2023,9 +2074,9 @@ static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Clip target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at // the top end. - if (target_frame_size < 0) + if (target_frame_size < 0) { target_frame_size = 0; - else { + } else { if (target_frame_size > max_bits) target_frame_size = max_bits; @@ -2093,14 +2144,19 @@ void vp9_second_pass(VP9_COMP *cpi) { cpi->twopass.est_max_qcorrection_factor = 1.0; // Set a cq_level in constrained quality mode. + // Commenting this code out for now since it does not seem to be + // working well. + /* if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) { int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats, - section_target_bandwidth); + section_target_bandwidth); - cpi->cq_target_quality = cpi->oxcf.cq_level; if (est_cq > cpi->cq_target_quality) cpi->cq_target_quality = est_cq; + else + cpi->cq_target_quality = cpi->oxcf.cq_level; } + */ // guess at maxq needed in 2nd pass cpi->twopass.maxq_max_limit = cpi->worst_quality; @@ -2113,17 +2169,14 @@ void vp9_second_pass(VP9_COMP *cpi) { cpi->ni_av_qi = tmp_q; cpi->avg_q = vp9_convert_qindex_to_q(tmp_q); -#ifndef ONE_SHOT_Q_ESTIMATE // Limit the maxq value returned subsequently. // This increases the risk of overspend or underspend if the initial // estimate for the clip is bad, but helps prevent excessive // variation in Q, especially near the end of a clip // where for example a small overspend may cause Q to crash adjust_maxq_qrange(cpi); -#endif } -#ifndef ONE_SHOT_Q_ESTIMATE // The last few frames of a clip almost always have to few or too many // bits and for the sake of over exact rate control we dont want to make // radical adjustments to the allowed quantizer range just to use up a @@ -2146,7 +2199,6 @@ void vp9_second_pass(VP9_COMP *cpi) { cpi->active_worst_quality = adjust_active_maxq(cpi->active_worst_quality, tmp_q); } -#endif } vp9_zero(this_frame); if (EOF == input_stats(cpi, &this_frame)) @@ -2243,16 +2295,17 @@ static int test_candidate_kf(VP9_COMP *cpi, if ((this_frame->pcnt_second_ref < 0.10) && (next_frame->pcnt_second_ref < 0.10) && ((this_frame->pcnt_inter < 0.05) || - ( - ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) && - ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) && - ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) || - (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) || - ((next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5) - ) - ) - ) - ) { + (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) && + ((this_frame->intra_error / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) && + ((fabs(last_frame->coded_error - this_frame->coded_error) / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > + .40) || + (fabs(last_frame->intra_error - this_frame->intra_error) / + DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > + .40) || + ((next_frame->intra_error / + DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5))))) { int i; FIRSTPASS_STATS *start_pos; @@ -2270,7 +2323,8 @@ static int test_candidate_kf(VP9_COMP *cpi, // Examine how well the key frame predicts subsequent frames for (i = 0; i < 16; i++) { - next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error)); + next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / + DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error)); if (next_iiratio > RMAX) next_iiratio = RMAX; @@ -2279,7 +2333,8 @@ static int test_candidate_kf(VP9_COMP *cpi, if (local_next_frame.pcnt_inter > 0.85) decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter; else - decay_accumulator = decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0); + decay_accumulator = + decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0); // decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter; @@ -2307,9 +2362,9 @@ static int test_candidate_kf(VP9_COMP *cpi, // If there is tolerable prediction for at least the next 3 frames then // break out else discard this potential key frame and move on - if (boost_score > 30.0 && (i > 3)) + if (boost_score > 30.0 && (i > 3)) { is_viable_kf = 1; - else { + } else { // Reset the file position reset_fpf_position(cpi, start_pos); @@ -2369,8 +2424,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Accumulate kf group error kf_group_err += calculate_modified_err(cpi, this_frame); - // These figures keep intra and coded error counts for all frames including key frames in the group. - // The effect of the key frame itself can be subtracted out using the first_frame data collected above + // These figures keep intra and coded error counts for all frames including + // key frames in the group. The effect of the key frame itself can be + // subtracted out using the first_frame data collected above. kf_group_intra_err += this_frame->intra_error; kf_group_coded_err += this_frame->coded_error; @@ -2410,9 +2466,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // forcekeyframeevery intervals then break out of the loop. if (cpi->twopass.frames_to_key >= 2 * (int)cpi->key_frame_frequency) break; - } else + } else { cpi->twopass.frames_to_key++; - + } i++; } @@ -2452,22 +2508,24 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { reset_fpf_position(cpi, current_pos); cpi->next_key_frame_forced = 1; - } else + } else { cpi->next_key_frame_forced = 0; - + } // Special case for the last frame of the file if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) { // Accumulate kf group error kf_group_err += calculate_modified_err(cpi, this_frame); - // These figures keep intra and coded error counts for all frames including key frames in the group. - // The effect of the key frame itself can be subtracted out using the first_frame data collected above + // These figures keep intra and coded error counts for all frames including + // key frames in the group. The effect of the key frame itself can be + // subtracted out using the first_frame data collected above. kf_group_intra_err += this_frame->intra_error; kf_group_coded_err += this_frame->coded_error; } // Calculate the number of bits that should be assigned to the kf group. - if ((cpi->twopass.bits_left > 0) && (cpi->twopass.modified_error_left > 0.0)) { + if ((cpi->twopass.bits_left > 0) && + (cpi->twopass.modified_error_left > 0.0)) { // Max for a single normal frame (not key frame) int max_bits = frame_max_bits(cpi); @@ -2484,13 +2542,14 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { max_grp_bits = (int64_t)max_bits * (int64_t)cpi->twopass.frames_to_key; if (cpi->twopass.kf_group_bits > max_grp_bits) cpi->twopass.kf_group_bits = max_grp_bits; - } else + } else { cpi->twopass.kf_group_bits = 0; - + } // Reset the first pass file position reset_fpf_position(cpi, start_position); - // determine how big to make this keyframe based on how well the subsequent frames use inter blocks + // Determine how big to make this keyframe based on how well the subsequent + // frames use inter blocks. decay_accumulator = 1.0; boost_score = 0.0; loop_decay_rate = 1.00; // Starting decay rate @@ -2563,7 +2622,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { if (kf_boost < (cpi->twopass.frames_to_key * 3)) kf_boost = (cpi->twopass.frames_to_key * 3); - if (kf_boost < 300) // Min KF boost + if (kf_boost < 300) // Min KF boost kf_boost = 300; // Make a note of baseline boost and the zero motion @@ -2598,10 +2657,13 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { allocation_chunks /= divisor; } - cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits; + cpi->twopass.kf_group_bits = + (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits; // Calculate the number of bits to be spent on the key frame - cpi->twopass.kf_bits = (int)((double)kf_boost * ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks)); + cpi->twopass.kf_bits = + (int)((double)kf_boost * + ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks)); // If the key frame is actually easier than the average for the // kf group (which does sometimes happen... eg a blank intro frame) @@ -2619,11 +2681,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { if (cpi->twopass.kf_bits > alt_kf_bits) { cpi->twopass.kf_bits = alt_kf_bits; } - } + } else { // Else if it is much harder than other frames in the group make sure // it at least receives an allocation in keeping with its relative // error score - else { alt_kf_bits = (int)((double)cpi->twopass.bits_left * (kf_mod_err / @@ -2649,6 +2710,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err); // Adjust the count of total modified error left. - // The count of bits left is adjusted elsewhere based on real coded frame sizes + // The count of bits left is adjusted elsewhere based on real coded frame + // sizes. cpi->twopass.modified_error_left -= kf_group_err; } diff --git a/libvpx/vp9/encoder/vp9_firstpass.h b/libvpx/vp9/encoder/vp9_firstpass.h index 2296a66..c18d11e 100644 --- a/libvpx/vp9/encoder/vp9_firstpass.h +++ b/libvpx/vp9/encoder/vp9_firstpass.h @@ -10,6 +10,7 @@ #ifndef VP9_ENCODER_VP9_FIRSTPASS_H_ #define VP9_ENCODER_VP9_FIRSTPASS_H_ +#include "vp9/encoder/vp9_onyx_int.h" void vp9_init_first_pass(VP9_COMP *cpi); void vp9_first_pass(VP9_COMP *cpi); diff --git a/libvpx/vp9/encoder/vp9_lookahead.c b/libvpx/vp9/encoder/vp9_lookahead.c index 81445a9..c28c868 100644 --- a/libvpx/vp9/encoder/vp9_lookahead.c +++ b/libvpx/vp9/encoder/vp9_lookahead.c @@ -10,7 +10,7 @@ #include <assert.h> #include <stdlib.h> -#include "vpx_config.h" +#include "./vpx_config.h" #include "vp9/common/vp9_common.h" #include "vp9/encoder/vp9_lookahead.h" #include "vp9/common/vp9_extend.h" @@ -77,7 +77,7 @@ struct lookahead_ctx * vp9_lookahead_init(unsigned int width, goto bail; } return ctx; -bail: + bail: vp9_lookahead_destroy(ctx); return NULL; } diff --git a/libvpx/vp9/encoder/vp9_mbgraph.c b/libvpx/vp9/encoder/vp9_mbgraph.c index 5a671f2..7b605b2 100644 --- a/libvpx/vp9/encoder/vp9_mbgraph.c +++ b/libvpx/vp9/encoder/vp9_mbgraph.c @@ -10,14 +10,17 @@ #include <limits.h> -#include <vpx_mem/vpx_mem.h> -#include <vp9/encoder/vp9_encodeintra.h> -#include <vp9/encoder/vp9_rdopt.h> -#include <vp9/common/vp9_blockd.h> -#include <vp9/common/vp9_reconinter.h> -#include <vp9/common/vp9_reconintra.h> -#include <vp9/common/vp9_systemdependent.h> -#include <vp9/encoder/vp9_segmentation.h> +#include "vpx_mem/vpx_mem.h" +#include "vp9/encoder/vp9_encodeintra.h" +#include "vp9/encoder/vp9_rdopt.h" +#include "vp9/encoder/vp9_segmentation.h" +#include "vp9/encoder/vp9_mcomp.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_systemdependent.h" + + static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, int_mv *ref_mv, @@ -46,9 +49,9 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, ref_full.as_mv.row = ref_mv->as_mv.row >> 3; /*cpi->sf.search_method == HEX*/ - best_err = vp9_hex_search(x, &ref_full, step_param, x->errorperbit, + best_err = vp9_hex_search(x, &ref_full.as_mv, step_param, x->errorperbit, 0, &v_fn_ptr, - 0, ref_mv, dst_mv); + 0, &ref_mv->as_mv, &dst_mv->as_mv); // Try sub-pixel MC // if (bestsme > error_thresh && bestsme < INT_MAX) @@ -57,7 +60,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, unsigned int sse; best_err = cpi->find_fractional_mv_step( x, - dst_mv, ref_mv, + &dst_mv->as_mv, &ref_mv->as_mv, + cpi->common.allow_high_precision_mv, x->errorperbit, &v_fn_ptr, 0, cpi->sf.subpel_iters_per_step, NULL, NULL, & distortion, &sse); @@ -100,7 +104,8 @@ static int do_16x16_motion_search(VP9_COMP *cpi, int_mv *ref_mv, int_mv *dst_mv, dst_mv->as_int = tmp_mv.as_int; } - // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well + // If the current best reference mv is not centered on 0,0 then do a 0,0 + // based search as well. if (ref_mv->as_int) { unsigned int tmp_err; int_mv zero_ref_mv, tmp_mv; @@ -145,7 +150,7 @@ static int find_best_16x16_intra(VP9_COMP *cpi, for (mode = DC_PRED; mode <= TM_PRED; mode++) { unsigned int err; - xd->this_mi->mbmi.mode = mode; + xd->mi_8x8[0]->mbmi.mode = mode; vp9_predict_intra_block(xd, 0, 2, TX_16X16, mode, x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf, xd->plane[0].dst.stride); @@ -189,8 +194,8 @@ static void update_mbgraph_mb_stats x->plane[0].src.buf = buf->y_buffer + mb_y_offset; x->plane[0].src.stride = buf->y_stride; - xd->plane[0].dst.buf = cm->yv12_fb[cm->new_fb_idx].y_buffer + mb_y_offset; - xd->plane[0].dst.stride = cm->yv12_fb[cm->new_fb_idx].y_stride; + xd->plane[0].dst.buf = get_frame_new_buffer(cm)->y_buffer + mb_y_offset; + xd->plane[0].dst.stride = get_frame_new_buffer(cm)->y_stride; // do intra 16x16 prediction intra_error = find_best_16x16_intra(cpi, mb_y_offset, @@ -214,7 +219,8 @@ static void update_mbgraph_mb_stats stats->ref[GOLDEN_FRAME].m.mv.as_int = 0; } - // Alt-ref frame MV search, if it exists and is different than last/golden frame + // Do an Alt-ref frame MV search, if it exists and is different than + // last/golden frame. if (alt_ref) { int a_motion_error; xd->plane[0].pre[0].buf = alt_ref->y_buffer + mb_y_offset; @@ -243,17 +249,17 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, int_mv arf_top_mv, gld_top_mv; MODE_INFO mi_local = { { 0 } }; - // Set up limit values for motion vectors to prevent them extending outside the UMV borders + // Set up limit values for motion vectors to prevent them extending outside + // the UMV borders. arf_top_mv.as_int = 0; gld_top_mv.as_int = 0; - x->mv_row_min = -(VP9BORDERINPIXELS - 8 - VP9_INTERP_EXTEND); - x->mv_row_max = (cm->mb_rows - 1) * 8 + VP9BORDERINPIXELS - - 8 - VP9_INTERP_EXTEND; + x->mv_row_min = -BORDER_MV_PIXELS_B16; + x->mv_row_max = (cm->mb_rows - 1) * 8 + BORDER_MV_PIXELS_B16; xd->up_available = 0; xd->plane[0].dst.stride = buf->y_stride; xd->plane[0].pre[0].stride = buf->y_stride; xd->plane[1].dst.stride = buf->uv_stride; - xd->this_mi = &mi_local; + xd->mi_8x8[0] = &mi_local; mi_local.mbmi.sb_type = BLOCK_16X16; mi_local.mbmi.ref_frame[0] = LAST_FRAME; mi_local.mbmi.ref_frame[1] = NONE; @@ -264,12 +270,12 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, int arf_y_in_offset = arf_y_offset; int gld_y_in_offset = gld_y_offset; - // Set up limit values for motion vectors to prevent them extending outside the UMV borders + // Set up limit values for motion vectors to prevent them extending outside + // the UMV borders. arf_left_mv.as_int = arf_top_mv.as_int; gld_left_mv.as_int = gld_top_mv.as_int; - x->mv_col_min = -(VP9BORDERINPIXELS - 8 - VP9_INTERP_EXTEND); - x->mv_col_max = (cm->mb_cols - 1) * 8 + VP9BORDERINPIXELS - - 8 - VP9_INTERP_EXTEND; + x->mv_col_min = -BORDER_MV_PIXELS_B16; + x->mv_col_max = (cm->mb_cols - 1) * 8 + BORDER_MV_PIXELS_B16; xd->left_available = 0; for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { @@ -307,6 +313,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, static void separate_arf_mbs(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; int mb_col, mb_row, offset, i; + int mi_row, mi_col; int ncnt[4] = { 0 }; int n_frames = cpi->mbgraph_n_frames; @@ -343,22 +350,17 @@ static void separate_arf_mbs(VP9_COMP *cpi) { } } - for (offset = 0, mb_row = 0; mb_row < cm->mb_rows; - offset += cm->mb_cols, mb_row++) { - for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { + // arf_not_zz is indexed by MB, but this loop is indexed by MI to avoid out + // of bound access in segmentation_map + for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) { // If any of the blocks in the sequence failed then the MB // goes in segment 0 - if (arf_not_zz[offset + mb_col]) { + if (arf_not_zz[mi_row/2*cm->mb_cols + mi_col/2]) { ncnt[0]++; - cpi->segmentation_map[offset * 4 + 2 * mb_col] = 0; - cpi->segmentation_map[offset * 4 + 2 * mb_col + 1] = 0; - cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols] = 0; - cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols + 1] = 0; + cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0; } else { - cpi->segmentation_map[offset * 4 + 2 * mb_col] = 1; - cpi->segmentation_map[offset * 4 + 2 * mb_col + 1] = 1; - cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols] = 1; - cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols + 1] = 1; + cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 1; ncnt[1]++; } } @@ -369,7 +371,7 @@ static void separate_arf_mbs(VP9_COMP *cpi) { if (1) { // Note % of blocks that are marked as static if (cm->MBs) - cpi->static_mb_pct = (ncnt[1] * 100) / cm->MBs; + cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols); // This error case should not be reachable as this function should // never be called with the common data structure uninitialized. @@ -406,7 +408,8 @@ void vp9_update_mbgraph_stats(VP9_COMP *cpi) { for (i = 0; i < n_frames; i++) { MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i]; vpx_memset(frame_stats->mb_stats, 0, - cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats)); + cm->mb_rows * cm->mb_cols * + sizeof(*cpi->mbgraph_stats[i].mb_stats)); } // do motion search to find contribution of each reference to data diff --git a/libvpx/vp9/encoder/vp9_mcomp.c b/libvpx/vp9/encoder/vp9_mcomp.c index 1360088..a52f5b1 100644 --- a/libvpx/vp9/encoder/vp9_mcomp.c +++ b/libvpx/vp9/encoder/vp9_mcomp.c @@ -59,38 +59,39 @@ int vp9_init_search_range(VP9_COMP *cpi, int size) { return sr; } -int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2], - int weight) { - MV v; - v.row = mv->as_mv.row - ref->as_mv.row; - v.col = mv->as_mv.col - ref->as_mv.col; - return ROUND_POWER_OF_TWO((mvjcost[vp9_get_mv_joint(&v)] + - mvcost[0][v.row] + - mvcost[1][v.col]) * weight, 7); +static INLINE int mv_cost(const MV *mv, + const int *joint_cost, int *comp_cost[2]) { + return joint_cost[vp9_get_mv_joint(mv)] + + comp_cost[0][mv->row] + comp_cost[1][mv->col]; } -static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2], +int vp9_mv_bit_cost(const MV *mv, const MV *ref, + const int *mvjcost, int *mvcost[2], int weight) { + const MV diff = { mv->row - ref->row, + mv->col - ref->col }; + return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * weight, 7); +} + +static int mv_err_cost(const MV *mv, const MV *ref, + const int *mvjcost, int *mvcost[2], int error_per_bit) { if (mvcost) { - MV v; - v.row = mv->as_mv.row - ref->as_mv.row; - v.col = mv->as_mv.col - ref->as_mv.col; - return ROUND_POWER_OF_TWO((mvjcost[vp9_get_mv_joint(&v)] + - mvcost[0][v.row] + - mvcost[1][v.col]) * error_per_bit, 13); + const MV diff = { mv->row - ref->row, + mv->col - ref->col }; + return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * + error_per_bit, 13); } return 0; } -static int mvsad_err_cost(int_mv *mv, int_mv *ref, int *mvjsadcost, - int *mvsadcost[2], int error_per_bit) { +static int mvsad_err_cost(const MV *mv, const MV *ref, + const int *mvjsadcost, int *mvsadcost[2], + int error_per_bit) { if (mvsadcost) { - MV v; - v.row = mv->as_mv.row - ref->as_mv.row; - v.col = mv->as_mv.col - ref->as_mv.col; - return ROUND_POWER_OF_TWO((mvjsadcost[vp9_get_mv_joint(&v)] + - mvsadcost[0][v.row] + - mvsadcost[1][v.col]) * error_per_bit, 8); + const MV diff = { mv->row - ref->row, + mv->col - ref->col }; + return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjsadcost, mvsadcost) * + error_per_bit, 8); } return 0; } @@ -136,66 +137,26 @@ void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) { } void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) { - int len; - int search_site_count = 0; + int len, ss_count = 1; - // Generate offsets for 8 search sites per step. - x->ss[search_site_count].mv.col = 0; - x->ss[search_site_count].mv.row = 0; - x->ss[search_site_count].offset = 0; - search_site_count++; + x->ss[0].mv.col = x->ss[0].mv.row = 0; + x->ss[0].offset = 0; for (len = MAX_FIRST_STEP; len > 0; len /= 2) { - // Compute offsets for search sites. - x->ss[search_site_count].mv.col = 0; - x->ss[search_site_count].mv.row = -len; - x->ss[search_site_count].offset = -len * stride; - search_site_count++; - - // Compute offsets for search sites. - x->ss[search_site_count].mv.col = 0; - x->ss[search_site_count].mv.row = len; - x->ss[search_site_count].offset = len * stride; - search_site_count++; - - // Compute offsets for search sites. - x->ss[search_site_count].mv.col = -len; - x->ss[search_site_count].mv.row = 0; - x->ss[search_site_count].offset = -len; - search_site_count++; - - // Compute offsets for search sites. - x->ss[search_site_count].mv.col = len; - x->ss[search_site_count].mv.row = 0; - x->ss[search_site_count].offset = len; - search_site_count++; - - // Compute offsets for search sites. - x->ss[search_site_count].mv.col = -len; - x->ss[search_site_count].mv.row = -len; - x->ss[search_site_count].offset = -len * stride - len; - search_site_count++; - - // Compute offsets for search sites. - x->ss[search_site_count].mv.col = len; - x->ss[search_site_count].mv.row = -len; - x->ss[search_site_count].offset = -len * stride + len; - search_site_count++; - - // Compute offsets for search sites. - x->ss[search_site_count].mv.col = -len; - x->ss[search_site_count].mv.row = len; - x->ss[search_site_count].offset = len * stride - len; - search_site_count++; - - // Compute offsets for search sites. - x->ss[search_site_count].mv.col = len; - x->ss[search_site_count].mv.row = len; - x->ss[search_site_count].offset = len * stride + len; - search_site_count++; + // Generate offsets for 8 search sites per step. + const MV ss_mvs[8] = { + {-len, 0 }, {len, 0 }, { 0, -len}, {0, len}, + {-len, -len}, {-len, len}, {len, -len}, {len, len} + }; + int i; + for (i = 0; i < 8; ++i) { + search_site *const ss = &x->ss[ss_count++]; + ss->mv = ss_mvs[i]; + ss->offset = ss->mv.row * stride + ss->mv.col; + } } - x->ss_count = search_site_count; + x->ss_count = ss_count; x->searches_per_step = 8; } @@ -313,7 +274,8 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) { } int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x, - int_mv *bestmv, int_mv *ref_mv, + MV *bestmv, const MV *ref_mv, + int allow_hp, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, @@ -333,38 +295,34 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x, unsigned int eighthiters = iters_per_step; int thismse; - uint8_t *y = xd->plane[0].pre[0].buf + - (bestmv->as_mv.row) * xd->plane[0].pre[0].stride + - bestmv->as_mv.col; - const int y_stride = xd->plane[0].pre[0].stride; + const int offset = bestmv->row * y_stride + bestmv->col; + uint8_t *y = xd->plane[0].pre[0].buf + offset; - int rr = ref_mv->as_mv.row; - int rc = ref_mv->as_mv.col; - int br = bestmv->as_mv.row << 3; - int bc = bestmv->as_mv.col << 3; + int rr = ref_mv->row; + int rc = ref_mv->col; + int br = bestmv->row * 8; + int bc = bestmv->col * 8; int hstep = 4; - const int minc = MAX(x->mv_col_min << 3, ref_mv->as_mv.col - MV_MAX); - const int maxc = MIN(x->mv_col_max << 3, ref_mv->as_mv.col + MV_MAX); - const int minr = MAX(x->mv_row_min << 3, ref_mv->as_mv.row - MV_MAX); - const int maxr = MIN(x->mv_row_max << 3, ref_mv->as_mv.row + MV_MAX); + const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX); + const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX); + const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX); + const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX); int tr = br; int tc = bc; - const int offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col; - // central mv - bestmv->as_mv.row <<= 3; - bestmv->as_mv.col <<= 3; + bestmv->row <<= 3; + bestmv->col <<= 3; // calculate central point error besterr = vfp->vf(y, y_stride, z, src_stride, sse1); *distortion = besterr; besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); - // TODO: Each subsequent iteration checks at least one point in - // common with the last iteration could be 2 ( if diag selected) + // TODO(jbb): Each subsequent iteration checks at least one point in + // common with the last iteration could be 2 if diagonal is selected. while (halfiters--) { // 1/2 pel FIRST_LEVEL_CHECKS; @@ -375,8 +333,8 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x, tc = bc; } - // TODO: Each subsequent iteration checks at least one point in common with - // the last iteration could be 2 ( if diag selected) 1/4 pel + // TODO(yaowu): Each subsequent iteration checks at least one point in common + // with the last iteration could be 2 if diagonal is selected. // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only if (forced_stop != 2) { @@ -391,8 +349,7 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x, } } - if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) && - forced_stop == 0) { + if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) { hstep >>= 1; while (eighthiters--) { FIRST_LEVEL_CHECKS; @@ -404,18 +361,19 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x, } } - bestmv->as_mv.row = br; - bestmv->as_mv.col = bc; + bestmv->row = br; + bestmv->col = bc; - if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3))) + if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || + (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) return INT_MAX; return besterr; } int vp9_find_best_sub_pixel_tree(MACROBLOCK *x, - int_mv *bestmv, int_mv *ref_mv, + MV *bestmv, const MV *ref_mv, + int allow_hp, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, @@ -424,49 +382,36 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x, int *distortion, unsigned int *sse1) { uint8_t *z = x->plane[0].src.buf; - int src_stride = x->plane[0].src.stride; + const int src_stride = x->plane[0].src.stride; MACROBLOCKD *xd = &x->e_mbd; - int rr, rc, br, bc, hstep; - int tr, tc; unsigned int besterr = INT_MAX; unsigned int sse; unsigned int whichdir; int thismse; - int maxc, minc, maxr, minr; - int y_stride; - int offset; unsigned int halfiters = iters_per_step; unsigned int quarteriters = iters_per_step; unsigned int eighthiters = iters_per_step; - uint8_t *y = xd->plane[0].pre[0].buf + - (bestmv->as_mv.row) * xd->plane[0].pre[0].stride + - bestmv->as_mv.col; - - y_stride = xd->plane[0].pre[0].stride; - - rr = ref_mv->as_mv.row; - rc = ref_mv->as_mv.col; - br = bestmv->as_mv.row << 3; - bc = bestmv->as_mv.col << 3; - hstep = 4; - minc = MAX(x->mv_col_min << 3, - (ref_mv->as_mv.col) - ((1 << MV_MAX_BITS) - 1)); - maxc = MIN(x->mv_col_max << 3, - (ref_mv->as_mv.col) + ((1 << MV_MAX_BITS) - 1)); - minr = MAX(x->mv_row_min << 3, - (ref_mv->as_mv.row) - ((1 << MV_MAX_BITS) - 1)); - maxr = MIN(x->mv_row_max << 3, - (ref_mv->as_mv.row) + ((1 << MV_MAX_BITS) - 1)); + const int y_stride = xd->plane[0].pre[0].stride; + const int offset = bestmv->row * y_stride + bestmv->col; + uint8_t *y = xd->plane[0].pre[0].buf + offset; - tr = br; - tc = bc; + int rr = ref_mv->row; + int rc = ref_mv->col; + int br = bestmv->row * 8; + int bc = bestmv->col * 8; + int hstep = 4; + const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX); + const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX); + const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX); + const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX); - offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col; + int tr = br; + int tc = bc; // central mv - bestmv->as_mv.row <<= 3; - bestmv->as_mv.col <<= 3; + bestmv->row *= 8; + bestmv->col *= 8; // calculate central point error besterr = vfp->vf(y, y_stride, z, src_stride, sse1); @@ -492,8 +437,7 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x, tc = bc; } - if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) && - forced_stop == 0) { + if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) { hstep >>= 1; FIRST_LEVEL_CHECKS; if (eighthiters > 1) { @@ -503,11 +447,11 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x, tc = bc; } - bestmv->as_mv.row = br; - bestmv->as_mv.col = bc; + bestmv->row = br; + bestmv->col = bc; - if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3))) + if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || + (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) return INT_MAX; return besterr; @@ -520,7 +464,8 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x, z, src_stride, &sse, second_pred) int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x, - int_mv *bestmv, int_mv *ref_mv, + MV *bestmv, const MV *ref_mv, + int allow_hp, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, @@ -543,30 +488,26 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x, int thismse; DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); - uint8_t *const y = xd->plane[0].pre[0].buf + - (bestmv->as_mv.row) * xd->plane[0].pre[0].stride + - bestmv->as_mv.col; - const int y_stride = xd->plane[0].pre[0].stride; + const int offset = bestmv->row * y_stride + bestmv->col; + uint8_t *const y = xd->plane[0].pre[0].buf + offset; - int rr = ref_mv->as_mv.row; - int rc = ref_mv->as_mv.col; - int br = bestmv->as_mv.row << 3; - int bc = bestmv->as_mv.col << 3; + int rr = ref_mv->row; + int rc = ref_mv->col; + int br = bestmv->row * 8; + int bc = bestmv->col * 8; int hstep = 4; - const int minc = MAX(x->mv_col_min << 3, ref_mv->as_mv.col - MV_MAX); - const int maxc = MIN(x->mv_col_max << 3, ref_mv->as_mv.col + MV_MAX); - const int minr = MAX(x->mv_row_min << 3, ref_mv->as_mv.row - MV_MAX); - const int maxr = MIN(x->mv_row_max << 3, ref_mv->as_mv.row + MV_MAX); + const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX); + const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX); + const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX); + const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX); int tr = br; int tc = bc; - const int offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col; - // central mv - bestmv->as_mv.row <<= 3; - bestmv->as_mv.col <<= 3; + bestmv->row *= 8; + bestmv->col *= 8; // calculate central point error // TODO(yunqingwang): central pointer error was already calculated in full- @@ -604,8 +545,7 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x, } } - if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) && - forced_stop == 0) { + if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) { hstep >>= 1; while (eighthiters--) { FIRST_LEVEL_CHECKS; @@ -616,18 +556,19 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x, tc = bc; } } - bestmv->as_mv.row = br; - bestmv->as_mv.col = bc; + bestmv->row = br; + bestmv->col = bc; - if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3))) + if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || + (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) return INT_MAX; return besterr; } int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, - int_mv *bestmv, int_mv *ref_mv, + MV *bestmv, const MV *ref_mv, + int allow_hp, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, @@ -638,51 +579,37 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, const uint8_t *second_pred, int w, int h) { uint8_t *z = x->plane[0].src.buf; - int src_stride = x->plane[0].src.stride; + const int src_stride = x->plane[0].src.stride; MACROBLOCKD *xd = &x->e_mbd; - int rr, rc, br, bc, hstep; - int tr, tc; unsigned int besterr = INT_MAX; unsigned int sse; unsigned int whichdir; int thismse; - int maxc, minc, maxr, minr; - int y_stride; - int offset; unsigned int halfiters = iters_per_step; unsigned int quarteriters = iters_per_step; unsigned int eighthiters = iters_per_step; DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); - uint8_t *y = xd->plane[0].pre[0].buf + - (bestmv->as_mv.row) * xd->plane[0].pre[0].stride + - bestmv->as_mv.col; - - y_stride = xd->plane[0].pre[0].stride; - - rr = ref_mv->as_mv.row; - rc = ref_mv->as_mv.col; - br = bestmv->as_mv.row << 3; - bc = bestmv->as_mv.col << 3; - hstep = 4; - minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) - - ((1 << MV_MAX_BITS) - 1)); - maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) + - ((1 << MV_MAX_BITS) - 1)); - minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) - - ((1 << MV_MAX_BITS) - 1)); - maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) + - ((1 << MV_MAX_BITS) - 1)); - - tr = br; - tc = bc; + const int y_stride = xd->plane[0].pre[0].stride; + const int offset = bestmv->row * y_stride + bestmv->col; + uint8_t *y = xd->plane[0].pre[0].buf + offset; + int rr = ref_mv->row; + int rc = ref_mv->col; + int br = bestmv->row * 8; + int bc = bestmv->col * 8; + int hstep = 4; + const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX); + const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX); + const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX); + const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX); - offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col; + int tr = br; + int tc = bc; // central mv - bestmv->as_mv.row <<= 3; - bestmv->as_mv.col <<= 3; + bestmv->row *= 8; + bestmv->col *= 8; // calculate central point error // TODO(yunqingwang): central pointer error was already calculated in full- @@ -716,8 +643,7 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, tc = bc; } - if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) && - forced_stop == 0) { + if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) { hstep >>= 1; FIRST_LEVEL_CHECKS; if (eighthiters > 1) { @@ -726,11 +652,11 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, tr = br; tc = bc; } - bestmv->as_mv.row = br; - bestmv->as_mv.col = bc; + bestmv->row = br; + bestmv->col = bc; - if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3))) + if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || + (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) return INT_MAX; return besterr; @@ -754,10 +680,10 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, #define CHECK_POINT \ {\ - if (this_mv.as_mv.col < x->mv_col_min) continue;\ - if (this_mv.as_mv.col > x->mv_col_max) continue;\ - if (this_mv.as_mv.row < x->mv_row_min) continue;\ - if (this_mv.as_mv.row > x->mv_row_max) continue;\ + if (this_mv.col < x->mv_col_min) continue;\ + if (this_mv.col > x->mv_col_max) continue;\ + if (this_mv.row < x->mv_row_min) continue;\ + if (this_mv.row > x->mv_row_max) continue;\ } #define CHECK_BETTER \ @@ -765,7 +691,7 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, if (thissad < bestsad)\ {\ if (use_mvcost) \ - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, \ + thissad += mvsad_err_cost(&this_mv, &fcenter_mv.as_mv, \ mvjsadcost, mvsadcost, \ sad_per_bit);\ if (thissad < bestsad)\ @@ -790,14 +716,14 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, // candidates as indicated in the num_candidates and candidates arrays // passed into this function static int vp9_pattern_search(MACROBLOCK *x, - int_mv *ref_mv, + MV *ref_mv, int search_param, int sad_per_bit, int do_init_search, int do_refine, const vp9_variance_fn_ptr_t *vfp, int use_mvcost, - int_mv *center_mv, int_mv *best_mv, + const MV *center_mv, MV *best_mv, const int num_candidates[MAX_PATTERN_SCALES], const MV candidates[MAX_PATTERN_SCALES] [MAX_PATTERN_CANDIDATES]) { @@ -810,7 +736,7 @@ static int vp9_pattern_search(MACROBLOCK *x, int what_stride = x->plane[0].src.stride; int in_what_stride = xd->plane[0].pre[0].stride; int br, bc; - int_mv this_mv; + MV this_mv; int bestsad = INT_MAX; int thissad; uint8_t *base_offset; @@ -823,24 +749,22 @@ static int vp9_pattern_search(MACROBLOCK *x, int *mvjsadcost = x->nmvjointsadcost; int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; - fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; + fcenter_mv.as_mv.row = center_mv->row >> 3; + fcenter_mv.as_mv.col = center_mv->col >> 3; // adjust ref_mv to make sure it is within MV range - clamp_mv(&ref_mv->as_mv, - x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); - br = ref_mv->as_mv.row; - bc = ref_mv->as_mv.col; + clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); + br = ref_mv->row; + bc = ref_mv->col; // Work out the start point for the search base_offset = (uint8_t *)(xd->plane[0].pre[0].buf); this_offset = base_offset + (br * in_what_stride) + bc; - this_mv.as_mv.row = br; - this_mv.as_mv.col = bc; - bestsad = vfp->sdf(what, what_stride, this_offset, - in_what_stride, 0x7fffffff) - + mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, mvsadcost, - sad_per_bit); + this_mv.row = br; + this_mv.col = bc; + bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, 0x7fffffff) + + mvsad_err_cost(&this_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); // Search all possible scales upto the search param around the center point // pick the scale of the point that is best as the starting scale of @@ -853,21 +777,21 @@ static int vp9_pattern_search(MACROBLOCK *x, CHECK_BOUNDS((1 << t)) if (all_in) { for (i = 0; i < num_candidates[t]; i++) { - this_mv.as_mv.row = br + candidates[t][i].row; - this_mv.as_mv.col = bc + candidates[t][i].col; - this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + - this_mv.as_mv.col; + this_mv.row = br + candidates[t][i].row; + this_mv.col = bc + candidates[t][i].col; + this_offset = base_offset + (this_mv.row * in_what_stride) + + this_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); CHECK_BETTER } } else { for (i = 0; i < num_candidates[t]; i++) { - this_mv.as_mv.row = br + candidates[t][i].row; - this_mv.as_mv.col = bc + candidates[t][i].col; + this_mv.row = br + candidates[t][i].row; + this_mv.col = bc + candidates[t][i].col; CHECK_POINT - this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + - this_mv.as_mv.col; + this_offset = base_offset + (this_mv.row * in_what_stride) + + this_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); CHECK_BETTER @@ -897,21 +821,21 @@ static int vp9_pattern_search(MACROBLOCK *x, CHECK_BOUNDS((1 << s)) if (all_in) { for (i = 0; i < num_candidates[s]; i++) { - this_mv.as_mv.row = br + candidates[s][i].row; - this_mv.as_mv.col = bc + candidates[s][i].col; - this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + - this_mv.as_mv.col; + this_mv.row = br + candidates[s][i].row; + this_mv.col = bc + candidates[s][i].col; + this_offset = base_offset + (this_mv.row * in_what_stride) + + this_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); CHECK_BETTER } } else { for (i = 0; i < num_candidates[s]; i++) { - this_mv.as_mv.row = br + candidates[s][i].row; - this_mv.as_mv.col = bc + candidates[s][i].col; + this_mv.row = br + candidates[s][i].row; + this_mv.col = bc + candidates[s][i].col; CHECK_POINT - this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + - this_mv.as_mv.col; + this_offset = base_offset + (this_mv.row * in_what_stride) + + this_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); CHECK_BETTER @@ -935,25 +859,21 @@ static int vp9_pattern_search(MACROBLOCK *x, get_next_chkpts(next_chkpts_indices, k, num_candidates[s]); if (all_in) { for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { - this_mv.as_mv.row = br + - candidates[s][next_chkpts_indices[i]].row; - this_mv.as_mv.col = bc + - candidates[s][next_chkpts_indices[i]].col; - this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + - this_mv.as_mv.col; + this_mv.row = br + candidates[s][next_chkpts_indices[i]].row; + this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col; + this_offset = base_offset + (this_mv.row * (in_what_stride)) + + this_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); CHECK_BETTER } } else { for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { - this_mv.as_mv.row = br + - candidates[s][next_chkpts_indices[i]].row; - this_mv.as_mv.col = bc + - candidates[s][next_chkpts_indices[i]].col; + this_mv.row = br + candidates[s][next_chkpts_indices[i]].row; + this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col; CHECK_POINT - this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + - this_mv.as_mv.col; + this_offset = base_offset + (this_mv.row * (in_what_stride)) + + this_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); CHECK_BETTER @@ -980,21 +900,21 @@ static int vp9_pattern_search(MACROBLOCK *x, CHECK_BOUNDS(1) if (all_in) { for (i = 0; i < 4; i++) { - this_mv.as_mv.row = br + neighbors[i].row; - this_mv.as_mv.col = bc + neighbors[i].col; - this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + - this_mv.as_mv.col; + this_mv.row = br + neighbors[i].row; + this_mv.col = bc + neighbors[i].col; + this_offset = base_offset + (this_mv.row * (in_what_stride)) + + this_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); CHECK_BETTER } } else { for (i = 0; i < 4; i++) { - this_mv.as_mv.row = br + neighbors[i].row; - this_mv.as_mv.col = bc + neighbors[i].col; + this_mv.row = br + neighbors[i].row; + this_mv.col = bc + neighbors[i].col; CHECK_POINT - this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + - this_mv.as_mv.col; + this_offset = base_offset + (this_mv.row * (in_what_stride)) + + this_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); CHECK_BETTER @@ -1010,31 +930,32 @@ static int vp9_pattern_search(MACROBLOCK *x, } } - best_mv->as_mv.row = br; - best_mv->as_mv.col = bc; + best_mv->row = br; + best_mv->col = bc; - this_offset = base_offset + (best_mv->as_mv.row * (in_what_stride)) + - best_mv->as_mv.col; - this_mv.as_mv.row = best_mv->as_mv.row << 3; - this_mv.as_mv.col = best_mv->as_mv.col << 3; + this_offset = base_offset + (best_mv->row * in_what_stride) + + best_mv->col; + this_mv.row = best_mv->row * 8; + this_mv.col = best_mv->col * 8; if (bestsad == INT_MAX) return INT_MAX; - return - vfp->vf(what, what_stride, this_offset, in_what_stride, - (unsigned int *)(&bestsad)) + - use_mvcost ? mv_err_cost(&this_mv, center_mv, x->nmvjointcost, x->mvcost, - x->errorperbit) : 0; + + return vfp->vf(what, what_stride, this_offset, in_what_stride, + (unsigned int *)&bestsad) + + use_mvcost ? mv_err_cost(&this_mv, center_mv, + x->nmvjointcost, x->mvcost, x->errorperbit) + : 0; } int vp9_hex_search(MACROBLOCK *x, - int_mv *ref_mv, + MV *ref_mv, int search_param, int sad_per_bit, int do_init_search, const vp9_variance_fn_ptr_t *vfp, int use_mvcost, - int_mv *center_mv, int_mv *best_mv) { + const MV *center_mv, MV *best_mv) { // First scale has 8-closest points, the rest have 6 points in hex shape // at increasing scales static const int hex_num_candidates[MAX_PATTERN_SCALES] = { @@ -1063,14 +984,14 @@ int vp9_hex_search(MACROBLOCK *x, } int vp9_bigdia_search(MACROBLOCK *x, - int_mv *ref_mv, + MV *ref_mv, int search_param, int sad_per_bit, int do_init_search, const vp9_variance_fn_ptr_t *vfp, int use_mvcost, - int_mv *center_mv, - int_mv *best_mv) { + const MV *center_mv, + MV *best_mv) { // First scale has 4-closest points, the rest have 8 points in diamond // shape at increasing scales static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = { @@ -1097,22 +1018,21 @@ int vp9_bigdia_search(MACROBLOCK *x, {{-512, -512}, {0, -1024}, {512, -512}, {1024, 0}, {512, 512}, {0, 1024}, {-512, 512}, {-1024, 0}}, }; - return - vp9_pattern_search(x, ref_mv, search_param, sad_per_bit, - do_init_search, 0, vfp, use_mvcost, - center_mv, best_mv, - bigdia_num_candidates, bigdia_candidates); + return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit, + do_init_search, 0, vfp, use_mvcost, + center_mv, best_mv, + bigdia_num_candidates, bigdia_candidates); } int vp9_square_search(MACROBLOCK *x, - int_mv *ref_mv, + MV *ref_mv, int search_param, int sad_per_bit, int do_init_search, const vp9_variance_fn_ptr_t *vfp, int use_mvcost, - int_mv *center_mv, - int_mv *best_mv) { + const MV *center_mv, + MV *best_mv) { // All scales have 8 closest points in square shape static const int square_num_candidates[MAX_PATTERN_SCALES] = { 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, @@ -1139,11 +1059,10 @@ int vp9_square_search(MACROBLOCK *x, {{-1024, -1024}, {0, -1024}, {1024, -1024}, {1024, 0}, {1024, 1024}, {0, 1024}, {-1024, 1024}, {-1024, 0}}, }; - return - vp9_pattern_search(x, ref_mv, search_param, sad_per_bit, - do_init_search, 0, vfp, use_mvcost, - center_mv, best_mv, - square_num_candidates, square_candidates); + return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit, + do_init_search, 0, vfp, use_mvcost, + center_mv, best_mv, + square_num_candidates, square_candidates); }; #undef CHECK_BOUNDS @@ -1199,13 +1118,14 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x, best_address = in_what; // Check the starting position - bestsad = fn_ptr->sdf(what, what_stride, in_what, - in_what_stride, 0x7fffffff) - + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost, - sad_per_bit); + bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); - // search_param determines the length of the initial step and hence the number of iterations - // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc. + // search_param determines the length of the initial step and hence the number + // of iterations + // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = + // (MAX_FIRST_STEP/4) pel... etc. ss = &x->ss[search_param * x->searches_per_step]; tot_steps = (x->ss_count / x->searches_per_step) - search_param; @@ -1228,7 +1148,7 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x, if (thissad < bestsad) { this_mv.as_mv.row = this_row_offset; this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { @@ -1260,7 +1180,7 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x, if (thissad < bestsad) { this_mv.as_mv.row = this_row_offset; this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { bestsad = thissad; @@ -1274,19 +1194,21 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x, break; }; #endif - } else if (best_address == in_what) + } else if (best_address == in_what) { (*num00)++; + } } - this_mv.as_mv.row = best_mv->as_mv.row << 3; - this_mv.as_mv.col = best_mv->as_mv.col << 3; + this_mv.as_mv.row = best_mv->as_mv.row * 8; + this_mv.as_mv.col = best_mv->as_mv.col * 8; if (bestsad == INT_MAX) return INT_MAX; return fn_ptr->vf(what, what_stride, best_address, in_what_stride, - (unsigned int *)(&thissad)) + mv_err_cost(&this_mv, center_mv, mvjcost, - mvcost, x->errorperbit); + (unsigned int *)(&thissad)) + + mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, + mvjcost, mvcost, x->errorperbit); } int vp9_diamond_search_sadx4(MACROBLOCK *x, @@ -1340,13 +1262,15 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, best_address = in_what; // Check the starting position - bestsad = fn_ptr->sdf(what, what_stride, - in_what, in_what_stride, 0x7fffffff) - + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost, - sad_per_bit); + bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); - // search_param determines the length of the initial step and hence the number of iterations - // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc. + // search_param determines the length of the initial step and hence the number + // of iterations. + // 0 = initial step (MAX_FIRST_STEP) pel + // 1 = (MAX_FIRST_STEP/2) pel, + // 2 = (MAX_FIRST_STEP/4) pel... ss = &x->ss[search_param * x->searches_per_step]; tot_steps = (x->ss_count / x->searches_per_step) - search_param; @@ -1355,13 +1279,16 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, for (step = 0; step < tot_steps; step++) { int all_in = 1, t; - // To know if all neighbor points are within the bounds, 4 bounds checking are enough instead of - // checking 4 bounds for each points. + // All_in is true if every one of the points we are checking are within + // the bounds of the image. all_in &= ((best_mv->as_mv.row + ss[i].mv.row) > x->mv_row_min); all_in &= ((best_mv->as_mv.row + ss[i + 1].mv.row) < x->mv_row_max); all_in &= ((best_mv->as_mv.col + ss[i + 2].mv.col) > x->mv_col_min); all_in &= ((best_mv->as_mv.col + ss[i + 3].mv.col) < x->mv_col_max); + // If all the pixels are within the bounds we don't check whether the + // search point is valid in this loop, otherwise we check each point + // for validity.. if (all_in) { unsigned int sad_array[4]; @@ -1378,7 +1305,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, if (sad_array[t] < bestsad) { this_mv.as_mv.row = best_mv->as_mv.row + ss[i].mv.row; this_mv.as_mv.col = best_mv->as_mv.col + ss[i].mv.col; - sad_array[t] += mvsad_err_cost(&this_mv, &fcenter_mv, + sad_array[t] += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, mvjsadcost, mvsadcost, sad_per_bit); if (sad_array[t] < bestsad) { @@ -1394,15 +1321,18 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, this_row_offset = best_mv->as_mv.row + ss[i].mv.row; this_col_offset = best_mv->as_mv.col + ss[i].mv.col; - if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) && - (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) { + if ((this_col_offset > x->mv_col_min) && + (this_col_offset < x->mv_col_max) && + (this_row_offset > x->mv_row_min) && + (this_row_offset < x->mv_row_max)) { check_here = ss[i].offset + best_address; - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad); + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, + bestsad); if (thissad < bestsad) { this_mv.as_mv.row = this_row_offset; this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { @@ -1433,7 +1363,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, if (thissad < bestsad) { this_mv.as_mv.row = this_row_offset; this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { bestsad = thissad; @@ -1447,19 +1377,21 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, break; }; #endif - } else if (best_address == in_what) + } else if (best_address == in_what) { (*num00)++; + } } - this_mv.as_mv.row = best_mv->as_mv.row << 3; - this_mv.as_mv.col = best_mv->as_mv.col << 3; + this_mv.as_mv.row = best_mv->as_mv.row * 8; + this_mv.as_mv.col = best_mv->as_mv.col * 8; if (bestsad == INT_MAX) return INT_MAX; return fn_ptr->vf(what, what_stride, best_address, in_what_stride, - (unsigned int *)(&thissad)) + mv_err_cost(&this_mv, - center_mv, mvjcost, mvcost, x->errorperbit); + (unsigned int *)(&thissad)) + + mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, + mvjcost, mvcost, x->errorperbit); } /* do_refine: If last step (1-away) of n-step search doesn't pick the center @@ -1482,16 +1414,17 @@ int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x, n = num00; num00 = 0; - /* If there won't be more n-step search, check to see if refining search is needed. */ + /* If there won't be more n-step search, check to see if refining search is + * needed. */ if (n > further_steps) do_refine = 0; while (n < further_steps) { n++; - if (num00) + if (num00) { num00--; - else { + } else { thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv, step_param + n, sadpb, &num00, fn_ptr, x->nmvjointcost, x->mvcost, @@ -1570,8 +1503,8 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv, // Baseline value at the centre bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) - + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost, - sad_per_bit); + + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); // Apply further limits to prevent us looking using vectors that stretch // beyond the UMV border @@ -1585,11 +1518,12 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv, check_here = r * mv_stride + in_what + col_min; for (c = col_min; c < col_max; c++) { - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad); + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, + bestsad); this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { bestsad = thissad; @@ -1602,14 +1536,14 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv, } } - this_mv.as_mv.row = best_mv->as_mv.row << 3; - this_mv.as_mv.col = best_mv->as_mv.col << 3; + this_mv.as_mv.row = best_mv->as_mv.row * 8; + this_mv.as_mv.col = best_mv->as_mv.col * 8; if (bestsad < INT_MAX) - return - fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, - (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit); + return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, + (unsigned int *)(&thissad)) + + mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, + mvjcost, mvcost, x->errorperbit); else return INT_MAX; } @@ -1660,8 +1594,8 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv, // Baseline value at the centre bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) - + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost, - sad_per_bit); + + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); // Apply further limits to prevent us looking using vectors that stretch // beyond the UMV border @@ -1685,8 +1619,8 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv, if (thissad < bestsad) { this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { bestsad = thissad; @@ -1702,11 +1636,12 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv, } while (c < col_max) { - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad); + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, + bestsad); if (thissad < bestsad) { this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { @@ -1720,17 +1655,16 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv, check_here++; c++; } - } - this_mv.as_mv.row = best_mv->as_mv.row << 3; - this_mv.as_mv.col = best_mv->as_mv.col << 3; + this_mv.as_mv.row = best_mv->as_mv.row * 8; + this_mv.as_mv.col = best_mv->as_mv.col * 8; if (bestsad < INT_MAX) - return - fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, - (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit); + return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, + (unsigned int *)(&thissad)) + + mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, + mvjcost, mvcost, x->errorperbit); else return INT_MAX; } @@ -1783,8 +1717,8 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv, // Baseline value at the centre bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) - + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost, - sad_per_bit); + + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); // Apply further limits to prevent us looking using vectors that stretch // beyond the UMV border @@ -1808,8 +1742,8 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv, if (thissad < bestsad) { this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { bestsad = thissad; @@ -1834,7 +1768,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv, if (thissad < bestsad) { this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { @@ -1851,12 +1785,13 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv, } while (c < col_max) { - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad); + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, + bestsad); if (thissad < bestsad) { this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, - mvjsadcost, mvsadcost, sad_per_bit); + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, sad_per_bit); if (thissad < bestsad) { bestsad = thissad; @@ -1871,14 +1806,14 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv, } } - this_mv.as_mv.row = best_mv->as_mv.row << 3; - this_mv.as_mv.col = best_mv->as_mv.col << 3; + this_mv.as_mv.row = best_mv->as_mv.row * 8; + this_mv.as_mv.col = best_mv->as_mv.col * 8; if (bestsad < INT_MAX) - return - fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, - (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit); + return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, + (unsigned int *)(&thissad)) + + mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, + mvjcost, mvcost, x->errorperbit); else return INT_MAX; } @@ -1909,8 +1844,10 @@ int vp9_refining_search_sad_c(MACROBLOCK *x, fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; - bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) + - mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit); + bestsad = fn_ptr->sdf(what, what_stride, best_address, + in_what_stride, 0x7fffffff) + + mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, error_per_bit); for (i = 0; i < search_range; i++) { int best_site = -1; @@ -1919,16 +1856,20 @@ int vp9_refining_search_sad_c(MACROBLOCK *x, this_row_offset = ref_mv->as_mv.row + neighbors[j].row; this_col_offset = ref_mv->as_mv.col + neighbors[j].col; - if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) && - (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) { - check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address; - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad); + if ((this_col_offset > x->mv_col_min) && + (this_col_offset < x->mv_col_max) && + (this_row_offset > x->mv_row_min) && + (this_row_offset < x->mv_row_max)) { + check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + + best_address; + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, + bestsad); if (thissad < bestsad) { this_mv.as_mv.row = this_row_offset; this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, - mvsadcost, error_per_bit); + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, error_per_bit); if (thissad < bestsad) { bestsad = thissad; @@ -1938,23 +1879,24 @@ int vp9_refining_search_sad_c(MACROBLOCK *x, } } - if (best_site == -1) + if (best_site == -1) { break; - else { + } else { ref_mv->as_mv.row += neighbors[best_site].row; ref_mv->as_mv.col += neighbors[best_site].col; - best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col; + best_address += (neighbors[best_site].row) * in_what_stride + + neighbors[best_site].col; } } - this_mv.as_mv.row = ref_mv->as_mv.row << 3; - this_mv.as_mv.col = ref_mv->as_mv.col << 3; + this_mv.as_mv.row = ref_mv->as_mv.row * 8; + this_mv.as_mv.col = ref_mv->as_mv.col * 8; if (bestsad < INT_MAX) - return - fn_ptr->vf(what, what_stride, best_address, in_what_stride, - (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit); + return fn_ptr->vf(what, what_stride, best_address, in_what_stride, + (unsigned int *)(&thissad)) + + mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, + mvjcost, mvcost, x->errorperbit); else return INT_MAX; } @@ -1986,8 +1928,10 @@ int vp9_refining_search_sadx4(MACROBLOCK *x, fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; - bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) + - mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit); + bestsad = fn_ptr->sdf(what, what_stride, best_address, + in_what_stride, 0x7fffffff) + + mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, error_per_bit); for (i = 0; i < search_range; i++) { int best_site = -1; @@ -2004,14 +1948,15 @@ int vp9_refining_search_sadx4(MACROBLOCK *x, block_offset[2] = best_address + 1; block_offset[3] = best_address + in_what_stride; - fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array); + fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, + sad_array); for (j = 0; j < 4; j++) { if (sad_array[j] < bestsad) { this_mv.as_mv.row = ref_mv->as_mv.row + neighbors[j].row; this_mv.as_mv.col = ref_mv->as_mv.col + neighbors[j].col; - sad_array[j] += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, - mvsadcost, error_per_bit); + sad_array[j] += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, error_per_bit); if (sad_array[j] < bestsad) { bestsad = sad_array[j]; @@ -2024,16 +1969,20 @@ int vp9_refining_search_sadx4(MACROBLOCK *x, this_row_offset = ref_mv->as_mv.row + neighbors[j].row; this_col_offset = ref_mv->as_mv.col + neighbors[j].col; - if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) && - (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) { - check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address; - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad); + if ((this_col_offset > x->mv_col_min) && + (this_col_offset < x->mv_col_max) && + (this_row_offset > x->mv_row_min) && + (this_row_offset < x->mv_row_max)) { + check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + + best_address; + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, + bestsad); if (thissad < bestsad) { this_mv.as_mv.row = this_row_offset; this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, - mvsadcost, error_per_bit); + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, error_per_bit); if (thissad < bestsad) { bestsad = thissad; @@ -2044,23 +1993,24 @@ int vp9_refining_search_sadx4(MACROBLOCK *x, } } - if (best_site == -1) + if (best_site == -1) { break; - else { + } else { ref_mv->as_mv.row += neighbors[best_site].row; ref_mv->as_mv.col += neighbors[best_site].col; - best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col; + best_address += (neighbors[best_site].row) * in_what_stride + + neighbors[best_site].col; } } - this_mv.as_mv.row = ref_mv->as_mv.row << 3; - this_mv.as_mv.col = ref_mv->as_mv.col << 3; + this_mv.as_mv.row = ref_mv->as_mv.row * 8; + this_mv.as_mv.col = ref_mv->as_mv.col * 8; if (bestsad < INT_MAX) - return - fn_ptr->vf(what, what_stride, best_address, in_what_stride, - (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit); + return fn_ptr->vf(what, what_stride, best_address, in_what_stride, + (unsigned int *)(&thissad)) + + mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, + mvjcost, mvcost, x->errorperbit); else return INT_MAX; } @@ -2100,7 +2050,8 @@ int vp9_refining_search_8p_c(MACROBLOCK *x, /* Get compound pred by averaging two pred blocks. */ bestsad = fn_ptr->sdaf(what, what_stride, best_address, in_what_stride, second_pred, 0x7fffffff) + - mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit); + mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, error_per_bit); for (i = 0; i < search_range; i++) { int best_site = -1; @@ -2123,9 +2074,8 @@ int vp9_refining_search_8p_c(MACROBLOCK *x, if (thissad < bestsad) { this_mv.as_mv.row = this_row_offset; this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, - mvsadcost, error_per_bit); - + thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, + mvjsadcost, mvsadcost, error_per_bit); if (thissad < bestsad) { bestsad = thissad; best_site = j; @@ -2144,16 +2094,16 @@ int vp9_refining_search_8p_c(MACROBLOCK *x, } } - this_mv.as_mv.row = ref_mv->as_mv.row << 3; - this_mv.as_mv.col = ref_mv->as_mv.col << 3; + this_mv.as_mv.row = ref_mv->as_mv.row * 8; + this_mv.as_mv.col = ref_mv->as_mv.col * 8; if (bestsad < INT_MAX) { // FIXME(rbultje, yunqing): add full-pixel averaging variance functions // so we don't have to use the subpixel with xoff=0,yoff=0 here. - return fn_ptr->svaf(best_address, in_what_stride, 0, 0, - what, what_stride, (unsigned int *)(&thissad), - second_pred) + - mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit); + return fn_ptr->svaf(best_address, in_what_stride, 0, 0, what, what_stride, + (unsigned int *)(&thissad), second_pred) + + mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, + mvjcost, mvcost, x->errorperbit); } else { return INT_MAX; } diff --git a/libvpx/vp9/encoder/vp9_mcomp.h b/libvpx/vp9/encoder/vp9_mcomp.h index 3598fa0..bcab679 100644 --- a/libvpx/vp9/encoder/vp9_mcomp.h +++ b/libvpx/vp9/encoder/vp9_mcomp.h @@ -22,10 +22,14 @@ #define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1) // Maximum size of the first step in full pel units #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1)) +// Allowed motion vector pixel distance outside image border +// for Block_16x16 +#define BORDER_MV_PIXELS_B16 (16 + VP9_INTERP_EXTEND) + void vp9_clamp_mv_min_max(MACROBLOCK *x, MV *mv); -int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, - int *mvcost[2], int weight); +int vp9_mv_bit_cost(const MV *mv, const MV *ref, + const int *mvjcost, int *mvcost[2], int weight); void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride); void vp9_init3smotion_compensation(MACROBLOCK *x, int stride); @@ -40,37 +44,37 @@ int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, int_mv *ref_mv, int_mv *dst_mv); int vp9_hex_search(MACROBLOCK *x, - int_mv *ref_mv, + MV *ref_mv, int search_param, int error_per_bit, int do_init_search, const vp9_variance_fn_ptr_t *vf, int use_mvcost, - int_mv *center_mv, - int_mv *best_mv); + const MV *center_mv, + MV *best_mv); int vp9_bigdia_search(MACROBLOCK *x, - int_mv *ref_mv, + MV *ref_mv, int search_param, int error_per_bit, int do_init_search, const vp9_variance_fn_ptr_t *vf, int use_mvcost, - int_mv *center_mv, - int_mv *best_mv); + const MV *center_mv, + MV *best_mv); int vp9_square_search(MACROBLOCK *x, - int_mv *ref_mv, + MV *ref_mv, int search_param, int error_per_bit, int do_init_search, const vp9_variance_fn_ptr_t *vf, int use_mvcost, - int_mv *center_mv, - int_mv *best_mv); + const MV *center_mv, + MV *best_mv); typedef int (fractional_mv_step_fp) ( MACROBLOCK *x, - int_mv *bestmv, - int_mv *ref_mv, + MV *bestmv, const MV *ref_mv, + int allow_hp, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, // 0 - full, 1 - qtr only, 2 - half only @@ -84,7 +88,8 @@ extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree; typedef int (fractional_mv_step_comp_fp) ( MACROBLOCK *x, - int_mv *bestmv, int_mv *ref_mv, + MV *bestmv, const MV *ref_mv, + int allow_hp, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, // 0 - full, 1 - qtr only, 2 - half only diff --git a/libvpx/vp9/encoder/vp9_modecosts.c b/libvpx/vp9/encoder/vp9_modecosts.c index a5dfaed..7eb6592 100644 --- a/libvpx/vp9/encoder/vp9_modecosts.c +++ b/libvpx/vp9/encoder/vp9_modecosts.c @@ -17,7 +17,7 @@ void vp9_init_mode_costs(VP9_COMP *c) { VP9_COMMON *const cm = &c->common; - const vp9_tree_p KT = vp9_intra_mode_tree; + const vp9_tree_index *KT = vp9_intra_mode_tree; int i, j; for (i = 0; i < INTRA_MODES; i++) { @@ -36,7 +36,7 @@ void vp9_init_mode_costs(VP9_COMP *c) { vp9_kf_uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree); - for (i = 0; i <= SWITCHABLE_FILTERS; ++i) + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i], cm->fc.switchable_interp_prob[i], vp9_switchable_interp_tree); diff --git a/libvpx/vp9/encoder/vp9_onyx_if.c b/libvpx/vp9/encoder/vp9_onyx_if.c index 883b31e..f922f90 100644 --- a/libvpx/vp9/encoder/vp9_onyx_if.c +++ b/libvpx/vp9/encoder/vp9_onyx_if.c @@ -8,44 +8,35 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <math.h> +#include <stdio.h> +#include <limits.h> + +#include "./vpx_config.h" +#include "./vpx_scale_rtcd.h" -#include "vpx_config.h" +#include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_filter.h" -#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_idct.h" +#if CONFIG_VP9_POSTPROC +#include "vp9/common/vp9_postproc.h" +#endif #include "vp9/common/vp9_reconinter.h" -#include "vp9/encoder/vp9_onyx_int.h" #include "vp9/common/vp9_systemdependent.h" -#include "vp9/encoder/vp9_quantize.h" -#include "vp9/common/vp9_alloccommon.h" -#include "vp9/encoder/vp9_mcomp.h" +#include "vp9/common/vp9_tile_common.h" #include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_mbgraph.h" +#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_picklpf.h" #include "vp9/encoder/vp9_psnr.h" -#include "vpx_scale/vpx_scale.h" -#include "vp9/common/vp9_extend.h" #include "vp9/encoder/vp9_ratectrl.h" -#include "vp9/common/vp9_quant_common.h" -#include "vp9/common/vp9_tile_common.h" -#include "vp9/encoder/vp9_segmentation.h" -#include "./vp9_rtcd.h" -#include "./vpx_scale_rtcd.h" -#if CONFIG_VP9_POSTPROC -#include "vp9/common/vp9_postproc.h" -#endif -#include "vpx_mem/vpx_mem.h" -#include "vpx_ports/vpx_timer.h" - -#include "vp9/common/vp9_seg_common.h" -#include "vp9/encoder/vp9_mbgraph.h" -#include "vp9/common/vp9_pred_common.h" #include "vp9/encoder/vp9_rdopt.h" -#include "vp9/encoder/vp9_bitstream.h" -#include "vp9/encoder/vp9_picklpf.h" -#include "vp9/common/vp9_mvref_common.h" +#include "vp9/encoder/vp9_segmentation.h" #include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_vaq.h" + +#include "vpx_ports/vpx_timer.h" -#include <math.h> -#include <stdio.h> -#include <limits.h> extern void print_tree_update_probs(); @@ -55,16 +46,20 @@ static void set_default_lf_deltas(struct loopfilter *lf); #define SHARP_FILTER_QTHRESH 0 /* Q threshold for 8-tap sharp filter */ -#define ALTREF_HIGH_PRECISION_MV 1 /* whether to use high precision mv - for altref computation */ -#define HIGH_PRECISION_MV_QTHRESH 200 /* Q threshold for use of high precision - mv. Choose a very high value for - now so that HIGH_PRECISION is always - chosen */ +#define ALTREF_HIGH_PRECISION_MV 1 // Whether to use high precision mv + // for altref computation. +#define HIGH_PRECISION_MV_QTHRESH 200 // Q threshold for high precision + // mv. Choose a very high value for + // now so that HIGH_PRECISION is always + // chosen. -#if CONFIG_INTERNAL_STATS -#include "math.h" +// Masks for partially or completely disabling split mode +#define DISABLE_ALL_SPLIT 0x3F +#define DISABLE_ALL_INTER_SPLIT 0x1F +#define DISABLE_COMPOUND_SPLIT 0x18 +#define LAST_AND_INTRA_SPLIT_ONLY 0x1E +#if CONFIG_INTERNAL_STATS extern double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, int lumamask, double *weight); @@ -107,7 +102,8 @@ extern void write_switchable_interp_stats(); #endif #ifdef SPEEDSTATS -unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0}; #endif #if defined(SECTIONBITS_OUTPUT) @@ -122,6 +118,8 @@ static int kf_high_motion_minq[QINDEX_RANGE]; static int gf_low_motion_minq[QINDEX_RANGE]; static int gf_high_motion_minq[QINDEX_RANGE]; static int inter_minq[QINDEX_RANGE]; +static int afq_low_motion_minq[QINDEX_RANGE]; +static int afq_high_motion_minq[QINDEX_RANGE]; static INLINE void Scale2Ratio(int mode, int *hr, int *hs) { switch (mode) { @@ -193,24 +191,55 @@ static void init_minq_luts(void) { gf_low_motion_minq[i] = calculate_minq_index(maxq, 0.0000015, -0.0009, - 0.33, + 0.32, 0.0); gf_high_motion_minq[i] = calculate_minq_index(maxq, 0.0000021, -0.00125, - 0.45, + 0.50, 0.0); inter_minq[i] = calculate_minq_index(maxq, 0.00000271, -0.00113, - 0.697, + 0.75, 0.0); + afq_low_motion_minq[i] = calculate_minq_index(maxq, + 0.0000015, + -0.0009, + 0.33, + 0.0); + afq_high_motion_minq[i] = calculate_minq_index(maxq, + 0.0000021, + -0.00125, + 0.55, + 0.0); + } +} +static int get_active_quality(int q, + int gfu_boost, + int low, + int high, + int *low_motion_minq, + int *high_motion_minq) { + int active_best_quality; + if (gfu_boost > high) { + active_best_quality = low_motion_minq[q]; + } else if (gfu_boost < low) { + active_best_quality = high_motion_minq[q]; + } else { + const int gap = high - low; + const int offset = high - gfu_boost; + const int qdiff = high_motion_minq[q] - low_motion_minq[q]; + const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap; + active_best_quality = low_motion_minq[q] + adjustment; } + return active_best_quality; } -static void set_mvcost(MACROBLOCK *mb) { - if (mb->e_mbd.allow_high_precision_mv) { +static void set_mvcost(VP9_COMP *cpi) { + MACROBLOCK *const mb = &cpi->mb; + if (cpi->common.allow_high_precision_mv) { mb->mvcost = mb->nmvcost_hp; mb->mvsadcost = mb->nmvsadcost_hp; } else { @@ -284,14 +313,17 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { vpx_free(cpi->mb_norm_activity_map); cpi->mb_norm_activity_map = 0; - vpx_free(cpi->mb.pip); - cpi->mb.pip = 0; + vpx_free(cpi->above_context[0]); + cpi->above_context[0] = NULL; + + vpx_free(cpi->above_seg_context); + cpi->above_seg_context = NULL; } // Computes a q delta (in "q index" terms) to get from a starting q value // to a target value // target q value -static int compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) { +int vp9_compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) { int i; int start_index = cpi->worst_quality; int target_index = cpi->worst_quality; @@ -355,7 +387,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) { seg->update_map = 1; seg->update_data = 1; - qi_delta = compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875)); + qi_delta = vp9_compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875)); vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, (qi_delta - 2)); vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2); @@ -364,7 +396,6 @@ static void configure_static_seg_features(VP9_COMP *cpi) { // Where relevant assume segment data is delta data seg->abs_delta = SEGMENT_DELTADATA; - } } else if (seg->enabled) { // All other frames if segmentation has been enabled @@ -377,8 +408,8 @@ static void configure_static_seg_features(VP9_COMP *cpi) { seg->update_data = 1; seg->abs_delta = SEGMENT_DELTADATA; - qi_delta = compute_qdelta(cpi, cpi->avg_q, - (cpi->avg_q * 1.125)); + qi_delta = vp9_compute_qdelta(cpi, cpi->avg_q, + (cpi->avg_q * 1.125)); vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, (qi_delta + 2)); vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); @@ -421,8 +452,8 @@ static void configure_static_seg_features(VP9_COMP *cpi) { // Skip all MBs if high Q (0,0 mv and skip coeffs) if (high_q) { - vp9_enable_segfeature(seg, 0, SEG_LVL_SKIP); - vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP); + vp9_enable_segfeature(seg, 0, SEG_LVL_SKIP); + vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP); } // Enable data update seg->update_data = 1; @@ -565,16 +596,16 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) { sf->thresh_mult[THR_NEARESTG] = 0; sf->thresh_mult[THR_NEARESTA] = 0; - sf->thresh_mult[THR_NEWMV] += 1000; - sf->thresh_mult[THR_COMP_NEARESTLA] += 1000; - sf->thresh_mult[THR_NEARMV] += 1000; - sf->thresh_mult[THR_COMP_NEARESTGA] += 1000; - sf->thresh_mult[THR_DC] += 1000; - sf->thresh_mult[THR_NEWG] += 1000; + sf->thresh_mult[THR_NEWMV] += 1000; sf->thresh_mult[THR_NEWA] += 1000; + sf->thresh_mult[THR_NEWG] += 1000; + + sf->thresh_mult[THR_NEARMV] += 1000; sf->thresh_mult[THR_NEARA] += 1000; + sf->thresh_mult[THR_COMP_NEARESTLA] += 1000; + sf->thresh_mult[THR_COMP_NEARESTGA] += 1000; sf->thresh_mult[THR_TM] += 1000; @@ -584,19 +615,12 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) { sf->thresh_mult[THR_COMP_NEARGA] += 1500; sf->thresh_mult[THR_COMP_NEWGA] += 2000; - sf->thresh_mult[THR_SPLITMV] += 2500; - sf->thresh_mult[THR_SPLITG] += 2500; - sf->thresh_mult[THR_SPLITA] += 2500; - sf->thresh_mult[THR_COMP_SPLITLA] += 4500; - sf->thresh_mult[THR_COMP_SPLITGA] += 4500; - sf->thresh_mult[THR_ZEROMV] += 2000; sf->thresh_mult[THR_ZEROG] += 2000; sf->thresh_mult[THR_ZEROA] += 2000; sf->thresh_mult[THR_COMP_ZEROLA] += 2500; sf->thresh_mult[THR_COMP_ZEROGA] += 2500; - sf->thresh_mult[THR_B_PRED] += 2500; sf->thresh_mult[THR_H_PRED] += 2000; sf->thresh_mult[THR_V_PRED] += 2000; sf->thresh_mult[THR_D45_PRED ] += 2500; @@ -606,49 +630,24 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) { sf->thresh_mult[THR_D207_PRED] += 2500; sf->thresh_mult[THR_D63_PRED] += 2500; - if (cpi->sf.skip_lots_of_modes) { - for (i = 0; i < MAX_MODES; ++i) - sf->thresh_mult[i] = INT_MAX; - - sf->thresh_mult[THR_DC] = 2000; - sf->thresh_mult[THR_TM] = 2000; - sf->thresh_mult[THR_NEWMV] = 4000; - sf->thresh_mult[THR_NEWG] = 4000; - sf->thresh_mult[THR_NEWA] = 4000; - sf->thresh_mult[THR_NEARESTMV] = 0; - sf->thresh_mult[THR_NEARESTG] = 0; - sf->thresh_mult[THR_NEARESTA] = 0; - sf->thresh_mult[THR_NEARMV] = 2000; - sf->thresh_mult[THR_NEARG] = 2000; - sf->thresh_mult[THR_NEARA] = 2000; - sf->thresh_mult[THR_COMP_NEARESTLA] = 2000; - sf->thresh_mult[THR_SPLITMV] = 2500; - sf->thresh_mult[THR_SPLITG] = 2500; - sf->thresh_mult[THR_SPLITA] = 2500; - sf->recode_loop = 0; - } - /* disable frame modes if flags not set */ if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) { sf->thresh_mult[THR_NEWMV ] = INT_MAX; sf->thresh_mult[THR_NEARESTMV] = INT_MAX; sf->thresh_mult[THR_ZEROMV ] = INT_MAX; sf->thresh_mult[THR_NEARMV ] = INT_MAX; - sf->thresh_mult[THR_SPLITMV ] = INT_MAX; } if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) { sf->thresh_mult[THR_NEARESTG ] = INT_MAX; sf->thresh_mult[THR_ZEROG ] = INT_MAX; sf->thresh_mult[THR_NEARG ] = INT_MAX; sf->thresh_mult[THR_NEWG ] = INT_MAX; - sf->thresh_mult[THR_SPLITG ] = INT_MAX; } if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) { sf->thresh_mult[THR_NEARESTA ] = INT_MAX; sf->thresh_mult[THR_ZEROA ] = INT_MAX; sf->thresh_mult[THR_NEARA ] = INT_MAX; sf->thresh_mult[THR_NEWA ] = INT_MAX; - sf->thresh_mult[THR_SPLITA ] = INT_MAX; } if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != @@ -657,7 +656,6 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) { sf->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX; sf->thresh_mult[THR_COMP_NEARLA ] = INT_MAX; sf->thresh_mult[THR_COMP_NEWLA ] = INT_MAX; - sf->thresh_mult[THR_COMP_SPLITLA ] = INT_MAX; } if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != (VP9_GOLD_FLAG | VP9_ALT_FLAG)) { @@ -665,17 +663,42 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) { sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX; sf->thresh_mult[THR_COMP_NEARGA ] = INT_MAX; sf->thresh_mult[THR_COMP_NEWGA ] = INT_MAX; - sf->thresh_mult[THR_COMP_SPLITGA ] = INT_MAX; } +} + +static void set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi, int mode) { + SPEED_FEATURES *sf = &cpi->sf; + int i; + + for (i = 0; i < MAX_REFS; ++i) + sf->thresh_mult_sub8x8[i] = mode == 0 ? -500 : 0; - if (sf->disable_splitmv == 1) { - sf->thresh_mult[THR_SPLITMV ] = INT_MAX; - sf->thresh_mult[THR_SPLITG ] = INT_MAX; - sf->thresh_mult[THR_SPLITA ] = INT_MAX; + sf->thresh_mult_sub8x8[THR_LAST] += 2500; + sf->thresh_mult_sub8x8[THR_GOLD] += 2500; + sf->thresh_mult_sub8x8[THR_ALTR] += 2500; + sf->thresh_mult_sub8x8[THR_INTRA] += 2500; + sf->thresh_mult_sub8x8[THR_COMP_LA] += 4500; + sf->thresh_mult_sub8x8[THR_COMP_GA] += 4500; - sf->thresh_mult[THR_COMP_SPLITLA ] = INT_MAX; - sf->thresh_mult[THR_COMP_SPLITGA ] = INT_MAX; + // Check for masked out split cases. + for (i = 0; i < MAX_REFS; i++) { + if (sf->disable_split_mask & (1 << i)) + sf->thresh_mult_sub8x8[i] = INT_MAX; } + + // disable mode test if frame flag is not set + if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) + sf->thresh_mult_sub8x8[THR_LAST] = INT_MAX; + if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) + sf->thresh_mult_sub8x8[THR_GOLD] = INT_MAX; + if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) + sf->thresh_mult_sub8x8[THR_ALTR] = INT_MAX; + if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != + (VP9_LAST_FLAG | VP9_ALT_FLAG)) + sf->thresh_mult_sub8x8[THR_COMP_LA] = INT_MAX; + if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != + (VP9_GOLD_FLAG | VP9_ALT_FLAG)) + sf->thresh_mult_sub8x8[THR_COMP_GA] = INT_MAX; } void vp9_set_speed_features(VP9_COMP *cpi) { @@ -688,12 +711,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) { if (mode > 1) mode = 1; - // Initialise default mode frequency sampling variables - for (i = 0; i < MAX_MODES; i ++) { - cpi->mode_check_freq[i] = 0; - cpi->mode_test_hit_counts[i] = 0; + for (i = 0; i < MAX_MODES; ++i) cpi->mode_chosen_counts[i] = 0; - } // best quality defaults sf->RD = 1; @@ -708,30 +727,28 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->max_step_search_steps = MAX_MVSEARCH_STEPS; sf->comp_inter_joint_search_thresh = BLOCK_4X4; sf->adaptive_rd_thresh = 0; - sf->use_lastframe_partitioning = 0; + sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_OFF; sf->tx_size_search_method = USE_FULL_RD; sf->use_lp32x32fdct = 0; sf->adaptive_motion_search = 0; sf->use_avoid_tested_higherror = 0; sf->reference_masking = 0; - sf->skip_lots_of_modes = 0; - sf->partition_by_variance = 0; sf->use_one_partition_size_always = 0; sf->less_rectangular_check = 0; sf->use_square_partition_only = 0; sf->auto_min_max_partition_size = 0; - sf->auto_min_max_partition_interval = 0; - sf->auto_min_max_partition_count = 0; sf->max_partition_size = BLOCK_64X64; sf->min_partition_size = BLOCK_4X4; sf->adjust_partitioning_from_last_frame = 0; sf->last_partitioning_redo_frequency = 4; - sf->disable_splitmv = 0; + sf->disable_split_mask = 0; sf->mode_search_skip_flags = 0; sf->disable_split_var_thresh = 0; sf->disable_filter_search_var_thresh = 0; - sf->intra_y_mode_mask = ALL_INTRA_MODES; - sf->intra_uv_mode_mask = ALL_INTRA_MODES; + for (i = 0; i < TX_SIZES; i++) { + sf->intra_y_mode_mask[i] = ALL_INTRA_MODES; + sf->intra_uv_mode_mask[i] = ALL_INTRA_MODES; + } sf->use_rd_breakout = 0; sf->skip_encode_sb = 0; sf->use_uv_intra_rd_estimate = 0; @@ -747,8 +764,10 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->static_segmentation = 0; #endif + sf->variance_adaptive_quantization = 0; + switch (mode) { - case 0: // best quality mode + case 0: // This is the best quality mode. break; case 1: @@ -759,121 +778,148 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->static_segmentation = 0; #endif sf->use_avoid_tested_higherror = 1; - sf->adaptive_rd_thresh = MIN((speed + 1), 4); + sf->adaptive_rd_thresh = 1; + sf->recode_loop = (speed < 1); if (speed == 1) { - sf->comp_inter_joint_search_thresh = BLOCK_SIZES; + sf->use_square_partition_only = !frame_is_intra_only(&cpi->common); + sf->less_rectangular_check = 1; + sf->tx_size_search_method = frame_is_intra_only(&cpi->common) + ? USE_FULL_RD : USE_LARGESTALL; + + if (MIN(cpi->common.width, cpi->common.height) >= 720) + sf->disable_split_mask = cpi->common.show_frame ? + DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; + else + sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; + + sf->use_rd_breakout = 1; + sf->adaptive_motion_search = 1; + sf->auto_mv_step_size = 1; + sf->adaptive_rd_thresh = 2; + sf->recode_loop = 2; + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; + } + if (speed == 2) { + sf->use_square_partition_only = !frame_is_intra_only(&cpi->common); sf->less_rectangular_check = 1; - sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME || - cpi->common.intra_only || - cpi->common.show_frame == 0) ? - USE_FULL_RD : - USE_LARGESTALL); - sf->use_square_partition_only = !(cpi->common.frame_type == KEY_FRAME || - cpi->common.intra_only || - cpi->common.show_frame == 0); - sf->disable_splitmv = - (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0; + sf->tx_size_search_method = frame_is_intra_only(&cpi->common) + ? USE_FULL_RD : USE_LARGESTALL; + + if (MIN(cpi->common.width, cpi->common.height) >= 720) + sf->disable_split_mask = cpi->common.show_frame ? + DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; + else + sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY; + + sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR; - sf->use_uv_intra_rd_estimate = 1; + sf->use_rd_breakout = 1; - sf->skip_encode_sb = 1; - sf->use_lp32x32fdct = 1; sf->adaptive_motion_search = 1; sf->auto_mv_step_size = 1; - sf->auto_min_max_partition_size = 1; - sf->auto_min_max_partition_interval = 1; - // FIXME(jingning): temporarily turn off disable_split_var_thresh - // during refactoring process. will get this back after finishing - // the main framework of partition search type. - sf->disable_split_var_thresh = 0; sf->disable_filter_search_var_thresh = 16; - - sf->intra_y_mode_mask = INTRA_DC_TM_H_V; - sf->intra_uv_mode_mask = INTRA_DC_TM_H_V; - sf->use_fast_coef_updates = 1; - sf->mode_skip_start = 9; - } - if (speed == 2) { - sf->less_rectangular_check = 1; - sf->use_square_partition_only = 1; sf->comp_inter_joint_search_thresh = BLOCK_SIZES; - sf->use_lastframe_partitioning = 1; + + sf->auto_min_max_partition_size = 1; + sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION; sf->adjust_partitioning_from_last_frame = 1; sf->last_partitioning_redo_frequency = 3; - sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME || - cpi->common.intra_only || - cpi->common.show_frame == 0) ? - USE_FULL_RD : - USE_LARGESTALL); + + sf->adaptive_rd_thresh = 2; + sf->recode_loop = 2; + sf->mode_skip_start = 11; + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; + sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; + } + if (speed == 3) { + sf->use_square_partition_only = 1; + sf->tx_size_search_method = USE_LARGESTALL; + + if (MIN(cpi->common.width, cpi->common.height) >= 720) + sf->disable_split_mask = DISABLE_ALL_SPLIT; + else + sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT; + sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA | - FLAG_SKIP_COMP_REFMISMATCH | - FLAG_SKIP_INTRA_LOWVAR | - FLAG_EARLY_TERMINATE; - sf->intra_y_mode_mask = INTRA_DC_TM; - sf->intra_uv_mode_mask = INTRA_DC_TM; - sf->use_uv_intra_rd_estimate = 1; + FLAG_SKIP_INTRA_LOWVAR; + sf->use_rd_breakout = 1; - sf->skip_encode_sb = 1; - sf->use_lp32x32fdct = 1; sf->adaptive_motion_search = 1; - sf->using_small_partition_info = 0; - sf->disable_splitmv = - (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0; sf->auto_mv_step_size = 1; - sf->search_method = SQUARE; - sf->subpel_iters_per_step = 1; - sf->use_fast_lpf_pick = 1; + + sf->disable_filter_search_var_thresh = 16; + sf->comp_inter_joint_search_thresh = BLOCK_SIZES; + sf->auto_min_max_partition_size = 1; - sf->auto_min_max_partition_interval = 2; - sf->disable_split_var_thresh = 32; - sf->disable_filter_search_var_thresh = 32; + sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL; + sf->adjust_partitioning_from_last_frame = 1; + sf->last_partitioning_redo_frequency = 3; + + sf->use_uv_intra_rd_estimate = 1; + sf->skip_encode_sb = 1; + sf->use_lp32x32fdct = 1; + sf->subpel_iters_per_step = 1; sf->use_fast_coef_updates = 2; - sf->mode_skip_start = 9; + + sf->adaptive_rd_thresh = 4; + sf->mode_skip_start = 6; } - if (speed == 3) { - sf->comp_inter_joint_search_thresh = BLOCK_SIZES; - sf->partition_by_variance = 1; - sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME || - cpi->common.intra_only || - cpi->common.show_frame == 0) ? - USE_FULL_RD : - USE_LARGESTALL); + if (speed == 4) { + sf->use_square_partition_only = 1; + sf->tx_size_search_method = USE_LARGESTALL; + sf->disable_split_mask = DISABLE_ALL_SPLIT; + sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_COMP_REFMISMATCH | FLAG_SKIP_INTRA_LOWVAR | FLAG_EARLY_TERMINATE; + sf->use_rd_breakout = 1; + sf->adaptive_motion_search = 1; + sf->auto_mv_step_size = 1; + + sf->disable_filter_search_var_thresh = 16; + sf->comp_inter_joint_search_thresh = BLOCK_SIZES; + + sf->auto_min_max_partition_size = 1; + sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL; + sf->adjust_partitioning_from_last_frame = 1; + sf->last_partitioning_redo_frequency = 3; + + sf->use_uv_intra_rd_estimate = 1; sf->skip_encode_sb = 1; sf->use_lp32x32fdct = 1; - sf->disable_splitmv = 1; - sf->auto_mv_step_size = 1; - sf->search_method = BIGDIA; sf->subpel_iters_per_step = 1; - sf->disable_split_var_thresh = 64; - sf->disable_filter_search_var_thresh = 64; - sf->intra_y_mode_mask = INTRA_DC_ONLY; - sf->intra_uv_mode_mask = INTRA_DC_ONLY; sf->use_fast_coef_updates = 2; - sf->mode_skip_start = 9; + + sf->adaptive_rd_thresh = 4; + sf->mode_skip_start = 6; + + /* sf->intra_y_mode_mask = INTRA_DC_ONLY; + sf->intra_uv_mode_mask = INTRA_DC_ONLY; + sf->search_method = BIGDIA; + sf->disable_split_var_thresh = 64; + sf->disable_filter_search_var_thresh = 64; */ } - if (speed == 4) { + if (speed == 5) { sf->comp_inter_joint_search_thresh = BLOCK_SIZES; sf->use_one_partition_size_always = 1; sf->always_this_block_size = BLOCK_16X16; - sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME || - cpi->common.intra_only || - cpi->common.show_frame == 0) ? - USE_FULL_RD : - USE_LARGESTALL); + sf->tx_size_search_method = frame_is_intra_only(&cpi->common) ? + USE_FULL_RD : USE_LARGESTALL; sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA | @@ -887,22 +933,25 @@ void vp9_set_speed_features(VP9_COMP *cpi) { // sf->reduce_first_step_size = 1; // sf->reference_masking = 1; - sf->disable_splitmv = 1; + sf->disable_split_mask = DISABLE_ALL_SPLIT; sf->search_method = HEX; sf->subpel_iters_per_step = 1; sf->disable_split_var_thresh = 64; sf->disable_filter_search_var_thresh = 96; - sf->intra_y_mode_mask = INTRA_DC_ONLY; - sf->intra_uv_mode_mask = INTRA_DC_ONLY; + for (i = 0; i < TX_SIZES; i++) { + sf->intra_y_mode_mask[i] = INTRA_DC_ONLY; + sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY; + } sf->use_fast_coef_updates = 2; - sf->mode_skip_start = 9; + sf->adaptive_rd_thresh = 4; + sf->mode_skip_start = 6; } break; - }; /* switch */ // Set rd thresholds based on mode and speed setting set_rd_speed_thresholds(cpi, mode); + set_rd_speed_thresholds_sub8x8(cpi, mode); // Slow quant, dct and trellis not worthwhile for first pass // so make sure they are always turned off. @@ -910,16 +959,16 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->optimize_coefficients = 0; } - cpi->mb.fwd_txm16x16 = vp9_short_fdct16x16; - cpi->mb.fwd_txm8x8 = vp9_short_fdct8x8; - cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4; - cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4; - if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) { - cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4; - cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4; + // No recode for 1 pass. + if (cpi->pass == 0) { + sf->recode_loop = 0; + sf->optimize_coefficients = 0; } - cpi->mb.quantize_b_4x4 = vp9_regular_quantize_b_4x4; + cpi->mb.fwd_txm4x4 = vp9_fdct4x4; + if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) { + cpi->mb.fwd_txm4x4 = vp9_fwht4x4; + } if (cpi->sf.subpel_search_method == SUBPEL_ITERATIVE) { cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_iterative; @@ -954,20 +1003,6 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) { "Failed to allocate altref buffer"); } -static int alloc_partition_data(VP9_COMP *cpi) { - vpx_free(cpi->mb.pip); - - cpi->mb.pip = vpx_calloc(cpi->common.mode_info_stride * - (cpi->common.mi_rows + MI_BLOCK_SIZE), - sizeof(PARTITION_INFO)); - if (!cpi->mb.pip) - return 1; - - cpi->mb.pi = cpi->mb.pip + cpi->common.mode_info_stride + 1; - - return 0; -} - void vp9_alloc_compressor_data(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; @@ -975,10 +1010,6 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) { vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffers"); - if (alloc_partition_data(cpi)) - vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, - "Failed to allocate partition data"); - if (vp9_alloc_frame_buffer(&cpi->last_frame_uf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, @@ -1001,11 +1032,6 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) { CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok))); } - // Data used for real time vc mode to see if gf needs refreshing - cpi->inter_zz_count = 0; - cpi->gf_bad_count = 0; - cpi->gf_update_recommended = 0; - vpx_free(cpi->mb_activity_map); CHECK_MEM_ERROR(cm, cpi->mb_activity_map, vpx_calloc(sizeof(unsigned int), @@ -1015,6 +1041,19 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) { CHECK_MEM_ERROR(cm, cpi->mb_norm_activity_map, vpx_calloc(sizeof(unsigned int), cm->mb_rows * cm->mb_cols)); + + // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm + // block where mi unit size is 8x8. + vpx_free(cpi->above_context[0]); + CHECK_MEM_ERROR(cm, cpi->above_context[0], + vpx_calloc(2 * mi_cols_aligned_to_sb(cm->mi_cols) * + MAX_MB_PLANE, + sizeof(*cpi->above_context[0]))); + + vpx_free(cpi->above_seg_context); + CHECK_MEM_ERROR(cm, cpi->above_seg_context, + vpx_calloc(mi_cols_aligned_to_sb(cm->mi_cols), + sizeof(*cpi->above_seg_context))); } @@ -1047,13 +1086,18 @@ static void update_frame_size(VP9_COMP *cpi) { vp9_init_dsmotion_compensation(&cpi->mb, y_stride); } } + + { + int i; + for (i = 1; i < MAX_MB_PLANE; ++i) { + cpi->above_context[i] = cpi->above_context[0] + + i * sizeof(*cpi->above_context[0]) * 2 * + mi_cols_aligned_to_sb(cm->mi_cols); + } + } } -// TODO perhaps change number of steps expose to outside world when setting -// max and min limits. Also this will likely want refining for the extended Q -// range. -// // Table that converts 0-63 Q range values passed in outside to the Qindex // range used internally. static const int q_trans[] = { @@ -1080,11 +1124,14 @@ void vp9_new_framerate(VP9_COMP *cpi, double framerate) { if (framerate < 0.1) framerate = 30; - cpi->oxcf.framerate = framerate; - cpi->output_framerate = cpi->oxcf.framerate; - cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate); - cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate); - cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); + cpi->oxcf.framerate = framerate; + cpi->output_framerate = cpi->oxcf.framerate; + cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth + / cpi->output_framerate); + cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth + / cpi->output_framerate); + cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth * + cpi->oxcf.two_pass_vbrmin_section / 100); cpi->min_frame_bandwidth = MAX(cpi->min_frame_bandwidth, FRAME_OVERHEAD_BITS); @@ -1133,7 +1180,6 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { int i; cpi->oxcf = *oxcf; - cpi->goldfreq = 7; cm->version = oxcf->version; @@ -1195,6 +1241,12 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { switch (cpi->oxcf.Mode) { // Real time and one pass deprecated in test code base + case MODE_GOODQUALITY: + cpi->pass = 0; + cpi->compressor_speed = 2; + cpi->oxcf.cpu_used = clamp(cpi->oxcf.cpu_used, -5, 5); + break; + case MODE_FIRSTPASS: cpi->pass = 1; cpi->compressor_speed = 1; @@ -1217,14 +1269,8 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level]; cpi->oxcf.lossless = oxcf->lossless; - if (cpi->oxcf.lossless) { - cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add; - cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add; - } else { - cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add; - cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add; - } - + cpi->mb.e_mbd.itxm_add = cpi->oxcf.lossless ? vp9_iwht4x4_add + : vp9_idct4x4_add; cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL; cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; @@ -1237,8 +1283,8 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cm->reset_frame_context = 0; setup_features(cm); - cpi->mb.e_mbd.allow_high_precision_mv = 0; // Default mv precision adaptation - set_mvcost(&cpi->mb); + cpi->common.allow_high_precision_mv = 0; // Default mv precision + set_mvcost(cpi); { int i; @@ -1390,6 +1436,94 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) { } while (++i <= MV_MAX); } +static void init_pick_mode_context(VP9_COMP *cpi) { + int i; + MACROBLOCK *x = &cpi->mb; + MACROBLOCKD *xd = &x->e_mbd; + VP9_COMMON *cm = &cpi->common; + + for (i = 0; i < BLOCK_SIZES; ++i) { + const int num_4x4_w = num_4x4_blocks_wide_lookup[i]; + const int num_4x4_h = num_4x4_blocks_high_lookup[i]; + const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h); + if (i < BLOCK_16X16) { + for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) { + for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) { + for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) { + PICK_MODE_CONTEXT *ctx = get_block_context(x, i); + ctx->num_4x4_blk = num_4x4_blk; + CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, + vpx_calloc(num_4x4_blk, sizeof(uint8_t))); + } + } + } + } else if (i < BLOCK_32X32) { + for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) { + for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk; + ++xd->mb_index) { + PICK_MODE_CONTEXT *ctx = get_block_context(x, i); + ctx->num_4x4_blk = num_4x4_blk; + CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, + vpx_calloc(num_4x4_blk, sizeof(uint8_t))); + } + } + } else if (i < BLOCK_64X64) { + for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) { + PICK_MODE_CONTEXT *ctx = get_block_context(x, i); + ctx->num_4x4_blk = num_4x4_blk; + CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, + vpx_calloc(num_4x4_blk, sizeof(uint8_t))); + } + } else { + PICK_MODE_CONTEXT *ctx = get_block_context(x, i); + ctx->num_4x4_blk = num_4x4_blk; + CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, + vpx_calloc(num_4x4_blk, sizeof(uint8_t))); + } + } +} + +static void free_pick_mode_context(MACROBLOCK *x) { + int i; + MACROBLOCKD *xd = &x->e_mbd; + + for (i = 0; i < BLOCK_SIZES; ++i) { + const int num_4x4_w = num_4x4_blocks_wide_lookup[i]; + const int num_4x4_h = num_4x4_blocks_high_lookup[i]; + const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h); + if (i < BLOCK_16X16) { + for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) { + for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) { + for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) { + PICK_MODE_CONTEXT *ctx = get_block_context(x, i); + vpx_free(ctx->zcoeff_blk); + ctx->zcoeff_blk = 0; + } + } + } + } else if (i < BLOCK_32X32) { + for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) { + for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk; + ++xd->mb_index) { + PICK_MODE_CONTEXT *ctx = get_block_context(x, i); + vpx_free(ctx->zcoeff_blk); + ctx->zcoeff_blk = 0; + } + } + } else if (i < BLOCK_64X64) { + for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) { + PICK_MODE_CONTEXT *ctx = get_block_context(x, i); + vpx_free(ctx->zcoeff_blk); + ctx->zcoeff_blk = 0; + } + } else { + PICK_MODE_CONTEXT *ctx = get_block_context(x, i); + vpx_free(ctx->zcoeff_blk); + ctx->zcoeff_blk = 0; + } + } +} + VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { int i, j; volatile union { @@ -1426,6 +1560,8 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { init_config((VP9_PTR)cpi, oxcf); + init_pick_mode_context(cpi); + cm->current_video_frame = 0; cpi->kf_overspend_bits = 0; cpi->kf_bitrate_adjustment = 0; @@ -1478,7 +1614,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { /*Initialize the feed-forward activity masking.*/ cpi->activity_avg = 90 << 12; - cpi->frames_since_key = 8; // Give a sensible default for the first frame. + cpi->frames_since_key = 8; // Sensible default for first frame. cpi->key_frame_frequency = cpi->oxcf.key_freq; cpi->this_key_frame_forced = 0; cpi->next_key_frame_forced = 0; @@ -1599,9 +1735,12 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { vp9_set_speed_features(cpi); // Default rd threshold factors for mode selection - for (i = 0; i < BLOCK_SIZES; ++i) + for (i = 0; i < BLOCK_SIZES; ++i) { for (j = 0; j < MAX_MODES; ++j) cpi->rd_thresh_freq_fact[i][j] = 32; + for (j = 0; j < MAX_REFS; ++j) + cpi->rd_thresh_freq_sub8x8[i][j] = 32; + } #define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \ SDX3F, SDX8F, SDX4DF)\ @@ -1712,10 +1851,10 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { cpi->common.error.setjmp = 0; - vp9_zero(cpi->y_uv_mode_count) + vp9_zero(cpi->y_uv_mode_count); #ifdef MODE_TEST_HIT_STATS - vp9_zero(cpi->mode_test_hits) + vp9_zero(cpi->mode_test_hits); #endif return (VP9_PTR) cpi; @@ -1757,8 +1896,10 @@ void vp9_remove_compressor(VP9_PTR *ptr) { FILE *f = fopen("opsnr.stt", "a"); double time_encoded = (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) / 10000000.000; - double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data) / 1000.000; - double dr = (double)cpi->bytes * (double) 8 / (double)1000 / time_encoded; + double total_encode_time = (cpi->time_receive_data + + cpi->time_compress_data) / 1000.000; + double dr = (double)cpi->bytes * (double) 8 / (double)1000 + / time_encoded; if (cpi->b_calculate_psnr) { YV12_BUFFER_CONFIG *lst_yv12 = @@ -1778,20 +1919,15 @@ void vp9_remove_compressor(VP9_PTR *ptr) { dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim, total_ssimp, total_encode_time); -// fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f %10ld\n", -// dr, cpi->total / cpi->count, total_psnr, -// cpi->totalp / cpi->count, total_psnr2, total_ssim, -// total_encode_time, cpi->tot_recode_hits); } if (cpi->b_calculate_ssimg) { fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t Time(ms)\n"); fprintf(f, "%7.2f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr, - cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count, - cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time); -// fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f %10ld\n", dr, -// cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count, -// cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time, cpi->tot_recode_hits); + cpi->total_ssimg_y / cpi->count, + cpi->total_ssimg_u / cpi->count, + cpi->total_ssimg_v / cpi->count, + cpi->total_ssimg_all / cpi->count, total_encode_time); } fclose(f); @@ -1838,11 +1974,9 @@ void vp9_remove_compressor(VP9_PTR *ptr) { "[INTRA_MODES] =\n{\n"); for (i = 0; i < INTRA_MODES; i++) { - fprintf(fmode, " { // Above Mode : %d\n", i); for (j = 0; j < INTRA_MODES; j++) { - fprintf(fmode, " {"); for (k = 0; k < INTRA_MODES; k++) { @@ -1853,11 +1987,9 @@ void vp9_remove_compressor(VP9_PTR *ptr) { } fprintf(fmode, "}, // left_mode %d\n", j); - } fprintf(fmode, " },\n"); - } fprintf(fmode, "};\n"); @@ -1891,14 +2023,15 @@ void vp9_remove_compressor(VP9_PTR *ptr) { (cpi->time_receive_data + cpi->time_compress_data) / 1000); } #endif - } + free_pick_mode_context(&cpi->mb); dealloc_compressor_data(cpi); vpx_free(cpi->mb.ss); vpx_free(cpi->tok); - for (i = 0; i < sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]); i++) { + for (i = 0; i < sizeof(cpi->mbgraph_stats) / + sizeof(cpi->mbgraph_stats[0]); ++i) { vpx_free(cpi->mbgraph_stats[i].mb_stats); } @@ -1925,7 +2058,6 @@ void vp9_remove_compressor(VP9_PTR *ptr) { fclose(kf_list); #endif - } @@ -2246,14 +2378,15 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { cpi->frames_since_golden = 0; // ******** Fixed Q test code only ************ - // If we are going to use the ALT reference for the next group of frames set a flag to say so. + // If we are going to use the ALT reference for the next group of frames + // set a flag to say so. if (cpi->oxcf.fixed_q >= 0 && cpi->oxcf.play_alternate && !cpi->refresh_alt_ref_frame) { cpi->source_alt_ref_pending = 1; cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; - // TODO(ivan): for SVC encoder, GF automatic update is disabled by using a - // large GF_interval + // TODO(ivan): For SVC encoder, GF automatic update is disabled by using + // a large GF_interval. if (cpi->use_svc) { cpi->frames_till_gf_update_due = INT_MAX; } @@ -2293,12 +2426,12 @@ static int find_fp_qindex() { return i; } -static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags) { +static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, + unsigned int *frame_flags) { (void) size; (void) dest; (void) frame_flags; - vp9_set_quantizer(cpi, find_fp_qindex()); vp9_first_pass(cpi); } @@ -2306,13 +2439,11 @@ static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, #define WRITE_RECON_BUFFER 0 #if WRITE_RECON_BUFFER void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) { - - // write the frame FILE *yframe; int i; char filename[255]; - sprintf(filename, "cx\\y%04d.raw", this_frame); + snprintf(filename, sizeof(filename), "cx\\y%04d.raw", this_frame); yframe = fopen(filename, "wb"); for (i = 0; i < frame->y_height; i++) @@ -2320,7 +2451,7 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) { frame->y_width, 1, yframe); fclose(yframe); - sprintf(filename, "cx\\u%04d.raw", this_frame); + snprintf(filename, sizeof(filename), "cx\\u%04d.raw", this_frame); yframe = fopen(filename, "wb"); for (i = 0; i < frame->uv_height; i++) @@ -2328,7 +2459,7 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) { frame->uv_width, 1, yframe); fclose(yframe); - sprintf(filename, "cx\\v%04d.raw", this_frame); + snprintf(filename, sizeof(filename), "cx\\v%04d.raw", this_frame); yframe = fopen(filename, "wb"); for (i = 0; i < frame->uv_height; i++) @@ -2350,8 +2481,10 @@ static double compute_edge_pixel_proportion(YV12_BUFFER_CONFIG *frame) { for (i = 1; i < frame->y_height - 1; i++) { for (j = 1; j < frame->y_width - 1; j++) { /* Sobel hor and ver gradients */ - int v = 2 * (curr[1] - curr[-1]) + (prev[1] - prev[-1]) + (next[1] - next[-1]); - int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) + (prev[-1] - next[-1]); + int v = 2 * (curr[1] - curr[-1]) + (prev[1] - prev[-1]) + + (next[1] - next[-1]); + int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) + + (prev[-1] - next[-1]); h = (h < 0 ? -h : h); v = (v < 0 ? -v : v); if (h > EDGE_THRESH || v > EDGE_THRESH) @@ -2387,10 +2520,9 @@ static int recode_loop_test(VP9_COMP *cpi, if (((cpi->projected_frame_size > high_limit) && (q < maxq)) || ((cpi->projected_frame_size < low_limit) && (q > minq))) { force_recode = 1; - } - // Special Constrained quality tests - else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) { - // Undershoot and below auto cq level + } else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) { + // Deal with frame undershoot and whether or not we are + // below the automatically set cq level. if (q > cpi->cq_target_quality && cpi->projected_frame_size < ((cpi->this_frame_target * 7) >> 3)) { force_recode = 1; @@ -2551,159 +2683,81 @@ static void full_to_model_counts( } } +#if 0 && CONFIG_INTERNAL_STATS +static void output_frame_level_debug_stats(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w"); + int recon_err; -static void encode_frame_to_data_rate(VP9_COMP *cpi, - unsigned long *size, - unsigned char *dest, - unsigned int *frame_flags) { - VP9_COMMON *cm = &cpi->common; - MACROBLOCKD *xd = &cpi->mb.e_mbd; - TX_SIZE t; - int q; - int frame_over_shoot_limit; - int frame_under_shoot_limit; - - int loop = 0; - int loop_count; - - int q_low; - int q_high; - - int top_index; - int bottom_index; - int active_worst_qchanged = 0; - - int overshoot_seen = 0; - int undershoot_seen = 0; - - SPEED_FEATURES *sf = &cpi->sf; - unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height); - struct segmentation *seg = &cm->seg; - - /* Scale the source buffer, if required */ - if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width || - cm->mi_rows * 8 != cpi->un_scaled_source->y_height) { - scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source); - cpi->Source = &cpi->scaled_source; - } else { - cpi->Source = cpi->un_scaled_source; - } - - scale_references(cpi); - - // Clear down mmx registers to allow floating point in what follows - vp9_clear_system_state(); - - - // For an alt ref frame in 2 pass we skip the call to the second - // pass function that sets the target bandwidth so must set it here - if (cpi->refresh_alt_ref_frame) { - // Per frame bit target for the alt ref frame - cpi->per_frame_bandwidth = cpi->twopass.gf_bits; - // per second target bitrate - cpi->target_bandwidth = (int)(cpi->twopass.gf_bits * - cpi->output_framerate); - } - - // Clear zbin over-quant value and mode boost values. - cpi->zbin_mode_boost = 0; - - // Enable or disable mode based tweaking of the zbin - // For 2 Pass Only used where GF/ARF prediction quality - // is above a threshold - cpi->zbin_mode_boost = 0; - - // if (cpi->oxcf.lossless) - cpi->zbin_mode_boost_enabled = 0; - // else - // cpi->zbin_mode_boost_enabled = 1; - - // Current default encoder behaviour for the altref sign bias - cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = cpi->source_alt_ref_active; - - // Check to see if a key frame is signaled - // For two pass with auto key frame enabled cm->frame_type may already be set, but not for one pass. - if ((cm->current_video_frame == 0) || - (cm->frame_flags & FRAMEFLAGS_KEY) || - (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0))) { - // Key frame from VFW/auto-keyframe/first frame - cm->frame_type = KEY_FRAME; - } + vp9_clear_system_state(); // __asm emms; - // Set default state for segment based loop filter update flags - cm->lf.mode_ref_delta_update = 0; + recon_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm)); + + if (cpi->twopass.total_left_stats.coded_error != 0.0) + fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d" + "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f" + "%6d %6d %5d %5d %5d %8.2f %10d %10.3f" + "%10.3f %8d %10d %10d %10d\n", + cpi->common.current_video_frame, cpi->this_frame_target, + cpi->projected_frame_size, 0, + (cpi->projected_frame_size - cpi->this_frame_target), + (int)cpi->total_target_vs_actual, + (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target), + (int)cpi->total_actual_bits, cm->base_qindex, + vp9_convert_qindex_to_q(cm->base_qindex), + (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0, + vp9_convert_qindex_to_q(cpi->active_best_quality), + vp9_convert_qindex_to_q(cpi->active_worst_quality), cpi->avg_q, + vp9_convert_qindex_to_q(cpi->ni_av_qi), + vp9_convert_qindex_to_q(cpi->cq_target_quality), + cpi->refresh_last_frame, cpi->refresh_golden_frame, + cpi->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost, + cpi->twopass.est_max_qcorrection_factor, (int)cpi->twopass.bits_left, + cpi->twopass.total_left_stats.coded_error, + (double)cpi->twopass.bits_left / + (1 + cpi->twopass.total_left_stats.coded_error), + cpi->tot_recode_hits, recon_err, cpi->kf_boost, cpi->kf_zeromotion_pct); - // Initialize cpi->mv_step_param to default based on max resolution - cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def); - // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate. - if (sf->auto_mv_step_size) { - if ((cpi->common.frame_type == KEY_FRAME) || cpi->common.intra_only) { - // initialize max_mv_magnitude for use in the first INTER frame - // after a key/intra-only frame - cpi->max_mv_magnitude = max_mv_def; - } else { - if (cm->show_frame) - // allow mv_steps to correspond to twice the max mv magnitude found - // in the previous frame, capped by the default max_mv_magnitude based - // on resolution - cpi->mv_step_param = vp9_init_search_range( - cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude)); - cpi->max_mv_magnitude = 0; - } - } + fclose(f); - // Set various flags etc to special state if it is a key frame - if (cm->frame_type == KEY_FRAME) { - // Reset the loop filter deltas and segmentation map - setup_features(cm); + if (0) { + FILE *const fmodes = fopen("Modes.stt", "a"); + int i; - // If segmentation is enabled force a map update for key frames - if (seg->enabled) { - seg->update_map = 1; - seg->update_data = 1; - } + fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame, + cm->frame_type, cpi->refresh_golden_frame, + cpi->refresh_alt_ref_frame); - // The alternate reference frame cannot be active for a key frame - cpi->source_alt_ref_active = 0; + for (i = 0; i < MAX_MODES; ++i) + fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]); + for (i = 0; i < MAX_REFS; ++i) + fprintf(fmodes, "%5d ", cpi->sub8x8_mode_chosen_counts[i]); - cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0); - cm->frame_parallel_decoding_mode = - (cpi->oxcf.frame_parallel_decoding_mode != 0); - if (cm->error_resilient_mode) { - cm->frame_parallel_decoding_mode = 1; - cm->reset_frame_context = 0; - cm->refresh_frame_context = 0; - } - } + fprintf(fmodes, "\n"); - // Configure experimental use of segmentation for enhanced coding of - // static regions if indicated. - // Only allowed for now in second pass of two pass (as requires lagged coding) - // and if the relevant speed feature flag is set. - if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) { - configure_static_seg_features(cpi); + fclose(fmodes); } +} +#endif - // Decide how big to make the frame - vp9_pick_frame_size(cpi); - - vp9_clear_system_state(); - +static int pick_q_and_adjust_q_bounds(VP9_COMP *cpi, + int * bottom_index, int * top_index) { // Set an active best quality and if necessary active worst quality - q = cpi->active_worst_quality; + int q = cpi->active_worst_quality; + VP9_COMMON *const cm = &cpi->common; - if (cm->frame_type == KEY_FRAME) { + if (frame_is_intra_only(cm)) { #if !CONFIG_MULTIPLE_ARF - // Special case for key frames forced because we have reached - // the maximum key frame interval. Here force the Q to a range - // based on the ambient Q to reduce the risk of popping + // Handle the special case for key frames forced when we have75 reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. if (cpi->this_key_frame_forced) { int delta_qindex; int qindex = cpi->last_boosted_qindex; double last_boosted_q = vp9_convert_qindex_to_q(qindex); - delta_qindex = compute_qdelta(cpi, last_boosted_q, - (last_boosted_q * 0.75)); + delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q, + (last_boosted_q * 0.75)); cpi->active_best_quality = MAX(qindex + delta_qindex, cpi->best_quality); @@ -2714,18 +2768,10 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, double q_val; // Baseline value derived from cpi->active_worst_quality and kf boost - if (cpi->kf_boost > high) { - cpi->active_best_quality = kf_low_motion_minq[q]; - } else if (cpi->kf_boost < low) { - cpi->active_best_quality = kf_high_motion_minq[q]; - } else { - const int gap = high - low; - const int offset = high - cpi->kf_boost; - const int qdiff = kf_high_motion_minq[q] - kf_low_motion_minq[q]; - const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap; - - cpi->active_best_quality = kf_low_motion_minq[q] + adjustment; - } + cpi->active_best_quality = get_active_quality(q, cpi->kf_boost, + low, high, + kf_low_motion_minq, + kf_high_motion_minq); // Allow somewhat lower kf minq with small image formats. if ((cm->width * cm->height) <= (352 * 288)) { @@ -2739,84 +2785,78 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // on active_best_quality. q_val = vp9_convert_qindex_to_q(cpi->active_best_quality); cpi->active_best_quality += - compute_qdelta(cpi, q_val, (q_val * q_adj_factor)); + vp9_compute_qdelta(cpi, q_val, (q_val * q_adj_factor)); } #else double current_q; // Force the KF quantizer to be 30% of the active_worst_quality. current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality); cpi->active_best_quality = cpi->active_worst_quality - + compute_qdelta(cpi, current_q, current_q * 0.3); + + vp9_compute_qdelta(cpi, current_q, current_q * 0.3); #endif - } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) { + } else if (!cpi->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { int high = 2000; int low = 400; // Use the lower of cpi->active_worst_quality and recent - // average Q as basis for GF/ARF Q limit unless last frame was + // average Q as basis for GF/ARF best Q limit unless last frame was // a key frame. if (cpi->frames_since_key > 1 && cpi->avg_frame_qindex < cpi->active_worst_quality) { q = cpi->avg_frame_qindex; } // For constrained quality dont allow Q less than the cq level - if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && - q < cpi->cq_target_quality) { - q = cpi->cq_target_quality; - } - if (cpi->gfu_boost > high) { - cpi->active_best_quality = gf_low_motion_minq[q]; - } else if (cpi->gfu_boost < low) { - cpi->active_best_quality = gf_high_motion_minq[q]; - } else { - const int gap = high - low; - const int offset = high - cpi->gfu_boost; - const int qdiff = gf_high_motion_minq[q] - gf_low_motion_minq[q]; - const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap; - - cpi->active_best_quality = gf_low_motion_minq[q] + adjustment; - } - - // Constrained quality use slightly lower active best. - if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) + if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) { + if (q < cpi->cq_target_quality) + q = cpi->cq_target_quality; + if (cpi->frames_since_key > 1) { + cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost, + low, high, + afq_low_motion_minq, + afq_high_motion_minq); + } else { + cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost, + low, high, + gf_low_motion_minq, + gf_high_motion_minq); + } + // Constrained quality use slightly lower active best. cpi->active_best_quality = cpi->active_best_quality * 15 / 16; - // TODO(debargha): Refine the logic below - if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { + } else if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { if (!cpi->refresh_alt_ref_frame) { cpi->active_best_quality = cpi->cq_target_quality; } else { if (cpi->frames_since_key > 1) { - if (cpi->gfu_boost > high) { - cpi->active_best_quality = cpi->cq_target_quality * 6 / 16; - } else if (cpi->gfu_boost < low) { - cpi->active_best_quality = cpi->cq_target_quality * 11 / 16; - } else { - const int gap = high - low; - const int offset = high - cpi->gfu_boost; - const int qdiff = cpi->cq_target_quality * 5 / 16; - const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap; - cpi->active_best_quality = cpi->cq_target_quality * 6 / 16 - + adjustment; - } + cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost, + low, high, + afq_low_motion_minq, + afq_high_motion_minq); + } else { + cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost, + low, high, + gf_low_motion_minq, + gf_high_motion_minq); } } + } else { + cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost, + low, high, + gf_low_motion_minq, + gf_high_motion_minq); } } else { if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { cpi->active_best_quality = cpi->cq_target_quality; } else { -#ifdef ONE_SHOT_Q_ESTIMATE -#ifdef STRICT_ONE_SHOT_Q - cpi->active_best_quality = q; -#else - cpi->active_best_quality = inter_minq[q]; -#endif -#else cpi->active_best_quality = inter_minq[q]; -#endif + // 1-pass: for now, use the average Q for the active_best, if its lower + // than active_worst. + if (cpi->pass == 0 && (cpi->avg_frame_qindex < q)) + cpi->active_best_quality = inter_minq[cpi->avg_frame_qindex]; - // For the constant/constrained quality mode we don't want + // For the constrained quality mode we don't want // q to fall below the cq level. if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) && (cpi->active_best_quality < cpi->cq_target_quality)) { @@ -2844,16 +2884,189 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (cpi->active_worst_quality < cpi->active_best_quality) cpi->active_worst_quality = cpi->active_best_quality; - // Special case code to try and match quality with forced key frames + // Limit Q range for the adaptive loop. + if (cm->frame_type == KEY_FRAME && !cpi->this_key_frame_forced) { + *top_index = + (cpi->active_worst_quality + cpi->active_best_quality * 3) / 4; + // If this is the first (key) frame in 1-pass, active best is the user + // best-allowed, and leave the top_index to active_worst. + if (cpi->pass == 0 && cpi->common.current_video_frame == 0) { + cpi->active_best_quality = cpi->oxcf.best_allowed_q; + *top_index = cpi->oxcf.worst_allowed_q; + } + } else if (!cpi->is_src_frame_alt_ref && + (cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER) && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + *top_index = + (cpi->active_worst_quality + cpi->active_best_quality) / 2; + } else { + *top_index = cpi->active_worst_quality; + } + *bottom_index = cpi->active_best_quality; + if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { q = cpi->active_best_quality; + // Special case code to try and match quality with forced key frames } else if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) { q = cpi->last_boosted_qindex; } else { - // Determine initial Q to try - q = vp9_regulate_q(cpi, cpi->this_frame_target); + // Determine initial Q to try. + if (cpi->pass == 0) { + // 1-pass: for now, use per-frame-bw for target size of frame, scaled + // by |x| for key frame. + int scale = (cm->frame_type == KEY_FRAME) ? 5 : 1; + q = vp9_regulate_q(cpi, scale * cpi->av_per_frame_bandwidth); + } else { + q = vp9_regulate_q(cpi, cpi->this_frame_target); + } + if (q > *top_index) + q = *top_index; } + return q; +} +static void encode_frame_to_data_rate(VP9_COMP *cpi, + unsigned long *size, + unsigned char *dest, + unsigned int *frame_flags) { + VP9_COMMON *const cm = &cpi->common; + TX_SIZE t; + int q; + int frame_over_shoot_limit; + int frame_under_shoot_limit; + + int loop = 0; + int loop_count; + + int q_low; + int q_high; + + int top_index; + int bottom_index; + int active_worst_qchanged = 0; + + int overshoot_seen = 0; + int undershoot_seen = 0; + + SPEED_FEATURES *const sf = &cpi->sf; + unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height); + struct segmentation *const seg = &cm->seg; + + /* Scale the source buffer, if required. */ + if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width || + cm->mi_rows * 8 != cpi->un_scaled_source->y_height) { + scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source); + cpi->Source = &cpi->scaled_source; + } else { + cpi->Source = cpi->un_scaled_source; + } + scale_references(cpi); + + // Clear down mmx registers to allow floating point in what follows. + vp9_clear_system_state(); + + // For an alt ref frame in 2 pass we skip the call to the second + // pass function that sets the target bandwidth so we must set it here. + if (cpi->refresh_alt_ref_frame) { + // Set a per frame bit target for the alt ref frame. + cpi->per_frame_bandwidth = cpi->twopass.gf_bits; + // Set a per second target bitrate. + cpi->target_bandwidth = (int)(cpi->twopass.gf_bits * cpi->output_framerate); + } + + // Clear zbin over-quant value and mode boost values. + cpi->zbin_mode_boost = 0; + + // Enable or disable mode based tweaking of the zbin. + // For 2 pass only used where GF/ARF prediction quality + // is above a threshold. + cpi->zbin_mode_boost = 0; + cpi->zbin_mode_boost_enabled = 0; + + // Current default encoder behavior for the altref sign bias. + cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = cpi->source_alt_ref_active; + + // Check to see if a key frame is signaled. + // For two pass with auto key frame enabled cm->frame_type may already be + // set, but not for one pass. + if ((cm->current_video_frame == 0) || + (cm->frame_flags & FRAMEFLAGS_KEY) || + (cpi->oxcf.auto_key && (cpi->frames_since_key % + cpi->key_frame_frequency == 0))) { + // Set frame type to key frame for the force key frame, if we exceed the + // maximum distance in an automatic keyframe selection or for the first + // frame. + cm->frame_type = KEY_FRAME; + } + + // Set default state for segment based loop filter update flags. + cm->lf.mode_ref_delta_update = 0; + + // Initialize cpi->mv_step_param to default based on max resolution. + cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def); + // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate. + if (sf->auto_mv_step_size) { + if (frame_is_intra_only(&cpi->common)) { + // Initialize max_mv_magnitude for use in the first INTER frame + // after a key/intra-only frame. + cpi->max_mv_magnitude = max_mv_def; + } else { + if (cm->show_frame) + // Allow mv_steps to correspond to twice the max mv magnitude found + // in the previous frame, capped by the default max_mv_magnitude based + // on resolution. + cpi->mv_step_param = vp9_init_search_range( + cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude)); + cpi->max_mv_magnitude = 0; + } + } + + // Set various flags etc to special state if it is a key frame. + if (frame_is_intra_only(cm)) { + vp9_setup_key_frame(cpi); + // Reset the loop filter deltas and segmentation map. + setup_features(cm); + + // If segmentation is enabled force a map update for key frames. + if (seg->enabled) { + seg->update_map = 1; + seg->update_data = 1; + } + + // The alternate reference frame cannot be active for a key frame. + cpi->source_alt_ref_active = 0; + + cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0); + cm->frame_parallel_decoding_mode = + (cpi->oxcf.frame_parallel_decoding_mode != 0); + if (cm->error_resilient_mode) { + cm->frame_parallel_decoding_mode = 1; + cm->reset_frame_context = 0; + cm->refresh_frame_context = 0; + } else if (cm->intra_only) { + // Only reset the current context. + cm->reset_frame_context = 2; + } + } + + // Configure experimental use of segmentation for enhanced coding of + // static regions if indicated. + // Only allowed in second pass of two pass (as requires lagged coding) + // and if the relevant speed feature flag is set. + if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) { + configure_static_seg_features(cpi); + } + + // Decide how big to make the frame. + vp9_pick_frame_size(cpi); + + vp9_clear_system_state(); + + q = pick_q_and_adjust_q_bounds(cpi, &bottom_index, &top_index); + + q_high = top_index; + q_low = bottom_index; + vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit); @@ -2868,7 +3081,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // Set quantizer steps at 10% increments. new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level))); - q = cpi->active_worst_quality + compute_qdelta(cpi, current_q, new_q); + q = cpi->active_worst_quality + vp9_compute_qdelta(cpi, current_q, new_q); bottom_index = q; top_index = q; @@ -2876,24 +3089,17 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, q_high = q; printf("frame:%d q:%d\n", cm->current_video_frame, q); - } else { -#endif - // Limit Q range for the adaptive loop. - bottom_index = cpi->active_best_quality; - top_index = cpi->active_worst_quality; - q_low = cpi->active_best_quality; - q_high = cpi->active_worst_quality; -#if CONFIG_MULTIPLE_ARF } #endif + loop_count = 0; vp9_zero(cpi->rd_tx_select_threshes); - if (cm->frame_type != KEY_FRAME) { + if (!frame_is_intra_only(cm)) { cm->mcomp_filter_type = DEFAULT_INTERP_FILTER; /* TODO: Decide this more intelligently */ - xd->allow_high_precision_mv = q < HIGH_PRECISION_MV_QTHRESH; - set_mvcost(&cpi->mb); + cm->allow_high_precision_mv = q < HIGH_PRECISION_MV_QTHRESH; + set_mvcost(cpi); } #if CONFIG_VP9_POSTPROC @@ -2935,25 +3141,25 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, vp9_set_quantizer(cpi, q); if (loop_count == 0) { - - // Set up entropy depending on frame type. + // Set up entropy context depending on frame type. The decoder mandates + // the use of the default context, index 0, for keyframes and inter + // frames where the error_resilient_mode or intra_only flag is set. For + // other inter-frames the encoder currently uses only two contexts; + // context 1 for ALTREF frames and context 0 for the others. if (cm->frame_type == KEY_FRAME) { - /* Choose which entropy context to use. When using a forward reference - * frame, it immediately follows the keyframe, and thus benefits from - * using the same entropy context established by the keyframe. - * Otherwise, use the default context 0. - */ - cm->frame_context_idx = cpi->oxcf.play_alternate; vp9_setup_key_frame(cpi); } else { - /* Choose which entropy context to use. Currently there are only two - * contexts used, one for normal frames and one for alt ref frames. - */ - cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame; + if (!cm->intra_only && !cm->error_resilient_mode) { + cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame; + } vp9_setup_inter_frame(cpi); } } + if (cpi->sf.variance_adaptive_quantization) { + vp9_vaq_frame_setup(cpi); + } + // transform / motion compensation build reconstruction frame vp9_encode_frame(cpi); @@ -2977,14 +3183,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, frame_over_shoot_limit = 1; active_worst_qchanged = 0; - // Special case handling for forced key frames if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { loop = 0; } else { + // Special case handling for forced key frames if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) { int last_q = q; - int kf_err = vp9_calc_ss_err(cpi->Source, - &cm->yv12_fb[cm->new_fb_idx]); + int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm)); int high_err_target = cpi->ambient_err; int low_err_target = cpi->ambient_err >> 1; @@ -3120,14 +3325,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // fixed interval. Note the reconstruction error if it is the frame before // the force key frame if (cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0)) { - cpi->ambient_err = vp9_calc_ss_err(cpi->Source, - &cm->yv12_fb[cm->new_fb_idx]); + cpi->ambient_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm)); } if (cm->frame_type == KEY_FRAME) cpi->refresh_last_frame = 1; - cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx]; + cm->frame_to_show = get_frame_new_buffer(cm); #if WRITE_RECON_BUFFER if (cm->show_frame) @@ -3168,7 +3372,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, vp9_adapt_coef_probs(&cpi->common); } - if (cpi->common.frame_type != KEY_FRAME) { + if (!frame_is_intra_only(&cpi->common)) { FRAME_COUNTS *counts = &cpi->common.counts; vp9_copy(counts->y_mode, cpi->y_mode_count); @@ -3182,7 +3386,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (!cpi->common.error_resilient_mode && !cpi->common.frame_parallel_decoding_mode) { vp9_adapt_mode_probs(&cpi->common); - vp9_adapt_mv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv); + vp9_adapt_mv_probs(&cpi->common, cpi->common.allow_high_precision_mv); } } @@ -3198,8 +3402,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->total_byte_count += (*size); cpi->projected_frame_size = (*size) << 3; + // Post encode loop adjustment of Q prediction. if (!active_worst_qchanged) - vp9_update_rate_correction_factors(cpi, 2); + vp9_update_rate_correction_factors(cpi, (cpi->sf.recode_loop) ? 2 : 0); cpi->last_q[cm->frame_type] = cm->base_qindex; @@ -3222,9 +3427,11 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // Keep a record of ambient average Q. if (cm->frame_type != KEY_FRAME) - cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2; + cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + + cm->base_qindex) >> 2; - // Keep a record from which we can calculate the average Q excluding GF updates and key frames + // Keep a record from which we can calculate the average Q excluding GF + // updates and key frames. if (cm->frame_type != KEY_FRAME && !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) { @@ -3242,7 +3449,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (!cm->show_frame) cpi->bits_off_target -= cpi->projected_frame_size; else - cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size; + cpi->bits_off_target += cpi->av_per_frame_bandwidth - + cpi->projected_frame_size; // Clip the buffer level at the maximum buffer size if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) @@ -3266,125 +3474,30 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->total_actual_bits += cpi->projected_frame_size; // Debug stats - cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size); + cpi->total_target_vs_actual += (cpi->this_frame_target - + cpi->projected_frame_size); cpi->buffer_level = cpi->bits_off_target; - // Update bits left to the kf and gf groups to account for overshoot or undershoot on these frames +#ifndef DISABLE_RC_LONG_TERM_MEM + // Update bits left to the kf and gf groups to account for overshoot or + // undershoot on these frames if (cm->frame_type == KEY_FRAME) { - cpi->twopass.kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size; + cpi->twopass.kf_group_bits += cpi->this_frame_target - + cpi->projected_frame_size; cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0); } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) { - cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size; + cpi->twopass.gf_group_bits += cpi->this_frame_target - + cpi->projected_frame_size; cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0); } - - // Update the skip mb flag probabilities based on the distribution seen - // in this frame. - // update_base_skip_probs(cpi); - -#if CONFIG_INTERNAL_STATS - { - FILE *f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w"); - int recon_err; - - vp9_clear_system_state(); // __asm emms; - - recon_err = vp9_calc_ss_err(cpi->Source, - &cm->yv12_fb[cm->new_fb_idx]); - - if (cpi->twopass.total_left_stats.coded_error != 0.0) - fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d" - "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f" - "%6d %6d %5d %5d %5d %8.2f %10d %10.3f" - "%10.3f %8d %10d %10d %10d\n", - cpi->common.current_video_frame, cpi->this_frame_target, - cpi->projected_frame_size, 0, //loop_size_estimate, - (cpi->projected_frame_size - cpi->this_frame_target), - (int)cpi->total_target_vs_actual, - (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target), - (int)cpi->total_actual_bits, - cm->base_qindex, - vp9_convert_qindex_to_q(cm->base_qindex), - (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0, - vp9_convert_qindex_to_q(cpi->active_best_quality), - vp9_convert_qindex_to_q(cpi->active_worst_quality), - cpi->avg_q, - vp9_convert_qindex_to_q(cpi->ni_av_qi), - vp9_convert_qindex_to_q(cpi->cq_target_quality), - cpi->refresh_last_frame, - cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame, - cm->frame_type, cpi->gfu_boost, - cpi->twopass.est_max_qcorrection_factor, - (int)cpi->twopass.bits_left, - cpi->twopass.total_left_stats.coded_error, - (double)cpi->twopass.bits_left / - cpi->twopass.total_left_stats.coded_error, - cpi->tot_recode_hits, recon_err, cpi->kf_boost, - cpi->kf_zeromotion_pct); - else - fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d" - "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f" - "%5d %5d %5d %8d %8d %8.2f %10d %10.3f" - "%8d %10d %10d %10d\n", - cpi->common.current_video_frame, - cpi->this_frame_target, cpi->projected_frame_size, - 0, //loop_size_estimate, - (cpi->projected_frame_size - cpi->this_frame_target), - (int)cpi->total_target_vs_actual, - (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target), - (int)cpi->total_actual_bits, - cm->base_qindex, - vp9_convert_qindex_to_q(cm->base_qindex), - (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0, - vp9_convert_qindex_to_q(cpi->active_best_quality), - vp9_convert_qindex_to_q(cpi->active_worst_quality), - cpi->avg_q, - vp9_convert_qindex_to_q(cpi->ni_av_qi), - vp9_convert_qindex_to_q(cpi->cq_target_quality), - cpi->refresh_last_frame, - cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame, - cm->frame_type, cpi->gfu_boost, - cpi->twopass.est_max_qcorrection_factor, - (int)cpi->twopass.bits_left, - cpi->twopass.total_left_stats.coded_error, - cpi->tot_recode_hits, recon_err, cpi->kf_boost, - cpi->kf_zeromotion_pct); - - fclose(f); - - if (0) { - FILE *fmodes = fopen("Modes.stt", "a"); - int i; - - fprintf(fmodes, "%6d:%1d:%1d:%1d ", - cpi->common.current_video_frame, - cm->frame_type, cpi->refresh_golden_frame, - cpi->refresh_alt_ref_frame); - - for (i = 0; i < MAX_MODES; i++) - fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]); - - fprintf(fmodes, "\n"); - - fclose(fmodes); - } - } - #endif #if 0 - // Debug stats for segment feature experiments. - print_seg_map(cpi); + output_frame_level_debug_stats(cpi); #endif - - // If this was a kf or Gf note the Q - if ((cm->frame_type == KEY_FRAME) - || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) - cm->last_kf_gf_q = cm->base_qindex; - if (cpi->refresh_golden_frame == 1) cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN; else @@ -3463,7 +3576,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, #endif } - // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas. + // Clear the one shot update flags for segmentation map and mode/ref loop + // filter deltas. cm->seg.update_map = 0; cm->seg.update_data = 0; cm->lf.mode_ref_delta_update = 0; @@ -3487,6 +3601,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cm->mi = cm->mip + cm->mode_info_stride + 1; cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1; + cpi->mb.e_mbd.mi_8x8 = cm->mi_grid_visible; + cpi->mb.e_mbd.mi_8x8[0] = cm->mi; + // Don't increment frame counters if this was an altref buffer // update not a real frame ++cm->current_video_frame; @@ -3495,28 +3612,10 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // restore prev_mi cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1; cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1; - - #if 0 - { - char filename[512]; - FILE *recon_file; - sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame); - recon_file = fopen(filename, "wb"); - fwrite(cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].buffer_alloc, - cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].frame_size, - 1, recon_file); - fclose(recon_file); - } -#endif -#ifdef OUTPUT_YUV_REC - vp9_write_yuv_rec_frame(cm); -#endif - } static void Pass2Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags) { - cpi->enable_encode_breakout = 1; if (!cpi->refresh_alt_ref_frame) @@ -3533,12 +3632,14 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size, if (!cpi->refresh_alt_ref_frame) { double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate; double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth - * cpi->oxcf.two_pass_vbrmin_section / 100); + * cpi->oxcf.two_pass_vbrmin_section + / 100); if (two_pass_min_rate < lower_bounds_min_rate) two_pass_min_rate = lower_bounds_min_rate; - cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.framerate); + cpi->twopass.bits_left += (int64_t)(two_pass_min_rate + / cpi->oxcf.framerate); } } @@ -3612,8 +3713,8 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, cpi->source = NULL; - cpi->mb.e_mbd.allow_high_precision_mv = ALTREF_HIGH_PRECISION_MV; - set_mvcost(&cpi->mb); + cpi->common.allow_high_precision_mv = ALTREF_HIGH_PRECISION_MV; + set_mvcost(cpi); // Should we code an alternate reference frame. if (cpi->oxcf.play_alternate && cpi->source_alt_ref_pending) { @@ -3651,7 +3752,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, } cm->show_frame = 0; - cm->intra_only = 0; cpi->refresh_alt_ref_frame = 1; cpi->refresh_golden_frame = 0; cpi->refresh_last_frame = 0; @@ -3673,6 +3773,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, #endif if ((cpi->source = vp9_lookahead_pop(cpi->lookahead, flush))) { cm->show_frame = 1; + cm->intra_only = 0; #if CONFIG_MULTIPLE_ARF // Is this frame the ARF overlay. @@ -3829,7 +3930,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, cm->frame_flags = *frame_flags; // Reset the frame pointers to the current frame size - vp9_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx], + vp9_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, VP9BORDERINPIXELS); @@ -3840,6 +3941,10 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm); + if (cpi->sf.variance_adaptive_quantization) { + vp9_vaq_init(); + } + if (cpi->pass == 1) { Pass1Encode(cpi, size, dest, frame_flags); } else if (cpi->pass == 2) { @@ -3876,7 +3981,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, cpi->bytes += *size; if (cm->show_frame) { - cpi->count++; if (cpi->b_calculate_psnr) { @@ -3986,9 +4090,9 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags) { VP9_COMP *cpi = (VP9_COMP *) comp; - if (!cpi->common.show_frame) + if (!cpi->common.show_frame) { return -1; - else { + } else { int ret; #if CONFIG_VP9_POSTPROC ret = vp9_post_proc_frame(&cpi->common, dest, flags); @@ -4138,37 +4242,9 @@ int vp9_set_size_literal(VP9_PTR comp, unsigned int width, return 0; } -int vp9_switch_layer(VP9_PTR comp, int layer) { - VP9_COMP *cpi = (VP9_COMP *)comp; - - if (cpi->use_svc) { - cpi->current_layer = layer; - - // Use buffer i for layer i LST - cpi->lst_fb_idx = layer; - - // Use buffer i-1 for layer i Alt (Inter-layer prediction) - if (layer != 0) cpi->alt_fb_idx = layer - 1; - - // Use the rest for Golden - if (layer < 2 * cpi->number_spatial_layers - NUM_REF_FRAMES) - cpi->gld_fb_idx = cpi->lst_fb_idx; - else - cpi->gld_fb_idx = 2 * cpi->number_spatial_layers - 1 - layer; - - printf("Switching to layer %d:\n", layer); - printf("Using references: LST/GLD/ALT [%d|%d|%d]\n", cpi->lst_fb_idx, - cpi->gld_fb_idx, cpi->alt_fb_idx); - } else { - printf("Switching layer not supported. Enable SVC first \n"); - } - return 0; -} - void vp9_set_svc(VP9_PTR comp, int use_svc) { VP9_COMP *cpi = (VP9_COMP *)comp; cpi->use_svc = use_svc; - if (cpi->use_svc) printf("Enabled SVC encoder \n"); return; } diff --git a/libvpx/vp9/encoder/vp9_onyx_int.h b/libvpx/vp9/encoder/vp9_onyx_int.h index 3e5796f..9429c7f 100644 --- a/libvpx/vp9/encoder/vp9_onyx_int.h +++ b/libvpx/vp9/encoder/vp9_onyx_int.h @@ -29,12 +29,7 @@ #include "vp9/common/vp9_findnearmv.h" #include "vp9/encoder/vp9_lookahead.h" -// Experimental rate control switches -#if CONFIG_ONESHOTQ -#define ONE_SHOT_Q_ESTIMATE 0 -#define STRICT_ONE_SHOT_Q 0 #define DISABLE_RC_LONG_TERM_MEM 0 -#endif // #define MODE_TEST_HIT_STATS @@ -49,7 +44,8 @@ #define KEY_FRAME_CONTEXT 5 -#define MAX_MODES 36 +#define MAX_MODES 30 +#define MAX_REFS 6 #define MIN_THRESHMULT 32 #define MAX_THRESHMULT 512 @@ -61,16 +57,11 @@ #define INTRA_ZBIN_BOOST 0 typedef struct { - nmv_context nmvc; int nmvjointcost[MV_JOINTS]; int nmvcosts[2][MV_VALS]; int nmvcosts_hp[2][MV_VALS]; vp9_prob segment_pred_probs[PREDICTION_PROBS]; - vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS]; - vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS]; - vp9_prob single_ref_prob[REF_CONTEXTS][2]; - vp9_prob comp_ref_prob[REF_CONTEXTS]; unsigned char *last_frame_seg_map_copy; @@ -79,20 +70,8 @@ typedef struct { // 0 = ZERO_MV, MV signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; - vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES]; - - vp9_prob y_mode_prob[4][INTRA_MODES - 1]; - vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; - vp9_prob partition_prob[2][NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1]; - - vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1] - [SWITCHABLE_FILTERS - 1]; - int inter_mode_counts[INTER_MODE_CONTEXTS][INTER_MODES - 1][2]; - vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1]; - - struct tx_probs tx_probs; - vp9_prob mbskip_probs[MBSKIP_CONTEXTS]; + FRAME_CONTEXT fc; } CODING_CONTEXT; typedef struct { @@ -169,19 +148,12 @@ typedef enum { THR_COMP_NEARGA, THR_COMP_NEWGA, - THR_SPLITMV, - THR_SPLITG, - THR_SPLITA, - THR_COMP_SPLITLA, - THR_COMP_SPLITGA, - THR_ZEROMV, THR_ZEROG, THR_ZEROA, THR_COMP_ZEROLA, THR_COMP_ZEROGA, - THR_B_PRED, THR_H_PRED, THR_V_PRED, THR_D135_PRED, @@ -193,6 +165,15 @@ typedef enum { } THR_MODES; typedef enum { + THR_LAST, + THR_GOLD, + THR_ALTR, + THR_COMP_LA, + THR_COMP_GA, + THR_INTRA, +} THR_MODES_SUB8X8; + +typedef enum { DIAMOND = 0, NSTEP = 1, HEX = 2, @@ -244,8 +225,15 @@ typedef enum { #define ALL_INTRA_MODES 0x3FF #define INTRA_DC_ONLY 0x01 #define INTRA_DC_TM ((1 << TM_PRED) | (1 << DC_PRED)) +#define INTRA_DC_H_V ((1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED)) #define INTRA_DC_TM_H_V (INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED)) +typedef enum { + LAST_FRAME_PARTITION_OFF = 0, + LAST_FRAME_PARTITION_LOW_MOTION = 1, + LAST_FRAME_PARTITION_ALL = 2 +} LAST_FRAME_PARTITION_METHOD; + typedef struct { int RD; SEARCH_METHODS search_method; @@ -254,21 +242,21 @@ typedef struct { SUBPEL_SEARCH_METHODS subpel_search_method; int subpel_iters_per_step; int thresh_mult[MAX_MODES]; + int thresh_mult_sub8x8[MAX_REFS]; int max_step_search_steps; int reduce_first_step_size; int auto_mv_step_size; int optimize_coefficients; int static_segmentation; + int variance_adaptive_quantization; int comp_inter_joint_search_thresh; int adaptive_rd_thresh; int skip_encode_sb; int skip_encode_frame; - int use_lastframe_partitioning; + LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning; TX_SIZE_SEARCH_METHOD tx_size_search_method; int use_lp32x32fdct; int use_avoid_tested_higherror; - int skip_lots_of_modes; - int partition_by_variance; int use_one_partition_size_always; int less_rectangular_check; int use_square_partition_only; @@ -276,13 +264,11 @@ typedef struct { int reference_masking; BLOCK_SIZE always_this_block_size; int auto_min_max_partition_size; - int auto_min_max_partition_interval; - int auto_min_max_partition_count; BLOCK_SIZE min_partition_size; BLOCK_SIZE max_partition_size; int adjust_partitioning_from_last_frame; int last_partitioning_redo_frequency; - int disable_splitmv; + int disable_split_mask; int using_small_partition_info; // TODO(jingning): combine the related motion search speed features int adaptive_motion_search; @@ -296,8 +282,8 @@ typedef struct { // A source variance threshold below which filter search is disabled // Choose a very large value (UINT_MAX) to use 8-tap always unsigned int disable_filter_search_var_thresh; - int intra_y_mode_mask; - int intra_uv_mode_mask; + int intra_y_mode_mask[TX_SIZES]; + int intra_uv_mode_mask[TX_SIZES]; int use_rd_breakout; int use_uv_intra_rd_estimate; int use_fast_lpf_pick; @@ -325,6 +311,7 @@ typedef struct VP9_COMP { MACROBLOCK mb; VP9_COMMON common; VP9_CONFIG oxcf; + struct rdcost_block_args rdcost_stack; struct lookahead_ctx *lookahead; struct lookahead_entry *source; @@ -339,13 +326,13 @@ typedef struct VP9_COMP { YV12_BUFFER_CONFIG scaled_source; unsigned int frames_till_alt_ref_frame; - int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref - int source_alt_ref_active; // an alt ref frame has been encoded and is usable + int source_alt_ref_pending; + int source_alt_ref_active; - int is_src_frame_alt_ref; // source of frame to encode is an exact copy of an alt ref frame + int is_src_frame_alt_ref; - int gold_is_last; // golden frame same as last frame ( short circuit gold searches) - int alt_is_last; // Alt reference frame same as last ( short circuit altref search) + int gold_is_last; // gold same as last frame ( short circuit gold searches) + int alt_is_last; // Alt same as last ( short circuit altref search) int gold_is_alt; // don't do both alt and gold search ( just do gold). int scaled_ref_idx[3]; @@ -382,19 +369,19 @@ typedef struct VP9_COMP { // Ambient reconstruction err target for force key frames int ambient_err; - unsigned int mode_check_freq[MAX_MODES]; - unsigned int mode_test_hit_counts[MAX_MODES]; unsigned int mode_chosen_counts[MAX_MODES]; + unsigned int sub8x8_mode_chosen_counts[MAX_REFS]; int64_t mode_skip_mask; int ref_frame_mask; int set_ref_frame_mask; - int rd_threshes[BLOCK_SIZES][MAX_MODES]; + int rd_threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES]; int rd_thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; + int rd_thresh_sub8x8[MAX_SEGMENTS][BLOCK_SIZES][MAX_REFS]; + int rd_thresh_freq_sub8x8[BLOCK_SIZES][MAX_REFS]; int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES]; - // FIXME(rbultje) int64_t? - int rd_prediction_type_threshes[4][NB_PREDICTION_TYPES]; + int64_t rd_prediction_type_threshes[4][NB_PREDICTION_TYPES]; unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2]; unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2]; unsigned int single_ref_count[REF_CONTEXTS][2][2]; @@ -404,9 +391,9 @@ typedef struct VP9_COMP { // FIXME(rbultje) can this overflow? int rd_tx_select_threshes[4][TX_MODES]; - int64_t rd_filter_diff[SWITCHABLE_FILTERS + 1]; - int64_t rd_filter_threshes[4][SWITCHABLE_FILTERS + 1]; - int64_t rd_filter_cache[SWITCHABLE_FILTERS + 1]; + int64_t rd_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; + int64_t rd_filter_threshes[4][SWITCHABLE_FILTER_CONTEXTS]; + int64_t rd_filter_cache[SWITCHABLE_FILTER_CONTEXTS]; int RDMULT; int RDDIV; @@ -424,14 +411,14 @@ typedef struct VP9_COMP { double gf_rate_correction_factor; unsigned int frames_since_golden; - int frames_till_gf_update_due; // Count down till next GF + int frames_till_gf_update_due; // Count down till next GF - int gf_overspend_bits; // Total bits overspent becasue of GF boost (cumulative) + int gf_overspend_bits; // cumulative bits overspent because of GF boost - int non_gf_bitrate_adjustment; // Used in the few frames following a GF to recover the extra bits spent in that GF + int non_gf_bitrate_adjustment; // Following GF to recover extra bits spent - int kf_overspend_bits; // Extra bits spent on key frames that need to be recovered on inter frames - int kf_bitrate_adjustment; // Current number of bit s to try and recover on each inter frame. + int kf_overspend_bits; // Bits spent on key frames to be recovered on inters + int kf_bitrate_adjustment; // number of bits to recover on each inter frame. int max_gf_interval; int baseline_gf_interval; int active_arnr_frames; // <= cpi->oxcf.arnr_max_frames @@ -439,9 +426,9 @@ typedef struct VP9_COMP { int64_t key_frame_count; int prior_key_frame_distance[KEY_FRAME_CONTEXT]; - int per_frame_bandwidth; // Current section per frame bandwidth target - int av_per_frame_bandwidth; // Average frame size target for clip - int min_frame_bandwidth; // Minimum allocation that should be used for any frame + int per_frame_bandwidth; // Current section per frame bandwidth target + int av_per_frame_bandwidth; // Average frame size target for clip + int min_frame_bandwidth; // Minimum allocation used for any frame int inter_frame_target; double output_framerate; int64_t last_time_stamp_seen; @@ -483,7 +470,7 @@ typedef struct VP9_COMP { int y_mode_count[4][INTRA_MODES]; int y_uv_mode_count[INTRA_MODES][INTRA_MODES]; - unsigned int partition_count[NUM_PARTITION_CONTEXTS][PARTITION_TYPES]; + unsigned int partition_count[PARTITION_CONTEXTS][PARTITION_TYPES]; nmv_context_counts NMVcount; @@ -514,14 +501,9 @@ typedef struct VP9_COMP { int decimation_count; // for real time encoding - int avg_encode_time; // microsecond - int avg_pick_mode_time; // microsecond int speed; - unsigned int cpu_freq; // Mhz int compressor_speed; - int interquantizer; - int goldfreq; int auto_worst_q; int cpu_used; int pass; @@ -537,11 +519,6 @@ typedef struct VP9_COMP { unsigned int max_mv_magnitude; int mv_step_param; - // Data used for real time conferencing mode to help determine if it would be good to update the gf - int inter_zz_count; - int gf_bad_count; - int gf_update_recommended; - unsigned char *segmentation_map; // segment threashold for encode breakout @@ -648,10 +625,10 @@ typedef struct VP9_COMP { int dummy_packing; /* flag to indicate if packing is dummy */ - unsigned int switchable_interp_count[SWITCHABLE_FILTERS + 1] + unsigned int switchable_interp_count[SWITCHABLE_FILTER_CONTEXTS] [SWITCHABLE_FILTERS]; - unsigned int txfm_stepdown_count[TX_SIZES]; + unsigned int tx_stepdown_count[TX_SIZES]; int initial_width; int initial_height; @@ -682,6 +659,13 @@ typedef struct VP9_COMP { // Debug / test stats int64_t mode_test_hits[BLOCK_SIZES]; #endif + + /* Y,U,V,(A) */ + ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; + ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16]; + + PARTITION_CONTEXT *above_seg_context; + PARTITION_CONTEXT left_seg_context[8]; } VP9_COMP; static int get_ref_frame_idx(VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { @@ -714,9 +698,14 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x); void vp9_set_speed_features(VP9_COMP *cpi); -extern int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *dest); +int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest); + +void vp9_alloc_compressor_data(VP9_COMP *cpi); -extern void vp9_alloc_compressor_data(VP9_COMP *cpi); +int vp9_compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget); + +static int get_token_alloc(int mb_rows, int mb_cols) { + return mb_rows * mb_cols * (48 * 16 + 4); +} #endif // VP9_ENCODER_VP9_ONYX_INT_H_ diff --git a/libvpx/vp9/encoder/vp9_picklpf.c b/libvpx/vp9/encoder/vp9_picklpf.c index 239fd6b..476ecaa 100644 --- a/libvpx/vp9/encoder/vp9_picklpf.c +++ b/libvpx/vp9/encoder/vp9_picklpf.c @@ -54,7 +54,8 @@ static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source, src += srcoffset; dst += dstoffset; - // Loop through the Y plane raw and reconstruction data summing (square differences) + // Loop through the raw Y plane and reconstruction data summing the square + // differences. for (i = 0; i < linestocopy; i += 16) { for (j = 0; j < source->y_width; j += 16) { unsigned int sse; @@ -72,20 +73,6 @@ static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source, // Enforce a minimum filter level based upon baseline Q static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) { int min_filter_level; - /*int q = (int) vp9_convert_qindex_to_q(base_qindex); - - if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame && !cpi->common.refresh_alt_ref_frame) - min_filter_level = 0; - else - { - if (q <= 10) - min_filter_level = 0; - else if (q <= 64) - min_filter_level = 1; - else - min_filter_level = (q >> 6); - } - */ min_filter_level = 0; return min_filter_level; @@ -93,11 +80,7 @@ static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) { // Enforce a maximum filter level based upon baseline Q static int get_max_filter_level(VP9_COMP *cpi, int base_qindex) { - // PGW August 2006: Highest filter values almost always a bad idea - - // jbb chg: 20100118 - not so any more with this overquant stuff allow high values - // with lots of intra coming in. - int max_filter_level = MAX_LOOP_FILTER;// * 3 / 4; + int max_filter_level = MAX_LOOP_FILTER; (void)base_qindex; if (cpi->twopass.section_intra_rating > 8) @@ -128,7 +111,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) { int filt_best; int filt_direction = 0; - int Bias = 0; // Bias against raising loop filter and in favour of lowering it + int Bias = 0; // Bias against raising loop filter in favor of lowering it. // Make a copy of the unfiltered / processed recon buffer vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf); @@ -136,7 +119,8 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) { lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0 : cpi->oxcf.Sharpness; - // Start the search at the previous frame filter level unless it is now out of range. + // Start the search at the previous frame filter level unless it is now out of + // range. filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level); // Define the initial step size @@ -153,9 +137,8 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) { vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); while (filter_step > 0) { - Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; // PGW change 12/12/06 for small images + Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; - // jbb chg: 20100118 - in sections with lots of new material coming in don't bias as much to a low filter value if (cpi->twopass.section_intra_rating < 20) Bias = Bias * cpi->twopass.section_intra_rating / 20; @@ -163,8 +146,12 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) { if (cpi->common.tx_mode != ONLY_4X4) Bias >>= 1; - filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step); - filt_low = ((filt_mid - filter_step) < min_filter_level) ? min_filter_level : (filt_mid - filter_step); + filt_high = ((filt_mid + filter_step) > max_filter_level) + ? max_filter_level + : (filt_mid + filter_step); + filt_low = ((filt_mid - filter_step) < min_filter_level) + ? min_filter_level + : (filt_mid - filter_step); if ((filt_direction <= 0) && (filt_low != filt_mid)) { // Get Low filter error score @@ -176,7 +163,8 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) { // Re-instate the unfiltered frame vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); - // If value is close to the best so far then bias towards a lower loop filter value. + // If value is close to the best so far then bias towards a lower loop + // filter value. if ((filt_err - Bias) < best_err) { // Was it actually better than the previous best? if (filt_err < best_err) @@ -215,4 +203,3 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) { lf->filter_level = filt_best; } - diff --git a/libvpx/vp9/encoder/vp9_psnr.c b/libvpx/vp9/encoder/vp9_psnr.c index 9439434..58294e1 100644 --- a/libvpx/vp9/encoder/vp9_psnr.c +++ b/libvpx/vp9/encoder/vp9_psnr.c @@ -8,9 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <math.h> #include "vpx_scale/yv12config.h" -#include "math.h" #define MAX_PSNR 100 diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c index 6c8b2a0..fca7525 100644 --- a/libvpx/vp9/encoder/vp9_quantize.c +++ b/libvpx/vp9/encoder/vp9_quantize.c @@ -12,6 +12,7 @@ #include "vpx_mem/vpx_mem.h" #include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_rdopt.h" #include "vp9/encoder/vp9_quantize.h" #include "vp9/common/vp9_quant_common.h" @@ -21,12 +22,14 @@ extern int enc_debug; #endif -void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, - int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, - int16_t *dqcoeff_ptr, int16_t *dequant_ptr, - int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { +void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, + int zbin_oq_value, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { int i, rc, eob; int zbins[2], nzbins[2], zbin; int x, y, z, sz; @@ -85,14 +88,15 @@ void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, *eob_ptr = eob + 1; } -void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs, +void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - int16_t *zbin_ptr, int16_t *round_ptr, - int16_t *quant_ptr, int16_t *quant_shift_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, - int16_t *dequant_ptr, int zbin_oq_value, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const int16_t *dequant_ptr, + int zbin_oq_value, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { int i, rc, eob; int zbins[2], nzbins[2]; int x, y, z, sz; @@ -173,25 +177,19 @@ static INLINE struct plane_block_idx plane_block_idx(int y_blocks, return res; } -void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type, - int y_blocks) { - MACROBLOCKD *const xd = &mb->e_mbd; +void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx, + const int16_t *scan, const int16_t *iscan) { + MACROBLOCKD *const xd = &x->e_mbd; const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx); - const int16_t *scan = get_scan_4x4(tx_type); - const int16_t *iscan = get_iscan_4x4(tx_type); - - vp9_quantize_b(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block), - 16, mb->skip_block, - mb->plane[pb_idx.plane].zbin, - mb->plane[pb_idx.plane].round, - mb->plane[pb_idx.plane].quant, - mb->plane[pb_idx.plane].quant_shift, - BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block), - BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block), - xd->plane[pb_idx.plane].dequant, - mb->plane[pb_idx.plane].zbin_extra, - &xd->plane[pb_idx.plane].eobs[pb_idx.block], - scan, iscan); + struct macroblock_plane* p = &x->plane[pb_idx.plane]; + struct macroblockd_plane* pd = &xd->plane[pb_idx.plane]; + + vp9_quantize_b(BLOCK_OFFSET(p->coeff, pb_idx.block), + 16, x->skip_block, + p->zbin, p->round, p->quant, p->quant_shift, + BLOCK_OFFSET(pd->qcoeff, pb_idx.block), + BLOCK_OFFSET(pd->dqcoeff, pb_idx.block), + pd->dequant, p->zbin_extra, &pd->eobs[pb_idx.block], scan, iscan); } static void invert_quant(int16_t *quant, int16_t *shift, int d) { @@ -271,12 +269,15 @@ void vp9_init_quantizer(VP9_COMP *cpi) { void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { int i; + VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; int zbin_extra; - int segment_id = xd->this_mi->mbmi.segment_id; + int segment_id = xd->mi_8x8[0]->mbmi.segment_id; const int qindex = vp9_get_qindex(&cpi->common.seg, segment_id, cpi->common.base_qindex); + int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q); + // Y zbin_extra = (cpi->common.y_dequant[qindex][1] * (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; @@ -315,6 +316,12 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { /* save this macroblock QIndex for vp9_update_zbin_extra() */ x->e_mbd.q_index = qindex; + + /* R/D setup */ + cpi->mb.errorperbit = rdmult >> 6; + cpi->mb.errorperbit += (cpi->mb.errorperbit == 0); + + vp9_initialize_me_consts(cpi, xd->q_index); } void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) { @@ -337,10 +344,10 @@ void vp9_frame_init_quantizer(VP9_COMP *cpi) { vp9_mb_init_quantizer(cpi, &cpi->mb); } -void vp9_set_quantizer(struct VP9_COMP *cpi, int Q) { +void vp9_set_quantizer(struct VP9_COMP *cpi, int q) { VP9_COMMON *cm = &cpi->common; - cm->base_qindex = Q; + cm->base_qindex = q; // if any of the delta_q values are changing update flag will // have to be set. diff --git a/libvpx/vp9/encoder/vp9_quantize.h b/libvpx/vp9/encoder/vp9_quantize.h index 3229eaa..c078e1d 100644 --- a/libvpx/vp9/encoder/vp9_quantize.h +++ b/libvpx/vp9/encoder/vp9_quantize.h @@ -13,31 +13,19 @@ #include "vp9/encoder/vp9_block.h" -#define prototype_quantize_block(sym) \ - void (sym)(MACROBLOCK *mb, int b_idx) +void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx, + const int16_t *scan, const int16_t *iscan); -#define prototype_quantize_block_pair(sym) \ - void (sym)(MACROBLOCK *mb, int b_idx1, int b_idx2) - -#define prototype_quantize_mb(sym) \ - void (sym)(MACROBLOCK *x) - -void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2, - int y_blocks); -void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type, - int y_blocks); -void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type, - int y_blocks); struct VP9_COMP; -extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q); +void vp9_set_quantizer(struct VP9_COMP *cpi, int q); -extern void vp9_frame_init_quantizer(struct VP9_COMP *cpi); +void vp9_frame_init_quantizer(struct VP9_COMP *cpi); -extern void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x); +void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x); -extern void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x); +void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x); -extern void vp9_init_quantizer(struct VP9_COMP *cpi); +void vp9_init_quantizer(struct VP9_COMP *cpi); #endif // VP9_ENCODER_VP9_QUANTIZE_H_ diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c index 2d12ba9..0aa3a68 100644 --- a/libvpx/vp9/encoder/vp9_ratectrl.c +++ b/libvpx/vp9/encoder/vp9_ratectrl.c @@ -59,9 +59,8 @@ static int kfboost_qadjust(int qindex) { int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex, double correction_factor) { - const double q = vp9_convert_qindex_to_q(qindex); - int enumerator = frame_type == KEY_FRAME ? 4000000 : 2500000; + int enumerator = frame_type == KEY_FRAME ? 3300000 : 2250000; // q based adjustment to baseline enumerator enumerator += (int)(enumerator * q) >> 12; @@ -76,35 +75,19 @@ void vp9_save_coding_context(VP9_COMP *cpi) { // restored with a call to vp9_restore_coding_context. These functions are // intended for use in a re-code loop in vp9_compress_frame where the // quantizer value is adjusted between loop iterations. - - cc->nmvc = cm->fc.nmvc; vp9_copy(cc->nmvjointcost, cpi->mb.nmvjointcost); vp9_copy(cc->nmvcosts, cpi->mb.nmvcosts); vp9_copy(cc->nmvcosts_hp, cpi->mb.nmvcosts_hp); - vp9_copy(cc->inter_mode_probs, cm->fc.inter_mode_probs); - - vp9_copy(cc->y_mode_prob, cm->fc.y_mode_prob); - vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob); - vp9_copy(cc->partition_prob, cm->fc.partition_prob); - vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs); - vp9_copy(cc->intra_inter_prob, cm->fc.intra_inter_prob); - vp9_copy(cc->comp_inter_prob, cm->fc.comp_inter_prob); - vp9_copy(cc->single_ref_prob, cm->fc.single_ref_prob); - vp9_copy(cc->comp_ref_prob, cm->fc.comp_ref_prob); - vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy, cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols)); vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas); vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas); - vp9_copy(cc->coef_probs, cm->fc.coef_probs); - vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob); - cc->tx_probs = cm->fc.tx_probs; - vp9_copy(cc->mbskip_probs, cm->fc.mbskip_probs); + cc->fc = cm->fc; } void vp9_restore_coding_context(VP9_COMP *cpi) { @@ -113,25 +96,12 @@ void vp9_restore_coding_context(VP9_COMP *cpi) { // Restore key state variables to the snapshot state stored in the // previous call to vp9_save_coding_context. - - cm->fc.nmvc = cc->nmvc; vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost); vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts); vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp); - vp9_copy(cm->fc.inter_mode_probs, cc->inter_mode_probs); - - vp9_copy(cm->fc.y_mode_prob, cc->y_mode_prob); - vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob); - vp9_copy(cm->fc.partition_prob, cc->partition_prob); - vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs); - vp9_copy(cm->fc.intra_inter_prob, cc->intra_inter_prob); - vp9_copy(cm->fc.comp_inter_prob, cc->comp_inter_prob); - vp9_copy(cm->fc.single_ref_prob, cc->single_ref_prob); - vp9_copy(cm->fc.comp_ref_prob, cc->comp_ref_prob); - vpx_memcpy(cm->last_frame_seg_map, cpi->coding_context.last_frame_seg_map_copy, (cm->mi_rows * cm->mi_cols)); @@ -139,10 +109,7 @@ void vp9_restore_coding_context(VP9_COMP *cpi) { vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas); vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas); - vp9_copy(cm->fc.coef_probs, cc->coef_probs); - vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob); - cm->fc.tx_probs = cc->tx_probs; - vp9_copy(cm->fc.mbskip_probs, cc->mbskip_probs); + cm->fc = cc->fc; } void vp9_setup_key_frame(VP9_COMP *cpi) { @@ -224,11 +191,12 @@ static void calc_pframe_target_size(VP9_COMP *cpi) { cpi->this_frame_target = cpi->per_frame_bandwidth; } - // Sanity check that the total sum of adjustments is not above the maximum allowed - // That is that having allowed for KF and GF penalties we have not pushed the - // current interframe target to low. If the adjustment we apply here is not capable of recovering - // all the extra bits we have spent in the KF or GF then the remainder will have to be recovered over - // a longer time span via other buffer / rate control mechanisms. + // Check that the total sum of adjustments is not above the maximum allowed. + // That is, having allowed for the KF and GF penalties, we have not pushed + // the current inter-frame target too low. If the adjustment we apply here is + // not capable of recovering all the extra bits we have spent in the KF or GF, + // then the remainder will have to be recovered over a longer time span via + // other buffer / rate control mechanisms. if (cpi->this_frame_target < min_frame_target) cpi->this_frame_target = min_frame_target; @@ -297,12 +265,12 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { rate_correction_factor); // Work out a size correction factor. - // if ( cpi->this_frame_target > 0 ) - // correction_factor = (100 * cpi->projected_frame_size) / cpi->this_frame_target; if (projected_size_based_on_q > 0) - correction_factor = (100 * cpi->projected_frame_size) / projected_size_based_on_q; + correction_factor = + (100 * cpi->projected_frame_size) / projected_size_based_on_q; - // More heavily damped adjustment used if we have been oscillating either side of target + // More heavily damped adjustment used if we have been oscillating either side + // of target. switch (damp_var) { case 0: adjustment_limit = 0.75; @@ -319,27 +287,29 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { // if ( (correction_factor > 102) && (Q < cpi->active_worst_quality) ) if (correction_factor > 102) { // We are not already at the worst allowable quality - correction_factor = (int)(100.5 + ((correction_factor - 100) * adjustment_limit)); - rate_correction_factor = ((rate_correction_factor * correction_factor) / 100); + correction_factor = + (int)(100 + ((correction_factor - 100) * adjustment_limit)); + rate_correction_factor = + ((rate_correction_factor * correction_factor) / 100); // Keep rate_correction_factor within limits if (rate_correction_factor > MAX_BPB_FACTOR) rate_correction_factor = MAX_BPB_FACTOR; - } - // else if ( (correction_factor < 99) && (Q > cpi->active_best_quality) ) - else if (correction_factor < 99) { + } else if (correction_factor < 99) { // We are not already at the best allowable quality - correction_factor = (int)(100.5 - ((100 - correction_factor) * adjustment_limit)); - rate_correction_factor = ((rate_correction_factor * correction_factor) / 100); + correction_factor = + (int)(100 - ((100 - correction_factor) * adjustment_limit)); + rate_correction_factor = + ((rate_correction_factor * correction_factor) / 100); // Keep rate_correction_factor within limits if (rate_correction_factor < MIN_BPB_FACTOR) rate_correction_factor = MIN_BPB_FACTOR; } - if (cpi->common.frame_type == KEY_FRAME) + if (cpi->common.frame_type == KEY_FRAME) { cpi->key_frame_rate_correction_factor = rate_correction_factor; - else { + } else { if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) cpi->gf_rate_correction_factor = rate_correction_factor; else @@ -358,20 +328,24 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) { double correction_factor; // Select the appropriate correction factor based upon type of frame. - if (cpi->common.frame_type == KEY_FRAME) + if (cpi->common.frame_type == KEY_FRAME) { correction_factor = cpi->key_frame_rate_correction_factor; - else { + } else { if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) correction_factor = cpi->gf_rate_correction_factor; else correction_factor = cpi->rate_correction_factor; } - // Calculate required scaling factor based on target frame size and size of frame produced using previous Q + // Calculate required scaling factor based on target frame size and size of + // frame produced using previous Q. if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS)) - target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) << BPER_MB_NORMBITS; // Case where we would overflow int + target_bits_per_mb = + (target_bits_per_frame / cpi->common.MBs) + << BPER_MB_NORMBITS; // Case where we would overflow int else - target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs; + target_bits_per_mb = + (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs; i = cpi->active_best_quality; @@ -437,7 +411,6 @@ static int estimate_keyframe_frequency(VP9_COMP *cpi) { } av_key_frame_frequency /= total_weight; - } return av_key_frame_frequency; } diff --git a/libvpx/vp9/encoder/vp9_ratectrl.h b/libvpx/vp9/encoder/vp9_ratectrl.h index 4733176..ddda713 100644 --- a/libvpx/vp9/encoder/vp9_ratectrl.h +++ b/libvpx/vp9/encoder/vp9_ratectrl.h @@ -32,8 +32,8 @@ int vp9_pick_frame_size(VP9_COMP *cpi); double vp9_convert_qindex_to_q(int qindex); int vp9_gfboost_qadjust(int qindex); -extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex, - double correction_factor); +int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex, + double correction_factor); void vp9_setup_inter_frame(VP9_COMP *cpi); #endif // VP9_ENCODER_VP9_RATECTRL_H_ diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c index df00334..993919e 100644 --- a/libvpx/vp9/encoder/vp9_rdopt.c +++ b/libvpx/vp9/encoder/vp9_rdopt.c @@ -36,7 +36,7 @@ #include "vp9/common/vp9_seg_common.h" #include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_entropy.h" -#include "vp9_rtcd.h" +#include "./vp9_rtcd.h" #include "vp9/common/vp9_mvref_common.h" #include "vp9/common/vp9_common.h" @@ -45,58 +45,59 @@ /* Factor to weigh the rate for switchable interp filters */ #define SWITCHABLE_INTERP_RATE_FACTOR 1 -DECLARE_ALIGNED(16, extern const uint8_t, - vp9_pt_energy_class[MAX_ENTROPY_TOKENS]); +#define LAST_FRAME_MODE_MASK 0xFFEDCD60 +#define GOLDEN_FRAME_MODE_MASK 0xFFDA3BB0 +#define ALT_REF_MODE_MASK 0xFFC648D0 -#define LAST_FRAME_MODE_MASK 0xFFDADCD60 -#define GOLDEN_FRAME_MODE_MASK 0xFFB5A3BB0 -#define ALT_REF_MODE_MASK 0xFF8C648D0 +#define MIN_EARLY_TERM_INDEX 3 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { - {RD_NEARESTMV, LAST_FRAME, NONE}, - {RD_NEARESTMV, ALTREF_FRAME, NONE}, - {RD_NEARESTMV, GOLDEN_FRAME, NONE}, - - {RD_DC_PRED, INTRA_FRAME, NONE}, - - {RD_NEWMV, LAST_FRAME, NONE}, - {RD_NEWMV, ALTREF_FRAME, NONE}, - {RD_NEWMV, GOLDEN_FRAME, NONE}, - - {RD_NEARMV, LAST_FRAME, NONE}, - {RD_NEARMV, ALTREF_FRAME, NONE}, - {RD_NEARESTMV, LAST_FRAME, ALTREF_FRAME}, - {RD_NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME}, - - {RD_TM_PRED, INTRA_FRAME, NONE}, - - {RD_NEARMV, LAST_FRAME, ALTREF_FRAME}, - {RD_NEWMV, LAST_FRAME, ALTREF_FRAME}, - {RD_NEARMV, GOLDEN_FRAME, NONE}, - {RD_NEARMV, GOLDEN_FRAME, ALTREF_FRAME}, - {RD_NEWMV, GOLDEN_FRAME, ALTREF_FRAME}, - - {RD_SPLITMV, LAST_FRAME, NONE}, - {RD_SPLITMV, GOLDEN_FRAME, NONE}, - {RD_SPLITMV, ALTREF_FRAME, NONE}, - {RD_SPLITMV, LAST_FRAME, ALTREF_FRAME}, - {RD_SPLITMV, GOLDEN_FRAME, ALTREF_FRAME}, - - {RD_ZEROMV, LAST_FRAME, NONE}, - {RD_ZEROMV, GOLDEN_FRAME, NONE}, - {RD_ZEROMV, ALTREF_FRAME, NONE}, - {RD_ZEROMV, LAST_FRAME, ALTREF_FRAME}, - {RD_ZEROMV, GOLDEN_FRAME, ALTREF_FRAME}, - - {RD_I4X4_PRED, INTRA_FRAME, NONE}, - {RD_H_PRED, INTRA_FRAME, NONE}, - {RD_V_PRED, INTRA_FRAME, NONE}, - {RD_D135_PRED, INTRA_FRAME, NONE}, - {RD_D207_PRED, INTRA_FRAME, NONE}, - {RD_D153_PRED, INTRA_FRAME, NONE}, - {RD_D63_PRED, INTRA_FRAME, NONE}, - {RD_D117_PRED, INTRA_FRAME, NONE}, - {RD_D45_PRED, INTRA_FRAME, NONE}, + {NEARESTMV, LAST_FRAME, NONE}, + {NEARESTMV, ALTREF_FRAME, NONE}, + {NEARESTMV, GOLDEN_FRAME, NONE}, + + {DC_PRED, INTRA_FRAME, NONE}, + + {NEWMV, LAST_FRAME, NONE}, + {NEWMV, ALTREF_FRAME, NONE}, + {NEWMV, GOLDEN_FRAME, NONE}, + + {NEARMV, LAST_FRAME, NONE}, + {NEARMV, ALTREF_FRAME, NONE}, + {NEARESTMV, LAST_FRAME, ALTREF_FRAME}, + {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME}, + + {TM_PRED, INTRA_FRAME, NONE}, + + {NEARMV, LAST_FRAME, ALTREF_FRAME}, + {NEWMV, LAST_FRAME, ALTREF_FRAME}, + {NEARMV, GOLDEN_FRAME, NONE}, + {NEARMV, GOLDEN_FRAME, ALTREF_FRAME}, + {NEWMV, GOLDEN_FRAME, ALTREF_FRAME}, + + {ZEROMV, LAST_FRAME, NONE}, + {ZEROMV, GOLDEN_FRAME, NONE}, + {ZEROMV, ALTREF_FRAME, NONE}, + {ZEROMV, LAST_FRAME, ALTREF_FRAME}, + {ZEROMV, GOLDEN_FRAME, ALTREF_FRAME}, + + {H_PRED, INTRA_FRAME, NONE}, + {V_PRED, INTRA_FRAME, NONE}, + {D135_PRED, INTRA_FRAME, NONE}, + {D207_PRED, INTRA_FRAME, NONE}, + {D153_PRED, INTRA_FRAME, NONE}, + {D63_PRED, INTRA_FRAME, NONE}, + {D117_PRED, INTRA_FRAME, NONE}, + {D45_PRED, INTRA_FRAME, NONE}, +}; + +const REF_DEFINITION vp9_ref_order[MAX_REFS] = { + {LAST_FRAME, NONE}, + {GOLDEN_FRAME, NONE}, + {ALTREF_FRAME, NONE}, + {LAST_FRAME, ALTREF_FRAME}, + {GOLDEN_FRAME, ALTREF_FRAME}, + {INTRA_FRAME, NONE}, }; // The baseline rd thresholds for breaking out of the rd loop for @@ -106,8 +107,13 @@ const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { static int rd_thresh_block_size_factor[BLOCK_SIZES] = {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32}; -#define MAX_RD_THRESH_FACT 64 -#define RD_THRESH_INC 1 +#define RD_THRESH_MAX_FACT 64 +#define RD_THRESH_INC 1 +#define RD_THRESH_POW 1.25 +#define RD_MULT_EPB_RATIO 64 + +#define MV_COST_WEIGHT 108 +#define MV_COST_WEIGHT_SUB 120 static void fill_token_costs(vp9_coeff_cost *c, vp9_coeff_probs_model (*p)[BLOCK_TYPES]) { @@ -155,18 +161,26 @@ void vp9_init_me_luts() { } } -static int compute_rd_mult(int qindex) { +int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex) { const int q = vp9_dc_quant(qindex, 0); - return (11 * q * q) >> 2; + // TODO(debargha): Adjust the function below + int rdmult = 88 * q * q / 25; + if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { + if (cpi->twopass.next_iiratio > 31) + rdmult += (rdmult * rd_iifactor[31]) >> 4; + else + rdmult += (rdmult * rd_iifactor[cpi->twopass.next_iiratio]) >> 4; + } + return rdmult; } -static MB_PREDICTION_MODE rd_mode_to_mode(RD_PREDICTION_MODE rd_mode) { - if (rd_mode == RD_SPLITMV || rd_mode == RD_I4X4_PRED) { - assert(!"Invalid rd_mode"); - return MB_MODE_COUNT; - } - assert((int)rd_mode < (int)MB_MODE_COUNT); - return (MB_PREDICTION_MODE)rd_mode; +static int compute_rd_thresh_factor(int qindex) { + int q; + // TODO(debargha): Adjust the function below + q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12); + if (q < 8) + q = 8; + return q; } void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) { @@ -174,102 +188,90 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) { cpi->mb.sadperbit4 = sad_per_bit4lut[qindex]; } +static void set_block_thresholds(VP9_COMP *cpi) { + int i, bsize, segment_id; + VP9_COMMON *cm = &cpi->common; -void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) { - int q, i, bsize; - - vp9_clear_system_state(); // __asm emms; - - // Further tests required to see if optimum is different - // for key frames, golden frames and arf frames. - // if (cpi->common.refresh_golden_frame || - // cpi->common.refresh_alt_ref_frame) - qindex = clamp(qindex, 0, MAXQ); - - cpi->RDMULT = compute_rd_mult(qindex); - if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { - if (cpi->twopass.next_iiratio > 31) - cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4; - else - cpi->RDMULT += - (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4; - } - cpi->mb.errorperbit = cpi->RDMULT >> 6; - cpi->mb.errorperbit += (cpi->mb.errorperbit == 0); - - vp9_set_speed_features(cpi); - - q = (int)pow(vp9_dc_quant(qindex, 0) >> 2, 1.25); - q <<= 2; - if (q < 8) - q = 8; - - if (cpi->RDMULT > 1000) { - cpi->RDDIV = 1; - cpi->RDMULT /= 100; + for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) { + int q; + int segment_qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); + segment_qindex = clamp(segment_qindex + cm->y_dc_delta_q, 0, MAXQ); + q = compute_rd_thresh_factor(segment_qindex); for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) { + // Threshold here seem unecessarily harsh but fine given actual + // range of values used for cpi->sf.thresh_mult[] + int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]); + for (i = 0; i < MAX_MODES; ++i) { - // Threshold here seem unecessarily harsh but fine given actual - // range of values used for cpi->sf.thresh_mult[] - int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]); - - // *4 relates to the scaling of rd_thresh_block_size_factor[] - if ((int64_t)cpi->sf.thresh_mult[i] < thresh_max) { - cpi->rd_threshes[bsize][i] = - cpi->sf.thresh_mult[i] * q * - rd_thresh_block_size_factor[bsize] / (4 * 100); + if (cpi->sf.thresh_mult[i] < thresh_max) { + cpi->rd_threshes[segment_id][bsize][i] = + cpi->sf.thresh_mult[i] * q * + rd_thresh_block_size_factor[bsize] / 4; } else { - cpi->rd_threshes[bsize][i] = INT_MAX; + cpi->rd_threshes[segment_id][bsize][i] = INT_MAX; } } - } - } else { - cpi->RDDIV = 100; - - for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) { - for (i = 0; i < MAX_MODES; i++) { - // Threshold here seem unecessarily harsh but fine given actual - // range of values used for cpi->sf.thresh_mult[] - int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]); - if (cpi->sf.thresh_mult[i] < thresh_max) { - cpi->rd_threshes[bsize][i] = - cpi->sf.thresh_mult[i] * q * - rd_thresh_block_size_factor[bsize] / 4; + for (i = 0; i < MAX_REFS; ++i) { + if (cpi->sf.thresh_mult_sub8x8[i] < thresh_max) { + cpi->rd_thresh_sub8x8[segment_id][bsize][i] = + cpi->sf.thresh_mult_sub8x8[i] * q * + rd_thresh_block_size_factor[bsize] / 4; } else { - cpi->rd_threshes[bsize][i] = INT_MAX; + cpi->rd_thresh_sub8x8[segment_id][bsize][i] = INT_MAX; } } } } +} + +void vp9_initialize_rd_consts(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + int qindex, i; + + vp9_clear_system_state(); // __asm emms; + + // Further tests required to see if optimum is different + // for key frames, golden frames and arf frames. + // if (cpi->common.refresh_golden_frame || + // cpi->common.refresh_alt_ref_frame) + qindex = clamp(cm->base_qindex + cm->y_dc_delta_q, 0, MAXQ); + + cpi->RDDIV = RDDIV_BITS; // in bits (to multiply D by 128) + cpi->RDMULT = vp9_compute_rd_mult(cpi, qindex); + + cpi->mb.errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO; + cpi->mb.errorperbit += (cpi->mb.errorperbit == 0); + + vp9_set_speed_features(cpi); - fill_token_costs(cpi->mb.token_costs, cpi->common.fc.coef_probs); + set_block_thresholds(cpi); - for (i = 0; i < NUM_PARTITION_CONTEXTS; i++) - vp9_cost_tokens(cpi->mb.partition_cost[i], - cpi->common.fc.partition_prob[cpi->common.frame_type][i], + fill_token_costs(cpi->mb.token_costs, cm->fc.coef_probs); + + for (i = 0; i < PARTITION_CONTEXTS; i++) + vp9_cost_tokens(cpi->mb.partition_cost[i], get_partition_probs(cm, i), vp9_partition_tree); /*rough estimate for costing*/ vp9_init_mode_costs(cpi); - if (cpi->common.frame_type != KEY_FRAME) { + if (!frame_is_intra_only(cm)) { vp9_build_nmv_cost_table( cpi->mb.nmvjointcost, - cpi->mb.e_mbd.allow_high_precision_mv ? - cpi->mb.nmvcost_hp : cpi->mb.nmvcost, - &cpi->common.fc.nmvc, - cpi->mb.e_mbd.allow_high_precision_mv, 1, 1); + cm->allow_high_precision_mv ? cpi->mb.nmvcost_hp : cpi->mb.nmvcost, + &cm->fc.nmvc, + cm->allow_high_precision_mv, 1, 1); for (i = 0; i < INTER_MODE_CONTEXTS; i++) { MB_PREDICTION_MODE m; for (m = NEARESTMV; m < MB_MODE_COUNT; m++) - cpi->mb.inter_mode_cost[i][m - NEARESTMV] = + cpi->mb.inter_mode_cost[i][inter_mode_offset(m)] = cost_token(vp9_inter_mode_tree, - cpi->common.fc.inter_mode_probs[i], - vp9_inter_mode_encodings - NEARESTMV + m); + cm->fc.inter_mode_probs[i], + &vp9_inter_mode_encodings[inter_mode_offset(m)]); } } } @@ -369,8 +371,8 @@ static void model_rd_from_var_lapndz(int var, int n, int qstep, double s2 = (double) var / n; double x = qstep / sqrt(s2); model_rd_norm(x, &R, &D); - *rate = ((n << 8) * R + 0.5); - *dist = (var * D + 0.5); + *rate = (int)((n << 8) * R + 0.5); + *dist = (int)(var * D + 0.5); } vp9_clear_system_state(); } @@ -397,7 +399,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, pd->dequant[1] >> 3, &rate, &dist); rate_sum += rate; - dist_sum += dist; + dist_sum += (int)dist; } *out_rate_sum = rate_sum; @@ -479,13 +481,13 @@ static const int16_t band_counts[TX_SIZES][8] = { { 1, 2, 3, 4, 11, 1024 - 21, 0 }, }; -static INLINE int cost_coeffs(MACROBLOCK *mb, +static INLINE int cost_coeffs(MACROBLOCK *x, int plane, int block, ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L, TX_SIZE tx_size, const int16_t *scan, const int16_t *nb) { - MACROBLOCKD *const xd = &mb->e_mbd; - MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; struct macroblockd_plane *pd = &xd->plane[plane]; const PLANE_TYPE type = pd->plane_type; const int16_t *band_count = &band_counts[tx_size][1]; @@ -493,9 +495,9 @@ static INLINE int cost_coeffs(MACROBLOCK *mb, const int16_t *const qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block); const int ref = mbmi->ref_frame[0] != INTRA_FRAME; unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] = - mb->token_costs[tx_size][type][ref]; + x->token_costs[tx_size][type][ref]; const ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L; - uint8_t token_cache[1024]; + uint8_t *p_tok = x->token_cache; int pt = combine_entropy_contexts(above_ec, left_ec); int c, cost; @@ -514,7 +516,7 @@ static INLINE int cost_coeffs(MACROBLOCK *mb, int v = qcoeff_ptr[0]; int prev_t = vp9_dct_value_tokens_ptr[v].token; cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v]; - token_cache[0] = vp9_pt_energy_class[prev_t]; + p_tok[0] = vp9_pt_energy_class[prev_t]; ++token_costs; // ac tokens @@ -524,9 +526,9 @@ static INLINE int cost_coeffs(MACROBLOCK *mb, v = qcoeff_ptr[rc]; t = vp9_dct_value_tokens_ptr[v].token; - pt = get_coef_context(nb, token_cache, c); + pt = get_coef_context(nb, p_tok, c); cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v]; - token_cache[rc] = vp9_pt_energy_class[t]; + p_tok[rc] = vp9_pt_energy_class[t]; prev_t = t; if (!--band_left) { band_left = *band_count++; @@ -536,7 +538,7 @@ static INLINE int cost_coeffs(MACROBLOCK *mb, // eob token if (band_left) { - pt = get_coef_context(nb, token_cache, c); + pt = get_coef_context(nb, p_tok, c); cost += (*token_costs)[0][pt][DCT_EOB_TOKEN]; } } @@ -547,21 +549,6 @@ static INLINE int cost_coeffs(MACROBLOCK *mb, return cost; } -struct rdcost_block_args { - MACROBLOCK *x; - ENTROPY_CONTEXT t_above[16]; - ENTROPY_CONTEXT t_left[16]; - TX_SIZE tx_size; - int bw; - int bh; - int rate; - int64_t dist; - int64_t sse; - int64_t best_rd; - int skip; - const int16_t *scan, *nb; -}; - static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) { const int ss_txfrm_size = tx_size << 1; struct rdcost_block_args* args = arg; @@ -573,16 +560,16 @@ static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) { int shift = args->tx_size == TX_32X32 ? 0 : 2; int16_t *const coeff = BLOCK_OFFSET(p->coeff, block); int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, - &this_sse) >> shift; - args->sse += this_sse >> shift; + args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, + &this_sse) >> shift; + args->sse = this_sse >> shift; if (x->skip_encode && - xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME) { + xd->mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME) { // TODO(jingning): tune the model to better capture the distortion. int64_t p = (pd->dequant[1] * pd->dequant[1] * - (1 << ss_txfrm_size)) >> shift; - args->dist += p; + (1 << ss_txfrm_size)) >> (shift + 2); + args->dist += (p >> 4); args->sse += p; } } @@ -594,10 +581,9 @@ static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize, int x_idx, y_idx; txfrm_block_to_raster_xy(plane_bsize, args->tx_size, block, &x_idx, &y_idx); - args->rate += cost_coeffs(args->x, plane, block, - args->t_above + x_idx, - args->t_left + y_idx, args->tx_size, - args->scan, args->nb); + args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx, + args->t_left + y_idx, args->tx_size, + args->scan, args->nb); } static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, @@ -610,85 +596,114 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, if (args->skip) return; - rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist); - rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse); - rd = MIN(rd1, rd2); - if (rd > args->best_rd) { - args->skip = 1; - args->rate = INT_MAX; - args->dist = INT64_MAX; - args->sse = INT64_MAX; - return; - } - if (!is_inter_block(&xd->this_mi->mbmi)) + if (!is_inter_block(&xd->mi_8x8[0]->mbmi)) vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &encode_args); else vp9_xform_quant(plane, block, plane_bsize, tx_size, &encode_args); dist_block(plane, block, tx_size, args); rate_block(plane, block, plane_bsize, tx_size, args); -} + rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist); + rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse); -static void txfm_rd_in_plane(MACROBLOCK *x, - int *rate, int64_t *distortion, - int *skippable, int64_t *sse, - int64_t ref_best_rd, int plane, - BLOCK_SIZE bsize, TX_SIZE tx_size) { - MACROBLOCKD *const xd = &x->e_mbd; - struct macroblockd_plane *const pd = &xd->plane[plane]; - const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); - const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bs]; - const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bs]; - int i; - struct rdcost_block_args args = { x, { 0 }, { 0 }, tx_size, - num_4x4_blocks_wide, num_4x4_blocks_high, - 0, 0, 0, ref_best_rd, 0 }; + // TODO(jingning): temporarily enabled only for luma component + rd = MIN(rd1, rd2); if (plane == 0) - xd->this_mi->mbmi.tx_size = tx_size; + x->zcoeff_blk[tx_size][block] = rd1 > rd2 || !xd->plane[plane].eobs[block]; + + args->this_rate += args->rate; + args->this_dist += args->dist; + args->this_sse += args->sse; + args->this_rd += rd; + if (args->this_rd > args->best_rd) { + args->skip = 1; + return; + } +} + +void vp9_get_entropy_contexts(TX_SIZE tx_size, + ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16], + const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left, + int num_4x4_w, int num_4x4_h) { + int i; switch (tx_size) { case TX_4X4: - vpx_memcpy(&args.t_above, pd->above_context, - sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide); - vpx_memcpy(&args.t_left, pd->left_context, - sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high); - get_scan_nb_4x4(get_tx_type_4x4(pd->plane_type, xd, 0), - &args.scan, &args.nb); + vpx_memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); + vpx_memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); break; case TX_8X8: - for (i = 0; i < num_4x4_blocks_wide; i += 2) - args.t_above[i] = !!*(uint16_t *)&pd->above_context[i]; - for (i = 0; i < num_4x4_blocks_high; i += 2) - args.t_left[i] = !!*(uint16_t *)&pd->left_context[i]; - get_scan_nb_8x8(get_tx_type_8x8(pd->plane_type, xd), - &args.scan, &args.nb); + for (i = 0; i < num_4x4_w; i += 2) + t_above[i] = !!*(const uint16_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 2) + t_left[i] = !!*(const uint16_t *)&left[i]; break; case TX_16X16: - for (i = 0; i < num_4x4_blocks_wide; i += 4) - args.t_above[i] = !!*(uint32_t *)&pd->above_context[i]; - for (i = 0; i < num_4x4_blocks_high; i += 4) - args.t_left[i] = !!*(uint32_t *)&pd->left_context[i]; - get_scan_nb_16x16(get_tx_type_16x16(pd->plane_type, xd), - &args.scan, &args.nb); + for (i = 0; i < num_4x4_w; i += 4) + t_above[i] = !!*(const uint32_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 4) + t_left[i] = !!*(const uint32_t *)&left[i]; break; case TX_32X32: - for (i = 0; i < num_4x4_blocks_wide; i += 8) - args.t_above[i] = !!*(uint64_t *)&pd->above_context[i]; - for (i = 0; i < num_4x4_blocks_high; i += 8) - args.t_left[i] = !!*(uint64_t *)&pd->left_context[i]; - args.scan = vp9_default_scan_32x32; - args.nb = vp9_default_scan_32x32_neighbors; + for (i = 0; i < num_4x4_w; i += 8) + t_above[i] = !!*(const uint64_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 8) + t_left[i] = !!*(const uint64_t *)&left[i]; break; default: - assert(0); + assert(!"Invalid transform size."); } +} + +static void init_rdcost_stack(MACROBLOCK *x, TX_SIZE tx_size, + const int num_4x4_w, const int num_4x4_h, + const int64_t ref_rdcost, + struct rdcost_block_args *arg) { + vpx_memset(arg, 0, sizeof(struct rdcost_block_args)); + arg->x = x; + arg->tx_size = tx_size; + arg->bw = num_4x4_w; + arg->bh = num_4x4_h; + arg->best_rd = ref_rdcost; +} + +static void txfm_rd_in_plane(MACROBLOCK *x, + struct rdcost_block_args *rd_stack, + int *rate, int64_t *distortion, + int *skippable, int64_t *sse, + int64_t ref_best_rd, int plane, + BLOCK_SIZE bsize, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); + const int num_4x4_w = num_4x4_blocks_wide_lookup[bs]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bs]; + + init_rdcost_stack(x, tx_size, num_4x4_w, num_4x4_h, + ref_best_rd, rd_stack); + if (plane == 0) + xd->mi_8x8[0]->mbmi.tx_size = tx_size; - foreach_transformed_block_in_plane(xd, bsize, plane, block_yrd_txfm, &args); - *distortion = args.dist; - *rate = args.rate; - *sse = args.sse; - *skippable = vp9_is_skippable_in_plane(xd, bsize, plane) && (!args.skip); + vp9_get_entropy_contexts(tx_size, rd_stack->t_above, rd_stack->t_left, + pd->above_context, pd->left_context, + num_4x4_w, num_4x4_h); + + get_scan(xd, tx_size, pd->plane_type, 0, &rd_stack->scan, &rd_stack->nb); + + foreach_transformed_block_in_plane(xd, bsize, plane, + block_yrd_txfm, rd_stack); + if (rd_stack->skip) { + *rate = INT_MAX; + *distortion = INT64_MAX; + *sse = INT64_MAX; + *skippable = 0; + } else { + *distortion = rd_stack->this_dist; + *rate = rd_stack->this_rate; + *sse = rd_stack->this_sse; + *skippable = vp9_is_skippable_in_plane(xd, bsize, plane); + } } static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x, @@ -696,28 +711,18 @@ static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x, int *skip, int64_t *sse, int64_t ref_best_rd, BLOCK_SIZE bs) { - const TX_SIZE max_txfm_size = max_txsize_lookup[bs]; + const TX_SIZE max_tx_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; + const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode]; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; - if (max_txfm_size == TX_32X32 && - (cm->tx_mode == ALLOW_32X32 || - cm->tx_mode == TX_MODE_SELECT)) { - mbmi->tx_size = TX_32X32; - } else if (max_txfm_size >= TX_16X16 && - (cm->tx_mode == ALLOW_16X16 || - cm->tx_mode == ALLOW_32X32 || - cm->tx_mode == TX_MODE_SELECT)) { - mbmi->tx_size = TX_16X16; - } else if (cm->tx_mode != ONLY_4X4) { - mbmi->tx_size = TX_8X8; - } else { - mbmi->tx_size = TX_4X4; - } - txfm_rd_in_plane(x, rate, distortion, skip, + MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; + + mbmi->tx_size = MIN(max_tx_size, largest_tx_size); + + txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip, &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size); - cpi->txfm_stepdown_count[0]++; + cpi->tx_stepdown_count[0]++; } static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, @@ -729,13 +734,13 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, const TX_SIZE max_tx_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; + MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd); int64_t rd[TX_SIZES][2]; int n, m; int s0, s1; - const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->this_mi); + const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->mi_8x8[0]); for (n = TX_4X4; n <= max_tx_size; n++) { r[n][1] = r[n][0]; @@ -811,15 +816,15 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] && rd[TX_32X32][1] < rd[TX_4X4][1]) { - cpi->txfm_stepdown_count[0]++; + cpi->tx_stepdown_count[0]++; } else if (max_tx_size >= TX_16X16 && rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1]) { - cpi->txfm_stepdown_count[max_tx_size - TX_16X16]++; + cpi->tx_stepdown_count[max_tx_size - TX_16X16]++; } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) { - cpi->txfm_stepdown_count[max_tx_size - TX_8X8]++; + cpi->tx_stepdown_count[max_tx_size - TX_8X8]++; } else { - cpi->txfm_stepdown_count[max_tx_size - TX_4X4]++; + cpi->tx_stepdown_count[max_tx_size - TX_4X4]++; } } @@ -829,10 +834,10 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, int *s, int *skip, int64_t *sse, int64_t ref_best_rd, BLOCK_SIZE bs) { - const TX_SIZE max_txfm_size = max_txsize_lookup[bs]; + const TX_SIZE max_tx_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; + MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd); int64_t rd[TX_SIZES][2]; int n, m; @@ -840,14 +845,14 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00}; // double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00}; - const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->this_mi); + const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->mi_8x8[0]); // for (n = TX_4X4; n <= max_txfm_size; n++) // r[n][0] = (r[n][0] * scale_r[n]); - for (n = TX_4X4; n <= max_txfm_size; n++) { + for (n = TX_4X4; n <= max_tx_size; n++) { r[n][1] = r[n][0]; - for (m = 0; m <= n - (n == max_txfm_size); m++) { + for (m = 0; m <= n - (n == max_tx_size); m++) { if (m == n) r[n][1] += vp9_cost_zero(tx_probs[m]); else @@ -859,7 +864,7 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, s0 = vp9_cost_bit(skip_prob, 0); s1 = vp9_cost_bit(skip_prob, 1); - for (n = TX_4X4; n <= max_txfm_size; n++) { + for (n = TX_4X4; n <= max_tx_size; n++) { if (s[n]) { rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]); } else { @@ -867,19 +872,19 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]); } } - for (n = TX_4X4; n <= max_txfm_size; n++) { - rd[n][0] = (scale_rd[n] * rd[n][0]); - rd[n][1] = (scale_rd[n] * rd[n][1]); + for (n = TX_4X4; n <= max_tx_size; n++) { + rd[n][0] = (int64_t)(scale_rd[n] * rd[n][0]); + rd[n][1] = (int64_t)(scale_rd[n] * rd[n][1]); } - if (max_txfm_size == TX_32X32 && + if (max_tx_size == TX_32X32 && (cm->tx_mode == ALLOW_32X32 || (cm->tx_mode == TX_MODE_SELECT && rd[TX_32X32][1] <= rd[TX_16X16][1] && rd[TX_32X32][1] <= rd[TX_8X8][1] && rd[TX_32X32][1] <= rd[TX_4X4][1]))) { mbmi->tx_size = TX_32X32; - } else if (max_txfm_size >= TX_16X16 && + } else if (max_tx_size >= TX_16X16 && (cm->tx_mode == ALLOW_16X16 || cm->tx_mode == ALLOW_32X32 || (cm->tx_mode == TX_MODE_SELECT && @@ -898,22 +903,22 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, // Actually encode using the chosen mode if a model was used, but do not // update the r, d costs - txfm_rd_in_plane(x, rate, distortion, skip, &sse[mbmi->tx_size], - ref_best_rd, 0, bs, mbmi->tx_size); + txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip, + &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size); - if (max_txfm_size == TX_32X32 && + if (max_tx_size == TX_32X32 && rd[TX_32X32][1] <= rd[TX_16X16][1] && rd[TX_32X32][1] <= rd[TX_8X8][1] && rd[TX_32X32][1] <= rd[TX_4X4][1]) { - cpi->txfm_stepdown_count[0]++; - } else if (max_txfm_size >= TX_16X16 && + cpi->tx_stepdown_count[0]++; + } else if (max_tx_size >= TX_16X16 && rd[TX_16X16][1] <= rd[TX_8X8][1] && rd[TX_16X16][1] <= rd[TX_4X4][1]) { - cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++; + cpi->tx_stepdown_count[max_tx_size - TX_16X16]++; } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) { - cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++; + cpi->tx_stepdown_count[max_tx_size - TX_8X8]++; } else { - cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++; + cpi->tx_stepdown_count[max_tx_size - TX_4X4]++; } } @@ -925,15 +930,17 @@ static void super_block_yrd(VP9_COMP *cpi, int r[TX_SIZES][2], s[TX_SIZES]; int64_t d[TX_SIZES], sse[TX_SIZES]; MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; + MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; + struct rdcost_block_args *rdcost_stack = &cpi->rdcost_stack; + const int b_inter_mode = is_inter_block(mbmi); assert(bs == mbmi->sb_type); - if (mbmi->ref_frame[0] > INTRA_FRAME) + if (b_inter_mode) vp9_subtract_sby(x, bs); if (cpi->sf.tx_size_search_method == USE_LARGESTALL || (cpi->sf.tx_size_search_method != USE_FULL_RD && - mbmi->ref_frame[0] == INTRA_FRAME)) { + !b_inter_mode)) { vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t)); choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse, ref_best_rd, bs); @@ -943,7 +950,7 @@ static void super_block_yrd(VP9_COMP *cpi, } if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER && - mbmi->ref_frame[0] > INTRA_FRAME) { + b_inter_mode) { if (bs >= BLOCK_32X32) model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]); @@ -961,14 +968,16 @@ static void super_block_yrd(VP9_COMP *cpi, skip, sse, ref_best_rd, bs); } else { if (bs >= BLOCK_32X32) - txfm_rd_in_plane(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32], - &sse[TX_32X32], ref_best_rd, 0, bs, TX_32X32); + txfm_rd_in_plane(x, rdcost_stack, &r[TX_32X32][0], &d[TX_32X32], + &s[TX_32X32], &sse[TX_32X32], + ref_best_rd, 0, bs, TX_32X32); if (bs >= BLOCK_16X16) - txfm_rd_in_plane(x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16], - &sse[TX_16X16], ref_best_rd, 0, bs, TX_16X16); - txfm_rd_in_plane(x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], + txfm_rd_in_plane(x, rdcost_stack, &r[TX_16X16][0], &d[TX_16X16], + &s[TX_16X16], &sse[TX_16X16], + ref_best_rd, 0, bs, TX_16X16); + txfm_rd_in_plane(x, rdcost_stack, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], &sse[TX_8X8], ref_best_rd, 0, bs, TX_8X8); - txfm_rd_in_plane(x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], + txfm_rd_in_plane(x, rdcost_stack, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], &sse[TX_4X4], ref_best_rd, 0, bs, TX_4X4); choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache, bs); @@ -1022,23 +1031,23 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, ENTROPY_CONTEXT ta[2], tempa[2]; ENTROPY_CONTEXT tl[2], templ[2]; - TX_TYPE tx_type = DCT_DCT; + const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; - int idx, idy, block; + int idx, idy; uint8_t best_dst[8 * 8]; assert(ib < 4); vpx_memcpy(ta, a, sizeof(ta)); vpx_memcpy(tl, l, sizeof(tl)); - xd->this_mi->mbmi.tx_size = TX_4X4; + xd->mi_8x8[0]->mbmi.tx_size = TX_4X4; for (mode = DC_PRED; mode <= TM_PRED; ++mode) { int64_t this_rd; int ratey = 0; - if (!(cpi->sf.intra_y_mode_mask & (1 << mode))) + if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode))) continue; // Only do the oblique modes if the best so far is @@ -1058,11 +1067,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, for (idx = 0; idx < num_4x4_blocks_wide; ++idx) { int64_t ssz; const int16_t *scan; + const int16_t *nb; uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride; uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride; - - block = ib + idy * 2 + idx; - xd->this_mi->bmi[block].as_mode = mode; + const int block = ib + idy * 2 + idx; + TX_TYPE tx_type; + xd->mi_8x8[0]->bmi[block].as_mode = mode; src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff); coeff = BLOCK_OFFSET(x->plane[0].coeff, block); vp9_predict_intra_block(xd, block, 1, @@ -1075,29 +1085,28 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, dst, dst_stride); tx_type = get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block); - if (tx_type != DCT_DCT) { + get_scan_nb_4x4(tx_type, &scan, &nb); + + if (tx_type != DCT_DCT) vp9_short_fht4x4(src_diff, coeff, 8, tx_type); - x->quantize_b_4x4(x, block, tx_type, 16); - } else { - x->fwd_txm4x4(src_diff, coeff, 16); - x->quantize_b_4x4(x, block, tx_type, 16); - } + else + x->fwd_txm4x4(src_diff, coeff, 8); + + vp9_regular_quantize_b_4x4(x, 4, block, scan, get_iscan_4x4(tx_type)); - scan = get_scan_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block)); ratey += cost_coeffs(x, 0, block, - tempa + idx, templ + idy, TX_4X4, scan, - vp9_get_coef_neighbors_handle(scan)); + tempa + idx, templ + idy, TX_4X4, scan, nb); distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &ssz) >> 2; if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) goto next; if (tx_type != DCT_DCT) - vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), + vp9_iht4x4_16_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, pd->dst.stride, tx_type); else - xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), - dst, pd->dst.stride); + xd->itxm_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, pd->dst.stride, + 16); } } @@ -1138,10 +1147,10 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi, int64_t best_rd) { int i, j; MACROBLOCKD *const xd = &mb->e_mbd; - MODE_INFO *const mic = xd->this_mi; + MODE_INFO *const mic = xd->mi_8x8[0]; const MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride]; - const MODE_INFO *left_mi = xd->mi_8x8[-1]; - const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type; + const MODE_INFO *left_mi = xd->left_available ? xd->mi_8x8[-1] : NULL; + const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; int idx, idy; @@ -1166,9 +1175,7 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi, i = idy * 2 + idx; if (cpi->common.frame_type == KEY_FRAME) { const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, i); - const MB_PREDICTION_MODE L = (xd->left_available || idx) ? - left_block_mode(mic, left_mi, i) : - DC_PRED; + const MB_PREDICTION_MODE L = left_block_mode(mic, left_mi, i); bmode_costs = mb->y_mode_costs[A][L]; } @@ -1212,7 +1219,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, MB_PREDICTION_MODE mode; MB_PREDICTION_MODE mode_selected = DC_PRED; MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO *const mic = xd->this_mi; + MODE_INFO *const mic = xd->mi_8x8[0]; int this_rate, this_rate_tokenonly, s; int64_t this_distortion, this_rd; TX_SIZE best_tx = TX_4X4; @@ -1227,15 +1234,14 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, for (mode = DC_PRED; mode <= TM_PRED; mode++) { int64_t local_tx_cache[TX_MODES]; MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride]; - MODE_INFO *left_mi = xd->mi_8x8[-1]; + MODE_INFO *left_mi = xd->left_available ? xd->mi_8x8[-1] : NULL; - if (!(cpi->sf.intra_y_mode_mask & (1 << mode))) + if (!(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]] & (1 << mode))) continue; if (cpi->common.frame_type == KEY_FRAME) { const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, 0); - const MB_PREDICTION_MODE L = xd->left_available ? - left_block_mode(mic, left_mi, 0) : DC_PRED; + const MB_PREDICTION_MODE L = left_block_mode(mic, left_mi, 0); bmode_costs = x->y_mode_costs[A][L]; } @@ -1277,12 +1283,12 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, return best_rd; } -static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, +static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x, int *rate, int64_t *distortion, int *skippable, int64_t *sse, BLOCK_SIZE bsize, int64_t ref_best_rd) { MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; + MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi); int plane; int pnrate = 0, pnskip = 1; @@ -1300,7 +1306,7 @@ static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, *skippable = 1; for (plane = 1; plane < MAX_MB_PLANE; ++plane) { - txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse, + txfm_rd_in_plane(x, &cpi->rdcost_stack, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd, plane, bsize, uv_txfm_size); if (pnrate == INT_MAX) goto term; @@ -1332,14 +1338,15 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, // int mode_mask = (bsize <= BLOCK_8X8) // ? ALL_INTRA_MODES : cpi->sf.intra_uv_mode_mask; - for (mode = DC_PRED; mode <= TM_PRED; mode++) { + for (mode = DC_PRED; mode <= TM_PRED; mode ++) { // if (!(mode_mask & (1 << mode))) - if (!(cpi->sf.intra_uv_mode_mask & (1 << mode))) + if (!(cpi->sf.intra_uv_mode_mask[max_uv_txsize_lookup[bsize]] + & (1 << mode))) continue; x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode; - super_block_uvrd(&cpi->common, x, &this_rate_tokenonly, + super_block_uvrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, &this_sse, bsize, best_rd); if (this_rate_tokenonly == INT_MAX) continue; @@ -1370,8 +1377,8 @@ static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x, int64_t this_sse; x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED; - super_block_uvrd(&cpi->common, x, rate_tokenonly, - distortion, skippable, &this_sse, bsize, INT64_MAX); + super_block_uvrd(cpi, x, rate_tokenonly, distortion, + skippable, &this_sse, bsize, INT64_MAX); *rate = *rate_tokenonly + x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED]; this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion); @@ -1404,12 +1411,12 @@ static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode, int mode_context) { MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - const int segment_id = xd->this_mi->mbmi.segment_id; + const int segment_id = xd->mi_8x8[0]->mbmi.segment_id; // Don't account for mode here if segment skip is enabled. if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) { assert(is_inter_mode(mode)); - return x->inter_mode_cost[mode_context][mode - NEARESTMV]; + return x->inter_mode_cost[mode_context][inter_mode_offset(mode)]; } else { return 0; } @@ -1426,10 +1433,6 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], int *rate_mv); -static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, - int mi_row, int mi_col, - int_mv *tmp_mv, int *rate_mv); static int labels2mode(MACROBLOCK *x, int i, MB_PREDICTION_MODE this_mode, @@ -1440,12 +1443,13 @@ static int labels2mode(MACROBLOCK *x, int i, int_mv *second_best_ref_mv, int *mvjcost, int *mvcost[2], VP9_COMP *cpi) { MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO *const mic = xd->this_mi; + MODE_INFO *const mic = xd->mi_8x8[0]; MB_MODE_INFO *mbmi = &mic->mbmi; int cost = 0, thismvcost = 0; int idx, idy; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type]; + const int has_second_rf = has_second_ref(mbmi); /* We have to be careful retrieving previously-encoded motion vectors. Ones from this macroblock have to be pulled from the BLOCKD array @@ -1457,29 +1461,30 @@ static int labels2mode(MACROBLOCK *x, int i, switch (m = this_mode) { case NEWMV: this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int; - thismvcost = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost, - 102); - if (mbmi->ref_frame[1] > 0) { + thismvcost = vp9_mv_bit_cost(&this_mv->as_mv, &best_ref_mv->as_mv, + mvjcost, mvcost, MV_COST_WEIGHT_SUB); + if (has_second_rf) { this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int; - thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv, - mvjcost, mvcost, 102); + thismvcost += vp9_mv_bit_cost(&this_second_mv->as_mv, + &second_best_ref_mv->as_mv, + mvjcost, mvcost, MV_COST_WEIGHT_SUB); } break; case NEARESTMV: this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int; - if (mbmi->ref_frame[1] > 0) + if (has_second_rf) this_second_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int; break; case NEARMV: this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[0]].as_int; - if (mbmi->ref_frame[1] > 0) + if (has_second_rf) this_second_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[1]].as_int; break; case ZEROMV: this_mv->as_int = 0; - if (mbmi->ref_frame[1] > 0) + if (has_second_rf) this_second_mv->as_int = 0; break; default: @@ -1490,10 +1495,11 @@ static int labels2mode(MACROBLOCK *x, int i, mbmi->mode_context[mbmi->ref_frame[0]]); mic->bmi[i].as_mv[0].as_int = this_mv->as_int; - if (mbmi->ref_frame[1] > 0) + if (has_second_rf) mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int; - x->partition_info->bmi[i].mode = m; + mic->bmi[i].as_mode = m; + for (idy = 0; idy < num_4x4_blocks_high; ++idy) for (idx = 0; idx < num_4x4_blocks_wide; ++idx) vpx_memcpy(&mic->bmi[i + idy * 2 + idx], @@ -1514,25 +1520,21 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, int k; MACROBLOCKD *xd = &x->e_mbd; struct macroblockd_plane *const pd = &xd->plane[0]; - MODE_INFO *const mi = xd->this_mi; + struct macroblock_plane *const p = &x->plane[0]; + MODE_INFO *const mi = xd->mi_8x8[0]; const BLOCK_SIZE bsize = mi->mbmi.sb_type; const int width = plane_block_width(bsize, pd); const int height = plane_block_height(bsize, pd); int idx, idy; - const int src_stride = x->plane[0].src.stride; - uint8_t* const src = raster_block_offset_uint8(BLOCK_8X8, i, - x->plane[0].src.buf, - src_stride); - int16_t* src_diff = raster_block_offset_int16(BLOCK_8X8, i, - x->plane[0].src_diff); - int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, i); - uint8_t* const dst = raster_block_offset_uint8(BLOCK_8X8, i, + + uint8_t *const src = raster_block_offset_uint8(BLOCK_8X8, i, + p->src.buf, p->src.stride); + uint8_t *const dst = raster_block_offset_uint8(BLOCK_8X8, i, pd->dst.buf, pd->dst.stride); int64_t thisdistortion = 0, thissse = 0; - int thisrate = 0; - int ref, second_ref = has_second_ref(&mi->mbmi); - - for (ref = 0; ref < 1 + second_ref; ++ref) { + int thisrate = 0, ref; + const int is_compound = has_second_ref(&mi->mbmi); + for (ref = 0; ref < 1 + is_compound; ++ref) { const uint8_t *pre = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[ref].buf, pd->pre[ref].stride); vp9_build_inter_predictor(pre, pd->pre[ref].stride, @@ -1542,20 +1544,23 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, width, height, ref, &xd->subpix, MV_PRECISION_Q3); } - vp9_subtract_block(height, width, src_diff, 8, src, src_stride, + vp9_subtract_block(height, width, + raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8, + src, p->src.stride, dst, pd->dst.stride); k = i; for (idy = 0; idy < height / 4; ++idy) { for (idx = 0; idx < width / 4; ++idx) { int64_t ssz, rd, rd1, rd2; + int16_t* coeff; k += (idy * 2 + idx); - src_diff = raster_block_offset_int16(BLOCK_8X8, k, - x->plane[0].src_diff); - coeff = BLOCK_OFFSET(x->plane[0].coeff, k); - x->fwd_txm4x4(src_diff, coeff, 16); - x->quantize_b_4x4(x, k, DCT_DCT, 16); + coeff = BLOCK_OFFSET(p->coeff, k); + x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff), + coeff, 8); + vp9_regular_quantize_b_4x4(x, 4, k, get_scan_4x4(DCT_DCT), + get_iscan_4x4(DCT_DCT)); thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz); thissse += ssz; @@ -1571,6 +1576,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, return INT64_MAX; } } + *distortion = thisdistortion >> 2; *labelyrate = thisrate; *sse = thissse >> 2; @@ -1623,7 +1629,7 @@ static INLINE void mi_buf_shift(MACROBLOCK *x, int i) { assert(((intptr_t)pd->pre[0].buf & 0x7) == 0); pd->pre[0].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[0].buf, pd->pre[0].stride); - if (mbmi->ref_frame[1]) + if (has_second_ref(mbmi)) pd->pre[1].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[1].buf, pd->pre[1].stride); } @@ -1633,19 +1639,21 @@ static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src, MB_MODE_INFO *mbmi = &x->e_mbd.mi_8x8[0]->mbmi; x->plane[0].src = orig_src; x->e_mbd.plane[0].pre[0] = orig_pre[0]; - if (mbmi->ref_frame[1]) + if (has_second_ref(mbmi)) x->e_mbd.plane[0].pre[1] = orig_pre[1]; } static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo *const tile, BEST_SEG_INFO *bsi_buf, int filter_idx, int_mv seg_mvs[4][MAX_REF_FRAMES], int mi_row, int mi_col) { - int i, j, br = 0, idx, idy; + int i, br = 0, idx, idy; int64_t bd = 0, block_sse = 0; MB_PREDICTION_MODE this_mode; MODE_INFO *mi = x->e_mbd.mi_8x8[0]; MB_MODE_INFO *const mbmi = &mi->mbmi; + struct macroblockd_plane *const pd = &x->e_mbd.plane[0]; const int label_count = 4; int64_t this_segment_rd = 0; int label_mv_thresh; @@ -1658,9 +1666,10 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi = bsi_buf + filter_idx; int mode_idx; int subpelmv = 1, have_ref = 0; + const int has_second_rf = has_second_ref(mbmi); - vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above)); - vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left)); + vpx_memcpy(t_above, pd->above_context, sizeof(t_above)); + vpx_memcpy(t_left, pd->left_context, sizeof(t_left)); v_fn_ptr = &cpi->fn_ptr[bsize]; @@ -1682,17 +1691,17 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, i = idy * 2 + idx; frame_mv[ZEROMV][mbmi->ref_frame[0]].as_int = 0; - frame_mv[ZEROMV][mbmi->ref_frame[1]].as_int = 0; - vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, + vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile, &frame_mv[NEARESTMV][mbmi->ref_frame[0]], &frame_mv[NEARMV][mbmi->ref_frame[0]], i, 0, mi_row, mi_col); - if (mbmi->ref_frame[1] > 0) - vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, - &frame_mv[NEARESTMV][mbmi->ref_frame[1]], - &frame_mv[NEARMV][mbmi->ref_frame[1]], - i, 1, mi_row, mi_col); - + if (has_second_rf) { + frame_mv[ZEROMV][mbmi->ref_frame[1]].as_int = 0; + vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile, + &frame_mv[NEARESTMV][mbmi->ref_frame[1]], + &frame_mv[NEARMV][mbmi->ref_frame[1]], + i, 1, mi_row, mi_col); + } // search for the best motion vector on this segment for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { const struct buf_2d orig_src = x->plane[0].src; @@ -1705,7 +1714,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) && frame_mv[this_mode][mbmi->ref_frame[0]].as_int == 0 && - (mbmi->ref_frame[1] <= 0 || + (!has_second_rf || frame_mv[this_mode][mbmi->ref_frame[1]].as_int == 0)) { int rfc = mbmi->mode_context[mbmi->ref_frame[0]]; int c1 = cost_mv_ref(cpi, NEARMV, rfc); @@ -1720,7 +1729,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, continue; } else { assert(this_mode == ZEROMV); - if (mbmi->ref_frame[1] <= 0) { + if (!has_second_rf) { if ((c3 >= c2 && frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) || (c3 >= c1 && @@ -1738,14 +1747,14 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, } } - vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre)); + vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre)); vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above, sizeof(bsi->rdstat[i][mode_idx].ta)); vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left, sizeof(bsi->rdstat[i][mode_idx].tl)); // motion search for newmv (single predictor case only) - if (mbmi->ref_frame[1] <= 0 && this_mode == NEWMV && + if (!has_second_rf && this_mode == NEWMV && seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) { int step_param = 0; int further_steps; @@ -1795,20 +1804,23 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, // adjust src pointer for this block mi_buf_shift(x, i); if (cpi->sf.search_method == HEX) { - bestsme = vp9_hex_search(x, &mvp_full, + bestsme = vp9_hex_search(x, &mvp_full.as_mv, step_param, sadpb, 1, v_fn_ptr, 1, - bsi->ref_mv, &mode_mv[NEWMV]); + &bsi->ref_mv->as_mv, + &mode_mv[NEWMV].as_mv); } else if (cpi->sf.search_method == SQUARE) { - bestsme = vp9_square_search(x, &mvp_full, + bestsme = vp9_square_search(x, &mvp_full.as_mv, step_param, sadpb, 1, v_fn_ptr, 1, - bsi->ref_mv, &mode_mv[NEWMV]); + &bsi->ref_mv->as_mv, + &mode_mv[NEWMV].as_mv); } else if (cpi->sf.search_method == BIGDIA) { - bestsme = vp9_bigdia_search(x, &mvp_full, + bestsme = vp9_bigdia_search(x, &mvp_full.as_mv, step_param, sadpb, 1, v_fn_ptr, 1, - bsi->ref_mv, &mode_mv[NEWMV]); + &bsi->ref_mv->as_mv, + &mode_mv[NEWMV].as_mv); } else { bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, sadpb, further_steps, 0, v_fn_ptr, @@ -1839,8 +1851,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, if (bestsme < INT_MAX) { int distortion; unsigned int sse; - cpi->find_fractional_mv_step(x, &mode_mv[NEWMV], - bsi->ref_mv, x->errorperbit, v_fn_ptr, + cpi->find_fractional_mv_step(x, + &mode_mv[NEWMV].as_mv, + &bsi->ref_mv->as_mv, + cpi->common.allow_high_precision_mv, + x->errorperbit, v_fn_ptr, 0, cpi->sf.subpel_iters_per_step, x->nmvjointcost, x->mvcost, &distortion, &sse); @@ -1856,12 +1871,14 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, mi_buf_restore(x, orig_src, orig_pre); } - if (mbmi->ref_frame[1] > 0 && this_mode == NEWMV && - mbmi->interp_filter == EIGHTTAP) { + if (has_second_rf) { if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV || seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) continue; + } + if (has_second_rf && this_mode == NEWMV && + mbmi->interp_filter == EIGHTTAP) { // adjust src pointers mi_buf_shift(x, i); if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { @@ -1891,7 +1908,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, if (num_4x4_blocks_high > 1) bsi->rdstat[i + 2][mode_idx].mvs[0].as_int = mode_mv[this_mode].as_int; - if (mbmi->ref_frame[1] > 0) { + if (has_second_rf) { bsi->rdstat[i][mode_idx].mvs[1].as_int = second_mode_mv[this_mode].as_int; if (num_4x4_blocks_wide > 1) @@ -1905,7 +1922,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, // Trap vectors that reach beyond the UMV borders if (mv_check_bounds(x, &mode_mv[this_mode])) continue; - if (mbmi->ref_frame[1] > 0 && + if (has_second_rf && mv_check_bounds(x, &second_mode_mv[this_mode])) continue; @@ -1915,7 +1932,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, (mode_mv[this_mode].as_mv.col & 0x0f); have_ref = mode_mv[this_mode].as_int == ref_bsi->rdstat[i][mode_idx].mvs[0].as_int; - if (mbmi->ref_frame[1] > 0) { + if (has_second_rf) { subpelmv |= (second_mode_mv[this_mode].as_mv.row & 0x0f) || (second_mode_mv[this_mode].as_mv.col & 0x0f); have_ref &= second_mode_mv[this_mode].as_int == @@ -1926,7 +1943,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, ref_bsi = bsi_buf + 1; have_ref = mode_mv[this_mode].as_int == ref_bsi->rdstat[i][mode_idx].mvs[0].as_int; - if (mbmi->ref_frame[1] > 0) { + if (has_second_rf) { have_ref &= second_mode_mv[this_mode].as_int == ref_bsi->rdstat[i][mode_idx].mvs[1].as_int; } @@ -1936,6 +1953,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) { vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx], sizeof(SEG_RDSTAT)); + if (num_4x4_blocks_wide > 1) + bsi->rdstat[i + 1][mode_idx].eobs = + ref_bsi->rdstat[i + 1][mode_idx].eobs; + if (num_4x4_blocks_high > 1) + bsi->rdstat[i + 2][mode_idx].eobs = + ref_bsi->rdstat[i + 2][mode_idx].eobs; + if (bsi->rdstat[i][mode_idx].brdcost < best_rd) { mode_selected = this_mode; best_rd = bsi->rdstat[i][mode_idx].brdcost; @@ -1956,7 +1980,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv, bsi->rdstat[i][mode_idx].brate, 0); bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate; - bsi->rdstat[i][mode_idx].eobs = x->e_mbd.plane[0].eobs[i]; + bsi->rdstat[i][mode_idx].eobs = pd->eobs[i]; + if (num_4x4_blocks_wide > 1) + bsi->rdstat[i + 1][mode_idx].eobs = pd->eobs[i + 1]; + if (num_4x4_blocks_high > 1) + bsi->rdstat[i + 2][mode_idx].eobs = pd->eobs[i + 2]; } if (bsi->rdstat[i][mode_idx].brdcost < best_rd) { @@ -1997,15 +2025,6 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, bsi->segment_rd = INT64_MAX; return; } - - for (j = 1; j < num_4x4_blocks_high; ++j) - vpx_memcpy(&x->partition_info->bmi[i + j * 2], - &x->partition_info->bmi[i], - sizeof(x->partition_info->bmi[i])); - for (j = 1; j < num_4x4_blocks_wide; ++j) - vpx_memcpy(&x->partition_info->bmi[i + j], - &x->partition_info->bmi[i], - sizeof(x->partition_info->bmi[i])); } } /* for each label */ @@ -2017,10 +2036,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, // update the coding decisions for (i = 0; i < 4; ++i) - bsi->modes[i] = x->partition_info->bmi[i].mode; + bsi->modes[i] = mi->bmi[i].as_mode; } static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo *const tile, int_mv *best_ref_mv, int_mv *second_best_ref_mv, int64_t best_rd, @@ -2036,7 +2056,7 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, int i; BEST_SEG_INFO *bsi = bsi_buf + filter_idx; MACROBLOCKD *xd = &x->e_mbd; - MODE_INFO *mi = xd->this_mi; + MODE_INFO *mi = xd->mi_8x8[0]; MB_MODE_INFO *mbmi = &mi->mbmi; int mode_idx; @@ -2051,7 +2071,8 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, for (i = 0; i < 4; i++) bsi->modes[i] = ZEROMV; - rd_check_segment_txsize(cpi, x, bsi_buf, filter_idx, seg_mvs, mi_row, mi_col); + rd_check_segment_txsize(cpi, x, tile, bsi_buf, filter_idx, seg_mvs, + mi_row, mi_col); if (bsi->segment_rd > best_rd) return INT64_MAX; @@ -2059,10 +2080,10 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, for (i = 0; i < 4; i++) { mode_idx = inter_mode_offset(bsi->modes[i]); mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int; - if (mbmi->ref_frame[1] > 0) + if (has_second_ref(mbmi)) mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int; xd->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs; - x->partition_info->bmi[i].mode = bsi->modes[i]; + mi->bmi[i].as_mode = bsi->modes[i]; } /* @@ -2082,7 +2103,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame, BLOCK_SIZE block_size ) { MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; + MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; int_mv this_mv; int i; int zero_seen = 0; @@ -2195,22 +2216,18 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int mode_index, - PARTITION_INFO *partition, int_mv *ref_mv, int_mv *second_ref_mv, int64_t comp_pred_diff[NB_PREDICTION_TYPES], int64_t tx_size_diff[TX_MODES], - int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]) { + int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) { MACROBLOCKD *const xd = &x->e_mbd; // Take a snapshot of the coding context so it can be // restored if we decide to encode this way ctx->skip = x->skip; ctx->best_mode_index = mode_index; - ctx->mic = *xd->this_mi; - - if (partition) - ctx->partition_info = *partition; + ctx->mic = *xd->mi_8x8[0]; ctx->best_ref_mv.as_int = ref_mv->as_int; ctx->second_best_ref_mv.as_int = second_ref_mv->as_int; @@ -2219,11 +2236,9 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, ctx->comp_pred_diff = (int)comp_pred_diff[COMP_PREDICTION_ONLY]; ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION]; - // FIXME(rbultje) does this memcpy the whole array? I believe sizeof() - // doesn't actually work this way - memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff)); - memcpy(ctx->best_filter_diff, best_filter_diff, - sizeof(*best_filter_diff) * (SWITCHABLE_FILTERS + 1)); + vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff)); + vpx_memcpy(ctx->best_filter_diff, best_filter_diff, + sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS); } static void setup_pred_block(const MACROBLOCKD *xd, @@ -2253,6 +2268,7 @@ static void setup_pred_block(const MACROBLOCKD *xd, } static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo *const tile, int idx, MV_REFERENCE_FRAME frame_type, BLOCK_SIZE block_size, int mi_row, int mi_col, @@ -2263,17 +2279,13 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, VP9_COMMON *cm = &cpi->common; YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]]; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; + MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; // set up scaling factors scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1]; - scale[frame_type].x_offset_q4 = - ROUND_POWER_OF_TWO(mi_col * MI_SIZE * scale[frame_type].x_scale_fp, - REF_SCALE_SHIFT) & 0xf; - scale[frame_type].y_offset_q4 = - ROUND_POWER_OF_TWO(mi_row * MI_SIZE * scale[frame_type].y_scale_fp, - REF_SCALE_SHIFT) & 0xf; + scale[frame_type].sfc->set_scaled_offsets(&scale[frame_type], + mi_row * MI_SIZE, mi_col * MI_SIZE); // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this // use the UV scaling factors. @@ -2281,13 +2293,13 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, &scale[frame_type], &scale[frame_type]); // Gets an initial list of candidate vectors from neighbours and orders them - vp9_find_mv_refs(&cpi->common, xd, xd->this_mi, + vp9_find_mv_refs(cm, xd, tile, xd->mi_8x8[0], xd->last_mi, frame_type, mbmi->ref_mvs[frame_type], mi_row, mi_col); // Candidate refinement carried out at encoder and decoder - vp9_find_best_ref_mvs(xd, + vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, mbmi->ref_mvs[frame_type], &frame_nearest_mv[frame_type], &frame_near_mv[frame_type]); @@ -2295,7 +2307,7 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, // Further refinement that is encode side only to test the top few candidates // in full and choose the best as the centre point for subsequent searches. // The current implementation doesn't support scaling. - if (!vp9_is_scaled(&scale[frame_type])) + if (!vp9_is_scaled(scale[frame_type].sfc) && block_size >= BLOCK_8X8) mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride, frame_type, block_size); } @@ -2311,19 +2323,20 @@ static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) { static INLINE int get_switchable_rate(const MACROBLOCK *x) { const MACROBLOCKD *const xd = &x->e_mbd; - const MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; + const MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; const int ctx = vp9_get_pred_context_switchable_interp(xd); return SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[ctx][mbmi->interp_filter]; } static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo *const tile, BLOCK_SIZE bsize, int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv) { MACROBLOCKD *xd = &x->e_mbd; VP9_COMMON *cm = &cpi->common; - MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; + MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}}; int bestsme = INT_MAX; int further_steps, step_param; @@ -2402,23 +2415,23 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; if (cpi->sf.search_method == HEX) { - bestsme = vp9_hex_search(x, &mvp_full, + bestsme = vp9_hex_search(x, &mvp_full.as_mv, step_param, sadpb, 1, &cpi->fn_ptr[block_size], 1, - &ref_mv, tmp_mv); + &ref_mv.as_mv, &tmp_mv->as_mv); } else if (cpi->sf.search_method == SQUARE) { - bestsme = vp9_square_search(x, &mvp_full, + bestsme = vp9_square_search(x, &mvp_full.as_mv, step_param, sadpb, 1, &cpi->fn_ptr[block_size], 1, - &ref_mv, tmp_mv); + &ref_mv.as_mv, &tmp_mv->as_mv); } else if (cpi->sf.search_method == BIGDIA) { - bestsme = vp9_bigdia_search(x, &mvp_full, + bestsme = vp9_bigdia_search(x, &mvp_full.as_mv, step_param, sadpb, 1, &cpi->fn_ptr[block_size], 1, - &ref_mv, tmp_mv); + &ref_mv.as_mv, &tmp_mv->as_mv); } else { bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, sadpb, further_steps, 1, @@ -2434,16 +2447,16 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, if (bestsme < INT_MAX) { int dis; /* TODO: use dis in distortion calculation later. */ unsigned int sse; - cpi->find_fractional_mv_step(x, tmp_mv, &ref_mv, + cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv, + cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[block_size], 0, cpi->sf.subpel_iters_per_step, x->nmvjointcost, x->mvcost, &dis, &sse); } - *rate_mv = vp9_mv_bit_cost(tmp_mv, &ref_mv, - x->nmvjointcost, x->mvcost, - 96); + *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); if (cpi->sf.adaptive_motion_search && cpi->common.show_frame) x->pred_mv[ref].as_int = tmp_mv->as_int; @@ -2463,7 +2476,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int *rate_mv) { int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize); MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; + MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; int refs[2] = { mbmi->ref_frame[0], (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; int_mv ref_mv[2]; @@ -2499,12 +2512,12 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, for (i = 0; i < MAX_MB_PLANE; i++) backup_second_yv12[i] = xd->plane[i].pre[1]; - setup_pre_planes(xd, 0, scaled_ref_frame[1], mi_row, mi_col, NULL); + setup_pre_planes(xd, 1, scaled_ref_frame[1], mi_row, mi_col, NULL); } - xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0], + xd->scale_factor[0].sfc->set_scaled_offsets(&xd->scale_factor[0], mi_row, mi_col); - xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1], + xd->scale_factor[1].sfc->set_scaled_offsets(&xd->scale_factor[1], mi_row, mi_col); scaled_first_yv12 = xd->plane[0].pre[0]; @@ -2569,8 +2582,9 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, unsigned int sse; bestsme = cpi->find_fractional_mv_step_comp( - x, &tmp_mv, - &ref_mv[id], + x, &tmp_mv.as_mv, + &ref_mv[id].as_mv, + cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[block_size], 0, cpi->sf.subpel_iters_per_step, @@ -2602,17 +2616,18 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[1] = backup_second_yv12[i]; } - *rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]], - &mbmi->ref_mvs[refs[0]][0], - x->nmvjointcost, x->mvcost, 96); - *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]], - &mbmi->ref_mvs[refs[1]][0], - x->nmvjointcost, x->mvcost, 96); + *rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv, + &mbmi->ref_mvs[refs[0]][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv, + &mbmi->ref_mvs[refs[1]][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); vpx_free(second_pred); } static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo *const tile, BLOCK_SIZE bsize, int64_t txfm_cache[], int *rate2, int64_t *distortion, @@ -2620,7 +2635,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate_y, int64_t *distortion_y, int *rate_uv, int64_t *distortion_uv, int *mode_excluded, int *disable_skip, - INTERPOLATIONFILTERTYPE *best_filter, + INTERPOLATION_TYPE *best_filter, int_mv (*mode_mv)[MAX_REF_FRAMES], int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], @@ -2628,8 +2643,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, const int64_t ref_best_rd) { VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; - const int is_comp_pred = (mbmi->ref_frame[1] > 0); + MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; + const int is_comp_pred = has_second_ref(mbmi); const int num_refs = is_comp_pred ? 2 : 1; const int this_mode = mbmi->mode; int_mv *frame_mv = mode_mv[this_mode]; @@ -2647,6 +2662,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int orig_dst_stride[MAX_MB_PLANE]; int rs = 0; + if (is_comp_pred) { + if (frame_mv[refs[0]].as_int == INVALID_MV || + frame_mv[refs[1]].as_int == INVALID_MV) + return INT64_MAX; + } + if (this_mode == NEWMV) { int rate_mv; if (is_comp_pred) { @@ -2658,23 +2679,21 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, single_newmv, &rate_mv); } else { - rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]], - &mbmi->ref_mvs[refs[0]][0], - x->nmvjointcost, x->mvcost, 96); - rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]], - &mbmi->ref_mvs[refs[1]][0], - x->nmvjointcost, x->mvcost, 96); + rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv, + &mbmi->ref_mvs[refs[0]][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv, + &mbmi->ref_mvs[refs[1]][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); } - if (frame_mv[refs[0]].as_int == INVALID_MV || - frame_mv[refs[1]].as_int == INVALID_MV) - return INT64_MAX; *rate2 += rate_mv; } else { int_mv tmp_mv; - single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv); + single_motion_search(cpi, x, tile, bsize, mi_row, mi_col, + &tmp_mv, &rate_mv); *rate2 += rate_mv; frame_mv[refs[0]].as_int = - xd->this_mi->bmi[0].as_mv[0].as_int = tmp_mv.as_int; + xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int; single_newmv[refs[0]].as_int = tmp_mv.as_int; } } @@ -2904,7 +2923,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, unsigned int thresh_ac; // The encode_breakout input unsigned int encode_breakout = x->encode_breakout << 4; - int max_thresh = 36000; + unsigned int max_thresh = 36000; // Use extreme low threshold for static frames to limit skipping. if (cpi->enable_encode_breakout == 2) @@ -3001,7 +3020,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion); rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse)); - super_block_uvrd(cm, x, rate_uv, distortion_uv, &skippable_uv, &sseuv, + super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv, bsize, ref_best_rd - rdcosty); if (*rate_uv == INT_MAX) { *rate2 = INT_MAX; @@ -3038,7 +3057,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 }; x->skip_encode = 0; ctx->skip = 0; - xd->this_mi->mbmi.ref_frame[0] = INTRA_FRAME; + xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME; if (bsize >= BLOCK_8X8) { if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y, &y_skip, bsize, tx_cache, @@ -3070,14 +3089,19 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 0); *returndist = dist_y + dist_uv; if (cpi->sf.tx_size_search_method == USE_FULL_RD) - for (i = 0; i < TX_MODES; i++) - ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode]; + for (i = 0; i < TX_MODES; i++) { + if (tx_cache[i] < INT64_MAX && tx_cache[cm->tx_mode] < INT64_MAX) + ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode]; + else + ctx->tx_rd_diff[i] = 0; + } } - ctx->mic = *xd->this_mi; + ctx->mic = *xd->mi_8x8[0]; } int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo *const tile, int mi_row, int mi_col, int *returnrate, int64_t *returndistortion, @@ -3086,10 +3110,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int64_t best_rd_so_far) { VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; + MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; const struct segmentation *seg = &cm->seg; const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]); - RD_PREDICTION_MODE this_mode; + MB_PREDICTION_MODE this_mode; MV_REFERENCE_FRAME ref_frame, second_ref_frame; unsigned char segment_id = mbmi->segment_id; int comp_pred, i; @@ -3103,13 +3127,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, cpi->gld_fb_idx, cpi->alt_fb_idx}; int64_t best_rd = best_rd_so_far; - int64_t best_yrd = best_rd_so_far; // FIXME(rbultje) more precise int64_t best_tx_rd[TX_MODES]; int64_t best_tx_diff[TX_MODES]; int64_t best_pred_diff[NB_PREDICTION_TYPES]; int64_t best_pred_rd[NB_PREDICTION_TYPES]; - int64_t best_filter_rd[SWITCHABLE_FILTERS + 1]; - int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]; + int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS]; + int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; MB_MODE_INFO best_mbmode = { 0 }; int j; int mode_index, best_mode_index = 0; @@ -3118,9 +3141,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int64_t best_intra_rd = INT64_MAX; int64_t best_inter_rd = INT64_MAX; MB_PREDICTION_MODE best_intra_mode = DC_PRED; - // MB_PREDICTION_MODE best_inter_mode = ZEROMV; MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME; - INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE; + INTERPOLATION_TYPE tmp_best_filter = SWITCHABLE; int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES]; int64_t dist_uv[TX_SIZES]; int skip_uv[TX_SIZES]; @@ -3130,22 +3152,13 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, unsigned int mode_mask = 0; int64_t mode_distortions[MB_MODE_COUNT] = {-1}; int64_t frame_distortions[MAX_REF_FRAMES] = {-1}; - int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex, - cpi->common.y_dc_delta_q); - int_mv seg_mvs[4][MAX_REF_FRAMES]; - union b_mode_info best_bmodes[4]; - PARTITION_INFO best_partition; + int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q); const int bws = num_8x8_blocks_wide_lookup[bsize] / 2; const int bhs = num_8x8_blocks_high_lookup[bsize] / 2; int best_skip2 = 0; x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH; - for (i = 0; i < 4; i++) { - int j; - for (j = 0; j < MAX_REF_FRAMES; j++) - seg_mvs[i][j].as_int = INVALID_MV; - } // Everywhere the flag is set the error is much higher than its neighbors. ctx->frames_with_high_error = 0; ctx->modes_with_high_error = 0; @@ -3157,7 +3170,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, best_pred_rd[i] = INT64_MAX; for (i = 0; i < TX_MODES; i++) best_tx_rd[i] = INT64_MAX; - for (i = 0; i <= SWITCHABLE_FILTERS; i++) + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) best_filter_rd[i] = INT64_MAX; for (i = 0; i < TX_SIZES; i++) rate_uv_intra[i] = INT_MAX; @@ -3199,8 +3212,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { if (cpi->ref_frame_flags & flag_list[ref_frame]) { - setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size, - mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV], + setup_buffer_inter(cpi, x, tile, idx_list[ref_frame], ref_frame, + block_size, mi_row, mi_col, + frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb, scale_factor); } frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; @@ -3251,7 +3265,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, assert(!"Invalid Reference frame"); } } - if (cpi->mode_skip_mask & (1 << mode_index)) + if (cpi->mode_skip_mask & ((int64_t)1 << mode_index)) continue; } @@ -3261,9 +3275,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, continue; // Test best rd so far against threshold for trying this mode. - if ((best_rd < ((cpi->rd_threshes[bsize][mode_index] * - cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 5)) || - cpi->rd_threshes[bsize][mode_index] == INT_MAX) + if ((best_rd < ((int64_t)cpi->rd_threshes[segment_id][bsize][mode_index] * + cpi->rd_thresh_freq_fact[bsize][mode_index] >> 5)) || + cpi->rd_threshes[segment_id][bsize][mode_index] == INT_MAX) continue; // Do not allow compound prediction if the segment level reference @@ -3313,25 +3327,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, second_ref_frame != best_inter_ref_frame) continue; } - // TODO(jingning, jkoleszar): scaling reference frame not supported for - // SPLITMV. - if (ref_frame > 0 && - vp9_is_scaled(&scale_factor[ref_frame]) && - this_mode == RD_SPLITMV) - continue; - - if (second_ref_frame > 0 && - vp9_is_scaled(&scale_factor[second_ref_frame]) && - this_mode == RD_SPLITMV) - continue; - - if (bsize >= BLOCK_8X8 && - (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV)) - continue; - - if (bsize < BLOCK_8X8 && - !(this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV)) - continue; set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor); mbmi->uv_mode = DC_PRED; @@ -3339,7 +3334,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // Evaluate all sub-pel filters irrespective of whether we can use // them for this frame. mbmi->interp_filter = cm->mcomp_filter_type; - vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); + vp9_setup_interp_filters(xd, mbmi->interp_filter, cm); if (comp_pred) { if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) @@ -3373,7 +3368,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // If the segment skip feature is enabled.... // then do nothing if the current mode is not allowed.. } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) && - (this_mode != RD_ZEROMV && ref_frame != INTRA_FRAME)) { + (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) { continue; // Disable this drop out case if the ref frame // segment level feature is enabled for this segment. This is to @@ -3385,11 +3380,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // an unfiltered alternative. We allow near/nearest as well // because they may result in zero-zero MVs but be cheaper. if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { - if ((this_mode != RD_ZEROMV && - !(this_mode == RD_NEARMV && - frame_mv[RD_NEARMV][ALTREF_FRAME].as_int == 0) && - !(this_mode == RD_NEARESTMV && - frame_mv[RD_NEARESTMV][ALTREF_FRAME].as_int == 0)) || + if ((this_mode != ZEROMV && + !(this_mode == NEARMV && + frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) && + !(this_mode == NEARESTMV && + frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) || ref_frame != ALTREF_FRAME) { continue; } @@ -3401,7 +3396,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // a representative block in the boundary ( first ) and then implement a // function that does sads when inside the border.. if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) && - this_mode == RD_NEWMV) { + this_mode == NEWMV) { continue; } @@ -3411,58 +3406,27 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, cpi->mode_test_hits[bsize]++; #endif - if (this_mode == RD_I4X4_PRED) { - int rate; - - /* - if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) && - (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME)) - continue; - */ - // RD_I4X4_PRED is only considered for block sizes less than 8x8. - mbmi->tx_size = TX_4X4; - if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y, - &distortion_y, best_rd) >= best_rd) - continue; - rate2 += rate; - rate2 += intra_cost_penalty; - distortion2 += distortion_y; - - if (rate_uv_intra[TX_4X4] == INT_MAX) { - choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4], - &rate_uv_tokenonly[TX_4X4], - &dist_uv[TX_4X4], &skip_uv[TX_4X4], - &mode_uv[TX_4X4]); - } - rate2 += rate_uv_intra[TX_4X4]; - rate_uv = rate_uv_tokenonly[TX_4X4]; - distortion2 += dist_uv[TX_4X4]; - distortion_uv = dist_uv[TX_4X4]; - mbmi->uv_mode = mode_uv[TX_4X4]; - tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); - for (i = 0; i < TX_MODES; ++i) - tx_cache[i] = tx_cache[ONLY_4X4]; - } else if (ref_frame == INTRA_FRAME) { + if (ref_frame == INTRA_FRAME) { TX_SIZE uv_tx; // Disable intra modes other than DC_PRED for blocks with low variance // Threshold for intra skipping based on source variance // TODO(debargha): Specialize the threshold for super block sizes - static const int skip_intra_var_thresh[BLOCK_SIZES] = { + static const unsigned int skip_intra_var_thresh[BLOCK_SIZES] = { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, }; if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) && - this_mode != RD_DC_PRED && + this_mode != DC_PRED && x->source_variance < skip_intra_var_thresh[mbmi->sb_type]) continue; // Only search the oblique modes if the best so far is // one of the neighboring directional modes if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) && - (this_mode >= RD_D45_PRED && this_mode <= RD_TM_PRED)) { + (this_mode >= D45_PRED && this_mode <= TM_PRED)) { if (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME) continue; } - mbmi->mode = rd_mode_to_mode(this_mode); + mbmi->mode = this_mode; if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { if (conditional_skipintra(mbmi->mode, best_intra_mode)) continue; @@ -3488,11 +3452,631 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, mbmi->uv_mode = mode_uv[uv_tx]; rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx]; - if (this_mode != RD_DC_PRED && this_mode != RD_TM_PRED) + if (this_mode != DC_PRED && this_mode != TM_PRED) rate2 += intra_cost_penalty; distortion2 = distortion_y + distortion_uv; - } else if (this_mode == RD_SPLITMV) { - const int is_comp_pred = second_ref_frame > 0; + } else { + mbmi->mode = this_mode; + compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME); + this_rd = handle_inter_mode(cpi, x, tile, bsize, + tx_cache, + &rate2, &distortion2, &skippable, + &rate_y, &distortion_y, + &rate_uv, &distortion_uv, + &mode_excluded, &disable_skip, + &tmp_best_filter, frame_mv, + mi_row, mi_col, + single_newmv, &total_sse, best_rd); + if (this_rd == INT64_MAX) + continue; + } + + if (cm->comp_pred_mode == HYBRID_PREDICTION) { + rate2 += compmode_cost; + } + + // Estimate the reference frame signaling cost and add it + // to the rolling cost variable. + if (second_ref_frame > INTRA_FRAME) { + rate2 += ref_costs_comp[ref_frame]; + } else { + rate2 += ref_costs_single[ref_frame]; + } + + if (!disable_skip) { + // Test for the condition where skip block will be activated + // because there are no non zero coefficients and make any + // necessary adjustment for rate. Ignore if skip is coded at + // segment level as the cost wont have been added in. + // Is Mb level skip allowed (i.e. not coded at segment level). + const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id, + SEG_LVL_SKIP); + + if (skippable) { + // Back out the coefficient coding costs + rate2 -= (rate_y + rate_uv); + // for best yrd calculation + rate_uv = 0; + + if (mb_skip_allowed) { + int prob_skip_cost; + + // Cost the skip mb case + vp9_prob skip_prob = + vp9_get_pred_prob_mbskip(cm, xd); + + if (skip_prob) { + prob_skip_cost = vp9_cost_bit(skip_prob, 1); + rate2 += prob_skip_cost; + } + } + } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) { + if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) < + RDCOST(x->rdmult, x->rddiv, 0, total_sse)) { + // Add in the cost of the no skip flag. + int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), + 0); + rate2 += prob_skip_cost; + } else { + // FIXME(rbultje) make this work for splitmv also + int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), + 1); + rate2 += prob_skip_cost; + distortion2 = total_sse; + assert(total_sse >= 0); + rate2 -= (rate_y + rate_uv); + rate_y = 0; + rate_uv = 0; + this_skip2 = 1; + } + } else if (mb_skip_allowed) { + // Add in the cost of the no skip flag. + int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), + 0); + rate2 += prob_skip_cost; + } + + // Calculate the final RD estimate for this mode. + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + } + + // Keep record of best intra rd + if (!is_inter_block(&xd->mi_8x8[0]->mbmi) && + this_rd < best_intra_rd) { + best_intra_rd = this_rd; + best_intra_mode = xd->mi_8x8[0]->mbmi.mode; + } + + // Keep record of best inter rd with single reference + if (is_inter_block(&xd->mi_8x8[0]->mbmi) && + !has_second_ref(&xd->mi_8x8[0]->mbmi) && + !mode_excluded && this_rd < best_inter_rd) { + best_inter_rd = this_rd; + best_inter_ref_frame = ref_frame; + } + + if (!disable_skip && ref_frame == INTRA_FRAME) { + for (i = 0; i < NB_PREDICTION_TYPES; ++i) + best_pred_rd[i] = MIN(best_pred_rd[i], this_rd); + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) + best_filter_rd[i] = MIN(best_filter_rd[i], this_rd); + } + + // Store the respective mode distortions for later use. + if (mode_distortions[this_mode] == -1 + || distortion2 < mode_distortions[this_mode]) { + mode_distortions[this_mode] = distortion2; + } + if (frame_distortions[ref_frame] == -1 + || distortion2 < frame_distortions[ref_frame]) { + frame_distortions[ref_frame] = distortion2; + } + + // Did this mode help.. i.e. is it the new best mode + if (this_rd < best_rd || x->skip) { + if (!mode_excluded) { + // Note index of best mode so far + best_mode_index = mode_index; + + if (ref_frame == INTRA_FRAME) { + /* required for left and above block mv */ + mbmi->mv[0].as_int = 0; + } + + *returnrate = rate2; + *returndistortion = distortion2; + best_rd = this_rd; + best_mbmode = *mbmi; + best_skip2 = this_skip2; + vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size], + sizeof(uint8_t) * ctx->num_4x4_blk); + + // TODO(debargha): enhance this test with a better distortion prediction + // based on qp, activity mask and history + if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) && + (mode_index > MIN_EARLY_TERM_INDEX)) { + const int qstep = xd->plane[0].dequant[1]; + // TODO(debargha): Enhance this by specializing for each mode_index + int scale = 4; + if (x->source_variance < UINT_MAX) { + const int var_adjust = (x->source_variance < 16); + scale -= var_adjust; + } + if (ref_frame > INTRA_FRAME && + distortion2 * scale < qstep * qstep) { + early_term = 1; + } + } + } + } + + /* keep record of best compound/single-only prediction */ + if (!disable_skip && ref_frame != INTRA_FRAME) { + int single_rd, hybrid_rd, single_rate, hybrid_rate; + + if (cm->comp_pred_mode == HYBRID_PREDICTION) { + single_rate = rate2 - compmode_cost; + hybrid_rate = rate2; + } else { + single_rate = rate2; + hybrid_rate = rate2 + compmode_cost; + } + + single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2); + hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2); + + if (second_ref_frame <= INTRA_FRAME && + single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) { + best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd; + } else if (second_ref_frame > INTRA_FRAME && + single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) { + best_pred_rd[COMP_PREDICTION_ONLY] = single_rd; + } + if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION]) + best_pred_rd[HYBRID_PREDICTION] = hybrid_rd; + } + + /* keep record of best filter type */ + if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME && + cm->mcomp_filter_type != BILINEAR) { + int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ? + SWITCHABLE_FILTERS : cm->mcomp_filter_type]; + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { + int64_t adj_rd; + // In cases of poor prediction, filter_cache[] can contain really big + // values, which actually are bigger than this_rd itself. This can + // cause negative best_filter_rd[] values, which is obviously silly. + // Therefore, if filter_cache < ref, we do an adjusted calculation. + if (cpi->rd_filter_cache[i] >= ref) { + adj_rd = this_rd + cpi->rd_filter_cache[i] - ref; + } else { + // FIXME(rbultje) do this for comppsred also + // + // To prevent out-of-range computation in + // adj_rd = cpi->rd_filter_cache[i] * this_rd / ref + // cpi->rd_filter_cache[i] / ref is converted to a 256 based ratio. + int tmp = cpi->rd_filter_cache[i] * 256 / ref; + adj_rd = (this_rd * tmp) >> 8; + } + best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd); + } + } + + /* keep record of best txfm size */ + if (bsize < BLOCK_32X32) { + if (bsize < BLOCK_16X16) + tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8]; + + tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16]; + } + if (!mode_excluded && this_rd != INT64_MAX) { + for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) { + int64_t adj_rd = INT64_MAX; + adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode]; + + if (adj_rd < best_tx_rd[i]) + best_tx_rd[i] = adj_rd; + } + } + + if (early_term) + break; + + if (x->skip && !comp_pred) + break; + } + + if (best_rd >= best_rd_so_far) + return INT64_MAX; + + // If we used an estimate for the uv intra rd in the loop above... + if (cpi->sf.use_uv_intra_rd_estimate) { + // Do Intra UV best rd mode selection if best mode choice above was intra. + if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) { + TX_SIZE uv_tx_size = get_uv_tx_size(mbmi); + rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size], + &rate_uv_tokenonly[uv_tx_size], + &dist_uv[uv_tx_size], + &skip_uv[uv_tx_size], + bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize); + } + } + + // If we are using reference masking and the set mask flag is set then + // create the reference frame mask. + if (cpi->sf.reference_masking && cpi->set_ref_frame_mask) + cpi->ref_frame_mask = ~(1 << vp9_mode_order[best_mode_index].ref_frame); + + // Flag all modes that have a distortion thats > 2x the best we found at + // this level. + for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) { + if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV) + continue; + + if (mode_distortions[mode_index] > 2 * *returndistortion) { + ctx->modes_with_high_error |= (1 << mode_index); + } + } + + // Flag all ref frames that have a distortion thats > 2x the best we found at + // this level. + for (ref_frame = INTRA_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + if (frame_distortions[ref_frame] > 2 * *returndistortion) { + ctx->frames_with_high_error |= (1 << ref_frame); + } + } + + assert((cm->mcomp_filter_type == SWITCHABLE) || + (cm->mcomp_filter_type == best_mbmode.interp_filter) || + (best_mbmode.ref_frame[0] == INTRA_FRAME)); + + // Updating rd_thresh_freq_fact[] here means that the different + // partition/block sizes are handled independently based on the best + // choice for the current partition. It may well be better to keep a scaled + // best rd so far value and update rd_thresh_freq_fact based on the mode/size + // combination that wins out. + if (cpi->sf.adaptive_rd_thresh) { + for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) { + if (mode_index == best_mode_index) { + cpi->rd_thresh_freq_fact[bsize][mode_index] -= + (cpi->rd_thresh_freq_fact[bsize][mode_index] >> 3); + } else { + cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC; + if (cpi->rd_thresh_freq_fact[bsize][mode_index] > + (cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT)) { + cpi->rd_thresh_freq_fact[bsize][mode_index] = + cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT; + } + } + } + } + + // macroblock modes + *mbmi = best_mbmode; + x->skip |= best_skip2; + + for (i = 0; i < NB_PREDICTION_TYPES; ++i) { + if (best_pred_rd[i] == INT64_MAX) + best_pred_diff[i] = INT_MIN; + else + best_pred_diff[i] = best_rd - best_pred_rd[i]; + } + + if (!x->skip) { + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { + if (best_filter_rd[i] == INT64_MAX) + best_filter_diff[i] = 0; + else + best_filter_diff[i] = best_rd - best_filter_rd[i]; + } + if (cm->mcomp_filter_type == SWITCHABLE) + assert(best_filter_diff[SWITCHABLE_FILTERS] == 0); + } else { + vp9_zero(best_filter_diff); + } + + if (!x->skip) { + for (i = 0; i < TX_MODES; i++) { + if (best_tx_rd[i] == INT64_MAX) + best_tx_diff[i] = 0; + else + best_tx_diff[i] = best_rd - best_tx_rd[i]; + } + } else { + vp9_zero(best_tx_diff); + } + + set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1], + scale_factor); + store_coding_context(x, ctx, best_mode_index, + &mbmi->ref_mvs[mbmi->ref_frame[0]][0], + &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 : + mbmi->ref_frame[1]][0], + best_pred_diff, best_tx_diff, best_filter_diff); + + return best_rd; +} + + +int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo *const tile, + int mi_row, int mi_col, + int *returnrate, + int64_t *returndistortion, + BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far) { + VP9_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; + const struct segmentation *seg = &cm->seg; + const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]); + MV_REFERENCE_FRAME ref_frame, second_ref_frame; + unsigned char segment_id = mbmi->segment_id; + int comp_pred, i; + int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; + struct buf_2d yv12_mb[4][MAX_MB_PLANE]; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + int idx_list[4] = {0, + cpi->lst_fb_idx, + cpi->gld_fb_idx, + cpi->alt_fb_idx}; + int64_t best_rd = best_rd_so_far; + int64_t best_yrd = best_rd_so_far; // FIXME(rbultje) more precise + int64_t best_tx_rd[TX_MODES]; + int64_t best_tx_diff[TX_MODES]; + int64_t best_pred_diff[NB_PREDICTION_TYPES]; + int64_t best_pred_rd[NB_PREDICTION_TYPES]; + int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS]; + int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; + MB_MODE_INFO best_mbmode = { 0 }; + int mode_index, best_mode_index = 0; + unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES]; + vp9_prob comp_mode_p; + int64_t best_inter_rd = INT64_MAX; + MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME; + INTERPOLATION_TYPE tmp_best_filter = SWITCHABLE; + int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES]; + int64_t dist_uv[TX_SIZES]; + int skip_uv[TX_SIZES]; + MB_PREDICTION_MODE mode_uv[TX_SIZES] = { 0 }; + struct scale_factors scale_factor[4]; + unsigned int ref_frame_mask = 0; + unsigned int mode_mask = 0; + int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex, + cpi->common.y_dc_delta_q); + int_mv seg_mvs[4][MAX_REF_FRAMES]; + b_mode_info best_bmodes[4]; + int best_skip2 = 0; + + x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH; + vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4); + + for (i = 0; i < 4; i++) { + int j; + for (j = 0; j < MAX_REF_FRAMES; j++) + seg_mvs[i][j].as_int = INVALID_MV; + } + + estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp, + &comp_mode_p); + + for (i = 0; i < NB_PREDICTION_TYPES; ++i) + best_pred_rd[i] = INT64_MAX; + for (i = 0; i < TX_MODES; i++) + best_tx_rd[i] = INT64_MAX; + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) + best_filter_rd[i] = INT64_MAX; + for (i = 0; i < TX_SIZES; i++) + rate_uv_intra[i] = INT_MAX; + + *returnrate = INT_MAX; + + // Create a mask set to 1 for each reference frame used by a smaller + // resolution. + if (cpi->sf.use_avoid_tested_higherror) { + ref_frame_mask = 0; + mode_mask = 0; + ref_frame_mask = ~ref_frame_mask; + mode_mask = ~mode_mask; + } + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + if (cpi->ref_frame_flags & flag_list[ref_frame]) { + setup_buffer_inter(cpi, x, tile, idx_list[ref_frame], ref_frame, + block_size, mi_row, mi_col, + frame_mv[NEARESTMV], frame_mv[NEARMV], + yv12_mb, scale_factor); + } + frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; + frame_mv[ZEROMV][ref_frame].as_int = 0; + } + + for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) { + int mode_excluded = 0; + int64_t this_rd = INT64_MAX; + int disable_skip = 0; + int compmode_cost = 0; + int rate2 = 0, rate_y = 0, rate_uv = 0; + int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0; + int skippable = 0; + int64_t tx_cache[TX_MODES]; + int i; + int this_skip2 = 0; + int64_t total_sse = INT_MAX; + int early_term = 0; + + for (i = 0; i < TX_MODES; ++i) + tx_cache[i] = INT64_MAX; + + x->skip = 0; + ref_frame = vp9_ref_order[mode_index].ref_frame; + second_ref_frame = vp9_ref_order[mode_index].second_ref_frame; + + // Look at the reference frame of the best mode so far and set the + // skip mask to look at a subset of the remaining modes. + if (mode_index > 2 && cpi->sf.mode_skip_start < MAX_MODES) { + if (mode_index == 3) { + switch (vp9_ref_order[best_mode_index].ref_frame) { + case INTRA_FRAME: + cpi->mode_skip_mask = 0; + break; + case LAST_FRAME: + cpi->mode_skip_mask = 0x0010; + break; + case GOLDEN_FRAME: + cpi->mode_skip_mask = 0x0008; + break; + case ALTREF_FRAME: + cpi->mode_skip_mask = 0x0000; + break; + case NONE: + case MAX_REF_FRAMES: + assert(!"Invalid Reference frame"); + } + } + if (cpi->mode_skip_mask & ((int64_t)1 << mode_index)) + continue; + } + + // Skip if the current reference frame has been masked off + if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask && + (cpi->ref_frame_mask & (1 << ref_frame))) + continue; + + // Test best rd so far against threshold for trying this mode. + if ((best_rd < + ((int64_t)cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] * + cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 5)) || + cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] == INT_MAX) + continue; + + // Do not allow compound prediction if the segment level reference + // frame feature is in use as in this case there can only be one reference. + if ((second_ref_frame > INTRA_FRAME) && + vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) + continue; + + mbmi->ref_frame[0] = ref_frame; + mbmi->ref_frame[1] = second_ref_frame; + + if (!(ref_frame == INTRA_FRAME + || (cpi->ref_frame_flags & flag_list[ref_frame]))) { + continue; + } + if (!(second_ref_frame == NONE + || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) { + continue; + } + + comp_pred = second_ref_frame > INTRA_FRAME; + if (comp_pred) { + if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) + if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME) + continue; + if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) + if (ref_frame != best_inter_ref_frame && + second_ref_frame != best_inter_ref_frame) + continue; + } + + // TODO(jingning, jkoleszar): scaling reference frame not supported for + // sub8x8 blocks. + if (ref_frame > 0 && + vp9_is_scaled(scale_factor[ref_frame].sfc)) + continue; + + if (second_ref_frame > 0 && + vp9_is_scaled(scale_factor[second_ref_frame].sfc)) + continue; + + set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor); + mbmi->uv_mode = DC_PRED; + + // Evaluate all sub-pel filters irrespective of whether we can use + // them for this frame. + mbmi->interp_filter = cm->mcomp_filter_type; + vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); + + if (comp_pred) { + if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) + continue; + set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor); + + mode_excluded = mode_excluded + ? mode_excluded + : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY; + } else { + if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) { + mode_excluded = + mode_excluded ? + mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY; + } + } + + // Select prediction reference frames. + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + if (comp_pred) + xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; + } + + // If the segment reference frame feature is enabled.... + // then do nothing if the current ref frame is not allowed.. + if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != + (int)ref_frame) { + continue; + // If the segment skip feature is enabled.... + // then do nothing if the current mode is not allowed.. + } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) && + ref_frame != INTRA_FRAME) { + continue; + // Disable this drop out case if the ref frame + // segment level feature is enabled for this segment. This is to + // prevent the possibility that we end up unable to pick any mode. + } else if (!vp9_segfeature_active(seg, segment_id, + SEG_LVL_REF_FRAME)) { + // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, + // unless ARNR filtering is enabled in which case we want + // an unfiltered alternative. We allow near/nearest as well + // because they may result in zero-zero MVs but be cheaper. + if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) + continue; + } + +#ifdef MODE_TEST_HIT_STATS + // TEST/DEBUG CODE + // Keep a rcord of the number of test hits at each size + cpi->mode_test_hits[bsize]++; +#endif + + if (ref_frame == INTRA_FRAME) { + int rate; + mbmi->tx_size = TX_4X4; + if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y, + &distortion_y, best_rd) >= best_rd) + continue; + rate2 += rate; + rate2 += intra_cost_penalty; + distortion2 += distortion_y; + + if (rate_uv_intra[TX_4X4] == INT_MAX) { + choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4], + &rate_uv_tokenonly[TX_4X4], + &dist_uv[TX_4X4], &skip_uv[TX_4X4], + &mode_uv[TX_4X4]); + } + rate2 += rate_uv_intra[TX_4X4]; + rate_uv = rate_uv_tokenonly[TX_4X4]; + distortion2 += dist_uv[TX_4X4]; + distortion_uv = dist_uv[TX_4X4]; + mbmi->uv_mode = mode_uv[TX_4X4]; + tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + for (i = 0; i < TX_MODES; ++i) + tx_cache[i] = tx_cache[ONLY_4X4]; + } else { int rate; int64_t distortion; int64_t this_rd_thresh; @@ -3501,30 +4085,20 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse; int tmp_best_skippable = 0; int switchable_filter_index; - int_mv *second_ref = is_comp_pred ? - &mbmi->ref_mvs[second_ref_frame][0] : NULL; - union b_mode_info tmp_best_bmodes[16]; + int_mv *second_ref = comp_pred ? + &mbmi->ref_mvs[second_ref_frame][0] : NULL; + b_mode_info tmp_best_bmodes[16]; MB_MODE_INFO tmp_best_mbmode; - PARTITION_INFO tmp_best_partition; BEST_SEG_INFO bsi[SWITCHABLE_FILTERS]; int pred_exists = 0; int uv_skippable; - if (is_comp_pred) { - if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) - if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) - continue; - if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) - if (ref_frame != best_inter_ref_frame && - second_ref_frame != best_inter_ref_frame) - continue; - } this_rd_thresh = (ref_frame == LAST_FRAME) ? - cpi->rd_threshes[bsize][THR_NEWMV] : - cpi->rd_threshes[bsize][THR_NEWA]; + cpi->rd_thresh_sub8x8[segment_id][bsize][THR_LAST] : + cpi->rd_thresh_sub8x8[segment_id][bsize][THR_ALTR]; this_rd_thresh = (ref_frame == GOLDEN_FRAME) ? - cpi->rd_threshes[bsize][THR_NEWG] : this_rd_thresh; - xd->this_mi->mbmi.tx_size = TX_4X4; + cpi->rd_thresh_sub8x8[segment_id][bsize][THR_GOLD] : this_rd_thresh; + xd->mi_8x8[0]->mbmi.tx_size = TX_4X4; cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX; if (cm->mcomp_filter_type != BILINEAR) { @@ -3542,7 +4116,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, mbmi->interp_filter = switchable_filter_index; vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); - tmp_rd = rd_pick_best_mbsegmentation(cpi, x, + tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile, &mbmi->ref_mvs[ref_frame][0], second_ref, best_yrd, @@ -3578,9 +4152,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, tmp_best_sse = total_sse; tmp_best_skippable = skippable; tmp_best_mbmode = *mbmi; - tmp_best_partition = *x->partition_info; - for (i = 0; i < 4; i++) - tmp_best_bmodes[i] = xd->this_mi->bmi[i]; + for (i = 0; i < 4; i++) { + tmp_best_bmodes[i] = xd->mi_8x8[0]->bmi[i]; + x->zcoeff_blk[TX_4X4][i] = !xd->plane[0].eobs[i]; + } pred_exists = 1; if (switchable_filter_index == 0 && cpi->sf.use_rd_breakout && @@ -3607,7 +4182,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (!pred_exists) { // Handles the special case when a filter that is not in the // switchable list (bilinear, 6-tap) is indicated at the frame level - tmp_rd = rd_pick_best_mbsegmentation(cpi, x, + tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile, &mbmi->ref_mvs[ref_frame][0], second_ref, best_yrd, @@ -3630,9 +4205,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, distortion = tmp_best_distortion; skippable = tmp_best_skippable; *mbmi = tmp_best_mbmode; - *x->partition_info = tmp_best_partition; for (i = 0; i < 4; i++) - xd->this_mi->bmi[i] = tmp_best_bmodes[i]; + xd->mi_8x8[0]->bmi[i] = tmp_best_bmodes[i]; } rate2 += rate; @@ -3642,12 +4216,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, rate2 += get_switchable_rate(x); if (!mode_excluded) { - if (is_comp_pred) + if (comp_pred) mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY; else mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY; } - compmode_cost = vp9_cost_bit(comp_mode_p, is_comp_pred); + compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred); tmp_best_rdu = best_rd - MIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2), @@ -3658,7 +4232,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // then dont bother looking at UV vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, BLOCK_8X8); - super_block_uvrd(cm, x, &rate_uv, &distortion_uv, &uv_skippable, + super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable, &uv_sse, BLOCK_8X8, tmp_best_rdu); if (rate_uv == INT_MAX) continue; @@ -3671,20 +4245,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, for (i = 0; i < TX_MODES; ++i) tx_cache[i] = tx_cache[ONLY_4X4]; } - } else { - mbmi->mode = rd_mode_to_mode(this_mode); - compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME); - this_rd = handle_inter_mode(cpi, x, bsize, - tx_cache, - &rate2, &distortion2, &skippable, - &rate_y, &distortion_y, - &rate_uv, &distortion_uv, - &mode_excluded, &disable_skip, - &tmp_best_filter, frame_mv, - mi_row, mi_col, - single_newmv, &total_sse, best_rd); - if (this_rd == INT64_MAX) - continue; } if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) { @@ -3708,25 +4268,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP); - if (skippable && bsize >= BLOCK_8X8) { - // Back out the coefficient coding costs - rate2 -= (rate_y + rate_uv); - // for best yrd calculation - rate_uv = 0; - - if (mb_skip_allowed) { - int prob_skip_cost; - - // Cost the skip mb case - vp9_prob skip_prob = - vp9_get_pred_prob_mbskip(cm, xd); - - if (skip_prob) { - prob_skip_cost = vp9_cost_bit(skip_prob, 1); - rate2 += prob_skip_cost; - } - } - } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) { + if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) { if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) < RDCOST(x->rdmult, x->rddiv, 0, total_sse)) { // Add in the cost of the no skip flag. @@ -3756,42 +4298,22 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); } - // Keep record of best intra rd - if (xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME && - is_intra_mode(xd->this_mi->mbmi.mode) && - this_rd < best_intra_rd) { - best_intra_rd = this_rd; - best_intra_mode = xd->this_mi->mbmi.mode; - } // Keep record of best inter rd with single reference - if (xd->this_mi->mbmi.ref_frame[0] > INTRA_FRAME && - xd->this_mi->mbmi.ref_frame[1] == NONE && + if (xd->mi_8x8[0]->mbmi.ref_frame[0] > INTRA_FRAME && + xd->mi_8x8[0]->mbmi.ref_frame[1] == NONE && !mode_excluded && this_rd < best_inter_rd) { best_inter_rd = this_rd; best_inter_ref_frame = ref_frame; - // best_inter_mode = xd->this_mi->mbmi.mode; } if (!disable_skip && ref_frame == INTRA_FRAME) { for (i = 0; i < NB_PREDICTION_TYPES; ++i) best_pred_rd[i] = MIN(best_pred_rd[i], this_rd); - for (i = 0; i <= SWITCHABLE_FILTERS; i++) + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) best_filter_rd[i] = MIN(best_filter_rd[i], this_rd); } - if (this_mode != RD_I4X4_PRED && this_mode != RD_SPLITMV) { - // Store the respective mode distortions for later use. - if (mode_distortions[this_mode] == -1 - || distortion2 < mode_distortions[this_mode]) { - mode_distortions[this_mode] = distortion2; - } - if (frame_distortions[ref_frame] == -1 - || distortion2 < frame_distortions[ref_frame]) { - frame_distortions[ref_frame] = distortion2; - } - } - // Did this mode help.. i.e. is it the new best mode if (this_rd < best_rd || x->skip) { if (!mode_excluded) { @@ -3810,15 +4332,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv); best_mbmode = *mbmi; best_skip2 = this_skip2; - best_partition = *x->partition_info; + vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size], + sizeof(uint8_t) * ctx->num_4x4_blk); - if (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV) - for (i = 0; i < 4; i++) - best_bmodes[i] = xd->this_mi->bmi[i]; + for (i = 0; i < 4; i++) + best_bmodes[i] = xd->mi_8x8[0]->bmi[i]; // TODO(debargha): enhance this test with a better distortion prediction // based on qp, activity mask and history - if (cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) { + if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) && + (mode_index > MIN_EARLY_TERM_INDEX)) { const int qstep = xd->plane[0].dequant[1]; // TODO(debargha): Enhance this by specializing for each mode_index int scale = 4; @@ -3865,7 +4388,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, cm->mcomp_filter_type != BILINEAR) { int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ? SWITCHABLE_FILTERS : cm->mcomp_filter_type]; - for (i = 0; i <= SWITCHABLE_FILTERS; i++) { + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { int64_t adj_rd; // In cases of poor prediction, filter_cache[] can contain really big // values, which actually are bigger than this_rd itself. This can @@ -3882,8 +4405,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, /* keep record of best txfm size */ if (bsize < BLOCK_32X32) { if (bsize < BLOCK_16X16) { - if (this_mode == RD_SPLITMV || this_mode == RD_I4X4_PRED) - tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4]; + tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4]; tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8]; } tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16]; @@ -3891,11 +4413,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (!mode_excluded && this_rd != INT64_MAX) { for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) { int64_t adj_rd = INT64_MAX; - if (this_mode != RD_I4X4_PRED) { + if (ref_frame > INTRA_FRAME) adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode]; - } else { + else adj_rd = this_rd; - } if (adj_rd < best_tx_rd[i]) best_tx_rd[i] = adj_rd; @@ -3915,39 +4436,20 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // If we used an estimate for the uv intra rd in the loop above... if (cpi->sf.use_uv_intra_rd_estimate) { // Do Intra UV best rd mode selection if best mode choice above was intra. - if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) { + if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME) { TX_SIZE uv_tx_size = get_uv_tx_size(mbmi); rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size], &rate_uv_tokenonly[uv_tx_size], &dist_uv[uv_tx_size], &skip_uv[uv_tx_size], - bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize); + BLOCK_8X8); } } // If we are using reference masking and the set mask flag is set then // create the reference frame mask. if (cpi->sf.reference_masking && cpi->set_ref_frame_mask) - cpi->ref_frame_mask = ~(1 << vp9_mode_order[best_mode_index].ref_frame); - - // Flag all modes that have a distortion thats > 2x the best we found at - // this level. - for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) { - if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV) - continue; - - if (mode_distortions[mode_index] > 2 * *returndistortion) { - ctx->modes_with_high_error |= (1 << mode_index); - } - } - - // Flag all ref frames that have a distortion thats > 2x the best we found at - // this level. - for (ref_frame = INTRA_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { - if (frame_distortions[ref_frame] > 2 * *returndistortion) { - ctx->frames_with_high_error |= (1 << ref_frame); - } - } + cpi->ref_frame_mask = ~(1 << vp9_ref_order[best_mode_index].ref_frame); if (best_rd == INT64_MAX && bsize < BLOCK_8X8) { *returnrate = INT_MAX; @@ -3965,16 +4467,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // best rd so far value and update rd_thresh_freq_fact based on the mode/size // combination that wins out. if (cpi->sf.adaptive_rd_thresh) { - for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) { + for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) { if (mode_index == best_mode_index) { - cpi->rd_thresh_freq_fact[bsize][mode_index] -= - (cpi->rd_thresh_freq_fact[bsize][mode_index] >> 3); + cpi->rd_thresh_freq_sub8x8[bsize][mode_index] -= + (cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 3); } else { - cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC; - if (cpi->rd_thresh_freq_fact[bsize][mode_index] > - (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT)) { - cpi->rd_thresh_freq_fact[bsize][mode_index] = - cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT; + cpi->rd_thresh_freq_sub8x8[bsize][mode_index] += RD_THRESH_INC; + if (cpi->rd_thresh_freq_sub8x8[bsize][mode_index] > + (cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT)) { + cpi->rd_thresh_freq_sub8x8[bsize][mode_index] = + cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT; } } } @@ -3983,27 +4485,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // macroblock modes *mbmi = best_mbmode; x->skip |= best_skip2; - if (best_mbmode.ref_frame[0] == INTRA_FRAME && - best_mbmode.sb_type < BLOCK_8X8) { + if (best_mbmode.ref_frame[0] == INTRA_FRAME) { for (i = 0; i < 4; i++) - xd->this_mi->bmi[i].as_mode = best_bmodes[i].as_mode; - } - - if (best_mbmode.ref_frame[0] != INTRA_FRAME && - best_mbmode.sb_type < BLOCK_8X8) { - for (i = 0; i < 4; i++) - xd->this_mi->bmi[i].as_mv[0].as_int = - best_bmodes[i].as_mv[0].as_int; - - if (mbmi->ref_frame[1] > 0) - for (i = 0; i < 4; i++) - xd->this_mi->bmi[i].as_mv[1].as_int = - best_bmodes[i].as_mv[1].as_int; - - *x->partition_info = best_partition; + xd->mi_8x8[0]->bmi[i].as_mode = best_bmodes[i].as_mode; + } else { + for (i = 0; i < 4; ++i) + vpx_memcpy(&xd->mi_8x8[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info)); - mbmi->mv[0].as_int = xd->this_mi->bmi[3].as_mv[0].as_int; - mbmi->mv[1].as_int = xd->this_mi->bmi[3].as_mv[1].as_int; + mbmi->mv[0].as_int = xd->mi_8x8[0]->bmi[3].as_mv[0].as_int; + mbmi->mv[1].as_int = xd->mi_8x8[0]->bmi[3].as_mv[1].as_int; } for (i = 0; i < NB_PREDICTION_TYPES; ++i) { @@ -4014,7 +4504,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } if (!x->skip) { - for (i = 0; i <= SWITCHABLE_FILTERS; i++) { + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { if (best_filter_rd[i] == INT64_MAX) best_filter_diff[i] = 0; else @@ -4023,7 +4513,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (cm->mcomp_filter_type == SWITCHABLE) assert(best_filter_diff[SWITCHABLE_FILTERS] == 0); } else { - vpx_memset(best_filter_diff, 0, sizeof(best_filter_diff)); + vp9_zero(best_filter_diff); } if (!x->skip) { @@ -4034,13 +4524,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, best_tx_diff[i] = best_rd - best_tx_rd[i]; } } else { - vpx_memset(best_tx_diff, 0, sizeof(best_tx_diff)); + vp9_zero(best_tx_diff); } set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1], scale_factor); store_coding_context(x, ctx, best_mode_index, - &best_partition, &mbmi->ref_mvs[mbmi->ref_frame[0]][0], &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]][0], diff --git a/libvpx/vp9/encoder/vp9_rdopt.h b/libvpx/vp9/encoder/vp9_rdopt.h index eba7df9..92fb235 100644 --- a/libvpx/vp9/encoder/vp9_rdopt.h +++ b/libvpx/vp9/encoder/vp9_rdopt.h @@ -12,10 +12,17 @@ #ifndef VP9_ENCODER_VP9_RDOPT_H_ #define VP9_ENCODER_VP9_RDOPT_H_ -#define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) ) +#define RDDIV_BITS 7 + +#define RDCOST(RM, DM, R, D) \ + (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM)) #define QIDX_SKIP_THRESH 115 -void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex); +struct TileInfo; + +int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex); + +void vp9_initialize_rd_consts(VP9_COMP *cpi); void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex); @@ -24,13 +31,31 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int64_t best_rd); int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, + const struct TileInfo *const tile, int mi_row, int mi_col, - int *r, int64_t *d, BLOCK_SIZE bsize, - PICK_MODE_CONTEXT *ctx, int64_t best_rd); + int *returnrate, + int64_t *returndistortion, + BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far); + +int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, + const struct TileInfo *const tile, + int mi_row, int mi_col, + int *returnrate, + int64_t *returndistortion, + BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far); void vp9_init_me_luts(); void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv); +void vp9_get_entropy_contexts(TX_SIZE tx_size, + ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16], + const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left, + int num_4x4_w, int num_4x4_h); + #endif // VP9_ENCODER_VP9_RDOPT_H_ diff --git a/libvpx/vp9/encoder/vp9_segmentation.c b/libvpx/vp9/encoder/vp9_segmentation.c index 10655e8..24f011f 100644 --- a/libvpx/vp9/encoder/vp9_segmentation.c +++ b/libvpx/vp9/encoder/vp9_segmentation.c @@ -117,7 +117,8 @@ static int cost_segmap(int *segcounts, vp9_prob *probs) { return cost; } -static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8, +static void count_segs(VP9_COMP *cpi, const TileInfo *const tile, + MODE_INFO **mi_8x8, int *no_pred_segcounts, int (*temporal_predictor_count)[2], int *t_unpred_seg_counts, @@ -129,9 +130,10 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8, if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - segment_id = mi_8x8[0]->mbmi.segment_id; + xd->mi_8x8 = mi_8x8; + segment_id = xd->mi_8x8[0]->mbmi.segment_id; - set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw); + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); // Count the number of hits on each segment with no prediction no_pred_segcounts[segment_id]++; @@ -156,7 +158,8 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8, } } -static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, +static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile, + MODE_INFO **mi_8x8, int *no_pred_segcounts, int (*temporal_predictor_count)[2], int *t_unpred_seg_counts, @@ -174,19 +177,20 @@ static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type]; if (bw == bs && bh == bs) { - count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count, + count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, bs, bs, mi_row, mi_col); } else if (bw == bs && bh < bs) { - count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count, + count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, bs, hbs, mi_row, mi_col); - count_segs(cpi, mi_8x8 + hbs * mis, no_pred_segcounts, + count_segs(cpi, tile, mi_8x8 + hbs * mis, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, bs, hbs, mi_row + hbs, mi_col); } else if (bw < bs && bh == bs) { - count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count, + count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row, mi_col); - count_segs(cpi, mi_8x8 + hbs, no_pred_segcounts, temporal_predictor_count, - t_unpred_seg_counts, hbs, bs, mi_row, mi_col + hbs); + count_segs(cpi, tile, mi_8x8 + hbs, + no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, + hbs, bs, mi_row, mi_col + hbs); } else { const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize]; int n; @@ -197,7 +201,7 @@ static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, const int mi_dc = hbs * (n & 1); const int mi_dr = hbs * (n >> 1); - count_segs_sb(cpi, &mi_8x8[mi_dr * mis + mi_dc], + count_segs_sb(cpi, tile, &mi_8x8[mi_dr * mis + mi_dc], no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, mi_row + mi_dr, mi_col + mi_dc, subsize); @@ -233,15 +237,18 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { // First of all generate stats regarding how well the last segment map // predicts this one for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) { - vp9_get_tile_col_offsets(cm, tile_col); - mi_ptr = cm->mi_grid_visible + cm->cur_tile_mi_col_start; + TileInfo tile; + + vp9_tile_init(&tile, cm, 0, tile_col); + mi_ptr = cm->mi_grid_visible + tile.mi_col_start; for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8, mi_ptr += 8 * mis) { mi = mi_ptr; - for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end; + for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end; mi_col += 8, mi += 8) - count_segs_sb(cpi, mi, no_pred_segcounts, temporal_predictor_count, - t_unpred_seg_counts, mi_row, mi_col, BLOCK_64X64); + count_segs_sb(cpi, &tile, mi, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, + mi_row, mi_col, BLOCK_64X64); } } @@ -251,13 +258,13 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { no_pred_cost = cost_segmap(no_pred_segcounts, no_pred_tree); // Key frames cannot use temporal prediction - if (cm->frame_type != KEY_FRAME) { + if (!frame_is_intra_only(cm)) { // Work out probability tree for coding those segments not // predicted using the temporal method and the cost. calc_segtree_probs(t_unpred_seg_counts, t_pred_tree); t_pred_cost = cost_segmap(t_unpred_seg_counts, t_pred_tree); - // Add in the cost of the signalling for each prediction context + // Add in the cost of the signaling for each prediction context. for (i = 0; i < PREDICTION_PROBS; i++) { const int count0 = temporal_predictor_count[i][0]; const int count1 = temporal_predictor_count[i][1]; diff --git a/libvpx/vp9/encoder/vp9_ssim.c b/libvpx/vp9/encoder/vp9_ssim.c index c155516..a5f18e6 100644 --- a/libvpx/vp9/encoder/vp9_ssim.c +++ b/libvpx/vp9/encoder/vp9_ssim.c @@ -42,8 +42,8 @@ void vp9_ssim_parms_8x8_c(uint8_t *s, int sp, uint8_t *r, int rp, } } -const static int64_t cc1 = 26634; // (64^2*(.01*255)^2 -const static int64_t cc2 = 239708; // (64^2*(.03*255)^2 +static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 +static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 static double similarity(unsigned long sum_s, unsigned long sum_r, unsigned long sum_sq_s, unsigned long sum_sq_r, diff --git a/libvpx/vp9/encoder/vp9_subexp.c b/libvpx/vp9/encoder/vp9_subexp.c index 667b801..387fc90 100644 --- a/libvpx/vp9/encoder/vp9_subexp.c +++ b/libvpx/vp9/encoder/vp9_subexp.c @@ -221,7 +221,8 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct, } void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp, - vp9_prob upd, unsigned int *ct) { + const unsigned int ct[2]) { + const vp9_prob upd = DIFF_UPDATE_PROB; vp9_prob newp = get_binary_prob(ct[0], ct[1]); const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp, upd); diff --git a/libvpx/vp9/encoder/vp9_subexp.h b/libvpx/vp9/encoder/vp9_subexp.h index 7acdaf6..521c777 100644 --- a/libvpx/vp9/encoder/vp9_subexp.h +++ b/libvpx/vp9/encoder/vp9_subexp.h @@ -19,7 +19,7 @@ void vp9_write_prob_diff_update(vp9_writer *w, vp9_prob newp, vp9_prob oldp); void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp, - vp9_prob upd, unsigned int *ct); + unsigned int *ct); int vp9_prob_diff_update_savings_search(const unsigned int *ct, vp9_prob oldp, vp9_prob *bestp, diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c index 63826ee..2cace03 100644 --- a/libvpx/vp9/encoder/vp9_temporal_filter.c +++ b/libvpx/vp9/encoder/vp9_temporal_filter.c @@ -29,7 +29,7 @@ #include "vpx_ports/vpx_timer.h" #define ALT_REF_MC_ENABLED 1 // dis/enable MC in AltRef filtering -#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering +#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, uint8_t *y_mb_ptr, @@ -38,14 +38,15 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, int stride, int mv_row, int mv_col, - uint8_t *pred) { + uint8_t *pred, + struct scale_factors *scale) { const int which_mv = 0; MV mv = { mv_row, mv_col }; vp9_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, - &xd->scale_factor[which_mv], + scale, 16, 16, which_mv, &xd->subpix, MV_PRECISION_Q3); @@ -55,7 +56,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, vp9_build_inter_predictor(u_mb_ptr, stride, &pred[256], 8, &mv, - &xd->scale_factor[which_mv], + scale, 8, 8, which_mv, &xd->subpix, MV_PRECISION_Q4); @@ -63,7 +64,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, vp9_build_inter_predictor(v_mb_ptr, stride, &pred[320], 8, &mv, - &xd->scale_factor[which_mv], + scale, 8, 8, which_mv, &xd->subpix, MV_PRECISION_Q4); @@ -83,7 +84,6 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1, for (i = 0, k = 0; i < block_size; i++) { for (j = 0; j < block_size; j++, k++) { - int src_byte = frame1[byte]; int pixel_value = *frame2++; @@ -151,13 +151,12 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2)); /*cpi->sf.search_method == HEX*/ - // TODO Check that the 16x16 vf & sdf are selected here // Ignore mv costing by sending NULL pointer instead of cost arrays ref_mv = &x->e_mbd.mi_8x8[0]->bmi[0].as_mv[0]; - bestsme = vp9_hex_search(x, &best_ref_mv1_full, + bestsme = vp9_hex_search(x, &best_ref_mv1_full.as_mv, step_param, sadpb, 1, &cpi->fn_ptr[BLOCK_16X16], - 0, &best_ref_mv1, ref_mv); + 0, &best_ref_mv1.as_mv, &ref_mv->as_mv); #if ALT_REF_SUBPEL_ENABLED // Try sub-pixel MC? @@ -166,8 +165,9 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, int distortion; unsigned int sse; // Ignore mv costing by sending NULL pointer instead of cost array - bestsme = cpi->find_fractional_mv_step(x, ref_mv, - &best_ref_mv1, + bestsme = cpi->find_fractional_mv_step(x, &ref_mv->as_mv, + &best_ref_mv1.as_mv, + cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], 0, cpi->sf.subpel_iters_per_step, @@ -187,7 +187,8 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, static void temporal_filter_iterate_c(VP9_COMP *cpi, int frame_count, int alt_ref_index, - int strength) { + int strength, + struct scale_factors *scale) { int byte; int frame; int mb_col, mb_row; @@ -281,7 +282,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, cpi->frames[frame]->y_stride, mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.row, mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.col, - predictor); + predictor, scale); // Apply the filter (YUV) vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, @@ -375,6 +376,9 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) { const int num_frames_forward = vp9_lookahead_depth(cpi->lookahead) - (num_frames_backward + 1); + struct scale_factors scale; + struct scale_factors_common scale_comm; + switch (blur_type) { case 1: // Backward Blur @@ -424,26 +428,22 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) { #ifdef DEBUGFWG // DEBUG FWG - printf("max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d start:%d" -, max_frames -, num_frames_backward -, num_frames_forward -, frames_to_blur -, frames_to_blur_backward -, frames_to_blur_forward -, cpi->source_encode_index -, cpi->last_alt_ref_sei -, start_frame); + printf( + "max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d " + "start:%d", + max_frames, num_frames_backward, num_frames_forward, frames_to_blur, + frames_to_blur_backward, frames_to_blur_forward, cpi->source_encode_index, + cpi->last_alt_ref_sei, start_frame); #endif // Setup scaling factors. Scaling on each of the arnr frames is not supported - vp9_setup_scale_factors_for_frame(&cpi->mb.e_mbd.scale_factor[0], - cm->yv12_fb[cm->new_fb_idx].y_crop_width, - cm->yv12_fb[cm->new_fb_idx].y_crop_height, + vp9_setup_scale_factors_for_frame(&scale, &scale_comm, + get_frame_new_buffer(cm)->y_crop_width, + get_frame_new_buffer(cm)->y_crop_height, cm->width, cm->height); // Setup frame pointers, NULL indicates frame not included in filter - vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *)); + vp9_zero(cpi->frames); for (frame = 0; frame < frames_to_blur; frame++) { int which_buffer = start_frame - frame; struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead, @@ -452,7 +452,7 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) { } temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward, - strength); + strength, &scale); } void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame, diff --git a/libvpx/vp9/encoder/vp9_tokenize.c b/libvpx/vp9/encoder/vp9_tokenize.c index 0c9bf9d..7d4676e 100644 --- a/libvpx/vp9/encoder/vp9_tokenize.c +++ b/libvpx/vp9/encoder/vp9_tokenize.c @@ -21,24 +21,12 @@ #include "vp9/common/vp9_seg_common.h" #include "vp9/common/vp9_entropy.h" -/* Global event counters used for accumulating statistics across several - compressions, then generating vp9_context.c = initial stats. */ - -#ifdef ENTROPY_STATS -vp9_coeff_accum context_counters[TX_SIZES][BLOCK_TYPES]; -extern vp9_coeff_stats tree_update_hist[TX_SIZES][BLOCK_TYPES]; -#endif /* ENTROPY_STATS */ - -DECLARE_ALIGNED(16, extern const uint8_t, - vp9_pt_energy_class[MAX_ENTROPY_TOKENS]); - static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2]; const TOKENVALUE *vp9_dct_value_tokens_ptr; static int dct_value_cost[DCT_MAX_VALUE * 2]; const int *vp9_dct_value_cost_ptr; static void fill_value_tokens() { - TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE; const vp9_extra_bit *const e = vp9_extra_bits; @@ -60,9 +48,9 @@ static void fill_value_tokens() { t[i].token = --j; eb |= (a - e[j].base_val) << 1; - } else + } else { t[i].token = a; - + } t[i].extra = eb; } @@ -81,9 +69,7 @@ static void fill_value_tokens() { cost += vp9_cost_bit(vp9_prob_half, extra & 1); /* sign */ dct_value_cost[i + DCT_MAX_VALUE] = cost; } - } - } while (++i < DCT_MAX_VALUE); vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE; @@ -95,6 +81,7 @@ struct tokenize_b_args { MACROBLOCKD *xd; TOKENEXTRA **tp; TX_SIZE tx_size; + uint8_t *token_cache; }; static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize, @@ -113,8 +100,9 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, VP9_COMP *cpi = args->cpi; MACROBLOCKD *xd = args->xd; TOKENEXTRA **tp = args->tp; + uint8_t *token_cache = args->token_cache; struct macroblockd_plane *pd = &xd->plane[plane]; - MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; + MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; int pt; /* near block/prev token context index */ int c = 0, rc = 0; TOKENEXTRA *t = *tp; /* store tokens starting here */ @@ -127,21 +115,16 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, vp9_coeff_count *const counts = cpi->coef_counts[tx_size]; vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size]; const int ref = is_inter_block(mbmi); - uint8_t token_cache[1024]; - const uint8_t *band_translate; - ENTROPY_CONTEXT *A, *L; + const uint8_t *const band_translate = get_band_translate(tx_size); const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size); int aoff, loff; txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff); - A = pd->above_context + aoff; - L = pd->left_context + loff; - assert((!type && !plane) || (type && plane)); - pt = get_entropy_context(xd, tx_size, type, block, A, L, - &scan, &band_translate); - nb = vp9_get_coef_neighbors_handle(scan); + pt = get_entropy_context(tx_size, pd->above_context + aoff, + pd->left_context + loff); + get_scan(xd, tx_size, type, block, &scan, &nb); c = 0; do { const int band = get_coef_band(band_translate, c); @@ -210,12 +193,12 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; - MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; + MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; TOKENEXTRA *t_backup = *t; const int mb_skip_context = vp9_get_pred_context_mbskip(xd); const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); - struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size}; + struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size, cpi->mb.token_cache}; mbmi->skip_coeff = vp9_sb_is_skippable(xd, bsize); if (mbmi->skip_coeff) { @@ -236,149 +219,6 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run, } } -#ifdef ENTROPY_STATS -void init_context_counters(void) { - FILE *f = fopen("context.bin", "rb"); - if (!f) { - vp9_zero(context_counters); - } else { - fread(context_counters, sizeof(context_counters), 1, f); - fclose(f); - } - - f = fopen("treeupdate.bin", "rb"); - if (!f) { - vpx_memset(tree_update_hist, 0, sizeof(tree_update_hist)); - } else { - fread(tree_update_hist, sizeof(tree_update_hist), 1, f); - fclose(f); - } -} - -static void print_counter(FILE *f, vp9_coeff_accum *context_counters, - int block_types, const char *header) { - int type, ref, band, pt, t; - - fprintf(f, "static const vp9_coeff_count %s = {\n", header); - -#define Comma(X) (X ? "," : "") - type = 0; - do { - ref = 0; - fprintf(f, "%s\n { /* block Type %d */", Comma(type), type); - do { - fprintf(f, "%s\n { /* %s */", Comma(type), ref ? "Inter" : "Intra"); - band = 0; - do { - fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band); - pt = 0; - do { - fprintf(f, "%s\n {", Comma(pt)); - - t = 0; - do { - const int64_t x = context_counters[type][ref][band][pt][t]; - const int y = (int) x; - - assert(x == (int64_t) y); /* no overflow handling yet */ - fprintf(f, "%s %d", Comma(t), y); - } while (++t < 1 + MAX_ENTROPY_TOKENS); - fprintf(f, "}"); - } while (++pt < PREV_COEF_CONTEXTS); - fprintf(f, "\n }"); - } while (++band < COEF_BANDS); - fprintf(f, "\n }"); - } while (++ref < REF_TYPES); - fprintf(f, "\n }"); - } while (++type < block_types); - fprintf(f, "\n};\n"); -} - -static void print_probs(FILE *f, vp9_coeff_accum *context_counters, - int block_types, const char *header) { - int type, ref, band, pt, t; - - fprintf(f, "static const vp9_coeff_probs %s = {", header); - - type = 0; -#define Newline(x, spaces) (x ? " " : "\n" spaces) - do { - fprintf(f, "%s%s{ /* block Type %d */", - Comma(type), Newline(type, " "), type); - ref = 0; - do { - fprintf(f, "%s%s{ /* %s */", - Comma(band), Newline(band, " "), ref ? "Inter" : "Intra"); - band = 0; - do { - fprintf(f, "%s%s{ /* Coeff Band %d */", - Comma(band), Newline(band, " "), band); - pt = 0; - do { - unsigned int branch_ct[ENTROPY_NODES][2]; - unsigned int coef_counts[MAX_ENTROPY_TOKENS + 1]; - vp9_prob coef_probs[ENTROPY_NODES]; - - if (pt >= 3 && band == 0) - break; - for (t = 0; t < MAX_ENTROPY_TOKENS + 1; ++t) - coef_counts[t] = context_counters[type][ref][band][pt][t]; - vp9_tree_probs_from_distribution(vp9_coef_tree, coef_probs, - branch_ct, coef_counts, 0); - branch_ct[0][1] = coef_counts[MAX_ENTROPY_TOKENS] - branch_ct[0][0]; - coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]); - fprintf(f, "%s\n {", Comma(pt)); - - t = 0; - do { - fprintf(f, "%s %3d", Comma(t), coef_probs[t]); - } while (++t < ENTROPY_NODES); - - fprintf(f, " }"); - } while (++pt < PREV_COEF_CONTEXTS); - fprintf(f, "\n }"); - } while (++band < COEF_BANDS); - fprintf(f, "\n }"); - } while (++ref < REF_TYPES); - fprintf(f, "\n }"); - } while (++type < block_types); - fprintf(f, "\n};\n"); -} - -void print_context_counters() { - FILE *f = fopen("vp9_context.c", "w"); - - fprintf(f, "#include \"vp9_entropy.h\"\n"); - fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n"); - - /* print counts */ - print_counter(f, context_counters[TX_4X4], BLOCK_TYPES, - "vp9_default_coef_counts_4x4[BLOCK_TYPES]"); - print_counter(f, context_counters[TX_8X8], BLOCK_TYPES, - "vp9_default_coef_counts_8x8[BLOCK_TYPES]"); - print_counter(f, context_counters[TX_16X16], BLOCK_TYPES, - "vp9_default_coef_counts_16x16[BLOCK_TYPES]"); - print_counter(f, context_counters[TX_32X32], BLOCK_TYPES, - "vp9_default_coef_counts_32x32[BLOCK_TYPES]"); - - /* print coefficient probabilities */ - print_probs(f, context_counters[TX_4X4], BLOCK_TYPES, - "default_coef_probs_4x4[BLOCK_TYPES]"); - print_probs(f, context_counters[TX_8X8], BLOCK_TYPES, - "default_coef_probs_8x8[BLOCK_TYPES]"); - print_probs(f, context_counters[TX_16X16], BLOCK_TYPES, - "default_coef_probs_16x16[BLOCK_TYPES]"); - print_probs(f, context_counters[TX_32X32], BLOCK_TYPES, - "default_coef_probs_32x32[BLOCK_TYPES]"); - - fclose(f); - - f = fopen("context.bin", "wb"); - fwrite(context_counters, sizeof(context_counters), 1, f); - fclose(f); -} -#endif - void vp9_tokenize_initialize() { fill_value_tokens(); } diff --git a/libvpx/vp9/encoder/vp9_tokenize.h b/libvpx/vp9/encoder/vp9_tokenize.h index b78e100..e24e31b 100644 --- a/libvpx/vp9/encoder/vp9_tokenize.h +++ b/libvpx/vp9/encoder/vp9_tokenize.h @@ -28,9 +28,6 @@ typedef struct { uint8_t skip_eob_node; } TOKENEXTRA; -typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] - [MAX_ENTROPY_TOKENS + 1]; - int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize); int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane); @@ -39,13 +36,6 @@ struct VP9_COMP; void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize); -#ifdef ENTROPY_STATS -void init_context_counters(); -void print_context_counters(); - -extern vp9_coeff_accum context_counters[TX_SIZES][BLOCK_TYPES]; -#endif - extern const int *vp9_dct_value_cost_ptr; /* TODO: The Token field should be broken out into a separate char array to * improve cache locality, since it's needed for costing when the rest of the diff --git a/libvpx/vp9/encoder/vp9_vaq.c b/libvpx/vp9/encoder/vp9_vaq.c new file mode 100644 index 0000000..1f9cb87 --- /dev/null +++ b/libvpx/vp9/encoder/vp9_vaq.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <math.h> + +#include "vp9/encoder/vp9_vaq.h" + +#include "vp9/common/vp9_seg_common.h" + +#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_rdopt.h" +#include "vp9/encoder/vp9_segmentation.h" +#include "vp9/common/vp9_systemdependent.h" + +#define ENERGY_MIN (-3) +#define ENERGY_MAX (3) +#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1) +#define ENERGY_IN_BOUNDS(energy)\ + assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX) + +static double q_ratio[MAX_SEGMENTS] = { 1, 1, 1, 1, 1, 1, 1, 1 }; +static double rdmult_ratio[MAX_SEGMENTS] = { 1, 1, 1, 1, 1, 1, 1, 1 }; +static int segment_id[MAX_SEGMENTS] = { 5, 3, 1, 0, 2, 4, 6, 7 }; + +#define Q_RATIO(i) q_ratio[(i) - ENERGY_MIN] +#define RDMULT_RATIO(i) rdmult_ratio[(i) - ENERGY_MIN] +#define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN] + +DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = {0}; + +unsigned int vp9_vaq_segment_id(int energy) { + ENERGY_IN_BOUNDS(energy); + + return SEGMENT_ID(energy); +} + +double vp9_vaq_rdmult_ratio(int energy) { + ENERGY_IN_BOUNDS(energy); + + vp9_clear_system_state(); // __asm emms; + + return RDMULT_RATIO(energy); +} + +double vp9_vaq_inv_q_ratio(int energy) { + ENERGY_IN_BOUNDS(energy); + + vp9_clear_system_state(); // __asm emms; + + return Q_RATIO(-energy); +} + +void vp9_vaq_init() { + int i; + double base_ratio; + + assert(ENERGY_SPAN <= MAX_SEGMENTS); + + vp9_clear_system_state(); // __asm emms; + + base_ratio = 1.8; + + for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) { + Q_RATIO(i) = pow(base_ratio, i/3.0); + } +} + +void vp9_vaq_frame_setup(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + struct segmentation *seg = &cm->seg; + int base_q = vp9_convert_qindex_to_q(cm->base_qindex); + int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + + cm->y_dc_delta_q); + int i; + + vp9_enable_segmentation((VP9_PTR)cpi); + vp9_clearall_segfeatures(seg); + + seg->abs_delta = SEGMENT_DELTADATA; + + vp9_clear_system_state(); // __asm emms; + + for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) { + int qindex_delta, segment_rdmult; + + if (Q_RATIO(i) == 1) { + // No need to enable SEG_LVL_ALT_Q for this segment + RDMULT_RATIO(i) = 1; + continue; + } + + qindex_delta = vp9_compute_qdelta(cpi, base_q, base_q * Q_RATIO(i)); + vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta); + vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q); + + segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta + + cm->y_dc_delta_q); + + RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult; + } +} + + +static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bs) { + MACROBLOCKD *xd = &x->e_mbd; + unsigned int var, sse; + int right_overflow = (xd->mb_to_right_edge < 0) ? + ((-xd->mb_to_right_edge) >> 3) : 0; + int bottom_overflow = (xd->mb_to_bottom_edge < 0) ? + ((-xd->mb_to_bottom_edge) >> 3) : 0; + + if (right_overflow || bottom_overflow) { + const int bw = 8 * num_8x8_blocks_wide_lookup[bs] - right_overflow; + const int bh = 8 * num_8x8_blocks_high_lookup[bs] - bottom_overflow; + int avg; + variance(x->plane[0].src.buf, x->plane[0].src.stride, + vp9_64_zeros, 0, bw, bh, &sse, &avg); + var = sse - (((int64_t)avg * avg) / (bw * bh)); + return (256 * var) / (bw * bh); + } else { + var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, + x->plane[0].src.stride, + vp9_64_zeros, 0, &sse); + return (256 * var) >> num_pels_log2_lookup[bs]; + } +} + +int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { + double energy; + unsigned int var = block_variance(cpi, x, bs); + + vp9_clear_system_state(); // __asm emms; + + // if (var <= 1000) + // return 0; + + energy = 0.9*(logf(var + 1) - 10.0); + return clamp(round(energy), ENERGY_MIN, ENERGY_MAX); +} diff --git a/libvpx/vp9/encoder/vp9_vaq.h b/libvpx/vp9/encoder/vp9_vaq.h new file mode 100644 index 0000000..dc18b22 --- /dev/null +++ b/libvpx/vp9/encoder/vp9_vaq.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_ENCODER_VP9_CONFIG_VAQ_H_ +#define VP9_ENCODER_VP9_CONFIG_VAQ_H_ + +#include "vp9/encoder/vp9_onyx_int.h" + +unsigned int vp9_vaq_segment_id(int energy); +double vp9_vaq_rdmult_ratio(int energy); +double vp9_vaq_inv_q_ratio(int energy); + +void vp9_vaq_init(); +void vp9_vaq_frame_setup(VP9_COMP *cpi); + +int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); + +#endif // VP9_ENCODER_VP9_CONFIG_VAQ_H_ diff --git a/libvpx/vp9/encoder/vp9_variance.h b/libvpx/vp9/encoder/vp9_variance.h index 6e686d6..2ded97c 100644 --- a/libvpx/vp9/encoder/vp9_variance.h +++ b/libvpx/vp9/encoder/vp9_variance.h @@ -14,6 +14,15 @@ #include "vpx/vpx_integer.h" // #include "./vpx_config.h" +void variance(const uint8_t *src_ptr, + int source_stride, + const uint8_t *ref_ptr, + int recon_stride, + int w, + int h, + unsigned int *sse, + int *sum); + typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, @@ -67,12 +76,6 @@ typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr, unsigned int *sse, const uint8_t *second_pred); -typedef void (*vp9_ssimpf_fn_t)(uint8_t *s, int sp, uint8_t *r, - int rp, unsigned long *sum_s, - unsigned long *sum_r, unsigned long *sum_sq_s, - unsigned long *sum_sq_r, - unsigned long *sum_sxr); - typedef unsigned int (*vp9_getmbss_fn_t)(const short *); typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr, diff --git a/libvpx/vp9/encoder/vp9_variance_c.c b/libvpx/vp9/encoder/vp9_variance_c.c index 155ba8a..8bc3850 100644 --- a/libvpx/vp9/encoder/vp9_variance_c.c +++ b/libvpx/vp9/encoder/vp9_variance_c.c @@ -8,13 +8,150 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vp9_rtcd.h" -#include "vp9/encoder/vp9_variance.h" -#include "vp9/common/vp9_filter.h" -#include "vp9/common/vp9_subpelvar.h" -#include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" -#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_filter.h" + +#include "vp9/encoder/vp9_variance.h" + +void variance(const uint8_t *src_ptr, + int source_stride, + const uint8_t *ref_ptr, + int recon_stride, + int w, + int h, + unsigned int *sse, + int *sum) { + int i, j; + int diff; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + diff = src_ptr[j] - ref_ptr[j]; + *sum += diff; + *sse += diff * diff; + } + + src_ptr += source_stride; + ref_ptr += recon_stride; + } +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_first_pass + * + * INPUTS : uint8_t *src_ptr : Pointer to source block. + * uint32_t src_pixels_per_line : Stride of input block. + * uint32_t pixel_step : Offset between filter input + * samples (see notes). + * uint32_t output_height : Input block height. + * uint32_t output_width : Input block width. + * int32_t *vp9_filter : Array of 2 bi-linear filter + * taps. + * + * OUTPUTS : int32_t *output_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in + * either horizontal or vertical direction to produce the + * filtered output block. Used to implement first-pass + * of 2-D separable filter. + * + * SPECIAL NOTES : Produces int32_t output to retain precision for next pass. + * Two filter taps should sum to VP9_FILTER_WEIGHT. + * pixel_step defines whether the filter is applied + * horizontally (pixel_step=1) or vertically (pixel_step= + * stride). + * It defines the offset required to move from one input + * to the next. + * + ****************************************************************************/ +static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr, + uint16_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp9_filter) { + unsigned int i, j; + + for (i = 0; i < output_height; i++) { + for (j = 0; j < output_width; j++) { + output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + + (int)src_ptr[pixel_step] * vp9_filter[1], + FILTER_BITS); + + src_ptr++; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_second_pass + * + * INPUTS : int32_t *src_ptr : Pointer to source block. + * uint32_t src_pixels_per_line : Stride of input block. + * uint32_t pixel_step : Offset between filter input + * samples (see notes). + * uint32_t output_height : Input block height. + * uint32_t output_width : Input block width. + * int32_t *vp9_filter : Array of 2 bi-linear filter + * taps. + * + * OUTPUTS : uint16_t *output_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in + * either horizontal or vertical direction to produce the + * filtered output block. Used to implement second-pass + * of 2-D separable filter. + * + * SPECIAL NOTES : Requires 32-bit input as produced by + * filter_block2d_bil_first_pass. + * Two filter taps should sum to VP9_FILTER_WEIGHT. + * pixel_step defines whether the filter is applied + * horizontally (pixel_step=1) or vertically (pixel_step= + * stride). + * It defines the offset required to move from one input + * to the next. + * + ****************************************************************************/ +static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp9_filter) { + unsigned int i, j; + + for (i = 0; i < output_height; i++) { + for (j = 0; j < output_width; j++) { + output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + + (int)src_ptr[pixel_step] * vp9_filter[1], + FILTER_BITS); + src_ptr++; + } + + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) { unsigned int i, sum = 0; diff --git a/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c index 95ae266..2d59775 100644 --- a/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c @@ -27,32 +27,14 @@ static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); return _mm_unpacklo_epi64(buf0, buf1); } - -static INLINE __m128i k_cvtlo_epi16(__m128i a, __m128i mask16, __m128i kZero) { - // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers - __m128i sign_bit = _mm_and_si128(a, mask16); - __m128i b = _mm_unpacklo_epi16(a, kZero); - sign_bit = _mm_cmplt_epi16(sign_bit, kZero); - sign_bit = _mm_unpacklo_epi16(kZero, sign_bit); - return _mm_or_si128(sign_bit, b); -} - -static INLINE __m128i k_cvthi_epi16(__m128i a, __m128i mask16, __m128i kZero) { - // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers - __m128i sign_bit = _mm_and_si128(a, mask16); - __m128i b = _mm_unpackhi_epi16(a, kZero); - sign_bit = _mm_cmplt_epi16(sign_bit, kZero); - sign_bit = _mm_unpackhi_epi16(kZero, sign_bit); - return _mm_or_si128(sign_bit, b); -} #endif -void FDCT32x32_2D(int16_t *input, - int16_t *output_org, int pitch) { +void FDCT32x32_2D(const int16_t *input, + int16_t *output_org, int stride) { // Calculate pre-multiplied strides - const int str1 = pitch >> 1; - const int str2 = pitch; - const int str3 = pitch + str1; + const int str1 = stride; + const int str2 = 2 * stride; + const int str3 = 2 * stride + str1; // We need an intermediate buffer between passes. DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]); // Constants @@ -111,13 +93,13 @@ void FDCT32x32_2D(int16_t *input, // Note: even though all the loads below are aligned, using the aligned // intrinsic make the code slightly slower. if (0 == pass) { - int16_t *in = &input[column_start]; + const int16_t *in = &input[column_start]; // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; // Note: the next four blocks could be in a loop. That would help the // instruction cache but is actually slower. { - int16_t *ina = in + 0 * str1; - int16_t *inb = in + 31 * str1; + const int16_t *ina = in + 0 * str1; + const int16_t *inb = in + 31 * str1; __m128i *step1a = &step1[ 0]; __m128i *step1b = &step1[31]; const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); @@ -146,8 +128,8 @@ void FDCT32x32_2D(int16_t *input, step1b[-0] = _mm_slli_epi16(step1b[-0], 2); } { - int16_t *ina = in + 4 * str1; - int16_t *inb = in + 27 * str1; + const int16_t *ina = in + 4 * str1; + const int16_t *inb = in + 27 * str1; __m128i *step1a = &step1[ 4]; __m128i *step1b = &step1[27]; const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); @@ -176,8 +158,8 @@ void FDCT32x32_2D(int16_t *input, step1b[-0] = _mm_slli_epi16(step1b[-0], 2); } { - int16_t *ina = in + 8 * str1; - int16_t *inb = in + 23 * str1; + const int16_t *ina = in + 8 * str1; + const int16_t *inb = in + 23 * str1; __m128i *step1a = &step1[ 8]; __m128i *step1b = &step1[23]; const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); @@ -206,8 +188,8 @@ void FDCT32x32_2D(int16_t *input, step1b[-0] = _mm_slli_epi16(step1b[-0], 2); } { - int16_t *ina = in + 12 * str1; - int16_t *inb = in + 19 * str1; + const int16_t *ina = in + 12 * str1; + const int16_t *inb = in + 19 * str1; __m128i *step1a = &step1[12]; __m128i *step1b = &step1[19]; const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); @@ -1159,28 +1141,43 @@ void FDCT32x32_2D(int16_t *input, } else { __m128i lstep1[64], lstep2[64], lstep3[64]; __m128i u[32], v[32], sign[16]; - const __m128i mask16 = _mm_set1_epi32(0x80008000); const __m128i K32One = _mm_set_epi32(1, 1, 1, 1); // start using 32-bit operations // stage 3 { // expanding to 32-bit length priori to addition operations - lstep2[ 0] = k_cvtlo_epi16(step2[ 0], mask16, kZero); - lstep2[ 1] = k_cvthi_epi16(step2[ 0], mask16, kZero); - lstep2[ 2] = k_cvtlo_epi16(step2[ 1], mask16, kZero); - lstep2[ 3] = k_cvthi_epi16(step2[ 1], mask16, kZero); - lstep2[ 4] = k_cvtlo_epi16(step2[ 2], mask16, kZero); - lstep2[ 5] = k_cvthi_epi16(step2[ 2], mask16, kZero); - lstep2[ 6] = k_cvtlo_epi16(step2[ 3], mask16, kZero); - lstep2[ 7] = k_cvthi_epi16(step2[ 3], mask16, kZero); - lstep2[ 8] = k_cvtlo_epi16(step2[ 4], mask16, kZero); - lstep2[ 9] = k_cvthi_epi16(step2[ 4], mask16, kZero); - lstep2[10] = k_cvtlo_epi16(step2[ 5], mask16, kZero); - lstep2[11] = k_cvthi_epi16(step2[ 5], mask16, kZero); - lstep2[12] = k_cvtlo_epi16(step2[ 6], mask16, kZero); - lstep2[13] = k_cvthi_epi16(step2[ 6], mask16, kZero); - lstep2[14] = k_cvtlo_epi16(step2[ 7], mask16, kZero); - lstep2[15] = k_cvthi_epi16(step2[ 7], mask16, kZero); + lstep2[ 0] = _mm_unpacklo_epi16(step2[ 0], kZero); + lstep2[ 1] = _mm_unpackhi_epi16(step2[ 0], kZero); + lstep2[ 2] = _mm_unpacklo_epi16(step2[ 1], kZero); + lstep2[ 3] = _mm_unpackhi_epi16(step2[ 1], kZero); + lstep2[ 4] = _mm_unpacklo_epi16(step2[ 2], kZero); + lstep2[ 5] = _mm_unpackhi_epi16(step2[ 2], kZero); + lstep2[ 6] = _mm_unpacklo_epi16(step2[ 3], kZero); + lstep2[ 7] = _mm_unpackhi_epi16(step2[ 3], kZero); + lstep2[ 8] = _mm_unpacklo_epi16(step2[ 4], kZero); + lstep2[ 9] = _mm_unpackhi_epi16(step2[ 4], kZero); + lstep2[10] = _mm_unpacklo_epi16(step2[ 5], kZero); + lstep2[11] = _mm_unpackhi_epi16(step2[ 5], kZero); + lstep2[12] = _mm_unpacklo_epi16(step2[ 6], kZero); + lstep2[13] = _mm_unpackhi_epi16(step2[ 6], kZero); + lstep2[14] = _mm_unpacklo_epi16(step2[ 7], kZero); + lstep2[15] = _mm_unpackhi_epi16(step2[ 7], kZero); + lstep2[ 0] = _mm_madd_epi16(lstep2[ 0], kOne); + lstep2[ 1] = _mm_madd_epi16(lstep2[ 1], kOne); + lstep2[ 2] = _mm_madd_epi16(lstep2[ 2], kOne); + lstep2[ 3] = _mm_madd_epi16(lstep2[ 3], kOne); + lstep2[ 4] = _mm_madd_epi16(lstep2[ 4], kOne); + lstep2[ 5] = _mm_madd_epi16(lstep2[ 5], kOne); + lstep2[ 6] = _mm_madd_epi16(lstep2[ 6], kOne); + lstep2[ 7] = _mm_madd_epi16(lstep2[ 7], kOne); + lstep2[ 8] = _mm_madd_epi16(lstep2[ 8], kOne); + lstep2[ 9] = _mm_madd_epi16(lstep2[ 9], kOne); + lstep2[10] = _mm_madd_epi16(lstep2[10], kOne); + lstep2[11] = _mm_madd_epi16(lstep2[11], kOne); + lstep2[12] = _mm_madd_epi16(lstep2[12], kOne); + lstep2[13] = _mm_madd_epi16(lstep2[13], kOne); + lstep2[14] = _mm_madd_epi16(lstep2[14], kOne); + lstep2[15] = _mm_madd_epi16(lstep2[15], kOne); lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]); lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]); @@ -1231,42 +1228,75 @@ void FDCT32x32_2D(int16_t *input, lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); } { - lstep2[40] = k_cvtlo_epi16(step2[20], mask16, kZero); - lstep2[41] = k_cvthi_epi16(step2[20], mask16, kZero); - lstep2[42] = k_cvtlo_epi16(step2[21], mask16, kZero); - lstep2[43] = k_cvthi_epi16(step2[21], mask16, kZero); - lstep2[44] = k_cvtlo_epi16(step2[22], mask16, kZero); - lstep2[45] = k_cvthi_epi16(step2[22], mask16, kZero); - lstep2[46] = k_cvtlo_epi16(step2[23], mask16, kZero); - lstep2[47] = k_cvthi_epi16(step2[23], mask16, kZero); - lstep2[48] = k_cvtlo_epi16(step2[24], mask16, kZero); - lstep2[49] = k_cvthi_epi16(step2[24], mask16, kZero); - lstep2[50] = k_cvtlo_epi16(step2[25], mask16, kZero); - lstep2[51] = k_cvthi_epi16(step2[25], mask16, kZero); - lstep2[52] = k_cvtlo_epi16(step2[26], mask16, kZero); - lstep2[53] = k_cvthi_epi16(step2[26], mask16, kZero); - lstep2[54] = k_cvtlo_epi16(step2[27], mask16, kZero); - lstep2[55] = k_cvthi_epi16(step2[27], mask16, kZero); - - lstep1[32] = k_cvtlo_epi16(step1[16], mask16, kZero); - lstep1[33] = k_cvthi_epi16(step1[16], mask16, kZero); - lstep1[34] = k_cvtlo_epi16(step1[17], mask16, kZero); - lstep1[35] = k_cvthi_epi16(step1[17], mask16, kZero); - lstep1[36] = k_cvtlo_epi16(step1[18], mask16, kZero); - lstep1[37] = k_cvthi_epi16(step1[18], mask16, kZero); - lstep1[38] = k_cvtlo_epi16(step1[19], mask16, kZero); - lstep1[39] = k_cvthi_epi16(step1[19], mask16, kZero); - lstep1[56] = k_cvtlo_epi16(step1[28], mask16, kZero); - lstep1[57] = k_cvthi_epi16(step1[28], mask16, kZero); - lstep1[58] = k_cvtlo_epi16(step1[29], mask16, kZero); - lstep1[59] = k_cvthi_epi16(step1[29], mask16, kZero); - lstep1[60] = k_cvtlo_epi16(step1[30], mask16, kZero); - lstep1[61] = k_cvthi_epi16(step1[30], mask16, kZero); - lstep1[62] = k_cvtlo_epi16(step1[31], mask16, kZero); - lstep1[63] = k_cvthi_epi16(step1[31], mask16, kZero); + lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero); + lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero); + lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero); + lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero); + lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero); + lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero); + lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero); + lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero); + lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero); + lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero); + lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero); + lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero); + lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero); + lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero); + lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero); + lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero); + lstep2[40] = _mm_madd_epi16(lstep2[40], kOne); + lstep2[41] = _mm_madd_epi16(lstep2[41], kOne); + lstep2[42] = _mm_madd_epi16(lstep2[42], kOne); + lstep2[43] = _mm_madd_epi16(lstep2[43], kOne); + lstep2[44] = _mm_madd_epi16(lstep2[44], kOne); + lstep2[45] = _mm_madd_epi16(lstep2[45], kOne); + lstep2[46] = _mm_madd_epi16(lstep2[46], kOne); + lstep2[47] = _mm_madd_epi16(lstep2[47], kOne); + lstep2[48] = _mm_madd_epi16(lstep2[48], kOne); + lstep2[49] = _mm_madd_epi16(lstep2[49], kOne); + lstep2[50] = _mm_madd_epi16(lstep2[50], kOne); + lstep2[51] = _mm_madd_epi16(lstep2[51], kOne); + lstep2[52] = _mm_madd_epi16(lstep2[52], kOne); + lstep2[53] = _mm_madd_epi16(lstep2[53], kOne); + lstep2[54] = _mm_madd_epi16(lstep2[54], kOne); + lstep2[55] = _mm_madd_epi16(lstep2[55], kOne); + + lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero); + lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero); + lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero); + lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero); + lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero); + lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero); + lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero); + lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero); + lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero); + lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero); + lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero); + lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero); + lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero); + lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero); + lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero); + lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero); + lstep1[32] = _mm_madd_epi16(lstep1[32], kOne); + lstep1[33] = _mm_madd_epi16(lstep1[33], kOne); + lstep1[34] = _mm_madd_epi16(lstep1[34], kOne); + lstep1[35] = _mm_madd_epi16(lstep1[35], kOne); + lstep1[36] = _mm_madd_epi16(lstep1[36], kOne); + lstep1[37] = _mm_madd_epi16(lstep1[37], kOne); + lstep1[38] = _mm_madd_epi16(lstep1[38], kOne); + lstep1[39] = _mm_madd_epi16(lstep1[39], kOne); + lstep1[56] = _mm_madd_epi16(lstep1[56], kOne); + lstep1[57] = _mm_madd_epi16(lstep1[57], kOne); + lstep1[58] = _mm_madd_epi16(lstep1[58], kOne); + lstep1[59] = _mm_madd_epi16(lstep1[59], kOne); + lstep1[60] = _mm_madd_epi16(lstep1[60], kOne); + lstep1[61] = _mm_madd_epi16(lstep1[61], kOne); + lstep1[62] = _mm_madd_epi16(lstep1[62], kOne); + lstep1[63] = _mm_madd_epi16(lstep1[63], kOne); lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]); lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]); + lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]); lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]); lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]); @@ -1302,14 +1332,22 @@ void FDCT32x32_2D(int16_t *input, // stage 4 { // expanding to 32-bit length priori to addition operations - lstep2[16] = k_cvtlo_epi16(step2[ 8], mask16, kZero); - lstep2[17] = k_cvthi_epi16(step2[ 8], mask16, kZero); - lstep2[18] = k_cvtlo_epi16(step2[ 9], mask16, kZero); - lstep2[19] = k_cvthi_epi16(step2[ 9], mask16, kZero); - lstep2[28] = k_cvtlo_epi16(step2[14], mask16, kZero); - lstep2[29] = k_cvthi_epi16(step2[14], mask16, kZero); - lstep2[30] = k_cvtlo_epi16(step2[15], mask16, kZero); - lstep2[31] = k_cvthi_epi16(step2[15], mask16, kZero); + lstep2[16] = _mm_unpacklo_epi16(step2[ 8], kZero); + lstep2[17] = _mm_unpackhi_epi16(step2[ 8], kZero); + lstep2[18] = _mm_unpacklo_epi16(step2[ 9], kZero); + lstep2[19] = _mm_unpackhi_epi16(step2[ 9], kZero); + lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero); + lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero); + lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero); + lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero); + lstep2[16] = _mm_madd_epi16(lstep2[16], kOne); + lstep2[17] = _mm_madd_epi16(lstep2[17], kOne); + lstep2[18] = _mm_madd_epi16(lstep2[18], kOne); + lstep2[19] = _mm_madd_epi16(lstep2[19], kOne); + lstep2[28] = _mm_madd_epi16(lstep2[28], kOne); + lstep2[29] = _mm_madd_epi16(lstep2[29], kOne); + lstep2[30] = _mm_madd_epi16(lstep2[30], kOne); + lstep2[31] = _mm_madd_epi16(lstep2[31], kOne); lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]); lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]); @@ -1337,41 +1375,41 @@ void FDCT32x32_2D(int16_t *input, lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]); } { - // to be continued... - // - const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); - const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); - - u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]); - u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]); - u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]); - u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]); - - // TODO(jingning): manually inline k_madd_epi32_ to further hide - // instruction latency. - v[ 0] = k_madd_epi32(u[0], k32_p16_m16); - v[ 1] = k_madd_epi32(u[1], k32_p16_m16); - v[ 2] = k_madd_epi32(u[2], k32_p16_m16); - v[ 3] = k_madd_epi32(u[3], k32_p16_m16); - v[ 4] = k_madd_epi32(u[0], k32_p16_p16); - v[ 5] = k_madd_epi32(u[1], k32_p16_p16); - v[ 6] = k_madd_epi32(u[2], k32_p16_p16); - v[ 7] = k_madd_epi32(u[3], k32_p16_p16); - - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - - lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + // to be continued... + // + const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); + const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); + + u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]); + u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]); + u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]); + u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]); + + // TODO(jingning): manually inline k_madd_epi32_ to further hide + // instruction latency. + v[ 0] = k_madd_epi32(u[0], k32_p16_m16); + v[ 1] = k_madd_epi32(u[1], k32_p16_m16); + v[ 2] = k_madd_epi32(u[2], k32_p16_m16); + v[ 3] = k_madd_epi32(u[3], k32_p16_m16); + v[ 4] = k_madd_epi32(u[0], k32_p16_p16); + v[ 5] = k_madd_epi32(u[1], k32_p16_p16); + v[ 6] = k_madd_epi32(u[2], k32_p16_p16); + v[ 7] = k_madd_epi32(u[3], k32_p16_p16); + + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + + lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS); } { const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); @@ -2647,4 +2685,4 @@ void FDCT32x32_2D(int16_t *input, } } } -} +} // NOLINT diff --git a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c index eb271fe..dc11501 100644 --- a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c @@ -12,14 +12,13 @@ #include "vp9/common/vp9_idct.h" // for cospi constants #include "vpx_ports/mem.h" -void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) { +void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, // as the first pass results are transposed, we tranpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). - const int stride = pitch >> 1; int pass; // Constants // When we use them, in one case, they are all the same. In all others @@ -112,12 +111,8 @@ void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) { } } -void vp9_short_fdct8x4_sse2(int16_t *input, int16_t *output, int pitch) { - vp9_short_fdct4x4_sse2(input, output, pitch); - vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch); -} - -static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) { +static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, + int stride) { const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); __m128i mask; @@ -171,22 +166,21 @@ static INLINE void transpose_4x4(__m128i *res) { void fdct4_1d_sse2(__m128i *in) { const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); __m128i u[4], v[4]; - u[0] = _mm_add_epi16(in[0], in[3]); - u[1] = _mm_add_epi16(in[1], in[2]); - u[2] = _mm_sub_epi16(in[1], in[2]); - u[3] = _mm_sub_epi16(in[0], in[3]); + u[0]=_mm_unpacklo_epi16(in[0], in[1]); + u[1]=_mm_unpacklo_epi16(in[3], in[2]); + + v[0] = _mm_add_epi16(u[0], u[1]); + v[1] = _mm_sub_epi16(u[0], u[1]); - v[0] = _mm_unpacklo_epi16(u[0], u[1]); - v[1] = _mm_unpacklo_epi16(u[2], u[3]); u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 - u[2] = _mm_madd_epi16(v[1], k__cospi_p24_p08); // 1 - u[3] = _mm_madd_epi16(v[1], k__cospi_m08_p24); // 3 + u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1 + u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); @@ -249,7 +243,7 @@ void fadst4_1d_sse2(__m128i *in) { transpose_4x4(in); } -void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output, +void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type) { __m128i in[4]; load_buffer_4x4(input, in, stride); @@ -277,8 +271,7 @@ void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output, write_buffer_4x4(output, in); } -void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) { - const int stride = pitch >> 1; +void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { int pass; // Constants // When we use them, in one case, they are all the same. In all others @@ -535,15 +528,16 @@ void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) { } // load 8x8 array -static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) { - in[0] = _mm_load_si128((__m128i *)(input + 0 * stride)); - in[1] = _mm_load_si128((__m128i *)(input + 1 * stride)); - in[2] = _mm_load_si128((__m128i *)(input + 2 * stride)); - in[3] = _mm_load_si128((__m128i *)(input + 3 * stride)); - in[4] = _mm_load_si128((__m128i *)(input + 4 * stride)); - in[5] = _mm_load_si128((__m128i *)(input + 5 * stride)); - in[6] = _mm_load_si128((__m128i *)(input + 6 * stride)); - in[7] = _mm_load_si128((__m128i *)(input + 7 * stride)); +static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, + int stride) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); in[0] = _mm_slli_epi16(in[0], 2); in[1] = _mm_slli_epi16(in[1], 2); @@ -1033,7 +1027,7 @@ void fadst8_1d_sse2(__m128i *in) { array_transpose_8x8(in, in); } -void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output, +void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type) { __m128i in[8]; load_buffer_8x8(input, in, stride); @@ -1062,18 +1056,17 @@ void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output, write_buffer_8x8(output, in, 8); } -void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { +void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, // as the first pass results are transposed, we tranpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). - const int stride = pitch >> 1; int pass; // We need an intermediate buffer between passes. DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); - int16_t *in = input; + const int16_t *in = input; int16_t *out = intermediate; // Constants // When we use them, in one case, they are all the same. In all others @@ -1688,7 +1681,7 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { } } -static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0, +static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0, __m128i *in1, int stride) { // load first 8 columns load_buffer_8x8(input, in0, stride); @@ -2540,7 +2533,7 @@ void fadst16_1d_sse2(__m128i *in0, __m128i *in1) { array_transpose_16x16(in0, in1); } -void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output, +void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type) { __m128i in0[16], in1[16]; load_buffer_16x16(input, in0, in1, stride); @@ -2572,13 +2565,13 @@ void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output, write_buffer_16x16(output, in0, in1, 16); } -#define FDCT32x32_2D vp9_short_fdct32x32_rd_sse2 +#define FDCT32x32_2D vp9_fdct32x32_rd_sse2 #define FDCT32x32_HIGH_PRECISION 0 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" #undef FDCT32x32_2D #undef FDCT32x32_HIGH_PRECISION -#define FDCT32x32_2D vp9_short_fdct32x32_sse2 +#define FDCT32x32_2D vp9_fdct32x32_sse2 #define FDCT32x32_HIGH_PRECISION 1 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT #undef FDCT32x32_2D diff --git a/libvpx/vp9/encoder/x86/vp9_variance_mmx.c b/libvpx/vp9/encoder/x86/vp9_variance_mmx.c index d141560..a3d0114 100644 --- a/libvpx/vp9/encoder/x86/vp9_variance_mmx.c +++ b/libvpx/vp9/encoder/x86/vp9_variance_mmx.c @@ -8,12 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vpx_config.h" +#include "./vpx_config.h" #include "vp9/encoder/vp9_variance.h" #include "vp9/common/vp9_pragmas.h" #include "vpx_ports/mem.h" -extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr); +extern unsigned int vp9_get_mb_ss_mmx(const int16_t *src_ptr); extern unsigned int vp9_get8x8var_mmx ( const unsigned char *src_ptr, @@ -45,7 +45,6 @@ unsigned int vp9_variance4x4_mmx( vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg); *sse = var; return (var - (((unsigned int)avg * avg) >> 4)); - } unsigned int vp9_variance8x8_mmx( @@ -61,7 +60,6 @@ unsigned int vp9_variance8x8_mmx( *sse = var; return (var - (((unsigned int)avg * avg) >> 6)); - } unsigned int vp9_mse16x16_mmx( @@ -74,10 +72,14 @@ unsigned int vp9_mse16x16_mmx( int sum0, sum1, sum2, sum3; - vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0); - vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2); - vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); + vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, + &sum0); + vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, + &sse1, &sum1); + vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, + ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2); + vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, + ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); var = sse0 + sse1 + sse2 + sse3; *sse = var; @@ -94,11 +96,14 @@ unsigned int vp9_variance16x16_mmx( unsigned int sse0, sse1, sse2, sse3, var; int sum0, sum1, sum2, sum3, avg; - - vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0); - vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2); - vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); + vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, + &sum0); + vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, + &sse1, &sum1); + vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, + ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2); + vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, + ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); var = sse0 + sse1 + sse2 + sse3; avg = sum0 + sum1 + sum2 + sum3; @@ -115,14 +120,15 @@ unsigned int vp9_variance16x8_mmx( unsigned int sse0, sse1, var; int sum0, sum1, avg; - vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0); - vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); + vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, + &sum0); + vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, + &sse1, &sum1); var = sse0 + sse1; avg = sum0 + sum1; *sse = var; return (var - (((unsigned int)avg * avg) >> 7)); - } @@ -135,13 +141,14 @@ unsigned int vp9_variance8x16_mmx( unsigned int sse0, sse1, var; int sum0, sum1, avg; - vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0); - vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1); + vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, + &sum0); + vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, + ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1); var = sse0 + sse1; avg = sum0 + sum1; *sse = var; return (var - (((unsigned int)avg * avg) >> 7)); - } diff --git a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c index cea934d..79e42c4 100644 --- a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vpx_config.h" +#include "./vpx_config.h" #include "vp9/encoder/vp9_variance.h" #include "vp9/common/vp9_pragmas.h" @@ -26,7 +26,7 @@ extern unsigned int vp9_get4x4var_mmx unsigned int vp9_get_mb_ss_sse2 ( - const short *src_ptr + const int16_t *src_ptr ); unsigned int vp9_get16x16var_sse2 ( @@ -250,7 +250,6 @@ unsigned int vp9_mse16x16_sse2( const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { - unsigned int sse0; int sum0; vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, @@ -407,12 +406,12 @@ FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \ -FN(16, 8, 16, 4, 3, opt1,); \ -FN(8, 16, 8, 3, 4, opt1,); \ -FN(8, 8, 8, 3, 3, opt1,); \ -FN(8, 4, 8, 3, 2, opt1,); \ -FN(4, 8, 4, 2, 3, opt2,); \ -FN(4, 4, 4, 2, 2, opt2,) +FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \ +FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \ +FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \ +FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \ +FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \ +FN(4, 4, 4, 2, 2, opt2, (unsigned int)) FNS(sse2, sse); FNS(ssse3, ssse3); @@ -487,12 +486,12 @@ FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \ -FN(16, 8, 16, 4, 3, opt1,); \ -FN(8, 16, 8, 3, 4, opt1,); \ -FN(8, 8, 8, 3, 3, opt1,); \ -FN(8, 4, 8, 3, 2, opt1,); \ -FN(4, 8, 4, 2, 3, opt2,); \ -FN(4, 4, 4, 2, 2, opt2,) +FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \ +FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \ +FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \ +FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \ +FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \ +FN(4, 4, 4, 2, 2, opt2, (unsigned int)) FNS(sse2, sse); FNS(ssse3, ssse3); diff --git a/libvpx/vp9/vp9_common.mk b/libvpx/vp9/vp9_common.mk index 687fb48..0badb08 100644 --- a/libvpx/vp9/vp9_common.mk +++ b/libvpx/vp9/vp9_common.mk @@ -48,7 +48,6 @@ VP9_COMMON_SRCS-yes += common/vp9_reconintra.h VP9_COMMON_SRCS-yes += common/vp9_rtcd.c VP9_COMMON_SRCS-yes += common/vp9_rtcd_defs.sh VP9_COMMON_SRCS-yes += common/vp9_sadmxn.h -VP9_COMMON_SRCS-yes += common/vp9_subpelvar.h VP9_COMMON_SRCS-yes += common/vp9_scale.h VP9_COMMON_SRCS-yes += common/vp9_scale.c VP9_COMMON_SRCS-yes += common/vp9_seg_common.h @@ -69,13 +68,17 @@ VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/vp9_textblit.c VP9_COMMON_SRCS-yes += common/vp9_treecoder.c VP9_COMMON_SRCS-yes += common/vp9_common_data.c VP9_COMMON_SRCS-yes += common/vp9_common_data.h +VP9_COMMON_SRCS-yes += common/vp9_scan.c +VP9_COMMON_SRCS-yes += common/vp9_scan.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c +VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm @@ -88,11 +91,28 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm endif +# common (c) +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_common_dspr2.h +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_avg_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_horiz_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_vert_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_avg_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_horiz_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_vert_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans8_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans16_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans32_cols_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans32_dspr2.c + VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_neon.c -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct32x32_neon.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM) @@ -109,5 +129,6 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht8x8_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_avg_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_save_reg_neon$(ASM) $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.sh)) diff --git a/libvpx/vp9/vp9_cx_iface.c b/libvpx/vp9/vp9_cx_iface.c index 48866d2..1942039 100644 --- a/libvpx/vp9/vp9_cx_iface.c +++ b/libvpx/vp9/vp9_cx_iface.c @@ -8,30 +8,30 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <stdlib.h> +#include <string.h> #include "vpx/vpx_codec.h" #include "vpx/internal/vpx_codec_internal.h" -#include "vpx_version.h" +#include "./vpx_version.h" #include "vp9/encoder/vp9_onyx_int.h" #include "vpx/vp8cx.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/common/vp9_onyx.h" #include "vp9/vp9_iface_common.h" -#include <stdlib.h> -#include <string.h> struct vp9_extracfg { struct vpx_codec_pkt_list *pkt_list; - int cpu_used; /** available cpu percentage in 1/16*/ - unsigned int enable_auto_alt_ref; /** if encoder decides to uses alternate reference frame */ + int cpu_used; /* available cpu percentage in 1/16 */ + unsigned int enable_auto_alt_ref; unsigned int noise_sensitivity; unsigned int Sharpness; unsigned int static_thresh; unsigned int tile_columns; unsigned int tile_rows; - unsigned int arnr_max_frames; /* alt_ref Noise Reduction Max Frame Count */ - unsigned int arnr_strength; /* alt_ref Noise Reduction Strength */ - unsigned int arnr_type; /* alt_ref filter type */ + unsigned int arnr_max_frames; + unsigned int arnr_strength; + unsigned int arnr_type; unsigned int experimental; vp8e_tuning tuning; unsigned int cq_level; /* constrained quality level */ @@ -48,7 +48,7 @@ struct extraconfig_map { static const struct extraconfig_map extracfg_map[] = { { 0, - { + { // NOLINT NULL, 0, /* cpu_used */ 1, /* enable_auto_alt_ref */ @@ -85,11 +85,11 @@ struct vpx_codec_alg_priv { uint32_t pending_frame_magnitude; vpx_image_t preview_img; vp8_postproc_cfg_t preview_ppcfg; - vpx_codec_pkt_list_decl(64) pkt_list; // changed to accomendate the maximum number of lagged frames allowed + vpx_codec_pkt_list_decl(64) pkt_list; unsigned int fixed_kf_cntr; }; -static const VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) { +static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) { switch (frame) { case VP8_LAST_FRAME: return VP9_LAST_FLAG; @@ -120,26 +120,26 @@ update_error_state(vpx_codec_alg_priv_t *ctx, #define ERROR(str) do {\ ctx->base.err_detail = str;\ return VPX_CODEC_INVALID_PARAM;\ - } while(0) + } while (0) -#define RANGE_CHECK(p,memb,lo,hi) do {\ - if(!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \ +#define RANGE_CHECK(p, memb, lo, hi) do {\ + if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \ ERROR(#memb " out of range ["#lo".."#hi"]");\ - } while(0) + } while (0) -#define RANGE_CHECK_HI(p,memb,hi) do {\ - if(!((p)->memb <= (hi))) \ +#define RANGE_CHECK_HI(p, memb, hi) do {\ + if (!((p)->memb <= (hi))) \ ERROR(#memb " out of range [.."#hi"]");\ - } while(0) + } while (0) -#define RANGE_CHECK_LO(p,memb,lo) do {\ - if(!((p)->memb >= (lo))) \ +#define RANGE_CHECK_LO(p, memb, lo) do {\ + if (!((p)->memb >= (lo))) \ ERROR(#memb " out of range ["#lo"..]");\ - } while(0) + } while (0) -#define RANGE_CHECK_BOOL(p,memb) do {\ - if(!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\ - } while(0) +#define RANGE_CHECK_BOOL(p, memb) do {\ + if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\ + } while (0) static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg, @@ -247,7 +247,8 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, oxcf->width = cfg.g_w; oxcf->height = cfg.g_h; /* guess a frame rate if out of whack, use 30 */ - oxcf->framerate = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num); + oxcf->framerate = (double)(cfg.g_timebase.den) + / (double)(cfg.g_timebase.num); if (oxcf->framerate > 180) { oxcf->framerate = 30; @@ -255,7 +256,7 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, switch (cfg.g_pass) { case VPX_RC_ONE_PASS: - oxcf->Mode = MODE_BESTQUALITY; + oxcf->Mode = MODE_GOODQUALITY; break; case VPX_RC_FIRST_PASS: oxcf->Mode = MODE_FIRSTPASS; @@ -266,25 +267,25 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, } if (cfg.g_pass == VPX_RC_FIRST_PASS) { - oxcf->allow_lag = 0; - oxcf->lag_in_frames = 0; + oxcf->allow_lag = 0; + oxcf->lag_in_frames = 0; } else { - oxcf->allow_lag = (cfg.g_lag_in_frames) > 0; - oxcf->lag_in_frames = cfg.g_lag_in_frames; + oxcf->allow_lag = (cfg.g_lag_in_frames) > 0; + oxcf->lag_in_frames = cfg.g_lag_in_frames; } // VBR only supported for now. // CBR code has been deprectated for experimental phase. // CQ mode not yet tested oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK; - /* if (cfg.rc_end_usage == VPX_CQ) oxcf->end_usage = USAGE_CONSTRAINED_QUALITY; - */ - if (cfg.rc_end_usage == VPX_Q) + else if (cfg.rc_end_usage == VPX_Q) oxcf->end_usage = USAGE_CONSTANT_QUALITY; + else if (cfg.rc_end_usage == VPX_CBR) + oxcf->end_usage = USAGE_STREAM_FROM_SERVER; - oxcf->target_bandwidth = cfg.rc_target_bitrate; + oxcf->target_bandwidth = cfg.rc_target_bitrate; oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct; oxcf->best_allowed_q = cfg.rc_min_quantizer; @@ -299,7 +300,7 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, oxcf->starting_buffer_level = cfg.rc_buf_initial_sz; oxcf->optimal_buffer_level = cfg.rc_buf_optimal_sz; - oxcf->two_pass_vbrbias = cfg.rc_2pass_vbr_bias_pct; + oxcf->two_pass_vbrbias = cfg.rc_2pass_vbr_bias_pct; oxcf->two_pass_vbrmin_section = cfg.rc_2pass_vbr_minsection_pct; oxcf->two_pass_vbrmax_section = cfg.rc_2pass_vbr_maxsection_pct; @@ -315,23 +316,23 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, oxcf->encode_breakout = vp8_cfg.static_thresh; oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref; oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity; - oxcf->Sharpness = vp8_cfg.Sharpness; + oxcf->Sharpness = vp8_cfg.Sharpness; - oxcf->two_pass_stats_in = cfg.rc_twopass_stats_in; - oxcf->output_pkt_list = vp8_cfg.pkt_list; + oxcf->two_pass_stats_in = cfg.rc_twopass_stats_in; + oxcf->output_pkt_list = vp8_cfg.pkt_list; oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames; - oxcf->arnr_strength = vp8_cfg.arnr_strength; - oxcf->arnr_type = vp8_cfg.arnr_type; + oxcf->arnr_strength = vp8_cfg.arnr_strength; + oxcf->arnr_type = vp8_cfg.arnr_type; oxcf->tuning = vp8_cfg.tuning; oxcf->tile_columns = vp8_cfg.tile_columns; - oxcf->tile_rows = vp8_cfg.tile_rows; + oxcf->tile_rows = vp8_cfg.tile_rows; oxcf->lossless = vp8_cfg.lossless; - oxcf->error_resilient_mode = cfg.g_error_resilient; + oxcf->error_resilient_mode = cfg.g_error_resilient; oxcf->frame_parallel_decoding_mode = vp8_cfg.frame_parallel_decoding_mode; oxcf->ss_number_layers = cfg.ss_number_layers; @@ -441,8 +442,6 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx, MAP(VP8E_SET_ARNR_TYPE, xcfg.arnr_type); MAP(VP8E_SET_TUNING, xcfg.tuning); MAP(VP8E_SET_CQ_LEVEL, xcfg.cq_level); - MAP(VP9E_SET_MAX_Q, ctx->cfg.rc_max_quantizer); - MAP(VP9E_SET_MIN_Q, ctx->cfg.rc_min_quantizer); MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct); MAP(VP9E_SET_LOSSLESS, xcfg.lossless); MAP(VP9E_SET_FRAME_PARALLEL_DECODING, xcfg.frame_parallel_decoding_mode); @@ -500,7 +499,7 @@ static vpx_codec_err_t vp9e_common_init(vpx_codec_ctx_t *ctx, */ for (i = 0; extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage; - i++); + i++) {} priv->vp8_cfg = extracfg_map[i].cfg; priv->vp8_cfg.pkt_list = &priv->pkt_list.head; @@ -555,7 +554,6 @@ static vpx_codec_err_t vp9e_exp_init(vpx_codec_ctx_t *ctx, static vpx_codec_err_t vp9e_destroy(vpx_codec_alg_priv_t *ctx) { - free(ctx->cx_data); vp9_remove_compressor(&ctx->cpi); free(ctx); @@ -589,7 +587,8 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, static int write_superframe_index(vpx_codec_alg_priv_t *ctx) { uint8_t marker = 0xc0; - int mag, mask, index_sz; + unsigned int mask; + int mag, index_sz; assert(ctx->pending_frame_count); assert(ctx->pending_frame_count <= 8); @@ -713,8 +712,10 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t *ctx, lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; /* vp8 use 10,000,000 ticks/second as time stamp */ - dst_time_stamp = pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den; - dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den; + dst_time_stamp = pts * 10000000 * ctx->cfg.g_timebase.num + / ctx->cfg.g_timebase.den; + dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / + ctx->cfg.g_timebase.den; if (img != NULL) { res = image2yuvconfig(img, &sd); @@ -768,7 +769,7 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t *ctx, } /* Add the frame packet to the list of returned packets. */ - round = 1000000 * ctx->cfg.g_timebase.num / 2 - 1; + round = (vpx_codec_pts_t)1000000 * ctx->cfg.g_timebase.num / 2 - 1; delta = (dst_end_time_stamp - dst_time_stamp); pkt.kind = VPX_CODEC_CX_FRAME_PKT; pkt.data.frame.pts = @@ -840,8 +841,6 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t *ctx, cx_data += size; cx_data_sz -= size; } - - // printf("timestamp: %lld, duration: %d\n", pkt->data.frame.pts, pkt->data.frame.duration); } } } @@ -868,15 +867,14 @@ static vpx_codec_err_t vp9e_set_reference(vpx_codec_alg_priv_t *ctx, vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type), &sd); return VPX_CODEC_OK; - } else + } else { return VPX_CODEC_INVALID_PARAM; - + } } static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { - vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); if (data) { @@ -887,8 +885,9 @@ static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx, vp9_copy_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type), &sd); return VPX_CODEC_OK; - } else + } else { return VPX_CODEC_INVALID_PARAM; + } } static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx, @@ -917,8 +916,9 @@ static vpx_codec_err_t vp9e_set_previewpp(vpx_codec_alg_priv_t *ctx, if (data) { ctx->preview_ppcfg = *((vp8_postproc_cfg_t *)data); return VPX_CODEC_OK; - } else + } else { return VPX_CODEC_INVALID_PARAM; + } #else (void)ctx; (void)ctr_id; @@ -929,7 +929,6 @@ static vpx_codec_err_t vp9e_set_previewpp(vpx_codec_alg_priv_t *ctx, static vpx_image_t *vp9e_get_preview(vpx_codec_alg_priv_t *ctx) { - YV12_BUFFER_CONFIG sd; vp9_ppflags_t flags = {0}; @@ -942,8 +941,9 @@ static vpx_image_t *vp9e_get_preview(vpx_codec_alg_priv_t *ctx) { if (0 == vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags)) { yuvconfig2image(&ctx->preview_img, &sd, NULL); return &ctx->preview_img; - } else + } else { return NULL; + } } static vpx_codec_err_t vp9e_update_entropy(vpx_codec_alg_priv_t *ctx, @@ -952,7 +952,6 @@ static vpx_codec_err_t vp9e_update_entropy(vpx_codec_alg_priv_t *ctx, int update = va_arg(args, int); vp9_update_entropy(ctx->cpi, update); return VPX_CODEC_OK; - } static vpx_codec_err_t vp9e_update_reference(vpx_codec_alg_priv_t *ctx, @@ -974,64 +973,30 @@ static vpx_codec_err_t vp9e_use_reference(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t vp9e_set_roi_map(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { - vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *); - - if (data) { - vpx_roi_map_t *roi = (vpx_roi_map_t *)data; - - if (!vp9_set_roimap(ctx->cpi, roi->roi_map, roi->rows, roi->cols, - roi->delta_q, roi->delta_lf, roi->static_threshold)) - return VPX_CODEC_OK; - else - return VPX_CODEC_INVALID_PARAM; - } else - return VPX_CODEC_INVALID_PARAM; + // TODO(yaowu): Need to re-implement and test for VP9. + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t vp9e_set_activemap(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { - vpx_active_map_t *data = va_arg(args, vpx_active_map_t *); - - if (data) { - - vpx_active_map_t *map = (vpx_active_map_t *)data; - - if (!vp9_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols)) - return VPX_CODEC_OK; - else - return VPX_CODEC_INVALID_PARAM; - } else - return VPX_CODEC_INVALID_PARAM; + // TODO(yaowu): Need to re-implement and test for VP9. + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { - vpx_scaling_mode_t *data = va_arg(args, vpx_scaling_mode_t *); if (data) { int res; vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data; - res = vp9_set_internal_size(ctx->cpi, scalemode.h_scaling_mode, - scalemode.v_scaling_mode); - - if (!res) { - return VPX_CODEC_OK; - } else - return VPX_CODEC_INVALID_PARAM; - } else - return VPX_CODEC_INVALID_PARAM; -} + res = vp9_set_internal_size(ctx->cpi, + (VPX_SCALING)scalemode.h_scaling_mode, + (VPX_SCALING)scalemode.v_scaling_mode); -static vpx_codec_err_t vp9e_set_width(vpx_codec_alg_priv_t *ctx, int ctr_id, - va_list args) { - unsigned int *data = va_arg(args, unsigned int *); - if (data) { - int res; - res = vp9_set_size_literal(ctx->cpi, *data, 0); if (!res) { return VPX_CODEC_OK; } else { @@ -1042,50 +1007,40 @@ static vpx_codec_err_t vp9e_set_width(vpx_codec_alg_priv_t *ctx, int ctr_id, } } -static vpx_codec_err_t vp9e_set_height(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) { - unsigned int *data = va_arg(args, unsigned int *); +static vpx_codec_err_t vp9e_set_svc(vpx_codec_alg_priv_t *ctx, int ctr_id, + va_list args) { + int data = va_arg(args, int); + vp9_set_svc(ctx->cpi, data); + return VPX_CODEC_OK; +} - if (data) { - int res; - res = vp9_set_size_literal(ctx->cpi, 0, *data); +static vpx_codec_err_t vp9e_set_svc_parameters(vpx_codec_alg_priv_t *ctx, + int ctr_id, va_list args) { + vpx_svc_parameters_t *data = va_arg(args, vpx_svc_parameters_t *); + VP9_COMP *cpi = (VP9_COMP *)ctx->cpi; + vpx_svc_parameters_t params; - if (!res) { - return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; - } - } else { + if (data == NULL) { return VPX_CODEC_INVALID_PARAM; } -} - -static vpx_codec_err_t vp9e_set_layer(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) { - unsigned int *data = va_arg(args, unsigned int *); - if (data) { - int res; - res = 0; + params = *(vpx_svc_parameters_t *)data; - res = vp9_switch_layer(ctx->cpi, *data); + cpi->current_layer = params.layer; + cpi->lst_fb_idx = params.lst_fb_idx; + cpi->gld_fb_idx = params.gld_fb_idx; + cpi->alt_fb_idx = params.alt_fb_idx; - if (!res) { - return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; - } - } else { + if (vp9_set_size_literal(ctx->cpi, params.width, params.height) != 0) { return VPX_CODEC_INVALID_PARAM; } -} -static vpx_codec_err_t vp9e_set_svc(vpx_codec_alg_priv_t *ctx, int ctr_id, - va_list args) { - int data = va_arg(args, int); - vp9_set_svc(ctx->cpi, data); + ctx->cfg.rc_max_quantizer = params.max_quantizer; + ctx->cfg.rc_min_quantizer = params.min_quantizer; + + set_vp9e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg); + vp9_change_config(ctx->cpi, &ctx->oxcf); + return VPX_CODEC_OK; } @@ -1113,23 +1068,19 @@ static vpx_codec_ctrl_fn_map_t vp9e_ctf_maps[] = { {VP8E_SET_ARNR_TYPE, set_param}, {VP8E_SET_TUNING, set_param}, {VP8E_SET_CQ_LEVEL, set_param}, - {VP9E_SET_MAX_Q, set_param}, - {VP9E_SET_MIN_Q, set_param}, {VP8E_SET_MAX_INTRA_BITRATE_PCT, set_param}, {VP9E_SET_LOSSLESS, set_param}, {VP9E_SET_FRAME_PARALLEL_DECODING, set_param}, {VP9_GET_REFERENCE, get_reference}, - {VP9E_SET_WIDTH, vp9e_set_width}, - {VP9E_SET_HEIGHT, vp9e_set_height}, - {VP9E_SET_LAYER, vp9e_set_layer}, {VP9E_SET_SVC, vp9e_set_svc}, + {VP9E_SET_SVC_PARAMETERS, vp9e_set_svc_parameters}, { -1, NULL}, }; static vpx_codec_enc_cfg_map_t vp9e_usage_cfg_map[] = { { 0, - { + { // NOLINT 0, /* g_usage */ 0, /* g_threads */ 0, /* g_profile */ @@ -1198,13 +1149,13 @@ CODEC_INTERFACE(vpx_codec_vp9_cx) = { vp9e_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */ NOT_IMPLEMENTED, /* vpx_codec_get_mmap_fn_t get_mmap; */ NOT_IMPLEMENTED, /* vpx_codec_set_mmap_fn_t set_mmap; */ - { + { // NOLINT NOT_IMPLEMENTED, /* vpx_codec_peek_si_fn_t peek_si; */ NOT_IMPLEMENTED, /* vpx_codec_get_si_fn_t get_si; */ NOT_IMPLEMENTED, /* vpx_codec_decode_fn_t decode; */ NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */ }, - { + { // NOLINT vp9e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */ vp9e_encode, /* vpx_codec_encode_fn_t encode; */ vp9e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */ @@ -1227,13 +1178,13 @@ CODEC_INTERFACE(vpx_codec_vp9x_cx) = { vp9e_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */ NOT_IMPLEMENTED, /* vpx_codec_get_mmap_fn_t get_mmap; */ NOT_IMPLEMENTED, /* vpx_codec_set_mmap_fn_t set_mmap; */ - { + { // NOLINT NOT_IMPLEMENTED, /* vpx_codec_peek_si_fn_t peek_si; */ NOT_IMPLEMENTED, /* vpx_codec_get_si_fn_t get_si; */ NOT_IMPLEMENTED, /* vpx_codec_decode_fn_t decode; */ NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */ }, - { + { // NOLINT vp9e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */ vp9e_encode, /* vpx_codec_encode_fn_t encode; */ vp9e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */ diff --git a/libvpx/vp9/vp9_dx_iface.c b/libvpx/vp9/vp9_dx_iface.c index 10b3238..5dacab4 100644 --- a/libvpx/vp9/vp9_dx_iface.c +++ b/libvpx/vp9/vp9_dx_iface.c @@ -14,7 +14,7 @@ #include "vpx/vpx_decoder.h" #include "vpx/vp8dx.h" #include "vpx/internal/vpx_codec_internal.h" -#include "vpx_version.h" +#include "./vpx_version.h" #include "vp9/decoder/vp9_onyxd.h" #include "vp9/decoder/vp9_onyxd_int.h" #include "vp9/decoder/vp9_read_bit_buffer.h" @@ -172,9 +172,9 @@ static vpx_codec_err_t vp9_peek_si(const uint8_t *data, rb.bit_offset += 1; // show frame rb.bit_offset += 1; // error resilient - if (vp9_rb_read_literal(&rb, 8) != SYNC_CODE_0 || - vp9_rb_read_literal(&rb, 8) != SYNC_CODE_1 || - vp9_rb_read_literal(&rb, 8) != SYNC_CODE_2) { + if (vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_0 || + vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_1 || + vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_2) { return VPX_CODEC_UNSUP_BITSTREAM; } @@ -205,7 +205,6 @@ static vpx_codec_err_t vp9_peek_si(const uint8_t *data, static vpx_codec_err_t vp9_get_si(vpx_codec_alg_priv_t *ctx, vpx_codec_stream_info_t *si) { - unsigned int sz; if (si->sz >= sizeof(vp9_stream_info_t)) @@ -323,15 +322,20 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, vp9_ppflags_t flags = {0}; if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) { - flags.post_proc_flag = ctx->postproc_cfg.post_proc_flag + flags.post_proc_flag = #if CONFIG_POSTPROC_VISUALIZER - - | ((ctx->dbg_color_ref_frame_flag != 0) ? VP9D_DEBUG_CLR_FRM_REF_BLKS : 0) - | ((ctx->dbg_color_mb_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0) - | ((ctx->dbg_color_b_modes_flag != 0) ? VP9D_DEBUG_CLR_BLK_MODES : 0) - | ((ctx->dbg_display_mv_flag != 0) ? VP9D_DEBUG_DRAW_MV : 0) + ((ctx->dbg_color_ref_frame_flag != 0) ? + VP9D_DEBUG_CLR_FRM_REF_BLKS : 0) + | ((ctx->dbg_color_mb_modes_flag != 0) ? + VP9D_DEBUG_CLR_BLK_MODES : 0) + | ((ctx->dbg_color_b_modes_flag != 0) ? + VP9D_DEBUG_CLR_BLK_MODES : 0) + | ((ctx->dbg_display_mv_flag != 0) ? + VP9D_DEBUG_DRAW_MV : 0) + | #endif -; + ctx->postproc_cfg.post_proc_flag; + flags.deblocking_level = ctx->postproc_cfg.deblocking_level; flags.noise_level = ctx->postproc_cfg.noise_level; #if CONFIG_POSTPROC_VISUALIZER @@ -496,8 +500,9 @@ static vpx_codec_err_t vp9_xma_get_mmap(const vpx_codec_ctx_t *ctx, mmap->sz = seg_iter->calc_sz(ctx->config.dec, ctx->init_flags); res = VPX_CODEC_OK; - } else + } else { res = VPX_CODEC_LIST_END; + } } while (!mmap->sz && res != VPX_CODEC_LIST_END); return res; @@ -542,7 +547,6 @@ static vpx_codec_err_t vp9_xma_set_mmap(vpx_codec_ctx_t *ctx, static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { - vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); if (data) { @@ -553,15 +557,14 @@ static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx, return vp9_set_reference_dec(ctx->pbi, (VP9_REFFRAME)frame->frame_type, &sd); - } else + } else { return VPX_CODEC_INVALID_PARAM; - + } } static vpx_codec_err_t copy_reference(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { - vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); if (data) { @@ -572,9 +575,9 @@ static vpx_codec_err_t copy_reference(vpx_codec_alg_priv_t *ctx, return vp9_copy_reference_dec(ctx->pbi, (VP9_REFFRAME)frame->frame_type, &sd); - } else + } else { return VPX_CODEC_INVALID_PARAM; - + } } static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx, @@ -603,9 +606,9 @@ static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx, ctx->postproc_cfg_set = 1; ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data); return VPX_CODEC_OK; - } else + } else { return VPX_CODEC_INVALID_PARAM; - + } #else return VPX_CODEC_INCAPABLE; #endif @@ -642,25 +645,27 @@ static vpx_codec_err_t get_last_ref_updates(vpx_codec_alg_priv_t *ctx, *update_info = pbi->refresh_frame_flags; return VPX_CODEC_OK; - } else + } else { return VPX_CODEC_INVALID_PARAM; + } } static vpx_codec_err_t get_frame_corrupted(vpx_codec_alg_priv_t *ctx, int ctrl_id, va_list args) { - int *corrupted = va_arg(args, int *); if (corrupted) { VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi; - *corrupted = pbi->common.frame_to_show->corrupted; - + if (pbi) + *corrupted = pbi->common.frame_to_show->corrupted; + else + return VPX_CODEC_ERROR; return VPX_CODEC_OK; - } else + } else { return VPX_CODEC_INVALID_PARAM; - + } } static vpx_codec_err_t set_invert_tile_order(vpx_codec_alg_priv_t *ctx, @@ -699,13 +704,13 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = { ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */ vp9_xma_get_mmap, /* vpx_codec_get_mmap_fn_t get_mmap; */ vp9_xma_set_mmap, /* vpx_codec_set_mmap_fn_t set_mmap; */ - { + { // NOLINT vp9_peek_si, /* vpx_codec_peek_si_fn_t peek_si; */ vp9_get_si, /* vpx_codec_get_si_fn_t get_si; */ vp9_decode, /* vpx_codec_decode_fn_t decode; */ vp9_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */ }, - { + { // NOLINT /* encoder functions */ NOT_IMPLEMENTED, NOT_IMPLEMENTED, diff --git a/libvpx/vp9/vp9cx.mk b/libvpx/vp9/vp9cx.mk index 9fbf100..0993c6c 100644 --- a/libvpx/vp9/vp9cx.mk +++ b/libvpx/vp9/vp9cx.mk @@ -20,6 +20,7 @@ VP9_CX_SRCS-yes += vp9_cx_iface.c VP9_CX_SRCS-yes += encoder/vp9_bitstream.c VP9_CX_SRCS-yes += encoder/vp9_boolhuff.c VP9_CX_SRCS-yes += encoder/vp9_dct.c +VP9_CX_SRCS-yes += encoder/vp9_dct.h VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h VP9_CX_SRCS-yes += encoder/vp9_encodeintra.c @@ -64,6 +65,8 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c VP9_CX_SRCS-yes += encoder/vp9_tokenize.c VP9_CX_SRCS-yes += encoder/vp9_treewriter.c VP9_CX_SRCS-yes += encoder/vp9_variance_c.c +VP9_CX_SRCS-yes += encoder/vp9_vaq.c +VP9_CX_SRCS-yes += encoder/vp9_vaq.h ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c diff --git a/libvpx/vp9/vp9dx.mk b/libvpx/vp9/vp9dx.mk index be3afe8..3a27cdd 100644 --- a/libvpx/vp9/vp9dx.mk +++ b/libvpx/vp9/vp9dx.mk @@ -32,12 +32,7 @@ VP9_DX_SRCS-yes += decoder/vp9_thread.c VP9_DX_SRCS-yes += decoder/vp9_thread.h VP9_DX_SRCS-yes += decoder/vp9_treereader.h VP9_DX_SRCS-yes += decoder/vp9_onyxd_if.c -VP9_DX_SRCS-yes += decoder/vp9_idct_blk.c -VP9_DX_SRCS-yes += decoder/vp9_idct_blk.h VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes)) - -VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_dequantize_sse2.c -VP9_DX_SRCS-$(HAVE_NEON) += decoder/arm/neon/vp9_add_constant_residual_neon$(ASM) |